radeonsi: use shader_info::cs::local_size_variable to clean up some code
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "ac_exp_param.h"
26 #include "ac_rtld.h"
27 #include "compiler/nir/nir.h"
28 #include "compiler/nir/nir_serialize.h"
29 #include "si_pipe.h"
30 #include "si_shader_internal.h"
31 #include "sid.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_strings.h"
34 #include "util/u_memory.h"
35
36 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
37
38 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
39
40 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
41
42 /** Whether the shader runs as a combination of multiple API shaders */
43 bool si_is_multi_part_shader(struct si_shader *shader)
44 {
45 if (shader->selector->screen->info.chip_class <= GFX8)
46 return false;
47
48 return shader->key.as_ls || shader->key.as_es ||
49 shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||
50 shader->selector->info.stage == MESA_SHADER_GEOMETRY;
51 }
52
53 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
54 bool si_is_merged_shader(struct si_shader *shader)
55 {
56 return shader->key.as_ngg || si_is_multi_part_shader(shader);
57 }
58
59 /**
60 * Returns a unique index for a per-patch semantic name and index. The index
61 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
62 * can be calculated.
63 */
64 unsigned si_shader_io_get_unique_index_patch(unsigned semantic)
65 {
66 switch (semantic) {
67 case VARYING_SLOT_TESS_LEVEL_OUTER:
68 return 0;
69 case VARYING_SLOT_TESS_LEVEL_INNER:
70 return 1;
71 default:
72 if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
73 return 2 + (semantic - VARYING_SLOT_PATCH0);
74
75 assert(!"invalid semantic");
76 return 0;
77 }
78 }
79
80 /**
81 * Returns a unique index for a semantic name and index. The index must be
82 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
83 * calculated.
84 */
85 unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
86 {
87 switch (semantic) {
88 case VARYING_SLOT_POS:
89 return 0;
90 default:
91 /* Since some shader stages use the the highest used IO index
92 * to determine the size to allocate for inputs/outputs
93 * (in LDS, tess and GS rings). GENERIC should be placed right
94 * after POSITION to make that size as small as possible.
95 */
96 if (semantic >= VARYING_SLOT_VAR0 &&
97 semantic < VARYING_SLOT_VAR0 + SI_MAX_IO_GENERIC)
98 return 1 + (semantic - VARYING_SLOT_VAR0);
99
100 assert(!"invalid generic index");
101 return 0;
102 case VARYING_SLOT_FOGC:
103 return SI_MAX_IO_GENERIC + 1;
104 case VARYING_SLOT_COL0:
105 return SI_MAX_IO_GENERIC + 2;
106 case VARYING_SLOT_COL1:
107 return SI_MAX_IO_GENERIC + 3;
108 case VARYING_SLOT_BFC0:
109 /* If it's a varying, COLOR and BCOLOR alias. */
110 if (is_varying)
111 return SI_MAX_IO_GENERIC + 2;
112 else
113 return SI_MAX_IO_GENERIC + 4;
114 case VARYING_SLOT_BFC1:
115 if (is_varying)
116 return SI_MAX_IO_GENERIC + 3;
117 else
118 return SI_MAX_IO_GENERIC + 5;
119 case VARYING_SLOT_TEX0:
120 case VARYING_SLOT_TEX1:
121 case VARYING_SLOT_TEX2:
122 case VARYING_SLOT_TEX3:
123 case VARYING_SLOT_TEX4:
124 case VARYING_SLOT_TEX5:
125 case VARYING_SLOT_TEX6:
126 case VARYING_SLOT_TEX7:
127 return SI_MAX_IO_GENERIC + 6 + (semantic - VARYING_SLOT_TEX0);
128
129 /* These are rarely used between LS and HS or ES and GS. */
130 case VARYING_SLOT_CLIP_DIST0:
131 return SI_MAX_IO_GENERIC + 6 + 8;
132 case VARYING_SLOT_CLIP_DIST1:
133 return SI_MAX_IO_GENERIC + 6 + 8 + 1;
134 case VARYING_SLOT_CLIP_VERTEX:
135 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
136 case VARYING_SLOT_PSIZ:
137 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
138
139 /* These can't be written by LS, HS, and ES. */
140 case VARYING_SLOT_LAYER:
141 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
142 case VARYING_SLOT_VIEWPORT:
143 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
144 case VARYING_SLOT_PRIMITIVE_ID:
145 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
146 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
147 }
148 }
149
150 static void si_dump_streamout(struct pipe_stream_output_info *so)
151 {
152 unsigned i;
153
154 if (so->num_outputs)
155 fprintf(stderr, "STREAMOUT\n");
156
157 for (i = 0; i < so->num_outputs; i++) {
158 unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
159 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
160 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
161 so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
162 mask & 4 ? "z" : "", mask & 8 ? "w" : "");
163 }
164 }
165
166 static void declare_streamout_params(struct si_shader_context *ctx,
167 struct pipe_stream_output_info *so)
168 {
169 if (ctx->screen->use_ngg_streamout) {
170 if (ctx->stage == MESA_SHADER_TESS_EVAL)
171 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
172 return;
173 }
174
175 /* Streamout SGPRs. */
176 if (so->num_outputs) {
177 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
178 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
179 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
180 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
181 }
182
183 /* A streamout buffer offset is loaded if the stride is non-zero. */
184 for (int i = 0; i < 4; i++) {
185 if (!so->stride[i])
186 continue;
187
188 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
189 }
190 }
191
192 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
193 {
194 switch (shader->selector->info.stage) {
195 case MESA_SHADER_VERTEX:
196 case MESA_SHADER_TESS_EVAL:
197 return shader->key.as_ngg ? 128 : 0;
198
199 case MESA_SHADER_TESS_CTRL:
200 /* Return this so that LLVM doesn't remove s_barrier
201 * instructions on chips where we use s_barrier. */
202 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
203
204 case MESA_SHADER_GEOMETRY:
205 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
206
207 case MESA_SHADER_COMPUTE:
208 break; /* see below */
209
210 default:
211 return 0;
212 }
213
214 /* Compile a variable block size using the maximum variable size. */
215 if (shader->selector->info.base.cs.local_size_variable)
216 return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
217
218 uint16_t *local_size = shader->selector->info.base.cs.local_size;
219 unsigned max_work_group_size = (uint32_t)local_size[0] *
220 (uint32_t)local_size[1] *
221 (uint32_t)local_size[2];
222 assert(max_work_group_size);
223 return max_work_group_size;
224 }
225
226 static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
227 {
228 enum ac_arg_type const_shader_buf_type;
229
230 if (ctx->shader->selector->info.base.num_ubos == 1 &&
231 ctx->shader->selector->info.base.num_ssbos == 0)
232 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
233 else
234 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
235
236 ac_add_arg(
237 &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
238 assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
239 }
240
241 static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
242 {
243 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
244 assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
245 }
246
247 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
248 {
249 declare_const_and_shader_buffers(ctx, assign_params);
250 declare_samplers_and_images(ctx, assign_params);
251 }
252
253 static void declare_global_desc_pointers(struct si_shader_context *ctx)
254 {
255 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers);
256 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
257 &ctx->bindless_samplers_and_images);
258 }
259
260 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
261 {
262 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
263 if (!ctx->shader->is_gs_copy_shader) {
264 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
265 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
266 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
267 }
268 }
269
270 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
271 {
272 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
273
274 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
275 if (num_vbos_in_user_sgprs) {
276 unsigned user_sgprs = ctx->args.num_sgprs_used;
277
278 if (si_is_merged_shader(ctx->shader))
279 user_sgprs -= 8;
280 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
281
282 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
283 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
284 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
285
286 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
287 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
288 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
289 }
290 }
291
292 static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
293 bool ngg_cull_shader)
294 {
295 struct si_shader *shader = ctx->shader;
296
297 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
298 if (shader->key.as_ls) {
299 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
300 if (ctx->screen->info.chip_class >= GFX10) {
301 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
302 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
303 } else {
304 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
305 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
306 }
307 } else if (ctx->screen->info.chip_class >= GFX10) {
308 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
309 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
310 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
311 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
312 } else {
313 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
314 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
315 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
316 }
317
318 if (!shader->is_gs_copy_shader) {
319 if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
320 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
321 }
322
323 /* Vertex load indices. */
324 if (shader->selector->info.num_inputs) {
325 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
326 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
327 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
328 }
329 *num_prolog_vgprs += shader->selector->info.num_inputs;
330 }
331 }
332
333 static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
334 {
335 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
336 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
337 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
338
339 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
340 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
341 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
342 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
343 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
344 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
345 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
346 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
347 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
348 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
349 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
350 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
351 }
352 }
353
354 static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
355 {
356 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
357 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
358 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
359 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
360
361 if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
362 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
363 }
364 }
365
366 enum
367 {
368 /* Convenient merged shader definitions. */
369 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
370 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
371 };
372
373 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
374 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
375 {
376 assert(args->arg_count == idx);
377 ac_add_arg(args, file, registers, type, arg);
378 }
379
380 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
381 {
382 struct si_shader *shader = ctx->shader;
383 LLVMTypeRef returns[AC_MAX_ARGS];
384 unsigned i, num_return_sgprs;
385 unsigned num_returns = 0;
386 unsigned num_prolog_vgprs = 0;
387 unsigned stage = ctx->stage;
388
389 memset(&ctx->args, 0, sizeof(ctx->args));
390
391 /* Set MERGED shaders. */
392 if (ctx->screen->info.chip_class >= GFX9) {
393 if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL)
394 stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
395 else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY)
396 stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
397 }
398
399 switch (stage) {
400 case MESA_SHADER_VERTEX:
401 declare_global_desc_pointers(ctx);
402
403 if (shader->selector->info.base.vs.blit_sgprs_amd) {
404 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
405
406 /* VGPRs */
407 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
408 break;
409 }
410
411 declare_per_stage_desc_pointers(ctx, true);
412 declare_vs_specific_input_sgprs(ctx);
413 if (!shader->is_gs_copy_shader)
414 declare_vb_descriptor_input_sgprs(ctx);
415
416 if (shader->key.as_es) {
417 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
418 } else if (shader->key.as_ls) {
419 /* no extra parameters */
420 } else {
421 /* The locations of the other parameters are assigned dynamically. */
422 declare_streamout_params(ctx, &shader->selector->so);
423 }
424
425 /* VGPRs */
426 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
427
428 /* Return values */
429 if (shader->key.opt.vs_as_prim_discard_cs) {
430 for (i = 0; i < 4; i++)
431 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
432 }
433 break;
434
435 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
436 declare_global_desc_pointers(ctx);
437 declare_per_stage_desc_pointers(ctx, true);
438 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
439 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
440 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
441 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
442 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
443 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
444
445 /* VGPRs */
446 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
447 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
448
449 /* param_tcs_offchip_offset and param_tcs_factor_offset are
450 * placed after the user SGPRs.
451 */
452 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
453 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
454 for (i = 0; i < 11; i++)
455 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
456 break;
457
458 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
459 /* Merged stages have 8 system SGPRs at the beginning. */
460 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
461 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
462 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
463 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
464 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
465 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
466 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
467 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
468
469 declare_global_desc_pointers(ctx);
470 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
471 declare_vs_specific_input_sgprs(ctx);
472
473 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
474 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
475 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
476 declare_vb_descriptor_input_sgprs(ctx);
477
478 /* VGPRs (first TCS, then VS) */
479 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
480 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
481
482 if (ctx->stage == MESA_SHADER_VERTEX) {
483 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
484
485 /* LS return values are inputs to the TCS main shader part. */
486 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
487 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
488 for (i = 0; i < 2; i++)
489 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
490 } else {
491 /* TCS return values are inputs to the TCS epilog.
492 *
493 * param_tcs_offchip_offset, param_tcs_factor_offset,
494 * param_tcs_offchip_layout, and param_rw_buffers
495 * should be passed to the epilog.
496 */
497 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
498 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
499 for (i = 0; i < 11; i++)
500 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
501 }
502 break;
503
504 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
505 /* Merged stages have 8 system SGPRs at the beginning. */
506 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
507 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
508
509 if (ctx->shader->key.as_ngg)
510 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
511 else
512 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
513
514 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
515 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
516 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
517 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
518 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
519 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
520 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
521
522 declare_global_desc_pointers(ctx);
523 if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {
524 declare_per_stage_desc_pointers(
525 ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
526 }
527
528 if (ctx->stage == MESA_SHADER_VERTEX) {
529 if (shader->selector->info.base.vs.blit_sgprs_amd)
530 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
531 else
532 declare_vs_specific_input_sgprs(ctx);
533 } else {
534 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
535 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
536 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
537 /* Declare as many input SGPRs as the VS has. */
538 }
539
540 if (ctx->stage == MESA_SHADER_VERTEX)
541 declare_vb_descriptor_input_sgprs(ctx);
542
543 /* VGPRs (first GS, then VS/TES) */
544 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
545 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
546 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
547 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
548 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
549
550 if (ctx->stage == MESA_SHADER_VERTEX) {
551 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
552 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
553 declare_tes_input_vgprs(ctx, ngg_cull_shader);
554 }
555
556 if ((ctx->shader->key.as_es || ngg_cull_shader) &&
557 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
558 unsigned num_user_sgprs, num_vgprs;
559
560 if (ctx->stage == MESA_SHADER_VERTEX) {
561 /* For the NGG cull shader, add 1 SGPR to hold
562 * the vertex buffer pointer.
563 */
564 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
565
566 if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
567 assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
568 num_user_sgprs =
569 SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
570 }
571 } else {
572 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
573 }
574
575 /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
576 *
577 * The normal merged ESGS shader only has to return the 5 VGPRs
578 * for the GS stage.
579 */
580 num_vgprs = ngg_cull_shader ? 10 : 5;
581
582 /* ES return values are inputs to GS. */
583 for (i = 0; i < 8 + num_user_sgprs; i++)
584 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
585 for (i = 0; i < num_vgprs; i++)
586 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
587 }
588 break;
589
590 case MESA_SHADER_TESS_EVAL:
591 declare_global_desc_pointers(ctx);
592 declare_per_stage_desc_pointers(ctx, true);
593 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
594 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
595 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
596
597 if (shader->key.as_es) {
598 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
599 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
600 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
601 } else {
602 declare_streamout_params(ctx, &shader->selector->so);
603 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
604 }
605
606 /* VGPRs */
607 declare_tes_input_vgprs(ctx, ngg_cull_shader);
608 break;
609
610 case MESA_SHADER_GEOMETRY:
611 declare_global_desc_pointers(ctx);
612 declare_per_stage_desc_pointers(ctx, true);
613 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
614 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
615
616 /* VGPRs */
617 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
618 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
619 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
620 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
621 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
622 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
623 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
624 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
625 break;
626
627 case MESA_SHADER_FRAGMENT:
628 declare_global_desc_pointers(ctx);
629 declare_per_stage_desc_pointers(ctx, true);
630 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
631 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
632 SI_PARAM_PRIM_MASK);
633
634 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
635 SI_PARAM_PERSP_SAMPLE);
636 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
637 SI_PARAM_PERSP_CENTER);
638 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
639 SI_PARAM_PERSP_CENTROID);
640 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
641 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
642 SI_PARAM_LINEAR_SAMPLE);
643 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
644 SI_PARAM_LINEAR_CENTER);
645 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
646 SI_PARAM_LINEAR_CENTROID);
647 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
648 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
649 SI_PARAM_POS_X_FLOAT);
650 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
651 SI_PARAM_POS_Y_FLOAT);
652 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
653 SI_PARAM_POS_Z_FLOAT);
654 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
655 SI_PARAM_POS_W_FLOAT);
656 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
657 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
658 SI_PARAM_FRONT_FACE);
659 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
660 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
661 SI_PARAM_ANCILLARY);
662 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
663 SI_PARAM_SAMPLE_COVERAGE);
664 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
665 SI_PARAM_POS_FIXED_PT);
666
667 /* Color inputs from the prolog. */
668 if (shader->selector->info.colors_read) {
669 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
670
671 for (i = 0; i < num_color_elements; i++)
672 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
673
674 num_prolog_vgprs += num_color_elements;
675 }
676
677 /* Outputs for the epilog. */
678 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
679 num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
680 shader->selector->info.writes_z + shader->selector->info.writes_stencil +
681 shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
682
683 num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
684
685 for (i = 0; i < num_return_sgprs; i++)
686 returns[i] = ctx->ac.i32;
687 for (; i < num_returns; i++)
688 returns[i] = ctx->ac.f32;
689 break;
690
691 case MESA_SHADER_COMPUTE:
692 declare_global_desc_pointers(ctx);
693 declare_per_stage_desc_pointers(ctx, true);
694 if (shader->selector->info.uses_grid_size)
695 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
696 if (shader->selector->info.uses_variable_block_size)
697 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
698
699 unsigned cs_user_data_dwords =
700 shader->selector->info.base.cs.user_data_components_amd;
701 if (cs_user_data_dwords) {
702 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
703 }
704
705 /* Some descriptors can be in user SGPRs. */
706 /* Shader buffers in user SGPRs. */
707 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
708 while (ctx->args.num_sgprs_used % 4 != 0)
709 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
710
711 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
712 }
713 /* Images in user SGPRs. */
714 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
715 unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8;
716
717 while (ctx->args.num_sgprs_used % num_sgprs != 0)
718 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
719
720 ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
721 }
722
723 /* Hardware SGPRs. */
724 for (i = 0; i < 3; i++) {
725 if (shader->selector->info.uses_block_id[i]) {
726 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
727 }
728 }
729 if (shader->selector->info.uses_subgroup_info)
730 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
731
732 /* Hardware VGPRs. */
733 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
734 break;
735 default:
736 assert(0 && "unimplemented shader");
737 return;
738 }
739
740 si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns,
741 si_get_max_workgroup_size(shader));
742
743 /* Reserve register locations for VGPR inputs the PS prolog may need. */
744 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
745 ac_llvm_add_target_dep_function_attr(
746 ctx->main_fn, "InitialPSInputAddr",
747 S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
748 S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
749 S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
750 S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
751 }
752
753 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
754 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
755
756 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
757 shader->info.num_input_vgprs -= num_prolog_vgprs;
758
759 if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
760 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
761 /* The LSHS size is not known until draw time, so we append it
762 * at the end of whatever LDS use there may be in the rest of
763 * the shader (currently none, unless LLVM decides to do its
764 * own LDS-based lowering).
765 */
766 ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
767 "__lds_end", AC_ADDR_SPACE_LDS);
768 LLVMSetAlignment(ctx->ac.lds, 256);
769 } else {
770 ac_declare_lds_as_pointer(&ctx->ac);
771 }
772 }
773
774 /* Unlike radv, we override these arguments in the prolog, so to the
775 * API shader they appear as normal arguments.
776 */
777 if (ctx->stage == MESA_SHADER_VERTEX) {
778 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
779 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
780 } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
781 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
782 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
783 }
784 }
785
786 /* For the UMR disassembler. */
787 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
788 #define DEBUGGER_NUM_MARKERS 5
789
790 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
791 struct ac_rtld_binary *rtld)
792 {
793 const struct si_shader_selector *sel = shader->selector;
794 const char *part_elfs[5];
795 size_t part_sizes[5];
796 unsigned num_parts = 0;
797
798 #define add_part(shader_or_part) \
799 if (shader_or_part) { \
800 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
801 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
802 num_parts++; \
803 }
804
805 add_part(shader->prolog);
806 add_part(shader->previous_stage);
807 add_part(shader->prolog2);
808 add_part(shader);
809 add_part(shader->epilog);
810
811 #undef add_part
812
813 struct ac_rtld_symbol lds_symbols[2];
814 unsigned num_lds_symbols = 0;
815
816 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
817 (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
818 /* We add this symbol even on LLVM <= 8 to ensure that
819 * shader->config.lds_size is set correctly below.
820 */
821 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
822 sym->name = "esgs_ring";
823 sym->size = shader->gs_info.esgs_ring_size * 4;
824 sym->align = 64 * 1024;
825 }
826
827 if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) {
828 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
829 sym->name = "ngg_emit";
830 sym->size = shader->ngg.ngg_emit_size * 4;
831 sym->align = 4;
832 }
833
834 bool ok = ac_rtld_open(
835 rtld, (struct ac_rtld_open_info){.info = &screen->info,
836 .options =
837 {
838 .halt_at_entry = screen->options.halt_shaders,
839 },
840 .shader_type = sel->info.stage,
841 .wave_size = si_get_shader_wave_size(shader),
842 .num_parts = num_parts,
843 .elf_ptrs = part_elfs,
844 .elf_sizes = part_sizes,
845 .num_shared_lds_symbols = num_lds_symbols,
846 .shared_lds_symbols = lds_symbols});
847
848 if (rtld->lds_size > 0) {
849 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
850 shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
851 }
852
853 return ok;
854 }
855
856 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
857 {
858 struct ac_rtld_binary rtld;
859 si_shader_binary_open(screen, shader, &rtld);
860 return rtld.exec_size;
861 }
862
863 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
864 {
865 uint64_t *scratch_va = data;
866
867 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
868 *value = (uint32_t)*scratch_va;
869 return true;
870 }
871 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
872 /* Enable scratch coalescing. */
873 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
874 return true;
875 }
876
877 return false;
878 }
879
880 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
881 uint64_t scratch_va)
882 {
883 struct ac_rtld_binary binary;
884 if (!si_shader_binary_open(sscreen, shader, &binary))
885 return false;
886
887 si_resource_reference(&shader->bo, NULL);
888 shader->bo = si_aligned_buffer_create(
889 &sscreen->b, sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY,
890 PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
891 if (!shader->bo)
892 return false;
893
894 /* Upload. */
895 struct ac_rtld_upload_info u = {};
896 u.binary = &binary;
897 u.get_external_symbol = si_get_external_symbol;
898 u.cb_data = &scratch_va;
899 u.rx_va = shader->bo->gpu_address;
900 u.rx_ptr = sscreen->ws->buffer_map(
901 shader->bo->buf, NULL,
902 PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);
903 if (!u.rx_ptr)
904 return false;
905
906 bool ok = ac_rtld_upload(&u);
907
908 sscreen->ws->buffer_unmap(shader->bo->buf);
909 ac_rtld_close(&binary);
910
911 return ok;
912 }
913
914 static void si_shader_dump_disassembly(struct si_screen *screen,
915 const struct si_shader_binary *binary,
916 gl_shader_stage stage, unsigned wave_size,
917 struct pipe_debug_callback *debug, const char *name,
918 FILE *file)
919 {
920 struct ac_rtld_binary rtld_binary;
921
922 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
923 .info = &screen->info,
924 .shader_type = stage,
925 .wave_size = wave_size,
926 .num_parts = 1,
927 .elf_ptrs = &binary->elf_buffer,
928 .elf_sizes = &binary->elf_size}))
929 return;
930
931 const char *disasm;
932 size_t nbytes;
933
934 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
935 goto out;
936
937 if (nbytes > INT_MAX)
938 goto out;
939
940 if (debug && debug->debug_message) {
941 /* Very long debug messages are cut off, so send the
942 * disassembly one line at a time. This causes more
943 * overhead, but on the plus side it simplifies
944 * parsing of resulting logs.
945 */
946 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
947
948 uint64_t line = 0;
949 while (line < nbytes) {
950 int count = nbytes - line;
951 const char *nl = memchr(disasm + line, '\n', nbytes - line);
952 if (nl)
953 count = nl - (disasm + line);
954
955 if (count) {
956 pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
957 }
958
959 line += count + 1;
960 }
961
962 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
963 }
964
965 if (file) {
966 fprintf(file, "Shader %s disassembly:\n", name);
967 fprintf(file, "%*s", (int)nbytes, disasm);
968 }
969
970 out:
971 ac_rtld_close(&rtld_binary);
972 }
973
974 static void si_calculate_max_simd_waves(struct si_shader *shader)
975 {
976 struct si_screen *sscreen = shader->selector->screen;
977 struct ac_shader_config *conf = &shader->config;
978 unsigned num_inputs = shader->selector->info.num_inputs;
979 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
980 unsigned lds_per_wave = 0;
981 unsigned max_simd_waves;
982
983 max_simd_waves = sscreen->info.max_wave64_per_simd;
984
985 /* Compute LDS usage for PS. */
986 switch (shader->selector->info.stage) {
987 case MESA_SHADER_FRAGMENT:
988 /* The minimum usage per wave is (num_inputs * 48). The maximum
989 * usage is (num_inputs * 48 * 16).
990 * We can get anything in between and it varies between waves.
991 *
992 * The 48 bytes per input for a single primitive is equal to
993 * 4 bytes/component * 4 components/input * 3 points.
994 *
995 * Other stages don't know the size at compile time or don't
996 * allocate LDS per wave, but instead they do it per thread group.
997 */
998 lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
999 break;
1000 case MESA_SHADER_COMPUTE:
1001 if (shader->selector) {
1002 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1003 lds_per_wave = (conf->lds_size * lds_increment) /
1004 DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
1005 }
1006 break;
1007 default:;
1008 }
1009
1010 /* Compute the per-SIMD wave counts. */
1011 if (conf->num_sgprs) {
1012 max_simd_waves =
1013 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1014 }
1015
1016 if (conf->num_vgprs) {
1017 /* Always print wave limits as Wave64, so that we can compare
1018 * Wave32 and Wave64 with shader-db fairly. */
1019 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1020 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1021 }
1022
1023 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1024 if (lds_per_wave)
1025 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1026
1027 shader->info.max_simd_waves = max_simd_waves;
1028 }
1029
1030 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1031 struct pipe_debug_callback *debug)
1032 {
1033 const struct ac_shader_config *conf = &shader->config;
1034
1035 if (screen->options.debug_disassembly)
1036 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,
1037 si_get_shader_wave_size(shader), debug, "main", NULL);
1038
1039 pipe_debug_message(debug, SHADER_INFO,
1040 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1041 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1042 "Spilled VGPRs: %d PrivMem VGPRs: %d",
1043 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1044 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1045 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
1046 }
1047
1048 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1049 bool check_debug_option)
1050 {
1051 const struct ac_shader_config *conf = &shader->config;
1052
1053 if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) {
1054 if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) {
1055 fprintf(file,
1056 "*** SHADER CONFIG ***\n"
1057 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1058 "SPI_PS_INPUT_ENA = 0x%04x\n",
1059 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1060 }
1061
1062 fprintf(file,
1063 "*** SHADER STATS ***\n"
1064 "SGPRS: %d\n"
1065 "VGPRS: %d\n"
1066 "Spilled SGPRs: %d\n"
1067 "Spilled VGPRs: %d\n"
1068 "Private memory VGPRs: %d\n"
1069 "Code Size: %d bytes\n"
1070 "LDS: %d blocks\n"
1071 "Scratch: %d bytes per wave\n"
1072 "Max Waves: %d\n"
1073 "********************\n\n\n",
1074 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1075 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1076 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1077 }
1078 }
1079
1080 const char *si_get_shader_name(const struct si_shader *shader)
1081 {
1082 switch (shader->selector->info.stage) {
1083 case MESA_SHADER_VERTEX:
1084 if (shader->key.as_es)
1085 return "Vertex Shader as ES";
1086 else if (shader->key.as_ls)
1087 return "Vertex Shader as LS";
1088 else if (shader->key.opt.vs_as_prim_discard_cs)
1089 return "Vertex Shader as Primitive Discard CS";
1090 else if (shader->key.as_ngg)
1091 return "Vertex Shader as ESGS";
1092 else
1093 return "Vertex Shader as VS";
1094 case MESA_SHADER_TESS_CTRL:
1095 return "Tessellation Control Shader";
1096 case MESA_SHADER_TESS_EVAL:
1097 if (shader->key.as_es)
1098 return "Tessellation Evaluation Shader as ES";
1099 else if (shader->key.as_ngg)
1100 return "Tessellation Evaluation Shader as ESGS";
1101 else
1102 return "Tessellation Evaluation Shader as VS";
1103 case MESA_SHADER_GEOMETRY:
1104 if (shader->is_gs_copy_shader)
1105 return "GS Copy Shader as VS";
1106 else
1107 return "Geometry Shader";
1108 case MESA_SHADER_FRAGMENT:
1109 return "Pixel Shader";
1110 case MESA_SHADER_COMPUTE:
1111 return "Compute Shader";
1112 default:
1113 return "Unknown Shader";
1114 }
1115 }
1116
1117 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1118 struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
1119 {
1120 gl_shader_stage stage = shader->selector->info.stage;
1121
1122 if (!check_debug_option || si_can_dump_shader(sscreen, stage))
1123 si_dump_shader_key(shader, file);
1124
1125 if (!check_debug_option && shader->binary.llvm_ir_string) {
1126 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1127 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1128 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1129 }
1130
1131 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1132 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1133 }
1134
1135 if (!check_debug_option ||
1136 (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
1137 unsigned wave_size = si_get_shader_wave_size(shader);
1138
1139 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1140
1141 if (shader->prolog)
1142 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug,
1143 "prolog", file);
1144 if (shader->previous_stage)
1145 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1146 wave_size, debug, "previous stage", file);
1147 if (shader->prolog2)
1148 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size,
1149 debug, "prolog2", file);
1150
1151 si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main",
1152 file);
1153
1154 if (shader->epilog)
1155 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug,
1156 "epilog", file);
1157 fprintf(file, "\n");
1158 }
1159
1160 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1161 }
1162
1163 static void si_dump_shader_key_vs(const struct si_shader_key *key,
1164 const struct si_vs_prolog_bits *prolog, const char *prefix,
1165 FILE *f)
1166 {
1167 fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
1168 fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix,
1169 prolog->instance_divisor_is_fetched);
1170 fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
1171 prolog->unpack_instance_id_from_vertex_id);
1172 fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
1173
1174 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
1175 fprintf(f, " mono.vs.fix_fetch = {");
1176 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1177 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
1178 if (i)
1179 fprintf(f, ", ");
1180 if (!fix.bits)
1181 fprintf(f, "0");
1182 else
1183 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1184 fix.u.format);
1185 }
1186 fprintf(f, "}\n");
1187 }
1188
1189 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1190 {
1191 const struct si_shader_key *key = &shader->key;
1192 gl_shader_stage stage = shader->selector->info.stage;
1193
1194 fprintf(f, "SHADER KEY\n");
1195
1196 switch (stage) {
1197 case MESA_SHADER_VERTEX:
1198 si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
1199 fprintf(f, " as_es = %u\n", key->as_es);
1200 fprintf(f, " as_ls = %u\n", key->as_ls);
1201 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1202 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
1203 fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
1204 fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
1205 fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);
1206 fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing);
1207 fprintf(f, " opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
1208 fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
1209 fprintf(f, " opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
1210 fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
1211 fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
1212 fprintf(f, " opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
1213 fprintf(f, " opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
1214 break;
1215
1216 case MESA_SHADER_TESS_CTRL:
1217 if (shader->selector->screen->info.chip_class >= GFX9) {
1218 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
1219 }
1220 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
1221 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
1222 key->mono.u.ff_tcs_inputs_to_copy);
1223 break;
1224
1225 case MESA_SHADER_TESS_EVAL:
1226 fprintf(f, " as_es = %u\n", key->as_es);
1227 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1228 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
1229 break;
1230
1231 case MESA_SHADER_GEOMETRY:
1232 if (shader->is_gs_copy_shader)
1233 break;
1234
1235 if (shader->selector->screen->info.chip_class >= GFX9 &&
1236 key->part.gs.es->info.stage == MESA_SHADER_VERTEX) {
1237 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
1238 }
1239 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n",
1240 key->part.gs.prolog.tri_strip_adj_fix);
1241 fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
1242 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1243 break;
1244
1245 case MESA_SHADER_COMPUTE:
1246 break;
1247
1248 case MESA_SHADER_FRAGMENT:
1249 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
1250 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
1251 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
1252 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n",
1253 key->part.ps.prolog.force_persp_sample_interp);
1254 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n",
1255 key->part.ps.prolog.force_linear_sample_interp);
1256 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n",
1257 key->part.ps.prolog.force_persp_center_interp);
1258 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n",
1259 key->part.ps.prolog.force_linear_center_interp);
1260 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n",
1261 key->part.ps.prolog.bc_optimize_for_persp);
1262 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n",
1263 key->part.ps.prolog.bc_optimize_for_linear);
1264 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n",
1265 key->part.ps.prolog.samplemask_log_ps_iter);
1266 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n",
1267 key->part.ps.epilog.spi_shader_col_format);
1268 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
1269 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
1270 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
1271 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
1272 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
1273 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n",
1274 key->part.ps.epilog.poly_line_smoothing);
1275 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
1276 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n",
1277 key->mono.u.ps.interpolate_at_sample_force_center);
1278 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
1279 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
1280 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
1281 break;
1282
1283 default:
1284 assert(0);
1285 }
1286
1287 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1288 stage == MESA_SHADER_VERTEX) &&
1289 !key->as_es && !key->as_ls) {
1290 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
1291 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
1292 if (stage != MESA_SHADER_GEOMETRY)
1293 fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
1294 }
1295 }
1296
1297 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
1298 {
1299 struct si_shader *shader = ctx->shader;
1300 struct si_shader_info *info = &shader->selector->info;
1301 unsigned skip_vs_optim_mask = 0;
1302
1303 if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
1304 shader->key.as_ls || shader->key.as_es)
1305 return;
1306
1307 /* Optimizing these outputs is not possible, since they might be overriden
1308 * at runtime with S_028644_PT_SPRITE_TEX. */
1309 for (int i = 0; i < info->num_outputs; i++) {
1310 if (info->output_semantic[i] == VARYING_SLOT_PNTC ||
1311 (info->output_semantic[i] >= VARYING_SLOT_TEX0 &&
1312 info->output_semantic[i] <= VARYING_SLOT_TEX7)) {
1313 skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i];
1314 }
1315 }
1316
1317 ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
1318 info->num_outputs, skip_vs_optim_mask,
1319 &shader->info.nr_param_exports);
1320 }
1321
1322 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1323 const struct si_vs_prolog_bits *prolog_key,
1324 const struct si_shader_key *key, bool ngg_cull_shader)
1325 {
1326 /* VGPR initialization fixup for Vega10 and Raven is always done in the
1327 * VS prolog. */
1328 return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
1329 prolog_key->unpack_instance_id_from_vertex_id ||
1330 (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
1331 }
1332
1333 static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
1334 struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
1335 {
1336 struct si_shader_selector *sel = shader->selector;
1337 const struct si_shader_info *info = &sel->info;
1338
1339 ctx->shader = shader;
1340 ctx->stage = sel->info.stage;
1341
1342 ctx->num_const_buffers = info->base.num_ubos;
1343 ctx->num_shader_buffers = info->base.num_ssbos;
1344
1345 ctx->num_samplers = util_last_bit(info->base.textures_used);
1346 ctx->num_images = info->base.num_images;
1347
1348 si_llvm_init_resource_callbacks(ctx);
1349
1350 switch (ctx->stage) {
1351 case MESA_SHADER_VERTEX:
1352 si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
1353 break;
1354 case MESA_SHADER_TESS_CTRL:
1355 si_llvm_init_tcs_callbacks(ctx);
1356 break;
1357 case MESA_SHADER_TESS_EVAL:
1358 si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
1359 break;
1360 case MESA_SHADER_GEOMETRY:
1361 si_llvm_init_gs_callbacks(ctx);
1362 break;
1363 case MESA_SHADER_FRAGMENT:
1364 si_llvm_init_ps_callbacks(ctx);
1365 break;
1366 case MESA_SHADER_COMPUTE:
1367 ctx->abi.load_local_group_size = si_llvm_get_block_size;
1368 break;
1369 default:
1370 assert(!"Unsupported shader type");
1371 return false;
1372 }
1373
1374 si_create_function(ctx, ngg_cull_shader);
1375
1376 if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
1377 si_preload_esgs_ring(ctx);
1378
1379 if (ctx->stage == MESA_SHADER_GEOMETRY)
1380 si_preload_gs_rings(ctx);
1381 else if (ctx->stage == MESA_SHADER_TESS_EVAL)
1382 si_llvm_preload_tes_rings(ctx);
1383
1384 if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
1385 for (unsigned i = 0; i < 6; i++) {
1386 ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1387 }
1388 }
1389
1390 if (ctx->stage == MESA_SHADER_GEOMETRY) {
1391 for (unsigned i = 0; i < 4; i++) {
1392 ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1393 }
1394 if (shader->key.as_ngg) {
1395 for (unsigned i = 0; i < 4; ++i) {
1396 ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1397 ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1398 }
1399
1400 assert(!ctx->gs_ngg_scratch);
1401 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
1402 ctx->gs_ngg_scratch =
1403 LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1404 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
1405 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1406
1407 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
1408 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
1409 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
1410 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
1411 }
1412 }
1413
1414 if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
1415 /* Unconditionally declare scratch space base for streamout and
1416 * vertex compaction. Whether space is actually allocated is
1417 * determined during linking / PM4 creation.
1418 *
1419 * Add an extra dword per vertex to ensure an odd stride, which
1420 * avoids bank conflicts for SoA accesses.
1421 */
1422 if (!gfx10_is_ngg_passthrough(shader))
1423 si_llvm_declare_esgs_ring(ctx);
1424
1425 /* This is really only needed when streamout and / or vertex
1426 * compaction is enabled.
1427 */
1428 if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
1429 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
1430 ctx->gs_ngg_scratch =
1431 LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1432 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
1433 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1434 }
1435 }
1436
1437 /* For GFX9 merged shaders:
1438 * - Set EXEC for the first shader. If the prolog is present, set
1439 * EXEC there instead.
1440 * - Add a barrier before the second shader.
1441 * - In the second shader, reset EXEC to ~0 and wrap the main part in
1442 * an if-statement. This is required for correctness in geometry
1443 * shaders, to ensure that empty GS waves do not send GS_EMIT and
1444 * GS_CUT messages.
1445 *
1446 * For monolithic merged shaders, the first shader is wrapped in an
1447 * if-block together with its prolog in si_build_wrapper_function.
1448 *
1449 * NGG vertex and tess eval shaders running as the last
1450 * vertex/geometry stage handle execution explicitly using
1451 * if-statements.
1452 */
1453 if (ctx->screen->info.chip_class >= GFX9) {
1454 if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
1455 (ctx->stage == MESA_SHADER_TESS_EVAL ||
1456 (ctx->stage == MESA_SHADER_VERTEX &&
1457 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
1458 si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
1459 } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
1460 (shader->key.as_ngg && !shader->key.as_es)) {
1461 LLVMValueRef thread_enabled;
1462 bool nested_barrier;
1463
1464 if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
1465 !shader->key.as_es && !shader->key.opt.ngg_culling))
1466 ac_init_exec_full_mask(&ctx->ac);
1467
1468 if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
1469 shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
1470 gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
1471
1472 /* Build the primitive export at the beginning
1473 * of the shader if possible.
1474 */
1475 if (gfx10_ngg_export_prim_early(shader))
1476 gfx10_ngg_build_export_prim(ctx, NULL, NULL);
1477 }
1478
1479 if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY) {
1480 if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg) {
1481 gfx10_ngg_gs_emit_prologue(ctx);
1482 nested_barrier = false;
1483 } else {
1484 nested_barrier = true;
1485 }
1486
1487 thread_enabled = si_is_gs_thread(ctx);
1488 } else {
1489 thread_enabled = si_is_es_thread(ctx);
1490 nested_barrier = false;
1491 }
1492
1493 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
1494 ctx->merged_wrap_if_label = 11500;
1495 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
1496
1497 if (nested_barrier) {
1498 /* Execute a barrier before the second shader in
1499 * a merged shader.
1500 *
1501 * Execute the barrier inside the conditional block,
1502 * so that empty waves can jump directly to s_endpgm,
1503 * which will also signal the barrier.
1504 *
1505 * This is possible in gfx9, because an empty wave
1506 * for the second shader does not participate in
1507 * the epilogue. With NGG, empty waves may still
1508 * be required to export data (e.g. GS output vertices),
1509 * so we cannot let them exit early.
1510 *
1511 * If the shader is TCS and the TCS epilog is present
1512 * and contains a barrier, it will wait there and then
1513 * reach s_endpgm.
1514 */
1515 si_llvm_emit_barrier(ctx);
1516 }
1517 }
1518 }
1519
1520 bool success = si_nir_build_llvm(ctx, nir);
1521 if (free_nir)
1522 ralloc_free(nir);
1523 if (!success) {
1524 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
1525 return false;
1526 }
1527
1528 si_llvm_build_ret(ctx, ctx->return_value);
1529 return true;
1530 }
1531
1532 /**
1533 * Compute the VS prolog key, which contains all the information needed to
1534 * build the VS prolog function, and set shader->info bits where needed.
1535 *
1536 * \param info Shader info of the vertex shader.
1537 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
1538 * \param has_old_ Whether the preceding shader part is the NGG cull shader.
1539 * \param prolog_key Key of the VS prolog
1540 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
1541 * \param key Output shader part key.
1542 */
1543 static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
1544 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
1545 struct si_shader *shader_out, union si_shader_part_key *key)
1546 {
1547 memset(key, 0, sizeof(*key));
1548 key->vs_prolog.states = *prolog_key;
1549 key->vs_prolog.num_input_sgprs = num_input_sgprs;
1550 key->vs_prolog.num_inputs = info->num_inputs;
1551 key->vs_prolog.as_ls = shader_out->key.as_ls;
1552 key->vs_prolog.as_es = shader_out->key.as_es;
1553 key->vs_prolog.as_ngg = shader_out->key.as_ngg;
1554 key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
1555
1556 if (ngg_cull_shader) {
1557 key->vs_prolog.gs_fast_launch_tri_list =
1558 !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
1559 key->vs_prolog.gs_fast_launch_tri_strip =
1560 !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
1561 } else {
1562 key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
1563 }
1564
1565 if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
1566 key->vs_prolog.as_ls = 1;
1567 key->vs_prolog.num_merged_next_stage_vgprs = 2;
1568 } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {
1569 key->vs_prolog.as_es = 1;
1570 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1571 } else if (shader_out->key.as_ngg) {
1572 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1573 }
1574
1575 /* Only one of these combinations can be set. as_ngg can be set with as_es. */
1576 assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
1577 (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
1578 1);
1579
1580 /* Enable loading the InstanceID VGPR. */
1581 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1582
1583 if ((key->vs_prolog.states.instance_divisor_is_one |
1584 key->vs_prolog.states.instance_divisor_is_fetched) &
1585 input_mask)
1586 shader_out->info.uses_instanceid = true;
1587 }
1588
1589 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
1590 struct si_shader_selector *sel)
1591 {
1592 if (!compiler->low_opt_passes)
1593 return false;
1594
1595 /* Assume a slow CPU. */
1596 assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
1597
1598 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
1599 * buffer stores. */
1600 return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
1601 }
1602
1603 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
1604 {
1605 *free_nir = false;
1606
1607 if (sel->nir) {
1608 return sel->nir;
1609 } else if (sel->nir_binary) {
1610 struct pipe_screen *screen = &sel->screen->b;
1611 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1612 pipe_shader_type_from_mesa(sel->info.stage));
1613
1614 struct blob_reader blob_reader;
1615 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1616 *free_nir = true;
1617 return nir_deserialize(NULL, options, &blob_reader);
1618 }
1619 return NULL;
1620 }
1621
1622 static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1623 struct si_shader *shader, struct pipe_debug_callback *debug,
1624 struct nir_shader *nir, bool free_nir)
1625 {
1626 struct si_shader_selector *sel = shader->selector;
1627 struct si_shader_context ctx;
1628
1629 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
1630
1631 LLVMValueRef ngg_cull_main_fn = NULL;
1632 if (shader->key.opt.ngg_culling) {
1633 if (!si_build_main_function(&ctx, shader, nir, false, true)) {
1634 si_llvm_dispose(&ctx);
1635 return false;
1636 }
1637 ngg_cull_main_fn = ctx.main_fn;
1638 ctx.main_fn = NULL;
1639 }
1640
1641 if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
1642 si_llvm_dispose(&ctx);
1643 return false;
1644 }
1645
1646 if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
1647 LLVMValueRef parts[4];
1648 unsigned num_parts = 0;
1649 bool has_prolog = false;
1650 LLVMValueRef main_fn = ctx.main_fn;
1651
1652 if (ngg_cull_main_fn) {
1653 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
1654 union si_shader_part_key prolog_key;
1655 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
1656 &shader->key.part.vs.prolog, shader, &prolog_key);
1657 prolog_key.vs_prolog.is_monolithic = true;
1658 si_llvm_build_vs_prolog(&ctx, &prolog_key);
1659 parts[num_parts++] = ctx.main_fn;
1660 has_prolog = true;
1661 }
1662 parts[num_parts++] = ngg_cull_main_fn;
1663 }
1664
1665 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
1666 union si_shader_part_key prolog_key;
1667 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
1668 &shader->key.part.vs.prolog, shader, &prolog_key);
1669 prolog_key.vs_prolog.is_monolithic = true;
1670 si_llvm_build_vs_prolog(&ctx, &prolog_key);
1671 parts[num_parts++] = ctx.main_fn;
1672 has_prolog = true;
1673 }
1674 parts[num_parts++] = main_fn;
1675
1676 si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
1677
1678 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
1679 si_build_prim_discard_compute_shader(&ctx);
1680 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
1681 LLVMValueRef parts[2];
1682
1683 parts[0] = ngg_cull_main_fn;
1684 parts[1] = ctx.main_fn;
1685
1686 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
1687 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
1688 if (sscreen->info.chip_class >= GFX9) {
1689 struct si_shader_selector *ls = shader->key.part.tcs.ls;
1690 LLVMValueRef parts[4];
1691 bool vs_needs_prolog =
1692 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
1693
1694 /* TCS main part */
1695 parts[2] = ctx.main_fn;
1696
1697 /* TCS epilog */
1698 union si_shader_part_key tcs_epilog_key;
1699 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
1700 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1701 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
1702 parts[3] = ctx.main_fn;
1703
1704 /* VS as LS main part */
1705 nir = get_nir_shader(ls, &free_nir);
1706 struct si_shader shader_ls = {};
1707 shader_ls.selector = ls;
1708 shader_ls.key.as_ls = 1;
1709 shader_ls.key.mono = shader->key.mono;
1710 shader_ls.key.opt = shader->key.opt;
1711 shader_ls.is_monolithic = true;
1712
1713 if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
1714 si_llvm_dispose(&ctx);
1715 return false;
1716 }
1717 shader->info.uses_instanceid |= ls->info.uses_instanceid;
1718 parts[1] = ctx.main_fn;
1719
1720 /* LS prolog */
1721 if (vs_needs_prolog) {
1722 union si_shader_part_key vs_prolog_key;
1723 si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
1724 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
1725 vs_prolog_key.vs_prolog.is_monolithic = true;
1726 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1727 parts[0] = ctx.main_fn;
1728 }
1729
1730 /* Reset the shader context. */
1731 ctx.shader = shader;
1732 ctx.stage = MESA_SHADER_TESS_CTRL;
1733
1734 si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
1735 vs_needs_prolog, vs_needs_prolog ? 2 : 1);
1736 } else {
1737 LLVMValueRef parts[2];
1738 union si_shader_part_key epilog_key;
1739
1740 parts[0] = ctx.main_fn;
1741
1742 memset(&epilog_key, 0, sizeof(epilog_key));
1743 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1744 si_llvm_build_tcs_epilog(&ctx, &epilog_key);
1745 parts[1] = ctx.main_fn;
1746
1747 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
1748 }
1749 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
1750 if (ctx.screen->info.chip_class >= GFX9) {
1751 struct si_shader_selector *es = shader->key.part.gs.es;
1752 LLVMValueRef es_prolog = NULL;
1753 LLVMValueRef es_main = NULL;
1754 LLVMValueRef gs_prolog = NULL;
1755 LLVMValueRef gs_main = ctx.main_fn;
1756
1757 /* GS prolog */
1758 union si_shader_part_key gs_prolog_key;
1759 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
1760 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1761 gs_prolog_key.gs_prolog.is_monolithic = true;
1762 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
1763 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
1764 gs_prolog = ctx.main_fn;
1765
1766 /* ES main part */
1767 nir = get_nir_shader(es, &free_nir);
1768 struct si_shader shader_es = {};
1769 shader_es.selector = es;
1770 shader_es.key.as_es = 1;
1771 shader_es.key.as_ngg = shader->key.as_ngg;
1772 shader_es.key.mono = shader->key.mono;
1773 shader_es.key.opt = shader->key.opt;
1774 shader_es.is_monolithic = true;
1775
1776 if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
1777 si_llvm_dispose(&ctx);
1778 return false;
1779 }
1780 shader->info.uses_instanceid |= es->info.uses_instanceid;
1781 es_main = ctx.main_fn;
1782
1783 /* ES prolog */
1784 if (es->info.stage == MESA_SHADER_VERTEX &&
1785 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
1786 union si_shader_part_key vs_prolog_key;
1787 si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
1788 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
1789 vs_prolog_key.vs_prolog.is_monolithic = true;
1790 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1791 es_prolog = ctx.main_fn;
1792 }
1793
1794 /* Reset the shader context. */
1795 ctx.shader = shader;
1796 ctx.stage = MESA_SHADER_GEOMETRY;
1797
1798 /* Prepare the array of shader parts. */
1799 LLVMValueRef parts[4];
1800 unsigned num_parts = 0, main_part, next_first_part;
1801
1802 if (es_prolog)
1803 parts[num_parts++] = es_prolog;
1804
1805 parts[main_part = num_parts++] = es_main;
1806 parts[next_first_part = num_parts++] = gs_prolog;
1807 parts[num_parts++] = gs_main;
1808
1809 si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
1810 } else {
1811 LLVMValueRef parts[2];
1812 union si_shader_part_key prolog_key;
1813
1814 parts[1] = ctx.main_fn;
1815
1816 memset(&prolog_key, 0, sizeof(prolog_key));
1817 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1818 si_llvm_build_gs_prolog(&ctx, &prolog_key);
1819 parts[0] = ctx.main_fn;
1820
1821 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
1822 }
1823 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
1824 si_llvm_build_monolithic_ps(&ctx, shader);
1825 }
1826
1827 si_llvm_optimize_module(&ctx);
1828
1829 /* Post-optimization transformations and analysis. */
1830 si_optimize_vs_outputs(&ctx);
1831
1832 if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.stage)) {
1833 ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
1834 }
1835
1836 /* Make sure the input is a pointer and not integer followed by inttoptr. */
1837 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
1838
1839 /* Compile to bytecode. */
1840 if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
1841 ctx.stage, si_get_shader_name(shader),
1842 si_should_optimize_less(compiler, shader->selector))) {
1843 si_llvm_dispose(&ctx);
1844 fprintf(stderr, "LLVM failed to compile shader\n");
1845 return false;
1846 }
1847
1848 si_llvm_dispose(&ctx);
1849 return true;
1850 }
1851
1852 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1853 struct si_shader *shader, struct pipe_debug_callback *debug)
1854 {
1855 struct si_shader_selector *sel = shader->selector;
1856 bool free_nir;
1857 struct nir_shader *nir = get_nir_shader(sel, &free_nir);
1858
1859 /* Dump NIR before doing NIR->LLVM conversion in case the
1860 * conversion fails. */
1861 if (si_can_dump_shader(sscreen, sel->info.stage) &&
1862 !(sscreen->debug_flags & DBG(NO_NIR))) {
1863 nir_print_shader(nir, stderr);
1864 si_dump_streamout(&sel->so);
1865 }
1866
1867 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
1868 sizeof(shader->info.vs_output_param_offset));
1869
1870 shader->info.uses_instanceid = sel->info.uses_instanceid;
1871
1872 /* TODO: ACO could compile non-monolithic shaders here (starting
1873 * with PS and NGG VS), but monolithic shaders should be compiled
1874 * by LLVM due to more complicated compilation.
1875 */
1876 if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
1877 return false;
1878
1879 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
1880 * LLVM 3.9svn has this bug.
1881 */
1882 if (sel->info.stage == MESA_SHADER_COMPUTE) {
1883 unsigned wave_size = sscreen->compute_wave_size;
1884 unsigned max_vgprs =
1885 sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
1886 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
1887 unsigned max_sgprs_per_wave = 128;
1888 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
1889 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
1890 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
1891 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
1892
1893 max_vgprs = max_vgprs / waves_per_simd;
1894 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
1895
1896 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
1897 fprintf(stderr,
1898 "LLVM failed to compile a shader correctly: "
1899 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
1900 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
1901
1902 /* Just terminate the process, because dependent
1903 * shaders can hang due to bad input data, but use
1904 * the env var to allow shader-db to work.
1905 */
1906 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
1907 abort();
1908 }
1909 }
1910
1911 /* Add the scratch offset to input SGPRs. */
1912 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
1913 shader->info.num_input_sgprs += 1; /* scratch byte offset */
1914
1915 /* Calculate the number of fragment input VGPRs. */
1916 if (sel->info.stage == MESA_SHADER_FRAGMENT) {
1917 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
1918 &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
1919 }
1920
1921 si_calculate_max_simd_waves(shader);
1922 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
1923 return true;
1924 }
1925
1926 /**
1927 * Create, compile and return a shader part (prolog or epilog).
1928 *
1929 * \param sscreen screen
1930 * \param list list of shader parts of the same category
1931 * \param type shader type
1932 * \param key shader part key
1933 * \param prolog whether the part being requested is a prolog
1934 * \param tm LLVM target machine
1935 * \param debug debug callback
1936 * \param build the callback responsible for building the main function
1937 * \return non-NULL on success
1938 */
1939 static struct si_shader_part *
1940 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
1941 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
1942 struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
1943 void (*build)(struct si_shader_context *, union si_shader_part_key *),
1944 const char *name)
1945 {
1946 struct si_shader_part *result;
1947
1948 simple_mtx_lock(&sscreen->shader_parts_mutex);
1949
1950 /* Find existing. */
1951 for (result = *list; result; result = result->next) {
1952 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
1953 simple_mtx_unlock(&sscreen->shader_parts_mutex);
1954 return result;
1955 }
1956 }
1957
1958 /* Compile a new one. */
1959 result = CALLOC_STRUCT(si_shader_part);
1960 result->key = *key;
1961
1962 struct si_shader_selector sel = {};
1963 sel.screen = sscreen;
1964
1965 struct si_shader shader = {};
1966 shader.selector = &sel;
1967
1968 switch (stage) {
1969 case MESA_SHADER_VERTEX:
1970 shader.key.as_ls = key->vs_prolog.as_ls;
1971 shader.key.as_es = key->vs_prolog.as_es;
1972 shader.key.as_ngg = key->vs_prolog.as_ngg;
1973 shader.key.opt.ngg_culling =
1974 (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
1975 (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
1976 shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
1977 break;
1978 case MESA_SHADER_TESS_CTRL:
1979 assert(!prolog);
1980 shader.key.part.tcs.epilog = key->tcs_epilog.states;
1981 break;
1982 case MESA_SHADER_GEOMETRY:
1983 assert(prolog);
1984 shader.key.as_ngg = key->gs_prolog.as_ngg;
1985 break;
1986 case MESA_SHADER_FRAGMENT:
1987 if (prolog)
1988 shader.key.part.ps.prolog = key->ps_prolog.states;
1989 else
1990 shader.key.part.ps.epilog = key->ps_epilog.states;
1991 break;
1992 default:
1993 unreachable("bad shader part");
1994 }
1995
1996 struct si_shader_context ctx;
1997 si_llvm_context_init(&ctx, sscreen, compiler,
1998 si_get_wave_size(sscreen, stage,
1999 shader.key.as_ngg, shader.key.as_es,
2000 shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
2001 shader.key.opt.vs_as_prim_discard_cs));
2002 ctx.shader = &shader;
2003 ctx.stage = stage;
2004
2005 build(&ctx, key);
2006
2007 /* Compile. */
2008 si_llvm_optimize_module(&ctx);
2009
2010 if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
2011 ctx.stage, name, false)) {
2012 FREE(result);
2013 result = NULL;
2014 goto out;
2015 }
2016
2017 result->next = *list;
2018 *list = result;
2019
2020 out:
2021 si_llvm_dispose(&ctx);
2022 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2023 return result;
2024 }
2025
2026 static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2027 struct si_shader *shader, struct pipe_debug_callback *debug,
2028 struct si_shader *main_part, const struct si_vs_prolog_bits *key)
2029 {
2030 struct si_shader_selector *vs = main_part->selector;
2031
2032 if (!si_vs_needs_prolog(vs, key, &shader->key, false))
2033 return true;
2034
2035 /* Get the prolog. */
2036 union si_shader_part_key prolog_key;
2037 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
2038 &prolog_key);
2039
2040 shader->prolog =
2041 si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
2042 compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
2043 return shader->prolog != NULL;
2044 }
2045
2046 /**
2047 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
2048 */
2049 static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2050 struct si_shader *shader, struct pipe_debug_callback *debug)
2051 {
2052 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
2053 }
2054
2055 /**
2056 * Select and compile (or reuse) TCS parts (epilog).
2057 */
2058 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2059 struct si_shader *shader, struct pipe_debug_callback *debug)
2060 {
2061 if (sscreen->info.chip_class >= GFX9) {
2062 struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
2063
2064 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
2065 &shader->key.part.tcs.ls_prolog))
2066 return false;
2067
2068 shader->previous_stage = ls_main_part;
2069 }
2070
2071 /* Get the epilog. */
2072 union si_shader_part_key epilog_key;
2073 memset(&epilog_key, 0, sizeof(epilog_key));
2074 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2075
2076 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
2077 &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
2078 "Tessellation Control Shader Epilog");
2079 return shader->epilog != NULL;
2080 }
2081
2082 /**
2083 * Select and compile (or reuse) GS parts (prolog).
2084 */
2085 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2086 struct si_shader *shader, struct pipe_debug_callback *debug)
2087 {
2088 if (sscreen->info.chip_class >= GFX9) {
2089 struct si_shader *es_main_part;
2090
2091 if (shader->key.as_ngg)
2092 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
2093 else
2094 es_main_part = shader->key.part.gs.es->main_shader_part_es;
2095
2096 if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
2097 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
2098 &shader->key.part.gs.vs_prolog))
2099 return false;
2100
2101 shader->previous_stage = es_main_part;
2102 }
2103
2104 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
2105 return true;
2106
2107 union si_shader_part_key prolog_key;
2108 memset(&prolog_key, 0, sizeof(prolog_key));
2109 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2110 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
2111
2112 shader->prolog2 =
2113 si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,
2114 compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
2115 return shader->prolog2 != NULL;
2116 }
2117
2118 /**
2119 * Compute the PS prolog key, which contains all the information needed to
2120 * build the PS prolog function, and set related bits in shader->config.
2121 */
2122 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
2123 bool separate_prolog)
2124 {
2125 struct si_shader_info *info = &shader->selector->info;
2126
2127 memset(key, 0, sizeof(*key));
2128 key->ps_prolog.states = shader->key.part.ps.prolog;
2129 key->ps_prolog.colors_read = info->colors_read;
2130 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
2131 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
2132 key->ps_prolog.wqm =
2133 info->base.fs.needs_helper_invocations &&
2134 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2135 key->ps_prolog.states.force_linear_sample_interp ||
2136 key->ps_prolog.states.force_persp_center_interp ||
2137 key->ps_prolog.states.force_linear_center_interp ||
2138 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
2139 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
2140
2141 if (info->colors_read) {
2142 ubyte *color = shader->selector->color_attr_index;
2143
2144 if (shader->key.part.ps.prolog.color_two_side) {
2145 /* BCOLORs are stored after the last input. */
2146 key->ps_prolog.num_interp_inputs = info->num_inputs;
2147 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
2148 if (separate_prolog)
2149 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2150 }
2151
2152 for (unsigned i = 0; i < 2; i++) {
2153 unsigned interp = info->color_interpolate[i];
2154 unsigned location = info->color_interpolate_loc[i];
2155
2156 if (!(info->colors_read & (0xf << i * 4)))
2157 continue;
2158
2159 key->ps_prolog.color_attr_index[i] = color[i];
2160
2161 if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
2162 interp = INTERP_MODE_FLAT;
2163
2164 switch (interp) {
2165 case INTERP_MODE_FLAT:
2166 key->ps_prolog.color_interp_vgpr_index[i] = -1;
2167 break;
2168 case INTERP_MODE_SMOOTH:
2169 case INTERP_MODE_COLOR:
2170 /* Force the interpolation location for colors here. */
2171 if (shader->key.part.ps.prolog.force_persp_sample_interp)
2172 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2173 if (shader->key.part.ps.prolog.force_persp_center_interp)
2174 location = TGSI_INTERPOLATE_LOC_CENTER;
2175
2176 switch (location) {
2177 case TGSI_INTERPOLATE_LOC_SAMPLE:
2178 key->ps_prolog.color_interp_vgpr_index[i] = 0;
2179 if (separate_prolog) {
2180 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2181 }
2182 break;
2183 case TGSI_INTERPOLATE_LOC_CENTER:
2184 key->ps_prolog.color_interp_vgpr_index[i] = 2;
2185 if (separate_prolog) {
2186 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2187 }
2188 break;
2189 case TGSI_INTERPOLATE_LOC_CENTROID:
2190 key->ps_prolog.color_interp_vgpr_index[i] = 4;
2191 if (separate_prolog) {
2192 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
2193 }
2194 break;
2195 default:
2196 assert(0);
2197 }
2198 break;
2199 case INTERP_MODE_NOPERSPECTIVE:
2200 /* Force the interpolation location for colors here. */
2201 if (shader->key.part.ps.prolog.force_linear_sample_interp)
2202 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2203 if (shader->key.part.ps.prolog.force_linear_center_interp)
2204 location = TGSI_INTERPOLATE_LOC_CENTER;
2205
2206 /* The VGPR assignment for non-monolithic shaders
2207 * works because InitialPSInputAddr is set on the
2208 * main shader and PERSP_PULL_MODEL is never used.
2209 */
2210 switch (location) {
2211 case TGSI_INTERPOLATE_LOC_SAMPLE:
2212 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
2213 if (separate_prolog) {
2214 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2215 }
2216 break;
2217 case TGSI_INTERPOLATE_LOC_CENTER:
2218 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
2219 if (separate_prolog) {
2220 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2221 }
2222 break;
2223 case TGSI_INTERPOLATE_LOC_CENTROID:
2224 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
2225 if (separate_prolog) {
2226 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
2227 }
2228 break;
2229 default:
2230 assert(0);
2231 }
2232 break;
2233 default:
2234 assert(0);
2235 }
2236 }
2237 }
2238 }
2239
2240 /**
2241 * Check whether a PS prolog is required based on the key.
2242 */
2243 bool si_need_ps_prolog(const union si_shader_part_key *key)
2244 {
2245 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2246 key->ps_prolog.states.force_linear_sample_interp ||
2247 key->ps_prolog.states.force_persp_center_interp ||
2248 key->ps_prolog.states.force_linear_center_interp ||
2249 key->ps_prolog.states.bc_optimize_for_persp ||
2250 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
2251 key->ps_prolog.states.samplemask_log_ps_iter;
2252 }
2253
2254 /**
2255 * Compute the PS epilog key, which contains all the information needed to
2256 * build the PS epilog function.
2257 */
2258 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
2259 {
2260 struct si_shader_info *info = &shader->selector->info;
2261 memset(key, 0, sizeof(*key));
2262 key->ps_epilog.colors_written = info->colors_written;
2263 key->ps_epilog.writes_z = info->writes_z;
2264 key->ps_epilog.writes_stencil = info->writes_stencil;
2265 key->ps_epilog.writes_samplemask = info->writes_samplemask;
2266 key->ps_epilog.states = shader->key.part.ps.epilog;
2267 }
2268
2269 /**
2270 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
2271 */
2272 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2273 struct si_shader *shader, struct pipe_debug_callback *debug)
2274 {
2275 union si_shader_part_key prolog_key;
2276 union si_shader_part_key epilog_key;
2277
2278 /* Get the prolog. */
2279 si_get_ps_prolog_key(shader, &prolog_key, true);
2280
2281 /* The prolog is a no-op if these aren't set. */
2282 if (si_need_ps_prolog(&prolog_key)) {
2283 shader->prolog =
2284 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
2285 compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
2286 if (!shader->prolog)
2287 return false;
2288 }
2289
2290 /* Get the epilog. */
2291 si_get_ps_epilog_key(shader, &epilog_key);
2292
2293 shader->epilog =
2294 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
2295 compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
2296 if (!shader->epilog)
2297 return false;
2298
2299 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2300 if (shader->key.part.ps.prolog.poly_stipple) {
2301 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2302 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
2303 }
2304
2305 /* Set up the enable bits for per-sample shading if needed. */
2306 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
2307 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2308 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2309 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2310 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2311 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2312 }
2313 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
2314 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2315 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2316 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2317 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2318 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2319 }
2320 if (shader->key.part.ps.prolog.force_persp_center_interp &&
2321 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2322 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2323 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2324 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2325 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2326 }
2327 if (shader->key.part.ps.prolog.force_linear_center_interp &&
2328 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2329 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2330 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2331 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2332 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2333 }
2334
2335 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2336 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2337 !(shader->config.spi_ps_input_ena & 0xf)) {
2338 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2339 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
2340 }
2341
2342 /* At least one pair of interpolation weights must be enabled. */
2343 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
2344 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2345 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
2346 }
2347
2348 /* Samplemask fixup requires the sample ID. */
2349 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
2350 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2351 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
2352 }
2353
2354 /* The sample mask input is always enabled, because the API shader always
2355 * passes it through to the epilog. Disable it here if it's unused.
2356 */
2357 if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
2358 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2359
2360 return true;
2361 }
2362
2363 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
2364 {
2365 /* If tessellation is all offchip and on-chip GS isn't used, this
2366 * workaround is not needed.
2367 */
2368 return;
2369
2370 /* SPI barrier management bug:
2371 * Make sure we have at least 4k of LDS in use to avoid the bug.
2372 * It applies to workgroup sizes of more than one wavefront.
2373 */
2374 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
2375 *lds_size = MAX2(*lds_size, 8);
2376 }
2377
2378 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
2379 {
2380 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
2381
2382 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
2383
2384 if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&
2385 si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
2386 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
2387 }
2388 }
2389
2390 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2391 struct si_shader *shader, struct pipe_debug_callback *debug)
2392 {
2393 struct si_shader_selector *sel = shader->selector;
2394 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
2395
2396 /* LS, ES, VS are compiled on demand if the main part hasn't been
2397 * compiled for that stage.
2398 *
2399 * GS are compiled on demand if the main part hasn't been compiled
2400 * for the chosen NGG-ness.
2401 *
2402 * Vertex shaders are compiled on demand when a vertex fetch
2403 * workaround must be applied.
2404 */
2405 if (shader->is_monolithic) {
2406 /* Monolithic shader (compiled as a whole, has many variants,
2407 * may take a long time to compile).
2408 */
2409 if (!si_compile_shader(sscreen, compiler, shader, debug))
2410 return false;
2411 } else {
2412 /* The shader consists of several parts:
2413 *
2414 * - the middle part is the user shader, it has 1 variant only
2415 * and it was compiled during the creation of the shader
2416 * selector
2417 * - the prolog part is inserted at the beginning
2418 * - the epilog part is inserted at the end
2419 *
2420 * The prolog and epilog have many (but simple) variants.
2421 *
2422 * Starting with gfx9, geometry and tessellation control
2423 * shaders also contain the prolog and user shader parts of
2424 * the previous shader stage.
2425 */
2426
2427 if (!mainp)
2428 return false;
2429
2430 /* Copy the compiled shader data over. */
2431 shader->is_binary_shared = true;
2432 shader->binary = mainp->binary;
2433 shader->config = mainp->config;
2434 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
2435 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
2436 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
2437 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
2438 memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
2439 sizeof(mainp->info.vs_output_param_offset));
2440 shader->info.uses_instanceid = mainp->info.uses_instanceid;
2441 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
2442 shader->info.nr_param_exports = mainp->info.nr_param_exports;
2443
2444 /* Select prologs and/or epilogs. */
2445 switch (sel->info.stage) {
2446 case MESA_SHADER_VERTEX:
2447 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
2448 return false;
2449 break;
2450 case MESA_SHADER_TESS_CTRL:
2451 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
2452 return false;
2453 break;
2454 case MESA_SHADER_TESS_EVAL:
2455 break;
2456 case MESA_SHADER_GEOMETRY:
2457 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
2458 return false;
2459 break;
2460 case MESA_SHADER_FRAGMENT:
2461 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
2462 return false;
2463
2464 /* Make sure we have at least as many VGPRs as there
2465 * are allocated inputs.
2466 */
2467 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
2468 break;
2469 default:;
2470 }
2471
2472 /* Update SGPR and VGPR counts. */
2473 if (shader->prolog) {
2474 shader->config.num_sgprs =
2475 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
2476 shader->config.num_vgprs =
2477 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
2478 }
2479 if (shader->previous_stage) {
2480 shader->config.num_sgprs =
2481 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
2482 shader->config.num_vgprs =
2483 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
2484 shader->config.spilled_sgprs =
2485 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
2486 shader->config.spilled_vgprs =
2487 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
2488 shader->info.private_mem_vgprs =
2489 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
2490 shader->config.scratch_bytes_per_wave =
2491 MAX2(shader->config.scratch_bytes_per_wave,
2492 shader->previous_stage->config.scratch_bytes_per_wave);
2493 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
2494 }
2495 if (shader->prolog2) {
2496 shader->config.num_sgprs =
2497 MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
2498 shader->config.num_vgprs =
2499 MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
2500 }
2501 if (shader->epilog) {
2502 shader->config.num_sgprs =
2503 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
2504 shader->config.num_vgprs =
2505 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
2506 }
2507 si_calculate_max_simd_waves(shader);
2508 }
2509
2510 if (shader->key.as_ngg) {
2511 assert(!shader->key.as_es && !shader->key.as_ls);
2512 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
2513 fprintf(stderr, "Failed to compute subgroup info\n");
2514 return false;
2515 }
2516 } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) {
2517 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
2518 }
2519
2520 si_fix_resource_usage(sscreen, shader);
2521 si_shader_dump(sscreen, shader, debug, stderr, true);
2522
2523 /* Upload. */
2524 if (!si_shader_binary_upload(sscreen, shader, 0)) {
2525 fprintf(stderr, "LLVM failed to upload shader\n");
2526 return false;
2527 }
2528
2529 return true;
2530 }
2531
2532 void si_shader_binary_clean(struct si_shader_binary *binary)
2533 {
2534 free((void *)binary->elf_buffer);
2535 binary->elf_buffer = NULL;
2536
2537 free(binary->llvm_ir_string);
2538 binary->llvm_ir_string = NULL;
2539 }
2540
2541 void si_shader_destroy(struct si_shader *shader)
2542 {
2543 if (shader->scratch_bo)
2544 si_resource_reference(&shader->scratch_bo, NULL);
2545
2546 si_resource_reference(&shader->bo, NULL);
2547
2548 if (!shader->is_binary_shared)
2549 si_shader_binary_clean(&shader->binary);
2550
2551 free(shader->shader_log);
2552 }