radeonsi: remove redundant si_shader_info::images_declared
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "ac_exp_param.h"
26 #include "ac_rtld.h"
27 #include "compiler/nir/nir.h"
28 #include "compiler/nir/nir_serialize.h"
29 #include "si_pipe.h"
30 #include "si_shader_internal.h"
31 #include "sid.h"
32 #include "tgsi/tgsi_from_mesa.h"
33 #include "tgsi/tgsi_strings.h"
34 #include "util/u_memory.h"
35
36 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
37
38 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
39
40 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
41
42 /** Whether the shader runs as a combination of multiple API shaders */
43 bool si_is_multi_part_shader(struct si_shader *shader)
44 {
45 if (shader->selector->screen->info.chip_class <= GFX8)
46 return false;
47
48 return shader->key.as_ls || shader->key.as_es ||
49 shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||
50 shader->selector->info.stage == MESA_SHADER_GEOMETRY;
51 }
52
53 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
54 bool si_is_merged_shader(struct si_shader *shader)
55 {
56 return shader->key.as_ngg || si_is_multi_part_shader(shader);
57 }
58
59 /**
60 * Returns a unique index for a per-patch semantic name and index. The index
61 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
62 * can be calculated.
63 */
64 unsigned si_shader_io_get_unique_index_patch(unsigned semantic)
65 {
66 switch (semantic) {
67 case VARYING_SLOT_TESS_LEVEL_OUTER:
68 return 0;
69 case VARYING_SLOT_TESS_LEVEL_INNER:
70 return 1;
71 default:
72 if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
73 return 2 + (semantic - VARYING_SLOT_PATCH0);
74
75 assert(!"invalid semantic");
76 return 0;
77 }
78 }
79
80 /**
81 * Returns a unique index for a semantic name and index. The index must be
82 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
83 * calculated.
84 */
85 unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
86 {
87 switch (semantic) {
88 case VARYING_SLOT_POS:
89 return 0;
90 default:
91 /* Since some shader stages use the the highest used IO index
92 * to determine the size to allocate for inputs/outputs
93 * (in LDS, tess and GS rings). GENERIC should be placed right
94 * after POSITION to make that size as small as possible.
95 */
96 if (semantic >= VARYING_SLOT_VAR0 &&
97 semantic < VARYING_SLOT_VAR0 + SI_MAX_IO_GENERIC)
98 return 1 + (semantic - VARYING_SLOT_VAR0);
99
100 assert(!"invalid generic index");
101 return 0;
102 case VARYING_SLOT_FOGC:
103 return SI_MAX_IO_GENERIC + 1;
104 case VARYING_SLOT_COL0:
105 return SI_MAX_IO_GENERIC + 2;
106 case VARYING_SLOT_COL1:
107 return SI_MAX_IO_GENERIC + 3;
108 case VARYING_SLOT_BFC0:
109 /* If it's a varying, COLOR and BCOLOR alias. */
110 if (is_varying)
111 return SI_MAX_IO_GENERIC + 2;
112 else
113 return SI_MAX_IO_GENERIC + 4;
114 case VARYING_SLOT_BFC1:
115 if (is_varying)
116 return SI_MAX_IO_GENERIC + 3;
117 else
118 return SI_MAX_IO_GENERIC + 5;
119 case VARYING_SLOT_TEX0:
120 case VARYING_SLOT_TEX1:
121 case VARYING_SLOT_TEX2:
122 case VARYING_SLOT_TEX3:
123 case VARYING_SLOT_TEX4:
124 case VARYING_SLOT_TEX5:
125 case VARYING_SLOT_TEX6:
126 case VARYING_SLOT_TEX7:
127 return SI_MAX_IO_GENERIC + 6 + (semantic - VARYING_SLOT_TEX0);
128
129 /* These are rarely used between LS and HS or ES and GS. */
130 case VARYING_SLOT_CLIP_DIST0:
131 return SI_MAX_IO_GENERIC + 6 + 8;
132 case VARYING_SLOT_CLIP_DIST1:
133 return SI_MAX_IO_GENERIC + 6 + 8 + 1;
134 case VARYING_SLOT_CLIP_VERTEX:
135 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
136 case VARYING_SLOT_PSIZ:
137 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
138
139 /* These can't be written by LS, HS, and ES. */
140 case VARYING_SLOT_LAYER:
141 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
142 case VARYING_SLOT_VIEWPORT:
143 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
144 case VARYING_SLOT_PRIMITIVE_ID:
145 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
146 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
147 }
148 }
149
150 static void si_dump_streamout(struct pipe_stream_output_info *so)
151 {
152 unsigned i;
153
154 if (so->num_outputs)
155 fprintf(stderr, "STREAMOUT\n");
156
157 for (i = 0; i < so->num_outputs; i++) {
158 unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
159 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
160 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
161 so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
162 mask & 4 ? "z" : "", mask & 8 ? "w" : "");
163 }
164 }
165
166 static void declare_streamout_params(struct si_shader_context *ctx,
167 struct pipe_stream_output_info *so)
168 {
169 if (ctx->screen->use_ngg_streamout) {
170 if (ctx->stage == MESA_SHADER_TESS_EVAL)
171 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
172 return;
173 }
174
175 /* Streamout SGPRs. */
176 if (so->num_outputs) {
177 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
178 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
179 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
180 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
181 }
182
183 /* A streamout buffer offset is loaded if the stride is non-zero. */
184 for (int i = 0; i < 4; i++) {
185 if (!so->stride[i])
186 continue;
187
188 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
189 }
190 }
191
192 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
193 {
194 switch (shader->selector->info.stage) {
195 case MESA_SHADER_VERTEX:
196 case MESA_SHADER_TESS_EVAL:
197 return shader->key.as_ngg ? 128 : 0;
198
199 case MESA_SHADER_TESS_CTRL:
200 /* Return this so that LLVM doesn't remove s_barrier
201 * instructions on chips where we use s_barrier. */
202 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
203
204 case MESA_SHADER_GEOMETRY:
205 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
206
207 case MESA_SHADER_COMPUTE:
208 break; /* see below */
209
210 default:
211 return 0;
212 }
213
214 uint16_t *local_size = shader->selector->info.base.cs.local_size;
215 unsigned max_work_group_size = (uint32_t)local_size[0] *
216 (uint32_t)local_size[1] *
217 (uint32_t)local_size[2];
218
219 if (!max_work_group_size) {
220 /* This is a variable group size compute shader,
221 * compile it for the maximum possible group size.
222 */
223 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
224 }
225 return max_work_group_size;
226 }
227
228 static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
229 {
230 enum ac_arg_type const_shader_buf_type;
231
232 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
233 ctx->shader->selector->info.base.num_ssbos == 0)
234 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
235 else
236 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
237
238 ac_add_arg(
239 &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
240 assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
241 }
242
243 static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
244 {
245 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
246 assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
247 }
248
249 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
250 {
251 declare_const_and_shader_buffers(ctx, assign_params);
252 declare_samplers_and_images(ctx, assign_params);
253 }
254
255 static void declare_global_desc_pointers(struct si_shader_context *ctx)
256 {
257 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->rw_buffers);
258 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
259 &ctx->bindless_samplers_and_images);
260 }
261
262 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
263 {
264 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
265 if (!ctx->shader->is_gs_copy_shader) {
266 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
267 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
268 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
269 }
270 }
271
272 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
273 {
274 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
275
276 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
277 if (num_vbos_in_user_sgprs) {
278 unsigned user_sgprs = ctx->args.num_sgprs_used;
279
280 if (si_is_merged_shader(ctx->shader))
281 user_sgprs -= 8;
282 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
283
284 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
285 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
286 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
287
288 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
289 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
290 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
291 }
292 }
293
294 static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
295 bool ngg_cull_shader)
296 {
297 struct si_shader *shader = ctx->shader;
298
299 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
300 if (shader->key.as_ls) {
301 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
302 if (ctx->screen->info.chip_class >= GFX10) {
303 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
304 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
305 } else {
306 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
307 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
308 }
309 } else if (ctx->screen->info.chip_class >= GFX10) {
310 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
311 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
312 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
313 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
314 } else {
315 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
316 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
317 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
318 }
319
320 if (!shader->is_gs_copy_shader) {
321 if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
322 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
323 }
324
325 /* Vertex load indices. */
326 if (shader->selector->info.num_inputs) {
327 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
328 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
329 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
330 }
331 *num_prolog_vgprs += shader->selector->info.num_inputs;
332 }
333 }
334
335 static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
336 {
337 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
338 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
339 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
340
341 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
342 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
343 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
344 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
345 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
346 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
347 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
348 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
349 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
350 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
351 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
352 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
353 }
354 }
355
356 static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
357 {
358 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
359 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
360 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
361 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
362
363 if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
364 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
365 }
366 }
367
368 enum
369 {
370 /* Convenient merged shader definitions. */
371 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
372 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
373 };
374
375 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
376 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
377 {
378 assert(args->arg_count == idx);
379 ac_add_arg(args, file, registers, type, arg);
380 }
381
382 void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
383 {
384 struct si_shader *shader = ctx->shader;
385 LLVMTypeRef returns[AC_MAX_ARGS];
386 unsigned i, num_return_sgprs;
387 unsigned num_returns = 0;
388 unsigned num_prolog_vgprs = 0;
389 unsigned stage = ctx->stage;
390
391 memset(&ctx->args, 0, sizeof(ctx->args));
392
393 /* Set MERGED shaders. */
394 if (ctx->screen->info.chip_class >= GFX9) {
395 if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL)
396 stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
397 else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY)
398 stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
399 }
400
401 switch (stage) {
402 case MESA_SHADER_VERTEX:
403 declare_global_desc_pointers(ctx);
404
405 if (shader->selector->info.base.vs.blit_sgprs_amd) {
406 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
407
408 /* VGPRs */
409 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
410 break;
411 }
412
413 declare_per_stage_desc_pointers(ctx, true);
414 declare_vs_specific_input_sgprs(ctx);
415 if (!shader->is_gs_copy_shader)
416 declare_vb_descriptor_input_sgprs(ctx);
417
418 if (shader->key.as_es) {
419 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
420 } else if (shader->key.as_ls) {
421 /* no extra parameters */
422 } else {
423 /* The locations of the other parameters are assigned dynamically. */
424 declare_streamout_params(ctx, &shader->selector->so);
425 }
426
427 /* VGPRs */
428 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
429
430 /* Return values */
431 if (shader->key.opt.vs_as_prim_discard_cs) {
432 for (i = 0; i < 4; i++)
433 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
434 }
435 break;
436
437 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
438 declare_global_desc_pointers(ctx);
439 declare_per_stage_desc_pointers(ctx, true);
440 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
441 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
442 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
443 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
444 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
445 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
446
447 /* VGPRs */
448 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
449 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
450
451 /* param_tcs_offchip_offset and param_tcs_factor_offset are
452 * placed after the user SGPRs.
453 */
454 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
455 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
456 for (i = 0; i < 11; i++)
457 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
458 break;
459
460 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
461 /* Merged stages have 8 system SGPRs at the beginning. */
462 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
463 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
464 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
465 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
466 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
467 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
468 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
469 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
470
471 declare_global_desc_pointers(ctx);
472 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
473 declare_vs_specific_input_sgprs(ctx);
474
475 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
476 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
477 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
478 declare_vb_descriptor_input_sgprs(ctx);
479
480 /* VGPRs (first TCS, then VS) */
481 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
482 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
483
484 if (ctx->stage == MESA_SHADER_VERTEX) {
485 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
486
487 /* LS return values are inputs to the TCS main shader part. */
488 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
489 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
490 for (i = 0; i < 2; i++)
491 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
492 } else {
493 /* TCS return values are inputs to the TCS epilog.
494 *
495 * param_tcs_offchip_offset, param_tcs_factor_offset,
496 * param_tcs_offchip_layout, and param_rw_buffers
497 * should be passed to the epilog.
498 */
499 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
500 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
501 for (i = 0; i < 11; i++)
502 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
503 }
504 break;
505
506 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
507 /* Merged stages have 8 system SGPRs at the beginning. */
508 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
509 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
510
511 if (ctx->shader->key.as_ngg)
512 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
513 else
514 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
515
516 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
517 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
518 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
519 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
520 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
521 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
522 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
523
524 declare_global_desc_pointers(ctx);
525 if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {
526 declare_per_stage_desc_pointers(
527 ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
528 }
529
530 if (ctx->stage == MESA_SHADER_VERTEX) {
531 if (shader->selector->info.base.vs.blit_sgprs_amd)
532 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
533 else
534 declare_vs_specific_input_sgprs(ctx);
535 } else {
536 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
537 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
538 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
539 /* Declare as many input SGPRs as the VS has. */
540 }
541
542 if (ctx->stage == MESA_SHADER_VERTEX)
543 declare_vb_descriptor_input_sgprs(ctx);
544
545 /* VGPRs (first GS, then VS/TES) */
546 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
547 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
548 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
549 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
550 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
551
552 if (ctx->stage == MESA_SHADER_VERTEX) {
553 declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
554 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
555 declare_tes_input_vgprs(ctx, ngg_cull_shader);
556 }
557
558 if ((ctx->shader->key.as_es || ngg_cull_shader) &&
559 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
560 unsigned num_user_sgprs, num_vgprs;
561
562 if (ctx->stage == MESA_SHADER_VERTEX) {
563 /* For the NGG cull shader, add 1 SGPR to hold
564 * the vertex buffer pointer.
565 */
566 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + ngg_cull_shader;
567
568 if (ngg_cull_shader && shader->selector->num_vbos_in_user_sgprs) {
569 assert(num_user_sgprs <= 8 + SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
570 num_user_sgprs =
571 SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
572 }
573 } else {
574 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
575 }
576
577 /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
578 *
579 * The normal merged ESGS shader only has to return the 5 VGPRs
580 * for the GS stage.
581 */
582 num_vgprs = ngg_cull_shader ? 10 : 5;
583
584 /* ES return values are inputs to GS. */
585 for (i = 0; i < 8 + num_user_sgprs; i++)
586 returns[num_returns++] = ctx->ac.i32; /* SGPRs */
587 for (i = 0; i < num_vgprs; i++)
588 returns[num_returns++] = ctx->ac.f32; /* VGPRs */
589 }
590 break;
591
592 case MESA_SHADER_TESS_EVAL:
593 declare_global_desc_pointers(ctx);
594 declare_per_stage_desc_pointers(ctx, true);
595 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
596 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
597 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
598
599 if (shader->key.as_es) {
600 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
601 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
602 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
603 } else {
604 declare_streamout_params(ctx, &shader->selector->so);
605 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
606 }
607
608 /* VGPRs */
609 declare_tes_input_vgprs(ctx, ngg_cull_shader);
610 break;
611
612 case MESA_SHADER_GEOMETRY:
613 declare_global_desc_pointers(ctx);
614 declare_per_stage_desc_pointers(ctx, true);
615 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
616 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
617
618 /* VGPRs */
619 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
620 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
621 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
622 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
623 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
624 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
625 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
626 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
627 break;
628
629 case MESA_SHADER_FRAGMENT:
630 declare_global_desc_pointers(ctx);
631 declare_per_stage_desc_pointers(ctx, true);
632 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
633 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
634 SI_PARAM_PRIM_MASK);
635
636 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
637 SI_PARAM_PERSP_SAMPLE);
638 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
639 SI_PARAM_PERSP_CENTER);
640 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
641 SI_PARAM_PERSP_CENTROID);
642 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
643 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
644 SI_PARAM_LINEAR_SAMPLE);
645 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
646 SI_PARAM_LINEAR_CENTER);
647 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
648 SI_PARAM_LINEAR_CENTROID);
649 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
650 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
651 SI_PARAM_POS_X_FLOAT);
652 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
653 SI_PARAM_POS_Y_FLOAT);
654 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
655 SI_PARAM_POS_Z_FLOAT);
656 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
657 SI_PARAM_POS_W_FLOAT);
658 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
659 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
660 SI_PARAM_FRONT_FACE);
661 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
662 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
663 SI_PARAM_ANCILLARY);
664 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
665 SI_PARAM_SAMPLE_COVERAGE);
666 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
667 SI_PARAM_POS_FIXED_PT);
668
669 /* Color inputs from the prolog. */
670 if (shader->selector->info.colors_read) {
671 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
672
673 for (i = 0; i < num_color_elements; i++)
674 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
675
676 num_prolog_vgprs += num_color_elements;
677 }
678
679 /* Outputs for the epilog. */
680 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
681 num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
682 shader->selector->info.writes_z + shader->selector->info.writes_stencil +
683 shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
684
685 num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
686
687 for (i = 0; i < num_return_sgprs; i++)
688 returns[i] = ctx->ac.i32;
689 for (; i < num_returns; i++)
690 returns[i] = ctx->ac.f32;
691 break;
692
693 case MESA_SHADER_COMPUTE:
694 declare_global_desc_pointers(ctx);
695 declare_per_stage_desc_pointers(ctx, true);
696 if (shader->selector->info.uses_grid_size)
697 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
698 if (shader->selector->info.uses_block_size &&
699 shader->selector->info.base.cs.local_size[0] == 0)
700 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
701
702 unsigned cs_user_data_dwords =
703 shader->selector->info.base.cs.user_data_components_amd;
704 if (cs_user_data_dwords) {
705 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
706 }
707
708 /* Some descriptors can be in user SGPRs. */
709 /* Shader buffers in user SGPRs. */
710 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
711 while (ctx->args.num_sgprs_used % 4 != 0)
712 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
713
714 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
715 }
716 /* Images in user SGPRs. */
717 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
718 unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8;
719
720 while (ctx->args.num_sgprs_used % num_sgprs != 0)
721 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
722
723 ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
724 }
725
726 /* Hardware SGPRs. */
727 for (i = 0; i < 3; i++) {
728 if (shader->selector->info.uses_block_id[i]) {
729 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
730 }
731 }
732 if (shader->selector->info.uses_subgroup_info)
733 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
734
735 /* Hardware VGPRs. */
736 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
737 break;
738 default:
739 assert(0 && "unimplemented shader");
740 return;
741 }
742
743 si_llvm_create_func(ctx, ngg_cull_shader ? "ngg_cull_main" : "main", returns, num_returns,
744 si_get_max_workgroup_size(shader));
745
746 /* Reserve register locations for VGPR inputs the PS prolog may need. */
747 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
748 ac_llvm_add_target_dep_function_attr(
749 ctx->main_fn, "InitialPSInputAddr",
750 S_0286D0_PERSP_SAMPLE_ENA(1) | S_0286D0_PERSP_CENTER_ENA(1) |
751 S_0286D0_PERSP_CENTROID_ENA(1) | S_0286D0_LINEAR_SAMPLE_ENA(1) |
752 S_0286D0_LINEAR_CENTER_ENA(1) | S_0286D0_LINEAR_CENTROID_ENA(1) |
753 S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_ANCILLARY_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1));
754 }
755
756 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
757 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
758
759 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
760 shader->info.num_input_vgprs -= num_prolog_vgprs;
761
762 if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
763 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
764 /* The LSHS size is not known until draw time, so we append it
765 * at the end of whatever LDS use there may be in the rest of
766 * the shader (currently none, unless LLVM decides to do its
767 * own LDS-based lowering).
768 */
769 ctx->ac.lds = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
770 "__lds_end", AC_ADDR_SPACE_LDS);
771 LLVMSetAlignment(ctx->ac.lds, 256);
772 } else {
773 ac_declare_lds_as_pointer(&ctx->ac);
774 }
775 }
776
777 /* Unlike radv, we override these arguments in the prolog, so to the
778 * API shader they appear as normal arguments.
779 */
780 if (ctx->stage == MESA_SHADER_VERTEX) {
781 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
782 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
783 } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
784 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
785 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
786 }
787 }
788
789 /* For the UMR disassembler. */
790 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
791 #define DEBUGGER_NUM_MARKERS 5
792
793 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
794 struct ac_rtld_binary *rtld)
795 {
796 const struct si_shader_selector *sel = shader->selector;
797 const char *part_elfs[5];
798 size_t part_sizes[5];
799 unsigned num_parts = 0;
800
801 #define add_part(shader_or_part) \
802 if (shader_or_part) { \
803 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
804 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
805 num_parts++; \
806 }
807
808 add_part(shader->prolog);
809 add_part(shader->previous_stage);
810 add_part(shader->prolog2);
811 add_part(shader);
812 add_part(shader->epilog);
813
814 #undef add_part
815
816 struct ac_rtld_symbol lds_symbols[2];
817 unsigned num_lds_symbols = 0;
818
819 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
820 (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
821 /* We add this symbol even on LLVM <= 8 to ensure that
822 * shader->config.lds_size is set correctly below.
823 */
824 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
825 sym->name = "esgs_ring";
826 sym->size = shader->gs_info.esgs_ring_size * 4;
827 sym->align = 64 * 1024;
828 }
829
830 if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) {
831 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
832 sym->name = "ngg_emit";
833 sym->size = shader->ngg.ngg_emit_size * 4;
834 sym->align = 4;
835 }
836
837 bool ok = ac_rtld_open(
838 rtld, (struct ac_rtld_open_info){.info = &screen->info,
839 .options =
840 {
841 .halt_at_entry = screen->options.halt_shaders,
842 },
843 .shader_type = sel->info.stage,
844 .wave_size = si_get_shader_wave_size(shader),
845 .num_parts = num_parts,
846 .elf_ptrs = part_elfs,
847 .elf_sizes = part_sizes,
848 .num_shared_lds_symbols = num_lds_symbols,
849 .shared_lds_symbols = lds_symbols});
850
851 if (rtld->lds_size > 0) {
852 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
853 shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
854 }
855
856 return ok;
857 }
858
859 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
860 {
861 struct ac_rtld_binary rtld;
862 si_shader_binary_open(screen, shader, &rtld);
863 return rtld.exec_size;
864 }
865
866 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
867 {
868 uint64_t *scratch_va = data;
869
870 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
871 *value = (uint32_t)*scratch_va;
872 return true;
873 }
874 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
875 /* Enable scratch coalescing. */
876 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
877 return true;
878 }
879
880 return false;
881 }
882
883 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
884 uint64_t scratch_va)
885 {
886 struct ac_rtld_binary binary;
887 if (!si_shader_binary_open(sscreen, shader, &binary))
888 return false;
889
890 si_resource_reference(&shader->bo, NULL);
891 shader->bo = si_aligned_buffer_create(
892 &sscreen->b, sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY,
893 PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
894 if (!shader->bo)
895 return false;
896
897 /* Upload. */
898 struct ac_rtld_upload_info u = {};
899 u.binary = &binary;
900 u.get_external_symbol = si_get_external_symbol;
901 u.cb_data = &scratch_va;
902 u.rx_va = shader->bo->gpu_address;
903 u.rx_ptr = sscreen->ws->buffer_map(
904 shader->bo->buf, NULL,
905 PIPE_TRANSFER_READ_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED | RADEON_TRANSFER_TEMPORARY);
906 if (!u.rx_ptr)
907 return false;
908
909 bool ok = ac_rtld_upload(&u);
910
911 sscreen->ws->buffer_unmap(shader->bo->buf);
912 ac_rtld_close(&binary);
913
914 return ok;
915 }
916
917 static void si_shader_dump_disassembly(struct si_screen *screen,
918 const struct si_shader_binary *binary,
919 gl_shader_stage stage, unsigned wave_size,
920 struct pipe_debug_callback *debug, const char *name,
921 FILE *file)
922 {
923 struct ac_rtld_binary rtld_binary;
924
925 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
926 .info = &screen->info,
927 .shader_type = stage,
928 .wave_size = wave_size,
929 .num_parts = 1,
930 .elf_ptrs = &binary->elf_buffer,
931 .elf_sizes = &binary->elf_size}))
932 return;
933
934 const char *disasm;
935 size_t nbytes;
936
937 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
938 goto out;
939
940 if (nbytes > INT_MAX)
941 goto out;
942
943 if (debug && debug->debug_message) {
944 /* Very long debug messages are cut off, so send the
945 * disassembly one line at a time. This causes more
946 * overhead, but on the plus side it simplifies
947 * parsing of resulting logs.
948 */
949 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
950
951 uint64_t line = 0;
952 while (line < nbytes) {
953 int count = nbytes - line;
954 const char *nl = memchr(disasm + line, '\n', nbytes - line);
955 if (nl)
956 count = nl - (disasm + line);
957
958 if (count) {
959 pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
960 }
961
962 line += count + 1;
963 }
964
965 pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
966 }
967
968 if (file) {
969 fprintf(file, "Shader %s disassembly:\n", name);
970 fprintf(file, "%*s", (int)nbytes, disasm);
971 }
972
973 out:
974 ac_rtld_close(&rtld_binary);
975 }
976
977 static void si_calculate_max_simd_waves(struct si_shader *shader)
978 {
979 struct si_screen *sscreen = shader->selector->screen;
980 struct ac_shader_config *conf = &shader->config;
981 unsigned num_inputs = shader->selector->info.num_inputs;
982 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
983 unsigned lds_per_wave = 0;
984 unsigned max_simd_waves;
985
986 max_simd_waves = sscreen->info.max_wave64_per_simd;
987
988 /* Compute LDS usage for PS. */
989 switch (shader->selector->info.stage) {
990 case MESA_SHADER_FRAGMENT:
991 /* The minimum usage per wave is (num_inputs * 48). The maximum
992 * usage is (num_inputs * 48 * 16).
993 * We can get anything in between and it varies between waves.
994 *
995 * The 48 bytes per input for a single primitive is equal to
996 * 4 bytes/component * 4 components/input * 3 points.
997 *
998 * Other stages don't know the size at compile time or don't
999 * allocate LDS per wave, but instead they do it per thread group.
1000 */
1001 lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
1002 break;
1003 case MESA_SHADER_COMPUTE:
1004 if (shader->selector) {
1005 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1006 lds_per_wave = (conf->lds_size * lds_increment) /
1007 DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
1008 }
1009 break;
1010 default:;
1011 }
1012
1013 /* Compute the per-SIMD wave counts. */
1014 if (conf->num_sgprs) {
1015 max_simd_waves =
1016 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1017 }
1018
1019 if (conf->num_vgprs) {
1020 /* Always print wave limits as Wave64, so that we can compare
1021 * Wave32 and Wave64 with shader-db fairly. */
1022 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1023 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1024 }
1025
1026 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1027 if (lds_per_wave)
1028 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1029
1030 shader->info.max_simd_waves = max_simd_waves;
1031 }
1032
1033 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1034 struct pipe_debug_callback *debug)
1035 {
1036 const struct ac_shader_config *conf = &shader->config;
1037
1038 if (screen->options.debug_disassembly)
1039 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,
1040 si_get_shader_wave_size(shader), debug, "main", NULL);
1041
1042 pipe_debug_message(debug, SHADER_INFO,
1043 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1044 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1045 "Spilled VGPRs: %d PrivMem VGPRs: %d",
1046 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1047 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1048 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
1049 }
1050
1051 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1052 bool check_debug_option)
1053 {
1054 const struct ac_shader_config *conf = &shader->config;
1055
1056 if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) {
1057 if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) {
1058 fprintf(file,
1059 "*** SHADER CONFIG ***\n"
1060 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1061 "SPI_PS_INPUT_ENA = 0x%04x\n",
1062 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1063 }
1064
1065 fprintf(file,
1066 "*** SHADER STATS ***\n"
1067 "SGPRS: %d\n"
1068 "VGPRS: %d\n"
1069 "Spilled SGPRs: %d\n"
1070 "Spilled VGPRs: %d\n"
1071 "Private memory VGPRs: %d\n"
1072 "Code Size: %d bytes\n"
1073 "LDS: %d blocks\n"
1074 "Scratch: %d bytes per wave\n"
1075 "Max Waves: %d\n"
1076 "********************\n\n\n",
1077 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1078 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1079 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1080 }
1081 }
1082
1083 const char *si_get_shader_name(const struct si_shader *shader)
1084 {
1085 switch (shader->selector->info.stage) {
1086 case MESA_SHADER_VERTEX:
1087 if (shader->key.as_es)
1088 return "Vertex Shader as ES";
1089 else if (shader->key.as_ls)
1090 return "Vertex Shader as LS";
1091 else if (shader->key.opt.vs_as_prim_discard_cs)
1092 return "Vertex Shader as Primitive Discard CS";
1093 else if (shader->key.as_ngg)
1094 return "Vertex Shader as ESGS";
1095 else
1096 return "Vertex Shader as VS";
1097 case MESA_SHADER_TESS_CTRL:
1098 return "Tessellation Control Shader";
1099 case MESA_SHADER_TESS_EVAL:
1100 if (shader->key.as_es)
1101 return "Tessellation Evaluation Shader as ES";
1102 else if (shader->key.as_ngg)
1103 return "Tessellation Evaluation Shader as ESGS";
1104 else
1105 return "Tessellation Evaluation Shader as VS";
1106 case MESA_SHADER_GEOMETRY:
1107 if (shader->is_gs_copy_shader)
1108 return "GS Copy Shader as VS";
1109 else
1110 return "Geometry Shader";
1111 case MESA_SHADER_FRAGMENT:
1112 return "Pixel Shader";
1113 case MESA_SHADER_COMPUTE:
1114 return "Compute Shader";
1115 default:
1116 return "Unknown Shader";
1117 }
1118 }
1119
1120 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1121 struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
1122 {
1123 gl_shader_stage stage = shader->selector->info.stage;
1124
1125 if (!check_debug_option || si_can_dump_shader(sscreen, stage))
1126 si_dump_shader_key(shader, file);
1127
1128 if (!check_debug_option && shader->binary.llvm_ir_string) {
1129 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1130 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1131 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1132 }
1133
1134 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1135 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1136 }
1137
1138 if (!check_debug_option ||
1139 (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
1140 unsigned wave_size = si_get_shader_wave_size(shader);
1141
1142 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1143
1144 if (shader->prolog)
1145 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug,
1146 "prolog", file);
1147 if (shader->previous_stage)
1148 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1149 wave_size, debug, "previous stage", file);
1150 if (shader->prolog2)
1151 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size,
1152 debug, "prolog2", file);
1153
1154 si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main",
1155 file);
1156
1157 if (shader->epilog)
1158 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug,
1159 "epilog", file);
1160 fprintf(file, "\n");
1161 }
1162
1163 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1164 }
1165
1166 static void si_dump_shader_key_vs(const struct si_shader_key *key,
1167 const struct si_vs_prolog_bits *prolog, const char *prefix,
1168 FILE *f)
1169 {
1170 fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
1171 fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix,
1172 prolog->instance_divisor_is_fetched);
1173 fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
1174 prolog->unpack_instance_id_from_vertex_id);
1175 fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
1176
1177 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
1178 fprintf(f, " mono.vs.fix_fetch = {");
1179 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1180 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
1181 if (i)
1182 fprintf(f, ", ");
1183 if (!fix.bits)
1184 fprintf(f, "0");
1185 else
1186 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1187 fix.u.format);
1188 }
1189 fprintf(f, "}\n");
1190 }
1191
1192 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1193 {
1194 const struct si_shader_key *key = &shader->key;
1195 gl_shader_stage stage = shader->selector->info.stage;
1196
1197 fprintf(f, "SHADER KEY\n");
1198
1199 switch (stage) {
1200 case MESA_SHADER_VERTEX:
1201 si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
1202 fprintf(f, " as_es = %u\n", key->as_es);
1203 fprintf(f, " as_ls = %u\n", key->as_ls);
1204 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1205 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
1206 fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
1207 fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
1208 fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed);
1209 fprintf(f, " opt.cs_instancing = %u\n", key->opt.cs_instancing);
1210 fprintf(f, " opt.cs_primitive_restart = %u\n", key->opt.cs_primitive_restart);
1211 fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
1212 fprintf(f, " opt.cs_need_correct_orientation = %u\n", key->opt.cs_need_correct_orientation);
1213 fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
1214 fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
1215 fprintf(f, " opt.cs_cull_z = %u\n", key->opt.cs_cull_z);
1216 fprintf(f, " opt.cs_halfz_clip_space = %u\n", key->opt.cs_halfz_clip_space);
1217 break;
1218
1219 case MESA_SHADER_TESS_CTRL:
1220 if (shader->selector->screen->info.chip_class >= GFX9) {
1221 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
1222 }
1223 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
1224 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
1225 key->mono.u.ff_tcs_inputs_to_copy);
1226 break;
1227
1228 case MESA_SHADER_TESS_EVAL:
1229 fprintf(f, " as_es = %u\n", key->as_es);
1230 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1231 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
1232 break;
1233
1234 case MESA_SHADER_GEOMETRY:
1235 if (shader->is_gs_copy_shader)
1236 break;
1237
1238 if (shader->selector->screen->info.chip_class >= GFX9 &&
1239 key->part.gs.es->info.stage == MESA_SHADER_VERTEX) {
1240 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
1241 }
1242 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n",
1243 key->part.gs.prolog.tri_strip_adj_fix);
1244 fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
1245 fprintf(f, " as_ngg = %u\n", key->as_ngg);
1246 break;
1247
1248 case MESA_SHADER_COMPUTE:
1249 break;
1250
1251 case MESA_SHADER_FRAGMENT:
1252 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
1253 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
1254 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
1255 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n",
1256 key->part.ps.prolog.force_persp_sample_interp);
1257 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n",
1258 key->part.ps.prolog.force_linear_sample_interp);
1259 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n",
1260 key->part.ps.prolog.force_persp_center_interp);
1261 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n",
1262 key->part.ps.prolog.force_linear_center_interp);
1263 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n",
1264 key->part.ps.prolog.bc_optimize_for_persp);
1265 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n",
1266 key->part.ps.prolog.bc_optimize_for_linear);
1267 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n",
1268 key->part.ps.prolog.samplemask_log_ps_iter);
1269 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n",
1270 key->part.ps.epilog.spi_shader_col_format);
1271 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
1272 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
1273 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
1274 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
1275 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
1276 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n",
1277 key->part.ps.epilog.poly_line_smoothing);
1278 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
1279 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n",
1280 key->mono.u.ps.interpolate_at_sample_force_center);
1281 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
1282 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
1283 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
1284 break;
1285
1286 default:
1287 assert(0);
1288 }
1289
1290 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1291 stage == MESA_SHADER_VERTEX) &&
1292 !key->as_es && !key->as_ls) {
1293 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
1294 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
1295 if (stage != MESA_SHADER_GEOMETRY)
1296 fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
1297 }
1298 }
1299
1300 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
1301 {
1302 struct si_shader *shader = ctx->shader;
1303 struct si_shader_info *info = &shader->selector->info;
1304 unsigned skip_vs_optim_mask = 0;
1305
1306 if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
1307 shader->key.as_ls || shader->key.as_es)
1308 return;
1309
1310 /* Optimizing these outputs is not possible, since they might be overriden
1311 * at runtime with S_028644_PT_SPRITE_TEX. */
1312 for (int i = 0; i < info->num_outputs; i++) {
1313 if (info->output_semantic[i] == VARYING_SLOT_PNTC ||
1314 (info->output_semantic[i] >= VARYING_SLOT_TEX0 &&
1315 info->output_semantic[i] <= VARYING_SLOT_TEX7)) {
1316 skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i];
1317 }
1318 }
1319
1320 ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
1321 info->num_outputs, skip_vs_optim_mask,
1322 &shader->info.nr_param_exports);
1323 }
1324
1325 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1326 const struct si_vs_prolog_bits *prolog_key,
1327 const struct si_shader_key *key, bool ngg_cull_shader)
1328 {
1329 /* VGPR initialization fixup for Vega10 and Raven is always done in the
1330 * VS prolog. */
1331 return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
1332 prolog_key->unpack_instance_id_from_vertex_id ||
1333 (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
1334 }
1335
1336 static bool si_build_main_function(struct si_shader_context *ctx, struct si_shader *shader,
1337 struct nir_shader *nir, bool free_nir, bool ngg_cull_shader)
1338 {
1339 struct si_shader_selector *sel = shader->selector;
1340 const struct si_shader_info *info = &sel->info;
1341
1342 ctx->shader = shader;
1343 ctx->stage = sel->info.stage;
1344
1345 ctx->num_const_buffers = util_last_bit(info->const_buffers_declared);
1346 ctx->num_shader_buffers = info->base.num_ssbos;
1347
1348 ctx->num_samplers = util_last_bit(info->base.textures_used);
1349 ctx->num_images = info->base.num_images;
1350
1351 si_llvm_init_resource_callbacks(ctx);
1352
1353 switch (ctx->stage) {
1354 case MESA_SHADER_VERTEX:
1355 si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
1356 break;
1357 case MESA_SHADER_TESS_CTRL:
1358 si_llvm_init_tcs_callbacks(ctx);
1359 break;
1360 case MESA_SHADER_TESS_EVAL:
1361 si_llvm_init_tes_callbacks(ctx, ngg_cull_shader);
1362 break;
1363 case MESA_SHADER_GEOMETRY:
1364 si_llvm_init_gs_callbacks(ctx);
1365 break;
1366 case MESA_SHADER_FRAGMENT:
1367 si_llvm_init_ps_callbacks(ctx);
1368 break;
1369 case MESA_SHADER_COMPUTE:
1370 ctx->abi.load_local_group_size = si_llvm_get_block_size;
1371 break;
1372 default:
1373 assert(!"Unsupported shader type");
1374 return false;
1375 }
1376
1377 si_create_function(ctx, ngg_cull_shader);
1378
1379 if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
1380 si_preload_esgs_ring(ctx);
1381
1382 if (ctx->stage == MESA_SHADER_GEOMETRY)
1383 si_preload_gs_rings(ctx);
1384 else if (ctx->stage == MESA_SHADER_TESS_EVAL)
1385 si_llvm_preload_tes_rings(ctx);
1386
1387 if (ctx->stage == MESA_SHADER_TESS_CTRL && sel->info.tessfactors_are_def_in_all_invocs) {
1388 for (unsigned i = 0; i < 6; i++) {
1389 ctx->invoc0_tess_factors[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
1390 }
1391 }
1392
1393 if (ctx->stage == MESA_SHADER_GEOMETRY) {
1394 for (unsigned i = 0; i < 4; i++) {
1395 ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1396 }
1397 if (shader->key.as_ngg) {
1398 for (unsigned i = 0; i < 4; ++i) {
1399 ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1400 ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
1401 }
1402
1403 assert(!ctx->gs_ngg_scratch);
1404 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
1405 ctx->gs_ngg_scratch =
1406 LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1407 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
1408 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1409
1410 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
1411 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
1412 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
1413 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
1414 }
1415 }
1416
1417 if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
1418 /* Unconditionally declare scratch space base for streamout and
1419 * vertex compaction. Whether space is actually allocated is
1420 * determined during linking / PM4 creation.
1421 *
1422 * Add an extra dword per vertex to ensure an odd stride, which
1423 * avoids bank conflicts for SoA accesses.
1424 */
1425 if (!gfx10_is_ngg_passthrough(shader))
1426 si_llvm_declare_esgs_ring(ctx);
1427
1428 /* This is really only needed when streamout and / or vertex
1429 * compaction is enabled.
1430 */
1431 if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
1432 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
1433 ctx->gs_ngg_scratch =
1434 LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
1435 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
1436 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
1437 }
1438 }
1439
1440 /* For GFX9 merged shaders:
1441 * - Set EXEC for the first shader. If the prolog is present, set
1442 * EXEC there instead.
1443 * - Add a barrier before the second shader.
1444 * - In the second shader, reset EXEC to ~0 and wrap the main part in
1445 * an if-statement. This is required for correctness in geometry
1446 * shaders, to ensure that empty GS waves do not send GS_EMIT and
1447 * GS_CUT messages.
1448 *
1449 * For monolithic merged shaders, the first shader is wrapped in an
1450 * if-block together with its prolog in si_build_wrapper_function.
1451 *
1452 * NGG vertex and tess eval shaders running as the last
1453 * vertex/geometry stage handle execution explicitly using
1454 * if-statements.
1455 */
1456 if (ctx->screen->info.chip_class >= GFX9) {
1457 if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
1458 (ctx->stage == MESA_SHADER_TESS_EVAL ||
1459 (ctx->stage == MESA_SHADER_VERTEX &&
1460 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
1461 si_init_exec_from_input(ctx, ctx->merged_wave_info, 0);
1462 } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
1463 (shader->key.as_ngg && !shader->key.as_es)) {
1464 LLVMValueRef thread_enabled;
1465 bool nested_barrier;
1466
1467 if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
1468 !shader->key.as_es && !shader->key.opt.ngg_culling))
1469 ac_init_exec_full_mask(&ctx->ac);
1470
1471 if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
1472 shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
1473 gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
1474
1475 /* Build the primitive export at the beginning
1476 * of the shader if possible.
1477 */
1478 if (gfx10_ngg_export_prim_early(shader))
1479 gfx10_ngg_build_export_prim(ctx, NULL, NULL);
1480 }
1481
1482 if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY) {
1483 if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg) {
1484 gfx10_ngg_gs_emit_prologue(ctx);
1485 nested_barrier = false;
1486 } else {
1487 nested_barrier = true;
1488 }
1489
1490 thread_enabled = si_is_gs_thread(ctx);
1491 } else {
1492 thread_enabled = si_is_es_thread(ctx);
1493 nested_barrier = false;
1494 }
1495
1496 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
1497 ctx->merged_wrap_if_label = 11500;
1498 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
1499
1500 if (nested_barrier) {
1501 /* Execute a barrier before the second shader in
1502 * a merged shader.
1503 *
1504 * Execute the barrier inside the conditional block,
1505 * so that empty waves can jump directly to s_endpgm,
1506 * which will also signal the barrier.
1507 *
1508 * This is possible in gfx9, because an empty wave
1509 * for the second shader does not participate in
1510 * the epilogue. With NGG, empty waves may still
1511 * be required to export data (e.g. GS output vertices),
1512 * so we cannot let them exit early.
1513 *
1514 * If the shader is TCS and the TCS epilog is present
1515 * and contains a barrier, it will wait there and then
1516 * reach s_endpgm.
1517 */
1518 si_llvm_emit_barrier(ctx);
1519 }
1520 }
1521 }
1522
1523 bool success = si_nir_build_llvm(ctx, nir);
1524 if (free_nir)
1525 ralloc_free(nir);
1526 if (!success) {
1527 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
1528 return false;
1529 }
1530
1531 si_llvm_build_ret(ctx, ctx->return_value);
1532 return true;
1533 }
1534
1535 /**
1536 * Compute the VS prolog key, which contains all the information needed to
1537 * build the VS prolog function, and set shader->info bits where needed.
1538 *
1539 * \param info Shader info of the vertex shader.
1540 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
1541 * \param has_old_ Whether the preceding shader part is the NGG cull shader.
1542 * \param prolog_key Key of the VS prolog
1543 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
1544 * \param key Output shader part key.
1545 */
1546 static void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
1547 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
1548 struct si_shader *shader_out, union si_shader_part_key *key)
1549 {
1550 memset(key, 0, sizeof(*key));
1551 key->vs_prolog.states = *prolog_key;
1552 key->vs_prolog.num_input_sgprs = num_input_sgprs;
1553 key->vs_prolog.num_inputs = info->num_inputs;
1554 key->vs_prolog.as_ls = shader_out->key.as_ls;
1555 key->vs_prolog.as_es = shader_out->key.as_es;
1556 key->vs_prolog.as_ngg = shader_out->key.as_ngg;
1557 key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
1558
1559 if (ngg_cull_shader) {
1560 key->vs_prolog.gs_fast_launch_tri_list =
1561 !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
1562 key->vs_prolog.gs_fast_launch_tri_strip =
1563 !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
1564 } else {
1565 key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
1566 }
1567
1568 if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
1569 key->vs_prolog.as_ls = 1;
1570 key->vs_prolog.num_merged_next_stage_vgprs = 2;
1571 } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {
1572 key->vs_prolog.as_es = 1;
1573 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1574 } else if (shader_out->key.as_ngg) {
1575 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1576 }
1577
1578 /* Only one of these combinations can be set. as_ngg can be set with as_es. */
1579 assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
1580 (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
1581 1);
1582
1583 /* Enable loading the InstanceID VGPR. */
1584 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1585
1586 if ((key->vs_prolog.states.instance_divisor_is_one |
1587 key->vs_prolog.states.instance_divisor_is_fetched) &
1588 input_mask)
1589 shader_out->info.uses_instanceid = true;
1590 }
1591
1592 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
1593 struct si_shader_selector *sel)
1594 {
1595 if (!compiler->low_opt_passes)
1596 return false;
1597
1598 /* Assume a slow CPU. */
1599 assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.chip_class <= GFX8);
1600
1601 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
1602 * buffer stores. */
1603 return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_instructions > 1000;
1604 }
1605
1606 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
1607 {
1608 *free_nir = false;
1609
1610 if (sel->nir) {
1611 return sel->nir;
1612 } else if (sel->nir_binary) {
1613 struct pipe_screen *screen = &sel->screen->b;
1614 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1615 pipe_shader_type_from_mesa(sel->info.stage));
1616
1617 struct blob_reader blob_reader;
1618 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1619 *free_nir = true;
1620 return nir_deserialize(NULL, options, &blob_reader);
1621 }
1622 return NULL;
1623 }
1624
1625 static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1626 struct si_shader *shader, struct pipe_debug_callback *debug,
1627 struct nir_shader *nir, bool free_nir)
1628 {
1629 struct si_shader_selector *sel = shader->selector;
1630 struct si_shader_context ctx;
1631
1632 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
1633
1634 LLVMValueRef ngg_cull_main_fn = NULL;
1635 if (shader->key.opt.ngg_culling) {
1636 if (!si_build_main_function(&ctx, shader, nir, false, true)) {
1637 si_llvm_dispose(&ctx);
1638 return false;
1639 }
1640 ngg_cull_main_fn = ctx.main_fn;
1641 ctx.main_fn = NULL;
1642 }
1643
1644 if (!si_build_main_function(&ctx, shader, nir, free_nir, false)) {
1645 si_llvm_dispose(&ctx);
1646 return false;
1647 }
1648
1649 if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
1650 LLVMValueRef parts[4];
1651 unsigned num_parts = 0;
1652 bool has_prolog = false;
1653 LLVMValueRef main_fn = ctx.main_fn;
1654
1655 if (ngg_cull_main_fn) {
1656 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
1657 union si_shader_part_key prolog_key;
1658 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
1659 &shader->key.part.vs.prolog, shader, &prolog_key);
1660 prolog_key.vs_prolog.is_monolithic = true;
1661 si_llvm_build_vs_prolog(&ctx, &prolog_key);
1662 parts[num_parts++] = ctx.main_fn;
1663 has_prolog = true;
1664 }
1665 parts[num_parts++] = ngg_cull_main_fn;
1666 }
1667
1668 if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
1669 union si_shader_part_key prolog_key;
1670 si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
1671 &shader->key.part.vs.prolog, shader, &prolog_key);
1672 prolog_key.vs_prolog.is_monolithic = true;
1673 si_llvm_build_vs_prolog(&ctx, &prolog_key);
1674 parts[num_parts++] = ctx.main_fn;
1675 has_prolog = true;
1676 }
1677 parts[num_parts++] = main_fn;
1678
1679 si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
1680
1681 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
1682 si_build_prim_discard_compute_shader(&ctx);
1683 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
1684 LLVMValueRef parts[2];
1685
1686 parts[0] = ngg_cull_main_fn;
1687 parts[1] = ctx.main_fn;
1688
1689 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
1690 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
1691 if (sscreen->info.chip_class >= GFX9) {
1692 struct si_shader_selector *ls = shader->key.part.tcs.ls;
1693 LLVMValueRef parts[4];
1694 bool vs_needs_prolog =
1695 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
1696
1697 /* TCS main part */
1698 parts[2] = ctx.main_fn;
1699
1700 /* TCS epilog */
1701 union si_shader_part_key tcs_epilog_key;
1702 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
1703 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1704 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
1705 parts[3] = ctx.main_fn;
1706
1707 /* VS as LS main part */
1708 nir = get_nir_shader(ls, &free_nir);
1709 struct si_shader shader_ls = {};
1710 shader_ls.selector = ls;
1711 shader_ls.key.as_ls = 1;
1712 shader_ls.key.mono = shader->key.mono;
1713 shader_ls.key.opt = shader->key.opt;
1714 shader_ls.is_monolithic = true;
1715
1716 if (!si_build_main_function(&ctx, &shader_ls, nir, free_nir, false)) {
1717 si_llvm_dispose(&ctx);
1718 return false;
1719 }
1720 shader->info.uses_instanceid |= ls->info.uses_instanceid;
1721 parts[1] = ctx.main_fn;
1722
1723 /* LS prolog */
1724 if (vs_needs_prolog) {
1725 union si_shader_part_key vs_prolog_key;
1726 si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
1727 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
1728 vs_prolog_key.vs_prolog.is_monolithic = true;
1729 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1730 parts[0] = ctx.main_fn;
1731 }
1732
1733 /* Reset the shader context. */
1734 ctx.shader = shader;
1735 ctx.stage = MESA_SHADER_TESS_CTRL;
1736
1737 si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
1738 vs_needs_prolog, vs_needs_prolog ? 2 : 1);
1739 } else {
1740 LLVMValueRef parts[2];
1741 union si_shader_part_key epilog_key;
1742
1743 parts[0] = ctx.main_fn;
1744
1745 memset(&epilog_key, 0, sizeof(epilog_key));
1746 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
1747 si_llvm_build_tcs_epilog(&ctx, &epilog_key);
1748 parts[1] = ctx.main_fn;
1749
1750 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
1751 }
1752 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
1753 if (ctx.screen->info.chip_class >= GFX9) {
1754 struct si_shader_selector *es = shader->key.part.gs.es;
1755 LLVMValueRef es_prolog = NULL;
1756 LLVMValueRef es_main = NULL;
1757 LLVMValueRef gs_prolog = NULL;
1758 LLVMValueRef gs_main = ctx.main_fn;
1759
1760 /* GS prolog */
1761 union si_shader_part_key gs_prolog_key;
1762 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
1763 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1764 gs_prolog_key.gs_prolog.is_monolithic = true;
1765 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
1766 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
1767 gs_prolog = ctx.main_fn;
1768
1769 /* ES main part */
1770 nir = get_nir_shader(es, &free_nir);
1771 struct si_shader shader_es = {};
1772 shader_es.selector = es;
1773 shader_es.key.as_es = 1;
1774 shader_es.key.as_ngg = shader->key.as_ngg;
1775 shader_es.key.mono = shader->key.mono;
1776 shader_es.key.opt = shader->key.opt;
1777 shader_es.is_monolithic = true;
1778
1779 if (!si_build_main_function(&ctx, &shader_es, nir, free_nir, false)) {
1780 si_llvm_dispose(&ctx);
1781 return false;
1782 }
1783 shader->info.uses_instanceid |= es->info.uses_instanceid;
1784 es_main = ctx.main_fn;
1785
1786 /* ES prolog */
1787 if (es->info.stage == MESA_SHADER_VERTEX &&
1788 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
1789 union si_shader_part_key vs_prolog_key;
1790 si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
1791 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
1792 vs_prolog_key.vs_prolog.is_monolithic = true;
1793 si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
1794 es_prolog = ctx.main_fn;
1795 }
1796
1797 /* Reset the shader context. */
1798 ctx.shader = shader;
1799 ctx.stage = MESA_SHADER_GEOMETRY;
1800
1801 /* Prepare the array of shader parts. */
1802 LLVMValueRef parts[4];
1803 unsigned num_parts = 0, main_part, next_first_part;
1804
1805 if (es_prolog)
1806 parts[num_parts++] = es_prolog;
1807
1808 parts[main_part = num_parts++] = es_main;
1809 parts[next_first_part = num_parts++] = gs_prolog;
1810 parts[num_parts++] = gs_main;
1811
1812 si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
1813 } else {
1814 LLVMValueRef parts[2];
1815 union si_shader_part_key prolog_key;
1816
1817 parts[1] = ctx.main_fn;
1818
1819 memset(&prolog_key, 0, sizeof(prolog_key));
1820 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
1821 si_llvm_build_gs_prolog(&ctx, &prolog_key);
1822 parts[0] = ctx.main_fn;
1823
1824 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
1825 }
1826 } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
1827 si_llvm_build_monolithic_ps(&ctx, shader);
1828 }
1829
1830 si_llvm_optimize_module(&ctx);
1831
1832 /* Post-optimization transformations and analysis. */
1833 si_optimize_vs_outputs(&ctx);
1834
1835 if ((debug && debug->debug_message) || si_can_dump_shader(sscreen, ctx.stage)) {
1836 ctx.shader->info.private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_fn);
1837 }
1838
1839 /* Make sure the input is a pointer and not integer followed by inttoptr. */
1840 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
1841
1842 /* Compile to bytecode. */
1843 if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
1844 ctx.stage, si_get_shader_name(shader),
1845 si_should_optimize_less(compiler, shader->selector))) {
1846 si_llvm_dispose(&ctx);
1847 fprintf(stderr, "LLVM failed to compile shader\n");
1848 return false;
1849 }
1850
1851 si_llvm_dispose(&ctx);
1852 return true;
1853 }
1854
1855 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1856 struct si_shader *shader, struct pipe_debug_callback *debug)
1857 {
1858 struct si_shader_selector *sel = shader->selector;
1859 bool free_nir;
1860 struct nir_shader *nir = get_nir_shader(sel, &free_nir);
1861
1862 /* Dump NIR before doing NIR->LLVM conversion in case the
1863 * conversion fails. */
1864 if (si_can_dump_shader(sscreen, sel->info.stage) &&
1865 !(sscreen->debug_flags & DBG(NO_NIR))) {
1866 nir_print_shader(nir, stderr);
1867 si_dump_streamout(&sel->so);
1868 }
1869
1870 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
1871 sizeof(shader->info.vs_output_param_offset));
1872
1873 shader->info.uses_instanceid = sel->info.uses_instanceid;
1874
1875 /* TODO: ACO could compile non-monolithic shaders here (starting
1876 * with PS and NGG VS), but monolithic shaders should be compiled
1877 * by LLVM due to more complicated compilation.
1878 */
1879 if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
1880 return false;
1881
1882 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
1883 * LLVM 3.9svn has this bug.
1884 */
1885 if (sel->info.stage == MESA_SHADER_COMPUTE) {
1886 unsigned wave_size = sscreen->compute_wave_size;
1887 unsigned max_vgprs =
1888 sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
1889 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
1890 unsigned max_sgprs_per_wave = 128;
1891 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
1892 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
1893 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
1894 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
1895
1896 max_vgprs = max_vgprs / waves_per_simd;
1897 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
1898
1899 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
1900 fprintf(stderr,
1901 "LLVM failed to compile a shader correctly: "
1902 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
1903 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
1904
1905 /* Just terminate the process, because dependent
1906 * shaders can hang due to bad input data, but use
1907 * the env var to allow shader-db to work.
1908 */
1909 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
1910 abort();
1911 }
1912 }
1913
1914 /* Add the scratch offset to input SGPRs. */
1915 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
1916 shader->info.num_input_sgprs += 1; /* scratch byte offset */
1917
1918 /* Calculate the number of fragment input VGPRs. */
1919 if (sel->info.stage == MESA_SHADER_FRAGMENT) {
1920 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
1921 &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
1922 }
1923
1924 si_calculate_max_simd_waves(shader);
1925 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
1926 return true;
1927 }
1928
1929 /**
1930 * Create, compile and return a shader part (prolog or epilog).
1931 *
1932 * \param sscreen screen
1933 * \param list list of shader parts of the same category
1934 * \param type shader type
1935 * \param key shader part key
1936 * \param prolog whether the part being requested is a prolog
1937 * \param tm LLVM target machine
1938 * \param debug debug callback
1939 * \param build the callback responsible for building the main function
1940 * \return non-NULL on success
1941 */
1942 static struct si_shader_part *
1943 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
1944 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
1945 struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
1946 void (*build)(struct si_shader_context *, union si_shader_part_key *),
1947 const char *name)
1948 {
1949 struct si_shader_part *result;
1950
1951 simple_mtx_lock(&sscreen->shader_parts_mutex);
1952
1953 /* Find existing. */
1954 for (result = *list; result; result = result->next) {
1955 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
1956 simple_mtx_unlock(&sscreen->shader_parts_mutex);
1957 return result;
1958 }
1959 }
1960
1961 /* Compile a new one. */
1962 result = CALLOC_STRUCT(si_shader_part);
1963 result->key = *key;
1964
1965 struct si_shader_selector sel = {};
1966 sel.screen = sscreen;
1967
1968 struct si_shader shader = {};
1969 shader.selector = &sel;
1970
1971 switch (stage) {
1972 case MESA_SHADER_VERTEX:
1973 shader.key.as_ls = key->vs_prolog.as_ls;
1974 shader.key.as_es = key->vs_prolog.as_es;
1975 shader.key.as_ngg = key->vs_prolog.as_ngg;
1976 shader.key.opt.ngg_culling =
1977 (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
1978 (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
1979 shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
1980 break;
1981 case MESA_SHADER_TESS_CTRL:
1982 assert(!prolog);
1983 shader.key.part.tcs.epilog = key->tcs_epilog.states;
1984 break;
1985 case MESA_SHADER_GEOMETRY:
1986 assert(prolog);
1987 shader.key.as_ngg = key->gs_prolog.as_ngg;
1988 break;
1989 case MESA_SHADER_FRAGMENT:
1990 if (prolog)
1991 shader.key.part.ps.prolog = key->ps_prolog.states;
1992 else
1993 shader.key.part.ps.epilog = key->ps_epilog.states;
1994 break;
1995 default:
1996 unreachable("bad shader part");
1997 }
1998
1999 struct si_shader_context ctx;
2000 si_llvm_context_init(&ctx, sscreen, compiler,
2001 si_get_wave_size(sscreen, stage,
2002 shader.key.as_ngg, shader.key.as_es,
2003 shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
2004 shader.key.opt.vs_as_prim_discard_cs));
2005 ctx.shader = &shader;
2006 ctx.stage = stage;
2007
2008 build(&ctx, key);
2009
2010 /* Compile. */
2011 si_llvm_optimize_module(&ctx);
2012
2013 if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
2014 ctx.stage, name, false)) {
2015 FREE(result);
2016 result = NULL;
2017 goto out;
2018 }
2019
2020 result->next = *list;
2021 *list = result;
2022
2023 out:
2024 si_llvm_dispose(&ctx);
2025 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2026 return result;
2027 }
2028
2029 static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2030 struct si_shader *shader, struct pipe_debug_callback *debug,
2031 struct si_shader *main_part, const struct si_vs_prolog_bits *key)
2032 {
2033 struct si_shader_selector *vs = main_part->selector;
2034
2035 if (!si_vs_needs_prolog(vs, key, &shader->key, false))
2036 return true;
2037
2038 /* Get the prolog. */
2039 union si_shader_part_key prolog_key;
2040 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
2041 &prolog_key);
2042
2043 shader->prolog =
2044 si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
2045 compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
2046 return shader->prolog != NULL;
2047 }
2048
2049 /**
2050 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
2051 */
2052 static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2053 struct si_shader *shader, struct pipe_debug_callback *debug)
2054 {
2055 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
2056 }
2057
2058 /**
2059 * Select and compile (or reuse) TCS parts (epilog).
2060 */
2061 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2062 struct si_shader *shader, struct pipe_debug_callback *debug)
2063 {
2064 if (sscreen->info.chip_class >= GFX9) {
2065 struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
2066
2067 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
2068 &shader->key.part.tcs.ls_prolog))
2069 return false;
2070
2071 shader->previous_stage = ls_main_part;
2072 }
2073
2074 /* Get the epilog. */
2075 union si_shader_part_key epilog_key;
2076 memset(&epilog_key, 0, sizeof(epilog_key));
2077 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2078
2079 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
2080 &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
2081 "Tessellation Control Shader Epilog");
2082 return shader->epilog != NULL;
2083 }
2084
2085 /**
2086 * Select and compile (or reuse) GS parts (prolog).
2087 */
2088 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2089 struct si_shader *shader, struct pipe_debug_callback *debug)
2090 {
2091 if (sscreen->info.chip_class >= GFX9) {
2092 struct si_shader *es_main_part;
2093
2094 if (shader->key.as_ngg)
2095 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
2096 else
2097 es_main_part = shader->key.part.gs.es->main_shader_part_es;
2098
2099 if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
2100 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
2101 &shader->key.part.gs.vs_prolog))
2102 return false;
2103
2104 shader->previous_stage = es_main_part;
2105 }
2106
2107 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
2108 return true;
2109
2110 union si_shader_part_key prolog_key;
2111 memset(&prolog_key, 0, sizeof(prolog_key));
2112 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
2113 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
2114
2115 shader->prolog2 =
2116 si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,
2117 compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
2118 return shader->prolog2 != NULL;
2119 }
2120
2121 /**
2122 * Compute the PS prolog key, which contains all the information needed to
2123 * build the PS prolog function, and set related bits in shader->config.
2124 */
2125 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
2126 bool separate_prolog)
2127 {
2128 struct si_shader_info *info = &shader->selector->info;
2129
2130 memset(key, 0, sizeof(*key));
2131 key->ps_prolog.states = shader->key.part.ps.prolog;
2132 key->ps_prolog.colors_read = info->colors_read;
2133 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
2134 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
2135 key->ps_prolog.wqm =
2136 info->uses_derivatives &&
2137 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2138 key->ps_prolog.states.force_linear_sample_interp ||
2139 key->ps_prolog.states.force_persp_center_interp ||
2140 key->ps_prolog.states.force_linear_center_interp ||
2141 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
2142 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
2143
2144 if (info->colors_read) {
2145 unsigned *color = shader->selector->color_attr_index;
2146
2147 if (shader->key.part.ps.prolog.color_two_side) {
2148 /* BCOLORs are stored after the last input. */
2149 key->ps_prolog.num_interp_inputs = info->num_inputs;
2150 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
2151 if (separate_prolog)
2152 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2153 }
2154
2155 for (unsigned i = 0; i < 2; i++) {
2156 unsigned interp = info->color_interpolate[i];
2157 unsigned location = info->color_interpolate_loc[i];
2158
2159 if (!(info->colors_read & (0xf << i * 4)))
2160 continue;
2161
2162 key->ps_prolog.color_attr_index[i] = color[i];
2163
2164 if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
2165 interp = INTERP_MODE_FLAT;
2166
2167 switch (interp) {
2168 case INTERP_MODE_FLAT:
2169 key->ps_prolog.color_interp_vgpr_index[i] = -1;
2170 break;
2171 case INTERP_MODE_SMOOTH:
2172 case INTERP_MODE_COLOR:
2173 /* Force the interpolation location for colors here. */
2174 if (shader->key.part.ps.prolog.force_persp_sample_interp)
2175 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2176 if (shader->key.part.ps.prolog.force_persp_center_interp)
2177 location = TGSI_INTERPOLATE_LOC_CENTER;
2178
2179 switch (location) {
2180 case TGSI_INTERPOLATE_LOC_SAMPLE:
2181 key->ps_prolog.color_interp_vgpr_index[i] = 0;
2182 if (separate_prolog) {
2183 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2184 }
2185 break;
2186 case TGSI_INTERPOLATE_LOC_CENTER:
2187 key->ps_prolog.color_interp_vgpr_index[i] = 2;
2188 if (separate_prolog) {
2189 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2190 }
2191 break;
2192 case TGSI_INTERPOLATE_LOC_CENTROID:
2193 key->ps_prolog.color_interp_vgpr_index[i] = 4;
2194 if (separate_prolog) {
2195 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
2196 }
2197 break;
2198 default:
2199 assert(0);
2200 }
2201 break;
2202 case INTERP_MODE_NOPERSPECTIVE:
2203 /* Force the interpolation location for colors here. */
2204 if (shader->key.part.ps.prolog.force_linear_sample_interp)
2205 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2206 if (shader->key.part.ps.prolog.force_linear_center_interp)
2207 location = TGSI_INTERPOLATE_LOC_CENTER;
2208
2209 /* The VGPR assignment for non-monolithic shaders
2210 * works because InitialPSInputAddr is set on the
2211 * main shader and PERSP_PULL_MODEL is never used.
2212 */
2213 switch (location) {
2214 case TGSI_INTERPOLATE_LOC_SAMPLE:
2215 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
2216 if (separate_prolog) {
2217 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2218 }
2219 break;
2220 case TGSI_INTERPOLATE_LOC_CENTER:
2221 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
2222 if (separate_prolog) {
2223 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2224 }
2225 break;
2226 case TGSI_INTERPOLATE_LOC_CENTROID:
2227 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
2228 if (separate_prolog) {
2229 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
2230 }
2231 break;
2232 default:
2233 assert(0);
2234 }
2235 break;
2236 default:
2237 assert(0);
2238 }
2239 }
2240 }
2241 }
2242
2243 /**
2244 * Check whether a PS prolog is required based on the key.
2245 */
2246 bool si_need_ps_prolog(const union si_shader_part_key *key)
2247 {
2248 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2249 key->ps_prolog.states.force_linear_sample_interp ||
2250 key->ps_prolog.states.force_persp_center_interp ||
2251 key->ps_prolog.states.force_linear_center_interp ||
2252 key->ps_prolog.states.bc_optimize_for_persp ||
2253 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
2254 key->ps_prolog.states.samplemask_log_ps_iter;
2255 }
2256
2257 /**
2258 * Compute the PS epilog key, which contains all the information needed to
2259 * build the PS epilog function.
2260 */
2261 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
2262 {
2263 struct si_shader_info *info = &shader->selector->info;
2264 memset(key, 0, sizeof(*key));
2265 key->ps_epilog.colors_written = info->colors_written;
2266 key->ps_epilog.writes_z = info->writes_z;
2267 key->ps_epilog.writes_stencil = info->writes_stencil;
2268 key->ps_epilog.writes_samplemask = info->writes_samplemask;
2269 key->ps_epilog.states = shader->key.part.ps.epilog;
2270 }
2271
2272 /**
2273 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
2274 */
2275 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2276 struct si_shader *shader, struct pipe_debug_callback *debug)
2277 {
2278 union si_shader_part_key prolog_key;
2279 union si_shader_part_key epilog_key;
2280
2281 /* Get the prolog. */
2282 si_get_ps_prolog_key(shader, &prolog_key, true);
2283
2284 /* The prolog is a no-op if these aren't set. */
2285 if (si_need_ps_prolog(&prolog_key)) {
2286 shader->prolog =
2287 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
2288 compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
2289 if (!shader->prolog)
2290 return false;
2291 }
2292
2293 /* Get the epilog. */
2294 si_get_ps_epilog_key(shader, &epilog_key);
2295
2296 shader->epilog =
2297 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
2298 compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
2299 if (!shader->epilog)
2300 return false;
2301
2302 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2303 if (shader->key.part.ps.prolog.poly_stipple) {
2304 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2305 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
2306 }
2307
2308 /* Set up the enable bits for per-sample shading if needed. */
2309 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
2310 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2311 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2312 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2313 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2314 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2315 }
2316 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
2317 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2318 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2319 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2320 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2321 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2322 }
2323 if (shader->key.part.ps.prolog.force_persp_center_interp &&
2324 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2325 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2326 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2327 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2328 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2329 }
2330 if (shader->key.part.ps.prolog.force_linear_center_interp &&
2331 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2332 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2333 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2334 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2335 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2336 }
2337
2338 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2339 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2340 !(shader->config.spi_ps_input_ena & 0xf)) {
2341 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2342 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
2343 }
2344
2345 /* At least one pair of interpolation weights must be enabled. */
2346 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
2347 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2348 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
2349 }
2350
2351 /* Samplemask fixup requires the sample ID. */
2352 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
2353 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2354 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
2355 }
2356
2357 /* The sample mask input is always enabled, because the API shader always
2358 * passes it through to the epilog. Disable it here if it's unused.
2359 */
2360 if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
2361 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2362
2363 return true;
2364 }
2365
2366 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
2367 {
2368 /* If tessellation is all offchip and on-chip GS isn't used, this
2369 * workaround is not needed.
2370 */
2371 return;
2372
2373 /* SPI barrier management bug:
2374 * Make sure we have at least 4k of LDS in use to avoid the bug.
2375 * It applies to workgroup sizes of more than one wavefront.
2376 */
2377 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
2378 *lds_size = MAX2(*lds_size, 8);
2379 }
2380
2381 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
2382 {
2383 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
2384
2385 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
2386
2387 if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&
2388 si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
2389 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
2390 }
2391 }
2392
2393 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2394 struct si_shader *shader, struct pipe_debug_callback *debug)
2395 {
2396 struct si_shader_selector *sel = shader->selector;
2397 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
2398
2399 /* LS, ES, VS are compiled on demand if the main part hasn't been
2400 * compiled for that stage.
2401 *
2402 * GS are compiled on demand if the main part hasn't been compiled
2403 * for the chosen NGG-ness.
2404 *
2405 * Vertex shaders are compiled on demand when a vertex fetch
2406 * workaround must be applied.
2407 */
2408 if (shader->is_monolithic) {
2409 /* Monolithic shader (compiled as a whole, has many variants,
2410 * may take a long time to compile).
2411 */
2412 if (!si_compile_shader(sscreen, compiler, shader, debug))
2413 return false;
2414 } else {
2415 /* The shader consists of several parts:
2416 *
2417 * - the middle part is the user shader, it has 1 variant only
2418 * and it was compiled during the creation of the shader
2419 * selector
2420 * - the prolog part is inserted at the beginning
2421 * - the epilog part is inserted at the end
2422 *
2423 * The prolog and epilog have many (but simple) variants.
2424 *
2425 * Starting with gfx9, geometry and tessellation control
2426 * shaders also contain the prolog and user shader parts of
2427 * the previous shader stage.
2428 */
2429
2430 if (!mainp)
2431 return false;
2432
2433 /* Copy the compiled shader data over. */
2434 shader->is_binary_shared = true;
2435 shader->binary = mainp->binary;
2436 shader->config = mainp->config;
2437 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
2438 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
2439 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
2440 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
2441 memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
2442 sizeof(mainp->info.vs_output_param_offset));
2443 shader->info.uses_instanceid = mainp->info.uses_instanceid;
2444 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
2445 shader->info.nr_param_exports = mainp->info.nr_param_exports;
2446
2447 /* Select prologs and/or epilogs. */
2448 switch (sel->info.stage) {
2449 case MESA_SHADER_VERTEX:
2450 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
2451 return false;
2452 break;
2453 case MESA_SHADER_TESS_CTRL:
2454 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
2455 return false;
2456 break;
2457 case MESA_SHADER_TESS_EVAL:
2458 break;
2459 case MESA_SHADER_GEOMETRY:
2460 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
2461 return false;
2462 break;
2463 case MESA_SHADER_FRAGMENT:
2464 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
2465 return false;
2466
2467 /* Make sure we have at least as many VGPRs as there
2468 * are allocated inputs.
2469 */
2470 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
2471 break;
2472 default:;
2473 }
2474
2475 /* Update SGPR and VGPR counts. */
2476 if (shader->prolog) {
2477 shader->config.num_sgprs =
2478 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
2479 shader->config.num_vgprs =
2480 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
2481 }
2482 if (shader->previous_stage) {
2483 shader->config.num_sgprs =
2484 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
2485 shader->config.num_vgprs =
2486 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
2487 shader->config.spilled_sgprs =
2488 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
2489 shader->config.spilled_vgprs =
2490 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
2491 shader->info.private_mem_vgprs =
2492 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
2493 shader->config.scratch_bytes_per_wave =
2494 MAX2(shader->config.scratch_bytes_per_wave,
2495 shader->previous_stage->config.scratch_bytes_per_wave);
2496 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
2497 }
2498 if (shader->prolog2) {
2499 shader->config.num_sgprs =
2500 MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
2501 shader->config.num_vgprs =
2502 MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
2503 }
2504 if (shader->epilog) {
2505 shader->config.num_sgprs =
2506 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
2507 shader->config.num_vgprs =
2508 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
2509 }
2510 si_calculate_max_simd_waves(shader);
2511 }
2512
2513 if (shader->key.as_ngg) {
2514 assert(!shader->key.as_es && !shader->key.as_ls);
2515 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
2516 fprintf(stderr, "Failed to compute subgroup info\n");
2517 return false;
2518 }
2519 } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) {
2520 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
2521 }
2522
2523 si_fix_resource_usage(sscreen, shader);
2524 si_shader_dump(sscreen, shader, debug, stderr, true);
2525
2526 /* Upload. */
2527 if (!si_shader_binary_upload(sscreen, shader, 0)) {
2528 fprintf(stderr, "LLVM failed to upload shader\n");
2529 return false;
2530 }
2531
2532 return true;
2533 }
2534
2535 void si_shader_binary_clean(struct si_shader_binary *binary)
2536 {
2537 free((void *)binary->elf_buffer);
2538 binary->elf_buffer = NULL;
2539
2540 free(binary->llvm_ir_string);
2541 binary->llvm_ir_string = NULL;
2542 }
2543
2544 void si_shader_destroy(struct si_shader *shader)
2545 {
2546 if (shader->scratch_bo)
2547 si_resource_reference(&shader->scratch_bo, NULL);
2548
2549 si_resource_reference(&shader->bo, NULL);
2550
2551 if (!shader->is_binary_shared)
2552 si_shader_binary_clean(&shader->binary);
2553
2554 free(shader->shader_log);
2555 }