radeonsi: move code for shader resources into si_shader_llvm_resources.c
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include <llvm/Config/llvm-config.h>
26
27 #include "util/u_memory.h"
28 #include "tgsi/tgsi_strings.h"
29 #include "tgsi/tgsi_from_mesa.h"
30
31 #include "ac_exp_param.h"
32 #include "ac_shader_util.h"
33 #include "ac_rtld.h"
34 #include "ac_llvm_util.h"
35 #include "si_shader_internal.h"
36 #include "si_pipe.h"
37 #include "sid.h"
38
39 #include "compiler/nir/nir.h"
40 #include "compiler/nir/nir_serialize.h"
41
42 static const char scratch_rsrc_dword0_symbol[] =
43 "SCRATCH_RSRC_DWORD0";
44
45 static const char scratch_rsrc_dword1_symbol[] =
46 "SCRATCH_RSRC_DWORD1";
47
48 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
49
50 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
51 union si_shader_part_key *key);
52
53 /** Whether the shader runs as a combination of multiple API shaders */
54 static bool is_multi_part_shader(struct si_shader_context *ctx)
55 {
56 if (ctx->screen->info.chip_class <= GFX8)
57 return false;
58
59 return ctx->shader->key.as_ls ||
60 ctx->shader->key.as_es ||
61 ctx->type == PIPE_SHADER_TESS_CTRL ||
62 ctx->type == PIPE_SHADER_GEOMETRY;
63 }
64
65 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
66 bool si_is_merged_shader(struct si_shader_context *ctx)
67 {
68 return ctx->shader->key.as_ngg || is_multi_part_shader(ctx);
69 }
70
71 /**
72 * Returns a unique index for a per-patch semantic name and index. The index
73 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
74 * can be calculated.
75 */
76 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
77 {
78 switch (semantic_name) {
79 case TGSI_SEMANTIC_TESSOUTER:
80 return 0;
81 case TGSI_SEMANTIC_TESSINNER:
82 return 1;
83 case TGSI_SEMANTIC_PATCH:
84 assert(index < 30);
85 return 2 + index;
86
87 default:
88 assert(!"invalid semantic name");
89 return 0;
90 }
91 }
92
93 /**
94 * Returns a unique index for a semantic name and index. The index must be
95 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
96 * calculated.
97 */
98 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
99 unsigned is_varying)
100 {
101 switch (semantic_name) {
102 case TGSI_SEMANTIC_POSITION:
103 return 0;
104 case TGSI_SEMANTIC_GENERIC:
105 /* Since some shader stages use the the highest used IO index
106 * to determine the size to allocate for inputs/outputs
107 * (in LDS, tess and GS rings). GENERIC should be placed right
108 * after POSITION to make that size as small as possible.
109 */
110 if (index < SI_MAX_IO_GENERIC)
111 return 1 + index;
112
113 assert(!"invalid generic index");
114 return 0;
115 case TGSI_SEMANTIC_FOG:
116 return SI_MAX_IO_GENERIC + 1;
117 case TGSI_SEMANTIC_COLOR:
118 assert(index < 2);
119 return SI_MAX_IO_GENERIC + 2 + index;
120 case TGSI_SEMANTIC_BCOLOR:
121 assert(index < 2);
122 /* If it's a varying, COLOR and BCOLOR alias. */
123 if (is_varying)
124 return SI_MAX_IO_GENERIC + 2 + index;
125 else
126 return SI_MAX_IO_GENERIC + 4 + index;
127 case TGSI_SEMANTIC_TEXCOORD:
128 assert(index < 8);
129 return SI_MAX_IO_GENERIC + 6 + index;
130
131 /* These are rarely used between LS and HS or ES and GS. */
132 case TGSI_SEMANTIC_CLIPDIST:
133 assert(index < 2);
134 return SI_MAX_IO_GENERIC + 6 + 8 + index;
135 case TGSI_SEMANTIC_CLIPVERTEX:
136 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
137 case TGSI_SEMANTIC_PSIZE:
138 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
139
140 /* These can't be written by LS, HS, and ES. */
141 case TGSI_SEMANTIC_LAYER:
142 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
143 case TGSI_SEMANTIC_VIEWPORT_INDEX:
144 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
145 case TGSI_SEMANTIC_PRIMID:
146 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
147 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
148 default:
149 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
150 assert(!"invalid semantic name");
151 return 0;
152 }
153 }
154
155 /**
156 * Get the value of a shader input parameter and extract a bitfield.
157 */
158 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
159 LLVMValueRef value, unsigned rshift,
160 unsigned bitwidth)
161 {
162 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
163 value = ac_to_integer(&ctx->ac, value);
164
165 if (rshift)
166 value = LLVMBuildLShr(ctx->ac.builder, value,
167 LLVMConstInt(ctx->i32, rshift, 0), "");
168
169 if (rshift + bitwidth < 32) {
170 unsigned mask = (1 << bitwidth) - 1;
171 value = LLVMBuildAnd(ctx->ac.builder, value,
172 LLVMConstInt(ctx->i32, mask, 0), "");
173 }
174
175 return value;
176 }
177
178 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
179 struct ac_arg param, unsigned rshift,
180 unsigned bitwidth)
181 {
182 LLVMValueRef value = ac_get_arg(&ctx->ac, param);
183
184 return unpack_llvm_param(ctx, value, rshift, bitwidth);
185 }
186
187 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
188 LLVMValueRef i32, unsigned index)
189 {
190 assert(index <= 1);
191
192 if (index == 1)
193 return LLVMBuildAShr(ctx->ac.builder, i32,
194 LLVMConstInt(ctx->i32, 16, 0), "");
195
196 return LLVMBuildSExt(ctx->ac.builder,
197 LLVMBuildTrunc(ctx->ac.builder, i32,
198 ctx->ac.i16, ""),
199 ctx->i32, "");
200 }
201
202 void si_llvm_load_input_vs(
203 struct si_shader_context *ctx,
204 unsigned input_index,
205 LLVMValueRef out[4])
206 {
207 const struct si_shader_info *info = &ctx->shader->selector->info;
208 unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
209
210 if (vs_blit_property) {
211 LLVMValueRef vertex_id = ctx->abi.vertex_id;
212 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
213 LLVMIntULE, vertex_id,
214 ctx->i32_1, "");
215 /* Use LLVMIntNE, because we have 3 vertices and only
216 * the middle one should use y2.
217 */
218 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
219 LLVMIntNE, vertex_id,
220 ctx->i32_1, "");
221
222 unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
223 if (input_index == 0) {
224 /* Position: */
225 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
226 param_vs_blit_inputs);
227 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
228 param_vs_blit_inputs + 1);
229
230 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
231 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
232 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
233 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
234
235 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
236 x1, x2, "");
237 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
238 y1, y2, "");
239
240 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
241 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
242 out[2] = LLVMGetParam(ctx->main_fn,
243 param_vs_blit_inputs + 2);
244 out[3] = ctx->ac.f32_1;
245 return;
246 }
247
248 /* Color or texture coordinates: */
249 assert(input_index == 1);
250
251 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
252 for (int i = 0; i < 4; i++) {
253 out[i] = LLVMGetParam(ctx->main_fn,
254 param_vs_blit_inputs + 3 + i);
255 }
256 } else {
257 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
258 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
259 param_vs_blit_inputs + 3);
260 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
261 param_vs_blit_inputs + 4);
262 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
263 param_vs_blit_inputs + 5);
264 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
265 param_vs_blit_inputs + 6);
266
267 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
268 x1, x2, "");
269 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
270 y1, y2, "");
271 out[2] = LLVMGetParam(ctx->main_fn,
272 param_vs_blit_inputs + 7);
273 out[3] = LLVMGetParam(ctx->main_fn,
274 param_vs_blit_inputs + 8);
275 }
276 return;
277 }
278
279 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
280 union si_vs_fix_fetch fix_fetch;
281 LLVMValueRef vb_desc;
282 LLVMValueRef vertex_index;
283 LLVMValueRef tmp;
284
285 if (input_index < num_vbos_in_user_sgprs) {
286 vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
287 } else {
288 unsigned index= input_index - num_vbos_in_user_sgprs;
289 vb_desc = ac_build_load_to_sgpr(&ctx->ac,
290 ac_get_arg(&ctx->ac, ctx->vertex_buffers),
291 LLVMConstInt(ctx->i32, index, 0));
292 }
293
294 vertex_index = LLVMGetParam(ctx->main_fn,
295 ctx->vertex_index0.arg_index +
296 input_index);
297
298 /* Use the open-coded implementation for all loads of doubles and
299 * of dword-sized data that needs fixups. We need to insert conversion
300 * code anyway, and the amd/common code does it for us.
301 *
302 * Note: On LLVM <= 8, we can only open-code formats with
303 * channel size >= 4 bytes.
304 */
305 bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
306 fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
307 if (opencode ||
308 (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
309 (fix_fetch.u.log_size == 2)) {
310 tmp = ac_build_opencoded_load_format(
311 &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
312 fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
313 vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
314 for (unsigned i = 0; i < 4; ++i)
315 out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
316 return;
317 }
318
319 /* Do multiple loads for special formats. */
320 unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
321 LLVMValueRef fetches[4];
322 unsigned num_fetches;
323 unsigned fetch_stride;
324 unsigned channels_per_fetch;
325
326 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
327 num_fetches = MIN2(required_channels, 3);
328 fetch_stride = 1 << fix_fetch.u.log_size;
329 channels_per_fetch = 1;
330 } else {
331 num_fetches = 1;
332 fetch_stride = 0;
333 channels_per_fetch = required_channels;
334 }
335
336 for (unsigned i = 0; i < num_fetches; ++i) {
337 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
338 fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
339 channels_per_fetch, 0, true);
340 }
341
342 if (num_fetches == 1 && channels_per_fetch > 1) {
343 LLVMValueRef fetch = fetches[0];
344 for (unsigned i = 0; i < channels_per_fetch; ++i) {
345 tmp = LLVMConstInt(ctx->i32, i, false);
346 fetches[i] = LLVMBuildExtractElement(
347 ctx->ac.builder, fetch, tmp, "");
348 }
349 num_fetches = channels_per_fetch;
350 channels_per_fetch = 1;
351 }
352
353 for (unsigned i = num_fetches; i < 4; ++i)
354 fetches[i] = LLVMGetUndef(ctx->f32);
355
356 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
357 required_channels == 4) {
358 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
359 fetches[3] = ctx->ac.i32_1;
360 else
361 fetches[3] = ctx->ac.f32_1;
362 } else if (fix_fetch.u.log_size == 3 &&
363 (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
364 fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
365 fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
366 required_channels == 4) {
367 /* For 2_10_10_10, the hardware returns an unsigned value;
368 * convert it to a signed one.
369 */
370 LLVMValueRef tmp = fetches[3];
371 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
372
373 /* First, recover the sign-extended signed integer value. */
374 if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
375 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
376 else
377 tmp = ac_to_integer(&ctx->ac, tmp);
378
379 /* For the integer-like cases, do a natural sign extension.
380 *
381 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
382 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
383 * exponent.
384 */
385 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
386 fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
387 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
388 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
389
390 /* Convert back to the right type. */
391 if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
392 LLVMValueRef clamp;
393 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
394 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
395 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
396 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
397 } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
398 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
399 }
400
401 fetches[3] = tmp;
402 }
403
404 for (unsigned i = 0; i < 4; ++i)
405 out[i] = ac_to_float(&ctx->ac, fetches[i]);
406 }
407
408 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
409 unsigned swizzle)
410 {
411 if (swizzle > 0)
412 return ctx->i32_0;
413
414 switch (ctx->type) {
415 case PIPE_SHADER_VERTEX:
416 return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
417 case PIPE_SHADER_TESS_CTRL:
418 return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
419 case PIPE_SHADER_TESS_EVAL:
420 return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
421 case PIPE_SHADER_GEOMETRY:
422 return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
423 default:
424 assert(0);
425 return ctx->i32_0;
426 }
427 }
428
429 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
430 {
431 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
432
433 /* For non-indexed draws, the base vertex set by the driver
434 * (for direct draws) or the CP (for indirect draws) is the
435 * first vertex ID, but GLSL expects 0 to be returned.
436 */
437 LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
438 ctx->vs_state_bits);
439 LLVMValueRef indexed;
440
441 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
442 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
443
444 return LLVMBuildSelect(ctx->ac.builder, indexed,
445 ac_get_arg(&ctx->ac, ctx->args.base_vertex),
446 ctx->i32_0, "");
447 }
448
449 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
450 {
451 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
452
453 LLVMValueRef values[3];
454 LLVMValueRef result;
455 unsigned i;
456 unsigned *properties = ctx->shader->selector->info.properties;
457
458 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
459 unsigned sizes[3] = {
460 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
461 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
462 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
463 };
464
465 for (i = 0; i < 3; ++i)
466 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
467
468 result = ac_build_gather_values(&ctx->ac, values, 3);
469 } else {
470 result = ac_get_arg(&ctx->ac, ctx->block_size);
471 }
472
473 return result;
474 }
475
476 void si_declare_compute_memory(struct si_shader_context *ctx)
477 {
478 struct si_shader_selector *sel = ctx->shader->selector;
479 unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
480
481 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
482 LLVMValueRef var;
483
484 assert(!ctx->ac.lds);
485
486 var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
487 LLVMArrayType(ctx->i8, lds_size),
488 "compute_lds",
489 AC_ADDR_SPACE_LDS);
490 LLVMSetAlignment(var, 64 * 1024);
491
492 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
493 }
494
495 /* Initialize arguments for the shader export intrinsic */
496 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
497 LLVMValueRef *values,
498 unsigned target,
499 struct ac_export_args *args)
500 {
501 args->enabled_channels = 0xf; /* writemask - default is 0xf */
502 args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
503 args->done = 0; /* Specify whether this is the last export */
504 args->target = target; /* Specify the target we are exporting */
505 args->compr = false;
506
507 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
508 }
509
510 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
511 struct ac_export_args *pos, LLVMValueRef *out_elts)
512 {
513 unsigned reg_index;
514 unsigned chan;
515 unsigned const_chan;
516 LLVMValueRef base_elt;
517 LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
518 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
519 SI_VS_CONST_CLIP_PLANES, 0);
520 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
521
522 for (reg_index = 0; reg_index < 2; reg_index ++) {
523 struct ac_export_args *args = &pos[2 + reg_index];
524
525 args->out[0] =
526 args->out[1] =
527 args->out[2] =
528 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
529
530 /* Compute dot products of position and user clip plane vectors */
531 for (chan = 0; chan < 4; chan++) {
532 for (const_chan = 0; const_chan < 4; const_chan++) {
533 LLVMValueRef addr =
534 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
535 const_chan) * 4, 0);
536 base_elt = si_buffer_load_const(ctx, const_resource,
537 addr);
538 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
539 out_elts[const_chan], args->out[chan]);
540 }
541 }
542
543 args->enabled_channels = 0xf;
544 args->valid_mask = 0;
545 args->done = 0;
546 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
547 args->compr = 0;
548 }
549 }
550
551 static void si_dump_streamout(struct pipe_stream_output_info *so)
552 {
553 unsigned i;
554
555 if (so->num_outputs)
556 fprintf(stderr, "STREAMOUT\n");
557
558 for (i = 0; i < so->num_outputs; i++) {
559 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
560 so->output[i].start_component;
561 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
562 i, so->output[i].output_buffer,
563 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
564 so->output[i].register_index,
565 mask & 1 ? "x" : "",
566 mask & 2 ? "y" : "",
567 mask & 4 ? "z" : "",
568 mask & 8 ? "w" : "");
569 }
570 }
571
572 void si_emit_streamout_output(struct si_shader_context *ctx,
573 LLVMValueRef const *so_buffers,
574 LLVMValueRef const *so_write_offsets,
575 struct pipe_stream_output *stream_out,
576 struct si_shader_output_values *shader_out)
577 {
578 unsigned buf_idx = stream_out->output_buffer;
579 unsigned start = stream_out->start_component;
580 unsigned num_comps = stream_out->num_components;
581 LLVMValueRef out[4];
582
583 assert(num_comps && num_comps <= 4);
584 if (!num_comps || num_comps > 4)
585 return;
586
587 /* Load the output as int. */
588 for (int j = 0; j < num_comps; j++) {
589 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
590
591 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
592 }
593
594 /* Pack the output. */
595 LLVMValueRef vdata = NULL;
596
597 switch (num_comps) {
598 case 1: /* as i32 */
599 vdata = out[0];
600 break;
601 case 2: /* as v2i32 */
602 case 3: /* as v3i32 */
603 if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
604 vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
605 break;
606 }
607 /* as v4i32 (aligned to 4) */
608 out[3] = LLVMGetUndef(ctx->i32);
609 /* fall through */
610 case 4: /* as v4i32 */
611 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
612 break;
613 }
614
615 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
616 vdata, num_comps,
617 so_write_offsets[buf_idx],
618 ctx->i32_0,
619 stream_out->dst_offset * 4, ac_glc | ac_slc);
620 }
621
622 /**
623 * Write streamout data to buffers for vertex stream @p stream (different
624 * vertex streams can occur for GS copy shaders).
625 */
626 void si_llvm_emit_streamout(struct si_shader_context *ctx,
627 struct si_shader_output_values *outputs,
628 unsigned noutput, unsigned stream)
629 {
630 struct si_shader_selector *sel = ctx->shader->selector;
631 struct pipe_stream_output_info *so = &sel->so;
632 LLVMBuilderRef builder = ctx->ac.builder;
633 int i;
634
635 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
636 LLVMValueRef so_vtx_count =
637 si_unpack_param(ctx, ctx->streamout_config, 16, 7);
638
639 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
640
641 /* can_emit = tid < so_vtx_count; */
642 LLVMValueRef can_emit =
643 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
644
645 /* Emit the streamout code conditionally. This actually avoids
646 * out-of-bounds buffer access. The hw tells us via the SGPR
647 * (so_vtx_count) which threads are allowed to emit streamout data. */
648 ac_build_ifcc(&ctx->ac, can_emit, 6501);
649 {
650 /* The buffer offset is computed as follows:
651 * ByteOffset = streamout_offset[buffer_id]*4 +
652 * (streamout_write_index + thread_id)*stride[buffer_id] +
653 * attrib_offset
654 */
655
656 LLVMValueRef so_write_index =
657 ac_get_arg(&ctx->ac,
658 ctx->streamout_write_index);
659
660 /* Compute (streamout_write_index + thread_id). */
661 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
662
663 /* Load the descriptor and compute the write offset for each
664 * enabled buffer. */
665 LLVMValueRef so_write_offset[4] = {};
666 LLVMValueRef so_buffers[4];
667 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
668 ctx->rw_buffers);
669
670 for (i = 0; i < 4; i++) {
671 if (!so->stride[i])
672 continue;
673
674 LLVMValueRef offset = LLVMConstInt(ctx->i32,
675 SI_VS_STREAMOUT_BUF0 + i, 0);
676
677 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
678
679 LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
680 ctx->streamout_offset[i]);
681 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
682
683 so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
684 LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
685 so_offset);
686 }
687
688 /* Write streamout data. */
689 for (i = 0; i < so->num_outputs; i++) {
690 unsigned reg = so->output[i].register_index;
691
692 if (reg >= noutput)
693 continue;
694
695 if (stream != so->output[i].stream)
696 continue;
697
698 si_emit_streamout_output(ctx, so_buffers, so_write_offset,
699 &so->output[i], &outputs[reg]);
700 }
701 }
702 ac_build_endif(&ctx->ac, 6501);
703 }
704
705 static void si_export_param(struct si_shader_context *ctx, unsigned index,
706 LLVMValueRef *values)
707 {
708 struct ac_export_args args;
709
710 si_llvm_init_vs_export_args(ctx, values,
711 V_008DFC_SQ_EXP_PARAM + index, &args);
712 ac_build_export(&ctx->ac, &args);
713 }
714
715 static void si_build_param_exports(struct si_shader_context *ctx,
716 struct si_shader_output_values *outputs,
717 unsigned noutput)
718 {
719 struct si_shader *shader = ctx->shader;
720 unsigned param_count = 0;
721
722 for (unsigned i = 0; i < noutput; i++) {
723 unsigned semantic_name = outputs[i].semantic_name;
724 unsigned semantic_index = outputs[i].semantic_index;
725
726 if (outputs[i].vertex_stream[0] != 0 &&
727 outputs[i].vertex_stream[1] != 0 &&
728 outputs[i].vertex_stream[2] != 0 &&
729 outputs[i].vertex_stream[3] != 0)
730 continue;
731
732 switch (semantic_name) {
733 case TGSI_SEMANTIC_LAYER:
734 case TGSI_SEMANTIC_VIEWPORT_INDEX:
735 case TGSI_SEMANTIC_CLIPDIST:
736 case TGSI_SEMANTIC_COLOR:
737 case TGSI_SEMANTIC_BCOLOR:
738 case TGSI_SEMANTIC_PRIMID:
739 case TGSI_SEMANTIC_FOG:
740 case TGSI_SEMANTIC_TEXCOORD:
741 case TGSI_SEMANTIC_GENERIC:
742 break;
743 default:
744 continue;
745 }
746
747 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
748 semantic_index < SI_MAX_IO_GENERIC) &&
749 shader->key.opt.kill_outputs &
750 (1ull << si_shader_io_get_unique_index(semantic_name,
751 semantic_index, true)))
752 continue;
753
754 si_export_param(ctx, param_count, outputs[i].values);
755
756 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
757 shader->info.vs_output_param_offset[i] = param_count++;
758 }
759
760 shader->info.nr_param_exports = param_count;
761 }
762
763 /**
764 * Vertex color clamping.
765 *
766 * This uses a state constant loaded in a user data SGPR and
767 * an IF statement is added that clamps all colors if the constant
768 * is true.
769 */
770 static void si_vertex_color_clamping(struct si_shader_context *ctx,
771 struct si_shader_output_values *outputs,
772 unsigned noutput)
773 {
774 LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
775 bool has_colors = false;
776
777 /* Store original colors to alloca variables. */
778 for (unsigned i = 0; i < noutput; i++) {
779 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
780 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
781 continue;
782
783 for (unsigned j = 0; j < 4; j++) {
784 addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->f32, "");
785 LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
786 }
787 has_colors = true;
788 }
789
790 if (!has_colors)
791 return;
792
793 /* The state is in the first bit of the user SGPR. */
794 LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
795 cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->i1, "");
796
797 ac_build_ifcc(&ctx->ac, cond, 6502);
798
799 /* Store clamped colors to alloca variables within the conditional block. */
800 for (unsigned i = 0; i < noutput; i++) {
801 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
802 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
803 continue;
804
805 for (unsigned j = 0; j < 4; j++) {
806 LLVMBuildStore(ctx->ac.builder,
807 ac_build_clamp(&ctx->ac, outputs[i].values[j]),
808 addr[i][j]);
809 }
810 }
811 ac_build_endif(&ctx->ac, 6502);
812
813 /* Load clamped colors */
814 for (unsigned i = 0; i < noutput; i++) {
815 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
816 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
817 continue;
818
819 for (unsigned j = 0; j < 4; j++) {
820 outputs[i].values[j] =
821 LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
822 }
823 }
824 }
825
826 /* Generate export instructions for hardware VS shader stage or NGG GS stage
827 * (position and parameter data only).
828 */
829 void si_llvm_export_vs(struct si_shader_context *ctx,
830 struct si_shader_output_values *outputs,
831 unsigned noutput)
832 {
833 struct si_shader *shader = ctx->shader;
834 struct ac_export_args pos_args[4] = {};
835 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
836 unsigned pos_idx;
837 int i;
838
839 si_vertex_color_clamping(ctx, outputs, noutput);
840
841 /* Build position exports. */
842 for (i = 0; i < noutput; i++) {
843 switch (outputs[i].semantic_name) {
844 case TGSI_SEMANTIC_POSITION:
845 si_llvm_init_vs_export_args(ctx, outputs[i].values,
846 V_008DFC_SQ_EXP_POS, &pos_args[0]);
847 break;
848 case TGSI_SEMANTIC_PSIZE:
849 psize_value = outputs[i].values[0];
850 break;
851 case TGSI_SEMANTIC_LAYER:
852 layer_value = outputs[i].values[0];
853 break;
854 case TGSI_SEMANTIC_VIEWPORT_INDEX:
855 viewport_index_value = outputs[i].values[0];
856 break;
857 case TGSI_SEMANTIC_EDGEFLAG:
858 edgeflag_value = outputs[i].values[0];
859 break;
860 case TGSI_SEMANTIC_CLIPDIST:
861 if (!shader->key.opt.clip_disable) {
862 unsigned index = 2 + outputs[i].semantic_index;
863 si_llvm_init_vs_export_args(ctx, outputs[i].values,
864 V_008DFC_SQ_EXP_POS + index,
865 &pos_args[index]);
866 }
867 break;
868 case TGSI_SEMANTIC_CLIPVERTEX:
869 if (!shader->key.opt.clip_disable) {
870 si_llvm_emit_clipvertex(ctx, pos_args,
871 outputs[i].values);
872 }
873 break;
874 }
875 }
876
877 /* We need to add the position output manually if it's missing. */
878 if (!pos_args[0].out[0]) {
879 pos_args[0].enabled_channels = 0xf; /* writemask */
880 pos_args[0].valid_mask = 0; /* EXEC mask */
881 pos_args[0].done = 0; /* last export? */
882 pos_args[0].target = V_008DFC_SQ_EXP_POS;
883 pos_args[0].compr = 0; /* COMPR flag */
884 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
885 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
886 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
887 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
888 }
889
890 bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
891 !shader->key.as_ngg;
892
893 /* Write the misc vector (point size, edgeflag, layer, viewport). */
894 if (shader->selector->info.writes_psize ||
895 pos_writes_edgeflag ||
896 shader->selector->info.writes_viewport_index ||
897 shader->selector->info.writes_layer) {
898 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
899 (pos_writes_edgeflag << 1) |
900 (shader->selector->info.writes_layer << 2);
901
902 pos_args[1].valid_mask = 0; /* EXEC mask */
903 pos_args[1].done = 0; /* last export? */
904 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
905 pos_args[1].compr = 0; /* COMPR flag */
906 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
907 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
908 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
909 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
910
911 if (shader->selector->info.writes_psize)
912 pos_args[1].out[0] = psize_value;
913
914 if (pos_writes_edgeflag) {
915 /* The output is a float, but the hw expects an integer
916 * with the first bit containing the edge flag. */
917 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
918 edgeflag_value,
919 ctx->i32, "");
920 edgeflag_value = ac_build_umin(&ctx->ac,
921 edgeflag_value,
922 ctx->i32_1);
923
924 /* The LLVM intrinsic expects a float. */
925 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
926 }
927
928 if (ctx->screen->info.chip_class >= GFX9) {
929 /* GFX9 has the layer in out.z[10:0] and the viewport
930 * index in out.z[19:16].
931 */
932 if (shader->selector->info.writes_layer)
933 pos_args[1].out[2] = layer_value;
934
935 if (shader->selector->info.writes_viewport_index) {
936 LLVMValueRef v = viewport_index_value;
937
938 v = ac_to_integer(&ctx->ac, v);
939 v = LLVMBuildShl(ctx->ac.builder, v,
940 LLVMConstInt(ctx->i32, 16, 0), "");
941 v = LLVMBuildOr(ctx->ac.builder, v,
942 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
943 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
944 pos_args[1].enabled_channels |= 1 << 2;
945 }
946 } else {
947 if (shader->selector->info.writes_layer)
948 pos_args[1].out[2] = layer_value;
949
950 if (shader->selector->info.writes_viewport_index) {
951 pos_args[1].out[3] = viewport_index_value;
952 pos_args[1].enabled_channels |= 1 << 3;
953 }
954 }
955 }
956
957 for (i = 0; i < 4; i++)
958 if (pos_args[i].out[0])
959 shader->info.nr_pos_exports++;
960
961 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
962 * Setting valid_mask=1 prevents it and has no other effect.
963 */
964 if (ctx->screen->info.family == CHIP_NAVI10 ||
965 ctx->screen->info.family == CHIP_NAVI12 ||
966 ctx->screen->info.family == CHIP_NAVI14)
967 pos_args[0].valid_mask = 1;
968
969 pos_idx = 0;
970 for (i = 0; i < 4; i++) {
971 if (!pos_args[i].out[0])
972 continue;
973
974 /* Specify the target we are exporting */
975 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
976
977 if (pos_idx == shader->info.nr_pos_exports)
978 /* Specify that this is the last export */
979 pos_args[i].done = 1;
980
981 ac_build_export(&ctx->ac, &pos_args[i]);
982 }
983
984 /* Build parameter exports. */
985 si_build_param_exports(ctx, outputs, noutput);
986 }
987
988 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
989 unsigned max_outputs,
990 LLVMValueRef *addrs)
991 {
992 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
993 struct si_shader_info *info = &ctx->shader->selector->info;
994 struct si_shader_output_values *outputs = NULL;
995 int i,j;
996
997 assert(!ctx->shader->is_gs_copy_shader);
998 assert(info->num_outputs <= max_outputs);
999
1000 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
1001
1002 for (i = 0; i < info->num_outputs; i++) {
1003 outputs[i].semantic_name = info->output_semantic_name[i];
1004 outputs[i].semantic_index = info->output_semantic_index[i];
1005
1006 for (j = 0; j < 4; j++) {
1007 outputs[i].values[j] =
1008 LLVMBuildLoad(ctx->ac.builder,
1009 addrs[4 * i + j],
1010 "");
1011 outputs[i].vertex_stream[j] =
1012 (info->output_streams[i] >> (2 * j)) & 3;
1013 }
1014 }
1015
1016 if (!ctx->screen->use_ngg_streamout &&
1017 ctx->shader->selector->so.num_outputs)
1018 si_llvm_emit_streamout(ctx, outputs, i, 0);
1019
1020 /* Export PrimitiveID. */
1021 if (ctx->shader->key.mono.u.vs_export_prim_id) {
1022 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
1023 outputs[i].semantic_index = 0;
1024 outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
1025 for (j = 1; j < 4; j++)
1026 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
1027
1028 memset(outputs[i].vertex_stream, 0,
1029 sizeof(outputs[i].vertex_stream));
1030 i++;
1031 }
1032
1033 si_llvm_export_vs(ctx, outputs, i);
1034 FREE(outputs);
1035 }
1036
1037 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
1038 unsigned max_outputs,
1039 LLVMValueRef *addrs)
1040 {
1041 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1042 struct si_shader_info *info = &ctx->shader->selector->info;
1043 LLVMValueRef pos[4] = {};
1044
1045 assert(info->num_outputs <= max_outputs);
1046
1047 for (unsigned i = 0; i < info->num_outputs; i++) {
1048 if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
1049 continue;
1050
1051 for (unsigned chan = 0; chan < 4; chan++)
1052 pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
1053 break;
1054 }
1055 assert(pos[0] != NULL);
1056
1057 /* Return the position output. */
1058 LLVMValueRef ret = ctx->return_value;
1059 for (unsigned chan = 0; chan < 4; chan++)
1060 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
1061 ctx->return_value = ret;
1062 }
1063
1064 static void declare_streamout_params(struct si_shader_context *ctx,
1065 struct pipe_stream_output_info *so)
1066 {
1067 if (ctx->screen->use_ngg_streamout) {
1068 if (ctx->type == PIPE_SHADER_TESS_EVAL)
1069 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1070 return;
1071 }
1072
1073 /* Streamout SGPRs. */
1074 if (so->num_outputs) {
1075 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
1076 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
1077 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
1078 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1079 }
1080
1081 /* A streamout buffer offset is loaded if the stride is non-zero. */
1082 for (int i = 0; i < 4; i++) {
1083 if (!so->stride[i])
1084 continue;
1085
1086 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
1087 }
1088 }
1089
1090 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
1091 {
1092 switch (shader->selector->type) {
1093 case PIPE_SHADER_VERTEX:
1094 case PIPE_SHADER_TESS_EVAL:
1095 return shader->key.as_ngg ? 128 : 0;
1096
1097 case PIPE_SHADER_TESS_CTRL:
1098 /* Return this so that LLVM doesn't remove s_barrier
1099 * instructions on chips where we use s_barrier. */
1100 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
1101
1102 case PIPE_SHADER_GEOMETRY:
1103 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
1104
1105 case PIPE_SHADER_COMPUTE:
1106 break; /* see below */
1107
1108 default:
1109 return 0;
1110 }
1111
1112 const unsigned *properties = shader->selector->info.properties;
1113 unsigned max_work_group_size =
1114 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
1115 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
1116 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
1117
1118 if (!max_work_group_size) {
1119 /* This is a variable group size compute shader,
1120 * compile it for the maximum possible group size.
1121 */
1122 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
1123 }
1124 return max_work_group_size;
1125 }
1126
1127 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
1128 bool assign_params)
1129 {
1130 enum ac_arg_type const_shader_buf_type;
1131
1132 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
1133 ctx->shader->selector->info.shader_buffers_declared == 0)
1134 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
1135 else
1136 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
1137
1138 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
1139 assign_params ? &ctx->const_and_shader_buffers :
1140 &ctx->other_const_and_shader_buffers);
1141 }
1142
1143 static void declare_samplers_and_images(struct si_shader_context *ctx,
1144 bool assign_params)
1145 {
1146 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
1147 assign_params ? &ctx->samplers_and_images :
1148 &ctx->other_samplers_and_images);
1149 }
1150
1151 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
1152 bool assign_params)
1153 {
1154 declare_const_and_shader_buffers(ctx, assign_params);
1155 declare_samplers_and_images(ctx, assign_params);
1156 }
1157
1158 static void declare_global_desc_pointers(struct si_shader_context *ctx)
1159 {
1160 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
1161 &ctx->rw_buffers);
1162 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
1163 &ctx->bindless_samplers_and_images);
1164 }
1165
1166 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
1167 {
1168 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1169 if (!ctx->shader->is_gs_copy_shader) {
1170 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
1171 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
1172 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
1173 }
1174 }
1175
1176 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
1177 {
1178 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
1179
1180 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
1181 if (num_vbos_in_user_sgprs) {
1182 unsigned user_sgprs = ctx->args.num_sgprs_used;
1183
1184 if (si_is_merged_shader(ctx))
1185 user_sgprs -= 8;
1186 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
1187
1188 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
1189 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
1190 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
1191
1192 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
1193 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
1194 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
1195 }
1196 }
1197
1198 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
1199 unsigned *num_prolog_vgprs)
1200 {
1201 struct si_shader *shader = ctx->shader;
1202
1203 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
1204 if (shader->key.as_ls) {
1205 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
1206 if (ctx->screen->info.chip_class >= GFX10) {
1207 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
1208 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1209 } else {
1210 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1211 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
1212 }
1213 } else if (ctx->screen->info.chip_class >= GFX10) {
1214 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
1215 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1216 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
1217 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1218 } else {
1219 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1220 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
1221 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
1222 }
1223
1224 if (!shader->is_gs_copy_shader) {
1225 /* Vertex load indices. */
1226 if (shader->selector->info.num_inputs) {
1227 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1228 &ctx->vertex_index0);
1229 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
1230 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
1231 }
1232 *num_prolog_vgprs += shader->selector->info.num_inputs;
1233 }
1234 }
1235
1236 static void declare_vs_blit_inputs(struct si_shader_context *ctx,
1237 unsigned vs_blit_property)
1238 {
1239 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1240 &ctx->vs_blit_inputs); /* i16 x1, y1 */
1241 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
1242 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
1243
1244 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
1245 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
1246 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
1247 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
1248 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
1249 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
1250 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
1251 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
1252 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
1253 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
1254 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
1255 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
1256 }
1257 }
1258
1259 static void declare_tes_input_vgprs(struct si_shader_context *ctx)
1260 {
1261 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
1262 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
1263 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
1264 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
1265 }
1266
1267 enum {
1268 /* Convenient merged shader definitions. */
1269 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
1270 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
1271 };
1272
1273 void si_add_arg_checked(struct ac_shader_args *args,
1274 enum ac_arg_regfile file,
1275 unsigned registers, enum ac_arg_type type,
1276 struct ac_arg *arg,
1277 unsigned idx)
1278 {
1279 assert(args->arg_count == idx);
1280 ac_add_arg(args, file, registers, type, arg);
1281 }
1282
1283 void si_create_function(struct si_shader_context *ctx)
1284 {
1285 struct si_shader *shader = ctx->shader;
1286 LLVMTypeRef returns[AC_MAX_ARGS];
1287 unsigned i, num_return_sgprs;
1288 unsigned num_returns = 0;
1289 unsigned num_prolog_vgprs = 0;
1290 unsigned type = ctx->type;
1291 unsigned vs_blit_property =
1292 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
1293
1294 memset(&ctx->args, 0, sizeof(ctx->args));
1295
1296 /* Set MERGED shaders. */
1297 if (ctx->screen->info.chip_class >= GFX9) {
1298 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
1299 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
1300 else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
1301 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
1302 }
1303
1304 switch (type) {
1305 case PIPE_SHADER_VERTEX:
1306 declare_global_desc_pointers(ctx);
1307
1308 if (vs_blit_property) {
1309 declare_vs_blit_inputs(ctx, vs_blit_property);
1310
1311 /* VGPRs */
1312 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1313 break;
1314 }
1315
1316 declare_per_stage_desc_pointers(ctx, true);
1317 declare_vs_specific_input_sgprs(ctx);
1318 if (!shader->is_gs_copy_shader)
1319 declare_vb_descriptor_input_sgprs(ctx);
1320
1321 if (shader->key.as_es) {
1322 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1323 &ctx->es2gs_offset);
1324 } else if (shader->key.as_ls) {
1325 /* no extra parameters */
1326 } else {
1327 /* The locations of the other parameters are assigned dynamically. */
1328 declare_streamout_params(ctx, &shader->selector->so);
1329 }
1330
1331 /* VGPRs */
1332 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1333
1334 /* Return values */
1335 if (shader->key.opt.vs_as_prim_discard_cs) {
1336 for (i = 0; i < 4; i++)
1337 returns[num_returns++] = ctx->f32; /* VGPRs */
1338 }
1339 break;
1340
1341 case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
1342 declare_global_desc_pointers(ctx);
1343 declare_per_stage_desc_pointers(ctx, true);
1344 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1345 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
1346 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1347 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1348 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1349 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
1350
1351 /* VGPRs */
1352 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
1353 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
1354
1355 /* param_tcs_offchip_offset and param_tcs_factor_offset are
1356 * placed after the user SGPRs.
1357 */
1358 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
1359 returns[num_returns++] = ctx->i32; /* SGPRs */
1360 for (i = 0; i < 11; i++)
1361 returns[num_returns++] = ctx->f32; /* VGPRs */
1362 break;
1363
1364 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
1365 /* Merged stages have 8 system SGPRs at the beginning. */
1366 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
1367 declare_per_stage_desc_pointers(ctx,
1368 ctx->type == PIPE_SHADER_TESS_CTRL);
1369 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1370 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
1371 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
1372 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
1373 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
1374 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
1375
1376 declare_global_desc_pointers(ctx);
1377 declare_per_stage_desc_pointers(ctx,
1378 ctx->type == PIPE_SHADER_VERTEX);
1379 declare_vs_specific_input_sgprs(ctx);
1380
1381 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1382 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
1383 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1384 declare_vb_descriptor_input_sgprs(ctx);
1385
1386 /* VGPRs (first TCS, then VS) */
1387 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
1388 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
1389
1390 if (ctx->type == PIPE_SHADER_VERTEX) {
1391 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1392
1393 /* LS return values are inputs to the TCS main shader part. */
1394 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
1395 returns[num_returns++] = ctx->i32; /* SGPRs */
1396 for (i = 0; i < 2; i++)
1397 returns[num_returns++] = ctx->f32; /* VGPRs */
1398 } else {
1399 /* TCS return values are inputs to the TCS epilog.
1400 *
1401 * param_tcs_offchip_offset, param_tcs_factor_offset,
1402 * param_tcs_offchip_layout, and param_rw_buffers
1403 * should be passed to the epilog.
1404 */
1405 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
1406 returns[num_returns++] = ctx->i32; /* SGPRs */
1407 for (i = 0; i < 11; i++)
1408 returns[num_returns++] = ctx->f32; /* VGPRs */
1409 }
1410 break;
1411
1412 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
1413 /* Merged stages have 8 system SGPRs at the beginning. */
1414 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
1415 declare_per_stage_desc_pointers(ctx,
1416 ctx->type == PIPE_SHADER_GEOMETRY);
1417
1418 if (ctx->shader->key.as_ngg)
1419 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
1420 else
1421 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
1422
1423 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
1424 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1425 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
1426 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
1427 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
1428
1429 declare_global_desc_pointers(ctx);
1430 if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
1431 declare_per_stage_desc_pointers(ctx,
1432 (ctx->type == PIPE_SHADER_VERTEX ||
1433 ctx->type == PIPE_SHADER_TESS_EVAL));
1434 }
1435
1436 if (ctx->type == PIPE_SHADER_VERTEX) {
1437 if (vs_blit_property)
1438 declare_vs_blit_inputs(ctx, vs_blit_property);
1439 else
1440 declare_vs_specific_input_sgprs(ctx);
1441 } else {
1442 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1443 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1444 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
1445 /* Declare as many input SGPRs as the VS has. */
1446 }
1447
1448 if (ctx->type == PIPE_SHADER_VERTEX)
1449 declare_vb_descriptor_input_sgprs(ctx);
1450
1451 /* VGPRs (first GS, then VS/TES) */
1452 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
1453 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
1454 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
1455 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
1456 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
1457
1458 if (ctx->type == PIPE_SHADER_VERTEX) {
1459 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1460 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
1461 declare_tes_input_vgprs(ctx);
1462 }
1463
1464 if (ctx->shader->key.as_es &&
1465 (ctx->type == PIPE_SHADER_VERTEX ||
1466 ctx->type == PIPE_SHADER_TESS_EVAL)) {
1467 unsigned num_user_sgprs;
1468
1469 if (ctx->type == PIPE_SHADER_VERTEX)
1470 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
1471 else
1472 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
1473
1474 /* ES return values are inputs to GS. */
1475 for (i = 0; i < 8 + num_user_sgprs; i++)
1476 returns[num_returns++] = ctx->i32; /* SGPRs */
1477 for (i = 0; i < 5; i++)
1478 returns[num_returns++] = ctx->f32; /* VGPRs */
1479 }
1480 break;
1481
1482 case PIPE_SHADER_TESS_EVAL:
1483 declare_global_desc_pointers(ctx);
1484 declare_per_stage_desc_pointers(ctx, true);
1485 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1486 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1487 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
1488
1489 if (shader->key.as_es) {
1490 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1491 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1492 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
1493 } else {
1494 declare_streamout_params(ctx, &shader->selector->so);
1495 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1496 }
1497
1498 /* VGPRs */
1499 declare_tes_input_vgprs(ctx);
1500 break;
1501
1502 case PIPE_SHADER_GEOMETRY:
1503 declare_global_desc_pointers(ctx);
1504 declare_per_stage_desc_pointers(ctx, true);
1505 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
1506 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
1507
1508 /* VGPRs */
1509 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
1510 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
1511 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
1512 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
1513 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
1514 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
1515 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
1516 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
1517 break;
1518
1519 case PIPE_SHADER_FRAGMENT:
1520 declare_global_desc_pointers(ctx);
1521 declare_per_stage_desc_pointers(ctx, true);
1522 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
1523 SI_PARAM_ALPHA_REF);
1524 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1525 &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
1526
1527 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
1528 SI_PARAM_PERSP_SAMPLE);
1529 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1530 &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
1531 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1532 &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
1533 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
1534 NULL, SI_PARAM_PERSP_PULL_MODEL);
1535 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1536 &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
1537 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1538 &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
1539 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1540 &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
1541 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
1542 NULL, SI_PARAM_LINE_STIPPLE_TEX);
1543 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1544 &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
1545 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1546 &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
1547 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1548 &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
1549 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1550 &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
1551 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
1552 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1553 &ctx->args.front_face, SI_PARAM_FRONT_FACE);
1554 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
1555 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1556 &ctx->args.ancillary, SI_PARAM_ANCILLARY);
1557 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1558 &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
1559 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1560 &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
1561
1562 /* Color inputs from the prolog. */
1563 if (shader->selector->info.colors_read) {
1564 unsigned num_color_elements =
1565 util_bitcount(shader->selector->info.colors_read);
1566
1567 for (i = 0; i < num_color_elements; i++)
1568 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
1569
1570 num_prolog_vgprs += num_color_elements;
1571 }
1572
1573 /* Outputs for the epilog. */
1574 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
1575 num_returns =
1576 num_return_sgprs +
1577 util_bitcount(shader->selector->info.colors_written) * 4 +
1578 shader->selector->info.writes_z +
1579 shader->selector->info.writes_stencil +
1580 shader->selector->info.writes_samplemask +
1581 1 /* SampleMaskIn */;
1582
1583 num_returns = MAX2(num_returns,
1584 num_return_sgprs +
1585 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
1586
1587 for (i = 0; i < num_return_sgprs; i++)
1588 returns[i] = ctx->i32;
1589 for (; i < num_returns; i++)
1590 returns[i] = ctx->f32;
1591 break;
1592
1593 case PIPE_SHADER_COMPUTE:
1594 declare_global_desc_pointers(ctx);
1595 declare_per_stage_desc_pointers(ctx, true);
1596 if (shader->selector->info.uses_grid_size)
1597 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
1598 &ctx->args.num_work_groups);
1599 if (shader->selector->info.uses_block_size &&
1600 shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
1601 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
1602
1603 unsigned cs_user_data_dwords =
1604 shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
1605 if (cs_user_data_dwords) {
1606 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
1607 &ctx->cs_user_data);
1608 }
1609
1610 /* Hardware SGPRs. */
1611 for (i = 0; i < 3; i++) {
1612 if (shader->selector->info.uses_block_id[i]) {
1613 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1614 &ctx->args.workgroup_ids[i]);
1615 }
1616 }
1617 if (shader->selector->info.uses_subgroup_info)
1618 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
1619
1620 /* Hardware VGPRs. */
1621 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
1622 &ctx->args.local_invocation_ids);
1623 break;
1624 default:
1625 assert(0 && "unimplemented shader");
1626 return;
1627 }
1628
1629 si_llvm_create_func(ctx, "main", returns, num_returns,
1630 si_get_max_workgroup_size(shader));
1631
1632 /* Reserve register locations for VGPR inputs the PS prolog may need. */
1633 if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
1634 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
1635 "InitialPSInputAddr",
1636 S_0286D0_PERSP_SAMPLE_ENA(1) |
1637 S_0286D0_PERSP_CENTER_ENA(1) |
1638 S_0286D0_PERSP_CENTROID_ENA(1) |
1639 S_0286D0_LINEAR_SAMPLE_ENA(1) |
1640 S_0286D0_LINEAR_CENTER_ENA(1) |
1641 S_0286D0_LINEAR_CENTROID_ENA(1) |
1642 S_0286D0_FRONT_FACE_ENA(1) |
1643 S_0286D0_ANCILLARY_ENA(1) |
1644 S_0286D0_POS_FIXED_PT_ENA(1));
1645 }
1646
1647 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
1648 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
1649
1650 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
1651 shader->info.num_input_vgprs -= num_prolog_vgprs;
1652
1653 if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
1654 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
1655 /* The LSHS size is not known until draw time, so we append it
1656 * at the end of whatever LDS use there may be in the rest of
1657 * the shader (currently none, unless LLVM decides to do its
1658 * own LDS-based lowering).
1659 */
1660 ctx->ac.lds = LLVMAddGlobalInAddressSpace(
1661 ctx->ac.module, LLVMArrayType(ctx->i32, 0),
1662 "__lds_end", AC_ADDR_SPACE_LDS);
1663 LLVMSetAlignment(ctx->ac.lds, 256);
1664 } else {
1665 ac_declare_lds_as_pointer(&ctx->ac);
1666 }
1667 }
1668
1669 /* Unlike radv, we override these arguments in the prolog, so to the
1670 * API shader they appear as normal arguments.
1671 */
1672 if (ctx->type == PIPE_SHADER_VERTEX) {
1673 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
1674 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
1675 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1676 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
1677 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
1678 }
1679 }
1680
1681 /* For the UMR disassembler. */
1682 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
1683 #define DEBUGGER_NUM_MARKERS 5
1684
1685 static bool si_shader_binary_open(struct si_screen *screen,
1686 struct si_shader *shader,
1687 struct ac_rtld_binary *rtld)
1688 {
1689 const struct si_shader_selector *sel = shader->selector;
1690 const char *part_elfs[5];
1691 size_t part_sizes[5];
1692 unsigned num_parts = 0;
1693
1694 #define add_part(shader_or_part) \
1695 if (shader_or_part) { \
1696 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
1697 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
1698 num_parts++; \
1699 }
1700
1701 add_part(shader->prolog);
1702 add_part(shader->previous_stage);
1703 add_part(shader->prolog2);
1704 add_part(shader);
1705 add_part(shader->epilog);
1706
1707 #undef add_part
1708
1709 struct ac_rtld_symbol lds_symbols[2];
1710 unsigned num_lds_symbols = 0;
1711
1712 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
1713 (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
1714 /* We add this symbol even on LLVM <= 8 to ensure that
1715 * shader->config.lds_size is set correctly below.
1716 */
1717 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1718 sym->name = "esgs_ring";
1719 sym->size = shader->gs_info.esgs_ring_size;
1720 sym->align = 64 * 1024;
1721 }
1722
1723 if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
1724 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1725 sym->name = "ngg_emit";
1726 sym->size = shader->ngg.ngg_emit_size * 4;
1727 sym->align = 4;
1728 }
1729
1730 bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
1731 .info = &screen->info,
1732 .options = {
1733 .halt_at_entry = screen->options.halt_shaders,
1734 },
1735 .shader_type = tgsi_processor_to_shader_stage(sel->type),
1736 .wave_size = si_get_shader_wave_size(shader),
1737 .num_parts = num_parts,
1738 .elf_ptrs = part_elfs,
1739 .elf_sizes = part_sizes,
1740 .num_shared_lds_symbols = num_lds_symbols,
1741 .shared_lds_symbols = lds_symbols });
1742
1743 if (rtld->lds_size > 0) {
1744 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
1745 shader->config.lds_size =
1746 align(rtld->lds_size, alloc_granularity) / alloc_granularity;
1747 }
1748
1749 return ok;
1750 }
1751
1752 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
1753 {
1754 struct ac_rtld_binary rtld;
1755 si_shader_binary_open(screen, shader, &rtld);
1756 return rtld.exec_size;
1757 }
1758
1759 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
1760 {
1761 uint64_t *scratch_va = data;
1762
1763 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
1764 *value = (uint32_t)*scratch_va;
1765 return true;
1766 }
1767 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
1768 /* Enable scratch coalescing. */
1769 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
1770 S_008F04_SWIZZLE_ENABLE(1);
1771 return true;
1772 }
1773
1774 return false;
1775 }
1776
1777 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1778 uint64_t scratch_va)
1779 {
1780 struct ac_rtld_binary binary;
1781 if (!si_shader_binary_open(sscreen, shader, &binary))
1782 return false;
1783
1784 si_resource_reference(&shader->bo, NULL);
1785 shader->bo = si_aligned_buffer_create(&sscreen->b,
1786 sscreen->info.cpdma_prefetch_writes_memory ?
1787 0 : SI_RESOURCE_FLAG_READ_ONLY,
1788 PIPE_USAGE_IMMUTABLE,
1789 align(binary.rx_size, SI_CPDMA_ALIGNMENT),
1790 256);
1791 if (!shader->bo)
1792 return false;
1793
1794 /* Upload. */
1795 struct ac_rtld_upload_info u = {};
1796 u.binary = &binary;
1797 u.get_external_symbol = si_get_external_symbol;
1798 u.cb_data = &scratch_va;
1799 u.rx_va = shader->bo->gpu_address;
1800 u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
1801 PIPE_TRANSFER_READ_WRITE |
1802 PIPE_TRANSFER_UNSYNCHRONIZED |
1803 RADEON_TRANSFER_TEMPORARY);
1804 if (!u.rx_ptr)
1805 return false;
1806
1807 bool ok = ac_rtld_upload(&u);
1808
1809 sscreen->ws->buffer_unmap(shader->bo->buf);
1810 ac_rtld_close(&binary);
1811
1812 return ok;
1813 }
1814
1815 static void si_shader_dump_disassembly(struct si_screen *screen,
1816 const struct si_shader_binary *binary,
1817 enum pipe_shader_type shader_type,
1818 unsigned wave_size,
1819 struct pipe_debug_callback *debug,
1820 const char *name, FILE *file)
1821 {
1822 struct ac_rtld_binary rtld_binary;
1823
1824 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1825 .info = &screen->info,
1826 .shader_type = tgsi_processor_to_shader_stage(shader_type),
1827 .wave_size = wave_size,
1828 .num_parts = 1,
1829 .elf_ptrs = &binary->elf_buffer,
1830 .elf_sizes = &binary->elf_size }))
1831 return;
1832
1833 const char *disasm;
1834 size_t nbytes;
1835
1836 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1837 goto out;
1838
1839 if (nbytes > INT_MAX)
1840 goto out;
1841
1842 if (debug && debug->debug_message) {
1843 /* Very long debug messages are cut off, so send the
1844 * disassembly one line at a time. This causes more
1845 * overhead, but on the plus side it simplifies
1846 * parsing of resulting logs.
1847 */
1848 pipe_debug_message(debug, SHADER_INFO,
1849 "Shader Disassembly Begin");
1850
1851 uint64_t line = 0;
1852 while (line < nbytes) {
1853 int count = nbytes - line;
1854 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1855 if (nl)
1856 count = nl - (disasm + line);
1857
1858 if (count) {
1859 pipe_debug_message(debug, SHADER_INFO,
1860 "%.*s", count, disasm + line);
1861 }
1862
1863 line += count + 1;
1864 }
1865
1866 pipe_debug_message(debug, SHADER_INFO,
1867 "Shader Disassembly End");
1868 }
1869
1870 if (file) {
1871 fprintf(file, "Shader %s disassembly:\n", name);
1872 fprintf(file, "%*s", (int)nbytes, disasm);
1873 }
1874
1875 out:
1876 ac_rtld_close(&rtld_binary);
1877 }
1878
1879 static void si_calculate_max_simd_waves(struct si_shader *shader)
1880 {
1881 struct si_screen *sscreen = shader->selector->screen;
1882 struct ac_shader_config *conf = &shader->config;
1883 unsigned num_inputs = shader->selector->info.num_inputs;
1884 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
1885 unsigned lds_per_wave = 0;
1886 unsigned max_simd_waves;
1887
1888 max_simd_waves = sscreen->info.max_wave64_per_simd;
1889
1890 /* Compute LDS usage for PS. */
1891 switch (shader->selector->type) {
1892 case PIPE_SHADER_FRAGMENT:
1893 /* The minimum usage per wave is (num_inputs * 48). The maximum
1894 * usage is (num_inputs * 48 * 16).
1895 * We can get anything in between and it varies between waves.
1896 *
1897 * The 48 bytes per input for a single primitive is equal to
1898 * 4 bytes/component * 4 components/input * 3 points.
1899 *
1900 * Other stages don't know the size at compile time or don't
1901 * allocate LDS per wave, but instead they do it per thread group.
1902 */
1903 lds_per_wave = conf->lds_size * lds_increment +
1904 align(num_inputs * 48, lds_increment);
1905 break;
1906 case PIPE_SHADER_COMPUTE:
1907 if (shader->selector) {
1908 unsigned max_workgroup_size =
1909 si_get_max_workgroup_size(shader);
1910 lds_per_wave = (conf->lds_size * lds_increment) /
1911 DIV_ROUND_UP(max_workgroup_size,
1912 sscreen->compute_wave_size);
1913 }
1914 break;
1915 default:;
1916 }
1917
1918 /* Compute the per-SIMD wave counts. */
1919 if (conf->num_sgprs) {
1920 max_simd_waves =
1921 MIN2(max_simd_waves,
1922 sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1923 }
1924
1925 if (conf->num_vgprs) {
1926 /* Always print wave limits as Wave64, so that we can compare
1927 * Wave32 and Wave64 with shader-db fairly. */
1928 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1929 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1930 }
1931
1932 /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
1933 * 16KB makes some SIMDs unoccupied).
1934 *
1935 * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
1936 */
1937 unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
1938 unsigned max_lds_per_simd = max_lds_size / 4;
1939 if (lds_per_wave)
1940 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1941
1942 shader->info.max_simd_waves = max_simd_waves;
1943 }
1944
1945 void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
1946 struct si_shader *shader,
1947 struct pipe_debug_callback *debug)
1948 {
1949 const struct ac_shader_config *conf = &shader->config;
1950
1951 if (screen->options.debug_disassembly)
1952 si_shader_dump_disassembly(screen, &shader->binary,
1953 shader->selector->type,
1954 si_get_shader_wave_size(shader),
1955 debug, "main", NULL);
1956
1957 pipe_debug_message(debug, SHADER_INFO,
1958 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1959 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1960 "Spilled VGPRs: %d PrivMem VGPRs: %d",
1961 conf->num_sgprs, conf->num_vgprs,
1962 si_get_shader_binary_size(screen, shader),
1963 conf->lds_size, conf->scratch_bytes_per_wave,
1964 shader->info.max_simd_waves, conf->spilled_sgprs,
1965 conf->spilled_vgprs, shader->info.private_mem_vgprs);
1966 }
1967
1968 static void si_shader_dump_stats(struct si_screen *sscreen,
1969 struct si_shader *shader,
1970 FILE *file,
1971 bool check_debug_option)
1972 {
1973 const struct ac_shader_config *conf = &shader->config;
1974
1975 if (!check_debug_option ||
1976 si_can_dump_shader(sscreen, shader->selector->type)) {
1977 if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
1978 fprintf(file, "*** SHADER CONFIG ***\n"
1979 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1980 "SPI_PS_INPUT_ENA = 0x%04x\n",
1981 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1982 }
1983
1984 fprintf(file, "*** SHADER STATS ***\n"
1985 "SGPRS: %d\n"
1986 "VGPRS: %d\n"
1987 "Spilled SGPRs: %d\n"
1988 "Spilled VGPRs: %d\n"
1989 "Private memory VGPRs: %d\n"
1990 "Code Size: %d bytes\n"
1991 "LDS: %d blocks\n"
1992 "Scratch: %d bytes per wave\n"
1993 "Max Waves: %d\n"
1994 "********************\n\n\n",
1995 conf->num_sgprs, conf->num_vgprs,
1996 conf->spilled_sgprs, conf->spilled_vgprs,
1997 shader->info.private_mem_vgprs,
1998 si_get_shader_binary_size(sscreen, shader),
1999 conf->lds_size, conf->scratch_bytes_per_wave,
2000 shader->info.max_simd_waves);
2001 }
2002 }
2003
2004 const char *si_get_shader_name(const struct si_shader *shader)
2005 {
2006 switch (shader->selector->type) {
2007 case PIPE_SHADER_VERTEX:
2008 if (shader->key.as_es)
2009 return "Vertex Shader as ES";
2010 else if (shader->key.as_ls)
2011 return "Vertex Shader as LS";
2012 else if (shader->key.opt.vs_as_prim_discard_cs)
2013 return "Vertex Shader as Primitive Discard CS";
2014 else if (shader->key.as_ngg)
2015 return "Vertex Shader as ESGS";
2016 else
2017 return "Vertex Shader as VS";
2018 case PIPE_SHADER_TESS_CTRL:
2019 return "Tessellation Control Shader";
2020 case PIPE_SHADER_TESS_EVAL:
2021 if (shader->key.as_es)
2022 return "Tessellation Evaluation Shader as ES";
2023 else if (shader->key.as_ngg)
2024 return "Tessellation Evaluation Shader as ESGS";
2025 else
2026 return "Tessellation Evaluation Shader as VS";
2027 case PIPE_SHADER_GEOMETRY:
2028 if (shader->is_gs_copy_shader)
2029 return "GS Copy Shader as VS";
2030 else
2031 return "Geometry Shader";
2032 case PIPE_SHADER_FRAGMENT:
2033 return "Pixel Shader";
2034 case PIPE_SHADER_COMPUTE:
2035 return "Compute Shader";
2036 default:
2037 return "Unknown Shader";
2038 }
2039 }
2040
2041 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
2042 struct pipe_debug_callback *debug,
2043 FILE *file, bool check_debug_option)
2044 {
2045 enum pipe_shader_type shader_type = shader->selector->type;
2046
2047 if (!check_debug_option ||
2048 si_can_dump_shader(sscreen, shader_type))
2049 si_dump_shader_key(shader, file);
2050
2051 if (!check_debug_option && shader->binary.llvm_ir_string) {
2052 if (shader->previous_stage &&
2053 shader->previous_stage->binary.llvm_ir_string) {
2054 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
2055 si_get_shader_name(shader));
2056 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
2057 }
2058
2059 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
2060 si_get_shader_name(shader));
2061 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
2062 }
2063
2064 if (!check_debug_option ||
2065 (si_can_dump_shader(sscreen, shader_type) &&
2066 !(sscreen->debug_flags & DBG(NO_ASM)))) {
2067 unsigned wave_size = si_get_shader_wave_size(shader);
2068
2069 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
2070
2071 if (shader->prolog)
2072 si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
2073 shader_type, wave_size, debug, "prolog", file);
2074 if (shader->previous_stage)
2075 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
2076 shader_type, wave_size, debug, "previous stage", file);
2077 if (shader->prolog2)
2078 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
2079 shader_type, wave_size, debug, "prolog2", file);
2080
2081 si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
2082 wave_size, debug, "main", file);
2083
2084 if (shader->epilog)
2085 si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
2086 shader_type, wave_size, debug, "epilog", file);
2087 fprintf(file, "\n");
2088 }
2089
2090 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
2091 }
2092
2093 int si_compile_llvm(struct si_screen *sscreen,
2094 struct si_shader_binary *binary,
2095 struct ac_shader_config *conf,
2096 struct ac_llvm_compiler *compiler,
2097 LLVMModuleRef mod,
2098 struct pipe_debug_callback *debug,
2099 enum pipe_shader_type shader_type,
2100 unsigned wave_size,
2101 const char *name,
2102 bool less_optimized)
2103 {
2104 unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
2105
2106 if (si_can_dump_shader(sscreen, shader_type)) {
2107 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
2108
2109 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
2110 fprintf(stderr, "%s LLVM IR:\n\n", name);
2111 ac_dump_module(mod);
2112 fprintf(stderr, "\n");
2113 }
2114 }
2115
2116 if (sscreen->record_llvm_ir) {
2117 char *ir = LLVMPrintModuleToString(mod);
2118 binary->llvm_ir_string = strdup(ir);
2119 LLVMDisposeMessage(ir);
2120 }
2121
2122 if (!si_replace_shader(count, binary)) {
2123 unsigned r = si_llvm_compile(mod, binary, compiler, debug,
2124 less_optimized, wave_size);
2125 if (r)
2126 return r;
2127 }
2128
2129 struct ac_rtld_binary rtld;
2130 if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
2131 .info = &sscreen->info,
2132 .shader_type = tgsi_processor_to_shader_stage(shader_type),
2133 .wave_size = wave_size,
2134 .num_parts = 1,
2135 .elf_ptrs = &binary->elf_buffer,
2136 .elf_sizes = &binary->elf_size }))
2137 return -1;
2138
2139 bool ok = ac_rtld_read_config(&rtld, conf);
2140 ac_rtld_close(&rtld);
2141 if (!ok)
2142 return -1;
2143
2144 /* Enable 64-bit and 16-bit denormals, because there is no performance
2145 * cost.
2146 *
2147 * If denormals are enabled, all floating-point output modifiers are
2148 * ignored.
2149 *
2150 * Don't enable denormals for 32-bit floats, because:
2151 * - Floating-point output modifiers would be ignored by the hw.
2152 * - Some opcodes don't support denormals, such as v_mad_f32. We would
2153 * have to stop using those.
2154 * - GFX6 & GFX7 would be very slow.
2155 */
2156 conf->float_mode |= V_00B028_FP_64_DENORMS;
2157
2158 return 0;
2159 }
2160
2161 static void si_dump_shader_key_vs(const struct si_shader_key *key,
2162 const struct si_vs_prolog_bits *prolog,
2163 const char *prefix, FILE *f)
2164 {
2165 fprintf(f, " %s.instance_divisor_is_one = %u\n",
2166 prefix, prolog->instance_divisor_is_one);
2167 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
2168 prefix, prolog->instance_divisor_is_fetched);
2169 fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n",
2170 prefix, prolog->unpack_instance_id_from_vertex_id);
2171 fprintf(f, " %s.ls_vgpr_fix = %u\n",
2172 prefix, prolog->ls_vgpr_fix);
2173
2174 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
2175 fprintf(f, " mono.vs.fix_fetch = {");
2176 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
2177 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
2178 if (i)
2179 fprintf(f, ", ");
2180 if (!fix.bits)
2181 fprintf(f, "0");
2182 else
2183 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
2184 fix.u.num_channels_m1, fix.u.format);
2185 }
2186 fprintf(f, "}\n");
2187 }
2188
2189 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
2190 {
2191 const struct si_shader_key *key = &shader->key;
2192 enum pipe_shader_type shader_type = shader->selector->type;
2193
2194 fprintf(f, "SHADER KEY\n");
2195
2196 switch (shader_type) {
2197 case PIPE_SHADER_VERTEX:
2198 si_dump_shader_key_vs(key, &key->part.vs.prolog,
2199 "part.vs.prolog", f);
2200 fprintf(f, " as_es = %u\n", key->as_es);
2201 fprintf(f, " as_ls = %u\n", key->as_ls);
2202 fprintf(f, " as_ngg = %u\n", key->as_ngg);
2203 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
2204 key->mono.u.vs_export_prim_id);
2205 fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
2206 key->opt.vs_as_prim_discard_cs);
2207 fprintf(f, " opt.cs_prim_type = %s\n",
2208 tgsi_primitive_names[key->opt.cs_prim_type]);
2209 fprintf(f, " opt.cs_indexed = %u\n",
2210 key->opt.cs_indexed);
2211 fprintf(f, " opt.cs_instancing = %u\n",
2212 key->opt.cs_instancing);
2213 fprintf(f, " opt.cs_primitive_restart = %u\n",
2214 key->opt.cs_primitive_restart);
2215 fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
2216 key->opt.cs_provoking_vertex_first);
2217 fprintf(f, " opt.cs_need_correct_orientation = %u\n",
2218 key->opt.cs_need_correct_orientation);
2219 fprintf(f, " opt.cs_cull_front = %u\n",
2220 key->opt.cs_cull_front);
2221 fprintf(f, " opt.cs_cull_back = %u\n",
2222 key->opt.cs_cull_back);
2223 fprintf(f, " opt.cs_cull_z = %u\n",
2224 key->opt.cs_cull_z);
2225 fprintf(f, " opt.cs_halfz_clip_space = %u\n",
2226 key->opt.cs_halfz_clip_space);
2227 break;
2228
2229 case PIPE_SHADER_TESS_CTRL:
2230 if (shader->selector->screen->info.chip_class >= GFX9) {
2231 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
2232 "part.tcs.ls_prolog", f);
2233 }
2234 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
2235 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
2236 break;
2237
2238 case PIPE_SHADER_TESS_EVAL:
2239 fprintf(f, " as_es = %u\n", key->as_es);
2240 fprintf(f, " as_ngg = %u\n", key->as_ngg);
2241 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
2242 key->mono.u.vs_export_prim_id);
2243 break;
2244
2245 case PIPE_SHADER_GEOMETRY:
2246 if (shader->is_gs_copy_shader)
2247 break;
2248
2249 if (shader->selector->screen->info.chip_class >= GFX9 &&
2250 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
2251 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
2252 "part.gs.vs_prolog", f);
2253 }
2254 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
2255 fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
2256 fprintf(f, " as_ngg = %u\n", key->as_ngg);
2257 break;
2258
2259 case PIPE_SHADER_COMPUTE:
2260 break;
2261
2262 case PIPE_SHADER_FRAGMENT:
2263 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
2264 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
2265 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
2266 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
2267 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
2268 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
2269 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
2270 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
2271 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
2272 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
2273 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
2274 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
2275 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
2276 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
2277 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
2278 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
2279 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
2280 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
2281 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
2282 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
2283 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
2284 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
2285 break;
2286
2287 default:
2288 assert(0);
2289 }
2290
2291 if ((shader_type == PIPE_SHADER_GEOMETRY ||
2292 shader_type == PIPE_SHADER_TESS_EVAL ||
2293 shader_type == PIPE_SHADER_VERTEX) &&
2294 !key->as_es && !key->as_ls) {
2295 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
2296 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
2297 }
2298 }
2299
2300 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
2301 {
2302 struct si_shader *shader = ctx->shader;
2303 struct si_shader_info *info = &shader->selector->info;
2304
2305 if ((ctx->type != PIPE_SHADER_VERTEX &&
2306 ctx->type != PIPE_SHADER_TESS_EVAL) ||
2307 shader->key.as_ls ||
2308 shader->key.as_es)
2309 return;
2310
2311 ac_optimize_vs_outputs(&ctx->ac,
2312 ctx->main_fn,
2313 shader->info.vs_output_param_offset,
2314 info->num_outputs,
2315 &shader->info.nr_param_exports);
2316 }
2317
2318 static void si_init_exec_from_input(struct si_shader_context *ctx,
2319 struct ac_arg param, unsigned bitoffset)
2320 {
2321 LLVMValueRef args[] = {
2322 ac_get_arg(&ctx->ac, param),
2323 LLVMConstInt(ctx->i32, bitoffset, 0),
2324 };
2325 ac_build_intrinsic(&ctx->ac,
2326 "llvm.amdgcn.init.exec.from.input",
2327 ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
2328 }
2329
2330 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
2331 const struct si_vs_prolog_bits *key)
2332 {
2333 /* VGPR initialization fixup for Vega10 and Raven is always done in the
2334 * VS prolog. */
2335 return sel->vs_needs_prolog ||
2336 key->ls_vgpr_fix ||
2337 key->unpack_instance_id_from_vertex_id;
2338 }
2339
2340 static bool si_build_main_function(struct si_shader_context *ctx,
2341 struct nir_shader *nir, bool free_nir)
2342 {
2343 struct si_shader *shader = ctx->shader;
2344 struct si_shader_selector *sel = shader->selector;
2345
2346 si_llvm_init_resource_callbacks(ctx);
2347
2348 switch (ctx->type) {
2349 case PIPE_SHADER_VERTEX:
2350 if (shader->key.as_ls)
2351 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
2352 else if (shader->key.as_es)
2353 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
2354 else if (shader->key.opt.vs_as_prim_discard_cs)
2355 ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
2356 else if (shader->key.as_ngg)
2357 ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
2358 else
2359 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
2360 ctx->abi.load_base_vertex = get_base_vertex;
2361 break;
2362 case PIPE_SHADER_TESS_CTRL:
2363 si_llvm_init_tcs_callbacks(ctx);
2364 break;
2365 case PIPE_SHADER_TESS_EVAL:
2366 si_llvm_init_tes_callbacks(ctx);
2367
2368 if (shader->key.as_es)
2369 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
2370 else if (shader->key.as_ngg)
2371 ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
2372 else
2373 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
2374 break;
2375 case PIPE_SHADER_GEOMETRY:
2376 si_llvm_init_gs_callbacks(ctx);
2377 break;
2378 case PIPE_SHADER_FRAGMENT:
2379 si_llvm_init_ps_callbacks(ctx);
2380 break;
2381 case PIPE_SHADER_COMPUTE:
2382 ctx->abi.load_local_group_size = get_block_size;
2383 break;
2384 default:
2385 assert(!"Unsupported shader type");
2386 return false;
2387 }
2388
2389 si_create_function(ctx);
2390
2391 if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
2392 si_preload_esgs_ring(ctx);
2393
2394 if (ctx->type == PIPE_SHADER_GEOMETRY)
2395 si_preload_gs_rings(ctx);
2396 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
2397 si_llvm_preload_tes_rings(ctx);
2398
2399 if (ctx->type == PIPE_SHADER_TESS_CTRL &&
2400 sel->info.tessfactors_are_def_in_all_invocs) {
2401 for (unsigned i = 0; i < 6; i++) {
2402 ctx->invoc0_tess_factors[i] =
2403 ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
2404 }
2405 }
2406
2407 if (ctx->type == PIPE_SHADER_GEOMETRY) {
2408 for (unsigned i = 0; i < 4; i++) {
2409 ctx->gs_next_vertex[i] =
2410 ac_build_alloca(&ctx->ac, ctx->i32, "");
2411 }
2412 if (shader->key.as_ngg) {
2413 for (unsigned i = 0; i < 4; ++i) {
2414 ctx->gs_curprim_verts[i] =
2415 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
2416 ctx->gs_generated_prims[i] =
2417 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
2418 }
2419
2420 unsigned scratch_size = 8;
2421 if (sel->so.num_outputs)
2422 scratch_size = 44;
2423
2424 LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size);
2425 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2426 ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
2427 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
2428 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
2429
2430 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2431 LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
2432 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
2433 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
2434 }
2435 }
2436
2437 if (ctx->type != PIPE_SHADER_GEOMETRY &&
2438 (shader->key.as_ngg && !shader->key.as_es)) {
2439 /* Unconditionally declare scratch space base for streamout and
2440 * vertex compaction. Whether space is actually allocated is
2441 * determined during linking / PM4 creation.
2442 *
2443 * Add an extra dword per vertex to ensure an odd stride, which
2444 * avoids bank conflicts for SoA accesses.
2445 */
2446 if (!gfx10_is_ngg_passthrough(shader))
2447 si_llvm_declare_esgs_ring(ctx);
2448
2449 /* This is really only needed when streamout and / or vertex
2450 * compaction is enabled.
2451 */
2452 if (sel->so.num_outputs && !ctx->gs_ngg_scratch) {
2453 LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8);
2454 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2455 asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
2456 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
2457 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
2458 }
2459 }
2460
2461 /* For GFX9 merged shaders:
2462 * - Set EXEC for the first shader. If the prolog is present, set
2463 * EXEC there instead.
2464 * - Add a barrier before the second shader.
2465 * - In the second shader, reset EXEC to ~0 and wrap the main part in
2466 * an if-statement. This is required for correctness in geometry
2467 * shaders, to ensure that empty GS waves do not send GS_EMIT and
2468 * GS_CUT messages.
2469 *
2470 * For monolithic merged shaders, the first shader is wrapped in an
2471 * if-block together with its prolog in si_build_wrapper_function.
2472 *
2473 * NGG vertex and tess eval shaders running as the last
2474 * vertex/geometry stage handle execution explicitly using
2475 * if-statements.
2476 */
2477 if (ctx->screen->info.chip_class >= GFX9) {
2478 if (!shader->is_monolithic &&
2479 (shader->key.as_es || shader->key.as_ls) &&
2480 (ctx->type == PIPE_SHADER_TESS_EVAL ||
2481 (ctx->type == PIPE_SHADER_VERTEX &&
2482 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
2483 si_init_exec_from_input(ctx,
2484 ctx->merged_wave_info, 0);
2485 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
2486 ctx->type == PIPE_SHADER_GEOMETRY ||
2487 (shader->key.as_ngg && !shader->key.as_es)) {
2488 LLVMValueRef thread_enabled;
2489 bool nested_barrier;
2490
2491 if (!shader->is_monolithic ||
2492 (ctx->type == PIPE_SHADER_TESS_EVAL &&
2493 (shader->key.as_ngg && !shader->key.as_es)))
2494 ac_init_exec_full_mask(&ctx->ac);
2495
2496 if (ctx->type == PIPE_SHADER_TESS_CTRL ||
2497 ctx->type == PIPE_SHADER_GEOMETRY) {
2498 if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
2499 gfx10_ngg_gs_emit_prologue(ctx);
2500 nested_barrier = false;
2501 } else {
2502 nested_barrier = true;
2503 }
2504
2505 thread_enabled = si_is_gs_thread(ctx);
2506 } else {
2507 thread_enabled = si_is_es_thread(ctx);
2508 nested_barrier = false;
2509 }
2510
2511 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
2512 ctx->merged_wrap_if_label = 11500;
2513 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
2514
2515 if (nested_barrier) {
2516 /* Execute a barrier before the second shader in
2517 * a merged shader.
2518 *
2519 * Execute the barrier inside the conditional block,
2520 * so that empty waves can jump directly to s_endpgm,
2521 * which will also signal the barrier.
2522 *
2523 * This is possible in gfx9, because an empty wave
2524 * for the second shader does not participate in
2525 * the epilogue. With NGG, empty waves may still
2526 * be required to export data (e.g. GS output vertices),
2527 * so we cannot let them exit early.
2528 *
2529 * If the shader is TCS and the TCS epilog is present
2530 * and contains a barrier, it will wait there and then
2531 * reach s_endpgm.
2532 */
2533 si_llvm_emit_barrier(ctx);
2534 }
2535 }
2536 }
2537
2538 if (sel->force_correct_derivs_after_kill) {
2539 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
2540 /* true = don't kill. */
2541 LLVMBuildStore(ctx->ac.builder, ctx->i1true,
2542 ctx->postponed_kill);
2543 }
2544
2545 bool success = si_nir_build_llvm(ctx, nir);
2546 if (free_nir)
2547 ralloc_free(nir);
2548 if (!success) {
2549 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
2550 return false;
2551 }
2552
2553 si_llvm_build_ret(ctx, ctx->return_value);
2554 return true;
2555 }
2556
2557 /**
2558 * Compute the VS prolog key, which contains all the information needed to
2559 * build the VS prolog function, and set shader->info bits where needed.
2560 *
2561 * \param info Shader info of the vertex shader.
2562 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
2563 * \param prolog_key Key of the VS prolog
2564 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
2565 * \param key Output shader part key.
2566 */
2567 static void si_get_vs_prolog_key(const struct si_shader_info *info,
2568 unsigned num_input_sgprs,
2569 const struct si_vs_prolog_bits *prolog_key,
2570 struct si_shader *shader_out,
2571 union si_shader_part_key *key)
2572 {
2573 memset(key, 0, sizeof(*key));
2574 key->vs_prolog.states = *prolog_key;
2575 key->vs_prolog.num_input_sgprs = num_input_sgprs;
2576 key->vs_prolog.num_inputs = info->num_inputs;
2577 key->vs_prolog.as_ls = shader_out->key.as_ls;
2578 key->vs_prolog.as_es = shader_out->key.as_es;
2579 key->vs_prolog.as_ngg = shader_out->key.as_ngg;
2580
2581 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
2582 key->vs_prolog.as_ls = 1;
2583 key->vs_prolog.num_merged_next_stage_vgprs = 2;
2584 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
2585 key->vs_prolog.as_es = 1;
2586 key->vs_prolog.num_merged_next_stage_vgprs = 5;
2587 } else if (shader_out->key.as_ngg) {
2588 key->vs_prolog.num_merged_next_stage_vgprs = 5;
2589 }
2590
2591 /* Enable loading the InstanceID VGPR. */
2592 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
2593
2594 if ((key->vs_prolog.states.instance_divisor_is_one |
2595 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
2596 shader_out->info.uses_instanceid = true;
2597 }
2598
2599 /**
2600 * Given a list of shader part functions, build a wrapper function that
2601 * runs them in sequence to form a monolithic shader.
2602 */
2603 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
2604 unsigned num_parts, unsigned main_part,
2605 unsigned next_shader_first_part)
2606 {
2607 LLVMBuilderRef builder = ctx->ac.builder;
2608 /* PS epilog has one arg per color component; gfx9 merged shader
2609 * prologs need to forward 40 SGPRs.
2610 */
2611 LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
2612 LLVMTypeRef function_type;
2613 unsigned num_first_params;
2614 unsigned num_out, initial_num_out;
2615 ASSERTED unsigned num_out_sgpr; /* used in debug checks */
2616 ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
2617 unsigned num_sgprs, num_vgprs;
2618 unsigned gprs;
2619
2620 memset(&ctx->args, 0, sizeof(ctx->args));
2621
2622 for (unsigned i = 0; i < num_parts; ++i) {
2623 ac_add_function_attr(ctx->ac.context, parts[i], -1,
2624 AC_FUNC_ATTR_ALWAYSINLINE);
2625 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
2626 }
2627
2628 /* The parameters of the wrapper function correspond to those of the
2629 * first part in terms of SGPRs and VGPRs, but we use the types of the
2630 * main part to get the right types. This is relevant for the
2631 * dereferenceable attribute on descriptor table pointers.
2632 */
2633 num_sgprs = 0;
2634 num_vgprs = 0;
2635
2636 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
2637 num_first_params = LLVMCountParamTypes(function_type);
2638
2639 for (unsigned i = 0; i < num_first_params; ++i) {
2640 LLVMValueRef param = LLVMGetParam(parts[0], i);
2641
2642 if (ac_is_sgpr_param(param)) {
2643 assert(num_vgprs == 0);
2644 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
2645 } else {
2646 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
2647 }
2648 }
2649
2650 gprs = 0;
2651 while (gprs < num_sgprs + num_vgprs) {
2652 LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
2653 LLVMTypeRef type = LLVMTypeOf(param);
2654 unsigned size = ac_get_type_size(type) / 4;
2655
2656 /* This is going to get casted anyways, so we don't have to
2657 * have the exact same type. But we do have to preserve the
2658 * pointer-ness so that LLVM knows about it.
2659 */
2660 enum ac_arg_type arg_type = AC_ARG_INT;
2661 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
2662 type = LLVMGetElementType(type);
2663
2664 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2665 if (LLVMGetVectorSize(type) == 4)
2666 arg_type = AC_ARG_CONST_DESC_PTR;
2667 else if (LLVMGetVectorSize(type) == 8)
2668 arg_type = AC_ARG_CONST_IMAGE_PTR;
2669 else
2670 assert(0);
2671 } else if (type == ctx->f32) {
2672 arg_type = AC_ARG_CONST_FLOAT_PTR;
2673 } else {
2674 assert(0);
2675 }
2676 }
2677
2678 ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
2679 size, arg_type, NULL);
2680
2681 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
2682 assert(gprs + size <= num_sgprs + num_vgprs &&
2683 (gprs >= num_sgprs || gprs + size <= num_sgprs));
2684
2685 gprs += size;
2686 }
2687
2688 /* Prepare the return type. */
2689 unsigned num_returns = 0;
2690 LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
2691
2692 last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
2693 return_type = LLVMGetReturnType(last_func_type);
2694
2695 switch (LLVMGetTypeKind(return_type)) {
2696 case LLVMStructTypeKind:
2697 num_returns = LLVMCountStructElementTypes(return_type);
2698 assert(num_returns <= ARRAY_SIZE(returns));
2699 LLVMGetStructElementTypes(return_type, returns);
2700 break;
2701 case LLVMVoidTypeKind:
2702 break;
2703 default:
2704 unreachable("unexpected type");
2705 }
2706
2707 si_llvm_create_func(ctx, "wrapper", returns, num_returns,
2708 si_get_max_workgroup_size(ctx->shader));
2709
2710 if (si_is_merged_shader(ctx))
2711 ac_init_exec_full_mask(&ctx->ac);
2712
2713 /* Record the arguments of the function as if they were an output of
2714 * a previous part.
2715 */
2716 num_out = 0;
2717 num_out_sgpr = 0;
2718
2719 for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
2720 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
2721 LLVMTypeRef param_type = LLVMTypeOf(param);
2722 LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->i32 : ctx->f32;
2723 unsigned size = ac_get_type_size(param_type) / 4;
2724
2725 if (size == 1) {
2726 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
2727 param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
2728 param_type = ctx->i32;
2729 }
2730
2731 if (param_type != out_type)
2732 param = LLVMBuildBitCast(builder, param, out_type, "");
2733 out[num_out++] = param;
2734 } else {
2735 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
2736
2737 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
2738 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
2739 param_type = ctx->i64;
2740 }
2741
2742 if (param_type != vector_type)
2743 param = LLVMBuildBitCast(builder, param, vector_type, "");
2744
2745 for (unsigned j = 0; j < size; ++j)
2746 out[num_out++] = LLVMBuildExtractElement(
2747 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
2748 }
2749
2750 if (ctx->args.args[i].file == AC_ARG_SGPR)
2751 num_out_sgpr = num_out;
2752 }
2753
2754 memcpy(initial, out, sizeof(out));
2755 initial_num_out = num_out;
2756 initial_num_out_sgpr = num_out_sgpr;
2757
2758 /* Now chain the parts. */
2759 LLVMValueRef ret = NULL;
2760 for (unsigned part = 0; part < num_parts; ++part) {
2761 LLVMValueRef in[AC_MAX_ARGS];
2762 LLVMTypeRef ret_type;
2763 unsigned out_idx = 0;
2764 unsigned num_params = LLVMCountParams(parts[part]);
2765
2766 /* Merged shaders are executed conditionally depending
2767 * on the number of enabled threads passed in the input SGPRs. */
2768 if (is_multi_part_shader(ctx) && part == 0) {
2769 LLVMValueRef ena, count = initial[3];
2770
2771 count = LLVMBuildAnd(builder, count,
2772 LLVMConstInt(ctx->i32, 0x7f, 0), "");
2773 ena = LLVMBuildICmp(builder, LLVMIntULT,
2774 ac_get_thread_id(&ctx->ac), count, "");
2775 ac_build_ifcc(&ctx->ac, ena, 6506);
2776 }
2777
2778 /* Derive arguments for the next part from outputs of the
2779 * previous one.
2780 */
2781 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
2782 LLVMValueRef param;
2783 LLVMTypeRef param_type;
2784 bool is_sgpr;
2785 unsigned param_size;
2786 LLVMValueRef arg = NULL;
2787
2788 param = LLVMGetParam(parts[part], param_idx);
2789 param_type = LLVMTypeOf(param);
2790 param_size = ac_get_type_size(param_type) / 4;
2791 is_sgpr = ac_is_sgpr_param(param);
2792
2793 if (is_sgpr) {
2794 ac_add_function_attr(ctx->ac.context, parts[part],
2795 param_idx + 1, AC_FUNC_ATTR_INREG);
2796 } else if (out_idx < num_out_sgpr) {
2797 /* Skip returned SGPRs the current part doesn't
2798 * declare on the input. */
2799 out_idx = num_out_sgpr;
2800 }
2801
2802 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
2803
2804 if (param_size == 1)
2805 arg = out[out_idx];
2806 else
2807 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
2808
2809 if (LLVMTypeOf(arg) != param_type) {
2810 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
2811 if (LLVMGetPointerAddressSpace(param_type) ==
2812 AC_ADDR_SPACE_CONST_32BIT) {
2813 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
2814 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
2815 } else {
2816 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
2817 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
2818 }
2819 } else {
2820 arg = LLVMBuildBitCast(builder, arg, param_type, "");
2821 }
2822 }
2823
2824 in[param_idx] = arg;
2825 out_idx += param_size;
2826 }
2827
2828 ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
2829
2830 if (is_multi_part_shader(ctx) &&
2831 part + 1 == next_shader_first_part) {
2832 ac_build_endif(&ctx->ac, 6506);
2833
2834 /* The second half of the merged shader should use
2835 * the inputs from the toplevel (wrapper) function,
2836 * not the return value from the last call.
2837 *
2838 * That's because the last call was executed condi-
2839 * tionally, so we can't consume it in the main
2840 * block.
2841 */
2842 memcpy(out, initial, sizeof(initial));
2843 num_out = initial_num_out;
2844 num_out_sgpr = initial_num_out_sgpr;
2845 continue;
2846 }
2847
2848 /* Extract the returned GPRs. */
2849 ret_type = LLVMTypeOf(ret);
2850 num_out = 0;
2851 num_out_sgpr = 0;
2852
2853 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
2854 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
2855
2856 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
2857
2858 for (unsigned i = 0; i < ret_size; ++i) {
2859 LLVMValueRef val =
2860 LLVMBuildExtractValue(builder, ret, i, "");
2861
2862 assert(num_out < ARRAY_SIZE(out));
2863 out[num_out++] = val;
2864
2865 if (LLVMTypeOf(val) == ctx->i32) {
2866 assert(num_out_sgpr + 1 == num_out);
2867 num_out_sgpr = num_out;
2868 }
2869 }
2870 }
2871 }
2872
2873 /* Return the value from the last part. */
2874 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
2875 LLVMBuildRetVoid(builder);
2876 else
2877 LLVMBuildRet(builder, ret);
2878 }
2879
2880 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
2881 struct si_shader_selector *sel)
2882 {
2883 if (!compiler->low_opt_passes)
2884 return false;
2885
2886 /* Assume a slow CPU. */
2887 assert(!sel->screen->info.has_dedicated_vram &&
2888 sel->screen->info.chip_class <= GFX8);
2889
2890 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
2891 * buffer stores. */
2892 return sel->type == PIPE_SHADER_COMPUTE &&
2893 sel->info.num_memory_instructions > 1000;
2894 }
2895
2896 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
2897 bool *free_nir)
2898 {
2899 *free_nir = false;
2900
2901 if (sel->nir) {
2902 return sel->nir;
2903 } else if (sel->nir_binary) {
2904 struct pipe_screen *screen = &sel->screen->b;
2905 const void *options =
2906 screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
2907 sel->type);
2908
2909 struct blob_reader blob_reader;
2910 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
2911 *free_nir = true;
2912 return nir_deserialize(NULL, options, &blob_reader);
2913 }
2914 return NULL;
2915 }
2916
2917 int si_compile_shader(struct si_screen *sscreen,
2918 struct ac_llvm_compiler *compiler,
2919 struct si_shader *shader,
2920 struct pipe_debug_callback *debug)
2921 {
2922 struct si_shader_selector *sel = shader->selector;
2923 struct si_shader_context ctx;
2924 bool free_nir;
2925 struct nir_shader *nir = get_nir_shader(sel, &free_nir);
2926 int r = -1;
2927
2928 /* Dump NIR before doing NIR->LLVM conversion in case the
2929 * conversion fails. */
2930 if (si_can_dump_shader(sscreen, sel->type) &&
2931 !(sscreen->debug_flags & DBG(NO_NIR))) {
2932 nir_print_shader(nir, stderr);
2933 si_dump_streamout(&sel->so);
2934 }
2935
2936 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
2937 si_llvm_context_set_ir(&ctx, shader);
2938
2939 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
2940 sizeof(shader->info.vs_output_param_offset));
2941
2942 shader->info.uses_instanceid = sel->info.uses_instanceid;
2943
2944 if (!si_build_main_function(&ctx, nir, free_nir)) {
2945 si_llvm_dispose(&ctx);
2946 return -1;
2947 }
2948
2949 if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
2950 LLVMValueRef parts[2];
2951 bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
2952
2953 parts[1] = ctx.main_fn;
2954
2955 if (need_prolog) {
2956 union si_shader_part_key prolog_key;
2957 si_get_vs_prolog_key(&sel->info,
2958 shader->info.num_input_sgprs,
2959 &shader->key.part.vs.prolog,
2960 shader, &prolog_key);
2961 prolog_key.vs_prolog.is_monolithic = true;
2962 si_build_vs_prolog_function(&ctx, &prolog_key);
2963 parts[0] = ctx.main_fn;
2964 }
2965
2966 si_build_wrapper_function(&ctx, parts + !need_prolog,
2967 1 + need_prolog, need_prolog, 0);
2968
2969 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
2970 si_build_prim_discard_compute_shader(&ctx);
2971 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
2972 if (sscreen->info.chip_class >= GFX9) {
2973 struct si_shader_selector *ls = shader->key.part.tcs.ls;
2974 LLVMValueRef parts[4];
2975 bool vs_needs_prolog =
2976 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
2977
2978 /* TCS main part */
2979 parts[2] = ctx.main_fn;
2980
2981 /* TCS epilog */
2982 union si_shader_part_key tcs_epilog_key;
2983 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
2984 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
2985 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
2986 parts[3] = ctx.main_fn;
2987
2988 /* VS as LS main part */
2989 nir = get_nir_shader(ls, &free_nir);
2990 struct si_shader shader_ls = {};
2991 shader_ls.selector = ls;
2992 shader_ls.key.as_ls = 1;
2993 shader_ls.key.mono = shader->key.mono;
2994 shader_ls.key.opt = shader->key.opt;
2995 shader_ls.is_monolithic = true;
2996 si_llvm_context_set_ir(&ctx, &shader_ls);
2997
2998 if (!si_build_main_function(&ctx, nir, free_nir)) {
2999 si_llvm_dispose(&ctx);
3000 return -1;
3001 }
3002 shader->info.uses_instanceid |= ls->info.uses_instanceid;
3003 parts[1] = ctx.main_fn;
3004
3005 /* LS prolog */
3006 if (vs_needs_prolog) {
3007 union si_shader_part_key vs_prolog_key;
3008 si_get_vs_prolog_key(&ls->info,
3009 shader_ls.info.num_input_sgprs,
3010 &shader->key.part.tcs.ls_prolog,
3011 shader, &vs_prolog_key);
3012 vs_prolog_key.vs_prolog.is_monolithic = true;
3013 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
3014 parts[0] = ctx.main_fn;
3015 }
3016
3017 /* Reset the shader context. */
3018 ctx.shader = shader;
3019 ctx.type = PIPE_SHADER_TESS_CTRL;
3020
3021 si_build_wrapper_function(&ctx,
3022 parts + !vs_needs_prolog,
3023 4 - !vs_needs_prolog, vs_needs_prolog,
3024 vs_needs_prolog ? 2 : 1);
3025 } else {
3026 LLVMValueRef parts[2];
3027 union si_shader_part_key epilog_key;
3028
3029 parts[0] = ctx.main_fn;
3030
3031 memset(&epilog_key, 0, sizeof(epilog_key));
3032 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
3033 si_llvm_build_tcs_epilog(&ctx, &epilog_key);
3034 parts[1] = ctx.main_fn;
3035
3036 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
3037 }
3038 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
3039 if (ctx.screen->info.chip_class >= GFX9) {
3040 struct si_shader_selector *es = shader->key.part.gs.es;
3041 LLVMValueRef es_prolog = NULL;
3042 LLVMValueRef es_main = NULL;
3043 LLVMValueRef gs_prolog = NULL;
3044 LLVMValueRef gs_main = ctx.main_fn;
3045
3046 /* GS prolog */
3047 union si_shader_part_key gs_prolog_key;
3048 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
3049 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
3050 gs_prolog_key.gs_prolog.is_monolithic = true;
3051 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
3052 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
3053 gs_prolog = ctx.main_fn;
3054
3055 /* ES main part */
3056 nir = get_nir_shader(es, &free_nir);
3057 struct si_shader shader_es = {};
3058 shader_es.selector = es;
3059 shader_es.key.as_es = 1;
3060 shader_es.key.as_ngg = shader->key.as_ngg;
3061 shader_es.key.mono = shader->key.mono;
3062 shader_es.key.opt = shader->key.opt;
3063 shader_es.is_monolithic = true;
3064 si_llvm_context_set_ir(&ctx, &shader_es);
3065
3066 if (!si_build_main_function(&ctx, nir, free_nir)) {
3067 si_llvm_dispose(&ctx);
3068 return -1;
3069 }
3070 shader->info.uses_instanceid |= es->info.uses_instanceid;
3071 es_main = ctx.main_fn;
3072
3073 /* ES prolog */
3074 if (es->type == PIPE_SHADER_VERTEX &&
3075 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
3076 union si_shader_part_key vs_prolog_key;
3077 si_get_vs_prolog_key(&es->info,
3078 shader_es.info.num_input_sgprs,
3079 &shader->key.part.gs.vs_prolog,
3080 shader, &vs_prolog_key);
3081 vs_prolog_key.vs_prolog.is_monolithic = true;
3082 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
3083 es_prolog = ctx.main_fn;
3084 }
3085
3086 /* Reset the shader context. */
3087 ctx.shader = shader;
3088 ctx.type = PIPE_SHADER_GEOMETRY;
3089
3090 /* Prepare the array of shader parts. */
3091 LLVMValueRef parts[4];
3092 unsigned num_parts = 0, main_part, next_first_part;
3093
3094 if (es_prolog)
3095 parts[num_parts++] = es_prolog;
3096
3097 parts[main_part = num_parts++] = es_main;
3098 parts[next_first_part = num_parts++] = gs_prolog;
3099 parts[num_parts++] = gs_main;
3100
3101 si_build_wrapper_function(&ctx, parts, num_parts,
3102 main_part, next_first_part);
3103 } else {
3104 LLVMValueRef parts[2];
3105 union si_shader_part_key prolog_key;
3106
3107 parts[1] = ctx.main_fn;
3108
3109 memset(&prolog_key, 0, sizeof(prolog_key));
3110 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
3111 si_llvm_build_gs_prolog(&ctx, &prolog_key);
3112 parts[0] = ctx.main_fn;
3113
3114 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
3115 }
3116 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
3117 si_llvm_build_monolithic_ps(&ctx, shader);
3118 }
3119
3120 si_llvm_optimize_module(&ctx);
3121
3122 /* Post-optimization transformations and analysis. */
3123 si_optimize_vs_outputs(&ctx);
3124
3125 if ((debug && debug->debug_message) ||
3126 si_can_dump_shader(sscreen, ctx.type)) {
3127 ctx.shader->info.private_mem_vgprs =
3128 ac_count_scratch_private_memory(ctx.main_fn);
3129 }
3130
3131 /* Make sure the input is a pointer and not integer followed by inttoptr. */
3132 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
3133 LLVMPointerTypeKind);
3134
3135 /* Compile to bytecode. */
3136 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
3137 ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
3138 si_get_shader_name(shader),
3139 si_should_optimize_less(compiler, shader->selector));
3140 si_llvm_dispose(&ctx);
3141 if (r) {
3142 fprintf(stderr, "LLVM failed to compile shader\n");
3143 return r;
3144 }
3145
3146 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
3147 * LLVM 3.9svn has this bug.
3148 */
3149 if (sel->type == PIPE_SHADER_COMPUTE) {
3150 unsigned wave_size = sscreen->compute_wave_size;
3151 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
3152 (wave_size == 32 ? 2 : 1);
3153 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3154 unsigned max_sgprs_per_wave = 128;
3155 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3156 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3157 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
3158 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3159
3160 max_vgprs = max_vgprs / waves_per_simd;
3161 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3162
3163 if (shader->config.num_sgprs > max_sgprs ||
3164 shader->config.num_vgprs > max_vgprs) {
3165 fprintf(stderr, "LLVM failed to compile a shader correctly: "
3166 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3167 shader->config.num_sgprs, shader->config.num_vgprs,
3168 max_sgprs, max_vgprs);
3169
3170 /* Just terminate the process, because dependent
3171 * shaders can hang due to bad input data, but use
3172 * the env var to allow shader-db to work.
3173 */
3174 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3175 abort();
3176 }
3177 }
3178
3179 /* Add the scratch offset to input SGPRs. */
3180 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(&ctx))
3181 shader->info.num_input_sgprs += 1; /* scratch byte offset */
3182
3183 /* Calculate the number of fragment input VGPRs. */
3184 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3185 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
3186 &shader->info.face_vgpr_index,
3187 &shader->info.ancillary_vgpr_index);
3188 }
3189
3190 si_calculate_max_simd_waves(shader);
3191 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3192 return 0;
3193 }
3194
3195 /**
3196 * Create, compile and return a shader part (prolog or epilog).
3197 *
3198 * \param sscreen screen
3199 * \param list list of shader parts of the same category
3200 * \param type shader type
3201 * \param key shader part key
3202 * \param prolog whether the part being requested is a prolog
3203 * \param tm LLVM target machine
3204 * \param debug debug callback
3205 * \param build the callback responsible for building the main function
3206 * \return non-NULL on success
3207 */
3208 static struct si_shader_part *
3209 si_get_shader_part(struct si_screen *sscreen,
3210 struct si_shader_part **list,
3211 enum pipe_shader_type type,
3212 bool prolog,
3213 union si_shader_part_key *key,
3214 struct ac_llvm_compiler *compiler,
3215 struct pipe_debug_callback *debug,
3216 void (*build)(struct si_shader_context *,
3217 union si_shader_part_key *),
3218 const char *name)
3219 {
3220 struct si_shader_part *result;
3221
3222 simple_mtx_lock(&sscreen->shader_parts_mutex);
3223
3224 /* Find existing. */
3225 for (result = *list; result; result = result->next) {
3226 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3227 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3228 return result;
3229 }
3230 }
3231
3232 /* Compile a new one. */
3233 result = CALLOC_STRUCT(si_shader_part);
3234 result->key = *key;
3235
3236 struct si_shader shader = {};
3237
3238 switch (type) {
3239 case PIPE_SHADER_VERTEX:
3240 shader.key.as_ls = key->vs_prolog.as_ls;
3241 shader.key.as_es = key->vs_prolog.as_es;
3242 shader.key.as_ngg = key->vs_prolog.as_ngg;
3243 break;
3244 case PIPE_SHADER_TESS_CTRL:
3245 assert(!prolog);
3246 shader.key.part.tcs.epilog = key->tcs_epilog.states;
3247 break;
3248 case PIPE_SHADER_GEOMETRY:
3249 assert(prolog);
3250 shader.key.as_ngg = key->gs_prolog.as_ngg;
3251 break;
3252 case PIPE_SHADER_FRAGMENT:
3253 if (prolog)
3254 shader.key.part.ps.prolog = key->ps_prolog.states;
3255 else
3256 shader.key.part.ps.epilog = key->ps_epilog.states;
3257 break;
3258 default:
3259 unreachable("bad shader part");
3260 }
3261
3262 struct si_shader_context ctx;
3263 si_llvm_context_init(&ctx, sscreen, compiler,
3264 si_get_wave_size(sscreen, type, shader.key.as_ngg,
3265 shader.key.as_es));
3266 ctx.shader = &shader;
3267 ctx.type = type;
3268
3269 build(&ctx, key);
3270
3271 /* Compile. */
3272 si_llvm_optimize_module(&ctx);
3273
3274 if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
3275 ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
3276 name, false)) {
3277 FREE(result);
3278 result = NULL;
3279 goto out;
3280 }
3281
3282 result->next = *list;
3283 *list = result;
3284
3285 out:
3286 si_llvm_dispose(&ctx);
3287 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3288 return result;
3289 }
3290
3291 /**
3292 * Build the vertex shader prolog function.
3293 *
3294 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
3295 * All inputs are returned unmodified. The vertex load indices are
3296 * stored after them, which will be used by the API VS for fetching inputs.
3297 *
3298 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
3299 * input_v0,
3300 * input_v1,
3301 * input_v2,
3302 * input_v3,
3303 * (VertexID + BaseVertex),
3304 * (InstanceID + StartInstance),
3305 * (InstanceID / 2 + StartInstance)
3306 */
3307 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
3308 union si_shader_part_key *key)
3309 {
3310 LLVMTypeRef *returns;
3311 LLVMValueRef ret, func;
3312 int num_returns, i;
3313 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
3314 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
3315 struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
3316 struct ac_arg input_vgpr_param[9];
3317 LLVMValueRef input_vgprs[9];
3318 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
3319 num_input_vgprs;
3320 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
3321
3322 memset(&ctx->args, 0, sizeof(ctx->args));
3323
3324 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
3325 returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
3326 sizeof(LLVMTypeRef));
3327 num_returns = 0;
3328
3329 /* Declare input and output SGPRs. */
3330 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
3331 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
3332 &input_sgpr_param[i]);
3333 returns[num_returns++] = ctx->i32;
3334 }
3335
3336 struct ac_arg merged_wave_info = input_sgpr_param[3];
3337
3338 /* Preloaded VGPRs (outputs must be floats) */
3339 for (i = 0; i < num_input_vgprs; i++) {
3340 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
3341 returns[num_returns++] = ctx->f32;
3342 }
3343
3344 /* Vertex load indices. */
3345 for (i = 0; i < key->vs_prolog.num_inputs; i++)
3346 returns[num_returns++] = ctx->f32;
3347
3348 /* Create the function. */
3349 si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
3350 func = ctx->main_fn;
3351
3352 for (i = 0; i < num_input_vgprs; i++) {
3353 input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
3354 }
3355
3356 if (key->vs_prolog.num_merged_next_stage_vgprs) {
3357 if (!key->vs_prolog.is_monolithic)
3358 si_init_exec_from_input(ctx, merged_wave_info, 0);
3359
3360 if (key->vs_prolog.as_ls &&
3361 ctx->screen->info.has_ls_vgpr_init_bug) {
3362 /* If there are no HS threads, SPI loads the LS VGPRs
3363 * starting at VGPR 0. Shift them back to where they
3364 * belong.
3365 */
3366 LLVMValueRef has_hs_threads =
3367 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
3368 si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
3369 ctx->i32_0, "");
3370
3371 for (i = 4; i > 0; --i) {
3372 input_vgprs[i + 1] =
3373 LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
3374 input_vgprs[i + 1],
3375 input_vgprs[i - 1], "");
3376 }
3377 }
3378 }
3379
3380 unsigned vertex_id_vgpr = first_vs_vgpr;
3381 unsigned instance_id_vgpr =
3382 ctx->screen->info.chip_class >= GFX10 ?
3383 first_vs_vgpr + 3 :
3384 first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
3385
3386 ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
3387 ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
3388
3389 /* InstanceID = VertexID >> 16;
3390 * VertexID = VertexID & 0xffff;
3391 */
3392 if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
3393 ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
3394 LLVMConstInt(ctx->i32, 16, 0), "");
3395 ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
3396 LLVMConstInt(ctx->i32, 0xffff, 0), "");
3397 }
3398
3399 /* Copy inputs to outputs. This should be no-op, as the registers match,
3400 * but it will prevent the compiler from overwriting them unintentionally.
3401 */
3402 ret = ctx->return_value;
3403 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
3404 LLVMValueRef p = LLVMGetParam(func, i);
3405 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
3406 }
3407 for (i = 0; i < num_input_vgprs; i++) {
3408 LLVMValueRef p = input_vgprs[i];
3409
3410 if (i == vertex_id_vgpr)
3411 p = ctx->abi.vertex_id;
3412 else if (i == instance_id_vgpr)
3413 p = ctx->abi.instance_id;
3414
3415 p = ac_to_float(&ctx->ac, p);
3416 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
3417 key->vs_prolog.num_input_sgprs + i, "");
3418 }
3419
3420 /* Compute vertex load indices from instance divisors. */
3421 LLVMValueRef instance_divisor_constbuf = NULL;
3422
3423 if (key->vs_prolog.states.instance_divisor_is_fetched) {
3424 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
3425 LLVMValueRef buf_index =
3426 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
3427 instance_divisor_constbuf =
3428 ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
3429 }
3430
3431 for (i = 0; i < key->vs_prolog.num_inputs; i++) {
3432 bool divisor_is_one =
3433 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
3434 bool divisor_is_fetched =
3435 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
3436 LLVMValueRef index = NULL;
3437
3438 if (divisor_is_one) {
3439 index = ctx->abi.instance_id;
3440 } else if (divisor_is_fetched) {
3441 LLVMValueRef udiv_factors[4];
3442
3443 for (unsigned j = 0; j < 4; j++) {
3444 udiv_factors[j] =
3445 si_buffer_load_const(ctx, instance_divisor_constbuf,
3446 LLVMConstInt(ctx->i32, i*16 + j*4, 0));
3447 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
3448 }
3449 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
3450 * Such InstanceID might not be achievable in a reasonable time though.
3451 */
3452 index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
3453 udiv_factors[0], udiv_factors[1],
3454 udiv_factors[2], udiv_factors[3]);
3455 }
3456
3457 if (divisor_is_one || divisor_is_fetched) {
3458 /* Add StartInstance. */
3459 index = LLVMBuildAdd(ctx->ac.builder, index,
3460 LLVMGetParam(ctx->main_fn, user_sgpr_base +
3461 SI_SGPR_START_INSTANCE), "");
3462 } else {
3463 /* VertexID + BaseVertex */
3464 index = LLVMBuildAdd(ctx->ac.builder,
3465 ctx->abi.vertex_id,
3466 LLVMGetParam(func, user_sgpr_base +
3467 SI_SGPR_BASE_VERTEX), "");
3468 }
3469
3470 index = ac_to_float(&ctx->ac, index);
3471 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
3472 ctx->args.arg_count + i, "");
3473 }
3474
3475 si_llvm_build_ret(ctx, ret);
3476 }
3477
3478 static bool si_get_vs_prolog(struct si_screen *sscreen,
3479 struct ac_llvm_compiler *compiler,
3480 struct si_shader *shader,
3481 struct pipe_debug_callback *debug,
3482 struct si_shader *main_part,
3483 const struct si_vs_prolog_bits *key)
3484 {
3485 struct si_shader_selector *vs = main_part->selector;
3486
3487 if (!si_vs_needs_prolog(vs, key))
3488 return true;
3489
3490 /* Get the prolog. */
3491 union si_shader_part_key prolog_key;
3492 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
3493 key, shader, &prolog_key);
3494
3495 shader->prolog =
3496 si_get_shader_part(sscreen, &sscreen->vs_prologs,
3497 PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
3498 debug, si_build_vs_prolog_function,
3499 "Vertex Shader Prolog");
3500 return shader->prolog != NULL;
3501 }
3502
3503 /**
3504 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
3505 */
3506 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
3507 struct ac_llvm_compiler *compiler,
3508 struct si_shader *shader,
3509 struct pipe_debug_callback *debug)
3510 {
3511 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
3512 &shader->key.part.vs.prolog);
3513 }
3514
3515 /**
3516 * Select and compile (or reuse) TCS parts (epilog).
3517 */
3518 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
3519 struct ac_llvm_compiler *compiler,
3520 struct si_shader *shader,
3521 struct pipe_debug_callback *debug)
3522 {
3523 if (sscreen->info.chip_class >= GFX9) {
3524 struct si_shader *ls_main_part =
3525 shader->key.part.tcs.ls->main_shader_part_ls;
3526
3527 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
3528 &shader->key.part.tcs.ls_prolog))
3529 return false;
3530
3531 shader->previous_stage = ls_main_part;
3532 }
3533
3534 /* Get the epilog. */
3535 union si_shader_part_key epilog_key;
3536 memset(&epilog_key, 0, sizeof(epilog_key));
3537 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
3538
3539 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
3540 PIPE_SHADER_TESS_CTRL, false,
3541 &epilog_key, compiler, debug,
3542 si_llvm_build_tcs_epilog,
3543 "Tessellation Control Shader Epilog");
3544 return shader->epilog != NULL;
3545 }
3546
3547 /**
3548 * Select and compile (or reuse) GS parts (prolog).
3549 */
3550 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
3551 struct ac_llvm_compiler *compiler,
3552 struct si_shader *shader,
3553 struct pipe_debug_callback *debug)
3554 {
3555 if (sscreen->info.chip_class >= GFX9) {
3556 struct si_shader *es_main_part;
3557 enum pipe_shader_type es_type = shader->key.part.gs.es->type;
3558
3559 if (shader->key.as_ngg)
3560 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
3561 else
3562 es_main_part = shader->key.part.gs.es->main_shader_part_es;
3563
3564 if (es_type == PIPE_SHADER_VERTEX &&
3565 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
3566 &shader->key.part.gs.vs_prolog))
3567 return false;
3568
3569 shader->previous_stage = es_main_part;
3570 }
3571
3572 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
3573 return true;
3574
3575 union si_shader_part_key prolog_key;
3576 memset(&prolog_key, 0, sizeof(prolog_key));
3577 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
3578 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
3579
3580 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
3581 PIPE_SHADER_GEOMETRY, true,
3582 &prolog_key, compiler, debug,
3583 si_llvm_build_gs_prolog,
3584 "Geometry Shader Prolog");
3585 return shader->prolog2 != NULL;
3586 }
3587
3588 /**
3589 * Compute the PS prolog key, which contains all the information needed to
3590 * build the PS prolog function, and set related bits in shader->config.
3591 */
3592 void si_get_ps_prolog_key(struct si_shader *shader,
3593 union si_shader_part_key *key,
3594 bool separate_prolog)
3595 {
3596 struct si_shader_info *info = &shader->selector->info;
3597
3598 memset(key, 0, sizeof(*key));
3599 key->ps_prolog.states = shader->key.part.ps.prolog;
3600 key->ps_prolog.colors_read = info->colors_read;
3601 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3602 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
3603 key->ps_prolog.wqm = info->uses_derivatives &&
3604 (key->ps_prolog.colors_read ||
3605 key->ps_prolog.states.force_persp_sample_interp ||
3606 key->ps_prolog.states.force_linear_sample_interp ||
3607 key->ps_prolog.states.force_persp_center_interp ||
3608 key->ps_prolog.states.force_linear_center_interp ||
3609 key->ps_prolog.states.bc_optimize_for_persp ||
3610 key->ps_prolog.states.bc_optimize_for_linear);
3611 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
3612
3613 if (info->colors_read) {
3614 unsigned *color = shader->selector->color_attr_index;
3615
3616 if (shader->key.part.ps.prolog.color_two_side) {
3617 /* BCOLORs are stored after the last input. */
3618 key->ps_prolog.num_interp_inputs = info->num_inputs;
3619 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
3620 if (separate_prolog)
3621 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3622 }
3623
3624 for (unsigned i = 0; i < 2; i++) {
3625 unsigned interp = info->input_interpolate[color[i]];
3626 unsigned location = info->input_interpolate_loc[color[i]];
3627
3628 if (!(info->colors_read & (0xf << i*4)))
3629 continue;
3630
3631 key->ps_prolog.color_attr_index[i] = color[i];
3632
3633 if (shader->key.part.ps.prolog.flatshade_colors &&
3634 interp == TGSI_INTERPOLATE_COLOR)
3635 interp = TGSI_INTERPOLATE_CONSTANT;
3636
3637 switch (interp) {
3638 case TGSI_INTERPOLATE_CONSTANT:
3639 key->ps_prolog.color_interp_vgpr_index[i] = -1;
3640 break;
3641 case TGSI_INTERPOLATE_PERSPECTIVE:
3642 case TGSI_INTERPOLATE_COLOR:
3643 /* Force the interpolation location for colors here. */
3644 if (shader->key.part.ps.prolog.force_persp_sample_interp)
3645 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3646 if (shader->key.part.ps.prolog.force_persp_center_interp)
3647 location = TGSI_INTERPOLATE_LOC_CENTER;
3648
3649 switch (location) {
3650 case TGSI_INTERPOLATE_LOC_SAMPLE:
3651 key->ps_prolog.color_interp_vgpr_index[i] = 0;
3652 if (separate_prolog) {
3653 shader->config.spi_ps_input_ena |=
3654 S_0286CC_PERSP_SAMPLE_ENA(1);
3655 }
3656 break;
3657 case TGSI_INTERPOLATE_LOC_CENTER:
3658 key->ps_prolog.color_interp_vgpr_index[i] = 2;
3659 if (separate_prolog) {
3660 shader->config.spi_ps_input_ena |=
3661 S_0286CC_PERSP_CENTER_ENA(1);
3662 }
3663 break;
3664 case TGSI_INTERPOLATE_LOC_CENTROID:
3665 key->ps_prolog.color_interp_vgpr_index[i] = 4;
3666 if (separate_prolog) {
3667 shader->config.spi_ps_input_ena |=
3668 S_0286CC_PERSP_CENTROID_ENA(1);
3669 }
3670 break;
3671 default:
3672 assert(0);
3673 }
3674 break;
3675 case TGSI_INTERPOLATE_LINEAR:
3676 /* Force the interpolation location for colors here. */
3677 if (shader->key.part.ps.prolog.force_linear_sample_interp)
3678 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3679 if (shader->key.part.ps.prolog.force_linear_center_interp)
3680 location = TGSI_INTERPOLATE_LOC_CENTER;
3681
3682 /* The VGPR assignment for non-monolithic shaders
3683 * works because InitialPSInputAddr is set on the
3684 * main shader and PERSP_PULL_MODEL is never used.
3685 */
3686 switch (location) {
3687 case TGSI_INTERPOLATE_LOC_SAMPLE:
3688 key->ps_prolog.color_interp_vgpr_index[i] =
3689 separate_prolog ? 6 : 9;
3690 if (separate_prolog) {
3691 shader->config.spi_ps_input_ena |=
3692 S_0286CC_LINEAR_SAMPLE_ENA(1);
3693 }
3694 break;
3695 case TGSI_INTERPOLATE_LOC_CENTER:
3696 key->ps_prolog.color_interp_vgpr_index[i] =
3697 separate_prolog ? 8 : 11;
3698 if (separate_prolog) {
3699 shader->config.spi_ps_input_ena |=
3700 S_0286CC_LINEAR_CENTER_ENA(1);
3701 }
3702 break;
3703 case TGSI_INTERPOLATE_LOC_CENTROID:
3704 key->ps_prolog.color_interp_vgpr_index[i] =
3705 separate_prolog ? 10 : 13;
3706 if (separate_prolog) {
3707 shader->config.spi_ps_input_ena |=
3708 S_0286CC_LINEAR_CENTROID_ENA(1);
3709 }
3710 break;
3711 default:
3712 assert(0);
3713 }
3714 break;
3715 default:
3716 assert(0);
3717 }
3718 }
3719 }
3720 }
3721
3722 /**
3723 * Check whether a PS prolog is required based on the key.
3724 */
3725 bool si_need_ps_prolog(const union si_shader_part_key *key)
3726 {
3727 return key->ps_prolog.colors_read ||
3728 key->ps_prolog.states.force_persp_sample_interp ||
3729 key->ps_prolog.states.force_linear_sample_interp ||
3730 key->ps_prolog.states.force_persp_center_interp ||
3731 key->ps_prolog.states.force_linear_center_interp ||
3732 key->ps_prolog.states.bc_optimize_for_persp ||
3733 key->ps_prolog.states.bc_optimize_for_linear ||
3734 key->ps_prolog.states.poly_stipple ||
3735 key->ps_prolog.states.samplemask_log_ps_iter;
3736 }
3737
3738 /**
3739 * Compute the PS epilog key, which contains all the information needed to
3740 * build the PS epilog function.
3741 */
3742 void si_get_ps_epilog_key(struct si_shader *shader,
3743 union si_shader_part_key *key)
3744 {
3745 struct si_shader_info *info = &shader->selector->info;
3746 memset(key, 0, sizeof(*key));
3747 key->ps_epilog.colors_written = info->colors_written;
3748 key->ps_epilog.writes_z = info->writes_z;
3749 key->ps_epilog.writes_stencil = info->writes_stencil;
3750 key->ps_epilog.writes_samplemask = info->writes_samplemask;
3751 key->ps_epilog.states = shader->key.part.ps.epilog;
3752 }
3753
3754 /**
3755 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3756 */
3757 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
3758 struct ac_llvm_compiler *compiler,
3759 struct si_shader *shader,
3760 struct pipe_debug_callback *debug)
3761 {
3762 union si_shader_part_key prolog_key;
3763 union si_shader_part_key epilog_key;
3764
3765 /* Get the prolog. */
3766 si_get_ps_prolog_key(shader, &prolog_key, true);
3767
3768 /* The prolog is a no-op if these aren't set. */
3769 if (si_need_ps_prolog(&prolog_key)) {
3770 shader->prolog =
3771 si_get_shader_part(sscreen, &sscreen->ps_prologs,
3772 PIPE_SHADER_FRAGMENT, true,
3773 &prolog_key, compiler, debug,
3774 si_llvm_build_ps_prolog,
3775 "Fragment Shader Prolog");
3776 if (!shader->prolog)
3777 return false;
3778 }
3779
3780 /* Get the epilog. */
3781 si_get_ps_epilog_key(shader, &epilog_key);
3782
3783 shader->epilog =
3784 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
3785 PIPE_SHADER_FRAGMENT, false,
3786 &epilog_key, compiler, debug,
3787 si_llvm_build_ps_epilog,
3788 "Fragment Shader Epilog");
3789 if (!shader->epilog)
3790 return false;
3791
3792 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
3793 if (shader->key.part.ps.prolog.poly_stipple) {
3794 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
3795 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
3796 }
3797
3798 /* Set up the enable bits for per-sample shading if needed. */
3799 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
3800 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
3801 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3802 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
3803 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
3804 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3805 }
3806 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
3807 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
3808 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3809 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
3810 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
3811 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3812 }
3813 if (shader->key.part.ps.prolog.force_persp_center_interp &&
3814 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
3815 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3816 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
3817 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
3818 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3819 }
3820 if (shader->key.part.ps.prolog.force_linear_center_interp &&
3821 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
3822 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3823 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
3824 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
3825 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3826 }
3827
3828 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
3829 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
3830 !(shader->config.spi_ps_input_ena & 0xf)) {
3831 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3832 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
3833 }
3834
3835 /* At least one pair of interpolation weights must be enabled. */
3836 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
3837 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3838 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
3839 }
3840
3841 /* Samplemask fixup requires the sample ID. */
3842 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
3843 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
3844 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
3845 }
3846
3847 /* The sample mask input is always enabled, because the API shader always
3848 * passes it through to the epilog. Disable it here if it's unused.
3849 */
3850 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
3851 !shader->selector->info.reads_samplemask)
3852 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
3853
3854 return true;
3855 }
3856
3857 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
3858 unsigned *lds_size)
3859 {
3860 /* If tessellation is all offchip and on-chip GS isn't used, this
3861 * workaround is not needed.
3862 */
3863 return;
3864
3865 /* SPI barrier management bug:
3866 * Make sure we have at least 4k of LDS in use to avoid the bug.
3867 * It applies to workgroup sizes of more than one wavefront.
3868 */
3869 if (sscreen->info.family == CHIP_BONAIRE ||
3870 sscreen->info.family == CHIP_KABINI)
3871 *lds_size = MAX2(*lds_size, 8);
3872 }
3873
3874 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3875 {
3876 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3877
3878 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3879
3880 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
3881 si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
3882 si_multiwave_lds_size_workaround(sscreen,
3883 &shader->config.lds_size);
3884 }
3885 }
3886
3887 bool si_create_shader_variant(struct si_screen *sscreen,
3888 struct ac_llvm_compiler *compiler,
3889 struct si_shader *shader,
3890 struct pipe_debug_callback *debug)
3891 {
3892 struct si_shader_selector *sel = shader->selector;
3893 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
3894 int r;
3895
3896 /* LS, ES, VS are compiled on demand if the main part hasn't been
3897 * compiled for that stage.
3898 *
3899 * GS are compiled on demand if the main part hasn't been compiled
3900 * for the chosen NGG-ness.
3901 *
3902 * Vertex shaders are compiled on demand when a vertex fetch
3903 * workaround must be applied.
3904 */
3905 if (shader->is_monolithic) {
3906 /* Monolithic shader (compiled as a whole, has many variants,
3907 * may take a long time to compile).
3908 */
3909 r = si_compile_shader(sscreen, compiler, shader, debug);
3910 if (r)
3911 return false;
3912 } else {
3913 /* The shader consists of several parts:
3914 *
3915 * - the middle part is the user shader, it has 1 variant only
3916 * and it was compiled during the creation of the shader
3917 * selector
3918 * - the prolog part is inserted at the beginning
3919 * - the epilog part is inserted at the end
3920 *
3921 * The prolog and epilog have many (but simple) variants.
3922 *
3923 * Starting with gfx9, geometry and tessellation control
3924 * shaders also contain the prolog and user shader parts of
3925 * the previous shader stage.
3926 */
3927
3928 if (!mainp)
3929 return false;
3930
3931 /* Copy the compiled shader data over. */
3932 shader->is_binary_shared = true;
3933 shader->binary = mainp->binary;
3934 shader->config = mainp->config;
3935 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
3936 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
3937 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
3938 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
3939 memcpy(shader->info.vs_output_param_offset,
3940 mainp->info.vs_output_param_offset,
3941 sizeof(mainp->info.vs_output_param_offset));
3942 shader->info.uses_instanceid = mainp->info.uses_instanceid;
3943 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
3944 shader->info.nr_param_exports = mainp->info.nr_param_exports;
3945
3946 /* Select prologs and/or epilogs. */
3947 switch (sel->type) {
3948 case PIPE_SHADER_VERTEX:
3949 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
3950 return false;
3951 break;
3952 case PIPE_SHADER_TESS_CTRL:
3953 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3954 return false;
3955 break;
3956 case PIPE_SHADER_TESS_EVAL:
3957 break;
3958 case PIPE_SHADER_GEOMETRY:
3959 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3960 return false;
3961 break;
3962 case PIPE_SHADER_FRAGMENT:
3963 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3964 return false;
3965
3966 /* Make sure we have at least as many VGPRs as there
3967 * are allocated inputs.
3968 */
3969 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3970 shader->info.num_input_vgprs);
3971 break;
3972 default:;
3973 }
3974
3975 /* Update SGPR and VGPR counts. */
3976 if (shader->prolog) {
3977 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
3978 shader->prolog->config.num_sgprs);
3979 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3980 shader->prolog->config.num_vgprs);
3981 }
3982 if (shader->previous_stage) {
3983 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
3984 shader->previous_stage->config.num_sgprs);
3985 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
3986 shader->previous_stage->config.num_vgprs);
3987 shader->config.spilled_sgprs =
3988 MAX2(shader->config.spilled_sgprs,
3989 shader->previous_stage->config.spilled_sgprs);
3990 shader->config.spilled_vgprs =
3991 MAX2(shader->config.spilled_vgprs,
3992 shader->previous_stage->config.spilled_vgprs);
3993 shader->info.private_mem_vgprs =
3994 MAX2(shader->info.private_mem_vgprs,
3995 shader->previous_stage->info.private_mem_vgprs);
3996 shader->config.scratch_bytes_per_wave =
3997 MAX2(shader->config.scratch_bytes_per_wave,
3998 shader->previous_stage->config.scratch_bytes_per_wave);
3999 shader->info.uses_instanceid |=
4000 shader->previous_stage->info.uses_instanceid;
4001 }
4002 if (shader->prolog2) {
4003 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
4004 shader->prolog2->config.num_sgprs);
4005 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
4006 shader->prolog2->config.num_vgprs);
4007 }
4008 if (shader->epilog) {
4009 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
4010 shader->epilog->config.num_sgprs);
4011 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
4012 shader->epilog->config.num_vgprs);
4013 }
4014 si_calculate_max_simd_waves(shader);
4015 }
4016
4017 if (shader->key.as_ngg) {
4018 assert(!shader->key.as_es && !shader->key.as_ls);
4019 gfx10_ngg_calculate_subgroup_info(shader);
4020 } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
4021 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
4022 }
4023
4024 si_fix_resource_usage(sscreen, shader);
4025 si_shader_dump(sscreen, shader, debug, stderr, true);
4026
4027 /* Upload. */
4028 if (!si_shader_binary_upload(sscreen, shader, 0)) {
4029 fprintf(stderr, "LLVM failed to upload shader\n");
4030 return false;
4031 }
4032
4033 return true;
4034 }
4035
4036 void si_shader_destroy(struct si_shader *shader)
4037 {
4038 if (shader->scratch_bo)
4039 si_resource_reference(&shader->scratch_bo, NULL);
4040
4041 si_resource_reference(&shader->bo, NULL);
4042
4043 if (!shader->is_binary_shared)
4044 si_shader_binary_clean(&shader->binary);
4045
4046 free(shader->shader_log);
4047 }