radeonsi: move geometry shader code into si_shader_llvm_gs.c
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include <llvm/Config/llvm-config.h>
26
27 #include "util/u_memory.h"
28 #include "tgsi/tgsi_strings.h"
29 #include "tgsi/tgsi_from_mesa.h"
30
31 #include "ac_exp_param.h"
32 #include "ac_shader_util.h"
33 #include "ac_rtld.h"
34 #include "ac_llvm_util.h"
35 #include "si_shader_internal.h"
36 #include "si_pipe.h"
37 #include "sid.h"
38
39 #include "compiler/nir/nir.h"
40 #include "compiler/nir/nir_serialize.h"
41
42 static const char scratch_rsrc_dword0_symbol[] =
43 "SCRATCH_RSRC_DWORD0";
44
45 static const char scratch_rsrc_dword1_symbol[] =
46 "SCRATCH_RSRC_DWORD1";
47
48 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
49
50 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
51 union si_shader_part_key *key);
52
53 /** Whether the shader runs as a combination of multiple API shaders */
54 static bool is_multi_part_shader(struct si_shader_context *ctx)
55 {
56 if (ctx->screen->info.chip_class <= GFX8)
57 return false;
58
59 return ctx->shader->key.as_ls ||
60 ctx->shader->key.as_es ||
61 ctx->type == PIPE_SHADER_TESS_CTRL ||
62 ctx->type == PIPE_SHADER_GEOMETRY;
63 }
64
65 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
66 bool si_is_merged_shader(struct si_shader_context *ctx)
67 {
68 return ctx->shader->key.as_ngg || is_multi_part_shader(ctx);
69 }
70
71 /**
72 * Returns a unique index for a per-patch semantic name and index. The index
73 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
74 * can be calculated.
75 */
76 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
77 {
78 switch (semantic_name) {
79 case TGSI_SEMANTIC_TESSOUTER:
80 return 0;
81 case TGSI_SEMANTIC_TESSINNER:
82 return 1;
83 case TGSI_SEMANTIC_PATCH:
84 assert(index < 30);
85 return 2 + index;
86
87 default:
88 assert(!"invalid semantic name");
89 return 0;
90 }
91 }
92
93 /**
94 * Returns a unique index for a semantic name and index. The index must be
95 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
96 * calculated.
97 */
98 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
99 unsigned is_varying)
100 {
101 switch (semantic_name) {
102 case TGSI_SEMANTIC_POSITION:
103 return 0;
104 case TGSI_SEMANTIC_GENERIC:
105 /* Since some shader stages use the the highest used IO index
106 * to determine the size to allocate for inputs/outputs
107 * (in LDS, tess and GS rings). GENERIC should be placed right
108 * after POSITION to make that size as small as possible.
109 */
110 if (index < SI_MAX_IO_GENERIC)
111 return 1 + index;
112
113 assert(!"invalid generic index");
114 return 0;
115 case TGSI_SEMANTIC_FOG:
116 return SI_MAX_IO_GENERIC + 1;
117 case TGSI_SEMANTIC_COLOR:
118 assert(index < 2);
119 return SI_MAX_IO_GENERIC + 2 + index;
120 case TGSI_SEMANTIC_BCOLOR:
121 assert(index < 2);
122 /* If it's a varying, COLOR and BCOLOR alias. */
123 if (is_varying)
124 return SI_MAX_IO_GENERIC + 2 + index;
125 else
126 return SI_MAX_IO_GENERIC + 4 + index;
127 case TGSI_SEMANTIC_TEXCOORD:
128 assert(index < 8);
129 return SI_MAX_IO_GENERIC + 6 + index;
130
131 /* These are rarely used between LS and HS or ES and GS. */
132 case TGSI_SEMANTIC_CLIPDIST:
133 assert(index < 2);
134 return SI_MAX_IO_GENERIC + 6 + 8 + index;
135 case TGSI_SEMANTIC_CLIPVERTEX:
136 return SI_MAX_IO_GENERIC + 6 + 8 + 2;
137 case TGSI_SEMANTIC_PSIZE:
138 return SI_MAX_IO_GENERIC + 6 + 8 + 3;
139
140 /* These can't be written by LS, HS, and ES. */
141 case TGSI_SEMANTIC_LAYER:
142 return SI_MAX_IO_GENERIC + 6 + 8 + 4;
143 case TGSI_SEMANTIC_VIEWPORT_INDEX:
144 return SI_MAX_IO_GENERIC + 6 + 8 + 5;
145 case TGSI_SEMANTIC_PRIMID:
146 STATIC_ASSERT(SI_MAX_IO_GENERIC + 6 + 8 + 6 <= 63);
147 return SI_MAX_IO_GENERIC + 6 + 8 + 6;
148 default:
149 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
150 assert(!"invalid semantic name");
151 return 0;
152 }
153 }
154
155 /**
156 * Get the value of a shader input parameter and extract a bitfield.
157 */
158 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
159 LLVMValueRef value, unsigned rshift,
160 unsigned bitwidth)
161 {
162 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
163 value = ac_to_integer(&ctx->ac, value);
164
165 if (rshift)
166 value = LLVMBuildLShr(ctx->ac.builder, value,
167 LLVMConstInt(ctx->i32, rshift, 0), "");
168
169 if (rshift + bitwidth < 32) {
170 unsigned mask = (1 << bitwidth) - 1;
171 value = LLVMBuildAnd(ctx->ac.builder, value,
172 LLVMConstInt(ctx->i32, mask, 0), "");
173 }
174
175 return value;
176 }
177
178 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
179 struct ac_arg param, unsigned rshift,
180 unsigned bitwidth)
181 {
182 LLVMValueRef value = ac_get_arg(&ctx->ac, param);
183
184 return unpack_llvm_param(ctx, value, rshift, bitwidth);
185 }
186
187 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
188 LLVMValueRef i32, unsigned index)
189 {
190 assert(index <= 1);
191
192 if (index == 1)
193 return LLVMBuildAShr(ctx->ac.builder, i32,
194 LLVMConstInt(ctx->i32, 16, 0), "");
195
196 return LLVMBuildSExt(ctx->ac.builder,
197 LLVMBuildTrunc(ctx->ac.builder, i32,
198 ctx->ac.i16, ""),
199 ctx->i32, "");
200 }
201
202 void si_llvm_load_input_vs(
203 struct si_shader_context *ctx,
204 unsigned input_index,
205 LLVMValueRef out[4])
206 {
207 const struct si_shader_info *info = &ctx->shader->selector->info;
208 unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
209
210 if (vs_blit_property) {
211 LLVMValueRef vertex_id = ctx->abi.vertex_id;
212 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
213 LLVMIntULE, vertex_id,
214 ctx->i32_1, "");
215 /* Use LLVMIntNE, because we have 3 vertices and only
216 * the middle one should use y2.
217 */
218 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
219 LLVMIntNE, vertex_id,
220 ctx->i32_1, "");
221
222 unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
223 if (input_index == 0) {
224 /* Position: */
225 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
226 param_vs_blit_inputs);
227 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
228 param_vs_blit_inputs + 1);
229
230 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
231 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
232 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
233 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
234
235 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
236 x1, x2, "");
237 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
238 y1, y2, "");
239
240 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
241 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
242 out[2] = LLVMGetParam(ctx->main_fn,
243 param_vs_blit_inputs + 2);
244 out[3] = ctx->ac.f32_1;
245 return;
246 }
247
248 /* Color or texture coordinates: */
249 assert(input_index == 1);
250
251 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
252 for (int i = 0; i < 4; i++) {
253 out[i] = LLVMGetParam(ctx->main_fn,
254 param_vs_blit_inputs + 3 + i);
255 }
256 } else {
257 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
258 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
259 param_vs_blit_inputs + 3);
260 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
261 param_vs_blit_inputs + 4);
262 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
263 param_vs_blit_inputs + 5);
264 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
265 param_vs_blit_inputs + 6);
266
267 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
268 x1, x2, "");
269 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
270 y1, y2, "");
271 out[2] = LLVMGetParam(ctx->main_fn,
272 param_vs_blit_inputs + 7);
273 out[3] = LLVMGetParam(ctx->main_fn,
274 param_vs_blit_inputs + 8);
275 }
276 return;
277 }
278
279 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
280 union si_vs_fix_fetch fix_fetch;
281 LLVMValueRef vb_desc;
282 LLVMValueRef vertex_index;
283 LLVMValueRef tmp;
284
285 if (input_index < num_vbos_in_user_sgprs) {
286 vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
287 } else {
288 unsigned index= input_index - num_vbos_in_user_sgprs;
289 vb_desc = ac_build_load_to_sgpr(&ctx->ac,
290 ac_get_arg(&ctx->ac, ctx->vertex_buffers),
291 LLVMConstInt(ctx->i32, index, 0));
292 }
293
294 vertex_index = LLVMGetParam(ctx->main_fn,
295 ctx->vertex_index0.arg_index +
296 input_index);
297
298 /* Use the open-coded implementation for all loads of doubles and
299 * of dword-sized data that needs fixups. We need to insert conversion
300 * code anyway, and the amd/common code does it for us.
301 *
302 * Note: On LLVM <= 8, we can only open-code formats with
303 * channel size >= 4 bytes.
304 */
305 bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
306 fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
307 if (opencode ||
308 (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
309 (fix_fetch.u.log_size == 2)) {
310 tmp = ac_build_opencoded_load_format(
311 &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
312 fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
313 vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
314 for (unsigned i = 0; i < 4; ++i)
315 out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->i32, i, false), "");
316 return;
317 }
318
319 /* Do multiple loads for special formats. */
320 unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
321 LLVMValueRef fetches[4];
322 unsigned num_fetches;
323 unsigned fetch_stride;
324 unsigned channels_per_fetch;
325
326 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
327 num_fetches = MIN2(required_channels, 3);
328 fetch_stride = 1 << fix_fetch.u.log_size;
329 channels_per_fetch = 1;
330 } else {
331 num_fetches = 1;
332 fetch_stride = 0;
333 channels_per_fetch = required_channels;
334 }
335
336 for (unsigned i = 0; i < num_fetches; ++i) {
337 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
338 fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
339 channels_per_fetch, 0, true);
340 }
341
342 if (num_fetches == 1 && channels_per_fetch > 1) {
343 LLVMValueRef fetch = fetches[0];
344 for (unsigned i = 0; i < channels_per_fetch; ++i) {
345 tmp = LLVMConstInt(ctx->i32, i, false);
346 fetches[i] = LLVMBuildExtractElement(
347 ctx->ac.builder, fetch, tmp, "");
348 }
349 num_fetches = channels_per_fetch;
350 channels_per_fetch = 1;
351 }
352
353 for (unsigned i = num_fetches; i < 4; ++i)
354 fetches[i] = LLVMGetUndef(ctx->f32);
355
356 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
357 required_channels == 4) {
358 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
359 fetches[3] = ctx->ac.i32_1;
360 else
361 fetches[3] = ctx->ac.f32_1;
362 } else if (fix_fetch.u.log_size == 3 &&
363 (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
364 fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
365 fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
366 required_channels == 4) {
367 /* For 2_10_10_10, the hardware returns an unsigned value;
368 * convert it to a signed one.
369 */
370 LLVMValueRef tmp = fetches[3];
371 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
372
373 /* First, recover the sign-extended signed integer value. */
374 if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
375 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
376 else
377 tmp = ac_to_integer(&ctx->ac, tmp);
378
379 /* For the integer-like cases, do a natural sign extension.
380 *
381 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
382 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
383 * exponent.
384 */
385 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
386 fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
387 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
388 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
389
390 /* Convert back to the right type. */
391 if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
392 LLVMValueRef clamp;
393 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
394 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
395 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
396 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
397 } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
398 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
399 }
400
401 fetches[3] = tmp;
402 }
403
404 for (unsigned i = 0; i < 4; ++i)
405 out[i] = ac_to_float(&ctx->ac, fetches[i]);
406 }
407
408 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
409 unsigned swizzle)
410 {
411 if (swizzle > 0)
412 return ctx->i32_0;
413
414 switch (ctx->type) {
415 case PIPE_SHADER_VERTEX:
416 return ac_get_arg(&ctx->ac, ctx->vs_prim_id);
417 case PIPE_SHADER_TESS_CTRL:
418 return ac_get_arg(&ctx->ac, ctx->args.tcs_patch_id);
419 case PIPE_SHADER_TESS_EVAL:
420 return ac_get_arg(&ctx->ac, ctx->args.tes_patch_id);
421 case PIPE_SHADER_GEOMETRY:
422 return ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
423 default:
424 assert(0);
425 return ctx->i32_0;
426 }
427 }
428
429 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
430 {
431 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
432
433 /* For non-indexed draws, the base vertex set by the driver
434 * (for direct draws) or the CP (for indirect draws) is the
435 * first vertex ID, but GLSL expects 0 to be returned.
436 */
437 LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
438 ctx->vs_state_bits);
439 LLVMValueRef indexed;
440
441 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
442 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
443
444 return LLVMBuildSelect(ctx->ac.builder, indexed,
445 ac_get_arg(&ctx->ac, ctx->args.base_vertex),
446 ctx->i32_0, "");
447 }
448
449 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
450 {
451 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
452
453 LLVMValueRef values[3];
454 LLVMValueRef result;
455 unsigned i;
456 unsigned *properties = ctx->shader->selector->info.properties;
457
458 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
459 unsigned sizes[3] = {
460 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
461 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
462 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
463 };
464
465 for (i = 0; i < 3; ++i)
466 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
467
468 result = ac_build_gather_values(&ctx->ac, values, 3);
469 } else {
470 result = ac_get_arg(&ctx->ac, ctx->block_size);
471 }
472
473 return result;
474 }
475
476 void si_declare_compute_memory(struct si_shader_context *ctx)
477 {
478 struct si_shader_selector *sel = ctx->shader->selector;
479 unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
480
481 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
482 LLVMValueRef var;
483
484 assert(!ctx->ac.lds);
485
486 var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
487 LLVMArrayType(ctx->i8, lds_size),
488 "compute_lds",
489 AC_ADDR_SPACE_LDS);
490 LLVMSetAlignment(var, 64 * 1024);
491
492 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
493 }
494
495 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
496 {
497 LLVMValueRef ptr =
498 ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
499 struct si_shader_selector *sel = ctx->shader->selector;
500
501 /* Do the bounds checking with a descriptor, because
502 * doing computation and manual bounds checking of 64-bit
503 * addresses generates horrible VALU code with very high
504 * VGPR usage and very low SIMD occupancy.
505 */
506 ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
507
508 LLVMValueRef desc0, desc1;
509 desc0 = ptr;
510 desc1 = LLVMConstInt(ctx->i32,
511 S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
512
513 uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
514 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
515 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
516 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
517
518 if (ctx->screen->info.chip_class >= GFX10)
519 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
520 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
521 S_008F0C_RESOURCE_LEVEL(1);
522 else
523 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
524 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
525
526 LLVMValueRef desc_elems[] = {
527 desc0,
528 desc1,
529 LLVMConstInt(ctx->i32, sel->info.constbuf0_num_slots * 16, 0),
530 LLVMConstInt(ctx->i32, rsrc3, false)
531 };
532
533 return ac_build_gather_values(&ctx->ac, desc_elems, 4);
534 }
535
536 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
537 {
538 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
539 struct si_shader_selector *sel = ctx->shader->selector;
540
541 LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
542
543 if (sel->info.const_buffers_declared == 1 &&
544 sel->info.shader_buffers_declared == 0) {
545 return load_const_buffer_desc_fast_path(ctx);
546 }
547
548 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
549 index = LLVMBuildAdd(ctx->ac.builder, index,
550 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
551
552 return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
553 }
554
555 static LLVMValueRef
556 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
557 {
558 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
559 LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac,
560 ctx->const_and_shader_buffers);
561
562 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
563 index = LLVMBuildSub(ctx->ac.builder,
564 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
565 index, "");
566
567 return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
568 }
569
570 /* Initialize arguments for the shader export intrinsic */
571 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
572 LLVMValueRef *values,
573 unsigned target,
574 struct ac_export_args *args)
575 {
576 args->enabled_channels = 0xf; /* writemask - default is 0xf */
577 args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
578 args->done = 0; /* Specify whether this is the last export */
579 args->target = target; /* Specify the target we are exporting */
580 args->compr = false;
581
582 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
583 }
584
585 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
586 struct ac_export_args *pos, LLVMValueRef *out_elts)
587 {
588 unsigned reg_index;
589 unsigned chan;
590 unsigned const_chan;
591 LLVMValueRef base_elt;
592 LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
593 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
594 SI_VS_CONST_CLIP_PLANES, 0);
595 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
596
597 for (reg_index = 0; reg_index < 2; reg_index ++) {
598 struct ac_export_args *args = &pos[2 + reg_index];
599
600 args->out[0] =
601 args->out[1] =
602 args->out[2] =
603 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
604
605 /* Compute dot products of position and user clip plane vectors */
606 for (chan = 0; chan < 4; chan++) {
607 for (const_chan = 0; const_chan < 4; const_chan++) {
608 LLVMValueRef addr =
609 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
610 const_chan) * 4, 0);
611 base_elt = si_buffer_load_const(ctx, const_resource,
612 addr);
613 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
614 out_elts[const_chan], args->out[chan]);
615 }
616 }
617
618 args->enabled_channels = 0xf;
619 args->valid_mask = 0;
620 args->done = 0;
621 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
622 args->compr = 0;
623 }
624 }
625
626 static void si_dump_streamout(struct pipe_stream_output_info *so)
627 {
628 unsigned i;
629
630 if (so->num_outputs)
631 fprintf(stderr, "STREAMOUT\n");
632
633 for (i = 0; i < so->num_outputs; i++) {
634 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
635 so->output[i].start_component;
636 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
637 i, so->output[i].output_buffer,
638 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
639 so->output[i].register_index,
640 mask & 1 ? "x" : "",
641 mask & 2 ? "y" : "",
642 mask & 4 ? "z" : "",
643 mask & 8 ? "w" : "");
644 }
645 }
646
647 void si_emit_streamout_output(struct si_shader_context *ctx,
648 LLVMValueRef const *so_buffers,
649 LLVMValueRef const *so_write_offsets,
650 struct pipe_stream_output *stream_out,
651 struct si_shader_output_values *shader_out)
652 {
653 unsigned buf_idx = stream_out->output_buffer;
654 unsigned start = stream_out->start_component;
655 unsigned num_comps = stream_out->num_components;
656 LLVMValueRef out[4];
657
658 assert(num_comps && num_comps <= 4);
659 if (!num_comps || num_comps > 4)
660 return;
661
662 /* Load the output as int. */
663 for (int j = 0; j < num_comps; j++) {
664 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
665
666 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
667 }
668
669 /* Pack the output. */
670 LLVMValueRef vdata = NULL;
671
672 switch (num_comps) {
673 case 1: /* as i32 */
674 vdata = out[0];
675 break;
676 case 2: /* as v2i32 */
677 case 3: /* as v3i32 */
678 if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
679 vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
680 break;
681 }
682 /* as v4i32 (aligned to 4) */
683 out[3] = LLVMGetUndef(ctx->i32);
684 /* fall through */
685 case 4: /* as v4i32 */
686 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
687 break;
688 }
689
690 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
691 vdata, num_comps,
692 so_write_offsets[buf_idx],
693 ctx->i32_0,
694 stream_out->dst_offset * 4, ac_glc | ac_slc);
695 }
696
697 /**
698 * Write streamout data to buffers for vertex stream @p stream (different
699 * vertex streams can occur for GS copy shaders).
700 */
701 void si_llvm_emit_streamout(struct si_shader_context *ctx,
702 struct si_shader_output_values *outputs,
703 unsigned noutput, unsigned stream)
704 {
705 struct si_shader_selector *sel = ctx->shader->selector;
706 struct pipe_stream_output_info *so = &sel->so;
707 LLVMBuilderRef builder = ctx->ac.builder;
708 int i;
709
710 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
711 LLVMValueRef so_vtx_count =
712 si_unpack_param(ctx, ctx->streamout_config, 16, 7);
713
714 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
715
716 /* can_emit = tid < so_vtx_count; */
717 LLVMValueRef can_emit =
718 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
719
720 /* Emit the streamout code conditionally. This actually avoids
721 * out-of-bounds buffer access. The hw tells us via the SGPR
722 * (so_vtx_count) which threads are allowed to emit streamout data. */
723 ac_build_ifcc(&ctx->ac, can_emit, 6501);
724 {
725 /* The buffer offset is computed as follows:
726 * ByteOffset = streamout_offset[buffer_id]*4 +
727 * (streamout_write_index + thread_id)*stride[buffer_id] +
728 * attrib_offset
729 */
730
731 LLVMValueRef so_write_index =
732 ac_get_arg(&ctx->ac,
733 ctx->streamout_write_index);
734
735 /* Compute (streamout_write_index + thread_id). */
736 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
737
738 /* Load the descriptor and compute the write offset for each
739 * enabled buffer. */
740 LLVMValueRef so_write_offset[4] = {};
741 LLVMValueRef so_buffers[4];
742 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
743 ctx->rw_buffers);
744
745 for (i = 0; i < 4; i++) {
746 if (!so->stride[i])
747 continue;
748
749 LLVMValueRef offset = LLVMConstInt(ctx->i32,
750 SI_VS_STREAMOUT_BUF0 + i, 0);
751
752 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
753
754 LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
755 ctx->streamout_offset[i]);
756 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
757
758 so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
759 LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
760 so_offset);
761 }
762
763 /* Write streamout data. */
764 for (i = 0; i < so->num_outputs; i++) {
765 unsigned reg = so->output[i].register_index;
766
767 if (reg >= noutput)
768 continue;
769
770 if (stream != so->output[i].stream)
771 continue;
772
773 si_emit_streamout_output(ctx, so_buffers, so_write_offset,
774 &so->output[i], &outputs[reg]);
775 }
776 }
777 ac_build_endif(&ctx->ac, 6501);
778 }
779
780 static void si_export_param(struct si_shader_context *ctx, unsigned index,
781 LLVMValueRef *values)
782 {
783 struct ac_export_args args;
784
785 si_llvm_init_vs_export_args(ctx, values,
786 V_008DFC_SQ_EXP_PARAM + index, &args);
787 ac_build_export(&ctx->ac, &args);
788 }
789
790 static void si_build_param_exports(struct si_shader_context *ctx,
791 struct si_shader_output_values *outputs,
792 unsigned noutput)
793 {
794 struct si_shader *shader = ctx->shader;
795 unsigned param_count = 0;
796
797 for (unsigned i = 0; i < noutput; i++) {
798 unsigned semantic_name = outputs[i].semantic_name;
799 unsigned semantic_index = outputs[i].semantic_index;
800
801 if (outputs[i].vertex_stream[0] != 0 &&
802 outputs[i].vertex_stream[1] != 0 &&
803 outputs[i].vertex_stream[2] != 0 &&
804 outputs[i].vertex_stream[3] != 0)
805 continue;
806
807 switch (semantic_name) {
808 case TGSI_SEMANTIC_LAYER:
809 case TGSI_SEMANTIC_VIEWPORT_INDEX:
810 case TGSI_SEMANTIC_CLIPDIST:
811 case TGSI_SEMANTIC_COLOR:
812 case TGSI_SEMANTIC_BCOLOR:
813 case TGSI_SEMANTIC_PRIMID:
814 case TGSI_SEMANTIC_FOG:
815 case TGSI_SEMANTIC_TEXCOORD:
816 case TGSI_SEMANTIC_GENERIC:
817 break;
818 default:
819 continue;
820 }
821
822 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
823 semantic_index < SI_MAX_IO_GENERIC) &&
824 shader->key.opt.kill_outputs &
825 (1ull << si_shader_io_get_unique_index(semantic_name,
826 semantic_index, true)))
827 continue;
828
829 si_export_param(ctx, param_count, outputs[i].values);
830
831 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
832 shader->info.vs_output_param_offset[i] = param_count++;
833 }
834
835 shader->info.nr_param_exports = param_count;
836 }
837
838 /**
839 * Vertex color clamping.
840 *
841 * This uses a state constant loaded in a user data SGPR and
842 * an IF statement is added that clamps all colors if the constant
843 * is true.
844 */
845 static void si_vertex_color_clamping(struct si_shader_context *ctx,
846 struct si_shader_output_values *outputs,
847 unsigned noutput)
848 {
849 LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
850 bool has_colors = false;
851
852 /* Store original colors to alloca variables. */
853 for (unsigned i = 0; i < noutput; i++) {
854 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
855 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
856 continue;
857
858 for (unsigned j = 0; j < 4; j++) {
859 addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->f32, "");
860 LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
861 }
862 has_colors = true;
863 }
864
865 if (!has_colors)
866 return;
867
868 /* The state is in the first bit of the user SGPR. */
869 LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
870 cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->i1, "");
871
872 ac_build_ifcc(&ctx->ac, cond, 6502);
873
874 /* Store clamped colors to alloca variables within the conditional block. */
875 for (unsigned i = 0; i < noutput; i++) {
876 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
877 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
878 continue;
879
880 for (unsigned j = 0; j < 4; j++) {
881 LLVMBuildStore(ctx->ac.builder,
882 ac_build_clamp(&ctx->ac, outputs[i].values[j]),
883 addr[i][j]);
884 }
885 }
886 ac_build_endif(&ctx->ac, 6502);
887
888 /* Load clamped colors */
889 for (unsigned i = 0; i < noutput; i++) {
890 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
891 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
892 continue;
893
894 for (unsigned j = 0; j < 4; j++) {
895 outputs[i].values[j] =
896 LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
897 }
898 }
899 }
900
901 /* Generate export instructions for hardware VS shader stage or NGG GS stage
902 * (position and parameter data only).
903 */
904 void si_llvm_export_vs(struct si_shader_context *ctx,
905 struct si_shader_output_values *outputs,
906 unsigned noutput)
907 {
908 struct si_shader *shader = ctx->shader;
909 struct ac_export_args pos_args[4] = {};
910 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
911 unsigned pos_idx;
912 int i;
913
914 si_vertex_color_clamping(ctx, outputs, noutput);
915
916 /* Build position exports. */
917 for (i = 0; i < noutput; i++) {
918 switch (outputs[i].semantic_name) {
919 case TGSI_SEMANTIC_POSITION:
920 si_llvm_init_vs_export_args(ctx, outputs[i].values,
921 V_008DFC_SQ_EXP_POS, &pos_args[0]);
922 break;
923 case TGSI_SEMANTIC_PSIZE:
924 psize_value = outputs[i].values[0];
925 break;
926 case TGSI_SEMANTIC_LAYER:
927 layer_value = outputs[i].values[0];
928 break;
929 case TGSI_SEMANTIC_VIEWPORT_INDEX:
930 viewport_index_value = outputs[i].values[0];
931 break;
932 case TGSI_SEMANTIC_EDGEFLAG:
933 edgeflag_value = outputs[i].values[0];
934 break;
935 case TGSI_SEMANTIC_CLIPDIST:
936 if (!shader->key.opt.clip_disable) {
937 unsigned index = 2 + outputs[i].semantic_index;
938 si_llvm_init_vs_export_args(ctx, outputs[i].values,
939 V_008DFC_SQ_EXP_POS + index,
940 &pos_args[index]);
941 }
942 break;
943 case TGSI_SEMANTIC_CLIPVERTEX:
944 if (!shader->key.opt.clip_disable) {
945 si_llvm_emit_clipvertex(ctx, pos_args,
946 outputs[i].values);
947 }
948 break;
949 }
950 }
951
952 /* We need to add the position output manually if it's missing. */
953 if (!pos_args[0].out[0]) {
954 pos_args[0].enabled_channels = 0xf; /* writemask */
955 pos_args[0].valid_mask = 0; /* EXEC mask */
956 pos_args[0].done = 0; /* last export? */
957 pos_args[0].target = V_008DFC_SQ_EXP_POS;
958 pos_args[0].compr = 0; /* COMPR flag */
959 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
960 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
961 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
962 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
963 }
964
965 bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
966 !shader->key.as_ngg;
967
968 /* Write the misc vector (point size, edgeflag, layer, viewport). */
969 if (shader->selector->info.writes_psize ||
970 pos_writes_edgeflag ||
971 shader->selector->info.writes_viewport_index ||
972 shader->selector->info.writes_layer) {
973 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
974 (pos_writes_edgeflag << 1) |
975 (shader->selector->info.writes_layer << 2);
976
977 pos_args[1].valid_mask = 0; /* EXEC mask */
978 pos_args[1].done = 0; /* last export? */
979 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
980 pos_args[1].compr = 0; /* COMPR flag */
981 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
982 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
983 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
984 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
985
986 if (shader->selector->info.writes_psize)
987 pos_args[1].out[0] = psize_value;
988
989 if (pos_writes_edgeflag) {
990 /* The output is a float, but the hw expects an integer
991 * with the first bit containing the edge flag. */
992 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
993 edgeflag_value,
994 ctx->i32, "");
995 edgeflag_value = ac_build_umin(&ctx->ac,
996 edgeflag_value,
997 ctx->i32_1);
998
999 /* The LLVM intrinsic expects a float. */
1000 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
1001 }
1002
1003 if (ctx->screen->info.chip_class >= GFX9) {
1004 /* GFX9 has the layer in out.z[10:0] and the viewport
1005 * index in out.z[19:16].
1006 */
1007 if (shader->selector->info.writes_layer)
1008 pos_args[1].out[2] = layer_value;
1009
1010 if (shader->selector->info.writes_viewport_index) {
1011 LLVMValueRef v = viewport_index_value;
1012
1013 v = ac_to_integer(&ctx->ac, v);
1014 v = LLVMBuildShl(ctx->ac.builder, v,
1015 LLVMConstInt(ctx->i32, 16, 0), "");
1016 v = LLVMBuildOr(ctx->ac.builder, v,
1017 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
1018 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
1019 pos_args[1].enabled_channels |= 1 << 2;
1020 }
1021 } else {
1022 if (shader->selector->info.writes_layer)
1023 pos_args[1].out[2] = layer_value;
1024
1025 if (shader->selector->info.writes_viewport_index) {
1026 pos_args[1].out[3] = viewport_index_value;
1027 pos_args[1].enabled_channels |= 1 << 3;
1028 }
1029 }
1030 }
1031
1032 for (i = 0; i < 4; i++)
1033 if (pos_args[i].out[0])
1034 shader->info.nr_pos_exports++;
1035
1036 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
1037 * Setting valid_mask=1 prevents it and has no other effect.
1038 */
1039 if (ctx->screen->info.family == CHIP_NAVI10 ||
1040 ctx->screen->info.family == CHIP_NAVI12 ||
1041 ctx->screen->info.family == CHIP_NAVI14)
1042 pos_args[0].valid_mask = 1;
1043
1044 pos_idx = 0;
1045 for (i = 0; i < 4; i++) {
1046 if (!pos_args[i].out[0])
1047 continue;
1048
1049 /* Specify the target we are exporting */
1050 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
1051
1052 if (pos_idx == shader->info.nr_pos_exports)
1053 /* Specify that this is the last export */
1054 pos_args[i].done = 1;
1055
1056 ac_build_export(&ctx->ac, &pos_args[i]);
1057 }
1058
1059 /* Build parameter exports. */
1060 si_build_param_exports(ctx, outputs, noutput);
1061 }
1062
1063 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
1064 unsigned max_outputs,
1065 LLVMValueRef *addrs)
1066 {
1067 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1068 struct si_shader_info *info = &ctx->shader->selector->info;
1069 struct si_shader_output_values *outputs = NULL;
1070 int i,j;
1071
1072 assert(!ctx->shader->is_gs_copy_shader);
1073 assert(info->num_outputs <= max_outputs);
1074
1075 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
1076
1077 for (i = 0; i < info->num_outputs; i++) {
1078 outputs[i].semantic_name = info->output_semantic_name[i];
1079 outputs[i].semantic_index = info->output_semantic_index[i];
1080
1081 for (j = 0; j < 4; j++) {
1082 outputs[i].values[j] =
1083 LLVMBuildLoad(ctx->ac.builder,
1084 addrs[4 * i + j],
1085 "");
1086 outputs[i].vertex_stream[j] =
1087 (info->output_streams[i] >> (2 * j)) & 3;
1088 }
1089 }
1090
1091 if (!ctx->screen->use_ngg_streamout &&
1092 ctx->shader->selector->so.num_outputs)
1093 si_llvm_emit_streamout(ctx, outputs, i, 0);
1094
1095 /* Export PrimitiveID. */
1096 if (ctx->shader->key.mono.u.vs_export_prim_id) {
1097 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
1098 outputs[i].semantic_index = 0;
1099 outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
1100 for (j = 1; j < 4; j++)
1101 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
1102
1103 memset(outputs[i].vertex_stream, 0,
1104 sizeof(outputs[i].vertex_stream));
1105 i++;
1106 }
1107
1108 si_llvm_export_vs(ctx, outputs, i);
1109 FREE(outputs);
1110 }
1111
1112 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
1113 unsigned max_outputs,
1114 LLVMValueRef *addrs)
1115 {
1116 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1117 struct si_shader_info *info = &ctx->shader->selector->info;
1118 LLVMValueRef pos[4] = {};
1119
1120 assert(info->num_outputs <= max_outputs);
1121
1122 for (unsigned i = 0; i < info->num_outputs; i++) {
1123 if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
1124 continue;
1125
1126 for (unsigned chan = 0; chan < 4; chan++)
1127 pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
1128 break;
1129 }
1130 assert(pos[0] != NULL);
1131
1132 /* Return the position output. */
1133 LLVMValueRef ret = ctx->return_value;
1134 for (unsigned chan = 0; chan < 4; chan++)
1135 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
1136 ctx->return_value = ret;
1137 }
1138
1139 static void declare_streamout_params(struct si_shader_context *ctx,
1140 struct pipe_stream_output_info *so)
1141 {
1142 if (ctx->screen->use_ngg_streamout) {
1143 if (ctx->type == PIPE_SHADER_TESS_EVAL)
1144 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1145 return;
1146 }
1147
1148 /* Streamout SGPRs. */
1149 if (so->num_outputs) {
1150 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_config);
1151 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_write_index);
1152 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
1153 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1154 }
1155
1156 /* A streamout buffer offset is loaded if the stride is non-zero. */
1157 for (int i = 0; i < 4; i++) {
1158 if (!so->stride[i])
1159 continue;
1160
1161 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->streamout_offset[i]);
1162 }
1163 }
1164
1165 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
1166 {
1167 switch (shader->selector->type) {
1168 case PIPE_SHADER_VERTEX:
1169 case PIPE_SHADER_TESS_EVAL:
1170 return shader->key.as_ngg ? 128 : 0;
1171
1172 case PIPE_SHADER_TESS_CTRL:
1173 /* Return this so that LLVM doesn't remove s_barrier
1174 * instructions on chips where we use s_barrier. */
1175 return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
1176
1177 case PIPE_SHADER_GEOMETRY:
1178 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
1179
1180 case PIPE_SHADER_COMPUTE:
1181 break; /* see below */
1182
1183 default:
1184 return 0;
1185 }
1186
1187 const unsigned *properties = shader->selector->info.properties;
1188 unsigned max_work_group_size =
1189 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
1190 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
1191 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
1192
1193 if (!max_work_group_size) {
1194 /* This is a variable group size compute shader,
1195 * compile it for the maximum possible group size.
1196 */
1197 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
1198 }
1199 return max_work_group_size;
1200 }
1201
1202 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
1203 bool assign_params)
1204 {
1205 enum ac_arg_type const_shader_buf_type;
1206
1207 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
1208 ctx->shader->selector->info.shader_buffers_declared == 0)
1209 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
1210 else
1211 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
1212
1213 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
1214 assign_params ? &ctx->const_and_shader_buffers :
1215 &ctx->other_const_and_shader_buffers);
1216 }
1217
1218 static void declare_samplers_and_images(struct si_shader_context *ctx,
1219 bool assign_params)
1220 {
1221 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
1222 assign_params ? &ctx->samplers_and_images :
1223 &ctx->other_samplers_and_images);
1224 }
1225
1226 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
1227 bool assign_params)
1228 {
1229 declare_const_and_shader_buffers(ctx, assign_params);
1230 declare_samplers_and_images(ctx, assign_params);
1231 }
1232
1233 static void declare_global_desc_pointers(struct si_shader_context *ctx)
1234 {
1235 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
1236 &ctx->rw_buffers);
1237 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
1238 &ctx->bindless_samplers_and_images);
1239 }
1240
1241 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
1242 {
1243 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1244 if (!ctx->shader->is_gs_copy_shader) {
1245 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
1246 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
1247 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
1248 }
1249 }
1250
1251 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
1252 {
1253 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->vertex_buffers);
1254
1255 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
1256 if (num_vbos_in_user_sgprs) {
1257 unsigned user_sgprs = ctx->args.num_sgprs_used;
1258
1259 if (si_is_merged_shader(ctx))
1260 user_sgprs -= 8;
1261 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
1262
1263 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
1264 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
1265 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
1266
1267 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
1268 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
1269 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
1270 }
1271 }
1272
1273 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
1274 unsigned *num_prolog_vgprs)
1275 {
1276 struct si_shader *shader = ctx->shader;
1277
1278 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
1279 if (shader->key.as_ls) {
1280 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->rel_auto_id);
1281 if (ctx->screen->info.chip_class >= GFX10) {
1282 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
1283 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1284 } else {
1285 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1286 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
1287 }
1288 } else if (ctx->screen->info.chip_class >= GFX10) {
1289 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
1290 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1291 &ctx->vs_prim_id); /* user vgpr or PrimID (legacy) */
1292 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1293 } else {
1294 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
1295 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vs_prim_id);
1296 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
1297 }
1298
1299 if (!shader->is_gs_copy_shader) {
1300 /* Vertex load indices. */
1301 if (shader->selector->info.num_inputs) {
1302 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1303 &ctx->vertex_index0);
1304 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
1305 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
1306 }
1307 *num_prolog_vgprs += shader->selector->info.num_inputs;
1308 }
1309 }
1310
1311 static void declare_vs_blit_inputs(struct si_shader_context *ctx,
1312 unsigned vs_blit_property)
1313 {
1314 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1315 &ctx->vs_blit_inputs); /* i16 x1, y1 */
1316 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
1317 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
1318
1319 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
1320 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
1321 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
1322 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
1323 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
1324 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
1325 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
1326 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
1327 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
1328 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
1329 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
1330 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
1331 }
1332 }
1333
1334 static void declare_tes_input_vgprs(struct si_shader_context *ctx)
1335 {
1336 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
1337 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
1338 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
1339 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
1340 }
1341
1342 enum {
1343 /* Convenient merged shader definitions. */
1344 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
1345 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
1346 };
1347
1348 void si_add_arg_checked(struct ac_shader_args *args,
1349 enum ac_arg_regfile file,
1350 unsigned registers, enum ac_arg_type type,
1351 struct ac_arg *arg,
1352 unsigned idx)
1353 {
1354 assert(args->arg_count == idx);
1355 ac_add_arg(args, file, registers, type, arg);
1356 }
1357
1358 void si_create_function(struct si_shader_context *ctx)
1359 {
1360 struct si_shader *shader = ctx->shader;
1361 LLVMTypeRef returns[AC_MAX_ARGS];
1362 unsigned i, num_return_sgprs;
1363 unsigned num_returns = 0;
1364 unsigned num_prolog_vgprs = 0;
1365 unsigned type = ctx->type;
1366 unsigned vs_blit_property =
1367 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
1368
1369 memset(&ctx->args, 0, sizeof(ctx->args));
1370
1371 /* Set MERGED shaders. */
1372 if (ctx->screen->info.chip_class >= GFX9) {
1373 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
1374 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
1375 else if (shader->key.as_es || shader->key.as_ngg || type == PIPE_SHADER_GEOMETRY)
1376 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
1377 }
1378
1379 switch (type) {
1380 case PIPE_SHADER_VERTEX:
1381 declare_global_desc_pointers(ctx);
1382
1383 if (vs_blit_property) {
1384 declare_vs_blit_inputs(ctx, vs_blit_property);
1385
1386 /* VGPRs */
1387 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1388 break;
1389 }
1390
1391 declare_per_stage_desc_pointers(ctx, true);
1392 declare_vs_specific_input_sgprs(ctx);
1393 if (!shader->is_gs_copy_shader)
1394 declare_vb_descriptor_input_sgprs(ctx);
1395
1396 if (shader->key.as_es) {
1397 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1398 &ctx->es2gs_offset);
1399 } else if (shader->key.as_ls) {
1400 /* no extra parameters */
1401 } else {
1402 /* The locations of the other parameters are assigned dynamically. */
1403 declare_streamout_params(ctx, &shader->selector->so);
1404 }
1405
1406 /* VGPRs */
1407 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1408
1409 /* Return values */
1410 if (shader->key.opt.vs_as_prim_discard_cs) {
1411 for (i = 0; i < 4; i++)
1412 returns[num_returns++] = ctx->f32; /* VGPRs */
1413 }
1414 break;
1415
1416 case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
1417 declare_global_desc_pointers(ctx);
1418 declare_per_stage_desc_pointers(ctx, true);
1419 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1420 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
1421 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1422 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1423 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1424 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
1425
1426 /* VGPRs */
1427 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
1428 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
1429
1430 /* param_tcs_offchip_offset and param_tcs_factor_offset are
1431 * placed after the user SGPRs.
1432 */
1433 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
1434 returns[num_returns++] = ctx->i32; /* SGPRs */
1435 for (i = 0; i < 11; i++)
1436 returns[num_returns++] = ctx->f32; /* VGPRs */
1437 break;
1438
1439 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
1440 /* Merged stages have 8 system SGPRs at the beginning. */
1441 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
1442 declare_per_stage_desc_pointers(ctx,
1443 ctx->type == PIPE_SHADER_TESS_CTRL);
1444 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1445 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
1446 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_factor_offset);
1447 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
1448 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
1449 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
1450
1451 declare_global_desc_pointers(ctx);
1452 declare_per_stage_desc_pointers(ctx,
1453 ctx->type == PIPE_SHADER_VERTEX);
1454 declare_vs_specific_input_sgprs(ctx);
1455
1456 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1457 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
1458 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
1459 declare_vb_descriptor_input_sgprs(ctx);
1460
1461 /* VGPRs (first TCS, then VS) */
1462 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
1463 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
1464
1465 if (ctx->type == PIPE_SHADER_VERTEX) {
1466 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1467
1468 /* LS return values are inputs to the TCS main shader part. */
1469 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
1470 returns[num_returns++] = ctx->i32; /* SGPRs */
1471 for (i = 0; i < 2; i++)
1472 returns[num_returns++] = ctx->f32; /* VGPRs */
1473 } else {
1474 /* TCS return values are inputs to the TCS epilog.
1475 *
1476 * param_tcs_offchip_offset, param_tcs_factor_offset,
1477 * param_tcs_offchip_layout, and param_rw_buffers
1478 * should be passed to the epilog.
1479 */
1480 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
1481 returns[num_returns++] = ctx->i32; /* SGPRs */
1482 for (i = 0; i < 11; i++)
1483 returns[num_returns++] = ctx->f32; /* VGPRs */
1484 }
1485 break;
1486
1487 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
1488 /* Merged stages have 8 system SGPRs at the beginning. */
1489 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
1490 declare_per_stage_desc_pointers(ctx,
1491 ctx->type == PIPE_SHADER_GEOMETRY);
1492
1493 if (ctx->shader->key.as_ngg)
1494 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_tg_info);
1495 else
1496 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
1497
1498 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_wave_info);
1499 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1500 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->merged_scratch_offset);
1501 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
1502 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
1503
1504 declare_global_desc_pointers(ctx);
1505 if (ctx->type != PIPE_SHADER_VERTEX || !vs_blit_property) {
1506 declare_per_stage_desc_pointers(ctx,
1507 (ctx->type == PIPE_SHADER_VERTEX ||
1508 ctx->type == PIPE_SHADER_TESS_EVAL));
1509 }
1510
1511 if (ctx->type == PIPE_SHADER_VERTEX) {
1512 if (vs_blit_property)
1513 declare_vs_blit_inputs(ctx, vs_blit_property);
1514 else
1515 declare_vs_specific_input_sgprs(ctx);
1516 } else {
1517 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1518 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1519 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
1520 /* Declare as many input SGPRs as the VS has. */
1521 }
1522
1523 if (ctx->type == PIPE_SHADER_VERTEX)
1524 declare_vb_descriptor_input_sgprs(ctx);
1525
1526 /* VGPRs (first GS, then VS/TES) */
1527 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
1528 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
1529 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
1530 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
1531 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
1532
1533 if (ctx->type == PIPE_SHADER_VERTEX) {
1534 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
1535 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
1536 declare_tes_input_vgprs(ctx);
1537 }
1538
1539 if (ctx->shader->key.as_es &&
1540 (ctx->type == PIPE_SHADER_VERTEX ||
1541 ctx->type == PIPE_SHADER_TESS_EVAL)) {
1542 unsigned num_user_sgprs;
1543
1544 if (ctx->type == PIPE_SHADER_VERTEX)
1545 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
1546 else
1547 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
1548
1549 /* ES return values are inputs to GS. */
1550 for (i = 0; i < 8 + num_user_sgprs; i++)
1551 returns[num_returns++] = ctx->i32; /* SGPRs */
1552 for (i = 0; i < 5; i++)
1553 returns[num_returns++] = ctx->f32; /* VGPRs */
1554 }
1555 break;
1556
1557 case PIPE_SHADER_TESS_EVAL:
1558 declare_global_desc_pointers(ctx);
1559 declare_per_stage_desc_pointers(ctx, true);
1560 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
1561 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
1562 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
1563
1564 if (shader->key.as_es) {
1565 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1566 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1567 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->es2gs_offset);
1568 } else {
1569 declare_streamout_params(ctx, &shader->selector->so);
1570 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_offset);
1571 }
1572
1573 /* VGPRs */
1574 declare_tes_input_vgprs(ctx);
1575 break;
1576
1577 case PIPE_SHADER_GEOMETRY:
1578 declare_global_desc_pointers(ctx);
1579 declare_per_stage_desc_pointers(ctx, true);
1580 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs2vs_offset);
1581 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_wave_id);
1582
1583 /* VGPRs */
1584 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[0]);
1585 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[1]);
1586 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
1587 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[2]);
1588 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[3]);
1589 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[4]);
1590 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx_offset[5]);
1591 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
1592 break;
1593
1594 case PIPE_SHADER_FRAGMENT:
1595 declare_global_desc_pointers(ctx);
1596 declare_per_stage_desc_pointers(ctx, true);
1597 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL,
1598 SI_PARAM_ALPHA_REF);
1599 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1600 &ctx->args.prim_mask, SI_PARAM_PRIM_MASK);
1601
1602 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
1603 SI_PARAM_PERSP_SAMPLE);
1604 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1605 &ctx->args.persp_center, SI_PARAM_PERSP_CENTER);
1606 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1607 &ctx->args.persp_centroid, SI_PARAM_PERSP_CENTROID);
1608 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
1609 NULL, SI_PARAM_PERSP_PULL_MODEL);
1610 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1611 &ctx->args.linear_sample, SI_PARAM_LINEAR_SAMPLE);
1612 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1613 &ctx->args.linear_center, SI_PARAM_LINEAR_CENTER);
1614 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT,
1615 &ctx->args.linear_centroid, SI_PARAM_LINEAR_CENTROID);
1616 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT,
1617 NULL, SI_PARAM_LINE_STIPPLE_TEX);
1618 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1619 &ctx->args.frag_pos[0], SI_PARAM_POS_X_FLOAT);
1620 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1621 &ctx->args.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
1622 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1623 &ctx->args.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
1624 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1625 &ctx->args.frag_pos[3], SI_PARAM_POS_W_FLOAT);
1626 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
1627 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1628 &ctx->args.front_face, SI_PARAM_FRONT_FACE);
1629 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
1630 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1631 &ctx->args.ancillary, SI_PARAM_ANCILLARY);
1632 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT,
1633 &ctx->args.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
1634 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
1635 &ctx->pos_fixed_pt, SI_PARAM_POS_FIXED_PT);
1636
1637 /* Color inputs from the prolog. */
1638 if (shader->selector->info.colors_read) {
1639 unsigned num_color_elements =
1640 util_bitcount(shader->selector->info.colors_read);
1641
1642 for (i = 0; i < num_color_elements; i++)
1643 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
1644
1645 num_prolog_vgprs += num_color_elements;
1646 }
1647
1648 /* Outputs for the epilog. */
1649 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
1650 num_returns =
1651 num_return_sgprs +
1652 util_bitcount(shader->selector->info.colors_written) * 4 +
1653 shader->selector->info.writes_z +
1654 shader->selector->info.writes_stencil +
1655 shader->selector->info.writes_samplemask +
1656 1 /* SampleMaskIn */;
1657
1658 num_returns = MAX2(num_returns,
1659 num_return_sgprs +
1660 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
1661
1662 for (i = 0; i < num_return_sgprs; i++)
1663 returns[i] = ctx->i32;
1664 for (; i < num_returns; i++)
1665 returns[i] = ctx->f32;
1666 break;
1667
1668 case PIPE_SHADER_COMPUTE:
1669 declare_global_desc_pointers(ctx);
1670 declare_per_stage_desc_pointers(ctx, true);
1671 if (shader->selector->info.uses_grid_size)
1672 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT,
1673 &ctx->args.num_work_groups);
1674 if (shader->selector->info.uses_block_size &&
1675 shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
1676 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->block_size);
1677
1678 unsigned cs_user_data_dwords =
1679 shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD];
1680 if (cs_user_data_dwords) {
1681 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT,
1682 &ctx->cs_user_data);
1683 }
1684
1685 /* Hardware SGPRs. */
1686 for (i = 0; i < 3; i++) {
1687 if (shader->selector->info.uses_block_id[i]) {
1688 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
1689 &ctx->args.workgroup_ids[i]);
1690 }
1691 }
1692 if (shader->selector->info.uses_subgroup_info)
1693 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
1694
1695 /* Hardware VGPRs. */
1696 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT,
1697 &ctx->args.local_invocation_ids);
1698 break;
1699 default:
1700 assert(0 && "unimplemented shader");
1701 return;
1702 }
1703
1704 si_llvm_create_func(ctx, "main", returns, num_returns,
1705 si_get_max_workgroup_size(shader));
1706
1707 /* Reserve register locations for VGPR inputs the PS prolog may need. */
1708 if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
1709 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
1710 "InitialPSInputAddr",
1711 S_0286D0_PERSP_SAMPLE_ENA(1) |
1712 S_0286D0_PERSP_CENTER_ENA(1) |
1713 S_0286D0_PERSP_CENTROID_ENA(1) |
1714 S_0286D0_LINEAR_SAMPLE_ENA(1) |
1715 S_0286D0_LINEAR_CENTER_ENA(1) |
1716 S_0286D0_LINEAR_CENTROID_ENA(1) |
1717 S_0286D0_FRONT_FACE_ENA(1) |
1718 S_0286D0_ANCILLARY_ENA(1) |
1719 S_0286D0_POS_FIXED_PT_ENA(1));
1720 }
1721
1722 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
1723 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
1724
1725 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
1726 shader->info.num_input_vgprs -= num_prolog_vgprs;
1727
1728 if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) {
1729 if (USE_LDS_SYMBOLS && LLVM_VERSION_MAJOR >= 9) {
1730 /* The LSHS size is not known until draw time, so we append it
1731 * at the end of whatever LDS use there may be in the rest of
1732 * the shader (currently none, unless LLVM decides to do its
1733 * own LDS-based lowering).
1734 */
1735 ctx->ac.lds = LLVMAddGlobalInAddressSpace(
1736 ctx->ac.module, LLVMArrayType(ctx->i32, 0),
1737 "__lds_end", AC_ADDR_SPACE_LDS);
1738 LLVMSetAlignment(ctx->ac.lds, 256);
1739 } else {
1740 ac_declare_lds_as_pointer(&ctx->ac);
1741 }
1742 }
1743
1744 /* Unlike radv, we override these arguments in the prolog, so to the
1745 * API shader they appear as normal arguments.
1746 */
1747 if (ctx->type == PIPE_SHADER_VERTEX) {
1748 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args.vertex_id);
1749 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args.instance_id);
1750 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1751 ctx->abi.persp_centroid = ac_get_arg(&ctx->ac, ctx->args.persp_centroid);
1752 ctx->abi.linear_centroid = ac_get_arg(&ctx->ac, ctx->args.linear_centroid);
1753 }
1754 }
1755
1756 /* For the UMR disassembler. */
1757 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
1758 #define DEBUGGER_NUM_MARKERS 5
1759
1760 static bool si_shader_binary_open(struct si_screen *screen,
1761 struct si_shader *shader,
1762 struct ac_rtld_binary *rtld)
1763 {
1764 const struct si_shader_selector *sel = shader->selector;
1765 const char *part_elfs[5];
1766 size_t part_sizes[5];
1767 unsigned num_parts = 0;
1768
1769 #define add_part(shader_or_part) \
1770 if (shader_or_part) { \
1771 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
1772 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
1773 num_parts++; \
1774 }
1775
1776 add_part(shader->prolog);
1777 add_part(shader->previous_stage);
1778 add_part(shader->prolog2);
1779 add_part(shader);
1780 add_part(shader->epilog);
1781
1782 #undef add_part
1783
1784 struct ac_rtld_symbol lds_symbols[2];
1785 unsigned num_lds_symbols = 0;
1786
1787 if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
1788 (sel->type == PIPE_SHADER_GEOMETRY || shader->key.as_ngg)) {
1789 /* We add this symbol even on LLVM <= 8 to ensure that
1790 * shader->config.lds_size is set correctly below.
1791 */
1792 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1793 sym->name = "esgs_ring";
1794 sym->size = shader->gs_info.esgs_ring_size;
1795 sym->align = 64 * 1024;
1796 }
1797
1798 if (shader->key.as_ngg && sel->type == PIPE_SHADER_GEOMETRY) {
1799 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
1800 sym->name = "ngg_emit";
1801 sym->size = shader->ngg.ngg_emit_size * 4;
1802 sym->align = 4;
1803 }
1804
1805 bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){
1806 .info = &screen->info,
1807 .options = {
1808 .halt_at_entry = screen->options.halt_shaders,
1809 },
1810 .shader_type = tgsi_processor_to_shader_stage(sel->type),
1811 .wave_size = si_get_shader_wave_size(shader),
1812 .num_parts = num_parts,
1813 .elf_ptrs = part_elfs,
1814 .elf_sizes = part_sizes,
1815 .num_shared_lds_symbols = num_lds_symbols,
1816 .shared_lds_symbols = lds_symbols });
1817
1818 if (rtld->lds_size > 0) {
1819 unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
1820 shader->config.lds_size =
1821 align(rtld->lds_size, alloc_granularity) / alloc_granularity;
1822 }
1823
1824 return ok;
1825 }
1826
1827 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
1828 {
1829 struct ac_rtld_binary rtld;
1830 si_shader_binary_open(screen, shader, &rtld);
1831 return rtld.exec_size;
1832 }
1833
1834 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
1835 {
1836 uint64_t *scratch_va = data;
1837
1838 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
1839 *value = (uint32_t)*scratch_va;
1840 return true;
1841 }
1842 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
1843 /* Enable scratch coalescing. */
1844 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) |
1845 S_008F04_SWIZZLE_ENABLE(1);
1846 return true;
1847 }
1848
1849 return false;
1850 }
1851
1852 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1853 uint64_t scratch_va)
1854 {
1855 struct ac_rtld_binary binary;
1856 if (!si_shader_binary_open(sscreen, shader, &binary))
1857 return false;
1858
1859 si_resource_reference(&shader->bo, NULL);
1860 shader->bo = si_aligned_buffer_create(&sscreen->b,
1861 sscreen->info.cpdma_prefetch_writes_memory ?
1862 0 : SI_RESOURCE_FLAG_READ_ONLY,
1863 PIPE_USAGE_IMMUTABLE,
1864 align(binary.rx_size, SI_CPDMA_ALIGNMENT),
1865 256);
1866 if (!shader->bo)
1867 return false;
1868
1869 /* Upload. */
1870 struct ac_rtld_upload_info u = {};
1871 u.binary = &binary;
1872 u.get_external_symbol = si_get_external_symbol;
1873 u.cb_data = &scratch_va;
1874 u.rx_va = shader->bo->gpu_address;
1875 u.rx_ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
1876 PIPE_TRANSFER_READ_WRITE |
1877 PIPE_TRANSFER_UNSYNCHRONIZED |
1878 RADEON_TRANSFER_TEMPORARY);
1879 if (!u.rx_ptr)
1880 return false;
1881
1882 bool ok = ac_rtld_upload(&u);
1883
1884 sscreen->ws->buffer_unmap(shader->bo->buf);
1885 ac_rtld_close(&binary);
1886
1887 return ok;
1888 }
1889
1890 static void si_shader_dump_disassembly(struct si_screen *screen,
1891 const struct si_shader_binary *binary,
1892 enum pipe_shader_type shader_type,
1893 unsigned wave_size,
1894 struct pipe_debug_callback *debug,
1895 const char *name, FILE *file)
1896 {
1897 struct ac_rtld_binary rtld_binary;
1898
1899 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1900 .info = &screen->info,
1901 .shader_type = tgsi_processor_to_shader_stage(shader_type),
1902 .wave_size = wave_size,
1903 .num_parts = 1,
1904 .elf_ptrs = &binary->elf_buffer,
1905 .elf_sizes = &binary->elf_size }))
1906 return;
1907
1908 const char *disasm;
1909 size_t nbytes;
1910
1911 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1912 goto out;
1913
1914 if (nbytes > INT_MAX)
1915 goto out;
1916
1917 if (debug && debug->debug_message) {
1918 /* Very long debug messages are cut off, so send the
1919 * disassembly one line at a time. This causes more
1920 * overhead, but on the plus side it simplifies
1921 * parsing of resulting logs.
1922 */
1923 pipe_debug_message(debug, SHADER_INFO,
1924 "Shader Disassembly Begin");
1925
1926 uint64_t line = 0;
1927 while (line < nbytes) {
1928 int count = nbytes - line;
1929 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1930 if (nl)
1931 count = nl - (disasm + line);
1932
1933 if (count) {
1934 pipe_debug_message(debug, SHADER_INFO,
1935 "%.*s", count, disasm + line);
1936 }
1937
1938 line += count + 1;
1939 }
1940
1941 pipe_debug_message(debug, SHADER_INFO,
1942 "Shader Disassembly End");
1943 }
1944
1945 if (file) {
1946 fprintf(file, "Shader %s disassembly:\n", name);
1947 fprintf(file, "%*s", (int)nbytes, disasm);
1948 }
1949
1950 out:
1951 ac_rtld_close(&rtld_binary);
1952 }
1953
1954 static void si_calculate_max_simd_waves(struct si_shader *shader)
1955 {
1956 struct si_screen *sscreen = shader->selector->screen;
1957 struct ac_shader_config *conf = &shader->config;
1958 unsigned num_inputs = shader->selector->info.num_inputs;
1959 unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
1960 unsigned lds_per_wave = 0;
1961 unsigned max_simd_waves;
1962
1963 max_simd_waves = sscreen->info.max_wave64_per_simd;
1964
1965 /* Compute LDS usage for PS. */
1966 switch (shader->selector->type) {
1967 case PIPE_SHADER_FRAGMENT:
1968 /* The minimum usage per wave is (num_inputs * 48). The maximum
1969 * usage is (num_inputs * 48 * 16).
1970 * We can get anything in between and it varies between waves.
1971 *
1972 * The 48 bytes per input for a single primitive is equal to
1973 * 4 bytes/component * 4 components/input * 3 points.
1974 *
1975 * Other stages don't know the size at compile time or don't
1976 * allocate LDS per wave, but instead they do it per thread group.
1977 */
1978 lds_per_wave = conf->lds_size * lds_increment +
1979 align(num_inputs * 48, lds_increment);
1980 break;
1981 case PIPE_SHADER_COMPUTE:
1982 if (shader->selector) {
1983 unsigned max_workgroup_size =
1984 si_get_max_workgroup_size(shader);
1985 lds_per_wave = (conf->lds_size * lds_increment) /
1986 DIV_ROUND_UP(max_workgroup_size,
1987 sscreen->compute_wave_size);
1988 }
1989 break;
1990 default:;
1991 }
1992
1993 /* Compute the per-SIMD wave counts. */
1994 if (conf->num_sgprs) {
1995 max_simd_waves =
1996 MIN2(max_simd_waves,
1997 sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1998 }
1999
2000 if (conf->num_vgprs) {
2001 /* Always print wave limits as Wave64, so that we can compare
2002 * Wave32 and Wave64 with shader-db fairly. */
2003 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
2004 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
2005 }
2006
2007 /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
2008 * 16KB makes some SIMDs unoccupied).
2009 *
2010 * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
2011 */
2012 unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
2013 unsigned max_lds_per_simd = max_lds_size / 4;
2014 if (lds_per_wave)
2015 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
2016
2017 shader->info.max_simd_waves = max_simd_waves;
2018 }
2019
2020 void si_shader_dump_stats_for_shader_db(struct si_screen *screen,
2021 struct si_shader *shader,
2022 struct pipe_debug_callback *debug)
2023 {
2024 const struct ac_shader_config *conf = &shader->config;
2025
2026 if (screen->options.debug_disassembly)
2027 si_shader_dump_disassembly(screen, &shader->binary,
2028 shader->selector->type,
2029 si_get_shader_wave_size(shader),
2030 debug, "main", NULL);
2031
2032 pipe_debug_message(debug, SHADER_INFO,
2033 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
2034 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
2035 "Spilled VGPRs: %d PrivMem VGPRs: %d",
2036 conf->num_sgprs, conf->num_vgprs,
2037 si_get_shader_binary_size(screen, shader),
2038 conf->lds_size, conf->scratch_bytes_per_wave,
2039 shader->info.max_simd_waves, conf->spilled_sgprs,
2040 conf->spilled_vgprs, shader->info.private_mem_vgprs);
2041 }
2042
2043 static void si_shader_dump_stats(struct si_screen *sscreen,
2044 struct si_shader *shader,
2045 FILE *file,
2046 bool check_debug_option)
2047 {
2048 const struct ac_shader_config *conf = &shader->config;
2049
2050 if (!check_debug_option ||
2051 si_can_dump_shader(sscreen, shader->selector->type)) {
2052 if (shader->selector->type == PIPE_SHADER_FRAGMENT) {
2053 fprintf(file, "*** SHADER CONFIG ***\n"
2054 "SPI_PS_INPUT_ADDR = 0x%04x\n"
2055 "SPI_PS_INPUT_ENA = 0x%04x\n",
2056 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
2057 }
2058
2059 fprintf(file, "*** SHADER STATS ***\n"
2060 "SGPRS: %d\n"
2061 "VGPRS: %d\n"
2062 "Spilled SGPRs: %d\n"
2063 "Spilled VGPRs: %d\n"
2064 "Private memory VGPRs: %d\n"
2065 "Code Size: %d bytes\n"
2066 "LDS: %d blocks\n"
2067 "Scratch: %d bytes per wave\n"
2068 "Max Waves: %d\n"
2069 "********************\n\n\n",
2070 conf->num_sgprs, conf->num_vgprs,
2071 conf->spilled_sgprs, conf->spilled_vgprs,
2072 shader->info.private_mem_vgprs,
2073 si_get_shader_binary_size(sscreen, shader),
2074 conf->lds_size, conf->scratch_bytes_per_wave,
2075 shader->info.max_simd_waves);
2076 }
2077 }
2078
2079 const char *si_get_shader_name(const struct si_shader *shader)
2080 {
2081 switch (shader->selector->type) {
2082 case PIPE_SHADER_VERTEX:
2083 if (shader->key.as_es)
2084 return "Vertex Shader as ES";
2085 else if (shader->key.as_ls)
2086 return "Vertex Shader as LS";
2087 else if (shader->key.opt.vs_as_prim_discard_cs)
2088 return "Vertex Shader as Primitive Discard CS";
2089 else if (shader->key.as_ngg)
2090 return "Vertex Shader as ESGS";
2091 else
2092 return "Vertex Shader as VS";
2093 case PIPE_SHADER_TESS_CTRL:
2094 return "Tessellation Control Shader";
2095 case PIPE_SHADER_TESS_EVAL:
2096 if (shader->key.as_es)
2097 return "Tessellation Evaluation Shader as ES";
2098 else if (shader->key.as_ngg)
2099 return "Tessellation Evaluation Shader as ESGS";
2100 else
2101 return "Tessellation Evaluation Shader as VS";
2102 case PIPE_SHADER_GEOMETRY:
2103 if (shader->is_gs_copy_shader)
2104 return "GS Copy Shader as VS";
2105 else
2106 return "Geometry Shader";
2107 case PIPE_SHADER_FRAGMENT:
2108 return "Pixel Shader";
2109 case PIPE_SHADER_COMPUTE:
2110 return "Compute Shader";
2111 default:
2112 return "Unknown Shader";
2113 }
2114 }
2115
2116 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
2117 struct pipe_debug_callback *debug,
2118 FILE *file, bool check_debug_option)
2119 {
2120 enum pipe_shader_type shader_type = shader->selector->type;
2121
2122 if (!check_debug_option ||
2123 si_can_dump_shader(sscreen, shader_type))
2124 si_dump_shader_key(shader, file);
2125
2126 if (!check_debug_option && shader->binary.llvm_ir_string) {
2127 if (shader->previous_stage &&
2128 shader->previous_stage->binary.llvm_ir_string) {
2129 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
2130 si_get_shader_name(shader));
2131 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
2132 }
2133
2134 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
2135 si_get_shader_name(shader));
2136 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
2137 }
2138
2139 if (!check_debug_option ||
2140 (si_can_dump_shader(sscreen, shader_type) &&
2141 !(sscreen->debug_flags & DBG(NO_ASM)))) {
2142 unsigned wave_size = si_get_shader_wave_size(shader);
2143
2144 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
2145
2146 if (shader->prolog)
2147 si_shader_dump_disassembly(sscreen, &shader->prolog->binary,
2148 shader_type, wave_size, debug, "prolog", file);
2149 if (shader->previous_stage)
2150 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary,
2151 shader_type, wave_size, debug, "previous stage", file);
2152 if (shader->prolog2)
2153 si_shader_dump_disassembly(sscreen, &shader->prolog2->binary,
2154 shader_type, wave_size, debug, "prolog2", file);
2155
2156 si_shader_dump_disassembly(sscreen, &shader->binary, shader_type,
2157 wave_size, debug, "main", file);
2158
2159 if (shader->epilog)
2160 si_shader_dump_disassembly(sscreen, &shader->epilog->binary,
2161 shader_type, wave_size, debug, "epilog", file);
2162 fprintf(file, "\n");
2163 }
2164
2165 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
2166 }
2167
2168 int si_compile_llvm(struct si_screen *sscreen,
2169 struct si_shader_binary *binary,
2170 struct ac_shader_config *conf,
2171 struct ac_llvm_compiler *compiler,
2172 LLVMModuleRef mod,
2173 struct pipe_debug_callback *debug,
2174 enum pipe_shader_type shader_type,
2175 unsigned wave_size,
2176 const char *name,
2177 bool less_optimized)
2178 {
2179 unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
2180
2181 if (si_can_dump_shader(sscreen, shader_type)) {
2182 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
2183
2184 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
2185 fprintf(stderr, "%s LLVM IR:\n\n", name);
2186 ac_dump_module(mod);
2187 fprintf(stderr, "\n");
2188 }
2189 }
2190
2191 if (sscreen->record_llvm_ir) {
2192 char *ir = LLVMPrintModuleToString(mod);
2193 binary->llvm_ir_string = strdup(ir);
2194 LLVMDisposeMessage(ir);
2195 }
2196
2197 if (!si_replace_shader(count, binary)) {
2198 unsigned r = si_llvm_compile(mod, binary, compiler, debug,
2199 less_optimized, wave_size);
2200 if (r)
2201 return r;
2202 }
2203
2204 struct ac_rtld_binary rtld;
2205 if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
2206 .info = &sscreen->info,
2207 .shader_type = tgsi_processor_to_shader_stage(shader_type),
2208 .wave_size = wave_size,
2209 .num_parts = 1,
2210 .elf_ptrs = &binary->elf_buffer,
2211 .elf_sizes = &binary->elf_size }))
2212 return -1;
2213
2214 bool ok = ac_rtld_read_config(&rtld, conf);
2215 ac_rtld_close(&rtld);
2216 if (!ok)
2217 return -1;
2218
2219 /* Enable 64-bit and 16-bit denormals, because there is no performance
2220 * cost.
2221 *
2222 * If denormals are enabled, all floating-point output modifiers are
2223 * ignored.
2224 *
2225 * Don't enable denormals for 32-bit floats, because:
2226 * - Floating-point output modifiers would be ignored by the hw.
2227 * - Some opcodes don't support denormals, such as v_mad_f32. We would
2228 * have to stop using those.
2229 * - GFX6 & GFX7 would be very slow.
2230 */
2231 conf->float_mode |= V_00B028_FP_64_DENORMS;
2232
2233 return 0;
2234 }
2235
2236 static void si_dump_shader_key_vs(const struct si_shader_key *key,
2237 const struct si_vs_prolog_bits *prolog,
2238 const char *prefix, FILE *f)
2239 {
2240 fprintf(f, " %s.instance_divisor_is_one = %u\n",
2241 prefix, prolog->instance_divisor_is_one);
2242 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
2243 prefix, prolog->instance_divisor_is_fetched);
2244 fprintf(f, " %s.unpack_instance_id_from_vertex_id = %u\n",
2245 prefix, prolog->unpack_instance_id_from_vertex_id);
2246 fprintf(f, " %s.ls_vgpr_fix = %u\n",
2247 prefix, prolog->ls_vgpr_fix);
2248
2249 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
2250 fprintf(f, " mono.vs.fix_fetch = {");
2251 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
2252 union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
2253 if (i)
2254 fprintf(f, ", ");
2255 if (!fix.bits)
2256 fprintf(f, "0");
2257 else
2258 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size,
2259 fix.u.num_channels_m1, fix.u.format);
2260 }
2261 fprintf(f, "}\n");
2262 }
2263
2264 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
2265 {
2266 const struct si_shader_key *key = &shader->key;
2267 enum pipe_shader_type shader_type = shader->selector->type;
2268
2269 fprintf(f, "SHADER KEY\n");
2270
2271 switch (shader_type) {
2272 case PIPE_SHADER_VERTEX:
2273 si_dump_shader_key_vs(key, &key->part.vs.prolog,
2274 "part.vs.prolog", f);
2275 fprintf(f, " as_es = %u\n", key->as_es);
2276 fprintf(f, " as_ls = %u\n", key->as_ls);
2277 fprintf(f, " as_ngg = %u\n", key->as_ngg);
2278 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
2279 key->mono.u.vs_export_prim_id);
2280 fprintf(f, " opt.vs_as_prim_discard_cs = %u\n",
2281 key->opt.vs_as_prim_discard_cs);
2282 fprintf(f, " opt.cs_prim_type = %s\n",
2283 tgsi_primitive_names[key->opt.cs_prim_type]);
2284 fprintf(f, " opt.cs_indexed = %u\n",
2285 key->opt.cs_indexed);
2286 fprintf(f, " opt.cs_instancing = %u\n",
2287 key->opt.cs_instancing);
2288 fprintf(f, " opt.cs_primitive_restart = %u\n",
2289 key->opt.cs_primitive_restart);
2290 fprintf(f, " opt.cs_provoking_vertex_first = %u\n",
2291 key->opt.cs_provoking_vertex_first);
2292 fprintf(f, " opt.cs_need_correct_orientation = %u\n",
2293 key->opt.cs_need_correct_orientation);
2294 fprintf(f, " opt.cs_cull_front = %u\n",
2295 key->opt.cs_cull_front);
2296 fprintf(f, " opt.cs_cull_back = %u\n",
2297 key->opt.cs_cull_back);
2298 fprintf(f, " opt.cs_cull_z = %u\n",
2299 key->opt.cs_cull_z);
2300 fprintf(f, " opt.cs_halfz_clip_space = %u\n",
2301 key->opt.cs_halfz_clip_space);
2302 break;
2303
2304 case PIPE_SHADER_TESS_CTRL:
2305 if (shader->selector->screen->info.chip_class >= GFX9) {
2306 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
2307 "part.tcs.ls_prolog", f);
2308 }
2309 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
2310 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
2311 break;
2312
2313 case PIPE_SHADER_TESS_EVAL:
2314 fprintf(f, " as_es = %u\n", key->as_es);
2315 fprintf(f, " as_ngg = %u\n", key->as_ngg);
2316 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
2317 key->mono.u.vs_export_prim_id);
2318 break;
2319
2320 case PIPE_SHADER_GEOMETRY:
2321 if (shader->is_gs_copy_shader)
2322 break;
2323
2324 if (shader->selector->screen->info.chip_class >= GFX9 &&
2325 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
2326 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
2327 "part.gs.vs_prolog", f);
2328 }
2329 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
2330 fprintf(f, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key->part.gs.prolog.gfx9_prev_is_vs);
2331 fprintf(f, " as_ngg = %u\n", key->as_ngg);
2332 break;
2333
2334 case PIPE_SHADER_COMPUTE:
2335 break;
2336
2337 case PIPE_SHADER_FRAGMENT:
2338 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
2339 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
2340 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
2341 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
2342 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
2343 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
2344 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
2345 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
2346 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
2347 fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key->part.ps.prolog.samplemask_log_ps_iter);
2348 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
2349 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
2350 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
2351 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
2352 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
2353 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
2354 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
2355 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
2356 fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key->mono.u.ps.interpolate_at_sample_force_center);
2357 fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
2358 fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
2359 fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
2360 break;
2361
2362 default:
2363 assert(0);
2364 }
2365
2366 if ((shader_type == PIPE_SHADER_GEOMETRY ||
2367 shader_type == PIPE_SHADER_TESS_EVAL ||
2368 shader_type == PIPE_SHADER_VERTEX) &&
2369 !key->as_es && !key->as_ls) {
2370 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
2371 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
2372 }
2373 }
2374
2375 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
2376 {
2377 struct si_shader *shader = ctx->shader;
2378 struct si_shader_info *info = &shader->selector->info;
2379
2380 if ((ctx->type != PIPE_SHADER_VERTEX &&
2381 ctx->type != PIPE_SHADER_TESS_EVAL) ||
2382 shader->key.as_ls ||
2383 shader->key.as_es)
2384 return;
2385
2386 ac_optimize_vs_outputs(&ctx->ac,
2387 ctx->main_fn,
2388 shader->info.vs_output_param_offset,
2389 info->num_outputs,
2390 &shader->info.nr_param_exports);
2391 }
2392
2393 static void si_init_exec_from_input(struct si_shader_context *ctx,
2394 struct ac_arg param, unsigned bitoffset)
2395 {
2396 LLVMValueRef args[] = {
2397 ac_get_arg(&ctx->ac, param),
2398 LLVMConstInt(ctx->i32, bitoffset, 0),
2399 };
2400 ac_build_intrinsic(&ctx->ac,
2401 "llvm.amdgcn.init.exec.from.input",
2402 ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
2403 }
2404
2405 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
2406 const struct si_vs_prolog_bits *key)
2407 {
2408 /* VGPR initialization fixup for Vega10 and Raven is always done in the
2409 * VS prolog. */
2410 return sel->vs_needs_prolog ||
2411 key->ls_vgpr_fix ||
2412 key->unpack_instance_id_from_vertex_id;
2413 }
2414
2415 static bool si_build_main_function(struct si_shader_context *ctx,
2416 struct nir_shader *nir, bool free_nir)
2417 {
2418 struct si_shader *shader = ctx->shader;
2419 struct si_shader_selector *sel = shader->selector;
2420
2421 switch (ctx->type) {
2422 case PIPE_SHADER_VERTEX:
2423 if (shader->key.as_ls)
2424 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
2425 else if (shader->key.as_es)
2426 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
2427 else if (shader->key.opt.vs_as_prim_discard_cs)
2428 ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
2429 else if (shader->key.as_ngg)
2430 ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
2431 else
2432 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
2433 ctx->abi.load_base_vertex = get_base_vertex;
2434 break;
2435 case PIPE_SHADER_TESS_CTRL:
2436 si_llvm_init_tcs_callbacks(ctx);
2437 break;
2438 case PIPE_SHADER_TESS_EVAL:
2439 si_llvm_init_tes_callbacks(ctx);
2440
2441 if (shader->key.as_es)
2442 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
2443 else if (shader->key.as_ngg)
2444 ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
2445 else
2446 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
2447 break;
2448 case PIPE_SHADER_GEOMETRY:
2449 si_llvm_init_gs_callbacks(ctx);
2450 break;
2451 case PIPE_SHADER_FRAGMENT:
2452 si_llvm_init_ps_callbacks(ctx);
2453 break;
2454 case PIPE_SHADER_COMPUTE:
2455 ctx->abi.load_local_group_size = get_block_size;
2456 break;
2457 default:
2458 assert(!"Unsupported shader type");
2459 return false;
2460 }
2461
2462 ctx->abi.load_ubo = load_ubo;
2463 ctx->abi.load_ssbo = load_ssbo;
2464
2465 si_create_function(ctx);
2466
2467 if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)
2468 si_preload_esgs_ring(ctx);
2469
2470 if (ctx->type == PIPE_SHADER_GEOMETRY)
2471 si_preload_gs_rings(ctx);
2472 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
2473 si_llvm_preload_tes_rings(ctx);
2474
2475 if (ctx->type == PIPE_SHADER_TESS_CTRL &&
2476 sel->info.tessfactors_are_def_in_all_invocs) {
2477 for (unsigned i = 0; i < 6; i++) {
2478 ctx->invoc0_tess_factors[i] =
2479 ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
2480 }
2481 }
2482
2483 if (ctx->type == PIPE_SHADER_GEOMETRY) {
2484 for (unsigned i = 0; i < 4; i++) {
2485 ctx->gs_next_vertex[i] =
2486 ac_build_alloca(&ctx->ac, ctx->i32, "");
2487 }
2488 if (shader->key.as_ngg) {
2489 for (unsigned i = 0; i < 4; ++i) {
2490 ctx->gs_curprim_verts[i] =
2491 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
2492 ctx->gs_generated_prims[i] =
2493 ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
2494 }
2495
2496 unsigned scratch_size = 8;
2497 if (sel->so.num_outputs)
2498 scratch_size = 44;
2499
2500 LLVMTypeRef ai32 = LLVMArrayType(ctx->i32, scratch_size);
2501 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2502 ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
2503 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(ai32));
2504 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
2505
2506 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2507 LLVMArrayType(ctx->i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
2508 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
2509 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
2510 }
2511 }
2512
2513 if (ctx->type != PIPE_SHADER_GEOMETRY &&
2514 (shader->key.as_ngg && !shader->key.as_es)) {
2515 /* Unconditionally declare scratch space base for streamout and
2516 * vertex compaction. Whether space is actually allocated is
2517 * determined during linking / PM4 creation.
2518 *
2519 * Add an extra dword per vertex to ensure an odd stride, which
2520 * avoids bank conflicts for SoA accesses.
2521 */
2522 if (!gfx10_is_ngg_passthrough(shader))
2523 si_llvm_declare_esgs_ring(ctx);
2524
2525 /* This is really only needed when streamout and / or vertex
2526 * compaction is enabled.
2527 */
2528 if (sel->so.num_outputs && !ctx->gs_ngg_scratch) {
2529 LLVMTypeRef asi32 = LLVMArrayType(ctx->i32, 8);
2530 ctx->gs_ngg_scratch = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2531 asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
2532 LLVMSetInitializer(ctx->gs_ngg_scratch, LLVMGetUndef(asi32));
2533 LLVMSetAlignment(ctx->gs_ngg_scratch, 4);
2534 }
2535 }
2536
2537 /* For GFX9 merged shaders:
2538 * - Set EXEC for the first shader. If the prolog is present, set
2539 * EXEC there instead.
2540 * - Add a barrier before the second shader.
2541 * - In the second shader, reset EXEC to ~0 and wrap the main part in
2542 * an if-statement. This is required for correctness in geometry
2543 * shaders, to ensure that empty GS waves do not send GS_EMIT and
2544 * GS_CUT messages.
2545 *
2546 * For monolithic merged shaders, the first shader is wrapped in an
2547 * if-block together with its prolog in si_build_wrapper_function.
2548 *
2549 * NGG vertex and tess eval shaders running as the last
2550 * vertex/geometry stage handle execution explicitly using
2551 * if-statements.
2552 */
2553 if (ctx->screen->info.chip_class >= GFX9) {
2554 if (!shader->is_monolithic &&
2555 (shader->key.as_es || shader->key.as_ls) &&
2556 (ctx->type == PIPE_SHADER_TESS_EVAL ||
2557 (ctx->type == PIPE_SHADER_VERTEX &&
2558 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
2559 si_init_exec_from_input(ctx,
2560 ctx->merged_wave_info, 0);
2561 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
2562 ctx->type == PIPE_SHADER_GEOMETRY ||
2563 (shader->key.as_ngg && !shader->key.as_es)) {
2564 LLVMValueRef thread_enabled;
2565 bool nested_barrier;
2566
2567 if (!shader->is_monolithic ||
2568 (ctx->type == PIPE_SHADER_TESS_EVAL &&
2569 (shader->key.as_ngg && !shader->key.as_es)))
2570 ac_init_exec_full_mask(&ctx->ac);
2571
2572 if (ctx->type == PIPE_SHADER_TESS_CTRL ||
2573 ctx->type == PIPE_SHADER_GEOMETRY) {
2574 if (ctx->type == PIPE_SHADER_GEOMETRY && shader->key.as_ngg) {
2575 gfx10_ngg_gs_emit_prologue(ctx);
2576 nested_barrier = false;
2577 } else {
2578 nested_barrier = true;
2579 }
2580
2581 thread_enabled = si_is_gs_thread(ctx);
2582 } else {
2583 thread_enabled = si_is_es_thread(ctx);
2584 nested_barrier = false;
2585 }
2586
2587 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
2588 ctx->merged_wrap_if_label = 11500;
2589 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
2590
2591 if (nested_barrier) {
2592 /* Execute a barrier before the second shader in
2593 * a merged shader.
2594 *
2595 * Execute the barrier inside the conditional block,
2596 * so that empty waves can jump directly to s_endpgm,
2597 * which will also signal the barrier.
2598 *
2599 * This is possible in gfx9, because an empty wave
2600 * for the second shader does not participate in
2601 * the epilogue. With NGG, empty waves may still
2602 * be required to export data (e.g. GS output vertices),
2603 * so we cannot let them exit early.
2604 *
2605 * If the shader is TCS and the TCS epilog is present
2606 * and contains a barrier, it will wait there and then
2607 * reach s_endpgm.
2608 */
2609 si_llvm_emit_barrier(ctx);
2610 }
2611 }
2612 }
2613
2614 if (sel->force_correct_derivs_after_kill) {
2615 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
2616 /* true = don't kill. */
2617 LLVMBuildStore(ctx->ac.builder, ctx->i1true,
2618 ctx->postponed_kill);
2619 }
2620
2621 bool success = si_nir_build_llvm(ctx, nir);
2622 if (free_nir)
2623 ralloc_free(nir);
2624 if (!success) {
2625 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
2626 return false;
2627 }
2628
2629 si_llvm_build_ret(ctx, ctx->return_value);
2630 return true;
2631 }
2632
2633 /**
2634 * Compute the VS prolog key, which contains all the information needed to
2635 * build the VS prolog function, and set shader->info bits where needed.
2636 *
2637 * \param info Shader info of the vertex shader.
2638 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
2639 * \param prolog_key Key of the VS prolog
2640 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
2641 * \param key Output shader part key.
2642 */
2643 static void si_get_vs_prolog_key(const struct si_shader_info *info,
2644 unsigned num_input_sgprs,
2645 const struct si_vs_prolog_bits *prolog_key,
2646 struct si_shader *shader_out,
2647 union si_shader_part_key *key)
2648 {
2649 memset(key, 0, sizeof(*key));
2650 key->vs_prolog.states = *prolog_key;
2651 key->vs_prolog.num_input_sgprs = num_input_sgprs;
2652 key->vs_prolog.num_inputs = info->num_inputs;
2653 key->vs_prolog.as_ls = shader_out->key.as_ls;
2654 key->vs_prolog.as_es = shader_out->key.as_es;
2655 key->vs_prolog.as_ngg = shader_out->key.as_ngg;
2656
2657 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
2658 key->vs_prolog.as_ls = 1;
2659 key->vs_prolog.num_merged_next_stage_vgprs = 2;
2660 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
2661 key->vs_prolog.as_es = 1;
2662 key->vs_prolog.num_merged_next_stage_vgprs = 5;
2663 } else if (shader_out->key.as_ngg) {
2664 key->vs_prolog.num_merged_next_stage_vgprs = 5;
2665 }
2666
2667 /* Enable loading the InstanceID VGPR. */
2668 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
2669
2670 if ((key->vs_prolog.states.instance_divisor_is_one |
2671 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
2672 shader_out->info.uses_instanceid = true;
2673 }
2674
2675 /**
2676 * Given a list of shader part functions, build a wrapper function that
2677 * runs them in sequence to form a monolithic shader.
2678 */
2679 void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
2680 unsigned num_parts, unsigned main_part,
2681 unsigned next_shader_first_part)
2682 {
2683 LLVMBuilderRef builder = ctx->ac.builder;
2684 /* PS epilog has one arg per color component; gfx9 merged shader
2685 * prologs need to forward 40 SGPRs.
2686 */
2687 LLVMValueRef initial[AC_MAX_ARGS], out[AC_MAX_ARGS];
2688 LLVMTypeRef function_type;
2689 unsigned num_first_params;
2690 unsigned num_out, initial_num_out;
2691 ASSERTED unsigned num_out_sgpr; /* used in debug checks */
2692 ASSERTED unsigned initial_num_out_sgpr; /* used in debug checks */
2693 unsigned num_sgprs, num_vgprs;
2694 unsigned gprs;
2695
2696 memset(&ctx->args, 0, sizeof(ctx->args));
2697
2698 for (unsigned i = 0; i < num_parts; ++i) {
2699 ac_add_function_attr(ctx->ac.context, parts[i], -1,
2700 AC_FUNC_ATTR_ALWAYSINLINE);
2701 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
2702 }
2703
2704 /* The parameters of the wrapper function correspond to those of the
2705 * first part in terms of SGPRs and VGPRs, but we use the types of the
2706 * main part to get the right types. This is relevant for the
2707 * dereferenceable attribute on descriptor table pointers.
2708 */
2709 num_sgprs = 0;
2710 num_vgprs = 0;
2711
2712 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
2713 num_first_params = LLVMCountParamTypes(function_type);
2714
2715 for (unsigned i = 0; i < num_first_params; ++i) {
2716 LLVMValueRef param = LLVMGetParam(parts[0], i);
2717
2718 if (ac_is_sgpr_param(param)) {
2719 assert(num_vgprs == 0);
2720 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
2721 } else {
2722 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
2723 }
2724 }
2725
2726 gprs = 0;
2727 while (gprs < num_sgprs + num_vgprs) {
2728 LLVMValueRef param = LLVMGetParam(parts[main_part], ctx->args.arg_count);
2729 LLVMTypeRef type = LLVMTypeOf(param);
2730 unsigned size = ac_get_type_size(type) / 4;
2731
2732 /* This is going to get casted anyways, so we don't have to
2733 * have the exact same type. But we do have to preserve the
2734 * pointer-ness so that LLVM knows about it.
2735 */
2736 enum ac_arg_type arg_type = AC_ARG_INT;
2737 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
2738 type = LLVMGetElementType(type);
2739
2740 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2741 if (LLVMGetVectorSize(type) == 4)
2742 arg_type = AC_ARG_CONST_DESC_PTR;
2743 else if (LLVMGetVectorSize(type) == 8)
2744 arg_type = AC_ARG_CONST_IMAGE_PTR;
2745 else
2746 assert(0);
2747 } else if (type == ctx->f32) {
2748 arg_type = AC_ARG_CONST_FLOAT_PTR;
2749 } else {
2750 assert(0);
2751 }
2752 }
2753
2754 ac_add_arg(&ctx->args, gprs < num_sgprs ? AC_ARG_SGPR : AC_ARG_VGPR,
2755 size, arg_type, NULL);
2756
2757 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
2758 assert(gprs + size <= num_sgprs + num_vgprs &&
2759 (gprs >= num_sgprs || gprs + size <= num_sgprs));
2760
2761 gprs += size;
2762 }
2763
2764 /* Prepare the return type. */
2765 unsigned num_returns = 0;
2766 LLVMTypeRef returns[AC_MAX_ARGS], last_func_type, return_type;
2767
2768 last_func_type = LLVMGetElementType(LLVMTypeOf(parts[num_parts - 1]));
2769 return_type = LLVMGetReturnType(last_func_type);
2770
2771 switch (LLVMGetTypeKind(return_type)) {
2772 case LLVMStructTypeKind:
2773 num_returns = LLVMCountStructElementTypes(return_type);
2774 assert(num_returns <= ARRAY_SIZE(returns));
2775 LLVMGetStructElementTypes(return_type, returns);
2776 break;
2777 case LLVMVoidTypeKind:
2778 break;
2779 default:
2780 unreachable("unexpected type");
2781 }
2782
2783 si_llvm_create_func(ctx, "wrapper", returns, num_returns,
2784 si_get_max_workgroup_size(ctx->shader));
2785
2786 if (si_is_merged_shader(ctx))
2787 ac_init_exec_full_mask(&ctx->ac);
2788
2789 /* Record the arguments of the function as if they were an output of
2790 * a previous part.
2791 */
2792 num_out = 0;
2793 num_out_sgpr = 0;
2794
2795 for (unsigned i = 0; i < ctx->args.arg_count; ++i) {
2796 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
2797 LLVMTypeRef param_type = LLVMTypeOf(param);
2798 LLVMTypeRef out_type = ctx->args.args[i].file == AC_ARG_SGPR ? ctx->i32 : ctx->f32;
2799 unsigned size = ac_get_type_size(param_type) / 4;
2800
2801 if (size == 1) {
2802 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
2803 param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
2804 param_type = ctx->i32;
2805 }
2806
2807 if (param_type != out_type)
2808 param = LLVMBuildBitCast(builder, param, out_type, "");
2809 out[num_out++] = param;
2810 } else {
2811 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
2812
2813 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
2814 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
2815 param_type = ctx->i64;
2816 }
2817
2818 if (param_type != vector_type)
2819 param = LLVMBuildBitCast(builder, param, vector_type, "");
2820
2821 for (unsigned j = 0; j < size; ++j)
2822 out[num_out++] = LLVMBuildExtractElement(
2823 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
2824 }
2825
2826 if (ctx->args.args[i].file == AC_ARG_SGPR)
2827 num_out_sgpr = num_out;
2828 }
2829
2830 memcpy(initial, out, sizeof(out));
2831 initial_num_out = num_out;
2832 initial_num_out_sgpr = num_out_sgpr;
2833
2834 /* Now chain the parts. */
2835 LLVMValueRef ret = NULL;
2836 for (unsigned part = 0; part < num_parts; ++part) {
2837 LLVMValueRef in[AC_MAX_ARGS];
2838 LLVMTypeRef ret_type;
2839 unsigned out_idx = 0;
2840 unsigned num_params = LLVMCountParams(parts[part]);
2841
2842 /* Merged shaders are executed conditionally depending
2843 * on the number of enabled threads passed in the input SGPRs. */
2844 if (is_multi_part_shader(ctx) && part == 0) {
2845 LLVMValueRef ena, count = initial[3];
2846
2847 count = LLVMBuildAnd(builder, count,
2848 LLVMConstInt(ctx->i32, 0x7f, 0), "");
2849 ena = LLVMBuildICmp(builder, LLVMIntULT,
2850 ac_get_thread_id(&ctx->ac), count, "");
2851 ac_build_ifcc(&ctx->ac, ena, 6506);
2852 }
2853
2854 /* Derive arguments for the next part from outputs of the
2855 * previous one.
2856 */
2857 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
2858 LLVMValueRef param;
2859 LLVMTypeRef param_type;
2860 bool is_sgpr;
2861 unsigned param_size;
2862 LLVMValueRef arg = NULL;
2863
2864 param = LLVMGetParam(parts[part], param_idx);
2865 param_type = LLVMTypeOf(param);
2866 param_size = ac_get_type_size(param_type) / 4;
2867 is_sgpr = ac_is_sgpr_param(param);
2868
2869 if (is_sgpr) {
2870 ac_add_function_attr(ctx->ac.context, parts[part],
2871 param_idx + 1, AC_FUNC_ATTR_INREG);
2872 } else if (out_idx < num_out_sgpr) {
2873 /* Skip returned SGPRs the current part doesn't
2874 * declare on the input. */
2875 out_idx = num_out_sgpr;
2876 }
2877
2878 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
2879
2880 if (param_size == 1)
2881 arg = out[out_idx];
2882 else
2883 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
2884
2885 if (LLVMTypeOf(arg) != param_type) {
2886 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
2887 if (LLVMGetPointerAddressSpace(param_type) ==
2888 AC_ADDR_SPACE_CONST_32BIT) {
2889 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
2890 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
2891 } else {
2892 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
2893 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
2894 }
2895 } else {
2896 arg = LLVMBuildBitCast(builder, arg, param_type, "");
2897 }
2898 }
2899
2900 in[param_idx] = arg;
2901 out_idx += param_size;
2902 }
2903
2904 ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
2905
2906 if (is_multi_part_shader(ctx) &&
2907 part + 1 == next_shader_first_part) {
2908 ac_build_endif(&ctx->ac, 6506);
2909
2910 /* The second half of the merged shader should use
2911 * the inputs from the toplevel (wrapper) function,
2912 * not the return value from the last call.
2913 *
2914 * That's because the last call was executed condi-
2915 * tionally, so we can't consume it in the main
2916 * block.
2917 */
2918 memcpy(out, initial, sizeof(initial));
2919 num_out = initial_num_out;
2920 num_out_sgpr = initial_num_out_sgpr;
2921 continue;
2922 }
2923
2924 /* Extract the returned GPRs. */
2925 ret_type = LLVMTypeOf(ret);
2926 num_out = 0;
2927 num_out_sgpr = 0;
2928
2929 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
2930 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
2931
2932 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
2933
2934 for (unsigned i = 0; i < ret_size; ++i) {
2935 LLVMValueRef val =
2936 LLVMBuildExtractValue(builder, ret, i, "");
2937
2938 assert(num_out < ARRAY_SIZE(out));
2939 out[num_out++] = val;
2940
2941 if (LLVMTypeOf(val) == ctx->i32) {
2942 assert(num_out_sgpr + 1 == num_out);
2943 num_out_sgpr = num_out;
2944 }
2945 }
2946 }
2947 }
2948
2949 /* Return the value from the last part. */
2950 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
2951 LLVMBuildRetVoid(builder);
2952 else
2953 LLVMBuildRet(builder, ret);
2954 }
2955
2956 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
2957 struct si_shader_selector *sel)
2958 {
2959 if (!compiler->low_opt_passes)
2960 return false;
2961
2962 /* Assume a slow CPU. */
2963 assert(!sel->screen->info.has_dedicated_vram &&
2964 sel->screen->info.chip_class <= GFX8);
2965
2966 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
2967 * buffer stores. */
2968 return sel->type == PIPE_SHADER_COMPUTE &&
2969 sel->info.num_memory_instructions > 1000;
2970 }
2971
2972 static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
2973 bool *free_nir)
2974 {
2975 *free_nir = false;
2976
2977 if (sel->nir) {
2978 return sel->nir;
2979 } else if (sel->nir_binary) {
2980 struct pipe_screen *screen = &sel->screen->b;
2981 const void *options =
2982 screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
2983 sel->type);
2984
2985 struct blob_reader blob_reader;
2986 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
2987 *free_nir = true;
2988 return nir_deserialize(NULL, options, &blob_reader);
2989 }
2990 return NULL;
2991 }
2992
2993 int si_compile_shader(struct si_screen *sscreen,
2994 struct ac_llvm_compiler *compiler,
2995 struct si_shader *shader,
2996 struct pipe_debug_callback *debug)
2997 {
2998 struct si_shader_selector *sel = shader->selector;
2999 struct si_shader_context ctx;
3000 bool free_nir;
3001 struct nir_shader *nir = get_nir_shader(sel, &free_nir);
3002 int r = -1;
3003
3004 /* Dump NIR before doing NIR->LLVM conversion in case the
3005 * conversion fails. */
3006 if (si_can_dump_shader(sscreen, sel->type) &&
3007 !(sscreen->debug_flags & DBG(NO_NIR))) {
3008 nir_print_shader(nir, stderr);
3009 si_dump_streamout(&sel->so);
3010 }
3011
3012 si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
3013 si_llvm_context_set_ir(&ctx, shader);
3014
3015 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
3016 sizeof(shader->info.vs_output_param_offset));
3017
3018 shader->info.uses_instanceid = sel->info.uses_instanceid;
3019
3020 if (!si_build_main_function(&ctx, nir, free_nir)) {
3021 si_llvm_dispose(&ctx);
3022 return -1;
3023 }
3024
3025 if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
3026 LLVMValueRef parts[2];
3027 bool need_prolog = si_vs_needs_prolog(sel, &shader->key.part.vs.prolog);
3028
3029 parts[1] = ctx.main_fn;
3030
3031 if (need_prolog) {
3032 union si_shader_part_key prolog_key;
3033 si_get_vs_prolog_key(&sel->info,
3034 shader->info.num_input_sgprs,
3035 &shader->key.part.vs.prolog,
3036 shader, &prolog_key);
3037 prolog_key.vs_prolog.is_monolithic = true;
3038 si_build_vs_prolog_function(&ctx, &prolog_key);
3039 parts[0] = ctx.main_fn;
3040 }
3041
3042 si_build_wrapper_function(&ctx, parts + !need_prolog,
3043 1 + need_prolog, need_prolog, 0);
3044
3045 if (ctx.shader->key.opt.vs_as_prim_discard_cs)
3046 si_build_prim_discard_compute_shader(&ctx);
3047 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
3048 if (sscreen->info.chip_class >= GFX9) {
3049 struct si_shader_selector *ls = shader->key.part.tcs.ls;
3050 LLVMValueRef parts[4];
3051 bool vs_needs_prolog =
3052 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
3053
3054 /* TCS main part */
3055 parts[2] = ctx.main_fn;
3056
3057 /* TCS epilog */
3058 union si_shader_part_key tcs_epilog_key;
3059 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
3060 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
3061 si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
3062 parts[3] = ctx.main_fn;
3063
3064 /* VS as LS main part */
3065 nir = get_nir_shader(ls, &free_nir);
3066 struct si_shader shader_ls = {};
3067 shader_ls.selector = ls;
3068 shader_ls.key.as_ls = 1;
3069 shader_ls.key.mono = shader->key.mono;
3070 shader_ls.key.opt = shader->key.opt;
3071 shader_ls.is_monolithic = true;
3072 si_llvm_context_set_ir(&ctx, &shader_ls);
3073
3074 if (!si_build_main_function(&ctx, nir, free_nir)) {
3075 si_llvm_dispose(&ctx);
3076 return -1;
3077 }
3078 shader->info.uses_instanceid |= ls->info.uses_instanceid;
3079 parts[1] = ctx.main_fn;
3080
3081 /* LS prolog */
3082 if (vs_needs_prolog) {
3083 union si_shader_part_key vs_prolog_key;
3084 si_get_vs_prolog_key(&ls->info,
3085 shader_ls.info.num_input_sgprs,
3086 &shader->key.part.tcs.ls_prolog,
3087 shader, &vs_prolog_key);
3088 vs_prolog_key.vs_prolog.is_monolithic = true;
3089 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
3090 parts[0] = ctx.main_fn;
3091 }
3092
3093 /* Reset the shader context. */
3094 ctx.shader = shader;
3095 ctx.type = PIPE_SHADER_TESS_CTRL;
3096
3097 si_build_wrapper_function(&ctx,
3098 parts + !vs_needs_prolog,
3099 4 - !vs_needs_prolog, vs_needs_prolog,
3100 vs_needs_prolog ? 2 : 1);
3101 } else {
3102 LLVMValueRef parts[2];
3103 union si_shader_part_key epilog_key;
3104
3105 parts[0] = ctx.main_fn;
3106
3107 memset(&epilog_key, 0, sizeof(epilog_key));
3108 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
3109 si_llvm_build_tcs_epilog(&ctx, &epilog_key);
3110 parts[1] = ctx.main_fn;
3111
3112 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
3113 }
3114 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
3115 if (ctx.screen->info.chip_class >= GFX9) {
3116 struct si_shader_selector *es = shader->key.part.gs.es;
3117 LLVMValueRef es_prolog = NULL;
3118 LLVMValueRef es_main = NULL;
3119 LLVMValueRef gs_prolog = NULL;
3120 LLVMValueRef gs_main = ctx.main_fn;
3121
3122 /* GS prolog */
3123 union si_shader_part_key gs_prolog_key;
3124 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
3125 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
3126 gs_prolog_key.gs_prolog.is_monolithic = true;
3127 gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
3128 si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
3129 gs_prolog = ctx.main_fn;
3130
3131 /* ES main part */
3132 nir = get_nir_shader(es, &free_nir);
3133 struct si_shader shader_es = {};
3134 shader_es.selector = es;
3135 shader_es.key.as_es = 1;
3136 shader_es.key.as_ngg = shader->key.as_ngg;
3137 shader_es.key.mono = shader->key.mono;
3138 shader_es.key.opt = shader->key.opt;
3139 shader_es.is_monolithic = true;
3140 si_llvm_context_set_ir(&ctx, &shader_es);
3141
3142 if (!si_build_main_function(&ctx, nir, free_nir)) {
3143 si_llvm_dispose(&ctx);
3144 return -1;
3145 }
3146 shader->info.uses_instanceid |= es->info.uses_instanceid;
3147 es_main = ctx.main_fn;
3148
3149 /* ES prolog */
3150 if (es->type == PIPE_SHADER_VERTEX &&
3151 si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog)) {
3152 union si_shader_part_key vs_prolog_key;
3153 si_get_vs_prolog_key(&es->info,
3154 shader_es.info.num_input_sgprs,
3155 &shader->key.part.gs.vs_prolog,
3156 shader, &vs_prolog_key);
3157 vs_prolog_key.vs_prolog.is_monolithic = true;
3158 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
3159 es_prolog = ctx.main_fn;
3160 }
3161
3162 /* Reset the shader context. */
3163 ctx.shader = shader;
3164 ctx.type = PIPE_SHADER_GEOMETRY;
3165
3166 /* Prepare the array of shader parts. */
3167 LLVMValueRef parts[4];
3168 unsigned num_parts = 0, main_part, next_first_part;
3169
3170 if (es_prolog)
3171 parts[num_parts++] = es_prolog;
3172
3173 parts[main_part = num_parts++] = es_main;
3174 parts[next_first_part = num_parts++] = gs_prolog;
3175 parts[num_parts++] = gs_main;
3176
3177 si_build_wrapper_function(&ctx, parts, num_parts,
3178 main_part, next_first_part);
3179 } else {
3180 LLVMValueRef parts[2];
3181 union si_shader_part_key prolog_key;
3182
3183 parts[1] = ctx.main_fn;
3184
3185 memset(&prolog_key, 0, sizeof(prolog_key));
3186 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
3187 si_llvm_build_gs_prolog(&ctx, &prolog_key);
3188 parts[0] = ctx.main_fn;
3189
3190 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
3191 }
3192 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
3193 si_llvm_build_monolithic_ps(&ctx, shader);
3194 }
3195
3196 si_llvm_optimize_module(&ctx);
3197
3198 /* Post-optimization transformations and analysis. */
3199 si_optimize_vs_outputs(&ctx);
3200
3201 if ((debug && debug->debug_message) ||
3202 si_can_dump_shader(sscreen, ctx.type)) {
3203 ctx.shader->info.private_mem_vgprs =
3204 ac_count_scratch_private_memory(ctx.main_fn);
3205 }
3206
3207 /* Make sure the input is a pointer and not integer followed by inttoptr. */
3208 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
3209 LLVMPointerTypeKind);
3210
3211 /* Compile to bytecode. */
3212 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
3213 ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
3214 si_get_shader_name(shader),
3215 si_should_optimize_less(compiler, shader->selector));
3216 si_llvm_dispose(&ctx);
3217 if (r) {
3218 fprintf(stderr, "LLVM failed to compile shader\n");
3219 return r;
3220 }
3221
3222 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
3223 * LLVM 3.9svn has this bug.
3224 */
3225 if (sel->type == PIPE_SHADER_COMPUTE) {
3226 unsigned wave_size = sscreen->compute_wave_size;
3227 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd *
3228 (wave_size == 32 ? 2 : 1);
3229 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3230 unsigned max_sgprs_per_wave = 128;
3231 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3232 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3233 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
3234 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3235
3236 max_vgprs = max_vgprs / waves_per_simd;
3237 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3238
3239 if (shader->config.num_sgprs > max_sgprs ||
3240 shader->config.num_vgprs > max_vgprs) {
3241 fprintf(stderr, "LLVM failed to compile a shader correctly: "
3242 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3243 shader->config.num_sgprs, shader->config.num_vgprs,
3244 max_sgprs, max_vgprs);
3245
3246 /* Just terminate the process, because dependent
3247 * shaders can hang due to bad input data, but use
3248 * the env var to allow shader-db to work.
3249 */
3250 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3251 abort();
3252 }
3253 }
3254
3255 /* Add the scratch offset to input SGPRs. */
3256 if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(&ctx))
3257 shader->info.num_input_sgprs += 1; /* scratch byte offset */
3258
3259 /* Calculate the number of fragment input VGPRs. */
3260 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3261 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config,
3262 &shader->info.face_vgpr_index,
3263 &shader->info.ancillary_vgpr_index);
3264 }
3265
3266 si_calculate_max_simd_waves(shader);
3267 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3268 return 0;
3269 }
3270
3271 /**
3272 * Create, compile and return a shader part (prolog or epilog).
3273 *
3274 * \param sscreen screen
3275 * \param list list of shader parts of the same category
3276 * \param type shader type
3277 * \param key shader part key
3278 * \param prolog whether the part being requested is a prolog
3279 * \param tm LLVM target machine
3280 * \param debug debug callback
3281 * \param build the callback responsible for building the main function
3282 * \return non-NULL on success
3283 */
3284 static struct si_shader_part *
3285 si_get_shader_part(struct si_screen *sscreen,
3286 struct si_shader_part **list,
3287 enum pipe_shader_type type,
3288 bool prolog,
3289 union si_shader_part_key *key,
3290 struct ac_llvm_compiler *compiler,
3291 struct pipe_debug_callback *debug,
3292 void (*build)(struct si_shader_context *,
3293 union si_shader_part_key *),
3294 const char *name)
3295 {
3296 struct si_shader_part *result;
3297
3298 simple_mtx_lock(&sscreen->shader_parts_mutex);
3299
3300 /* Find existing. */
3301 for (result = *list; result; result = result->next) {
3302 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3303 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3304 return result;
3305 }
3306 }
3307
3308 /* Compile a new one. */
3309 result = CALLOC_STRUCT(si_shader_part);
3310 result->key = *key;
3311
3312 struct si_shader shader = {};
3313
3314 switch (type) {
3315 case PIPE_SHADER_VERTEX:
3316 shader.key.as_ls = key->vs_prolog.as_ls;
3317 shader.key.as_es = key->vs_prolog.as_es;
3318 shader.key.as_ngg = key->vs_prolog.as_ngg;
3319 break;
3320 case PIPE_SHADER_TESS_CTRL:
3321 assert(!prolog);
3322 shader.key.part.tcs.epilog = key->tcs_epilog.states;
3323 break;
3324 case PIPE_SHADER_GEOMETRY:
3325 assert(prolog);
3326 shader.key.as_ngg = key->gs_prolog.as_ngg;
3327 break;
3328 case PIPE_SHADER_FRAGMENT:
3329 if (prolog)
3330 shader.key.part.ps.prolog = key->ps_prolog.states;
3331 else
3332 shader.key.part.ps.epilog = key->ps_epilog.states;
3333 break;
3334 default:
3335 unreachable("bad shader part");
3336 }
3337
3338 struct si_shader_context ctx;
3339 si_llvm_context_init(&ctx, sscreen, compiler,
3340 si_get_wave_size(sscreen, type, shader.key.as_ngg,
3341 shader.key.as_es));
3342 ctx.shader = &shader;
3343 ctx.type = type;
3344
3345 build(&ctx, key);
3346
3347 /* Compile. */
3348 si_llvm_optimize_module(&ctx);
3349
3350 if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
3351 ctx.ac.module, debug, ctx.type, ctx.ac.wave_size,
3352 name, false)) {
3353 FREE(result);
3354 result = NULL;
3355 goto out;
3356 }
3357
3358 result->next = *list;
3359 *list = result;
3360
3361 out:
3362 si_llvm_dispose(&ctx);
3363 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3364 return result;
3365 }
3366
3367 /**
3368 * Build the vertex shader prolog function.
3369 *
3370 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
3371 * All inputs are returned unmodified. The vertex load indices are
3372 * stored after them, which will be used by the API VS for fetching inputs.
3373 *
3374 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
3375 * input_v0,
3376 * input_v1,
3377 * input_v2,
3378 * input_v3,
3379 * (VertexID + BaseVertex),
3380 * (InstanceID + StartInstance),
3381 * (InstanceID / 2 + StartInstance)
3382 */
3383 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
3384 union si_shader_part_key *key)
3385 {
3386 LLVMTypeRef *returns;
3387 LLVMValueRef ret, func;
3388 int num_returns, i;
3389 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
3390 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
3391 struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
3392 struct ac_arg input_vgpr_param[9];
3393 LLVMValueRef input_vgprs[9];
3394 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
3395 num_input_vgprs;
3396 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
3397
3398 memset(&ctx->args, 0, sizeof(ctx->args));
3399
3400 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
3401 returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
3402 sizeof(LLVMTypeRef));
3403 num_returns = 0;
3404
3405 /* Declare input and output SGPRs. */
3406 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
3407 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
3408 &input_sgpr_param[i]);
3409 returns[num_returns++] = ctx->i32;
3410 }
3411
3412 struct ac_arg merged_wave_info = input_sgpr_param[3];
3413
3414 /* Preloaded VGPRs (outputs must be floats) */
3415 for (i = 0; i < num_input_vgprs; i++) {
3416 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
3417 returns[num_returns++] = ctx->f32;
3418 }
3419
3420 /* Vertex load indices. */
3421 for (i = 0; i < key->vs_prolog.num_inputs; i++)
3422 returns[num_returns++] = ctx->f32;
3423
3424 /* Create the function. */
3425 si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
3426 func = ctx->main_fn;
3427
3428 for (i = 0; i < num_input_vgprs; i++) {
3429 input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
3430 }
3431
3432 if (key->vs_prolog.num_merged_next_stage_vgprs) {
3433 if (!key->vs_prolog.is_monolithic)
3434 si_init_exec_from_input(ctx, merged_wave_info, 0);
3435
3436 if (key->vs_prolog.as_ls &&
3437 ctx->screen->info.has_ls_vgpr_init_bug) {
3438 /* If there are no HS threads, SPI loads the LS VGPRs
3439 * starting at VGPR 0. Shift them back to where they
3440 * belong.
3441 */
3442 LLVMValueRef has_hs_threads =
3443 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
3444 si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
3445 ctx->i32_0, "");
3446
3447 for (i = 4; i > 0; --i) {
3448 input_vgprs[i + 1] =
3449 LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
3450 input_vgprs[i + 1],
3451 input_vgprs[i - 1], "");
3452 }
3453 }
3454 }
3455
3456 unsigned vertex_id_vgpr = first_vs_vgpr;
3457 unsigned instance_id_vgpr =
3458 ctx->screen->info.chip_class >= GFX10 ?
3459 first_vs_vgpr + 3 :
3460 first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
3461
3462 ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
3463 ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
3464
3465 /* InstanceID = VertexID >> 16;
3466 * VertexID = VertexID & 0xffff;
3467 */
3468 if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
3469 ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
3470 LLVMConstInt(ctx->i32, 16, 0), "");
3471 ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
3472 LLVMConstInt(ctx->i32, 0xffff, 0), "");
3473 }
3474
3475 /* Copy inputs to outputs. This should be no-op, as the registers match,
3476 * but it will prevent the compiler from overwriting them unintentionally.
3477 */
3478 ret = ctx->return_value;
3479 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
3480 LLVMValueRef p = LLVMGetParam(func, i);
3481 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
3482 }
3483 for (i = 0; i < num_input_vgprs; i++) {
3484 LLVMValueRef p = input_vgprs[i];
3485
3486 if (i == vertex_id_vgpr)
3487 p = ctx->abi.vertex_id;
3488 else if (i == instance_id_vgpr)
3489 p = ctx->abi.instance_id;
3490
3491 p = ac_to_float(&ctx->ac, p);
3492 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
3493 key->vs_prolog.num_input_sgprs + i, "");
3494 }
3495
3496 /* Compute vertex load indices from instance divisors. */
3497 LLVMValueRef instance_divisor_constbuf = NULL;
3498
3499 if (key->vs_prolog.states.instance_divisor_is_fetched) {
3500 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
3501 LLVMValueRef buf_index =
3502 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
3503 instance_divisor_constbuf =
3504 ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
3505 }
3506
3507 for (i = 0; i < key->vs_prolog.num_inputs; i++) {
3508 bool divisor_is_one =
3509 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
3510 bool divisor_is_fetched =
3511 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
3512 LLVMValueRef index = NULL;
3513
3514 if (divisor_is_one) {
3515 index = ctx->abi.instance_id;
3516 } else if (divisor_is_fetched) {
3517 LLVMValueRef udiv_factors[4];
3518
3519 for (unsigned j = 0; j < 4; j++) {
3520 udiv_factors[j] =
3521 si_buffer_load_const(ctx, instance_divisor_constbuf,
3522 LLVMConstInt(ctx->i32, i*16 + j*4, 0));
3523 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
3524 }
3525 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
3526 * Such InstanceID might not be achievable in a reasonable time though.
3527 */
3528 index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
3529 udiv_factors[0], udiv_factors[1],
3530 udiv_factors[2], udiv_factors[3]);
3531 }
3532
3533 if (divisor_is_one || divisor_is_fetched) {
3534 /* Add StartInstance. */
3535 index = LLVMBuildAdd(ctx->ac.builder, index,
3536 LLVMGetParam(ctx->main_fn, user_sgpr_base +
3537 SI_SGPR_START_INSTANCE), "");
3538 } else {
3539 /* VertexID + BaseVertex */
3540 index = LLVMBuildAdd(ctx->ac.builder,
3541 ctx->abi.vertex_id,
3542 LLVMGetParam(func, user_sgpr_base +
3543 SI_SGPR_BASE_VERTEX), "");
3544 }
3545
3546 index = ac_to_float(&ctx->ac, index);
3547 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
3548 ctx->args.arg_count + i, "");
3549 }
3550
3551 si_llvm_build_ret(ctx, ret);
3552 }
3553
3554 static bool si_get_vs_prolog(struct si_screen *sscreen,
3555 struct ac_llvm_compiler *compiler,
3556 struct si_shader *shader,
3557 struct pipe_debug_callback *debug,
3558 struct si_shader *main_part,
3559 const struct si_vs_prolog_bits *key)
3560 {
3561 struct si_shader_selector *vs = main_part->selector;
3562
3563 if (!si_vs_needs_prolog(vs, key))
3564 return true;
3565
3566 /* Get the prolog. */
3567 union si_shader_part_key prolog_key;
3568 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
3569 key, shader, &prolog_key);
3570
3571 shader->prolog =
3572 si_get_shader_part(sscreen, &sscreen->vs_prologs,
3573 PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
3574 debug, si_build_vs_prolog_function,
3575 "Vertex Shader Prolog");
3576 return shader->prolog != NULL;
3577 }
3578
3579 /**
3580 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
3581 */
3582 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
3583 struct ac_llvm_compiler *compiler,
3584 struct si_shader *shader,
3585 struct pipe_debug_callback *debug)
3586 {
3587 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
3588 &shader->key.part.vs.prolog);
3589 }
3590
3591 /**
3592 * Select and compile (or reuse) TCS parts (epilog).
3593 */
3594 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
3595 struct ac_llvm_compiler *compiler,
3596 struct si_shader *shader,
3597 struct pipe_debug_callback *debug)
3598 {
3599 if (sscreen->info.chip_class >= GFX9) {
3600 struct si_shader *ls_main_part =
3601 shader->key.part.tcs.ls->main_shader_part_ls;
3602
3603 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
3604 &shader->key.part.tcs.ls_prolog))
3605 return false;
3606
3607 shader->previous_stage = ls_main_part;
3608 }
3609
3610 /* Get the epilog. */
3611 union si_shader_part_key epilog_key;
3612 memset(&epilog_key, 0, sizeof(epilog_key));
3613 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
3614
3615 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
3616 PIPE_SHADER_TESS_CTRL, false,
3617 &epilog_key, compiler, debug,
3618 si_llvm_build_tcs_epilog,
3619 "Tessellation Control Shader Epilog");
3620 return shader->epilog != NULL;
3621 }
3622
3623 /**
3624 * Select and compile (or reuse) GS parts (prolog).
3625 */
3626 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
3627 struct ac_llvm_compiler *compiler,
3628 struct si_shader *shader,
3629 struct pipe_debug_callback *debug)
3630 {
3631 if (sscreen->info.chip_class >= GFX9) {
3632 struct si_shader *es_main_part;
3633 enum pipe_shader_type es_type = shader->key.part.gs.es->type;
3634
3635 if (shader->key.as_ngg)
3636 es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
3637 else
3638 es_main_part = shader->key.part.gs.es->main_shader_part_es;
3639
3640 if (es_type == PIPE_SHADER_VERTEX &&
3641 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
3642 &shader->key.part.gs.vs_prolog))
3643 return false;
3644
3645 shader->previous_stage = es_main_part;
3646 }
3647
3648 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
3649 return true;
3650
3651 union si_shader_part_key prolog_key;
3652 memset(&prolog_key, 0, sizeof(prolog_key));
3653 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
3654 prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
3655
3656 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
3657 PIPE_SHADER_GEOMETRY, true,
3658 &prolog_key, compiler, debug,
3659 si_llvm_build_gs_prolog,
3660 "Geometry Shader Prolog");
3661 return shader->prolog2 != NULL;
3662 }
3663
3664 /**
3665 * Compute the PS prolog key, which contains all the information needed to
3666 * build the PS prolog function, and set related bits in shader->config.
3667 */
3668 void si_get_ps_prolog_key(struct si_shader *shader,
3669 union si_shader_part_key *key,
3670 bool separate_prolog)
3671 {
3672 struct si_shader_info *info = &shader->selector->info;
3673
3674 memset(key, 0, sizeof(*key));
3675 key->ps_prolog.states = shader->key.part.ps.prolog;
3676 key->ps_prolog.colors_read = info->colors_read;
3677 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3678 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
3679 key->ps_prolog.wqm = info->uses_derivatives &&
3680 (key->ps_prolog.colors_read ||
3681 key->ps_prolog.states.force_persp_sample_interp ||
3682 key->ps_prolog.states.force_linear_sample_interp ||
3683 key->ps_prolog.states.force_persp_center_interp ||
3684 key->ps_prolog.states.force_linear_center_interp ||
3685 key->ps_prolog.states.bc_optimize_for_persp ||
3686 key->ps_prolog.states.bc_optimize_for_linear);
3687 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
3688
3689 if (info->colors_read) {
3690 unsigned *color = shader->selector->color_attr_index;
3691
3692 if (shader->key.part.ps.prolog.color_two_side) {
3693 /* BCOLORs are stored after the last input. */
3694 key->ps_prolog.num_interp_inputs = info->num_inputs;
3695 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
3696 if (separate_prolog)
3697 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3698 }
3699
3700 for (unsigned i = 0; i < 2; i++) {
3701 unsigned interp = info->input_interpolate[color[i]];
3702 unsigned location = info->input_interpolate_loc[color[i]];
3703
3704 if (!(info->colors_read & (0xf << i*4)))
3705 continue;
3706
3707 key->ps_prolog.color_attr_index[i] = color[i];
3708
3709 if (shader->key.part.ps.prolog.flatshade_colors &&
3710 interp == TGSI_INTERPOLATE_COLOR)
3711 interp = TGSI_INTERPOLATE_CONSTANT;
3712
3713 switch (interp) {
3714 case TGSI_INTERPOLATE_CONSTANT:
3715 key->ps_prolog.color_interp_vgpr_index[i] = -1;
3716 break;
3717 case TGSI_INTERPOLATE_PERSPECTIVE:
3718 case TGSI_INTERPOLATE_COLOR:
3719 /* Force the interpolation location for colors here. */
3720 if (shader->key.part.ps.prolog.force_persp_sample_interp)
3721 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3722 if (shader->key.part.ps.prolog.force_persp_center_interp)
3723 location = TGSI_INTERPOLATE_LOC_CENTER;
3724
3725 switch (location) {
3726 case TGSI_INTERPOLATE_LOC_SAMPLE:
3727 key->ps_prolog.color_interp_vgpr_index[i] = 0;
3728 if (separate_prolog) {
3729 shader->config.spi_ps_input_ena |=
3730 S_0286CC_PERSP_SAMPLE_ENA(1);
3731 }
3732 break;
3733 case TGSI_INTERPOLATE_LOC_CENTER:
3734 key->ps_prolog.color_interp_vgpr_index[i] = 2;
3735 if (separate_prolog) {
3736 shader->config.spi_ps_input_ena |=
3737 S_0286CC_PERSP_CENTER_ENA(1);
3738 }
3739 break;
3740 case TGSI_INTERPOLATE_LOC_CENTROID:
3741 key->ps_prolog.color_interp_vgpr_index[i] = 4;
3742 if (separate_prolog) {
3743 shader->config.spi_ps_input_ena |=
3744 S_0286CC_PERSP_CENTROID_ENA(1);
3745 }
3746 break;
3747 default:
3748 assert(0);
3749 }
3750 break;
3751 case TGSI_INTERPOLATE_LINEAR:
3752 /* Force the interpolation location for colors here. */
3753 if (shader->key.part.ps.prolog.force_linear_sample_interp)
3754 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3755 if (shader->key.part.ps.prolog.force_linear_center_interp)
3756 location = TGSI_INTERPOLATE_LOC_CENTER;
3757
3758 /* The VGPR assignment for non-monolithic shaders
3759 * works because InitialPSInputAddr is set on the
3760 * main shader and PERSP_PULL_MODEL is never used.
3761 */
3762 switch (location) {
3763 case TGSI_INTERPOLATE_LOC_SAMPLE:
3764 key->ps_prolog.color_interp_vgpr_index[i] =
3765 separate_prolog ? 6 : 9;
3766 if (separate_prolog) {
3767 shader->config.spi_ps_input_ena |=
3768 S_0286CC_LINEAR_SAMPLE_ENA(1);
3769 }
3770 break;
3771 case TGSI_INTERPOLATE_LOC_CENTER:
3772 key->ps_prolog.color_interp_vgpr_index[i] =
3773 separate_prolog ? 8 : 11;
3774 if (separate_prolog) {
3775 shader->config.spi_ps_input_ena |=
3776 S_0286CC_LINEAR_CENTER_ENA(1);
3777 }
3778 break;
3779 case TGSI_INTERPOLATE_LOC_CENTROID:
3780 key->ps_prolog.color_interp_vgpr_index[i] =
3781 separate_prolog ? 10 : 13;
3782 if (separate_prolog) {
3783 shader->config.spi_ps_input_ena |=
3784 S_0286CC_LINEAR_CENTROID_ENA(1);
3785 }
3786 break;
3787 default:
3788 assert(0);
3789 }
3790 break;
3791 default:
3792 assert(0);
3793 }
3794 }
3795 }
3796 }
3797
3798 /**
3799 * Check whether a PS prolog is required based on the key.
3800 */
3801 bool si_need_ps_prolog(const union si_shader_part_key *key)
3802 {
3803 return key->ps_prolog.colors_read ||
3804 key->ps_prolog.states.force_persp_sample_interp ||
3805 key->ps_prolog.states.force_linear_sample_interp ||
3806 key->ps_prolog.states.force_persp_center_interp ||
3807 key->ps_prolog.states.force_linear_center_interp ||
3808 key->ps_prolog.states.bc_optimize_for_persp ||
3809 key->ps_prolog.states.bc_optimize_for_linear ||
3810 key->ps_prolog.states.poly_stipple ||
3811 key->ps_prolog.states.samplemask_log_ps_iter;
3812 }
3813
3814 /**
3815 * Compute the PS epilog key, which contains all the information needed to
3816 * build the PS epilog function.
3817 */
3818 void si_get_ps_epilog_key(struct si_shader *shader,
3819 union si_shader_part_key *key)
3820 {
3821 struct si_shader_info *info = &shader->selector->info;
3822 memset(key, 0, sizeof(*key));
3823 key->ps_epilog.colors_written = info->colors_written;
3824 key->ps_epilog.writes_z = info->writes_z;
3825 key->ps_epilog.writes_stencil = info->writes_stencil;
3826 key->ps_epilog.writes_samplemask = info->writes_samplemask;
3827 key->ps_epilog.states = shader->key.part.ps.epilog;
3828 }
3829
3830 /**
3831 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3832 */
3833 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
3834 struct ac_llvm_compiler *compiler,
3835 struct si_shader *shader,
3836 struct pipe_debug_callback *debug)
3837 {
3838 union si_shader_part_key prolog_key;
3839 union si_shader_part_key epilog_key;
3840
3841 /* Get the prolog. */
3842 si_get_ps_prolog_key(shader, &prolog_key, true);
3843
3844 /* The prolog is a no-op if these aren't set. */
3845 if (si_need_ps_prolog(&prolog_key)) {
3846 shader->prolog =
3847 si_get_shader_part(sscreen, &sscreen->ps_prologs,
3848 PIPE_SHADER_FRAGMENT, true,
3849 &prolog_key, compiler, debug,
3850 si_llvm_build_ps_prolog,
3851 "Fragment Shader Prolog");
3852 if (!shader->prolog)
3853 return false;
3854 }
3855
3856 /* Get the epilog. */
3857 si_get_ps_epilog_key(shader, &epilog_key);
3858
3859 shader->epilog =
3860 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
3861 PIPE_SHADER_FRAGMENT, false,
3862 &epilog_key, compiler, debug,
3863 si_llvm_build_ps_epilog,
3864 "Fragment Shader Epilog");
3865 if (!shader->epilog)
3866 return false;
3867
3868 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
3869 if (shader->key.part.ps.prolog.poly_stipple) {
3870 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
3871 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
3872 }
3873
3874 /* Set up the enable bits for per-sample shading if needed. */
3875 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
3876 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
3877 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3878 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
3879 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
3880 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3881 }
3882 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
3883 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
3884 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3885 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
3886 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
3887 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3888 }
3889 if (shader->key.part.ps.prolog.force_persp_center_interp &&
3890 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
3891 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3892 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
3893 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
3894 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3895 }
3896 if (shader->key.part.ps.prolog.force_linear_center_interp &&
3897 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
3898 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
3899 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
3900 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
3901 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3902 }
3903
3904 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
3905 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
3906 !(shader->config.spi_ps_input_ena & 0xf)) {
3907 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3908 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
3909 }
3910
3911 /* At least one pair of interpolation weights must be enabled. */
3912 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
3913 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3914 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
3915 }
3916
3917 /* Samplemask fixup requires the sample ID. */
3918 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
3919 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
3920 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
3921 }
3922
3923 /* The sample mask input is always enabled, because the API shader always
3924 * passes it through to the epilog. Disable it here if it's unused.
3925 */
3926 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
3927 !shader->selector->info.reads_samplemask)
3928 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
3929
3930 return true;
3931 }
3932
3933 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
3934 unsigned *lds_size)
3935 {
3936 /* If tessellation is all offchip and on-chip GS isn't used, this
3937 * workaround is not needed.
3938 */
3939 return;
3940
3941 /* SPI barrier management bug:
3942 * Make sure we have at least 4k of LDS in use to avoid the bug.
3943 * It applies to workgroup sizes of more than one wavefront.
3944 */
3945 if (sscreen->info.family == CHIP_BONAIRE ||
3946 sscreen->info.family == CHIP_KABINI)
3947 *lds_size = MAX2(*lds_size, 8);
3948 }
3949
3950 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3951 {
3952 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3953
3954 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3955
3956 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
3957 si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
3958 si_multiwave_lds_size_workaround(sscreen,
3959 &shader->config.lds_size);
3960 }
3961 }
3962
3963 bool si_create_shader_variant(struct si_screen *sscreen,
3964 struct ac_llvm_compiler *compiler,
3965 struct si_shader *shader,
3966 struct pipe_debug_callback *debug)
3967 {
3968 struct si_shader_selector *sel = shader->selector;
3969 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
3970 int r;
3971
3972 /* LS, ES, VS are compiled on demand if the main part hasn't been
3973 * compiled for that stage.
3974 *
3975 * GS are compiled on demand if the main part hasn't been compiled
3976 * for the chosen NGG-ness.
3977 *
3978 * Vertex shaders are compiled on demand when a vertex fetch
3979 * workaround must be applied.
3980 */
3981 if (shader->is_monolithic) {
3982 /* Monolithic shader (compiled as a whole, has many variants,
3983 * may take a long time to compile).
3984 */
3985 r = si_compile_shader(sscreen, compiler, shader, debug);
3986 if (r)
3987 return false;
3988 } else {
3989 /* The shader consists of several parts:
3990 *
3991 * - the middle part is the user shader, it has 1 variant only
3992 * and it was compiled during the creation of the shader
3993 * selector
3994 * - the prolog part is inserted at the beginning
3995 * - the epilog part is inserted at the end
3996 *
3997 * The prolog and epilog have many (but simple) variants.
3998 *
3999 * Starting with gfx9, geometry and tessellation control
4000 * shaders also contain the prolog and user shader parts of
4001 * the previous shader stage.
4002 */
4003
4004 if (!mainp)
4005 return false;
4006
4007 /* Copy the compiled shader data over. */
4008 shader->is_binary_shared = true;
4009 shader->binary = mainp->binary;
4010 shader->config = mainp->config;
4011 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
4012 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
4013 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
4014 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
4015 memcpy(shader->info.vs_output_param_offset,
4016 mainp->info.vs_output_param_offset,
4017 sizeof(mainp->info.vs_output_param_offset));
4018 shader->info.uses_instanceid = mainp->info.uses_instanceid;
4019 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
4020 shader->info.nr_param_exports = mainp->info.nr_param_exports;
4021
4022 /* Select prologs and/or epilogs. */
4023 switch (sel->type) {
4024 case PIPE_SHADER_VERTEX:
4025 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
4026 return false;
4027 break;
4028 case PIPE_SHADER_TESS_CTRL:
4029 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
4030 return false;
4031 break;
4032 case PIPE_SHADER_TESS_EVAL:
4033 break;
4034 case PIPE_SHADER_GEOMETRY:
4035 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
4036 return false;
4037 break;
4038 case PIPE_SHADER_FRAGMENT:
4039 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
4040 return false;
4041
4042 /* Make sure we have at least as many VGPRs as there
4043 * are allocated inputs.
4044 */
4045 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
4046 shader->info.num_input_vgprs);
4047 break;
4048 default:;
4049 }
4050
4051 /* Update SGPR and VGPR counts. */
4052 if (shader->prolog) {
4053 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
4054 shader->prolog->config.num_sgprs);
4055 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
4056 shader->prolog->config.num_vgprs);
4057 }
4058 if (shader->previous_stage) {
4059 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
4060 shader->previous_stage->config.num_sgprs);
4061 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
4062 shader->previous_stage->config.num_vgprs);
4063 shader->config.spilled_sgprs =
4064 MAX2(shader->config.spilled_sgprs,
4065 shader->previous_stage->config.spilled_sgprs);
4066 shader->config.spilled_vgprs =
4067 MAX2(shader->config.spilled_vgprs,
4068 shader->previous_stage->config.spilled_vgprs);
4069 shader->info.private_mem_vgprs =
4070 MAX2(shader->info.private_mem_vgprs,
4071 shader->previous_stage->info.private_mem_vgprs);
4072 shader->config.scratch_bytes_per_wave =
4073 MAX2(shader->config.scratch_bytes_per_wave,
4074 shader->previous_stage->config.scratch_bytes_per_wave);
4075 shader->info.uses_instanceid |=
4076 shader->previous_stage->info.uses_instanceid;
4077 }
4078 if (shader->prolog2) {
4079 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
4080 shader->prolog2->config.num_sgprs);
4081 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
4082 shader->prolog2->config.num_vgprs);
4083 }
4084 if (shader->epilog) {
4085 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
4086 shader->epilog->config.num_sgprs);
4087 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
4088 shader->epilog->config.num_vgprs);
4089 }
4090 si_calculate_max_simd_waves(shader);
4091 }
4092
4093 if (shader->key.as_ngg) {
4094 assert(!shader->key.as_es && !shader->key.as_ls);
4095 gfx10_ngg_calculate_subgroup_info(shader);
4096 } else if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) {
4097 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
4098 }
4099
4100 si_fix_resource_usage(sscreen, shader);
4101 si_shader_dump(sscreen, shader, debug, stderr, true);
4102
4103 /* Upload. */
4104 if (!si_shader_binary_upload(sscreen, shader, 0)) {
4105 fprintf(stderr, "LLVM failed to upload shader\n");
4106 return false;
4107 }
4108
4109 return true;
4110 }
4111
4112 void si_shader_destroy(struct si_shader *shader)
4113 {
4114 if (shader->scratch_bo)
4115 si_resource_reference(&shader->scratch_bo, NULL);
4116
4117 si_resource_reference(&shader->bo, NULL);
4118
4119 if (!shader->is_binary_shared)
4120 si_shader_binary_clean(&shader->binary);
4121
4122 free(shader->shader_log);
4123 }