2 * Copyright 2012 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 #include <llvm/Config/llvm-config.h>
27 #include "util/u_memory.h"
28 #include "tgsi/tgsi_strings.h"
29 #include "tgsi/tgsi_from_mesa.h"
31 #include "ac_exp_param.h"
32 #include "ac_shader_util.h"
34 #include "ac_llvm_util.h"
35 #include "si_shader_internal.h"
39 #include "compiler/nir/nir.h"
40 #include "compiler/nir/nir_serialize.h"
42 static const char scratch_rsrc_dword0_symbol
[] =
43 "SCRATCH_RSRC_DWORD0";
45 static const char scratch_rsrc_dword1_symbol
[] =
46 "SCRATCH_RSRC_DWORD1";
48 static void si_dump_shader_key(const struct si_shader
*shader
, FILE *f
);
50 static void si_build_vs_prolog_function(struct si_shader_context
*ctx
,
51 union si_shader_part_key
*key
);
52 static void si_fix_resource_usage(struct si_screen
*sscreen
,
53 struct si_shader
*shader
);
55 /** Whether the shader runs as a combination of multiple API shaders */
56 static bool is_multi_part_shader(struct si_shader_context
*ctx
)
58 if (ctx
->screen
->info
.chip_class
<= GFX8
)
61 return ctx
->shader
->key
.as_ls
||
62 ctx
->shader
->key
.as_es
||
63 ctx
->type
== PIPE_SHADER_TESS_CTRL
||
64 ctx
->type
== PIPE_SHADER_GEOMETRY
;
67 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
68 bool si_is_merged_shader(struct si_shader_context
*ctx
)
70 return ctx
->shader
->key
.as_ngg
|| is_multi_part_shader(ctx
);
74 * Returns a unique index for a per-patch semantic name and index. The index
75 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
78 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name
, unsigned index
)
80 switch (semantic_name
) {
81 case TGSI_SEMANTIC_TESSOUTER
:
83 case TGSI_SEMANTIC_TESSINNER
:
85 case TGSI_SEMANTIC_PATCH
:
90 assert(!"invalid semantic name");
96 * Returns a unique index for a semantic name and index. The index must be
97 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
100 unsigned si_shader_io_get_unique_index(unsigned semantic_name
, unsigned index
,
103 switch (semantic_name
) {
104 case TGSI_SEMANTIC_POSITION
:
106 case TGSI_SEMANTIC_GENERIC
:
107 /* Since some shader stages use the the highest used IO index
108 * to determine the size to allocate for inputs/outputs
109 * (in LDS, tess and GS rings). GENERIC should be placed right
110 * after POSITION to make that size as small as possible.
112 if (index
< SI_MAX_IO_GENERIC
)
115 assert(!"invalid generic index");
117 case TGSI_SEMANTIC_FOG
:
118 return SI_MAX_IO_GENERIC
+ 1;
119 case TGSI_SEMANTIC_COLOR
:
121 return SI_MAX_IO_GENERIC
+ 2 + index
;
122 case TGSI_SEMANTIC_BCOLOR
:
124 /* If it's a varying, COLOR and BCOLOR alias. */
126 return SI_MAX_IO_GENERIC
+ 2 + index
;
128 return SI_MAX_IO_GENERIC
+ 4 + index
;
129 case TGSI_SEMANTIC_TEXCOORD
:
131 return SI_MAX_IO_GENERIC
+ 6 + index
;
133 /* These are rarely used between LS and HS or ES and GS. */
134 case TGSI_SEMANTIC_CLIPDIST
:
136 return SI_MAX_IO_GENERIC
+ 6 + 8 + index
;
137 case TGSI_SEMANTIC_CLIPVERTEX
:
138 return SI_MAX_IO_GENERIC
+ 6 + 8 + 2;
139 case TGSI_SEMANTIC_PSIZE
:
140 return SI_MAX_IO_GENERIC
+ 6 + 8 + 3;
142 /* These can't be written by LS, HS, and ES. */
143 case TGSI_SEMANTIC_LAYER
:
144 return SI_MAX_IO_GENERIC
+ 6 + 8 + 4;
145 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
146 return SI_MAX_IO_GENERIC
+ 6 + 8 + 5;
147 case TGSI_SEMANTIC_PRIMID
:
148 STATIC_ASSERT(SI_MAX_IO_GENERIC
+ 6 + 8 + 6 <= 63);
149 return SI_MAX_IO_GENERIC
+ 6 + 8 + 6;
151 fprintf(stderr
, "invalid semantic name = %u\n", semantic_name
);
152 assert(!"invalid semantic name");
158 * Get the value of a shader input parameter and extract a bitfield.
160 static LLVMValueRef
unpack_llvm_param(struct si_shader_context
*ctx
,
161 LLVMValueRef value
, unsigned rshift
,
164 if (LLVMGetTypeKind(LLVMTypeOf(value
)) == LLVMFloatTypeKind
)
165 value
= ac_to_integer(&ctx
->ac
, value
);
168 value
= LLVMBuildLShr(ctx
->ac
.builder
, value
,
169 LLVMConstInt(ctx
->i32
, rshift
, 0), "");
171 if (rshift
+ bitwidth
< 32) {
172 unsigned mask
= (1 << bitwidth
) - 1;
173 value
= LLVMBuildAnd(ctx
->ac
.builder
, value
,
174 LLVMConstInt(ctx
->i32
, mask
, 0), "");
180 LLVMValueRef
si_unpack_param(struct si_shader_context
*ctx
,
181 struct ac_arg param
, unsigned rshift
,
184 LLVMValueRef value
= ac_get_arg(&ctx
->ac
, param
);
186 return unpack_llvm_param(ctx
, value
, rshift
, bitwidth
);
189 static LLVMValueRef
unpack_sint16(struct si_shader_context
*ctx
,
190 LLVMValueRef i32
, unsigned index
)
195 return LLVMBuildAShr(ctx
->ac
.builder
, i32
,
196 LLVMConstInt(ctx
->i32
, 16, 0), "");
198 return LLVMBuildSExt(ctx
->ac
.builder
,
199 LLVMBuildTrunc(ctx
->ac
.builder
, i32
,
204 void si_llvm_load_input_vs(
205 struct si_shader_context
*ctx
,
206 unsigned input_index
,
209 const struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
210 unsigned vs_blit_property
= info
->properties
[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD
];
212 if (vs_blit_property
) {
213 LLVMValueRef vertex_id
= ctx
->abi
.vertex_id
;
214 LLVMValueRef sel_x1
= LLVMBuildICmp(ctx
->ac
.builder
,
215 LLVMIntULE
, vertex_id
,
217 /* Use LLVMIntNE, because we have 3 vertices and only
218 * the middle one should use y2.
220 LLVMValueRef sel_y1
= LLVMBuildICmp(ctx
->ac
.builder
,
221 LLVMIntNE
, vertex_id
,
224 unsigned param_vs_blit_inputs
= ctx
->vs_blit_inputs
.arg_index
;
225 if (input_index
== 0) {
227 LLVMValueRef x1y1
= LLVMGetParam(ctx
->main_fn
,
228 param_vs_blit_inputs
);
229 LLVMValueRef x2y2
= LLVMGetParam(ctx
->main_fn
,
230 param_vs_blit_inputs
+ 1);
232 LLVMValueRef x1
= unpack_sint16(ctx
, x1y1
, 0);
233 LLVMValueRef y1
= unpack_sint16(ctx
, x1y1
, 1);
234 LLVMValueRef x2
= unpack_sint16(ctx
, x2y2
, 0);
235 LLVMValueRef y2
= unpack_sint16(ctx
, x2y2
, 1);
237 LLVMValueRef x
= LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
,
239 LLVMValueRef y
= LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
,
242 out
[0] = LLVMBuildSIToFP(ctx
->ac
.builder
, x
, ctx
->f32
, "");
243 out
[1] = LLVMBuildSIToFP(ctx
->ac
.builder
, y
, ctx
->f32
, "");
244 out
[2] = LLVMGetParam(ctx
->main_fn
,
245 param_vs_blit_inputs
+ 2);
246 out
[3] = ctx
->ac
.f32_1
;
250 /* Color or texture coordinates: */
251 assert(input_index
== 1);
253 if (vs_blit_property
== SI_VS_BLIT_SGPRS_POS_COLOR
) {
254 for (int i
= 0; i
< 4; i
++) {
255 out
[i
] = LLVMGetParam(ctx
->main_fn
,
256 param_vs_blit_inputs
+ 3 + i
);
259 assert(vs_blit_property
== SI_VS_BLIT_SGPRS_POS_TEXCOORD
);
260 LLVMValueRef x1
= LLVMGetParam(ctx
->main_fn
,
261 param_vs_blit_inputs
+ 3);
262 LLVMValueRef y1
= LLVMGetParam(ctx
->main_fn
,
263 param_vs_blit_inputs
+ 4);
264 LLVMValueRef x2
= LLVMGetParam(ctx
->main_fn
,
265 param_vs_blit_inputs
+ 5);
266 LLVMValueRef y2
= LLVMGetParam(ctx
->main_fn
,
267 param_vs_blit_inputs
+ 6);
269 out
[0] = LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
,
271 out
[1] = LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
,
273 out
[2] = LLVMGetParam(ctx
->main_fn
,
274 param_vs_blit_inputs
+ 7);
275 out
[3] = LLVMGetParam(ctx
->main_fn
,
276 param_vs_blit_inputs
+ 8);
281 unsigned num_vbos_in_user_sgprs
= ctx
->shader
->selector
->num_vbos_in_user_sgprs
;
282 union si_vs_fix_fetch fix_fetch
;
283 LLVMValueRef vb_desc
;
284 LLVMValueRef vertex_index
;
287 if (input_index
< num_vbos_in_user_sgprs
) {
288 vb_desc
= ac_get_arg(&ctx
->ac
, ctx
->vb_descriptors
[input_index
]);
290 unsigned index
= input_index
- num_vbos_in_user_sgprs
;
291 vb_desc
= ac_build_load_to_sgpr(&ctx
->ac
,
292 ac_get_arg(&ctx
->ac
, ctx
->vertex_buffers
),
293 LLVMConstInt(ctx
->i32
, index
, 0));
296 vertex_index
= LLVMGetParam(ctx
->main_fn
,
297 ctx
->vertex_index0
.arg_index
+
300 /* Use the open-coded implementation for all loads of doubles and
301 * of dword-sized data that needs fixups. We need to insert conversion
302 * code anyway, and the amd/common code does it for us.
304 * Note: On LLVM <= 8, we can only open-code formats with
305 * channel size >= 4 bytes.
307 bool opencode
= ctx
->shader
->key
.mono
.vs_fetch_opencode
& (1 << input_index
);
308 fix_fetch
.bits
= ctx
->shader
->key
.mono
.vs_fix_fetch
[input_index
].bits
;
310 (fix_fetch
.u
.log_size
== 3 && fix_fetch
.u
.format
== AC_FETCH_FORMAT_FLOAT
) ||
311 (fix_fetch
.u
.log_size
== 2)) {
312 tmp
= ac_build_opencoded_load_format(
313 &ctx
->ac
, fix_fetch
.u
.log_size
, fix_fetch
.u
.num_channels_m1
+ 1,
314 fix_fetch
.u
.format
, fix_fetch
.u
.reverse
, !opencode
,
315 vb_desc
, vertex_index
, ctx
->ac
.i32_0
, ctx
->ac
.i32_0
, 0, true);
316 for (unsigned i
= 0; i
< 4; ++i
)
317 out
[i
] = LLVMBuildExtractElement(ctx
->ac
.builder
, tmp
, LLVMConstInt(ctx
->i32
, i
, false), "");
321 /* Do multiple loads for special formats. */
322 unsigned required_channels
= util_last_bit(info
->input_usage_mask
[input_index
]);
323 LLVMValueRef fetches
[4];
324 unsigned num_fetches
;
325 unsigned fetch_stride
;
326 unsigned channels_per_fetch
;
328 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2) {
329 num_fetches
= MIN2(required_channels
, 3);
330 fetch_stride
= 1 << fix_fetch
.u
.log_size
;
331 channels_per_fetch
= 1;
335 channels_per_fetch
= required_channels
;
338 for (unsigned i
= 0; i
< num_fetches
; ++i
) {
339 LLVMValueRef voffset
= LLVMConstInt(ctx
->i32
, fetch_stride
* i
, 0);
340 fetches
[i
] = ac_build_buffer_load_format(&ctx
->ac
, vb_desc
, vertex_index
, voffset
,
341 channels_per_fetch
, 0, true);
344 if (num_fetches
== 1 && channels_per_fetch
> 1) {
345 LLVMValueRef fetch
= fetches
[0];
346 for (unsigned i
= 0; i
< channels_per_fetch
; ++i
) {
347 tmp
= LLVMConstInt(ctx
->i32
, i
, false);
348 fetches
[i
] = LLVMBuildExtractElement(
349 ctx
->ac
.builder
, fetch
, tmp
, "");
351 num_fetches
= channels_per_fetch
;
352 channels_per_fetch
= 1;
355 for (unsigned i
= num_fetches
; i
< 4; ++i
)
356 fetches
[i
] = LLVMGetUndef(ctx
->f32
);
358 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2 &&
359 required_channels
== 4) {
360 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_UINT
|| fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
)
361 fetches
[3] = ctx
->ac
.i32_1
;
363 fetches
[3] = ctx
->ac
.f32_1
;
364 } else if (fix_fetch
.u
.log_size
== 3 &&
365 (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
||
366 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
||
367 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
) &&
368 required_channels
== 4) {
369 /* For 2_10_10_10, the hardware returns an unsigned value;
370 * convert it to a signed one.
372 LLVMValueRef tmp
= fetches
[3];
373 LLVMValueRef c30
= LLVMConstInt(ctx
->i32
, 30, 0);
375 /* First, recover the sign-extended signed integer value. */
376 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
)
377 tmp
= LLVMBuildFPToUI(ctx
->ac
.builder
, tmp
, ctx
->i32
, "");
379 tmp
= ac_to_integer(&ctx
->ac
, tmp
);
381 /* For the integer-like cases, do a natural sign extension.
383 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
384 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
387 tmp
= LLVMBuildShl(ctx
->ac
.builder
, tmp
,
388 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
?
389 LLVMConstInt(ctx
->i32
, 7, 0) : c30
, "");
390 tmp
= LLVMBuildAShr(ctx
->ac
.builder
, tmp
, c30
, "");
392 /* Convert back to the right type. */
393 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
) {
395 LLVMValueRef neg_one
= LLVMConstReal(ctx
->f32
, -1.0);
396 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->f32
, "");
397 clamp
= LLVMBuildFCmp(ctx
->ac
.builder
, LLVMRealULT
, tmp
, neg_one
, "");
398 tmp
= LLVMBuildSelect(ctx
->ac
.builder
, clamp
, neg_one
, tmp
, "");
399 } else if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
) {
400 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->f32
, "");
406 for (unsigned i
= 0; i
< 4; ++i
)
407 out
[i
] = ac_to_float(&ctx
->ac
, fetches
[i
]);
410 LLVMValueRef
si_get_primitive_id(struct si_shader_context
*ctx
,
417 case PIPE_SHADER_VERTEX
:
418 return ac_get_arg(&ctx
->ac
, ctx
->vs_prim_id
);
419 case PIPE_SHADER_TESS_CTRL
:
420 return ac_get_arg(&ctx
->ac
, ctx
->args
.tcs_patch_id
);
421 case PIPE_SHADER_TESS_EVAL
:
422 return ac_get_arg(&ctx
->ac
, ctx
->args
.tes_patch_id
);
423 case PIPE_SHADER_GEOMETRY
:
424 return ac_get_arg(&ctx
->ac
, ctx
->args
.gs_prim_id
);
431 static LLVMValueRef
si_llvm_load_input_gs(struct ac_shader_abi
*abi
,
432 unsigned input_index
,
433 unsigned vtx_offset_param
,
437 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
438 struct si_shader
*shader
= ctx
->shader
;
439 LLVMValueRef vtx_offset
, soffset
;
440 struct si_shader_info
*info
= &shader
->selector
->info
;
441 unsigned semantic_name
= info
->input_semantic_name
[input_index
];
442 unsigned semantic_index
= info
->input_semantic_index
[input_index
];
446 param
= si_shader_io_get_unique_index(semantic_name
, semantic_index
, false);
448 /* GFX9 has the ESGS ring in LDS. */
449 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
450 unsigned index
= vtx_offset_param
;
454 vtx_offset
= si_unpack_param(ctx
, ctx
->gs_vtx01_offset
,
455 index
% 2 ? 16 : 0, 16);
458 vtx_offset
= si_unpack_param(ctx
, ctx
->gs_vtx23_offset
,
459 index
% 2 ? 16 : 0, 16);
462 vtx_offset
= si_unpack_param(ctx
, ctx
->gs_vtx45_offset
,
463 index
% 2 ? 16 : 0, 16);
470 unsigned offset
= param
* 4 + swizzle
;
471 vtx_offset
= LLVMBuildAdd(ctx
->ac
.builder
, vtx_offset
,
472 LLVMConstInt(ctx
->i32
, offset
, false), "");
474 LLVMValueRef ptr
= ac_build_gep0(&ctx
->ac
, ctx
->esgs_ring
, vtx_offset
);
475 LLVMValueRef value
= LLVMBuildLoad(ctx
->ac
.builder
, ptr
, "");
476 if (llvm_type_is_64bit(ctx
, type
)) {
477 ptr
= LLVMBuildGEP(ctx
->ac
.builder
, ptr
,
478 &ctx
->ac
.i32_1
, 1, "");
479 LLVMValueRef values
[2] = {
481 LLVMBuildLoad(ctx
->ac
.builder
, ptr
, "")
483 value
= ac_build_gather_values(&ctx
->ac
, values
, 2);
485 return LLVMBuildBitCast(ctx
->ac
.builder
, value
, type
, "");
488 /* GFX6: input load from the ESGS ring in memory. */
490 LLVMValueRef values
[4];
492 for (chan
= 0; chan
< 4; chan
++) {
493 values
[chan
] = si_llvm_load_input_gs(abi
, input_index
, vtx_offset_param
,
496 return ac_build_gather_values(&ctx
->ac
, values
, 4);
499 /* Get the vertex offset parameter on GFX6. */
500 LLVMValueRef gs_vtx_offset
= ac_get_arg(&ctx
->ac
,
501 ctx
->gs_vtx_offset
[vtx_offset_param
]);
503 vtx_offset
= LLVMBuildMul(ctx
->ac
.builder
, gs_vtx_offset
,
504 LLVMConstInt(ctx
->i32
, 4, 0), "");
506 soffset
= LLVMConstInt(ctx
->i32
, (param
* 4 + swizzle
) * 256, 0);
508 value
= ac_build_buffer_load(&ctx
->ac
, ctx
->esgs_ring
, 1, ctx
->i32_0
,
509 vtx_offset
, soffset
, 0, ac_glc
, true, false);
510 if (llvm_type_is_64bit(ctx
, type
)) {
512 soffset
= LLVMConstInt(ctx
->i32
, (param
* 4 + swizzle
+ 1) * 256, 0);
514 value2
= ac_build_buffer_load(&ctx
->ac
, ctx
->esgs_ring
, 1,
515 ctx
->i32_0
, vtx_offset
, soffset
,
516 0, ac_glc
, true, false);
517 return si_build_gather_64bit(ctx
, type
, value
, value2
);
519 return LLVMBuildBitCast(ctx
->ac
.builder
, value
, type
, "");
522 static LLVMValueRef
si_nir_load_input_gs(struct ac_shader_abi
*abi
,
524 unsigned driver_location
,
526 unsigned num_components
,
527 unsigned vertex_index
,
528 unsigned const_index
,
531 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
533 LLVMValueRef value
[4];
534 for (unsigned i
= 0; i
< num_components
; i
++) {
536 if (llvm_type_is_64bit(ctx
, type
))
540 value
[i
+ component
] = si_llvm_load_input_gs(&ctx
->abi
, driver_location
/ 4 + const_index
,
541 vertex_index
, type
, offset
);
544 return ac_build_varying_gather_values(&ctx
->ac
, value
, num_components
, component
);
547 static LLVMValueRef
get_base_vertex(struct ac_shader_abi
*abi
)
549 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
551 /* For non-indexed draws, the base vertex set by the driver
552 * (for direct draws) or the CP (for indirect draws) is the
553 * first vertex ID, but GLSL expects 0 to be returned.
555 LLVMValueRef vs_state
= ac_get_arg(&ctx
->ac
,
557 LLVMValueRef indexed
;
559 indexed
= LLVMBuildLShr(ctx
->ac
.builder
, vs_state
, ctx
->i32_1
, "");
560 indexed
= LLVMBuildTrunc(ctx
->ac
.builder
, indexed
, ctx
->i1
, "");
562 return LLVMBuildSelect(ctx
->ac
.builder
, indexed
,
563 ac_get_arg(&ctx
->ac
, ctx
->args
.base_vertex
),
567 static LLVMValueRef
get_block_size(struct ac_shader_abi
*abi
)
569 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
571 LLVMValueRef values
[3];
574 unsigned *properties
= ctx
->shader
->selector
->info
.properties
;
576 if (properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH
] != 0) {
577 unsigned sizes
[3] = {
578 properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH
],
579 properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT
],
580 properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH
]
583 for (i
= 0; i
< 3; ++i
)
584 values
[i
] = LLVMConstInt(ctx
->i32
, sizes
[i
], 0);
586 result
= ac_build_gather_values(&ctx
->ac
, values
, 3);
588 result
= ac_get_arg(&ctx
->ac
, ctx
->block_size
);
594 void si_declare_compute_memory(struct si_shader_context
*ctx
)
596 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
597 unsigned lds_size
= sel
->info
.properties
[TGSI_PROPERTY_CS_LOCAL_SIZE
];
599 LLVMTypeRef i8p
= LLVMPointerType(ctx
->i8
, AC_ADDR_SPACE_LDS
);
602 assert(!ctx
->ac
.lds
);
604 var
= LLVMAddGlobalInAddressSpace(ctx
->ac
.module
,
605 LLVMArrayType(ctx
->i8
, lds_size
),
608 LLVMSetAlignment(var
, 64 * 1024);
610 ctx
->ac
.lds
= LLVMBuildBitCast(ctx
->ac
.builder
, var
, i8p
, "");
613 static LLVMValueRef
load_const_buffer_desc_fast_path(struct si_shader_context
*ctx
)
616 ac_get_arg(&ctx
->ac
, ctx
->const_and_shader_buffers
);
617 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
619 /* Do the bounds checking with a descriptor, because
620 * doing computation and manual bounds checking of 64-bit
621 * addresses generates horrible VALU code with very high
622 * VGPR usage and very low SIMD occupancy.
624 ptr
= LLVMBuildPtrToInt(ctx
->ac
.builder
, ptr
, ctx
->ac
.intptr
, "");
626 LLVMValueRef desc0
, desc1
;
628 desc1
= LLVMConstInt(ctx
->i32
,
629 S_008F04_BASE_ADDRESS_HI(ctx
->screen
->info
.address32_hi
), 0);
631 uint32_t rsrc3
= S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
632 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
633 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
634 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
);
636 if (ctx
->screen
->info
.chip_class
>= GFX10
)
637 rsrc3
|= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT
) |
638 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW
) |
639 S_008F0C_RESOURCE_LEVEL(1);
641 rsrc3
|= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
642 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
644 LLVMValueRef desc_elems
[] = {
647 LLVMConstInt(ctx
->i32
, sel
->info
.constbuf0_num_slots
* 16, 0),
648 LLVMConstInt(ctx
->i32
, rsrc3
, false)
651 return ac_build_gather_values(&ctx
->ac
, desc_elems
, 4);
654 static LLVMValueRef
load_ubo(struct ac_shader_abi
*abi
, LLVMValueRef index
)
656 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
657 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
659 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->const_and_shader_buffers
);
661 if (sel
->info
.const_buffers_declared
== 1 &&
662 sel
->info
.shader_buffers_declared
== 0) {
663 return load_const_buffer_desc_fast_path(ctx
);
666 index
= si_llvm_bound_index(ctx
, index
, ctx
->num_const_buffers
);
667 index
= LLVMBuildAdd(ctx
->ac
.builder
, index
,
668 LLVMConstInt(ctx
->i32
, SI_NUM_SHADER_BUFFERS
, 0), "");
670 return ac_build_load_to_sgpr(&ctx
->ac
, ptr
, index
);
674 load_ssbo(struct ac_shader_abi
*abi
, LLVMValueRef index
, bool write
)
676 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
677 LLVMValueRef rsrc_ptr
= ac_get_arg(&ctx
->ac
,
678 ctx
->const_and_shader_buffers
);
680 index
= si_llvm_bound_index(ctx
, index
, ctx
->num_shader_buffers
);
681 index
= LLVMBuildSub(ctx
->ac
.builder
,
682 LLVMConstInt(ctx
->i32
, SI_NUM_SHADER_BUFFERS
- 1, 0),
685 return ac_build_load_to_sgpr(&ctx
->ac
, rsrc_ptr
, index
);
688 /* Initialize arguments for the shader export intrinsic */
689 static void si_llvm_init_vs_export_args(struct si_shader_context
*ctx
,
690 LLVMValueRef
*values
,
692 struct ac_export_args
*args
)
694 args
->enabled_channels
= 0xf; /* writemask - default is 0xf */
695 args
->valid_mask
= 0; /* Specify whether the EXEC mask represents the valid mask */
696 args
->done
= 0; /* Specify whether this is the last export */
697 args
->target
= target
; /* Specify the target we are exporting */
700 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
703 static void si_llvm_emit_clipvertex(struct si_shader_context
*ctx
,
704 struct ac_export_args
*pos
, LLVMValueRef
*out_elts
)
709 LLVMValueRef base_elt
;
710 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
711 LLVMValueRef constbuf_index
= LLVMConstInt(ctx
->i32
,
712 SI_VS_CONST_CLIP_PLANES
, 0);
713 LLVMValueRef const_resource
= ac_build_load_to_sgpr(&ctx
->ac
, ptr
, constbuf_index
);
715 for (reg_index
= 0; reg_index
< 2; reg_index
++) {
716 struct ac_export_args
*args
= &pos
[2 + reg_index
];
721 args
->out
[3] = LLVMConstReal(ctx
->f32
, 0.0f
);
723 /* Compute dot products of position and user clip plane vectors */
724 for (chan
= 0; chan
< 4; chan
++) {
725 for (const_chan
= 0; const_chan
< 4; const_chan
++) {
727 LLVMConstInt(ctx
->i32
, ((reg_index
* 4 + chan
) * 4 +
729 base_elt
= si_buffer_load_const(ctx
, const_resource
,
731 args
->out
[chan
] = ac_build_fmad(&ctx
->ac
, base_elt
,
732 out_elts
[const_chan
], args
->out
[chan
]);
736 args
->enabled_channels
= 0xf;
737 args
->valid_mask
= 0;
739 args
->target
= V_008DFC_SQ_EXP_POS
+ 2 + reg_index
;
744 static void si_dump_streamout(struct pipe_stream_output_info
*so
)
749 fprintf(stderr
, "STREAMOUT\n");
751 for (i
= 0; i
< so
->num_outputs
; i
++) {
752 unsigned mask
= ((1 << so
->output
[i
].num_components
) - 1) <<
753 so
->output
[i
].start_component
;
754 fprintf(stderr
, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
755 i
, so
->output
[i
].output_buffer
,
756 so
->output
[i
].dst_offset
, so
->output
[i
].dst_offset
+ so
->output
[i
].num_components
- 1,
757 so
->output
[i
].register_index
,
761 mask
& 8 ? "w" : "");
765 void si_emit_streamout_output(struct si_shader_context
*ctx
,
766 LLVMValueRef
const *so_buffers
,
767 LLVMValueRef
const *so_write_offsets
,
768 struct pipe_stream_output
*stream_out
,
769 struct si_shader_output_values
*shader_out
)
771 unsigned buf_idx
= stream_out
->output_buffer
;
772 unsigned start
= stream_out
->start_component
;
773 unsigned num_comps
= stream_out
->num_components
;
776 assert(num_comps
&& num_comps
<= 4);
777 if (!num_comps
|| num_comps
> 4)
780 /* Load the output as int. */
781 for (int j
= 0; j
< num_comps
; j
++) {
782 assert(stream_out
->stream
== shader_out
->vertex_stream
[start
+ j
]);
784 out
[j
] = ac_to_integer(&ctx
->ac
, shader_out
->values
[start
+ j
]);
787 /* Pack the output. */
788 LLVMValueRef vdata
= NULL
;
794 case 2: /* as v2i32 */
795 case 3: /* as v3i32 */
796 if (ac_has_vec3_support(ctx
->screen
->info
.chip_class
, false)) {
797 vdata
= ac_build_gather_values(&ctx
->ac
, out
, num_comps
);
800 /* as v4i32 (aligned to 4) */
801 out
[3] = LLVMGetUndef(ctx
->i32
);
803 case 4: /* as v4i32 */
804 vdata
= ac_build_gather_values(&ctx
->ac
, out
, util_next_power_of_two(num_comps
));
808 ac_build_buffer_store_dword(&ctx
->ac
, so_buffers
[buf_idx
],
810 so_write_offsets
[buf_idx
],
812 stream_out
->dst_offset
* 4, ac_glc
| ac_slc
);
816 * Write streamout data to buffers for vertex stream @p stream (different
817 * vertex streams can occur for GS copy shaders).
819 static void si_llvm_emit_streamout(struct si_shader_context
*ctx
,
820 struct si_shader_output_values
*outputs
,
821 unsigned noutput
, unsigned stream
)
823 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
824 struct pipe_stream_output_info
*so
= &sel
->so
;
825 LLVMBuilderRef builder
= ctx
->ac
.builder
;
828 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
829 LLVMValueRef so_vtx_count
=
830 si_unpack_param(ctx
, ctx
->streamout_config
, 16, 7);
832 LLVMValueRef tid
= ac_get_thread_id(&ctx
->ac
);
834 /* can_emit = tid < so_vtx_count; */
835 LLVMValueRef can_emit
=
836 LLVMBuildICmp(builder
, LLVMIntULT
, tid
, so_vtx_count
, "");
838 /* Emit the streamout code conditionally. This actually avoids
839 * out-of-bounds buffer access. The hw tells us via the SGPR
840 * (so_vtx_count) which threads are allowed to emit streamout data. */
841 ac_build_ifcc(&ctx
->ac
, can_emit
, 6501);
843 /* The buffer offset is computed as follows:
844 * ByteOffset = streamout_offset[buffer_id]*4 +
845 * (streamout_write_index + thread_id)*stride[buffer_id] +
849 LLVMValueRef so_write_index
=
851 ctx
->streamout_write_index
);
853 /* Compute (streamout_write_index + thread_id). */
854 so_write_index
= LLVMBuildAdd(builder
, so_write_index
, tid
, "");
856 /* Load the descriptor and compute the write offset for each
858 LLVMValueRef so_write_offset
[4] = {};
859 LLVMValueRef so_buffers
[4];
860 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
,
863 for (i
= 0; i
< 4; i
++) {
867 LLVMValueRef offset
= LLVMConstInt(ctx
->i32
,
868 SI_VS_STREAMOUT_BUF0
+ i
, 0);
870 so_buffers
[i
] = ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
872 LLVMValueRef so_offset
= ac_get_arg(&ctx
->ac
,
873 ctx
->streamout_offset
[i
]);
874 so_offset
= LLVMBuildMul(builder
, so_offset
, LLVMConstInt(ctx
->i32
, 4, 0), "");
876 so_write_offset
[i
] = ac_build_imad(&ctx
->ac
, so_write_index
,
877 LLVMConstInt(ctx
->i32
, so
->stride
[i
]*4, 0),
881 /* Write streamout data. */
882 for (i
= 0; i
< so
->num_outputs
; i
++) {
883 unsigned reg
= so
->output
[i
].register_index
;
888 if (stream
!= so
->output
[i
].stream
)
891 si_emit_streamout_output(ctx
, so_buffers
, so_write_offset
,
892 &so
->output
[i
], &outputs
[reg
]);
895 ac_build_endif(&ctx
->ac
, 6501);
898 static void si_export_param(struct si_shader_context
*ctx
, unsigned index
,
899 LLVMValueRef
*values
)
901 struct ac_export_args args
;
903 si_llvm_init_vs_export_args(ctx
, values
,
904 V_008DFC_SQ_EXP_PARAM
+ index
, &args
);
905 ac_build_export(&ctx
->ac
, &args
);
908 static void si_build_param_exports(struct si_shader_context
*ctx
,
909 struct si_shader_output_values
*outputs
,
912 struct si_shader
*shader
= ctx
->shader
;
913 unsigned param_count
= 0;
915 for (unsigned i
= 0; i
< noutput
; i
++) {
916 unsigned semantic_name
= outputs
[i
].semantic_name
;
917 unsigned semantic_index
= outputs
[i
].semantic_index
;
919 if (outputs
[i
].vertex_stream
[0] != 0 &&
920 outputs
[i
].vertex_stream
[1] != 0 &&
921 outputs
[i
].vertex_stream
[2] != 0 &&
922 outputs
[i
].vertex_stream
[3] != 0)
925 switch (semantic_name
) {
926 case TGSI_SEMANTIC_LAYER
:
927 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
928 case TGSI_SEMANTIC_CLIPDIST
:
929 case TGSI_SEMANTIC_COLOR
:
930 case TGSI_SEMANTIC_BCOLOR
:
931 case TGSI_SEMANTIC_PRIMID
:
932 case TGSI_SEMANTIC_FOG
:
933 case TGSI_SEMANTIC_TEXCOORD
:
934 case TGSI_SEMANTIC_GENERIC
:
940 if ((semantic_name
!= TGSI_SEMANTIC_GENERIC
||
941 semantic_index
< SI_MAX_IO_GENERIC
) &&
942 shader
->key
.opt
.kill_outputs
&
943 (1ull << si_shader_io_get_unique_index(semantic_name
,
944 semantic_index
, true)))
947 si_export_param(ctx
, param_count
, outputs
[i
].values
);
949 assert(i
< ARRAY_SIZE(shader
->info
.vs_output_param_offset
));
950 shader
->info
.vs_output_param_offset
[i
] = param_count
++;
953 shader
->info
.nr_param_exports
= param_count
;
957 * Vertex color clamping.
959 * This uses a state constant loaded in a user data SGPR and
960 * an IF statement is added that clamps all colors if the constant
963 static void si_vertex_color_clamping(struct si_shader_context
*ctx
,
964 struct si_shader_output_values
*outputs
,
967 LLVMValueRef addr
[SI_MAX_VS_OUTPUTS
][4];
968 bool has_colors
= false;
970 /* Store original colors to alloca variables. */
971 for (unsigned i
= 0; i
< noutput
; i
++) {
972 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
973 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
976 for (unsigned j
= 0; j
< 4; j
++) {
977 addr
[i
][j
] = ac_build_alloca_undef(&ctx
->ac
, ctx
->f32
, "");
978 LLVMBuildStore(ctx
->ac
.builder
, outputs
[i
].values
[j
], addr
[i
][j
]);
986 /* The state is in the first bit of the user SGPR. */
987 LLVMValueRef cond
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
988 cond
= LLVMBuildTrunc(ctx
->ac
.builder
, cond
, ctx
->i1
, "");
990 ac_build_ifcc(&ctx
->ac
, cond
, 6502);
992 /* Store clamped colors to alloca variables within the conditional block. */
993 for (unsigned i
= 0; i
< noutput
; i
++) {
994 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
995 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
998 for (unsigned j
= 0; j
< 4; j
++) {
999 LLVMBuildStore(ctx
->ac
.builder
,
1000 ac_build_clamp(&ctx
->ac
, outputs
[i
].values
[j
]),
1004 ac_build_endif(&ctx
->ac
, 6502);
1006 /* Load clamped colors */
1007 for (unsigned i
= 0; i
< noutput
; i
++) {
1008 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
1009 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
1012 for (unsigned j
= 0; j
< 4; j
++) {
1013 outputs
[i
].values
[j
] =
1014 LLVMBuildLoad(ctx
->ac
.builder
, addr
[i
][j
], "");
1019 /* Generate export instructions for hardware VS shader stage or NGG GS stage
1020 * (position and parameter data only).
1022 void si_llvm_export_vs(struct si_shader_context
*ctx
,
1023 struct si_shader_output_values
*outputs
,
1026 struct si_shader
*shader
= ctx
->shader
;
1027 struct ac_export_args pos_args
[4] = {};
1028 LLVMValueRef psize_value
= NULL
, edgeflag_value
= NULL
, layer_value
= NULL
, viewport_index_value
= NULL
;
1032 si_vertex_color_clamping(ctx
, outputs
, noutput
);
1034 /* Build position exports. */
1035 for (i
= 0; i
< noutput
; i
++) {
1036 switch (outputs
[i
].semantic_name
) {
1037 case TGSI_SEMANTIC_POSITION
:
1038 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
,
1039 V_008DFC_SQ_EXP_POS
, &pos_args
[0]);
1041 case TGSI_SEMANTIC_PSIZE
:
1042 psize_value
= outputs
[i
].values
[0];
1044 case TGSI_SEMANTIC_LAYER
:
1045 layer_value
= outputs
[i
].values
[0];
1047 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
1048 viewport_index_value
= outputs
[i
].values
[0];
1050 case TGSI_SEMANTIC_EDGEFLAG
:
1051 edgeflag_value
= outputs
[i
].values
[0];
1053 case TGSI_SEMANTIC_CLIPDIST
:
1054 if (!shader
->key
.opt
.clip_disable
) {
1055 unsigned index
= 2 + outputs
[i
].semantic_index
;
1056 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
,
1057 V_008DFC_SQ_EXP_POS
+ index
,
1061 case TGSI_SEMANTIC_CLIPVERTEX
:
1062 if (!shader
->key
.opt
.clip_disable
) {
1063 si_llvm_emit_clipvertex(ctx
, pos_args
,
1070 /* We need to add the position output manually if it's missing. */
1071 if (!pos_args
[0].out
[0]) {
1072 pos_args
[0].enabled_channels
= 0xf; /* writemask */
1073 pos_args
[0].valid_mask
= 0; /* EXEC mask */
1074 pos_args
[0].done
= 0; /* last export? */
1075 pos_args
[0].target
= V_008DFC_SQ_EXP_POS
;
1076 pos_args
[0].compr
= 0; /* COMPR flag */
1077 pos_args
[0].out
[0] = ctx
->ac
.f32_0
; /* X */
1078 pos_args
[0].out
[1] = ctx
->ac
.f32_0
; /* Y */
1079 pos_args
[0].out
[2] = ctx
->ac
.f32_0
; /* Z */
1080 pos_args
[0].out
[3] = ctx
->ac
.f32_1
; /* W */
1083 bool pos_writes_edgeflag
= shader
->selector
->info
.writes_edgeflag
&&
1084 !shader
->key
.as_ngg
;
1086 /* Write the misc vector (point size, edgeflag, layer, viewport). */
1087 if (shader
->selector
->info
.writes_psize
||
1088 pos_writes_edgeflag
||
1089 shader
->selector
->info
.writes_viewport_index
||
1090 shader
->selector
->info
.writes_layer
) {
1091 pos_args
[1].enabled_channels
= shader
->selector
->info
.writes_psize
|
1092 (pos_writes_edgeflag
<< 1) |
1093 (shader
->selector
->info
.writes_layer
<< 2);
1095 pos_args
[1].valid_mask
= 0; /* EXEC mask */
1096 pos_args
[1].done
= 0; /* last export? */
1097 pos_args
[1].target
= V_008DFC_SQ_EXP_POS
+ 1;
1098 pos_args
[1].compr
= 0; /* COMPR flag */
1099 pos_args
[1].out
[0] = ctx
->ac
.f32_0
; /* X */
1100 pos_args
[1].out
[1] = ctx
->ac
.f32_0
; /* Y */
1101 pos_args
[1].out
[2] = ctx
->ac
.f32_0
; /* Z */
1102 pos_args
[1].out
[3] = ctx
->ac
.f32_0
; /* W */
1104 if (shader
->selector
->info
.writes_psize
)
1105 pos_args
[1].out
[0] = psize_value
;
1107 if (pos_writes_edgeflag
) {
1108 /* The output is a float, but the hw expects an integer
1109 * with the first bit containing the edge flag. */
1110 edgeflag_value
= LLVMBuildFPToUI(ctx
->ac
.builder
,
1113 edgeflag_value
= ac_build_umin(&ctx
->ac
,
1117 /* The LLVM intrinsic expects a float. */
1118 pos_args
[1].out
[1] = ac_to_float(&ctx
->ac
, edgeflag_value
);
1121 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
1122 /* GFX9 has the layer in out.z[10:0] and the viewport
1123 * index in out.z[19:16].
1125 if (shader
->selector
->info
.writes_layer
)
1126 pos_args
[1].out
[2] = layer_value
;
1128 if (shader
->selector
->info
.writes_viewport_index
) {
1129 LLVMValueRef v
= viewport_index_value
;
1131 v
= ac_to_integer(&ctx
->ac
, v
);
1132 v
= LLVMBuildShl(ctx
->ac
.builder
, v
,
1133 LLVMConstInt(ctx
->i32
, 16, 0), "");
1134 v
= LLVMBuildOr(ctx
->ac
.builder
, v
,
1135 ac_to_integer(&ctx
->ac
, pos_args
[1].out
[2]), "");
1136 pos_args
[1].out
[2] = ac_to_float(&ctx
->ac
, v
);
1137 pos_args
[1].enabled_channels
|= 1 << 2;
1140 if (shader
->selector
->info
.writes_layer
)
1141 pos_args
[1].out
[2] = layer_value
;
1143 if (shader
->selector
->info
.writes_viewport_index
) {
1144 pos_args
[1].out
[3] = viewport_index_value
;
1145 pos_args
[1].enabled_channels
|= 1 << 3;
1150 for (i
= 0; i
< 4; i
++)
1151 if (pos_args
[i
].out
[0])
1152 shader
->info
.nr_pos_exports
++;
1154 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
1155 * Setting valid_mask=1 prevents it and has no other effect.
1157 if (ctx
->screen
->info
.family
== CHIP_NAVI10
||
1158 ctx
->screen
->info
.family
== CHIP_NAVI12
||
1159 ctx
->screen
->info
.family
== CHIP_NAVI14
)
1160 pos_args
[0].valid_mask
= 1;
1163 for (i
= 0; i
< 4; i
++) {
1164 if (!pos_args
[i
].out
[0])
1167 /* Specify the target we are exporting */
1168 pos_args
[i
].target
= V_008DFC_SQ_EXP_POS
+ pos_idx
++;
1170 if (pos_idx
== shader
->info
.nr_pos_exports
)
1171 /* Specify that this is the last export */
1172 pos_args
[i
].done
= 1;
1174 ac_build_export(&ctx
->ac
, &pos_args
[i
]);
1177 /* Build parameter exports. */
1178 si_build_param_exports(ctx
, outputs
, noutput
);
1181 /* Pass GS inputs from ES to GS on GFX9. */
1182 static void si_set_es_return_value_for_gs(struct si_shader_context
*ctx
)
1184 LLVMValueRef ret
= ctx
->return_value
;
1186 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->other_const_and_shader_buffers
, 0);
1187 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->other_samplers_and_images
, 1);
1188 if (ctx
->shader
->key
.as_ngg
)
1189 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->gs_tg_info
, 2);
1191 ret
= si_insert_input_ret(ctx
, ret
, ctx
->gs2vs_offset
, 2);
1192 ret
= si_insert_input_ret(ctx
, ret
, ctx
->merged_wave_info
, 3);
1193 ret
= si_insert_input_ret(ctx
, ret
, ctx
->merged_scratch_offset
, 5);
1195 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->rw_buffers
,
1196 8 + SI_SGPR_RW_BUFFERS
);
1197 ret
= si_insert_input_ptr(ctx
, ret
,
1198 ctx
->bindless_samplers_and_images
,
1199 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES
);
1200 if (ctx
->screen
->use_ngg
) {
1201 ret
= si_insert_input_ptr(ctx
, ret
, ctx
->vs_state_bits
,
1202 8 + SI_SGPR_VS_STATE_BITS
);
1206 if (ctx
->type
== PIPE_SHADER_VERTEX
)
1207 vgpr
= 8 + GFX9_VSGS_NUM_USER_SGPR
;
1209 vgpr
= 8 + GFX9_TESGS_NUM_USER_SGPR
;
1211 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->gs_vtx01_offset
, vgpr
++);
1212 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->gs_vtx23_offset
, vgpr
++);
1213 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->args
.gs_prim_id
, vgpr
++);
1214 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->args
.gs_invocation_id
, vgpr
++);
1215 ret
= si_insert_input_ret_float(ctx
, ret
, ctx
->gs_vtx45_offset
, vgpr
++);
1216 ctx
->return_value
= ret
;
1219 static void si_llvm_emit_es_epilogue(struct ac_shader_abi
*abi
,
1220 unsigned max_outputs
,
1221 LLVMValueRef
*addrs
)
1223 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
1224 struct si_shader
*es
= ctx
->shader
;
1225 struct si_shader_info
*info
= &es
->selector
->info
;
1226 LLVMValueRef lds_base
= NULL
;
1230 if (ctx
->screen
->info
.chip_class
>= GFX9
&& info
->num_outputs
) {
1231 unsigned itemsize_dw
= es
->selector
->esgs_itemsize
/ 4;
1232 LLVMValueRef vertex_idx
= ac_get_thread_id(&ctx
->ac
);
1233 LLVMValueRef wave_idx
= si_unpack_param(ctx
, ctx
->merged_wave_info
, 24, 4);
1234 vertex_idx
= LLVMBuildOr(ctx
->ac
.builder
, vertex_idx
,
1235 LLVMBuildMul(ctx
->ac
.builder
, wave_idx
,
1236 LLVMConstInt(ctx
->i32
, ctx
->ac
.wave_size
, false), ""), "");
1237 lds_base
= LLVMBuildMul(ctx
->ac
.builder
, vertex_idx
,
1238 LLVMConstInt(ctx
->i32
, itemsize_dw
, 0), "");
1241 for (i
= 0; i
< info
->num_outputs
; i
++) {
1244 if (info
->output_semantic_name
[i
] == TGSI_SEMANTIC_VIEWPORT_INDEX
||
1245 info
->output_semantic_name
[i
] == TGSI_SEMANTIC_LAYER
)
1248 param
= si_shader_io_get_unique_index(info
->output_semantic_name
[i
],
1249 info
->output_semantic_index
[i
], false);
1251 for (chan
= 0; chan
< 4; chan
++) {
1252 if (!(info
->output_usagemask
[i
] & (1 << chan
)))
1255 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
1256 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
1258 /* GFX9 has the ESGS ring in LDS. */
1259 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
1260 LLVMValueRef idx
= LLVMConstInt(ctx
->i32
, param
* 4 + chan
, false);
1261 idx
= LLVMBuildAdd(ctx
->ac
.builder
, lds_base
, idx
, "");
1262 ac_build_indexed_store(&ctx
->ac
, ctx
->esgs_ring
, idx
, out_val
);
1266 ac_build_buffer_store_dword(&ctx
->ac
,
1269 ac_get_arg(&ctx
->ac
, ctx
->es2gs_offset
),
1270 (4 * param
+ chan
) * 4,
1271 ac_glc
| ac_slc
| ac_swizzled
);
1275 if (ctx
->screen
->info
.chip_class
>= GFX9
)
1276 si_set_es_return_value_for_gs(ctx
);
1279 static LLVMValueRef
si_get_gs_wave_id(struct si_shader_context
*ctx
)
1281 if (ctx
->screen
->info
.chip_class
>= GFX9
)
1282 return si_unpack_param(ctx
, ctx
->merged_wave_info
, 16, 8);
1284 return ac_get_arg(&ctx
->ac
, ctx
->gs_wave_id
);
1287 static void emit_gs_epilogue(struct si_shader_context
*ctx
)
1289 if (ctx
->shader
->key
.as_ngg
) {
1290 gfx10_ngg_gs_emit_epilogue(ctx
);
1294 if (ctx
->screen
->info
.chip_class
>= GFX10
)
1295 LLVMBuildFence(ctx
->ac
.builder
, LLVMAtomicOrderingRelease
, false, "");
1297 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_NOP
| AC_SENDMSG_GS_DONE
,
1298 si_get_gs_wave_id(ctx
));
1300 if (ctx
->screen
->info
.chip_class
>= GFX9
)
1301 ac_build_endif(&ctx
->ac
, ctx
->merged_wrap_if_label
);
1304 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi
*abi
,
1305 unsigned max_outputs
,
1306 LLVMValueRef
*addrs
)
1308 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
1309 struct si_shader_info UNUSED
*info
= &ctx
->shader
->selector
->info
;
1311 assert(info
->num_outputs
<= max_outputs
);
1313 emit_gs_epilogue(ctx
);
1316 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi
*abi
,
1317 unsigned max_outputs
,
1318 LLVMValueRef
*addrs
)
1320 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
1321 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
1322 struct si_shader_output_values
*outputs
= NULL
;
1325 assert(!ctx
->shader
->is_gs_copy_shader
);
1326 assert(info
->num_outputs
<= max_outputs
);
1328 outputs
= MALLOC((info
->num_outputs
+ 1) * sizeof(outputs
[0]));
1330 for (i
= 0; i
< info
->num_outputs
; i
++) {
1331 outputs
[i
].semantic_name
= info
->output_semantic_name
[i
];
1332 outputs
[i
].semantic_index
= info
->output_semantic_index
[i
];
1334 for (j
= 0; j
< 4; j
++) {
1335 outputs
[i
].values
[j
] =
1336 LLVMBuildLoad(ctx
->ac
.builder
,
1339 outputs
[i
].vertex_stream
[j
] =
1340 (info
->output_streams
[i
] >> (2 * j
)) & 3;
1344 if (!ctx
->screen
->use_ngg_streamout
&&
1345 ctx
->shader
->selector
->so
.num_outputs
)
1346 si_llvm_emit_streamout(ctx
, outputs
, i
, 0);
1348 /* Export PrimitiveID. */
1349 if (ctx
->shader
->key
.mono
.u
.vs_export_prim_id
) {
1350 outputs
[i
].semantic_name
= TGSI_SEMANTIC_PRIMID
;
1351 outputs
[i
].semantic_index
= 0;
1352 outputs
[i
].values
[0] = ac_to_float(&ctx
->ac
, si_get_primitive_id(ctx
, 0));
1353 for (j
= 1; j
< 4; j
++)
1354 outputs
[i
].values
[j
] = LLVMConstReal(ctx
->f32
, 0);
1356 memset(outputs
[i
].vertex_stream
, 0,
1357 sizeof(outputs
[i
].vertex_stream
));
1361 si_llvm_export_vs(ctx
, outputs
, i
);
1365 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi
*abi
,
1366 unsigned max_outputs
,
1367 LLVMValueRef
*addrs
)
1369 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
1370 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
1371 LLVMValueRef pos
[4] = {};
1373 assert(info
->num_outputs
<= max_outputs
);
1375 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
1376 if (info
->output_semantic_name
[i
] != TGSI_SEMANTIC_POSITION
)
1379 for (unsigned chan
= 0; chan
< 4; chan
++)
1380 pos
[chan
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
1383 assert(pos
[0] != NULL
);
1385 /* Return the position output. */
1386 LLVMValueRef ret
= ctx
->return_value
;
1387 for (unsigned chan
= 0; chan
< 4; chan
++)
1388 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, pos
[chan
], chan
, "");
1389 ctx
->return_value
= ret
;
1392 /* Emit one vertex from the geometry shader */
1393 static void si_llvm_emit_vertex(struct ac_shader_abi
*abi
,
1395 LLVMValueRef
*addrs
)
1397 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
1399 if (ctx
->shader
->key
.as_ngg
) {
1400 gfx10_ngg_gs_emit_vertex(ctx
, stream
, addrs
);
1404 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
1405 struct si_shader
*shader
= ctx
->shader
;
1406 LLVMValueRef soffset
= ac_get_arg(&ctx
->ac
, ctx
->gs2vs_offset
);
1407 LLVMValueRef gs_next_vertex
;
1408 LLVMValueRef can_emit
;
1409 unsigned chan
, offset
;
1412 /* Write vertex attribute values to GSVS ring */
1413 gs_next_vertex
= LLVMBuildLoad(ctx
->ac
.builder
,
1414 ctx
->gs_next_vertex
[stream
],
1417 /* If this thread has already emitted the declared maximum number of
1418 * vertices, skip the write: excessive vertex emissions are not
1419 * supposed to have any effect.
1421 * If the shader has no writes to memory, kill it instead. This skips
1422 * further memory loads and may allow LLVM to skip to the end
1425 can_emit
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
, gs_next_vertex
,
1426 LLVMConstInt(ctx
->i32
,
1427 shader
->selector
->gs_max_out_vertices
, 0), "");
1429 bool use_kill
= !info
->writes_memory
;
1431 ac_build_kill_if_false(&ctx
->ac
, can_emit
);
1433 ac_build_ifcc(&ctx
->ac
, can_emit
, 6505);
1437 for (i
= 0; i
< info
->num_outputs
; i
++) {
1438 for (chan
= 0; chan
< 4; chan
++) {
1439 if (!(info
->output_usagemask
[i
] & (1 << chan
)) ||
1440 ((info
->output_streams
[i
] >> (2 * chan
)) & 3) != stream
)
1443 LLVMValueRef out_val
= LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
1444 LLVMValueRef voffset
=
1445 LLVMConstInt(ctx
->i32
, offset
*
1446 shader
->selector
->gs_max_out_vertices
, 0);
1449 voffset
= LLVMBuildAdd(ctx
->ac
.builder
, voffset
, gs_next_vertex
, "");
1450 voffset
= LLVMBuildMul(ctx
->ac
.builder
, voffset
,
1451 LLVMConstInt(ctx
->i32
, 4, 0), "");
1453 out_val
= ac_to_integer(&ctx
->ac
, out_val
);
1455 ac_build_buffer_store_dword(&ctx
->ac
,
1456 ctx
->gsvs_ring
[stream
],
1458 voffset
, soffset
, 0,
1459 ac_glc
| ac_slc
| ac_swizzled
);
1463 gs_next_vertex
= LLVMBuildAdd(ctx
->ac
.builder
, gs_next_vertex
, ctx
->i32_1
, "");
1464 LLVMBuildStore(ctx
->ac
.builder
, gs_next_vertex
, ctx
->gs_next_vertex
[stream
]);
1466 /* Signal vertex emission if vertex data was written. */
1468 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_EMIT
| AC_SENDMSG_GS
| (stream
<< 8),
1469 si_get_gs_wave_id(ctx
));
1473 ac_build_endif(&ctx
->ac
, 6505);
1476 /* Cut one primitive from the geometry shader */
1477 static void si_llvm_emit_primitive(struct ac_shader_abi
*abi
,
1480 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
1482 if (ctx
->shader
->key
.as_ngg
) {
1483 LLVMBuildStore(ctx
->ac
.builder
, ctx
->ac
.i32_0
, ctx
->gs_curprim_verts
[stream
]);
1487 /* Signal primitive cut */
1488 ac_build_sendmsg(&ctx
->ac
, AC_SENDMSG_GS_OP_CUT
| AC_SENDMSG_GS
| (stream
<< 8),
1489 si_get_gs_wave_id(ctx
));
1492 static void declare_streamout_params(struct si_shader_context
*ctx
,
1493 struct pipe_stream_output_info
*so
)
1495 if (ctx
->screen
->use_ngg_streamout
) {
1496 if (ctx
->type
== PIPE_SHADER_TESS_EVAL
)
1497 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
);
1501 /* Streamout SGPRs. */
1502 if (so
->num_outputs
) {
1503 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->streamout_config
);
1504 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->streamout_write_index
);
1505 } else if (ctx
->type
== PIPE_SHADER_TESS_EVAL
) {
1506 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
);
1509 /* A streamout buffer offset is loaded if the stride is non-zero. */
1510 for (int i
= 0; i
< 4; i
++) {
1514 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->streamout_offset
[i
]);
1518 static unsigned si_get_max_workgroup_size(const struct si_shader
*shader
)
1520 switch (shader
->selector
->type
) {
1521 case PIPE_SHADER_VERTEX
:
1522 case PIPE_SHADER_TESS_EVAL
:
1523 return shader
->key
.as_ngg
? 128 : 0;
1525 case PIPE_SHADER_TESS_CTRL
:
1526 /* Return this so that LLVM doesn't remove s_barrier
1527 * instructions on chips where we use s_barrier. */
1528 return shader
->selector
->screen
->info
.chip_class
>= GFX7
? 128 : 0;
1530 case PIPE_SHADER_GEOMETRY
:
1531 return shader
->selector
->screen
->info
.chip_class
>= GFX9
? 128 : 0;
1533 case PIPE_SHADER_COMPUTE
:
1534 break; /* see below */
1540 const unsigned *properties
= shader
->selector
->info
.properties
;
1541 unsigned max_work_group_size
=
1542 properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH
] *
1543 properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT
] *
1544 properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH
];
1546 if (!max_work_group_size
) {
1547 /* This is a variable group size compute shader,
1548 * compile it for the maximum possible group size.
1550 max_work_group_size
= SI_MAX_VARIABLE_THREADS_PER_BLOCK
;
1552 return max_work_group_size
;
1555 static void declare_const_and_shader_buffers(struct si_shader_context
*ctx
,
1558 enum ac_arg_type const_shader_buf_type
;
1560 if (ctx
->shader
->selector
->info
.const_buffers_declared
== 1 &&
1561 ctx
->shader
->selector
->info
.shader_buffers_declared
== 0)
1562 const_shader_buf_type
= AC_ARG_CONST_FLOAT_PTR
;
1564 const_shader_buf_type
= AC_ARG_CONST_DESC_PTR
;
1566 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, const_shader_buf_type
,
1567 assign_params
? &ctx
->const_and_shader_buffers
:
1568 &ctx
->other_const_and_shader_buffers
);
1571 static void declare_samplers_and_images(struct si_shader_context
*ctx
,
1574 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_CONST_IMAGE_PTR
,
1575 assign_params
? &ctx
->samplers_and_images
:
1576 &ctx
->other_samplers_and_images
);
1579 static void declare_per_stage_desc_pointers(struct si_shader_context
*ctx
,
1582 declare_const_and_shader_buffers(ctx
, assign_params
);
1583 declare_samplers_and_images(ctx
, assign_params
);
1586 static void declare_global_desc_pointers(struct si_shader_context
*ctx
)
1588 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_CONST_DESC_PTR
,
1590 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_CONST_IMAGE_PTR
,
1591 &ctx
->bindless_samplers_and_images
);
1594 static void declare_vs_specific_input_sgprs(struct si_shader_context
*ctx
)
1596 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->vs_state_bits
);
1597 if (!ctx
->shader
->is_gs_copy_shader
) {
1598 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->args
.base_vertex
);
1599 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->args
.start_instance
);
1600 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->args
.draw_id
);
1604 static void declare_vb_descriptor_input_sgprs(struct si_shader_context
*ctx
)
1606 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_CONST_DESC_PTR
, &ctx
->vertex_buffers
);
1608 unsigned num_vbos_in_user_sgprs
= ctx
->shader
->selector
->num_vbos_in_user_sgprs
;
1609 if (num_vbos_in_user_sgprs
) {
1610 unsigned user_sgprs
= ctx
->args
.num_sgprs_used
;
1612 if (si_is_merged_shader(ctx
))
1614 assert(user_sgprs
<= SI_SGPR_VS_VB_DESCRIPTOR_FIRST
);
1616 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
1617 for (unsigned i
= user_sgprs
; i
< SI_SGPR_VS_VB_DESCRIPTOR_FIRST
; i
++)
1618 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
); /* unused */
1620 assert(num_vbos_in_user_sgprs
<= ARRAY_SIZE(ctx
->vb_descriptors
));
1621 for (unsigned i
= 0; i
< num_vbos_in_user_sgprs
; i
++)
1622 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 4, AC_ARG_INT
, &ctx
->vb_descriptors
[i
]);
1626 static void declare_vs_input_vgprs(struct si_shader_context
*ctx
,
1627 unsigned *num_prolog_vgprs
)
1629 struct si_shader
*shader
= ctx
->shader
;
1631 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.vertex_id
);
1632 if (shader
->key
.as_ls
) {
1633 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->rel_auto_id
);
1634 if (ctx
->screen
->info
.chip_class
>= GFX10
) {
1635 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, NULL
); /* user VGPR */
1636 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.instance_id
);
1638 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.instance_id
);
1639 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, NULL
); /* unused */
1641 } else if (ctx
->screen
->info
.chip_class
>= GFX10
) {
1642 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, NULL
); /* user VGPR */
1643 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
,
1644 &ctx
->vs_prim_id
); /* user vgpr or PrimID (legacy) */
1645 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.instance_id
);
1647 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.instance_id
);
1648 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->vs_prim_id
);
1649 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, NULL
); /* unused */
1652 if (!shader
->is_gs_copy_shader
) {
1653 /* Vertex load indices. */
1654 if (shader
->selector
->info
.num_inputs
) {
1655 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
,
1656 &ctx
->vertex_index0
);
1657 for (unsigned i
= 1; i
< shader
->selector
->info
.num_inputs
; i
++)
1658 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, NULL
);
1660 *num_prolog_vgprs
+= shader
->selector
->info
.num_inputs
;
1664 static void declare_vs_blit_inputs(struct si_shader_context
*ctx
,
1665 unsigned vs_blit_property
)
1667 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
,
1668 &ctx
->vs_blit_inputs
); /* i16 x1, y1 */
1669 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
); /* i16 x1, y1 */
1670 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* depth */
1672 if (vs_blit_property
== SI_VS_BLIT_SGPRS_POS_COLOR
) {
1673 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* color0 */
1674 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* color1 */
1675 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* color2 */
1676 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* color3 */
1677 } else if (vs_blit_property
== SI_VS_BLIT_SGPRS_POS_TEXCOORD
) {
1678 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* texcoord.x1 */
1679 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* texcoord.y1 */
1680 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* texcoord.x2 */
1681 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* texcoord.y2 */
1682 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* texcoord.z */
1683 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_FLOAT
, NULL
); /* texcoord.w */
1687 static void declare_tes_input_vgprs(struct si_shader_context
*ctx
)
1689 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
, &ctx
->tes_u
);
1690 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
, &ctx
->tes_v
);
1691 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->tes_rel_patch_id
);
1692 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.tes_patch_id
);
1696 /* Convenient merged shader definitions. */
1697 SI_SHADER_MERGED_VERTEX_TESSCTRL
= PIPE_SHADER_TYPES
,
1698 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY
,
1701 void si_add_arg_checked(struct ac_shader_args
*args
,
1702 enum ac_arg_regfile file
,
1703 unsigned registers
, enum ac_arg_type type
,
1707 assert(args
->arg_count
== idx
);
1708 ac_add_arg(args
, file
, registers
, type
, arg
);
1711 static void create_function(struct si_shader_context
*ctx
)
1713 struct si_shader
*shader
= ctx
->shader
;
1714 LLVMTypeRef returns
[AC_MAX_ARGS
];
1715 unsigned i
, num_return_sgprs
;
1716 unsigned num_returns
= 0;
1717 unsigned num_prolog_vgprs
= 0;
1718 unsigned type
= ctx
->type
;
1719 unsigned vs_blit_property
=
1720 shader
->selector
->info
.properties
[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD
];
1722 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
1724 /* Set MERGED shaders. */
1725 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
1726 if (shader
->key
.as_ls
|| type
== PIPE_SHADER_TESS_CTRL
)
1727 type
= SI_SHADER_MERGED_VERTEX_TESSCTRL
; /* LS or HS */
1728 else if (shader
->key
.as_es
|| shader
->key
.as_ngg
|| type
== PIPE_SHADER_GEOMETRY
)
1729 type
= SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY
;
1733 case PIPE_SHADER_VERTEX
:
1734 declare_global_desc_pointers(ctx
);
1736 if (vs_blit_property
) {
1737 declare_vs_blit_inputs(ctx
, vs_blit_property
);
1740 declare_vs_input_vgprs(ctx
, &num_prolog_vgprs
);
1744 declare_per_stage_desc_pointers(ctx
, true);
1745 declare_vs_specific_input_sgprs(ctx
);
1746 if (!shader
->is_gs_copy_shader
)
1747 declare_vb_descriptor_input_sgprs(ctx
);
1749 if (shader
->key
.as_es
) {
1750 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
,
1751 &ctx
->es2gs_offset
);
1752 } else if (shader
->key
.as_ls
) {
1753 /* no extra parameters */
1755 /* The locations of the other parameters are assigned dynamically. */
1756 declare_streamout_params(ctx
, &shader
->selector
->so
);
1760 declare_vs_input_vgprs(ctx
, &num_prolog_vgprs
);
1763 if (shader
->key
.opt
.vs_as_prim_discard_cs
) {
1764 for (i
= 0; i
< 4; i
++)
1765 returns
[num_returns
++] = ctx
->f32
; /* VGPRs */
1769 case PIPE_SHADER_TESS_CTRL
: /* GFX6-GFX8 */
1770 declare_global_desc_pointers(ctx
);
1771 declare_per_stage_desc_pointers(ctx
, true);
1772 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_layout
);
1773 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_out_lds_offsets
);
1774 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_out_lds_layout
);
1775 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->vs_state_bits
);
1776 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_offset
);
1777 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_factor_offset
);
1780 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.tcs_patch_id
);
1781 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.tcs_rel_ids
);
1783 /* param_tcs_offchip_offset and param_tcs_factor_offset are
1784 * placed after the user SGPRs.
1786 for (i
= 0; i
< GFX6_TCS_NUM_USER_SGPR
+ 2; i
++)
1787 returns
[num_returns
++] = ctx
->i32
; /* SGPRs */
1788 for (i
= 0; i
< 11; i
++)
1789 returns
[num_returns
++] = ctx
->f32
; /* VGPRs */
1792 case SI_SHADER_MERGED_VERTEX_TESSCTRL
:
1793 /* Merged stages have 8 system SGPRs at the beginning. */
1794 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
1795 declare_per_stage_desc_pointers(ctx
,
1796 ctx
->type
== PIPE_SHADER_TESS_CTRL
);
1797 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_offset
);
1798 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->merged_wave_info
);
1799 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_factor_offset
);
1800 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->merged_scratch_offset
);
1801 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
); /* unused */
1802 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
); /* unused */
1804 declare_global_desc_pointers(ctx
);
1805 declare_per_stage_desc_pointers(ctx
,
1806 ctx
->type
== PIPE_SHADER_VERTEX
);
1807 declare_vs_specific_input_sgprs(ctx
);
1809 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_layout
);
1810 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_out_lds_offsets
);
1811 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_out_lds_layout
);
1812 declare_vb_descriptor_input_sgprs(ctx
);
1814 /* VGPRs (first TCS, then VS) */
1815 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.tcs_patch_id
);
1816 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.tcs_rel_ids
);
1818 if (ctx
->type
== PIPE_SHADER_VERTEX
) {
1819 declare_vs_input_vgprs(ctx
, &num_prolog_vgprs
);
1821 /* LS return values are inputs to the TCS main shader part. */
1822 for (i
= 0; i
< 8 + GFX9_TCS_NUM_USER_SGPR
; i
++)
1823 returns
[num_returns
++] = ctx
->i32
; /* SGPRs */
1824 for (i
= 0; i
< 2; i
++)
1825 returns
[num_returns
++] = ctx
->f32
; /* VGPRs */
1827 /* TCS return values are inputs to the TCS epilog.
1829 * param_tcs_offchip_offset, param_tcs_factor_offset,
1830 * param_tcs_offchip_layout, and param_rw_buffers
1831 * should be passed to the epilog.
1833 for (i
= 0; i
<= 8 + GFX9_SGPR_TCS_OUT_LAYOUT
; i
++)
1834 returns
[num_returns
++] = ctx
->i32
; /* SGPRs */
1835 for (i
= 0; i
< 11; i
++)
1836 returns
[num_returns
++] = ctx
->f32
; /* VGPRs */
1840 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY
:
1841 /* Merged stages have 8 system SGPRs at the beginning. */
1842 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
1843 declare_per_stage_desc_pointers(ctx
,
1844 ctx
->type
== PIPE_SHADER_GEOMETRY
);
1846 if (ctx
->shader
->key
.as_ngg
)
1847 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->gs_tg_info
);
1849 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->gs2vs_offset
);
1851 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->merged_wave_info
);
1852 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_offset
);
1853 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->merged_scratch_offset
);
1854 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
1855 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
1857 declare_global_desc_pointers(ctx
);
1858 if (ctx
->type
!= PIPE_SHADER_VERTEX
|| !vs_blit_property
) {
1859 declare_per_stage_desc_pointers(ctx
,
1860 (ctx
->type
== PIPE_SHADER_VERTEX
||
1861 ctx
->type
== PIPE_SHADER_TESS_EVAL
));
1864 if (ctx
->type
== PIPE_SHADER_VERTEX
) {
1865 if (vs_blit_property
)
1866 declare_vs_blit_inputs(ctx
, vs_blit_property
);
1868 declare_vs_specific_input_sgprs(ctx
);
1870 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->vs_state_bits
);
1871 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_layout
);
1872 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tes_offchip_addr
);
1873 /* Declare as many input SGPRs as the VS has. */
1876 if (ctx
->type
== PIPE_SHADER_VERTEX
)
1877 declare_vb_descriptor_input_sgprs(ctx
);
1879 /* VGPRs (first GS, then VS/TES) */
1880 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx01_offset
);
1881 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx23_offset
);
1882 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.gs_prim_id
);
1883 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.gs_invocation_id
);
1884 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx45_offset
);
1886 if (ctx
->type
== PIPE_SHADER_VERTEX
) {
1887 declare_vs_input_vgprs(ctx
, &num_prolog_vgprs
);
1888 } else if (ctx
->type
== PIPE_SHADER_TESS_EVAL
) {
1889 declare_tes_input_vgprs(ctx
);
1892 if (ctx
->shader
->key
.as_es
&&
1893 (ctx
->type
== PIPE_SHADER_VERTEX
||
1894 ctx
->type
== PIPE_SHADER_TESS_EVAL
)) {
1895 unsigned num_user_sgprs
;
1897 if (ctx
->type
== PIPE_SHADER_VERTEX
)
1898 num_user_sgprs
= GFX9_VSGS_NUM_USER_SGPR
;
1900 num_user_sgprs
= GFX9_TESGS_NUM_USER_SGPR
;
1902 /* ES return values are inputs to GS. */
1903 for (i
= 0; i
< 8 + num_user_sgprs
; i
++)
1904 returns
[num_returns
++] = ctx
->i32
; /* SGPRs */
1905 for (i
= 0; i
< 5; i
++)
1906 returns
[num_returns
++] = ctx
->f32
; /* VGPRs */
1910 case PIPE_SHADER_TESS_EVAL
:
1911 declare_global_desc_pointers(ctx
);
1912 declare_per_stage_desc_pointers(ctx
, true);
1913 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->vs_state_bits
);
1914 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_layout
);
1915 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tes_offchip_addr
);
1917 if (shader
->key
.as_es
) {
1918 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_offset
);
1919 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
);
1920 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->es2gs_offset
);
1922 declare_streamout_params(ctx
, &shader
->selector
->so
);
1923 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->tcs_offchip_offset
);
1927 declare_tes_input_vgprs(ctx
);
1930 case PIPE_SHADER_GEOMETRY
:
1931 declare_global_desc_pointers(ctx
);
1932 declare_per_stage_desc_pointers(ctx
, true);
1933 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->gs2vs_offset
);
1934 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->gs_wave_id
);
1937 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx_offset
[0]);
1938 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx_offset
[1]);
1939 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.gs_prim_id
);
1940 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx_offset
[2]);
1941 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx_offset
[3]);
1942 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx_offset
[4]);
1943 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->gs_vtx_offset
[5]);
1944 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &ctx
->args
.gs_invocation_id
);
1947 case PIPE_SHADER_FRAGMENT
:
1948 declare_global_desc_pointers(ctx
);
1949 declare_per_stage_desc_pointers(ctx
, true);
1950 si_add_arg_checked(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
,
1951 SI_PARAM_ALPHA_REF
);
1952 si_add_arg_checked(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
,
1953 &ctx
->args
.prim_mask
, SI_PARAM_PRIM_MASK
);
1955 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 2, AC_ARG_INT
, &ctx
->args
.persp_sample
,
1956 SI_PARAM_PERSP_SAMPLE
);
1957 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 2, AC_ARG_INT
,
1958 &ctx
->args
.persp_center
, SI_PARAM_PERSP_CENTER
);
1959 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 2, AC_ARG_INT
,
1960 &ctx
->args
.persp_centroid
, SI_PARAM_PERSP_CENTROID
);
1961 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 3, AC_ARG_INT
,
1962 NULL
, SI_PARAM_PERSP_PULL_MODEL
);
1963 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 2, AC_ARG_INT
,
1964 &ctx
->args
.linear_sample
, SI_PARAM_LINEAR_SAMPLE
);
1965 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 2, AC_ARG_INT
,
1966 &ctx
->args
.linear_center
, SI_PARAM_LINEAR_CENTER
);
1967 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 2, AC_ARG_INT
,
1968 &ctx
->args
.linear_centroid
, SI_PARAM_LINEAR_CENTROID
);
1969 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 3, AC_ARG_FLOAT
,
1970 NULL
, SI_PARAM_LINE_STIPPLE_TEX
);
1971 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
,
1972 &ctx
->args
.frag_pos
[0], SI_PARAM_POS_X_FLOAT
);
1973 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
,
1974 &ctx
->args
.frag_pos
[1], SI_PARAM_POS_Y_FLOAT
);
1975 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
,
1976 &ctx
->args
.frag_pos
[2], SI_PARAM_POS_Z_FLOAT
);
1977 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
,
1978 &ctx
->args
.frag_pos
[3], SI_PARAM_POS_W_FLOAT
);
1979 shader
->info
.face_vgpr_index
= ctx
->args
.num_vgprs_used
;
1980 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
,
1981 &ctx
->args
.front_face
, SI_PARAM_FRONT_FACE
);
1982 shader
->info
.ancillary_vgpr_index
= ctx
->args
.num_vgprs_used
;
1983 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
,
1984 &ctx
->args
.ancillary
, SI_PARAM_ANCILLARY
);
1985 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
,
1986 &ctx
->args
.sample_coverage
, SI_PARAM_SAMPLE_COVERAGE
);
1987 si_add_arg_checked(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
,
1988 &ctx
->pos_fixed_pt
, SI_PARAM_POS_FIXED_PT
);
1990 /* Color inputs from the prolog. */
1991 if (shader
->selector
->info
.colors_read
) {
1992 unsigned num_color_elements
=
1993 util_bitcount(shader
->selector
->info
.colors_read
);
1995 for (i
= 0; i
< num_color_elements
; i
++)
1996 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_FLOAT
, NULL
);
1998 num_prolog_vgprs
+= num_color_elements
;
2001 /* Outputs for the epilog. */
2002 num_return_sgprs
= SI_SGPR_ALPHA_REF
+ 1;
2005 util_bitcount(shader
->selector
->info
.colors_written
) * 4 +
2006 shader
->selector
->info
.writes_z
+
2007 shader
->selector
->info
.writes_stencil
+
2008 shader
->selector
->info
.writes_samplemask
+
2009 1 /* SampleMaskIn */;
2011 num_returns
= MAX2(num_returns
,
2013 PS_EPILOG_SAMPLEMASK_MIN_LOC
+ 1);
2015 for (i
= 0; i
< num_return_sgprs
; i
++)
2016 returns
[i
] = ctx
->i32
;
2017 for (; i
< num_returns
; i
++)
2018 returns
[i
] = ctx
->f32
;
2021 case PIPE_SHADER_COMPUTE
:
2022 declare_global_desc_pointers(ctx
);
2023 declare_per_stage_desc_pointers(ctx
, true);
2024 if (shader
->selector
->info
.uses_grid_size
)
2025 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 3, AC_ARG_INT
,
2026 &ctx
->args
.num_work_groups
);
2027 if (shader
->selector
->info
.uses_block_size
&&
2028 shader
->selector
->info
.properties
[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH
] == 0)
2029 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 3, AC_ARG_INT
, &ctx
->block_size
);
2031 unsigned cs_user_data_dwords
=
2032 shader
->selector
->info
.properties
[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD
];
2033 if (cs_user_data_dwords
) {
2034 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, cs_user_data_dwords
, AC_ARG_INT
,
2035 &ctx
->cs_user_data
);
2038 /* Hardware SGPRs. */
2039 for (i
= 0; i
< 3; i
++) {
2040 if (shader
->selector
->info
.uses_block_id
[i
]) {
2041 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
,
2042 &ctx
->args
.workgroup_ids
[i
]);
2045 if (shader
->selector
->info
.uses_subgroup_info
)
2046 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &ctx
->args
.tg_size
);
2048 /* Hardware VGPRs. */
2049 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 3, AC_ARG_INT
,
2050 &ctx
->args
.local_invocation_ids
);
2053 assert(0 && "unimplemented shader");
2057 si_llvm_create_func(ctx
, "main", returns
, num_returns
,
2058 si_get_max_workgroup_size(shader
));
2060 /* Reserve register locations for VGPR inputs the PS prolog may need. */
2061 if (ctx
->type
== PIPE_SHADER_FRAGMENT
&& !ctx
->shader
->is_monolithic
) {
2062 ac_llvm_add_target_dep_function_attr(ctx
->main_fn
,
2063 "InitialPSInputAddr",
2064 S_0286D0_PERSP_SAMPLE_ENA(1) |
2065 S_0286D0_PERSP_CENTER_ENA(1) |
2066 S_0286D0_PERSP_CENTROID_ENA(1) |
2067 S_0286D0_LINEAR_SAMPLE_ENA(1) |
2068 S_0286D0_LINEAR_CENTER_ENA(1) |
2069 S_0286D0_LINEAR_CENTROID_ENA(1) |
2070 S_0286D0_FRONT_FACE_ENA(1) |
2071 S_0286D0_ANCILLARY_ENA(1) |
2072 S_0286D0_POS_FIXED_PT_ENA(1));
2075 shader
->info
.num_input_sgprs
= ctx
->args
.num_sgprs_used
;
2076 shader
->info
.num_input_vgprs
= ctx
->args
.num_vgprs_used
;
2078 assert(shader
->info
.num_input_vgprs
>= num_prolog_vgprs
);
2079 shader
->info
.num_input_vgprs
-= num_prolog_vgprs
;
2081 if (shader
->key
.as_ls
|| ctx
->type
== PIPE_SHADER_TESS_CTRL
) {
2082 if (USE_LDS_SYMBOLS
&& LLVM_VERSION_MAJOR
>= 9) {
2083 /* The LSHS size is not known until draw time, so we append it
2084 * at the end of whatever LDS use there may be in the rest of
2085 * the shader (currently none, unless LLVM decides to do its
2086 * own LDS-based lowering).
2088 ctx
->ac
.lds
= LLVMAddGlobalInAddressSpace(
2089 ctx
->ac
.module
, LLVMArrayType(ctx
->i32
, 0),
2090 "__lds_end", AC_ADDR_SPACE_LDS
);
2091 LLVMSetAlignment(ctx
->ac
.lds
, 256);
2093 ac_declare_lds_as_pointer(&ctx
->ac
);
2097 /* Unlike radv, we override these arguments in the prolog, so to the
2098 * API shader they appear as normal arguments.
2100 if (ctx
->type
== PIPE_SHADER_VERTEX
) {
2101 ctx
->abi
.vertex_id
= ac_get_arg(&ctx
->ac
, ctx
->args
.vertex_id
);
2102 ctx
->abi
.instance_id
= ac_get_arg(&ctx
->ac
, ctx
->args
.instance_id
);
2103 } else if (ctx
->type
== PIPE_SHADER_FRAGMENT
) {
2104 ctx
->abi
.persp_centroid
= ac_get_arg(&ctx
->ac
, ctx
->args
.persp_centroid
);
2105 ctx
->abi
.linear_centroid
= ac_get_arg(&ctx
->ac
, ctx
->args
.linear_centroid
);
2109 /* Ensure that the esgs ring is declared.
2111 * We declare it with 64KB alignment as a hint that the
2112 * pointer value will always be 0.
2114 static void declare_esgs_ring(struct si_shader_context
*ctx
)
2119 assert(!LLVMGetNamedGlobal(ctx
->ac
.module
, "esgs_ring"));
2121 ctx
->esgs_ring
= LLVMAddGlobalInAddressSpace(
2122 ctx
->ac
.module
, LLVMArrayType(ctx
->i32
, 0),
2125 LLVMSetLinkage(ctx
->esgs_ring
, LLVMExternalLinkage
);
2126 LLVMSetAlignment(ctx
->esgs_ring
, 64 * 1024);
2130 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
2133 static void preload_ring_buffers(struct si_shader_context
*ctx
)
2135 LLVMBuilderRef builder
= ctx
->ac
.builder
;
2137 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
2139 if (ctx
->shader
->key
.as_es
|| ctx
->type
== PIPE_SHADER_GEOMETRY
) {
2140 if (ctx
->screen
->info
.chip_class
<= GFX8
) {
2142 ctx
->type
== PIPE_SHADER_GEOMETRY
? SI_GS_RING_ESGS
2144 LLVMValueRef offset
= LLVMConstInt(ctx
->i32
, ring
, 0);
2147 ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
2149 if (USE_LDS_SYMBOLS
&& LLVM_VERSION_MAJOR
>= 9) {
2150 /* Declare the ESGS ring as an explicit LDS symbol. */
2151 declare_esgs_ring(ctx
);
2153 ac_declare_lds_as_pointer(&ctx
->ac
);
2154 ctx
->esgs_ring
= ctx
->ac
.lds
;
2159 if (ctx
->shader
->is_gs_copy_shader
) {
2160 LLVMValueRef offset
= LLVMConstInt(ctx
->i32
, SI_RING_GSVS
, 0);
2163 ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
2164 } else if (ctx
->type
== PIPE_SHADER_GEOMETRY
) {
2165 const struct si_shader_selector
*sel
= ctx
->shader
->selector
;
2166 LLVMValueRef offset
= LLVMConstInt(ctx
->i32
, SI_RING_GSVS
, 0);
2167 LLVMValueRef base_ring
;
2169 base_ring
= ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
2171 /* The conceptual layout of the GSVS ring is
2172 * v0c0 .. vLv0 v0c1 .. vLc1 ..
2173 * but the real memory layout is swizzled across
2175 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
2177 * Override the buffer descriptor accordingly.
2179 LLVMTypeRef v2i64
= LLVMVectorType(ctx
->i64
, 2);
2180 uint64_t stream_offset
= 0;
2182 for (unsigned stream
= 0; stream
< 4; ++stream
) {
2183 unsigned num_components
;
2185 unsigned num_records
;
2186 LLVMValueRef ring
, tmp
;
2188 num_components
= sel
->info
.num_stream_output_components
[stream
];
2189 if (!num_components
)
2192 stride
= 4 * num_components
* sel
->gs_max_out_vertices
;
2194 /* Limit on the stride field for <= GFX7. */
2195 assert(stride
< (1 << 14));
2197 num_records
= ctx
->ac
.wave_size
;
2199 ring
= LLVMBuildBitCast(builder
, base_ring
, v2i64
, "");
2200 tmp
= LLVMBuildExtractElement(builder
, ring
, ctx
->i32_0
, "");
2201 tmp
= LLVMBuildAdd(builder
, tmp
,
2202 LLVMConstInt(ctx
->i64
,
2203 stream_offset
, 0), "");
2204 stream_offset
+= stride
* ctx
->ac
.wave_size
;
2206 ring
= LLVMBuildInsertElement(builder
, ring
, tmp
, ctx
->i32_0
, "");
2207 ring
= LLVMBuildBitCast(builder
, ring
, ctx
->v4i32
, "");
2208 tmp
= LLVMBuildExtractElement(builder
, ring
, ctx
->i32_1
, "");
2209 tmp
= LLVMBuildOr(builder
, tmp
,
2210 LLVMConstInt(ctx
->i32
,
2211 S_008F04_STRIDE(stride
) |
2212 S_008F04_SWIZZLE_ENABLE(1), 0), "");
2213 ring
= LLVMBuildInsertElement(builder
, ring
, tmp
, ctx
->i32_1
, "");
2214 ring
= LLVMBuildInsertElement(builder
, ring
,
2215 LLVMConstInt(ctx
->i32
, num_records
, 0),
2216 LLVMConstInt(ctx
->i32
, 2, 0), "");
2219 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) |
2220 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
2221 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) |
2222 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
) |
2223 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
2224 S_008F0C_ADD_TID_ENABLE(1);
2226 if (ctx
->ac
.chip_class
>= GFX10
) {
2227 rsrc3
|= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT
) |
2228 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_DISABLED
) |
2229 S_008F0C_RESOURCE_LEVEL(1);
2231 rsrc3
|= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
2232 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
) |
2233 S_008F0C_ELEMENT_SIZE(1); /* element_size = 4 (bytes) */
2236 ring
= LLVMBuildInsertElement(builder
, ring
,
2237 LLVMConstInt(ctx
->i32
, rsrc3
, false),
2238 LLVMConstInt(ctx
->i32
, 3, 0), "");
2240 ctx
->gsvs_ring
[stream
] = ring
;
2242 } else if (ctx
->type
== PIPE_SHADER_TESS_EVAL
) {
2243 si_llvm_preload_tes_rings(ctx
);
2247 /* For the UMR disassembler. */
2248 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
2249 #define DEBUGGER_NUM_MARKERS 5
2251 static bool si_shader_binary_open(struct si_screen
*screen
,
2252 struct si_shader
*shader
,
2253 struct ac_rtld_binary
*rtld
)
2255 const struct si_shader_selector
*sel
= shader
->selector
;
2256 const char *part_elfs
[5];
2257 size_t part_sizes
[5];
2258 unsigned num_parts
= 0;
2260 #define add_part(shader_or_part) \
2261 if (shader_or_part) { \
2262 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
2263 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
2267 add_part(shader
->prolog
);
2268 add_part(shader
->previous_stage
);
2269 add_part(shader
->prolog2
);
2271 add_part(shader
->epilog
);
2275 struct ac_rtld_symbol lds_symbols
[2];
2276 unsigned num_lds_symbols
= 0;
2278 if (sel
&& screen
->info
.chip_class
>= GFX9
&& !shader
->is_gs_copy_shader
&&
2279 (sel
->type
== PIPE_SHADER_GEOMETRY
|| shader
->key
.as_ngg
)) {
2280 /* We add this symbol even on LLVM <= 8 to ensure that
2281 * shader->config.lds_size is set correctly below.
2283 struct ac_rtld_symbol
*sym
= &lds_symbols
[num_lds_symbols
++];
2284 sym
->name
= "esgs_ring";
2285 sym
->size
= shader
->gs_info
.esgs_ring_size
;
2286 sym
->align
= 64 * 1024;
2289 if (shader
->key
.as_ngg
&& sel
->type
== PIPE_SHADER_GEOMETRY
) {
2290 struct ac_rtld_symbol
*sym
= &lds_symbols
[num_lds_symbols
++];
2291 sym
->name
= "ngg_emit";
2292 sym
->size
= shader
->ngg
.ngg_emit_size
* 4;
2296 bool ok
= ac_rtld_open(rtld
, (struct ac_rtld_open_info
){
2297 .info
= &screen
->info
,
2299 .halt_at_entry
= screen
->options
.halt_shaders
,
2301 .shader_type
= tgsi_processor_to_shader_stage(sel
->type
),
2302 .wave_size
= si_get_shader_wave_size(shader
),
2303 .num_parts
= num_parts
,
2304 .elf_ptrs
= part_elfs
,
2305 .elf_sizes
= part_sizes
,
2306 .num_shared_lds_symbols
= num_lds_symbols
,
2307 .shared_lds_symbols
= lds_symbols
});
2309 if (rtld
->lds_size
> 0) {
2310 unsigned alloc_granularity
= screen
->info
.chip_class
>= GFX7
? 512 : 256;
2311 shader
->config
.lds_size
=
2312 align(rtld
->lds_size
, alloc_granularity
) / alloc_granularity
;
2318 static unsigned si_get_shader_binary_size(struct si_screen
*screen
, struct si_shader
*shader
)
2320 struct ac_rtld_binary rtld
;
2321 si_shader_binary_open(screen
, shader
, &rtld
);
2322 return rtld
.exec_size
;
2325 static bool si_get_external_symbol(void *data
, const char *name
, uint64_t *value
)
2327 uint64_t *scratch_va
= data
;
2329 if (!strcmp(scratch_rsrc_dword0_symbol
, name
)) {
2330 *value
= (uint32_t)*scratch_va
;
2333 if (!strcmp(scratch_rsrc_dword1_symbol
, name
)) {
2334 /* Enable scratch coalescing. */
2335 *value
= S_008F04_BASE_ADDRESS_HI(*scratch_va
>> 32) |
2336 S_008F04_SWIZZLE_ENABLE(1);
2343 bool si_shader_binary_upload(struct si_screen
*sscreen
, struct si_shader
*shader
,
2344 uint64_t scratch_va
)
2346 struct ac_rtld_binary binary
;
2347 if (!si_shader_binary_open(sscreen
, shader
, &binary
))
2350 si_resource_reference(&shader
->bo
, NULL
);
2351 shader
->bo
= si_aligned_buffer_create(&sscreen
->b
,
2352 sscreen
->info
.cpdma_prefetch_writes_memory
?
2353 0 : SI_RESOURCE_FLAG_READ_ONLY
,
2354 PIPE_USAGE_IMMUTABLE
,
2355 align(binary
.rx_size
, SI_CPDMA_ALIGNMENT
),
2361 struct ac_rtld_upload_info u
= {};
2363 u
.get_external_symbol
= si_get_external_symbol
;
2364 u
.cb_data
= &scratch_va
;
2365 u
.rx_va
= shader
->bo
->gpu_address
;
2366 u
.rx_ptr
= sscreen
->ws
->buffer_map(shader
->bo
->buf
, NULL
,
2367 PIPE_TRANSFER_READ_WRITE
|
2368 PIPE_TRANSFER_UNSYNCHRONIZED
|
2369 RADEON_TRANSFER_TEMPORARY
);
2373 bool ok
= ac_rtld_upload(&u
);
2375 sscreen
->ws
->buffer_unmap(shader
->bo
->buf
);
2376 ac_rtld_close(&binary
);
2381 static void si_shader_dump_disassembly(struct si_screen
*screen
,
2382 const struct si_shader_binary
*binary
,
2383 enum pipe_shader_type shader_type
,
2385 struct pipe_debug_callback
*debug
,
2386 const char *name
, FILE *file
)
2388 struct ac_rtld_binary rtld_binary
;
2390 if (!ac_rtld_open(&rtld_binary
, (struct ac_rtld_open_info
){
2391 .info
= &screen
->info
,
2392 .shader_type
= tgsi_processor_to_shader_stage(shader_type
),
2393 .wave_size
= wave_size
,
2395 .elf_ptrs
= &binary
->elf_buffer
,
2396 .elf_sizes
= &binary
->elf_size
}))
2402 if (!ac_rtld_get_section_by_name(&rtld_binary
, ".AMDGPU.disasm", &disasm
, &nbytes
))
2405 if (nbytes
> INT_MAX
)
2408 if (debug
&& debug
->debug_message
) {
2409 /* Very long debug messages are cut off, so send the
2410 * disassembly one line at a time. This causes more
2411 * overhead, but on the plus side it simplifies
2412 * parsing of resulting logs.
2414 pipe_debug_message(debug
, SHADER_INFO
,
2415 "Shader Disassembly Begin");
2418 while (line
< nbytes
) {
2419 int count
= nbytes
- line
;
2420 const char *nl
= memchr(disasm
+ line
, '\n', nbytes
- line
);
2422 count
= nl
- (disasm
+ line
);
2425 pipe_debug_message(debug
, SHADER_INFO
,
2426 "%.*s", count
, disasm
+ line
);
2432 pipe_debug_message(debug
, SHADER_INFO
,
2433 "Shader Disassembly End");
2437 fprintf(file
, "Shader %s disassembly:\n", name
);
2438 fprintf(file
, "%*s", (int)nbytes
, disasm
);
2442 ac_rtld_close(&rtld_binary
);
2445 static void si_calculate_max_simd_waves(struct si_shader
*shader
)
2447 struct si_screen
*sscreen
= shader
->selector
->screen
;
2448 struct ac_shader_config
*conf
= &shader
->config
;
2449 unsigned num_inputs
= shader
->selector
->info
.num_inputs
;
2450 unsigned lds_increment
= sscreen
->info
.chip_class
>= GFX7
? 512 : 256;
2451 unsigned lds_per_wave
= 0;
2452 unsigned max_simd_waves
;
2454 max_simd_waves
= sscreen
->info
.max_wave64_per_simd
;
2456 /* Compute LDS usage for PS. */
2457 switch (shader
->selector
->type
) {
2458 case PIPE_SHADER_FRAGMENT
:
2459 /* The minimum usage per wave is (num_inputs * 48). The maximum
2460 * usage is (num_inputs * 48 * 16).
2461 * We can get anything in between and it varies between waves.
2463 * The 48 bytes per input for a single primitive is equal to
2464 * 4 bytes/component * 4 components/input * 3 points.
2466 * Other stages don't know the size at compile time or don't
2467 * allocate LDS per wave, but instead they do it per thread group.
2469 lds_per_wave
= conf
->lds_size
* lds_increment
+
2470 align(num_inputs
* 48, lds_increment
);
2472 case PIPE_SHADER_COMPUTE
:
2473 if (shader
->selector
) {
2474 unsigned max_workgroup_size
=
2475 si_get_max_workgroup_size(shader
);
2476 lds_per_wave
= (conf
->lds_size
* lds_increment
) /
2477 DIV_ROUND_UP(max_workgroup_size
,
2478 sscreen
->compute_wave_size
);
2484 /* Compute the per-SIMD wave counts. */
2485 if (conf
->num_sgprs
) {
2487 MIN2(max_simd_waves
,
2488 sscreen
->info
.num_physical_sgprs_per_simd
/ conf
->num_sgprs
);
2491 if (conf
->num_vgprs
) {
2492 /* Always print wave limits as Wave64, so that we can compare
2493 * Wave32 and Wave64 with shader-db fairly. */
2494 unsigned max_vgprs
= sscreen
->info
.num_physical_wave64_vgprs_per_simd
;
2495 max_simd_waves
= MIN2(max_simd_waves
, max_vgprs
/ conf
->num_vgprs
);
2498 /* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
2499 * 16KB makes some SIMDs unoccupied).
2501 * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
2503 unsigned max_lds_size
= sscreen
->info
.chip_class
>= GFX10
? 128*1024 : 64*1024;
2504 unsigned max_lds_per_simd
= max_lds_size
/ 4;
2506 max_simd_waves
= MIN2(max_simd_waves
, max_lds_per_simd
/ lds_per_wave
);
2508 shader
->info
.max_simd_waves
= max_simd_waves
;
2511 void si_shader_dump_stats_for_shader_db(struct si_screen
*screen
,
2512 struct si_shader
*shader
,
2513 struct pipe_debug_callback
*debug
)
2515 const struct ac_shader_config
*conf
= &shader
->config
;
2517 if (screen
->options
.debug_disassembly
)
2518 si_shader_dump_disassembly(screen
, &shader
->binary
,
2519 shader
->selector
->type
,
2520 si_get_shader_wave_size(shader
),
2521 debug
, "main", NULL
);
2523 pipe_debug_message(debug
, SHADER_INFO
,
2524 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
2525 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
2526 "Spilled VGPRs: %d PrivMem VGPRs: %d",
2527 conf
->num_sgprs
, conf
->num_vgprs
,
2528 si_get_shader_binary_size(screen
, shader
),
2529 conf
->lds_size
, conf
->scratch_bytes_per_wave
,
2530 shader
->info
.max_simd_waves
, conf
->spilled_sgprs
,
2531 conf
->spilled_vgprs
, shader
->info
.private_mem_vgprs
);
2534 static void si_shader_dump_stats(struct si_screen
*sscreen
,
2535 struct si_shader
*shader
,
2537 bool check_debug_option
)
2539 const struct ac_shader_config
*conf
= &shader
->config
;
2541 if (!check_debug_option
||
2542 si_can_dump_shader(sscreen
, shader
->selector
->type
)) {
2543 if (shader
->selector
->type
== PIPE_SHADER_FRAGMENT
) {
2544 fprintf(file
, "*** SHADER CONFIG ***\n"
2545 "SPI_PS_INPUT_ADDR = 0x%04x\n"
2546 "SPI_PS_INPUT_ENA = 0x%04x\n",
2547 conf
->spi_ps_input_addr
, conf
->spi_ps_input_ena
);
2550 fprintf(file
, "*** SHADER STATS ***\n"
2553 "Spilled SGPRs: %d\n"
2554 "Spilled VGPRs: %d\n"
2555 "Private memory VGPRs: %d\n"
2556 "Code Size: %d bytes\n"
2558 "Scratch: %d bytes per wave\n"
2560 "********************\n\n\n",
2561 conf
->num_sgprs
, conf
->num_vgprs
,
2562 conf
->spilled_sgprs
, conf
->spilled_vgprs
,
2563 shader
->info
.private_mem_vgprs
,
2564 si_get_shader_binary_size(sscreen
, shader
),
2565 conf
->lds_size
, conf
->scratch_bytes_per_wave
,
2566 shader
->info
.max_simd_waves
);
2570 const char *si_get_shader_name(const struct si_shader
*shader
)
2572 switch (shader
->selector
->type
) {
2573 case PIPE_SHADER_VERTEX
:
2574 if (shader
->key
.as_es
)
2575 return "Vertex Shader as ES";
2576 else if (shader
->key
.as_ls
)
2577 return "Vertex Shader as LS";
2578 else if (shader
->key
.opt
.vs_as_prim_discard_cs
)
2579 return "Vertex Shader as Primitive Discard CS";
2580 else if (shader
->key
.as_ngg
)
2581 return "Vertex Shader as ESGS";
2583 return "Vertex Shader as VS";
2584 case PIPE_SHADER_TESS_CTRL
:
2585 return "Tessellation Control Shader";
2586 case PIPE_SHADER_TESS_EVAL
:
2587 if (shader
->key
.as_es
)
2588 return "Tessellation Evaluation Shader as ES";
2589 else if (shader
->key
.as_ngg
)
2590 return "Tessellation Evaluation Shader as ESGS";
2592 return "Tessellation Evaluation Shader as VS";
2593 case PIPE_SHADER_GEOMETRY
:
2594 if (shader
->is_gs_copy_shader
)
2595 return "GS Copy Shader as VS";
2597 return "Geometry Shader";
2598 case PIPE_SHADER_FRAGMENT
:
2599 return "Pixel Shader";
2600 case PIPE_SHADER_COMPUTE
:
2601 return "Compute Shader";
2603 return "Unknown Shader";
2607 void si_shader_dump(struct si_screen
*sscreen
, struct si_shader
*shader
,
2608 struct pipe_debug_callback
*debug
,
2609 FILE *file
, bool check_debug_option
)
2611 enum pipe_shader_type shader_type
= shader
->selector
->type
;
2613 if (!check_debug_option
||
2614 si_can_dump_shader(sscreen
, shader_type
))
2615 si_dump_shader_key(shader
, file
);
2617 if (!check_debug_option
&& shader
->binary
.llvm_ir_string
) {
2618 if (shader
->previous_stage
&&
2619 shader
->previous_stage
->binary
.llvm_ir_string
) {
2620 fprintf(file
, "\n%s - previous stage - LLVM IR:\n\n",
2621 si_get_shader_name(shader
));
2622 fprintf(file
, "%s\n", shader
->previous_stage
->binary
.llvm_ir_string
);
2625 fprintf(file
, "\n%s - main shader part - LLVM IR:\n\n",
2626 si_get_shader_name(shader
));
2627 fprintf(file
, "%s\n", shader
->binary
.llvm_ir_string
);
2630 if (!check_debug_option
||
2631 (si_can_dump_shader(sscreen
, shader_type
) &&
2632 !(sscreen
->debug_flags
& DBG(NO_ASM
)))) {
2633 unsigned wave_size
= si_get_shader_wave_size(shader
);
2635 fprintf(file
, "\n%s:\n", si_get_shader_name(shader
));
2638 si_shader_dump_disassembly(sscreen
, &shader
->prolog
->binary
,
2639 shader_type
, wave_size
, debug
, "prolog", file
);
2640 if (shader
->previous_stage
)
2641 si_shader_dump_disassembly(sscreen
, &shader
->previous_stage
->binary
,
2642 shader_type
, wave_size
, debug
, "previous stage", file
);
2643 if (shader
->prolog2
)
2644 si_shader_dump_disassembly(sscreen
, &shader
->prolog2
->binary
,
2645 shader_type
, wave_size
, debug
, "prolog2", file
);
2647 si_shader_dump_disassembly(sscreen
, &shader
->binary
, shader_type
,
2648 wave_size
, debug
, "main", file
);
2651 si_shader_dump_disassembly(sscreen
, &shader
->epilog
->binary
,
2652 shader_type
, wave_size
, debug
, "epilog", file
);
2653 fprintf(file
, "\n");
2656 si_shader_dump_stats(sscreen
, shader
, file
, check_debug_option
);
2659 static int si_compile_llvm(struct si_screen
*sscreen
,
2660 struct si_shader_binary
*binary
,
2661 struct ac_shader_config
*conf
,
2662 struct ac_llvm_compiler
*compiler
,
2664 struct pipe_debug_callback
*debug
,
2665 enum pipe_shader_type shader_type
,
2668 bool less_optimized
)
2670 unsigned count
= p_atomic_inc_return(&sscreen
->num_compilations
);
2672 if (si_can_dump_shader(sscreen
, shader_type
)) {
2673 fprintf(stderr
, "radeonsi: Compiling shader %d\n", count
);
2675 if (!(sscreen
->debug_flags
& (DBG(NO_IR
) | DBG(PREOPT_IR
)))) {
2676 fprintf(stderr
, "%s LLVM IR:\n\n", name
);
2677 ac_dump_module(mod
);
2678 fprintf(stderr
, "\n");
2682 if (sscreen
->record_llvm_ir
) {
2683 char *ir
= LLVMPrintModuleToString(mod
);
2684 binary
->llvm_ir_string
= strdup(ir
);
2685 LLVMDisposeMessage(ir
);
2688 if (!si_replace_shader(count
, binary
)) {
2689 unsigned r
= si_llvm_compile(mod
, binary
, compiler
, debug
,
2690 less_optimized
, wave_size
);
2695 struct ac_rtld_binary rtld
;
2696 if (!ac_rtld_open(&rtld
, (struct ac_rtld_open_info
){
2697 .info
= &sscreen
->info
,
2698 .shader_type
= tgsi_processor_to_shader_stage(shader_type
),
2699 .wave_size
= wave_size
,
2701 .elf_ptrs
= &binary
->elf_buffer
,
2702 .elf_sizes
= &binary
->elf_size
}))
2705 bool ok
= ac_rtld_read_config(&rtld
, conf
);
2706 ac_rtld_close(&rtld
);
2710 /* Enable 64-bit and 16-bit denormals, because there is no performance
2713 * If denormals are enabled, all floating-point output modifiers are
2716 * Don't enable denormals for 32-bit floats, because:
2717 * - Floating-point output modifiers would be ignored by the hw.
2718 * - Some opcodes don't support denormals, such as v_mad_f32. We would
2719 * have to stop using those.
2720 * - GFX6 & GFX7 would be very slow.
2722 conf
->float_mode
|= V_00B028_FP_64_DENORMS
;
2727 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2729 si_generate_gs_copy_shader(struct si_screen
*sscreen
,
2730 struct ac_llvm_compiler
*compiler
,
2731 struct si_shader_selector
*gs_selector
,
2732 struct pipe_debug_callback
*debug
)
2734 struct si_shader_context ctx
;
2735 struct si_shader
*shader
;
2736 LLVMBuilderRef builder
;
2737 struct si_shader_output_values outputs
[SI_MAX_VS_OUTPUTS
];
2738 struct si_shader_info
*gsinfo
= &gs_selector
->info
;
2742 shader
= CALLOC_STRUCT(si_shader
);
2746 /* We can leave the fence as permanently signaled because the GS copy
2747 * shader only becomes visible globally after it has been compiled. */
2748 util_queue_fence_init(&shader
->ready
);
2750 shader
->selector
= gs_selector
;
2751 shader
->is_gs_copy_shader
= true;
2753 si_llvm_context_init(&ctx
, sscreen
, compiler
,
2754 si_get_wave_size(sscreen
, PIPE_SHADER_VERTEX
, false, false));
2755 ctx
.shader
= shader
;
2756 ctx
.type
= PIPE_SHADER_VERTEX
;
2758 builder
= ctx
.ac
.builder
;
2760 create_function(&ctx
);
2761 preload_ring_buffers(&ctx
);
2763 LLVMValueRef voffset
=
2764 LLVMBuildMul(ctx
.ac
.builder
, ctx
.abi
.vertex_id
,
2765 LLVMConstInt(ctx
.i32
, 4, 0), "");
2767 /* Fetch the vertex stream ID.*/
2768 LLVMValueRef stream_id
;
2770 if (!sscreen
->use_ngg_streamout
&& gs_selector
->so
.num_outputs
)
2771 stream_id
= si_unpack_param(&ctx
, ctx
.streamout_config
, 24, 2);
2773 stream_id
= ctx
.i32_0
;
2775 /* Fill in output information. */
2776 for (i
= 0; i
< gsinfo
->num_outputs
; ++i
) {
2777 outputs
[i
].semantic_name
= gsinfo
->output_semantic_name
[i
];
2778 outputs
[i
].semantic_index
= gsinfo
->output_semantic_index
[i
];
2780 for (int chan
= 0; chan
< 4; chan
++) {
2781 outputs
[i
].vertex_stream
[chan
] =
2782 (gsinfo
->output_streams
[i
] >> (2 * chan
)) & 3;
2786 LLVMBasicBlockRef end_bb
;
2787 LLVMValueRef switch_inst
;
2789 end_bb
= LLVMAppendBasicBlockInContext(ctx
.ac
.context
, ctx
.main_fn
, "end");
2790 switch_inst
= LLVMBuildSwitch(builder
, stream_id
, end_bb
, 4);
2792 for (int stream
= 0; stream
< 4; stream
++) {
2793 LLVMBasicBlockRef bb
;
2796 if (!gsinfo
->num_stream_output_components
[stream
])
2799 if (stream
> 0 && !gs_selector
->so
.num_outputs
)
2802 bb
= LLVMInsertBasicBlockInContext(ctx
.ac
.context
, end_bb
, "out");
2803 LLVMAddCase(switch_inst
, LLVMConstInt(ctx
.i32
, stream
, 0), bb
);
2804 LLVMPositionBuilderAtEnd(builder
, bb
);
2806 /* Fetch vertex data from GSVS ring */
2808 for (i
= 0; i
< gsinfo
->num_outputs
; ++i
) {
2809 for (unsigned chan
= 0; chan
< 4; chan
++) {
2810 if (!(gsinfo
->output_usagemask
[i
] & (1 << chan
)) ||
2811 outputs
[i
].vertex_stream
[chan
] != stream
) {
2812 outputs
[i
].values
[chan
] = LLVMGetUndef(ctx
.f32
);
2816 LLVMValueRef soffset
= LLVMConstInt(ctx
.i32
,
2817 offset
* gs_selector
->gs_max_out_vertices
* 16 * 4, 0);
2820 outputs
[i
].values
[chan
] =
2821 ac_build_buffer_load(&ctx
.ac
,
2822 ctx
.gsvs_ring
[0], 1,
2824 soffset
, 0, ac_glc
| ac_slc
,
2829 /* Streamout and exports. */
2830 if (!sscreen
->use_ngg_streamout
&& gs_selector
->so
.num_outputs
) {
2831 si_llvm_emit_streamout(&ctx
, outputs
,
2832 gsinfo
->num_outputs
,
2837 si_llvm_export_vs(&ctx
, outputs
, gsinfo
->num_outputs
);
2839 LLVMBuildBr(builder
, end_bb
);
2842 LLVMPositionBuilderAtEnd(builder
, end_bb
);
2844 LLVMBuildRetVoid(ctx
.ac
.builder
);
2846 ctx
.type
= PIPE_SHADER_GEOMETRY
; /* override for shader dumping */
2847 si_llvm_optimize_module(&ctx
);
2850 if (si_compile_llvm(sscreen
, &ctx
.shader
->binary
,
2851 &ctx
.shader
->config
, ctx
.compiler
,
2853 debug
, PIPE_SHADER_GEOMETRY
, ctx
.ac
.wave_size
,
2854 "GS Copy Shader", false) == 0) {
2855 if (si_can_dump_shader(sscreen
, PIPE_SHADER_GEOMETRY
))
2856 fprintf(stderr
, "GS Copy Shader:\n");
2857 si_shader_dump(sscreen
, ctx
.shader
, debug
, stderr
, true);
2859 if (!ctx
.shader
->config
.scratch_bytes_per_wave
)
2860 ok
= si_shader_binary_upload(sscreen
, ctx
.shader
, 0);
2865 si_llvm_dispose(&ctx
);
2871 si_fix_resource_usage(sscreen
, shader
);
2876 static void si_dump_shader_key_vs(const struct si_shader_key
*key
,
2877 const struct si_vs_prolog_bits
*prolog
,
2878 const char *prefix
, FILE *f
)
2880 fprintf(f
, " %s.instance_divisor_is_one = %u\n",
2881 prefix
, prolog
->instance_divisor_is_one
);
2882 fprintf(f
, " %s.instance_divisor_is_fetched = %u\n",
2883 prefix
, prolog
->instance_divisor_is_fetched
);
2884 fprintf(f
, " %s.unpack_instance_id_from_vertex_id = %u\n",
2885 prefix
, prolog
->unpack_instance_id_from_vertex_id
);
2886 fprintf(f
, " %s.ls_vgpr_fix = %u\n",
2887 prefix
, prolog
->ls_vgpr_fix
);
2889 fprintf(f
, " mono.vs.fetch_opencode = %x\n", key
->mono
.vs_fetch_opencode
);
2890 fprintf(f
, " mono.vs.fix_fetch = {");
2891 for (int i
= 0; i
< SI_MAX_ATTRIBS
; i
++) {
2892 union si_vs_fix_fetch fix
= key
->mono
.vs_fix_fetch
[i
];
2898 fprintf(f
, "%u.%u.%u.%u", fix
.u
.reverse
, fix
.u
.log_size
,
2899 fix
.u
.num_channels_m1
, fix
.u
.format
);
2904 static void si_dump_shader_key(const struct si_shader
*shader
, FILE *f
)
2906 const struct si_shader_key
*key
= &shader
->key
;
2907 enum pipe_shader_type shader_type
= shader
->selector
->type
;
2909 fprintf(f
, "SHADER KEY\n");
2911 switch (shader_type
) {
2912 case PIPE_SHADER_VERTEX
:
2913 si_dump_shader_key_vs(key
, &key
->part
.vs
.prolog
,
2914 "part.vs.prolog", f
);
2915 fprintf(f
, " as_es = %u\n", key
->as_es
);
2916 fprintf(f
, " as_ls = %u\n", key
->as_ls
);
2917 fprintf(f
, " as_ngg = %u\n", key
->as_ngg
);
2918 fprintf(f
, " mono.u.vs_export_prim_id = %u\n",
2919 key
->mono
.u
.vs_export_prim_id
);
2920 fprintf(f
, " opt.vs_as_prim_discard_cs = %u\n",
2921 key
->opt
.vs_as_prim_discard_cs
);
2922 fprintf(f
, " opt.cs_prim_type = %s\n",
2923 tgsi_primitive_names
[key
->opt
.cs_prim_type
]);
2924 fprintf(f
, " opt.cs_indexed = %u\n",
2925 key
->opt
.cs_indexed
);
2926 fprintf(f
, " opt.cs_instancing = %u\n",
2927 key
->opt
.cs_instancing
);
2928 fprintf(f
, " opt.cs_primitive_restart = %u\n",
2929 key
->opt
.cs_primitive_restart
);
2930 fprintf(f
, " opt.cs_provoking_vertex_first = %u\n",
2931 key
->opt
.cs_provoking_vertex_first
);
2932 fprintf(f
, " opt.cs_need_correct_orientation = %u\n",
2933 key
->opt
.cs_need_correct_orientation
);
2934 fprintf(f
, " opt.cs_cull_front = %u\n",
2935 key
->opt
.cs_cull_front
);
2936 fprintf(f
, " opt.cs_cull_back = %u\n",
2937 key
->opt
.cs_cull_back
);
2938 fprintf(f
, " opt.cs_cull_z = %u\n",
2939 key
->opt
.cs_cull_z
);
2940 fprintf(f
, " opt.cs_halfz_clip_space = %u\n",
2941 key
->opt
.cs_halfz_clip_space
);
2944 case PIPE_SHADER_TESS_CTRL
:
2945 if (shader
->selector
->screen
->info
.chip_class
>= GFX9
) {
2946 si_dump_shader_key_vs(key
, &key
->part
.tcs
.ls_prolog
,
2947 "part.tcs.ls_prolog", f
);
2949 fprintf(f
, " part.tcs.epilog.prim_mode = %u\n", key
->part
.tcs
.epilog
.prim_mode
);
2950 fprintf(f
, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64
"\n", key
->mono
.u
.ff_tcs_inputs_to_copy
);
2953 case PIPE_SHADER_TESS_EVAL
:
2954 fprintf(f
, " as_es = %u\n", key
->as_es
);
2955 fprintf(f
, " as_ngg = %u\n", key
->as_ngg
);
2956 fprintf(f
, " mono.u.vs_export_prim_id = %u\n",
2957 key
->mono
.u
.vs_export_prim_id
);
2960 case PIPE_SHADER_GEOMETRY
:
2961 if (shader
->is_gs_copy_shader
)
2964 if (shader
->selector
->screen
->info
.chip_class
>= GFX9
&&
2965 key
->part
.gs
.es
->type
== PIPE_SHADER_VERTEX
) {
2966 si_dump_shader_key_vs(key
, &key
->part
.gs
.vs_prolog
,
2967 "part.gs.vs_prolog", f
);
2969 fprintf(f
, " part.gs.prolog.tri_strip_adj_fix = %u\n", key
->part
.gs
.prolog
.tri_strip_adj_fix
);
2970 fprintf(f
, " part.gs.prolog.gfx9_prev_is_vs = %u\n", key
->part
.gs
.prolog
.gfx9_prev_is_vs
);
2971 fprintf(f
, " as_ngg = %u\n", key
->as_ngg
);
2974 case PIPE_SHADER_COMPUTE
:
2977 case PIPE_SHADER_FRAGMENT
:
2978 fprintf(f
, " part.ps.prolog.color_two_side = %u\n", key
->part
.ps
.prolog
.color_two_side
);
2979 fprintf(f
, " part.ps.prolog.flatshade_colors = %u\n", key
->part
.ps
.prolog
.flatshade_colors
);
2980 fprintf(f
, " part.ps.prolog.poly_stipple = %u\n", key
->part
.ps
.prolog
.poly_stipple
);
2981 fprintf(f
, " part.ps.prolog.force_persp_sample_interp = %u\n", key
->part
.ps
.prolog
.force_persp_sample_interp
);
2982 fprintf(f
, " part.ps.prolog.force_linear_sample_interp = %u\n", key
->part
.ps
.prolog
.force_linear_sample_interp
);
2983 fprintf(f
, " part.ps.prolog.force_persp_center_interp = %u\n", key
->part
.ps
.prolog
.force_persp_center_interp
);
2984 fprintf(f
, " part.ps.prolog.force_linear_center_interp = %u\n", key
->part
.ps
.prolog
.force_linear_center_interp
);
2985 fprintf(f
, " part.ps.prolog.bc_optimize_for_persp = %u\n", key
->part
.ps
.prolog
.bc_optimize_for_persp
);
2986 fprintf(f
, " part.ps.prolog.bc_optimize_for_linear = %u\n", key
->part
.ps
.prolog
.bc_optimize_for_linear
);
2987 fprintf(f
, " part.ps.prolog.samplemask_log_ps_iter = %u\n", key
->part
.ps
.prolog
.samplemask_log_ps_iter
);
2988 fprintf(f
, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key
->part
.ps
.epilog
.spi_shader_col_format
);
2989 fprintf(f
, " part.ps.epilog.color_is_int8 = 0x%X\n", key
->part
.ps
.epilog
.color_is_int8
);
2990 fprintf(f
, " part.ps.epilog.color_is_int10 = 0x%X\n", key
->part
.ps
.epilog
.color_is_int10
);
2991 fprintf(f
, " part.ps.epilog.last_cbuf = %u\n", key
->part
.ps
.epilog
.last_cbuf
);
2992 fprintf(f
, " part.ps.epilog.alpha_func = %u\n", key
->part
.ps
.epilog
.alpha_func
);
2993 fprintf(f
, " part.ps.epilog.alpha_to_one = %u\n", key
->part
.ps
.epilog
.alpha_to_one
);
2994 fprintf(f
, " part.ps.epilog.poly_line_smoothing = %u\n", key
->part
.ps
.epilog
.poly_line_smoothing
);
2995 fprintf(f
, " part.ps.epilog.clamp_color = %u\n", key
->part
.ps
.epilog
.clamp_color
);
2996 fprintf(f
, " mono.u.ps.interpolate_at_sample_force_center = %u\n", key
->mono
.u
.ps
.interpolate_at_sample_force_center
);
2997 fprintf(f
, " mono.u.ps.fbfetch_msaa = %u\n", key
->mono
.u
.ps
.fbfetch_msaa
);
2998 fprintf(f
, " mono.u.ps.fbfetch_is_1D = %u\n", key
->mono
.u
.ps
.fbfetch_is_1D
);
2999 fprintf(f
, " mono.u.ps.fbfetch_layered = %u\n", key
->mono
.u
.ps
.fbfetch_layered
);
3006 if ((shader_type
== PIPE_SHADER_GEOMETRY
||
3007 shader_type
== PIPE_SHADER_TESS_EVAL
||
3008 shader_type
== PIPE_SHADER_VERTEX
) &&
3009 !key
->as_es
&& !key
->as_ls
) {
3010 fprintf(f
, " opt.kill_outputs = 0x%"PRIx64
"\n", key
->opt
.kill_outputs
);
3011 fprintf(f
, " opt.clip_disable = %u\n", key
->opt
.clip_disable
);
3015 static void si_optimize_vs_outputs(struct si_shader_context
*ctx
)
3017 struct si_shader
*shader
= ctx
->shader
;
3018 struct si_shader_info
*info
= &shader
->selector
->info
;
3020 if ((ctx
->type
!= PIPE_SHADER_VERTEX
&&
3021 ctx
->type
!= PIPE_SHADER_TESS_EVAL
) ||
3022 shader
->key
.as_ls
||
3026 ac_optimize_vs_outputs(&ctx
->ac
,
3028 shader
->info
.vs_output_param_offset
,
3030 &shader
->info
.nr_param_exports
);
3033 static void si_init_exec_from_input(struct si_shader_context
*ctx
,
3034 struct ac_arg param
, unsigned bitoffset
)
3036 LLVMValueRef args
[] = {
3037 ac_get_arg(&ctx
->ac
, param
),
3038 LLVMConstInt(ctx
->i32
, bitoffset
, 0),
3040 ac_build_intrinsic(&ctx
->ac
,
3041 "llvm.amdgcn.init.exec.from.input",
3042 ctx
->voidt
, args
, 2, AC_FUNC_ATTR_CONVERGENT
);
3045 static bool si_vs_needs_prolog(const struct si_shader_selector
*sel
,
3046 const struct si_vs_prolog_bits
*key
)
3048 /* VGPR initialization fixup for Vega10 and Raven is always done in the
3050 return sel
->vs_needs_prolog
||
3052 key
->unpack_instance_id_from_vertex_id
;
3055 LLVMValueRef
si_is_es_thread(struct si_shader_context
*ctx
)
3057 /* Return true if the current thread should execute an ES thread. */
3058 return LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
,
3059 ac_get_thread_id(&ctx
->ac
),
3060 si_unpack_param(ctx
, ctx
->merged_wave_info
, 0, 8), "");
3063 LLVMValueRef
si_is_gs_thread(struct si_shader_context
*ctx
)
3065 /* Return true if the current thread should execute a GS thread. */
3066 return LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULT
,
3067 ac_get_thread_id(&ctx
->ac
),
3068 si_unpack_param(ctx
, ctx
->merged_wave_info
, 8, 8), "");
3071 static bool si_build_main_function(struct si_shader_context
*ctx
,
3072 struct nir_shader
*nir
, bool free_nir
)
3074 struct si_shader
*shader
= ctx
->shader
;
3075 struct si_shader_selector
*sel
= shader
->selector
;
3077 switch (ctx
->type
) {
3078 case PIPE_SHADER_VERTEX
:
3079 if (shader
->key
.as_ls
)
3080 ctx
->abi
.emit_outputs
= si_llvm_emit_ls_epilogue
;
3081 else if (shader
->key
.as_es
)
3082 ctx
->abi
.emit_outputs
= si_llvm_emit_es_epilogue
;
3083 else if (shader
->key
.opt
.vs_as_prim_discard_cs
)
3084 ctx
->abi
.emit_outputs
= si_llvm_emit_prim_discard_cs_epilogue
;
3085 else if (shader
->key
.as_ngg
)
3086 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_epilogue
;
3088 ctx
->abi
.emit_outputs
= si_llvm_emit_vs_epilogue
;
3089 ctx
->abi
.load_base_vertex
= get_base_vertex
;
3091 case PIPE_SHADER_TESS_CTRL
:
3092 si_llvm_init_tcs_callbacks(ctx
);
3094 case PIPE_SHADER_TESS_EVAL
:
3095 si_llvm_init_tes_callbacks(ctx
);
3097 if (shader
->key
.as_es
)
3098 ctx
->abi
.emit_outputs
= si_llvm_emit_es_epilogue
;
3099 else if (shader
->key
.as_ngg
)
3100 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_epilogue
;
3102 ctx
->abi
.emit_outputs
= si_llvm_emit_vs_epilogue
;
3104 case PIPE_SHADER_GEOMETRY
:
3105 ctx
->abi
.load_inputs
= si_nir_load_input_gs
;
3106 ctx
->abi
.emit_vertex
= si_llvm_emit_vertex
;
3107 ctx
->abi
.emit_primitive
= si_llvm_emit_primitive
;
3108 ctx
->abi
.emit_outputs
= si_llvm_emit_gs_epilogue
;
3110 case PIPE_SHADER_FRAGMENT
:
3111 si_llvm_init_ps_callbacks(ctx
);
3113 case PIPE_SHADER_COMPUTE
:
3114 ctx
->abi
.load_local_group_size
= get_block_size
;
3117 assert(!"Unsupported shader type");
3121 ctx
->abi
.load_ubo
= load_ubo
;
3122 ctx
->abi
.load_ssbo
= load_ssbo
;
3124 create_function(ctx
);
3125 preload_ring_buffers(ctx
);
3127 if (ctx
->type
== PIPE_SHADER_TESS_CTRL
&&
3128 sel
->info
.tessfactors_are_def_in_all_invocs
) {
3129 for (unsigned i
= 0; i
< 6; i
++) {
3130 ctx
->invoc0_tess_factors
[i
] =
3131 ac_build_alloca_undef(&ctx
->ac
, ctx
->i32
, "");
3135 if (ctx
->type
== PIPE_SHADER_GEOMETRY
) {
3136 for (unsigned i
= 0; i
< 4; i
++) {
3137 ctx
->gs_next_vertex
[i
] =
3138 ac_build_alloca(&ctx
->ac
, ctx
->i32
, "");
3140 if (shader
->key
.as_ngg
) {
3141 for (unsigned i
= 0; i
< 4; ++i
) {
3142 ctx
->gs_curprim_verts
[i
] =
3143 ac_build_alloca(&ctx
->ac
, ctx
->ac
.i32
, "");
3144 ctx
->gs_generated_prims
[i
] =
3145 ac_build_alloca(&ctx
->ac
, ctx
->ac
.i32
, "");
3148 unsigned scratch_size
= 8;
3149 if (sel
->so
.num_outputs
)
3152 LLVMTypeRef ai32
= LLVMArrayType(ctx
->i32
, scratch_size
);
3153 ctx
->gs_ngg_scratch
= LLVMAddGlobalInAddressSpace(ctx
->ac
.module
,
3154 ai32
, "ngg_scratch", AC_ADDR_SPACE_LDS
);
3155 LLVMSetInitializer(ctx
->gs_ngg_scratch
, LLVMGetUndef(ai32
));
3156 LLVMSetAlignment(ctx
->gs_ngg_scratch
, 4);
3158 ctx
->gs_ngg_emit
= LLVMAddGlobalInAddressSpace(ctx
->ac
.module
,
3159 LLVMArrayType(ctx
->i32
, 0), "ngg_emit", AC_ADDR_SPACE_LDS
);
3160 LLVMSetLinkage(ctx
->gs_ngg_emit
, LLVMExternalLinkage
);
3161 LLVMSetAlignment(ctx
->gs_ngg_emit
, 4);
3165 if (ctx
->type
!= PIPE_SHADER_GEOMETRY
&&
3166 (shader
->key
.as_ngg
&& !shader
->key
.as_es
)) {
3167 /* Unconditionally declare scratch space base for streamout and
3168 * vertex compaction. Whether space is actually allocated is
3169 * determined during linking / PM4 creation.
3171 * Add an extra dword per vertex to ensure an odd stride, which
3172 * avoids bank conflicts for SoA accesses.
3174 if (!gfx10_is_ngg_passthrough(shader
))
3175 declare_esgs_ring(ctx
);
3177 /* This is really only needed when streamout and / or vertex
3178 * compaction is enabled.
3180 if (sel
->so
.num_outputs
&& !ctx
->gs_ngg_scratch
) {
3181 LLVMTypeRef asi32
= LLVMArrayType(ctx
->i32
, 8);
3182 ctx
->gs_ngg_scratch
= LLVMAddGlobalInAddressSpace(ctx
->ac
.module
,
3183 asi32
, "ngg_scratch", AC_ADDR_SPACE_LDS
);
3184 LLVMSetInitializer(ctx
->gs_ngg_scratch
, LLVMGetUndef(asi32
));
3185 LLVMSetAlignment(ctx
->gs_ngg_scratch
, 4);
3189 /* For GFX9 merged shaders:
3190 * - Set EXEC for the first shader. If the prolog is present, set
3191 * EXEC there instead.
3192 * - Add a barrier before the second shader.
3193 * - In the second shader, reset EXEC to ~0 and wrap the main part in
3194 * an if-statement. This is required for correctness in geometry
3195 * shaders, to ensure that empty GS waves do not send GS_EMIT and
3198 * For monolithic merged shaders, the first shader is wrapped in an
3199 * if-block together with its prolog in si_build_wrapper_function.
3201 * NGG vertex and tess eval shaders running as the last
3202 * vertex/geometry stage handle execution explicitly using
3205 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
3206 if (!shader
->is_monolithic
&&
3207 (shader
->key
.as_es
|| shader
->key
.as_ls
) &&
3208 (ctx
->type
== PIPE_SHADER_TESS_EVAL
||
3209 (ctx
->type
== PIPE_SHADER_VERTEX
&&
3210 !si_vs_needs_prolog(sel
, &shader
->key
.part
.vs
.prolog
)))) {
3211 si_init_exec_from_input(ctx
,
3212 ctx
->merged_wave_info
, 0);
3213 } else if (ctx
->type
== PIPE_SHADER_TESS_CTRL
||
3214 ctx
->type
== PIPE_SHADER_GEOMETRY
||
3215 (shader
->key
.as_ngg
&& !shader
->key
.as_es
)) {
3216 LLVMValueRef thread_enabled
;
3217 bool nested_barrier
;
3219 if (!shader
->is_monolithic
||
3220 (ctx
->type
== PIPE_SHADER_TESS_EVAL
&&
3221 (shader
->key
.as_ngg
&& !shader
->key
.as_es
)))
3222 ac_init_exec_full_mask(&ctx
->ac
);
3224 if (ctx
->type
== PIPE_SHADER_TESS_CTRL
||
3225 ctx
->type
== PIPE_SHADER_GEOMETRY
) {
3226 if (ctx
->type
== PIPE_SHADER_GEOMETRY
&& shader
->key
.as_ngg
) {
3227 gfx10_ngg_gs_emit_prologue(ctx
);
3228 nested_barrier
= false;
3230 nested_barrier
= true;
3233 thread_enabled
= si_is_gs_thread(ctx
);
3235 thread_enabled
= si_is_es_thread(ctx
);
3236 nested_barrier
= false;
3239 ctx
->merged_wrap_if_entry_block
= LLVMGetInsertBlock(ctx
->ac
.builder
);
3240 ctx
->merged_wrap_if_label
= 11500;
3241 ac_build_ifcc(&ctx
->ac
, thread_enabled
, ctx
->merged_wrap_if_label
);
3243 if (nested_barrier
) {
3244 /* Execute a barrier before the second shader in
3247 * Execute the barrier inside the conditional block,
3248 * so that empty waves can jump directly to s_endpgm,
3249 * which will also signal the barrier.
3251 * This is possible in gfx9, because an empty wave
3252 * for the second shader does not participate in
3253 * the epilogue. With NGG, empty waves may still
3254 * be required to export data (e.g. GS output vertices),
3255 * so we cannot let them exit early.
3257 * If the shader is TCS and the TCS epilog is present
3258 * and contains a barrier, it will wait there and then
3261 si_llvm_emit_barrier(ctx
);
3266 if (sel
->force_correct_derivs_after_kill
) {
3267 ctx
->postponed_kill
= ac_build_alloca_undef(&ctx
->ac
, ctx
->i1
, "");
3268 /* true = don't kill. */
3269 LLVMBuildStore(ctx
->ac
.builder
, ctx
->i1true
,
3270 ctx
->postponed_kill
);
3273 bool success
= si_nir_build_llvm(ctx
, nir
);
3277 fprintf(stderr
, "Failed to translate shader from NIR to LLVM\n");
3281 si_llvm_build_ret(ctx
, ctx
->return_value
);
3286 * Compute the VS prolog key, which contains all the information needed to
3287 * build the VS prolog function, and set shader->info bits where needed.
3289 * \param info Shader info of the vertex shader.
3290 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
3291 * \param prolog_key Key of the VS prolog
3292 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
3293 * \param key Output shader part key.
3295 static void si_get_vs_prolog_key(const struct si_shader_info
*info
,
3296 unsigned num_input_sgprs
,
3297 const struct si_vs_prolog_bits
*prolog_key
,
3298 struct si_shader
*shader_out
,
3299 union si_shader_part_key
*key
)
3301 memset(key
, 0, sizeof(*key
));
3302 key
->vs_prolog
.states
= *prolog_key
;
3303 key
->vs_prolog
.num_input_sgprs
= num_input_sgprs
;
3304 key
->vs_prolog
.num_inputs
= info
->num_inputs
;
3305 key
->vs_prolog
.as_ls
= shader_out
->key
.as_ls
;
3306 key
->vs_prolog
.as_es
= shader_out
->key
.as_es
;
3307 key
->vs_prolog
.as_ngg
= shader_out
->key
.as_ngg
;
3309 if (shader_out
->selector
->type
== PIPE_SHADER_TESS_CTRL
) {
3310 key
->vs_prolog
.as_ls
= 1;
3311 key
->vs_prolog
.num_merged_next_stage_vgprs
= 2;
3312 } else if (shader_out
->selector
->type
== PIPE_SHADER_GEOMETRY
) {
3313 key
->vs_prolog
.as_es
= 1;
3314 key
->vs_prolog
.num_merged_next_stage_vgprs
= 5;
3315 } else if (shader_out
->key
.as_ngg
) {
3316 key
->vs_prolog
.num_merged_next_stage_vgprs
= 5;
3319 /* Enable loading the InstanceID VGPR. */
3320 uint16_t input_mask
= u_bit_consecutive(0, info
->num_inputs
);
3322 if ((key
->vs_prolog
.states
.instance_divisor_is_one
|
3323 key
->vs_prolog
.states
.instance_divisor_is_fetched
) & input_mask
)
3324 shader_out
->info
.uses_instanceid
= true;
3328 * Build the GS prolog function. Rotate the input vertices for triangle strips
3331 static void si_build_gs_prolog_function(struct si_shader_context
*ctx
,
3332 union si_shader_part_key
*key
)
3334 unsigned num_sgprs
, num_vgprs
;
3335 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3336 LLVMTypeRef returns
[AC_MAX_ARGS
];
3337 LLVMValueRef func
, ret
;
3339 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
3341 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
3342 if (key
->gs_prolog
.states
.gfx9_prev_is_vs
)
3343 num_sgprs
= 8 + GFX9_VSGS_NUM_USER_SGPR
;
3345 num_sgprs
= 8 + GFX9_TESGS_NUM_USER_SGPR
;
3346 num_vgprs
= 5; /* ES inputs are not needed by GS */
3348 num_sgprs
= GFX6_GS_NUM_USER_SGPR
+ 2;
3352 for (unsigned i
= 0; i
< num_sgprs
; ++i
) {
3353 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, NULL
);
3354 returns
[i
] = ctx
->i32
;
3357 for (unsigned i
= 0; i
< num_vgprs
; ++i
) {
3358 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, NULL
);
3359 returns
[num_sgprs
+ i
] = ctx
->f32
;
3362 /* Create the function. */
3363 si_llvm_create_func(ctx
, "gs_prolog", returns
, num_sgprs
+ num_vgprs
, 0);
3364 func
= ctx
->main_fn
;
3366 /* Set the full EXEC mask for the prolog, because we are only fiddling
3367 * with registers here. The main shader part will set the correct EXEC
3370 if (ctx
->screen
->info
.chip_class
>= GFX9
&& !key
->gs_prolog
.is_monolithic
)
3371 ac_init_exec_full_mask(&ctx
->ac
);
3373 /* Copy inputs to outputs. This should be no-op, as the registers match,
3374 * but it will prevent the compiler from overwriting them unintentionally.
3376 ret
= ctx
->return_value
;
3377 for (unsigned i
= 0; i
< num_sgprs
; i
++) {
3378 LLVMValueRef p
= LLVMGetParam(func
, i
);
3379 ret
= LLVMBuildInsertValue(builder
, ret
, p
, i
, "");
3381 for (unsigned i
= 0; i
< num_vgprs
; i
++) {
3382 LLVMValueRef p
= LLVMGetParam(func
, num_sgprs
+ i
);
3383 p
= ac_to_float(&ctx
->ac
, p
);
3384 ret
= LLVMBuildInsertValue(builder
, ret
, p
, num_sgprs
+ i
, "");
3387 if (key
->gs_prolog
.states
.tri_strip_adj_fix
) {
3388 /* Remap the input vertices for every other primitive. */
3389 const struct ac_arg gfx6_vtx_params
[6] = {
3390 { .used
= true, .arg_index
= num_sgprs
},
3391 { .used
= true, .arg_index
= num_sgprs
+ 1 },
3392 { .used
= true, .arg_index
= num_sgprs
+ 3 },
3393 { .used
= true, .arg_index
= num_sgprs
+ 4 },
3394 { .used
= true, .arg_index
= num_sgprs
+ 5 },
3395 { .used
= true, .arg_index
= num_sgprs
+ 6 },
3397 const struct ac_arg gfx9_vtx_params
[3] = {
3398 { .used
= true, .arg_index
= num_sgprs
},
3399 { .used
= true, .arg_index
= num_sgprs
+ 1 },
3400 { .used
= true, .arg_index
= num_sgprs
+ 4 },
3402 LLVMValueRef vtx_in
[6], vtx_out
[6];
3403 LLVMValueRef prim_id
, rotate
;
3405 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
3406 for (unsigned i
= 0; i
< 3; i
++) {
3407 vtx_in
[i
*2] = si_unpack_param(ctx
, gfx9_vtx_params
[i
], 0, 16);
3408 vtx_in
[i
*2+1] = si_unpack_param(ctx
, gfx9_vtx_params
[i
], 16, 16);
3411 for (unsigned i
= 0; i
< 6; i
++)
3412 vtx_in
[i
] = ac_get_arg(&ctx
->ac
, gfx6_vtx_params
[i
]);
3415 prim_id
= LLVMGetParam(func
, num_sgprs
+ 2);
3416 rotate
= LLVMBuildTrunc(builder
, prim_id
, ctx
->i1
, "");
3418 for (unsigned i
= 0; i
< 6; ++i
) {
3419 LLVMValueRef base
, rotated
;
3421 rotated
= vtx_in
[(i
+ 4) % 6];
3422 vtx_out
[i
] = LLVMBuildSelect(builder
, rotate
, rotated
, base
, "");
3425 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
3426 for (unsigned i
= 0; i
< 3; i
++) {
3427 LLVMValueRef hi
, out
;
3429 hi
= LLVMBuildShl(builder
, vtx_out
[i
*2+1],
3430 LLVMConstInt(ctx
->i32
, 16, 0), "");
3431 out
= LLVMBuildOr(builder
, vtx_out
[i
*2], hi
, "");
3432 out
= ac_to_float(&ctx
->ac
, out
);
3433 ret
= LLVMBuildInsertValue(builder
, ret
, out
,
3434 gfx9_vtx_params
[i
].arg_index
, "");
3437 for (unsigned i
= 0; i
< 6; i
++) {
3440 out
= ac_to_float(&ctx
->ac
, vtx_out
[i
]);
3441 ret
= LLVMBuildInsertValue(builder
, ret
, out
,
3442 gfx6_vtx_params
[i
].arg_index
, "");
3447 LLVMBuildRet(builder
, ret
);
3451 * Given a list of shader part functions, build a wrapper function that
3452 * runs them in sequence to form a monolithic shader.
3454 void si_build_wrapper_function(struct si_shader_context
*ctx
, LLVMValueRef
*parts
,
3455 unsigned num_parts
, unsigned main_part
,
3456 unsigned next_shader_first_part
)
3458 LLVMBuilderRef builder
= ctx
->ac
.builder
;
3459 /* PS epilog has one arg per color component; gfx9 merged shader
3460 * prologs need to forward 40 SGPRs.
3462 LLVMValueRef initial
[AC_MAX_ARGS
], out
[AC_MAX_ARGS
];
3463 LLVMTypeRef function_type
;
3464 unsigned num_first_params
;
3465 unsigned num_out
, initial_num_out
;
3466 ASSERTED
unsigned num_out_sgpr
; /* used in debug checks */
3467 ASSERTED
unsigned initial_num_out_sgpr
; /* used in debug checks */
3468 unsigned num_sgprs
, num_vgprs
;
3471 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
3473 for (unsigned i
= 0; i
< num_parts
; ++i
) {
3474 ac_add_function_attr(ctx
->ac
.context
, parts
[i
], -1,
3475 AC_FUNC_ATTR_ALWAYSINLINE
);
3476 LLVMSetLinkage(parts
[i
], LLVMPrivateLinkage
);
3479 /* The parameters of the wrapper function correspond to those of the
3480 * first part in terms of SGPRs and VGPRs, but we use the types of the
3481 * main part to get the right types. This is relevant for the
3482 * dereferenceable attribute on descriptor table pointers.
3487 function_type
= LLVMGetElementType(LLVMTypeOf(parts
[0]));
3488 num_first_params
= LLVMCountParamTypes(function_type
);
3490 for (unsigned i
= 0; i
< num_first_params
; ++i
) {
3491 LLVMValueRef param
= LLVMGetParam(parts
[0], i
);
3493 if (ac_is_sgpr_param(param
)) {
3494 assert(num_vgprs
== 0);
3495 num_sgprs
+= ac_get_type_size(LLVMTypeOf(param
)) / 4;
3497 num_vgprs
+= ac_get_type_size(LLVMTypeOf(param
)) / 4;
3502 while (gprs
< num_sgprs
+ num_vgprs
) {
3503 LLVMValueRef param
= LLVMGetParam(parts
[main_part
], ctx
->args
.arg_count
);
3504 LLVMTypeRef type
= LLVMTypeOf(param
);
3505 unsigned size
= ac_get_type_size(type
) / 4;
3507 /* This is going to get casted anyways, so we don't have to
3508 * have the exact same type. But we do have to preserve the
3509 * pointer-ness so that LLVM knows about it.
3511 enum ac_arg_type arg_type
= AC_ARG_INT
;
3512 if (LLVMGetTypeKind(type
) == LLVMPointerTypeKind
) {
3513 type
= LLVMGetElementType(type
);
3515 if (LLVMGetTypeKind(type
) == LLVMVectorTypeKind
) {
3516 if (LLVMGetVectorSize(type
) == 4)
3517 arg_type
= AC_ARG_CONST_DESC_PTR
;
3518 else if (LLVMGetVectorSize(type
) == 8)
3519 arg_type
= AC_ARG_CONST_IMAGE_PTR
;
3522 } else if (type
== ctx
->f32
) {
3523 arg_type
= AC_ARG_CONST_FLOAT_PTR
;
3529 ac_add_arg(&ctx
->args
, gprs
< num_sgprs
? AC_ARG_SGPR
: AC_ARG_VGPR
,
3530 size
, arg_type
, NULL
);
3532 assert(ac_is_sgpr_param(param
) == (gprs
< num_sgprs
));
3533 assert(gprs
+ size
<= num_sgprs
+ num_vgprs
&&
3534 (gprs
>= num_sgprs
|| gprs
+ size
<= num_sgprs
));
3539 /* Prepare the return type. */
3540 unsigned num_returns
= 0;
3541 LLVMTypeRef returns
[AC_MAX_ARGS
], last_func_type
, return_type
;
3543 last_func_type
= LLVMGetElementType(LLVMTypeOf(parts
[num_parts
- 1]));
3544 return_type
= LLVMGetReturnType(last_func_type
);
3546 switch (LLVMGetTypeKind(return_type
)) {
3547 case LLVMStructTypeKind
:
3548 num_returns
= LLVMCountStructElementTypes(return_type
);
3549 assert(num_returns
<= ARRAY_SIZE(returns
));
3550 LLVMGetStructElementTypes(return_type
, returns
);
3552 case LLVMVoidTypeKind
:
3555 unreachable("unexpected type");
3558 si_llvm_create_func(ctx
, "wrapper", returns
, num_returns
,
3559 si_get_max_workgroup_size(ctx
->shader
));
3561 if (si_is_merged_shader(ctx
))
3562 ac_init_exec_full_mask(&ctx
->ac
);
3564 /* Record the arguments of the function as if they were an output of
3570 for (unsigned i
= 0; i
< ctx
->args
.arg_count
; ++i
) {
3571 LLVMValueRef param
= LLVMGetParam(ctx
->main_fn
, i
);
3572 LLVMTypeRef param_type
= LLVMTypeOf(param
);
3573 LLVMTypeRef out_type
= ctx
->args
.args
[i
].file
== AC_ARG_SGPR
? ctx
->i32
: ctx
->f32
;
3574 unsigned size
= ac_get_type_size(param_type
) / 4;
3577 if (LLVMGetTypeKind(param_type
) == LLVMPointerTypeKind
) {
3578 param
= LLVMBuildPtrToInt(builder
, param
, ctx
->i32
, "");
3579 param_type
= ctx
->i32
;
3582 if (param_type
!= out_type
)
3583 param
= LLVMBuildBitCast(builder
, param
, out_type
, "");
3584 out
[num_out
++] = param
;
3586 LLVMTypeRef vector_type
= LLVMVectorType(out_type
, size
);
3588 if (LLVMGetTypeKind(param_type
) == LLVMPointerTypeKind
) {
3589 param
= LLVMBuildPtrToInt(builder
, param
, ctx
->i64
, "");
3590 param_type
= ctx
->i64
;
3593 if (param_type
!= vector_type
)
3594 param
= LLVMBuildBitCast(builder
, param
, vector_type
, "");
3596 for (unsigned j
= 0; j
< size
; ++j
)
3597 out
[num_out
++] = LLVMBuildExtractElement(
3598 builder
, param
, LLVMConstInt(ctx
->i32
, j
, 0), "");
3601 if (ctx
->args
.args
[i
].file
== AC_ARG_SGPR
)
3602 num_out_sgpr
= num_out
;
3605 memcpy(initial
, out
, sizeof(out
));
3606 initial_num_out
= num_out
;
3607 initial_num_out_sgpr
= num_out_sgpr
;
3609 /* Now chain the parts. */
3610 LLVMValueRef ret
= NULL
;
3611 for (unsigned part
= 0; part
< num_parts
; ++part
) {
3612 LLVMValueRef in
[AC_MAX_ARGS
];
3613 LLVMTypeRef ret_type
;
3614 unsigned out_idx
= 0;
3615 unsigned num_params
= LLVMCountParams(parts
[part
]);
3617 /* Merged shaders are executed conditionally depending
3618 * on the number of enabled threads passed in the input SGPRs. */
3619 if (is_multi_part_shader(ctx
) && part
== 0) {
3620 LLVMValueRef ena
, count
= initial
[3];
3622 count
= LLVMBuildAnd(builder
, count
,
3623 LLVMConstInt(ctx
->i32
, 0x7f, 0), "");
3624 ena
= LLVMBuildICmp(builder
, LLVMIntULT
,
3625 ac_get_thread_id(&ctx
->ac
), count
, "");
3626 ac_build_ifcc(&ctx
->ac
, ena
, 6506);
3629 /* Derive arguments for the next part from outputs of the
3632 for (unsigned param_idx
= 0; param_idx
< num_params
; ++param_idx
) {
3634 LLVMTypeRef param_type
;
3636 unsigned param_size
;
3637 LLVMValueRef arg
= NULL
;
3639 param
= LLVMGetParam(parts
[part
], param_idx
);
3640 param_type
= LLVMTypeOf(param
);
3641 param_size
= ac_get_type_size(param_type
) / 4;
3642 is_sgpr
= ac_is_sgpr_param(param
);
3645 ac_add_function_attr(ctx
->ac
.context
, parts
[part
],
3646 param_idx
+ 1, AC_FUNC_ATTR_INREG
);
3647 } else if (out_idx
< num_out_sgpr
) {
3648 /* Skip returned SGPRs the current part doesn't
3649 * declare on the input. */
3650 out_idx
= num_out_sgpr
;
3653 assert(out_idx
+ param_size
<= (is_sgpr
? num_out_sgpr
: num_out
));
3655 if (param_size
== 1)
3658 arg
= ac_build_gather_values(&ctx
->ac
, &out
[out_idx
], param_size
);
3660 if (LLVMTypeOf(arg
) != param_type
) {
3661 if (LLVMGetTypeKind(param_type
) == LLVMPointerTypeKind
) {
3662 if (LLVMGetPointerAddressSpace(param_type
) ==
3663 AC_ADDR_SPACE_CONST_32BIT
) {
3664 arg
= LLVMBuildBitCast(builder
, arg
, ctx
->i32
, "");
3665 arg
= LLVMBuildIntToPtr(builder
, arg
, param_type
, "");
3667 arg
= LLVMBuildBitCast(builder
, arg
, ctx
->i64
, "");
3668 arg
= LLVMBuildIntToPtr(builder
, arg
, param_type
, "");
3671 arg
= LLVMBuildBitCast(builder
, arg
, param_type
, "");
3675 in
[param_idx
] = arg
;
3676 out_idx
+= param_size
;
3679 ret
= ac_build_call(&ctx
->ac
, parts
[part
], in
, num_params
);
3681 if (is_multi_part_shader(ctx
) &&
3682 part
+ 1 == next_shader_first_part
) {
3683 ac_build_endif(&ctx
->ac
, 6506);
3685 /* The second half of the merged shader should use
3686 * the inputs from the toplevel (wrapper) function,
3687 * not the return value from the last call.
3689 * That's because the last call was executed condi-
3690 * tionally, so we can't consume it in the main
3693 memcpy(out
, initial
, sizeof(initial
));
3694 num_out
= initial_num_out
;
3695 num_out_sgpr
= initial_num_out_sgpr
;
3699 /* Extract the returned GPRs. */
3700 ret_type
= LLVMTypeOf(ret
);
3704 if (LLVMGetTypeKind(ret_type
) != LLVMVoidTypeKind
) {
3705 assert(LLVMGetTypeKind(ret_type
) == LLVMStructTypeKind
);
3707 unsigned ret_size
= LLVMCountStructElementTypes(ret_type
);
3709 for (unsigned i
= 0; i
< ret_size
; ++i
) {
3711 LLVMBuildExtractValue(builder
, ret
, i
, "");
3713 assert(num_out
< ARRAY_SIZE(out
));
3714 out
[num_out
++] = val
;
3716 if (LLVMTypeOf(val
) == ctx
->i32
) {
3717 assert(num_out_sgpr
+ 1 == num_out
);
3718 num_out_sgpr
= num_out
;
3724 /* Return the value from the last part. */
3725 if (LLVMGetTypeKind(LLVMTypeOf(ret
)) == LLVMVoidTypeKind
)
3726 LLVMBuildRetVoid(builder
);
3728 LLVMBuildRet(builder
, ret
);
3731 static bool si_should_optimize_less(struct ac_llvm_compiler
*compiler
,
3732 struct si_shader_selector
*sel
)
3734 if (!compiler
->low_opt_passes
)
3737 /* Assume a slow CPU. */
3738 assert(!sel
->screen
->info
.has_dedicated_vram
&&
3739 sel
->screen
->info
.chip_class
<= GFX8
);
3741 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
3743 return sel
->type
== PIPE_SHADER_COMPUTE
&&
3744 sel
->info
.num_memory_instructions
> 1000;
3747 static struct nir_shader
*get_nir_shader(struct si_shader_selector
*sel
,
3754 } else if (sel
->nir_binary
) {
3755 struct pipe_screen
*screen
= &sel
->screen
->b
;
3756 const void *options
=
3757 screen
->get_compiler_options(screen
, PIPE_SHADER_IR_NIR
,
3760 struct blob_reader blob_reader
;
3761 blob_reader_init(&blob_reader
, sel
->nir_binary
, sel
->nir_size
);
3763 return nir_deserialize(NULL
, options
, &blob_reader
);
3768 int si_compile_shader(struct si_screen
*sscreen
,
3769 struct ac_llvm_compiler
*compiler
,
3770 struct si_shader
*shader
,
3771 struct pipe_debug_callback
*debug
)
3773 struct si_shader_selector
*sel
= shader
->selector
;
3774 struct si_shader_context ctx
;
3776 struct nir_shader
*nir
= get_nir_shader(sel
, &free_nir
);
3779 /* Dump NIR before doing NIR->LLVM conversion in case the
3780 * conversion fails. */
3781 if (si_can_dump_shader(sscreen
, sel
->type
) &&
3782 !(sscreen
->debug_flags
& DBG(NO_NIR
))) {
3783 nir_print_shader(nir
, stderr
);
3784 si_dump_streamout(&sel
->so
);
3787 si_llvm_context_init(&ctx
, sscreen
, compiler
, si_get_shader_wave_size(shader
));
3788 si_llvm_context_set_ir(&ctx
, shader
);
3790 memset(shader
->info
.vs_output_param_offset
, AC_EXP_PARAM_UNDEFINED
,
3791 sizeof(shader
->info
.vs_output_param_offset
));
3793 shader
->info
.uses_instanceid
= sel
->info
.uses_instanceid
;
3795 if (!si_build_main_function(&ctx
, nir
, free_nir
)) {
3796 si_llvm_dispose(&ctx
);
3800 if (shader
->is_monolithic
&& ctx
.type
== PIPE_SHADER_VERTEX
) {
3801 LLVMValueRef parts
[2];
3802 bool need_prolog
= si_vs_needs_prolog(sel
, &shader
->key
.part
.vs
.prolog
);
3804 parts
[1] = ctx
.main_fn
;
3807 union si_shader_part_key prolog_key
;
3808 si_get_vs_prolog_key(&sel
->info
,
3809 shader
->info
.num_input_sgprs
,
3810 &shader
->key
.part
.vs
.prolog
,
3811 shader
, &prolog_key
);
3812 prolog_key
.vs_prolog
.is_monolithic
= true;
3813 si_build_vs_prolog_function(&ctx
, &prolog_key
);
3814 parts
[0] = ctx
.main_fn
;
3817 si_build_wrapper_function(&ctx
, parts
+ !need_prolog
,
3818 1 + need_prolog
, need_prolog
, 0);
3820 if (ctx
.shader
->key
.opt
.vs_as_prim_discard_cs
)
3821 si_build_prim_discard_compute_shader(&ctx
);
3822 } else if (shader
->is_monolithic
&& ctx
.type
== PIPE_SHADER_TESS_CTRL
) {
3823 if (sscreen
->info
.chip_class
>= GFX9
) {
3824 struct si_shader_selector
*ls
= shader
->key
.part
.tcs
.ls
;
3825 LLVMValueRef parts
[4];
3826 bool vs_needs_prolog
=
3827 si_vs_needs_prolog(ls
, &shader
->key
.part
.tcs
.ls_prolog
);
3830 parts
[2] = ctx
.main_fn
;
3833 union si_shader_part_key tcs_epilog_key
;
3834 memset(&tcs_epilog_key
, 0, sizeof(tcs_epilog_key
));
3835 tcs_epilog_key
.tcs_epilog
.states
= shader
->key
.part
.tcs
.epilog
;
3836 si_llvm_build_tcs_epilog(&ctx
, &tcs_epilog_key
);
3837 parts
[3] = ctx
.main_fn
;
3839 /* VS as LS main part */
3840 nir
= get_nir_shader(ls
, &free_nir
);
3841 struct si_shader shader_ls
= {};
3842 shader_ls
.selector
= ls
;
3843 shader_ls
.key
.as_ls
= 1;
3844 shader_ls
.key
.mono
= shader
->key
.mono
;
3845 shader_ls
.key
.opt
= shader
->key
.opt
;
3846 shader_ls
.is_monolithic
= true;
3847 si_llvm_context_set_ir(&ctx
, &shader_ls
);
3849 if (!si_build_main_function(&ctx
, nir
, free_nir
)) {
3850 si_llvm_dispose(&ctx
);
3853 shader
->info
.uses_instanceid
|= ls
->info
.uses_instanceid
;
3854 parts
[1] = ctx
.main_fn
;
3857 if (vs_needs_prolog
) {
3858 union si_shader_part_key vs_prolog_key
;
3859 si_get_vs_prolog_key(&ls
->info
,
3860 shader_ls
.info
.num_input_sgprs
,
3861 &shader
->key
.part
.tcs
.ls_prolog
,
3862 shader
, &vs_prolog_key
);
3863 vs_prolog_key
.vs_prolog
.is_monolithic
= true;
3864 si_build_vs_prolog_function(&ctx
, &vs_prolog_key
);
3865 parts
[0] = ctx
.main_fn
;
3868 /* Reset the shader context. */
3869 ctx
.shader
= shader
;
3870 ctx
.type
= PIPE_SHADER_TESS_CTRL
;
3872 si_build_wrapper_function(&ctx
,
3873 parts
+ !vs_needs_prolog
,
3874 4 - !vs_needs_prolog
, vs_needs_prolog
,
3875 vs_needs_prolog
? 2 : 1);
3877 LLVMValueRef parts
[2];
3878 union si_shader_part_key epilog_key
;
3880 parts
[0] = ctx
.main_fn
;
3882 memset(&epilog_key
, 0, sizeof(epilog_key
));
3883 epilog_key
.tcs_epilog
.states
= shader
->key
.part
.tcs
.epilog
;
3884 si_llvm_build_tcs_epilog(&ctx
, &epilog_key
);
3885 parts
[1] = ctx
.main_fn
;
3887 si_build_wrapper_function(&ctx
, parts
, 2, 0, 0);
3889 } else if (shader
->is_monolithic
&& ctx
.type
== PIPE_SHADER_GEOMETRY
) {
3890 if (ctx
.screen
->info
.chip_class
>= GFX9
) {
3891 struct si_shader_selector
*es
= shader
->key
.part
.gs
.es
;
3892 LLVMValueRef es_prolog
= NULL
;
3893 LLVMValueRef es_main
= NULL
;
3894 LLVMValueRef gs_prolog
= NULL
;
3895 LLVMValueRef gs_main
= ctx
.main_fn
;
3898 union si_shader_part_key gs_prolog_key
;
3899 memset(&gs_prolog_key
, 0, sizeof(gs_prolog_key
));
3900 gs_prolog_key
.gs_prolog
.states
= shader
->key
.part
.gs
.prolog
;
3901 gs_prolog_key
.gs_prolog
.is_monolithic
= true;
3902 gs_prolog_key
.gs_prolog
.as_ngg
= shader
->key
.as_ngg
;
3903 si_build_gs_prolog_function(&ctx
, &gs_prolog_key
);
3904 gs_prolog
= ctx
.main_fn
;
3907 nir
= get_nir_shader(es
, &free_nir
);
3908 struct si_shader shader_es
= {};
3909 shader_es
.selector
= es
;
3910 shader_es
.key
.as_es
= 1;
3911 shader_es
.key
.as_ngg
= shader
->key
.as_ngg
;
3912 shader_es
.key
.mono
= shader
->key
.mono
;
3913 shader_es
.key
.opt
= shader
->key
.opt
;
3914 shader_es
.is_monolithic
= true;
3915 si_llvm_context_set_ir(&ctx
, &shader_es
);
3917 if (!si_build_main_function(&ctx
, nir
, free_nir
)) {
3918 si_llvm_dispose(&ctx
);
3921 shader
->info
.uses_instanceid
|= es
->info
.uses_instanceid
;
3922 es_main
= ctx
.main_fn
;
3925 if (es
->type
== PIPE_SHADER_VERTEX
&&
3926 si_vs_needs_prolog(es
, &shader
->key
.part
.gs
.vs_prolog
)) {
3927 union si_shader_part_key vs_prolog_key
;
3928 si_get_vs_prolog_key(&es
->info
,
3929 shader_es
.info
.num_input_sgprs
,
3930 &shader
->key
.part
.gs
.vs_prolog
,
3931 shader
, &vs_prolog_key
);
3932 vs_prolog_key
.vs_prolog
.is_monolithic
= true;
3933 si_build_vs_prolog_function(&ctx
, &vs_prolog_key
);
3934 es_prolog
= ctx
.main_fn
;
3937 /* Reset the shader context. */
3938 ctx
.shader
= shader
;
3939 ctx
.type
= PIPE_SHADER_GEOMETRY
;
3941 /* Prepare the array of shader parts. */
3942 LLVMValueRef parts
[4];
3943 unsigned num_parts
= 0, main_part
, next_first_part
;
3946 parts
[num_parts
++] = es_prolog
;
3948 parts
[main_part
= num_parts
++] = es_main
;
3949 parts
[next_first_part
= num_parts
++] = gs_prolog
;
3950 parts
[num_parts
++] = gs_main
;
3952 si_build_wrapper_function(&ctx
, parts
, num_parts
,
3953 main_part
, next_first_part
);
3955 LLVMValueRef parts
[2];
3956 union si_shader_part_key prolog_key
;
3958 parts
[1] = ctx
.main_fn
;
3960 memset(&prolog_key
, 0, sizeof(prolog_key
));
3961 prolog_key
.gs_prolog
.states
= shader
->key
.part
.gs
.prolog
;
3962 si_build_gs_prolog_function(&ctx
, &prolog_key
);
3963 parts
[0] = ctx
.main_fn
;
3965 si_build_wrapper_function(&ctx
, parts
, 2, 1, 0);
3967 } else if (shader
->is_monolithic
&& ctx
.type
== PIPE_SHADER_FRAGMENT
) {
3968 si_llvm_build_monolithic_ps(&ctx
, shader
);
3971 si_llvm_optimize_module(&ctx
);
3973 /* Post-optimization transformations and analysis. */
3974 si_optimize_vs_outputs(&ctx
);
3976 if ((debug
&& debug
->debug_message
) ||
3977 si_can_dump_shader(sscreen
, ctx
.type
)) {
3978 ctx
.shader
->info
.private_mem_vgprs
=
3979 ac_count_scratch_private_memory(ctx
.main_fn
);
3982 /* Make sure the input is a pointer and not integer followed by inttoptr. */
3983 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx
.main_fn
, 0))) ==
3984 LLVMPointerTypeKind
);
3986 /* Compile to bytecode. */
3987 r
= si_compile_llvm(sscreen
, &shader
->binary
, &shader
->config
, compiler
,
3988 ctx
.ac
.module
, debug
, ctx
.type
, ctx
.ac
.wave_size
,
3989 si_get_shader_name(shader
),
3990 si_should_optimize_less(compiler
, shader
->selector
));
3991 si_llvm_dispose(&ctx
);
3993 fprintf(stderr
, "LLVM failed to compile shader\n");
3997 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
3998 * LLVM 3.9svn has this bug.
4000 if (sel
->type
== PIPE_SHADER_COMPUTE
) {
4001 unsigned wave_size
= sscreen
->compute_wave_size
;
4002 unsigned max_vgprs
= sscreen
->info
.num_physical_wave64_vgprs_per_simd
*
4003 (wave_size
== 32 ? 2 : 1);
4004 unsigned max_sgprs
= sscreen
->info
.num_physical_sgprs_per_simd
;
4005 unsigned max_sgprs_per_wave
= 128;
4006 unsigned simds_per_tg
= 4; /* assuming WGP mode on gfx10 */
4007 unsigned threads_per_tg
= si_get_max_workgroup_size(shader
);
4008 unsigned waves_per_tg
= DIV_ROUND_UP(threads_per_tg
, wave_size
);
4009 unsigned waves_per_simd
= DIV_ROUND_UP(waves_per_tg
, simds_per_tg
);
4011 max_vgprs
= max_vgprs
/ waves_per_simd
;
4012 max_sgprs
= MIN2(max_sgprs
/ waves_per_simd
, max_sgprs_per_wave
);
4014 if (shader
->config
.num_sgprs
> max_sgprs
||
4015 shader
->config
.num_vgprs
> max_vgprs
) {
4016 fprintf(stderr
, "LLVM failed to compile a shader correctly: "
4017 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
4018 shader
->config
.num_sgprs
, shader
->config
.num_vgprs
,
4019 max_sgprs
, max_vgprs
);
4021 /* Just terminate the process, because dependent
4022 * shaders can hang due to bad input data, but use
4023 * the env var to allow shader-db to work.
4025 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
4030 /* Add the scratch offset to input SGPRs. */
4031 if (shader
->config
.scratch_bytes_per_wave
&& !si_is_merged_shader(&ctx
))
4032 shader
->info
.num_input_sgprs
+= 1; /* scratch byte offset */
4034 /* Calculate the number of fragment input VGPRs. */
4035 if (ctx
.type
== PIPE_SHADER_FRAGMENT
) {
4036 shader
->info
.num_input_vgprs
= ac_get_fs_input_vgpr_cnt(&shader
->config
,
4037 &shader
->info
.face_vgpr_index
,
4038 &shader
->info
.ancillary_vgpr_index
);
4041 si_calculate_max_simd_waves(shader
);
4042 si_shader_dump_stats_for_shader_db(sscreen
, shader
, debug
);
4047 * Create, compile and return a shader part (prolog or epilog).
4049 * \param sscreen screen
4050 * \param list list of shader parts of the same category
4051 * \param type shader type
4052 * \param key shader part key
4053 * \param prolog whether the part being requested is a prolog
4054 * \param tm LLVM target machine
4055 * \param debug debug callback
4056 * \param build the callback responsible for building the main function
4057 * \return non-NULL on success
4059 static struct si_shader_part
*
4060 si_get_shader_part(struct si_screen
*sscreen
,
4061 struct si_shader_part
**list
,
4062 enum pipe_shader_type type
,
4064 union si_shader_part_key
*key
,
4065 struct ac_llvm_compiler
*compiler
,
4066 struct pipe_debug_callback
*debug
,
4067 void (*build
)(struct si_shader_context
*,
4068 union si_shader_part_key
*),
4071 struct si_shader_part
*result
;
4073 simple_mtx_lock(&sscreen
->shader_parts_mutex
);
4075 /* Find existing. */
4076 for (result
= *list
; result
; result
= result
->next
) {
4077 if (memcmp(&result
->key
, key
, sizeof(*key
)) == 0) {
4078 simple_mtx_unlock(&sscreen
->shader_parts_mutex
);
4083 /* Compile a new one. */
4084 result
= CALLOC_STRUCT(si_shader_part
);
4087 struct si_shader shader
= {};
4090 case PIPE_SHADER_VERTEX
:
4091 shader
.key
.as_ls
= key
->vs_prolog
.as_ls
;
4092 shader
.key
.as_es
= key
->vs_prolog
.as_es
;
4093 shader
.key
.as_ngg
= key
->vs_prolog
.as_ngg
;
4095 case PIPE_SHADER_TESS_CTRL
:
4097 shader
.key
.part
.tcs
.epilog
= key
->tcs_epilog
.states
;
4099 case PIPE_SHADER_GEOMETRY
:
4101 shader
.key
.as_ngg
= key
->gs_prolog
.as_ngg
;
4103 case PIPE_SHADER_FRAGMENT
:
4105 shader
.key
.part
.ps
.prolog
= key
->ps_prolog
.states
;
4107 shader
.key
.part
.ps
.epilog
= key
->ps_epilog
.states
;
4110 unreachable("bad shader part");
4113 struct si_shader_context ctx
;
4114 si_llvm_context_init(&ctx
, sscreen
, compiler
,
4115 si_get_wave_size(sscreen
, type
, shader
.key
.as_ngg
,
4117 ctx
.shader
= &shader
;
4123 si_llvm_optimize_module(&ctx
);
4125 if (si_compile_llvm(sscreen
, &result
->binary
, &result
->config
, compiler
,
4126 ctx
.ac
.module
, debug
, ctx
.type
, ctx
.ac
.wave_size
,
4133 result
->next
= *list
;
4137 si_llvm_dispose(&ctx
);
4138 simple_mtx_unlock(&sscreen
->shader_parts_mutex
);
4143 * Build the vertex shader prolog function.
4145 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
4146 * All inputs are returned unmodified. The vertex load indices are
4147 * stored after them, which will be used by the API VS for fetching inputs.
4149 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
4154 * (VertexID + BaseVertex),
4155 * (InstanceID + StartInstance),
4156 * (InstanceID / 2 + StartInstance)
4158 static void si_build_vs_prolog_function(struct si_shader_context
*ctx
,
4159 union si_shader_part_key
*key
)
4161 LLVMTypeRef
*returns
;
4162 LLVMValueRef ret
, func
;
4164 unsigned first_vs_vgpr
= key
->vs_prolog
.num_merged_next_stage_vgprs
;
4165 unsigned num_input_vgprs
= key
->vs_prolog
.num_merged_next_stage_vgprs
+ 4;
4166 struct ac_arg input_sgpr_param
[key
->vs_prolog
.num_input_sgprs
];
4167 struct ac_arg input_vgpr_param
[9];
4168 LLVMValueRef input_vgprs
[9];
4169 unsigned num_all_input_regs
= key
->vs_prolog
.num_input_sgprs
+
4171 unsigned user_sgpr_base
= key
->vs_prolog
.num_merged_next_stage_vgprs
? 8 : 0;
4173 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
4175 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
4176 returns
= alloca((num_all_input_regs
+ key
->vs_prolog
.num_inputs
) *
4177 sizeof(LLVMTypeRef
));
4180 /* Declare input and output SGPRs. */
4181 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
4182 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
,
4183 &input_sgpr_param
[i
]);
4184 returns
[num_returns
++] = ctx
->i32
;
4187 struct ac_arg merged_wave_info
= input_sgpr_param
[3];
4189 /* Preloaded VGPRs (outputs must be floats) */
4190 for (i
= 0; i
< num_input_vgprs
; i
++) {
4191 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &input_vgpr_param
[i
]);
4192 returns
[num_returns
++] = ctx
->f32
;
4195 /* Vertex load indices. */
4196 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++)
4197 returns
[num_returns
++] = ctx
->f32
;
4199 /* Create the function. */
4200 si_llvm_create_func(ctx
, "vs_prolog", returns
, num_returns
, 0);
4201 func
= ctx
->main_fn
;
4203 for (i
= 0; i
< num_input_vgprs
; i
++) {
4204 input_vgprs
[i
] = ac_get_arg(&ctx
->ac
, input_vgpr_param
[i
]);
4207 if (key
->vs_prolog
.num_merged_next_stage_vgprs
) {
4208 if (!key
->vs_prolog
.is_monolithic
)
4209 si_init_exec_from_input(ctx
, merged_wave_info
, 0);
4211 if (key
->vs_prolog
.as_ls
&&
4212 ctx
->screen
->info
.has_ls_vgpr_init_bug
) {
4213 /* If there are no HS threads, SPI loads the LS VGPRs
4214 * starting at VGPR 0. Shift them back to where they
4217 LLVMValueRef has_hs_threads
=
4218 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
,
4219 si_unpack_param(ctx
, input_sgpr_param
[3], 8, 8),
4222 for (i
= 4; i
> 0; --i
) {
4223 input_vgprs
[i
+ 1] =
4224 LLVMBuildSelect(ctx
->ac
.builder
, has_hs_threads
,
4226 input_vgprs
[i
- 1], "");
4231 unsigned vertex_id_vgpr
= first_vs_vgpr
;
4232 unsigned instance_id_vgpr
=
4233 ctx
->screen
->info
.chip_class
>= GFX10
?
4235 first_vs_vgpr
+ (key
->vs_prolog
.as_ls
? 2 : 1);
4237 ctx
->abi
.vertex_id
= input_vgprs
[vertex_id_vgpr
];
4238 ctx
->abi
.instance_id
= input_vgprs
[instance_id_vgpr
];
4240 /* InstanceID = VertexID >> 16;
4241 * VertexID = VertexID & 0xffff;
4243 if (key
->vs_prolog
.states
.unpack_instance_id_from_vertex_id
) {
4244 ctx
->abi
.instance_id
= LLVMBuildLShr(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
4245 LLVMConstInt(ctx
->i32
, 16, 0), "");
4246 ctx
->abi
.vertex_id
= LLVMBuildAnd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
4247 LLVMConstInt(ctx
->i32
, 0xffff, 0), "");
4250 /* Copy inputs to outputs. This should be no-op, as the registers match,
4251 * but it will prevent the compiler from overwriting them unintentionally.
4253 ret
= ctx
->return_value
;
4254 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
4255 LLVMValueRef p
= LLVMGetParam(func
, i
);
4256 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, i
, "");
4258 for (i
= 0; i
< num_input_vgprs
; i
++) {
4259 LLVMValueRef p
= input_vgprs
[i
];
4261 if (i
== vertex_id_vgpr
)
4262 p
= ctx
->abi
.vertex_id
;
4263 else if (i
== instance_id_vgpr
)
4264 p
= ctx
->abi
.instance_id
;
4266 p
= ac_to_float(&ctx
->ac
, p
);
4267 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
,
4268 key
->vs_prolog
.num_input_sgprs
+ i
, "");
4271 /* Compute vertex load indices from instance divisors. */
4272 LLVMValueRef instance_divisor_constbuf
= NULL
;
4274 if (key
->vs_prolog
.states
.instance_divisor_is_fetched
) {
4275 LLVMValueRef list
= si_prolog_get_rw_buffers(ctx
);
4276 LLVMValueRef buf_index
=
4277 LLVMConstInt(ctx
->i32
, SI_VS_CONST_INSTANCE_DIVISORS
, 0);
4278 instance_divisor_constbuf
=
4279 ac_build_load_to_sgpr(&ctx
->ac
, list
, buf_index
);
4282 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++) {
4283 bool divisor_is_one
=
4284 key
->vs_prolog
.states
.instance_divisor_is_one
& (1u << i
);
4285 bool divisor_is_fetched
=
4286 key
->vs_prolog
.states
.instance_divisor_is_fetched
& (1u << i
);
4287 LLVMValueRef index
= NULL
;
4289 if (divisor_is_one
) {
4290 index
= ctx
->abi
.instance_id
;
4291 } else if (divisor_is_fetched
) {
4292 LLVMValueRef udiv_factors
[4];
4294 for (unsigned j
= 0; j
< 4; j
++) {
4296 si_buffer_load_const(ctx
, instance_divisor_constbuf
,
4297 LLVMConstInt(ctx
->i32
, i
*16 + j
*4, 0));
4298 udiv_factors
[j
] = ac_to_integer(&ctx
->ac
, udiv_factors
[j
]);
4300 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
4301 * Such InstanceID might not be achievable in a reasonable time though.
4303 index
= ac_build_fast_udiv_nuw(&ctx
->ac
, ctx
->abi
.instance_id
,
4304 udiv_factors
[0], udiv_factors
[1],
4305 udiv_factors
[2], udiv_factors
[3]);
4308 if (divisor_is_one
|| divisor_is_fetched
) {
4309 /* Add StartInstance. */
4310 index
= LLVMBuildAdd(ctx
->ac
.builder
, index
,
4311 LLVMGetParam(ctx
->main_fn
, user_sgpr_base
+
4312 SI_SGPR_START_INSTANCE
), "");
4314 /* VertexID + BaseVertex */
4315 index
= LLVMBuildAdd(ctx
->ac
.builder
,
4317 LLVMGetParam(func
, user_sgpr_base
+
4318 SI_SGPR_BASE_VERTEX
), "");
4321 index
= ac_to_float(&ctx
->ac
, index
);
4322 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, index
,
4323 ctx
->args
.arg_count
+ i
, "");
4326 si_llvm_build_ret(ctx
, ret
);
4329 static bool si_get_vs_prolog(struct si_screen
*sscreen
,
4330 struct ac_llvm_compiler
*compiler
,
4331 struct si_shader
*shader
,
4332 struct pipe_debug_callback
*debug
,
4333 struct si_shader
*main_part
,
4334 const struct si_vs_prolog_bits
*key
)
4336 struct si_shader_selector
*vs
= main_part
->selector
;
4338 if (!si_vs_needs_prolog(vs
, key
))
4341 /* Get the prolog. */
4342 union si_shader_part_key prolog_key
;
4343 si_get_vs_prolog_key(&vs
->info
, main_part
->info
.num_input_sgprs
,
4344 key
, shader
, &prolog_key
);
4347 si_get_shader_part(sscreen
, &sscreen
->vs_prologs
,
4348 PIPE_SHADER_VERTEX
, true, &prolog_key
, compiler
,
4349 debug
, si_build_vs_prolog_function
,
4350 "Vertex Shader Prolog");
4351 return shader
->prolog
!= NULL
;
4355 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
4357 static bool si_shader_select_vs_parts(struct si_screen
*sscreen
,
4358 struct ac_llvm_compiler
*compiler
,
4359 struct si_shader
*shader
,
4360 struct pipe_debug_callback
*debug
)
4362 return si_get_vs_prolog(sscreen
, compiler
, shader
, debug
, shader
,
4363 &shader
->key
.part
.vs
.prolog
);
4367 * Select and compile (or reuse) TCS parts (epilog).
4369 static bool si_shader_select_tcs_parts(struct si_screen
*sscreen
,
4370 struct ac_llvm_compiler
*compiler
,
4371 struct si_shader
*shader
,
4372 struct pipe_debug_callback
*debug
)
4374 if (sscreen
->info
.chip_class
>= GFX9
) {
4375 struct si_shader
*ls_main_part
=
4376 shader
->key
.part
.tcs
.ls
->main_shader_part_ls
;
4378 if (!si_get_vs_prolog(sscreen
, compiler
, shader
, debug
, ls_main_part
,
4379 &shader
->key
.part
.tcs
.ls_prolog
))
4382 shader
->previous_stage
= ls_main_part
;
4385 /* Get the epilog. */
4386 union si_shader_part_key epilog_key
;
4387 memset(&epilog_key
, 0, sizeof(epilog_key
));
4388 epilog_key
.tcs_epilog
.states
= shader
->key
.part
.tcs
.epilog
;
4390 shader
->epilog
= si_get_shader_part(sscreen
, &sscreen
->tcs_epilogs
,
4391 PIPE_SHADER_TESS_CTRL
, false,
4392 &epilog_key
, compiler
, debug
,
4393 si_llvm_build_tcs_epilog
,
4394 "Tessellation Control Shader Epilog");
4395 return shader
->epilog
!= NULL
;
4399 * Select and compile (or reuse) GS parts (prolog).
4401 static bool si_shader_select_gs_parts(struct si_screen
*sscreen
,
4402 struct ac_llvm_compiler
*compiler
,
4403 struct si_shader
*shader
,
4404 struct pipe_debug_callback
*debug
)
4406 if (sscreen
->info
.chip_class
>= GFX9
) {
4407 struct si_shader
*es_main_part
;
4408 enum pipe_shader_type es_type
= shader
->key
.part
.gs
.es
->type
;
4410 if (shader
->key
.as_ngg
)
4411 es_main_part
= shader
->key
.part
.gs
.es
->main_shader_part_ngg_es
;
4413 es_main_part
= shader
->key
.part
.gs
.es
->main_shader_part_es
;
4415 if (es_type
== PIPE_SHADER_VERTEX
&&
4416 !si_get_vs_prolog(sscreen
, compiler
, shader
, debug
, es_main_part
,
4417 &shader
->key
.part
.gs
.vs_prolog
))
4420 shader
->previous_stage
= es_main_part
;
4423 if (!shader
->key
.part
.gs
.prolog
.tri_strip_adj_fix
)
4426 union si_shader_part_key prolog_key
;
4427 memset(&prolog_key
, 0, sizeof(prolog_key
));
4428 prolog_key
.gs_prolog
.states
= shader
->key
.part
.gs
.prolog
;
4429 prolog_key
.gs_prolog
.as_ngg
= shader
->key
.as_ngg
;
4431 shader
->prolog2
= si_get_shader_part(sscreen
, &sscreen
->gs_prologs
,
4432 PIPE_SHADER_GEOMETRY
, true,
4433 &prolog_key
, compiler
, debug
,
4434 si_build_gs_prolog_function
,
4435 "Geometry Shader Prolog");
4436 return shader
->prolog2
!= NULL
;
4440 * Compute the PS prolog key, which contains all the information needed to
4441 * build the PS prolog function, and set related bits in shader->config.
4443 void si_get_ps_prolog_key(struct si_shader
*shader
,
4444 union si_shader_part_key
*key
,
4445 bool separate_prolog
)
4447 struct si_shader_info
*info
= &shader
->selector
->info
;
4449 memset(key
, 0, sizeof(*key
));
4450 key
->ps_prolog
.states
= shader
->key
.part
.ps
.prolog
;
4451 key
->ps_prolog
.colors_read
= info
->colors_read
;
4452 key
->ps_prolog
.num_input_sgprs
= shader
->info
.num_input_sgprs
;
4453 key
->ps_prolog
.num_input_vgprs
= shader
->info
.num_input_vgprs
;
4454 key
->ps_prolog
.wqm
= info
->uses_derivatives
&&
4455 (key
->ps_prolog
.colors_read
||
4456 key
->ps_prolog
.states
.force_persp_sample_interp
||
4457 key
->ps_prolog
.states
.force_linear_sample_interp
||
4458 key
->ps_prolog
.states
.force_persp_center_interp
||
4459 key
->ps_prolog
.states
.force_linear_center_interp
||
4460 key
->ps_prolog
.states
.bc_optimize_for_persp
||
4461 key
->ps_prolog
.states
.bc_optimize_for_linear
);
4462 key
->ps_prolog
.ancillary_vgpr_index
= shader
->info
.ancillary_vgpr_index
;
4464 if (info
->colors_read
) {
4465 unsigned *color
= shader
->selector
->color_attr_index
;
4467 if (shader
->key
.part
.ps
.prolog
.color_two_side
) {
4468 /* BCOLORs are stored after the last input. */
4469 key
->ps_prolog
.num_interp_inputs
= info
->num_inputs
;
4470 key
->ps_prolog
.face_vgpr_index
= shader
->info
.face_vgpr_index
;
4471 if (separate_prolog
)
4472 shader
->config
.spi_ps_input_ena
|= S_0286CC_FRONT_FACE_ENA(1);
4475 for (unsigned i
= 0; i
< 2; i
++) {
4476 unsigned interp
= info
->input_interpolate
[color
[i
]];
4477 unsigned location
= info
->input_interpolate_loc
[color
[i
]];
4479 if (!(info
->colors_read
& (0xf << i
*4)))
4482 key
->ps_prolog
.color_attr_index
[i
] = color
[i
];
4484 if (shader
->key
.part
.ps
.prolog
.flatshade_colors
&&
4485 interp
== TGSI_INTERPOLATE_COLOR
)
4486 interp
= TGSI_INTERPOLATE_CONSTANT
;
4489 case TGSI_INTERPOLATE_CONSTANT
:
4490 key
->ps_prolog
.color_interp_vgpr_index
[i
] = -1;
4492 case TGSI_INTERPOLATE_PERSPECTIVE
:
4493 case TGSI_INTERPOLATE_COLOR
:
4494 /* Force the interpolation location for colors here. */
4495 if (shader
->key
.part
.ps
.prolog
.force_persp_sample_interp
)
4496 location
= TGSI_INTERPOLATE_LOC_SAMPLE
;
4497 if (shader
->key
.part
.ps
.prolog
.force_persp_center_interp
)
4498 location
= TGSI_INTERPOLATE_LOC_CENTER
;
4501 case TGSI_INTERPOLATE_LOC_SAMPLE
:
4502 key
->ps_prolog
.color_interp_vgpr_index
[i
] = 0;
4503 if (separate_prolog
) {
4504 shader
->config
.spi_ps_input_ena
|=
4505 S_0286CC_PERSP_SAMPLE_ENA(1);
4508 case TGSI_INTERPOLATE_LOC_CENTER
:
4509 key
->ps_prolog
.color_interp_vgpr_index
[i
] = 2;
4510 if (separate_prolog
) {
4511 shader
->config
.spi_ps_input_ena
|=
4512 S_0286CC_PERSP_CENTER_ENA(1);
4515 case TGSI_INTERPOLATE_LOC_CENTROID
:
4516 key
->ps_prolog
.color_interp_vgpr_index
[i
] = 4;
4517 if (separate_prolog
) {
4518 shader
->config
.spi_ps_input_ena
|=
4519 S_0286CC_PERSP_CENTROID_ENA(1);
4526 case TGSI_INTERPOLATE_LINEAR
:
4527 /* Force the interpolation location for colors here. */
4528 if (shader
->key
.part
.ps
.prolog
.force_linear_sample_interp
)
4529 location
= TGSI_INTERPOLATE_LOC_SAMPLE
;
4530 if (shader
->key
.part
.ps
.prolog
.force_linear_center_interp
)
4531 location
= TGSI_INTERPOLATE_LOC_CENTER
;
4533 /* The VGPR assignment for non-monolithic shaders
4534 * works because InitialPSInputAddr is set on the
4535 * main shader and PERSP_PULL_MODEL is never used.
4538 case TGSI_INTERPOLATE_LOC_SAMPLE
:
4539 key
->ps_prolog
.color_interp_vgpr_index
[i
] =
4540 separate_prolog
? 6 : 9;
4541 if (separate_prolog
) {
4542 shader
->config
.spi_ps_input_ena
|=
4543 S_0286CC_LINEAR_SAMPLE_ENA(1);
4546 case TGSI_INTERPOLATE_LOC_CENTER
:
4547 key
->ps_prolog
.color_interp_vgpr_index
[i
] =
4548 separate_prolog
? 8 : 11;
4549 if (separate_prolog
) {
4550 shader
->config
.spi_ps_input_ena
|=
4551 S_0286CC_LINEAR_CENTER_ENA(1);
4554 case TGSI_INTERPOLATE_LOC_CENTROID
:
4555 key
->ps_prolog
.color_interp_vgpr_index
[i
] =
4556 separate_prolog
? 10 : 13;
4557 if (separate_prolog
) {
4558 shader
->config
.spi_ps_input_ena
|=
4559 S_0286CC_LINEAR_CENTROID_ENA(1);
4574 * Check whether a PS prolog is required based on the key.
4576 bool si_need_ps_prolog(const union si_shader_part_key
*key
)
4578 return key
->ps_prolog
.colors_read
||
4579 key
->ps_prolog
.states
.force_persp_sample_interp
||
4580 key
->ps_prolog
.states
.force_linear_sample_interp
||
4581 key
->ps_prolog
.states
.force_persp_center_interp
||
4582 key
->ps_prolog
.states
.force_linear_center_interp
||
4583 key
->ps_prolog
.states
.bc_optimize_for_persp
||
4584 key
->ps_prolog
.states
.bc_optimize_for_linear
||
4585 key
->ps_prolog
.states
.poly_stipple
||
4586 key
->ps_prolog
.states
.samplemask_log_ps_iter
;
4590 * Compute the PS epilog key, which contains all the information needed to
4591 * build the PS epilog function.
4593 void si_get_ps_epilog_key(struct si_shader
*shader
,
4594 union si_shader_part_key
*key
)
4596 struct si_shader_info
*info
= &shader
->selector
->info
;
4597 memset(key
, 0, sizeof(*key
));
4598 key
->ps_epilog
.colors_written
= info
->colors_written
;
4599 key
->ps_epilog
.writes_z
= info
->writes_z
;
4600 key
->ps_epilog
.writes_stencil
= info
->writes_stencil
;
4601 key
->ps_epilog
.writes_samplemask
= info
->writes_samplemask
;
4602 key
->ps_epilog
.states
= shader
->key
.part
.ps
.epilog
;
4606 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
4608 static bool si_shader_select_ps_parts(struct si_screen
*sscreen
,
4609 struct ac_llvm_compiler
*compiler
,
4610 struct si_shader
*shader
,
4611 struct pipe_debug_callback
*debug
)
4613 union si_shader_part_key prolog_key
;
4614 union si_shader_part_key epilog_key
;
4616 /* Get the prolog. */
4617 si_get_ps_prolog_key(shader
, &prolog_key
, true);
4619 /* The prolog is a no-op if these aren't set. */
4620 if (si_need_ps_prolog(&prolog_key
)) {
4622 si_get_shader_part(sscreen
, &sscreen
->ps_prologs
,
4623 PIPE_SHADER_FRAGMENT
, true,
4624 &prolog_key
, compiler
, debug
,
4625 si_llvm_build_ps_prolog
,
4626 "Fragment Shader Prolog");
4627 if (!shader
->prolog
)
4631 /* Get the epilog. */
4632 si_get_ps_epilog_key(shader
, &epilog_key
);
4635 si_get_shader_part(sscreen
, &sscreen
->ps_epilogs
,
4636 PIPE_SHADER_FRAGMENT
, false,
4637 &epilog_key
, compiler
, debug
,
4638 si_llvm_build_ps_epilog
,
4639 "Fragment Shader Epilog");
4640 if (!shader
->epilog
)
4643 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
4644 if (shader
->key
.part
.ps
.prolog
.poly_stipple
) {
4645 shader
->config
.spi_ps_input_ena
|= S_0286CC_POS_FIXED_PT_ENA(1);
4646 assert(G_0286CC_POS_FIXED_PT_ENA(shader
->config
.spi_ps_input_addr
));
4649 /* Set up the enable bits for per-sample shading if needed. */
4650 if (shader
->key
.part
.ps
.prolog
.force_persp_sample_interp
&&
4651 (G_0286CC_PERSP_CENTER_ENA(shader
->config
.spi_ps_input_ena
) ||
4652 G_0286CC_PERSP_CENTROID_ENA(shader
->config
.spi_ps_input_ena
))) {
4653 shader
->config
.spi_ps_input_ena
&= C_0286CC_PERSP_CENTER_ENA
;
4654 shader
->config
.spi_ps_input_ena
&= C_0286CC_PERSP_CENTROID_ENA
;
4655 shader
->config
.spi_ps_input_ena
|= S_0286CC_PERSP_SAMPLE_ENA(1);
4657 if (shader
->key
.part
.ps
.prolog
.force_linear_sample_interp
&&
4658 (G_0286CC_LINEAR_CENTER_ENA(shader
->config
.spi_ps_input_ena
) ||
4659 G_0286CC_LINEAR_CENTROID_ENA(shader
->config
.spi_ps_input_ena
))) {
4660 shader
->config
.spi_ps_input_ena
&= C_0286CC_LINEAR_CENTER_ENA
;
4661 shader
->config
.spi_ps_input_ena
&= C_0286CC_LINEAR_CENTROID_ENA
;
4662 shader
->config
.spi_ps_input_ena
|= S_0286CC_LINEAR_SAMPLE_ENA(1);
4664 if (shader
->key
.part
.ps
.prolog
.force_persp_center_interp
&&
4665 (G_0286CC_PERSP_SAMPLE_ENA(shader
->config
.spi_ps_input_ena
) ||
4666 G_0286CC_PERSP_CENTROID_ENA(shader
->config
.spi_ps_input_ena
))) {
4667 shader
->config
.spi_ps_input_ena
&= C_0286CC_PERSP_SAMPLE_ENA
;
4668 shader
->config
.spi_ps_input_ena
&= C_0286CC_PERSP_CENTROID_ENA
;
4669 shader
->config
.spi_ps_input_ena
|= S_0286CC_PERSP_CENTER_ENA(1);
4671 if (shader
->key
.part
.ps
.prolog
.force_linear_center_interp
&&
4672 (G_0286CC_LINEAR_SAMPLE_ENA(shader
->config
.spi_ps_input_ena
) ||
4673 G_0286CC_LINEAR_CENTROID_ENA(shader
->config
.spi_ps_input_ena
))) {
4674 shader
->config
.spi_ps_input_ena
&= C_0286CC_LINEAR_SAMPLE_ENA
;
4675 shader
->config
.spi_ps_input_ena
&= C_0286CC_LINEAR_CENTROID_ENA
;
4676 shader
->config
.spi_ps_input_ena
|= S_0286CC_LINEAR_CENTER_ENA(1);
4679 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
4680 if (G_0286CC_POS_W_FLOAT_ENA(shader
->config
.spi_ps_input_ena
) &&
4681 !(shader
->config
.spi_ps_input_ena
& 0xf)) {
4682 shader
->config
.spi_ps_input_ena
|= S_0286CC_PERSP_CENTER_ENA(1);
4683 assert(G_0286CC_PERSP_CENTER_ENA(shader
->config
.spi_ps_input_addr
));
4686 /* At least one pair of interpolation weights must be enabled. */
4687 if (!(shader
->config
.spi_ps_input_ena
& 0x7f)) {
4688 shader
->config
.spi_ps_input_ena
|= S_0286CC_LINEAR_CENTER_ENA(1);
4689 assert(G_0286CC_LINEAR_CENTER_ENA(shader
->config
.spi_ps_input_addr
));
4692 /* Samplemask fixup requires the sample ID. */
4693 if (shader
->key
.part
.ps
.prolog
.samplemask_log_ps_iter
) {
4694 shader
->config
.spi_ps_input_ena
|= S_0286CC_ANCILLARY_ENA(1);
4695 assert(G_0286CC_ANCILLARY_ENA(shader
->config
.spi_ps_input_addr
));
4698 /* The sample mask input is always enabled, because the API shader always
4699 * passes it through to the epilog. Disable it here if it's unused.
4701 if (!shader
->key
.part
.ps
.epilog
.poly_line_smoothing
&&
4702 !shader
->selector
->info
.reads_samplemask
)
4703 shader
->config
.spi_ps_input_ena
&= C_0286CC_SAMPLE_COVERAGE_ENA
;
4708 void si_multiwave_lds_size_workaround(struct si_screen
*sscreen
,
4711 /* If tessellation is all offchip and on-chip GS isn't used, this
4712 * workaround is not needed.
4716 /* SPI barrier management bug:
4717 * Make sure we have at least 4k of LDS in use to avoid the bug.
4718 * It applies to workgroup sizes of more than one wavefront.
4720 if (sscreen
->info
.family
== CHIP_BONAIRE
||
4721 sscreen
->info
.family
== CHIP_KABINI
)
4722 *lds_size
= MAX2(*lds_size
, 8);
4725 static void si_fix_resource_usage(struct si_screen
*sscreen
,
4726 struct si_shader
*shader
)
4728 unsigned min_sgprs
= shader
->info
.num_input_sgprs
+ 2; /* VCC */
4730 shader
->config
.num_sgprs
= MAX2(shader
->config
.num_sgprs
, min_sgprs
);
4732 if (shader
->selector
->type
== PIPE_SHADER_COMPUTE
&&
4733 si_get_max_workgroup_size(shader
) > sscreen
->compute_wave_size
) {
4734 si_multiwave_lds_size_workaround(sscreen
,
4735 &shader
->config
.lds_size
);
4739 bool si_create_shader_variant(struct si_screen
*sscreen
,
4740 struct ac_llvm_compiler
*compiler
,
4741 struct si_shader
*shader
,
4742 struct pipe_debug_callback
*debug
)
4744 struct si_shader_selector
*sel
= shader
->selector
;
4745 struct si_shader
*mainp
= *si_get_main_shader_part(sel
, &shader
->key
);
4748 /* LS, ES, VS are compiled on demand if the main part hasn't been
4749 * compiled for that stage.
4751 * GS are compiled on demand if the main part hasn't been compiled
4752 * for the chosen NGG-ness.
4754 * Vertex shaders are compiled on demand when a vertex fetch
4755 * workaround must be applied.
4757 if (shader
->is_monolithic
) {
4758 /* Monolithic shader (compiled as a whole, has many variants,
4759 * may take a long time to compile).
4761 r
= si_compile_shader(sscreen
, compiler
, shader
, debug
);
4765 /* The shader consists of several parts:
4767 * - the middle part is the user shader, it has 1 variant only
4768 * and it was compiled during the creation of the shader
4770 * - the prolog part is inserted at the beginning
4771 * - the epilog part is inserted at the end
4773 * The prolog and epilog have many (but simple) variants.
4775 * Starting with gfx9, geometry and tessellation control
4776 * shaders also contain the prolog and user shader parts of
4777 * the previous shader stage.
4783 /* Copy the compiled shader data over. */
4784 shader
->is_binary_shared
= true;
4785 shader
->binary
= mainp
->binary
;
4786 shader
->config
= mainp
->config
;
4787 shader
->info
.num_input_sgprs
= mainp
->info
.num_input_sgprs
;
4788 shader
->info
.num_input_vgprs
= mainp
->info
.num_input_vgprs
;
4789 shader
->info
.face_vgpr_index
= mainp
->info
.face_vgpr_index
;
4790 shader
->info
.ancillary_vgpr_index
= mainp
->info
.ancillary_vgpr_index
;
4791 memcpy(shader
->info
.vs_output_param_offset
,
4792 mainp
->info
.vs_output_param_offset
,
4793 sizeof(mainp
->info
.vs_output_param_offset
));
4794 shader
->info
.uses_instanceid
= mainp
->info
.uses_instanceid
;
4795 shader
->info
.nr_pos_exports
= mainp
->info
.nr_pos_exports
;
4796 shader
->info
.nr_param_exports
= mainp
->info
.nr_param_exports
;
4798 /* Select prologs and/or epilogs. */
4799 switch (sel
->type
) {
4800 case PIPE_SHADER_VERTEX
:
4801 if (!si_shader_select_vs_parts(sscreen
, compiler
, shader
, debug
))
4804 case PIPE_SHADER_TESS_CTRL
:
4805 if (!si_shader_select_tcs_parts(sscreen
, compiler
, shader
, debug
))
4808 case PIPE_SHADER_TESS_EVAL
:
4810 case PIPE_SHADER_GEOMETRY
:
4811 if (!si_shader_select_gs_parts(sscreen
, compiler
, shader
, debug
))
4814 case PIPE_SHADER_FRAGMENT
:
4815 if (!si_shader_select_ps_parts(sscreen
, compiler
, shader
, debug
))
4818 /* Make sure we have at least as many VGPRs as there
4819 * are allocated inputs.
4821 shader
->config
.num_vgprs
= MAX2(shader
->config
.num_vgprs
,
4822 shader
->info
.num_input_vgprs
);
4827 /* Update SGPR and VGPR counts. */
4828 if (shader
->prolog
) {
4829 shader
->config
.num_sgprs
= MAX2(shader
->config
.num_sgprs
,
4830 shader
->prolog
->config
.num_sgprs
);
4831 shader
->config
.num_vgprs
= MAX2(shader
->config
.num_vgprs
,
4832 shader
->prolog
->config
.num_vgprs
);
4834 if (shader
->previous_stage
) {
4835 shader
->config
.num_sgprs
= MAX2(shader
->config
.num_sgprs
,
4836 shader
->previous_stage
->config
.num_sgprs
);
4837 shader
->config
.num_vgprs
= MAX2(shader
->config
.num_vgprs
,
4838 shader
->previous_stage
->config
.num_vgprs
);
4839 shader
->config
.spilled_sgprs
=
4840 MAX2(shader
->config
.spilled_sgprs
,
4841 shader
->previous_stage
->config
.spilled_sgprs
);
4842 shader
->config
.spilled_vgprs
=
4843 MAX2(shader
->config
.spilled_vgprs
,
4844 shader
->previous_stage
->config
.spilled_vgprs
);
4845 shader
->info
.private_mem_vgprs
=
4846 MAX2(shader
->info
.private_mem_vgprs
,
4847 shader
->previous_stage
->info
.private_mem_vgprs
);
4848 shader
->config
.scratch_bytes_per_wave
=
4849 MAX2(shader
->config
.scratch_bytes_per_wave
,
4850 shader
->previous_stage
->config
.scratch_bytes_per_wave
);
4851 shader
->info
.uses_instanceid
|=
4852 shader
->previous_stage
->info
.uses_instanceid
;
4854 if (shader
->prolog2
) {
4855 shader
->config
.num_sgprs
= MAX2(shader
->config
.num_sgprs
,
4856 shader
->prolog2
->config
.num_sgprs
);
4857 shader
->config
.num_vgprs
= MAX2(shader
->config
.num_vgprs
,
4858 shader
->prolog2
->config
.num_vgprs
);
4860 if (shader
->epilog
) {
4861 shader
->config
.num_sgprs
= MAX2(shader
->config
.num_sgprs
,
4862 shader
->epilog
->config
.num_sgprs
);
4863 shader
->config
.num_vgprs
= MAX2(shader
->config
.num_vgprs
,
4864 shader
->epilog
->config
.num_vgprs
);
4866 si_calculate_max_simd_waves(shader
);
4869 if (shader
->key
.as_ngg
) {
4870 assert(!shader
->key
.as_es
&& !shader
->key
.as_ls
);
4871 gfx10_ngg_calculate_subgroup_info(shader
);
4872 } else if (sscreen
->info
.chip_class
>= GFX9
&& sel
->type
== PIPE_SHADER_GEOMETRY
) {
4873 gfx9_get_gs_info(shader
->previous_stage_sel
, sel
, &shader
->gs_info
);
4876 si_fix_resource_usage(sscreen
, shader
);
4877 si_shader_dump(sscreen
, shader
, debug
, stderr
, true);
4880 if (!si_shader_binary_upload(sscreen
, shader
, 0)) {
4881 fprintf(stderr
, "LLVM failed to upload shader\n");
4888 void si_shader_destroy(struct si_shader
*shader
)
4890 if (shader
->scratch_bo
)
4891 si_resource_reference(&shader
->scratch_bo
, NULL
);
4893 si_resource_reference(&shader
->bo
, NULL
);
4895 if (!shader
->is_binary_shared
)
4896 si_shader_binary_clean(&shader
->binary
);
4898 free(shader
->shader_log
);