2 * Copyright 2020 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "si_shader_internal.h"
28 #include "util/u_memory.h"
30 static LLVMValueRef
unpack_sint16(struct si_shader_context
*ctx
, LLVMValueRef i32
, unsigned index
)
35 return LLVMBuildAShr(ctx
->ac
.builder
, i32
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
37 return LLVMBuildSExt(ctx
->ac
.builder
, LLVMBuildTrunc(ctx
->ac
.builder
, i32
, ctx
->ac
.i16
, ""),
41 static void load_input_vs(struct si_shader_context
*ctx
, unsigned input_index
, LLVMValueRef out
[4])
43 const struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
44 unsigned vs_blit_property
= info
->properties
[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD
];
46 if (vs_blit_property
) {
47 LLVMValueRef vertex_id
= ctx
->abi
.vertex_id
;
49 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULE
, vertex_id
, ctx
->ac
.i32_1
, "");
50 /* Use LLVMIntNE, because we have 3 vertices and only
51 * the middle one should use y2.
53 LLVMValueRef sel_y1
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
, vertex_id
, ctx
->ac
.i32_1
, "");
55 unsigned param_vs_blit_inputs
= ctx
->vs_blit_inputs
.arg_index
;
56 if (input_index
== 0) {
58 LLVMValueRef x1y1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
);
59 LLVMValueRef x2y2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 1);
61 LLVMValueRef x1
= unpack_sint16(ctx
, x1y1
, 0);
62 LLVMValueRef y1
= unpack_sint16(ctx
, x1y1
, 1);
63 LLVMValueRef x2
= unpack_sint16(ctx
, x2y2
, 0);
64 LLVMValueRef y2
= unpack_sint16(ctx
, x2y2
, 1);
66 LLVMValueRef x
= LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
, x1
, x2
, "");
67 LLVMValueRef y
= LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
, y1
, y2
, "");
69 out
[0] = LLVMBuildSIToFP(ctx
->ac
.builder
, x
, ctx
->ac
.f32
, "");
70 out
[1] = LLVMBuildSIToFP(ctx
->ac
.builder
, y
, ctx
->ac
.f32
, "");
71 out
[2] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 2);
72 out
[3] = ctx
->ac
.f32_1
;
76 /* Color or texture coordinates: */
77 assert(input_index
== 1);
79 if (vs_blit_property
== SI_VS_BLIT_SGPRS_POS_COLOR
) {
80 for (int i
= 0; i
< 4; i
++) {
81 out
[i
] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 3 + i
);
84 assert(vs_blit_property
== SI_VS_BLIT_SGPRS_POS_TEXCOORD
);
85 LLVMValueRef x1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 3);
86 LLVMValueRef y1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 4);
87 LLVMValueRef x2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 5);
88 LLVMValueRef y2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 6);
90 out
[0] = LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
, x1
, x2
, "");
91 out
[1] = LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
, y1
, y2
, "");
92 out
[2] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 7);
93 out
[3] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 8);
98 unsigned num_vbos_in_user_sgprs
= ctx
->shader
->selector
->num_vbos_in_user_sgprs
;
99 union si_vs_fix_fetch fix_fetch
;
100 LLVMValueRef vb_desc
;
101 LLVMValueRef vertex_index
;
104 if (input_index
< num_vbos_in_user_sgprs
) {
105 vb_desc
= ac_get_arg(&ctx
->ac
, ctx
->vb_descriptors
[input_index
]);
107 unsigned index
= input_index
- num_vbos_in_user_sgprs
;
108 vb_desc
= ac_build_load_to_sgpr(&ctx
->ac
, ac_get_arg(&ctx
->ac
, ctx
->vertex_buffers
),
109 LLVMConstInt(ctx
->ac
.i32
, index
, 0));
112 vertex_index
= LLVMGetParam(ctx
->main_fn
, ctx
->vertex_index0
.arg_index
+ input_index
);
114 /* Use the open-coded implementation for all loads of doubles and
115 * of dword-sized data that needs fixups. We need to insert conversion
116 * code anyway, and the amd/common code does it for us.
118 * Note: On LLVM <= 8, we can only open-code formats with
119 * channel size >= 4 bytes.
121 bool opencode
= ctx
->shader
->key
.mono
.vs_fetch_opencode
& (1 << input_index
);
122 fix_fetch
.bits
= ctx
->shader
->key
.mono
.vs_fix_fetch
[input_index
].bits
;
123 if (opencode
|| (fix_fetch
.u
.log_size
== 3 && fix_fetch
.u
.format
== AC_FETCH_FORMAT_FLOAT
) ||
124 (fix_fetch
.u
.log_size
== 2)) {
125 tmp
= ac_build_opencoded_load_format(&ctx
->ac
, fix_fetch
.u
.log_size
,
126 fix_fetch
.u
.num_channels_m1
+ 1, fix_fetch
.u
.format
,
127 fix_fetch
.u
.reverse
, !opencode
, vb_desc
, vertex_index
,
128 ctx
->ac
.i32_0
, ctx
->ac
.i32_0
, 0, true);
129 for (unsigned i
= 0; i
< 4; ++i
)
131 LLVMBuildExtractElement(ctx
->ac
.builder
, tmp
, LLVMConstInt(ctx
->ac
.i32
, i
, false), "");
135 /* Do multiple loads for special formats. */
136 unsigned required_channels
= util_last_bit(info
->input_usage_mask
[input_index
]);
137 LLVMValueRef fetches
[4];
138 unsigned num_fetches
;
139 unsigned fetch_stride
;
140 unsigned channels_per_fetch
;
142 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2) {
143 num_fetches
= MIN2(required_channels
, 3);
144 fetch_stride
= 1 << fix_fetch
.u
.log_size
;
145 channels_per_fetch
= 1;
149 channels_per_fetch
= required_channels
;
152 for (unsigned i
= 0; i
< num_fetches
; ++i
) {
153 LLVMValueRef voffset
= LLVMConstInt(ctx
->ac
.i32
, fetch_stride
* i
, 0);
154 fetches
[i
] = ac_build_buffer_load_format(&ctx
->ac
, vb_desc
, vertex_index
, voffset
,
155 channels_per_fetch
, 0, true, false);
158 if (num_fetches
== 1 && channels_per_fetch
> 1) {
159 LLVMValueRef fetch
= fetches
[0];
160 for (unsigned i
= 0; i
< channels_per_fetch
; ++i
) {
161 tmp
= LLVMConstInt(ctx
->ac
.i32
, i
, false);
162 fetches
[i
] = LLVMBuildExtractElement(ctx
->ac
.builder
, fetch
, tmp
, "");
164 num_fetches
= channels_per_fetch
;
165 channels_per_fetch
= 1;
168 for (unsigned i
= num_fetches
; i
< 4; ++i
)
169 fetches
[i
] = LLVMGetUndef(ctx
->ac
.f32
);
171 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2 && required_channels
== 4) {
172 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_UINT
|| fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
)
173 fetches
[3] = ctx
->ac
.i32_1
;
175 fetches
[3] = ctx
->ac
.f32_1
;
176 } else if (fix_fetch
.u
.log_size
== 3 &&
177 (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
||
178 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
||
179 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
) &&
180 required_channels
== 4) {
181 /* For 2_10_10_10, the hardware returns an unsigned value;
182 * convert it to a signed one.
184 LLVMValueRef tmp
= fetches
[3];
185 LLVMValueRef c30
= LLVMConstInt(ctx
->ac
.i32
, 30, 0);
187 /* First, recover the sign-extended signed integer value. */
188 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
)
189 tmp
= LLVMBuildFPToUI(ctx
->ac
.builder
, tmp
, ctx
->ac
.i32
, "");
191 tmp
= ac_to_integer(&ctx
->ac
, tmp
);
193 /* For the integer-like cases, do a natural sign extension.
195 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
196 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
200 ctx
->ac
.builder
, tmp
,
201 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
? LLVMConstInt(ctx
->ac
.i32
, 7, 0) : c30
, "");
202 tmp
= LLVMBuildAShr(ctx
->ac
.builder
, tmp
, c30
, "");
204 /* Convert back to the right type. */
205 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
) {
207 LLVMValueRef neg_one
= LLVMConstReal(ctx
->ac
.f32
, -1.0);
208 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
209 clamp
= LLVMBuildFCmp(ctx
->ac
.builder
, LLVMRealULT
, tmp
, neg_one
, "");
210 tmp
= LLVMBuildSelect(ctx
->ac
.builder
, clamp
, neg_one
, tmp
, "");
211 } else if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
) {
212 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
218 for (unsigned i
= 0; i
< 4; ++i
)
219 out
[i
] = ac_to_float(&ctx
->ac
, fetches
[i
]);
222 static void declare_input_vs(struct si_shader_context
*ctx
, unsigned input_index
)
224 LLVMValueRef input
[4];
226 load_input_vs(ctx
, input_index
/ 4, input
);
228 for (unsigned chan
= 0; chan
< 4; chan
++) {
229 ctx
->inputs
[input_index
+ chan
] =
230 LLVMBuildBitCast(ctx
->ac
.builder
, input
[chan
], ctx
->ac
.i32
, "");
234 void si_llvm_load_vs_inputs(struct si_shader_context
*ctx
, struct nir_shader
*nir
)
236 uint64_t processed_inputs
= 0;
238 nir_foreach_shader_in_variable (variable
, nir
) {
239 unsigned attrib_count
= glsl_count_attribute_slots(variable
->type
, true);
240 unsigned input_idx
= variable
->data
.driver_location
;
241 unsigned loc
= variable
->data
.location
;
243 for (unsigned i
= 0; i
< attrib_count
; i
++) {
244 /* Packed components share the same location so skip
245 * them if we have already processed the location.
247 if (processed_inputs
& ((uint64_t)1 << (loc
+ i
))) {
252 declare_input_vs(ctx
, input_idx
);
253 if (glsl_type_is_dual_slot(variable
->type
)) {
255 declare_input_vs(ctx
, input_idx
);
258 processed_inputs
|= ((uint64_t)1 << (loc
+ i
));
264 void si_llvm_streamout_store_output(struct si_shader_context
*ctx
, LLVMValueRef
const *so_buffers
,
265 LLVMValueRef
const *so_write_offsets
,
266 struct pipe_stream_output
*stream_out
,
267 struct si_shader_output_values
*shader_out
)
269 unsigned buf_idx
= stream_out
->output_buffer
;
270 unsigned start
= stream_out
->start_component
;
271 unsigned num_comps
= stream_out
->num_components
;
274 assert(num_comps
&& num_comps
<= 4);
275 if (!num_comps
|| num_comps
> 4)
278 /* Load the output as int. */
279 for (int j
= 0; j
< num_comps
; j
++) {
280 assert(stream_out
->stream
== shader_out
->vertex_stream
[start
+ j
]);
282 out
[j
] = ac_to_integer(&ctx
->ac
, shader_out
->values
[start
+ j
]);
285 /* Pack the output. */
286 LLVMValueRef vdata
= NULL
;
292 case 2: /* as v2i32 */
293 case 3: /* as v3i32 */
294 if (ac_has_vec3_support(ctx
->screen
->info
.chip_class
, false)) {
295 vdata
= ac_build_gather_values(&ctx
->ac
, out
, num_comps
);
298 /* as v4i32 (aligned to 4) */
299 out
[3] = LLVMGetUndef(ctx
->ac
.i32
);
301 case 4: /* as v4i32 */
302 vdata
= ac_build_gather_values(&ctx
->ac
, out
, util_next_power_of_two(num_comps
));
306 ac_build_buffer_store_dword(&ctx
->ac
, so_buffers
[buf_idx
], vdata
, num_comps
,
307 so_write_offsets
[buf_idx
], ctx
->ac
.i32_0
, stream_out
->dst_offset
* 4,
312 * Write streamout data to buffers for vertex stream @p stream (different
313 * vertex streams can occur for GS copy shaders).
315 void si_llvm_emit_streamout(struct si_shader_context
*ctx
, struct si_shader_output_values
*outputs
,
316 unsigned noutput
, unsigned stream
)
318 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
319 struct pipe_stream_output_info
*so
= &sel
->so
;
320 LLVMBuilderRef builder
= ctx
->ac
.builder
;
323 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
324 LLVMValueRef so_vtx_count
= si_unpack_param(ctx
, ctx
->streamout_config
, 16, 7);
326 LLVMValueRef tid
= ac_get_thread_id(&ctx
->ac
);
328 /* can_emit = tid < so_vtx_count; */
329 LLVMValueRef can_emit
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, so_vtx_count
, "");
331 /* Emit the streamout code conditionally. This actually avoids
332 * out-of-bounds buffer access. The hw tells us via the SGPR
333 * (so_vtx_count) which threads are allowed to emit streamout data. */
334 ac_build_ifcc(&ctx
->ac
, can_emit
, 6501);
336 /* The buffer offset is computed as follows:
337 * ByteOffset = streamout_offset[buffer_id]*4 +
338 * (streamout_write_index + thread_id)*stride[buffer_id] +
342 LLVMValueRef so_write_index
= ac_get_arg(&ctx
->ac
, ctx
->streamout_write_index
);
344 /* Compute (streamout_write_index + thread_id). */
345 so_write_index
= LLVMBuildAdd(builder
, so_write_index
, tid
, "");
347 /* Load the descriptor and compute the write offset for each
349 LLVMValueRef so_write_offset
[4] = {};
350 LLVMValueRef so_buffers
[4];
351 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
353 for (i
= 0; i
< 4; i
++) {
357 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_STREAMOUT_BUF0
+ i
, 0);
359 so_buffers
[i
] = ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
361 LLVMValueRef so_offset
= ac_get_arg(&ctx
->ac
, ctx
->streamout_offset
[i
]);
362 so_offset
= LLVMBuildMul(builder
, so_offset
, LLVMConstInt(ctx
->ac
.i32
, 4, 0), "");
364 so_write_offset
[i
] = ac_build_imad(
365 &ctx
->ac
, so_write_index
, LLVMConstInt(ctx
->ac
.i32
, so
->stride
[i
] * 4, 0), so_offset
);
368 /* Write streamout data. */
369 for (i
= 0; i
< so
->num_outputs
; i
++) {
370 unsigned reg
= so
->output
[i
].register_index
;
375 if (stream
!= so
->output
[i
].stream
)
378 si_llvm_streamout_store_output(ctx
, so_buffers
, so_write_offset
, &so
->output
[i
],
382 ac_build_endif(&ctx
->ac
, 6501);
385 static void si_llvm_emit_clipvertex(struct si_shader_context
*ctx
, struct ac_export_args
*pos
,
386 LLVMValueRef
*out_elts
)
391 LLVMValueRef base_elt
;
392 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
393 LLVMValueRef constbuf_index
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_CONST_CLIP_PLANES
, 0);
394 LLVMValueRef const_resource
= ac_build_load_to_sgpr(&ctx
->ac
, ptr
, constbuf_index
);
396 for (reg_index
= 0; reg_index
< 2; reg_index
++) {
397 struct ac_export_args
*args
= &pos
[2 + reg_index
];
399 args
->out
[0] = args
->out
[1] = args
->out
[2] = args
->out
[3] = LLVMConstReal(ctx
->ac
.f32
, 0.0f
);
401 /* Compute dot products of position and user clip plane vectors */
402 for (chan
= 0; chan
< 4; chan
++) {
403 for (const_chan
= 0; const_chan
< 4; const_chan
++) {
405 LLVMConstInt(ctx
->ac
.i32
, ((reg_index
* 4 + chan
) * 4 + const_chan
) * 4, 0);
406 base_elt
= si_buffer_load_const(ctx
, const_resource
, addr
);
408 ac_build_fmad(&ctx
->ac
, base_elt
, out_elts
[const_chan
], args
->out
[chan
]);
412 args
->enabled_channels
= 0xf;
413 args
->valid_mask
= 0;
415 args
->target
= V_008DFC_SQ_EXP_POS
+ 2 + reg_index
;
420 /* Initialize arguments for the shader export intrinsic */
421 static void si_llvm_init_vs_export_args(struct si_shader_context
*ctx
, LLVMValueRef
*values
,
422 unsigned target
, struct ac_export_args
*args
)
424 args
->enabled_channels
= 0xf; /* writemask - default is 0xf */
425 args
->valid_mask
= 0; /* Specify whether the EXEC mask represents the valid mask */
426 args
->done
= 0; /* Specify whether this is the last export */
427 args
->target
= target
; /* Specify the target we are exporting */
430 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
433 static void si_export_param(struct si_shader_context
*ctx
, unsigned index
, LLVMValueRef
*values
)
435 struct ac_export_args args
;
437 si_llvm_init_vs_export_args(ctx
, values
, V_008DFC_SQ_EXP_PARAM
+ index
, &args
);
438 ac_build_export(&ctx
->ac
, &args
);
441 static void si_build_param_exports(struct si_shader_context
*ctx
,
442 struct si_shader_output_values
*outputs
, unsigned noutput
)
444 struct si_shader
*shader
= ctx
->shader
;
445 unsigned param_count
= 0;
447 for (unsigned i
= 0; i
< noutput
; i
++) {
448 unsigned semantic_name
= outputs
[i
].semantic_name
;
449 unsigned semantic_index
= outputs
[i
].semantic_index
;
451 if (outputs
[i
].vertex_stream
[0] != 0 && outputs
[i
].vertex_stream
[1] != 0 &&
452 outputs
[i
].vertex_stream
[2] != 0 && outputs
[i
].vertex_stream
[3] != 0)
455 switch (semantic_name
) {
456 case TGSI_SEMANTIC_LAYER
:
457 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
458 case TGSI_SEMANTIC_CLIPDIST
:
459 case TGSI_SEMANTIC_COLOR
:
460 case TGSI_SEMANTIC_BCOLOR
:
461 case TGSI_SEMANTIC_PRIMID
:
462 case TGSI_SEMANTIC_FOG
:
463 case TGSI_SEMANTIC_TEXCOORD
:
464 case TGSI_SEMANTIC_GENERIC
:
470 if ((semantic_name
!= TGSI_SEMANTIC_GENERIC
|| semantic_index
< SI_MAX_IO_GENERIC
) &&
471 shader
->key
.opt
.kill_outputs
&
472 (1ull << si_shader_io_get_unique_index(semantic_name
, semantic_index
, true)))
475 si_export_param(ctx
, param_count
, outputs
[i
].values
);
477 assert(i
< ARRAY_SIZE(shader
->info
.vs_output_param_offset
));
478 shader
->info
.vs_output_param_offset
[i
] = param_count
++;
481 shader
->info
.nr_param_exports
= param_count
;
485 * Vertex color clamping.
487 * This uses a state constant loaded in a user data SGPR and
488 * an IF statement is added that clamps all colors if the constant
491 static void si_vertex_color_clamping(struct si_shader_context
*ctx
,
492 struct si_shader_output_values
*outputs
, unsigned noutput
)
494 LLVMValueRef addr
[SI_MAX_VS_OUTPUTS
][4];
495 bool has_colors
= false;
497 /* Store original colors to alloca variables. */
498 for (unsigned i
= 0; i
< noutput
; i
++) {
499 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
500 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
503 for (unsigned j
= 0; j
< 4; j
++) {
504 addr
[i
][j
] = ac_build_alloca_undef(&ctx
->ac
, ctx
->ac
.f32
, "");
505 LLVMBuildStore(ctx
->ac
.builder
, outputs
[i
].values
[j
], addr
[i
][j
]);
513 /* The state is in the first bit of the user SGPR. */
514 LLVMValueRef cond
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
515 cond
= LLVMBuildTrunc(ctx
->ac
.builder
, cond
, ctx
->ac
.i1
, "");
517 ac_build_ifcc(&ctx
->ac
, cond
, 6502);
519 /* Store clamped colors to alloca variables within the conditional block. */
520 for (unsigned i
= 0; i
< noutput
; i
++) {
521 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
522 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
525 for (unsigned j
= 0; j
< 4; j
++) {
526 LLVMBuildStore(ctx
->ac
.builder
, ac_build_clamp(&ctx
->ac
, outputs
[i
].values
[j
]),
530 ac_build_endif(&ctx
->ac
, 6502);
532 /* Load clamped colors */
533 for (unsigned i
= 0; i
< noutput
; i
++) {
534 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
535 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
538 for (unsigned j
= 0; j
< 4; j
++) {
539 outputs
[i
].values
[j
] = LLVMBuildLoad(ctx
->ac
.builder
, addr
[i
][j
], "");
544 /* Generate export instructions for hardware VS shader stage or NGG GS stage
545 * (position and parameter data only).
547 void si_llvm_build_vs_exports(struct si_shader_context
*ctx
,
548 struct si_shader_output_values
*outputs
, unsigned noutput
)
550 struct si_shader
*shader
= ctx
->shader
;
551 struct ac_export_args pos_args
[4] = {};
552 LLVMValueRef psize_value
= NULL
, edgeflag_value
= NULL
, layer_value
= NULL
,
553 viewport_index_value
= NULL
;
557 si_vertex_color_clamping(ctx
, outputs
, noutput
);
559 /* Build position exports. */
560 for (i
= 0; i
< noutput
; i
++) {
561 switch (outputs
[i
].semantic_name
) {
562 case TGSI_SEMANTIC_POSITION
:
563 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
, V_008DFC_SQ_EXP_POS
, &pos_args
[0]);
565 case TGSI_SEMANTIC_PSIZE
:
566 psize_value
= outputs
[i
].values
[0];
568 case TGSI_SEMANTIC_LAYER
:
569 layer_value
= outputs
[i
].values
[0];
571 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
572 viewport_index_value
= outputs
[i
].values
[0];
574 case TGSI_SEMANTIC_EDGEFLAG
:
575 edgeflag_value
= outputs
[i
].values
[0];
577 case TGSI_SEMANTIC_CLIPDIST
:
578 if (!shader
->key
.opt
.clip_disable
) {
579 unsigned index
= 2 + outputs
[i
].semantic_index
;
580 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
, V_008DFC_SQ_EXP_POS
+ index
,
584 case TGSI_SEMANTIC_CLIPVERTEX
:
585 if (!shader
->key
.opt
.clip_disable
) {
586 si_llvm_emit_clipvertex(ctx
, pos_args
, outputs
[i
].values
);
592 /* We need to add the position output manually if it's missing. */
593 if (!pos_args
[0].out
[0]) {
594 pos_args
[0].enabled_channels
= 0xf; /* writemask */
595 pos_args
[0].valid_mask
= 0; /* EXEC mask */
596 pos_args
[0].done
= 0; /* last export? */
597 pos_args
[0].target
= V_008DFC_SQ_EXP_POS
;
598 pos_args
[0].compr
= 0; /* COMPR flag */
599 pos_args
[0].out
[0] = ctx
->ac
.f32_0
; /* X */
600 pos_args
[0].out
[1] = ctx
->ac
.f32_0
; /* Y */
601 pos_args
[0].out
[2] = ctx
->ac
.f32_0
; /* Z */
602 pos_args
[0].out
[3] = ctx
->ac
.f32_1
; /* W */
605 bool pos_writes_edgeflag
= shader
->selector
->info
.writes_edgeflag
&& !shader
->key
.as_ngg
;
607 /* Write the misc vector (point size, edgeflag, layer, viewport). */
608 if (shader
->selector
->info
.writes_psize
|| pos_writes_edgeflag
||
609 shader
->selector
->info
.writes_viewport_index
|| shader
->selector
->info
.writes_layer
) {
610 pos_args
[1].enabled_channels
= shader
->selector
->info
.writes_psize
|
611 (pos_writes_edgeflag
<< 1) |
612 (shader
->selector
->info
.writes_layer
<< 2);
614 pos_args
[1].valid_mask
= 0; /* EXEC mask */
615 pos_args
[1].done
= 0; /* last export? */
616 pos_args
[1].target
= V_008DFC_SQ_EXP_POS
+ 1;
617 pos_args
[1].compr
= 0; /* COMPR flag */
618 pos_args
[1].out
[0] = ctx
->ac
.f32_0
; /* X */
619 pos_args
[1].out
[1] = ctx
->ac
.f32_0
; /* Y */
620 pos_args
[1].out
[2] = ctx
->ac
.f32_0
; /* Z */
621 pos_args
[1].out
[3] = ctx
->ac
.f32_0
; /* W */
623 if (shader
->selector
->info
.writes_psize
)
624 pos_args
[1].out
[0] = psize_value
;
626 if (pos_writes_edgeflag
) {
627 /* The output is a float, but the hw expects an integer
628 * with the first bit containing the edge flag. */
629 edgeflag_value
= LLVMBuildFPToUI(ctx
->ac
.builder
, edgeflag_value
, ctx
->ac
.i32
, "");
630 edgeflag_value
= ac_build_umin(&ctx
->ac
, edgeflag_value
, ctx
->ac
.i32_1
);
632 /* The LLVM intrinsic expects a float. */
633 pos_args
[1].out
[1] = ac_to_float(&ctx
->ac
, edgeflag_value
);
636 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
637 /* GFX9 has the layer in out.z[10:0] and the viewport
638 * index in out.z[19:16].
640 if (shader
->selector
->info
.writes_layer
)
641 pos_args
[1].out
[2] = layer_value
;
643 if (shader
->selector
->info
.writes_viewport_index
) {
644 LLVMValueRef v
= viewport_index_value
;
646 v
= ac_to_integer(&ctx
->ac
, v
);
647 v
= LLVMBuildShl(ctx
->ac
.builder
, v
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
648 v
= LLVMBuildOr(ctx
->ac
.builder
, v
, ac_to_integer(&ctx
->ac
, pos_args
[1].out
[2]), "");
649 pos_args
[1].out
[2] = ac_to_float(&ctx
->ac
, v
);
650 pos_args
[1].enabled_channels
|= 1 << 2;
653 if (shader
->selector
->info
.writes_layer
)
654 pos_args
[1].out
[2] = layer_value
;
656 if (shader
->selector
->info
.writes_viewport_index
) {
657 pos_args
[1].out
[3] = viewport_index_value
;
658 pos_args
[1].enabled_channels
|= 1 << 3;
663 for (i
= 0; i
< 4; i
++)
664 if (pos_args
[i
].out
[0])
665 shader
->info
.nr_pos_exports
++;
667 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
668 * Setting valid_mask=1 prevents it and has no other effect.
670 if (ctx
->screen
->info
.chip_class
== GFX10
)
671 pos_args
[0].valid_mask
= 1;
674 for (i
= 0; i
< 4; i
++) {
675 if (!pos_args
[i
].out
[0])
678 /* Specify the target we are exporting */
679 pos_args
[i
].target
= V_008DFC_SQ_EXP_POS
+ pos_idx
++;
681 if (pos_idx
== shader
->info
.nr_pos_exports
)
682 /* Specify that this is the last export */
683 pos_args
[i
].done
= 1;
685 ac_build_export(&ctx
->ac
, &pos_args
[i
]);
688 /* Build parameter exports. */
689 si_build_param_exports(ctx
, outputs
, noutput
);
692 void si_llvm_emit_vs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
, LLVMValueRef
*addrs
)
694 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
695 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
696 struct si_shader_output_values
*outputs
= NULL
;
699 assert(!ctx
->shader
->is_gs_copy_shader
);
700 assert(info
->num_outputs
<= max_outputs
);
702 outputs
= MALLOC((info
->num_outputs
+ 1) * sizeof(outputs
[0]));
704 for (i
= 0; i
< info
->num_outputs
; i
++) {
705 outputs
[i
].semantic_name
= info
->output_semantic_name
[i
];
706 outputs
[i
].semantic_index
= info
->output_semantic_index
[i
];
708 for (j
= 0; j
< 4; j
++) {
709 outputs
[i
].values
[j
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ j
], "");
710 outputs
[i
].vertex_stream
[j
] = (info
->output_streams
[i
] >> (2 * j
)) & 3;
714 if (!ctx
->screen
->use_ngg_streamout
&& ctx
->shader
->selector
->so
.num_outputs
)
715 si_llvm_emit_streamout(ctx
, outputs
, i
, 0);
717 /* Export PrimitiveID. */
718 if (ctx
->shader
->key
.mono
.u
.vs_export_prim_id
) {
719 outputs
[i
].semantic_name
= TGSI_SEMANTIC_PRIMID
;
720 outputs
[i
].semantic_index
= 0;
721 outputs
[i
].values
[0] = ac_to_float(&ctx
->ac
, si_get_primitive_id(ctx
, 0));
722 for (j
= 1; j
< 4; j
++)
723 outputs
[i
].values
[j
] = LLVMConstReal(ctx
->ac
.f32
, 0);
725 memset(outputs
[i
].vertex_stream
, 0, sizeof(outputs
[i
].vertex_stream
));
729 si_llvm_build_vs_exports(ctx
, outputs
, i
);
733 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
,
736 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
737 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
738 LLVMValueRef pos
[4] = {};
740 assert(info
->num_outputs
<= max_outputs
);
742 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
743 if (info
->output_semantic_name
[i
] != TGSI_SEMANTIC_POSITION
)
746 for (unsigned chan
= 0; chan
< 4; chan
++)
747 pos
[chan
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
750 assert(pos
[0] != NULL
);
752 /* Return the position output. */
753 LLVMValueRef ret
= ctx
->return_value
;
754 for (unsigned chan
= 0; chan
< 4; chan
++)
755 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, pos
[chan
], chan
, "");
756 ctx
->return_value
= ret
;
760 * Build the vertex shader prolog function.
762 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
763 * All inputs are returned unmodified. The vertex load indices are
764 * stored after them, which will be used by the API VS for fetching inputs.
766 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
771 * (VertexID + BaseVertex),
772 * (InstanceID + StartInstance),
773 * (InstanceID / 2 + StartInstance)
775 void si_llvm_build_vs_prolog(struct si_shader_context
*ctx
, union si_shader_part_key
*key
)
777 LLVMTypeRef
*returns
;
778 LLVMValueRef ret
, func
;
780 unsigned first_vs_vgpr
= key
->vs_prolog
.num_merged_next_stage_vgprs
;
781 unsigned num_input_vgprs
=
782 key
->vs_prolog
.num_merged_next_stage_vgprs
+ 4 + (key
->vs_prolog
.has_ngg_cull_inputs
? 1 : 0);
783 struct ac_arg input_sgpr_param
[key
->vs_prolog
.num_input_sgprs
];
784 struct ac_arg input_vgpr_param
[10];
785 LLVMValueRef input_vgprs
[10];
786 unsigned num_all_input_regs
= key
->vs_prolog
.num_input_sgprs
+ num_input_vgprs
;
787 unsigned user_sgpr_base
= key
->vs_prolog
.num_merged_next_stage_vgprs
? 8 : 0;
789 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
791 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
792 returns
= alloca((num_all_input_regs
+ key
->vs_prolog
.num_inputs
) * sizeof(LLVMTypeRef
));
795 /* Declare input and output SGPRs. */
796 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
797 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &input_sgpr_param
[i
]);
798 returns
[num_returns
++] = ctx
->ac
.i32
;
801 struct ac_arg merged_wave_info
= input_sgpr_param
[3];
803 /* Preloaded VGPRs (outputs must be floats) */
804 for (i
= 0; i
< num_input_vgprs
; i
++) {
805 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &input_vgpr_param
[i
]);
806 returns
[num_returns
++] = ctx
->ac
.f32
;
809 /* Vertex load indices. */
810 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++)
811 returns
[num_returns
++] = ctx
->ac
.f32
;
813 /* Create the function. */
814 si_llvm_create_func(ctx
, "vs_prolog", returns
, num_returns
, 0);
817 for (i
= 0; i
< num_input_vgprs
; i
++) {
818 input_vgprs
[i
] = ac_get_arg(&ctx
->ac
, input_vgpr_param
[i
]);
821 if (key
->vs_prolog
.num_merged_next_stage_vgprs
) {
822 if (!key
->vs_prolog
.is_monolithic
)
823 si_init_exec_from_input(ctx
, merged_wave_info
, 0);
825 if (key
->vs_prolog
.as_ls
&& ctx
->screen
->info
.has_ls_vgpr_init_bug
) {
826 /* If there are no HS threads, SPI loads the LS VGPRs
827 * starting at VGPR 0. Shift them back to where they
830 LLVMValueRef has_hs_threads
=
831 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
,
832 si_unpack_param(ctx
, input_sgpr_param
[3], 8, 8), ctx
->ac
.i32_0
, "");
834 for (i
= 4; i
> 0; --i
) {
835 input_vgprs
[i
+ 1] = LLVMBuildSelect(ctx
->ac
.builder
, has_hs_threads
,
836 input_vgprs
[i
+ 1], input_vgprs
[i
- 1], "");
841 if (key
->vs_prolog
.gs_fast_launch_tri_list
|| key
->vs_prolog
.gs_fast_launch_tri_strip
) {
842 LLVMValueRef wave_id
, thread_id_in_tg
;
844 wave_id
= si_unpack_param(ctx
, input_sgpr_param
[3], 24, 4);
846 ac_build_imad(&ctx
->ac
, wave_id
, LLVMConstInt(ctx
->ac
.i32
, ctx
->ac
.wave_size
, false),
847 ac_get_thread_id(&ctx
->ac
));
849 /* The GS fast launch initializes all VGPRs to the value of
850 * the first thread, so we have to add the thread ID.
852 * Only these are initialized by the hw:
853 * VGPR2: Base Primitive ID
854 * VGPR5: Base Vertex ID
858 /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
859 * The NGG cull shader will read them from there.
861 if (key
->vs_prolog
.gs_fast_launch_tri_list
) {
862 input_vgprs
[0] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx01_offset */
863 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 0 */
864 LLVMConstInt(ctx
->ac
.i32
, 0, 0));
865 input_vgprs
[1] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx23_offset */
866 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 1 */
867 LLVMConstInt(ctx
->ac
.i32
, 1, 0));
868 input_vgprs
[4] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx45_offset */
869 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 2 */
870 LLVMConstInt(ctx
->ac
.i32
, 2, 0));
872 assert(key
->vs_prolog
.gs_fast_launch_tri_strip
);
873 LLVMBuilderRef builder
= ctx
->ac
.builder
;
874 /* Triangle indices: */
875 LLVMValueRef index
[3] = {
877 LLVMBuildAdd(builder
, thread_id_in_tg
, LLVMConstInt(ctx
->ac
.i32
, 1, 0), ""),
878 LLVMBuildAdd(builder
, thread_id_in_tg
, LLVMConstInt(ctx
->ac
.i32
, 2, 0), ""),
880 LLVMValueRef is_odd
= LLVMBuildTrunc(ctx
->ac
.builder
, thread_id_in_tg
, ctx
->ac
.i1
, "");
881 LLVMValueRef flatshade_first
= LLVMBuildICmp(
882 builder
, LLVMIntEQ
, si_unpack_param(ctx
, ctx
->vs_state_bits
, 4, 2), ctx
->ac
.i32_0
, "");
884 ac_build_triangle_strip_indices_to_triangle(&ctx
->ac
, is_odd
, flatshade_first
, index
);
885 input_vgprs
[0] = index
[0];
886 input_vgprs
[1] = index
[1];
887 input_vgprs
[4] = index
[2];
890 /* Triangles always have all edge flags set initially. */
891 input_vgprs
[3] = LLVMConstInt(ctx
->ac
.i32
, 0x7 << 8, 0);
894 LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[2], thread_id_in_tg
, ""); /* PrimID */
896 LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[5], thread_id_in_tg
, ""); /* VertexID */
897 input_vgprs
[8] = input_vgprs
[6]; /* InstanceID */
900 unsigned vertex_id_vgpr
= first_vs_vgpr
;
901 unsigned instance_id_vgpr
= ctx
->screen
->info
.chip_class
>= GFX10
903 : first_vs_vgpr
+ (key
->vs_prolog
.as_ls
? 2 : 1);
905 ctx
->abi
.vertex_id
= input_vgprs
[vertex_id_vgpr
];
906 ctx
->abi
.instance_id
= input_vgprs
[instance_id_vgpr
];
908 /* InstanceID = VertexID >> 16;
909 * VertexID = VertexID & 0xffff;
911 if (key
->vs_prolog
.states
.unpack_instance_id_from_vertex_id
) {
912 ctx
->abi
.instance_id
=
913 LLVMBuildLShr(ctx
->ac
.builder
, ctx
->abi
.vertex_id
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
914 ctx
->abi
.vertex_id
= LLVMBuildAnd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
915 LLVMConstInt(ctx
->ac
.i32
, 0xffff, 0), "");
918 /* Copy inputs to outputs. This should be no-op, as the registers match,
919 * but it will prevent the compiler from overwriting them unintentionally.
921 ret
= ctx
->return_value
;
922 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
923 LLVMValueRef p
= LLVMGetParam(func
, i
);
924 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, i
, "");
926 for (i
= 0; i
< num_input_vgprs
; i
++) {
927 LLVMValueRef p
= input_vgprs
[i
];
929 if (i
== vertex_id_vgpr
)
930 p
= ctx
->abi
.vertex_id
;
931 else if (i
== instance_id_vgpr
)
932 p
= ctx
->abi
.instance_id
;
934 p
= ac_to_float(&ctx
->ac
, p
);
935 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, key
->vs_prolog
.num_input_sgprs
+ i
, "");
938 /* Compute vertex load indices from instance divisors. */
939 LLVMValueRef instance_divisor_constbuf
= NULL
;
941 if (key
->vs_prolog
.states
.instance_divisor_is_fetched
) {
942 LLVMValueRef list
= si_prolog_get_rw_buffers(ctx
);
943 LLVMValueRef buf_index
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_CONST_INSTANCE_DIVISORS
, 0);
944 instance_divisor_constbuf
= ac_build_load_to_sgpr(&ctx
->ac
, list
, buf_index
);
947 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++) {
948 bool divisor_is_one
= key
->vs_prolog
.states
.instance_divisor_is_one
& (1u << i
);
949 bool divisor_is_fetched
= key
->vs_prolog
.states
.instance_divisor_is_fetched
& (1u << i
);
950 LLVMValueRef index
= NULL
;
952 if (divisor_is_one
) {
953 index
= ctx
->abi
.instance_id
;
954 } else if (divisor_is_fetched
) {
955 LLVMValueRef udiv_factors
[4];
957 for (unsigned j
= 0; j
< 4; j
++) {
958 udiv_factors
[j
] = si_buffer_load_const(ctx
, instance_divisor_constbuf
,
959 LLVMConstInt(ctx
->ac
.i32
, i
* 16 + j
* 4, 0));
960 udiv_factors
[j
] = ac_to_integer(&ctx
->ac
, udiv_factors
[j
]);
962 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
963 * Such InstanceID might not be achievable in a reasonable time though.
965 index
= ac_build_fast_udiv_nuw(&ctx
->ac
, ctx
->abi
.instance_id
, udiv_factors
[0],
966 udiv_factors
[1], udiv_factors
[2], udiv_factors
[3]);
969 if (divisor_is_one
|| divisor_is_fetched
) {
970 /* Add StartInstance. */
972 LLVMBuildAdd(ctx
->ac
.builder
, index
,
973 LLVMGetParam(ctx
->main_fn
, user_sgpr_base
+ SI_SGPR_START_INSTANCE
), "");
975 /* VertexID + BaseVertex */
976 index
= LLVMBuildAdd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
977 LLVMGetParam(func
, user_sgpr_base
+ SI_SGPR_BASE_VERTEX
), "");
980 index
= ac_to_float(&ctx
->ac
, index
);
981 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, index
, ctx
->args
.arg_count
+ i
, "");
984 si_llvm_build_ret(ctx
, ret
);
987 static LLVMValueRef
get_base_vertex(struct ac_shader_abi
*abi
)
989 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
991 /* For non-indexed draws, the base vertex set by the driver
992 * (for direct draws) or the CP (for indirect draws) is the
993 * first vertex ID, but GLSL expects 0 to be returned.
995 LLVMValueRef vs_state
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
996 LLVMValueRef indexed
;
998 indexed
= LLVMBuildLShr(ctx
->ac
.builder
, vs_state
, ctx
->ac
.i32_1
, "");
999 indexed
= LLVMBuildTrunc(ctx
->ac
.builder
, indexed
, ctx
->ac
.i1
, "");
1001 return LLVMBuildSelect(ctx
->ac
.builder
, indexed
, ac_get_arg(&ctx
->ac
, ctx
->args
.base_vertex
),
1005 void si_llvm_init_vs_callbacks(struct si_shader_context
*ctx
, bool ngg_cull_shader
)
1007 struct si_shader
*shader
= ctx
->shader
;
1009 if (shader
->key
.as_ls
)
1010 ctx
->abi
.emit_outputs
= si_llvm_emit_ls_epilogue
;
1011 else if (shader
->key
.as_es
)
1012 ctx
->abi
.emit_outputs
= si_llvm_emit_es_epilogue
;
1013 else if (shader
->key
.opt
.vs_as_prim_discard_cs
)
1014 ctx
->abi
.emit_outputs
= si_llvm_emit_prim_discard_cs_epilogue
;
1015 else if (ngg_cull_shader
)
1016 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_culling_epilogue
;
1017 else if (shader
->key
.as_ngg
)
1018 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_epilogue
;
1020 ctx
->abi
.emit_outputs
= si_llvm_emit_vs_epilogue
;
1022 ctx
->abi
.load_base_vertex
= get_base_vertex
;