2 * Copyright 2020 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "si_shader_internal.h"
28 #include "util/u_memory.h"
30 static LLVMValueRef
unpack_sint16(struct si_shader_context
*ctx
, LLVMValueRef i32
, unsigned index
)
35 return LLVMBuildAShr(ctx
->ac
.builder
, i32
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
37 return LLVMBuildSExt(ctx
->ac
.builder
, LLVMBuildTrunc(ctx
->ac
.builder
, i32
, ctx
->ac
.i16
, ""),
41 static void load_input_vs(struct si_shader_context
*ctx
, unsigned input_index
, LLVMValueRef out
[4])
43 const struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
44 unsigned vs_blit_property
= info
->properties
[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD
];
46 if (vs_blit_property
) {
47 LLVMValueRef vertex_id
= ctx
->abi
.vertex_id
;
49 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULE
, vertex_id
, ctx
->ac
.i32_1
, "");
50 /* Use LLVMIntNE, because we have 3 vertices and only
51 * the middle one should use y2.
53 LLVMValueRef sel_y1
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
, vertex_id
, ctx
->ac
.i32_1
, "");
55 unsigned param_vs_blit_inputs
= ctx
->vs_blit_inputs
.arg_index
;
56 if (input_index
== 0) {
58 LLVMValueRef x1y1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
);
59 LLVMValueRef x2y2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 1);
61 LLVMValueRef x1
= unpack_sint16(ctx
, x1y1
, 0);
62 LLVMValueRef y1
= unpack_sint16(ctx
, x1y1
, 1);
63 LLVMValueRef x2
= unpack_sint16(ctx
, x2y2
, 0);
64 LLVMValueRef y2
= unpack_sint16(ctx
, x2y2
, 1);
66 LLVMValueRef x
= LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
, x1
, x2
, "");
67 LLVMValueRef y
= LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
, y1
, y2
, "");
69 out
[0] = LLVMBuildSIToFP(ctx
->ac
.builder
, x
, ctx
->ac
.f32
, "");
70 out
[1] = LLVMBuildSIToFP(ctx
->ac
.builder
, y
, ctx
->ac
.f32
, "");
71 out
[2] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 2);
72 out
[3] = ctx
->ac
.f32_1
;
76 /* Color or texture coordinates: */
77 assert(input_index
== 1);
79 if (vs_blit_property
== SI_VS_BLIT_SGPRS_POS_COLOR
) {
80 for (int i
= 0; i
< 4; i
++) {
81 out
[i
] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 3 + i
);
84 assert(vs_blit_property
== SI_VS_BLIT_SGPRS_POS_TEXCOORD
);
85 LLVMValueRef x1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 3);
86 LLVMValueRef y1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 4);
87 LLVMValueRef x2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 5);
88 LLVMValueRef y2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 6);
90 out
[0] = LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
, x1
, x2
, "");
91 out
[1] = LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
, y1
, y2
, "");
92 out
[2] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 7);
93 out
[3] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 8);
98 unsigned num_vbos_in_user_sgprs
= ctx
->shader
->selector
->num_vbos_in_user_sgprs
;
99 union si_vs_fix_fetch fix_fetch
;
100 LLVMValueRef vb_desc
;
101 LLVMValueRef vertex_index
;
104 if (input_index
< num_vbos_in_user_sgprs
) {
105 vb_desc
= ac_get_arg(&ctx
->ac
, ctx
->vb_descriptors
[input_index
]);
107 unsigned index
= input_index
- num_vbos_in_user_sgprs
;
108 vb_desc
= ac_build_load_to_sgpr(&ctx
->ac
, ac_get_arg(&ctx
->ac
, ctx
->vertex_buffers
),
109 LLVMConstInt(ctx
->ac
.i32
, index
, 0));
112 vertex_index
= LLVMGetParam(ctx
->main_fn
, ctx
->vertex_index0
.arg_index
+ input_index
);
114 /* Use the open-coded implementation for all loads of doubles and
115 * of dword-sized data that needs fixups. We need to insert conversion
116 * code anyway, and the amd/common code does it for us.
118 * Note: On LLVM <= 8, we can only open-code formats with
119 * channel size >= 4 bytes.
121 bool opencode
= ctx
->shader
->key
.mono
.vs_fetch_opencode
& (1 << input_index
);
122 fix_fetch
.bits
= ctx
->shader
->key
.mono
.vs_fix_fetch
[input_index
].bits
;
123 if (opencode
|| (fix_fetch
.u
.log_size
== 3 && fix_fetch
.u
.format
== AC_FETCH_FORMAT_FLOAT
) ||
124 (fix_fetch
.u
.log_size
== 2)) {
125 tmp
= ac_build_opencoded_load_format(&ctx
->ac
, fix_fetch
.u
.log_size
,
126 fix_fetch
.u
.num_channels_m1
+ 1, fix_fetch
.u
.format
,
127 fix_fetch
.u
.reverse
, !opencode
, vb_desc
, vertex_index
,
128 ctx
->ac
.i32_0
, ctx
->ac
.i32_0
, 0, true);
129 for (unsigned i
= 0; i
< 4; ++i
)
131 LLVMBuildExtractElement(ctx
->ac
.builder
, tmp
, LLVMConstInt(ctx
->ac
.i32
, i
, false), "");
135 unsigned required_channels
= util_last_bit(info
->input_usage_mask
[input_index
]);
136 if (required_channels
== 0) {
137 for (unsigned i
= 0; i
< 4; ++i
)
138 out
[i
] = LLVMGetUndef(ctx
->ac
.f32
);
142 /* Do multiple loads for special formats. */
143 LLVMValueRef fetches
[4];
144 unsigned num_fetches
;
145 unsigned fetch_stride
;
146 unsigned channels_per_fetch
;
148 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2) {
149 num_fetches
= MIN2(required_channels
, 3);
150 fetch_stride
= 1 << fix_fetch
.u
.log_size
;
151 channels_per_fetch
= 1;
155 channels_per_fetch
= required_channels
;
158 for (unsigned i
= 0; i
< num_fetches
; ++i
) {
159 LLVMValueRef voffset
= LLVMConstInt(ctx
->ac
.i32
, fetch_stride
* i
, 0);
160 fetches
[i
] = ac_build_buffer_load_format(&ctx
->ac
, vb_desc
, vertex_index
, voffset
,
161 channels_per_fetch
, 0, true, false);
164 if (num_fetches
== 1 && channels_per_fetch
> 1) {
165 LLVMValueRef fetch
= fetches
[0];
166 for (unsigned i
= 0; i
< channels_per_fetch
; ++i
) {
167 tmp
= LLVMConstInt(ctx
->ac
.i32
, i
, false);
168 fetches
[i
] = LLVMBuildExtractElement(ctx
->ac
.builder
, fetch
, tmp
, "");
170 num_fetches
= channels_per_fetch
;
171 channels_per_fetch
= 1;
174 for (unsigned i
= num_fetches
; i
< 4; ++i
)
175 fetches
[i
] = LLVMGetUndef(ctx
->ac
.f32
);
177 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2 && required_channels
== 4) {
178 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_UINT
|| fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
)
179 fetches
[3] = ctx
->ac
.i32_1
;
181 fetches
[3] = ctx
->ac
.f32_1
;
182 } else if (fix_fetch
.u
.log_size
== 3 &&
183 (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
||
184 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
||
185 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
) &&
186 required_channels
== 4) {
187 /* For 2_10_10_10, the hardware returns an unsigned value;
188 * convert it to a signed one.
190 LLVMValueRef tmp
= fetches
[3];
191 LLVMValueRef c30
= LLVMConstInt(ctx
->ac
.i32
, 30, 0);
193 /* First, recover the sign-extended signed integer value. */
194 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
)
195 tmp
= LLVMBuildFPToUI(ctx
->ac
.builder
, tmp
, ctx
->ac
.i32
, "");
197 tmp
= ac_to_integer(&ctx
->ac
, tmp
);
199 /* For the integer-like cases, do a natural sign extension.
201 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
202 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
206 ctx
->ac
.builder
, tmp
,
207 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
? LLVMConstInt(ctx
->ac
.i32
, 7, 0) : c30
, "");
208 tmp
= LLVMBuildAShr(ctx
->ac
.builder
, tmp
, c30
, "");
210 /* Convert back to the right type. */
211 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
) {
213 LLVMValueRef neg_one
= LLVMConstReal(ctx
->ac
.f32
, -1.0);
214 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
215 clamp
= LLVMBuildFCmp(ctx
->ac
.builder
, LLVMRealULT
, tmp
, neg_one
, "");
216 tmp
= LLVMBuildSelect(ctx
->ac
.builder
, clamp
, neg_one
, tmp
, "");
217 } else if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
) {
218 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
224 for (unsigned i
= 0; i
< 4; ++i
)
225 out
[i
] = ac_to_float(&ctx
->ac
, fetches
[i
]);
228 static void declare_input_vs(struct si_shader_context
*ctx
, unsigned input_index
)
230 LLVMValueRef input
[4];
232 load_input_vs(ctx
, input_index
/ 4, input
);
234 for (unsigned chan
= 0; chan
< 4; chan
++) {
235 ctx
->inputs
[input_index
+ chan
] =
236 LLVMBuildBitCast(ctx
->ac
.builder
, input
[chan
], ctx
->ac
.i32
, "");
240 void si_llvm_load_vs_inputs(struct si_shader_context
*ctx
, struct nir_shader
*nir
)
242 uint64_t processed_inputs
= 0;
244 nir_foreach_shader_in_variable (variable
, nir
) {
245 unsigned attrib_count
= glsl_count_attribute_slots(variable
->type
, true);
246 unsigned input_idx
= variable
->data
.driver_location
;
247 unsigned loc
= variable
->data
.location
;
249 for (unsigned i
= 0; i
< attrib_count
; i
++) {
250 /* Packed components share the same location so skip
251 * them if we have already processed the location.
253 if (processed_inputs
& ((uint64_t)1 << (loc
+ i
))) {
258 declare_input_vs(ctx
, input_idx
);
259 if (glsl_type_is_dual_slot(variable
->type
)) {
261 declare_input_vs(ctx
, input_idx
);
264 processed_inputs
|= ((uint64_t)1 << (loc
+ i
));
270 void si_llvm_streamout_store_output(struct si_shader_context
*ctx
, LLVMValueRef
const *so_buffers
,
271 LLVMValueRef
const *so_write_offsets
,
272 struct pipe_stream_output
*stream_out
,
273 struct si_shader_output_values
*shader_out
)
275 unsigned buf_idx
= stream_out
->output_buffer
;
276 unsigned start
= stream_out
->start_component
;
277 unsigned num_comps
= stream_out
->num_components
;
280 assert(num_comps
&& num_comps
<= 4);
281 if (!num_comps
|| num_comps
> 4)
284 /* Load the output as int. */
285 for (int j
= 0; j
< num_comps
; j
++) {
286 assert(stream_out
->stream
== shader_out
->vertex_stream
[start
+ j
]);
288 out
[j
] = ac_to_integer(&ctx
->ac
, shader_out
->values
[start
+ j
]);
291 /* Pack the output. */
292 LLVMValueRef vdata
= NULL
;
298 case 2: /* as v2i32 */
299 case 3: /* as v3i32 */
300 if (ac_has_vec3_support(ctx
->screen
->info
.chip_class
, false)) {
301 vdata
= ac_build_gather_values(&ctx
->ac
, out
, num_comps
);
304 /* as v4i32 (aligned to 4) */
305 out
[3] = LLVMGetUndef(ctx
->ac
.i32
);
307 case 4: /* as v4i32 */
308 vdata
= ac_build_gather_values(&ctx
->ac
, out
, util_next_power_of_two(num_comps
));
312 ac_build_buffer_store_dword(&ctx
->ac
, so_buffers
[buf_idx
], vdata
, num_comps
,
313 so_write_offsets
[buf_idx
], ctx
->ac
.i32_0
, stream_out
->dst_offset
* 4,
318 * Write streamout data to buffers for vertex stream @p stream (different
319 * vertex streams can occur for GS copy shaders).
321 void si_llvm_emit_streamout(struct si_shader_context
*ctx
, struct si_shader_output_values
*outputs
,
322 unsigned noutput
, unsigned stream
)
324 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
325 struct pipe_stream_output_info
*so
= &sel
->so
;
326 LLVMBuilderRef builder
= ctx
->ac
.builder
;
329 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
330 LLVMValueRef so_vtx_count
= si_unpack_param(ctx
, ctx
->streamout_config
, 16, 7);
332 LLVMValueRef tid
= ac_get_thread_id(&ctx
->ac
);
334 /* can_emit = tid < so_vtx_count; */
335 LLVMValueRef can_emit
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, so_vtx_count
, "");
337 /* Emit the streamout code conditionally. This actually avoids
338 * out-of-bounds buffer access. The hw tells us via the SGPR
339 * (so_vtx_count) which threads are allowed to emit streamout data. */
340 ac_build_ifcc(&ctx
->ac
, can_emit
, 6501);
342 /* The buffer offset is computed as follows:
343 * ByteOffset = streamout_offset[buffer_id]*4 +
344 * (streamout_write_index + thread_id)*stride[buffer_id] +
348 LLVMValueRef so_write_index
= ac_get_arg(&ctx
->ac
, ctx
->streamout_write_index
);
350 /* Compute (streamout_write_index + thread_id). */
351 so_write_index
= LLVMBuildAdd(builder
, so_write_index
, tid
, "");
353 /* Load the descriptor and compute the write offset for each
355 LLVMValueRef so_write_offset
[4] = {};
356 LLVMValueRef so_buffers
[4];
357 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
359 for (i
= 0; i
< 4; i
++) {
363 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_STREAMOUT_BUF0
+ i
, 0);
365 so_buffers
[i
] = ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
367 LLVMValueRef so_offset
= ac_get_arg(&ctx
->ac
, ctx
->streamout_offset
[i
]);
368 so_offset
= LLVMBuildMul(builder
, so_offset
, LLVMConstInt(ctx
->ac
.i32
, 4, 0), "");
370 so_write_offset
[i
] = ac_build_imad(
371 &ctx
->ac
, so_write_index
, LLVMConstInt(ctx
->ac
.i32
, so
->stride
[i
] * 4, 0), so_offset
);
374 /* Write streamout data. */
375 for (i
= 0; i
< so
->num_outputs
; i
++) {
376 unsigned reg
= so
->output
[i
].register_index
;
381 if (stream
!= so
->output
[i
].stream
)
384 si_llvm_streamout_store_output(ctx
, so_buffers
, so_write_offset
, &so
->output
[i
],
388 ac_build_endif(&ctx
->ac
, 6501);
391 static void si_llvm_emit_clipvertex(struct si_shader_context
*ctx
, struct ac_export_args
*pos
,
392 LLVMValueRef
*out_elts
)
397 LLVMValueRef base_elt
;
398 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
399 LLVMValueRef constbuf_index
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_CONST_CLIP_PLANES
, 0);
400 LLVMValueRef const_resource
= ac_build_load_to_sgpr(&ctx
->ac
, ptr
, constbuf_index
);
402 for (reg_index
= 0; reg_index
< 2; reg_index
++) {
403 struct ac_export_args
*args
= &pos
[2 + reg_index
];
405 args
->out
[0] = args
->out
[1] = args
->out
[2] = args
->out
[3] = LLVMConstReal(ctx
->ac
.f32
, 0.0f
);
407 /* Compute dot products of position and user clip plane vectors */
408 for (chan
= 0; chan
< 4; chan
++) {
409 for (const_chan
= 0; const_chan
< 4; const_chan
++) {
411 LLVMConstInt(ctx
->ac
.i32
, ((reg_index
* 4 + chan
) * 4 + const_chan
) * 4, 0);
412 base_elt
= si_buffer_load_const(ctx
, const_resource
, addr
);
414 ac_build_fmad(&ctx
->ac
, base_elt
, out_elts
[const_chan
], args
->out
[chan
]);
418 args
->enabled_channels
= 0xf;
419 args
->valid_mask
= 0;
421 args
->target
= V_008DFC_SQ_EXP_POS
+ 2 + reg_index
;
426 /* Initialize arguments for the shader export intrinsic */
427 static void si_llvm_init_vs_export_args(struct si_shader_context
*ctx
, LLVMValueRef
*values
,
428 unsigned target
, struct ac_export_args
*args
)
430 args
->enabled_channels
= 0xf; /* writemask - default is 0xf */
431 args
->valid_mask
= 0; /* Specify whether the EXEC mask represents the valid mask */
432 args
->done
= 0; /* Specify whether this is the last export */
433 args
->target
= target
; /* Specify the target we are exporting */
436 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
439 static void si_export_param(struct si_shader_context
*ctx
, unsigned index
, LLVMValueRef
*values
)
441 struct ac_export_args args
;
443 si_llvm_init_vs_export_args(ctx
, values
, V_008DFC_SQ_EXP_PARAM
+ index
, &args
);
444 ac_build_export(&ctx
->ac
, &args
);
447 static void si_build_param_exports(struct si_shader_context
*ctx
,
448 struct si_shader_output_values
*outputs
, unsigned noutput
)
450 struct si_shader
*shader
= ctx
->shader
;
451 unsigned param_count
= 0;
453 for (unsigned i
= 0; i
< noutput
; i
++) {
454 unsigned semantic_name
= outputs
[i
].semantic_name
;
455 unsigned semantic_index
= outputs
[i
].semantic_index
;
457 if (outputs
[i
].vertex_stream
[0] != 0 && outputs
[i
].vertex_stream
[1] != 0 &&
458 outputs
[i
].vertex_stream
[2] != 0 && outputs
[i
].vertex_stream
[3] != 0)
461 switch (semantic_name
) {
462 case TGSI_SEMANTIC_LAYER
:
463 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
464 case TGSI_SEMANTIC_CLIPDIST
:
465 case TGSI_SEMANTIC_COLOR
:
466 case TGSI_SEMANTIC_BCOLOR
:
467 case TGSI_SEMANTIC_PRIMID
:
468 case TGSI_SEMANTIC_FOG
:
469 case TGSI_SEMANTIC_TEXCOORD
:
470 case TGSI_SEMANTIC_GENERIC
:
476 if ((semantic_name
!= TGSI_SEMANTIC_GENERIC
|| semantic_index
< SI_MAX_IO_GENERIC
) &&
477 shader
->key
.opt
.kill_outputs
&
478 (1ull << si_shader_io_get_unique_index(semantic_name
, semantic_index
, true)))
481 si_export_param(ctx
, param_count
, outputs
[i
].values
);
483 assert(i
< ARRAY_SIZE(shader
->info
.vs_output_param_offset
));
484 shader
->info
.vs_output_param_offset
[i
] = param_count
++;
487 shader
->info
.nr_param_exports
= param_count
;
491 * Vertex color clamping.
493 * This uses a state constant loaded in a user data SGPR and
494 * an IF statement is added that clamps all colors if the constant
497 static void si_vertex_color_clamping(struct si_shader_context
*ctx
,
498 struct si_shader_output_values
*outputs
, unsigned noutput
)
500 LLVMValueRef addr
[SI_MAX_VS_OUTPUTS
][4];
501 bool has_colors
= false;
503 /* Store original colors to alloca variables. */
504 for (unsigned i
= 0; i
< noutput
; i
++) {
505 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
506 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
509 for (unsigned j
= 0; j
< 4; j
++) {
510 addr
[i
][j
] = ac_build_alloca_undef(&ctx
->ac
, ctx
->ac
.f32
, "");
511 LLVMBuildStore(ctx
->ac
.builder
, outputs
[i
].values
[j
], addr
[i
][j
]);
519 /* The state is in the first bit of the user SGPR. */
520 LLVMValueRef cond
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
521 cond
= LLVMBuildTrunc(ctx
->ac
.builder
, cond
, ctx
->ac
.i1
, "");
523 ac_build_ifcc(&ctx
->ac
, cond
, 6502);
525 /* Store clamped colors to alloca variables within the conditional block. */
526 for (unsigned i
= 0; i
< noutput
; i
++) {
527 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
528 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
531 for (unsigned j
= 0; j
< 4; j
++) {
532 LLVMBuildStore(ctx
->ac
.builder
, ac_build_clamp(&ctx
->ac
, outputs
[i
].values
[j
]),
536 ac_build_endif(&ctx
->ac
, 6502);
538 /* Load clamped colors */
539 for (unsigned i
= 0; i
< noutput
; i
++) {
540 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
541 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
544 for (unsigned j
= 0; j
< 4; j
++) {
545 outputs
[i
].values
[j
] = LLVMBuildLoad(ctx
->ac
.builder
, addr
[i
][j
], "");
550 /* Generate export instructions for hardware VS shader stage or NGG GS stage
551 * (position and parameter data only).
553 void si_llvm_build_vs_exports(struct si_shader_context
*ctx
,
554 struct si_shader_output_values
*outputs
, unsigned noutput
)
556 struct si_shader
*shader
= ctx
->shader
;
557 struct ac_export_args pos_args
[4] = {};
558 LLVMValueRef psize_value
= NULL
, edgeflag_value
= NULL
, layer_value
= NULL
,
559 viewport_index_value
= NULL
;
563 si_vertex_color_clamping(ctx
, outputs
, noutput
);
565 /* Build position exports. */
566 for (i
= 0; i
< noutput
; i
++) {
567 switch (outputs
[i
].semantic_name
) {
568 case TGSI_SEMANTIC_POSITION
:
569 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
, V_008DFC_SQ_EXP_POS
, &pos_args
[0]);
571 case TGSI_SEMANTIC_PSIZE
:
572 psize_value
= outputs
[i
].values
[0];
574 case TGSI_SEMANTIC_LAYER
:
575 layer_value
= outputs
[i
].values
[0];
577 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
578 viewport_index_value
= outputs
[i
].values
[0];
580 case TGSI_SEMANTIC_EDGEFLAG
:
581 edgeflag_value
= outputs
[i
].values
[0];
583 case TGSI_SEMANTIC_CLIPDIST
:
584 if (!shader
->key
.opt
.clip_disable
) {
585 unsigned index
= 2 + outputs
[i
].semantic_index
;
586 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
, V_008DFC_SQ_EXP_POS
+ index
,
590 case TGSI_SEMANTIC_CLIPVERTEX
:
591 if (!shader
->key
.opt
.clip_disable
) {
592 si_llvm_emit_clipvertex(ctx
, pos_args
, outputs
[i
].values
);
598 /* We need to add the position output manually if it's missing. */
599 if (!pos_args
[0].out
[0]) {
600 pos_args
[0].enabled_channels
= 0xf; /* writemask */
601 pos_args
[0].valid_mask
= 0; /* EXEC mask */
602 pos_args
[0].done
= 0; /* last export? */
603 pos_args
[0].target
= V_008DFC_SQ_EXP_POS
;
604 pos_args
[0].compr
= 0; /* COMPR flag */
605 pos_args
[0].out
[0] = ctx
->ac
.f32_0
; /* X */
606 pos_args
[0].out
[1] = ctx
->ac
.f32_0
; /* Y */
607 pos_args
[0].out
[2] = ctx
->ac
.f32_0
; /* Z */
608 pos_args
[0].out
[3] = ctx
->ac
.f32_1
; /* W */
611 bool pos_writes_edgeflag
= shader
->selector
->info
.writes_edgeflag
&& !shader
->key
.as_ngg
;
613 /* Write the misc vector (point size, edgeflag, layer, viewport). */
614 if (shader
->selector
->info
.writes_psize
|| pos_writes_edgeflag
||
615 shader
->selector
->info
.writes_viewport_index
|| shader
->selector
->info
.writes_layer
) {
616 pos_args
[1].enabled_channels
= shader
->selector
->info
.writes_psize
|
617 (pos_writes_edgeflag
<< 1) |
618 (shader
->selector
->info
.writes_layer
<< 2);
620 pos_args
[1].valid_mask
= 0; /* EXEC mask */
621 pos_args
[1].done
= 0; /* last export? */
622 pos_args
[1].target
= V_008DFC_SQ_EXP_POS
+ 1;
623 pos_args
[1].compr
= 0; /* COMPR flag */
624 pos_args
[1].out
[0] = ctx
->ac
.f32_0
; /* X */
625 pos_args
[1].out
[1] = ctx
->ac
.f32_0
; /* Y */
626 pos_args
[1].out
[2] = ctx
->ac
.f32_0
; /* Z */
627 pos_args
[1].out
[3] = ctx
->ac
.f32_0
; /* W */
629 if (shader
->selector
->info
.writes_psize
)
630 pos_args
[1].out
[0] = psize_value
;
632 if (pos_writes_edgeflag
) {
633 /* The output is a float, but the hw expects an integer
634 * with the first bit containing the edge flag. */
635 edgeflag_value
= LLVMBuildFPToUI(ctx
->ac
.builder
, edgeflag_value
, ctx
->ac
.i32
, "");
636 edgeflag_value
= ac_build_umin(&ctx
->ac
, edgeflag_value
, ctx
->ac
.i32_1
);
638 /* The LLVM intrinsic expects a float. */
639 pos_args
[1].out
[1] = ac_to_float(&ctx
->ac
, edgeflag_value
);
642 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
643 /* GFX9 has the layer in out.z[10:0] and the viewport
644 * index in out.z[19:16].
646 if (shader
->selector
->info
.writes_layer
)
647 pos_args
[1].out
[2] = layer_value
;
649 if (shader
->selector
->info
.writes_viewport_index
) {
650 LLVMValueRef v
= viewport_index_value
;
652 v
= ac_to_integer(&ctx
->ac
, v
);
653 v
= LLVMBuildShl(ctx
->ac
.builder
, v
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
654 v
= LLVMBuildOr(ctx
->ac
.builder
, v
, ac_to_integer(&ctx
->ac
, pos_args
[1].out
[2]), "");
655 pos_args
[1].out
[2] = ac_to_float(&ctx
->ac
, v
);
656 pos_args
[1].enabled_channels
|= 1 << 2;
659 if (shader
->selector
->info
.writes_layer
)
660 pos_args
[1].out
[2] = layer_value
;
662 if (shader
->selector
->info
.writes_viewport_index
) {
663 pos_args
[1].out
[3] = viewport_index_value
;
664 pos_args
[1].enabled_channels
|= 1 << 3;
669 for (i
= 0; i
< 4; i
++)
670 if (pos_args
[i
].out
[0])
671 shader
->info
.nr_pos_exports
++;
673 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
674 * Setting valid_mask=1 prevents it and has no other effect.
676 if (ctx
->screen
->info
.chip_class
== GFX10
)
677 pos_args
[0].valid_mask
= 1;
680 for (i
= 0; i
< 4; i
++) {
681 if (!pos_args
[i
].out
[0])
684 /* Specify the target we are exporting */
685 pos_args
[i
].target
= V_008DFC_SQ_EXP_POS
+ pos_idx
++;
687 if (pos_idx
== shader
->info
.nr_pos_exports
)
688 /* Specify that this is the last export */
689 pos_args
[i
].done
= 1;
691 ac_build_export(&ctx
->ac
, &pos_args
[i
]);
694 /* Build parameter exports. */
695 si_build_param_exports(ctx
, outputs
, noutput
);
698 void si_llvm_emit_vs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
, LLVMValueRef
*addrs
)
700 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
701 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
702 struct si_shader_output_values
*outputs
= NULL
;
705 assert(!ctx
->shader
->is_gs_copy_shader
);
706 assert(info
->num_outputs
<= max_outputs
);
708 outputs
= MALLOC((info
->num_outputs
+ 1) * sizeof(outputs
[0]));
710 for (i
= 0; i
< info
->num_outputs
; i
++) {
711 outputs
[i
].semantic_name
= info
->output_semantic_name
[i
];
712 outputs
[i
].semantic_index
= info
->output_semantic_index
[i
];
714 for (j
= 0; j
< 4; j
++) {
715 outputs
[i
].values
[j
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ j
], "");
716 outputs
[i
].vertex_stream
[j
] = (info
->output_streams
[i
] >> (2 * j
)) & 3;
720 if (!ctx
->screen
->use_ngg_streamout
&& ctx
->shader
->selector
->so
.num_outputs
)
721 si_llvm_emit_streamout(ctx
, outputs
, i
, 0);
723 /* Export PrimitiveID. */
724 if (ctx
->shader
->key
.mono
.u
.vs_export_prim_id
) {
725 outputs
[i
].semantic_name
= TGSI_SEMANTIC_PRIMID
;
726 outputs
[i
].semantic_index
= 0;
727 outputs
[i
].values
[0] = ac_to_float(&ctx
->ac
, si_get_primitive_id(ctx
, 0));
728 for (j
= 1; j
< 4; j
++)
729 outputs
[i
].values
[j
] = LLVMConstReal(ctx
->ac
.f32
, 0);
731 memset(outputs
[i
].vertex_stream
, 0, sizeof(outputs
[i
].vertex_stream
));
735 si_llvm_build_vs_exports(ctx
, outputs
, i
);
739 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
,
742 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
743 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
744 LLVMValueRef pos
[4] = {};
746 assert(info
->num_outputs
<= max_outputs
);
748 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
749 if (info
->output_semantic_name
[i
] != TGSI_SEMANTIC_POSITION
)
752 for (unsigned chan
= 0; chan
< 4; chan
++)
753 pos
[chan
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
756 assert(pos
[0] != NULL
);
758 /* Return the position output. */
759 LLVMValueRef ret
= ctx
->return_value
;
760 for (unsigned chan
= 0; chan
< 4; chan
++)
761 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, pos
[chan
], chan
, "");
762 ctx
->return_value
= ret
;
766 * Build the vertex shader prolog function.
768 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
769 * All inputs are returned unmodified. The vertex load indices are
770 * stored after them, which will be used by the API VS for fetching inputs.
772 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
777 * (VertexID + BaseVertex),
778 * (InstanceID + StartInstance),
779 * (InstanceID / 2 + StartInstance)
781 void si_llvm_build_vs_prolog(struct si_shader_context
*ctx
, union si_shader_part_key
*key
)
783 LLVMTypeRef
*returns
;
784 LLVMValueRef ret
, func
;
786 unsigned first_vs_vgpr
= key
->vs_prolog
.num_merged_next_stage_vgprs
;
787 unsigned num_input_vgprs
=
788 key
->vs_prolog
.num_merged_next_stage_vgprs
+ 4 + (key
->vs_prolog
.has_ngg_cull_inputs
? 1 : 0);
789 struct ac_arg input_sgpr_param
[key
->vs_prolog
.num_input_sgprs
];
790 struct ac_arg input_vgpr_param
[10];
791 LLVMValueRef input_vgprs
[10];
792 unsigned num_all_input_regs
= key
->vs_prolog
.num_input_sgprs
+ num_input_vgprs
;
793 unsigned user_sgpr_base
= key
->vs_prolog
.num_merged_next_stage_vgprs
? 8 : 0;
795 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
797 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
798 returns
= alloca((num_all_input_regs
+ key
->vs_prolog
.num_inputs
) * sizeof(LLVMTypeRef
));
801 /* Declare input and output SGPRs. */
802 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
803 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &input_sgpr_param
[i
]);
804 returns
[num_returns
++] = ctx
->ac
.i32
;
807 struct ac_arg merged_wave_info
= input_sgpr_param
[3];
809 /* Preloaded VGPRs (outputs must be floats) */
810 for (i
= 0; i
< num_input_vgprs
; i
++) {
811 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &input_vgpr_param
[i
]);
812 returns
[num_returns
++] = ctx
->ac
.f32
;
815 /* Vertex load indices. */
816 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++)
817 returns
[num_returns
++] = ctx
->ac
.f32
;
819 /* Create the function. */
820 si_llvm_create_func(ctx
, "vs_prolog", returns
, num_returns
, 0);
823 for (i
= 0; i
< num_input_vgprs
; i
++) {
824 input_vgprs
[i
] = ac_get_arg(&ctx
->ac
, input_vgpr_param
[i
]);
827 if (key
->vs_prolog
.num_merged_next_stage_vgprs
) {
828 if (!key
->vs_prolog
.is_monolithic
)
829 si_init_exec_from_input(ctx
, merged_wave_info
, 0);
831 if (key
->vs_prolog
.as_ls
&& ctx
->screen
->info
.has_ls_vgpr_init_bug
) {
832 /* If there are no HS threads, SPI loads the LS VGPRs
833 * starting at VGPR 0. Shift them back to where they
836 LLVMValueRef has_hs_threads
=
837 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
,
838 si_unpack_param(ctx
, input_sgpr_param
[3], 8, 8), ctx
->ac
.i32_0
, "");
840 for (i
= 4; i
> 0; --i
) {
841 input_vgprs
[i
+ 1] = LLVMBuildSelect(ctx
->ac
.builder
, has_hs_threads
,
842 input_vgprs
[i
+ 1], input_vgprs
[i
- 1], "");
847 if (key
->vs_prolog
.gs_fast_launch_tri_list
|| key
->vs_prolog
.gs_fast_launch_tri_strip
) {
848 LLVMValueRef wave_id
, thread_id_in_tg
;
850 wave_id
= si_unpack_param(ctx
, input_sgpr_param
[3], 24, 4);
852 ac_build_imad(&ctx
->ac
, wave_id
, LLVMConstInt(ctx
->ac
.i32
, ctx
->ac
.wave_size
, false),
853 ac_get_thread_id(&ctx
->ac
));
855 /* The GS fast launch initializes all VGPRs to the value of
856 * the first thread, so we have to add the thread ID.
858 * Only these are initialized by the hw:
859 * VGPR2: Base Primitive ID
860 * VGPR5: Base Vertex ID
864 /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
865 * The NGG cull shader will read them from there.
867 if (key
->vs_prolog
.gs_fast_launch_tri_list
) {
868 input_vgprs
[0] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx01_offset */
869 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 0 */
870 LLVMConstInt(ctx
->ac
.i32
, 0, 0));
871 input_vgprs
[1] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx23_offset */
872 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 1 */
873 LLVMConstInt(ctx
->ac
.i32
, 1, 0));
874 input_vgprs
[4] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx45_offset */
875 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 2 */
876 LLVMConstInt(ctx
->ac
.i32
, 2, 0));
878 assert(key
->vs_prolog
.gs_fast_launch_tri_strip
);
879 LLVMBuilderRef builder
= ctx
->ac
.builder
;
880 /* Triangle indices: */
881 LLVMValueRef index
[3] = {
883 LLVMBuildAdd(builder
, thread_id_in_tg
, LLVMConstInt(ctx
->ac
.i32
, 1, 0), ""),
884 LLVMBuildAdd(builder
, thread_id_in_tg
, LLVMConstInt(ctx
->ac
.i32
, 2, 0), ""),
886 LLVMValueRef is_odd
= LLVMBuildTrunc(ctx
->ac
.builder
, thread_id_in_tg
, ctx
->ac
.i1
, "");
887 LLVMValueRef flatshade_first
= LLVMBuildICmp(
888 builder
, LLVMIntEQ
, si_unpack_param(ctx
, ctx
->vs_state_bits
, 4, 2), ctx
->ac
.i32_0
, "");
890 ac_build_triangle_strip_indices_to_triangle(&ctx
->ac
, is_odd
, flatshade_first
, index
);
891 input_vgprs
[0] = index
[0];
892 input_vgprs
[1] = index
[1];
893 input_vgprs
[4] = index
[2];
896 /* Triangles always have all edge flags set initially. */
897 input_vgprs
[3] = LLVMConstInt(ctx
->ac
.i32
, 0x7 << 8, 0);
900 LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[2], thread_id_in_tg
, ""); /* PrimID */
902 LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[5], thread_id_in_tg
, ""); /* VertexID */
903 input_vgprs
[8] = input_vgprs
[6]; /* InstanceID */
906 unsigned vertex_id_vgpr
= first_vs_vgpr
;
907 unsigned instance_id_vgpr
= ctx
->screen
->info
.chip_class
>= GFX10
909 : first_vs_vgpr
+ (key
->vs_prolog
.as_ls
? 2 : 1);
911 ctx
->abi
.vertex_id
= input_vgprs
[vertex_id_vgpr
];
912 ctx
->abi
.instance_id
= input_vgprs
[instance_id_vgpr
];
914 /* InstanceID = VertexID >> 16;
915 * VertexID = VertexID & 0xffff;
917 if (key
->vs_prolog
.states
.unpack_instance_id_from_vertex_id
) {
918 ctx
->abi
.instance_id
=
919 LLVMBuildLShr(ctx
->ac
.builder
, ctx
->abi
.vertex_id
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
920 ctx
->abi
.vertex_id
= LLVMBuildAnd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
921 LLVMConstInt(ctx
->ac
.i32
, 0xffff, 0), "");
924 /* Copy inputs to outputs. This should be no-op, as the registers match,
925 * but it will prevent the compiler from overwriting them unintentionally.
927 ret
= ctx
->return_value
;
928 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
929 LLVMValueRef p
= LLVMGetParam(func
, i
);
930 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, i
, "");
932 for (i
= 0; i
< num_input_vgprs
; i
++) {
933 LLVMValueRef p
= input_vgprs
[i
];
935 if (i
== vertex_id_vgpr
)
936 p
= ctx
->abi
.vertex_id
;
937 else if (i
== instance_id_vgpr
)
938 p
= ctx
->abi
.instance_id
;
940 p
= ac_to_float(&ctx
->ac
, p
);
941 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, key
->vs_prolog
.num_input_sgprs
+ i
, "");
944 /* Compute vertex load indices from instance divisors. */
945 LLVMValueRef instance_divisor_constbuf
= NULL
;
947 if (key
->vs_prolog
.states
.instance_divisor_is_fetched
) {
948 LLVMValueRef list
= si_prolog_get_rw_buffers(ctx
);
949 LLVMValueRef buf_index
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_CONST_INSTANCE_DIVISORS
, 0);
950 instance_divisor_constbuf
= ac_build_load_to_sgpr(&ctx
->ac
, list
, buf_index
);
953 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++) {
954 bool divisor_is_one
= key
->vs_prolog
.states
.instance_divisor_is_one
& (1u << i
);
955 bool divisor_is_fetched
= key
->vs_prolog
.states
.instance_divisor_is_fetched
& (1u << i
);
956 LLVMValueRef index
= NULL
;
958 if (divisor_is_one
) {
959 index
= ctx
->abi
.instance_id
;
960 } else if (divisor_is_fetched
) {
961 LLVMValueRef udiv_factors
[4];
963 for (unsigned j
= 0; j
< 4; j
++) {
964 udiv_factors
[j
] = si_buffer_load_const(ctx
, instance_divisor_constbuf
,
965 LLVMConstInt(ctx
->ac
.i32
, i
* 16 + j
* 4, 0));
966 udiv_factors
[j
] = ac_to_integer(&ctx
->ac
, udiv_factors
[j
]);
968 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
969 * Such InstanceID might not be achievable in a reasonable time though.
971 index
= ac_build_fast_udiv_nuw(&ctx
->ac
, ctx
->abi
.instance_id
, udiv_factors
[0],
972 udiv_factors
[1], udiv_factors
[2], udiv_factors
[3]);
975 if (divisor_is_one
|| divisor_is_fetched
) {
976 /* Add StartInstance. */
978 LLVMBuildAdd(ctx
->ac
.builder
, index
,
979 LLVMGetParam(ctx
->main_fn
, user_sgpr_base
+ SI_SGPR_START_INSTANCE
), "");
981 /* VertexID + BaseVertex */
982 index
= LLVMBuildAdd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
983 LLVMGetParam(func
, user_sgpr_base
+ SI_SGPR_BASE_VERTEX
), "");
986 index
= ac_to_float(&ctx
->ac
, index
);
987 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, index
, ctx
->args
.arg_count
+ i
, "");
990 si_llvm_build_ret(ctx
, ret
);
993 static LLVMValueRef
get_base_vertex(struct ac_shader_abi
*abi
)
995 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
997 /* For non-indexed draws, the base vertex set by the driver
998 * (for direct draws) or the CP (for indirect draws) is the
999 * first vertex ID, but GLSL expects 0 to be returned.
1001 LLVMValueRef vs_state
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
1002 LLVMValueRef indexed
;
1004 indexed
= LLVMBuildLShr(ctx
->ac
.builder
, vs_state
, ctx
->ac
.i32_1
, "");
1005 indexed
= LLVMBuildTrunc(ctx
->ac
.builder
, indexed
, ctx
->ac
.i1
, "");
1007 return LLVMBuildSelect(ctx
->ac
.builder
, indexed
, ac_get_arg(&ctx
->ac
, ctx
->args
.base_vertex
),
1011 void si_llvm_init_vs_callbacks(struct si_shader_context
*ctx
, bool ngg_cull_shader
)
1013 struct si_shader
*shader
= ctx
->shader
;
1015 if (shader
->key
.as_ls
)
1016 ctx
->abi
.emit_outputs
= si_llvm_emit_ls_epilogue
;
1017 else if (shader
->key
.as_es
)
1018 ctx
->abi
.emit_outputs
= si_llvm_emit_es_epilogue
;
1019 else if (shader
->key
.opt
.vs_as_prim_discard_cs
)
1020 ctx
->abi
.emit_outputs
= si_llvm_emit_prim_discard_cs_epilogue
;
1021 else if (ngg_cull_shader
)
1022 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_culling_epilogue
;
1023 else if (shader
->key
.as_ngg
)
1024 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_epilogue
;
1026 ctx
->abi
.emit_outputs
= si_llvm_emit_vs_epilogue
;
1028 ctx
->abi
.load_base_vertex
= get_base_vertex
;