2 * Copyright 2020 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 #include "si_shader_internal.h"
28 #include "util/u_memory.h"
30 static LLVMValueRef
unpack_sint16(struct si_shader_context
*ctx
,
31 LLVMValueRef i32
, unsigned index
)
36 return LLVMBuildAShr(ctx
->ac
.builder
, i32
,
37 LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
39 return LLVMBuildSExt(ctx
->ac
.builder
,
40 LLVMBuildTrunc(ctx
->ac
.builder
, i32
,
45 static void load_input_vs(struct si_shader_context
*ctx
, unsigned input_index
,
48 const struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
49 unsigned vs_blit_property
= info
->properties
[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD
];
51 if (vs_blit_property
) {
52 LLVMValueRef vertex_id
= ctx
->abi
.vertex_id
;
53 LLVMValueRef sel_x1
= LLVMBuildICmp(ctx
->ac
.builder
,
54 LLVMIntULE
, vertex_id
,
56 /* Use LLVMIntNE, because we have 3 vertices and only
57 * the middle one should use y2.
59 LLVMValueRef sel_y1
= LLVMBuildICmp(ctx
->ac
.builder
,
63 unsigned param_vs_blit_inputs
= ctx
->vs_blit_inputs
.arg_index
;
64 if (input_index
== 0) {
66 LLVMValueRef x1y1
= LLVMGetParam(ctx
->main_fn
,
67 param_vs_blit_inputs
);
68 LLVMValueRef x2y2
= LLVMGetParam(ctx
->main_fn
,
69 param_vs_blit_inputs
+ 1);
71 LLVMValueRef x1
= unpack_sint16(ctx
, x1y1
, 0);
72 LLVMValueRef y1
= unpack_sint16(ctx
, x1y1
, 1);
73 LLVMValueRef x2
= unpack_sint16(ctx
, x2y2
, 0);
74 LLVMValueRef y2
= unpack_sint16(ctx
, x2y2
, 1);
76 LLVMValueRef x
= LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
,
78 LLVMValueRef y
= LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
,
81 out
[0] = LLVMBuildSIToFP(ctx
->ac
.builder
, x
, ctx
->ac
.f32
, "");
82 out
[1] = LLVMBuildSIToFP(ctx
->ac
.builder
, y
, ctx
->ac
.f32
, "");
83 out
[2] = LLVMGetParam(ctx
->main_fn
,
84 param_vs_blit_inputs
+ 2);
85 out
[3] = ctx
->ac
.f32_1
;
89 /* Color or texture coordinates: */
90 assert(input_index
== 1);
92 if (vs_blit_property
== SI_VS_BLIT_SGPRS_POS_COLOR
) {
93 for (int i
= 0; i
< 4; i
++) {
94 out
[i
] = LLVMGetParam(ctx
->main_fn
,
95 param_vs_blit_inputs
+ 3 + i
);
98 assert(vs_blit_property
== SI_VS_BLIT_SGPRS_POS_TEXCOORD
);
99 LLVMValueRef x1
= LLVMGetParam(ctx
->main_fn
,
100 param_vs_blit_inputs
+ 3);
101 LLVMValueRef y1
= LLVMGetParam(ctx
->main_fn
,
102 param_vs_blit_inputs
+ 4);
103 LLVMValueRef x2
= LLVMGetParam(ctx
->main_fn
,
104 param_vs_blit_inputs
+ 5);
105 LLVMValueRef y2
= LLVMGetParam(ctx
->main_fn
,
106 param_vs_blit_inputs
+ 6);
108 out
[0] = LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
,
110 out
[1] = LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
,
112 out
[2] = LLVMGetParam(ctx
->main_fn
,
113 param_vs_blit_inputs
+ 7);
114 out
[3] = LLVMGetParam(ctx
->main_fn
,
115 param_vs_blit_inputs
+ 8);
120 unsigned num_vbos_in_user_sgprs
= ctx
->shader
->selector
->num_vbos_in_user_sgprs
;
121 union si_vs_fix_fetch fix_fetch
;
122 LLVMValueRef vb_desc
;
123 LLVMValueRef vertex_index
;
126 if (input_index
< num_vbos_in_user_sgprs
) {
127 vb_desc
= ac_get_arg(&ctx
->ac
, ctx
->vb_descriptors
[input_index
]);
129 unsigned index
= input_index
- num_vbos_in_user_sgprs
;
130 vb_desc
= ac_build_load_to_sgpr(&ctx
->ac
,
131 ac_get_arg(&ctx
->ac
, ctx
->vertex_buffers
),
132 LLVMConstInt(ctx
->ac
.i32
, index
, 0));
135 vertex_index
= LLVMGetParam(ctx
->main_fn
,
136 ctx
->vertex_index0
.arg_index
+
139 /* Use the open-coded implementation for all loads of doubles and
140 * of dword-sized data that needs fixups. We need to insert conversion
141 * code anyway, and the amd/common code does it for us.
143 * Note: On LLVM <= 8, we can only open-code formats with
144 * channel size >= 4 bytes.
146 bool opencode
= ctx
->shader
->key
.mono
.vs_fetch_opencode
& (1 << input_index
);
147 fix_fetch
.bits
= ctx
->shader
->key
.mono
.vs_fix_fetch
[input_index
].bits
;
149 (fix_fetch
.u
.log_size
== 3 && fix_fetch
.u
.format
== AC_FETCH_FORMAT_FLOAT
) ||
150 (fix_fetch
.u
.log_size
== 2)) {
151 tmp
= ac_build_opencoded_load_format(
152 &ctx
->ac
, fix_fetch
.u
.log_size
, fix_fetch
.u
.num_channels_m1
+ 1,
153 fix_fetch
.u
.format
, fix_fetch
.u
.reverse
, !opencode
,
154 vb_desc
, vertex_index
, ctx
->ac
.i32_0
, ctx
->ac
.i32_0
, 0, true);
155 for (unsigned i
= 0; i
< 4; ++i
)
156 out
[i
] = LLVMBuildExtractElement(ctx
->ac
.builder
, tmp
, LLVMConstInt(ctx
->ac
.i32
, i
, false), "");
160 /* Do multiple loads for special formats. */
161 unsigned required_channels
= util_last_bit(info
->input_usage_mask
[input_index
]);
162 LLVMValueRef fetches
[4];
163 unsigned num_fetches
;
164 unsigned fetch_stride
;
165 unsigned channels_per_fetch
;
167 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2) {
168 num_fetches
= MIN2(required_channels
, 3);
169 fetch_stride
= 1 << fix_fetch
.u
.log_size
;
170 channels_per_fetch
= 1;
174 channels_per_fetch
= required_channels
;
177 for (unsigned i
= 0; i
< num_fetches
; ++i
) {
178 LLVMValueRef voffset
= LLVMConstInt(ctx
->ac
.i32
, fetch_stride
* i
, 0);
179 fetches
[i
] = ac_build_buffer_load_format(&ctx
->ac
, vb_desc
, vertex_index
, voffset
,
180 channels_per_fetch
, 0, true);
183 if (num_fetches
== 1 && channels_per_fetch
> 1) {
184 LLVMValueRef fetch
= fetches
[0];
185 for (unsigned i
= 0; i
< channels_per_fetch
; ++i
) {
186 tmp
= LLVMConstInt(ctx
->ac
.i32
, i
, false);
187 fetches
[i
] = LLVMBuildExtractElement(
188 ctx
->ac
.builder
, fetch
, tmp
, "");
190 num_fetches
= channels_per_fetch
;
191 channels_per_fetch
= 1;
194 for (unsigned i
= num_fetches
; i
< 4; ++i
)
195 fetches
[i
] = LLVMGetUndef(ctx
->ac
.f32
);
197 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2 &&
198 required_channels
== 4) {
199 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_UINT
|| fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
)
200 fetches
[3] = ctx
->ac
.i32_1
;
202 fetches
[3] = ctx
->ac
.f32_1
;
203 } else if (fix_fetch
.u
.log_size
== 3 &&
204 (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
||
205 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
||
206 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
) &&
207 required_channels
== 4) {
208 /* For 2_10_10_10, the hardware returns an unsigned value;
209 * convert it to a signed one.
211 LLVMValueRef tmp
= fetches
[3];
212 LLVMValueRef c30
= LLVMConstInt(ctx
->ac
.i32
, 30, 0);
214 /* First, recover the sign-extended signed integer value. */
215 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
)
216 tmp
= LLVMBuildFPToUI(ctx
->ac
.builder
, tmp
, ctx
->ac
.i32
, "");
218 tmp
= ac_to_integer(&ctx
->ac
, tmp
);
220 /* For the integer-like cases, do a natural sign extension.
222 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
223 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
226 tmp
= LLVMBuildShl(ctx
->ac
.builder
, tmp
,
227 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
?
228 LLVMConstInt(ctx
->ac
.i32
, 7, 0) : c30
, "");
229 tmp
= LLVMBuildAShr(ctx
->ac
.builder
, tmp
, c30
, "");
231 /* Convert back to the right type. */
232 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
) {
234 LLVMValueRef neg_one
= LLVMConstReal(ctx
->ac
.f32
, -1.0);
235 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
236 clamp
= LLVMBuildFCmp(ctx
->ac
.builder
, LLVMRealULT
, tmp
, neg_one
, "");
237 tmp
= LLVMBuildSelect(ctx
->ac
.builder
, clamp
, neg_one
, tmp
, "");
238 } else if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
) {
239 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
245 for (unsigned i
= 0; i
< 4; ++i
)
246 out
[i
] = ac_to_float(&ctx
->ac
, fetches
[i
]);
249 static void declare_input_vs(struct si_shader_context
*ctx
, unsigned input_index
)
251 LLVMValueRef input
[4];
253 load_input_vs(ctx
, input_index
/ 4, input
);
255 for (unsigned chan
= 0; chan
< 4; chan
++) {
256 ctx
->inputs
[input_index
+ chan
] =
257 LLVMBuildBitCast(ctx
->ac
.builder
, input
[chan
], ctx
->ac
.i32
, "");
261 void si_llvm_load_vs_inputs(struct si_shader_context
*ctx
, struct nir_shader
*nir
)
263 uint64_t processed_inputs
= 0;
265 nir_foreach_variable(variable
, &nir
->inputs
) {
266 unsigned attrib_count
= glsl_count_attribute_slots(variable
->type
,
268 unsigned input_idx
= variable
->data
.driver_location
;
269 unsigned loc
= variable
->data
.location
;
271 for (unsigned i
= 0; i
< attrib_count
; i
++) {
272 /* Packed components share the same location so skip
273 * them if we have already processed the location.
275 if (processed_inputs
& ((uint64_t)1 << (loc
+ i
))) {
280 declare_input_vs(ctx
, input_idx
);
281 if (glsl_type_is_dual_slot(variable
->type
)) {
283 declare_input_vs(ctx
, input_idx
);
286 processed_inputs
|= ((uint64_t)1 << (loc
+ i
));
292 void si_llvm_streamout_store_output(struct si_shader_context
*ctx
,
293 LLVMValueRef
const *so_buffers
,
294 LLVMValueRef
const *so_write_offsets
,
295 struct pipe_stream_output
*stream_out
,
296 struct si_shader_output_values
*shader_out
)
298 unsigned buf_idx
= stream_out
->output_buffer
;
299 unsigned start
= stream_out
->start_component
;
300 unsigned num_comps
= stream_out
->num_components
;
303 assert(num_comps
&& num_comps
<= 4);
304 if (!num_comps
|| num_comps
> 4)
307 /* Load the output as int. */
308 for (int j
= 0; j
< num_comps
; j
++) {
309 assert(stream_out
->stream
== shader_out
->vertex_stream
[start
+ j
]);
311 out
[j
] = ac_to_integer(&ctx
->ac
, shader_out
->values
[start
+ j
]);
314 /* Pack the output. */
315 LLVMValueRef vdata
= NULL
;
321 case 2: /* as v2i32 */
322 case 3: /* as v3i32 */
323 if (ac_has_vec3_support(ctx
->screen
->info
.chip_class
, false)) {
324 vdata
= ac_build_gather_values(&ctx
->ac
, out
, num_comps
);
327 /* as v4i32 (aligned to 4) */
328 out
[3] = LLVMGetUndef(ctx
->ac
.i32
);
330 case 4: /* as v4i32 */
331 vdata
= ac_build_gather_values(&ctx
->ac
, out
, util_next_power_of_two(num_comps
));
335 ac_build_buffer_store_dword(&ctx
->ac
, so_buffers
[buf_idx
],
337 so_write_offsets
[buf_idx
],
339 stream_out
->dst_offset
* 4, ac_glc
| ac_slc
);
343 * Write streamout data to buffers for vertex stream @p stream (different
344 * vertex streams can occur for GS copy shaders).
346 void si_llvm_emit_streamout(struct si_shader_context
*ctx
,
347 struct si_shader_output_values
*outputs
,
348 unsigned noutput
, unsigned stream
)
350 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
351 struct pipe_stream_output_info
*so
= &sel
->so
;
352 LLVMBuilderRef builder
= ctx
->ac
.builder
;
355 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
356 LLVMValueRef so_vtx_count
=
357 si_unpack_param(ctx
, ctx
->streamout_config
, 16, 7);
359 LLVMValueRef tid
= ac_get_thread_id(&ctx
->ac
);
361 /* can_emit = tid < so_vtx_count; */
362 LLVMValueRef can_emit
=
363 LLVMBuildICmp(builder
, LLVMIntULT
, tid
, so_vtx_count
, "");
365 /* Emit the streamout code conditionally. This actually avoids
366 * out-of-bounds buffer access. The hw tells us via the SGPR
367 * (so_vtx_count) which threads are allowed to emit streamout data. */
368 ac_build_ifcc(&ctx
->ac
, can_emit
, 6501);
370 /* The buffer offset is computed as follows:
371 * ByteOffset = streamout_offset[buffer_id]*4 +
372 * (streamout_write_index + thread_id)*stride[buffer_id] +
376 LLVMValueRef so_write_index
=
378 ctx
->streamout_write_index
);
380 /* Compute (streamout_write_index + thread_id). */
381 so_write_index
= LLVMBuildAdd(builder
, so_write_index
, tid
, "");
383 /* Load the descriptor and compute the write offset for each
385 LLVMValueRef so_write_offset
[4] = {};
386 LLVMValueRef so_buffers
[4];
387 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
,
390 for (i
= 0; i
< 4; i
++) {
394 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
,
395 SI_VS_STREAMOUT_BUF0
+ i
, 0);
397 so_buffers
[i
] = ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
399 LLVMValueRef so_offset
= ac_get_arg(&ctx
->ac
,
400 ctx
->streamout_offset
[i
]);
401 so_offset
= LLVMBuildMul(builder
, so_offset
, LLVMConstInt(ctx
->ac
.i32
, 4, 0), "");
403 so_write_offset
[i
] = ac_build_imad(&ctx
->ac
, so_write_index
,
404 LLVMConstInt(ctx
->ac
.i32
, so
->stride
[i
]*4, 0),
408 /* Write streamout data. */
409 for (i
= 0; i
< so
->num_outputs
; i
++) {
410 unsigned reg
= so
->output
[i
].register_index
;
415 if (stream
!= so
->output
[i
].stream
)
418 si_llvm_streamout_store_output(ctx
, so_buffers
, so_write_offset
,
419 &so
->output
[i
], &outputs
[reg
]);
422 ac_build_endif(&ctx
->ac
, 6501);
425 static void si_llvm_emit_clipvertex(struct si_shader_context
*ctx
,
426 struct ac_export_args
*pos
, LLVMValueRef
*out_elts
)
431 LLVMValueRef base_elt
;
432 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
433 LLVMValueRef constbuf_index
= LLVMConstInt(ctx
->ac
.i32
,
434 SI_VS_CONST_CLIP_PLANES
, 0);
435 LLVMValueRef const_resource
= ac_build_load_to_sgpr(&ctx
->ac
, ptr
, constbuf_index
);
437 for (reg_index
= 0; reg_index
< 2; reg_index
++) {
438 struct ac_export_args
*args
= &pos
[2 + reg_index
];
443 args
->out
[3] = LLVMConstReal(ctx
->ac
.f32
, 0.0f
);
445 /* Compute dot products of position and user clip plane vectors */
446 for (chan
= 0; chan
< 4; chan
++) {
447 for (const_chan
= 0; const_chan
< 4; const_chan
++) {
449 LLVMConstInt(ctx
->ac
.i32
, ((reg_index
* 4 + chan
) * 4 +
451 base_elt
= si_buffer_load_const(ctx
, const_resource
,
453 args
->out
[chan
] = ac_build_fmad(&ctx
->ac
, base_elt
,
454 out_elts
[const_chan
], args
->out
[chan
]);
458 args
->enabled_channels
= 0xf;
459 args
->valid_mask
= 0;
461 args
->target
= V_008DFC_SQ_EXP_POS
+ 2 + reg_index
;
466 /* Initialize arguments for the shader export intrinsic */
467 static void si_llvm_init_vs_export_args(struct si_shader_context
*ctx
,
468 LLVMValueRef
*values
,
470 struct ac_export_args
*args
)
472 args
->enabled_channels
= 0xf; /* writemask - default is 0xf */
473 args
->valid_mask
= 0; /* Specify whether the EXEC mask represents the valid mask */
474 args
->done
= 0; /* Specify whether this is the last export */
475 args
->target
= target
; /* Specify the target we are exporting */
478 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
481 static void si_export_param(struct si_shader_context
*ctx
, unsigned index
,
482 LLVMValueRef
*values
)
484 struct ac_export_args args
;
486 si_llvm_init_vs_export_args(ctx
, values
,
487 V_008DFC_SQ_EXP_PARAM
+ index
, &args
);
488 ac_build_export(&ctx
->ac
, &args
);
491 static void si_build_param_exports(struct si_shader_context
*ctx
,
492 struct si_shader_output_values
*outputs
,
495 struct si_shader
*shader
= ctx
->shader
;
496 unsigned param_count
= 0;
498 for (unsigned i
= 0; i
< noutput
; i
++) {
499 unsigned semantic_name
= outputs
[i
].semantic_name
;
500 unsigned semantic_index
= outputs
[i
].semantic_index
;
502 if (outputs
[i
].vertex_stream
[0] != 0 &&
503 outputs
[i
].vertex_stream
[1] != 0 &&
504 outputs
[i
].vertex_stream
[2] != 0 &&
505 outputs
[i
].vertex_stream
[3] != 0)
508 switch (semantic_name
) {
509 case TGSI_SEMANTIC_LAYER
:
510 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
511 case TGSI_SEMANTIC_CLIPDIST
:
512 case TGSI_SEMANTIC_COLOR
:
513 case TGSI_SEMANTIC_BCOLOR
:
514 case TGSI_SEMANTIC_PRIMID
:
515 case TGSI_SEMANTIC_FOG
:
516 case TGSI_SEMANTIC_TEXCOORD
:
517 case TGSI_SEMANTIC_GENERIC
:
523 if ((semantic_name
!= TGSI_SEMANTIC_GENERIC
||
524 semantic_index
< SI_MAX_IO_GENERIC
) &&
525 shader
->key
.opt
.kill_outputs
&
526 (1ull << si_shader_io_get_unique_index(semantic_name
,
527 semantic_index
, true)))
530 si_export_param(ctx
, param_count
, outputs
[i
].values
);
532 assert(i
< ARRAY_SIZE(shader
->info
.vs_output_param_offset
));
533 shader
->info
.vs_output_param_offset
[i
] = param_count
++;
536 shader
->info
.nr_param_exports
= param_count
;
540 * Vertex color clamping.
542 * This uses a state constant loaded in a user data SGPR and
543 * an IF statement is added that clamps all colors if the constant
546 static void si_vertex_color_clamping(struct si_shader_context
*ctx
,
547 struct si_shader_output_values
*outputs
,
550 LLVMValueRef addr
[SI_MAX_VS_OUTPUTS
][4];
551 bool has_colors
= false;
553 /* Store original colors to alloca variables. */
554 for (unsigned i
= 0; i
< noutput
; i
++) {
555 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
556 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
559 for (unsigned j
= 0; j
< 4; j
++) {
560 addr
[i
][j
] = ac_build_alloca_undef(&ctx
->ac
, ctx
->ac
.f32
, "");
561 LLVMBuildStore(ctx
->ac
.builder
, outputs
[i
].values
[j
], addr
[i
][j
]);
569 /* The state is in the first bit of the user SGPR. */
570 LLVMValueRef cond
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
571 cond
= LLVMBuildTrunc(ctx
->ac
.builder
, cond
, ctx
->ac
.i1
, "");
573 ac_build_ifcc(&ctx
->ac
, cond
, 6502);
575 /* Store clamped colors to alloca variables within the conditional block. */
576 for (unsigned i
= 0; i
< noutput
; i
++) {
577 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
578 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
581 for (unsigned j
= 0; j
< 4; j
++) {
582 LLVMBuildStore(ctx
->ac
.builder
,
583 ac_build_clamp(&ctx
->ac
, outputs
[i
].values
[j
]),
587 ac_build_endif(&ctx
->ac
, 6502);
589 /* Load clamped colors */
590 for (unsigned i
= 0; i
< noutput
; i
++) {
591 if (outputs
[i
].semantic_name
!= TGSI_SEMANTIC_COLOR
&&
592 outputs
[i
].semantic_name
!= TGSI_SEMANTIC_BCOLOR
)
595 for (unsigned j
= 0; j
< 4; j
++) {
596 outputs
[i
].values
[j
] =
597 LLVMBuildLoad(ctx
->ac
.builder
, addr
[i
][j
], "");
602 /* Generate export instructions for hardware VS shader stage or NGG GS stage
603 * (position and parameter data only).
605 void si_llvm_build_vs_exports(struct si_shader_context
*ctx
,
606 struct si_shader_output_values
*outputs
,
609 struct si_shader
*shader
= ctx
->shader
;
610 struct ac_export_args pos_args
[4] = {};
611 LLVMValueRef psize_value
= NULL
, edgeflag_value
= NULL
, layer_value
= NULL
, viewport_index_value
= NULL
;
615 si_vertex_color_clamping(ctx
, outputs
, noutput
);
617 /* Build position exports. */
618 for (i
= 0; i
< noutput
; i
++) {
619 switch (outputs
[i
].semantic_name
) {
620 case TGSI_SEMANTIC_POSITION
:
621 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
,
622 V_008DFC_SQ_EXP_POS
, &pos_args
[0]);
624 case TGSI_SEMANTIC_PSIZE
:
625 psize_value
= outputs
[i
].values
[0];
627 case TGSI_SEMANTIC_LAYER
:
628 layer_value
= outputs
[i
].values
[0];
630 case TGSI_SEMANTIC_VIEWPORT_INDEX
:
631 viewport_index_value
= outputs
[i
].values
[0];
633 case TGSI_SEMANTIC_EDGEFLAG
:
634 edgeflag_value
= outputs
[i
].values
[0];
636 case TGSI_SEMANTIC_CLIPDIST
:
637 if (!shader
->key
.opt
.clip_disable
) {
638 unsigned index
= 2 + outputs
[i
].semantic_index
;
639 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
,
640 V_008DFC_SQ_EXP_POS
+ index
,
644 case TGSI_SEMANTIC_CLIPVERTEX
:
645 if (!shader
->key
.opt
.clip_disable
) {
646 si_llvm_emit_clipvertex(ctx
, pos_args
,
653 /* We need to add the position output manually if it's missing. */
654 if (!pos_args
[0].out
[0]) {
655 pos_args
[0].enabled_channels
= 0xf; /* writemask */
656 pos_args
[0].valid_mask
= 0; /* EXEC mask */
657 pos_args
[0].done
= 0; /* last export? */
658 pos_args
[0].target
= V_008DFC_SQ_EXP_POS
;
659 pos_args
[0].compr
= 0; /* COMPR flag */
660 pos_args
[0].out
[0] = ctx
->ac
.f32_0
; /* X */
661 pos_args
[0].out
[1] = ctx
->ac
.f32_0
; /* Y */
662 pos_args
[0].out
[2] = ctx
->ac
.f32_0
; /* Z */
663 pos_args
[0].out
[3] = ctx
->ac
.f32_1
; /* W */
666 bool pos_writes_edgeflag
= shader
->selector
->info
.writes_edgeflag
&&
669 /* Write the misc vector (point size, edgeflag, layer, viewport). */
670 if (shader
->selector
->info
.writes_psize
||
671 pos_writes_edgeflag
||
672 shader
->selector
->info
.writes_viewport_index
||
673 shader
->selector
->info
.writes_layer
) {
674 pos_args
[1].enabled_channels
= shader
->selector
->info
.writes_psize
|
675 (pos_writes_edgeflag
<< 1) |
676 (shader
->selector
->info
.writes_layer
<< 2);
678 pos_args
[1].valid_mask
= 0; /* EXEC mask */
679 pos_args
[1].done
= 0; /* last export? */
680 pos_args
[1].target
= V_008DFC_SQ_EXP_POS
+ 1;
681 pos_args
[1].compr
= 0; /* COMPR flag */
682 pos_args
[1].out
[0] = ctx
->ac
.f32_0
; /* X */
683 pos_args
[1].out
[1] = ctx
->ac
.f32_0
; /* Y */
684 pos_args
[1].out
[2] = ctx
->ac
.f32_0
; /* Z */
685 pos_args
[1].out
[3] = ctx
->ac
.f32_0
; /* W */
687 if (shader
->selector
->info
.writes_psize
)
688 pos_args
[1].out
[0] = psize_value
;
690 if (pos_writes_edgeflag
) {
691 /* The output is a float, but the hw expects an integer
692 * with the first bit containing the edge flag. */
693 edgeflag_value
= LLVMBuildFPToUI(ctx
->ac
.builder
,
696 edgeflag_value
= ac_build_umin(&ctx
->ac
,
700 /* The LLVM intrinsic expects a float. */
701 pos_args
[1].out
[1] = ac_to_float(&ctx
->ac
, edgeflag_value
);
704 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
705 /* GFX9 has the layer in out.z[10:0] and the viewport
706 * index in out.z[19:16].
708 if (shader
->selector
->info
.writes_layer
)
709 pos_args
[1].out
[2] = layer_value
;
711 if (shader
->selector
->info
.writes_viewport_index
) {
712 LLVMValueRef v
= viewport_index_value
;
714 v
= ac_to_integer(&ctx
->ac
, v
);
715 v
= LLVMBuildShl(ctx
->ac
.builder
, v
,
716 LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
717 v
= LLVMBuildOr(ctx
->ac
.builder
, v
,
718 ac_to_integer(&ctx
->ac
, pos_args
[1].out
[2]), "");
719 pos_args
[1].out
[2] = ac_to_float(&ctx
->ac
, v
);
720 pos_args
[1].enabled_channels
|= 1 << 2;
723 if (shader
->selector
->info
.writes_layer
)
724 pos_args
[1].out
[2] = layer_value
;
726 if (shader
->selector
->info
.writes_viewport_index
) {
727 pos_args
[1].out
[3] = viewport_index_value
;
728 pos_args
[1].enabled_channels
|= 1 << 3;
733 for (i
= 0; i
< 4; i
++)
734 if (pos_args
[i
].out
[0])
735 shader
->info
.nr_pos_exports
++;
737 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
738 * Setting valid_mask=1 prevents it and has no other effect.
740 if (ctx
->screen
->info
.family
== CHIP_NAVI10
||
741 ctx
->screen
->info
.family
== CHIP_NAVI12
||
742 ctx
->screen
->info
.family
== CHIP_NAVI14
)
743 pos_args
[0].valid_mask
= 1;
746 for (i
= 0; i
< 4; i
++) {
747 if (!pos_args
[i
].out
[0])
750 /* Specify the target we are exporting */
751 pos_args
[i
].target
= V_008DFC_SQ_EXP_POS
+ pos_idx
++;
753 if (pos_idx
== shader
->info
.nr_pos_exports
)
754 /* Specify that this is the last export */
755 pos_args
[i
].done
= 1;
757 ac_build_export(&ctx
->ac
, &pos_args
[i
]);
760 /* Build parameter exports. */
761 si_build_param_exports(ctx
, outputs
, noutput
);
764 void si_llvm_emit_vs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
,
767 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
768 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
769 struct si_shader_output_values
*outputs
= NULL
;
772 assert(!ctx
->shader
->is_gs_copy_shader
);
773 assert(info
->num_outputs
<= max_outputs
);
775 outputs
= MALLOC((info
->num_outputs
+ 1) * sizeof(outputs
[0]));
777 for (i
= 0; i
< info
->num_outputs
; i
++) {
778 outputs
[i
].semantic_name
= info
->output_semantic_name
[i
];
779 outputs
[i
].semantic_index
= info
->output_semantic_index
[i
];
781 for (j
= 0; j
< 4; j
++) {
782 outputs
[i
].values
[j
] =
783 LLVMBuildLoad(ctx
->ac
.builder
,
786 outputs
[i
].vertex_stream
[j
] =
787 (info
->output_streams
[i
] >> (2 * j
)) & 3;
791 if (!ctx
->screen
->use_ngg_streamout
&&
792 ctx
->shader
->selector
->so
.num_outputs
)
793 si_llvm_emit_streamout(ctx
, outputs
, i
, 0);
795 /* Export PrimitiveID. */
796 if (ctx
->shader
->key
.mono
.u
.vs_export_prim_id
) {
797 outputs
[i
].semantic_name
= TGSI_SEMANTIC_PRIMID
;
798 outputs
[i
].semantic_index
= 0;
799 outputs
[i
].values
[0] = ac_to_float(&ctx
->ac
, si_get_primitive_id(ctx
, 0));
800 for (j
= 1; j
< 4; j
++)
801 outputs
[i
].values
[j
] = LLVMConstReal(ctx
->ac
.f32
, 0);
803 memset(outputs
[i
].vertex_stream
, 0,
804 sizeof(outputs
[i
].vertex_stream
));
808 si_llvm_build_vs_exports(ctx
, outputs
, i
);
812 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi
*abi
,
813 unsigned max_outputs
,
816 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
817 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
818 LLVMValueRef pos
[4] = {};
820 assert(info
->num_outputs
<= max_outputs
);
822 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
823 if (info
->output_semantic_name
[i
] != TGSI_SEMANTIC_POSITION
)
826 for (unsigned chan
= 0; chan
< 4; chan
++)
827 pos
[chan
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
830 assert(pos
[0] != NULL
);
832 /* Return the position output. */
833 LLVMValueRef ret
= ctx
->return_value
;
834 for (unsigned chan
= 0; chan
< 4; chan
++)
835 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, pos
[chan
], chan
, "");
836 ctx
->return_value
= ret
;
840 * Build the vertex shader prolog function.
842 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
843 * All inputs are returned unmodified. The vertex load indices are
844 * stored after them, which will be used by the API VS for fetching inputs.
846 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
851 * (VertexID + BaseVertex),
852 * (InstanceID + StartInstance),
853 * (InstanceID / 2 + StartInstance)
855 void si_llvm_build_vs_prolog(struct si_shader_context
*ctx
,
856 union si_shader_part_key
*key
)
858 LLVMTypeRef
*returns
;
859 LLVMValueRef ret
, func
;
861 unsigned first_vs_vgpr
= key
->vs_prolog
.num_merged_next_stage_vgprs
;
862 unsigned num_input_vgprs
= key
->vs_prolog
.num_merged_next_stage_vgprs
+ 4 +
863 (key
->vs_prolog
.has_ngg_cull_inputs
? 1 : 0);
864 struct ac_arg input_sgpr_param
[key
->vs_prolog
.num_input_sgprs
];
865 struct ac_arg input_vgpr_param
[10];
866 LLVMValueRef input_vgprs
[10];
867 unsigned num_all_input_regs
= key
->vs_prolog
.num_input_sgprs
+
869 unsigned user_sgpr_base
= key
->vs_prolog
.num_merged_next_stage_vgprs
? 8 : 0;
871 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
873 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
874 returns
= alloca((num_all_input_regs
+ key
->vs_prolog
.num_inputs
) *
875 sizeof(LLVMTypeRef
));
878 /* Declare input and output SGPRs. */
879 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
880 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
,
881 &input_sgpr_param
[i
]);
882 returns
[num_returns
++] = ctx
->ac
.i32
;
885 struct ac_arg merged_wave_info
= input_sgpr_param
[3];
887 /* Preloaded VGPRs (outputs must be floats) */
888 for (i
= 0; i
< num_input_vgprs
; i
++) {
889 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &input_vgpr_param
[i
]);
890 returns
[num_returns
++] = ctx
->ac
.f32
;
893 /* Vertex load indices. */
894 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++)
895 returns
[num_returns
++] = ctx
->ac
.f32
;
897 /* Create the function. */
898 si_llvm_create_func(ctx
, "vs_prolog", returns
, num_returns
, 0);
901 for (i
= 0; i
< num_input_vgprs
; i
++) {
902 input_vgprs
[i
] = ac_get_arg(&ctx
->ac
, input_vgpr_param
[i
]);
905 if (key
->vs_prolog
.num_merged_next_stage_vgprs
) {
906 if (!key
->vs_prolog
.is_monolithic
)
907 si_init_exec_from_input(ctx
, merged_wave_info
, 0);
909 if (key
->vs_prolog
.as_ls
&&
910 ctx
->screen
->info
.has_ls_vgpr_init_bug
) {
911 /* If there are no HS threads, SPI loads the LS VGPRs
912 * starting at VGPR 0. Shift them back to where they
915 LLVMValueRef has_hs_threads
=
916 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
,
917 si_unpack_param(ctx
, input_sgpr_param
[3], 8, 8),
920 for (i
= 4; i
> 0; --i
) {
922 LLVMBuildSelect(ctx
->ac
.builder
, has_hs_threads
,
924 input_vgprs
[i
- 1], "");
929 if (key
->vs_prolog
.gs_fast_launch_tri_list
||
930 key
->vs_prolog
.gs_fast_launch_tri_strip
) {
931 LLVMValueRef wave_id
, thread_id_in_tg
;
933 wave_id
= si_unpack_param(ctx
, input_sgpr_param
[3], 24, 4);
934 thread_id_in_tg
= ac_build_imad(&ctx
->ac
, wave_id
,
935 LLVMConstInt(ctx
->ac
.i32
, ctx
->ac
.wave_size
, false),
936 ac_get_thread_id(&ctx
->ac
));
938 /* The GS fast launch initializes all VGPRs to the value of
939 * the first thread, so we have to add the thread ID.
941 * Only these are initialized by the hw:
942 * VGPR2: Base Primitive ID
943 * VGPR5: Base Vertex ID
947 /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
948 * The NGG cull shader will read them from there.
950 if (key
->vs_prolog
.gs_fast_launch_tri_list
) {
951 input_vgprs
[0] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx01_offset */
952 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 0 */
953 LLVMConstInt(ctx
->ac
.i32
, 0, 0));
954 input_vgprs
[1] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx23_offset */
955 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 1 */
956 LLVMConstInt(ctx
->ac
.i32
, 1, 0));
957 input_vgprs
[4] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx45_offset */
958 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 2 */
959 LLVMConstInt(ctx
->ac
.i32
, 2, 0));
961 assert(key
->vs_prolog
.gs_fast_launch_tri_strip
);
962 LLVMBuilderRef builder
= ctx
->ac
.builder
;
963 /* Triangle indices: */
964 LLVMValueRef index
[3] = {
966 LLVMBuildAdd(builder
, thread_id_in_tg
,
967 LLVMConstInt(ctx
->ac
.i32
, 1, 0), ""),
968 LLVMBuildAdd(builder
, thread_id_in_tg
,
969 LLVMConstInt(ctx
->ac
.i32
, 2, 0), ""),
971 LLVMValueRef is_odd
= LLVMBuildTrunc(ctx
->ac
.builder
,
972 thread_id_in_tg
, ctx
->ac
.i1
, "");
973 LLVMValueRef flatshade_first
=
974 LLVMBuildICmp(builder
, LLVMIntEQ
,
975 si_unpack_param(ctx
, ctx
->vs_state_bits
, 4, 2),
978 ac_build_triangle_strip_indices_to_triangle(&ctx
->ac
, is_odd
,
979 flatshade_first
, index
);
980 input_vgprs
[0] = index
[0];
981 input_vgprs
[1] = index
[1];
982 input_vgprs
[4] = index
[2];
985 /* Triangles always have all edge flags set initially. */
986 input_vgprs
[3] = LLVMConstInt(ctx
->ac
.i32
, 0x7 << 8, 0);
988 input_vgprs
[2] = LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[2],
989 thread_id_in_tg
, ""); /* PrimID */
990 input_vgprs
[5] = LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[5],
991 thread_id_in_tg
, ""); /* VertexID */
992 input_vgprs
[8] = input_vgprs
[6]; /* InstanceID */
995 unsigned vertex_id_vgpr
= first_vs_vgpr
;
996 unsigned instance_id_vgpr
=
997 ctx
->screen
->info
.chip_class
>= GFX10
?
999 first_vs_vgpr
+ (key
->vs_prolog
.as_ls
? 2 : 1);
1001 ctx
->abi
.vertex_id
= input_vgprs
[vertex_id_vgpr
];
1002 ctx
->abi
.instance_id
= input_vgprs
[instance_id_vgpr
];
1004 /* InstanceID = VertexID >> 16;
1005 * VertexID = VertexID & 0xffff;
1007 if (key
->vs_prolog
.states
.unpack_instance_id_from_vertex_id
) {
1008 ctx
->abi
.instance_id
= LLVMBuildLShr(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
1009 LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
1010 ctx
->abi
.vertex_id
= LLVMBuildAnd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
1011 LLVMConstInt(ctx
->ac
.i32
, 0xffff, 0), "");
1014 /* Copy inputs to outputs. This should be no-op, as the registers match,
1015 * but it will prevent the compiler from overwriting them unintentionally.
1017 ret
= ctx
->return_value
;
1018 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
1019 LLVMValueRef p
= LLVMGetParam(func
, i
);
1020 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, i
, "");
1022 for (i
= 0; i
< num_input_vgprs
; i
++) {
1023 LLVMValueRef p
= input_vgprs
[i
];
1025 if (i
== vertex_id_vgpr
)
1026 p
= ctx
->abi
.vertex_id
;
1027 else if (i
== instance_id_vgpr
)
1028 p
= ctx
->abi
.instance_id
;
1030 p
= ac_to_float(&ctx
->ac
, p
);
1031 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
,
1032 key
->vs_prolog
.num_input_sgprs
+ i
, "");
1035 /* Compute vertex load indices from instance divisors. */
1036 LLVMValueRef instance_divisor_constbuf
= NULL
;
1038 if (key
->vs_prolog
.states
.instance_divisor_is_fetched
) {
1039 LLVMValueRef list
= si_prolog_get_rw_buffers(ctx
);
1040 LLVMValueRef buf_index
=
1041 LLVMConstInt(ctx
->ac
.i32
, SI_VS_CONST_INSTANCE_DIVISORS
, 0);
1042 instance_divisor_constbuf
=
1043 ac_build_load_to_sgpr(&ctx
->ac
, list
, buf_index
);
1046 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++) {
1047 bool divisor_is_one
=
1048 key
->vs_prolog
.states
.instance_divisor_is_one
& (1u << i
);
1049 bool divisor_is_fetched
=
1050 key
->vs_prolog
.states
.instance_divisor_is_fetched
& (1u << i
);
1051 LLVMValueRef index
= NULL
;
1053 if (divisor_is_one
) {
1054 index
= ctx
->abi
.instance_id
;
1055 } else if (divisor_is_fetched
) {
1056 LLVMValueRef udiv_factors
[4];
1058 for (unsigned j
= 0; j
< 4; j
++) {
1060 si_buffer_load_const(ctx
, instance_divisor_constbuf
,
1061 LLVMConstInt(ctx
->ac
.i32
, i
*16 + j
*4, 0));
1062 udiv_factors
[j
] = ac_to_integer(&ctx
->ac
, udiv_factors
[j
]);
1064 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
1065 * Such InstanceID might not be achievable in a reasonable time though.
1067 index
= ac_build_fast_udiv_nuw(&ctx
->ac
, ctx
->abi
.instance_id
,
1068 udiv_factors
[0], udiv_factors
[1],
1069 udiv_factors
[2], udiv_factors
[3]);
1072 if (divisor_is_one
|| divisor_is_fetched
) {
1073 /* Add StartInstance. */
1074 index
= LLVMBuildAdd(ctx
->ac
.builder
, index
,
1075 LLVMGetParam(ctx
->main_fn
, user_sgpr_base
+
1076 SI_SGPR_START_INSTANCE
), "");
1078 /* VertexID + BaseVertex */
1079 index
= LLVMBuildAdd(ctx
->ac
.builder
,
1081 LLVMGetParam(func
, user_sgpr_base
+
1082 SI_SGPR_BASE_VERTEX
), "");
1085 index
= ac_to_float(&ctx
->ac
, index
);
1086 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, index
,
1087 ctx
->args
.arg_count
+ i
, "");
1090 si_llvm_build_ret(ctx
, ret
);
1093 static LLVMValueRef
get_base_vertex(struct ac_shader_abi
*abi
)
1095 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
1097 /* For non-indexed draws, the base vertex set by the driver
1098 * (for direct draws) or the CP (for indirect draws) is the
1099 * first vertex ID, but GLSL expects 0 to be returned.
1101 LLVMValueRef vs_state
= ac_get_arg(&ctx
->ac
,
1102 ctx
->vs_state_bits
);
1103 LLVMValueRef indexed
;
1105 indexed
= LLVMBuildLShr(ctx
->ac
.builder
, vs_state
, ctx
->ac
.i32_1
, "");
1106 indexed
= LLVMBuildTrunc(ctx
->ac
.builder
, indexed
, ctx
->ac
.i1
, "");
1108 return LLVMBuildSelect(ctx
->ac
.builder
, indexed
,
1109 ac_get_arg(&ctx
->ac
, ctx
->args
.base_vertex
),
1113 void si_llvm_init_vs_callbacks(struct si_shader_context
*ctx
, bool ngg_cull_shader
)
1115 struct si_shader
*shader
= ctx
->shader
;
1117 if (shader
->key
.as_ls
)
1118 ctx
->abi
.emit_outputs
= si_llvm_emit_ls_epilogue
;
1119 else if (shader
->key
.as_es
)
1120 ctx
->abi
.emit_outputs
= si_llvm_emit_es_epilogue
;
1121 else if (shader
->key
.opt
.vs_as_prim_discard_cs
)
1122 ctx
->abi
.emit_outputs
= si_llvm_emit_prim_discard_cs_epilogue
;
1123 else if (ngg_cull_shader
)
1124 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_culling_epilogue_4x_wave32
;
1125 else if (shader
->key
.as_ngg
)
1126 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_epilogue
;
1128 ctx
->abi
.emit_outputs
= si_llvm_emit_vs_epilogue
;
1130 ctx
->abi
.load_base_vertex
= get_base_vertex
;