2 * Copyright 2020 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "si_shader_internal.h"
28 #include "util/u_memory.h"
30 static LLVMValueRef
unpack_sint16(struct si_shader_context
*ctx
, LLVMValueRef i32
, unsigned index
)
35 return LLVMBuildAShr(ctx
->ac
.builder
, i32
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
37 return LLVMBuildSExt(ctx
->ac
.builder
, LLVMBuildTrunc(ctx
->ac
.builder
, i32
, ctx
->ac
.i16
, ""),
41 static void load_input_vs(struct si_shader_context
*ctx
, unsigned input_index
, LLVMValueRef out
[4])
43 const struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
44 unsigned vs_blit_property
= info
->base
.vs
.blit_sgprs_amd
;
46 if (vs_blit_property
) {
47 LLVMValueRef vertex_id
= ctx
->abi
.vertex_id
;
49 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntULE
, vertex_id
, ctx
->ac
.i32_1
, "");
50 /* Use LLVMIntNE, because we have 3 vertices and only
51 * the middle one should use y2.
53 LLVMValueRef sel_y1
= LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
, vertex_id
, ctx
->ac
.i32_1
, "");
55 unsigned param_vs_blit_inputs
= ctx
->vs_blit_inputs
.arg_index
;
56 if (input_index
== 0) {
58 LLVMValueRef x1y1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
);
59 LLVMValueRef x2y2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 1);
61 LLVMValueRef x1
= unpack_sint16(ctx
, x1y1
, 0);
62 LLVMValueRef y1
= unpack_sint16(ctx
, x1y1
, 1);
63 LLVMValueRef x2
= unpack_sint16(ctx
, x2y2
, 0);
64 LLVMValueRef y2
= unpack_sint16(ctx
, x2y2
, 1);
66 LLVMValueRef x
= LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
, x1
, x2
, "");
67 LLVMValueRef y
= LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
, y1
, y2
, "");
69 out
[0] = LLVMBuildSIToFP(ctx
->ac
.builder
, x
, ctx
->ac
.f32
, "");
70 out
[1] = LLVMBuildSIToFP(ctx
->ac
.builder
, y
, ctx
->ac
.f32
, "");
71 out
[2] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 2);
72 out
[3] = ctx
->ac
.f32_1
;
76 /* Color or texture coordinates: */
77 assert(input_index
== 1);
79 if (vs_blit_property
== SI_VS_BLIT_SGPRS_POS_COLOR
) {
80 for (int i
= 0; i
< 4; i
++) {
81 out
[i
] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 3 + i
);
84 assert(vs_blit_property
== SI_VS_BLIT_SGPRS_POS_TEXCOORD
);
85 LLVMValueRef x1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 3);
86 LLVMValueRef y1
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 4);
87 LLVMValueRef x2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 5);
88 LLVMValueRef y2
= LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 6);
90 out
[0] = LLVMBuildSelect(ctx
->ac
.builder
, sel_x1
, x1
, x2
, "");
91 out
[1] = LLVMBuildSelect(ctx
->ac
.builder
, sel_y1
, y1
, y2
, "");
92 out
[2] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 7);
93 out
[3] = LLVMGetParam(ctx
->main_fn
, param_vs_blit_inputs
+ 8);
98 unsigned num_vbos_in_user_sgprs
= ctx
->shader
->selector
->num_vbos_in_user_sgprs
;
99 union si_vs_fix_fetch fix_fetch
;
100 LLVMValueRef vb_desc
;
101 LLVMValueRef vertex_index
;
104 if (input_index
< num_vbos_in_user_sgprs
) {
105 vb_desc
= ac_get_arg(&ctx
->ac
, ctx
->vb_descriptors
[input_index
]);
107 unsigned index
= input_index
- num_vbos_in_user_sgprs
;
108 vb_desc
= ac_build_load_to_sgpr(&ctx
->ac
, ac_get_arg(&ctx
->ac
, ctx
->vertex_buffers
),
109 LLVMConstInt(ctx
->ac
.i32
, index
, 0));
112 vertex_index
= LLVMGetParam(ctx
->main_fn
, ctx
->vertex_index0
.arg_index
+ input_index
);
114 /* Use the open-coded implementation for all loads of doubles and
115 * of dword-sized data that needs fixups. We need to insert conversion
116 * code anyway, and the amd/common code does it for us.
118 * Note: On LLVM <= 8, we can only open-code formats with
119 * channel size >= 4 bytes.
121 bool opencode
= ctx
->shader
->key
.mono
.vs_fetch_opencode
& (1 << input_index
);
122 fix_fetch
.bits
= ctx
->shader
->key
.mono
.vs_fix_fetch
[input_index
].bits
;
123 if (opencode
|| (fix_fetch
.u
.log_size
== 3 && fix_fetch
.u
.format
== AC_FETCH_FORMAT_FLOAT
) ||
124 (fix_fetch
.u
.log_size
== 2)) {
125 tmp
= ac_build_opencoded_load_format(&ctx
->ac
, fix_fetch
.u
.log_size
,
126 fix_fetch
.u
.num_channels_m1
+ 1, fix_fetch
.u
.format
,
127 fix_fetch
.u
.reverse
, !opencode
, vb_desc
, vertex_index
,
128 ctx
->ac
.i32_0
, ctx
->ac
.i32_0
, 0, true);
129 for (unsigned i
= 0; i
< 4; ++i
)
131 LLVMBuildExtractElement(ctx
->ac
.builder
, tmp
, LLVMConstInt(ctx
->ac
.i32
, i
, false), "");
135 unsigned required_channels
= util_last_bit(info
->input_usage_mask
[input_index
]);
136 if (required_channels
== 0) {
137 for (unsigned i
= 0; i
< 4; ++i
)
138 out
[i
] = LLVMGetUndef(ctx
->ac
.f32
);
142 /* Do multiple loads for special formats. */
143 LLVMValueRef fetches
[4];
144 unsigned num_fetches
;
145 unsigned fetch_stride
;
146 unsigned channels_per_fetch
;
148 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2) {
149 num_fetches
= MIN2(required_channels
, 3);
150 fetch_stride
= 1 << fix_fetch
.u
.log_size
;
151 channels_per_fetch
= 1;
155 channels_per_fetch
= required_channels
;
158 for (unsigned i
= 0; i
< num_fetches
; ++i
) {
159 LLVMValueRef voffset
= LLVMConstInt(ctx
->ac
.i32
, fetch_stride
* i
, 0);
160 fetches
[i
] = ac_build_buffer_load_format(&ctx
->ac
, vb_desc
, vertex_index
, voffset
,
161 channels_per_fetch
, 0, true, false);
164 if (num_fetches
== 1 && channels_per_fetch
> 1) {
165 LLVMValueRef fetch
= fetches
[0];
166 for (unsigned i
= 0; i
< channels_per_fetch
; ++i
) {
167 tmp
= LLVMConstInt(ctx
->ac
.i32
, i
, false);
168 fetches
[i
] = LLVMBuildExtractElement(ctx
->ac
.builder
, fetch
, tmp
, "");
170 num_fetches
= channels_per_fetch
;
171 channels_per_fetch
= 1;
174 for (unsigned i
= num_fetches
; i
< 4; ++i
)
175 fetches
[i
] = LLVMGetUndef(ctx
->ac
.f32
);
177 if (fix_fetch
.u
.log_size
<= 1 && fix_fetch
.u
.num_channels_m1
== 2 && required_channels
== 4) {
178 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_UINT
|| fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
)
179 fetches
[3] = ctx
->ac
.i32_1
;
181 fetches
[3] = ctx
->ac
.f32_1
;
182 } else if (fix_fetch
.u
.log_size
== 3 &&
183 (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
||
184 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
||
185 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SINT
) &&
186 required_channels
== 4) {
187 /* For 2_10_10_10, the hardware returns an unsigned value;
188 * convert it to a signed one.
190 LLVMValueRef tmp
= fetches
[3];
191 LLVMValueRef c30
= LLVMConstInt(ctx
->ac
.i32
, 30, 0);
193 /* First, recover the sign-extended signed integer value. */
194 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
)
195 tmp
= LLVMBuildFPToUI(ctx
->ac
.builder
, tmp
, ctx
->ac
.i32
, "");
197 tmp
= ac_to_integer(&ctx
->ac
, tmp
);
199 /* For the integer-like cases, do a natural sign extension.
201 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
202 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
206 ctx
->ac
.builder
, tmp
,
207 fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
? LLVMConstInt(ctx
->ac
.i32
, 7, 0) : c30
, "");
208 tmp
= LLVMBuildAShr(ctx
->ac
.builder
, tmp
, c30
, "");
210 /* Convert back to the right type. */
211 if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SNORM
) {
213 LLVMValueRef neg_one
= LLVMConstReal(ctx
->ac
.f32
, -1.0);
214 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
215 clamp
= LLVMBuildFCmp(ctx
->ac
.builder
, LLVMRealULT
, tmp
, neg_one
, "");
216 tmp
= LLVMBuildSelect(ctx
->ac
.builder
, clamp
, neg_one
, tmp
, "");
217 } else if (fix_fetch
.u
.format
== AC_FETCH_FORMAT_SSCALED
) {
218 tmp
= LLVMBuildSIToFP(ctx
->ac
.builder
, tmp
, ctx
->ac
.f32
, "");
224 for (unsigned i
= 0; i
< 4; ++i
)
225 out
[i
] = ac_to_float(&ctx
->ac
, fetches
[i
]);
228 void si_llvm_load_vs_inputs(struct si_shader_context
*ctx
, struct nir_shader
*nir
)
230 const struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
232 for (unsigned i
= 0; i
< info
->num_inputs
; i
++) {
233 LLVMValueRef values
[4];
235 load_input_vs(ctx
, i
, values
);
237 for (unsigned chan
= 0; chan
< 4; chan
++) {
238 ctx
->inputs
[i
* 4 + chan
] =
239 LLVMBuildBitCast(ctx
->ac
.builder
, values
[chan
], ctx
->ac
.i32
, "");
244 void si_llvm_streamout_store_output(struct si_shader_context
*ctx
, LLVMValueRef
const *so_buffers
,
245 LLVMValueRef
const *so_write_offsets
,
246 struct pipe_stream_output
*stream_out
,
247 struct si_shader_output_values
*shader_out
)
249 unsigned buf_idx
= stream_out
->output_buffer
;
250 unsigned start
= stream_out
->start_component
;
251 unsigned num_comps
= stream_out
->num_components
;
254 assert(num_comps
&& num_comps
<= 4);
255 if (!num_comps
|| num_comps
> 4)
258 /* Load the output as int. */
259 for (int j
= 0; j
< num_comps
; j
++) {
260 assert(stream_out
->stream
== shader_out
->vertex_stream
[start
+ j
]);
262 out
[j
] = ac_to_integer(&ctx
->ac
, shader_out
->values
[start
+ j
]);
265 /* Pack the output. */
266 LLVMValueRef vdata
= NULL
;
272 case 2: /* as v2i32 */
273 case 3: /* as v3i32 */
274 if (ac_has_vec3_support(ctx
->screen
->info
.chip_class
, false)) {
275 vdata
= ac_build_gather_values(&ctx
->ac
, out
, num_comps
);
278 /* as v4i32 (aligned to 4) */
279 out
[3] = LLVMGetUndef(ctx
->ac
.i32
);
281 case 4: /* as v4i32 */
282 vdata
= ac_build_gather_values(&ctx
->ac
, out
, util_next_power_of_two(num_comps
));
286 ac_build_buffer_store_dword(&ctx
->ac
, so_buffers
[buf_idx
], vdata
, num_comps
,
287 so_write_offsets
[buf_idx
], ctx
->ac
.i32_0
, stream_out
->dst_offset
* 4,
292 * Write streamout data to buffers for vertex stream @p stream (different
293 * vertex streams can occur for GS copy shaders).
295 void si_llvm_emit_streamout(struct si_shader_context
*ctx
, struct si_shader_output_values
*outputs
,
296 unsigned noutput
, unsigned stream
)
298 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
299 struct pipe_stream_output_info
*so
= &sel
->so
;
300 LLVMBuilderRef builder
= ctx
->ac
.builder
;
303 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
304 LLVMValueRef so_vtx_count
= si_unpack_param(ctx
, ctx
->streamout_config
, 16, 7);
306 LLVMValueRef tid
= ac_get_thread_id(&ctx
->ac
);
308 /* can_emit = tid < so_vtx_count; */
309 LLVMValueRef can_emit
= LLVMBuildICmp(builder
, LLVMIntULT
, tid
, so_vtx_count
, "");
311 /* Emit the streamout code conditionally. This actually avoids
312 * out-of-bounds buffer access. The hw tells us via the SGPR
313 * (so_vtx_count) which threads are allowed to emit streamout data. */
314 ac_build_ifcc(&ctx
->ac
, can_emit
, 6501);
316 /* The buffer offset is computed as follows:
317 * ByteOffset = streamout_offset[buffer_id]*4 +
318 * (streamout_write_index + thread_id)*stride[buffer_id] +
322 LLVMValueRef so_write_index
= ac_get_arg(&ctx
->ac
, ctx
->streamout_write_index
);
324 /* Compute (streamout_write_index + thread_id). */
325 so_write_index
= LLVMBuildAdd(builder
, so_write_index
, tid
, "");
327 /* Load the descriptor and compute the write offset for each
329 LLVMValueRef so_write_offset
[4] = {};
330 LLVMValueRef so_buffers
[4];
331 LLVMValueRef buf_ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
333 for (i
= 0; i
< 4; i
++) {
337 LLVMValueRef offset
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_STREAMOUT_BUF0
+ i
, 0);
339 so_buffers
[i
] = ac_build_load_to_sgpr(&ctx
->ac
, buf_ptr
, offset
);
341 LLVMValueRef so_offset
= ac_get_arg(&ctx
->ac
, ctx
->streamout_offset
[i
]);
342 so_offset
= LLVMBuildMul(builder
, so_offset
, LLVMConstInt(ctx
->ac
.i32
, 4, 0), "");
344 so_write_offset
[i
] = ac_build_imad(
345 &ctx
->ac
, so_write_index
, LLVMConstInt(ctx
->ac
.i32
, so
->stride
[i
] * 4, 0), so_offset
);
348 /* Write streamout data. */
349 for (i
= 0; i
< so
->num_outputs
; i
++) {
350 unsigned reg
= so
->output
[i
].register_index
;
355 if (stream
!= so
->output
[i
].stream
)
358 si_llvm_streamout_store_output(ctx
, so_buffers
, so_write_offset
, &so
->output
[i
],
362 ac_build_endif(&ctx
->ac
, 6501);
365 static void si_llvm_emit_clipvertex(struct si_shader_context
*ctx
, struct ac_export_args
*pos
,
366 LLVMValueRef
*out_elts
)
371 LLVMValueRef base_elt
;
372 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->rw_buffers
);
373 LLVMValueRef constbuf_index
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_CONST_CLIP_PLANES
, 0);
374 LLVMValueRef const_resource
= ac_build_load_to_sgpr(&ctx
->ac
, ptr
, constbuf_index
);
376 for (reg_index
= 0; reg_index
< 2; reg_index
++) {
377 struct ac_export_args
*args
= &pos
[2 + reg_index
];
379 args
->out
[0] = args
->out
[1] = args
->out
[2] = args
->out
[3] = LLVMConstReal(ctx
->ac
.f32
, 0.0f
);
381 /* Compute dot products of position and user clip plane vectors */
382 for (chan
= 0; chan
< 4; chan
++) {
383 for (const_chan
= 0; const_chan
< 4; const_chan
++) {
385 LLVMConstInt(ctx
->ac
.i32
, ((reg_index
* 4 + chan
) * 4 + const_chan
) * 4, 0);
386 base_elt
= si_buffer_load_const(ctx
, const_resource
, addr
);
388 ac_build_fmad(&ctx
->ac
, base_elt
, out_elts
[const_chan
], args
->out
[chan
]);
392 args
->enabled_channels
= 0xf;
393 args
->valid_mask
= 0;
395 args
->target
= V_008DFC_SQ_EXP_POS
+ 2 + reg_index
;
400 /* Initialize arguments for the shader export intrinsic */
401 static void si_llvm_init_vs_export_args(struct si_shader_context
*ctx
, LLVMValueRef
*values
,
402 unsigned target
, struct ac_export_args
*args
)
404 args
->enabled_channels
= 0xf; /* writemask - default is 0xf */
405 args
->valid_mask
= 0; /* Specify whether the EXEC mask represents the valid mask */
406 args
->done
= 0; /* Specify whether this is the last export */
407 args
->target
= target
; /* Specify the target we are exporting */
410 memcpy(&args
->out
[0], values
, sizeof(values
[0]) * 4);
413 static void si_export_param(struct si_shader_context
*ctx
, unsigned index
, LLVMValueRef
*values
)
415 struct ac_export_args args
;
417 si_llvm_init_vs_export_args(ctx
, values
, V_008DFC_SQ_EXP_PARAM
+ index
, &args
);
418 ac_build_export(&ctx
->ac
, &args
);
421 static void si_build_param_exports(struct si_shader_context
*ctx
,
422 struct si_shader_output_values
*outputs
, unsigned noutput
)
424 struct si_shader
*shader
= ctx
->shader
;
425 unsigned param_count
= 0;
427 for (unsigned i
= 0; i
< noutput
; i
++) {
428 unsigned semantic
= outputs
[i
].semantic
;
430 if (outputs
[i
].vertex_stream
[0] != 0 && outputs
[i
].vertex_stream
[1] != 0 &&
431 outputs
[i
].vertex_stream
[2] != 0 && outputs
[i
].vertex_stream
[3] != 0)
435 case VARYING_SLOT_LAYER
:
436 case VARYING_SLOT_VIEWPORT
:
437 case VARYING_SLOT_CLIP_DIST0
:
438 case VARYING_SLOT_CLIP_DIST1
:
439 case VARYING_SLOT_COL0
:
440 case VARYING_SLOT_COL1
:
441 case VARYING_SLOT_BFC0
:
442 case VARYING_SLOT_BFC1
:
443 case VARYING_SLOT_PRIMITIVE_ID
:
444 case VARYING_SLOT_FOGC
:
447 if ((semantic
>= VARYING_SLOT_TEX0
&& semantic
<= VARYING_SLOT_TEX7
) ||
448 semantic
>= VARYING_SLOT_VAR0
)
454 if (semantic
< VARYING_SLOT_VAR0
+ SI_MAX_IO_GENERIC
&&
455 shader
->key
.opt
.kill_outputs
&
456 (1ull << si_shader_io_get_unique_index(semantic
, true)))
459 si_export_param(ctx
, param_count
, outputs
[i
].values
);
461 assert(i
< ARRAY_SIZE(shader
->info
.vs_output_param_offset
));
462 shader
->info
.vs_output_param_offset
[i
] = param_count
++;
465 shader
->info
.nr_param_exports
= param_count
;
469 * Vertex color clamping.
471 * This uses a state constant loaded in a user data SGPR and
472 * an IF statement is added that clamps all colors if the constant
475 static void si_vertex_color_clamping(struct si_shader_context
*ctx
,
476 struct si_shader_output_values
*outputs
, unsigned noutput
)
478 LLVMValueRef addr
[SI_MAX_VS_OUTPUTS
][4];
479 bool has_colors
= false;
481 /* Store original colors to alloca variables. */
482 for (unsigned i
= 0; i
< noutput
; i
++) {
483 if (outputs
[i
].semantic
!= VARYING_SLOT_COL0
&&
484 outputs
[i
].semantic
!= VARYING_SLOT_COL1
&&
485 outputs
[i
].semantic
!= VARYING_SLOT_BFC0
&&
486 outputs
[i
].semantic
!= VARYING_SLOT_BFC1
)
489 for (unsigned j
= 0; j
< 4; j
++) {
490 addr
[i
][j
] = ac_build_alloca_undef(&ctx
->ac
, ctx
->ac
.f32
, "");
491 LLVMBuildStore(ctx
->ac
.builder
, outputs
[i
].values
[j
], addr
[i
][j
]);
499 /* The state is in the first bit of the user SGPR. */
500 LLVMValueRef cond
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
501 cond
= LLVMBuildTrunc(ctx
->ac
.builder
, cond
, ctx
->ac
.i1
, "");
503 ac_build_ifcc(&ctx
->ac
, cond
, 6502);
505 /* Store clamped colors to alloca variables within the conditional block. */
506 for (unsigned i
= 0; i
< noutput
; i
++) {
507 if (outputs
[i
].semantic
!= VARYING_SLOT_COL0
&&
508 outputs
[i
].semantic
!= VARYING_SLOT_COL1
&&
509 outputs
[i
].semantic
!= VARYING_SLOT_BFC0
&&
510 outputs
[i
].semantic
!= VARYING_SLOT_BFC1
)
513 for (unsigned j
= 0; j
< 4; j
++) {
514 LLVMBuildStore(ctx
->ac
.builder
, ac_build_clamp(&ctx
->ac
, outputs
[i
].values
[j
]),
518 ac_build_endif(&ctx
->ac
, 6502);
520 /* Load clamped colors */
521 for (unsigned i
= 0; i
< noutput
; i
++) {
522 if (outputs
[i
].semantic
!= VARYING_SLOT_COL0
&&
523 outputs
[i
].semantic
!= VARYING_SLOT_COL1
&&
524 outputs
[i
].semantic
!= VARYING_SLOT_BFC0
&&
525 outputs
[i
].semantic
!= VARYING_SLOT_BFC1
)
528 for (unsigned j
= 0; j
< 4; j
++) {
529 outputs
[i
].values
[j
] = LLVMBuildLoad(ctx
->ac
.builder
, addr
[i
][j
], "");
534 /* Generate export instructions for hardware VS shader stage or NGG GS stage
535 * (position and parameter data only).
537 void si_llvm_build_vs_exports(struct si_shader_context
*ctx
,
538 struct si_shader_output_values
*outputs
, unsigned noutput
)
540 struct si_shader
*shader
= ctx
->shader
;
541 struct ac_export_args pos_args
[4] = {};
542 LLVMValueRef psize_value
= NULL
, edgeflag_value
= NULL
, layer_value
= NULL
,
543 viewport_index_value
= NULL
;
547 si_vertex_color_clamping(ctx
, outputs
, noutput
);
549 /* Build position exports. */
550 for (i
= 0; i
< noutput
; i
++) {
551 switch (outputs
[i
].semantic
) {
552 case VARYING_SLOT_POS
:
553 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
, V_008DFC_SQ_EXP_POS
, &pos_args
[0]);
555 case VARYING_SLOT_PSIZ
:
556 psize_value
= outputs
[i
].values
[0];
558 case VARYING_SLOT_LAYER
:
559 layer_value
= outputs
[i
].values
[0];
561 case VARYING_SLOT_VIEWPORT
:
562 viewport_index_value
= outputs
[i
].values
[0];
564 case VARYING_SLOT_EDGE
:
565 edgeflag_value
= outputs
[i
].values
[0];
567 case VARYING_SLOT_CLIP_DIST0
:
568 case VARYING_SLOT_CLIP_DIST1
:
569 if (!shader
->key
.opt
.clip_disable
) {
570 unsigned index
= 2 + (outputs
[i
].semantic
- VARYING_SLOT_CLIP_DIST0
);
571 si_llvm_init_vs_export_args(ctx
, outputs
[i
].values
, V_008DFC_SQ_EXP_POS
+ index
,
575 case VARYING_SLOT_CLIP_VERTEX
:
576 if (!shader
->key
.opt
.clip_disable
) {
577 si_llvm_emit_clipvertex(ctx
, pos_args
, outputs
[i
].values
);
583 /* We need to add the position output manually if it's missing. */
584 if (!pos_args
[0].out
[0]) {
585 pos_args
[0].enabled_channels
= 0xf; /* writemask */
586 pos_args
[0].valid_mask
= 0; /* EXEC mask */
587 pos_args
[0].done
= 0; /* last export? */
588 pos_args
[0].target
= V_008DFC_SQ_EXP_POS
;
589 pos_args
[0].compr
= 0; /* COMPR flag */
590 pos_args
[0].out
[0] = ctx
->ac
.f32_0
; /* X */
591 pos_args
[0].out
[1] = ctx
->ac
.f32_0
; /* Y */
592 pos_args
[0].out
[2] = ctx
->ac
.f32_0
; /* Z */
593 pos_args
[0].out
[3] = ctx
->ac
.f32_1
; /* W */
596 bool pos_writes_edgeflag
= shader
->selector
->info
.writes_edgeflag
&& !shader
->key
.as_ngg
;
598 /* Write the misc vector (point size, edgeflag, layer, viewport). */
599 if (shader
->selector
->info
.writes_psize
|| pos_writes_edgeflag
||
600 shader
->selector
->info
.writes_viewport_index
|| shader
->selector
->info
.writes_layer
) {
601 pos_args
[1].enabled_channels
= shader
->selector
->info
.writes_psize
|
602 (pos_writes_edgeflag
<< 1) |
603 (shader
->selector
->info
.writes_layer
<< 2);
605 pos_args
[1].valid_mask
= 0; /* EXEC mask */
606 pos_args
[1].done
= 0; /* last export? */
607 pos_args
[1].target
= V_008DFC_SQ_EXP_POS
+ 1;
608 pos_args
[1].compr
= 0; /* COMPR flag */
609 pos_args
[1].out
[0] = ctx
->ac
.f32_0
; /* X */
610 pos_args
[1].out
[1] = ctx
->ac
.f32_0
; /* Y */
611 pos_args
[1].out
[2] = ctx
->ac
.f32_0
; /* Z */
612 pos_args
[1].out
[3] = ctx
->ac
.f32_0
; /* W */
614 if (shader
->selector
->info
.writes_psize
)
615 pos_args
[1].out
[0] = psize_value
;
617 if (pos_writes_edgeflag
) {
618 /* The output is a float, but the hw expects an integer
619 * with the first bit containing the edge flag. */
620 edgeflag_value
= LLVMBuildFPToUI(ctx
->ac
.builder
, edgeflag_value
, ctx
->ac
.i32
, "");
621 edgeflag_value
= ac_build_umin(&ctx
->ac
, edgeflag_value
, ctx
->ac
.i32_1
);
623 /* The LLVM intrinsic expects a float. */
624 pos_args
[1].out
[1] = ac_to_float(&ctx
->ac
, edgeflag_value
);
627 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
628 /* GFX9 has the layer in out.z[10:0] and the viewport
629 * index in out.z[19:16].
631 if (shader
->selector
->info
.writes_layer
)
632 pos_args
[1].out
[2] = layer_value
;
634 if (shader
->selector
->info
.writes_viewport_index
) {
635 LLVMValueRef v
= viewport_index_value
;
637 v
= ac_to_integer(&ctx
->ac
, v
);
638 v
= LLVMBuildShl(ctx
->ac
.builder
, v
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
639 v
= LLVMBuildOr(ctx
->ac
.builder
, v
, ac_to_integer(&ctx
->ac
, pos_args
[1].out
[2]), "");
640 pos_args
[1].out
[2] = ac_to_float(&ctx
->ac
, v
);
641 pos_args
[1].enabled_channels
|= 1 << 2;
644 if (shader
->selector
->info
.writes_layer
)
645 pos_args
[1].out
[2] = layer_value
;
647 if (shader
->selector
->info
.writes_viewport_index
) {
648 pos_args
[1].out
[3] = viewport_index_value
;
649 pos_args
[1].enabled_channels
|= 1 << 3;
654 for (i
= 0; i
< 4; i
++)
655 if (pos_args
[i
].out
[0])
656 shader
->info
.nr_pos_exports
++;
658 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
659 * Setting valid_mask=1 prevents it and has no other effect.
661 if (ctx
->screen
->info
.chip_class
== GFX10
)
662 pos_args
[0].valid_mask
= 1;
665 for (i
= 0; i
< 4; i
++) {
666 if (!pos_args
[i
].out
[0])
669 /* Specify the target we are exporting */
670 pos_args
[i
].target
= V_008DFC_SQ_EXP_POS
+ pos_idx
++;
672 if (pos_idx
== shader
->info
.nr_pos_exports
)
673 /* Specify that this is the last export */
674 pos_args
[i
].done
= 1;
676 ac_build_export(&ctx
->ac
, &pos_args
[i
]);
679 /* Build parameter exports. */
680 si_build_param_exports(ctx
, outputs
, noutput
);
683 void si_llvm_emit_vs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
, LLVMValueRef
*addrs
)
685 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
686 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
687 struct si_shader_output_values
*outputs
= NULL
;
690 assert(!ctx
->shader
->is_gs_copy_shader
);
691 assert(info
->num_outputs
<= max_outputs
);
693 outputs
= MALLOC((info
->num_outputs
+ 1) * sizeof(outputs
[0]));
695 for (i
= 0; i
< info
->num_outputs
; i
++) {
696 outputs
[i
].semantic
= info
->output_semantic
[i
];
698 for (j
= 0; j
< 4; j
++) {
699 outputs
[i
].values
[j
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ j
], "");
700 outputs
[i
].vertex_stream
[j
] = (info
->output_streams
[i
] >> (2 * j
)) & 3;
704 if (!ctx
->screen
->use_ngg_streamout
&& ctx
->shader
->selector
->so
.num_outputs
)
705 si_llvm_emit_streamout(ctx
, outputs
, i
, 0);
707 /* Export PrimitiveID. */
708 if (ctx
->shader
->key
.mono
.u
.vs_export_prim_id
) {
709 outputs
[i
].semantic
= VARYING_SLOT_PRIMITIVE_ID
;
710 outputs
[i
].values
[0] = ac_to_float(&ctx
->ac
, si_get_primitive_id(ctx
, 0));
711 for (j
= 1; j
< 4; j
++)
712 outputs
[i
].values
[j
] = LLVMConstReal(ctx
->ac
.f32
, 0);
714 memset(outputs
[i
].vertex_stream
, 0, sizeof(outputs
[i
].vertex_stream
));
718 si_llvm_build_vs_exports(ctx
, outputs
, i
);
722 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi
*abi
, unsigned max_outputs
,
725 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
726 struct si_shader_info
*info
= &ctx
->shader
->selector
->info
;
727 LLVMValueRef pos
[4] = {};
729 assert(info
->num_outputs
<= max_outputs
);
731 for (unsigned i
= 0; i
< info
->num_outputs
; i
++) {
732 if (info
->output_semantic
[i
] != VARYING_SLOT_POS
)
735 for (unsigned chan
= 0; chan
< 4; chan
++)
736 pos
[chan
] = LLVMBuildLoad(ctx
->ac
.builder
, addrs
[4 * i
+ chan
], "");
739 assert(pos
[0] != NULL
);
741 /* Return the position output. */
742 LLVMValueRef ret
= ctx
->return_value
;
743 for (unsigned chan
= 0; chan
< 4; chan
++)
744 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, pos
[chan
], chan
, "");
745 ctx
->return_value
= ret
;
749 * Build the vertex shader prolog function.
751 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
752 * All inputs are returned unmodified. The vertex load indices are
753 * stored after them, which will be used by the API VS for fetching inputs.
755 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
760 * (VertexID + BaseVertex),
761 * (InstanceID + StartInstance),
762 * (InstanceID / 2 + StartInstance)
764 void si_llvm_build_vs_prolog(struct si_shader_context
*ctx
, union si_shader_part_key
*key
)
766 LLVMTypeRef
*returns
;
767 LLVMValueRef ret
, func
;
769 unsigned first_vs_vgpr
= key
->vs_prolog
.num_merged_next_stage_vgprs
;
770 unsigned num_input_vgprs
=
771 key
->vs_prolog
.num_merged_next_stage_vgprs
+ 4 + (key
->vs_prolog
.has_ngg_cull_inputs
? 1 : 0);
772 struct ac_arg input_sgpr_param
[key
->vs_prolog
.num_input_sgprs
];
773 struct ac_arg input_vgpr_param
[10];
774 LLVMValueRef input_vgprs
[10];
775 unsigned num_all_input_regs
= key
->vs_prolog
.num_input_sgprs
+ num_input_vgprs
;
776 unsigned user_sgpr_base
= key
->vs_prolog
.num_merged_next_stage_vgprs
? 8 : 0;
778 memset(&ctx
->args
, 0, sizeof(ctx
->args
));
780 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
781 returns
= alloca((num_all_input_regs
+ key
->vs_prolog
.num_inputs
) * sizeof(LLVMTypeRef
));
784 /* Declare input and output SGPRs. */
785 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
786 ac_add_arg(&ctx
->args
, AC_ARG_SGPR
, 1, AC_ARG_INT
, &input_sgpr_param
[i
]);
787 returns
[num_returns
++] = ctx
->ac
.i32
;
790 struct ac_arg merged_wave_info
= input_sgpr_param
[3];
792 /* Preloaded VGPRs (outputs must be floats) */
793 for (i
= 0; i
< num_input_vgprs
; i
++) {
794 ac_add_arg(&ctx
->args
, AC_ARG_VGPR
, 1, AC_ARG_INT
, &input_vgpr_param
[i
]);
795 returns
[num_returns
++] = ctx
->ac
.f32
;
798 /* Vertex load indices. */
799 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++)
800 returns
[num_returns
++] = ctx
->ac
.f32
;
802 /* Create the function. */
803 si_llvm_create_func(ctx
, "vs_prolog", returns
, num_returns
, 0);
806 for (i
= 0; i
< num_input_vgprs
; i
++) {
807 input_vgprs
[i
] = ac_get_arg(&ctx
->ac
, input_vgpr_param
[i
]);
810 if (key
->vs_prolog
.num_merged_next_stage_vgprs
) {
811 if (!key
->vs_prolog
.is_monolithic
)
812 si_init_exec_from_input(ctx
, merged_wave_info
, 0);
814 if (key
->vs_prolog
.as_ls
&& ctx
->screen
->info
.has_ls_vgpr_init_bug
) {
815 /* If there are no HS threads, SPI loads the LS VGPRs
816 * starting at VGPR 0. Shift them back to where they
819 LLVMValueRef has_hs_threads
=
820 LLVMBuildICmp(ctx
->ac
.builder
, LLVMIntNE
,
821 si_unpack_param(ctx
, input_sgpr_param
[3], 8, 8), ctx
->ac
.i32_0
, "");
823 for (i
= 4; i
> 0; --i
) {
824 input_vgprs
[i
+ 1] = LLVMBuildSelect(ctx
->ac
.builder
, has_hs_threads
,
825 input_vgprs
[i
+ 1], input_vgprs
[i
- 1], "");
830 if (key
->vs_prolog
.gs_fast_launch_tri_list
|| key
->vs_prolog
.gs_fast_launch_tri_strip
) {
831 LLVMValueRef wave_id
, thread_id_in_tg
;
833 wave_id
= si_unpack_param(ctx
, input_sgpr_param
[3], 24, 4);
835 ac_build_imad(&ctx
->ac
, wave_id
, LLVMConstInt(ctx
->ac
.i32
, ctx
->ac
.wave_size
, false),
836 ac_get_thread_id(&ctx
->ac
));
838 /* The GS fast launch initializes all VGPRs to the value of
839 * the first thread, so we have to add the thread ID.
841 * Only these are initialized by the hw:
842 * VGPR2: Base Primitive ID
843 * VGPR5: Base Vertex ID
847 /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
848 * The NGG cull shader will read them from there.
850 if (key
->vs_prolog
.gs_fast_launch_tri_list
) {
851 input_vgprs
[0] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx01_offset */
852 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 0 */
853 LLVMConstInt(ctx
->ac
.i32
, 0, 0));
854 input_vgprs
[1] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx23_offset */
855 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 1 */
856 LLVMConstInt(ctx
->ac
.i32
, 1, 0));
857 input_vgprs
[4] = ac_build_imad(&ctx
->ac
, thread_id_in_tg
, /* gs_vtx45_offset */
858 LLVMConstInt(ctx
->ac
.i32
, 3, 0), /* Vertex 2 */
859 LLVMConstInt(ctx
->ac
.i32
, 2, 0));
861 assert(key
->vs_prolog
.gs_fast_launch_tri_strip
);
862 LLVMBuilderRef builder
= ctx
->ac
.builder
;
863 /* Triangle indices: */
864 LLVMValueRef index
[3] = {
866 LLVMBuildAdd(builder
, thread_id_in_tg
, LLVMConstInt(ctx
->ac
.i32
, 1, 0), ""),
867 LLVMBuildAdd(builder
, thread_id_in_tg
, LLVMConstInt(ctx
->ac
.i32
, 2, 0), ""),
869 LLVMValueRef is_odd
= LLVMBuildTrunc(ctx
->ac
.builder
, thread_id_in_tg
, ctx
->ac
.i1
, "");
870 LLVMValueRef flatshade_first
= LLVMBuildICmp(
871 builder
, LLVMIntEQ
, si_unpack_param(ctx
, ctx
->vs_state_bits
, 4, 2), ctx
->ac
.i32_0
, "");
873 ac_build_triangle_strip_indices_to_triangle(&ctx
->ac
, is_odd
, flatshade_first
, index
);
874 input_vgprs
[0] = index
[0];
875 input_vgprs
[1] = index
[1];
876 input_vgprs
[4] = index
[2];
879 /* Triangles always have all edge flags set initially. */
880 input_vgprs
[3] = LLVMConstInt(ctx
->ac
.i32
, 0x7 << 8, 0);
883 LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[2], thread_id_in_tg
, ""); /* PrimID */
885 LLVMBuildAdd(ctx
->ac
.builder
, input_vgprs
[5], thread_id_in_tg
, ""); /* VertexID */
886 input_vgprs
[8] = input_vgprs
[6]; /* InstanceID */
889 unsigned vertex_id_vgpr
= first_vs_vgpr
;
890 unsigned instance_id_vgpr
= ctx
->screen
->info
.chip_class
>= GFX10
892 : first_vs_vgpr
+ (key
->vs_prolog
.as_ls
? 2 : 1);
894 ctx
->abi
.vertex_id
= input_vgprs
[vertex_id_vgpr
];
895 ctx
->abi
.instance_id
= input_vgprs
[instance_id_vgpr
];
897 /* InstanceID = VertexID >> 16;
898 * VertexID = VertexID & 0xffff;
900 if (key
->vs_prolog
.states
.unpack_instance_id_from_vertex_id
) {
901 ctx
->abi
.instance_id
=
902 LLVMBuildLShr(ctx
->ac
.builder
, ctx
->abi
.vertex_id
, LLVMConstInt(ctx
->ac
.i32
, 16, 0), "");
903 ctx
->abi
.vertex_id
= LLVMBuildAnd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
904 LLVMConstInt(ctx
->ac
.i32
, 0xffff, 0), "");
907 /* Copy inputs to outputs. This should be no-op, as the registers match,
908 * but it will prevent the compiler from overwriting them unintentionally.
910 ret
= ctx
->return_value
;
911 for (i
= 0; i
< key
->vs_prolog
.num_input_sgprs
; i
++) {
912 LLVMValueRef p
= LLVMGetParam(func
, i
);
913 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, i
, "");
915 for (i
= 0; i
< num_input_vgprs
; i
++) {
916 LLVMValueRef p
= input_vgprs
[i
];
918 if (i
== vertex_id_vgpr
)
919 p
= ctx
->abi
.vertex_id
;
920 else if (i
== instance_id_vgpr
)
921 p
= ctx
->abi
.instance_id
;
923 p
= ac_to_float(&ctx
->ac
, p
);
924 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, p
, key
->vs_prolog
.num_input_sgprs
+ i
, "");
927 /* Compute vertex load indices from instance divisors. */
928 LLVMValueRef instance_divisor_constbuf
= NULL
;
930 if (key
->vs_prolog
.states
.instance_divisor_is_fetched
) {
931 LLVMValueRef list
= si_prolog_get_rw_buffers(ctx
);
932 LLVMValueRef buf_index
= LLVMConstInt(ctx
->ac
.i32
, SI_VS_CONST_INSTANCE_DIVISORS
, 0);
933 instance_divisor_constbuf
= ac_build_load_to_sgpr(&ctx
->ac
, list
, buf_index
);
936 for (i
= 0; i
< key
->vs_prolog
.num_inputs
; i
++) {
937 bool divisor_is_one
= key
->vs_prolog
.states
.instance_divisor_is_one
& (1u << i
);
938 bool divisor_is_fetched
= key
->vs_prolog
.states
.instance_divisor_is_fetched
& (1u << i
);
939 LLVMValueRef index
= NULL
;
941 if (divisor_is_one
) {
942 index
= ctx
->abi
.instance_id
;
943 } else if (divisor_is_fetched
) {
944 LLVMValueRef udiv_factors
[4];
946 for (unsigned j
= 0; j
< 4; j
++) {
947 udiv_factors
[j
] = si_buffer_load_const(ctx
, instance_divisor_constbuf
,
948 LLVMConstInt(ctx
->ac
.i32
, i
* 16 + j
* 4, 0));
949 udiv_factors
[j
] = ac_to_integer(&ctx
->ac
, udiv_factors
[j
]);
951 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
952 * Such InstanceID might not be achievable in a reasonable time though.
954 index
= ac_build_fast_udiv_nuw(&ctx
->ac
, ctx
->abi
.instance_id
, udiv_factors
[0],
955 udiv_factors
[1], udiv_factors
[2], udiv_factors
[3]);
958 if (divisor_is_one
|| divisor_is_fetched
) {
959 /* Add StartInstance. */
961 LLVMBuildAdd(ctx
->ac
.builder
, index
,
962 LLVMGetParam(ctx
->main_fn
, user_sgpr_base
+ SI_SGPR_START_INSTANCE
), "");
964 /* VertexID + BaseVertex */
965 index
= LLVMBuildAdd(ctx
->ac
.builder
, ctx
->abi
.vertex_id
,
966 LLVMGetParam(func
, user_sgpr_base
+ SI_SGPR_BASE_VERTEX
), "");
969 index
= ac_to_float(&ctx
->ac
, index
);
970 ret
= LLVMBuildInsertValue(ctx
->ac
.builder
, ret
, index
, ctx
->args
.arg_count
+ i
, "");
973 si_llvm_build_ret(ctx
, ret
);
976 static LLVMValueRef
get_base_vertex(struct ac_shader_abi
*abi
)
978 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
980 /* For non-indexed draws, the base vertex set by the driver
981 * (for direct draws) or the CP (for indirect draws) is the
982 * first vertex ID, but GLSL expects 0 to be returned.
984 LLVMValueRef vs_state
= ac_get_arg(&ctx
->ac
, ctx
->vs_state_bits
);
985 LLVMValueRef indexed
;
987 indexed
= LLVMBuildLShr(ctx
->ac
.builder
, vs_state
, ctx
->ac
.i32_1
, "");
988 indexed
= LLVMBuildTrunc(ctx
->ac
.builder
, indexed
, ctx
->ac
.i1
, "");
990 return LLVMBuildSelect(ctx
->ac
.builder
, indexed
, ac_get_arg(&ctx
->ac
, ctx
->args
.base_vertex
),
994 void si_llvm_init_vs_callbacks(struct si_shader_context
*ctx
, bool ngg_cull_shader
)
996 struct si_shader
*shader
= ctx
->shader
;
998 if (shader
->key
.as_ls
)
999 ctx
->abi
.emit_outputs
= si_llvm_emit_ls_epilogue
;
1000 else if (shader
->key
.as_es
)
1001 ctx
->abi
.emit_outputs
= si_llvm_emit_es_epilogue
;
1002 else if (shader
->key
.opt
.vs_as_prim_discard_cs
)
1003 ctx
->abi
.emit_outputs
= si_llvm_emit_prim_discard_cs_epilogue
;
1004 else if (ngg_cull_shader
)
1005 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_culling_epilogue
;
1006 else if (shader
->key
.as_ngg
)
1007 ctx
->abi
.emit_outputs
= gfx10_emit_ngg_epilogue
;
1009 ctx
->abi
.emit_outputs
= si_llvm_emit_vs_epilogue
;
1011 ctx
->abi
.load_base_vertex
= get_base_vertex
;