radeonsi: move VS shader code into si_shader_llvm_vs.c
[mesa.git] / src / gallium / drivers / radeonsi / si_shader_llvm_vs.c
1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_shader_internal.h"
26 #include "si_pipe.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29
30 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
31 LLVMValueRef i32, unsigned index)
32 {
33 assert(index <= 1);
34
35 if (index == 1)
36 return LLVMBuildAShr(ctx->ac.builder, i32,
37 LLVMConstInt(ctx->ac.i32, 16, 0), "");
38
39 return LLVMBuildSExt(ctx->ac.builder,
40 LLVMBuildTrunc(ctx->ac.builder, i32,
41 ctx->ac.i16, ""),
42 ctx->ac.i32, "");
43 }
44
45 static void load_input_vs(struct si_shader_context *ctx, unsigned input_index,
46 LLVMValueRef out[4])
47 {
48 const struct si_shader_info *info = &ctx->shader->selector->info;
49 unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS_AMD];
50
51 if (vs_blit_property) {
52 LLVMValueRef vertex_id = ctx->abi.vertex_id;
53 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
54 LLVMIntULE, vertex_id,
55 ctx->ac.i32_1, "");
56 /* Use LLVMIntNE, because we have 3 vertices and only
57 * the middle one should use y2.
58 */
59 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
60 LLVMIntNE, vertex_id,
61 ctx->ac.i32_1, "");
62
63 unsigned param_vs_blit_inputs = ctx->vs_blit_inputs.arg_index;
64 if (input_index == 0) {
65 /* Position: */
66 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
67 param_vs_blit_inputs);
68 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
69 param_vs_blit_inputs + 1);
70
71 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
72 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
73 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
74 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
75
76 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
77 x1, x2, "");
78 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
79 y1, y2, "");
80
81 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->ac.f32, "");
82 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->ac.f32, "");
83 out[2] = LLVMGetParam(ctx->main_fn,
84 param_vs_blit_inputs + 2);
85 out[3] = ctx->ac.f32_1;
86 return;
87 }
88
89 /* Color or texture coordinates: */
90 assert(input_index == 1);
91
92 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
93 for (int i = 0; i < 4; i++) {
94 out[i] = LLVMGetParam(ctx->main_fn,
95 param_vs_blit_inputs + 3 + i);
96 }
97 } else {
98 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
99 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
100 param_vs_blit_inputs + 3);
101 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
102 param_vs_blit_inputs + 4);
103 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
104 param_vs_blit_inputs + 5);
105 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
106 param_vs_blit_inputs + 6);
107
108 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
109 x1, x2, "");
110 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
111 y1, y2, "");
112 out[2] = LLVMGetParam(ctx->main_fn,
113 param_vs_blit_inputs + 7);
114 out[3] = LLVMGetParam(ctx->main_fn,
115 param_vs_blit_inputs + 8);
116 }
117 return;
118 }
119
120 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
121 union si_vs_fix_fetch fix_fetch;
122 LLVMValueRef vb_desc;
123 LLVMValueRef vertex_index;
124 LLVMValueRef tmp;
125
126 if (input_index < num_vbos_in_user_sgprs) {
127 vb_desc = ac_get_arg(&ctx->ac, ctx->vb_descriptors[input_index]);
128 } else {
129 unsigned index= input_index - num_vbos_in_user_sgprs;
130 vb_desc = ac_build_load_to_sgpr(&ctx->ac,
131 ac_get_arg(&ctx->ac, ctx->vertex_buffers),
132 LLVMConstInt(ctx->ac.i32, index, 0));
133 }
134
135 vertex_index = LLVMGetParam(ctx->main_fn,
136 ctx->vertex_index0.arg_index +
137 input_index);
138
139 /* Use the open-coded implementation for all loads of doubles and
140 * of dword-sized data that needs fixups. We need to insert conversion
141 * code anyway, and the amd/common code does it for us.
142 *
143 * Note: On LLVM <= 8, we can only open-code formats with
144 * channel size >= 4 bytes.
145 */
146 bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
147 fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
148 if (opencode ||
149 (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
150 (fix_fetch.u.log_size == 2)) {
151 tmp = ac_build_opencoded_load_format(
152 &ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1,
153 fix_fetch.u.format, fix_fetch.u.reverse, !opencode,
154 vb_desc, vertex_index, ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
155 for (unsigned i = 0; i < 4; ++i)
156 out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
157 return;
158 }
159
160 /* Do multiple loads for special formats. */
161 unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
162 LLVMValueRef fetches[4];
163 unsigned num_fetches;
164 unsigned fetch_stride;
165 unsigned channels_per_fetch;
166
167 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2) {
168 num_fetches = MIN2(required_channels, 3);
169 fetch_stride = 1 << fix_fetch.u.log_size;
170 channels_per_fetch = 1;
171 } else {
172 num_fetches = 1;
173 fetch_stride = 0;
174 channels_per_fetch = required_channels;
175 }
176
177 for (unsigned i = 0; i < num_fetches; ++i) {
178 LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
179 fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
180 channels_per_fetch, 0, true);
181 }
182
183 if (num_fetches == 1 && channels_per_fetch > 1) {
184 LLVMValueRef fetch = fetches[0];
185 for (unsigned i = 0; i < channels_per_fetch; ++i) {
186 tmp = LLVMConstInt(ctx->ac.i32, i, false);
187 fetches[i] = LLVMBuildExtractElement(
188 ctx->ac.builder, fetch, tmp, "");
189 }
190 num_fetches = channels_per_fetch;
191 channels_per_fetch = 1;
192 }
193
194 for (unsigned i = num_fetches; i < 4; ++i)
195 fetches[i] = LLVMGetUndef(ctx->ac.f32);
196
197 if (fix_fetch.u.log_size <= 1 && fix_fetch.u.num_channels_m1 == 2 &&
198 required_channels == 4) {
199 if (fix_fetch.u.format == AC_FETCH_FORMAT_UINT || fix_fetch.u.format == AC_FETCH_FORMAT_SINT)
200 fetches[3] = ctx->ac.i32_1;
201 else
202 fetches[3] = ctx->ac.f32_1;
203 } else if (fix_fetch.u.log_size == 3 &&
204 (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ||
205 fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED ||
206 fix_fetch.u.format == AC_FETCH_FORMAT_SINT) &&
207 required_channels == 4) {
208 /* For 2_10_10_10, the hardware returns an unsigned value;
209 * convert it to a signed one.
210 */
211 LLVMValueRef tmp = fetches[3];
212 LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
213
214 /* First, recover the sign-extended signed integer value. */
215 if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED)
216 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->ac.i32, "");
217 else
218 tmp = ac_to_integer(&ctx->ac, tmp);
219
220 /* For the integer-like cases, do a natural sign extension.
221 *
222 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
223 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
224 * exponent.
225 */
226 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
227 fix_fetch.u.format == AC_FETCH_FORMAT_SNORM ?
228 LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
229 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
230
231 /* Convert back to the right type. */
232 if (fix_fetch.u.format == AC_FETCH_FORMAT_SNORM) {
233 LLVMValueRef clamp;
234 LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
235 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
236 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
237 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
238 } else if (fix_fetch.u.format == AC_FETCH_FORMAT_SSCALED) {
239 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->ac.f32, "");
240 }
241
242 fetches[3] = tmp;
243 }
244
245 for (unsigned i = 0; i < 4; ++i)
246 out[i] = ac_to_float(&ctx->ac, fetches[i]);
247 }
248
249 static void declare_input_vs(struct si_shader_context *ctx, unsigned input_index)
250 {
251 LLVMValueRef input[4];
252
253 load_input_vs(ctx, input_index / 4, input);
254
255 for (unsigned chan = 0; chan < 4; chan++) {
256 ctx->inputs[input_index + chan] =
257 LLVMBuildBitCast(ctx->ac.builder, input[chan], ctx->ac.i32, "");
258 }
259 }
260
261 void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
262 {
263 uint64_t processed_inputs = 0;
264
265 nir_foreach_variable(variable, &nir->inputs) {
266 unsigned attrib_count = glsl_count_attribute_slots(variable->type,
267 true);
268 unsigned input_idx = variable->data.driver_location;
269 unsigned loc = variable->data.location;
270
271 for (unsigned i = 0; i < attrib_count; i++) {
272 /* Packed components share the same location so skip
273 * them if we have already processed the location.
274 */
275 if (processed_inputs & ((uint64_t)1 << (loc + i))) {
276 input_idx += 4;
277 continue;
278 }
279
280 declare_input_vs(ctx, input_idx);
281 if (glsl_type_is_dual_slot(variable->type)) {
282 input_idx += 4;
283 declare_input_vs(ctx, input_idx);
284 }
285
286 processed_inputs |= ((uint64_t)1 << (loc + i));
287 input_idx += 4;
288 }
289 }
290 }
291
292 void si_llvm_streamout_store_output(struct si_shader_context *ctx,
293 LLVMValueRef const *so_buffers,
294 LLVMValueRef const *so_write_offsets,
295 struct pipe_stream_output *stream_out,
296 struct si_shader_output_values *shader_out)
297 {
298 unsigned buf_idx = stream_out->output_buffer;
299 unsigned start = stream_out->start_component;
300 unsigned num_comps = stream_out->num_components;
301 LLVMValueRef out[4];
302
303 assert(num_comps && num_comps <= 4);
304 if (!num_comps || num_comps > 4)
305 return;
306
307 /* Load the output as int. */
308 for (int j = 0; j < num_comps; j++) {
309 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
310
311 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
312 }
313
314 /* Pack the output. */
315 LLVMValueRef vdata = NULL;
316
317 switch (num_comps) {
318 case 1: /* as i32 */
319 vdata = out[0];
320 break;
321 case 2: /* as v2i32 */
322 case 3: /* as v3i32 */
323 if (ac_has_vec3_support(ctx->screen->info.chip_class, false)) {
324 vdata = ac_build_gather_values(&ctx->ac, out, num_comps);
325 break;
326 }
327 /* as v4i32 (aligned to 4) */
328 out[3] = LLVMGetUndef(ctx->ac.i32);
329 /* fall through */
330 case 4: /* as v4i32 */
331 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
332 break;
333 }
334
335 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
336 vdata, num_comps,
337 so_write_offsets[buf_idx],
338 ctx->ac.i32_0,
339 stream_out->dst_offset * 4, ac_glc | ac_slc);
340 }
341
342 /**
343 * Write streamout data to buffers for vertex stream @p stream (different
344 * vertex streams can occur for GS copy shaders).
345 */
346 void si_llvm_emit_streamout(struct si_shader_context *ctx,
347 struct si_shader_output_values *outputs,
348 unsigned noutput, unsigned stream)
349 {
350 struct si_shader_selector *sel = ctx->shader->selector;
351 struct pipe_stream_output_info *so = &sel->so;
352 LLVMBuilderRef builder = ctx->ac.builder;
353 int i;
354
355 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
356 LLVMValueRef so_vtx_count =
357 si_unpack_param(ctx, ctx->streamout_config, 16, 7);
358
359 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
360
361 /* can_emit = tid < so_vtx_count; */
362 LLVMValueRef can_emit =
363 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
364
365 /* Emit the streamout code conditionally. This actually avoids
366 * out-of-bounds buffer access. The hw tells us via the SGPR
367 * (so_vtx_count) which threads are allowed to emit streamout data. */
368 ac_build_ifcc(&ctx->ac, can_emit, 6501);
369 {
370 /* The buffer offset is computed as follows:
371 * ByteOffset = streamout_offset[buffer_id]*4 +
372 * (streamout_write_index + thread_id)*stride[buffer_id] +
373 * attrib_offset
374 */
375
376 LLVMValueRef so_write_index =
377 ac_get_arg(&ctx->ac,
378 ctx->streamout_write_index);
379
380 /* Compute (streamout_write_index + thread_id). */
381 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
382
383 /* Load the descriptor and compute the write offset for each
384 * enabled buffer. */
385 LLVMValueRef so_write_offset[4] = {};
386 LLVMValueRef so_buffers[4];
387 LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac,
388 ctx->rw_buffers);
389
390 for (i = 0; i < 4; i++) {
391 if (!so->stride[i])
392 continue;
393
394 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32,
395 SI_VS_STREAMOUT_BUF0 + i, 0);
396
397 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
398
399 LLVMValueRef so_offset = ac_get_arg(&ctx->ac,
400 ctx->streamout_offset[i]);
401 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
402
403 so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
404 LLVMConstInt(ctx->ac.i32, so->stride[i]*4, 0),
405 so_offset);
406 }
407
408 /* Write streamout data. */
409 for (i = 0; i < so->num_outputs; i++) {
410 unsigned reg = so->output[i].register_index;
411
412 if (reg >= noutput)
413 continue;
414
415 if (stream != so->output[i].stream)
416 continue;
417
418 si_llvm_streamout_store_output(ctx, so_buffers, so_write_offset,
419 &so->output[i], &outputs[reg]);
420 }
421 }
422 ac_build_endif(&ctx->ac, 6501);
423 }
424
425 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
426 struct ac_export_args *pos, LLVMValueRef *out_elts)
427 {
428 unsigned reg_index;
429 unsigned chan;
430 unsigned const_chan;
431 LLVMValueRef base_elt;
432 LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
433 LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32,
434 SI_VS_CONST_CLIP_PLANES, 0);
435 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
436
437 for (reg_index = 0; reg_index < 2; reg_index ++) {
438 struct ac_export_args *args = &pos[2 + reg_index];
439
440 args->out[0] =
441 args->out[1] =
442 args->out[2] =
443 args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
444
445 /* Compute dot products of position and user clip plane vectors */
446 for (chan = 0; chan < 4; chan++) {
447 for (const_chan = 0; const_chan < 4; const_chan++) {
448 LLVMValueRef addr =
449 LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 +
450 const_chan) * 4, 0);
451 base_elt = si_buffer_load_const(ctx, const_resource,
452 addr);
453 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
454 out_elts[const_chan], args->out[chan]);
455 }
456 }
457
458 args->enabled_channels = 0xf;
459 args->valid_mask = 0;
460 args->done = 0;
461 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
462 args->compr = 0;
463 }
464 }
465
466 /* Initialize arguments for the shader export intrinsic */
467 static void si_llvm_init_vs_export_args(struct si_shader_context *ctx,
468 LLVMValueRef *values,
469 unsigned target,
470 struct ac_export_args *args)
471 {
472 args->enabled_channels = 0xf; /* writemask - default is 0xf */
473 args->valid_mask = 0; /* Specify whether the EXEC mask represents the valid mask */
474 args->done = 0; /* Specify whether this is the last export */
475 args->target = target; /* Specify the target we are exporting */
476 args->compr = false;
477
478 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
479 }
480
481 static void si_export_param(struct si_shader_context *ctx, unsigned index,
482 LLVMValueRef *values)
483 {
484 struct ac_export_args args;
485
486 si_llvm_init_vs_export_args(ctx, values,
487 V_008DFC_SQ_EXP_PARAM + index, &args);
488 ac_build_export(&ctx->ac, &args);
489 }
490
491 static void si_build_param_exports(struct si_shader_context *ctx,
492 struct si_shader_output_values *outputs,
493 unsigned noutput)
494 {
495 struct si_shader *shader = ctx->shader;
496 unsigned param_count = 0;
497
498 for (unsigned i = 0; i < noutput; i++) {
499 unsigned semantic_name = outputs[i].semantic_name;
500 unsigned semantic_index = outputs[i].semantic_index;
501
502 if (outputs[i].vertex_stream[0] != 0 &&
503 outputs[i].vertex_stream[1] != 0 &&
504 outputs[i].vertex_stream[2] != 0 &&
505 outputs[i].vertex_stream[3] != 0)
506 continue;
507
508 switch (semantic_name) {
509 case TGSI_SEMANTIC_LAYER:
510 case TGSI_SEMANTIC_VIEWPORT_INDEX:
511 case TGSI_SEMANTIC_CLIPDIST:
512 case TGSI_SEMANTIC_COLOR:
513 case TGSI_SEMANTIC_BCOLOR:
514 case TGSI_SEMANTIC_PRIMID:
515 case TGSI_SEMANTIC_FOG:
516 case TGSI_SEMANTIC_TEXCOORD:
517 case TGSI_SEMANTIC_GENERIC:
518 break;
519 default:
520 continue;
521 }
522
523 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
524 semantic_index < SI_MAX_IO_GENERIC) &&
525 shader->key.opt.kill_outputs &
526 (1ull << si_shader_io_get_unique_index(semantic_name,
527 semantic_index, true)))
528 continue;
529
530 si_export_param(ctx, param_count, outputs[i].values);
531
532 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
533 shader->info.vs_output_param_offset[i] = param_count++;
534 }
535
536 shader->info.nr_param_exports = param_count;
537 }
538
539 /**
540 * Vertex color clamping.
541 *
542 * This uses a state constant loaded in a user data SGPR and
543 * an IF statement is added that clamps all colors if the constant
544 * is true.
545 */
546 static void si_vertex_color_clamping(struct si_shader_context *ctx,
547 struct si_shader_output_values *outputs,
548 unsigned noutput)
549 {
550 LLVMValueRef addr[SI_MAX_VS_OUTPUTS][4];
551 bool has_colors = false;
552
553 /* Store original colors to alloca variables. */
554 for (unsigned i = 0; i < noutput; i++) {
555 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
556 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
557 continue;
558
559 for (unsigned j = 0; j < 4; j++) {
560 addr[i][j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
561 LLVMBuildStore(ctx->ac.builder, outputs[i].values[j], addr[i][j]);
562 }
563 has_colors = true;
564 }
565
566 if (!has_colors)
567 return;
568
569 /* The state is in the first bit of the user SGPR. */
570 LLVMValueRef cond = ac_get_arg(&ctx->ac, ctx->vs_state_bits);
571 cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
572
573 ac_build_ifcc(&ctx->ac, cond, 6502);
574
575 /* Store clamped colors to alloca variables within the conditional block. */
576 for (unsigned i = 0; i < noutput; i++) {
577 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
578 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
579 continue;
580
581 for (unsigned j = 0; j < 4; j++) {
582 LLVMBuildStore(ctx->ac.builder,
583 ac_build_clamp(&ctx->ac, outputs[i].values[j]),
584 addr[i][j]);
585 }
586 }
587 ac_build_endif(&ctx->ac, 6502);
588
589 /* Load clamped colors */
590 for (unsigned i = 0; i < noutput; i++) {
591 if (outputs[i].semantic_name != TGSI_SEMANTIC_COLOR &&
592 outputs[i].semantic_name != TGSI_SEMANTIC_BCOLOR)
593 continue;
594
595 for (unsigned j = 0; j < 4; j++) {
596 outputs[i].values[j] =
597 LLVMBuildLoad(ctx->ac.builder, addr[i][j], "");
598 }
599 }
600 }
601
602 /* Generate export instructions for hardware VS shader stage or NGG GS stage
603 * (position and parameter data only).
604 */
605 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
606 struct si_shader_output_values *outputs,
607 unsigned noutput)
608 {
609 struct si_shader *shader = ctx->shader;
610 struct ac_export_args pos_args[4] = {};
611 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
612 unsigned pos_idx;
613 int i;
614
615 si_vertex_color_clamping(ctx, outputs, noutput);
616
617 /* Build position exports. */
618 for (i = 0; i < noutput; i++) {
619 switch (outputs[i].semantic_name) {
620 case TGSI_SEMANTIC_POSITION:
621 si_llvm_init_vs_export_args(ctx, outputs[i].values,
622 V_008DFC_SQ_EXP_POS, &pos_args[0]);
623 break;
624 case TGSI_SEMANTIC_PSIZE:
625 psize_value = outputs[i].values[0];
626 break;
627 case TGSI_SEMANTIC_LAYER:
628 layer_value = outputs[i].values[0];
629 break;
630 case TGSI_SEMANTIC_VIEWPORT_INDEX:
631 viewport_index_value = outputs[i].values[0];
632 break;
633 case TGSI_SEMANTIC_EDGEFLAG:
634 edgeflag_value = outputs[i].values[0];
635 break;
636 case TGSI_SEMANTIC_CLIPDIST:
637 if (!shader->key.opt.clip_disable) {
638 unsigned index = 2 + outputs[i].semantic_index;
639 si_llvm_init_vs_export_args(ctx, outputs[i].values,
640 V_008DFC_SQ_EXP_POS + index,
641 &pos_args[index]);
642 }
643 break;
644 case TGSI_SEMANTIC_CLIPVERTEX:
645 if (!shader->key.opt.clip_disable) {
646 si_llvm_emit_clipvertex(ctx, pos_args,
647 outputs[i].values);
648 }
649 break;
650 }
651 }
652
653 /* We need to add the position output manually if it's missing. */
654 if (!pos_args[0].out[0]) {
655 pos_args[0].enabled_channels = 0xf; /* writemask */
656 pos_args[0].valid_mask = 0; /* EXEC mask */
657 pos_args[0].done = 0; /* last export? */
658 pos_args[0].target = V_008DFC_SQ_EXP_POS;
659 pos_args[0].compr = 0; /* COMPR flag */
660 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
661 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
662 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
663 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
664 }
665
666 bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag &&
667 !shader->key.as_ngg;
668
669 /* Write the misc vector (point size, edgeflag, layer, viewport). */
670 if (shader->selector->info.writes_psize ||
671 pos_writes_edgeflag ||
672 shader->selector->info.writes_viewport_index ||
673 shader->selector->info.writes_layer) {
674 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
675 (pos_writes_edgeflag << 1) |
676 (shader->selector->info.writes_layer << 2);
677
678 pos_args[1].valid_mask = 0; /* EXEC mask */
679 pos_args[1].done = 0; /* last export? */
680 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
681 pos_args[1].compr = 0; /* COMPR flag */
682 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
683 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
684 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
685 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
686
687 if (shader->selector->info.writes_psize)
688 pos_args[1].out[0] = psize_value;
689
690 if (pos_writes_edgeflag) {
691 /* The output is a float, but the hw expects an integer
692 * with the first bit containing the edge flag. */
693 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
694 edgeflag_value,
695 ctx->ac.i32, "");
696 edgeflag_value = ac_build_umin(&ctx->ac,
697 edgeflag_value,
698 ctx->ac.i32_1);
699
700 /* The LLVM intrinsic expects a float. */
701 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
702 }
703
704 if (ctx->screen->info.chip_class >= GFX9) {
705 /* GFX9 has the layer in out.z[10:0] and the viewport
706 * index in out.z[19:16].
707 */
708 if (shader->selector->info.writes_layer)
709 pos_args[1].out[2] = layer_value;
710
711 if (shader->selector->info.writes_viewport_index) {
712 LLVMValueRef v = viewport_index_value;
713
714 v = ac_to_integer(&ctx->ac, v);
715 v = LLVMBuildShl(ctx->ac.builder, v,
716 LLVMConstInt(ctx->ac.i32, 16, 0), "");
717 v = LLVMBuildOr(ctx->ac.builder, v,
718 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
719 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
720 pos_args[1].enabled_channels |= 1 << 2;
721 }
722 } else {
723 if (shader->selector->info.writes_layer)
724 pos_args[1].out[2] = layer_value;
725
726 if (shader->selector->info.writes_viewport_index) {
727 pos_args[1].out[3] = viewport_index_value;
728 pos_args[1].enabled_channels |= 1 << 3;
729 }
730 }
731 }
732
733 for (i = 0; i < 4; i++)
734 if (pos_args[i].out[0])
735 shader->info.nr_pos_exports++;
736
737 /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
738 * Setting valid_mask=1 prevents it and has no other effect.
739 */
740 if (ctx->screen->info.family == CHIP_NAVI10 ||
741 ctx->screen->info.family == CHIP_NAVI12 ||
742 ctx->screen->info.family == CHIP_NAVI14)
743 pos_args[0].valid_mask = 1;
744
745 pos_idx = 0;
746 for (i = 0; i < 4; i++) {
747 if (!pos_args[i].out[0])
748 continue;
749
750 /* Specify the target we are exporting */
751 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
752
753 if (pos_idx == shader->info.nr_pos_exports)
754 /* Specify that this is the last export */
755 pos_args[i].done = 1;
756
757 ac_build_export(&ctx->ac, &pos_args[i]);
758 }
759
760 /* Build parameter exports. */
761 si_build_param_exports(ctx, outputs, noutput);
762 }
763
764 void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
765 LLVMValueRef *addrs)
766 {
767 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
768 struct si_shader_info *info = &ctx->shader->selector->info;
769 struct si_shader_output_values *outputs = NULL;
770 int i,j;
771
772 assert(!ctx->shader->is_gs_copy_shader);
773 assert(info->num_outputs <= max_outputs);
774
775 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
776
777 for (i = 0; i < info->num_outputs; i++) {
778 outputs[i].semantic_name = info->output_semantic_name[i];
779 outputs[i].semantic_index = info->output_semantic_index[i];
780
781 for (j = 0; j < 4; j++) {
782 outputs[i].values[j] =
783 LLVMBuildLoad(ctx->ac.builder,
784 addrs[4 * i + j],
785 "");
786 outputs[i].vertex_stream[j] =
787 (info->output_streams[i] >> (2 * j)) & 3;
788 }
789 }
790
791 if (!ctx->screen->use_ngg_streamout &&
792 ctx->shader->selector->so.num_outputs)
793 si_llvm_emit_streamout(ctx, outputs, i, 0);
794
795 /* Export PrimitiveID. */
796 if (ctx->shader->key.mono.u.vs_export_prim_id) {
797 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
798 outputs[i].semantic_index = 0;
799 outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
800 for (j = 1; j < 4; j++)
801 outputs[i].values[j] = LLVMConstReal(ctx->ac.f32, 0);
802
803 memset(outputs[i].vertex_stream, 0,
804 sizeof(outputs[i].vertex_stream));
805 i++;
806 }
807
808 si_llvm_build_vs_exports(ctx, outputs, i);
809 FREE(outputs);
810 }
811
812 static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
813 unsigned max_outputs,
814 LLVMValueRef *addrs)
815 {
816 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
817 struct si_shader_info *info = &ctx->shader->selector->info;
818 LLVMValueRef pos[4] = {};
819
820 assert(info->num_outputs <= max_outputs);
821
822 for (unsigned i = 0; i < info->num_outputs; i++) {
823 if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
824 continue;
825
826 for (unsigned chan = 0; chan < 4; chan++)
827 pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
828 break;
829 }
830 assert(pos[0] != NULL);
831
832 /* Return the position output. */
833 LLVMValueRef ret = ctx->return_value;
834 for (unsigned chan = 0; chan < 4; chan++)
835 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
836 ctx->return_value = ret;
837 }
838
839 /**
840 * Build the vertex shader prolog function.
841 *
842 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
843 * All inputs are returned unmodified. The vertex load indices are
844 * stored after them, which will be used by the API VS for fetching inputs.
845 *
846 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
847 * input_v0,
848 * input_v1,
849 * input_v2,
850 * input_v3,
851 * (VertexID + BaseVertex),
852 * (InstanceID + StartInstance),
853 * (InstanceID / 2 + StartInstance)
854 */
855 void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
856 union si_shader_part_key *key)
857 {
858 LLVMTypeRef *returns;
859 LLVMValueRef ret, func;
860 int num_returns, i;
861 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
862 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
863 struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
864 struct ac_arg input_vgpr_param[9];
865 LLVMValueRef input_vgprs[9];
866 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
867 num_input_vgprs;
868 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
869
870 memset(&ctx->args, 0, sizeof(ctx->args));
871
872 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
873 returns = alloca((num_all_input_regs + key->vs_prolog.num_inputs) *
874 sizeof(LLVMTypeRef));
875 num_returns = 0;
876
877 /* Declare input and output SGPRs. */
878 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
879 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
880 &input_sgpr_param[i]);
881 returns[num_returns++] = ctx->ac.i32;
882 }
883
884 struct ac_arg merged_wave_info = input_sgpr_param[3];
885
886 /* Preloaded VGPRs (outputs must be floats) */
887 for (i = 0; i < num_input_vgprs; i++) {
888 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
889 returns[num_returns++] = ctx->ac.f32;
890 }
891
892 /* Vertex load indices. */
893 for (i = 0; i < key->vs_prolog.num_inputs; i++)
894 returns[num_returns++] = ctx->ac.f32;
895
896 /* Create the function. */
897 si_llvm_create_func(ctx, "vs_prolog", returns, num_returns, 0);
898 func = ctx->main_fn;
899
900 for (i = 0; i < num_input_vgprs; i++) {
901 input_vgprs[i] = ac_get_arg(&ctx->ac, input_vgpr_param[i]);
902 }
903
904 if (key->vs_prolog.num_merged_next_stage_vgprs) {
905 if (!key->vs_prolog.is_monolithic)
906 si_init_exec_from_input(ctx, merged_wave_info, 0);
907
908 if (key->vs_prolog.as_ls &&
909 ctx->screen->info.has_ls_vgpr_init_bug) {
910 /* If there are no HS threads, SPI loads the LS VGPRs
911 * starting at VGPR 0. Shift them back to where they
912 * belong.
913 */
914 LLVMValueRef has_hs_threads =
915 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
916 si_unpack_param(ctx, input_sgpr_param[3], 8, 8),
917 ctx->ac.i32_0, "");
918
919 for (i = 4; i > 0; --i) {
920 input_vgprs[i + 1] =
921 LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
922 input_vgprs[i + 1],
923 input_vgprs[i - 1], "");
924 }
925 }
926 }
927
928 if (key->vs_prolog.gs_fast_launch_tri_list ||
929 key->vs_prolog.gs_fast_launch_tri_strip) {
930 LLVMValueRef wave_id, thread_id_in_tg;
931
932 wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
933 thread_id_in_tg = ac_build_imad(&ctx->ac, wave_id,
934 LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
935 ac_get_thread_id(&ctx->ac));
936
937 /* The GS fast launch initializes all VGPRs to the value of
938 * the first thread, so we have to add the thread ID.
939 *
940 * Only these are initialized by the hw:
941 * VGPR2: Base Primitive ID
942 * VGPR5: Base Vertex ID
943 * VGPR6: Instance ID
944 */
945
946 /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
947 * The NGG cull shader will read them from there.
948 */
949 if (key->vs_prolog.gs_fast_launch_tri_list) {
950 input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx01_offset */
951 LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
952 LLVMConstInt(ctx->ac.i32, 0, 0));
953 input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx23_offset */
954 LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
955 LLVMConstInt(ctx->ac.i32, 1, 0));
956 input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg, /* gs_vtx45_offset */
957 LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
958 LLVMConstInt(ctx->ac.i32, 2, 0));
959 } else {
960 assert(key->vs_prolog.gs_fast_launch_tri_strip);
961 LLVMBuilderRef builder = ctx->ac.builder;
962 /* Triangle indices: */
963 LLVMValueRef index[3] = {
964 thread_id_in_tg,
965 LLVMBuildAdd(builder, thread_id_in_tg,
966 LLVMConstInt(ctx->ac.i32, 1, 0), ""),
967 LLVMBuildAdd(builder, thread_id_in_tg,
968 LLVMConstInt(ctx->ac.i32, 2, 0), ""),
969 };
970 LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder,
971 thread_id_in_tg, ctx->ac.i1, "");
972 LLVMValueRef flatshade_first =
973 LLVMBuildICmp(builder, LLVMIntEQ,
974 si_unpack_param(ctx, ctx->vs_state_bits, 4, 2),
975 ctx->ac.i32_0, "");
976
977 ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd,
978 flatshade_first, index);
979 input_vgprs[0] = index[0];
980 input_vgprs[1] = index[1];
981 input_vgprs[4] = index[2];
982 }
983
984 /* Triangles always have all edge flags set initially. */
985 input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
986
987 input_vgprs[2] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[2],
988 thread_id_in_tg, ""); /* PrimID */
989 input_vgprs[5] = LLVMBuildAdd(ctx->ac.builder, input_vgprs[5],
990 thread_id_in_tg, ""); /* VertexID */
991 input_vgprs[8] = input_vgprs[6]; /* InstanceID */
992 }
993
994 unsigned vertex_id_vgpr = first_vs_vgpr;
995 unsigned instance_id_vgpr =
996 ctx->screen->info.chip_class >= GFX10 ?
997 first_vs_vgpr + 3 :
998 first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
999
1000 ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
1001 ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
1002
1003 /* InstanceID = VertexID >> 16;
1004 * VertexID = VertexID & 0xffff;
1005 */
1006 if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
1007 ctx->abi.instance_id = LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id,
1008 LLVMConstInt(ctx->ac.i32, 16, 0), "");
1009 ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
1010 LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
1011 }
1012
1013 /* Copy inputs to outputs. This should be no-op, as the registers match,
1014 * but it will prevent the compiler from overwriting them unintentionally.
1015 */
1016 ret = ctx->return_value;
1017 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
1018 LLVMValueRef p = LLVMGetParam(func, i);
1019 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
1020 }
1021 for (i = 0; i < num_input_vgprs; i++) {
1022 LLVMValueRef p = input_vgprs[i];
1023
1024 if (i == vertex_id_vgpr)
1025 p = ctx->abi.vertex_id;
1026 else if (i == instance_id_vgpr)
1027 p = ctx->abi.instance_id;
1028
1029 p = ac_to_float(&ctx->ac, p);
1030 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
1031 key->vs_prolog.num_input_sgprs + i, "");
1032 }
1033
1034 /* Compute vertex load indices from instance divisors. */
1035 LLVMValueRef instance_divisor_constbuf = NULL;
1036
1037 if (key->vs_prolog.states.instance_divisor_is_fetched) {
1038 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
1039 LLVMValueRef buf_index =
1040 LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
1041 instance_divisor_constbuf =
1042 ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
1043 }
1044
1045 for (i = 0; i < key->vs_prolog.num_inputs; i++) {
1046 bool divisor_is_one =
1047 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
1048 bool divisor_is_fetched =
1049 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
1050 LLVMValueRef index = NULL;
1051
1052 if (divisor_is_one) {
1053 index = ctx->abi.instance_id;
1054 } else if (divisor_is_fetched) {
1055 LLVMValueRef udiv_factors[4];
1056
1057 for (unsigned j = 0; j < 4; j++) {
1058 udiv_factors[j] =
1059 si_buffer_load_const(ctx, instance_divisor_constbuf,
1060 LLVMConstInt(ctx->ac.i32, i*16 + j*4, 0));
1061 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
1062 }
1063 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
1064 * Such InstanceID might not be achievable in a reasonable time though.
1065 */
1066 index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
1067 udiv_factors[0], udiv_factors[1],
1068 udiv_factors[2], udiv_factors[3]);
1069 }
1070
1071 if (divisor_is_one || divisor_is_fetched) {
1072 /* Add StartInstance. */
1073 index = LLVMBuildAdd(ctx->ac.builder, index,
1074 LLVMGetParam(ctx->main_fn, user_sgpr_base +
1075 SI_SGPR_START_INSTANCE), "");
1076 } else {
1077 /* VertexID + BaseVertex */
1078 index = LLVMBuildAdd(ctx->ac.builder,
1079 ctx->abi.vertex_id,
1080 LLVMGetParam(func, user_sgpr_base +
1081 SI_SGPR_BASE_VERTEX), "");
1082 }
1083
1084 index = ac_to_float(&ctx->ac, index);
1085 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
1086 ctx->args.arg_count + i, "");
1087 }
1088
1089 si_llvm_build_ret(ctx, ret);
1090 }
1091
1092 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
1093 {
1094 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1095
1096 /* For non-indexed draws, the base vertex set by the driver
1097 * (for direct draws) or the CP (for indirect draws) is the
1098 * first vertex ID, but GLSL expects 0 to be returned.
1099 */
1100 LLVMValueRef vs_state = ac_get_arg(&ctx->ac,
1101 ctx->vs_state_bits);
1102 LLVMValueRef indexed;
1103
1104 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->ac.i32_1, "");
1105 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->ac.i1, "");
1106
1107 return LLVMBuildSelect(ctx->ac.builder, indexed,
1108 ac_get_arg(&ctx->ac, ctx->args.base_vertex),
1109 ctx->ac.i32_0, "");
1110 }
1111
1112 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
1113 {
1114 struct si_shader *shader = ctx->shader;
1115
1116 if (shader->key.as_ls)
1117 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
1118 else if (shader->key.as_es)
1119 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
1120 else if (shader->key.opt.vs_as_prim_discard_cs)
1121 ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
1122 else if (ngg_cull_shader)
1123 ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue_4x_wave32;
1124 else if (shader->key.as_ngg)
1125 ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
1126 else
1127 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
1128
1129 ctx->abi.load_base_vertex = get_base_vertex;
1130 }