radv/winsys: fix flags vs va_flags thinko.
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49 #include "compiler/nir/nir.h"
50
51 static const char *scratch_rsrc_dword0_symbol =
52 "SCRATCH_RSRC_DWORD0";
53
54 static const char *scratch_rsrc_dword1_symbol =
55 "SCRATCH_RSRC_DWORD1";
56
57 struct si_shader_output_values
58 {
59 LLVMValueRef values[4];
60 unsigned semantic_name;
61 unsigned semantic_index;
62 ubyte vertex_stream[4];
63 };
64
65 /**
66 * Used to collect types and other info about arguments of the LLVM function
67 * before the function is created.
68 */
69 struct si_function_info {
70 LLVMTypeRef types[100];
71 LLVMValueRef *assign[100];
72 unsigned num_sgpr_params;
73 unsigned num_params;
74 };
75
76 enum si_arg_regfile {
77 ARG_SGPR,
78 ARG_VGPR
79 };
80
81 static void si_init_shader_ctx(struct si_shader_context *ctx,
82 struct si_screen *sscreen,
83 LLVMTargetMachineRef tm);
84
85 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
86 struct lp_build_tgsi_context *bld_base,
87 struct lp_build_emit_data *emit_data);
88
89 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
90 FILE *f);
91
92 static unsigned llvm_get_type_size(LLVMTypeRef type);
93
94 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
95 union si_shader_part_key *key);
96 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
97 union si_shader_part_key *key);
98 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
99 union si_shader_part_key *key);
100 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
101 union si_shader_part_key *key);
102
103 /* Ideally pass the sample mask input to the PS epilog as v13, which
104 * is its usual location, so that the shader doesn't have to add v_mov.
105 */
106 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
107
108 enum {
109 CONST_ADDR_SPACE = 2,
110 LOCAL_ADDR_SPACE = 3,
111 };
112
113 static bool is_merged_shader(struct si_shader *shader)
114 {
115 if (shader->selector->screen->b.chip_class <= VI)
116 return false;
117
118 return shader->key.as_ls ||
119 shader->key.as_es ||
120 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
121 shader->selector->type == PIPE_SHADER_GEOMETRY;
122 }
123
124 static void si_init_function_info(struct si_function_info *fninfo)
125 {
126 fninfo->num_params = 0;
127 fninfo->num_sgpr_params = 0;
128 }
129
130 static unsigned add_arg_assign(struct si_function_info *fninfo,
131 enum si_arg_regfile regfile, LLVMTypeRef type,
132 LLVMValueRef *assign)
133 {
134 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
135
136 unsigned idx = fninfo->num_params++;
137 assert(idx < ARRAY_SIZE(fninfo->types));
138
139 if (regfile == ARG_SGPR)
140 fninfo->num_sgpr_params = fninfo->num_params;
141
142 fninfo->types[idx] = type;
143 fninfo->assign[idx] = assign;
144 return idx;
145 }
146
147 static unsigned add_arg(struct si_function_info *fninfo,
148 enum si_arg_regfile regfile, LLVMTypeRef type)
149 {
150 return add_arg_assign(fninfo, regfile, type, NULL);
151 }
152
153 static void add_arg_assign_checked(struct si_function_info *fninfo,
154 enum si_arg_regfile regfile, LLVMTypeRef type,
155 LLVMValueRef *assign, unsigned idx)
156 {
157 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
158 assert(actual == idx);
159 }
160
161 static void add_arg_checked(struct si_function_info *fninfo,
162 enum si_arg_regfile regfile, LLVMTypeRef type,
163 unsigned idx)
164 {
165 add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
166 }
167
168 /**
169 * Returns a unique index for a per-patch semantic name and index. The index
170 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
171 * can be calculated.
172 */
173 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
174 {
175 switch (semantic_name) {
176 case TGSI_SEMANTIC_TESSOUTER:
177 return 0;
178 case TGSI_SEMANTIC_TESSINNER:
179 return 1;
180 case TGSI_SEMANTIC_PATCH:
181 assert(index < 30);
182 return 2 + index;
183
184 default:
185 assert(!"invalid semantic name");
186 return 0;
187 }
188 }
189
190 /**
191 * Returns a unique index for a semantic name and index. The index must be
192 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
193 * calculated.
194 */
195 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
196 {
197 switch (semantic_name) {
198 case TGSI_SEMANTIC_POSITION:
199 return 0;
200 case TGSI_SEMANTIC_GENERIC:
201 /* Since some shader stages use the the highest used IO index
202 * to determine the size to allocate for inputs/outputs
203 * (in LDS, tess and GS rings). GENERIC should be placed right
204 * after POSITION to make that size as small as possible.
205 */
206 if (index < SI_MAX_IO_GENERIC)
207 return 1 + index;
208
209 assert(!"invalid generic index");
210 return 0;
211 case TGSI_SEMANTIC_PSIZE:
212 return SI_MAX_IO_GENERIC + 1;
213 case TGSI_SEMANTIC_CLIPDIST:
214 assert(index <= 1);
215 return SI_MAX_IO_GENERIC + 2 + index;
216 case TGSI_SEMANTIC_FOG:
217 return SI_MAX_IO_GENERIC + 4;
218 case TGSI_SEMANTIC_LAYER:
219 return SI_MAX_IO_GENERIC + 5;
220 case TGSI_SEMANTIC_VIEWPORT_INDEX:
221 return SI_MAX_IO_GENERIC + 6;
222 case TGSI_SEMANTIC_PRIMID:
223 return SI_MAX_IO_GENERIC + 7;
224 case TGSI_SEMANTIC_COLOR: /* these alias */
225 case TGSI_SEMANTIC_BCOLOR:
226 assert(index < 2);
227 return SI_MAX_IO_GENERIC + 8 + index;
228 case TGSI_SEMANTIC_TEXCOORD:
229 assert(index < 8);
230 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
231 return SI_MAX_IO_GENERIC + 10 + index;
232 default:
233 assert(!"invalid semantic name");
234 return 0;
235 }
236 }
237
238 /**
239 * Helper function that builds an LLVM IR PHI node and immediately adds
240 * incoming edges.
241 */
242 static LLVMValueRef
243 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
244 unsigned count_incoming, LLVMValueRef *values,
245 LLVMBasicBlockRef *blocks)
246 {
247 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
248 LLVMAddIncoming(phi, values, blocks, count_incoming);
249 return phi;
250 }
251
252 /**
253 * Get the value of a shader input parameter and extract a bitfield.
254 */
255 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
256 unsigned param, unsigned rshift,
257 unsigned bitwidth)
258 {
259 struct gallivm_state *gallivm = &ctx->gallivm;
260 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
261 param);
262
263 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
264 value = bitcast(&ctx->bld_base,
265 TGSI_TYPE_UNSIGNED, value);
266
267 if (rshift)
268 value = LLVMBuildLShr(gallivm->builder, value,
269 LLVMConstInt(ctx->i32, rshift, 0), "");
270
271 if (rshift + bitwidth < 32) {
272 unsigned mask = (1 << bitwidth) - 1;
273 value = LLVMBuildAnd(gallivm->builder, value,
274 LLVMConstInt(ctx->i32, mask, 0), "");
275 }
276
277 return value;
278 }
279
280 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
281 {
282 switch (ctx->type) {
283 case PIPE_SHADER_TESS_CTRL:
284 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
285
286 case PIPE_SHADER_TESS_EVAL:
287 return LLVMGetParam(ctx->main_fn,
288 ctx->param_tes_rel_patch_id);
289
290 default:
291 assert(0);
292 return NULL;
293 }
294 }
295
296 /* Tessellation shaders pass outputs to the next shader using LDS.
297 *
298 * LS outputs = TCS inputs
299 * TCS outputs = TES inputs
300 *
301 * The LDS layout is:
302 * - TCS inputs for patch 0
303 * - TCS inputs for patch 1
304 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
305 * - ...
306 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
307 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
308 * - TCS outputs for patch 1
309 * - Per-patch TCS outputs for patch 1
310 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
311 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
312 * - ...
313 *
314 * All three shaders VS(LS), TCS, TES share the same LDS space.
315 */
316
317 static LLVMValueRef
318 get_tcs_in_patch_stride(struct si_shader_context *ctx)
319 {
320 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
321 }
322
323 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
324 {
325 assert(ctx->type == PIPE_SHADER_TESS_CTRL);
326
327 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
328 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
329
330 return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
331 }
332
333 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
334 {
335 unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
336
337 return LLVMConstInt(ctx->i32, stride, 0);
338 }
339
340 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
341 {
342 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
343 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
344
345 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
346 unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
347 unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
348 unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
349 unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
350 num_patch_outputs * 4;
351 return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
352 }
353
354 static LLVMValueRef
355 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
356 {
357 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
358 unpack_param(ctx,
359 ctx->param_tcs_out_lds_offsets,
360 0, 16),
361 4);
362 }
363
364 static LLVMValueRef
365 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
366 {
367 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
368 unpack_param(ctx,
369 ctx->param_tcs_out_lds_offsets,
370 16, 16),
371 4);
372 }
373
374 static LLVMValueRef
375 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
376 {
377 struct gallivm_state *gallivm = &ctx->gallivm;
378 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
379 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
380
381 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
382 }
383
384 static LLVMValueRef
385 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
386 {
387 struct gallivm_state *gallivm = &ctx->gallivm;
388 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
389 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
390 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
391
392 return LLVMBuildAdd(gallivm->builder, patch0_offset,
393 LLVMBuildMul(gallivm->builder, patch_stride,
394 rel_patch_id, ""),
395 "");
396 }
397
398 static LLVMValueRef
399 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
400 {
401 struct gallivm_state *gallivm = &ctx->gallivm;
402 LLVMValueRef patch0_patch_data_offset =
403 get_tcs_out_patch0_patch_data_offset(ctx);
404 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
405 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
406
407 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
408 LLVMBuildMul(gallivm->builder, patch_stride,
409 rel_patch_id, ""),
410 "");
411 }
412
413 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
414 {
415 unsigned tcs_out_vertices =
416 ctx->shader->selector ?
417 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
418
419 /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
420 if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
421 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
422
423 return unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
424 }
425
426 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
427 {
428 unsigned stride;
429
430 switch (ctx->type) {
431 case PIPE_SHADER_VERTEX:
432 stride = util_last_bit64(ctx->shader->selector->outputs_written);
433 return LLVMConstInt(ctx->i32, stride * 4, 0);
434
435 case PIPE_SHADER_TESS_CTRL:
436 if (ctx->screen->b.chip_class >= GFX9 &&
437 ctx->shader->is_monolithic) {
438 stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
439 return LLVMConstInt(ctx->i32, stride * 4, 0);
440 }
441 return unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
442
443 default:
444 assert(0);
445 return NULL;
446 }
447 }
448
449 static LLVMValueRef get_instance_index_for_fetch(
450 struct si_shader_context *ctx,
451 unsigned param_start_instance, LLVMValueRef divisor)
452 {
453 struct gallivm_state *gallivm = &ctx->gallivm;
454
455 LLVMValueRef result = ctx->abi.instance_id;
456
457 /* The division must be done before START_INSTANCE is added. */
458 if (divisor != ctx->i32_1)
459 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
460
461 return LLVMBuildAdd(gallivm->builder, result,
462 LLVMGetParam(ctx->main_fn, param_start_instance), "");
463 }
464
465 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
466 * to float. */
467 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
468 LLVMValueRef vec4,
469 unsigned double_index)
470 {
471 LLVMBuilderRef builder = ctx->gallivm.builder;
472 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
473 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
474 LLVMVectorType(f64, 2), "");
475 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
476 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
477 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
478 }
479
480 void si_llvm_load_input_vs(
481 struct si_shader_context *ctx,
482 unsigned input_index,
483 LLVMValueRef out[4])
484 {
485 struct gallivm_state *gallivm = &ctx->gallivm;
486
487 unsigned chan;
488 unsigned fix_fetch;
489 unsigned num_fetches;
490 unsigned fetch_stride;
491
492 LLVMValueRef t_list_ptr;
493 LLVMValueRef t_offset;
494 LLVMValueRef t_list;
495 LLVMValueRef vertex_index;
496 LLVMValueRef input[3];
497
498 /* Load the T list */
499 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
500
501 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
502
503 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
504
505 vertex_index = LLVMGetParam(ctx->main_fn,
506 ctx->param_vertex_index0 +
507 input_index);
508
509 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
510
511 /* Do multiple loads for special formats. */
512 switch (fix_fetch) {
513 case SI_FIX_FETCH_RGB_64_FLOAT:
514 num_fetches = 3; /* 3 2-dword loads */
515 fetch_stride = 8;
516 break;
517 case SI_FIX_FETCH_RGBA_64_FLOAT:
518 num_fetches = 2; /* 2 4-dword loads */
519 fetch_stride = 16;
520 break;
521 case SI_FIX_FETCH_RGB_8:
522 case SI_FIX_FETCH_RGB_8_INT:
523 num_fetches = 3;
524 fetch_stride = 1;
525 break;
526 case SI_FIX_FETCH_RGB_16:
527 case SI_FIX_FETCH_RGB_16_INT:
528 num_fetches = 3;
529 fetch_stride = 2;
530 break;
531 default:
532 num_fetches = 1;
533 fetch_stride = 0;
534 }
535
536 for (unsigned i = 0; i < num_fetches; i++) {
537 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
538
539 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
540 vertex_index, voffset,
541 true);
542 }
543
544 /* Break up the vec4 into individual components */
545 for (chan = 0; chan < 4; chan++) {
546 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
547 out[chan] = LLVMBuildExtractElement(gallivm->builder,
548 input[0], llvm_chan, "");
549 }
550
551 switch (fix_fetch) {
552 case SI_FIX_FETCH_A2_SNORM:
553 case SI_FIX_FETCH_A2_SSCALED:
554 case SI_FIX_FETCH_A2_SINT: {
555 /* The hardware returns an unsigned value; convert it to a
556 * signed one.
557 */
558 LLVMValueRef tmp = out[3];
559 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
560
561 /* First, recover the sign-extended signed integer value. */
562 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
563 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
564 else
565 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
566
567 /* For the integer-like cases, do a natural sign extension.
568 *
569 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
570 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
571 * exponent.
572 */
573 tmp = LLVMBuildShl(gallivm->builder, tmp,
574 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
575 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
576 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
577
578 /* Convert back to the right type. */
579 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
580 LLVMValueRef clamp;
581 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
582 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
583 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
584 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
585 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
586 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
587 }
588
589 out[3] = tmp;
590 break;
591 }
592 case SI_FIX_FETCH_RGBA_32_UNORM:
593 case SI_FIX_FETCH_RGBX_32_UNORM:
594 for (chan = 0; chan < 4; chan++) {
595 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
596 ctx->i32, "");
597 out[chan] = LLVMBuildUIToFP(gallivm->builder,
598 out[chan], ctx->f32, "");
599 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
600 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
601 }
602 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
603 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
604 out[3] = LLVMConstReal(ctx->f32, 1);
605 break;
606 case SI_FIX_FETCH_RGBA_32_SNORM:
607 case SI_FIX_FETCH_RGBX_32_SNORM:
608 case SI_FIX_FETCH_RGBA_32_FIXED:
609 case SI_FIX_FETCH_RGBX_32_FIXED: {
610 double scale;
611 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
612 scale = 1.0 / 0x10000;
613 else
614 scale = 1.0 / INT_MAX;
615
616 for (chan = 0; chan < 4; chan++) {
617 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
618 ctx->i32, "");
619 out[chan] = LLVMBuildSIToFP(gallivm->builder,
620 out[chan], ctx->f32, "");
621 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
622 LLVMConstReal(ctx->f32, scale), "");
623 }
624 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
625 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
626 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
627 out[3] = LLVMConstReal(ctx->f32, 1);
628 break;
629 }
630 case SI_FIX_FETCH_RGBA_32_USCALED:
631 for (chan = 0; chan < 4; chan++) {
632 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
633 ctx->i32, "");
634 out[chan] = LLVMBuildUIToFP(gallivm->builder,
635 out[chan], ctx->f32, "");
636 }
637 break;
638 case SI_FIX_FETCH_RGBA_32_SSCALED:
639 for (chan = 0; chan < 4; chan++) {
640 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
641 ctx->i32, "");
642 out[chan] = LLVMBuildSIToFP(gallivm->builder,
643 out[chan], ctx->f32, "");
644 }
645 break;
646 case SI_FIX_FETCH_RG_64_FLOAT:
647 for (chan = 0; chan < 2; chan++)
648 out[chan] = extract_double_to_float(ctx, input[0], chan);
649
650 out[2] = LLVMConstReal(ctx->f32, 0);
651 out[3] = LLVMConstReal(ctx->f32, 1);
652 break;
653 case SI_FIX_FETCH_RGB_64_FLOAT:
654 for (chan = 0; chan < 3; chan++)
655 out[chan] = extract_double_to_float(ctx, input[chan], 0);
656
657 out[3] = LLVMConstReal(ctx->f32, 1);
658 break;
659 case SI_FIX_FETCH_RGBA_64_FLOAT:
660 for (chan = 0; chan < 4; chan++) {
661 out[chan] = extract_double_to_float(ctx, input[chan / 2],
662 chan % 2);
663 }
664 break;
665 case SI_FIX_FETCH_RGB_8:
666 case SI_FIX_FETCH_RGB_8_INT:
667 case SI_FIX_FETCH_RGB_16:
668 case SI_FIX_FETCH_RGB_16_INT:
669 for (chan = 0; chan < 3; chan++) {
670 out[chan] = LLVMBuildExtractElement(gallivm->builder,
671 input[chan],
672 ctx->i32_0, "");
673 }
674 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
675 fix_fetch == SI_FIX_FETCH_RGB_16) {
676 out[3] = LLVMConstReal(ctx->f32, 1);
677 } else {
678 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
679 ctx->f32, "");
680 }
681 break;
682 }
683 }
684
685 static void declare_input_vs(
686 struct si_shader_context *ctx,
687 unsigned input_index,
688 const struct tgsi_full_declaration *decl,
689 LLVMValueRef out[4])
690 {
691 si_llvm_load_input_vs(ctx, input_index, out);
692 }
693
694 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
695 unsigned swizzle)
696 {
697 if (swizzle > 0)
698 return ctx->i32_0;
699
700 switch (ctx->type) {
701 case PIPE_SHADER_VERTEX:
702 return LLVMGetParam(ctx->main_fn,
703 ctx->param_vs_prim_id);
704 case PIPE_SHADER_TESS_CTRL:
705 return LLVMGetParam(ctx->main_fn,
706 ctx->param_tcs_patch_id);
707 case PIPE_SHADER_TESS_EVAL:
708 return LLVMGetParam(ctx->main_fn,
709 ctx->param_tes_patch_id);
710 case PIPE_SHADER_GEOMETRY:
711 return LLVMGetParam(ctx->main_fn,
712 ctx->param_gs_prim_id);
713 default:
714 assert(0);
715 return ctx->i32_0;
716 }
717 }
718
719 /**
720 * Return the value of tgsi_ind_register for indexing.
721 * This is the indirect index with the constant offset added to it.
722 */
723 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
724 const struct tgsi_ind_register *ind,
725 int rel_index)
726 {
727 struct gallivm_state *gallivm = &ctx->gallivm;
728 LLVMValueRef result;
729
730 result = ctx->addrs[ind->Index][ind->Swizzle];
731 result = LLVMBuildLoad(gallivm->builder, result, "");
732 result = LLVMBuildAdd(gallivm->builder, result,
733 LLVMConstInt(ctx->i32, rel_index, 0), "");
734 return result;
735 }
736
737 /**
738 * Like si_get_indirect_index, but restricts the return value to a (possibly
739 * undefined) value inside [0..num).
740 */
741 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
742 const struct tgsi_ind_register *ind,
743 int rel_index, unsigned num)
744 {
745 LLVMValueRef result = si_get_indirect_index(ctx, ind, rel_index);
746
747 return si_llvm_bound_index(ctx, result, num);
748 }
749
750
751 /**
752 * Calculate a dword address given an input or output register and a stride.
753 */
754 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
755 const struct tgsi_full_dst_register *dst,
756 const struct tgsi_full_src_register *src,
757 LLVMValueRef vertex_dw_stride,
758 LLVMValueRef base_addr)
759 {
760 struct gallivm_state *gallivm = &ctx->gallivm;
761 struct tgsi_shader_info *info = &ctx->shader->selector->info;
762 ubyte *name, *index, *array_first;
763 int first, param;
764 struct tgsi_full_dst_register reg;
765
766 /* Set the register description. The address computation is the same
767 * for sources and destinations. */
768 if (src) {
769 reg.Register.File = src->Register.File;
770 reg.Register.Index = src->Register.Index;
771 reg.Register.Indirect = src->Register.Indirect;
772 reg.Register.Dimension = src->Register.Dimension;
773 reg.Indirect = src->Indirect;
774 reg.Dimension = src->Dimension;
775 reg.DimIndirect = src->DimIndirect;
776 } else
777 reg = *dst;
778
779 /* If the register is 2-dimensional (e.g. an array of vertices
780 * in a primitive), calculate the base address of the vertex. */
781 if (reg.Register.Dimension) {
782 LLVMValueRef index;
783
784 if (reg.Dimension.Indirect)
785 index = si_get_indirect_index(ctx, &reg.DimIndirect,
786 reg.Dimension.Index);
787 else
788 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
789
790 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
791 LLVMBuildMul(gallivm->builder, index,
792 vertex_dw_stride, ""), "");
793 }
794
795 /* Get information about the register. */
796 if (reg.Register.File == TGSI_FILE_INPUT) {
797 name = info->input_semantic_name;
798 index = info->input_semantic_index;
799 array_first = info->input_array_first;
800 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
801 name = info->output_semantic_name;
802 index = info->output_semantic_index;
803 array_first = info->output_array_first;
804 } else {
805 assert(0);
806 return NULL;
807 }
808
809 if (reg.Register.Indirect) {
810 /* Add the relative address of the element. */
811 LLVMValueRef ind_index;
812
813 if (reg.Indirect.ArrayID)
814 first = array_first[reg.Indirect.ArrayID];
815 else
816 first = reg.Register.Index;
817
818 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
819 reg.Register.Index - first);
820
821 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
822 LLVMBuildMul(gallivm->builder, ind_index,
823 LLVMConstInt(ctx->i32, 4, 0), ""), "");
824
825 param = reg.Register.Dimension ?
826 si_shader_io_get_unique_index(name[first], index[first]) :
827 si_shader_io_get_unique_index_patch(name[first], index[first]);
828 } else {
829 param = reg.Register.Dimension ?
830 si_shader_io_get_unique_index(name[reg.Register.Index],
831 index[reg.Register.Index]) :
832 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
833 index[reg.Register.Index]);
834 }
835
836 /* Add the base address of the element. */
837 return LLVMBuildAdd(gallivm->builder, base_addr,
838 LLVMConstInt(ctx->i32, param * 4, 0), "");
839 }
840
841 /* The offchip buffer layout for TCS->TES is
842 *
843 * - attribute 0 of patch 0 vertex 0
844 * - attribute 0 of patch 0 vertex 1
845 * - attribute 0 of patch 0 vertex 2
846 * ...
847 * - attribute 0 of patch 1 vertex 0
848 * - attribute 0 of patch 1 vertex 1
849 * ...
850 * - attribute 1 of patch 0 vertex 0
851 * - attribute 1 of patch 0 vertex 1
852 * ...
853 * - per patch attribute 0 of patch 0
854 * - per patch attribute 0 of patch 1
855 * ...
856 *
857 * Note that every attribute has 4 components.
858 */
859 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
860 LLVMValueRef rel_patch_id,
861 LLVMValueRef vertex_index,
862 LLVMValueRef param_index)
863 {
864 struct gallivm_state *gallivm = &ctx->gallivm;
865 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
866 LLVMValueRef param_stride, constant16;
867
868 vertices_per_patch = get_num_tcs_out_vertices(ctx);
869 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
870 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
871 num_patches, "");
872
873 constant16 = LLVMConstInt(ctx->i32, 16, 0);
874 if (vertex_index) {
875 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
876 vertices_per_patch, "");
877
878 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
879 vertex_index, "");
880
881 param_stride = total_vertices;
882 } else {
883 base_addr = rel_patch_id;
884 param_stride = num_patches;
885 }
886
887 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
888 LLVMBuildMul(gallivm->builder, param_index,
889 param_stride, ""), "");
890
891 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
892
893 if (!vertex_index) {
894 LLVMValueRef patch_data_offset =
895 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
896
897 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
898 patch_data_offset, "");
899 }
900 return base_addr;
901 }
902
903 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
904 struct si_shader_context *ctx,
905 const struct tgsi_full_dst_register *dst,
906 const struct tgsi_full_src_register *src)
907 {
908 struct gallivm_state *gallivm = &ctx->gallivm;
909 struct tgsi_shader_info *info = &ctx->shader->selector->info;
910 ubyte *name, *index, *array_first;
911 struct tgsi_full_src_register reg;
912 LLVMValueRef vertex_index = NULL;
913 LLVMValueRef param_index = NULL;
914 unsigned param_index_base, param_base;
915
916 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
917
918 if (reg.Register.Dimension) {
919
920 if (reg.Dimension.Indirect)
921 vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
922 reg.Dimension.Index);
923 else
924 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
925 }
926
927 /* Get information about the register. */
928 if (reg.Register.File == TGSI_FILE_INPUT) {
929 name = info->input_semantic_name;
930 index = info->input_semantic_index;
931 array_first = info->input_array_first;
932 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
933 name = info->output_semantic_name;
934 index = info->output_semantic_index;
935 array_first = info->output_array_first;
936 } else {
937 assert(0);
938 return NULL;
939 }
940
941 if (reg.Register.Indirect) {
942 if (reg.Indirect.ArrayID)
943 param_base = array_first[reg.Indirect.ArrayID];
944 else
945 param_base = reg.Register.Index;
946
947 param_index = si_get_indirect_index(ctx, &reg.Indirect,
948 reg.Register.Index - param_base);
949
950 } else {
951 param_base = reg.Register.Index;
952 param_index = ctx->i32_0;
953 }
954
955 param_index_base = reg.Register.Dimension ?
956 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
957 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
958
959 param_index = LLVMBuildAdd(gallivm->builder, param_index,
960 LLVMConstInt(ctx->i32, param_index_base, 0),
961 "");
962
963 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
964 vertex_index, param_index);
965 }
966
967 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
968 enum tgsi_opcode_type type, unsigned swizzle,
969 LLVMValueRef buffer, LLVMValueRef offset,
970 LLVMValueRef base, bool can_speculate)
971 {
972 struct si_shader_context *ctx = si_shader_context(bld_base);
973 struct gallivm_state *gallivm = &ctx->gallivm;
974 LLVMValueRef value, value2;
975 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
976 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
977
978 if (swizzle == ~0) {
979 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
980 0, 1, 0, can_speculate, false);
981
982 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
983 }
984
985 if (!tgsi_type_is_64bit(type)) {
986 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
987 0, 1, 0, can_speculate, false);
988
989 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
990 return LLVMBuildExtractElement(gallivm->builder, value,
991 LLVMConstInt(ctx->i32, swizzle, 0), "");
992 }
993
994 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
995 swizzle * 4, 1, 0, can_speculate, false);
996
997 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
998 swizzle * 4 + 4, 1, 0, can_speculate, false);
999
1000 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1001 }
1002
1003 /**
1004 * Load from LDS.
1005 *
1006 * \param type output value type
1007 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
1008 * \param dw_addr address in dwords
1009 */
1010 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1011 enum tgsi_opcode_type type, unsigned swizzle,
1012 LLVMValueRef dw_addr)
1013 {
1014 struct si_shader_context *ctx = si_shader_context(bld_base);
1015 struct gallivm_state *gallivm = &ctx->gallivm;
1016 LLVMValueRef value;
1017
1018 if (swizzle == ~0) {
1019 LLVMValueRef values[TGSI_NUM_CHANNELS];
1020
1021 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1022 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1023
1024 return lp_build_gather_values(gallivm, values,
1025 TGSI_NUM_CHANNELS);
1026 }
1027
1028 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1029 LLVMConstInt(ctx->i32, swizzle, 0));
1030
1031 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
1032 if (tgsi_type_is_64bit(type)) {
1033 LLVMValueRef value2;
1034 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1035 ctx->i32_1);
1036 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
1037 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1038 }
1039
1040 return LLVMBuildBitCast(gallivm->builder, value,
1041 tgsi2llvmtype(bld_base, type), "");
1042 }
1043
1044 /**
1045 * Store to LDS.
1046 *
1047 * \param swizzle offset (typically 0..3)
1048 * \param dw_addr address in dwords
1049 * \param value value to store
1050 */
1051 static void lds_store(struct lp_build_tgsi_context *bld_base,
1052 unsigned dw_offset_imm, LLVMValueRef dw_addr,
1053 LLVMValueRef value)
1054 {
1055 struct si_shader_context *ctx = si_shader_context(bld_base);
1056 struct gallivm_state *gallivm = &ctx->gallivm;
1057
1058 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1059 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
1060
1061 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1062 ac_build_indexed_store(&ctx->ac, ctx->lds,
1063 dw_addr, value);
1064 }
1065
1066 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
1067 unsigned param)
1068 {
1069 LLVMBuilderRef builder = ctx->gallivm.builder;
1070
1071 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1072 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
1073 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1074
1075 uint64_t desc2 = 0xffffffff;
1076 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1077 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1078 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1079 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1080 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1081 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1082 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1083
1084 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1085 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1086 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1087 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1088 }
1089
1090 static LLVMValueRef fetch_input_tcs(
1091 struct lp_build_tgsi_context *bld_base,
1092 const struct tgsi_full_src_register *reg,
1093 enum tgsi_opcode_type type, unsigned swizzle)
1094 {
1095 struct si_shader_context *ctx = si_shader_context(bld_base);
1096 LLVMValueRef dw_addr, stride;
1097
1098 stride = get_tcs_in_vertex_dw_stride(ctx);
1099 dw_addr = get_tcs_in_current_patch_offset(ctx);
1100 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1101
1102 return lds_load(bld_base, type, swizzle, dw_addr);
1103 }
1104
1105 static LLVMValueRef fetch_output_tcs(
1106 struct lp_build_tgsi_context *bld_base,
1107 const struct tgsi_full_src_register *reg,
1108 enum tgsi_opcode_type type, unsigned swizzle)
1109 {
1110 struct si_shader_context *ctx = si_shader_context(bld_base);
1111 LLVMValueRef dw_addr, stride;
1112
1113 if (reg->Register.Dimension) {
1114 stride = get_tcs_out_vertex_dw_stride(ctx);
1115 dw_addr = get_tcs_out_current_patch_offset(ctx);
1116 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1117 } else {
1118 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1119 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1120 }
1121
1122 return lds_load(bld_base, type, swizzle, dw_addr);
1123 }
1124
1125 static LLVMValueRef fetch_input_tes(
1126 struct lp_build_tgsi_context *bld_base,
1127 const struct tgsi_full_src_register *reg,
1128 enum tgsi_opcode_type type, unsigned swizzle)
1129 {
1130 struct si_shader_context *ctx = si_shader_context(bld_base);
1131 LLVMValueRef buffer, base, addr;
1132
1133 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1134
1135 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1136 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1137
1138 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1139 }
1140
1141 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1142 const struct tgsi_full_instruction *inst,
1143 const struct tgsi_opcode_info *info,
1144 LLVMValueRef dst[4])
1145 {
1146 struct si_shader_context *ctx = si_shader_context(bld_base);
1147 struct gallivm_state *gallivm = &ctx->gallivm;
1148 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1149 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1150 unsigned chan_index;
1151 LLVMValueRef dw_addr, stride;
1152 LLVMValueRef buffer, base, buf_addr;
1153 LLVMValueRef values[4];
1154 bool skip_lds_store;
1155 bool is_tess_factor = false;
1156
1157 /* Only handle per-patch and per-vertex outputs here.
1158 * Vectors will be lowered to scalars and this function will be called again.
1159 */
1160 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1161 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1162 si_llvm_emit_store(bld_base, inst, info, dst);
1163 return;
1164 }
1165
1166 if (reg->Register.Dimension) {
1167 stride = get_tcs_out_vertex_dw_stride(ctx);
1168 dw_addr = get_tcs_out_current_patch_offset(ctx);
1169 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1170 skip_lds_store = !sh_info->reads_pervertex_outputs;
1171 } else {
1172 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1173 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1174 skip_lds_store = !sh_info->reads_perpatch_outputs;
1175
1176 if (!reg->Register.Indirect) {
1177 int name = sh_info->output_semantic_name[reg->Register.Index];
1178
1179 /* Always write tess factors into LDS for the TCS epilog. */
1180 if (name == TGSI_SEMANTIC_TESSINNER ||
1181 name == TGSI_SEMANTIC_TESSOUTER) {
1182 skip_lds_store = false;
1183 is_tess_factor = true;
1184 }
1185 }
1186 }
1187
1188 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1189
1190 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1191 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1192
1193
1194 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1195 LLVMValueRef value = dst[chan_index];
1196
1197 if (inst->Instruction.Saturate)
1198 value = ac_build_clamp(&ctx->ac, value);
1199
1200 /* Skip LDS stores if there is no LDS read of this output. */
1201 if (!skip_lds_store)
1202 lds_store(bld_base, chan_index, dw_addr, value);
1203
1204 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1205 values[chan_index] = value;
1206
1207 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1208 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1209 buf_addr, base,
1210 4 * chan_index, 1, 0, true, false);
1211 }
1212 }
1213
1214 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1215 LLVMValueRef value = lp_build_gather_values(gallivm,
1216 values, 4);
1217 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1218 base, 0, 1, 0, true, false);
1219 }
1220 }
1221
1222 static LLVMValueRef fetch_input_gs(
1223 struct lp_build_tgsi_context *bld_base,
1224 const struct tgsi_full_src_register *reg,
1225 enum tgsi_opcode_type type,
1226 unsigned swizzle)
1227 {
1228 struct si_shader_context *ctx = si_shader_context(bld_base);
1229 struct si_shader *shader = ctx->shader;
1230 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1231 struct gallivm_state *gallivm = &ctx->gallivm;
1232 LLVMValueRef vtx_offset, soffset;
1233 struct tgsi_shader_info *info = &shader->selector->info;
1234 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1235 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1236 unsigned param;
1237 LLVMValueRef value;
1238
1239 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1240 return get_primitive_id(ctx, swizzle);
1241
1242 if (!reg->Register.Dimension)
1243 return NULL;
1244
1245 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1246
1247 /* GFX9 has the ESGS ring in LDS. */
1248 if (ctx->screen->b.chip_class >= GFX9) {
1249 unsigned index = reg->Dimension.Index;
1250
1251 switch (index / 2) {
1252 case 0:
1253 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1254 index % 2 ? 16 : 0, 16);
1255 break;
1256 case 1:
1257 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1258 index % 2 ? 16 : 0, 16);
1259 break;
1260 case 2:
1261 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1262 index % 2 ? 16 : 0, 16);
1263 break;
1264 default:
1265 assert(0);
1266 return NULL;
1267 }
1268
1269 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1270 LLVMConstInt(ctx->i32, param * 4, 0), "");
1271 return lds_load(bld_base, type, swizzle, vtx_offset);
1272 }
1273
1274 /* GFX6: input load from the ESGS ring in memory. */
1275 if (swizzle == ~0) {
1276 LLVMValueRef values[TGSI_NUM_CHANNELS];
1277 unsigned chan;
1278 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1279 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1280 }
1281 return lp_build_gather_values(gallivm, values,
1282 TGSI_NUM_CHANNELS);
1283 }
1284
1285 /* Get the vertex offset parameter on GFX6. */
1286 unsigned vtx_offset_param = reg->Dimension.Index;
1287 if (vtx_offset_param < 2) {
1288 vtx_offset_param += ctx->param_gs_vtx0_offset;
1289 } else {
1290 assert(vtx_offset_param < 6);
1291 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1292 }
1293 vtx_offset = lp_build_mul_imm(uint,
1294 LLVMGetParam(ctx->main_fn,
1295 vtx_offset_param),
1296 4);
1297
1298 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1299
1300 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1301 vtx_offset, soffset, 0, 1, 0, true, false);
1302 if (tgsi_type_is_64bit(type)) {
1303 LLVMValueRef value2;
1304 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1305
1306 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1307 ctx->i32_0, vtx_offset, soffset,
1308 0, 1, 0, true, false);
1309 return si_llvm_emit_fetch_64bit(bld_base, type,
1310 value, value2);
1311 }
1312 return LLVMBuildBitCast(gallivm->builder,
1313 value,
1314 tgsi2llvmtype(bld_base, type), "");
1315 }
1316
1317 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1318 {
1319 switch (interpolate) {
1320 case TGSI_INTERPOLATE_CONSTANT:
1321 return 0;
1322
1323 case TGSI_INTERPOLATE_LINEAR:
1324 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1325 return SI_PARAM_LINEAR_SAMPLE;
1326 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1327 return SI_PARAM_LINEAR_CENTROID;
1328 else
1329 return SI_PARAM_LINEAR_CENTER;
1330 break;
1331 case TGSI_INTERPOLATE_COLOR:
1332 case TGSI_INTERPOLATE_PERSPECTIVE:
1333 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1334 return SI_PARAM_PERSP_SAMPLE;
1335 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1336 return SI_PARAM_PERSP_CENTROID;
1337 else
1338 return SI_PARAM_PERSP_CENTER;
1339 break;
1340 default:
1341 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1342 return -1;
1343 }
1344 }
1345
1346 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1347 unsigned attr_index, unsigned chan,
1348 LLVMValueRef prim_mask,
1349 LLVMValueRef i, LLVMValueRef j)
1350 {
1351 if (i || j) {
1352 return ac_build_fs_interp(&ctx->ac,
1353 LLVMConstInt(ctx->i32, chan, 0),
1354 LLVMConstInt(ctx->i32, attr_index, 0),
1355 prim_mask, i, j);
1356 }
1357 return ac_build_fs_interp_mov(&ctx->ac,
1358 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1359 LLVMConstInt(ctx->i32, chan, 0),
1360 LLVMConstInt(ctx->i32, attr_index, 0),
1361 prim_mask);
1362 }
1363
1364 /**
1365 * Interpolate a fragment shader input.
1366 *
1367 * @param ctx context
1368 * @param input_index index of the input in hardware
1369 * @param semantic_name TGSI_SEMANTIC_*
1370 * @param semantic_index semantic index
1371 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1372 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1373 * @param interp_param interpolation weights (i,j)
1374 * @param prim_mask SI_PARAM_PRIM_MASK
1375 * @param face SI_PARAM_FRONT_FACE
1376 * @param result the return value (4 components)
1377 */
1378 static void interp_fs_input(struct si_shader_context *ctx,
1379 unsigned input_index,
1380 unsigned semantic_name,
1381 unsigned semantic_index,
1382 unsigned num_interp_inputs,
1383 unsigned colors_read_mask,
1384 LLVMValueRef interp_param,
1385 LLVMValueRef prim_mask,
1386 LLVMValueRef face,
1387 LLVMValueRef result[4])
1388 {
1389 struct gallivm_state *gallivm = &ctx->gallivm;
1390 LLVMValueRef i = NULL, j = NULL;
1391 unsigned chan;
1392
1393 /* fs.constant returns the param from the middle vertex, so it's not
1394 * really useful for flat shading. It's meant to be used for custom
1395 * interpolation (but the intrinsic can't fetch from the other two
1396 * vertices).
1397 *
1398 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1399 * to do the right thing. The only reason we use fs.constant is that
1400 * fs.interp cannot be used on integers, because they can be equal
1401 * to NaN.
1402 *
1403 * When interp is false we will use fs.constant or for newer llvm,
1404 * amdgcn.interp.mov.
1405 */
1406 bool interp = interp_param != NULL;
1407
1408 if (interp) {
1409 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1410 LLVMVectorType(ctx->f32, 2), "");
1411
1412 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1413 ctx->i32_0, "");
1414 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1415 ctx->i32_1, "");
1416 }
1417
1418 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1419 ctx->shader->key.part.ps.prolog.color_two_side) {
1420 LLVMValueRef is_face_positive;
1421
1422 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1423 * otherwise it's at offset "num_inputs".
1424 */
1425 unsigned back_attr_offset = num_interp_inputs;
1426 if (semantic_index == 1 && colors_read_mask & 0xf)
1427 back_attr_offset += 1;
1428
1429 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1430 face, ctx->i32_0, "");
1431
1432 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1433 LLVMValueRef front, back;
1434
1435 front = si_build_fs_interp(ctx,
1436 input_index, chan,
1437 prim_mask, i, j);
1438 back = si_build_fs_interp(ctx,
1439 back_attr_offset, chan,
1440 prim_mask, i, j);
1441
1442 result[chan] = LLVMBuildSelect(gallivm->builder,
1443 is_face_positive,
1444 front,
1445 back,
1446 "");
1447 }
1448 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1449 result[0] = si_build_fs_interp(ctx, input_index,
1450 0, prim_mask, i, j);
1451 result[1] =
1452 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1453 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1454 } else {
1455 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1456 result[chan] = si_build_fs_interp(ctx,
1457 input_index, chan,
1458 prim_mask, i, j);
1459 }
1460 }
1461 }
1462
1463 void si_llvm_load_input_fs(
1464 struct si_shader_context *ctx,
1465 unsigned input_index,
1466 LLVMValueRef out[4])
1467 {
1468 struct lp_build_context *base = &ctx->bld_base.base;
1469 struct si_shader *shader = ctx->shader;
1470 struct tgsi_shader_info *info = &shader->selector->info;
1471 LLVMValueRef main_fn = ctx->main_fn;
1472 LLVMValueRef interp_param = NULL;
1473 int interp_param_idx;
1474 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1475 unsigned semantic_index = info->input_semantic_index[input_index];
1476 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1477 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1478
1479 /* Get colors from input VGPRs (set by the prolog). */
1480 if (semantic_name == TGSI_SEMANTIC_COLOR) {
1481 unsigned colors_read = shader->selector->info.colors_read;
1482 unsigned mask = colors_read >> (semantic_index * 4);
1483 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1484 (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1485
1486 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1487 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1488 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1489 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1490 return;
1491 }
1492
1493 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1494 if (interp_param_idx == -1)
1495 return;
1496 else if (interp_param_idx) {
1497 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1498 }
1499
1500 interp_fs_input(ctx, input_index, semantic_name,
1501 semantic_index, 0, /* this param is unused */
1502 shader->selector->info.colors_read, interp_param,
1503 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1504 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1505 &out[0]);
1506 }
1507
1508 static void declare_input_fs(
1509 struct si_shader_context *ctx,
1510 unsigned input_index,
1511 const struct tgsi_full_declaration *decl,
1512 LLVMValueRef out[4])
1513 {
1514 si_llvm_load_input_fs(ctx, input_index, out);
1515 }
1516
1517 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1518 {
1519 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1520 }
1521
1522
1523 /**
1524 * Load a dword from a constant buffer.
1525 */
1526 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1527 LLVMValueRef resource,
1528 LLVMValueRef offset)
1529 {
1530 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1531 0, 0, 0, true, true);
1532 }
1533
1534 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1535 {
1536 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1537 struct gallivm_state *gallivm = &ctx->gallivm;
1538 LLVMBuilderRef builder = gallivm->builder;
1539 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1540 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1541 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1542
1543 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1544 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1545 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1546
1547 LLVMValueRef pos[4] = {
1548 buffer_load_const(ctx, resource, offset0),
1549 buffer_load_const(ctx, resource, offset1),
1550 LLVMConstReal(ctx->f32, 0),
1551 LLVMConstReal(ctx->f32, 0)
1552 };
1553
1554 return lp_build_gather_values(gallivm, pos, 4);
1555 }
1556
1557 void si_load_system_value(struct si_shader_context *ctx,
1558 unsigned index,
1559 const struct tgsi_full_declaration *decl)
1560 {
1561 struct lp_build_context *bld = &ctx->bld_base.base;
1562 struct gallivm_state *gallivm = &ctx->gallivm;
1563 LLVMValueRef value = 0;
1564
1565 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1566
1567 switch (decl->Semantic.Name) {
1568 case TGSI_SEMANTIC_INSTANCEID:
1569 value = ctx->abi.instance_id;
1570 break;
1571
1572 case TGSI_SEMANTIC_VERTEXID:
1573 value = LLVMBuildAdd(gallivm->builder,
1574 ctx->abi.vertex_id,
1575 ctx->abi.base_vertex, "");
1576 break;
1577
1578 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1579 /* Unused. Clarify the meaning in indexed vs. non-indexed
1580 * draws if this is ever used again. */
1581 assert(false);
1582 break;
1583
1584 case TGSI_SEMANTIC_BASEVERTEX:
1585 {
1586 /* For non-indexed draws, the base vertex set by the driver
1587 * (for direct draws) or the CP (for indirect draws) is the
1588 * first vertex ID, but GLSL expects 0 to be returned.
1589 */
1590 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1591 LLVMValueRef indexed;
1592
1593 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1594 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1595
1596 value = LLVMBuildSelect(gallivm->builder, indexed,
1597 ctx->abi.base_vertex, ctx->i32_0, "");
1598 break;
1599 }
1600
1601 case TGSI_SEMANTIC_BASEINSTANCE:
1602 value = ctx->abi.start_instance;
1603 break;
1604
1605 case TGSI_SEMANTIC_DRAWID:
1606 value = ctx->abi.draw_id;
1607 break;
1608
1609 case TGSI_SEMANTIC_INVOCATIONID:
1610 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1611 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1612 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1613 value = LLVMGetParam(ctx->main_fn,
1614 ctx->param_gs_instance_id);
1615 else
1616 assert(!"INVOCATIONID not implemented");
1617 break;
1618
1619 case TGSI_SEMANTIC_POSITION:
1620 {
1621 LLVMValueRef pos[4] = {
1622 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1623 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1624 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1625 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1626 LLVMGetParam(ctx->main_fn,
1627 SI_PARAM_POS_W_FLOAT)),
1628 };
1629 value = lp_build_gather_values(gallivm, pos, 4);
1630 break;
1631 }
1632
1633 case TGSI_SEMANTIC_FACE:
1634 value = ctx->abi.front_face;
1635 break;
1636
1637 case TGSI_SEMANTIC_SAMPLEID:
1638 value = get_sample_id(ctx);
1639 break;
1640
1641 case TGSI_SEMANTIC_SAMPLEPOS: {
1642 LLVMValueRef pos[4] = {
1643 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1644 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1645 LLVMConstReal(ctx->f32, 0),
1646 LLVMConstReal(ctx->f32, 0)
1647 };
1648 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1649 TGSI_OPCODE_FRC, pos[0]);
1650 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1651 TGSI_OPCODE_FRC, pos[1]);
1652 value = lp_build_gather_values(gallivm, pos, 4);
1653 break;
1654 }
1655
1656 case TGSI_SEMANTIC_SAMPLEMASK:
1657 /* This can only occur with the OpenGL Core profile, which
1658 * doesn't support smoothing.
1659 */
1660 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1661 break;
1662
1663 case TGSI_SEMANTIC_TESSCOORD:
1664 {
1665 LLVMValueRef coord[4] = {
1666 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1667 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1668 bld->zero,
1669 bld->zero
1670 };
1671
1672 /* For triangles, the vector should be (u, v, 1-u-v). */
1673 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1674 PIPE_PRIM_TRIANGLES)
1675 coord[2] = lp_build_sub(bld, bld->one,
1676 lp_build_add(bld, coord[0], coord[1]));
1677
1678 value = lp_build_gather_values(gallivm, coord, 4);
1679 break;
1680 }
1681
1682 case TGSI_SEMANTIC_VERTICESIN:
1683 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1684 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1685 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1686 value = get_num_tcs_out_vertices(ctx);
1687 else
1688 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1689 break;
1690
1691 case TGSI_SEMANTIC_TESSINNER:
1692 case TGSI_SEMANTIC_TESSOUTER:
1693 {
1694 LLVMValueRef buffer, base, addr;
1695 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1696
1697 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1698
1699 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1700 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1701 LLVMConstInt(ctx->i32, param, 0));
1702
1703 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1704 ~0, buffer, base, addr, true);
1705
1706 break;
1707 }
1708
1709 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1710 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1711 {
1712 LLVMValueRef buf, slot, val[4];
1713 int i, offset;
1714
1715 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1716 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1717 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1718 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1719
1720 for (i = 0; i < 4; i++)
1721 val[i] = buffer_load_const(ctx, buf,
1722 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1723 value = lp_build_gather_values(gallivm, val, 4);
1724 break;
1725 }
1726
1727 case TGSI_SEMANTIC_PRIMID:
1728 value = get_primitive_id(ctx, 0);
1729 break;
1730
1731 case TGSI_SEMANTIC_GRID_SIZE:
1732 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1733 break;
1734
1735 case TGSI_SEMANTIC_BLOCK_SIZE:
1736 {
1737 LLVMValueRef values[3];
1738 unsigned i;
1739 unsigned *properties = ctx->shader->selector->info.properties;
1740
1741 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1742 unsigned sizes[3] = {
1743 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1744 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1745 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1746 };
1747
1748 for (i = 0; i < 3; ++i)
1749 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1750
1751 value = lp_build_gather_values(gallivm, values, 3);
1752 } else {
1753 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1754 }
1755 break;
1756 }
1757
1758 case TGSI_SEMANTIC_BLOCK_ID:
1759 {
1760 LLVMValueRef values[3];
1761
1762 for (int i = 0; i < 3; i++) {
1763 values[i] = ctx->i32_0;
1764 if (ctx->param_block_id[i] >= 0) {
1765 values[i] = LLVMGetParam(ctx->main_fn,
1766 ctx->param_block_id[i]);
1767 }
1768 }
1769 value = lp_build_gather_values(gallivm, values, 3);
1770 break;
1771 }
1772
1773 case TGSI_SEMANTIC_THREAD_ID:
1774 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1775 break;
1776
1777 case TGSI_SEMANTIC_HELPER_INVOCATION:
1778 value = lp_build_intrinsic(gallivm->builder,
1779 "llvm.amdgcn.ps.live",
1780 ctx->i1, NULL, 0,
1781 LP_FUNC_ATTR_READNONE);
1782 value = LLVMBuildNot(gallivm->builder, value, "");
1783 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1784 break;
1785
1786 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1787 value = LLVMConstInt(ctx->i32, 64, 0);
1788 break;
1789
1790 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1791 value = ac_get_thread_id(&ctx->ac);
1792 break;
1793
1794 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1795 {
1796 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1797 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1798 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1799 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1800 break;
1801 }
1802
1803 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1804 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1805 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1806 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1807 {
1808 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1809 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1810 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1811 /* All bits set except LSB */
1812 value = LLVMConstInt(ctx->i64, -2, 0);
1813 } else {
1814 /* All bits set */
1815 value = LLVMConstInt(ctx->i64, -1, 0);
1816 }
1817 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1818 value = LLVMBuildShl(gallivm->builder, value, id, "");
1819 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1820 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1821 value = LLVMBuildNot(gallivm->builder, value, "");
1822 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1823 break;
1824 }
1825
1826 default:
1827 assert(!"unknown system value");
1828 return;
1829 }
1830
1831 ctx->system_values[index] = value;
1832 }
1833
1834 void si_declare_compute_memory(struct si_shader_context *ctx,
1835 const struct tgsi_full_declaration *decl)
1836 {
1837 struct si_shader_selector *sel = ctx->shader->selector;
1838 struct gallivm_state *gallivm = &ctx->gallivm;
1839
1840 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1841 LLVMValueRef var;
1842
1843 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1844 assert(decl->Range.First == decl->Range.Last);
1845 assert(!ctx->shared_memory);
1846
1847 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1848 LLVMArrayType(ctx->i8, sel->local_size),
1849 "compute_lds",
1850 LOCAL_ADDR_SPACE);
1851 LLVMSetAlignment(var, 4);
1852
1853 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1854 }
1855
1856 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1857 {
1858 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1859 ctx->param_const_and_shader_buffers);
1860
1861 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1862 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1863 }
1864
1865 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1866 {
1867 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1868 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1869
1870 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1871 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1872 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1873
1874 return ac_build_indexed_load_const(&ctx->ac, ptr, index);
1875 }
1876
1877 static LLVMValueRef
1878 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1879 {
1880 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1881 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
1882 ctx->param_const_and_shader_buffers);
1883
1884 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1885 index = LLVMBuildSub(ctx->gallivm.builder,
1886 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1887 index, "");
1888
1889 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
1890 }
1891
1892 static LLVMValueRef fetch_constant(
1893 struct lp_build_tgsi_context *bld_base,
1894 const struct tgsi_full_src_register *reg,
1895 enum tgsi_opcode_type type,
1896 unsigned swizzle)
1897 {
1898 struct si_shader_context *ctx = si_shader_context(bld_base);
1899 struct lp_build_context *base = &bld_base->base;
1900 const struct tgsi_ind_register *ireg = &reg->Indirect;
1901 unsigned buf, idx;
1902
1903 LLVMValueRef addr, bufp;
1904 LLVMValueRef result;
1905
1906 if (swizzle == LP_CHAN_ALL) {
1907 unsigned chan;
1908 LLVMValueRef values[4];
1909 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1910 values[chan] = fetch_constant(bld_base, reg, type, chan);
1911
1912 return lp_build_gather_values(&ctx->gallivm, values, 4);
1913 }
1914
1915 assert(reg->Register.Dimension);
1916 buf = reg->Dimension.Index;
1917 idx = reg->Register.Index * 4 + swizzle;
1918
1919 if (reg->Dimension.Indirect) {
1920 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1921 LLVMValueRef index;
1922 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1923 reg->Dimension.Index,
1924 ctx->num_const_buffers);
1925 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1926 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1927 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1928 } else
1929 bufp = load_const_buffer_desc(ctx, buf);
1930
1931 if (reg->Register.Indirect) {
1932 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1933 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1934 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1935 addr = lp_build_add(&bld_base->uint_bld, addr,
1936 LLVMConstInt(ctx->i32, idx * 4, 0));
1937 } else {
1938 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1939 }
1940
1941 result = buffer_load_const(ctx, bufp, addr);
1942
1943 if (!tgsi_type_is_64bit(type))
1944 result = bitcast(bld_base, type, result);
1945 else {
1946 LLVMValueRef addr2, result2;
1947
1948 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1949 LLVMConstInt(ctx->i32, 4, 0));
1950 result2 = buffer_load_const(ctx, bufp, addr2);
1951
1952 result = si_llvm_emit_fetch_64bit(bld_base, type,
1953 result, result2);
1954 }
1955 return result;
1956 }
1957
1958 /* Upper 16 bits must be zero. */
1959 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1960 LLVMValueRef val[2])
1961 {
1962 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1963 LLVMBuildShl(ctx->gallivm.builder, val[1],
1964 LLVMConstInt(ctx->i32, 16, 0),
1965 ""), "");
1966 }
1967
1968 /* Upper 16 bits are ignored and will be dropped. */
1969 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1970 LLVMValueRef val[2])
1971 {
1972 LLVMValueRef v[2] = {
1973 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1974 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1975 val[1],
1976 };
1977 return si_llvm_pack_two_int16(ctx, v);
1978 }
1979
1980 /* Initialize arguments for the shader export intrinsic */
1981 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1982 LLVMValueRef *values,
1983 unsigned target,
1984 struct ac_export_args *args)
1985 {
1986 struct si_shader_context *ctx = si_shader_context(bld_base);
1987 struct lp_build_context *base = &bld_base->base;
1988 LLVMBuilderRef builder = ctx->gallivm.builder;
1989 LLVMValueRef val[4];
1990 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1991 unsigned chan;
1992 bool is_int8, is_int10;
1993
1994 /* Default is 0xf. Adjusted below depending on the format. */
1995 args->enabled_channels = 0xf; /* writemask */
1996
1997 /* Specify whether the EXEC mask represents the valid mask */
1998 args->valid_mask = 0;
1999
2000 /* Specify whether this is the last export */
2001 args->done = 0;
2002
2003 /* Specify the target we are exporting */
2004 args->target = target;
2005
2006 if (ctx->type == PIPE_SHADER_FRAGMENT) {
2007 const struct si_shader_key *key = &ctx->shader->key;
2008 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2009 int cbuf = target - V_008DFC_SQ_EXP_MRT;
2010
2011 assert(cbuf >= 0 && cbuf < 8);
2012 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2013 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2014 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2015 }
2016
2017 args->compr = false;
2018 args->out[0] = base->undef;
2019 args->out[1] = base->undef;
2020 args->out[2] = base->undef;
2021 args->out[3] = base->undef;
2022
2023 switch (spi_shader_col_format) {
2024 case V_028714_SPI_SHADER_ZERO:
2025 args->enabled_channels = 0; /* writemask */
2026 args->target = V_008DFC_SQ_EXP_NULL;
2027 break;
2028
2029 case V_028714_SPI_SHADER_32_R:
2030 args->enabled_channels = 1; /* writemask */
2031 args->out[0] = values[0];
2032 break;
2033
2034 case V_028714_SPI_SHADER_32_GR:
2035 args->enabled_channels = 0x3; /* writemask */
2036 args->out[0] = values[0];
2037 args->out[1] = values[1];
2038 break;
2039
2040 case V_028714_SPI_SHADER_32_AR:
2041 args->enabled_channels = 0x9; /* writemask */
2042 args->out[0] = values[0];
2043 args->out[3] = values[3];
2044 break;
2045
2046 case V_028714_SPI_SHADER_FP16_ABGR:
2047 args->compr = 1; /* COMPR flag */
2048
2049 for (chan = 0; chan < 2; chan++) {
2050 LLVMValueRef pack_args[2] = {
2051 values[2 * chan],
2052 values[2 * chan + 1]
2053 };
2054 LLVMValueRef packed;
2055
2056 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
2057 args->out[chan] =
2058 LLVMBuildBitCast(ctx->gallivm.builder,
2059 packed, ctx->f32, "");
2060 }
2061 break;
2062
2063 case V_028714_SPI_SHADER_UNORM16_ABGR:
2064 for (chan = 0; chan < 4; chan++) {
2065 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
2066 val[chan] = LLVMBuildFMul(builder, val[chan],
2067 LLVMConstReal(ctx->f32, 65535), "");
2068 val[chan] = LLVMBuildFAdd(builder, val[chan],
2069 LLVMConstReal(ctx->f32, 0.5), "");
2070 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2071 ctx->i32, "");
2072 }
2073
2074 args->compr = 1; /* COMPR flag */
2075 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2076 si_llvm_pack_two_int16(ctx, val));
2077 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2078 si_llvm_pack_two_int16(ctx, val+2));
2079 break;
2080
2081 case V_028714_SPI_SHADER_SNORM16_ABGR:
2082 for (chan = 0; chan < 4; chan++) {
2083 /* Clamp between [-1, 1]. */
2084 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2085 values[chan],
2086 LLVMConstReal(ctx->f32, 1));
2087 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2088 val[chan],
2089 LLVMConstReal(ctx->f32, -1));
2090 /* Convert to a signed integer in [-32767, 32767]. */
2091 val[chan] = LLVMBuildFMul(builder, val[chan],
2092 LLVMConstReal(ctx->f32, 32767), "");
2093 /* If positive, add 0.5, else add -0.5. */
2094 val[chan] = LLVMBuildFAdd(builder, val[chan],
2095 LLVMBuildSelect(builder,
2096 LLVMBuildFCmp(builder, LLVMRealOGE,
2097 val[chan], base->zero, ""),
2098 LLVMConstReal(ctx->f32, 0.5),
2099 LLVMConstReal(ctx->f32, -0.5), ""), "");
2100 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2101 }
2102
2103 args->compr = 1; /* COMPR flag */
2104 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2105 si_llvm_pack_two_int32_as_int16(ctx, val));
2106 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2107 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2108 break;
2109
2110 case V_028714_SPI_SHADER_UINT16_ABGR: {
2111 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2112 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2113 LLVMValueRef max_alpha =
2114 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2115
2116 /* Clamp. */
2117 for (chan = 0; chan < 4; chan++) {
2118 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2119 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2120 val[chan],
2121 chan == 3 ? max_alpha : max_rgb);
2122 }
2123
2124 args->compr = 1; /* COMPR flag */
2125 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2126 si_llvm_pack_two_int16(ctx, val));
2127 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2128 si_llvm_pack_two_int16(ctx, val+2));
2129 break;
2130 }
2131
2132 case V_028714_SPI_SHADER_SINT16_ABGR: {
2133 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2134 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2135 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2136 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2137 LLVMValueRef max_alpha =
2138 !is_int10 ? max_rgb : ctx->i32_1;
2139 LLVMValueRef min_alpha =
2140 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2141
2142 /* Clamp. */
2143 for (chan = 0; chan < 4; chan++) {
2144 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2145 val[chan] = lp_build_emit_llvm_binary(bld_base,
2146 TGSI_OPCODE_IMIN,
2147 val[chan], chan == 3 ? max_alpha : max_rgb);
2148 val[chan] = lp_build_emit_llvm_binary(bld_base,
2149 TGSI_OPCODE_IMAX,
2150 val[chan], chan == 3 ? min_alpha : min_rgb);
2151 }
2152
2153 args->compr = 1; /* COMPR flag */
2154 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2155 si_llvm_pack_two_int32_as_int16(ctx, val));
2156 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2157 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2158 break;
2159 }
2160
2161 case V_028714_SPI_SHADER_32_ABGR:
2162 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2163 break;
2164 }
2165 }
2166
2167 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2168 LLVMValueRef alpha)
2169 {
2170 struct si_shader_context *ctx = si_shader_context(bld_base);
2171
2172 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2173 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2174 SI_PARAM_ALPHA_REF);
2175
2176 LLVMValueRef alpha_pass =
2177 lp_build_cmp(&bld_base->base,
2178 ctx->shader->key.part.ps.epilog.alpha_func,
2179 alpha, alpha_ref);
2180 LLVMValueRef arg =
2181 lp_build_select(&bld_base->base,
2182 alpha_pass,
2183 LLVMConstReal(ctx->f32, 1.0f),
2184 LLVMConstReal(ctx->f32, -1.0f));
2185
2186 ac_build_kill(&ctx->ac, arg);
2187 } else {
2188 ac_build_kill(&ctx->ac, NULL);
2189 }
2190 }
2191
2192 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2193 LLVMValueRef alpha,
2194 unsigned samplemask_param)
2195 {
2196 struct si_shader_context *ctx = si_shader_context(bld_base);
2197 struct gallivm_state *gallivm = &ctx->gallivm;
2198 LLVMValueRef coverage;
2199
2200 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2201 coverage = LLVMGetParam(ctx->main_fn,
2202 samplemask_param);
2203 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2204
2205 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2206 ctx->i32,
2207 &coverage, 1, LP_FUNC_ATTR_READNONE);
2208
2209 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2210 ctx->f32, "");
2211
2212 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2213 LLVMConstReal(ctx->f32,
2214 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2215
2216 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2217 }
2218
2219 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2220 struct ac_export_args *pos, LLVMValueRef *out_elts)
2221 {
2222 struct si_shader_context *ctx = si_shader_context(bld_base);
2223 struct lp_build_context *base = &bld_base->base;
2224 unsigned reg_index;
2225 unsigned chan;
2226 unsigned const_chan;
2227 LLVMValueRef base_elt;
2228 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2229 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2230 SI_VS_CONST_CLIP_PLANES, 0);
2231 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2232
2233 for (reg_index = 0; reg_index < 2; reg_index ++) {
2234 struct ac_export_args *args = &pos[2 + reg_index];
2235
2236 args->out[0] =
2237 args->out[1] =
2238 args->out[2] =
2239 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2240
2241 /* Compute dot products of position and user clip plane vectors */
2242 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2243 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2244 LLVMValueRef addr =
2245 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2246 const_chan) * 4, 0);
2247 base_elt = buffer_load_const(ctx, const_resource,
2248 addr);
2249 args->out[chan] =
2250 lp_build_add(base, args->out[chan],
2251 lp_build_mul(base, base_elt,
2252 out_elts[const_chan]));
2253 }
2254 }
2255
2256 args->enabled_channels = 0xf;
2257 args->valid_mask = 0;
2258 args->done = 0;
2259 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2260 args->compr = 0;
2261 }
2262 }
2263
2264 static void si_dump_streamout(struct pipe_stream_output_info *so)
2265 {
2266 unsigned i;
2267
2268 if (so->num_outputs)
2269 fprintf(stderr, "STREAMOUT\n");
2270
2271 for (i = 0; i < so->num_outputs; i++) {
2272 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2273 so->output[i].start_component;
2274 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2275 i, so->output[i].output_buffer,
2276 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2277 so->output[i].register_index,
2278 mask & 1 ? "x" : "",
2279 mask & 2 ? "y" : "",
2280 mask & 4 ? "z" : "",
2281 mask & 8 ? "w" : "");
2282 }
2283 }
2284
2285 static void emit_streamout_output(struct si_shader_context *ctx,
2286 LLVMValueRef const *so_buffers,
2287 LLVMValueRef const *so_write_offsets,
2288 struct pipe_stream_output *stream_out,
2289 struct si_shader_output_values *shader_out)
2290 {
2291 struct gallivm_state *gallivm = &ctx->gallivm;
2292 LLVMBuilderRef builder = gallivm->builder;
2293 unsigned buf_idx = stream_out->output_buffer;
2294 unsigned start = stream_out->start_component;
2295 unsigned num_comps = stream_out->num_components;
2296 LLVMValueRef out[4];
2297
2298 assert(num_comps && num_comps <= 4);
2299 if (!num_comps || num_comps > 4)
2300 return;
2301
2302 /* Load the output as int. */
2303 for (int j = 0; j < num_comps; j++) {
2304 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2305
2306 out[j] = LLVMBuildBitCast(builder,
2307 shader_out->values[start + j],
2308 ctx->i32, "");
2309 }
2310
2311 /* Pack the output. */
2312 LLVMValueRef vdata = NULL;
2313
2314 switch (num_comps) {
2315 case 1: /* as i32 */
2316 vdata = out[0];
2317 break;
2318 case 2: /* as v2i32 */
2319 case 3: /* as v4i32 (aligned to 4) */
2320 case 4: /* as v4i32 */
2321 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2322 for (int j = 0; j < num_comps; j++) {
2323 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2324 LLVMConstInt(ctx->i32, j, 0), "");
2325 }
2326 break;
2327 }
2328
2329 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2330 vdata, num_comps,
2331 so_write_offsets[buf_idx],
2332 ctx->i32_0,
2333 stream_out->dst_offset * 4, 1, 1, true, false);
2334 }
2335
2336 /**
2337 * Write streamout data to buffers for vertex stream @p stream (different
2338 * vertex streams can occur for GS copy shaders).
2339 */
2340 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2341 struct si_shader_output_values *outputs,
2342 unsigned noutput, unsigned stream)
2343 {
2344 struct si_shader_selector *sel = ctx->shader->selector;
2345 struct pipe_stream_output_info *so = &sel->so;
2346 struct gallivm_state *gallivm = &ctx->gallivm;
2347 LLVMBuilderRef builder = gallivm->builder;
2348 int i;
2349 struct lp_build_if_state if_ctx;
2350
2351 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2352 LLVMValueRef so_vtx_count =
2353 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2354
2355 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2356
2357 /* can_emit = tid < so_vtx_count; */
2358 LLVMValueRef can_emit =
2359 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2360
2361 /* Emit the streamout code conditionally. This actually avoids
2362 * out-of-bounds buffer access. The hw tells us via the SGPR
2363 * (so_vtx_count) which threads are allowed to emit streamout data. */
2364 lp_build_if(&if_ctx, gallivm, can_emit);
2365 {
2366 /* The buffer offset is computed as follows:
2367 * ByteOffset = streamout_offset[buffer_id]*4 +
2368 * (streamout_write_index + thread_id)*stride[buffer_id] +
2369 * attrib_offset
2370 */
2371
2372 LLVMValueRef so_write_index =
2373 LLVMGetParam(ctx->main_fn,
2374 ctx->param_streamout_write_index);
2375
2376 /* Compute (streamout_write_index + thread_id). */
2377 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2378
2379 /* Load the descriptor and compute the write offset for each
2380 * enabled buffer. */
2381 LLVMValueRef so_write_offset[4] = {};
2382 LLVMValueRef so_buffers[4];
2383 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2384 ctx->param_rw_buffers);
2385
2386 for (i = 0; i < 4; i++) {
2387 if (!so->stride[i])
2388 continue;
2389
2390 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2391 SI_VS_STREAMOUT_BUF0 + i, 0);
2392
2393 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2394
2395 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2396 ctx->param_streamout_offset[i]);
2397 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2398
2399 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2400 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2401 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2402 }
2403
2404 /* Write streamout data. */
2405 for (i = 0; i < so->num_outputs; i++) {
2406 unsigned reg = so->output[i].register_index;
2407
2408 if (reg >= noutput)
2409 continue;
2410
2411 if (stream != so->output[i].stream)
2412 continue;
2413
2414 emit_streamout_output(ctx, so_buffers, so_write_offset,
2415 &so->output[i], &outputs[reg]);
2416 }
2417 }
2418 lp_build_endif(&if_ctx);
2419 }
2420
2421 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2422 LLVMValueRef *values)
2423 {
2424 struct ac_export_args args;
2425
2426 si_llvm_init_export_args(&ctx->bld_base, values,
2427 V_008DFC_SQ_EXP_PARAM + index, &args);
2428 ac_build_export(&ctx->ac, &args);
2429 }
2430
2431 static void si_build_param_exports(struct si_shader_context *ctx,
2432 struct si_shader_output_values *outputs,
2433 unsigned noutput)
2434 {
2435 struct si_shader *shader = ctx->shader;
2436 unsigned param_count = 0;
2437
2438 for (unsigned i = 0; i < noutput; i++) {
2439 unsigned semantic_name = outputs[i].semantic_name;
2440 unsigned semantic_index = outputs[i].semantic_index;
2441
2442 if (outputs[i].vertex_stream[0] != 0 &&
2443 outputs[i].vertex_stream[1] != 0 &&
2444 outputs[i].vertex_stream[2] != 0 &&
2445 outputs[i].vertex_stream[3] != 0)
2446 continue;
2447
2448 switch (semantic_name) {
2449 case TGSI_SEMANTIC_LAYER:
2450 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2451 case TGSI_SEMANTIC_CLIPDIST:
2452 case TGSI_SEMANTIC_COLOR:
2453 case TGSI_SEMANTIC_BCOLOR:
2454 case TGSI_SEMANTIC_PRIMID:
2455 case TGSI_SEMANTIC_FOG:
2456 case TGSI_SEMANTIC_TEXCOORD:
2457 case TGSI_SEMANTIC_GENERIC:
2458 break;
2459 default:
2460 continue;
2461 }
2462
2463 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2464 semantic_index < SI_MAX_IO_GENERIC) &&
2465 shader->key.opt.kill_outputs &
2466 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2467 continue;
2468
2469 si_export_param(ctx, param_count, outputs[i].values);
2470
2471 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2472 shader->info.vs_output_param_offset[i] = param_count++;
2473 }
2474
2475 shader->info.nr_param_exports = param_count;
2476 }
2477
2478 /* Generate export instructions for hardware VS shader stage */
2479 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2480 struct si_shader_output_values *outputs,
2481 unsigned noutput)
2482 {
2483 struct si_shader_context *ctx = si_shader_context(bld_base);
2484 struct si_shader *shader = ctx->shader;
2485 struct lp_build_context *base = &bld_base->base;
2486 struct ac_export_args pos_args[4] = {};
2487 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2488 unsigned pos_idx;
2489 int i;
2490
2491 /* Build position exports. */
2492 for (i = 0; i < noutput; i++) {
2493 switch (outputs[i].semantic_name) {
2494 case TGSI_SEMANTIC_POSITION:
2495 si_llvm_init_export_args(bld_base, outputs[i].values,
2496 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2497 break;
2498 case TGSI_SEMANTIC_PSIZE:
2499 psize_value = outputs[i].values[0];
2500 break;
2501 case TGSI_SEMANTIC_LAYER:
2502 layer_value = outputs[i].values[0];
2503 break;
2504 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2505 viewport_index_value = outputs[i].values[0];
2506 break;
2507 case TGSI_SEMANTIC_EDGEFLAG:
2508 edgeflag_value = outputs[i].values[0];
2509 break;
2510 case TGSI_SEMANTIC_CLIPDIST:
2511 if (!shader->key.opt.clip_disable) {
2512 unsigned index = 2 + outputs[i].semantic_index;
2513 si_llvm_init_export_args(bld_base, outputs[i].values,
2514 V_008DFC_SQ_EXP_POS + index,
2515 &pos_args[index]);
2516 }
2517 break;
2518 case TGSI_SEMANTIC_CLIPVERTEX:
2519 if (!shader->key.opt.clip_disable) {
2520 si_llvm_emit_clipvertex(bld_base, pos_args,
2521 outputs[i].values);
2522 }
2523 break;
2524 }
2525 }
2526
2527 /* We need to add the position output manually if it's missing. */
2528 if (!pos_args[0].out[0]) {
2529 pos_args[0].enabled_channels = 0xf; /* writemask */
2530 pos_args[0].valid_mask = 0; /* EXEC mask */
2531 pos_args[0].done = 0; /* last export? */
2532 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2533 pos_args[0].compr = 0; /* COMPR flag */
2534 pos_args[0].out[0] = base->zero; /* X */
2535 pos_args[0].out[1] = base->zero; /* Y */
2536 pos_args[0].out[2] = base->zero; /* Z */
2537 pos_args[0].out[3] = base->one; /* W */
2538 }
2539
2540 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2541 if (shader->selector->info.writes_psize ||
2542 shader->selector->info.writes_edgeflag ||
2543 shader->selector->info.writes_viewport_index ||
2544 shader->selector->info.writes_layer) {
2545 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2546 (shader->selector->info.writes_edgeflag << 1) |
2547 (shader->selector->info.writes_layer << 2);
2548
2549 pos_args[1].valid_mask = 0; /* EXEC mask */
2550 pos_args[1].done = 0; /* last export? */
2551 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2552 pos_args[1].compr = 0; /* COMPR flag */
2553 pos_args[1].out[0] = base->zero; /* X */
2554 pos_args[1].out[1] = base->zero; /* Y */
2555 pos_args[1].out[2] = base->zero; /* Z */
2556 pos_args[1].out[3] = base->zero; /* W */
2557
2558 if (shader->selector->info.writes_psize)
2559 pos_args[1].out[0] = psize_value;
2560
2561 if (shader->selector->info.writes_edgeflag) {
2562 /* The output is a float, but the hw expects an integer
2563 * with the first bit containing the edge flag. */
2564 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2565 edgeflag_value,
2566 ctx->i32, "");
2567 edgeflag_value = ac_build_umin(&ctx->ac,
2568 edgeflag_value,
2569 ctx->i32_1);
2570
2571 /* The LLVM intrinsic expects a float. */
2572 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2573 edgeflag_value,
2574 ctx->f32, "");
2575 }
2576
2577 if (ctx->screen->b.chip_class >= GFX9) {
2578 /* GFX9 has the layer in out.z[10:0] and the viewport
2579 * index in out.z[19:16].
2580 */
2581 if (shader->selector->info.writes_layer)
2582 pos_args[1].out[2] = layer_value;
2583
2584 if (shader->selector->info.writes_viewport_index) {
2585 LLVMValueRef v = viewport_index_value;
2586
2587 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2588 v = LLVMBuildShl(ctx->gallivm.builder, v,
2589 LLVMConstInt(ctx->i32, 16, 0), "");
2590 v = LLVMBuildOr(ctx->gallivm.builder, v,
2591 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2592 pos_args[1].out[2]), "");
2593 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2594 pos_args[1].enabled_channels |= 1 << 2;
2595 }
2596 } else {
2597 if (shader->selector->info.writes_layer)
2598 pos_args[1].out[2] = layer_value;
2599
2600 if (shader->selector->info.writes_viewport_index) {
2601 pos_args[1].out[3] = viewport_index_value;
2602 pos_args[1].enabled_channels |= 1 << 3;
2603 }
2604 }
2605 }
2606
2607 for (i = 0; i < 4; i++)
2608 if (pos_args[i].out[0])
2609 shader->info.nr_pos_exports++;
2610
2611 pos_idx = 0;
2612 for (i = 0; i < 4; i++) {
2613 if (!pos_args[i].out[0])
2614 continue;
2615
2616 /* Specify the target we are exporting */
2617 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2618
2619 if (pos_idx == shader->info.nr_pos_exports)
2620 /* Specify that this is the last export */
2621 pos_args[i].done = 1;
2622
2623 ac_build_export(&ctx->ac, &pos_args[i]);
2624 }
2625
2626 /* Build parameter exports. */
2627 si_build_param_exports(ctx, outputs, noutput);
2628 }
2629
2630 /**
2631 * Forward all outputs from the vertex shader to the TES. This is only used
2632 * for the fixed function TCS.
2633 */
2634 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2635 {
2636 struct si_shader_context *ctx = si_shader_context(bld_base);
2637 struct gallivm_state *gallivm = &ctx->gallivm;
2638 LLVMValueRef invocation_id, buffer, buffer_offset;
2639 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2640 uint64_t inputs;
2641
2642 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2643 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2644 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2645
2646 lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
2647 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2648 lds_vertex_stride, "");
2649 lds_base = get_tcs_in_current_patch_offset(ctx);
2650 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2651
2652 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2653 while (inputs) {
2654 unsigned i = u_bit_scan64(&inputs);
2655
2656 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2657 LLVMConstInt(ctx->i32, 4 * i, 0),
2658 "");
2659
2660 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2661 get_rel_patch_id(ctx),
2662 invocation_id,
2663 LLVMConstInt(ctx->i32, i, 0));
2664
2665 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2666 lds_ptr);
2667
2668 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2669 buffer_offset, 0, 1, 0, true, false);
2670 }
2671 }
2672
2673 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2674 LLVMValueRef rel_patch_id,
2675 LLVMValueRef invocation_id,
2676 LLVMValueRef tcs_out_current_patch_data_offset)
2677 {
2678 struct si_shader_context *ctx = si_shader_context(bld_base);
2679 struct gallivm_state *gallivm = &ctx->gallivm;
2680 struct si_shader *shader = ctx->shader;
2681 unsigned tess_inner_index, tess_outer_index;
2682 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2683 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2684 unsigned stride, outer_comps, inner_comps, i, offset;
2685 struct lp_build_if_state if_ctx, inner_if_ctx;
2686
2687 si_llvm_emit_barrier(NULL, bld_base, NULL);
2688
2689 /* Do this only for invocation 0, because the tess levels are per-patch,
2690 * not per-vertex.
2691 *
2692 * This can't jump, because invocation 0 executes this. It should
2693 * at least mask out the loads and stores for other invocations.
2694 */
2695 lp_build_if(&if_ctx, gallivm,
2696 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2697 invocation_id, ctx->i32_0, ""));
2698
2699 /* Determine the layout of one tess factor element in the buffer. */
2700 switch (shader->key.part.tcs.epilog.prim_mode) {
2701 case PIPE_PRIM_LINES:
2702 stride = 2; /* 2 dwords, 1 vec2 store */
2703 outer_comps = 2;
2704 inner_comps = 0;
2705 break;
2706 case PIPE_PRIM_TRIANGLES:
2707 stride = 4; /* 4 dwords, 1 vec4 store */
2708 outer_comps = 3;
2709 inner_comps = 1;
2710 break;
2711 case PIPE_PRIM_QUADS:
2712 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2713 outer_comps = 4;
2714 inner_comps = 2;
2715 break;
2716 default:
2717 assert(0);
2718 return;
2719 }
2720
2721 /* Load tess_inner and tess_outer from LDS.
2722 * Any invocation can write them, so we can't get them from a temporary.
2723 */
2724 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2725 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2726
2727 lds_base = tcs_out_current_patch_data_offset;
2728 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2729 LLVMConstInt(ctx->i32,
2730 tess_inner_index * 4, 0), "");
2731 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2732 LLVMConstInt(ctx->i32,
2733 tess_outer_index * 4, 0), "");
2734
2735 for (i = 0; i < 4; i++) {
2736 inner[i] = LLVMGetUndef(ctx->i32);
2737 outer[i] = LLVMGetUndef(ctx->i32);
2738 }
2739
2740 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2741 /* For isolines, the hardware expects tess factors in the
2742 * reverse order from what GLSL / TGSI specify.
2743 */
2744 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2745 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2746 } else {
2747 for (i = 0; i < outer_comps; i++) {
2748 outer[i] = out[i] =
2749 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2750 }
2751 for (i = 0; i < inner_comps; i++) {
2752 inner[i] = out[outer_comps+i] =
2753 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2754 }
2755 }
2756
2757 /* Convert the outputs to vectors for stores. */
2758 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2759 vec1 = NULL;
2760
2761 if (stride > 4)
2762 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2763
2764 /* Get the buffer. */
2765 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2766
2767 /* Get the offset. */
2768 tf_base = LLVMGetParam(ctx->main_fn,
2769 ctx->param_tcs_factor_offset);
2770 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2771 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2772
2773 lp_build_if(&inner_if_ctx, gallivm,
2774 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2775 rel_patch_id, ctx->i32_0, ""));
2776
2777 /* Store the dynamic HS control word. */
2778 offset = 0;
2779 if (ctx->screen->b.chip_class <= VI) {
2780 ac_build_buffer_store_dword(&ctx->ac, buffer,
2781 LLVMConstInt(ctx->i32, 0x80000000, 0),
2782 1, ctx->i32_0, tf_base,
2783 offset, 1, 0, true, false);
2784 offset += 4;
2785 }
2786
2787 lp_build_endif(&inner_if_ctx);
2788
2789 /* Store the tessellation factors. */
2790 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2791 MIN2(stride, 4), byteoffset, tf_base,
2792 offset, 1, 0, true, false);
2793 offset += 16;
2794 if (vec1)
2795 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2796 stride - 4, byteoffset, tf_base,
2797 offset, 1, 0, true, false);
2798
2799 /* Store the tess factors into the offchip buffer if TES reads them. */
2800 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2801 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2802 LLVMValueRef tf_inner_offset;
2803 unsigned param_outer, param_inner;
2804
2805 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2806 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2807
2808 param_outer = si_shader_io_get_unique_index_patch(
2809 TGSI_SEMANTIC_TESSOUTER, 0);
2810 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2811 LLVMConstInt(ctx->i32, param_outer, 0));
2812
2813 outer_vec = lp_build_gather_values(gallivm, outer,
2814 util_next_power_of_two(outer_comps));
2815
2816 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2817 outer_comps, tf_outer_offset,
2818 base, 0, 1, 0, true, false);
2819 if (inner_comps) {
2820 param_inner = si_shader_io_get_unique_index_patch(
2821 TGSI_SEMANTIC_TESSINNER, 0);
2822 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2823 LLVMConstInt(ctx->i32, param_inner, 0));
2824
2825 inner_vec = inner_comps == 1 ? inner[0] :
2826 lp_build_gather_values(gallivm, inner, inner_comps);
2827 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2828 inner_comps, tf_inner_offset,
2829 base, 0, 1, 0, true, false);
2830 }
2831 }
2832
2833 lp_build_endif(&if_ctx);
2834 }
2835
2836 static LLVMValueRef
2837 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2838 unsigned param, unsigned return_index)
2839 {
2840 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2841 LLVMGetParam(ctx->main_fn, param),
2842 return_index, "");
2843 }
2844
2845 static LLVMValueRef
2846 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2847 unsigned param, unsigned return_index)
2848 {
2849 LLVMBuilderRef builder = ctx->gallivm.builder;
2850 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2851
2852 return LLVMBuildInsertValue(builder, ret,
2853 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2854 return_index, "");
2855 }
2856
2857 static LLVMValueRef
2858 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2859 unsigned param, unsigned return_index)
2860 {
2861 LLVMBuilderRef builder = ctx->gallivm.builder;
2862 LLVMValueRef ptr, lo, hi;
2863
2864 ptr = LLVMGetParam(ctx->main_fn, param);
2865 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2866 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2867 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2868 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2869 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2870 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2871 }
2872
2873 /* This only writes the tessellation factor levels. */
2874 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2875 {
2876 struct si_shader_context *ctx = si_shader_context(bld_base);
2877 LLVMBuilderRef builder = ctx->gallivm.builder;
2878 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2879
2880 si_copy_tcs_inputs(bld_base);
2881
2882 rel_patch_id = get_rel_patch_id(ctx);
2883 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2884 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2885
2886 if (ctx->screen->b.chip_class >= GFX9) {
2887 LLVMBasicBlockRef blocks[2] = {
2888 LLVMGetInsertBlock(builder),
2889 ctx->merged_wrap_if_state.entry_block
2890 };
2891 LLVMValueRef values[2];
2892
2893 lp_build_endif(&ctx->merged_wrap_if_state);
2894
2895 values[0] = rel_patch_id;
2896 values[1] = LLVMGetUndef(ctx->i32);
2897 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2898
2899 values[0] = tf_lds_offset;
2900 values[1] = LLVMGetUndef(ctx->i32);
2901 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2902
2903 values[0] = invocation_id;
2904 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2905 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2906 }
2907
2908 /* Return epilog parameters from this function. */
2909 LLVMValueRef ret = ctx->return_value;
2910 unsigned vgpr;
2911
2912 if (ctx->screen->b.chip_class >= GFX9) {
2913 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2914 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2915 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2916 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2917 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2918 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2919 /* Tess offchip and tess factor offsets are at the beginning. */
2920 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2921 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2922 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2923 } else {
2924 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2925 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2926 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2927 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2928 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2929 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2930 /* Tess offchip and tess factor offsets are after user SGPRs. */
2931 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2932 GFX6_TCS_NUM_USER_SGPR);
2933 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2934 GFX6_TCS_NUM_USER_SGPR + 1);
2935 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2936 }
2937
2938 /* VGPRs */
2939 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2940 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2941 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2942
2943 /* Leave a hole corresponding to the two input VGPRs. This ensures that
2944 * the invocation_id output does not alias the param_tcs_rel_ids input,
2945 * which saves a V_MOV on gfx9.
2946 */
2947 vgpr += 2;
2948
2949 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2950 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2951 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2952 ctx->return_value = ret;
2953 }
2954
2955 /* Pass TCS inputs from LS to TCS on GFX9. */
2956 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2957 {
2958 LLVMValueRef ret = ctx->return_value;
2959
2960 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2961 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2962 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2963 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2964 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2965 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2966 ctx->param_bindless_samplers_and_images,
2967 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
2968
2969 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2970 8 + SI_SGPR_VS_STATE_BITS);
2971 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2972 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2973 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2974 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2975 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2976 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2977 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2978 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2979 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2980 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2981
2982 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2983 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2984 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2985 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2986 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2987
2988 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2989 ret = si_insert_input_ret_float(ctx, ret,
2990 ctx->param_tcs_patch_id, vgpr++);
2991 ret = si_insert_input_ret_float(ctx, ret,
2992 ctx->param_tcs_rel_ids, vgpr++);
2993 ctx->return_value = ret;
2994 }
2995
2996 /* Pass GS inputs from ES to GS on GFX9. */
2997 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2998 {
2999 LLVMValueRef ret = ctx->return_value;
3000
3001 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
3002 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3003 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3004
3005 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3006 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
3007 ctx->param_bindless_samplers_and_images,
3008 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3009
3010 unsigned desc_param = ctx->param_vs_state_bits + 1;
3011 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
3012 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
3013 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
3014 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
3015
3016 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
3017 for (unsigned i = 0; i < 5; i++) {
3018 unsigned param = ctx->param_gs_vtx01_offset + i;
3019 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3020 }
3021 ctx->return_value = ret;
3022 }
3023
3024 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
3025 {
3026 struct si_shader_context *ctx = si_shader_context(bld_base);
3027 struct si_shader *shader = ctx->shader;
3028 struct tgsi_shader_info *info = &shader->selector->info;
3029 struct gallivm_state *gallivm = &ctx->gallivm;
3030 unsigned i, chan;
3031 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3032 ctx->param_rel_auto_id);
3033 LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3034 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
3035 vertex_dw_stride, "");
3036
3037 /* Write outputs to LDS. The next shader (TCS aka HS) will read
3038 * its inputs from it. */
3039 for (i = 0; i < info->num_outputs; i++) {
3040 LLVMValueRef *out_ptr = ctx->outputs[i];
3041 unsigned name = info->output_semantic_name[i];
3042 unsigned index = info->output_semantic_index[i];
3043
3044 /* The ARB_shader_viewport_layer_array spec contains the
3045 * following issue:
3046 *
3047 * 2) What happens if gl_ViewportIndex or gl_Layer is
3048 * written in the vertex shader and a geometry shader is
3049 * present?
3050 *
3051 * RESOLVED: The value written by the last vertex processing
3052 * stage is used. If the last vertex processing stage
3053 * (vertex, tessellation evaluation or geometry) does not
3054 * statically assign to gl_ViewportIndex or gl_Layer, index
3055 * or layer zero is assumed.
3056 *
3057 * So writes to those outputs in VS-as-LS are simply ignored.
3058 */
3059 if (name == TGSI_SEMANTIC_LAYER ||
3060 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3061 continue;
3062
3063 int param = si_shader_io_get_unique_index(name, index);
3064 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
3065 LLVMConstInt(ctx->i32, param * 4, 0), "");
3066
3067 for (chan = 0; chan < 4; chan++) {
3068 lds_store(bld_base, chan, dw_addr,
3069 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
3070 }
3071 }
3072
3073 if (ctx->screen->b.chip_class >= GFX9)
3074 si_set_ls_return_value_for_tcs(ctx);
3075 }
3076
3077 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
3078 {
3079 struct si_shader_context *ctx = si_shader_context(bld_base);
3080 struct gallivm_state *gallivm = &ctx->gallivm;
3081 struct si_shader *es = ctx->shader;
3082 struct tgsi_shader_info *info = &es->selector->info;
3083 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3084 ctx->param_es2gs_offset);
3085 LLVMValueRef lds_base = NULL;
3086 unsigned chan;
3087 int i;
3088
3089 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
3090 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3091 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3092 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3093 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
3094 LLVMBuildMul(gallivm->builder, wave_idx,
3095 LLVMConstInt(ctx->i32, 64, false), ""), "");
3096 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
3097 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3098 }
3099
3100 for (i = 0; i < info->num_outputs; i++) {
3101 LLVMValueRef *out_ptr = ctx->outputs[i];
3102 int param;
3103
3104 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3105 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3106 continue;
3107
3108 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3109 info->output_semantic_index[i]);
3110
3111 for (chan = 0; chan < 4; chan++) {
3112 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3113 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3114
3115 /* GFX9 has the ESGS ring in LDS. */
3116 if (ctx->screen->b.chip_class >= GFX9) {
3117 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
3118 continue;
3119 }
3120
3121 ac_build_buffer_store_dword(&ctx->ac,
3122 ctx->esgs_ring,
3123 out_val, 1, NULL, soffset,
3124 (4 * param + chan) * 4,
3125 1, 1, true, true);
3126 }
3127 }
3128
3129 if (ctx->screen->b.chip_class >= GFX9)
3130 si_set_es_return_value_for_gs(ctx);
3131 }
3132
3133 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3134 {
3135 if (ctx->screen->b.chip_class >= GFX9)
3136 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3137 else
3138 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3139 }
3140
3141 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3142 {
3143 struct si_shader_context *ctx = si_shader_context(bld_base);
3144
3145 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3146 si_get_gs_wave_id(ctx));
3147
3148 if (ctx->screen->b.chip_class >= GFX9)
3149 lp_build_endif(&ctx->merged_wrap_if_state);
3150 }
3151
3152 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3153 unsigned max_outputs,
3154 LLVMValueRef *addrs)
3155 {
3156 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3157 struct gallivm_state *gallivm = &ctx->gallivm;
3158 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3159 struct si_shader_output_values *outputs = NULL;
3160 int i,j;
3161
3162 assert(!ctx->shader->is_gs_copy_shader);
3163 assert(info->num_outputs <= max_outputs);
3164
3165 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3166
3167 /* Vertex color clamping.
3168 *
3169 * This uses a state constant loaded in a user data SGPR and
3170 * an IF statement is added that clamps all colors if the constant
3171 * is true.
3172 */
3173 if (ctx->type == PIPE_SHADER_VERTEX) {
3174 struct lp_build_if_state if_ctx;
3175 LLVMValueRef cond = NULL;
3176 LLVMValueRef addr, val;
3177
3178 for (i = 0; i < info->num_outputs; i++) {
3179 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3180 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3181 continue;
3182
3183 /* We've found a color. */
3184 if (!cond) {
3185 /* The state is in the first bit of the user SGPR. */
3186 cond = LLVMGetParam(ctx->main_fn,
3187 ctx->param_vs_state_bits);
3188 cond = LLVMBuildTrunc(gallivm->builder, cond,
3189 ctx->i1, "");
3190 lp_build_if(&if_ctx, gallivm, cond);
3191 }
3192
3193 for (j = 0; j < 4; j++) {
3194 addr = addrs[4 * i + j];
3195 val = LLVMBuildLoad(gallivm->builder, addr, "");
3196 val = ac_build_clamp(&ctx->ac, val);
3197 LLVMBuildStore(gallivm->builder, val, addr);
3198 }
3199 }
3200
3201 if (cond)
3202 lp_build_endif(&if_ctx);
3203 }
3204
3205 for (i = 0; i < info->num_outputs; i++) {
3206 outputs[i].semantic_name = info->output_semantic_name[i];
3207 outputs[i].semantic_index = info->output_semantic_index[i];
3208
3209 for (j = 0; j < 4; j++) {
3210 outputs[i].values[j] =
3211 LLVMBuildLoad(gallivm->builder,
3212 addrs[4 * i + j],
3213 "");
3214 outputs[i].vertex_stream[j] =
3215 (info->output_streams[i] >> (2 * j)) & 3;
3216 }
3217 }
3218
3219 if (ctx->shader->selector->so.num_outputs)
3220 si_llvm_emit_streamout(ctx, outputs, i, 0);
3221
3222 /* Export PrimitiveID. */
3223 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3224 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3225 outputs[i].semantic_index = 0;
3226 outputs[i].values[0] = LLVMBuildBitCast(gallivm->builder,
3227 get_primitive_id(ctx, 0), ctx->f32, "");
3228 for (j = 1; j < 4; j++)
3229 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3230
3231 memset(outputs[i].vertex_stream, 0,
3232 sizeof(outputs[i].vertex_stream));
3233 i++;
3234 }
3235
3236 si_llvm_export_vs(&ctx->bld_base, outputs, i);
3237 FREE(outputs);
3238 }
3239
3240 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3241 {
3242 struct si_shader_context *ctx = si_shader_context(bld_base);
3243
3244 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3245 &ctx->outputs[0][0]);
3246 }
3247
3248 struct si_ps_exports {
3249 unsigned num;
3250 struct ac_export_args args[10];
3251 };
3252
3253 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3254 bool writes_samplemask)
3255 {
3256 if (writes_z) {
3257 /* Z needs 32 bits. */
3258 if (writes_samplemask)
3259 return V_028710_SPI_SHADER_32_ABGR;
3260 else if (writes_stencil)
3261 return V_028710_SPI_SHADER_32_GR;
3262 else
3263 return V_028710_SPI_SHADER_32_R;
3264 } else if (writes_stencil || writes_samplemask) {
3265 /* Both stencil and sample mask need only 16 bits. */
3266 return V_028710_SPI_SHADER_UINT16_ABGR;
3267 } else {
3268 return V_028710_SPI_SHADER_ZERO;
3269 }
3270 }
3271
3272 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3273 LLVMValueRef depth, LLVMValueRef stencil,
3274 LLVMValueRef samplemask, struct si_ps_exports *exp)
3275 {
3276 struct si_shader_context *ctx = si_shader_context(bld_base);
3277 struct lp_build_context *base = &bld_base->base;
3278 struct ac_export_args args;
3279 unsigned mask = 0;
3280 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3281 stencil != NULL,
3282 samplemask != NULL);
3283
3284 assert(depth || stencil || samplemask);
3285
3286 args.valid_mask = 1; /* whether the EXEC mask is valid */
3287 args.done = 1; /* DONE bit */
3288
3289 /* Specify the target we are exporting */
3290 args.target = V_008DFC_SQ_EXP_MRTZ;
3291
3292 args.compr = 0; /* COMP flag */
3293 args.out[0] = base->undef; /* R, depth */
3294 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3295 args.out[2] = base->undef; /* B, sample mask */
3296 args.out[3] = base->undef; /* A, alpha to mask */
3297
3298 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3299 assert(!depth);
3300 args.compr = 1; /* COMPR flag */
3301
3302 if (stencil) {
3303 /* Stencil should be in X[23:16]. */
3304 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3305 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3306 LLVMConstInt(ctx->i32, 16, 0), "");
3307 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3308 mask |= 0x3;
3309 }
3310 if (samplemask) {
3311 /* SampleMask should be in Y[15:0]. */
3312 args.out[1] = samplemask;
3313 mask |= 0xc;
3314 }
3315 } else {
3316 if (depth) {
3317 args.out[0] = depth;
3318 mask |= 0x1;
3319 }
3320 if (stencil) {
3321 args.out[1] = stencil;
3322 mask |= 0x2;
3323 }
3324 if (samplemask) {
3325 args.out[2] = samplemask;
3326 mask |= 0x4;
3327 }
3328 }
3329
3330 /* SI (except OLAND and HAINAN) has a bug that it only looks
3331 * at the X writemask component. */
3332 if (ctx->screen->b.chip_class == SI &&
3333 ctx->screen->b.family != CHIP_OLAND &&
3334 ctx->screen->b.family != CHIP_HAINAN)
3335 mask |= 0x1;
3336
3337 /* Specify which components to enable */
3338 args.enabled_channels = mask;
3339
3340 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3341 }
3342
3343 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3344 LLVMValueRef *color, unsigned index,
3345 unsigned samplemask_param,
3346 bool is_last, struct si_ps_exports *exp)
3347 {
3348 struct si_shader_context *ctx = si_shader_context(bld_base);
3349 struct lp_build_context *base = &bld_base->base;
3350 int i;
3351
3352 /* Clamp color */
3353 if (ctx->shader->key.part.ps.epilog.clamp_color)
3354 for (i = 0; i < 4; i++)
3355 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3356
3357 /* Alpha to one */
3358 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3359 color[3] = base->one;
3360
3361 /* Alpha test */
3362 if (index == 0 &&
3363 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3364 si_alpha_test(bld_base, color[3]);
3365
3366 /* Line & polygon smoothing */
3367 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3368 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3369 samplemask_param);
3370
3371 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3372 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3373 struct ac_export_args args[8];
3374 int c, last = -1;
3375
3376 /* Get the export arguments, also find out what the last one is. */
3377 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3378 si_llvm_init_export_args(bld_base, color,
3379 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3380 if (args[c].enabled_channels)
3381 last = c;
3382 }
3383
3384 /* Emit all exports. */
3385 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3386 if (is_last && last == c) {
3387 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3388 args[c].done = 1; /* DONE bit */
3389 } else if (!args[c].enabled_channels)
3390 continue; /* unnecessary NULL export */
3391
3392 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3393 }
3394 } else {
3395 struct ac_export_args args;
3396
3397 /* Export */
3398 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3399 &args);
3400 if (is_last) {
3401 args.valid_mask = 1; /* whether the EXEC mask is valid */
3402 args.done = 1; /* DONE bit */
3403 } else if (!args.enabled_channels)
3404 return; /* unnecessary NULL export */
3405
3406 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3407 }
3408 }
3409
3410 static void si_emit_ps_exports(struct si_shader_context *ctx,
3411 struct si_ps_exports *exp)
3412 {
3413 for (unsigned i = 0; i < exp->num; i++)
3414 ac_build_export(&ctx->ac, &exp->args[i]);
3415 }
3416
3417 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3418 {
3419 struct si_shader_context *ctx = si_shader_context(bld_base);
3420 struct lp_build_context *base = &bld_base->base;
3421 struct ac_export_args args;
3422
3423 args.enabled_channels = 0x0; /* enabled channels */
3424 args.valid_mask = 1; /* whether the EXEC mask is valid */
3425 args.done = 1; /* DONE bit */
3426 args.target = V_008DFC_SQ_EXP_NULL;
3427 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3428 args.out[0] = base->undef; /* R */
3429 args.out[1] = base->undef; /* G */
3430 args.out[2] = base->undef; /* B */
3431 args.out[3] = base->undef; /* A */
3432
3433 ac_build_export(&ctx->ac, &args);
3434 }
3435
3436 /**
3437 * Return PS outputs in this order:
3438 *
3439 * v[0:3] = color0.xyzw
3440 * v[4:7] = color1.xyzw
3441 * ...
3442 * vN+0 = Depth
3443 * vN+1 = Stencil
3444 * vN+2 = SampleMask
3445 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3446 *
3447 * The alpha-ref SGPR is returned via its original location.
3448 */
3449 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3450 unsigned max_outputs,
3451 LLVMValueRef *addrs)
3452 {
3453 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3454 struct si_shader *shader = ctx->shader;
3455 struct tgsi_shader_info *info = &shader->selector->info;
3456 LLVMBuilderRef builder = ctx->gallivm.builder;
3457 unsigned i, j, first_vgpr, vgpr;
3458
3459 LLVMValueRef color[8][4] = {};
3460 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3461 LLVMValueRef ret;
3462
3463 if (ctx->postponed_kill)
3464 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3465
3466 /* Read the output values. */
3467 for (i = 0; i < info->num_outputs; i++) {
3468 unsigned semantic_name = info->output_semantic_name[i];
3469 unsigned semantic_index = info->output_semantic_index[i];
3470
3471 switch (semantic_name) {
3472 case TGSI_SEMANTIC_COLOR:
3473 assert(semantic_index < 8);
3474 for (j = 0; j < 4; j++) {
3475 LLVMValueRef ptr = addrs[4 * i + j];
3476 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3477 color[semantic_index][j] = result;
3478 }
3479 break;
3480 case TGSI_SEMANTIC_POSITION:
3481 depth = LLVMBuildLoad(builder,
3482 addrs[4 * i + 2], "");
3483 break;
3484 case TGSI_SEMANTIC_STENCIL:
3485 stencil = LLVMBuildLoad(builder,
3486 addrs[4 * i + 1], "");
3487 break;
3488 case TGSI_SEMANTIC_SAMPLEMASK:
3489 samplemask = LLVMBuildLoad(builder,
3490 addrs[4 * i + 0], "");
3491 break;
3492 default:
3493 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3494 semantic_name);
3495 }
3496 }
3497
3498 /* Fill the return structure. */
3499 ret = ctx->return_value;
3500
3501 /* Set SGPRs. */
3502 ret = LLVMBuildInsertValue(builder, ret,
3503 LLVMBuildBitCast(ctx->ac.builder,
3504 LLVMGetParam(ctx->main_fn,
3505 SI_PARAM_ALPHA_REF),
3506 ctx->i32, ""),
3507 SI_SGPR_ALPHA_REF, "");
3508
3509 /* Set VGPRs */
3510 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3511 for (i = 0; i < ARRAY_SIZE(color); i++) {
3512 if (!color[i][0])
3513 continue;
3514
3515 for (j = 0; j < 4; j++)
3516 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3517 }
3518 if (depth)
3519 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3520 if (stencil)
3521 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3522 if (samplemask)
3523 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3524
3525 /* Add the input sample mask for smoothing at the end. */
3526 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3527 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3528 ret = LLVMBuildInsertValue(builder, ret,
3529 LLVMGetParam(ctx->main_fn,
3530 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3531
3532 ctx->return_value = ret;
3533 }
3534
3535 /* Prevent optimizations (at least of memory accesses) across the current
3536 * point in the program by emitting empty inline assembly that is marked as
3537 * having side effects.
3538 *
3539 * Optionally, a value can be passed through the inline assembly to prevent
3540 * LLVM from hoisting calls to ReadNone functions.
3541 */
3542 static void emit_optimization_barrier(struct si_shader_context *ctx,
3543 LLVMValueRef *pvgpr)
3544 {
3545 static int counter = 0;
3546
3547 LLVMBuilderRef builder = ctx->gallivm.builder;
3548 char code[16];
3549
3550 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3551
3552 if (!pvgpr) {
3553 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3554 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3555 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3556 } else {
3557 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3558 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3559 LLVMValueRef vgpr = *pvgpr;
3560 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3561 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3562 LLVMValueRef vgpr0;
3563
3564 assert(vgpr_size % 4 == 0);
3565
3566 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3567 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3568 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3569 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3570 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3571
3572 *pvgpr = vgpr;
3573 }
3574 }
3575
3576 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3577 {
3578 struct gallivm_state *gallivm = &ctx->gallivm;
3579 LLVMBuilderRef builder = gallivm->builder;
3580 LLVMValueRef args[1] = {
3581 LLVMConstInt(ctx->i32, simm16, 0)
3582 };
3583 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3584 ctx->voidt, args, 1, 0);
3585 }
3586
3587 static void membar_emit(
3588 const struct lp_build_tgsi_action *action,
3589 struct lp_build_tgsi_context *bld_base,
3590 struct lp_build_emit_data *emit_data)
3591 {
3592 struct si_shader_context *ctx = si_shader_context(bld_base);
3593 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3594 unsigned flags = LLVMConstIntGetZExtValue(src0);
3595 unsigned waitcnt = NOOP_WAITCNT;
3596
3597 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3598 waitcnt &= VM_CNT & LGKM_CNT;
3599
3600 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3601 TGSI_MEMBAR_SHADER_BUFFER |
3602 TGSI_MEMBAR_SHADER_IMAGE))
3603 waitcnt &= VM_CNT;
3604
3605 if (flags & TGSI_MEMBAR_SHARED)
3606 waitcnt &= LGKM_CNT;
3607
3608 if (waitcnt != NOOP_WAITCNT)
3609 si_emit_waitcnt(ctx, waitcnt);
3610 }
3611
3612 static void clock_emit(
3613 const struct lp_build_tgsi_action *action,
3614 struct lp_build_tgsi_context *bld_base,
3615 struct lp_build_emit_data *emit_data)
3616 {
3617 struct si_shader_context *ctx = si_shader_context(bld_base);
3618 struct gallivm_state *gallivm = &ctx->gallivm;
3619 LLVMValueRef tmp;
3620
3621 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3622 ctx->i64, NULL, 0, 0);
3623 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3624
3625 emit_data->output[0] =
3626 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3627 emit_data->output[1] =
3628 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3629 }
3630
3631 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3632 {
3633 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3634 CONST_ADDR_SPACE);
3635 }
3636
3637 static void si_llvm_emit_ddxy(
3638 const struct lp_build_tgsi_action *action,
3639 struct lp_build_tgsi_context *bld_base,
3640 struct lp_build_emit_data *emit_data)
3641 {
3642 struct si_shader_context *ctx = si_shader_context(bld_base);
3643 struct gallivm_state *gallivm = &ctx->gallivm;
3644 unsigned opcode = emit_data->info->opcode;
3645 LLVMValueRef val;
3646 int idx;
3647 unsigned mask;
3648
3649 if (opcode == TGSI_OPCODE_DDX_FINE)
3650 mask = AC_TID_MASK_LEFT;
3651 else if (opcode == TGSI_OPCODE_DDY_FINE)
3652 mask = AC_TID_MASK_TOP;
3653 else
3654 mask = AC_TID_MASK_TOP_LEFT;
3655
3656 /* for DDX we want to next X pixel, DDY next Y pixel. */
3657 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3658
3659 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3660 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3661 mask, idx, val);
3662 emit_data->output[emit_data->chan] = val;
3663 }
3664
3665 /*
3666 * this takes an I,J coordinate pair,
3667 * and works out the X and Y derivatives.
3668 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3669 */
3670 static LLVMValueRef si_llvm_emit_ddxy_interp(
3671 struct lp_build_tgsi_context *bld_base,
3672 LLVMValueRef interp_ij)
3673 {
3674 struct si_shader_context *ctx = si_shader_context(bld_base);
3675 struct gallivm_state *gallivm = &ctx->gallivm;
3676 LLVMValueRef result[4], a;
3677 unsigned i;
3678
3679 for (i = 0; i < 2; i++) {
3680 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3681 LLVMConstInt(ctx->i32, i, 0), "");
3682 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3683 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3684 }
3685
3686 return lp_build_gather_values(gallivm, result, 4);
3687 }
3688
3689 static void interp_fetch_args(
3690 struct lp_build_tgsi_context *bld_base,
3691 struct lp_build_emit_data *emit_data)
3692 {
3693 struct si_shader_context *ctx = si_shader_context(bld_base);
3694 struct gallivm_state *gallivm = &ctx->gallivm;
3695 const struct tgsi_full_instruction *inst = emit_data->inst;
3696
3697 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3698 /* offset is in second src, first two channels */
3699 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3700 emit_data->inst, 1,
3701 TGSI_CHAN_X);
3702 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3703 emit_data->inst, 1,
3704 TGSI_CHAN_Y);
3705 emit_data->arg_count = 2;
3706 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3707 LLVMValueRef sample_position;
3708 LLVMValueRef sample_id;
3709 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3710
3711 /* fetch sample ID, then fetch its sample position,
3712 * and place into first two channels.
3713 */
3714 sample_id = lp_build_emit_fetch(bld_base,
3715 emit_data->inst, 1, TGSI_CHAN_X);
3716 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3717 ctx->i32, "");
3718 sample_position = load_sample_position(ctx, sample_id);
3719
3720 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3721 sample_position,
3722 ctx->i32_0, "");
3723
3724 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3725 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3726 sample_position,
3727 ctx->i32_1, "");
3728 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3729 emit_data->arg_count = 2;
3730 }
3731 }
3732
3733 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3734 struct lp_build_tgsi_context *bld_base,
3735 struct lp_build_emit_data *emit_data)
3736 {
3737 struct si_shader_context *ctx = si_shader_context(bld_base);
3738 struct si_shader *shader = ctx->shader;
3739 struct gallivm_state *gallivm = &ctx->gallivm;
3740 const struct tgsi_shader_info *info = &shader->selector->info;
3741 LLVMValueRef interp_param;
3742 const struct tgsi_full_instruction *inst = emit_data->inst;
3743 const struct tgsi_full_src_register *input = &inst->Src[0];
3744 int input_base, input_array_size;
3745 int chan;
3746 int i;
3747 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3748 LLVMValueRef array_idx;
3749 int interp_param_idx;
3750 unsigned interp;
3751 unsigned location;
3752
3753 assert(input->Register.File == TGSI_FILE_INPUT);
3754
3755 if (input->Register.Indirect) {
3756 unsigned array_id = input->Indirect.ArrayID;
3757
3758 if (array_id) {
3759 input_base = info->input_array_first[array_id];
3760 input_array_size = info->input_array_last[array_id] - input_base + 1;
3761 } else {
3762 input_base = inst->Src[0].Register.Index;
3763 input_array_size = info->num_inputs - input_base;
3764 }
3765
3766 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3767 input->Register.Index - input_base);
3768 } else {
3769 input_base = inst->Src[0].Register.Index;
3770 input_array_size = 1;
3771 array_idx = ctx->i32_0;
3772 }
3773
3774 interp = shader->selector->info.input_interpolate[input_base];
3775
3776 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3777 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3778 location = TGSI_INTERPOLATE_LOC_CENTER;
3779 else
3780 location = TGSI_INTERPOLATE_LOC_CENTROID;
3781
3782 interp_param_idx = lookup_interp_param_index(interp, location);
3783 if (interp_param_idx == -1)
3784 return;
3785 else if (interp_param_idx)
3786 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3787 else
3788 interp_param = NULL;
3789
3790 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3791 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3792 LLVMValueRef ij_out[2];
3793 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3794
3795 /*
3796 * take the I then J parameters, and the DDX/Y for it, and
3797 * calculate the IJ inputs for the interpolator.
3798 * temp1 = ddx * offset/sample.x + I;
3799 * interp_param.I = ddy * offset/sample.y + temp1;
3800 * temp1 = ddx * offset/sample.x + J;
3801 * interp_param.J = ddy * offset/sample.y + temp1;
3802 */
3803 for (i = 0; i < 2; i++) {
3804 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3805 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3806 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3807 ddxy_out, ix_ll, "");
3808 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3809 ddxy_out, iy_ll, "");
3810 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3811 interp_param, ix_ll, "");
3812 LLVMValueRef temp1, temp2;
3813
3814 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3815 ctx->f32, "");
3816
3817 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3818
3819 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3820
3821 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3822
3823 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3824 }
3825 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3826 }
3827
3828 if (interp_param) {
3829 interp_param = LLVMBuildBitCast(gallivm->builder,
3830 interp_param, LLVMVectorType(ctx->f32, 2), "");
3831 }
3832
3833 for (chan = 0; chan < 4; chan++) {
3834 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3835 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3836
3837 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3838 LLVMValueRef v, i = NULL, j = NULL;
3839
3840 if (interp_param) {
3841 interp_param = LLVMBuildBitCast(gallivm->builder,
3842 interp_param, LLVMVectorType(ctx->f32, 2), "");
3843 i = LLVMBuildExtractElement(
3844 gallivm->builder, interp_param, ctx->i32_0, "");
3845 j = LLVMBuildExtractElement(
3846 gallivm->builder, interp_param, ctx->i32_1, "");
3847 }
3848 v = si_build_fs_interp(ctx, input_base + idx, schan,
3849 prim_mask, i, j);
3850
3851 gather = LLVMBuildInsertElement(gallivm->builder,
3852 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3853 }
3854
3855 emit_data->output[chan] = LLVMBuildExtractElement(
3856 gallivm->builder, gather, array_idx, "");
3857 }
3858 }
3859
3860 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3861 LLVMValueRef value)
3862 {
3863 struct gallivm_state *gallivm = &ctx->gallivm;
3864 LLVMValueRef args[3] = {
3865 value,
3866 ctx->i32_0,
3867 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3868 };
3869
3870 /* We currently have no other way to prevent LLVM from lifting the icmp
3871 * calls to a dominating basic block.
3872 */
3873 emit_optimization_barrier(ctx, &args[0]);
3874
3875 if (LLVMTypeOf(args[0]) != ctx->i32)
3876 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3877
3878 return lp_build_intrinsic(gallivm->builder,
3879 "llvm.amdgcn.icmp.i32",
3880 ctx->i64, args, 3,
3881 LP_FUNC_ATTR_NOUNWIND |
3882 LP_FUNC_ATTR_READNONE |
3883 LP_FUNC_ATTR_CONVERGENT);
3884 }
3885
3886 static void vote_all_emit(
3887 const struct lp_build_tgsi_action *action,
3888 struct lp_build_tgsi_context *bld_base,
3889 struct lp_build_emit_data *emit_data)
3890 {
3891 struct si_shader_context *ctx = si_shader_context(bld_base);
3892 struct gallivm_state *gallivm = &ctx->gallivm;
3893 LLVMValueRef active_set, vote_set;
3894 LLVMValueRef tmp;
3895
3896 active_set = si_emit_ballot(ctx, ctx->i32_1);
3897 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3898
3899 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3900 emit_data->output[emit_data->chan] =
3901 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3902 }
3903
3904 static void vote_any_emit(
3905 const struct lp_build_tgsi_action *action,
3906 struct lp_build_tgsi_context *bld_base,
3907 struct lp_build_emit_data *emit_data)
3908 {
3909 struct si_shader_context *ctx = si_shader_context(bld_base);
3910 struct gallivm_state *gallivm = &ctx->gallivm;
3911 LLVMValueRef vote_set;
3912 LLVMValueRef tmp;
3913
3914 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3915
3916 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3917 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3918 emit_data->output[emit_data->chan] =
3919 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3920 }
3921
3922 static void vote_eq_emit(
3923 const struct lp_build_tgsi_action *action,
3924 struct lp_build_tgsi_context *bld_base,
3925 struct lp_build_emit_data *emit_data)
3926 {
3927 struct si_shader_context *ctx = si_shader_context(bld_base);
3928 struct gallivm_state *gallivm = &ctx->gallivm;
3929 LLVMValueRef active_set, vote_set;
3930 LLVMValueRef all, none, tmp;
3931
3932 active_set = si_emit_ballot(ctx, ctx->i32_1);
3933 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3934
3935 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3936 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3937 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3938 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3939 emit_data->output[emit_data->chan] =
3940 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3941 }
3942
3943 static void ballot_emit(
3944 const struct lp_build_tgsi_action *action,
3945 struct lp_build_tgsi_context *bld_base,
3946 struct lp_build_emit_data *emit_data)
3947 {
3948 struct si_shader_context *ctx = si_shader_context(bld_base);
3949 LLVMBuilderRef builder = ctx->gallivm.builder;
3950 LLVMValueRef tmp;
3951
3952 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3953 tmp = si_emit_ballot(ctx, tmp);
3954 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3955
3956 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3957 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3958 }
3959
3960 static void read_invoc_fetch_args(
3961 struct lp_build_tgsi_context *bld_base,
3962 struct lp_build_emit_data *emit_data)
3963 {
3964 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3965 0, emit_data->src_chan);
3966
3967 /* Always read the source invocation (= lane) from the X channel. */
3968 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3969 1, TGSI_CHAN_X);
3970 emit_data->arg_count = 2;
3971 }
3972
3973 static void read_lane_emit(
3974 const struct lp_build_tgsi_action *action,
3975 struct lp_build_tgsi_context *bld_base,
3976 struct lp_build_emit_data *emit_data)
3977 {
3978 struct si_shader_context *ctx = si_shader_context(bld_base);
3979 LLVMBuilderRef builder = ctx->gallivm.builder;
3980
3981 /* We currently have no other way to prevent LLVM from lifting the icmp
3982 * calls to a dominating basic block.
3983 */
3984 emit_optimization_barrier(ctx, &emit_data->args[0]);
3985
3986 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3987 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3988 ctx->i32, "");
3989 }
3990
3991 emit_data->output[emit_data->chan] =
3992 ac_build_intrinsic(&ctx->ac, action->intr_name,
3993 ctx->i32, emit_data->args, emit_data->arg_count,
3994 AC_FUNC_ATTR_READNONE |
3995 AC_FUNC_ATTR_CONVERGENT);
3996 }
3997
3998 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3999 struct lp_build_emit_data *emit_data)
4000 {
4001 struct si_shader_context *ctx = si_shader_context(bld_base);
4002 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4003 LLVMValueRef imm;
4004 unsigned stream;
4005
4006 assert(src0.File == TGSI_FILE_IMMEDIATE);
4007
4008 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4009 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4010 return stream;
4011 }
4012
4013 /* Emit one vertex from the geometry shader */
4014 static void si_llvm_emit_vertex(
4015 const struct lp_build_tgsi_action *action,
4016 struct lp_build_tgsi_context *bld_base,
4017 struct lp_build_emit_data *emit_data)
4018 {
4019 struct si_shader_context *ctx = si_shader_context(bld_base);
4020 struct lp_build_context *uint = &bld_base->uint_bld;
4021 struct si_shader *shader = ctx->shader;
4022 struct tgsi_shader_info *info = &shader->selector->info;
4023 struct gallivm_state *gallivm = &ctx->gallivm;
4024 struct lp_build_if_state if_state;
4025 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4026 ctx->param_gs2vs_offset);
4027 LLVMValueRef gs_next_vertex;
4028 LLVMValueRef can_emit, kill;
4029 unsigned chan, offset;
4030 int i;
4031 unsigned stream;
4032
4033 stream = si_llvm_get_stream(bld_base, emit_data);
4034
4035 /* Write vertex attribute values to GSVS ring */
4036 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
4037 ctx->gs_next_vertex[stream],
4038 "");
4039
4040 /* If this thread has already emitted the declared maximum number of
4041 * vertices, skip the write: excessive vertex emissions are not
4042 * supposed to have any effect.
4043 *
4044 * If the shader has no writes to memory, kill it instead. This skips
4045 * further memory loads and may allow LLVM to skip to the end
4046 * altogether.
4047 */
4048 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
4049 LLVMConstInt(ctx->i32,
4050 shader->selector->gs_max_out_vertices, 0), "");
4051
4052 bool use_kill = !info->writes_memory;
4053 if (use_kill) {
4054 kill = lp_build_select(&bld_base->base, can_emit,
4055 LLVMConstReal(ctx->f32, 1.0f),
4056 LLVMConstReal(ctx->f32, -1.0f));
4057
4058 ac_build_kill(&ctx->ac, kill);
4059 } else {
4060 lp_build_if(&if_state, gallivm, can_emit);
4061 }
4062
4063 offset = 0;
4064 for (i = 0; i < info->num_outputs; i++) {
4065 LLVMValueRef *out_ptr = ctx->outputs[i];
4066
4067 for (chan = 0; chan < 4; chan++) {
4068 if (!(info->output_usagemask[i] & (1 << chan)) ||
4069 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4070 continue;
4071
4072 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4073 LLVMValueRef voffset =
4074 LLVMConstInt(ctx->i32, offset *
4075 shader->selector->gs_max_out_vertices, 0);
4076 offset++;
4077
4078 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4079 voffset = lp_build_mul_imm(uint, voffset, 4);
4080
4081 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4082
4083 ac_build_buffer_store_dword(&ctx->ac,
4084 ctx->gsvs_ring[stream],
4085 out_val, 1,
4086 voffset, soffset, 0,
4087 1, 1, true, true);
4088 }
4089 }
4090
4091 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4092 ctx->i32_1);
4093
4094 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4095
4096 /* Signal vertex emission */
4097 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4098 si_get_gs_wave_id(ctx));
4099 if (!use_kill)
4100 lp_build_endif(&if_state);
4101 }
4102
4103 /* Cut one primitive from the geometry shader */
4104 static void si_llvm_emit_primitive(
4105 const struct lp_build_tgsi_action *action,
4106 struct lp_build_tgsi_context *bld_base,
4107 struct lp_build_emit_data *emit_data)
4108 {
4109 struct si_shader_context *ctx = si_shader_context(bld_base);
4110 unsigned stream;
4111
4112 /* Signal primitive cut */
4113 stream = si_llvm_get_stream(bld_base, emit_data);
4114 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4115 si_get_gs_wave_id(ctx));
4116 }
4117
4118 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4119 struct lp_build_tgsi_context *bld_base,
4120 struct lp_build_emit_data *emit_data)
4121 {
4122 struct si_shader_context *ctx = si_shader_context(bld_base);
4123 struct gallivm_state *gallivm = &ctx->gallivm;
4124
4125 /* SI only (thanks to a hw bug workaround):
4126 * The real barrier instruction isn’t needed, because an entire patch
4127 * always fits into a single wave.
4128 */
4129 if (ctx->screen->b.chip_class == SI &&
4130 ctx->type == PIPE_SHADER_TESS_CTRL) {
4131 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4132 return;
4133 }
4134
4135 lp_build_intrinsic(gallivm->builder,
4136 "llvm.amdgcn.s.barrier",
4137 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4138 }
4139
4140 static const struct lp_build_tgsi_action interp_action = {
4141 .fetch_args = interp_fetch_args,
4142 .emit = build_interp_intrinsic,
4143 };
4144
4145 static void si_create_function(struct si_shader_context *ctx,
4146 const char *name,
4147 LLVMTypeRef *returns, unsigned num_returns,
4148 struct si_function_info *fninfo,
4149 unsigned max_workgroup_size)
4150 {
4151 int i;
4152
4153 si_llvm_create_func(ctx, name, returns, num_returns,
4154 fninfo->types, fninfo->num_params);
4155 ctx->return_value = LLVMGetUndef(ctx->return_type);
4156
4157 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4158 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4159
4160 /* The combination of:
4161 * - ByVal
4162 * - dereferenceable
4163 * - invariant.load
4164 * allows the optimization passes to move loads and reduces
4165 * SGPR spilling significantly.
4166 */
4167 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4168 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4169 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4170 ac_add_attr_dereferenceable(P, UINT64_MAX);
4171 } else
4172 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4173 }
4174
4175 for (i = 0; i < fninfo->num_params; ++i) {
4176 if (fninfo->assign[i])
4177 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4178 }
4179
4180 if (max_workgroup_size) {
4181 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4182 max_workgroup_size);
4183 }
4184 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4185 "no-signed-zeros-fp-math",
4186 "true");
4187
4188 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
4189 /* These were copied from some LLVM test. */
4190 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4191 "less-precise-fpmad",
4192 "true");
4193 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4194 "no-infs-fp-math",
4195 "true");
4196 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4197 "no-nans-fp-math",
4198 "true");
4199 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4200 "unsafe-fp-math",
4201 "true");
4202 }
4203 }
4204
4205 static void declare_streamout_params(struct si_shader_context *ctx,
4206 struct pipe_stream_output_info *so,
4207 struct si_function_info *fninfo)
4208 {
4209 int i;
4210
4211 /* Streamout SGPRs. */
4212 if (so->num_outputs) {
4213 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4214 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4215 else
4216 ctx->param_streamout_config = fninfo->num_params - 1;
4217
4218 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4219 }
4220 /* A streamout buffer offset is loaded if the stride is non-zero. */
4221 for (i = 0; i < 4; i++) {
4222 if (!so->stride[i])
4223 continue;
4224
4225 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4226 }
4227 }
4228
4229 static unsigned llvm_get_type_size(LLVMTypeRef type)
4230 {
4231 LLVMTypeKind kind = LLVMGetTypeKind(type);
4232
4233 switch (kind) {
4234 case LLVMIntegerTypeKind:
4235 return LLVMGetIntTypeWidth(type) / 8;
4236 case LLVMFloatTypeKind:
4237 return 4;
4238 case LLVMPointerTypeKind:
4239 return 8;
4240 case LLVMVectorTypeKind:
4241 return LLVMGetVectorSize(type) *
4242 llvm_get_type_size(LLVMGetElementType(type));
4243 case LLVMArrayTypeKind:
4244 return LLVMGetArrayLength(type) *
4245 llvm_get_type_size(LLVMGetElementType(type));
4246 default:
4247 assert(0);
4248 return 0;
4249 }
4250 }
4251
4252 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4253 {
4254 struct gallivm_state *gallivm = &ctx->gallivm;
4255
4256 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4257 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4258 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4259 "lds");
4260 }
4261
4262 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4263 {
4264 switch (shader->selector->type) {
4265 case PIPE_SHADER_TESS_CTRL:
4266 /* Return this so that LLVM doesn't remove s_barrier
4267 * instructions on chips where we use s_barrier. */
4268 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4269
4270 case PIPE_SHADER_GEOMETRY:
4271 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4272
4273 case PIPE_SHADER_COMPUTE:
4274 break; /* see below */
4275
4276 default:
4277 return 0;
4278 }
4279
4280 const unsigned *properties = shader->selector->info.properties;
4281 unsigned max_work_group_size =
4282 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4283 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4284 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4285
4286 if (!max_work_group_size) {
4287 /* This is a variable group size compute shader,
4288 * compile it for the maximum possible group size.
4289 */
4290 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4291 }
4292 return max_work_group_size;
4293 }
4294
4295 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4296 struct si_function_info *fninfo,
4297 bool assign_params)
4298 {
4299 unsigned const_and_shader_buffers =
4300 add_arg(fninfo, ARG_SGPR,
4301 si_const_array(ctx->v4i32,
4302 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
4303 unsigned samplers_and_images =
4304 add_arg(fninfo, ARG_SGPR,
4305 si_const_array(ctx->v8i32,
4306 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4307
4308 if (assign_params) {
4309 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4310 ctx->param_samplers_and_images = samplers_and_images;
4311 }
4312 }
4313
4314 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4315 struct si_function_info *fninfo)
4316 {
4317 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4318 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4319 ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4320 si_const_array(ctx->v8i32, 0));
4321 declare_per_stage_desc_pointers(ctx, fninfo, true);
4322 }
4323
4324 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4325 struct si_function_info *fninfo)
4326 {
4327 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4328 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4329 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4330 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4331 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4332 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4333 }
4334
4335 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4336 struct si_function_info *fninfo,
4337 unsigned *num_prolog_vgprs)
4338 {
4339 struct si_shader *shader = ctx->shader;
4340
4341 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4342 if (shader->key.as_ls) {
4343 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4344 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4345 } else {
4346 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4347 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4348 }
4349 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4350
4351 if (!shader->is_gs_copy_shader) {
4352 /* Vertex load indices. */
4353 ctx->param_vertex_index0 = fninfo->num_params;
4354 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4355 add_arg(fninfo, ARG_VGPR, ctx->i32);
4356 *num_prolog_vgprs += shader->selector->info.num_inputs;
4357 }
4358 }
4359
4360 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4361 struct si_function_info *fninfo)
4362 {
4363 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4364 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4365 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4366 ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4367 }
4368
4369 enum {
4370 /* Convenient merged shader definitions. */
4371 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4372 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4373 };
4374
4375 static void create_function(struct si_shader_context *ctx)
4376 {
4377 struct si_shader *shader = ctx->shader;
4378 struct si_function_info fninfo;
4379 LLVMTypeRef returns[16+32*4];
4380 unsigned i, num_return_sgprs;
4381 unsigned num_returns = 0;
4382 unsigned num_prolog_vgprs = 0;
4383 unsigned type = ctx->type;
4384
4385 si_init_function_info(&fninfo);
4386
4387 /* Set MERGED shaders. */
4388 if (ctx->screen->b.chip_class >= GFX9) {
4389 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4390 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4391 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4392 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4393 }
4394
4395 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4396
4397 switch (type) {
4398 case PIPE_SHADER_VERTEX:
4399 declare_default_desc_pointers(ctx, &fninfo);
4400 declare_vs_specific_input_sgprs(ctx, &fninfo);
4401
4402 if (shader->key.as_es) {
4403 assert(!shader->selector->nir);
4404 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4405 } else if (shader->key.as_ls) {
4406 assert(!shader->selector->nir);
4407 /* no extra parameters */
4408 } else {
4409 if (shader->is_gs_copy_shader) {
4410 fninfo.num_params = ctx->param_rw_buffers + 1;
4411 fninfo.num_sgpr_params = fninfo.num_params;
4412 }
4413
4414 /* The locations of the other parameters are assigned dynamically. */
4415 declare_streamout_params(ctx, &shader->selector->so,
4416 &fninfo);
4417 }
4418
4419 /* VGPRs */
4420 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4421 break;
4422
4423 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4424 declare_default_desc_pointers(ctx, &fninfo);
4425 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4426 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4427 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4428 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4429 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4430 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4431 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4432 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4433
4434 /* VGPRs */
4435 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4436 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4437
4438 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4439 * placed after the user SGPRs.
4440 */
4441 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4442 returns[num_returns++] = ctx->i32; /* SGPRs */
4443 for (i = 0; i < 5; i++)
4444 returns[num_returns++] = ctx->f32; /* VGPRs */
4445 break;
4446
4447 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4448 /* Merged stages have 8 system SGPRs at the beginning. */
4449 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4450 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4451 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4452 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4453 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4454 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4455 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4456 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4457
4458 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4459 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4460
4461 ctx->param_bindless_samplers_and_images =
4462 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
4463
4464 declare_per_stage_desc_pointers(ctx, &fninfo,
4465 ctx->type == PIPE_SHADER_VERTEX);
4466 declare_vs_specific_input_sgprs(ctx, &fninfo);
4467
4468 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4469 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4470 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4471 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4472 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4473 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4474
4475 declare_per_stage_desc_pointers(ctx, &fninfo,
4476 ctx->type == PIPE_SHADER_TESS_CTRL);
4477
4478 /* VGPRs (first TCS, then VS) */
4479 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4480 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4481
4482 if (ctx->type == PIPE_SHADER_VERTEX) {
4483 declare_vs_input_vgprs(ctx, &fninfo,
4484 &num_prolog_vgprs);
4485
4486 /* LS return values are inputs to the TCS main shader part. */
4487 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4488 returns[num_returns++] = ctx->i32; /* SGPRs */
4489 for (i = 0; i < 2; i++)
4490 returns[num_returns++] = ctx->f32; /* VGPRs */
4491 } else {
4492 /* TCS return values are inputs to the TCS epilog.
4493 *
4494 * param_tcs_offchip_offset, param_tcs_factor_offset,
4495 * param_tcs_offchip_layout, and param_rw_buffers
4496 * should be passed to the epilog.
4497 */
4498 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4499 returns[num_returns++] = ctx->i32; /* SGPRs */
4500 for (i = 0; i < 5; i++)
4501 returns[num_returns++] = ctx->f32; /* VGPRs */
4502 }
4503 break;
4504
4505 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4506 /* Merged stages have 8 system SGPRs at the beginning. */
4507 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4508 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4509 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4510 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4511 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4512 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4513 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4514 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4515
4516 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4517 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4518
4519 ctx->param_bindless_samplers_and_images =
4520 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
4521
4522 declare_per_stage_desc_pointers(ctx, &fninfo,
4523 (ctx->type == PIPE_SHADER_VERTEX ||
4524 ctx->type == PIPE_SHADER_TESS_EVAL));
4525 if (ctx->type == PIPE_SHADER_VERTEX) {
4526 declare_vs_specific_input_sgprs(ctx, &fninfo);
4527 } else {
4528 /* TESS_EVAL (and also GEOMETRY):
4529 * Declare as many input SGPRs as the VS has. */
4530 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4531 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4532 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4533 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4534 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4535 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4536 }
4537
4538 declare_per_stage_desc_pointers(ctx, &fninfo,
4539 ctx->type == PIPE_SHADER_GEOMETRY);
4540
4541 /* VGPRs (first GS, then VS/TES) */
4542 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4543 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4544 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4545 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4546 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4547
4548 if (ctx->type == PIPE_SHADER_VERTEX) {
4549 declare_vs_input_vgprs(ctx, &fninfo,
4550 &num_prolog_vgprs);
4551 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4552 declare_tes_input_vgprs(ctx, &fninfo);
4553 }
4554
4555 if (ctx->type == PIPE_SHADER_VERTEX ||
4556 ctx->type == PIPE_SHADER_TESS_EVAL) {
4557 /* ES return values are inputs to GS. */
4558 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4559 returns[num_returns++] = ctx->i32; /* SGPRs */
4560 for (i = 0; i < 5; i++)
4561 returns[num_returns++] = ctx->f32; /* VGPRs */
4562 }
4563 break;
4564
4565 case PIPE_SHADER_TESS_EVAL:
4566 declare_default_desc_pointers(ctx, &fninfo);
4567 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4568 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4569
4570 if (shader->key.as_es) {
4571 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4572 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4573 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4574 } else {
4575 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4576 declare_streamout_params(ctx, &shader->selector->so,
4577 &fninfo);
4578 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4579 }
4580
4581 /* VGPRs */
4582 declare_tes_input_vgprs(ctx, &fninfo);
4583 break;
4584
4585 case PIPE_SHADER_GEOMETRY:
4586 declare_default_desc_pointers(ctx, &fninfo);
4587 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4588 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4589
4590 /* VGPRs */
4591 ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4592 ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4593 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4594 ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4595 ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4596 ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4597 ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4598 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4599 break;
4600
4601 case PIPE_SHADER_FRAGMENT:
4602 declare_default_desc_pointers(ctx, &fninfo);
4603 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4604 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4605
4606 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4607 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4608 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4609 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4610 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4611 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4612 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4613 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4614 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4615 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4616 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4617 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4618 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4619 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4620 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4621 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4622 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4623 &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4624 shader->info.face_vgpr_index = 20;
4625 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4626 &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4627 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4628 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4629 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4630
4631 /* Color inputs from the prolog. */
4632 if (shader->selector->info.colors_read) {
4633 unsigned num_color_elements =
4634 util_bitcount(shader->selector->info.colors_read);
4635
4636 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4637 for (i = 0; i < num_color_elements; i++)
4638 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4639
4640 num_prolog_vgprs += num_color_elements;
4641 }
4642
4643 /* Outputs for the epilog. */
4644 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4645 num_returns =
4646 num_return_sgprs +
4647 util_bitcount(shader->selector->info.colors_written) * 4 +
4648 shader->selector->info.writes_z +
4649 shader->selector->info.writes_stencil +
4650 shader->selector->info.writes_samplemask +
4651 1 /* SampleMaskIn */;
4652
4653 num_returns = MAX2(num_returns,
4654 num_return_sgprs +
4655 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4656
4657 for (i = 0; i < num_return_sgprs; i++)
4658 returns[i] = ctx->i32;
4659 for (; i < num_returns; i++)
4660 returns[i] = ctx->f32;
4661 break;
4662
4663 case PIPE_SHADER_COMPUTE:
4664 declare_default_desc_pointers(ctx, &fninfo);
4665 if (shader->selector->info.uses_grid_size)
4666 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4667 if (shader->selector->info.uses_block_size)
4668 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4669
4670 for (i = 0; i < 3; i++) {
4671 ctx->param_block_id[i] = -1;
4672 if (shader->selector->info.uses_block_id[i])
4673 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4674 }
4675
4676 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4677 break;
4678 default:
4679 assert(0 && "unimplemented shader");
4680 return;
4681 }
4682
4683 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4684 si_get_max_workgroup_size(shader));
4685
4686 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4687 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4688 ctx->separate_prolog) {
4689 si_llvm_add_attribute(ctx->main_fn,
4690 "InitialPSInputAddr",
4691 S_0286D0_PERSP_SAMPLE_ENA(1) |
4692 S_0286D0_PERSP_CENTER_ENA(1) |
4693 S_0286D0_PERSP_CENTROID_ENA(1) |
4694 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4695 S_0286D0_LINEAR_CENTER_ENA(1) |
4696 S_0286D0_LINEAR_CENTROID_ENA(1) |
4697 S_0286D0_FRONT_FACE_ENA(1) |
4698 S_0286D0_POS_FIXED_PT_ENA(1));
4699 }
4700
4701 shader->info.num_input_sgprs = 0;
4702 shader->info.num_input_vgprs = 0;
4703
4704 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4705 shader->info.num_input_sgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4706
4707 for (; i < fninfo.num_params; ++i)
4708 shader->info.num_input_vgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4709
4710 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4711 shader->info.num_input_vgprs -= num_prolog_vgprs;
4712
4713 if (shader->key.as_ls ||
4714 ctx->type == PIPE_SHADER_TESS_CTRL ||
4715 /* GFX9 has the ESGS ring buffer in LDS. */
4716 (ctx->screen->b.chip_class >= GFX9 &&
4717 (shader->key.as_es ||
4718 ctx->type == PIPE_SHADER_GEOMETRY)))
4719 declare_lds_as_pointer(ctx);
4720 }
4721
4722 /**
4723 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4724 * for later use.
4725 */
4726 static void preload_ring_buffers(struct si_shader_context *ctx)
4727 {
4728 struct gallivm_state *gallivm = &ctx->gallivm;
4729 LLVMBuilderRef builder = gallivm->builder;
4730
4731 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4732 ctx->param_rw_buffers);
4733
4734 if (ctx->screen->b.chip_class <= VI &&
4735 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4736 unsigned ring =
4737 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4738 : SI_ES_RING_ESGS;
4739 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4740
4741 ctx->esgs_ring =
4742 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4743 }
4744
4745 if (ctx->shader->is_gs_copy_shader) {
4746 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4747
4748 ctx->gsvs_ring[0] =
4749 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4750 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4751 const struct si_shader_selector *sel = ctx->shader->selector;
4752 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4753 LLVMValueRef base_ring;
4754
4755 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4756
4757 /* The conceptual layout of the GSVS ring is
4758 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4759 * but the real memory layout is swizzled across
4760 * threads:
4761 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4762 * t16v0c0 ..
4763 * Override the buffer descriptor accordingly.
4764 */
4765 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4766 uint64_t stream_offset = 0;
4767
4768 for (unsigned stream = 0; stream < 4; ++stream) {
4769 unsigned num_components;
4770 unsigned stride;
4771 unsigned num_records;
4772 LLVMValueRef ring, tmp;
4773
4774 num_components = sel->info.num_stream_output_components[stream];
4775 if (!num_components)
4776 continue;
4777
4778 stride = 4 * num_components * sel->gs_max_out_vertices;
4779
4780 /* Limit on the stride field for <= CIK. */
4781 assert(stride < (1 << 14));
4782
4783 num_records = 64;
4784
4785 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4786 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4787 tmp = LLVMBuildAdd(builder, tmp,
4788 LLVMConstInt(ctx->i64,
4789 stream_offset, 0), "");
4790 stream_offset += stride * 64;
4791
4792 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4793 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4794 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4795 tmp = LLVMBuildOr(builder, tmp,
4796 LLVMConstInt(ctx->i32,
4797 S_008F04_STRIDE(stride) |
4798 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4799 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4800 ring = LLVMBuildInsertElement(builder, ring,
4801 LLVMConstInt(ctx->i32, num_records, 0),
4802 LLVMConstInt(ctx->i32, 2, 0), "");
4803 ring = LLVMBuildInsertElement(builder, ring,
4804 LLVMConstInt(ctx->i32,
4805 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4806 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4807 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4808 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4809 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4810 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4811 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4812 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4813 S_008F0C_ADD_TID_ENABLE(1),
4814 0),
4815 LLVMConstInt(ctx->i32, 3, 0), "");
4816
4817 ctx->gsvs_ring[stream] = ring;
4818 }
4819 }
4820 }
4821
4822 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4823 LLVMValueRef param_rw_buffers,
4824 unsigned param_pos_fixed_pt)
4825 {
4826 struct gallivm_state *gallivm = &ctx->gallivm;
4827 LLVMBuilderRef builder = gallivm->builder;
4828 LLVMValueRef slot, desc, offset, row, bit, address[2];
4829
4830 /* Use the fixed-point gl_FragCoord input.
4831 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4832 * per coordinate to get the repeating effect.
4833 */
4834 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4835 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4836
4837 /* Load the buffer descriptor. */
4838 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4839 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4840
4841 /* The stipple pattern is 32x32, each row has 32 bits. */
4842 offset = LLVMBuildMul(builder, address[1],
4843 LLVMConstInt(ctx->i32, 4, 0), "");
4844 row = buffer_load_const(ctx, desc, offset);
4845 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4846 bit = LLVMBuildLShr(builder, row, address[0], "");
4847 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4848
4849 /* The intrinsic kills the thread if arg < 0. */
4850 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4851 LLVMConstReal(ctx->f32, -1), "");
4852 ac_build_kill(&ctx->ac, bit);
4853 }
4854
4855 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4856 struct si_shader_config *conf,
4857 unsigned symbol_offset)
4858 {
4859 unsigned i;
4860 const unsigned char *config =
4861 ac_shader_binary_config_start(binary, symbol_offset);
4862 bool really_needs_scratch = false;
4863
4864 /* LLVM adds SGPR spills to the scratch size.
4865 * Find out if we really need the scratch buffer.
4866 */
4867 for (i = 0; i < binary->reloc_count; i++) {
4868 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4869
4870 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4871 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4872 really_needs_scratch = true;
4873 break;
4874 }
4875 }
4876
4877 /* XXX: We may be able to emit some of these values directly rather than
4878 * extracting fields to be emitted later.
4879 */
4880
4881 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4882 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4883 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4884 switch (reg) {
4885 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4886 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4887 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4888 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4889 case R_00B848_COMPUTE_PGM_RSRC1:
4890 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4891 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4892 conf->float_mode = G_00B028_FLOAT_MODE(value);
4893 conf->rsrc1 = value;
4894 break;
4895 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4896 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4897 break;
4898 case R_00B84C_COMPUTE_PGM_RSRC2:
4899 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4900 conf->rsrc2 = value;
4901 break;
4902 case R_0286CC_SPI_PS_INPUT_ENA:
4903 conf->spi_ps_input_ena = value;
4904 break;
4905 case R_0286D0_SPI_PS_INPUT_ADDR:
4906 conf->spi_ps_input_addr = value;
4907 break;
4908 case R_0286E8_SPI_TMPRING_SIZE:
4909 case R_00B860_COMPUTE_TMPRING_SIZE:
4910 /* WAVESIZE is in units of 256 dwords. */
4911 if (really_needs_scratch)
4912 conf->scratch_bytes_per_wave =
4913 G_00B860_WAVESIZE(value) * 256 * 4;
4914 break;
4915 case 0x4: /* SPILLED_SGPRS */
4916 conf->spilled_sgprs = value;
4917 break;
4918 case 0x8: /* SPILLED_VGPRS */
4919 conf->spilled_vgprs = value;
4920 break;
4921 default:
4922 {
4923 static bool printed;
4924
4925 if (!printed) {
4926 fprintf(stderr, "Warning: LLVM emitted unknown "
4927 "config register: 0x%x\n", reg);
4928 printed = true;
4929 }
4930 }
4931 break;
4932 }
4933 }
4934
4935 if (!conf->spi_ps_input_addr)
4936 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4937 }
4938
4939 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4940 uint64_t scratch_va)
4941 {
4942 unsigned i;
4943 uint32_t scratch_rsrc_dword0 = scratch_va;
4944 uint32_t scratch_rsrc_dword1 =
4945 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4946
4947 /* Enable scratch coalescing. */
4948 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4949
4950 for (i = 0 ; i < shader->binary.reloc_count; i++) {
4951 const struct ac_shader_reloc *reloc =
4952 &shader->binary.relocs[i];
4953 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4954 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4955 &scratch_rsrc_dword0, 4);
4956 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4957 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4958 &scratch_rsrc_dword1, 4);
4959 }
4960 }
4961 }
4962
4963 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4964 {
4965 unsigned size = shader->binary.code_size;
4966
4967 if (shader->prolog)
4968 size += shader->prolog->binary.code_size;
4969 if (shader->previous_stage)
4970 size += shader->previous_stage->binary.code_size;
4971 if (shader->prolog2)
4972 size += shader->prolog2->binary.code_size;
4973 if (shader->epilog)
4974 size += shader->epilog->binary.code_size;
4975 return size;
4976 }
4977
4978 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4979 {
4980 const struct ac_shader_binary *prolog =
4981 shader->prolog ? &shader->prolog->binary : NULL;
4982 const struct ac_shader_binary *previous_stage =
4983 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4984 const struct ac_shader_binary *prolog2 =
4985 shader->prolog2 ? &shader->prolog2->binary : NULL;
4986 const struct ac_shader_binary *epilog =
4987 shader->epilog ? &shader->epilog->binary : NULL;
4988 const struct ac_shader_binary *mainb = &shader->binary;
4989 unsigned bo_size = si_get_shader_binary_size(shader) +
4990 (!epilog ? mainb->rodata_size : 0);
4991 unsigned char *ptr;
4992
4993 assert(!prolog || !prolog->rodata_size);
4994 assert(!previous_stage || !previous_stage->rodata_size);
4995 assert(!prolog2 || !prolog2->rodata_size);
4996 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4997 !mainb->rodata_size);
4998 assert(!epilog || !epilog->rodata_size);
4999
5000 r600_resource_reference(&shader->bo, NULL);
5001 shader->bo = (struct r600_resource*)
5002 pipe_buffer_create(&sscreen->b.b, 0,
5003 PIPE_USAGE_IMMUTABLE,
5004 align(bo_size, SI_CPDMA_ALIGNMENT));
5005 if (!shader->bo)
5006 return -ENOMEM;
5007
5008 /* Upload. */
5009 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
5010 PIPE_TRANSFER_READ_WRITE |
5011 PIPE_TRANSFER_UNSYNCHRONIZED);
5012
5013 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
5014 * endian-independent. */
5015 if (prolog) {
5016 memcpy(ptr, prolog->code, prolog->code_size);
5017 ptr += prolog->code_size;
5018 }
5019 if (previous_stage) {
5020 memcpy(ptr, previous_stage->code, previous_stage->code_size);
5021 ptr += previous_stage->code_size;
5022 }
5023 if (prolog2) {
5024 memcpy(ptr, prolog2->code, prolog2->code_size);
5025 ptr += prolog2->code_size;
5026 }
5027
5028 memcpy(ptr, mainb->code, mainb->code_size);
5029 ptr += mainb->code_size;
5030
5031 if (epilog)
5032 memcpy(ptr, epilog->code, epilog->code_size);
5033 else if (mainb->rodata_size > 0)
5034 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5035
5036 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5037 return 0;
5038 }
5039
5040 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5041 struct pipe_debug_callback *debug,
5042 const char *name, FILE *file)
5043 {
5044 char *line, *p;
5045 unsigned i, count;
5046
5047 if (binary->disasm_string) {
5048 fprintf(file, "Shader %s disassembly:\n", name);
5049 fprintf(file, "%s", binary->disasm_string);
5050
5051 if (debug && debug->debug_message) {
5052 /* Very long debug messages are cut off, so send the
5053 * disassembly one line at a time. This causes more
5054 * overhead, but on the plus side it simplifies
5055 * parsing of resulting logs.
5056 */
5057 pipe_debug_message(debug, SHADER_INFO,
5058 "Shader Disassembly Begin");
5059
5060 line = binary->disasm_string;
5061 while (*line) {
5062 p = util_strchrnul(line, '\n');
5063 count = p - line;
5064
5065 if (count) {
5066 pipe_debug_message(debug, SHADER_INFO,
5067 "%.*s", count, line);
5068 }
5069
5070 if (!*p)
5071 break;
5072 line = p + 1;
5073 }
5074
5075 pipe_debug_message(debug, SHADER_INFO,
5076 "Shader Disassembly End");
5077 }
5078 } else {
5079 fprintf(file, "Shader %s binary:\n", name);
5080 for (i = 0; i < binary->code_size; i += 4) {
5081 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5082 binary->code[i + 3], binary->code[i + 2],
5083 binary->code[i + 1], binary->code[i]);
5084 }
5085 }
5086 }
5087
5088 static void si_shader_dump_stats(struct si_screen *sscreen,
5089 const struct si_shader *shader,
5090 struct pipe_debug_callback *debug,
5091 unsigned processor,
5092 FILE *file,
5093 bool check_debug_option)
5094 {
5095 const struct si_shader_config *conf = &shader->config;
5096 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
5097 unsigned code_size = si_get_shader_binary_size(shader);
5098 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5099 unsigned lds_per_wave = 0;
5100 unsigned max_simd_waves;
5101
5102 switch (sscreen->b.family) {
5103 /* These always have 8 waves: */
5104 case CHIP_POLARIS10:
5105 case CHIP_POLARIS11:
5106 case CHIP_POLARIS12:
5107 max_simd_waves = 8;
5108 break;
5109 default:
5110 max_simd_waves = 10;
5111 }
5112
5113 /* Compute LDS usage for PS. */
5114 switch (processor) {
5115 case PIPE_SHADER_FRAGMENT:
5116 /* The minimum usage per wave is (num_inputs * 48). The maximum
5117 * usage is (num_inputs * 48 * 16).
5118 * We can get anything in between and it varies between waves.
5119 *
5120 * The 48 bytes per input for a single primitive is equal to
5121 * 4 bytes/component * 4 components/input * 3 points.
5122 *
5123 * Other stages don't know the size at compile time or don't
5124 * allocate LDS per wave, but instead they do it per thread group.
5125 */
5126 lds_per_wave = conf->lds_size * lds_increment +
5127 align(num_inputs * 48, lds_increment);
5128 break;
5129 case PIPE_SHADER_COMPUTE:
5130 if (shader->selector) {
5131 unsigned max_workgroup_size =
5132 si_get_max_workgroup_size(shader);
5133 lds_per_wave = (conf->lds_size * lds_increment) /
5134 DIV_ROUND_UP(max_workgroup_size, 64);
5135 }
5136 break;
5137 }
5138
5139 /* Compute the per-SIMD wave counts. */
5140 if (conf->num_sgprs) {
5141 if (sscreen->b.chip_class >= VI)
5142 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5143 else
5144 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5145 }
5146
5147 if (conf->num_vgprs)
5148 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5149
5150 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5151 * 16KB makes some SIMDs unoccupied). */
5152 if (lds_per_wave)
5153 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5154
5155 if (!check_debug_option ||
5156 r600_can_dump_shader(&sscreen->b, processor)) {
5157 if (processor == PIPE_SHADER_FRAGMENT) {
5158 fprintf(file, "*** SHADER CONFIG ***\n"
5159 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5160 "SPI_PS_INPUT_ENA = 0x%04x\n",
5161 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5162 }
5163
5164 fprintf(file, "*** SHADER STATS ***\n"
5165 "SGPRS: %d\n"
5166 "VGPRS: %d\n"
5167 "Spilled SGPRs: %d\n"
5168 "Spilled VGPRs: %d\n"
5169 "Private memory VGPRs: %d\n"
5170 "Code Size: %d bytes\n"
5171 "LDS: %d blocks\n"
5172 "Scratch: %d bytes per wave\n"
5173 "Max Waves: %d\n"
5174 "********************\n\n\n",
5175 conf->num_sgprs, conf->num_vgprs,
5176 conf->spilled_sgprs, conf->spilled_vgprs,
5177 conf->private_mem_vgprs, code_size,
5178 conf->lds_size, conf->scratch_bytes_per_wave,
5179 max_simd_waves);
5180 }
5181
5182 pipe_debug_message(debug, SHADER_INFO,
5183 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5184 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5185 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5186 conf->num_sgprs, conf->num_vgprs, code_size,
5187 conf->lds_size, conf->scratch_bytes_per_wave,
5188 max_simd_waves, conf->spilled_sgprs,
5189 conf->spilled_vgprs, conf->private_mem_vgprs);
5190 }
5191
5192 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5193 {
5194 switch (processor) {
5195 case PIPE_SHADER_VERTEX:
5196 if (shader->key.as_es)
5197 return "Vertex Shader as ES";
5198 else if (shader->key.as_ls)
5199 return "Vertex Shader as LS";
5200 else
5201 return "Vertex Shader as VS";
5202 case PIPE_SHADER_TESS_CTRL:
5203 return "Tessellation Control Shader";
5204 case PIPE_SHADER_TESS_EVAL:
5205 if (shader->key.as_es)
5206 return "Tessellation Evaluation Shader as ES";
5207 else
5208 return "Tessellation Evaluation Shader as VS";
5209 case PIPE_SHADER_GEOMETRY:
5210 if (shader->is_gs_copy_shader)
5211 return "GS Copy Shader as VS";
5212 else
5213 return "Geometry Shader";
5214 case PIPE_SHADER_FRAGMENT:
5215 return "Pixel Shader";
5216 case PIPE_SHADER_COMPUTE:
5217 return "Compute Shader";
5218 default:
5219 return "Unknown Shader";
5220 }
5221 }
5222
5223 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5224 struct pipe_debug_callback *debug, unsigned processor,
5225 FILE *file, bool check_debug_option)
5226 {
5227 if (!check_debug_option ||
5228 r600_can_dump_shader(&sscreen->b, processor))
5229 si_dump_shader_key(processor, shader, file);
5230
5231 if (!check_debug_option && shader->binary.llvm_ir_string) {
5232 if (shader->previous_stage &&
5233 shader->previous_stage->binary.llvm_ir_string) {
5234 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5235 si_get_shader_name(shader, processor));
5236 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5237 }
5238
5239 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5240 si_get_shader_name(shader, processor));
5241 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5242 }
5243
5244 if (!check_debug_option ||
5245 (r600_can_dump_shader(&sscreen->b, processor) &&
5246 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5247 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5248
5249 if (shader->prolog)
5250 si_shader_dump_disassembly(&shader->prolog->binary,
5251 debug, "prolog", file);
5252 if (shader->previous_stage)
5253 si_shader_dump_disassembly(&shader->previous_stage->binary,
5254 debug, "previous stage", file);
5255 if (shader->prolog2)
5256 si_shader_dump_disassembly(&shader->prolog2->binary,
5257 debug, "prolog2", file);
5258
5259 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5260
5261 if (shader->epilog)
5262 si_shader_dump_disassembly(&shader->epilog->binary,
5263 debug, "epilog", file);
5264 fprintf(file, "\n");
5265 }
5266
5267 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5268 check_debug_option);
5269 }
5270
5271 static int si_compile_llvm(struct si_screen *sscreen,
5272 struct ac_shader_binary *binary,
5273 struct si_shader_config *conf,
5274 LLVMTargetMachineRef tm,
5275 LLVMModuleRef mod,
5276 struct pipe_debug_callback *debug,
5277 unsigned processor,
5278 const char *name)
5279 {
5280 int r = 0;
5281 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5282
5283 if (r600_can_dump_shader(&sscreen->b, processor)) {
5284 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5285
5286 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5287 fprintf(stderr, "%s LLVM IR:\n\n", name);
5288 ac_dump_module(mod);
5289 fprintf(stderr, "\n");
5290 }
5291 }
5292
5293 if (sscreen->record_llvm_ir) {
5294 char *ir = LLVMPrintModuleToString(mod);
5295 binary->llvm_ir_string = strdup(ir);
5296 LLVMDisposeMessage(ir);
5297 }
5298
5299 if (!si_replace_shader(count, binary)) {
5300 r = si_llvm_compile(mod, binary, tm, debug);
5301 if (r)
5302 return r;
5303 }
5304
5305 si_shader_binary_read_config(binary, conf, 0);
5306
5307 /* Enable 64-bit and 16-bit denormals, because there is no performance
5308 * cost.
5309 *
5310 * If denormals are enabled, all floating-point output modifiers are
5311 * ignored.
5312 *
5313 * Don't enable denormals for 32-bit floats, because:
5314 * - Floating-point output modifiers would be ignored by the hw.
5315 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5316 * have to stop using those.
5317 * - SI & CI would be very slow.
5318 */
5319 conf->float_mode |= V_00B028_FP_64_DENORMS;
5320
5321 FREE(binary->config);
5322 FREE(binary->global_symbol_offsets);
5323 binary->config = NULL;
5324 binary->global_symbol_offsets = NULL;
5325
5326 /* Some shaders can't have rodata because their binaries can be
5327 * concatenated.
5328 */
5329 if (binary->rodata_size &&
5330 (processor == PIPE_SHADER_VERTEX ||
5331 processor == PIPE_SHADER_TESS_CTRL ||
5332 processor == PIPE_SHADER_TESS_EVAL ||
5333 processor == PIPE_SHADER_FRAGMENT)) {
5334 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5335 return -EINVAL;
5336 }
5337
5338 return r;
5339 }
5340
5341 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5342 {
5343 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5344 LLVMBuildRetVoid(ctx->gallivm.builder);
5345 else
5346 LLVMBuildRet(ctx->gallivm.builder, ret);
5347 }
5348
5349 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5350 struct si_shader *
5351 si_generate_gs_copy_shader(struct si_screen *sscreen,
5352 LLVMTargetMachineRef tm,
5353 struct si_shader_selector *gs_selector,
5354 struct pipe_debug_callback *debug)
5355 {
5356 struct si_shader_context ctx;
5357 struct si_shader *shader;
5358 struct gallivm_state *gallivm = &ctx.gallivm;
5359 LLVMBuilderRef builder;
5360 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5361 struct lp_build_context *uint = &bld_base->uint_bld;
5362 struct si_shader_output_values *outputs;
5363 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5364 int i, r;
5365
5366 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5367
5368 if (!outputs)
5369 return NULL;
5370
5371 shader = CALLOC_STRUCT(si_shader);
5372 if (!shader) {
5373 FREE(outputs);
5374 return NULL;
5375 }
5376
5377
5378 shader->selector = gs_selector;
5379 shader->is_gs_copy_shader = true;
5380
5381 si_init_shader_ctx(&ctx, sscreen, tm);
5382 ctx.shader = shader;
5383 ctx.type = PIPE_SHADER_VERTEX;
5384
5385 builder = gallivm->builder;
5386
5387 create_function(&ctx);
5388 preload_ring_buffers(&ctx);
5389
5390 LLVMValueRef voffset =
5391 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5392
5393 /* Fetch the vertex stream ID.*/
5394 LLVMValueRef stream_id;
5395
5396 if (gs_selector->so.num_outputs)
5397 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5398 else
5399 stream_id = ctx.i32_0;
5400
5401 /* Fill in output information. */
5402 for (i = 0; i < gsinfo->num_outputs; ++i) {
5403 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5404 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5405
5406 for (int chan = 0; chan < 4; chan++) {
5407 outputs[i].vertex_stream[chan] =
5408 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5409 }
5410 }
5411
5412 LLVMBasicBlockRef end_bb;
5413 LLVMValueRef switch_inst;
5414
5415 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5416 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5417
5418 for (int stream = 0; stream < 4; stream++) {
5419 LLVMBasicBlockRef bb;
5420 unsigned offset;
5421
5422 if (!gsinfo->num_stream_output_components[stream])
5423 continue;
5424
5425 if (stream > 0 && !gs_selector->so.num_outputs)
5426 continue;
5427
5428 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5429 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5430 LLVMPositionBuilderAtEnd(builder, bb);
5431
5432 /* Fetch vertex data from GSVS ring */
5433 offset = 0;
5434 for (i = 0; i < gsinfo->num_outputs; ++i) {
5435 for (unsigned chan = 0; chan < 4; chan++) {
5436 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5437 outputs[i].vertex_stream[chan] != stream) {
5438 outputs[i].values[chan] = ctx.bld_base.base.undef;
5439 continue;
5440 }
5441
5442 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5443 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5444 offset++;
5445
5446 outputs[i].values[chan] =
5447 ac_build_buffer_load(&ctx.ac,
5448 ctx.gsvs_ring[0], 1,
5449 ctx.i32_0, voffset,
5450 soffset, 0, 1, 1,
5451 true, false);
5452 }
5453 }
5454
5455 /* Streamout and exports. */
5456 if (gs_selector->so.num_outputs) {
5457 si_llvm_emit_streamout(&ctx, outputs,
5458 gsinfo->num_outputs,
5459 stream);
5460 }
5461
5462 if (stream == 0)
5463 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5464
5465 LLVMBuildBr(builder, end_bb);
5466 }
5467
5468 LLVMPositionBuilderAtEnd(builder, end_bb);
5469
5470 LLVMBuildRetVoid(gallivm->builder);
5471
5472 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5473 si_llvm_optimize_module(&ctx);
5474
5475 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5476 &ctx.shader->config, ctx.tm,
5477 ctx.gallivm.module,
5478 debug, PIPE_SHADER_GEOMETRY,
5479 "GS Copy Shader");
5480 if (!r) {
5481 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5482 fprintf(stderr, "GS Copy Shader:\n");
5483 si_shader_dump(sscreen, ctx.shader, debug,
5484 PIPE_SHADER_GEOMETRY, stderr, true);
5485 r = si_shader_binary_upload(sscreen, ctx.shader);
5486 }
5487
5488 si_llvm_dispose(&ctx);
5489
5490 FREE(outputs);
5491
5492 if (r != 0) {
5493 FREE(shader);
5494 shader = NULL;
5495 }
5496 return shader;
5497 }
5498
5499 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5500 const struct si_vs_prolog_bits *prolog,
5501 const char *prefix, FILE *f)
5502 {
5503 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5504 prefix, prolog->instance_divisor_is_one);
5505 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5506 prefix, prolog->instance_divisor_is_fetched);
5507 fprintf(f, " %s.ls_vgpr_fix = %u\n",
5508 prefix, prolog->ls_vgpr_fix);
5509
5510 fprintf(f, " mono.vs.fix_fetch = {");
5511 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5512 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5513 fprintf(f, "}\n");
5514 }
5515
5516 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5517 FILE *f)
5518 {
5519 const struct si_shader_key *key = &shader->key;
5520
5521 fprintf(f, "SHADER KEY\n");
5522
5523 switch (processor) {
5524 case PIPE_SHADER_VERTEX:
5525 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5526 "part.vs.prolog", f);
5527 fprintf(f, " as_es = %u\n", key->as_es);
5528 fprintf(f, " as_ls = %u\n", key->as_ls);
5529 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5530 key->mono.u.vs_export_prim_id);
5531 break;
5532
5533 case PIPE_SHADER_TESS_CTRL:
5534 if (shader->selector->screen->b.chip_class >= GFX9) {
5535 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5536 "part.tcs.ls_prolog", f);
5537 }
5538 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5539 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5540 break;
5541
5542 case PIPE_SHADER_TESS_EVAL:
5543 fprintf(f, " as_es = %u\n", key->as_es);
5544 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5545 key->mono.u.vs_export_prim_id);
5546 break;
5547
5548 case PIPE_SHADER_GEOMETRY:
5549 if (shader->is_gs_copy_shader)
5550 break;
5551
5552 if (shader->selector->screen->b.chip_class >= GFX9 &&
5553 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5554 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5555 "part.gs.vs_prolog", f);
5556 }
5557 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5558 break;
5559
5560 case PIPE_SHADER_COMPUTE:
5561 break;
5562
5563 case PIPE_SHADER_FRAGMENT:
5564 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5565 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5566 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5567 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5568 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5569 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5570 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5571 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5572 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5573 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5574 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5575 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5576 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5577 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5578 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5579 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5580 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5581 break;
5582
5583 default:
5584 assert(0);
5585 }
5586
5587 if ((processor == PIPE_SHADER_GEOMETRY ||
5588 processor == PIPE_SHADER_TESS_EVAL ||
5589 processor == PIPE_SHADER_VERTEX) &&
5590 !key->as_es && !key->as_ls) {
5591 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5592 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5593 }
5594 }
5595
5596 static void si_init_shader_ctx(struct si_shader_context *ctx,
5597 struct si_screen *sscreen,
5598 LLVMTargetMachineRef tm)
5599 {
5600 struct lp_build_tgsi_context *bld_base;
5601
5602 ctx->abi.chip_class = sscreen->b.chip_class;
5603
5604 si_llvm_context_init(ctx, sscreen, tm);
5605
5606 bld_base = &ctx->bld_base;
5607 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5608
5609 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5610 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5611 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5612
5613 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5614
5615 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5616
5617 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5618 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5619 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5620 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5621
5622 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5623 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5624 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5625 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5626 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5627 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5628 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5629 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5630 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5631
5632 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5633 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5634 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5635 }
5636
5637 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5638 {
5639 struct si_shader *shader = ctx->shader;
5640 struct tgsi_shader_info *info = &shader->selector->info;
5641
5642 if ((ctx->type != PIPE_SHADER_VERTEX &&
5643 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5644 shader->key.as_ls ||
5645 shader->key.as_es)
5646 return;
5647
5648 ac_optimize_vs_outputs(&ctx->ac,
5649 ctx->main_fn,
5650 shader->info.vs_output_param_offset,
5651 info->num_outputs,
5652 &shader->info.nr_param_exports);
5653 }
5654
5655 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5656 {
5657 ctx->shader->config.private_mem_vgprs = 0;
5658
5659 /* Process all LLVM instructions. */
5660 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5661 while (bb) {
5662 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5663
5664 while (next) {
5665 LLVMValueRef inst = next;
5666 next = LLVMGetNextInstruction(next);
5667
5668 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5669 continue;
5670
5671 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5672 /* No idea why LLVM aligns allocas to 4 elements. */
5673 unsigned alignment = LLVMGetAlignment(inst);
5674 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5675 ctx->shader->config.private_mem_vgprs += dw_size;
5676 }
5677 bb = LLVMGetNextBasicBlock(bb);
5678 }
5679 }
5680
5681 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5682 {
5683 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5684 lp_build_intrinsic(ctx->gallivm.builder,
5685 "llvm.amdgcn.init.exec", ctx->voidt,
5686 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5687 }
5688
5689 static void si_init_exec_from_input(struct si_shader_context *ctx,
5690 unsigned param, unsigned bitoffset)
5691 {
5692 LLVMValueRef args[] = {
5693 LLVMGetParam(ctx->main_fn, param),
5694 LLVMConstInt(ctx->i32, bitoffset, 0),
5695 };
5696 lp_build_intrinsic(ctx->gallivm.builder,
5697 "llvm.amdgcn.init.exec.from.input",
5698 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5699 }
5700
5701 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5702 const struct si_vs_prolog_bits *key)
5703 {
5704 /* VGPR initialization fixup for Vega10 and Raven is always done in the
5705 * VS prolog. */
5706 return sel->vs_needs_prolog || key->ls_vgpr_fix;
5707 }
5708
5709 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5710 bool is_monolithic)
5711 {
5712 struct si_shader *shader = ctx->shader;
5713 struct si_shader_selector *sel = shader->selector;
5714 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5715
5716 // TODO clean all this up!
5717 switch (ctx->type) {
5718 case PIPE_SHADER_VERTEX:
5719 ctx->load_input = declare_input_vs;
5720 if (shader->key.as_ls)
5721 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5722 else if (shader->key.as_es)
5723 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5724 else {
5725 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5726 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5727 }
5728 break;
5729 case PIPE_SHADER_TESS_CTRL:
5730 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5731 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5732 bld_base->emit_store = store_output_tcs;
5733 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5734 break;
5735 case PIPE_SHADER_TESS_EVAL:
5736 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5737 if (shader->key.as_es)
5738 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5739 else {
5740 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5741 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5742 }
5743 break;
5744 case PIPE_SHADER_GEOMETRY:
5745 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5746 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5747 break;
5748 case PIPE_SHADER_FRAGMENT:
5749 ctx->load_input = declare_input_fs;
5750 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5751 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5752 break;
5753 case PIPE_SHADER_COMPUTE:
5754 break;
5755 default:
5756 assert(!"Unsupported shader type");
5757 return false;
5758 }
5759
5760 ctx->abi.load_ubo = load_ubo;
5761 ctx->abi.load_ssbo = load_ssbo;
5762
5763 create_function(ctx);
5764 preload_ring_buffers(ctx);
5765
5766 /* For GFX9 merged shaders:
5767 * - Set EXEC for the first shader. If the prolog is present, set
5768 * EXEC there instead.
5769 * - Add a barrier before the second shader.
5770 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5771 * an if-statement. This is required for correctness in geometry
5772 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5773 * GS_CUT messages.
5774 *
5775 * For monolithic merged shaders, the first shader is wrapped in an
5776 * if-block together with its prolog in si_build_wrapper_function.
5777 */
5778 if (ctx->screen->b.chip_class >= GFX9) {
5779 if (!is_monolithic &&
5780 sel->info.num_instructions > 1 && /* not empty shader */
5781 (shader->key.as_es || shader->key.as_ls) &&
5782 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5783 (ctx->type == PIPE_SHADER_VERTEX &&
5784 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
5785 si_init_exec_from_input(ctx,
5786 ctx->param_merged_wave_info, 0);
5787 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5788 ctx->type == PIPE_SHADER_GEOMETRY) {
5789 if (!is_monolithic)
5790 si_init_exec_full_mask(ctx);
5791
5792 /* The barrier must execute for all shaders in a
5793 * threadgroup.
5794 */
5795 si_llvm_emit_barrier(NULL, bld_base, NULL);
5796
5797 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5798 LLVMValueRef ena =
5799 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5800 ac_get_thread_id(&ctx->ac), num_threads, "");
5801 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5802 }
5803 }
5804
5805 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5806 int i;
5807 for (i = 0; i < 4; i++) {
5808 ctx->gs_next_vertex[i] =
5809 lp_build_alloca(&ctx->gallivm,
5810 ctx->i32, "");
5811 }
5812 }
5813
5814 if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5815 ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5816 /* This is initialized to 0.0 = not kill. */
5817 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5818 }
5819
5820 if (sel->tokens) {
5821 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5822 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5823 return false;
5824 }
5825 } else {
5826 if (!si_nir_build_llvm(ctx, sel->nir)) {
5827 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5828 return false;
5829 }
5830 }
5831
5832 si_llvm_build_ret(ctx, ctx->return_value);
5833 return true;
5834 }
5835
5836 /**
5837 * Compute the VS prolog key, which contains all the information needed to
5838 * build the VS prolog function, and set shader->info bits where needed.
5839 *
5840 * \param info Shader info of the vertex shader.
5841 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5842 * \param prolog_key Key of the VS prolog
5843 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5844 * \param key Output shader part key.
5845 */
5846 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5847 unsigned num_input_sgprs,
5848 const struct si_vs_prolog_bits *prolog_key,
5849 struct si_shader *shader_out,
5850 union si_shader_part_key *key)
5851 {
5852 memset(key, 0, sizeof(*key));
5853 key->vs_prolog.states = *prolog_key;
5854 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5855 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5856 key->vs_prolog.as_ls = shader_out->key.as_ls;
5857
5858 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5859 key->vs_prolog.as_ls = 1;
5860 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5861 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5862 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5863 }
5864
5865 /* Enable loading the InstanceID VGPR. */
5866 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5867
5868 if ((key->vs_prolog.states.instance_divisor_is_one |
5869 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5870 shader_out->info.uses_instanceid = true;
5871 }
5872
5873 /**
5874 * Compute the PS prolog key, which contains all the information needed to
5875 * build the PS prolog function, and set related bits in shader->config.
5876 */
5877 static void si_get_ps_prolog_key(struct si_shader *shader,
5878 union si_shader_part_key *key,
5879 bool separate_prolog)
5880 {
5881 struct tgsi_shader_info *info = &shader->selector->info;
5882
5883 memset(key, 0, sizeof(*key));
5884 key->ps_prolog.states = shader->key.part.ps.prolog;
5885 key->ps_prolog.colors_read = info->colors_read;
5886 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5887 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5888 key->ps_prolog.wqm = info->uses_derivatives &&
5889 (key->ps_prolog.colors_read ||
5890 key->ps_prolog.states.force_persp_sample_interp ||
5891 key->ps_prolog.states.force_linear_sample_interp ||
5892 key->ps_prolog.states.force_persp_center_interp ||
5893 key->ps_prolog.states.force_linear_center_interp ||
5894 key->ps_prolog.states.bc_optimize_for_persp ||
5895 key->ps_prolog.states.bc_optimize_for_linear);
5896
5897 if (info->colors_read) {
5898 unsigned *color = shader->selector->color_attr_index;
5899
5900 if (shader->key.part.ps.prolog.color_two_side) {
5901 /* BCOLORs are stored after the last input. */
5902 key->ps_prolog.num_interp_inputs = info->num_inputs;
5903 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5904 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5905 }
5906
5907 for (unsigned i = 0; i < 2; i++) {
5908 unsigned interp = info->input_interpolate[color[i]];
5909 unsigned location = info->input_interpolate_loc[color[i]];
5910
5911 if (!(info->colors_read & (0xf << i*4)))
5912 continue;
5913
5914 key->ps_prolog.color_attr_index[i] = color[i];
5915
5916 if (shader->key.part.ps.prolog.flatshade_colors &&
5917 interp == TGSI_INTERPOLATE_COLOR)
5918 interp = TGSI_INTERPOLATE_CONSTANT;
5919
5920 switch (interp) {
5921 case TGSI_INTERPOLATE_CONSTANT:
5922 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5923 break;
5924 case TGSI_INTERPOLATE_PERSPECTIVE:
5925 case TGSI_INTERPOLATE_COLOR:
5926 /* Force the interpolation location for colors here. */
5927 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5928 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5929 if (shader->key.part.ps.prolog.force_persp_center_interp)
5930 location = TGSI_INTERPOLATE_LOC_CENTER;
5931
5932 switch (location) {
5933 case TGSI_INTERPOLATE_LOC_SAMPLE:
5934 key->ps_prolog.color_interp_vgpr_index[i] = 0;
5935 shader->config.spi_ps_input_ena |=
5936 S_0286CC_PERSP_SAMPLE_ENA(1);
5937 break;
5938 case TGSI_INTERPOLATE_LOC_CENTER:
5939 key->ps_prolog.color_interp_vgpr_index[i] = 2;
5940 shader->config.spi_ps_input_ena |=
5941 S_0286CC_PERSP_CENTER_ENA(1);
5942 break;
5943 case TGSI_INTERPOLATE_LOC_CENTROID:
5944 key->ps_prolog.color_interp_vgpr_index[i] = 4;
5945 shader->config.spi_ps_input_ena |=
5946 S_0286CC_PERSP_CENTROID_ENA(1);
5947 break;
5948 default:
5949 assert(0);
5950 }
5951 break;
5952 case TGSI_INTERPOLATE_LINEAR:
5953 /* Force the interpolation location for colors here. */
5954 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5955 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5956 if (shader->key.part.ps.prolog.force_linear_center_interp)
5957 location = TGSI_INTERPOLATE_LOC_CENTER;
5958
5959 /* The VGPR assignment for non-monolithic shaders
5960 * works because InitialPSInputAddr is set on the
5961 * main shader and PERSP_PULL_MODEL is never used.
5962 */
5963 switch (location) {
5964 case TGSI_INTERPOLATE_LOC_SAMPLE:
5965 key->ps_prolog.color_interp_vgpr_index[i] =
5966 separate_prolog ? 6 : 9;
5967 shader->config.spi_ps_input_ena |=
5968 S_0286CC_LINEAR_SAMPLE_ENA(1);
5969 break;
5970 case TGSI_INTERPOLATE_LOC_CENTER:
5971 key->ps_prolog.color_interp_vgpr_index[i] =
5972 separate_prolog ? 8 : 11;
5973 shader->config.spi_ps_input_ena |=
5974 S_0286CC_LINEAR_CENTER_ENA(1);
5975 break;
5976 case TGSI_INTERPOLATE_LOC_CENTROID:
5977 key->ps_prolog.color_interp_vgpr_index[i] =
5978 separate_prolog ? 10 : 13;
5979 shader->config.spi_ps_input_ena |=
5980 S_0286CC_LINEAR_CENTROID_ENA(1);
5981 break;
5982 default:
5983 assert(0);
5984 }
5985 break;
5986 default:
5987 assert(0);
5988 }
5989 }
5990 }
5991 }
5992
5993 /**
5994 * Check whether a PS prolog is required based on the key.
5995 */
5996 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5997 {
5998 return key->ps_prolog.colors_read ||
5999 key->ps_prolog.states.force_persp_sample_interp ||
6000 key->ps_prolog.states.force_linear_sample_interp ||
6001 key->ps_prolog.states.force_persp_center_interp ||
6002 key->ps_prolog.states.force_linear_center_interp ||
6003 key->ps_prolog.states.bc_optimize_for_persp ||
6004 key->ps_prolog.states.bc_optimize_for_linear ||
6005 key->ps_prolog.states.poly_stipple;
6006 }
6007
6008 /**
6009 * Compute the PS epilog key, which contains all the information needed to
6010 * build the PS epilog function.
6011 */
6012 static void si_get_ps_epilog_key(struct si_shader *shader,
6013 union si_shader_part_key *key)
6014 {
6015 struct tgsi_shader_info *info = &shader->selector->info;
6016 memset(key, 0, sizeof(*key));
6017 key->ps_epilog.colors_written = info->colors_written;
6018 key->ps_epilog.writes_z = info->writes_z;
6019 key->ps_epilog.writes_stencil = info->writes_stencil;
6020 key->ps_epilog.writes_samplemask = info->writes_samplemask;
6021 key->ps_epilog.states = shader->key.part.ps.epilog;
6022 }
6023
6024 /**
6025 * Build the GS prolog function. Rotate the input vertices for triangle strips
6026 * with adjacency.
6027 */
6028 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
6029 union si_shader_part_key *key)
6030 {
6031 unsigned num_sgprs, num_vgprs;
6032 struct gallivm_state *gallivm = &ctx->gallivm;
6033 struct si_function_info fninfo;
6034 LLVMBuilderRef builder = gallivm->builder;
6035 LLVMTypeRef returns[48];
6036 LLVMValueRef func, ret;
6037
6038 si_init_function_info(&fninfo);
6039
6040 if (ctx->screen->b.chip_class >= GFX9) {
6041 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
6042 num_vgprs = 5; /* ES inputs are not needed by GS */
6043 } else {
6044 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6045 num_vgprs = 8;
6046 }
6047
6048 for (unsigned i = 0; i < num_sgprs; ++i) {
6049 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6050 returns[i] = ctx->i32;
6051 }
6052
6053 for (unsigned i = 0; i < num_vgprs; ++i) {
6054 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6055 returns[num_sgprs + i] = ctx->f32;
6056 }
6057
6058 /* Create the function. */
6059 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6060 &fninfo, 0);
6061 func = ctx->main_fn;
6062
6063 /* Set the full EXEC mask for the prolog, because we are only fiddling
6064 * with registers here. The main shader part will set the correct EXEC
6065 * mask.
6066 */
6067 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6068 si_init_exec_full_mask(ctx);
6069
6070 /* Copy inputs to outputs. This should be no-op, as the registers match,
6071 * but it will prevent the compiler from overwriting them unintentionally.
6072 */
6073 ret = ctx->return_value;
6074 for (unsigned i = 0; i < num_sgprs; i++) {
6075 LLVMValueRef p = LLVMGetParam(func, i);
6076 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6077 }
6078 for (unsigned i = 0; i < num_vgprs; i++) {
6079 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6080 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
6081 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6082 }
6083
6084 if (key->gs_prolog.states.tri_strip_adj_fix) {
6085 /* Remap the input vertices for every other primitive. */
6086 const unsigned gfx6_vtx_params[6] = {
6087 num_sgprs,
6088 num_sgprs + 1,
6089 num_sgprs + 3,
6090 num_sgprs + 4,
6091 num_sgprs + 5,
6092 num_sgprs + 6
6093 };
6094 const unsigned gfx9_vtx_params[3] = {
6095 num_sgprs,
6096 num_sgprs + 1,
6097 num_sgprs + 4,
6098 };
6099 LLVMValueRef vtx_in[6], vtx_out[6];
6100 LLVMValueRef prim_id, rotate;
6101
6102 if (ctx->screen->b.chip_class >= GFX9) {
6103 for (unsigned i = 0; i < 3; i++) {
6104 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6105 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6106 }
6107 } else {
6108 for (unsigned i = 0; i < 6; i++)
6109 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6110 }
6111
6112 prim_id = LLVMGetParam(func, num_sgprs + 2);
6113 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6114
6115 for (unsigned i = 0; i < 6; ++i) {
6116 LLVMValueRef base, rotated;
6117 base = vtx_in[i];
6118 rotated = vtx_in[(i + 4) % 6];
6119 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6120 }
6121
6122 if (ctx->screen->b.chip_class >= GFX9) {
6123 for (unsigned i = 0; i < 3; i++) {
6124 LLVMValueRef hi, out;
6125
6126 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6127 LLVMConstInt(ctx->i32, 16, 0), "");
6128 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6129 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
6130 ret = LLVMBuildInsertValue(builder, ret, out,
6131 gfx9_vtx_params[i], "");
6132 }
6133 } else {
6134 for (unsigned i = 0; i < 6; i++) {
6135 LLVMValueRef out;
6136
6137 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
6138 ret = LLVMBuildInsertValue(builder, ret, out,
6139 gfx6_vtx_params[i], "");
6140 }
6141 }
6142 }
6143
6144 LLVMBuildRet(builder, ret);
6145 }
6146
6147 /**
6148 * Given a list of shader part functions, build a wrapper function that
6149 * runs them in sequence to form a monolithic shader.
6150 */
6151 static void si_build_wrapper_function(struct si_shader_context *ctx,
6152 LLVMValueRef *parts,
6153 unsigned num_parts,
6154 unsigned main_part,
6155 unsigned next_shader_first_part)
6156 {
6157 struct gallivm_state *gallivm = &ctx->gallivm;
6158 LLVMBuilderRef builder = ctx->gallivm.builder;
6159 /* PS epilog has one arg per color component; gfx9 merged shader
6160 * prologs need to forward 32 user SGPRs.
6161 */
6162 struct si_function_info fninfo;
6163 LLVMValueRef initial[64], out[64];
6164 LLVMTypeRef function_type;
6165 unsigned num_first_params;
6166 unsigned num_out, initial_num_out;
6167 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6168 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6169 unsigned num_sgprs, num_vgprs;
6170 unsigned gprs;
6171 struct lp_build_if_state if_state;
6172
6173 si_init_function_info(&fninfo);
6174
6175 for (unsigned i = 0; i < num_parts; ++i) {
6176 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6177 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6178 }
6179
6180 /* The parameters of the wrapper function correspond to those of the
6181 * first part in terms of SGPRs and VGPRs, but we use the types of the
6182 * main part to get the right types. This is relevant for the
6183 * dereferenceable attribute on descriptor table pointers.
6184 */
6185 num_sgprs = 0;
6186 num_vgprs = 0;
6187
6188 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6189 num_first_params = LLVMCountParamTypes(function_type);
6190
6191 for (unsigned i = 0; i < num_first_params; ++i) {
6192 LLVMValueRef param = LLVMGetParam(parts[0], i);
6193
6194 if (ac_is_sgpr_param(param)) {
6195 assert(num_vgprs == 0);
6196 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6197 } else {
6198 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6199 }
6200 }
6201
6202 gprs = 0;
6203 while (gprs < num_sgprs + num_vgprs) {
6204 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6205 LLVMTypeRef type = LLVMTypeOf(param);
6206 unsigned size = llvm_get_type_size(type) / 4;
6207
6208 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6209
6210 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6211 assert(gprs + size <= num_sgprs + num_vgprs &&
6212 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6213
6214 gprs += size;
6215 }
6216
6217 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6218 si_get_max_workgroup_size(ctx->shader));
6219
6220 if (is_merged_shader(ctx->shader))
6221 si_init_exec_full_mask(ctx);
6222
6223 /* Record the arguments of the function as if they were an output of
6224 * a previous part.
6225 */
6226 num_out = 0;
6227 num_out_sgpr = 0;
6228
6229 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6230 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6231 LLVMTypeRef param_type = LLVMTypeOf(param);
6232 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6233 unsigned size = llvm_get_type_size(param_type) / 4;
6234
6235 if (size == 1) {
6236 if (param_type != out_type)
6237 param = LLVMBuildBitCast(builder, param, out_type, "");
6238 out[num_out++] = param;
6239 } else {
6240 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6241
6242 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6243 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6244 param_type = ctx->i64;
6245 }
6246
6247 if (param_type != vector_type)
6248 param = LLVMBuildBitCast(builder, param, vector_type, "");
6249
6250 for (unsigned j = 0; j < size; ++j)
6251 out[num_out++] = LLVMBuildExtractElement(
6252 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6253 }
6254
6255 if (i < fninfo.num_sgpr_params)
6256 num_out_sgpr = num_out;
6257 }
6258
6259 memcpy(initial, out, sizeof(out));
6260 initial_num_out = num_out;
6261 initial_num_out_sgpr = num_out_sgpr;
6262
6263 /* Now chain the parts. */
6264 for (unsigned part = 0; part < num_parts; ++part) {
6265 LLVMValueRef in[48];
6266 LLVMValueRef ret;
6267 LLVMTypeRef ret_type;
6268 unsigned out_idx = 0;
6269 unsigned num_params = LLVMCountParams(parts[part]);
6270
6271 /* Merged shaders are executed conditionally depending
6272 * on the number of enabled threads passed in the input SGPRs. */
6273 if (is_merged_shader(ctx->shader) && part == 0) {
6274 LLVMValueRef ena, count = initial[3];
6275
6276 count = LLVMBuildAnd(builder, count,
6277 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6278 ena = LLVMBuildICmp(builder, LLVMIntULT,
6279 ac_get_thread_id(&ctx->ac), count, "");
6280 lp_build_if(&if_state, &ctx->gallivm, ena);
6281 }
6282
6283 /* Derive arguments for the next part from outputs of the
6284 * previous one.
6285 */
6286 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6287 LLVMValueRef param;
6288 LLVMTypeRef param_type;
6289 bool is_sgpr;
6290 unsigned param_size;
6291 LLVMValueRef arg = NULL;
6292
6293 param = LLVMGetParam(parts[part], param_idx);
6294 param_type = LLVMTypeOf(param);
6295 param_size = llvm_get_type_size(param_type) / 4;
6296 is_sgpr = ac_is_sgpr_param(param);
6297
6298 if (is_sgpr) {
6299 #if HAVE_LLVM < 0x0400
6300 LLVMRemoveAttribute(param, LLVMByValAttribute);
6301 #else
6302 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6303 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6304 #endif
6305 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6306 }
6307
6308 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6309 assert(is_sgpr || out_idx >= num_out_sgpr);
6310
6311 if (param_size == 1)
6312 arg = out[out_idx];
6313 else
6314 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6315
6316 if (LLVMTypeOf(arg) != param_type) {
6317 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6318 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6319 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6320 } else {
6321 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6322 }
6323 }
6324
6325 in[param_idx] = arg;
6326 out_idx += param_size;
6327 }
6328
6329 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6330
6331 if (is_merged_shader(ctx->shader) &&
6332 part + 1 == next_shader_first_part) {
6333 lp_build_endif(&if_state);
6334
6335 /* The second half of the merged shader should use
6336 * the inputs from the toplevel (wrapper) function,
6337 * not the return value from the last call.
6338 *
6339 * That's because the last call was executed condi-
6340 * tionally, so we can't consume it in the main
6341 * block.
6342 */
6343 memcpy(out, initial, sizeof(initial));
6344 num_out = initial_num_out;
6345 num_out_sgpr = initial_num_out_sgpr;
6346 continue;
6347 }
6348
6349 /* Extract the returned GPRs. */
6350 ret_type = LLVMTypeOf(ret);
6351 num_out = 0;
6352 num_out_sgpr = 0;
6353
6354 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6355 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6356
6357 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6358
6359 for (unsigned i = 0; i < ret_size; ++i) {
6360 LLVMValueRef val =
6361 LLVMBuildExtractValue(builder, ret, i, "");
6362
6363 assert(num_out < ARRAY_SIZE(out));
6364 out[num_out++] = val;
6365
6366 if (LLVMTypeOf(val) == ctx->i32) {
6367 assert(num_out_sgpr + 1 == num_out);
6368 num_out_sgpr = num_out;
6369 }
6370 }
6371 }
6372 }
6373
6374 LLVMBuildRetVoid(builder);
6375 }
6376
6377 int si_compile_tgsi_shader(struct si_screen *sscreen,
6378 LLVMTargetMachineRef tm,
6379 struct si_shader *shader,
6380 bool is_monolithic,
6381 struct pipe_debug_callback *debug)
6382 {
6383 struct si_shader_selector *sel = shader->selector;
6384 struct si_shader_context ctx;
6385 int r = -1;
6386
6387 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6388 * conversion fails. */
6389 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6390 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6391 if (sel->tokens)
6392 tgsi_dump(sel->tokens, 0);
6393 else
6394 nir_print_shader(sel->nir, stderr);
6395 si_dump_streamout(&sel->so);
6396 }
6397
6398 si_init_shader_ctx(&ctx, sscreen, tm);
6399 si_llvm_context_set_tgsi(&ctx, shader);
6400 ctx.separate_prolog = !is_monolithic;
6401
6402 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6403 sizeof(shader->info.vs_output_param_offset));
6404
6405 shader->info.uses_instanceid = sel->info.uses_instanceid;
6406
6407 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6408 si_llvm_dispose(&ctx);
6409 return -1;
6410 }
6411
6412 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6413 LLVMValueRef parts[2];
6414 bool need_prolog = sel->vs_needs_prolog;
6415
6416 parts[1] = ctx.main_fn;
6417
6418 if (need_prolog) {
6419 union si_shader_part_key prolog_key;
6420 si_get_vs_prolog_key(&sel->info,
6421 shader->info.num_input_sgprs,
6422 &shader->key.part.vs.prolog,
6423 shader, &prolog_key);
6424 si_build_vs_prolog_function(&ctx, &prolog_key);
6425 parts[0] = ctx.main_fn;
6426 }
6427
6428 si_build_wrapper_function(&ctx, parts + !need_prolog,
6429 1 + need_prolog, need_prolog, 0);
6430 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6431 if (sscreen->b.chip_class >= GFX9) {
6432 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6433 LLVMValueRef parts[4];
6434 bool vs_needs_prolog =
6435 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6436
6437 /* TCS main part */
6438 parts[2] = ctx.main_fn;
6439
6440 /* TCS epilog */
6441 union si_shader_part_key tcs_epilog_key;
6442 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6443 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6444 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6445 parts[3] = ctx.main_fn;
6446
6447 /* VS prolog */
6448 if (vs_needs_prolog) {
6449 union si_shader_part_key vs_prolog_key;
6450 si_get_vs_prolog_key(&ls->info,
6451 shader->info.num_input_sgprs,
6452 &shader->key.part.tcs.ls_prolog,
6453 shader, &vs_prolog_key);
6454 vs_prolog_key.vs_prolog.is_monolithic = true;
6455 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6456 parts[0] = ctx.main_fn;
6457 }
6458
6459 /* VS as LS main part */
6460 struct si_shader shader_ls = {};
6461 shader_ls.selector = ls;
6462 shader_ls.key.as_ls = 1;
6463 shader_ls.key.mono = shader->key.mono;
6464 shader_ls.key.opt = shader->key.opt;
6465 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6466
6467 if (!si_compile_tgsi_main(&ctx, true)) {
6468 si_llvm_dispose(&ctx);
6469 return -1;
6470 }
6471 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6472 parts[1] = ctx.main_fn;
6473
6474 /* Reset the shader context. */
6475 ctx.shader = shader;
6476 ctx.type = PIPE_SHADER_TESS_CTRL;
6477
6478 si_build_wrapper_function(&ctx,
6479 parts + !vs_needs_prolog,
6480 4 - !vs_needs_prolog, 0,
6481 vs_needs_prolog ? 2 : 1);
6482 } else {
6483 LLVMValueRef parts[2];
6484 union si_shader_part_key epilog_key;
6485
6486 parts[0] = ctx.main_fn;
6487
6488 memset(&epilog_key, 0, sizeof(epilog_key));
6489 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6490 si_build_tcs_epilog_function(&ctx, &epilog_key);
6491 parts[1] = ctx.main_fn;
6492
6493 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6494 }
6495 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6496 if (ctx.screen->b.chip_class >= GFX9) {
6497 struct si_shader_selector *es = shader->key.part.gs.es;
6498 LLVMValueRef es_prolog = NULL;
6499 LLVMValueRef es_main = NULL;
6500 LLVMValueRef gs_prolog = NULL;
6501 LLVMValueRef gs_main = ctx.main_fn;
6502
6503 /* GS prolog */
6504 union si_shader_part_key gs_prolog_key;
6505 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6506 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6507 gs_prolog_key.gs_prolog.is_monolithic = true;
6508 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6509 gs_prolog = ctx.main_fn;
6510
6511 /* ES prolog */
6512 if (es->vs_needs_prolog) {
6513 union si_shader_part_key vs_prolog_key;
6514 si_get_vs_prolog_key(&es->info,
6515 shader->info.num_input_sgprs,
6516 &shader->key.part.tcs.ls_prolog,
6517 shader, &vs_prolog_key);
6518 vs_prolog_key.vs_prolog.is_monolithic = true;
6519 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6520 es_prolog = ctx.main_fn;
6521 }
6522
6523 /* ES main part */
6524 struct si_shader shader_es = {};
6525 shader_es.selector = es;
6526 shader_es.key.as_es = 1;
6527 shader_es.key.mono = shader->key.mono;
6528 shader_es.key.opt = shader->key.opt;
6529 si_llvm_context_set_tgsi(&ctx, &shader_es);
6530
6531 if (!si_compile_tgsi_main(&ctx, true)) {
6532 si_llvm_dispose(&ctx);
6533 return -1;
6534 }
6535 shader->info.uses_instanceid |= es->info.uses_instanceid;
6536 es_main = ctx.main_fn;
6537
6538 /* Reset the shader context. */
6539 ctx.shader = shader;
6540 ctx.type = PIPE_SHADER_GEOMETRY;
6541
6542 /* Prepare the array of shader parts. */
6543 LLVMValueRef parts[4];
6544 unsigned num_parts = 0, main_part, next_first_part;
6545
6546 if (es_prolog)
6547 parts[num_parts++] = es_prolog;
6548
6549 parts[main_part = num_parts++] = es_main;
6550 parts[next_first_part = num_parts++] = gs_prolog;
6551 parts[num_parts++] = gs_main;
6552
6553 si_build_wrapper_function(&ctx, parts, num_parts,
6554 main_part, next_first_part);
6555 } else {
6556 LLVMValueRef parts[2];
6557 union si_shader_part_key prolog_key;
6558
6559 parts[1] = ctx.main_fn;
6560
6561 memset(&prolog_key, 0, sizeof(prolog_key));
6562 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6563 si_build_gs_prolog_function(&ctx, &prolog_key);
6564 parts[0] = ctx.main_fn;
6565
6566 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6567 }
6568 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6569 LLVMValueRef parts[3];
6570 union si_shader_part_key prolog_key;
6571 union si_shader_part_key epilog_key;
6572 bool need_prolog;
6573
6574 si_get_ps_prolog_key(shader, &prolog_key, false);
6575 need_prolog = si_need_ps_prolog(&prolog_key);
6576
6577 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6578
6579 if (need_prolog) {
6580 si_build_ps_prolog_function(&ctx, &prolog_key);
6581 parts[0] = ctx.main_fn;
6582 }
6583
6584 si_get_ps_epilog_key(shader, &epilog_key);
6585 si_build_ps_epilog_function(&ctx, &epilog_key);
6586 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6587
6588 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6589 need_prolog ? 1 : 0, 0);
6590 }
6591
6592 si_llvm_optimize_module(&ctx);
6593
6594 /* Post-optimization transformations and analysis. */
6595 si_optimize_vs_outputs(&ctx);
6596
6597 if ((debug && debug->debug_message) ||
6598 r600_can_dump_shader(&sscreen->b, ctx.type))
6599 si_count_scratch_private_memory(&ctx);
6600
6601 /* Compile to bytecode. */
6602 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6603 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6604 si_llvm_dispose(&ctx);
6605 if (r) {
6606 fprintf(stderr, "LLVM failed to compile shader\n");
6607 return r;
6608 }
6609
6610 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6611 * LLVM 3.9svn has this bug.
6612 */
6613 if (sel->type == PIPE_SHADER_COMPUTE) {
6614 unsigned wave_size = 64;
6615 unsigned max_vgprs = 256;
6616 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6617 unsigned max_sgprs_per_wave = 128;
6618 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6619 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6620 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6621
6622 max_vgprs = max_vgprs / min_waves_per_simd;
6623 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6624
6625 if (shader->config.num_sgprs > max_sgprs ||
6626 shader->config.num_vgprs > max_vgprs) {
6627 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6628 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6629 shader->config.num_sgprs, shader->config.num_vgprs,
6630 max_sgprs, max_vgprs);
6631
6632 /* Just terminate the process, because dependent
6633 * shaders can hang due to bad input data, but use
6634 * the env var to allow shader-db to work.
6635 */
6636 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6637 abort();
6638 }
6639 }
6640
6641 /* Add the scratch offset to input SGPRs. */
6642 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6643 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6644
6645 /* Calculate the number of fragment input VGPRs. */
6646 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6647 shader->info.num_input_vgprs = 0;
6648 shader->info.face_vgpr_index = -1;
6649
6650 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6651 shader->info.num_input_vgprs += 2;
6652 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6653 shader->info.num_input_vgprs += 2;
6654 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6655 shader->info.num_input_vgprs += 2;
6656 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6657 shader->info.num_input_vgprs += 3;
6658 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6659 shader->info.num_input_vgprs += 2;
6660 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6661 shader->info.num_input_vgprs += 2;
6662 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6663 shader->info.num_input_vgprs += 2;
6664 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6665 shader->info.num_input_vgprs += 1;
6666 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6667 shader->info.num_input_vgprs += 1;
6668 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6669 shader->info.num_input_vgprs += 1;
6670 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6671 shader->info.num_input_vgprs += 1;
6672 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6673 shader->info.num_input_vgprs += 1;
6674 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6675 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6676 shader->info.num_input_vgprs += 1;
6677 }
6678 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6679 shader->info.num_input_vgprs += 1;
6680 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6681 shader->info.num_input_vgprs += 1;
6682 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6683 shader->info.num_input_vgprs += 1;
6684 }
6685
6686 return 0;
6687 }
6688
6689 /**
6690 * Create, compile and return a shader part (prolog or epilog).
6691 *
6692 * \param sscreen screen
6693 * \param list list of shader parts of the same category
6694 * \param type shader type
6695 * \param key shader part key
6696 * \param prolog whether the part being requested is a prolog
6697 * \param tm LLVM target machine
6698 * \param debug debug callback
6699 * \param build the callback responsible for building the main function
6700 * \return non-NULL on success
6701 */
6702 static struct si_shader_part *
6703 si_get_shader_part(struct si_screen *sscreen,
6704 struct si_shader_part **list,
6705 enum pipe_shader_type type,
6706 bool prolog,
6707 union si_shader_part_key *key,
6708 LLVMTargetMachineRef tm,
6709 struct pipe_debug_callback *debug,
6710 void (*build)(struct si_shader_context *,
6711 union si_shader_part_key *),
6712 const char *name)
6713 {
6714 struct si_shader_part *result;
6715
6716 mtx_lock(&sscreen->shader_parts_mutex);
6717
6718 /* Find existing. */
6719 for (result = *list; result; result = result->next) {
6720 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6721 mtx_unlock(&sscreen->shader_parts_mutex);
6722 return result;
6723 }
6724 }
6725
6726 /* Compile a new one. */
6727 result = CALLOC_STRUCT(si_shader_part);
6728 result->key = *key;
6729
6730 struct si_shader shader = {};
6731 struct si_shader_context ctx;
6732 struct gallivm_state *gallivm = &ctx.gallivm;
6733
6734 si_init_shader_ctx(&ctx, sscreen, tm);
6735 ctx.shader = &shader;
6736 ctx.type = type;
6737
6738 switch (type) {
6739 case PIPE_SHADER_VERTEX:
6740 break;
6741 case PIPE_SHADER_TESS_CTRL:
6742 assert(!prolog);
6743 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6744 break;
6745 case PIPE_SHADER_GEOMETRY:
6746 assert(prolog);
6747 break;
6748 case PIPE_SHADER_FRAGMENT:
6749 if (prolog)
6750 shader.key.part.ps.prolog = key->ps_prolog.states;
6751 else
6752 shader.key.part.ps.epilog = key->ps_epilog.states;
6753 break;
6754 default:
6755 unreachable("bad shader part");
6756 }
6757
6758 build(&ctx, key);
6759
6760 /* Compile. */
6761 si_llvm_optimize_module(&ctx);
6762
6763 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6764 gallivm->module, debug, ctx.type, name)) {
6765 FREE(result);
6766 result = NULL;
6767 goto out;
6768 }
6769
6770 result->next = *list;
6771 *list = result;
6772
6773 out:
6774 si_llvm_dispose(&ctx);
6775 mtx_unlock(&sscreen->shader_parts_mutex);
6776 return result;
6777 }
6778
6779 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6780 {
6781 struct gallivm_state *gallivm = &ctx->gallivm;
6782 LLVMValueRef ptr[2], list;
6783
6784 /* Get the pointer to rw buffers. */
6785 ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6786 ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6787 list = lp_build_gather_values(gallivm, ptr, 2);
6788 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6789 list = LLVMBuildIntToPtr(gallivm->builder, list,
6790 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6791 return list;
6792 }
6793
6794 /**
6795 * Build the vertex shader prolog function.
6796 *
6797 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6798 * All inputs are returned unmodified. The vertex load indices are
6799 * stored after them, which will be used by the API VS for fetching inputs.
6800 *
6801 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6802 * input_v0,
6803 * input_v1,
6804 * input_v2,
6805 * input_v3,
6806 * (VertexID + BaseVertex),
6807 * (InstanceID + StartInstance),
6808 * (InstanceID / 2 + StartInstance)
6809 */
6810 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6811 union si_shader_part_key *key)
6812 {
6813 struct gallivm_state *gallivm = &ctx->gallivm;
6814 struct si_function_info fninfo;
6815 LLVMTypeRef *returns;
6816 LLVMValueRef ret, func;
6817 int num_returns, i;
6818 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
6819 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6820 LLVMValueRef input_vgprs[9];
6821 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6822 num_input_vgprs;
6823 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6824
6825 si_init_function_info(&fninfo);
6826
6827 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6828 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6829 sizeof(LLVMTypeRef));
6830 num_returns = 0;
6831
6832 /* Declare input and output SGPRs. */
6833 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6834 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6835 returns[num_returns++] = ctx->i32;
6836 }
6837
6838 /* Preloaded VGPRs (outputs must be floats) */
6839 for (i = 0; i < num_input_vgprs; i++) {
6840 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
6841 returns[num_returns++] = ctx->f32;
6842 }
6843
6844 /* Vertex load indices. */
6845 for (i = 0; i <= key->vs_prolog.last_input; i++)
6846 returns[num_returns++] = ctx->f32;
6847
6848 /* Create the function. */
6849 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6850 func = ctx->main_fn;
6851
6852 if (key->vs_prolog.num_merged_next_stage_vgprs) {
6853 if (!key->vs_prolog.is_monolithic)
6854 si_init_exec_from_input(ctx, 3, 0);
6855
6856 if (key->vs_prolog.as_ls &&
6857 (ctx->screen->b.family == CHIP_VEGA10 ||
6858 ctx->screen->b.family == CHIP_RAVEN)) {
6859 /* If there are no HS threads, SPI loads the LS VGPRs
6860 * starting at VGPR 0. Shift them back to where they
6861 * belong.
6862 */
6863 LLVMValueRef has_hs_threads =
6864 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
6865 unpack_param(ctx, 3, 8, 8),
6866 ctx->i32_0, "");
6867
6868 for (i = 4; i > 0; --i) {
6869 input_vgprs[i + 1] =
6870 LLVMBuildSelect(gallivm->builder, has_hs_threads,
6871 input_vgprs[i + 1],
6872 input_vgprs[i - 1], "");
6873 }
6874 }
6875 }
6876
6877 ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
6878 ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
6879
6880 /* Copy inputs to outputs. This should be no-op, as the registers match,
6881 * but it will prevent the compiler from overwriting them unintentionally.
6882 */
6883 ret = ctx->return_value;
6884 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6885 LLVMValueRef p = LLVMGetParam(func, i);
6886 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6887 }
6888 for (i = 0; i < num_input_vgprs; i++) {
6889 LLVMValueRef p = input_vgprs[i];
6890 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6891 ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
6892 key->vs_prolog.num_input_sgprs + i, "");
6893 }
6894
6895 /* Compute vertex load indices from instance divisors. */
6896 LLVMValueRef instance_divisor_constbuf = NULL;
6897
6898 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6899 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6900 LLVMValueRef buf_index =
6901 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6902 instance_divisor_constbuf =
6903 ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6904 }
6905
6906 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6907 bool divisor_is_one =
6908 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6909 bool divisor_is_fetched =
6910 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6911 LLVMValueRef index;
6912
6913 if (divisor_is_one || divisor_is_fetched) {
6914 LLVMValueRef divisor = ctx->i32_1;
6915
6916 if (divisor_is_fetched) {
6917 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6918 LLVMConstInt(ctx->i32, i * 4, 0));
6919 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6920 ctx->i32, "");
6921 }
6922
6923 /* InstanceID / Divisor + StartInstance */
6924 index = get_instance_index_for_fetch(ctx,
6925 user_sgpr_base +
6926 SI_SGPR_START_INSTANCE,
6927 divisor);
6928 } else {
6929 /* VertexID + BaseVertex */
6930 index = LLVMBuildAdd(gallivm->builder,
6931 ctx->abi.vertex_id,
6932 LLVMGetParam(func, user_sgpr_base +
6933 SI_SGPR_BASE_VERTEX), "");
6934 }
6935
6936 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6937 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6938 fninfo.num_params + i, "");
6939 }
6940
6941 si_llvm_build_ret(ctx, ret);
6942 }
6943
6944 static bool si_get_vs_prolog(struct si_screen *sscreen,
6945 LLVMTargetMachineRef tm,
6946 struct si_shader *shader,
6947 struct pipe_debug_callback *debug,
6948 struct si_shader *main_part,
6949 const struct si_vs_prolog_bits *key)
6950 {
6951 struct si_shader_selector *vs = main_part->selector;
6952
6953 if (!si_vs_needs_prolog(vs, key))
6954 return true;
6955
6956 /* Get the prolog. */
6957 union si_shader_part_key prolog_key;
6958 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6959 key, shader, &prolog_key);
6960
6961 shader->prolog =
6962 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6963 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6964 debug, si_build_vs_prolog_function,
6965 "Vertex Shader Prolog");
6966 return shader->prolog != NULL;
6967 }
6968
6969 /**
6970 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6971 */
6972 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6973 LLVMTargetMachineRef tm,
6974 struct si_shader *shader,
6975 struct pipe_debug_callback *debug)
6976 {
6977 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6978 &shader->key.part.vs.prolog);
6979 }
6980
6981 /**
6982 * Compile the TCS epilog function. This writes tesselation factors to memory
6983 * based on the output primitive type of the tesselator (determined by TES).
6984 */
6985 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6986 union si_shader_part_key *key)
6987 {
6988 struct gallivm_state *gallivm = &ctx->gallivm;
6989 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6990 struct si_function_info fninfo;
6991 LLVMValueRef func;
6992
6993 si_init_function_info(&fninfo);
6994
6995 if (ctx->screen->b.chip_class >= GFX9) {
6996 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6997 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6998 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
6999 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7000 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7001 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7002 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7003 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7004 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7005 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7006 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7007 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7008 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7009 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7010 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7011 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7012 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7013 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7014 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7015 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7016 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7017 } else {
7018 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7019 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7020 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7021 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7022 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7023 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7024 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7025 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7026 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7027 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7028 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7029 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7030 }
7031
7032 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7033 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7034 unsigned tess_factors_idx =
7035 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7036 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7037 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7038
7039 /* Create the function. */
7040 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7041 ctx->screen->b.chip_class >= CIK ? 128 : 64);
7042 declare_lds_as_pointer(ctx);
7043 func = ctx->main_fn;
7044
7045 si_write_tess_factors(bld_base,
7046 LLVMGetParam(func, tess_factors_idx),
7047 LLVMGetParam(func, tess_factors_idx + 1),
7048 LLVMGetParam(func, tess_factors_idx + 2));
7049
7050 LLVMBuildRetVoid(gallivm->builder);
7051 }
7052
7053 /**
7054 * Select and compile (or reuse) TCS parts (epilog).
7055 */
7056 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7057 LLVMTargetMachineRef tm,
7058 struct si_shader *shader,
7059 struct pipe_debug_callback *debug)
7060 {
7061 if (sscreen->b.chip_class >= GFX9) {
7062 struct si_shader *ls_main_part =
7063 shader->key.part.tcs.ls->main_shader_part_ls;
7064
7065 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
7066 &shader->key.part.tcs.ls_prolog))
7067 return false;
7068
7069 shader->previous_stage = ls_main_part;
7070 }
7071
7072 /* Get the epilog. */
7073 union si_shader_part_key epilog_key;
7074 memset(&epilog_key, 0, sizeof(epilog_key));
7075 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7076
7077 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7078 PIPE_SHADER_TESS_CTRL, false,
7079 &epilog_key, tm, debug,
7080 si_build_tcs_epilog_function,
7081 "Tessellation Control Shader Epilog");
7082 return shader->epilog != NULL;
7083 }
7084
7085 /**
7086 * Select and compile (or reuse) GS parts (prolog).
7087 */
7088 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7089 LLVMTargetMachineRef tm,
7090 struct si_shader *shader,
7091 struct pipe_debug_callback *debug)
7092 {
7093 if (sscreen->b.chip_class >= GFX9) {
7094 struct si_shader *es_main_part =
7095 shader->key.part.gs.es->main_shader_part_es;
7096
7097 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7098 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
7099 &shader->key.part.gs.vs_prolog))
7100 return false;
7101
7102 shader->previous_stage = es_main_part;
7103 }
7104
7105 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7106 return true;
7107
7108 union si_shader_part_key prolog_key;
7109 memset(&prolog_key, 0, sizeof(prolog_key));
7110 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7111
7112 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7113 PIPE_SHADER_GEOMETRY, true,
7114 &prolog_key, tm, debug,
7115 si_build_gs_prolog_function,
7116 "Geometry Shader Prolog");
7117 return shader->prolog2 != NULL;
7118 }
7119
7120 /**
7121 * Build the pixel shader prolog function. This handles:
7122 * - two-side color selection and interpolation
7123 * - overriding interpolation parameters for the API PS
7124 * - polygon stippling
7125 *
7126 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7127 * overriden by other states. (e.g. per-sample interpolation)
7128 * Interpolated colors are stored after the preloaded VGPRs.
7129 */
7130 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7131 union si_shader_part_key *key)
7132 {
7133 struct gallivm_state *gallivm = &ctx->gallivm;
7134 struct si_function_info fninfo;
7135 LLVMValueRef ret, func;
7136 int num_returns, i, num_color_channels;
7137
7138 assert(si_need_ps_prolog(key));
7139
7140 si_init_function_info(&fninfo);
7141
7142 /* Declare inputs. */
7143 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7144 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7145
7146 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7147 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7148
7149 /* Declare outputs (same as inputs + add colors if needed) */
7150 num_returns = fninfo.num_params;
7151 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7152 for (i = 0; i < num_color_channels; i++)
7153 fninfo.types[num_returns++] = ctx->f32;
7154
7155 /* Create the function. */
7156 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7157 &fninfo, 0);
7158 func = ctx->main_fn;
7159
7160 /* Copy inputs to outputs. This should be no-op, as the registers match,
7161 * but it will prevent the compiler from overwriting them unintentionally.
7162 */
7163 ret = ctx->return_value;
7164 for (i = 0; i < fninfo.num_params; i++) {
7165 LLVMValueRef p = LLVMGetParam(func, i);
7166 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7167 }
7168
7169 /* Polygon stippling. */
7170 if (key->ps_prolog.states.poly_stipple) {
7171 /* POS_FIXED_PT is always last. */
7172 unsigned pos = key->ps_prolog.num_input_sgprs +
7173 key->ps_prolog.num_input_vgprs - 1;
7174 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7175
7176 si_llvm_emit_polygon_stipple(ctx, list, pos);
7177 }
7178
7179 if (key->ps_prolog.states.bc_optimize_for_persp ||
7180 key->ps_prolog.states.bc_optimize_for_linear) {
7181 unsigned i, base = key->ps_prolog.num_input_sgprs;
7182 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7183
7184 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7185 * The hw doesn't compute CENTROID if the whole wave only
7186 * contains fully-covered quads.
7187 *
7188 * PRIM_MASK is after user SGPRs.
7189 */
7190 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7191 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7192 LLVMConstInt(ctx->i32, 31, 0), "");
7193 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7194 ctx->i1, "");
7195
7196 if (key->ps_prolog.states.bc_optimize_for_persp) {
7197 /* Read PERSP_CENTER. */
7198 for (i = 0; i < 2; i++)
7199 center[i] = LLVMGetParam(func, base + 2 + i);
7200 /* Read PERSP_CENTROID. */
7201 for (i = 0; i < 2; i++)
7202 centroid[i] = LLVMGetParam(func, base + 4 + i);
7203 /* Select PERSP_CENTROID. */
7204 for (i = 0; i < 2; i++) {
7205 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7206 center[i], centroid[i], "");
7207 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7208 tmp, base + 4 + i, "");
7209 }
7210 }
7211 if (key->ps_prolog.states.bc_optimize_for_linear) {
7212 /* Read LINEAR_CENTER. */
7213 for (i = 0; i < 2; i++)
7214 center[i] = LLVMGetParam(func, base + 8 + i);
7215 /* Read LINEAR_CENTROID. */
7216 for (i = 0; i < 2; i++)
7217 centroid[i] = LLVMGetParam(func, base + 10 + i);
7218 /* Select LINEAR_CENTROID. */
7219 for (i = 0; i < 2; i++) {
7220 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7221 center[i], centroid[i], "");
7222 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7223 tmp, base + 10 + i, "");
7224 }
7225 }
7226 }
7227
7228 /* Force per-sample interpolation. */
7229 if (key->ps_prolog.states.force_persp_sample_interp) {
7230 unsigned i, base = key->ps_prolog.num_input_sgprs;
7231 LLVMValueRef persp_sample[2];
7232
7233 /* Read PERSP_SAMPLE. */
7234 for (i = 0; i < 2; i++)
7235 persp_sample[i] = LLVMGetParam(func, base + i);
7236 /* Overwrite PERSP_CENTER. */
7237 for (i = 0; i < 2; i++)
7238 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7239 persp_sample[i], base + 2 + i, "");
7240 /* Overwrite PERSP_CENTROID. */
7241 for (i = 0; i < 2; i++)
7242 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7243 persp_sample[i], base + 4 + i, "");
7244 }
7245 if (key->ps_prolog.states.force_linear_sample_interp) {
7246 unsigned i, base = key->ps_prolog.num_input_sgprs;
7247 LLVMValueRef linear_sample[2];
7248
7249 /* Read LINEAR_SAMPLE. */
7250 for (i = 0; i < 2; i++)
7251 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7252 /* Overwrite LINEAR_CENTER. */
7253 for (i = 0; i < 2; i++)
7254 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7255 linear_sample[i], base + 8 + i, "");
7256 /* Overwrite LINEAR_CENTROID. */
7257 for (i = 0; i < 2; i++)
7258 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7259 linear_sample[i], base + 10 + i, "");
7260 }
7261
7262 /* Force center interpolation. */
7263 if (key->ps_prolog.states.force_persp_center_interp) {
7264 unsigned i, base = key->ps_prolog.num_input_sgprs;
7265 LLVMValueRef persp_center[2];
7266
7267 /* Read PERSP_CENTER. */
7268 for (i = 0; i < 2; i++)
7269 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7270 /* Overwrite PERSP_SAMPLE. */
7271 for (i = 0; i < 2; i++)
7272 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7273 persp_center[i], base + i, "");
7274 /* Overwrite PERSP_CENTROID. */
7275 for (i = 0; i < 2; i++)
7276 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7277 persp_center[i], base + 4 + i, "");
7278 }
7279 if (key->ps_prolog.states.force_linear_center_interp) {
7280 unsigned i, base = key->ps_prolog.num_input_sgprs;
7281 LLVMValueRef linear_center[2];
7282
7283 /* Read LINEAR_CENTER. */
7284 for (i = 0; i < 2; i++)
7285 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7286 /* Overwrite LINEAR_SAMPLE. */
7287 for (i = 0; i < 2; i++)
7288 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7289 linear_center[i], base + 6 + i, "");
7290 /* Overwrite LINEAR_CENTROID. */
7291 for (i = 0; i < 2; i++)
7292 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7293 linear_center[i], base + 10 + i, "");
7294 }
7295
7296 /* Interpolate colors. */
7297 unsigned color_out_idx = 0;
7298 for (i = 0; i < 2; i++) {
7299 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7300 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7301 key->ps_prolog.face_vgpr_index;
7302 LLVMValueRef interp[2], color[4];
7303 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7304
7305 if (!writemask)
7306 continue;
7307
7308 /* If the interpolation qualifier is not CONSTANT (-1). */
7309 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7310 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7311 key->ps_prolog.color_interp_vgpr_index[i];
7312
7313 /* Get the (i,j) updated by bc_optimize handling. */
7314 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7315 interp_vgpr, "");
7316 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7317 interp_vgpr + 1, "");
7318 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7319 }
7320
7321 /* Use the absolute location of the input. */
7322 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7323
7324 if (key->ps_prolog.states.color_two_side) {
7325 face = LLVMGetParam(func, face_vgpr);
7326 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7327 }
7328
7329 interp_fs_input(ctx,
7330 key->ps_prolog.color_attr_index[i],
7331 TGSI_SEMANTIC_COLOR, i,
7332 key->ps_prolog.num_interp_inputs,
7333 key->ps_prolog.colors_read, interp_ij,
7334 prim_mask, face, color);
7335
7336 while (writemask) {
7337 unsigned chan = u_bit_scan(&writemask);
7338 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7339 fninfo.num_params + color_out_idx++, "");
7340 }
7341 }
7342
7343 /* Tell LLVM to insert WQM instruction sequence when needed. */
7344 if (key->ps_prolog.wqm) {
7345 LLVMAddTargetDependentFunctionAttr(func,
7346 "amdgpu-ps-wqm-outputs", "");
7347 }
7348
7349 si_llvm_build_ret(ctx, ret);
7350 }
7351
7352 /**
7353 * Build the pixel shader epilog function. This handles everything that must be
7354 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7355 */
7356 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7357 union si_shader_part_key *key)
7358 {
7359 struct gallivm_state *gallivm = &ctx->gallivm;
7360 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7361 struct si_function_info fninfo;
7362 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7363 int i;
7364 struct si_ps_exports exp = {};
7365
7366 si_init_function_info(&fninfo);
7367
7368 /* Declare input SGPRs. */
7369 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7370 ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7371 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7372 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7373 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7374
7375 /* Declare input VGPRs. */
7376 unsigned required_num_params =
7377 fninfo.num_sgpr_params +
7378 util_bitcount(key->ps_epilog.colors_written) * 4 +
7379 key->ps_epilog.writes_z +
7380 key->ps_epilog.writes_stencil +
7381 key->ps_epilog.writes_samplemask;
7382
7383 required_num_params = MAX2(required_num_params,
7384 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7385
7386 while (fninfo.num_params < required_num_params)
7387 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7388
7389 /* Create the function. */
7390 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7391 /* Disable elimination of unused inputs. */
7392 si_llvm_add_attribute(ctx->main_fn,
7393 "InitialPSInputAddr", 0xffffff);
7394
7395 /* Process colors. */
7396 unsigned vgpr = fninfo.num_sgpr_params;
7397 unsigned colors_written = key->ps_epilog.colors_written;
7398 int last_color_export = -1;
7399
7400 /* Find the last color export. */
7401 if (!key->ps_epilog.writes_z &&
7402 !key->ps_epilog.writes_stencil &&
7403 !key->ps_epilog.writes_samplemask) {
7404 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7405
7406 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7407 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7408 /* Just set this if any of the colorbuffers are enabled. */
7409 if (spi_format &
7410 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7411 last_color_export = 0;
7412 } else {
7413 for (i = 0; i < 8; i++)
7414 if (colors_written & (1 << i) &&
7415 (spi_format >> (i * 4)) & 0xf)
7416 last_color_export = i;
7417 }
7418 }
7419
7420 while (colors_written) {
7421 LLVMValueRef color[4];
7422 int mrt = u_bit_scan(&colors_written);
7423
7424 for (i = 0; i < 4; i++)
7425 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7426
7427 si_export_mrt_color(bld_base, color, mrt,
7428 fninfo.num_params - 1,
7429 mrt == last_color_export, &exp);
7430 }
7431
7432 /* Process depth, stencil, samplemask. */
7433 if (key->ps_epilog.writes_z)
7434 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7435 if (key->ps_epilog.writes_stencil)
7436 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7437 if (key->ps_epilog.writes_samplemask)
7438 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7439
7440 if (depth || stencil || samplemask)
7441 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7442 else if (last_color_export == -1)
7443 si_export_null(bld_base);
7444
7445 if (exp.num)
7446 si_emit_ps_exports(ctx, &exp);
7447
7448 /* Compile. */
7449 LLVMBuildRetVoid(gallivm->builder);
7450 }
7451
7452 /**
7453 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7454 */
7455 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7456 LLVMTargetMachineRef tm,
7457 struct si_shader *shader,
7458 struct pipe_debug_callback *debug)
7459 {
7460 union si_shader_part_key prolog_key;
7461 union si_shader_part_key epilog_key;
7462
7463 /* Get the prolog. */
7464 si_get_ps_prolog_key(shader, &prolog_key, true);
7465
7466 /* The prolog is a no-op if these aren't set. */
7467 if (si_need_ps_prolog(&prolog_key)) {
7468 shader->prolog =
7469 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7470 PIPE_SHADER_FRAGMENT, true,
7471 &prolog_key, tm, debug,
7472 si_build_ps_prolog_function,
7473 "Fragment Shader Prolog");
7474 if (!shader->prolog)
7475 return false;
7476 }
7477
7478 /* Get the epilog. */
7479 si_get_ps_epilog_key(shader, &epilog_key);
7480
7481 shader->epilog =
7482 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7483 PIPE_SHADER_FRAGMENT, false,
7484 &epilog_key, tm, debug,
7485 si_build_ps_epilog_function,
7486 "Fragment Shader Epilog");
7487 if (!shader->epilog)
7488 return false;
7489
7490 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7491 if (shader->key.part.ps.prolog.poly_stipple) {
7492 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7493 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7494 }
7495
7496 /* Set up the enable bits for per-sample shading if needed. */
7497 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7498 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7499 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7500 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7501 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7502 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7503 }
7504 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7505 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7506 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7507 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7508 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7509 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7510 }
7511 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7512 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7513 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7514 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7515 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7516 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7517 }
7518 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7519 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7520 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7521 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7522 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7523 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7524 }
7525
7526 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7527 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7528 !(shader->config.spi_ps_input_ena & 0xf)) {
7529 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7530 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7531 }
7532
7533 /* At least one pair of interpolation weights must be enabled. */
7534 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7535 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7536 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7537 }
7538
7539 /* The sample mask input is always enabled, because the API shader always
7540 * passes it through to the epilog. Disable it here if it's unused.
7541 */
7542 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7543 !shader->selector->info.reads_samplemask)
7544 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7545
7546 return true;
7547 }
7548
7549 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7550 unsigned *lds_size)
7551 {
7552 /* SPI barrier management bug:
7553 * Make sure we have at least 4k of LDS in use to avoid the bug.
7554 * It applies to workgroup sizes of more than one wavefront.
7555 */
7556 if (sscreen->b.family == CHIP_BONAIRE ||
7557 sscreen->b.family == CHIP_KABINI ||
7558 sscreen->b.family == CHIP_MULLINS)
7559 *lds_size = MAX2(*lds_size, 8);
7560 }
7561
7562 static void si_fix_resource_usage(struct si_screen *sscreen,
7563 struct si_shader *shader)
7564 {
7565 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7566
7567 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7568
7569 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7570 si_get_max_workgroup_size(shader) > 64) {
7571 si_multiwave_lds_size_workaround(sscreen,
7572 &shader->config.lds_size);
7573 }
7574 }
7575
7576 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7577 struct si_shader *shader,
7578 struct pipe_debug_callback *debug)
7579 {
7580 struct si_shader_selector *sel = shader->selector;
7581 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7582 int r;
7583
7584 /* LS, ES, VS are compiled on demand if the main part hasn't been
7585 * compiled for that stage.
7586 *
7587 * Vertex shaders are compiled on demand when a vertex fetch
7588 * workaround must be applied.
7589 */
7590 if (shader->is_monolithic) {
7591 /* Monolithic shader (compiled as a whole, has many variants,
7592 * may take a long time to compile).
7593 */
7594 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7595 if (r)
7596 return r;
7597 } else {
7598 /* The shader consists of several parts:
7599 *
7600 * - the middle part is the user shader, it has 1 variant only
7601 * and it was compiled during the creation of the shader
7602 * selector
7603 * - the prolog part is inserted at the beginning
7604 * - the epilog part is inserted at the end
7605 *
7606 * The prolog and epilog have many (but simple) variants.
7607 *
7608 * Starting with gfx9, geometry and tessellation control
7609 * shaders also contain the prolog and user shader parts of
7610 * the previous shader stage.
7611 */
7612
7613 if (!mainp)
7614 return -1;
7615
7616 /* Copy the compiled TGSI shader data over. */
7617 shader->is_binary_shared = true;
7618 shader->binary = mainp->binary;
7619 shader->config = mainp->config;
7620 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7621 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7622 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7623 memcpy(shader->info.vs_output_param_offset,
7624 mainp->info.vs_output_param_offset,
7625 sizeof(mainp->info.vs_output_param_offset));
7626 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7627 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7628 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7629
7630 /* Select prologs and/or epilogs. */
7631 switch (sel->type) {
7632 case PIPE_SHADER_VERTEX:
7633 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7634 return -1;
7635 break;
7636 case PIPE_SHADER_TESS_CTRL:
7637 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7638 return -1;
7639 break;
7640 case PIPE_SHADER_TESS_EVAL:
7641 break;
7642 case PIPE_SHADER_GEOMETRY:
7643 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7644 return -1;
7645 break;
7646 case PIPE_SHADER_FRAGMENT:
7647 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7648 return -1;
7649
7650 /* Make sure we have at least as many VGPRs as there
7651 * are allocated inputs.
7652 */
7653 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7654 shader->info.num_input_vgprs);
7655 break;
7656 }
7657
7658 /* Update SGPR and VGPR counts. */
7659 if (shader->prolog) {
7660 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7661 shader->prolog->config.num_sgprs);
7662 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7663 shader->prolog->config.num_vgprs);
7664 }
7665 if (shader->previous_stage) {
7666 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7667 shader->previous_stage->config.num_sgprs);
7668 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7669 shader->previous_stage->config.num_vgprs);
7670 shader->config.spilled_sgprs =
7671 MAX2(shader->config.spilled_sgprs,
7672 shader->previous_stage->config.spilled_sgprs);
7673 shader->config.spilled_vgprs =
7674 MAX2(shader->config.spilled_vgprs,
7675 shader->previous_stage->config.spilled_vgprs);
7676 shader->config.private_mem_vgprs =
7677 MAX2(shader->config.private_mem_vgprs,
7678 shader->previous_stage->config.private_mem_vgprs);
7679 shader->config.scratch_bytes_per_wave =
7680 MAX2(shader->config.scratch_bytes_per_wave,
7681 shader->previous_stage->config.scratch_bytes_per_wave);
7682 shader->info.uses_instanceid |=
7683 shader->previous_stage->info.uses_instanceid;
7684 }
7685 if (shader->prolog2) {
7686 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7687 shader->prolog2->config.num_sgprs);
7688 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7689 shader->prolog2->config.num_vgprs);
7690 }
7691 if (shader->epilog) {
7692 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7693 shader->epilog->config.num_sgprs);
7694 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7695 shader->epilog->config.num_vgprs);
7696 }
7697 }
7698
7699 si_fix_resource_usage(sscreen, shader);
7700 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7701 stderr, true);
7702
7703 /* Upload. */
7704 r = si_shader_binary_upload(sscreen, shader);
7705 if (r) {
7706 fprintf(stderr, "LLVM failed to upload shader\n");
7707 return r;
7708 }
7709
7710 return 0;
7711 }
7712
7713 void si_shader_destroy(struct si_shader *shader)
7714 {
7715 if (shader->scratch_bo)
7716 r600_resource_reference(&shader->scratch_bo, NULL);
7717
7718 r600_resource_reference(&shader->bo, NULL);
7719
7720 if (!shader->is_binary_shared)
7721 radeon_shader_binary_clean(&shader->binary);
7722
7723 free(shader->shader_log);
7724 }