628e6f80d3f856268d8923c9298d969ecc856689
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49 #include "compiler/nir/nir.h"
50
51 static const char *scratch_rsrc_dword0_symbol =
52 "SCRATCH_RSRC_DWORD0";
53
54 static const char *scratch_rsrc_dword1_symbol =
55 "SCRATCH_RSRC_DWORD1";
56
57 struct si_shader_output_values
58 {
59 LLVMValueRef values[4];
60 unsigned semantic_name;
61 unsigned semantic_index;
62 ubyte vertex_stream[4];
63 };
64
65 /**
66 * Used to collect types and other info about arguments of the LLVM function
67 * before the function is created.
68 */
69 struct si_function_info {
70 LLVMTypeRef types[100];
71 LLVMValueRef *assign[100];
72 unsigned num_sgpr_params;
73 unsigned num_params;
74 };
75
76 enum si_arg_regfile {
77 ARG_SGPR,
78 ARG_VGPR
79 };
80
81 static void si_init_shader_ctx(struct si_shader_context *ctx,
82 struct si_screen *sscreen,
83 LLVMTargetMachineRef tm);
84
85 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
86 struct lp_build_tgsi_context *bld_base,
87 struct lp_build_emit_data *emit_data);
88
89 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
90 FILE *f);
91
92 static unsigned llvm_get_type_size(LLVMTypeRef type);
93
94 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
95 union si_shader_part_key *key);
96 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
97 union si_shader_part_key *key);
98 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
99 union si_shader_part_key *key);
100 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
101 union si_shader_part_key *key);
102
103 /* Ideally pass the sample mask input to the PS epilog as v13, which
104 * is its usual location, so that the shader doesn't have to add v_mov.
105 */
106 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
107
108 enum {
109 CONST_ADDR_SPACE = 2,
110 LOCAL_ADDR_SPACE = 3,
111 };
112
113 static bool is_merged_shader(struct si_shader *shader)
114 {
115 if (shader->selector->screen->b.chip_class <= VI)
116 return false;
117
118 return shader->key.as_ls ||
119 shader->key.as_es ||
120 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
121 shader->selector->type == PIPE_SHADER_GEOMETRY;
122 }
123
124 static void si_init_function_info(struct si_function_info *fninfo)
125 {
126 fninfo->num_params = 0;
127 fninfo->num_sgpr_params = 0;
128 }
129
130 static unsigned add_arg_assign(struct si_function_info *fninfo,
131 enum si_arg_regfile regfile, LLVMTypeRef type,
132 LLVMValueRef *assign)
133 {
134 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
135
136 unsigned idx = fninfo->num_params++;
137 assert(idx < ARRAY_SIZE(fninfo->types));
138
139 if (regfile == ARG_SGPR)
140 fninfo->num_sgpr_params = fninfo->num_params;
141
142 fninfo->types[idx] = type;
143 fninfo->assign[idx] = assign;
144 return idx;
145 }
146
147 static unsigned add_arg(struct si_function_info *fninfo,
148 enum si_arg_regfile regfile, LLVMTypeRef type)
149 {
150 return add_arg_assign(fninfo, regfile, type, NULL);
151 }
152
153 static void add_arg_assign_checked(struct si_function_info *fninfo,
154 enum si_arg_regfile regfile, LLVMTypeRef type,
155 LLVMValueRef *assign, unsigned idx)
156 {
157 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
158 assert(actual == idx);
159 }
160
161 static void add_arg_checked(struct si_function_info *fninfo,
162 enum si_arg_regfile regfile, LLVMTypeRef type,
163 unsigned idx)
164 {
165 add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
166 }
167
168 /**
169 * Returns a unique index for a per-patch semantic name and index. The index
170 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
171 * can be calculated.
172 */
173 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
174 {
175 switch (semantic_name) {
176 case TGSI_SEMANTIC_TESSOUTER:
177 return 0;
178 case TGSI_SEMANTIC_TESSINNER:
179 return 1;
180 case TGSI_SEMANTIC_PATCH:
181 assert(index < 30);
182 return 2 + index;
183
184 default:
185 assert(!"invalid semantic name");
186 return 0;
187 }
188 }
189
190 /**
191 * Returns a unique index for a semantic name and index. The index must be
192 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
193 * calculated.
194 */
195 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
196 {
197 switch (semantic_name) {
198 case TGSI_SEMANTIC_POSITION:
199 return 0;
200 case TGSI_SEMANTIC_GENERIC:
201 /* Since some shader stages use the the highest used IO index
202 * to determine the size to allocate for inputs/outputs
203 * (in LDS, tess and GS rings). GENERIC should be placed right
204 * after POSITION to make that size as small as possible.
205 */
206 if (index < SI_MAX_IO_GENERIC)
207 return 1 + index;
208
209 assert(!"invalid generic index");
210 return 0;
211 case TGSI_SEMANTIC_PSIZE:
212 return SI_MAX_IO_GENERIC + 1;
213 case TGSI_SEMANTIC_CLIPDIST:
214 assert(index <= 1);
215 return SI_MAX_IO_GENERIC + 2 + index;
216 case TGSI_SEMANTIC_FOG:
217 return SI_MAX_IO_GENERIC + 4;
218 case TGSI_SEMANTIC_LAYER:
219 return SI_MAX_IO_GENERIC + 5;
220 case TGSI_SEMANTIC_VIEWPORT_INDEX:
221 return SI_MAX_IO_GENERIC + 6;
222 case TGSI_SEMANTIC_PRIMID:
223 return SI_MAX_IO_GENERIC + 7;
224 case TGSI_SEMANTIC_COLOR: /* these alias */
225 case TGSI_SEMANTIC_BCOLOR:
226 assert(index < 2);
227 return SI_MAX_IO_GENERIC + 8 + index;
228 case TGSI_SEMANTIC_TEXCOORD:
229 assert(index < 8);
230 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
231 return SI_MAX_IO_GENERIC + 10 + index;
232 default:
233 assert(!"invalid semantic name");
234 return 0;
235 }
236 }
237
238 /**
239 * Helper function that builds an LLVM IR PHI node and immediately adds
240 * incoming edges.
241 */
242 static LLVMValueRef
243 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
244 unsigned count_incoming, LLVMValueRef *values,
245 LLVMBasicBlockRef *blocks)
246 {
247 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
248 LLVMAddIncoming(phi, values, blocks, count_incoming);
249 return phi;
250 }
251
252 /**
253 * Get the value of a shader input parameter and extract a bitfield.
254 */
255 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
256 unsigned param, unsigned rshift,
257 unsigned bitwidth)
258 {
259 struct gallivm_state *gallivm = &ctx->gallivm;
260 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
261 param);
262
263 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
264 value = bitcast(&ctx->bld_base,
265 TGSI_TYPE_UNSIGNED, value);
266
267 if (rshift)
268 value = LLVMBuildLShr(gallivm->builder, value,
269 LLVMConstInt(ctx->i32, rshift, 0), "");
270
271 if (rshift + bitwidth < 32) {
272 unsigned mask = (1 << bitwidth) - 1;
273 value = LLVMBuildAnd(gallivm->builder, value,
274 LLVMConstInt(ctx->i32, mask, 0), "");
275 }
276
277 return value;
278 }
279
280 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
281 {
282 switch (ctx->type) {
283 case PIPE_SHADER_TESS_CTRL:
284 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
285
286 case PIPE_SHADER_TESS_EVAL:
287 return LLVMGetParam(ctx->main_fn,
288 ctx->param_tes_rel_patch_id);
289
290 default:
291 assert(0);
292 return NULL;
293 }
294 }
295
296 /* Tessellation shaders pass outputs to the next shader using LDS.
297 *
298 * LS outputs = TCS inputs
299 * TCS outputs = TES inputs
300 *
301 * The LDS layout is:
302 * - TCS inputs for patch 0
303 * - TCS inputs for patch 1
304 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
305 * - ...
306 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
307 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
308 * - TCS outputs for patch 1
309 * - Per-patch TCS outputs for patch 1
310 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
311 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
312 * - ...
313 *
314 * All three shaders VS(LS), TCS, TES share the same LDS space.
315 */
316
317 static LLVMValueRef
318 get_tcs_in_patch_stride(struct si_shader_context *ctx)
319 {
320 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
321 }
322
323 static LLVMValueRef
324 get_tcs_out_patch_stride(struct si_shader_context *ctx)
325 {
326 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
327 }
328
329 static LLVMValueRef
330 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
331 {
332 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
333 unpack_param(ctx,
334 ctx->param_tcs_out_lds_offsets,
335 0, 16),
336 4);
337 }
338
339 static LLVMValueRef
340 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
341 {
342 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
343 unpack_param(ctx,
344 ctx->param_tcs_out_lds_offsets,
345 16, 16),
346 4);
347 }
348
349 static LLVMValueRef
350 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
351 {
352 struct gallivm_state *gallivm = &ctx->gallivm;
353 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
354 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
355
356 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
357 }
358
359 static LLVMValueRef
360 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
361 {
362 struct gallivm_state *gallivm = &ctx->gallivm;
363 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
364 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
365 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
366
367 return LLVMBuildAdd(gallivm->builder, patch0_offset,
368 LLVMBuildMul(gallivm->builder, patch_stride,
369 rel_patch_id, ""),
370 "");
371 }
372
373 static LLVMValueRef
374 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
375 {
376 struct gallivm_state *gallivm = &ctx->gallivm;
377 LLVMValueRef patch0_patch_data_offset =
378 get_tcs_out_patch0_patch_data_offset(ctx);
379 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
380 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
381
382 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
383 LLVMBuildMul(gallivm->builder, patch_stride,
384 rel_patch_id, ""),
385 "");
386 }
387
388 static LLVMValueRef get_instance_index_for_fetch(
389 struct si_shader_context *ctx,
390 unsigned param_start_instance, LLVMValueRef divisor)
391 {
392 struct gallivm_state *gallivm = &ctx->gallivm;
393
394 LLVMValueRef result = ctx->abi.instance_id;
395
396 /* The division must be done before START_INSTANCE is added. */
397 if (divisor != ctx->i32_1)
398 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
399
400 return LLVMBuildAdd(gallivm->builder, result,
401 LLVMGetParam(ctx->main_fn, param_start_instance), "");
402 }
403
404 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
405 * to float. */
406 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
407 LLVMValueRef vec4,
408 unsigned double_index)
409 {
410 LLVMBuilderRef builder = ctx->gallivm.builder;
411 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
412 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
413 LLVMVectorType(f64, 2), "");
414 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
415 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
416 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
417 }
418
419 void si_llvm_load_input_vs(
420 struct si_shader_context *ctx,
421 unsigned input_index,
422 LLVMValueRef out[4])
423 {
424 struct gallivm_state *gallivm = &ctx->gallivm;
425
426 unsigned chan;
427 unsigned fix_fetch;
428 unsigned num_fetches;
429 unsigned fetch_stride;
430
431 LLVMValueRef t_list_ptr;
432 LLVMValueRef t_offset;
433 LLVMValueRef t_list;
434 LLVMValueRef vertex_index;
435 LLVMValueRef input[3];
436
437 /* Load the T list */
438 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
439
440 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
441
442 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
443
444 vertex_index = LLVMGetParam(ctx->main_fn,
445 ctx->param_vertex_index0 +
446 input_index);
447
448 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
449
450 /* Do multiple loads for special formats. */
451 switch (fix_fetch) {
452 case SI_FIX_FETCH_RGB_64_FLOAT:
453 num_fetches = 3; /* 3 2-dword loads */
454 fetch_stride = 8;
455 break;
456 case SI_FIX_FETCH_RGBA_64_FLOAT:
457 num_fetches = 2; /* 2 4-dword loads */
458 fetch_stride = 16;
459 break;
460 case SI_FIX_FETCH_RGB_8:
461 case SI_FIX_FETCH_RGB_8_INT:
462 num_fetches = 3;
463 fetch_stride = 1;
464 break;
465 case SI_FIX_FETCH_RGB_16:
466 case SI_FIX_FETCH_RGB_16_INT:
467 num_fetches = 3;
468 fetch_stride = 2;
469 break;
470 default:
471 num_fetches = 1;
472 fetch_stride = 0;
473 }
474
475 for (unsigned i = 0; i < num_fetches; i++) {
476 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
477
478 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
479 vertex_index, voffset,
480 true);
481 }
482
483 /* Break up the vec4 into individual components */
484 for (chan = 0; chan < 4; chan++) {
485 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
486 out[chan] = LLVMBuildExtractElement(gallivm->builder,
487 input[0], llvm_chan, "");
488 }
489
490 switch (fix_fetch) {
491 case SI_FIX_FETCH_A2_SNORM:
492 case SI_FIX_FETCH_A2_SSCALED:
493 case SI_FIX_FETCH_A2_SINT: {
494 /* The hardware returns an unsigned value; convert it to a
495 * signed one.
496 */
497 LLVMValueRef tmp = out[3];
498 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
499
500 /* First, recover the sign-extended signed integer value. */
501 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
502 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
503 else
504 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
505
506 /* For the integer-like cases, do a natural sign extension.
507 *
508 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
509 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
510 * exponent.
511 */
512 tmp = LLVMBuildShl(gallivm->builder, tmp,
513 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
514 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
515 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
516
517 /* Convert back to the right type. */
518 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
519 LLVMValueRef clamp;
520 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
521 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
522 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
523 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
524 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
525 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
526 }
527
528 out[3] = tmp;
529 break;
530 }
531 case SI_FIX_FETCH_RGBA_32_UNORM:
532 case SI_FIX_FETCH_RGBX_32_UNORM:
533 for (chan = 0; chan < 4; chan++) {
534 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
535 ctx->i32, "");
536 out[chan] = LLVMBuildUIToFP(gallivm->builder,
537 out[chan], ctx->f32, "");
538 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
539 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
540 }
541 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
542 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
543 out[3] = LLVMConstReal(ctx->f32, 1);
544 break;
545 case SI_FIX_FETCH_RGBA_32_SNORM:
546 case SI_FIX_FETCH_RGBX_32_SNORM:
547 case SI_FIX_FETCH_RGBA_32_FIXED:
548 case SI_FIX_FETCH_RGBX_32_FIXED: {
549 double scale;
550 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
551 scale = 1.0 / 0x10000;
552 else
553 scale = 1.0 / INT_MAX;
554
555 for (chan = 0; chan < 4; chan++) {
556 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
557 ctx->i32, "");
558 out[chan] = LLVMBuildSIToFP(gallivm->builder,
559 out[chan], ctx->f32, "");
560 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
561 LLVMConstReal(ctx->f32, scale), "");
562 }
563 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
564 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
565 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
566 out[3] = LLVMConstReal(ctx->f32, 1);
567 break;
568 }
569 case SI_FIX_FETCH_RGBA_32_USCALED:
570 for (chan = 0; chan < 4; chan++) {
571 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
572 ctx->i32, "");
573 out[chan] = LLVMBuildUIToFP(gallivm->builder,
574 out[chan], ctx->f32, "");
575 }
576 break;
577 case SI_FIX_FETCH_RGBA_32_SSCALED:
578 for (chan = 0; chan < 4; chan++) {
579 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
580 ctx->i32, "");
581 out[chan] = LLVMBuildSIToFP(gallivm->builder,
582 out[chan], ctx->f32, "");
583 }
584 break;
585 case SI_FIX_FETCH_RG_64_FLOAT:
586 for (chan = 0; chan < 2; chan++)
587 out[chan] = extract_double_to_float(ctx, input[0], chan);
588
589 out[2] = LLVMConstReal(ctx->f32, 0);
590 out[3] = LLVMConstReal(ctx->f32, 1);
591 break;
592 case SI_FIX_FETCH_RGB_64_FLOAT:
593 for (chan = 0; chan < 3; chan++)
594 out[chan] = extract_double_to_float(ctx, input[chan], 0);
595
596 out[3] = LLVMConstReal(ctx->f32, 1);
597 break;
598 case SI_FIX_FETCH_RGBA_64_FLOAT:
599 for (chan = 0; chan < 4; chan++) {
600 out[chan] = extract_double_to_float(ctx, input[chan / 2],
601 chan % 2);
602 }
603 break;
604 case SI_FIX_FETCH_RGB_8:
605 case SI_FIX_FETCH_RGB_8_INT:
606 case SI_FIX_FETCH_RGB_16:
607 case SI_FIX_FETCH_RGB_16_INT:
608 for (chan = 0; chan < 3; chan++) {
609 out[chan] = LLVMBuildExtractElement(gallivm->builder,
610 input[chan],
611 ctx->i32_0, "");
612 }
613 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
614 fix_fetch == SI_FIX_FETCH_RGB_16) {
615 out[3] = LLVMConstReal(ctx->f32, 1);
616 } else {
617 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
618 ctx->f32, "");
619 }
620 break;
621 }
622 }
623
624 static void declare_input_vs(
625 struct si_shader_context *ctx,
626 unsigned input_index,
627 const struct tgsi_full_declaration *decl,
628 LLVMValueRef out[4])
629 {
630 si_llvm_load_input_vs(ctx, input_index, out);
631 }
632
633 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
634 unsigned swizzle)
635 {
636 if (swizzle > 0)
637 return ctx->i32_0;
638
639 switch (ctx->type) {
640 case PIPE_SHADER_VERTEX:
641 return LLVMGetParam(ctx->main_fn,
642 ctx->param_vs_prim_id);
643 case PIPE_SHADER_TESS_CTRL:
644 return LLVMGetParam(ctx->main_fn,
645 ctx->param_tcs_patch_id);
646 case PIPE_SHADER_TESS_EVAL:
647 return LLVMGetParam(ctx->main_fn,
648 ctx->param_tes_patch_id);
649 case PIPE_SHADER_GEOMETRY:
650 return LLVMGetParam(ctx->main_fn,
651 ctx->param_gs_prim_id);
652 default:
653 assert(0);
654 return ctx->i32_0;
655 }
656 }
657
658 /**
659 * Return the value of tgsi_ind_register for indexing.
660 * This is the indirect index with the constant offset added to it.
661 */
662 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
663 const struct tgsi_ind_register *ind,
664 int rel_index)
665 {
666 struct gallivm_state *gallivm = &ctx->gallivm;
667 LLVMValueRef result;
668
669 result = ctx->addrs[ind->Index][ind->Swizzle];
670 result = LLVMBuildLoad(gallivm->builder, result, "");
671 result = LLVMBuildAdd(gallivm->builder, result,
672 LLVMConstInt(ctx->i32, rel_index, 0), "");
673 return result;
674 }
675
676 /**
677 * Like si_get_indirect_index, but restricts the return value to a (possibly
678 * undefined) value inside [0..num).
679 */
680 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
681 const struct tgsi_ind_register *ind,
682 int rel_index, unsigned num)
683 {
684 LLVMValueRef result = si_get_indirect_index(ctx, ind, rel_index);
685
686 return si_llvm_bound_index(ctx, result, num);
687 }
688
689
690 /**
691 * Calculate a dword address given an input or output register and a stride.
692 */
693 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
694 const struct tgsi_full_dst_register *dst,
695 const struct tgsi_full_src_register *src,
696 LLVMValueRef vertex_dw_stride,
697 LLVMValueRef base_addr)
698 {
699 struct gallivm_state *gallivm = &ctx->gallivm;
700 struct tgsi_shader_info *info = &ctx->shader->selector->info;
701 ubyte *name, *index, *array_first;
702 int first, param;
703 struct tgsi_full_dst_register reg;
704
705 /* Set the register description. The address computation is the same
706 * for sources and destinations. */
707 if (src) {
708 reg.Register.File = src->Register.File;
709 reg.Register.Index = src->Register.Index;
710 reg.Register.Indirect = src->Register.Indirect;
711 reg.Register.Dimension = src->Register.Dimension;
712 reg.Indirect = src->Indirect;
713 reg.Dimension = src->Dimension;
714 reg.DimIndirect = src->DimIndirect;
715 } else
716 reg = *dst;
717
718 /* If the register is 2-dimensional (e.g. an array of vertices
719 * in a primitive), calculate the base address of the vertex. */
720 if (reg.Register.Dimension) {
721 LLVMValueRef index;
722
723 if (reg.Dimension.Indirect)
724 index = si_get_indirect_index(ctx, &reg.DimIndirect,
725 reg.Dimension.Index);
726 else
727 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
728
729 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
730 LLVMBuildMul(gallivm->builder, index,
731 vertex_dw_stride, ""), "");
732 }
733
734 /* Get information about the register. */
735 if (reg.Register.File == TGSI_FILE_INPUT) {
736 name = info->input_semantic_name;
737 index = info->input_semantic_index;
738 array_first = info->input_array_first;
739 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
740 name = info->output_semantic_name;
741 index = info->output_semantic_index;
742 array_first = info->output_array_first;
743 } else {
744 assert(0);
745 return NULL;
746 }
747
748 if (reg.Register.Indirect) {
749 /* Add the relative address of the element. */
750 LLVMValueRef ind_index;
751
752 if (reg.Indirect.ArrayID)
753 first = array_first[reg.Indirect.ArrayID];
754 else
755 first = reg.Register.Index;
756
757 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
758 reg.Register.Index - first);
759
760 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
761 LLVMBuildMul(gallivm->builder, ind_index,
762 LLVMConstInt(ctx->i32, 4, 0), ""), "");
763
764 param = reg.Register.Dimension ?
765 si_shader_io_get_unique_index(name[first], index[first]) :
766 si_shader_io_get_unique_index_patch(name[first], index[first]);
767 } else {
768 param = reg.Register.Dimension ?
769 si_shader_io_get_unique_index(name[reg.Register.Index],
770 index[reg.Register.Index]) :
771 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
772 index[reg.Register.Index]);
773 }
774
775 /* Add the base address of the element. */
776 return LLVMBuildAdd(gallivm->builder, base_addr,
777 LLVMConstInt(ctx->i32, param * 4, 0), "");
778 }
779
780 /* The offchip buffer layout for TCS->TES is
781 *
782 * - attribute 0 of patch 0 vertex 0
783 * - attribute 0 of patch 0 vertex 1
784 * - attribute 0 of patch 0 vertex 2
785 * ...
786 * - attribute 0 of patch 1 vertex 0
787 * - attribute 0 of patch 1 vertex 1
788 * ...
789 * - attribute 1 of patch 0 vertex 0
790 * - attribute 1 of patch 0 vertex 1
791 * ...
792 * - per patch attribute 0 of patch 0
793 * - per patch attribute 0 of patch 1
794 * ...
795 *
796 * Note that every attribute has 4 components.
797 */
798 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
799 LLVMValueRef rel_patch_id,
800 LLVMValueRef vertex_index,
801 LLVMValueRef param_index)
802 {
803 struct gallivm_state *gallivm = &ctx->gallivm;
804 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
805 LLVMValueRef param_stride, constant16;
806
807 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
808 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
809 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
810 num_patches, "");
811
812 constant16 = LLVMConstInt(ctx->i32, 16, 0);
813 if (vertex_index) {
814 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
815 vertices_per_patch, "");
816
817 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
818 vertex_index, "");
819
820 param_stride = total_vertices;
821 } else {
822 base_addr = rel_patch_id;
823 param_stride = num_patches;
824 }
825
826 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
827 LLVMBuildMul(gallivm->builder, param_index,
828 param_stride, ""), "");
829
830 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
831
832 if (!vertex_index) {
833 LLVMValueRef patch_data_offset =
834 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
835
836 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
837 patch_data_offset, "");
838 }
839 return base_addr;
840 }
841
842 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
843 struct si_shader_context *ctx,
844 const struct tgsi_full_dst_register *dst,
845 const struct tgsi_full_src_register *src)
846 {
847 struct gallivm_state *gallivm = &ctx->gallivm;
848 struct tgsi_shader_info *info = &ctx->shader->selector->info;
849 ubyte *name, *index, *array_first;
850 struct tgsi_full_src_register reg;
851 LLVMValueRef vertex_index = NULL;
852 LLVMValueRef param_index = NULL;
853 unsigned param_index_base, param_base;
854
855 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
856
857 if (reg.Register.Dimension) {
858
859 if (reg.Dimension.Indirect)
860 vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
861 reg.Dimension.Index);
862 else
863 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
864 }
865
866 /* Get information about the register. */
867 if (reg.Register.File == TGSI_FILE_INPUT) {
868 name = info->input_semantic_name;
869 index = info->input_semantic_index;
870 array_first = info->input_array_first;
871 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
872 name = info->output_semantic_name;
873 index = info->output_semantic_index;
874 array_first = info->output_array_first;
875 } else {
876 assert(0);
877 return NULL;
878 }
879
880 if (reg.Register.Indirect) {
881 if (reg.Indirect.ArrayID)
882 param_base = array_first[reg.Indirect.ArrayID];
883 else
884 param_base = reg.Register.Index;
885
886 param_index = si_get_indirect_index(ctx, &reg.Indirect,
887 reg.Register.Index - param_base);
888
889 } else {
890 param_base = reg.Register.Index;
891 param_index = ctx->i32_0;
892 }
893
894 param_index_base = reg.Register.Dimension ?
895 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
896 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
897
898 param_index = LLVMBuildAdd(gallivm->builder, param_index,
899 LLVMConstInt(ctx->i32, param_index_base, 0),
900 "");
901
902 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
903 vertex_index, param_index);
904 }
905
906 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
907 enum tgsi_opcode_type type, unsigned swizzle,
908 LLVMValueRef buffer, LLVMValueRef offset,
909 LLVMValueRef base, bool can_speculate)
910 {
911 struct si_shader_context *ctx = si_shader_context(bld_base);
912 struct gallivm_state *gallivm = &ctx->gallivm;
913 LLVMValueRef value, value2;
914 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
915 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
916
917 if (swizzle == ~0) {
918 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
919 0, 1, 0, can_speculate, false);
920
921 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
922 }
923
924 if (!tgsi_type_is_64bit(type)) {
925 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
926 0, 1, 0, can_speculate, false);
927
928 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
929 return LLVMBuildExtractElement(gallivm->builder, value,
930 LLVMConstInt(ctx->i32, swizzle, 0), "");
931 }
932
933 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
934 swizzle * 4, 1, 0, can_speculate, false);
935
936 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
937 swizzle * 4 + 4, 1, 0, can_speculate, false);
938
939 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
940 }
941
942 /**
943 * Load from LDS.
944 *
945 * \param type output value type
946 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
947 * \param dw_addr address in dwords
948 */
949 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
950 enum tgsi_opcode_type type, unsigned swizzle,
951 LLVMValueRef dw_addr)
952 {
953 struct si_shader_context *ctx = si_shader_context(bld_base);
954 struct gallivm_state *gallivm = &ctx->gallivm;
955 LLVMValueRef value;
956
957 if (swizzle == ~0) {
958 LLVMValueRef values[TGSI_NUM_CHANNELS];
959
960 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
961 values[chan] = lds_load(bld_base, type, chan, dw_addr);
962
963 return lp_build_gather_values(gallivm, values,
964 TGSI_NUM_CHANNELS);
965 }
966
967 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
968 LLVMConstInt(ctx->i32, swizzle, 0));
969
970 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
971 if (tgsi_type_is_64bit(type)) {
972 LLVMValueRef value2;
973 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
974 ctx->i32_1);
975 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
976 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
977 }
978
979 return LLVMBuildBitCast(gallivm->builder, value,
980 tgsi2llvmtype(bld_base, type), "");
981 }
982
983 /**
984 * Store to LDS.
985 *
986 * \param swizzle offset (typically 0..3)
987 * \param dw_addr address in dwords
988 * \param value value to store
989 */
990 static void lds_store(struct lp_build_tgsi_context *bld_base,
991 unsigned dw_offset_imm, LLVMValueRef dw_addr,
992 LLVMValueRef value)
993 {
994 struct si_shader_context *ctx = si_shader_context(bld_base);
995 struct gallivm_state *gallivm = &ctx->gallivm;
996
997 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
998 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
999
1000 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1001 ac_build_indexed_store(&ctx->ac, ctx->lds,
1002 dw_addr, value);
1003 }
1004
1005 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
1006 unsigned param)
1007 {
1008 LLVMBuilderRef builder = ctx->gallivm.builder;
1009
1010 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1011 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
1012 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1013
1014 uint64_t desc2 = 0xffffffff;
1015 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1016 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1017 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1018 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1019 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1020 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1021 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1022
1023 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1024 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1025 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1026 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1027 }
1028
1029 static LLVMValueRef fetch_input_tcs(
1030 struct lp_build_tgsi_context *bld_base,
1031 const struct tgsi_full_src_register *reg,
1032 enum tgsi_opcode_type type, unsigned swizzle)
1033 {
1034 struct si_shader_context *ctx = si_shader_context(bld_base);
1035 LLVMValueRef dw_addr, stride;
1036
1037 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
1038 dw_addr = get_tcs_in_current_patch_offset(ctx);
1039 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1040
1041 return lds_load(bld_base, type, swizzle, dw_addr);
1042 }
1043
1044 static LLVMValueRef fetch_output_tcs(
1045 struct lp_build_tgsi_context *bld_base,
1046 const struct tgsi_full_src_register *reg,
1047 enum tgsi_opcode_type type, unsigned swizzle)
1048 {
1049 struct si_shader_context *ctx = si_shader_context(bld_base);
1050 LLVMValueRef dw_addr, stride;
1051
1052 if (reg->Register.Dimension) {
1053 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1054 dw_addr = get_tcs_out_current_patch_offset(ctx);
1055 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1056 } else {
1057 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1058 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1059 }
1060
1061 return lds_load(bld_base, type, swizzle, dw_addr);
1062 }
1063
1064 static LLVMValueRef fetch_input_tes(
1065 struct lp_build_tgsi_context *bld_base,
1066 const struct tgsi_full_src_register *reg,
1067 enum tgsi_opcode_type type, unsigned swizzle)
1068 {
1069 struct si_shader_context *ctx = si_shader_context(bld_base);
1070 LLVMValueRef buffer, base, addr;
1071
1072 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1073
1074 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1075 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1076
1077 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1078 }
1079
1080 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1081 const struct tgsi_full_instruction *inst,
1082 const struct tgsi_opcode_info *info,
1083 LLVMValueRef dst[4])
1084 {
1085 struct si_shader_context *ctx = si_shader_context(bld_base);
1086 struct gallivm_state *gallivm = &ctx->gallivm;
1087 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1088 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1089 unsigned chan_index;
1090 LLVMValueRef dw_addr, stride;
1091 LLVMValueRef buffer, base, buf_addr;
1092 LLVMValueRef values[4];
1093 bool skip_lds_store;
1094 bool is_tess_factor = false;
1095
1096 /* Only handle per-patch and per-vertex outputs here.
1097 * Vectors will be lowered to scalars and this function will be called again.
1098 */
1099 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1100 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1101 si_llvm_emit_store(bld_base, inst, info, dst);
1102 return;
1103 }
1104
1105 if (reg->Register.Dimension) {
1106 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1107 dw_addr = get_tcs_out_current_patch_offset(ctx);
1108 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1109 skip_lds_store = !sh_info->reads_pervertex_outputs;
1110 } else {
1111 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1112 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1113 skip_lds_store = !sh_info->reads_perpatch_outputs;
1114
1115 if (!reg->Register.Indirect) {
1116 int name = sh_info->output_semantic_name[reg->Register.Index];
1117
1118 /* Always write tess factors into LDS for the TCS epilog. */
1119 if (name == TGSI_SEMANTIC_TESSINNER ||
1120 name == TGSI_SEMANTIC_TESSOUTER) {
1121 skip_lds_store = false;
1122 is_tess_factor = true;
1123 }
1124 }
1125 }
1126
1127 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1128
1129 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1130 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1131
1132
1133 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1134 LLVMValueRef value = dst[chan_index];
1135
1136 if (inst->Instruction.Saturate)
1137 value = ac_build_clamp(&ctx->ac, value);
1138
1139 /* Skip LDS stores if there is no LDS read of this output. */
1140 if (!skip_lds_store)
1141 lds_store(bld_base, chan_index, dw_addr, value);
1142
1143 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1144 values[chan_index] = value;
1145
1146 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1147 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1148 buf_addr, base,
1149 4 * chan_index, 1, 0, true, false);
1150 }
1151 }
1152
1153 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1154 LLVMValueRef value = lp_build_gather_values(gallivm,
1155 values, 4);
1156 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1157 base, 0, 1, 0, true, false);
1158 }
1159 }
1160
1161 static LLVMValueRef fetch_input_gs(
1162 struct lp_build_tgsi_context *bld_base,
1163 const struct tgsi_full_src_register *reg,
1164 enum tgsi_opcode_type type,
1165 unsigned swizzle)
1166 {
1167 struct si_shader_context *ctx = si_shader_context(bld_base);
1168 struct si_shader *shader = ctx->shader;
1169 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1170 struct gallivm_state *gallivm = &ctx->gallivm;
1171 LLVMValueRef vtx_offset, soffset;
1172 struct tgsi_shader_info *info = &shader->selector->info;
1173 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1174 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1175 unsigned param;
1176 LLVMValueRef value;
1177
1178 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1179 return get_primitive_id(ctx, swizzle);
1180
1181 if (!reg->Register.Dimension)
1182 return NULL;
1183
1184 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1185
1186 /* GFX9 has the ESGS ring in LDS. */
1187 if (ctx->screen->b.chip_class >= GFX9) {
1188 unsigned index = reg->Dimension.Index;
1189
1190 switch (index / 2) {
1191 case 0:
1192 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1193 index % 2 ? 16 : 0, 16);
1194 break;
1195 case 1:
1196 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1197 index % 2 ? 16 : 0, 16);
1198 break;
1199 case 2:
1200 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1201 index % 2 ? 16 : 0, 16);
1202 break;
1203 default:
1204 assert(0);
1205 return NULL;
1206 }
1207
1208 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1209 LLVMConstInt(ctx->i32, param * 4, 0), "");
1210 return lds_load(bld_base, type, swizzle, vtx_offset);
1211 }
1212
1213 /* GFX6: input load from the ESGS ring in memory. */
1214 if (swizzle == ~0) {
1215 LLVMValueRef values[TGSI_NUM_CHANNELS];
1216 unsigned chan;
1217 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1218 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1219 }
1220 return lp_build_gather_values(gallivm, values,
1221 TGSI_NUM_CHANNELS);
1222 }
1223
1224 /* Get the vertex offset parameter on GFX6. */
1225 unsigned vtx_offset_param = reg->Dimension.Index;
1226 if (vtx_offset_param < 2) {
1227 vtx_offset_param += ctx->param_gs_vtx0_offset;
1228 } else {
1229 assert(vtx_offset_param < 6);
1230 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1231 }
1232 vtx_offset = lp_build_mul_imm(uint,
1233 LLVMGetParam(ctx->main_fn,
1234 vtx_offset_param),
1235 4);
1236
1237 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1238
1239 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1240 vtx_offset, soffset, 0, 1, 0, true, false);
1241 if (tgsi_type_is_64bit(type)) {
1242 LLVMValueRef value2;
1243 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1244
1245 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1246 ctx->i32_0, vtx_offset, soffset,
1247 0, 1, 0, true, false);
1248 return si_llvm_emit_fetch_64bit(bld_base, type,
1249 value, value2);
1250 }
1251 return LLVMBuildBitCast(gallivm->builder,
1252 value,
1253 tgsi2llvmtype(bld_base, type), "");
1254 }
1255
1256 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1257 {
1258 switch (interpolate) {
1259 case TGSI_INTERPOLATE_CONSTANT:
1260 return 0;
1261
1262 case TGSI_INTERPOLATE_LINEAR:
1263 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1264 return SI_PARAM_LINEAR_SAMPLE;
1265 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1266 return SI_PARAM_LINEAR_CENTROID;
1267 else
1268 return SI_PARAM_LINEAR_CENTER;
1269 break;
1270 case TGSI_INTERPOLATE_COLOR:
1271 case TGSI_INTERPOLATE_PERSPECTIVE:
1272 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1273 return SI_PARAM_PERSP_SAMPLE;
1274 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1275 return SI_PARAM_PERSP_CENTROID;
1276 else
1277 return SI_PARAM_PERSP_CENTER;
1278 break;
1279 default:
1280 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1281 return -1;
1282 }
1283 }
1284
1285 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1286 unsigned attr_index, unsigned chan,
1287 LLVMValueRef prim_mask,
1288 LLVMValueRef i, LLVMValueRef j)
1289 {
1290 if (i || j) {
1291 return ac_build_fs_interp(&ctx->ac,
1292 LLVMConstInt(ctx->i32, chan, 0),
1293 LLVMConstInt(ctx->i32, attr_index, 0),
1294 prim_mask, i, j);
1295 }
1296 return ac_build_fs_interp_mov(&ctx->ac,
1297 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1298 LLVMConstInt(ctx->i32, chan, 0),
1299 LLVMConstInt(ctx->i32, attr_index, 0),
1300 prim_mask);
1301 }
1302
1303 /**
1304 * Interpolate a fragment shader input.
1305 *
1306 * @param ctx context
1307 * @param input_index index of the input in hardware
1308 * @param semantic_name TGSI_SEMANTIC_*
1309 * @param semantic_index semantic index
1310 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1311 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1312 * @param interp_param interpolation weights (i,j)
1313 * @param prim_mask SI_PARAM_PRIM_MASK
1314 * @param face SI_PARAM_FRONT_FACE
1315 * @param result the return value (4 components)
1316 */
1317 static void interp_fs_input(struct si_shader_context *ctx,
1318 unsigned input_index,
1319 unsigned semantic_name,
1320 unsigned semantic_index,
1321 unsigned num_interp_inputs,
1322 unsigned colors_read_mask,
1323 LLVMValueRef interp_param,
1324 LLVMValueRef prim_mask,
1325 LLVMValueRef face,
1326 LLVMValueRef result[4])
1327 {
1328 struct gallivm_state *gallivm = &ctx->gallivm;
1329 LLVMValueRef i = NULL, j = NULL;
1330 unsigned chan;
1331
1332 /* fs.constant returns the param from the middle vertex, so it's not
1333 * really useful for flat shading. It's meant to be used for custom
1334 * interpolation (but the intrinsic can't fetch from the other two
1335 * vertices).
1336 *
1337 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1338 * to do the right thing. The only reason we use fs.constant is that
1339 * fs.interp cannot be used on integers, because they can be equal
1340 * to NaN.
1341 *
1342 * When interp is false we will use fs.constant or for newer llvm,
1343 * amdgcn.interp.mov.
1344 */
1345 bool interp = interp_param != NULL;
1346
1347 if (interp) {
1348 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1349 LLVMVectorType(ctx->f32, 2), "");
1350
1351 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1352 ctx->i32_0, "");
1353 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1354 ctx->i32_1, "");
1355 }
1356
1357 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1358 ctx->shader->key.part.ps.prolog.color_two_side) {
1359 LLVMValueRef is_face_positive;
1360
1361 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1362 * otherwise it's at offset "num_inputs".
1363 */
1364 unsigned back_attr_offset = num_interp_inputs;
1365 if (semantic_index == 1 && colors_read_mask & 0xf)
1366 back_attr_offset += 1;
1367
1368 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1369 face, ctx->i32_0, "");
1370
1371 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1372 LLVMValueRef front, back;
1373
1374 front = si_build_fs_interp(ctx,
1375 input_index, chan,
1376 prim_mask, i, j);
1377 back = si_build_fs_interp(ctx,
1378 back_attr_offset, chan,
1379 prim_mask, i, j);
1380
1381 result[chan] = LLVMBuildSelect(gallivm->builder,
1382 is_face_positive,
1383 front,
1384 back,
1385 "");
1386 }
1387 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1388 result[0] = si_build_fs_interp(ctx, input_index,
1389 0, prim_mask, i, j);
1390 result[1] =
1391 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1392 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1393 } else {
1394 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1395 result[chan] = si_build_fs_interp(ctx,
1396 input_index, chan,
1397 prim_mask, i, j);
1398 }
1399 }
1400 }
1401
1402 void si_llvm_load_input_fs(
1403 struct si_shader_context *ctx,
1404 unsigned input_index,
1405 LLVMValueRef out[4])
1406 {
1407 struct lp_build_context *base = &ctx->bld_base.base;
1408 struct si_shader *shader = ctx->shader;
1409 struct tgsi_shader_info *info = &shader->selector->info;
1410 LLVMValueRef main_fn = ctx->main_fn;
1411 LLVMValueRef interp_param = NULL;
1412 int interp_param_idx;
1413 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1414 unsigned semantic_index = info->input_semantic_index[input_index];
1415 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1416 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1417
1418 /* Get colors from input VGPRs (set by the prolog). */
1419 if (semantic_name == TGSI_SEMANTIC_COLOR) {
1420 unsigned colors_read = shader->selector->info.colors_read;
1421 unsigned mask = colors_read >> (semantic_index * 4);
1422 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1423 (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1424
1425 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1426 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1427 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1428 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1429 return;
1430 }
1431
1432 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1433 if (interp_param_idx == -1)
1434 return;
1435 else if (interp_param_idx) {
1436 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1437 }
1438
1439 interp_fs_input(ctx, input_index, semantic_name,
1440 semantic_index, 0, /* this param is unused */
1441 shader->selector->info.colors_read, interp_param,
1442 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1443 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1444 &out[0]);
1445 }
1446
1447 static void declare_input_fs(
1448 struct si_shader_context *ctx,
1449 unsigned input_index,
1450 const struct tgsi_full_declaration *decl,
1451 LLVMValueRef out[4])
1452 {
1453 si_llvm_load_input_fs(ctx, input_index, out);
1454 }
1455
1456 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1457 {
1458 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1459 }
1460
1461
1462 /**
1463 * Load a dword from a constant buffer.
1464 */
1465 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1466 LLVMValueRef resource,
1467 LLVMValueRef offset)
1468 {
1469 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1470 0, 0, 0, true, true);
1471 }
1472
1473 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1474 {
1475 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1476 struct gallivm_state *gallivm = &ctx->gallivm;
1477 LLVMBuilderRef builder = gallivm->builder;
1478 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1479 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1480 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1481
1482 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1483 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1484 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1485
1486 LLVMValueRef pos[4] = {
1487 buffer_load_const(ctx, resource, offset0),
1488 buffer_load_const(ctx, resource, offset1),
1489 LLVMConstReal(ctx->f32, 0),
1490 LLVMConstReal(ctx->f32, 0)
1491 };
1492
1493 return lp_build_gather_values(gallivm, pos, 4);
1494 }
1495
1496 static void declare_system_value(struct si_shader_context *ctx,
1497 unsigned index,
1498 const struct tgsi_full_declaration *decl)
1499 {
1500 struct lp_build_context *bld = &ctx->bld_base.base;
1501 struct gallivm_state *gallivm = &ctx->gallivm;
1502 LLVMValueRef value = 0;
1503
1504 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1505
1506 switch (decl->Semantic.Name) {
1507 case TGSI_SEMANTIC_INSTANCEID:
1508 value = ctx->abi.instance_id;
1509 break;
1510
1511 case TGSI_SEMANTIC_VERTEXID:
1512 value = LLVMBuildAdd(gallivm->builder,
1513 ctx->abi.vertex_id,
1514 ctx->abi.base_vertex, "");
1515 break;
1516
1517 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1518 /* Unused. Clarify the meaning in indexed vs. non-indexed
1519 * draws if this is ever used again. */
1520 assert(false);
1521 break;
1522
1523 case TGSI_SEMANTIC_BASEVERTEX:
1524 {
1525 /* For non-indexed draws, the base vertex set by the driver
1526 * (for direct draws) or the CP (for indirect draws) is the
1527 * first vertex ID, but GLSL expects 0 to be returned.
1528 */
1529 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1530 LLVMValueRef indexed;
1531
1532 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1533 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1534
1535 value = LLVMBuildSelect(gallivm->builder, indexed,
1536 ctx->abi.base_vertex, ctx->i32_0, "");
1537 break;
1538 }
1539
1540 case TGSI_SEMANTIC_BASEINSTANCE:
1541 value = ctx->abi.start_instance;
1542 break;
1543
1544 case TGSI_SEMANTIC_DRAWID:
1545 value = ctx->abi.draw_id;
1546 break;
1547
1548 case TGSI_SEMANTIC_INVOCATIONID:
1549 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1550 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1551 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1552 value = LLVMGetParam(ctx->main_fn,
1553 ctx->param_gs_instance_id);
1554 else
1555 assert(!"INVOCATIONID not implemented");
1556 break;
1557
1558 case TGSI_SEMANTIC_POSITION:
1559 {
1560 LLVMValueRef pos[4] = {
1561 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1562 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1563 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1564 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1565 LLVMGetParam(ctx->main_fn,
1566 SI_PARAM_POS_W_FLOAT)),
1567 };
1568 value = lp_build_gather_values(gallivm, pos, 4);
1569 break;
1570 }
1571
1572 case TGSI_SEMANTIC_FACE:
1573 value = ctx->abi.front_face;
1574 break;
1575
1576 case TGSI_SEMANTIC_SAMPLEID:
1577 value = get_sample_id(ctx);
1578 break;
1579
1580 case TGSI_SEMANTIC_SAMPLEPOS: {
1581 LLVMValueRef pos[4] = {
1582 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1583 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1584 LLVMConstReal(ctx->f32, 0),
1585 LLVMConstReal(ctx->f32, 0)
1586 };
1587 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1588 TGSI_OPCODE_FRC, pos[0]);
1589 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1590 TGSI_OPCODE_FRC, pos[1]);
1591 value = lp_build_gather_values(gallivm, pos, 4);
1592 break;
1593 }
1594
1595 case TGSI_SEMANTIC_SAMPLEMASK:
1596 /* This can only occur with the OpenGL Core profile, which
1597 * doesn't support smoothing.
1598 */
1599 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1600 break;
1601
1602 case TGSI_SEMANTIC_TESSCOORD:
1603 {
1604 LLVMValueRef coord[4] = {
1605 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1606 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1607 bld->zero,
1608 bld->zero
1609 };
1610
1611 /* For triangles, the vector should be (u, v, 1-u-v). */
1612 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1613 PIPE_PRIM_TRIANGLES)
1614 coord[2] = lp_build_sub(bld, bld->one,
1615 lp_build_add(bld, coord[0], coord[1]));
1616
1617 value = lp_build_gather_values(gallivm, coord, 4);
1618 break;
1619 }
1620
1621 case TGSI_SEMANTIC_VERTICESIN:
1622 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1623 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1624 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1625 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1626 else
1627 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1628 break;
1629
1630 case TGSI_SEMANTIC_TESSINNER:
1631 case TGSI_SEMANTIC_TESSOUTER:
1632 {
1633 LLVMValueRef buffer, base, addr;
1634 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1635
1636 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1637
1638 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1639 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1640 LLVMConstInt(ctx->i32, param, 0));
1641
1642 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1643 ~0, buffer, base, addr, true);
1644
1645 break;
1646 }
1647
1648 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1649 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1650 {
1651 LLVMValueRef buf, slot, val[4];
1652 int i, offset;
1653
1654 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1655 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1656 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1657 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1658
1659 for (i = 0; i < 4; i++)
1660 val[i] = buffer_load_const(ctx, buf,
1661 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1662 value = lp_build_gather_values(gallivm, val, 4);
1663 break;
1664 }
1665
1666 case TGSI_SEMANTIC_PRIMID:
1667 value = get_primitive_id(ctx, 0);
1668 break;
1669
1670 case TGSI_SEMANTIC_GRID_SIZE:
1671 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1672 break;
1673
1674 case TGSI_SEMANTIC_BLOCK_SIZE:
1675 {
1676 LLVMValueRef values[3];
1677 unsigned i;
1678 unsigned *properties = ctx->shader->selector->info.properties;
1679
1680 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1681 unsigned sizes[3] = {
1682 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1683 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1684 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1685 };
1686
1687 for (i = 0; i < 3; ++i)
1688 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1689
1690 value = lp_build_gather_values(gallivm, values, 3);
1691 } else {
1692 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1693 }
1694 break;
1695 }
1696
1697 case TGSI_SEMANTIC_BLOCK_ID:
1698 {
1699 LLVMValueRef values[3];
1700
1701 for (int i = 0; i < 3; i++) {
1702 values[i] = ctx->i32_0;
1703 if (ctx->param_block_id[i] >= 0) {
1704 values[i] = LLVMGetParam(ctx->main_fn,
1705 ctx->param_block_id[i]);
1706 }
1707 }
1708 value = lp_build_gather_values(gallivm, values, 3);
1709 break;
1710 }
1711
1712 case TGSI_SEMANTIC_THREAD_ID:
1713 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1714 break;
1715
1716 case TGSI_SEMANTIC_HELPER_INVOCATION:
1717 value = lp_build_intrinsic(gallivm->builder,
1718 "llvm.amdgcn.ps.live",
1719 ctx->i1, NULL, 0,
1720 LP_FUNC_ATTR_READNONE);
1721 value = LLVMBuildNot(gallivm->builder, value, "");
1722 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1723 break;
1724
1725 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1726 value = LLVMConstInt(ctx->i32, 64, 0);
1727 break;
1728
1729 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1730 value = ac_get_thread_id(&ctx->ac);
1731 break;
1732
1733 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1734 {
1735 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1736 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1737 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1738 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1739 break;
1740 }
1741
1742 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1743 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1744 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1745 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1746 {
1747 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1748 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1749 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1750 /* All bits set except LSB */
1751 value = LLVMConstInt(ctx->i64, -2, 0);
1752 } else {
1753 /* All bits set */
1754 value = LLVMConstInt(ctx->i64, -1, 0);
1755 }
1756 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1757 value = LLVMBuildShl(gallivm->builder, value, id, "");
1758 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1759 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1760 value = LLVMBuildNot(gallivm->builder, value, "");
1761 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1762 break;
1763 }
1764
1765 default:
1766 assert(!"unknown system value");
1767 return;
1768 }
1769
1770 ctx->system_values[index] = value;
1771 }
1772
1773 static void declare_compute_memory(struct si_shader_context *ctx,
1774 const struct tgsi_full_declaration *decl)
1775 {
1776 struct si_shader_selector *sel = ctx->shader->selector;
1777 struct gallivm_state *gallivm = &ctx->gallivm;
1778
1779 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1780 LLVMValueRef var;
1781
1782 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1783 assert(decl->Range.First == decl->Range.Last);
1784 assert(!ctx->shared_memory);
1785
1786 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1787 LLVMArrayType(ctx->i8, sel->local_size),
1788 "compute_lds",
1789 LOCAL_ADDR_SPACE);
1790 LLVMSetAlignment(var, 4);
1791
1792 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1793 }
1794
1795 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1796 {
1797 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1798 ctx->param_const_and_shader_buffers);
1799
1800 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1801 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1802 }
1803
1804 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1805 {
1806 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1807 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1808
1809 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1810 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1811 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1812
1813 return ac_build_indexed_load_const(&ctx->ac, ptr, index);
1814 }
1815
1816 static LLVMValueRef
1817 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1818 {
1819 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1820 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
1821 ctx->param_const_and_shader_buffers);
1822
1823 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1824 index = LLVMBuildSub(ctx->gallivm.builder,
1825 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1826 index, "");
1827
1828 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
1829 }
1830
1831 static LLVMValueRef fetch_constant(
1832 struct lp_build_tgsi_context *bld_base,
1833 const struct tgsi_full_src_register *reg,
1834 enum tgsi_opcode_type type,
1835 unsigned swizzle)
1836 {
1837 struct si_shader_context *ctx = si_shader_context(bld_base);
1838 struct lp_build_context *base = &bld_base->base;
1839 const struct tgsi_ind_register *ireg = &reg->Indirect;
1840 unsigned buf, idx;
1841
1842 LLVMValueRef addr, bufp;
1843 LLVMValueRef result;
1844
1845 if (swizzle == LP_CHAN_ALL) {
1846 unsigned chan;
1847 LLVMValueRef values[4];
1848 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1849 values[chan] = fetch_constant(bld_base, reg, type, chan);
1850
1851 return lp_build_gather_values(&ctx->gallivm, values, 4);
1852 }
1853
1854 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1855 idx = reg->Register.Index * 4 + swizzle;
1856
1857 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1858 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1859 LLVMValueRef index;
1860 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1861 reg->Dimension.Index,
1862 ctx->num_const_buffers);
1863 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1864 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1865 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1866 } else
1867 bufp = load_const_buffer_desc(ctx, buf);
1868
1869 if (reg->Register.Indirect) {
1870 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1871 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1872 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1873 addr = lp_build_add(&bld_base->uint_bld, addr,
1874 LLVMConstInt(ctx->i32, idx * 4, 0));
1875 } else {
1876 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1877 }
1878
1879 result = buffer_load_const(ctx, bufp, addr);
1880
1881 if (!tgsi_type_is_64bit(type))
1882 result = bitcast(bld_base, type, result);
1883 else {
1884 LLVMValueRef addr2, result2;
1885
1886 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1887 LLVMConstInt(ctx->i32, 4, 0));
1888 result2 = buffer_load_const(ctx, bufp, addr2);
1889
1890 result = si_llvm_emit_fetch_64bit(bld_base, type,
1891 result, result2);
1892 }
1893 return result;
1894 }
1895
1896 /* Upper 16 bits must be zero. */
1897 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1898 LLVMValueRef val[2])
1899 {
1900 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1901 LLVMBuildShl(ctx->gallivm.builder, val[1],
1902 LLVMConstInt(ctx->i32, 16, 0),
1903 ""), "");
1904 }
1905
1906 /* Upper 16 bits are ignored and will be dropped. */
1907 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1908 LLVMValueRef val[2])
1909 {
1910 LLVMValueRef v[2] = {
1911 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1912 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1913 val[1],
1914 };
1915 return si_llvm_pack_two_int16(ctx, v);
1916 }
1917
1918 /* Initialize arguments for the shader export intrinsic */
1919 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1920 LLVMValueRef *values,
1921 unsigned target,
1922 struct ac_export_args *args)
1923 {
1924 struct si_shader_context *ctx = si_shader_context(bld_base);
1925 struct lp_build_context *base = &bld_base->base;
1926 LLVMBuilderRef builder = ctx->gallivm.builder;
1927 LLVMValueRef val[4];
1928 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1929 unsigned chan;
1930 bool is_int8, is_int10;
1931
1932 /* Default is 0xf. Adjusted below depending on the format. */
1933 args->enabled_channels = 0xf; /* writemask */
1934
1935 /* Specify whether the EXEC mask represents the valid mask */
1936 args->valid_mask = 0;
1937
1938 /* Specify whether this is the last export */
1939 args->done = 0;
1940
1941 /* Specify the target we are exporting */
1942 args->target = target;
1943
1944 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1945 const struct si_shader_key *key = &ctx->shader->key;
1946 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1947 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1948
1949 assert(cbuf >= 0 && cbuf < 8);
1950 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1951 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1952 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1953 }
1954
1955 args->compr = false;
1956 args->out[0] = base->undef;
1957 args->out[1] = base->undef;
1958 args->out[2] = base->undef;
1959 args->out[3] = base->undef;
1960
1961 switch (spi_shader_col_format) {
1962 case V_028714_SPI_SHADER_ZERO:
1963 args->enabled_channels = 0; /* writemask */
1964 args->target = V_008DFC_SQ_EXP_NULL;
1965 break;
1966
1967 case V_028714_SPI_SHADER_32_R:
1968 args->enabled_channels = 1; /* writemask */
1969 args->out[0] = values[0];
1970 break;
1971
1972 case V_028714_SPI_SHADER_32_GR:
1973 args->enabled_channels = 0x3; /* writemask */
1974 args->out[0] = values[0];
1975 args->out[1] = values[1];
1976 break;
1977
1978 case V_028714_SPI_SHADER_32_AR:
1979 args->enabled_channels = 0x9; /* writemask */
1980 args->out[0] = values[0];
1981 args->out[3] = values[3];
1982 break;
1983
1984 case V_028714_SPI_SHADER_FP16_ABGR:
1985 args->compr = 1; /* COMPR flag */
1986
1987 for (chan = 0; chan < 2; chan++) {
1988 LLVMValueRef pack_args[2] = {
1989 values[2 * chan],
1990 values[2 * chan + 1]
1991 };
1992 LLVMValueRef packed;
1993
1994 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1995 args->out[chan] =
1996 LLVMBuildBitCast(ctx->gallivm.builder,
1997 packed, ctx->f32, "");
1998 }
1999 break;
2000
2001 case V_028714_SPI_SHADER_UNORM16_ABGR:
2002 for (chan = 0; chan < 4; chan++) {
2003 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
2004 val[chan] = LLVMBuildFMul(builder, val[chan],
2005 LLVMConstReal(ctx->f32, 65535), "");
2006 val[chan] = LLVMBuildFAdd(builder, val[chan],
2007 LLVMConstReal(ctx->f32, 0.5), "");
2008 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2009 ctx->i32, "");
2010 }
2011
2012 args->compr = 1; /* COMPR flag */
2013 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2014 si_llvm_pack_two_int16(ctx, val));
2015 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2016 si_llvm_pack_two_int16(ctx, val+2));
2017 break;
2018
2019 case V_028714_SPI_SHADER_SNORM16_ABGR:
2020 for (chan = 0; chan < 4; chan++) {
2021 /* Clamp between [-1, 1]. */
2022 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2023 values[chan],
2024 LLVMConstReal(ctx->f32, 1));
2025 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2026 val[chan],
2027 LLVMConstReal(ctx->f32, -1));
2028 /* Convert to a signed integer in [-32767, 32767]. */
2029 val[chan] = LLVMBuildFMul(builder, val[chan],
2030 LLVMConstReal(ctx->f32, 32767), "");
2031 /* If positive, add 0.5, else add -0.5. */
2032 val[chan] = LLVMBuildFAdd(builder, val[chan],
2033 LLVMBuildSelect(builder,
2034 LLVMBuildFCmp(builder, LLVMRealOGE,
2035 val[chan], base->zero, ""),
2036 LLVMConstReal(ctx->f32, 0.5),
2037 LLVMConstReal(ctx->f32, -0.5), ""), "");
2038 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2039 }
2040
2041 args->compr = 1; /* COMPR flag */
2042 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2043 si_llvm_pack_two_int32_as_int16(ctx, val));
2044 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2045 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2046 break;
2047
2048 case V_028714_SPI_SHADER_UINT16_ABGR: {
2049 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2050 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2051 LLVMValueRef max_alpha =
2052 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2053
2054 /* Clamp. */
2055 for (chan = 0; chan < 4; chan++) {
2056 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2057 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2058 val[chan],
2059 chan == 3 ? max_alpha : max_rgb);
2060 }
2061
2062 args->compr = 1; /* COMPR flag */
2063 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2064 si_llvm_pack_two_int16(ctx, val));
2065 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2066 si_llvm_pack_two_int16(ctx, val+2));
2067 break;
2068 }
2069
2070 case V_028714_SPI_SHADER_SINT16_ABGR: {
2071 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2072 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2073 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2074 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2075 LLVMValueRef max_alpha =
2076 !is_int10 ? max_rgb : ctx->i32_1;
2077 LLVMValueRef min_alpha =
2078 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2079
2080 /* Clamp. */
2081 for (chan = 0; chan < 4; chan++) {
2082 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2083 val[chan] = lp_build_emit_llvm_binary(bld_base,
2084 TGSI_OPCODE_IMIN,
2085 val[chan], chan == 3 ? max_alpha : max_rgb);
2086 val[chan] = lp_build_emit_llvm_binary(bld_base,
2087 TGSI_OPCODE_IMAX,
2088 val[chan], chan == 3 ? min_alpha : min_rgb);
2089 }
2090
2091 args->compr = 1; /* COMPR flag */
2092 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2093 si_llvm_pack_two_int32_as_int16(ctx, val));
2094 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2095 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2096 break;
2097 }
2098
2099 case V_028714_SPI_SHADER_32_ABGR:
2100 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2101 break;
2102 }
2103 }
2104
2105 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2106 LLVMValueRef alpha)
2107 {
2108 struct si_shader_context *ctx = si_shader_context(bld_base);
2109
2110 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2111 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2112 SI_PARAM_ALPHA_REF);
2113
2114 LLVMValueRef alpha_pass =
2115 lp_build_cmp(&bld_base->base,
2116 ctx->shader->key.part.ps.epilog.alpha_func,
2117 alpha, alpha_ref);
2118 LLVMValueRef arg =
2119 lp_build_select(&bld_base->base,
2120 alpha_pass,
2121 LLVMConstReal(ctx->f32, 1.0f),
2122 LLVMConstReal(ctx->f32, -1.0f));
2123
2124 ac_build_kill(&ctx->ac, arg);
2125 } else {
2126 ac_build_kill(&ctx->ac, NULL);
2127 }
2128 }
2129
2130 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2131 LLVMValueRef alpha,
2132 unsigned samplemask_param)
2133 {
2134 struct si_shader_context *ctx = si_shader_context(bld_base);
2135 struct gallivm_state *gallivm = &ctx->gallivm;
2136 LLVMValueRef coverage;
2137
2138 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2139 coverage = LLVMGetParam(ctx->main_fn,
2140 samplemask_param);
2141 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2142
2143 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2144 ctx->i32,
2145 &coverage, 1, LP_FUNC_ATTR_READNONE);
2146
2147 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2148 ctx->f32, "");
2149
2150 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2151 LLVMConstReal(ctx->f32,
2152 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2153
2154 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2155 }
2156
2157 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2158 struct ac_export_args *pos, LLVMValueRef *out_elts)
2159 {
2160 struct si_shader_context *ctx = si_shader_context(bld_base);
2161 struct lp_build_context *base = &bld_base->base;
2162 unsigned reg_index;
2163 unsigned chan;
2164 unsigned const_chan;
2165 LLVMValueRef base_elt;
2166 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2167 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2168 SI_VS_CONST_CLIP_PLANES, 0);
2169 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2170
2171 for (reg_index = 0; reg_index < 2; reg_index ++) {
2172 struct ac_export_args *args = &pos[2 + reg_index];
2173
2174 args->out[0] =
2175 args->out[1] =
2176 args->out[2] =
2177 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2178
2179 /* Compute dot products of position and user clip plane vectors */
2180 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2181 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2182 LLVMValueRef addr =
2183 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2184 const_chan) * 4, 0);
2185 base_elt = buffer_load_const(ctx, const_resource,
2186 addr);
2187 args->out[chan] =
2188 lp_build_add(base, args->out[chan],
2189 lp_build_mul(base, base_elt,
2190 out_elts[const_chan]));
2191 }
2192 }
2193
2194 args->enabled_channels = 0xf;
2195 args->valid_mask = 0;
2196 args->done = 0;
2197 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2198 args->compr = 0;
2199 }
2200 }
2201
2202 static void si_dump_streamout(struct pipe_stream_output_info *so)
2203 {
2204 unsigned i;
2205
2206 if (so->num_outputs)
2207 fprintf(stderr, "STREAMOUT\n");
2208
2209 for (i = 0; i < so->num_outputs; i++) {
2210 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2211 so->output[i].start_component;
2212 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2213 i, so->output[i].output_buffer,
2214 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2215 so->output[i].register_index,
2216 mask & 1 ? "x" : "",
2217 mask & 2 ? "y" : "",
2218 mask & 4 ? "z" : "",
2219 mask & 8 ? "w" : "");
2220 }
2221 }
2222
2223 static void emit_streamout_output(struct si_shader_context *ctx,
2224 LLVMValueRef const *so_buffers,
2225 LLVMValueRef const *so_write_offsets,
2226 struct pipe_stream_output *stream_out,
2227 struct si_shader_output_values *shader_out)
2228 {
2229 struct gallivm_state *gallivm = &ctx->gallivm;
2230 LLVMBuilderRef builder = gallivm->builder;
2231 unsigned buf_idx = stream_out->output_buffer;
2232 unsigned start = stream_out->start_component;
2233 unsigned num_comps = stream_out->num_components;
2234 LLVMValueRef out[4];
2235
2236 assert(num_comps && num_comps <= 4);
2237 if (!num_comps || num_comps > 4)
2238 return;
2239
2240 /* Load the output as int. */
2241 for (int j = 0; j < num_comps; j++) {
2242 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2243
2244 out[j] = LLVMBuildBitCast(builder,
2245 shader_out->values[start + j],
2246 ctx->i32, "");
2247 }
2248
2249 /* Pack the output. */
2250 LLVMValueRef vdata = NULL;
2251
2252 switch (num_comps) {
2253 case 1: /* as i32 */
2254 vdata = out[0];
2255 break;
2256 case 2: /* as v2i32 */
2257 case 3: /* as v4i32 (aligned to 4) */
2258 case 4: /* as v4i32 */
2259 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2260 for (int j = 0; j < num_comps; j++) {
2261 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2262 LLVMConstInt(ctx->i32, j, 0), "");
2263 }
2264 break;
2265 }
2266
2267 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2268 vdata, num_comps,
2269 so_write_offsets[buf_idx],
2270 ctx->i32_0,
2271 stream_out->dst_offset * 4, 1, 1, true, false);
2272 }
2273
2274 /**
2275 * Write streamout data to buffers for vertex stream @p stream (different
2276 * vertex streams can occur for GS copy shaders).
2277 */
2278 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2279 struct si_shader_output_values *outputs,
2280 unsigned noutput, unsigned stream)
2281 {
2282 struct si_shader_selector *sel = ctx->shader->selector;
2283 struct pipe_stream_output_info *so = &sel->so;
2284 struct gallivm_state *gallivm = &ctx->gallivm;
2285 LLVMBuilderRef builder = gallivm->builder;
2286 int i;
2287 struct lp_build_if_state if_ctx;
2288
2289 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2290 LLVMValueRef so_vtx_count =
2291 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2292
2293 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2294
2295 /* can_emit = tid < so_vtx_count; */
2296 LLVMValueRef can_emit =
2297 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2298
2299 /* Emit the streamout code conditionally. This actually avoids
2300 * out-of-bounds buffer access. The hw tells us via the SGPR
2301 * (so_vtx_count) which threads are allowed to emit streamout data. */
2302 lp_build_if(&if_ctx, gallivm, can_emit);
2303 {
2304 /* The buffer offset is computed as follows:
2305 * ByteOffset = streamout_offset[buffer_id]*4 +
2306 * (streamout_write_index + thread_id)*stride[buffer_id] +
2307 * attrib_offset
2308 */
2309
2310 LLVMValueRef so_write_index =
2311 LLVMGetParam(ctx->main_fn,
2312 ctx->param_streamout_write_index);
2313
2314 /* Compute (streamout_write_index + thread_id). */
2315 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2316
2317 /* Load the descriptor and compute the write offset for each
2318 * enabled buffer. */
2319 LLVMValueRef so_write_offset[4] = {};
2320 LLVMValueRef so_buffers[4];
2321 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2322 ctx->param_rw_buffers);
2323
2324 for (i = 0; i < 4; i++) {
2325 if (!so->stride[i])
2326 continue;
2327
2328 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2329 SI_VS_STREAMOUT_BUF0 + i, 0);
2330
2331 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2332
2333 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2334 ctx->param_streamout_offset[i]);
2335 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2336
2337 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2338 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2339 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2340 }
2341
2342 /* Write streamout data. */
2343 for (i = 0; i < so->num_outputs; i++) {
2344 unsigned reg = so->output[i].register_index;
2345
2346 if (reg >= noutput)
2347 continue;
2348
2349 if (stream != so->output[i].stream)
2350 continue;
2351
2352 emit_streamout_output(ctx, so_buffers, so_write_offset,
2353 &so->output[i], &outputs[reg]);
2354 }
2355 }
2356 lp_build_endif(&if_ctx);
2357 }
2358
2359 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2360 LLVMValueRef *values)
2361 {
2362 struct ac_export_args args;
2363
2364 si_llvm_init_export_args(&ctx->bld_base, values,
2365 V_008DFC_SQ_EXP_PARAM + index, &args);
2366 ac_build_export(&ctx->ac, &args);
2367 }
2368
2369 static void si_build_param_exports(struct si_shader_context *ctx,
2370 struct si_shader_output_values *outputs,
2371 unsigned noutput)
2372 {
2373 struct si_shader *shader = ctx->shader;
2374 unsigned param_count = 0;
2375
2376 for (unsigned i = 0; i < noutput; i++) {
2377 unsigned semantic_name = outputs[i].semantic_name;
2378 unsigned semantic_index = outputs[i].semantic_index;
2379
2380 if (outputs[i].vertex_stream[0] != 0 &&
2381 outputs[i].vertex_stream[1] != 0 &&
2382 outputs[i].vertex_stream[2] != 0 &&
2383 outputs[i].vertex_stream[3] != 0)
2384 continue;
2385
2386 switch (semantic_name) {
2387 case TGSI_SEMANTIC_LAYER:
2388 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2389 case TGSI_SEMANTIC_CLIPDIST:
2390 case TGSI_SEMANTIC_COLOR:
2391 case TGSI_SEMANTIC_BCOLOR:
2392 case TGSI_SEMANTIC_PRIMID:
2393 case TGSI_SEMANTIC_FOG:
2394 case TGSI_SEMANTIC_TEXCOORD:
2395 case TGSI_SEMANTIC_GENERIC:
2396 break;
2397 default:
2398 continue;
2399 }
2400
2401 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2402 semantic_index < SI_MAX_IO_GENERIC) &&
2403 shader->key.opt.kill_outputs &
2404 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2405 continue;
2406
2407 si_export_param(ctx, param_count, outputs[i].values);
2408
2409 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2410 shader->info.vs_output_param_offset[i] = param_count++;
2411 }
2412
2413 shader->info.nr_param_exports = param_count;
2414 }
2415
2416 /* Generate export instructions for hardware VS shader stage */
2417 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2418 struct si_shader_output_values *outputs,
2419 unsigned noutput)
2420 {
2421 struct si_shader_context *ctx = si_shader_context(bld_base);
2422 struct si_shader *shader = ctx->shader;
2423 struct lp_build_context *base = &bld_base->base;
2424 struct ac_export_args pos_args[4] = {};
2425 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2426 unsigned pos_idx;
2427 int i;
2428
2429 /* Build position exports. */
2430 for (i = 0; i < noutput; i++) {
2431 switch (outputs[i].semantic_name) {
2432 case TGSI_SEMANTIC_POSITION:
2433 si_llvm_init_export_args(bld_base, outputs[i].values,
2434 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2435 break;
2436 case TGSI_SEMANTIC_PSIZE:
2437 psize_value = outputs[i].values[0];
2438 break;
2439 case TGSI_SEMANTIC_LAYER:
2440 layer_value = outputs[i].values[0];
2441 break;
2442 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2443 viewport_index_value = outputs[i].values[0];
2444 break;
2445 case TGSI_SEMANTIC_EDGEFLAG:
2446 edgeflag_value = outputs[i].values[0];
2447 break;
2448 case TGSI_SEMANTIC_CLIPDIST:
2449 if (!shader->key.opt.clip_disable) {
2450 unsigned index = 2 + outputs[i].semantic_index;
2451 si_llvm_init_export_args(bld_base, outputs[i].values,
2452 V_008DFC_SQ_EXP_POS + index,
2453 &pos_args[index]);
2454 }
2455 break;
2456 case TGSI_SEMANTIC_CLIPVERTEX:
2457 if (!shader->key.opt.clip_disable) {
2458 si_llvm_emit_clipvertex(bld_base, pos_args,
2459 outputs[i].values);
2460 }
2461 break;
2462 }
2463 }
2464
2465 /* We need to add the position output manually if it's missing. */
2466 if (!pos_args[0].out[0]) {
2467 pos_args[0].enabled_channels = 0xf; /* writemask */
2468 pos_args[0].valid_mask = 0; /* EXEC mask */
2469 pos_args[0].done = 0; /* last export? */
2470 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2471 pos_args[0].compr = 0; /* COMPR flag */
2472 pos_args[0].out[0] = base->zero; /* X */
2473 pos_args[0].out[1] = base->zero; /* Y */
2474 pos_args[0].out[2] = base->zero; /* Z */
2475 pos_args[0].out[3] = base->one; /* W */
2476 }
2477
2478 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2479 if (shader->selector->info.writes_psize ||
2480 shader->selector->info.writes_edgeflag ||
2481 shader->selector->info.writes_viewport_index ||
2482 shader->selector->info.writes_layer) {
2483 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2484 (shader->selector->info.writes_edgeflag << 1) |
2485 (shader->selector->info.writes_layer << 2);
2486
2487 pos_args[1].valid_mask = 0; /* EXEC mask */
2488 pos_args[1].done = 0; /* last export? */
2489 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2490 pos_args[1].compr = 0; /* COMPR flag */
2491 pos_args[1].out[0] = base->zero; /* X */
2492 pos_args[1].out[1] = base->zero; /* Y */
2493 pos_args[1].out[2] = base->zero; /* Z */
2494 pos_args[1].out[3] = base->zero; /* W */
2495
2496 if (shader->selector->info.writes_psize)
2497 pos_args[1].out[0] = psize_value;
2498
2499 if (shader->selector->info.writes_edgeflag) {
2500 /* The output is a float, but the hw expects an integer
2501 * with the first bit containing the edge flag. */
2502 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2503 edgeflag_value,
2504 ctx->i32, "");
2505 edgeflag_value = ac_build_umin(&ctx->ac,
2506 edgeflag_value,
2507 ctx->i32_1);
2508
2509 /* The LLVM intrinsic expects a float. */
2510 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2511 edgeflag_value,
2512 ctx->f32, "");
2513 }
2514
2515 if (ctx->screen->b.chip_class >= GFX9) {
2516 /* GFX9 has the layer in out.z[10:0] and the viewport
2517 * index in out.z[19:16].
2518 */
2519 if (shader->selector->info.writes_layer)
2520 pos_args[1].out[2] = layer_value;
2521
2522 if (shader->selector->info.writes_viewport_index) {
2523 LLVMValueRef v = viewport_index_value;
2524
2525 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2526 v = LLVMBuildShl(ctx->gallivm.builder, v,
2527 LLVMConstInt(ctx->i32, 16, 0), "");
2528 v = LLVMBuildOr(ctx->gallivm.builder, v,
2529 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2530 pos_args[1].out[2]), "");
2531 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2532 pos_args[1].enabled_channels |= 1 << 2;
2533 }
2534 } else {
2535 if (shader->selector->info.writes_layer)
2536 pos_args[1].out[2] = layer_value;
2537
2538 if (shader->selector->info.writes_viewport_index) {
2539 pos_args[1].out[3] = viewport_index_value;
2540 pos_args[1].enabled_channels |= 1 << 3;
2541 }
2542 }
2543 }
2544
2545 for (i = 0; i < 4; i++)
2546 if (pos_args[i].out[0])
2547 shader->info.nr_pos_exports++;
2548
2549 pos_idx = 0;
2550 for (i = 0; i < 4; i++) {
2551 if (!pos_args[i].out[0])
2552 continue;
2553
2554 /* Specify the target we are exporting */
2555 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2556
2557 if (pos_idx == shader->info.nr_pos_exports)
2558 /* Specify that this is the last export */
2559 pos_args[i].done = 1;
2560
2561 ac_build_export(&ctx->ac, &pos_args[i]);
2562 }
2563
2564 /* Build parameter exports. */
2565 si_build_param_exports(ctx, outputs, noutput);
2566 }
2567
2568 /**
2569 * Forward all outputs from the vertex shader to the TES. This is only used
2570 * for the fixed function TCS.
2571 */
2572 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2573 {
2574 struct si_shader_context *ctx = si_shader_context(bld_base);
2575 struct gallivm_state *gallivm = &ctx->gallivm;
2576 LLVMValueRef invocation_id, buffer, buffer_offset;
2577 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2578 uint64_t inputs;
2579
2580 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2581 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2582 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2583
2584 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2585 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2586 lds_vertex_stride, "");
2587 lds_base = get_tcs_in_current_patch_offset(ctx);
2588 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2589
2590 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2591 while (inputs) {
2592 unsigned i = u_bit_scan64(&inputs);
2593
2594 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2595 LLVMConstInt(ctx->i32, 4 * i, 0),
2596 "");
2597
2598 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2599 get_rel_patch_id(ctx),
2600 invocation_id,
2601 LLVMConstInt(ctx->i32, i, 0));
2602
2603 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2604 lds_ptr);
2605
2606 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2607 buffer_offset, 0, 1, 0, true, false);
2608 }
2609 }
2610
2611 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2612 LLVMValueRef rel_patch_id,
2613 LLVMValueRef invocation_id,
2614 LLVMValueRef tcs_out_current_patch_data_offset)
2615 {
2616 struct si_shader_context *ctx = si_shader_context(bld_base);
2617 struct gallivm_state *gallivm = &ctx->gallivm;
2618 struct si_shader *shader = ctx->shader;
2619 unsigned tess_inner_index, tess_outer_index;
2620 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2621 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2622 unsigned stride, outer_comps, inner_comps, i, offset;
2623 struct lp_build_if_state if_ctx, inner_if_ctx;
2624
2625 si_llvm_emit_barrier(NULL, bld_base, NULL);
2626
2627 /* Do this only for invocation 0, because the tess levels are per-patch,
2628 * not per-vertex.
2629 *
2630 * This can't jump, because invocation 0 executes this. It should
2631 * at least mask out the loads and stores for other invocations.
2632 */
2633 lp_build_if(&if_ctx, gallivm,
2634 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2635 invocation_id, ctx->i32_0, ""));
2636
2637 /* Determine the layout of one tess factor element in the buffer. */
2638 switch (shader->key.part.tcs.epilog.prim_mode) {
2639 case PIPE_PRIM_LINES:
2640 stride = 2; /* 2 dwords, 1 vec2 store */
2641 outer_comps = 2;
2642 inner_comps = 0;
2643 break;
2644 case PIPE_PRIM_TRIANGLES:
2645 stride = 4; /* 4 dwords, 1 vec4 store */
2646 outer_comps = 3;
2647 inner_comps = 1;
2648 break;
2649 case PIPE_PRIM_QUADS:
2650 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2651 outer_comps = 4;
2652 inner_comps = 2;
2653 break;
2654 default:
2655 assert(0);
2656 return;
2657 }
2658
2659 /* Load tess_inner and tess_outer from LDS.
2660 * Any invocation can write them, so we can't get them from a temporary.
2661 */
2662 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2663 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2664
2665 lds_base = tcs_out_current_patch_data_offset;
2666 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2667 LLVMConstInt(ctx->i32,
2668 tess_inner_index * 4, 0), "");
2669 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2670 LLVMConstInt(ctx->i32,
2671 tess_outer_index * 4, 0), "");
2672
2673 for (i = 0; i < 4; i++) {
2674 inner[i] = LLVMGetUndef(ctx->i32);
2675 outer[i] = LLVMGetUndef(ctx->i32);
2676 }
2677
2678 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2679 /* For isolines, the hardware expects tess factors in the
2680 * reverse order from what GLSL / TGSI specify.
2681 */
2682 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2683 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2684 } else {
2685 for (i = 0; i < outer_comps; i++) {
2686 outer[i] = out[i] =
2687 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2688 }
2689 for (i = 0; i < inner_comps; i++) {
2690 inner[i] = out[outer_comps+i] =
2691 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2692 }
2693 }
2694
2695 /* Convert the outputs to vectors for stores. */
2696 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2697 vec1 = NULL;
2698
2699 if (stride > 4)
2700 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2701
2702 /* Get the buffer. */
2703 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2704
2705 /* Get the offset. */
2706 tf_base = LLVMGetParam(ctx->main_fn,
2707 ctx->param_tcs_factor_offset);
2708 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2709 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2710
2711 lp_build_if(&inner_if_ctx, gallivm,
2712 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2713 rel_patch_id, ctx->i32_0, ""));
2714
2715 /* Store the dynamic HS control word. */
2716 offset = 0;
2717 if (ctx->screen->b.chip_class <= VI) {
2718 ac_build_buffer_store_dword(&ctx->ac, buffer,
2719 LLVMConstInt(ctx->i32, 0x80000000, 0),
2720 1, ctx->i32_0, tf_base,
2721 offset, 1, 0, true, false);
2722 offset += 4;
2723 }
2724
2725 lp_build_endif(&inner_if_ctx);
2726
2727 /* Store the tessellation factors. */
2728 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2729 MIN2(stride, 4), byteoffset, tf_base,
2730 offset, 1, 0, true, false);
2731 offset += 16;
2732 if (vec1)
2733 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2734 stride - 4, byteoffset, tf_base,
2735 offset, 1, 0, true, false);
2736
2737 /* Store the tess factors into the offchip buffer if TES reads them. */
2738 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2739 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2740 LLVMValueRef tf_inner_offset;
2741 unsigned param_outer, param_inner;
2742
2743 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2744 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2745
2746 param_outer = si_shader_io_get_unique_index_patch(
2747 TGSI_SEMANTIC_TESSOUTER, 0);
2748 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2749 LLVMConstInt(ctx->i32, param_outer, 0));
2750
2751 outer_vec = lp_build_gather_values(gallivm, outer,
2752 util_next_power_of_two(outer_comps));
2753
2754 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2755 outer_comps, tf_outer_offset,
2756 base, 0, 1, 0, true, false);
2757 if (inner_comps) {
2758 param_inner = si_shader_io_get_unique_index_patch(
2759 TGSI_SEMANTIC_TESSINNER, 0);
2760 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2761 LLVMConstInt(ctx->i32, param_inner, 0));
2762
2763 inner_vec = inner_comps == 1 ? inner[0] :
2764 lp_build_gather_values(gallivm, inner, inner_comps);
2765 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2766 inner_comps, tf_inner_offset,
2767 base, 0, 1, 0, true, false);
2768 }
2769 }
2770
2771 lp_build_endif(&if_ctx);
2772 }
2773
2774 static LLVMValueRef
2775 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2776 unsigned param, unsigned return_index)
2777 {
2778 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2779 LLVMGetParam(ctx->main_fn, param),
2780 return_index, "");
2781 }
2782
2783 static LLVMValueRef
2784 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2785 unsigned param, unsigned return_index)
2786 {
2787 LLVMBuilderRef builder = ctx->gallivm.builder;
2788 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2789
2790 return LLVMBuildInsertValue(builder, ret,
2791 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2792 return_index, "");
2793 }
2794
2795 static LLVMValueRef
2796 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2797 unsigned param, unsigned return_index)
2798 {
2799 LLVMBuilderRef builder = ctx->gallivm.builder;
2800 LLVMValueRef ptr, lo, hi;
2801
2802 ptr = LLVMGetParam(ctx->main_fn, param);
2803 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2804 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2805 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2806 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2807 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2808 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2809 }
2810
2811 /* This only writes the tessellation factor levels. */
2812 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2813 {
2814 struct si_shader_context *ctx = si_shader_context(bld_base);
2815 LLVMBuilderRef builder = ctx->gallivm.builder;
2816 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2817
2818 si_copy_tcs_inputs(bld_base);
2819
2820 rel_patch_id = get_rel_patch_id(ctx);
2821 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2822 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2823
2824 if (ctx->screen->b.chip_class >= GFX9) {
2825 LLVMBasicBlockRef blocks[2] = {
2826 LLVMGetInsertBlock(builder),
2827 ctx->merged_wrap_if_state.entry_block
2828 };
2829 LLVMValueRef values[2];
2830
2831 lp_build_endif(&ctx->merged_wrap_if_state);
2832
2833 values[0] = rel_patch_id;
2834 values[1] = LLVMGetUndef(ctx->i32);
2835 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2836
2837 values[0] = tf_lds_offset;
2838 values[1] = LLVMGetUndef(ctx->i32);
2839 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2840
2841 values[0] = invocation_id;
2842 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2843 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2844 }
2845
2846 /* Return epilog parameters from this function. */
2847 LLVMValueRef ret = ctx->return_value;
2848 unsigned vgpr;
2849
2850 if (ctx->screen->b.chip_class >= GFX9) {
2851 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2852 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2853 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2854 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2855 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2856 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2857 /* Tess offchip and tess factor offsets are at the beginning. */
2858 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2859 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2860 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2861 } else {
2862 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2863 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2864 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2865 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2866 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2867 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2868 /* Tess offchip and tess factor offsets are after user SGPRs. */
2869 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2870 GFX6_TCS_NUM_USER_SGPR);
2871 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2872 GFX6_TCS_NUM_USER_SGPR + 1);
2873 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2874 }
2875
2876 /* VGPRs */
2877 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2878 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2879 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2880
2881 /* Leave a hole corresponding to the two input VGPRs. This ensures that
2882 * the invocation_id output does not alias the param_tcs_rel_ids input,
2883 * which saves a V_MOV on gfx9.
2884 */
2885 vgpr += 2;
2886
2887 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2888 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2889 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2890 ctx->return_value = ret;
2891 }
2892
2893 /* Pass TCS inputs from LS to TCS on GFX9. */
2894 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2895 {
2896 LLVMValueRef ret = ctx->return_value;
2897
2898 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2899 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2900 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2901 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2902 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2903
2904 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2905 8 + SI_SGPR_VS_STATE_BITS);
2906 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2907 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2908 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2909 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2910 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2911 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2912 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2913 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2914 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2915 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2916
2917 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2918 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2919 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2920 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2921 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2922
2923 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2924 ret = si_insert_input_ret_float(ctx, ret,
2925 ctx->param_tcs_patch_id, vgpr++);
2926 ret = si_insert_input_ret_float(ctx, ret,
2927 ctx->param_tcs_rel_ids, vgpr++);
2928 ctx->return_value = ret;
2929 }
2930
2931 /* Pass GS inputs from ES to GS on GFX9. */
2932 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2933 {
2934 LLVMValueRef ret = ctx->return_value;
2935
2936 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2937 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2938 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2939
2940 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2941
2942 unsigned desc_param = ctx->param_vs_state_bits + 1;
2943 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2944 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
2945 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2946 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
2947
2948 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2949 for (unsigned i = 0; i < 5; i++) {
2950 unsigned param = ctx->param_gs_vtx01_offset + i;
2951 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2952 }
2953 ctx->return_value = ret;
2954 }
2955
2956 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2957 {
2958 struct si_shader_context *ctx = si_shader_context(bld_base);
2959 struct si_shader *shader = ctx->shader;
2960 struct tgsi_shader_info *info = &shader->selector->info;
2961 struct gallivm_state *gallivm = &ctx->gallivm;
2962 unsigned i, chan;
2963 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2964 ctx->param_rel_auto_id);
2965 LLVMValueRef vertex_dw_stride =
2966 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2967 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2968 vertex_dw_stride, "");
2969
2970 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2971 * its inputs from it. */
2972 for (i = 0; i < info->num_outputs; i++) {
2973 LLVMValueRef *out_ptr = ctx->outputs[i];
2974 unsigned name = info->output_semantic_name[i];
2975 unsigned index = info->output_semantic_index[i];
2976
2977 /* The ARB_shader_viewport_layer_array spec contains the
2978 * following issue:
2979 *
2980 * 2) What happens if gl_ViewportIndex or gl_Layer is
2981 * written in the vertex shader and a geometry shader is
2982 * present?
2983 *
2984 * RESOLVED: The value written by the last vertex processing
2985 * stage is used. If the last vertex processing stage
2986 * (vertex, tessellation evaluation or geometry) does not
2987 * statically assign to gl_ViewportIndex or gl_Layer, index
2988 * or layer zero is assumed.
2989 *
2990 * So writes to those outputs in VS-as-LS are simply ignored.
2991 */
2992 if (name == TGSI_SEMANTIC_LAYER ||
2993 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2994 continue;
2995
2996 int param = si_shader_io_get_unique_index(name, index);
2997 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2998 LLVMConstInt(ctx->i32, param * 4, 0), "");
2999
3000 for (chan = 0; chan < 4; chan++) {
3001 lds_store(bld_base, chan, dw_addr,
3002 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
3003 }
3004 }
3005
3006 if (ctx->screen->b.chip_class >= GFX9)
3007 si_set_ls_return_value_for_tcs(ctx);
3008 }
3009
3010 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
3011 {
3012 struct si_shader_context *ctx = si_shader_context(bld_base);
3013 struct gallivm_state *gallivm = &ctx->gallivm;
3014 struct si_shader *es = ctx->shader;
3015 struct tgsi_shader_info *info = &es->selector->info;
3016 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3017 ctx->param_es2gs_offset);
3018 LLVMValueRef lds_base = NULL;
3019 unsigned chan;
3020 int i;
3021
3022 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
3023 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3024 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3025 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3026 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
3027 LLVMBuildMul(gallivm->builder, wave_idx,
3028 LLVMConstInt(ctx->i32, 64, false), ""), "");
3029 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
3030 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3031 }
3032
3033 for (i = 0; i < info->num_outputs; i++) {
3034 LLVMValueRef *out_ptr = ctx->outputs[i];
3035 int param;
3036
3037 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3038 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3039 continue;
3040
3041 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3042 info->output_semantic_index[i]);
3043
3044 for (chan = 0; chan < 4; chan++) {
3045 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3046 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3047
3048 /* GFX9 has the ESGS ring in LDS. */
3049 if (ctx->screen->b.chip_class >= GFX9) {
3050 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
3051 continue;
3052 }
3053
3054 ac_build_buffer_store_dword(&ctx->ac,
3055 ctx->esgs_ring,
3056 out_val, 1, NULL, soffset,
3057 (4 * param + chan) * 4,
3058 1, 1, true, true);
3059 }
3060 }
3061
3062 if (ctx->screen->b.chip_class >= GFX9)
3063 si_set_es_return_value_for_gs(ctx);
3064 }
3065
3066 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3067 {
3068 if (ctx->screen->b.chip_class >= GFX9)
3069 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3070 else
3071 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3072 }
3073
3074 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3075 {
3076 struct si_shader_context *ctx = si_shader_context(bld_base);
3077
3078 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3079 si_get_gs_wave_id(ctx));
3080
3081 if (ctx->screen->b.chip_class >= GFX9)
3082 lp_build_endif(&ctx->merged_wrap_if_state);
3083 }
3084
3085 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3086 unsigned max_outputs,
3087 LLVMValueRef *addrs)
3088 {
3089 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3090 struct gallivm_state *gallivm = &ctx->gallivm;
3091 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3092 struct si_shader_output_values *outputs = NULL;
3093 int i,j;
3094
3095 assert(!ctx->shader->is_gs_copy_shader);
3096 assert(info->num_outputs <= max_outputs);
3097
3098 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3099
3100 /* Vertex color clamping.
3101 *
3102 * This uses a state constant loaded in a user data SGPR and
3103 * an IF statement is added that clamps all colors if the constant
3104 * is true.
3105 */
3106 if (ctx->type == PIPE_SHADER_VERTEX) {
3107 struct lp_build_if_state if_ctx;
3108 LLVMValueRef cond = NULL;
3109 LLVMValueRef addr, val;
3110
3111 for (i = 0; i < info->num_outputs; i++) {
3112 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3113 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3114 continue;
3115
3116 /* We've found a color. */
3117 if (!cond) {
3118 /* The state is in the first bit of the user SGPR. */
3119 cond = LLVMGetParam(ctx->main_fn,
3120 ctx->param_vs_state_bits);
3121 cond = LLVMBuildTrunc(gallivm->builder, cond,
3122 ctx->i1, "");
3123 lp_build_if(&if_ctx, gallivm, cond);
3124 }
3125
3126 for (j = 0; j < 4; j++) {
3127 addr = addrs[4 * i + j];
3128 val = LLVMBuildLoad(gallivm->builder, addr, "");
3129 val = ac_build_clamp(&ctx->ac, val);
3130 LLVMBuildStore(gallivm->builder, val, addr);
3131 }
3132 }
3133
3134 if (cond)
3135 lp_build_endif(&if_ctx);
3136 }
3137
3138 for (i = 0; i < info->num_outputs; i++) {
3139 outputs[i].semantic_name = info->output_semantic_name[i];
3140 outputs[i].semantic_index = info->output_semantic_index[i];
3141
3142 for (j = 0; j < 4; j++) {
3143 outputs[i].values[j] =
3144 LLVMBuildLoad(gallivm->builder,
3145 addrs[4 * i + j],
3146 "");
3147 outputs[i].vertex_stream[j] =
3148 (info->output_streams[i] >> (2 * j)) & 3;
3149 }
3150 }
3151
3152 if (ctx->shader->selector->so.num_outputs)
3153 si_llvm_emit_streamout(ctx, outputs, i, 0);
3154
3155 /* Export PrimitiveID. */
3156 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3157 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3158 outputs[i].semantic_index = 0;
3159 outputs[i].values[0] = LLVMBuildBitCast(gallivm->builder,
3160 get_primitive_id(ctx, 0), ctx->f32, "");
3161 for (j = 1; j < 4; j++)
3162 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3163
3164 memset(outputs[i].vertex_stream, 0,
3165 sizeof(outputs[i].vertex_stream));
3166 i++;
3167 }
3168
3169 si_llvm_export_vs(&ctx->bld_base, outputs, i);
3170 FREE(outputs);
3171 }
3172
3173 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3174 {
3175 struct si_shader_context *ctx = si_shader_context(bld_base);
3176
3177 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3178 &ctx->outputs[0][0]);
3179 }
3180
3181 struct si_ps_exports {
3182 unsigned num;
3183 struct ac_export_args args[10];
3184 };
3185
3186 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3187 bool writes_samplemask)
3188 {
3189 if (writes_z) {
3190 /* Z needs 32 bits. */
3191 if (writes_samplemask)
3192 return V_028710_SPI_SHADER_32_ABGR;
3193 else if (writes_stencil)
3194 return V_028710_SPI_SHADER_32_GR;
3195 else
3196 return V_028710_SPI_SHADER_32_R;
3197 } else if (writes_stencil || writes_samplemask) {
3198 /* Both stencil and sample mask need only 16 bits. */
3199 return V_028710_SPI_SHADER_UINT16_ABGR;
3200 } else {
3201 return V_028710_SPI_SHADER_ZERO;
3202 }
3203 }
3204
3205 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3206 LLVMValueRef depth, LLVMValueRef stencil,
3207 LLVMValueRef samplemask, struct si_ps_exports *exp)
3208 {
3209 struct si_shader_context *ctx = si_shader_context(bld_base);
3210 struct lp_build_context *base = &bld_base->base;
3211 struct ac_export_args args;
3212 unsigned mask = 0;
3213 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3214 stencil != NULL,
3215 samplemask != NULL);
3216
3217 assert(depth || stencil || samplemask);
3218
3219 args.valid_mask = 1; /* whether the EXEC mask is valid */
3220 args.done = 1; /* DONE bit */
3221
3222 /* Specify the target we are exporting */
3223 args.target = V_008DFC_SQ_EXP_MRTZ;
3224
3225 args.compr = 0; /* COMP flag */
3226 args.out[0] = base->undef; /* R, depth */
3227 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3228 args.out[2] = base->undef; /* B, sample mask */
3229 args.out[3] = base->undef; /* A, alpha to mask */
3230
3231 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3232 assert(!depth);
3233 args.compr = 1; /* COMPR flag */
3234
3235 if (stencil) {
3236 /* Stencil should be in X[23:16]. */
3237 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3238 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3239 LLVMConstInt(ctx->i32, 16, 0), "");
3240 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3241 mask |= 0x3;
3242 }
3243 if (samplemask) {
3244 /* SampleMask should be in Y[15:0]. */
3245 args.out[1] = samplemask;
3246 mask |= 0xc;
3247 }
3248 } else {
3249 if (depth) {
3250 args.out[0] = depth;
3251 mask |= 0x1;
3252 }
3253 if (stencil) {
3254 args.out[1] = stencil;
3255 mask |= 0x2;
3256 }
3257 if (samplemask) {
3258 args.out[2] = samplemask;
3259 mask |= 0x4;
3260 }
3261 }
3262
3263 /* SI (except OLAND and HAINAN) has a bug that it only looks
3264 * at the X writemask component. */
3265 if (ctx->screen->b.chip_class == SI &&
3266 ctx->screen->b.family != CHIP_OLAND &&
3267 ctx->screen->b.family != CHIP_HAINAN)
3268 mask |= 0x1;
3269
3270 /* Specify which components to enable */
3271 args.enabled_channels = mask;
3272
3273 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3274 }
3275
3276 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3277 LLVMValueRef *color, unsigned index,
3278 unsigned samplemask_param,
3279 bool is_last, struct si_ps_exports *exp)
3280 {
3281 struct si_shader_context *ctx = si_shader_context(bld_base);
3282 struct lp_build_context *base = &bld_base->base;
3283 int i;
3284
3285 /* Clamp color */
3286 if (ctx->shader->key.part.ps.epilog.clamp_color)
3287 for (i = 0; i < 4; i++)
3288 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3289
3290 /* Alpha to one */
3291 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3292 color[3] = base->one;
3293
3294 /* Alpha test */
3295 if (index == 0 &&
3296 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3297 si_alpha_test(bld_base, color[3]);
3298
3299 /* Line & polygon smoothing */
3300 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3301 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3302 samplemask_param);
3303
3304 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3305 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3306 struct ac_export_args args[8];
3307 int c, last = -1;
3308
3309 /* Get the export arguments, also find out what the last one is. */
3310 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3311 si_llvm_init_export_args(bld_base, color,
3312 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3313 if (args[c].enabled_channels)
3314 last = c;
3315 }
3316
3317 /* Emit all exports. */
3318 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3319 if (is_last && last == c) {
3320 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3321 args[c].done = 1; /* DONE bit */
3322 } else if (!args[c].enabled_channels)
3323 continue; /* unnecessary NULL export */
3324
3325 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3326 }
3327 } else {
3328 struct ac_export_args args;
3329
3330 /* Export */
3331 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3332 &args);
3333 if (is_last) {
3334 args.valid_mask = 1; /* whether the EXEC mask is valid */
3335 args.done = 1; /* DONE bit */
3336 } else if (!args.enabled_channels)
3337 return; /* unnecessary NULL export */
3338
3339 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3340 }
3341 }
3342
3343 static void si_emit_ps_exports(struct si_shader_context *ctx,
3344 struct si_ps_exports *exp)
3345 {
3346 for (unsigned i = 0; i < exp->num; i++)
3347 ac_build_export(&ctx->ac, &exp->args[i]);
3348 }
3349
3350 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3351 {
3352 struct si_shader_context *ctx = si_shader_context(bld_base);
3353 struct lp_build_context *base = &bld_base->base;
3354 struct ac_export_args args;
3355
3356 args.enabled_channels = 0x0; /* enabled channels */
3357 args.valid_mask = 1; /* whether the EXEC mask is valid */
3358 args.done = 1; /* DONE bit */
3359 args.target = V_008DFC_SQ_EXP_NULL;
3360 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3361 args.out[0] = base->undef; /* R */
3362 args.out[1] = base->undef; /* G */
3363 args.out[2] = base->undef; /* B */
3364 args.out[3] = base->undef; /* A */
3365
3366 ac_build_export(&ctx->ac, &args);
3367 }
3368
3369 /**
3370 * Return PS outputs in this order:
3371 *
3372 * v[0:3] = color0.xyzw
3373 * v[4:7] = color1.xyzw
3374 * ...
3375 * vN+0 = Depth
3376 * vN+1 = Stencil
3377 * vN+2 = SampleMask
3378 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3379 *
3380 * The alpha-ref SGPR is returned via its original location.
3381 */
3382 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3383 unsigned max_outputs,
3384 LLVMValueRef *addrs)
3385 {
3386 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3387 struct si_shader *shader = ctx->shader;
3388 struct tgsi_shader_info *info = &shader->selector->info;
3389 LLVMBuilderRef builder = ctx->gallivm.builder;
3390 unsigned i, j, first_vgpr, vgpr;
3391
3392 LLVMValueRef color[8][4] = {};
3393 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3394 LLVMValueRef ret;
3395
3396 if (ctx->postponed_kill)
3397 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3398
3399 /* Read the output values. */
3400 for (i = 0; i < info->num_outputs; i++) {
3401 unsigned semantic_name = info->output_semantic_name[i];
3402 unsigned semantic_index = info->output_semantic_index[i];
3403
3404 switch (semantic_name) {
3405 case TGSI_SEMANTIC_COLOR:
3406 assert(semantic_index < 8);
3407 for (j = 0; j < 4; j++) {
3408 LLVMValueRef ptr = addrs[4 * i + j];
3409 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3410 color[semantic_index][j] = result;
3411 }
3412 break;
3413 case TGSI_SEMANTIC_POSITION:
3414 depth = LLVMBuildLoad(builder,
3415 addrs[4 * i + 2], "");
3416 break;
3417 case TGSI_SEMANTIC_STENCIL:
3418 stencil = LLVMBuildLoad(builder,
3419 addrs[4 * i + 1], "");
3420 break;
3421 case TGSI_SEMANTIC_SAMPLEMASK:
3422 samplemask = LLVMBuildLoad(builder,
3423 addrs[4 * i + 0], "");
3424 break;
3425 default:
3426 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3427 semantic_name);
3428 }
3429 }
3430
3431 /* Fill the return structure. */
3432 ret = ctx->return_value;
3433
3434 /* Set SGPRs. */
3435 ret = LLVMBuildInsertValue(builder, ret,
3436 LLVMBuildBitCast(ctx->ac.builder,
3437 LLVMGetParam(ctx->main_fn,
3438 SI_PARAM_ALPHA_REF),
3439 ctx->i32, ""),
3440 SI_SGPR_ALPHA_REF, "");
3441
3442 /* Set VGPRs */
3443 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3444 for (i = 0; i < ARRAY_SIZE(color); i++) {
3445 if (!color[i][0])
3446 continue;
3447
3448 for (j = 0; j < 4; j++)
3449 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3450 }
3451 if (depth)
3452 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3453 if (stencil)
3454 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3455 if (samplemask)
3456 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3457
3458 /* Add the input sample mask for smoothing at the end. */
3459 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3460 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3461 ret = LLVMBuildInsertValue(builder, ret,
3462 LLVMGetParam(ctx->main_fn,
3463 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3464
3465 ctx->return_value = ret;
3466 }
3467
3468 /* Prevent optimizations (at least of memory accesses) across the current
3469 * point in the program by emitting empty inline assembly that is marked as
3470 * having side effects.
3471 *
3472 * Optionally, a value can be passed through the inline assembly to prevent
3473 * LLVM from hoisting calls to ReadNone functions.
3474 */
3475 static void emit_optimization_barrier(struct si_shader_context *ctx,
3476 LLVMValueRef *pvgpr)
3477 {
3478 static int counter = 0;
3479
3480 LLVMBuilderRef builder = ctx->gallivm.builder;
3481 char code[16];
3482
3483 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3484
3485 if (!pvgpr) {
3486 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3487 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3488 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3489 } else {
3490 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3491 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3492 LLVMValueRef vgpr = *pvgpr;
3493 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3494 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3495 LLVMValueRef vgpr0;
3496
3497 assert(vgpr_size % 4 == 0);
3498
3499 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3500 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3501 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3502 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3503 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3504
3505 *pvgpr = vgpr;
3506 }
3507 }
3508
3509 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3510 {
3511 struct gallivm_state *gallivm = &ctx->gallivm;
3512 LLVMBuilderRef builder = gallivm->builder;
3513 LLVMValueRef args[1] = {
3514 LLVMConstInt(ctx->i32, simm16, 0)
3515 };
3516 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3517 ctx->voidt, args, 1, 0);
3518 }
3519
3520 static void membar_emit(
3521 const struct lp_build_tgsi_action *action,
3522 struct lp_build_tgsi_context *bld_base,
3523 struct lp_build_emit_data *emit_data)
3524 {
3525 struct si_shader_context *ctx = si_shader_context(bld_base);
3526 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3527 unsigned flags = LLVMConstIntGetZExtValue(src0);
3528 unsigned waitcnt = NOOP_WAITCNT;
3529
3530 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3531 waitcnt &= VM_CNT & LGKM_CNT;
3532
3533 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3534 TGSI_MEMBAR_SHADER_BUFFER |
3535 TGSI_MEMBAR_SHADER_IMAGE))
3536 waitcnt &= VM_CNT;
3537
3538 if (flags & TGSI_MEMBAR_SHARED)
3539 waitcnt &= LGKM_CNT;
3540
3541 if (waitcnt != NOOP_WAITCNT)
3542 si_emit_waitcnt(ctx, waitcnt);
3543 }
3544
3545 static void clock_emit(
3546 const struct lp_build_tgsi_action *action,
3547 struct lp_build_tgsi_context *bld_base,
3548 struct lp_build_emit_data *emit_data)
3549 {
3550 struct si_shader_context *ctx = si_shader_context(bld_base);
3551 struct gallivm_state *gallivm = &ctx->gallivm;
3552 LLVMValueRef tmp;
3553
3554 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3555 ctx->i64, NULL, 0, 0);
3556 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3557
3558 emit_data->output[0] =
3559 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3560 emit_data->output[1] =
3561 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3562 }
3563
3564 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3565 {
3566 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3567 CONST_ADDR_SPACE);
3568 }
3569
3570 static void si_llvm_emit_ddxy(
3571 const struct lp_build_tgsi_action *action,
3572 struct lp_build_tgsi_context *bld_base,
3573 struct lp_build_emit_data *emit_data)
3574 {
3575 struct si_shader_context *ctx = si_shader_context(bld_base);
3576 struct gallivm_state *gallivm = &ctx->gallivm;
3577 unsigned opcode = emit_data->info->opcode;
3578 LLVMValueRef val;
3579 int idx;
3580 unsigned mask;
3581
3582 if (opcode == TGSI_OPCODE_DDX_FINE)
3583 mask = AC_TID_MASK_LEFT;
3584 else if (opcode == TGSI_OPCODE_DDY_FINE)
3585 mask = AC_TID_MASK_TOP;
3586 else
3587 mask = AC_TID_MASK_TOP_LEFT;
3588
3589 /* for DDX we want to next X pixel, DDY next Y pixel. */
3590 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3591
3592 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3593 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3594 mask, idx, ctx->lds, val);
3595 emit_data->output[emit_data->chan] = val;
3596 }
3597
3598 /*
3599 * this takes an I,J coordinate pair,
3600 * and works out the X and Y derivatives.
3601 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3602 */
3603 static LLVMValueRef si_llvm_emit_ddxy_interp(
3604 struct lp_build_tgsi_context *bld_base,
3605 LLVMValueRef interp_ij)
3606 {
3607 struct si_shader_context *ctx = si_shader_context(bld_base);
3608 struct gallivm_state *gallivm = &ctx->gallivm;
3609 LLVMValueRef result[4], a;
3610 unsigned i;
3611
3612 for (i = 0; i < 2; i++) {
3613 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3614 LLVMConstInt(ctx->i32, i, 0), "");
3615 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3616 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3617 }
3618
3619 return lp_build_gather_values(gallivm, result, 4);
3620 }
3621
3622 static void interp_fetch_args(
3623 struct lp_build_tgsi_context *bld_base,
3624 struct lp_build_emit_data *emit_data)
3625 {
3626 struct si_shader_context *ctx = si_shader_context(bld_base);
3627 struct gallivm_state *gallivm = &ctx->gallivm;
3628 const struct tgsi_full_instruction *inst = emit_data->inst;
3629
3630 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3631 /* offset is in second src, first two channels */
3632 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3633 emit_data->inst, 1,
3634 TGSI_CHAN_X);
3635 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3636 emit_data->inst, 1,
3637 TGSI_CHAN_Y);
3638 emit_data->arg_count = 2;
3639 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3640 LLVMValueRef sample_position;
3641 LLVMValueRef sample_id;
3642 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3643
3644 /* fetch sample ID, then fetch its sample position,
3645 * and place into first two channels.
3646 */
3647 sample_id = lp_build_emit_fetch(bld_base,
3648 emit_data->inst, 1, TGSI_CHAN_X);
3649 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3650 ctx->i32, "");
3651 sample_position = load_sample_position(ctx, sample_id);
3652
3653 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3654 sample_position,
3655 ctx->i32_0, "");
3656
3657 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3658 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3659 sample_position,
3660 ctx->i32_1, "");
3661 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3662 emit_data->arg_count = 2;
3663 }
3664 }
3665
3666 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3667 struct lp_build_tgsi_context *bld_base,
3668 struct lp_build_emit_data *emit_data)
3669 {
3670 struct si_shader_context *ctx = si_shader_context(bld_base);
3671 struct si_shader *shader = ctx->shader;
3672 struct gallivm_state *gallivm = &ctx->gallivm;
3673 const struct tgsi_shader_info *info = &shader->selector->info;
3674 LLVMValueRef interp_param;
3675 const struct tgsi_full_instruction *inst = emit_data->inst;
3676 const struct tgsi_full_src_register *input = &inst->Src[0];
3677 int input_base, input_array_size;
3678 int chan;
3679 int i;
3680 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3681 LLVMValueRef array_idx;
3682 int interp_param_idx;
3683 unsigned interp;
3684 unsigned location;
3685
3686 assert(input->Register.File == TGSI_FILE_INPUT);
3687
3688 if (input->Register.Indirect) {
3689 unsigned array_id = input->Indirect.ArrayID;
3690
3691 if (array_id) {
3692 input_base = info->input_array_first[array_id];
3693 input_array_size = info->input_array_last[array_id] - input_base + 1;
3694 } else {
3695 input_base = inst->Src[0].Register.Index;
3696 input_array_size = info->num_inputs - input_base;
3697 }
3698
3699 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3700 input->Register.Index - input_base);
3701 } else {
3702 input_base = inst->Src[0].Register.Index;
3703 input_array_size = 1;
3704 array_idx = ctx->i32_0;
3705 }
3706
3707 interp = shader->selector->info.input_interpolate[input_base];
3708
3709 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3710 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3711 location = TGSI_INTERPOLATE_LOC_CENTER;
3712 else
3713 location = TGSI_INTERPOLATE_LOC_CENTROID;
3714
3715 interp_param_idx = lookup_interp_param_index(interp, location);
3716 if (interp_param_idx == -1)
3717 return;
3718 else if (interp_param_idx)
3719 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3720 else
3721 interp_param = NULL;
3722
3723 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3724 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3725 LLVMValueRef ij_out[2];
3726 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3727
3728 /*
3729 * take the I then J parameters, and the DDX/Y for it, and
3730 * calculate the IJ inputs for the interpolator.
3731 * temp1 = ddx * offset/sample.x + I;
3732 * interp_param.I = ddy * offset/sample.y + temp1;
3733 * temp1 = ddx * offset/sample.x + J;
3734 * interp_param.J = ddy * offset/sample.y + temp1;
3735 */
3736 for (i = 0; i < 2; i++) {
3737 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3738 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3739 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3740 ddxy_out, ix_ll, "");
3741 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3742 ddxy_out, iy_ll, "");
3743 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3744 interp_param, ix_ll, "");
3745 LLVMValueRef temp1, temp2;
3746
3747 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3748 ctx->f32, "");
3749
3750 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3751
3752 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3753
3754 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3755
3756 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3757 }
3758 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3759 }
3760
3761 if (interp_param) {
3762 interp_param = LLVMBuildBitCast(gallivm->builder,
3763 interp_param, LLVMVectorType(ctx->f32, 2), "");
3764 }
3765
3766 for (chan = 0; chan < 4; chan++) {
3767 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3768 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3769
3770 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3771 LLVMValueRef v, i = NULL, j = NULL;
3772
3773 if (interp_param) {
3774 interp_param = LLVMBuildBitCast(gallivm->builder,
3775 interp_param, LLVMVectorType(ctx->f32, 2), "");
3776 i = LLVMBuildExtractElement(
3777 gallivm->builder, interp_param, ctx->i32_0, "");
3778 j = LLVMBuildExtractElement(
3779 gallivm->builder, interp_param, ctx->i32_1, "");
3780 }
3781 v = si_build_fs_interp(ctx, input_base + idx, schan,
3782 prim_mask, i, j);
3783
3784 gather = LLVMBuildInsertElement(gallivm->builder,
3785 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3786 }
3787
3788 emit_data->output[chan] = LLVMBuildExtractElement(
3789 gallivm->builder, gather, array_idx, "");
3790 }
3791 }
3792
3793 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3794 LLVMValueRef value)
3795 {
3796 struct gallivm_state *gallivm = &ctx->gallivm;
3797 LLVMValueRef args[3] = {
3798 value,
3799 ctx->i32_0,
3800 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3801 };
3802
3803 /* We currently have no other way to prevent LLVM from lifting the icmp
3804 * calls to a dominating basic block.
3805 */
3806 emit_optimization_barrier(ctx, &args[0]);
3807
3808 if (LLVMTypeOf(args[0]) != ctx->i32)
3809 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3810
3811 return lp_build_intrinsic(gallivm->builder,
3812 "llvm.amdgcn.icmp.i32",
3813 ctx->i64, args, 3,
3814 LP_FUNC_ATTR_NOUNWIND |
3815 LP_FUNC_ATTR_READNONE |
3816 LP_FUNC_ATTR_CONVERGENT);
3817 }
3818
3819 static void vote_all_emit(
3820 const struct lp_build_tgsi_action *action,
3821 struct lp_build_tgsi_context *bld_base,
3822 struct lp_build_emit_data *emit_data)
3823 {
3824 struct si_shader_context *ctx = si_shader_context(bld_base);
3825 struct gallivm_state *gallivm = &ctx->gallivm;
3826 LLVMValueRef active_set, vote_set;
3827 LLVMValueRef tmp;
3828
3829 active_set = si_emit_ballot(ctx, ctx->i32_1);
3830 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3831
3832 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3833 emit_data->output[emit_data->chan] =
3834 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3835 }
3836
3837 static void vote_any_emit(
3838 const struct lp_build_tgsi_action *action,
3839 struct lp_build_tgsi_context *bld_base,
3840 struct lp_build_emit_data *emit_data)
3841 {
3842 struct si_shader_context *ctx = si_shader_context(bld_base);
3843 struct gallivm_state *gallivm = &ctx->gallivm;
3844 LLVMValueRef vote_set;
3845 LLVMValueRef tmp;
3846
3847 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3848
3849 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3850 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3851 emit_data->output[emit_data->chan] =
3852 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3853 }
3854
3855 static void vote_eq_emit(
3856 const struct lp_build_tgsi_action *action,
3857 struct lp_build_tgsi_context *bld_base,
3858 struct lp_build_emit_data *emit_data)
3859 {
3860 struct si_shader_context *ctx = si_shader_context(bld_base);
3861 struct gallivm_state *gallivm = &ctx->gallivm;
3862 LLVMValueRef active_set, vote_set;
3863 LLVMValueRef all, none, tmp;
3864
3865 active_set = si_emit_ballot(ctx, ctx->i32_1);
3866 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3867
3868 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3869 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3870 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3871 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3872 emit_data->output[emit_data->chan] =
3873 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3874 }
3875
3876 static void ballot_emit(
3877 const struct lp_build_tgsi_action *action,
3878 struct lp_build_tgsi_context *bld_base,
3879 struct lp_build_emit_data *emit_data)
3880 {
3881 struct si_shader_context *ctx = si_shader_context(bld_base);
3882 LLVMBuilderRef builder = ctx->gallivm.builder;
3883 LLVMValueRef tmp;
3884
3885 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3886 tmp = si_emit_ballot(ctx, tmp);
3887 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3888
3889 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3890 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3891 }
3892
3893 static void read_invoc_fetch_args(
3894 struct lp_build_tgsi_context *bld_base,
3895 struct lp_build_emit_data *emit_data)
3896 {
3897 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3898 0, emit_data->src_chan);
3899
3900 /* Always read the source invocation (= lane) from the X channel. */
3901 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3902 1, TGSI_CHAN_X);
3903 emit_data->arg_count = 2;
3904 }
3905
3906 static void read_lane_emit(
3907 const struct lp_build_tgsi_action *action,
3908 struct lp_build_tgsi_context *bld_base,
3909 struct lp_build_emit_data *emit_data)
3910 {
3911 struct si_shader_context *ctx = si_shader_context(bld_base);
3912 LLVMBuilderRef builder = ctx->gallivm.builder;
3913
3914 /* We currently have no other way to prevent LLVM from lifting the icmp
3915 * calls to a dominating basic block.
3916 */
3917 emit_optimization_barrier(ctx, &emit_data->args[0]);
3918
3919 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3920 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3921 ctx->i32, "");
3922 }
3923
3924 emit_data->output[emit_data->chan] =
3925 ac_build_intrinsic(&ctx->ac, action->intr_name,
3926 ctx->i32, emit_data->args, emit_data->arg_count,
3927 AC_FUNC_ATTR_READNONE |
3928 AC_FUNC_ATTR_CONVERGENT);
3929 }
3930
3931 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3932 struct lp_build_emit_data *emit_data)
3933 {
3934 struct si_shader_context *ctx = si_shader_context(bld_base);
3935 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3936 LLVMValueRef imm;
3937 unsigned stream;
3938
3939 assert(src0.File == TGSI_FILE_IMMEDIATE);
3940
3941 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
3942 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
3943 return stream;
3944 }
3945
3946 /* Emit one vertex from the geometry shader */
3947 static void si_llvm_emit_vertex(
3948 const struct lp_build_tgsi_action *action,
3949 struct lp_build_tgsi_context *bld_base,
3950 struct lp_build_emit_data *emit_data)
3951 {
3952 struct si_shader_context *ctx = si_shader_context(bld_base);
3953 struct lp_build_context *uint = &bld_base->uint_bld;
3954 struct si_shader *shader = ctx->shader;
3955 struct tgsi_shader_info *info = &shader->selector->info;
3956 struct gallivm_state *gallivm = &ctx->gallivm;
3957 struct lp_build_if_state if_state;
3958 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3959 ctx->param_gs2vs_offset);
3960 LLVMValueRef gs_next_vertex;
3961 LLVMValueRef can_emit, kill;
3962 unsigned chan, offset;
3963 int i;
3964 unsigned stream;
3965
3966 stream = si_llvm_get_stream(bld_base, emit_data);
3967
3968 /* Write vertex attribute values to GSVS ring */
3969 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
3970 ctx->gs_next_vertex[stream],
3971 "");
3972
3973 /* If this thread has already emitted the declared maximum number of
3974 * vertices, skip the write: excessive vertex emissions are not
3975 * supposed to have any effect.
3976 *
3977 * If the shader has no writes to memory, kill it instead. This skips
3978 * further memory loads and may allow LLVM to skip to the end
3979 * altogether.
3980 */
3981 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
3982 LLVMConstInt(ctx->i32,
3983 shader->selector->gs_max_out_vertices, 0), "");
3984
3985 bool use_kill = !info->writes_memory;
3986 if (use_kill) {
3987 kill = lp_build_select(&bld_base->base, can_emit,
3988 LLVMConstReal(ctx->f32, 1.0f),
3989 LLVMConstReal(ctx->f32, -1.0f));
3990
3991 ac_build_kill(&ctx->ac, kill);
3992 } else {
3993 lp_build_if(&if_state, gallivm, can_emit);
3994 }
3995
3996 offset = 0;
3997 for (i = 0; i < info->num_outputs; i++) {
3998 LLVMValueRef *out_ptr = ctx->outputs[i];
3999
4000 for (chan = 0; chan < 4; chan++) {
4001 if (!(info->output_usagemask[i] & (1 << chan)) ||
4002 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4003 continue;
4004
4005 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4006 LLVMValueRef voffset =
4007 LLVMConstInt(ctx->i32, offset *
4008 shader->selector->gs_max_out_vertices, 0);
4009 offset++;
4010
4011 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4012 voffset = lp_build_mul_imm(uint, voffset, 4);
4013
4014 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4015
4016 ac_build_buffer_store_dword(&ctx->ac,
4017 ctx->gsvs_ring[stream],
4018 out_val, 1,
4019 voffset, soffset, 0,
4020 1, 1, true, true);
4021 }
4022 }
4023
4024 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4025 ctx->i32_1);
4026
4027 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4028
4029 /* Signal vertex emission */
4030 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4031 si_get_gs_wave_id(ctx));
4032 if (!use_kill)
4033 lp_build_endif(&if_state);
4034 }
4035
4036 /* Cut one primitive from the geometry shader */
4037 static void si_llvm_emit_primitive(
4038 const struct lp_build_tgsi_action *action,
4039 struct lp_build_tgsi_context *bld_base,
4040 struct lp_build_emit_data *emit_data)
4041 {
4042 struct si_shader_context *ctx = si_shader_context(bld_base);
4043 unsigned stream;
4044
4045 /* Signal primitive cut */
4046 stream = si_llvm_get_stream(bld_base, emit_data);
4047 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4048 si_get_gs_wave_id(ctx));
4049 }
4050
4051 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4052 struct lp_build_tgsi_context *bld_base,
4053 struct lp_build_emit_data *emit_data)
4054 {
4055 struct si_shader_context *ctx = si_shader_context(bld_base);
4056 struct gallivm_state *gallivm = &ctx->gallivm;
4057
4058 /* SI only (thanks to a hw bug workaround):
4059 * The real barrier instruction isn’t needed, because an entire patch
4060 * always fits into a single wave.
4061 */
4062 if (ctx->screen->b.chip_class == SI &&
4063 ctx->type == PIPE_SHADER_TESS_CTRL) {
4064 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4065 return;
4066 }
4067
4068 lp_build_intrinsic(gallivm->builder,
4069 "llvm.amdgcn.s.barrier",
4070 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4071 }
4072
4073 static const struct lp_build_tgsi_action interp_action = {
4074 .fetch_args = interp_fetch_args,
4075 .emit = build_interp_intrinsic,
4076 };
4077
4078 static void si_create_function(struct si_shader_context *ctx,
4079 const char *name,
4080 LLVMTypeRef *returns, unsigned num_returns,
4081 struct si_function_info *fninfo,
4082 unsigned max_workgroup_size)
4083 {
4084 int i;
4085
4086 si_llvm_create_func(ctx, name, returns, num_returns,
4087 fninfo->types, fninfo->num_params);
4088 ctx->return_value = LLVMGetUndef(ctx->return_type);
4089
4090 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4091 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4092
4093 /* The combination of:
4094 * - ByVal
4095 * - dereferenceable
4096 * - invariant.load
4097 * allows the optimization passes to move loads and reduces
4098 * SGPR spilling significantly.
4099 */
4100 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4101 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4102 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4103 ac_add_attr_dereferenceable(P, UINT64_MAX);
4104 } else
4105 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4106 }
4107
4108 for (i = 0; i < fninfo->num_params; ++i) {
4109 if (fninfo->assign[i])
4110 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4111 }
4112
4113 if (max_workgroup_size) {
4114 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4115 max_workgroup_size);
4116 }
4117 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4118 "no-signed-zeros-fp-math",
4119 "true");
4120
4121 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
4122 /* These were copied from some LLVM test. */
4123 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4124 "less-precise-fpmad",
4125 "true");
4126 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4127 "no-infs-fp-math",
4128 "true");
4129 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4130 "no-nans-fp-math",
4131 "true");
4132 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4133 "unsafe-fp-math",
4134 "true");
4135 }
4136 }
4137
4138 static void declare_streamout_params(struct si_shader_context *ctx,
4139 struct pipe_stream_output_info *so,
4140 struct si_function_info *fninfo)
4141 {
4142 int i;
4143
4144 /* Streamout SGPRs. */
4145 if (so->num_outputs) {
4146 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4147 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4148 else
4149 ctx->param_streamout_config = fninfo->num_params - 1;
4150
4151 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4152 }
4153 /* A streamout buffer offset is loaded if the stride is non-zero. */
4154 for (i = 0; i < 4; i++) {
4155 if (!so->stride[i])
4156 continue;
4157
4158 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4159 }
4160 }
4161
4162 static unsigned llvm_get_type_size(LLVMTypeRef type)
4163 {
4164 LLVMTypeKind kind = LLVMGetTypeKind(type);
4165
4166 switch (kind) {
4167 case LLVMIntegerTypeKind:
4168 return LLVMGetIntTypeWidth(type) / 8;
4169 case LLVMFloatTypeKind:
4170 return 4;
4171 case LLVMPointerTypeKind:
4172 return 8;
4173 case LLVMVectorTypeKind:
4174 return LLVMGetVectorSize(type) *
4175 llvm_get_type_size(LLVMGetElementType(type));
4176 case LLVMArrayTypeKind:
4177 return LLVMGetArrayLength(type) *
4178 llvm_get_type_size(LLVMGetElementType(type));
4179 default:
4180 assert(0);
4181 return 0;
4182 }
4183 }
4184
4185 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4186 {
4187 struct gallivm_state *gallivm = &ctx->gallivm;
4188
4189 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4190 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4191 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4192 "lds");
4193 }
4194
4195 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4196 {
4197 switch (shader->selector->type) {
4198 case PIPE_SHADER_TESS_CTRL:
4199 /* Return this so that LLVM doesn't remove s_barrier
4200 * instructions on chips where we use s_barrier. */
4201 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4202
4203 case PIPE_SHADER_GEOMETRY:
4204 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4205
4206 case PIPE_SHADER_COMPUTE:
4207 break; /* see below */
4208
4209 default:
4210 return 0;
4211 }
4212
4213 const unsigned *properties = shader->selector->info.properties;
4214 unsigned max_work_group_size =
4215 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4216 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4217 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4218
4219 if (!max_work_group_size) {
4220 /* This is a variable group size compute shader,
4221 * compile it for the maximum possible group size.
4222 */
4223 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4224 }
4225 return max_work_group_size;
4226 }
4227
4228 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4229 struct si_function_info *fninfo,
4230 bool assign_params)
4231 {
4232 unsigned const_and_shader_buffers =
4233 add_arg(fninfo, ARG_SGPR,
4234 si_const_array(ctx->v4i32,
4235 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
4236 unsigned samplers_and_images =
4237 add_arg(fninfo, ARG_SGPR,
4238 si_const_array(ctx->v8i32,
4239 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4240
4241 if (assign_params) {
4242 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4243 ctx->param_samplers_and_images = samplers_and_images;
4244 }
4245 }
4246
4247 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4248 struct si_function_info *fninfo)
4249 {
4250 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4251 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4252 declare_per_stage_desc_pointers(ctx, fninfo, true);
4253 }
4254
4255 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4256 struct si_function_info *fninfo)
4257 {
4258 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4259 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4260 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4261 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4262 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4263 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4264 }
4265
4266 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4267 struct si_function_info *fninfo,
4268 unsigned *num_prolog_vgprs)
4269 {
4270 struct si_shader *shader = ctx->shader;
4271
4272 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4273 if (shader->key.as_ls) {
4274 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4275 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4276 } else {
4277 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4278 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4279 }
4280 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4281
4282 if (!shader->is_gs_copy_shader) {
4283 /* Vertex load indices. */
4284 ctx->param_vertex_index0 = fninfo->num_params;
4285 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4286 add_arg(fninfo, ARG_VGPR, ctx->i32);
4287 *num_prolog_vgprs += shader->selector->info.num_inputs;
4288 }
4289 }
4290
4291 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4292 struct si_function_info *fninfo)
4293 {
4294 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4295 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4296 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4297 ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4298 }
4299
4300 enum {
4301 /* Convenient merged shader definitions. */
4302 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4303 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4304 };
4305
4306 static void create_function(struct si_shader_context *ctx)
4307 {
4308 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
4309 struct gallivm_state *gallivm = &ctx->gallivm;
4310 struct si_shader *shader = ctx->shader;
4311 struct si_function_info fninfo;
4312 LLVMTypeRef returns[16+32*4];
4313 unsigned i, num_return_sgprs;
4314 unsigned num_returns = 0;
4315 unsigned num_prolog_vgprs = 0;
4316 unsigned type = ctx->type;
4317
4318 si_init_function_info(&fninfo);
4319
4320 /* Set MERGED shaders. */
4321 if (ctx->screen->b.chip_class >= GFX9) {
4322 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4323 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4324 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4325 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4326 }
4327
4328 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4329
4330 switch (type) {
4331 case PIPE_SHADER_VERTEX:
4332 declare_default_desc_pointers(ctx, &fninfo);
4333 declare_vs_specific_input_sgprs(ctx, &fninfo);
4334
4335 if (shader->key.as_es) {
4336 assert(!shader->selector->nir);
4337 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4338 } else if (shader->key.as_ls) {
4339 assert(!shader->selector->nir);
4340 /* no extra parameters */
4341 } else {
4342 if (shader->is_gs_copy_shader) {
4343 fninfo.num_params = ctx->param_rw_buffers + 1;
4344 fninfo.num_sgpr_params = fninfo.num_params;
4345 }
4346
4347 /* The locations of the other parameters are assigned dynamically. */
4348 declare_streamout_params(ctx, &shader->selector->so,
4349 &fninfo);
4350 }
4351
4352 /* VGPRs */
4353 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4354 break;
4355
4356 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4357 declare_default_desc_pointers(ctx, &fninfo);
4358 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4359 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4360 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4361 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4362 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4363 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4364 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4365 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4366
4367 /* VGPRs */
4368 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4369 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4370
4371 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4372 * placed after the user SGPRs.
4373 */
4374 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4375 returns[num_returns++] = ctx->i32; /* SGPRs */
4376 for (i = 0; i < 5; i++)
4377 returns[num_returns++] = ctx->f32; /* VGPRs */
4378 break;
4379
4380 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4381 /* Merged stages have 8 system SGPRs at the beginning. */
4382 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4383 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4384 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4385 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4386 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4387 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4388 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4389 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4390
4391 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4392 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4393 declare_per_stage_desc_pointers(ctx, &fninfo,
4394 ctx->type == PIPE_SHADER_VERTEX);
4395 declare_vs_specific_input_sgprs(ctx, &fninfo);
4396
4397 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4398 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4399 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4400 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4401 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4402 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4403
4404 declare_per_stage_desc_pointers(ctx, &fninfo,
4405 ctx->type == PIPE_SHADER_TESS_CTRL);
4406
4407 /* VGPRs (first TCS, then VS) */
4408 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4409 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4410
4411 if (ctx->type == PIPE_SHADER_VERTEX) {
4412 declare_vs_input_vgprs(ctx, &fninfo,
4413 &num_prolog_vgprs);
4414
4415 /* LS return values are inputs to the TCS main shader part. */
4416 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4417 returns[num_returns++] = ctx->i32; /* SGPRs */
4418 for (i = 0; i < 2; i++)
4419 returns[num_returns++] = ctx->f32; /* VGPRs */
4420 } else {
4421 /* TCS return values are inputs to the TCS epilog.
4422 *
4423 * param_tcs_offchip_offset, param_tcs_factor_offset,
4424 * param_tcs_offchip_layout, and param_rw_buffers
4425 * should be passed to the epilog.
4426 */
4427 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4428 returns[num_returns++] = ctx->i32; /* SGPRs */
4429 for (i = 0; i < 5; i++)
4430 returns[num_returns++] = ctx->f32; /* VGPRs */
4431 }
4432 break;
4433
4434 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4435 /* Merged stages have 8 system SGPRs at the beginning. */
4436 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4437 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4438 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4439 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4440 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4441 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4442 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4443 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4444
4445 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4446 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4447 declare_per_stage_desc_pointers(ctx, &fninfo,
4448 (ctx->type == PIPE_SHADER_VERTEX ||
4449 ctx->type == PIPE_SHADER_TESS_EVAL));
4450 if (ctx->type == PIPE_SHADER_VERTEX) {
4451 declare_vs_specific_input_sgprs(ctx, &fninfo);
4452 } else {
4453 /* TESS_EVAL (and also GEOMETRY):
4454 * Declare as many input SGPRs as the VS has. */
4455 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4456 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4457 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4458 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4459 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4460 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4461 }
4462
4463 declare_per_stage_desc_pointers(ctx, &fninfo,
4464 ctx->type == PIPE_SHADER_GEOMETRY);
4465
4466 /* VGPRs (first GS, then VS/TES) */
4467 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4468 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4469 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4470 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4471 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4472
4473 if (ctx->type == PIPE_SHADER_VERTEX) {
4474 declare_vs_input_vgprs(ctx, &fninfo,
4475 &num_prolog_vgprs);
4476 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4477 declare_tes_input_vgprs(ctx, &fninfo);
4478 }
4479
4480 if (ctx->type == PIPE_SHADER_VERTEX ||
4481 ctx->type == PIPE_SHADER_TESS_EVAL) {
4482 /* ES return values are inputs to GS. */
4483 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4484 returns[num_returns++] = ctx->i32; /* SGPRs */
4485 for (i = 0; i < 5; i++)
4486 returns[num_returns++] = ctx->f32; /* VGPRs */
4487 }
4488 break;
4489
4490 case PIPE_SHADER_TESS_EVAL:
4491 declare_default_desc_pointers(ctx, &fninfo);
4492 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4493 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4494
4495 if (shader->key.as_es) {
4496 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4497 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4498 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4499 } else {
4500 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4501 declare_streamout_params(ctx, &shader->selector->so,
4502 &fninfo);
4503 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4504 }
4505
4506 /* VGPRs */
4507 declare_tes_input_vgprs(ctx, &fninfo);
4508 break;
4509
4510 case PIPE_SHADER_GEOMETRY:
4511 declare_default_desc_pointers(ctx, &fninfo);
4512 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4513 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4514
4515 /* VGPRs */
4516 ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4517 ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4518 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4519 ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4520 ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4521 ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4522 ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4523 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4524 break;
4525
4526 case PIPE_SHADER_FRAGMENT:
4527 declare_default_desc_pointers(ctx, &fninfo);
4528 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4529 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4530
4531 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4532 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4533 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4534 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4535 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4536 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4537 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4538 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4539 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4540 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4541 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4542 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4543 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4544 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4545 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4546 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4547 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4548 &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4549 shader->info.face_vgpr_index = 20;
4550 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4551 &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4552 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4553 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4554 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4555
4556 /* Color inputs from the prolog. */
4557 if (shader->selector->info.colors_read) {
4558 unsigned num_color_elements =
4559 util_bitcount(shader->selector->info.colors_read);
4560
4561 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4562 for (i = 0; i < num_color_elements; i++)
4563 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4564
4565 num_prolog_vgprs += num_color_elements;
4566 }
4567
4568 /* Outputs for the epilog. */
4569 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4570 num_returns =
4571 num_return_sgprs +
4572 util_bitcount(shader->selector->info.colors_written) * 4 +
4573 shader->selector->info.writes_z +
4574 shader->selector->info.writes_stencil +
4575 shader->selector->info.writes_samplemask +
4576 1 /* SampleMaskIn */;
4577
4578 num_returns = MAX2(num_returns,
4579 num_return_sgprs +
4580 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4581
4582 for (i = 0; i < num_return_sgprs; i++)
4583 returns[i] = ctx->i32;
4584 for (; i < num_returns; i++)
4585 returns[i] = ctx->f32;
4586 break;
4587
4588 case PIPE_SHADER_COMPUTE:
4589 declare_default_desc_pointers(ctx, &fninfo);
4590 if (shader->selector->info.uses_grid_size)
4591 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4592 if (shader->selector->info.uses_block_size)
4593 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4594
4595 for (i = 0; i < 3; i++) {
4596 ctx->param_block_id[i] = -1;
4597 if (shader->selector->info.uses_block_id[i])
4598 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4599 }
4600
4601 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4602 break;
4603 default:
4604 assert(0 && "unimplemented shader");
4605 return;
4606 }
4607
4608 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4609 si_get_max_workgroup_size(shader));
4610
4611 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4612 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4613 ctx->separate_prolog) {
4614 si_llvm_add_attribute(ctx->main_fn,
4615 "InitialPSInputAddr",
4616 S_0286D0_PERSP_SAMPLE_ENA(1) |
4617 S_0286D0_PERSP_CENTER_ENA(1) |
4618 S_0286D0_PERSP_CENTROID_ENA(1) |
4619 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4620 S_0286D0_LINEAR_CENTER_ENA(1) |
4621 S_0286D0_LINEAR_CENTROID_ENA(1) |
4622 S_0286D0_FRONT_FACE_ENA(1) |
4623 S_0286D0_POS_FIXED_PT_ENA(1));
4624 }
4625
4626 shader->info.num_input_sgprs = 0;
4627 shader->info.num_input_vgprs = 0;
4628
4629 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4630 shader->info.num_input_sgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4631
4632 for (; i < fninfo.num_params; ++i)
4633 shader->info.num_input_vgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4634
4635 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4636 shader->info.num_input_vgprs -= num_prolog_vgprs;
4637
4638 if (!ctx->screen->has_ds_bpermute &&
4639 bld_base->info &&
4640 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
4641 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
4642 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
4643 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
4644 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
4645 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
4646 ctx->lds =
4647 LLVMAddGlobalInAddressSpace(gallivm->module,
4648 LLVMArrayType(ctx->i32, 64),
4649 "ddxy_lds",
4650 LOCAL_ADDR_SPACE);
4651
4652 if (shader->key.as_ls ||
4653 ctx->type == PIPE_SHADER_TESS_CTRL ||
4654 /* GFX9 has the ESGS ring buffer in LDS. */
4655 (ctx->screen->b.chip_class >= GFX9 &&
4656 (shader->key.as_es ||
4657 ctx->type == PIPE_SHADER_GEOMETRY)))
4658 declare_lds_as_pointer(ctx);
4659 }
4660
4661 /**
4662 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4663 * for later use.
4664 */
4665 static void preload_ring_buffers(struct si_shader_context *ctx)
4666 {
4667 struct gallivm_state *gallivm = &ctx->gallivm;
4668 LLVMBuilderRef builder = gallivm->builder;
4669
4670 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4671 ctx->param_rw_buffers);
4672
4673 if (ctx->screen->b.chip_class <= VI &&
4674 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4675 unsigned ring =
4676 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4677 : SI_ES_RING_ESGS;
4678 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4679
4680 ctx->esgs_ring =
4681 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4682 }
4683
4684 if (ctx->shader->is_gs_copy_shader) {
4685 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4686
4687 ctx->gsvs_ring[0] =
4688 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4689 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4690 const struct si_shader_selector *sel = ctx->shader->selector;
4691 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4692 LLVMValueRef base_ring;
4693
4694 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4695
4696 /* The conceptual layout of the GSVS ring is
4697 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4698 * but the real memory layout is swizzled across
4699 * threads:
4700 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4701 * t16v0c0 ..
4702 * Override the buffer descriptor accordingly.
4703 */
4704 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4705 uint64_t stream_offset = 0;
4706
4707 for (unsigned stream = 0; stream < 4; ++stream) {
4708 unsigned num_components;
4709 unsigned stride;
4710 unsigned num_records;
4711 LLVMValueRef ring, tmp;
4712
4713 num_components = sel->info.num_stream_output_components[stream];
4714 if (!num_components)
4715 continue;
4716
4717 stride = 4 * num_components * sel->gs_max_out_vertices;
4718
4719 /* Limit on the stride field for <= CIK. */
4720 assert(stride < (1 << 14));
4721
4722 num_records = 64;
4723
4724 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4725 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4726 tmp = LLVMBuildAdd(builder, tmp,
4727 LLVMConstInt(ctx->i64,
4728 stream_offset, 0), "");
4729 stream_offset += stride * 64;
4730
4731 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4732 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4733 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4734 tmp = LLVMBuildOr(builder, tmp,
4735 LLVMConstInt(ctx->i32,
4736 S_008F04_STRIDE(stride) |
4737 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4738 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4739 ring = LLVMBuildInsertElement(builder, ring,
4740 LLVMConstInt(ctx->i32, num_records, 0),
4741 LLVMConstInt(ctx->i32, 2, 0), "");
4742 ring = LLVMBuildInsertElement(builder, ring,
4743 LLVMConstInt(ctx->i32,
4744 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4745 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4746 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4747 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4748 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4749 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4750 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4751 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4752 S_008F0C_ADD_TID_ENABLE(1),
4753 0),
4754 LLVMConstInt(ctx->i32, 3, 0), "");
4755
4756 ctx->gsvs_ring[stream] = ring;
4757 }
4758 }
4759 }
4760
4761 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4762 LLVMValueRef param_rw_buffers,
4763 unsigned param_pos_fixed_pt)
4764 {
4765 struct gallivm_state *gallivm = &ctx->gallivm;
4766 LLVMBuilderRef builder = gallivm->builder;
4767 LLVMValueRef slot, desc, offset, row, bit, address[2];
4768
4769 /* Use the fixed-point gl_FragCoord input.
4770 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4771 * per coordinate to get the repeating effect.
4772 */
4773 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4774 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4775
4776 /* Load the buffer descriptor. */
4777 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4778 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4779
4780 /* The stipple pattern is 32x32, each row has 32 bits. */
4781 offset = LLVMBuildMul(builder, address[1],
4782 LLVMConstInt(ctx->i32, 4, 0), "");
4783 row = buffer_load_const(ctx, desc, offset);
4784 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4785 bit = LLVMBuildLShr(builder, row, address[0], "");
4786 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4787
4788 /* The intrinsic kills the thread if arg < 0. */
4789 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4790 LLVMConstReal(ctx->f32, -1), "");
4791 ac_build_kill(&ctx->ac, bit);
4792 }
4793
4794 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4795 struct si_shader_config *conf,
4796 unsigned symbol_offset)
4797 {
4798 unsigned i;
4799 const unsigned char *config =
4800 ac_shader_binary_config_start(binary, symbol_offset);
4801 bool really_needs_scratch = false;
4802
4803 /* LLVM adds SGPR spills to the scratch size.
4804 * Find out if we really need the scratch buffer.
4805 */
4806 for (i = 0; i < binary->reloc_count; i++) {
4807 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4808
4809 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4810 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4811 really_needs_scratch = true;
4812 break;
4813 }
4814 }
4815
4816 /* XXX: We may be able to emit some of these values directly rather than
4817 * extracting fields to be emitted later.
4818 */
4819
4820 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4821 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4822 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4823 switch (reg) {
4824 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4825 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4826 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4827 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4828 case R_00B848_COMPUTE_PGM_RSRC1:
4829 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4830 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4831 conf->float_mode = G_00B028_FLOAT_MODE(value);
4832 conf->rsrc1 = value;
4833 break;
4834 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4835 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4836 break;
4837 case R_00B84C_COMPUTE_PGM_RSRC2:
4838 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4839 conf->rsrc2 = value;
4840 break;
4841 case R_0286CC_SPI_PS_INPUT_ENA:
4842 conf->spi_ps_input_ena = value;
4843 break;
4844 case R_0286D0_SPI_PS_INPUT_ADDR:
4845 conf->spi_ps_input_addr = value;
4846 break;
4847 case R_0286E8_SPI_TMPRING_SIZE:
4848 case R_00B860_COMPUTE_TMPRING_SIZE:
4849 /* WAVESIZE is in units of 256 dwords. */
4850 if (really_needs_scratch)
4851 conf->scratch_bytes_per_wave =
4852 G_00B860_WAVESIZE(value) * 256 * 4;
4853 break;
4854 case 0x4: /* SPILLED_SGPRS */
4855 conf->spilled_sgprs = value;
4856 break;
4857 case 0x8: /* SPILLED_VGPRS */
4858 conf->spilled_vgprs = value;
4859 break;
4860 default:
4861 {
4862 static bool printed;
4863
4864 if (!printed) {
4865 fprintf(stderr, "Warning: LLVM emitted unknown "
4866 "config register: 0x%x\n", reg);
4867 printed = true;
4868 }
4869 }
4870 break;
4871 }
4872 }
4873
4874 if (!conf->spi_ps_input_addr)
4875 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4876 }
4877
4878 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4879 uint64_t scratch_va)
4880 {
4881 unsigned i;
4882 uint32_t scratch_rsrc_dword0 = scratch_va;
4883 uint32_t scratch_rsrc_dword1 =
4884 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4885
4886 /* Enable scratch coalescing. */
4887 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4888
4889 for (i = 0 ; i < shader->binary.reloc_count; i++) {
4890 const struct ac_shader_reloc *reloc =
4891 &shader->binary.relocs[i];
4892 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4893 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4894 &scratch_rsrc_dword0, 4);
4895 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4896 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4897 &scratch_rsrc_dword1, 4);
4898 }
4899 }
4900 }
4901
4902 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4903 {
4904 unsigned size = shader->binary.code_size;
4905
4906 if (shader->prolog)
4907 size += shader->prolog->binary.code_size;
4908 if (shader->previous_stage)
4909 size += shader->previous_stage->binary.code_size;
4910 if (shader->prolog2)
4911 size += shader->prolog2->binary.code_size;
4912 if (shader->epilog)
4913 size += shader->epilog->binary.code_size;
4914 return size;
4915 }
4916
4917 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4918 {
4919 const struct ac_shader_binary *prolog =
4920 shader->prolog ? &shader->prolog->binary : NULL;
4921 const struct ac_shader_binary *previous_stage =
4922 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4923 const struct ac_shader_binary *prolog2 =
4924 shader->prolog2 ? &shader->prolog2->binary : NULL;
4925 const struct ac_shader_binary *epilog =
4926 shader->epilog ? &shader->epilog->binary : NULL;
4927 const struct ac_shader_binary *mainb = &shader->binary;
4928 unsigned bo_size = si_get_shader_binary_size(shader) +
4929 (!epilog ? mainb->rodata_size : 0);
4930 unsigned char *ptr;
4931
4932 assert(!prolog || !prolog->rodata_size);
4933 assert(!previous_stage || !previous_stage->rodata_size);
4934 assert(!prolog2 || !prolog2->rodata_size);
4935 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4936 !mainb->rodata_size);
4937 assert(!epilog || !epilog->rodata_size);
4938
4939 r600_resource_reference(&shader->bo, NULL);
4940 shader->bo = (struct r600_resource*)
4941 pipe_buffer_create(&sscreen->b.b, 0,
4942 PIPE_USAGE_IMMUTABLE,
4943 align(bo_size, SI_CPDMA_ALIGNMENT));
4944 if (!shader->bo)
4945 return -ENOMEM;
4946
4947 /* Upload. */
4948 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4949 PIPE_TRANSFER_READ_WRITE |
4950 PIPE_TRANSFER_UNSYNCHRONIZED);
4951
4952 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4953 * endian-independent. */
4954 if (prolog) {
4955 memcpy(ptr, prolog->code, prolog->code_size);
4956 ptr += prolog->code_size;
4957 }
4958 if (previous_stage) {
4959 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4960 ptr += previous_stage->code_size;
4961 }
4962 if (prolog2) {
4963 memcpy(ptr, prolog2->code, prolog2->code_size);
4964 ptr += prolog2->code_size;
4965 }
4966
4967 memcpy(ptr, mainb->code, mainb->code_size);
4968 ptr += mainb->code_size;
4969
4970 if (epilog)
4971 memcpy(ptr, epilog->code, epilog->code_size);
4972 else if (mainb->rodata_size > 0)
4973 memcpy(ptr, mainb->rodata, mainb->rodata_size);
4974
4975 sscreen->b.ws->buffer_unmap(shader->bo->buf);
4976 return 0;
4977 }
4978
4979 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
4980 struct pipe_debug_callback *debug,
4981 const char *name, FILE *file)
4982 {
4983 char *line, *p;
4984 unsigned i, count;
4985
4986 if (binary->disasm_string) {
4987 fprintf(file, "Shader %s disassembly:\n", name);
4988 fprintf(file, "%s", binary->disasm_string);
4989
4990 if (debug && debug->debug_message) {
4991 /* Very long debug messages are cut off, so send the
4992 * disassembly one line at a time. This causes more
4993 * overhead, but on the plus side it simplifies
4994 * parsing of resulting logs.
4995 */
4996 pipe_debug_message(debug, SHADER_INFO,
4997 "Shader Disassembly Begin");
4998
4999 line = binary->disasm_string;
5000 while (*line) {
5001 p = util_strchrnul(line, '\n');
5002 count = p - line;
5003
5004 if (count) {
5005 pipe_debug_message(debug, SHADER_INFO,
5006 "%.*s", count, line);
5007 }
5008
5009 if (!*p)
5010 break;
5011 line = p + 1;
5012 }
5013
5014 pipe_debug_message(debug, SHADER_INFO,
5015 "Shader Disassembly End");
5016 }
5017 } else {
5018 fprintf(file, "Shader %s binary:\n", name);
5019 for (i = 0; i < binary->code_size; i += 4) {
5020 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5021 binary->code[i + 3], binary->code[i + 2],
5022 binary->code[i + 1], binary->code[i]);
5023 }
5024 }
5025 }
5026
5027 static void si_shader_dump_stats(struct si_screen *sscreen,
5028 const struct si_shader *shader,
5029 struct pipe_debug_callback *debug,
5030 unsigned processor,
5031 FILE *file,
5032 bool check_debug_option)
5033 {
5034 const struct si_shader_config *conf = &shader->config;
5035 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
5036 unsigned code_size = si_get_shader_binary_size(shader);
5037 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5038 unsigned lds_per_wave = 0;
5039 unsigned max_simd_waves = 10;
5040
5041 /* Compute LDS usage for PS. */
5042 switch (processor) {
5043 case PIPE_SHADER_FRAGMENT:
5044 /* The minimum usage per wave is (num_inputs * 48). The maximum
5045 * usage is (num_inputs * 48 * 16).
5046 * We can get anything in between and it varies between waves.
5047 *
5048 * The 48 bytes per input for a single primitive is equal to
5049 * 4 bytes/component * 4 components/input * 3 points.
5050 *
5051 * Other stages don't know the size at compile time or don't
5052 * allocate LDS per wave, but instead they do it per thread group.
5053 */
5054 lds_per_wave = conf->lds_size * lds_increment +
5055 align(num_inputs * 48, lds_increment);
5056 break;
5057 case PIPE_SHADER_COMPUTE:
5058 if (shader->selector) {
5059 unsigned max_workgroup_size =
5060 si_get_max_workgroup_size(shader);
5061 lds_per_wave = (conf->lds_size * lds_increment) /
5062 DIV_ROUND_UP(max_workgroup_size, 64);
5063 }
5064 break;
5065 }
5066
5067 /* Compute the per-SIMD wave counts. */
5068 if (conf->num_sgprs) {
5069 if (sscreen->b.chip_class >= VI)
5070 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5071 else
5072 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5073 }
5074
5075 if (conf->num_vgprs)
5076 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5077
5078 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5079 * 16KB makes some SIMDs unoccupied). */
5080 if (lds_per_wave)
5081 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5082
5083 if (!check_debug_option ||
5084 r600_can_dump_shader(&sscreen->b, processor)) {
5085 if (processor == PIPE_SHADER_FRAGMENT) {
5086 fprintf(file, "*** SHADER CONFIG ***\n"
5087 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5088 "SPI_PS_INPUT_ENA = 0x%04x\n",
5089 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5090 }
5091
5092 fprintf(file, "*** SHADER STATS ***\n"
5093 "SGPRS: %d\n"
5094 "VGPRS: %d\n"
5095 "Spilled SGPRs: %d\n"
5096 "Spilled VGPRs: %d\n"
5097 "Private memory VGPRs: %d\n"
5098 "Code Size: %d bytes\n"
5099 "LDS: %d blocks\n"
5100 "Scratch: %d bytes per wave\n"
5101 "Max Waves: %d\n"
5102 "********************\n\n\n",
5103 conf->num_sgprs, conf->num_vgprs,
5104 conf->spilled_sgprs, conf->spilled_vgprs,
5105 conf->private_mem_vgprs, code_size,
5106 conf->lds_size, conf->scratch_bytes_per_wave,
5107 max_simd_waves);
5108 }
5109
5110 pipe_debug_message(debug, SHADER_INFO,
5111 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5112 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5113 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5114 conf->num_sgprs, conf->num_vgprs, code_size,
5115 conf->lds_size, conf->scratch_bytes_per_wave,
5116 max_simd_waves, conf->spilled_sgprs,
5117 conf->spilled_vgprs, conf->private_mem_vgprs);
5118 }
5119
5120 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5121 {
5122 switch (processor) {
5123 case PIPE_SHADER_VERTEX:
5124 if (shader->key.as_es)
5125 return "Vertex Shader as ES";
5126 else if (shader->key.as_ls)
5127 return "Vertex Shader as LS";
5128 else
5129 return "Vertex Shader as VS";
5130 case PIPE_SHADER_TESS_CTRL:
5131 return "Tessellation Control Shader";
5132 case PIPE_SHADER_TESS_EVAL:
5133 if (shader->key.as_es)
5134 return "Tessellation Evaluation Shader as ES";
5135 else
5136 return "Tessellation Evaluation Shader as VS";
5137 case PIPE_SHADER_GEOMETRY:
5138 if (shader->is_gs_copy_shader)
5139 return "GS Copy Shader as VS";
5140 else
5141 return "Geometry Shader";
5142 case PIPE_SHADER_FRAGMENT:
5143 return "Pixel Shader";
5144 case PIPE_SHADER_COMPUTE:
5145 return "Compute Shader";
5146 default:
5147 return "Unknown Shader";
5148 }
5149 }
5150
5151 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5152 struct pipe_debug_callback *debug, unsigned processor,
5153 FILE *file, bool check_debug_option)
5154 {
5155 if (!check_debug_option ||
5156 r600_can_dump_shader(&sscreen->b, processor))
5157 si_dump_shader_key(processor, shader, file);
5158
5159 if (!check_debug_option && shader->binary.llvm_ir_string) {
5160 if (shader->previous_stage &&
5161 shader->previous_stage->binary.llvm_ir_string) {
5162 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5163 si_get_shader_name(shader, processor));
5164 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5165 }
5166
5167 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5168 si_get_shader_name(shader, processor));
5169 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5170 }
5171
5172 if (!check_debug_option ||
5173 (r600_can_dump_shader(&sscreen->b, processor) &&
5174 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5175 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5176
5177 if (shader->prolog)
5178 si_shader_dump_disassembly(&shader->prolog->binary,
5179 debug, "prolog", file);
5180 if (shader->previous_stage)
5181 si_shader_dump_disassembly(&shader->previous_stage->binary,
5182 debug, "previous stage", file);
5183 if (shader->prolog2)
5184 si_shader_dump_disassembly(&shader->prolog2->binary,
5185 debug, "prolog2", file);
5186
5187 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5188
5189 if (shader->epilog)
5190 si_shader_dump_disassembly(&shader->epilog->binary,
5191 debug, "epilog", file);
5192 fprintf(file, "\n");
5193 }
5194
5195 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5196 check_debug_option);
5197 }
5198
5199 static int si_compile_llvm(struct si_screen *sscreen,
5200 struct ac_shader_binary *binary,
5201 struct si_shader_config *conf,
5202 LLVMTargetMachineRef tm,
5203 LLVMModuleRef mod,
5204 struct pipe_debug_callback *debug,
5205 unsigned processor,
5206 const char *name)
5207 {
5208 int r = 0;
5209 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5210
5211 if (r600_can_dump_shader(&sscreen->b, processor)) {
5212 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5213
5214 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5215 fprintf(stderr, "%s LLVM IR:\n\n", name);
5216 ac_dump_module(mod);
5217 fprintf(stderr, "\n");
5218 }
5219 }
5220
5221 if (sscreen->record_llvm_ir) {
5222 char *ir = LLVMPrintModuleToString(mod);
5223 binary->llvm_ir_string = strdup(ir);
5224 LLVMDisposeMessage(ir);
5225 }
5226
5227 if (!si_replace_shader(count, binary)) {
5228 r = si_llvm_compile(mod, binary, tm, debug);
5229 if (r)
5230 return r;
5231 }
5232
5233 si_shader_binary_read_config(binary, conf, 0);
5234
5235 /* Enable 64-bit and 16-bit denormals, because there is no performance
5236 * cost.
5237 *
5238 * If denormals are enabled, all floating-point output modifiers are
5239 * ignored.
5240 *
5241 * Don't enable denormals for 32-bit floats, because:
5242 * - Floating-point output modifiers would be ignored by the hw.
5243 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5244 * have to stop using those.
5245 * - SI & CI would be very slow.
5246 */
5247 conf->float_mode |= V_00B028_FP_64_DENORMS;
5248
5249 FREE(binary->config);
5250 FREE(binary->global_symbol_offsets);
5251 binary->config = NULL;
5252 binary->global_symbol_offsets = NULL;
5253
5254 /* Some shaders can't have rodata because their binaries can be
5255 * concatenated.
5256 */
5257 if (binary->rodata_size &&
5258 (processor == PIPE_SHADER_VERTEX ||
5259 processor == PIPE_SHADER_TESS_CTRL ||
5260 processor == PIPE_SHADER_TESS_EVAL ||
5261 processor == PIPE_SHADER_FRAGMENT)) {
5262 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5263 return -EINVAL;
5264 }
5265
5266 return r;
5267 }
5268
5269 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5270 {
5271 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5272 LLVMBuildRetVoid(ctx->gallivm.builder);
5273 else
5274 LLVMBuildRet(ctx->gallivm.builder, ret);
5275 }
5276
5277 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5278 struct si_shader *
5279 si_generate_gs_copy_shader(struct si_screen *sscreen,
5280 LLVMTargetMachineRef tm,
5281 struct si_shader_selector *gs_selector,
5282 struct pipe_debug_callback *debug)
5283 {
5284 struct si_shader_context ctx;
5285 struct si_shader *shader;
5286 struct gallivm_state *gallivm = &ctx.gallivm;
5287 LLVMBuilderRef builder;
5288 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5289 struct lp_build_context *uint = &bld_base->uint_bld;
5290 struct si_shader_output_values *outputs;
5291 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5292 int i, r;
5293
5294 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5295
5296 if (!outputs)
5297 return NULL;
5298
5299 shader = CALLOC_STRUCT(si_shader);
5300 if (!shader) {
5301 FREE(outputs);
5302 return NULL;
5303 }
5304
5305
5306 shader->selector = gs_selector;
5307 shader->is_gs_copy_shader = true;
5308
5309 si_init_shader_ctx(&ctx, sscreen, tm);
5310 ctx.shader = shader;
5311 ctx.type = PIPE_SHADER_VERTEX;
5312
5313 builder = gallivm->builder;
5314
5315 create_function(&ctx);
5316 preload_ring_buffers(&ctx);
5317
5318 LLVMValueRef voffset =
5319 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5320
5321 /* Fetch the vertex stream ID.*/
5322 LLVMValueRef stream_id;
5323
5324 if (gs_selector->so.num_outputs)
5325 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5326 else
5327 stream_id = ctx.i32_0;
5328
5329 /* Fill in output information. */
5330 for (i = 0; i < gsinfo->num_outputs; ++i) {
5331 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5332 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5333
5334 for (int chan = 0; chan < 4; chan++) {
5335 outputs[i].vertex_stream[chan] =
5336 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5337 }
5338 }
5339
5340 LLVMBasicBlockRef end_bb;
5341 LLVMValueRef switch_inst;
5342
5343 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5344 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5345
5346 for (int stream = 0; stream < 4; stream++) {
5347 LLVMBasicBlockRef bb;
5348 unsigned offset;
5349
5350 if (!gsinfo->num_stream_output_components[stream])
5351 continue;
5352
5353 if (stream > 0 && !gs_selector->so.num_outputs)
5354 continue;
5355
5356 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5357 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5358 LLVMPositionBuilderAtEnd(builder, bb);
5359
5360 /* Fetch vertex data from GSVS ring */
5361 offset = 0;
5362 for (i = 0; i < gsinfo->num_outputs; ++i) {
5363 for (unsigned chan = 0; chan < 4; chan++) {
5364 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5365 outputs[i].vertex_stream[chan] != stream) {
5366 outputs[i].values[chan] = ctx.bld_base.base.undef;
5367 continue;
5368 }
5369
5370 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5371 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5372 offset++;
5373
5374 outputs[i].values[chan] =
5375 ac_build_buffer_load(&ctx.ac,
5376 ctx.gsvs_ring[0], 1,
5377 ctx.i32_0, voffset,
5378 soffset, 0, 1, 1,
5379 true, false);
5380 }
5381 }
5382
5383 /* Streamout and exports. */
5384 if (gs_selector->so.num_outputs) {
5385 si_llvm_emit_streamout(&ctx, outputs,
5386 gsinfo->num_outputs,
5387 stream);
5388 }
5389
5390 if (stream == 0)
5391 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5392
5393 LLVMBuildBr(builder, end_bb);
5394 }
5395
5396 LLVMPositionBuilderAtEnd(builder, end_bb);
5397
5398 LLVMBuildRetVoid(gallivm->builder);
5399
5400 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5401 si_llvm_optimize_module(&ctx);
5402
5403 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5404 &ctx.shader->config, ctx.tm,
5405 ctx.gallivm.module,
5406 debug, PIPE_SHADER_GEOMETRY,
5407 "GS Copy Shader");
5408 if (!r) {
5409 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5410 fprintf(stderr, "GS Copy Shader:\n");
5411 si_shader_dump(sscreen, ctx.shader, debug,
5412 PIPE_SHADER_GEOMETRY, stderr, true);
5413 r = si_shader_binary_upload(sscreen, ctx.shader);
5414 }
5415
5416 si_llvm_dispose(&ctx);
5417
5418 FREE(outputs);
5419
5420 if (r != 0) {
5421 FREE(shader);
5422 shader = NULL;
5423 }
5424 return shader;
5425 }
5426
5427 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5428 const struct si_vs_prolog_bits *prolog,
5429 const char *prefix, FILE *f)
5430 {
5431 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5432 prefix, prolog->instance_divisor_is_one);
5433 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5434 prefix, prolog->instance_divisor_is_fetched);
5435
5436 fprintf(f, " mono.vs.fix_fetch = {");
5437 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5438 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5439 fprintf(f, "}\n");
5440 }
5441
5442 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5443 FILE *f)
5444 {
5445 const struct si_shader_key *key = &shader->key;
5446
5447 fprintf(f, "SHADER KEY\n");
5448
5449 switch (processor) {
5450 case PIPE_SHADER_VERTEX:
5451 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5452 "part.vs.prolog", f);
5453 fprintf(f, " as_es = %u\n", key->as_es);
5454 fprintf(f, " as_ls = %u\n", key->as_ls);
5455 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5456 key->mono.u.vs_export_prim_id);
5457 break;
5458
5459 case PIPE_SHADER_TESS_CTRL:
5460 if (shader->selector->screen->b.chip_class >= GFX9) {
5461 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5462 "part.tcs.ls_prolog", f);
5463 }
5464 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5465 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5466 break;
5467
5468 case PIPE_SHADER_TESS_EVAL:
5469 fprintf(f, " as_es = %u\n", key->as_es);
5470 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5471 key->mono.u.vs_export_prim_id);
5472 break;
5473
5474 case PIPE_SHADER_GEOMETRY:
5475 if (shader->is_gs_copy_shader)
5476 break;
5477
5478 if (shader->selector->screen->b.chip_class >= GFX9 &&
5479 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5480 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5481 "part.gs.vs_prolog", f);
5482 }
5483 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5484 break;
5485
5486 case PIPE_SHADER_COMPUTE:
5487 break;
5488
5489 case PIPE_SHADER_FRAGMENT:
5490 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5491 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5492 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5493 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5494 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5495 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5496 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5497 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5498 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5499 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5500 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5501 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5502 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5503 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5504 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5505 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5506 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5507 break;
5508
5509 default:
5510 assert(0);
5511 }
5512
5513 if ((processor == PIPE_SHADER_GEOMETRY ||
5514 processor == PIPE_SHADER_TESS_EVAL ||
5515 processor == PIPE_SHADER_VERTEX) &&
5516 !key->as_es && !key->as_ls) {
5517 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5518 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5519 }
5520 }
5521
5522 static void si_init_shader_ctx(struct si_shader_context *ctx,
5523 struct si_screen *sscreen,
5524 LLVMTargetMachineRef tm)
5525 {
5526 struct lp_build_tgsi_context *bld_base;
5527
5528 ctx->abi.chip_class = sscreen->b.chip_class;
5529
5530 si_llvm_context_init(ctx, sscreen, tm);
5531
5532 bld_base = &ctx->bld_base;
5533 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5534
5535 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5536 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5537 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5538
5539 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5540
5541 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5542
5543 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5544 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5545 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5546 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5547
5548 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5549 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5550 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5551 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5552 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5553 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5554 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5555 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5556 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5557
5558 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5559 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5560 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5561 }
5562
5563 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5564 {
5565 struct si_shader *shader = ctx->shader;
5566 struct tgsi_shader_info *info = &shader->selector->info;
5567
5568 if ((ctx->type != PIPE_SHADER_VERTEX &&
5569 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5570 shader->key.as_ls ||
5571 shader->key.as_es)
5572 return;
5573
5574 ac_optimize_vs_outputs(&ctx->ac,
5575 ctx->main_fn,
5576 shader->info.vs_output_param_offset,
5577 info->num_outputs,
5578 &shader->info.nr_param_exports);
5579 }
5580
5581 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5582 {
5583 ctx->shader->config.private_mem_vgprs = 0;
5584
5585 /* Process all LLVM instructions. */
5586 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5587 while (bb) {
5588 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5589
5590 while (next) {
5591 LLVMValueRef inst = next;
5592 next = LLVMGetNextInstruction(next);
5593
5594 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5595 continue;
5596
5597 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5598 /* No idea why LLVM aligns allocas to 4 elements. */
5599 unsigned alignment = LLVMGetAlignment(inst);
5600 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5601 ctx->shader->config.private_mem_vgprs += dw_size;
5602 }
5603 bb = LLVMGetNextBasicBlock(bb);
5604 }
5605 }
5606
5607 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5608 {
5609 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5610 lp_build_intrinsic(ctx->gallivm.builder,
5611 "llvm.amdgcn.init.exec", ctx->voidt,
5612 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5613 }
5614
5615 static void si_init_exec_from_input(struct si_shader_context *ctx,
5616 unsigned param, unsigned bitoffset)
5617 {
5618 LLVMValueRef args[] = {
5619 LLVMGetParam(ctx->main_fn, param),
5620 LLVMConstInt(ctx->i32, bitoffset, 0),
5621 };
5622 lp_build_intrinsic(ctx->gallivm.builder,
5623 "llvm.amdgcn.init.exec.from.input",
5624 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5625 }
5626
5627 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5628 bool is_monolithic)
5629 {
5630 struct si_shader *shader = ctx->shader;
5631 struct si_shader_selector *sel = shader->selector;
5632 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5633
5634 // TODO clean all this up!
5635 switch (ctx->type) {
5636 case PIPE_SHADER_VERTEX:
5637 ctx->load_input = declare_input_vs;
5638 if (shader->key.as_ls)
5639 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5640 else if (shader->key.as_es)
5641 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5642 else {
5643 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5644 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5645 }
5646 break;
5647 case PIPE_SHADER_TESS_CTRL:
5648 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5649 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5650 bld_base->emit_store = store_output_tcs;
5651 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5652 break;
5653 case PIPE_SHADER_TESS_EVAL:
5654 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5655 if (shader->key.as_es)
5656 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5657 else {
5658 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5659 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5660 }
5661 break;
5662 case PIPE_SHADER_GEOMETRY:
5663 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5664 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5665 break;
5666 case PIPE_SHADER_FRAGMENT:
5667 ctx->load_input = declare_input_fs;
5668 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5669 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5670 break;
5671 case PIPE_SHADER_COMPUTE:
5672 ctx->declare_memory_region = declare_compute_memory;
5673 break;
5674 default:
5675 assert(!"Unsupported shader type");
5676 return false;
5677 }
5678
5679 ctx->abi.load_ubo = load_ubo;
5680 ctx->abi.load_ssbo = load_ssbo;
5681
5682 create_function(ctx);
5683 preload_ring_buffers(ctx);
5684
5685 /* For GFX9 merged shaders:
5686 * - Set EXEC for the first shader. If the prolog is present, set
5687 * EXEC there instead.
5688 * - Add a barrier before the second shader.
5689 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5690 * an if-statement. This is required for correctness in geometry
5691 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5692 * GS_CUT messages.
5693 *
5694 * For monolithic merged shaders, the first shader is wrapped in an
5695 * if-block together with its prolog in si_build_wrapper_function.
5696 */
5697 if (ctx->screen->b.chip_class >= GFX9) {
5698 if (!is_monolithic &&
5699 sel->info.num_instructions > 1 && /* not empty shader */
5700 (shader->key.as_es || shader->key.as_ls) &&
5701 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5702 (ctx->type == PIPE_SHADER_VERTEX &&
5703 !sel->vs_needs_prolog))) {
5704 si_init_exec_from_input(ctx,
5705 ctx->param_merged_wave_info, 0);
5706 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5707 ctx->type == PIPE_SHADER_GEOMETRY) {
5708 if (!is_monolithic)
5709 si_init_exec_full_mask(ctx);
5710
5711 /* The barrier must execute for all shaders in a
5712 * threadgroup.
5713 */
5714 si_llvm_emit_barrier(NULL, bld_base, NULL);
5715
5716 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5717 LLVMValueRef ena =
5718 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5719 ac_get_thread_id(&ctx->ac), num_threads, "");
5720 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5721 }
5722 }
5723
5724 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5725 int i;
5726 for (i = 0; i < 4; i++) {
5727 ctx->gs_next_vertex[i] =
5728 lp_build_alloca(&ctx->gallivm,
5729 ctx->i32, "");
5730 }
5731 }
5732
5733 if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5734 ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5735 /* This is initialized to 0.0 = not kill. */
5736 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5737 }
5738
5739 if (sel->tokens) {
5740 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5741 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5742 return false;
5743 }
5744 } else {
5745 if (!si_nir_build_llvm(ctx, sel->nir)) {
5746 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5747 return false;
5748 }
5749 }
5750
5751 si_llvm_build_ret(ctx, ctx->return_value);
5752 return true;
5753 }
5754
5755 /**
5756 * Compute the VS prolog key, which contains all the information needed to
5757 * build the VS prolog function, and set shader->info bits where needed.
5758 *
5759 * \param info Shader info of the vertex shader.
5760 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5761 * \param prolog_key Key of the VS prolog
5762 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5763 * \param key Output shader part key.
5764 */
5765 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5766 unsigned num_input_sgprs,
5767 const struct si_vs_prolog_bits *prolog_key,
5768 struct si_shader *shader_out,
5769 union si_shader_part_key *key)
5770 {
5771 memset(key, 0, sizeof(*key));
5772 key->vs_prolog.states = *prolog_key;
5773 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5774 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5775 key->vs_prolog.as_ls = shader_out->key.as_ls;
5776
5777 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5778 key->vs_prolog.as_ls = 1;
5779 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5780 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5781 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5782 }
5783
5784 /* Enable loading the InstanceID VGPR. */
5785 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5786
5787 if ((key->vs_prolog.states.instance_divisor_is_one |
5788 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5789 shader_out->info.uses_instanceid = true;
5790 }
5791
5792 /**
5793 * Compute the PS prolog key, which contains all the information needed to
5794 * build the PS prolog function, and set related bits in shader->config.
5795 */
5796 static void si_get_ps_prolog_key(struct si_shader *shader,
5797 union si_shader_part_key *key,
5798 bool separate_prolog)
5799 {
5800 struct tgsi_shader_info *info = &shader->selector->info;
5801
5802 memset(key, 0, sizeof(*key));
5803 key->ps_prolog.states = shader->key.part.ps.prolog;
5804 key->ps_prolog.colors_read = info->colors_read;
5805 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5806 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5807 key->ps_prolog.wqm = info->uses_derivatives &&
5808 (key->ps_prolog.colors_read ||
5809 key->ps_prolog.states.force_persp_sample_interp ||
5810 key->ps_prolog.states.force_linear_sample_interp ||
5811 key->ps_prolog.states.force_persp_center_interp ||
5812 key->ps_prolog.states.force_linear_center_interp ||
5813 key->ps_prolog.states.bc_optimize_for_persp ||
5814 key->ps_prolog.states.bc_optimize_for_linear);
5815
5816 if (info->colors_read) {
5817 unsigned *color = shader->selector->color_attr_index;
5818
5819 if (shader->key.part.ps.prolog.color_two_side) {
5820 /* BCOLORs are stored after the last input. */
5821 key->ps_prolog.num_interp_inputs = info->num_inputs;
5822 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5823 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5824 }
5825
5826 for (unsigned i = 0; i < 2; i++) {
5827 unsigned interp = info->input_interpolate[color[i]];
5828 unsigned location = info->input_interpolate_loc[color[i]];
5829
5830 if (!(info->colors_read & (0xf << i*4)))
5831 continue;
5832
5833 key->ps_prolog.color_attr_index[i] = color[i];
5834
5835 if (shader->key.part.ps.prolog.flatshade_colors &&
5836 interp == TGSI_INTERPOLATE_COLOR)
5837 interp = TGSI_INTERPOLATE_CONSTANT;
5838
5839 switch (interp) {
5840 case TGSI_INTERPOLATE_CONSTANT:
5841 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5842 break;
5843 case TGSI_INTERPOLATE_PERSPECTIVE:
5844 case TGSI_INTERPOLATE_COLOR:
5845 /* Force the interpolation location for colors here. */
5846 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5847 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5848 if (shader->key.part.ps.prolog.force_persp_center_interp)
5849 location = TGSI_INTERPOLATE_LOC_CENTER;
5850
5851 switch (location) {
5852 case TGSI_INTERPOLATE_LOC_SAMPLE:
5853 key->ps_prolog.color_interp_vgpr_index[i] = 0;
5854 shader->config.spi_ps_input_ena |=
5855 S_0286CC_PERSP_SAMPLE_ENA(1);
5856 break;
5857 case TGSI_INTERPOLATE_LOC_CENTER:
5858 key->ps_prolog.color_interp_vgpr_index[i] = 2;
5859 shader->config.spi_ps_input_ena |=
5860 S_0286CC_PERSP_CENTER_ENA(1);
5861 break;
5862 case TGSI_INTERPOLATE_LOC_CENTROID:
5863 key->ps_prolog.color_interp_vgpr_index[i] = 4;
5864 shader->config.spi_ps_input_ena |=
5865 S_0286CC_PERSP_CENTROID_ENA(1);
5866 break;
5867 default:
5868 assert(0);
5869 }
5870 break;
5871 case TGSI_INTERPOLATE_LINEAR:
5872 /* Force the interpolation location for colors here. */
5873 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5874 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5875 if (shader->key.part.ps.prolog.force_linear_center_interp)
5876 location = TGSI_INTERPOLATE_LOC_CENTER;
5877
5878 /* The VGPR assignment for non-monolithic shaders
5879 * works because InitialPSInputAddr is set on the
5880 * main shader and PERSP_PULL_MODEL is never used.
5881 */
5882 switch (location) {
5883 case TGSI_INTERPOLATE_LOC_SAMPLE:
5884 key->ps_prolog.color_interp_vgpr_index[i] =
5885 separate_prolog ? 6 : 9;
5886 shader->config.spi_ps_input_ena |=
5887 S_0286CC_LINEAR_SAMPLE_ENA(1);
5888 break;
5889 case TGSI_INTERPOLATE_LOC_CENTER:
5890 key->ps_prolog.color_interp_vgpr_index[i] =
5891 separate_prolog ? 8 : 11;
5892 shader->config.spi_ps_input_ena |=
5893 S_0286CC_LINEAR_CENTER_ENA(1);
5894 break;
5895 case TGSI_INTERPOLATE_LOC_CENTROID:
5896 key->ps_prolog.color_interp_vgpr_index[i] =
5897 separate_prolog ? 10 : 13;
5898 shader->config.spi_ps_input_ena |=
5899 S_0286CC_LINEAR_CENTROID_ENA(1);
5900 break;
5901 default:
5902 assert(0);
5903 }
5904 break;
5905 default:
5906 assert(0);
5907 }
5908 }
5909 }
5910 }
5911
5912 /**
5913 * Check whether a PS prolog is required based on the key.
5914 */
5915 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5916 {
5917 return key->ps_prolog.colors_read ||
5918 key->ps_prolog.states.force_persp_sample_interp ||
5919 key->ps_prolog.states.force_linear_sample_interp ||
5920 key->ps_prolog.states.force_persp_center_interp ||
5921 key->ps_prolog.states.force_linear_center_interp ||
5922 key->ps_prolog.states.bc_optimize_for_persp ||
5923 key->ps_prolog.states.bc_optimize_for_linear ||
5924 key->ps_prolog.states.poly_stipple;
5925 }
5926
5927 /**
5928 * Compute the PS epilog key, which contains all the information needed to
5929 * build the PS epilog function.
5930 */
5931 static void si_get_ps_epilog_key(struct si_shader *shader,
5932 union si_shader_part_key *key)
5933 {
5934 struct tgsi_shader_info *info = &shader->selector->info;
5935 memset(key, 0, sizeof(*key));
5936 key->ps_epilog.colors_written = info->colors_written;
5937 key->ps_epilog.writes_z = info->writes_z;
5938 key->ps_epilog.writes_stencil = info->writes_stencil;
5939 key->ps_epilog.writes_samplemask = info->writes_samplemask;
5940 key->ps_epilog.states = shader->key.part.ps.epilog;
5941 }
5942
5943 /**
5944 * Build the GS prolog function. Rotate the input vertices for triangle strips
5945 * with adjacency.
5946 */
5947 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5948 union si_shader_part_key *key)
5949 {
5950 unsigned num_sgprs, num_vgprs;
5951 struct gallivm_state *gallivm = &ctx->gallivm;
5952 struct si_function_info fninfo;
5953 LLVMBuilderRef builder = gallivm->builder;
5954 LLVMTypeRef returns[48];
5955 LLVMValueRef func, ret;
5956
5957 si_init_function_info(&fninfo);
5958
5959 if (ctx->screen->b.chip_class >= GFX9) {
5960 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
5961 num_vgprs = 5; /* ES inputs are not needed by GS */
5962 } else {
5963 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
5964 num_vgprs = 8;
5965 }
5966
5967 for (unsigned i = 0; i < num_sgprs; ++i) {
5968 add_arg(&fninfo, ARG_SGPR, ctx->i32);
5969 returns[i] = ctx->i32;
5970 }
5971
5972 for (unsigned i = 0; i < num_vgprs; ++i) {
5973 add_arg(&fninfo, ARG_VGPR, ctx->i32);
5974 returns[num_sgprs + i] = ctx->f32;
5975 }
5976
5977 /* Create the function. */
5978 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
5979 &fninfo, 0);
5980 func = ctx->main_fn;
5981
5982 /* Set the full EXEC mask for the prolog, because we are only fiddling
5983 * with registers here. The main shader part will set the correct EXEC
5984 * mask.
5985 */
5986 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
5987 si_init_exec_full_mask(ctx);
5988
5989 /* Copy inputs to outputs. This should be no-op, as the registers match,
5990 * but it will prevent the compiler from overwriting them unintentionally.
5991 */
5992 ret = ctx->return_value;
5993 for (unsigned i = 0; i < num_sgprs; i++) {
5994 LLVMValueRef p = LLVMGetParam(func, i);
5995 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
5996 }
5997 for (unsigned i = 0; i < num_vgprs; i++) {
5998 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
5999 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
6000 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6001 }
6002
6003 if (key->gs_prolog.states.tri_strip_adj_fix) {
6004 /* Remap the input vertices for every other primitive. */
6005 const unsigned gfx6_vtx_params[6] = {
6006 num_sgprs,
6007 num_sgprs + 1,
6008 num_sgprs + 3,
6009 num_sgprs + 4,
6010 num_sgprs + 5,
6011 num_sgprs + 6
6012 };
6013 const unsigned gfx9_vtx_params[3] = {
6014 num_sgprs,
6015 num_sgprs + 1,
6016 num_sgprs + 4,
6017 };
6018 LLVMValueRef vtx_in[6], vtx_out[6];
6019 LLVMValueRef prim_id, rotate;
6020
6021 if (ctx->screen->b.chip_class >= GFX9) {
6022 for (unsigned i = 0; i < 3; i++) {
6023 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6024 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6025 }
6026 } else {
6027 for (unsigned i = 0; i < 6; i++)
6028 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6029 }
6030
6031 prim_id = LLVMGetParam(func, num_sgprs + 2);
6032 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6033
6034 for (unsigned i = 0; i < 6; ++i) {
6035 LLVMValueRef base, rotated;
6036 base = vtx_in[i];
6037 rotated = vtx_in[(i + 4) % 6];
6038 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6039 }
6040
6041 if (ctx->screen->b.chip_class >= GFX9) {
6042 for (unsigned i = 0; i < 3; i++) {
6043 LLVMValueRef hi, out;
6044
6045 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6046 LLVMConstInt(ctx->i32, 16, 0), "");
6047 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6048 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
6049 ret = LLVMBuildInsertValue(builder, ret, out,
6050 gfx9_vtx_params[i], "");
6051 }
6052 } else {
6053 for (unsigned i = 0; i < 6; i++) {
6054 LLVMValueRef out;
6055
6056 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
6057 ret = LLVMBuildInsertValue(builder, ret, out,
6058 gfx6_vtx_params[i], "");
6059 }
6060 }
6061 }
6062
6063 LLVMBuildRet(builder, ret);
6064 }
6065
6066 /**
6067 * Given a list of shader part functions, build a wrapper function that
6068 * runs them in sequence to form a monolithic shader.
6069 */
6070 static void si_build_wrapper_function(struct si_shader_context *ctx,
6071 LLVMValueRef *parts,
6072 unsigned num_parts,
6073 unsigned main_part,
6074 unsigned next_shader_first_part)
6075 {
6076 struct gallivm_state *gallivm = &ctx->gallivm;
6077 LLVMBuilderRef builder = ctx->gallivm.builder;
6078 /* PS epilog has one arg per color component; gfx9 merged shader
6079 * prologs need to forward 32 user SGPRs.
6080 */
6081 struct si_function_info fninfo;
6082 LLVMValueRef initial[64], out[64];
6083 LLVMTypeRef function_type;
6084 unsigned num_first_params;
6085 unsigned num_out, initial_num_out;
6086 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6087 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6088 unsigned num_sgprs, num_vgprs;
6089 unsigned gprs;
6090 struct lp_build_if_state if_state;
6091
6092 si_init_function_info(&fninfo);
6093
6094 for (unsigned i = 0; i < num_parts; ++i) {
6095 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6096 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6097 }
6098
6099 /* The parameters of the wrapper function correspond to those of the
6100 * first part in terms of SGPRs and VGPRs, but we use the types of the
6101 * main part to get the right types. This is relevant for the
6102 * dereferenceable attribute on descriptor table pointers.
6103 */
6104 num_sgprs = 0;
6105 num_vgprs = 0;
6106
6107 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6108 num_first_params = LLVMCountParamTypes(function_type);
6109
6110 for (unsigned i = 0; i < num_first_params; ++i) {
6111 LLVMValueRef param = LLVMGetParam(parts[0], i);
6112
6113 if (ac_is_sgpr_param(param)) {
6114 assert(num_vgprs == 0);
6115 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6116 } else {
6117 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6118 }
6119 }
6120
6121 gprs = 0;
6122 while (gprs < num_sgprs + num_vgprs) {
6123 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6124 LLVMTypeRef type = LLVMTypeOf(param);
6125 unsigned size = llvm_get_type_size(type) / 4;
6126
6127 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6128
6129 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6130 assert(gprs + size <= num_sgprs + num_vgprs &&
6131 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6132
6133 gprs += size;
6134 }
6135
6136 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6137 si_get_max_workgroup_size(ctx->shader));
6138
6139 if (is_merged_shader(ctx->shader))
6140 si_init_exec_full_mask(ctx);
6141
6142 /* Record the arguments of the function as if they were an output of
6143 * a previous part.
6144 */
6145 num_out = 0;
6146 num_out_sgpr = 0;
6147
6148 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6149 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6150 LLVMTypeRef param_type = LLVMTypeOf(param);
6151 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6152 unsigned size = llvm_get_type_size(param_type) / 4;
6153
6154 if (size == 1) {
6155 if (param_type != out_type)
6156 param = LLVMBuildBitCast(builder, param, out_type, "");
6157 out[num_out++] = param;
6158 } else {
6159 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6160
6161 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6162 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6163 param_type = ctx->i64;
6164 }
6165
6166 if (param_type != vector_type)
6167 param = LLVMBuildBitCast(builder, param, vector_type, "");
6168
6169 for (unsigned j = 0; j < size; ++j)
6170 out[num_out++] = LLVMBuildExtractElement(
6171 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6172 }
6173
6174 if (i < fninfo.num_sgpr_params)
6175 num_out_sgpr = num_out;
6176 }
6177
6178 memcpy(initial, out, sizeof(out));
6179 initial_num_out = num_out;
6180 initial_num_out_sgpr = num_out_sgpr;
6181
6182 /* Now chain the parts. */
6183 for (unsigned part = 0; part < num_parts; ++part) {
6184 LLVMValueRef in[48];
6185 LLVMValueRef ret;
6186 LLVMTypeRef ret_type;
6187 unsigned out_idx = 0;
6188 unsigned num_params = LLVMCountParams(parts[part]);
6189
6190 /* Merged shaders are executed conditionally depending
6191 * on the number of enabled threads passed in the input SGPRs. */
6192 if (is_merged_shader(ctx->shader) && part == 0) {
6193 LLVMValueRef ena, count = initial[3];
6194
6195 count = LLVMBuildAnd(builder, count,
6196 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6197 ena = LLVMBuildICmp(builder, LLVMIntULT,
6198 ac_get_thread_id(&ctx->ac), count, "");
6199 lp_build_if(&if_state, &ctx->gallivm, ena);
6200 }
6201
6202 /* Derive arguments for the next part from outputs of the
6203 * previous one.
6204 */
6205 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6206 LLVMValueRef param;
6207 LLVMTypeRef param_type;
6208 bool is_sgpr;
6209 unsigned param_size;
6210 LLVMValueRef arg = NULL;
6211
6212 param = LLVMGetParam(parts[part], param_idx);
6213 param_type = LLVMTypeOf(param);
6214 param_size = llvm_get_type_size(param_type) / 4;
6215 is_sgpr = ac_is_sgpr_param(param);
6216
6217 if (is_sgpr) {
6218 #if HAVE_LLVM < 0x0400
6219 LLVMRemoveAttribute(param, LLVMByValAttribute);
6220 #else
6221 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6222 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6223 #endif
6224 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6225 }
6226
6227 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6228 assert(is_sgpr || out_idx >= num_out_sgpr);
6229
6230 if (param_size == 1)
6231 arg = out[out_idx];
6232 else
6233 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6234
6235 if (LLVMTypeOf(arg) != param_type) {
6236 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6237 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6238 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6239 } else {
6240 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6241 }
6242 }
6243
6244 in[param_idx] = arg;
6245 out_idx += param_size;
6246 }
6247
6248 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6249
6250 if (is_merged_shader(ctx->shader) &&
6251 part + 1 == next_shader_first_part) {
6252 lp_build_endif(&if_state);
6253
6254 /* The second half of the merged shader should use
6255 * the inputs from the toplevel (wrapper) function,
6256 * not the return value from the last call.
6257 *
6258 * That's because the last call was executed condi-
6259 * tionally, so we can't consume it in the main
6260 * block.
6261 */
6262 memcpy(out, initial, sizeof(initial));
6263 num_out = initial_num_out;
6264 num_out_sgpr = initial_num_out_sgpr;
6265 continue;
6266 }
6267
6268 /* Extract the returned GPRs. */
6269 ret_type = LLVMTypeOf(ret);
6270 num_out = 0;
6271 num_out_sgpr = 0;
6272
6273 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6274 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6275
6276 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6277
6278 for (unsigned i = 0; i < ret_size; ++i) {
6279 LLVMValueRef val =
6280 LLVMBuildExtractValue(builder, ret, i, "");
6281
6282 assert(num_out < ARRAY_SIZE(out));
6283 out[num_out++] = val;
6284
6285 if (LLVMTypeOf(val) == ctx->i32) {
6286 assert(num_out_sgpr + 1 == num_out);
6287 num_out_sgpr = num_out;
6288 }
6289 }
6290 }
6291 }
6292
6293 LLVMBuildRetVoid(builder);
6294 }
6295
6296 int si_compile_tgsi_shader(struct si_screen *sscreen,
6297 LLVMTargetMachineRef tm,
6298 struct si_shader *shader,
6299 bool is_monolithic,
6300 struct pipe_debug_callback *debug)
6301 {
6302 struct si_shader_selector *sel = shader->selector;
6303 struct si_shader_context ctx;
6304 int r = -1;
6305
6306 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6307 * conversion fails. */
6308 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6309 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6310 if (sel->tokens)
6311 tgsi_dump(sel->tokens, 0);
6312 else
6313 nir_print_shader(sel->nir, stderr);
6314 si_dump_streamout(&sel->so);
6315 }
6316
6317 si_init_shader_ctx(&ctx, sscreen, tm);
6318 si_llvm_context_set_tgsi(&ctx, shader);
6319 ctx.separate_prolog = !is_monolithic;
6320
6321 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6322 sizeof(shader->info.vs_output_param_offset));
6323
6324 shader->info.uses_instanceid = sel->info.uses_instanceid;
6325
6326 ctx.load_system_value = declare_system_value;
6327
6328 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6329 si_llvm_dispose(&ctx);
6330 return -1;
6331 }
6332
6333 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6334 LLVMValueRef parts[2];
6335 bool need_prolog = sel->vs_needs_prolog;
6336
6337 parts[1] = ctx.main_fn;
6338
6339 if (need_prolog) {
6340 union si_shader_part_key prolog_key;
6341 si_get_vs_prolog_key(&sel->info,
6342 shader->info.num_input_sgprs,
6343 &shader->key.part.vs.prolog,
6344 shader, &prolog_key);
6345 si_build_vs_prolog_function(&ctx, &prolog_key);
6346 parts[0] = ctx.main_fn;
6347 }
6348
6349 si_build_wrapper_function(&ctx, parts + !need_prolog,
6350 1 + need_prolog, need_prolog, 0);
6351 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6352 if (sscreen->b.chip_class >= GFX9) {
6353 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6354 LLVMValueRef parts[4];
6355
6356 /* TCS main part */
6357 parts[2] = ctx.main_fn;
6358
6359 /* TCS epilog */
6360 union si_shader_part_key tcs_epilog_key;
6361 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6362 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6363 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6364 parts[3] = ctx.main_fn;
6365
6366 /* VS prolog */
6367 if (ls->vs_needs_prolog) {
6368 union si_shader_part_key vs_prolog_key;
6369 si_get_vs_prolog_key(&ls->info,
6370 shader->info.num_input_sgprs,
6371 &shader->key.part.tcs.ls_prolog,
6372 shader, &vs_prolog_key);
6373 vs_prolog_key.vs_prolog.is_monolithic = true;
6374 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6375 parts[0] = ctx.main_fn;
6376 }
6377
6378 /* VS as LS main part */
6379 struct si_shader shader_ls = {};
6380 shader_ls.selector = ls;
6381 shader_ls.key.as_ls = 1;
6382 shader_ls.key.mono = shader->key.mono;
6383 shader_ls.key.opt = shader->key.opt;
6384 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6385
6386 if (!si_compile_tgsi_main(&ctx, true)) {
6387 si_llvm_dispose(&ctx);
6388 return -1;
6389 }
6390 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6391 parts[1] = ctx.main_fn;
6392
6393 /* Reset the shader context. */
6394 ctx.shader = shader;
6395 ctx.type = PIPE_SHADER_TESS_CTRL;
6396
6397 si_build_wrapper_function(&ctx,
6398 parts + !ls->vs_needs_prolog,
6399 4 - !ls->vs_needs_prolog, 0,
6400 ls->vs_needs_prolog ? 2 : 1);
6401 } else {
6402 LLVMValueRef parts[2];
6403 union si_shader_part_key epilog_key;
6404
6405 parts[0] = ctx.main_fn;
6406
6407 memset(&epilog_key, 0, sizeof(epilog_key));
6408 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6409 si_build_tcs_epilog_function(&ctx, &epilog_key);
6410 parts[1] = ctx.main_fn;
6411
6412 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6413 }
6414 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6415 if (ctx.screen->b.chip_class >= GFX9) {
6416 struct si_shader_selector *es = shader->key.part.gs.es;
6417 LLVMValueRef es_prolog = NULL;
6418 LLVMValueRef es_main = NULL;
6419 LLVMValueRef gs_prolog = NULL;
6420 LLVMValueRef gs_main = ctx.main_fn;
6421
6422 /* GS prolog */
6423 union si_shader_part_key gs_prolog_key;
6424 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6425 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6426 gs_prolog_key.gs_prolog.is_monolithic = true;
6427 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6428 gs_prolog = ctx.main_fn;
6429
6430 /* ES prolog */
6431 if (es->vs_needs_prolog) {
6432 union si_shader_part_key vs_prolog_key;
6433 si_get_vs_prolog_key(&es->info,
6434 shader->info.num_input_sgprs,
6435 &shader->key.part.tcs.ls_prolog,
6436 shader, &vs_prolog_key);
6437 vs_prolog_key.vs_prolog.is_monolithic = true;
6438 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6439 es_prolog = ctx.main_fn;
6440 }
6441
6442 /* ES main part */
6443 struct si_shader shader_es = {};
6444 shader_es.selector = es;
6445 shader_es.key.as_es = 1;
6446 shader_es.key.mono = shader->key.mono;
6447 shader_es.key.opt = shader->key.opt;
6448 si_llvm_context_set_tgsi(&ctx, &shader_es);
6449
6450 if (!si_compile_tgsi_main(&ctx, true)) {
6451 si_llvm_dispose(&ctx);
6452 return -1;
6453 }
6454 shader->info.uses_instanceid |= es->info.uses_instanceid;
6455 es_main = ctx.main_fn;
6456
6457 /* Reset the shader context. */
6458 ctx.shader = shader;
6459 ctx.type = PIPE_SHADER_GEOMETRY;
6460
6461 /* Prepare the array of shader parts. */
6462 LLVMValueRef parts[4];
6463 unsigned num_parts = 0, main_part, next_first_part;
6464
6465 if (es_prolog)
6466 parts[num_parts++] = es_prolog;
6467
6468 parts[main_part = num_parts++] = es_main;
6469 parts[next_first_part = num_parts++] = gs_prolog;
6470 parts[num_parts++] = gs_main;
6471
6472 si_build_wrapper_function(&ctx, parts, num_parts,
6473 main_part, next_first_part);
6474 } else {
6475 LLVMValueRef parts[2];
6476 union si_shader_part_key prolog_key;
6477
6478 parts[1] = ctx.main_fn;
6479
6480 memset(&prolog_key, 0, sizeof(prolog_key));
6481 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6482 si_build_gs_prolog_function(&ctx, &prolog_key);
6483 parts[0] = ctx.main_fn;
6484
6485 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6486 }
6487 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6488 LLVMValueRef parts[3];
6489 union si_shader_part_key prolog_key;
6490 union si_shader_part_key epilog_key;
6491 bool need_prolog;
6492
6493 si_get_ps_prolog_key(shader, &prolog_key, false);
6494 need_prolog = si_need_ps_prolog(&prolog_key);
6495
6496 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6497
6498 if (need_prolog) {
6499 si_build_ps_prolog_function(&ctx, &prolog_key);
6500 parts[0] = ctx.main_fn;
6501 }
6502
6503 si_get_ps_epilog_key(shader, &epilog_key);
6504 si_build_ps_epilog_function(&ctx, &epilog_key);
6505 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6506
6507 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6508 need_prolog ? 1 : 0, 0);
6509 }
6510
6511 si_llvm_optimize_module(&ctx);
6512
6513 /* Post-optimization transformations and analysis. */
6514 si_optimize_vs_outputs(&ctx);
6515
6516 if ((debug && debug->debug_message) ||
6517 r600_can_dump_shader(&sscreen->b, ctx.type))
6518 si_count_scratch_private_memory(&ctx);
6519
6520 /* Compile to bytecode. */
6521 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6522 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6523 si_llvm_dispose(&ctx);
6524 if (r) {
6525 fprintf(stderr, "LLVM failed to compile shader\n");
6526 return r;
6527 }
6528
6529 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6530 * LLVM 3.9svn has this bug.
6531 */
6532 if (sel->type == PIPE_SHADER_COMPUTE) {
6533 unsigned wave_size = 64;
6534 unsigned max_vgprs = 256;
6535 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6536 unsigned max_sgprs_per_wave = 128;
6537 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6538 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6539 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6540
6541 max_vgprs = max_vgprs / min_waves_per_simd;
6542 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6543
6544 if (shader->config.num_sgprs > max_sgprs ||
6545 shader->config.num_vgprs > max_vgprs) {
6546 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6547 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6548 shader->config.num_sgprs, shader->config.num_vgprs,
6549 max_sgprs, max_vgprs);
6550
6551 /* Just terminate the process, because dependent
6552 * shaders can hang due to bad input data, but use
6553 * the env var to allow shader-db to work.
6554 */
6555 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6556 abort();
6557 }
6558 }
6559
6560 /* Add the scratch offset to input SGPRs. */
6561 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6562 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6563
6564 /* Calculate the number of fragment input VGPRs. */
6565 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6566 shader->info.num_input_vgprs = 0;
6567 shader->info.face_vgpr_index = -1;
6568
6569 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6570 shader->info.num_input_vgprs += 2;
6571 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6572 shader->info.num_input_vgprs += 2;
6573 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6574 shader->info.num_input_vgprs += 2;
6575 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6576 shader->info.num_input_vgprs += 3;
6577 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6578 shader->info.num_input_vgprs += 2;
6579 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6580 shader->info.num_input_vgprs += 2;
6581 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6582 shader->info.num_input_vgprs += 2;
6583 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6584 shader->info.num_input_vgprs += 1;
6585 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6586 shader->info.num_input_vgprs += 1;
6587 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6588 shader->info.num_input_vgprs += 1;
6589 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6590 shader->info.num_input_vgprs += 1;
6591 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6592 shader->info.num_input_vgprs += 1;
6593 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6594 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6595 shader->info.num_input_vgprs += 1;
6596 }
6597 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6598 shader->info.num_input_vgprs += 1;
6599 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6600 shader->info.num_input_vgprs += 1;
6601 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6602 shader->info.num_input_vgprs += 1;
6603 }
6604
6605 return 0;
6606 }
6607
6608 /**
6609 * Create, compile and return a shader part (prolog or epilog).
6610 *
6611 * \param sscreen screen
6612 * \param list list of shader parts of the same category
6613 * \param type shader type
6614 * \param key shader part key
6615 * \param prolog whether the part being requested is a prolog
6616 * \param tm LLVM target machine
6617 * \param debug debug callback
6618 * \param build the callback responsible for building the main function
6619 * \return non-NULL on success
6620 */
6621 static struct si_shader_part *
6622 si_get_shader_part(struct si_screen *sscreen,
6623 struct si_shader_part **list,
6624 enum pipe_shader_type type,
6625 bool prolog,
6626 union si_shader_part_key *key,
6627 LLVMTargetMachineRef tm,
6628 struct pipe_debug_callback *debug,
6629 void (*build)(struct si_shader_context *,
6630 union si_shader_part_key *),
6631 const char *name)
6632 {
6633 struct si_shader_part *result;
6634
6635 mtx_lock(&sscreen->shader_parts_mutex);
6636
6637 /* Find existing. */
6638 for (result = *list; result; result = result->next) {
6639 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6640 mtx_unlock(&sscreen->shader_parts_mutex);
6641 return result;
6642 }
6643 }
6644
6645 /* Compile a new one. */
6646 result = CALLOC_STRUCT(si_shader_part);
6647 result->key = *key;
6648
6649 struct si_shader shader = {};
6650 struct si_shader_context ctx;
6651 struct gallivm_state *gallivm = &ctx.gallivm;
6652
6653 si_init_shader_ctx(&ctx, sscreen, tm);
6654 ctx.shader = &shader;
6655 ctx.type = type;
6656
6657 switch (type) {
6658 case PIPE_SHADER_VERTEX:
6659 break;
6660 case PIPE_SHADER_TESS_CTRL:
6661 assert(!prolog);
6662 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6663 break;
6664 case PIPE_SHADER_GEOMETRY:
6665 assert(prolog);
6666 break;
6667 case PIPE_SHADER_FRAGMENT:
6668 if (prolog)
6669 shader.key.part.ps.prolog = key->ps_prolog.states;
6670 else
6671 shader.key.part.ps.epilog = key->ps_epilog.states;
6672 break;
6673 default:
6674 unreachable("bad shader part");
6675 }
6676
6677 build(&ctx, key);
6678
6679 /* Compile. */
6680 si_llvm_optimize_module(&ctx);
6681
6682 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6683 gallivm->module, debug, ctx.type, name)) {
6684 FREE(result);
6685 result = NULL;
6686 goto out;
6687 }
6688
6689 result->next = *list;
6690 *list = result;
6691
6692 out:
6693 si_llvm_dispose(&ctx);
6694 mtx_unlock(&sscreen->shader_parts_mutex);
6695 return result;
6696 }
6697
6698 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6699 {
6700 struct gallivm_state *gallivm = &ctx->gallivm;
6701 LLVMValueRef ptr[2], list;
6702
6703 /* Get the pointer to rw buffers. */
6704 ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6705 ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6706 list = lp_build_gather_values(gallivm, ptr, 2);
6707 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6708 list = LLVMBuildIntToPtr(gallivm->builder, list,
6709 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6710 return list;
6711 }
6712
6713 /**
6714 * Build the vertex shader prolog function.
6715 *
6716 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6717 * All inputs are returned unmodified. The vertex load indices are
6718 * stored after them, which will be used by the API VS for fetching inputs.
6719 *
6720 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6721 * input_v0,
6722 * input_v1,
6723 * input_v2,
6724 * input_v3,
6725 * (VertexID + BaseVertex),
6726 * (InstanceID + StartInstance),
6727 * (InstanceID / 2 + StartInstance)
6728 */
6729 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6730 union si_shader_part_key *key)
6731 {
6732 struct gallivm_state *gallivm = &ctx->gallivm;
6733 struct si_function_info fninfo;
6734 LLVMTypeRef *returns;
6735 LLVMValueRef ret, func;
6736 int num_returns, i;
6737 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
6738 key->vs_prolog.num_merged_next_stage_vgprs;
6739 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6740 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6741 num_input_vgprs;
6742 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6743
6744 si_init_function_info(&fninfo);
6745
6746 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6747 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6748 sizeof(LLVMTypeRef));
6749 num_returns = 0;
6750
6751 /* Declare input and output SGPRs. */
6752 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6753 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6754 returns[num_returns++] = ctx->i32;
6755 }
6756
6757 /* Preloaded VGPRs (outputs must be floats) */
6758 for (i = 0; i < num_input_vgprs; i++) {
6759 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6760 returns[num_returns++] = ctx->f32;
6761 }
6762
6763 fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
6764 fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
6765
6766 /* Vertex load indices. */
6767 for (i = 0; i <= key->vs_prolog.last_input; i++)
6768 returns[num_returns++] = ctx->f32;
6769
6770 /* Create the function. */
6771 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6772 func = ctx->main_fn;
6773
6774 if (key->vs_prolog.num_merged_next_stage_vgprs &&
6775 !key->vs_prolog.is_monolithic)
6776 si_init_exec_from_input(ctx, 3, 0);
6777
6778 /* Copy inputs to outputs. This should be no-op, as the registers match,
6779 * but it will prevent the compiler from overwriting them unintentionally.
6780 */
6781 ret = ctx->return_value;
6782 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6783 LLVMValueRef p = LLVMGetParam(func, i);
6784 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6785 }
6786 for (; i < fninfo.num_params; i++) {
6787 LLVMValueRef p = LLVMGetParam(func, i);
6788 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6789 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6790 }
6791
6792 /* Compute vertex load indices from instance divisors. */
6793 LLVMValueRef instance_divisor_constbuf = NULL;
6794
6795 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6796 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6797 LLVMValueRef buf_index =
6798 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6799 instance_divisor_constbuf =
6800 ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6801 }
6802
6803 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6804 bool divisor_is_one =
6805 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6806 bool divisor_is_fetched =
6807 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6808 LLVMValueRef index;
6809
6810 if (divisor_is_one || divisor_is_fetched) {
6811 LLVMValueRef divisor = ctx->i32_1;
6812
6813 if (divisor_is_fetched) {
6814 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6815 LLVMConstInt(ctx->i32, i * 4, 0));
6816 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6817 ctx->i32, "");
6818 }
6819
6820 /* InstanceID / Divisor + StartInstance */
6821 index = get_instance_index_for_fetch(ctx,
6822 user_sgpr_base +
6823 SI_SGPR_START_INSTANCE,
6824 divisor);
6825 } else {
6826 /* VertexID + BaseVertex */
6827 index = LLVMBuildAdd(gallivm->builder,
6828 ctx->abi.vertex_id,
6829 LLVMGetParam(func, user_sgpr_base +
6830 SI_SGPR_BASE_VERTEX), "");
6831 }
6832
6833 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6834 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6835 fninfo.num_params + i, "");
6836 }
6837
6838 si_llvm_build_ret(ctx, ret);
6839 }
6840
6841 static bool si_get_vs_prolog(struct si_screen *sscreen,
6842 LLVMTargetMachineRef tm,
6843 struct si_shader *shader,
6844 struct pipe_debug_callback *debug,
6845 struct si_shader *main_part,
6846 const struct si_vs_prolog_bits *key)
6847 {
6848 struct si_shader_selector *vs = main_part->selector;
6849
6850 /* The prolog is a no-op if there are no inputs. */
6851 if (!vs->vs_needs_prolog)
6852 return true;
6853
6854 /* Get the prolog. */
6855 union si_shader_part_key prolog_key;
6856 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6857 key, shader, &prolog_key);
6858
6859 shader->prolog =
6860 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6861 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6862 debug, si_build_vs_prolog_function,
6863 "Vertex Shader Prolog");
6864 return shader->prolog != NULL;
6865 }
6866
6867 /**
6868 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6869 */
6870 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6871 LLVMTargetMachineRef tm,
6872 struct si_shader *shader,
6873 struct pipe_debug_callback *debug)
6874 {
6875 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6876 &shader->key.part.vs.prolog);
6877 }
6878
6879 /**
6880 * Compile the TCS epilog function. This writes tesselation factors to memory
6881 * based on the output primitive type of the tesselator (determined by TES).
6882 */
6883 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6884 union si_shader_part_key *key)
6885 {
6886 struct gallivm_state *gallivm = &ctx->gallivm;
6887 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6888 struct si_function_info fninfo;
6889 LLVMValueRef func;
6890
6891 si_init_function_info(&fninfo);
6892
6893 if (ctx->screen->b.chip_class >= GFX9) {
6894 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6895 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6896 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
6897 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6898 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6899 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6900 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6901 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6902 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6903 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6904 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6905 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6906 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6907 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6908 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6909 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6910 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6911 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6912 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6913 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6914 } else {
6915 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6916 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6917 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6918 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6919 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6920 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6921 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6922 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6923 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6924 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6925 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6926 }
6927
6928 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6929 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6930 unsigned tess_factors_idx =
6931 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
6932 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
6933 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
6934
6935 /* Create the function. */
6936 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
6937 ctx->screen->b.chip_class >= CIK ? 128 : 64);
6938 declare_lds_as_pointer(ctx);
6939 func = ctx->main_fn;
6940
6941 si_write_tess_factors(bld_base,
6942 LLVMGetParam(func, tess_factors_idx),
6943 LLVMGetParam(func, tess_factors_idx + 1),
6944 LLVMGetParam(func, tess_factors_idx + 2));
6945
6946 LLVMBuildRetVoid(gallivm->builder);
6947 }
6948
6949 /**
6950 * Select and compile (or reuse) TCS parts (epilog).
6951 */
6952 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6953 LLVMTargetMachineRef tm,
6954 struct si_shader *shader,
6955 struct pipe_debug_callback *debug)
6956 {
6957 if (sscreen->b.chip_class >= GFX9) {
6958 struct si_shader *ls_main_part =
6959 shader->key.part.tcs.ls->main_shader_part_ls;
6960
6961 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
6962 &shader->key.part.tcs.ls_prolog))
6963 return false;
6964
6965 shader->previous_stage = ls_main_part;
6966 }
6967
6968 /* Get the epilog. */
6969 union si_shader_part_key epilog_key;
6970 memset(&epilog_key, 0, sizeof(epilog_key));
6971 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6972
6973 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6974 PIPE_SHADER_TESS_CTRL, false,
6975 &epilog_key, tm, debug,
6976 si_build_tcs_epilog_function,
6977 "Tessellation Control Shader Epilog");
6978 return shader->epilog != NULL;
6979 }
6980
6981 /**
6982 * Select and compile (or reuse) GS parts (prolog).
6983 */
6984 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
6985 LLVMTargetMachineRef tm,
6986 struct si_shader *shader,
6987 struct pipe_debug_callback *debug)
6988 {
6989 if (sscreen->b.chip_class >= GFX9) {
6990 struct si_shader *es_main_part =
6991 shader->key.part.gs.es->main_shader_part_es;
6992
6993 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
6994 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
6995 &shader->key.part.gs.vs_prolog))
6996 return false;
6997
6998 shader->previous_stage = es_main_part;
6999 }
7000
7001 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7002 return true;
7003
7004 union si_shader_part_key prolog_key;
7005 memset(&prolog_key, 0, sizeof(prolog_key));
7006 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7007
7008 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7009 PIPE_SHADER_GEOMETRY, true,
7010 &prolog_key, tm, debug,
7011 si_build_gs_prolog_function,
7012 "Geometry Shader Prolog");
7013 return shader->prolog2 != NULL;
7014 }
7015
7016 /**
7017 * Build the pixel shader prolog function. This handles:
7018 * - two-side color selection and interpolation
7019 * - overriding interpolation parameters for the API PS
7020 * - polygon stippling
7021 *
7022 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7023 * overriden by other states. (e.g. per-sample interpolation)
7024 * Interpolated colors are stored after the preloaded VGPRs.
7025 */
7026 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7027 union si_shader_part_key *key)
7028 {
7029 struct gallivm_state *gallivm = &ctx->gallivm;
7030 struct si_function_info fninfo;
7031 LLVMValueRef ret, func;
7032 int num_returns, i, num_color_channels;
7033
7034 assert(si_need_ps_prolog(key));
7035
7036 si_init_function_info(&fninfo);
7037
7038 /* Declare inputs. */
7039 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7040 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7041
7042 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7043 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7044
7045 /* Declare outputs (same as inputs + add colors if needed) */
7046 num_returns = fninfo.num_params;
7047 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7048 for (i = 0; i < num_color_channels; i++)
7049 fninfo.types[num_returns++] = ctx->f32;
7050
7051 /* Create the function. */
7052 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7053 &fninfo, 0);
7054 func = ctx->main_fn;
7055
7056 /* Copy inputs to outputs. This should be no-op, as the registers match,
7057 * but it will prevent the compiler from overwriting them unintentionally.
7058 */
7059 ret = ctx->return_value;
7060 for (i = 0; i < fninfo.num_params; i++) {
7061 LLVMValueRef p = LLVMGetParam(func, i);
7062 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7063 }
7064
7065 /* Polygon stippling. */
7066 if (key->ps_prolog.states.poly_stipple) {
7067 /* POS_FIXED_PT is always last. */
7068 unsigned pos = key->ps_prolog.num_input_sgprs +
7069 key->ps_prolog.num_input_vgprs - 1;
7070 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7071
7072 si_llvm_emit_polygon_stipple(ctx, list, pos);
7073 }
7074
7075 if (key->ps_prolog.states.bc_optimize_for_persp ||
7076 key->ps_prolog.states.bc_optimize_for_linear) {
7077 unsigned i, base = key->ps_prolog.num_input_sgprs;
7078 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7079
7080 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7081 * The hw doesn't compute CENTROID if the whole wave only
7082 * contains fully-covered quads.
7083 *
7084 * PRIM_MASK is after user SGPRs.
7085 */
7086 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7087 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7088 LLVMConstInt(ctx->i32, 31, 0), "");
7089 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7090 ctx->i1, "");
7091
7092 if (key->ps_prolog.states.bc_optimize_for_persp) {
7093 /* Read PERSP_CENTER. */
7094 for (i = 0; i < 2; i++)
7095 center[i] = LLVMGetParam(func, base + 2 + i);
7096 /* Read PERSP_CENTROID. */
7097 for (i = 0; i < 2; i++)
7098 centroid[i] = LLVMGetParam(func, base + 4 + i);
7099 /* Select PERSP_CENTROID. */
7100 for (i = 0; i < 2; i++) {
7101 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7102 center[i], centroid[i], "");
7103 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7104 tmp, base + 4 + i, "");
7105 }
7106 }
7107 if (key->ps_prolog.states.bc_optimize_for_linear) {
7108 /* Read LINEAR_CENTER. */
7109 for (i = 0; i < 2; i++)
7110 center[i] = LLVMGetParam(func, base + 8 + i);
7111 /* Read LINEAR_CENTROID. */
7112 for (i = 0; i < 2; i++)
7113 centroid[i] = LLVMGetParam(func, base + 10 + i);
7114 /* Select LINEAR_CENTROID. */
7115 for (i = 0; i < 2; i++) {
7116 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7117 center[i], centroid[i], "");
7118 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7119 tmp, base + 10 + i, "");
7120 }
7121 }
7122 }
7123
7124 /* Force per-sample interpolation. */
7125 if (key->ps_prolog.states.force_persp_sample_interp) {
7126 unsigned i, base = key->ps_prolog.num_input_sgprs;
7127 LLVMValueRef persp_sample[2];
7128
7129 /* Read PERSP_SAMPLE. */
7130 for (i = 0; i < 2; i++)
7131 persp_sample[i] = LLVMGetParam(func, base + i);
7132 /* Overwrite PERSP_CENTER. */
7133 for (i = 0; i < 2; i++)
7134 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7135 persp_sample[i], base + 2 + i, "");
7136 /* Overwrite PERSP_CENTROID. */
7137 for (i = 0; i < 2; i++)
7138 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7139 persp_sample[i], base + 4 + i, "");
7140 }
7141 if (key->ps_prolog.states.force_linear_sample_interp) {
7142 unsigned i, base = key->ps_prolog.num_input_sgprs;
7143 LLVMValueRef linear_sample[2];
7144
7145 /* Read LINEAR_SAMPLE. */
7146 for (i = 0; i < 2; i++)
7147 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7148 /* Overwrite LINEAR_CENTER. */
7149 for (i = 0; i < 2; i++)
7150 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7151 linear_sample[i], base + 8 + i, "");
7152 /* Overwrite LINEAR_CENTROID. */
7153 for (i = 0; i < 2; i++)
7154 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7155 linear_sample[i], base + 10 + i, "");
7156 }
7157
7158 /* Force center interpolation. */
7159 if (key->ps_prolog.states.force_persp_center_interp) {
7160 unsigned i, base = key->ps_prolog.num_input_sgprs;
7161 LLVMValueRef persp_center[2];
7162
7163 /* Read PERSP_CENTER. */
7164 for (i = 0; i < 2; i++)
7165 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7166 /* Overwrite PERSP_SAMPLE. */
7167 for (i = 0; i < 2; i++)
7168 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7169 persp_center[i], base + i, "");
7170 /* Overwrite PERSP_CENTROID. */
7171 for (i = 0; i < 2; i++)
7172 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7173 persp_center[i], base + 4 + i, "");
7174 }
7175 if (key->ps_prolog.states.force_linear_center_interp) {
7176 unsigned i, base = key->ps_prolog.num_input_sgprs;
7177 LLVMValueRef linear_center[2];
7178
7179 /* Read LINEAR_CENTER. */
7180 for (i = 0; i < 2; i++)
7181 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7182 /* Overwrite LINEAR_SAMPLE. */
7183 for (i = 0; i < 2; i++)
7184 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7185 linear_center[i], base + 6 + i, "");
7186 /* Overwrite LINEAR_CENTROID. */
7187 for (i = 0; i < 2; i++)
7188 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7189 linear_center[i], base + 10 + i, "");
7190 }
7191
7192 /* Interpolate colors. */
7193 unsigned color_out_idx = 0;
7194 for (i = 0; i < 2; i++) {
7195 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7196 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7197 key->ps_prolog.face_vgpr_index;
7198 LLVMValueRef interp[2], color[4];
7199 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7200
7201 if (!writemask)
7202 continue;
7203
7204 /* If the interpolation qualifier is not CONSTANT (-1). */
7205 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7206 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7207 key->ps_prolog.color_interp_vgpr_index[i];
7208
7209 /* Get the (i,j) updated by bc_optimize handling. */
7210 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7211 interp_vgpr, "");
7212 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7213 interp_vgpr + 1, "");
7214 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7215 }
7216
7217 /* Use the absolute location of the input. */
7218 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7219
7220 if (key->ps_prolog.states.color_two_side) {
7221 face = LLVMGetParam(func, face_vgpr);
7222 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7223 }
7224
7225 interp_fs_input(ctx,
7226 key->ps_prolog.color_attr_index[i],
7227 TGSI_SEMANTIC_COLOR, i,
7228 key->ps_prolog.num_interp_inputs,
7229 key->ps_prolog.colors_read, interp_ij,
7230 prim_mask, face, color);
7231
7232 while (writemask) {
7233 unsigned chan = u_bit_scan(&writemask);
7234 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7235 fninfo.num_params + color_out_idx++, "");
7236 }
7237 }
7238
7239 /* Tell LLVM to insert WQM instruction sequence when needed. */
7240 if (key->ps_prolog.wqm) {
7241 LLVMAddTargetDependentFunctionAttr(func,
7242 "amdgpu-ps-wqm-outputs", "");
7243 }
7244
7245 si_llvm_build_ret(ctx, ret);
7246 }
7247
7248 /**
7249 * Build the pixel shader epilog function. This handles everything that must be
7250 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7251 */
7252 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7253 union si_shader_part_key *key)
7254 {
7255 struct gallivm_state *gallivm = &ctx->gallivm;
7256 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7257 struct si_function_info fninfo;
7258 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7259 int i;
7260 struct si_ps_exports exp = {};
7261
7262 si_init_function_info(&fninfo);
7263
7264 /* Declare input SGPRs. */
7265 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7266 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7267 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7268 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7269
7270 /* Declare input VGPRs. */
7271 unsigned required_num_params =
7272 fninfo.num_sgpr_params +
7273 util_bitcount(key->ps_epilog.colors_written) * 4 +
7274 key->ps_epilog.writes_z +
7275 key->ps_epilog.writes_stencil +
7276 key->ps_epilog.writes_samplemask;
7277
7278 required_num_params = MAX2(required_num_params,
7279 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7280
7281 while (fninfo.num_params < required_num_params)
7282 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7283
7284 /* Create the function. */
7285 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7286 /* Disable elimination of unused inputs. */
7287 si_llvm_add_attribute(ctx->main_fn,
7288 "InitialPSInputAddr", 0xffffff);
7289
7290 /* Process colors. */
7291 unsigned vgpr = fninfo.num_sgpr_params;
7292 unsigned colors_written = key->ps_epilog.colors_written;
7293 int last_color_export = -1;
7294
7295 /* Find the last color export. */
7296 if (!key->ps_epilog.writes_z &&
7297 !key->ps_epilog.writes_stencil &&
7298 !key->ps_epilog.writes_samplemask) {
7299 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7300
7301 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7302 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7303 /* Just set this if any of the colorbuffers are enabled. */
7304 if (spi_format &
7305 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7306 last_color_export = 0;
7307 } else {
7308 for (i = 0; i < 8; i++)
7309 if (colors_written & (1 << i) &&
7310 (spi_format >> (i * 4)) & 0xf)
7311 last_color_export = i;
7312 }
7313 }
7314
7315 while (colors_written) {
7316 LLVMValueRef color[4];
7317 int mrt = u_bit_scan(&colors_written);
7318
7319 for (i = 0; i < 4; i++)
7320 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7321
7322 si_export_mrt_color(bld_base, color, mrt,
7323 fninfo.num_params - 1,
7324 mrt == last_color_export, &exp);
7325 }
7326
7327 /* Process depth, stencil, samplemask. */
7328 if (key->ps_epilog.writes_z)
7329 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7330 if (key->ps_epilog.writes_stencil)
7331 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7332 if (key->ps_epilog.writes_samplemask)
7333 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7334
7335 if (depth || stencil || samplemask)
7336 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7337 else if (last_color_export == -1)
7338 si_export_null(bld_base);
7339
7340 if (exp.num)
7341 si_emit_ps_exports(ctx, &exp);
7342
7343 /* Compile. */
7344 LLVMBuildRetVoid(gallivm->builder);
7345 }
7346
7347 /**
7348 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7349 */
7350 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7351 LLVMTargetMachineRef tm,
7352 struct si_shader *shader,
7353 struct pipe_debug_callback *debug)
7354 {
7355 union si_shader_part_key prolog_key;
7356 union si_shader_part_key epilog_key;
7357
7358 /* Get the prolog. */
7359 si_get_ps_prolog_key(shader, &prolog_key, true);
7360
7361 /* The prolog is a no-op if these aren't set. */
7362 if (si_need_ps_prolog(&prolog_key)) {
7363 shader->prolog =
7364 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7365 PIPE_SHADER_FRAGMENT, true,
7366 &prolog_key, tm, debug,
7367 si_build_ps_prolog_function,
7368 "Fragment Shader Prolog");
7369 if (!shader->prolog)
7370 return false;
7371 }
7372
7373 /* Get the epilog. */
7374 si_get_ps_epilog_key(shader, &epilog_key);
7375
7376 shader->epilog =
7377 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7378 PIPE_SHADER_FRAGMENT, false,
7379 &epilog_key, tm, debug,
7380 si_build_ps_epilog_function,
7381 "Fragment Shader Epilog");
7382 if (!shader->epilog)
7383 return false;
7384
7385 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7386 if (shader->key.part.ps.prolog.poly_stipple) {
7387 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7388 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7389 }
7390
7391 /* Set up the enable bits for per-sample shading if needed. */
7392 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7393 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7394 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7395 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7396 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7397 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7398 }
7399 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7400 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7401 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7402 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7403 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7404 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7405 }
7406 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7407 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7408 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7409 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7410 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7411 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7412 }
7413 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7414 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7415 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7416 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7417 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7418 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7419 }
7420
7421 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7422 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7423 !(shader->config.spi_ps_input_ena & 0xf)) {
7424 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7425 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7426 }
7427
7428 /* At least one pair of interpolation weights must be enabled. */
7429 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7430 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7431 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7432 }
7433
7434 /* The sample mask input is always enabled, because the API shader always
7435 * passes it through to the epilog. Disable it here if it's unused.
7436 */
7437 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7438 !shader->selector->info.reads_samplemask)
7439 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7440
7441 return true;
7442 }
7443
7444 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7445 unsigned *lds_size)
7446 {
7447 /* SPI barrier management bug:
7448 * Make sure we have at least 4k of LDS in use to avoid the bug.
7449 * It applies to workgroup sizes of more than one wavefront.
7450 */
7451 if (sscreen->b.family == CHIP_BONAIRE ||
7452 sscreen->b.family == CHIP_KABINI ||
7453 sscreen->b.family == CHIP_MULLINS)
7454 *lds_size = MAX2(*lds_size, 8);
7455 }
7456
7457 static void si_fix_resource_usage(struct si_screen *sscreen,
7458 struct si_shader *shader)
7459 {
7460 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7461
7462 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7463
7464 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7465 si_get_max_workgroup_size(shader) > 64) {
7466 si_multiwave_lds_size_workaround(sscreen,
7467 &shader->config.lds_size);
7468 }
7469 }
7470
7471 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7472 struct si_shader *shader,
7473 struct pipe_debug_callback *debug)
7474 {
7475 struct si_shader_selector *sel = shader->selector;
7476 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7477 int r;
7478
7479 /* LS, ES, VS are compiled on demand if the main part hasn't been
7480 * compiled for that stage.
7481 *
7482 * Vertex shaders are compiled on demand when a vertex fetch
7483 * workaround must be applied.
7484 */
7485 if (shader->is_monolithic) {
7486 /* Monolithic shader (compiled as a whole, has many variants,
7487 * may take a long time to compile).
7488 */
7489 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7490 if (r)
7491 return r;
7492 } else {
7493 /* The shader consists of several parts:
7494 *
7495 * - the middle part is the user shader, it has 1 variant only
7496 * and it was compiled during the creation of the shader
7497 * selector
7498 * - the prolog part is inserted at the beginning
7499 * - the epilog part is inserted at the end
7500 *
7501 * The prolog and epilog have many (but simple) variants.
7502 *
7503 * Starting with gfx9, geometry and tessellation control
7504 * shaders also contain the prolog and user shader parts of
7505 * the previous shader stage.
7506 */
7507
7508 if (!mainp)
7509 return -1;
7510
7511 /* Copy the compiled TGSI shader data over. */
7512 shader->is_binary_shared = true;
7513 shader->binary = mainp->binary;
7514 shader->config = mainp->config;
7515 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7516 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7517 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7518 memcpy(shader->info.vs_output_param_offset,
7519 mainp->info.vs_output_param_offset,
7520 sizeof(mainp->info.vs_output_param_offset));
7521 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7522 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7523 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7524
7525 /* Select prologs and/or epilogs. */
7526 switch (sel->type) {
7527 case PIPE_SHADER_VERTEX:
7528 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7529 return -1;
7530 break;
7531 case PIPE_SHADER_TESS_CTRL:
7532 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7533 return -1;
7534 break;
7535 case PIPE_SHADER_TESS_EVAL:
7536 break;
7537 case PIPE_SHADER_GEOMETRY:
7538 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7539 return -1;
7540 break;
7541 case PIPE_SHADER_FRAGMENT:
7542 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7543 return -1;
7544
7545 /* Make sure we have at least as many VGPRs as there
7546 * are allocated inputs.
7547 */
7548 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7549 shader->info.num_input_vgprs);
7550 break;
7551 }
7552
7553 /* Update SGPR and VGPR counts. */
7554 if (shader->prolog) {
7555 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7556 shader->prolog->config.num_sgprs);
7557 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7558 shader->prolog->config.num_vgprs);
7559 }
7560 if (shader->previous_stage) {
7561 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7562 shader->previous_stage->config.num_sgprs);
7563 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7564 shader->previous_stage->config.num_vgprs);
7565 shader->config.spilled_sgprs =
7566 MAX2(shader->config.spilled_sgprs,
7567 shader->previous_stage->config.spilled_sgprs);
7568 shader->config.spilled_vgprs =
7569 MAX2(shader->config.spilled_vgprs,
7570 shader->previous_stage->config.spilled_vgprs);
7571 shader->config.private_mem_vgprs =
7572 MAX2(shader->config.private_mem_vgprs,
7573 shader->previous_stage->config.private_mem_vgprs);
7574 shader->config.scratch_bytes_per_wave =
7575 MAX2(shader->config.scratch_bytes_per_wave,
7576 shader->previous_stage->config.scratch_bytes_per_wave);
7577 shader->info.uses_instanceid |=
7578 shader->previous_stage->info.uses_instanceid;
7579 }
7580 if (shader->prolog2) {
7581 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7582 shader->prolog2->config.num_sgprs);
7583 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7584 shader->prolog2->config.num_vgprs);
7585 }
7586 if (shader->epilog) {
7587 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7588 shader->epilog->config.num_sgprs);
7589 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7590 shader->epilog->config.num_vgprs);
7591 }
7592 }
7593
7594 si_fix_resource_usage(sscreen, shader);
7595 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7596 stderr, true);
7597
7598 /* Upload. */
7599 r = si_shader_binary_upload(sscreen, shader);
7600 if (r) {
7601 fprintf(stderr, "LLVM failed to upload shader\n");
7602 return r;
7603 }
7604
7605 return 0;
7606 }
7607
7608 void si_shader_destroy(struct si_shader *shader)
7609 {
7610 if (shader->scratch_bo)
7611 r600_resource_reference(&shader->scratch_bo, NULL);
7612
7613 r600_resource_reference(&shader->bo, NULL);
7614
7615 if (!shader->is_binary_shared)
7616 radeon_shader_binary_clean(&shader->binary);
7617
7618 free(shader->shader_log);
7619 }