radeonsi: store shader function arguments in a structure
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 /**
65 * Used to collect types and other info about arguments of the LLVM function
66 * before the function is created.
67 */
68 struct si_function_info {
69 LLVMTypeRef types[100];
70 unsigned num_sgpr_params;
71 unsigned num_params;
72 };
73
74 enum si_arg_regfile {
75 ARG_SGPR,
76 ARG_VGPR
77 };
78
79 static void si_init_shader_ctx(struct si_shader_context *ctx,
80 struct si_screen *sscreen,
81 LLVMTargetMachineRef tm);
82
83 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
84 struct lp_build_tgsi_context *bld_base,
85 struct lp_build_emit_data *emit_data);
86
87 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
88 FILE *f);
89
90 static unsigned llvm_get_type_size(LLVMTypeRef type);
91
92 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
93 union si_shader_part_key *key);
94 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
95 union si_shader_part_key *key);
96 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
97 union si_shader_part_key *key);
98 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
99 union si_shader_part_key *key);
100
101 /* Ideally pass the sample mask input to the PS epilog as v13, which
102 * is its usual location, so that the shader doesn't have to add v_mov.
103 */
104 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
105
106 enum {
107 CONST_ADDR_SPACE = 2,
108 LOCAL_ADDR_SPACE = 3,
109 };
110
111 static bool is_merged_shader(struct si_shader *shader)
112 {
113 if (shader->selector->screen->b.chip_class <= VI)
114 return false;
115
116 return shader->key.as_ls ||
117 shader->key.as_es ||
118 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
119 shader->selector->type == PIPE_SHADER_GEOMETRY;
120 }
121
122 static void si_init_function_info(struct si_function_info *fninfo)
123 {
124 fninfo->num_params = 0;
125 fninfo->num_sgpr_params = 0;
126 }
127
128 static unsigned add_arg(struct si_function_info *fninfo,
129 enum si_arg_regfile regfile, LLVMTypeRef type)
130 {
131 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
132
133 unsigned idx = fninfo->num_params++;
134 assert(idx < ARRAY_SIZE(fninfo->types));
135
136 if (regfile == ARG_SGPR)
137 fninfo->num_sgpr_params = fninfo->num_params;
138
139 fninfo->types[idx] = type;
140 return idx;
141 }
142
143 static void add_arg_checked(struct si_function_info *fninfo,
144 enum si_arg_regfile regfile, LLVMTypeRef type,
145 unsigned idx)
146 {
147 MAYBE_UNUSED unsigned actual = add_arg(fninfo, regfile, type);
148 assert(actual == idx);
149 }
150
151 /**
152 * Returns a unique index for a per-patch semantic name and index. The index
153 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
154 * can be calculated.
155 */
156 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
157 {
158 switch (semantic_name) {
159 case TGSI_SEMANTIC_TESSOUTER:
160 return 0;
161 case TGSI_SEMANTIC_TESSINNER:
162 return 1;
163 case TGSI_SEMANTIC_PATCH:
164 assert(index < 30);
165 return 2 + index;
166
167 default:
168 assert(!"invalid semantic name");
169 return 0;
170 }
171 }
172
173 /**
174 * Returns a unique index for a semantic name and index. The index must be
175 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
176 * calculated.
177 */
178 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
179 {
180 switch (semantic_name) {
181 case TGSI_SEMANTIC_POSITION:
182 return 0;
183 case TGSI_SEMANTIC_GENERIC:
184 /* Since some shader stages use the the highest used IO index
185 * to determine the size to allocate for inputs/outputs
186 * (in LDS, tess and GS rings). GENERIC should be placed right
187 * after POSITION to make that size as small as possible.
188 */
189 if (index < SI_MAX_IO_GENERIC)
190 return 1 + index;
191
192 assert(!"invalid generic index");
193 return 0;
194 case TGSI_SEMANTIC_PSIZE:
195 return SI_MAX_IO_GENERIC + 1;
196 case TGSI_SEMANTIC_CLIPDIST:
197 assert(index <= 1);
198 return SI_MAX_IO_GENERIC + 2 + index;
199 case TGSI_SEMANTIC_FOG:
200 return SI_MAX_IO_GENERIC + 4;
201 case TGSI_SEMANTIC_LAYER:
202 return SI_MAX_IO_GENERIC + 5;
203 case TGSI_SEMANTIC_VIEWPORT_INDEX:
204 return SI_MAX_IO_GENERIC + 6;
205 case TGSI_SEMANTIC_PRIMID:
206 return SI_MAX_IO_GENERIC + 7;
207 case TGSI_SEMANTIC_COLOR: /* these alias */
208 case TGSI_SEMANTIC_BCOLOR:
209 assert(index < 2);
210 return SI_MAX_IO_GENERIC + 8 + index;
211 case TGSI_SEMANTIC_TEXCOORD:
212 assert(index < 8);
213 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
214 return SI_MAX_IO_GENERIC + 10 + index;
215 default:
216 assert(!"invalid semantic name");
217 return 0;
218 }
219 }
220
221 /**
222 * Helper function that builds an LLVM IR PHI node and immediately adds
223 * incoming edges.
224 */
225 static LLVMValueRef
226 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
227 unsigned count_incoming, LLVMValueRef *values,
228 LLVMBasicBlockRef *blocks)
229 {
230 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
231 LLVMAddIncoming(phi, values, blocks, count_incoming);
232 return phi;
233 }
234
235 /**
236 * Get the value of a shader input parameter and extract a bitfield.
237 */
238 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
239 unsigned param, unsigned rshift,
240 unsigned bitwidth)
241 {
242 struct gallivm_state *gallivm = &ctx->gallivm;
243 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
244 param);
245
246 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
247 value = bitcast(&ctx->bld_base,
248 TGSI_TYPE_UNSIGNED, value);
249
250 if (rshift)
251 value = LLVMBuildLShr(gallivm->builder, value,
252 LLVMConstInt(ctx->i32, rshift, 0), "");
253
254 if (rshift + bitwidth < 32) {
255 unsigned mask = (1 << bitwidth) - 1;
256 value = LLVMBuildAnd(gallivm->builder, value,
257 LLVMConstInt(ctx->i32, mask, 0), "");
258 }
259
260 return value;
261 }
262
263 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
264 {
265 switch (ctx->type) {
266 case PIPE_SHADER_TESS_CTRL:
267 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
268
269 case PIPE_SHADER_TESS_EVAL:
270 return LLVMGetParam(ctx->main_fn,
271 ctx->param_tes_rel_patch_id);
272
273 default:
274 assert(0);
275 return NULL;
276 }
277 }
278
279 /* Tessellation shaders pass outputs to the next shader using LDS.
280 *
281 * LS outputs = TCS inputs
282 * TCS outputs = TES inputs
283 *
284 * The LDS layout is:
285 * - TCS inputs for patch 0
286 * - TCS inputs for patch 1
287 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
288 * - ...
289 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
290 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
291 * - TCS outputs for patch 1
292 * - Per-patch TCS outputs for patch 1
293 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
294 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
295 * - ...
296 *
297 * All three shaders VS(LS), TCS, TES share the same LDS space.
298 */
299
300 static LLVMValueRef
301 get_tcs_in_patch_stride(struct si_shader_context *ctx)
302 {
303 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
304 }
305
306 static LLVMValueRef
307 get_tcs_out_patch_stride(struct si_shader_context *ctx)
308 {
309 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
310 }
311
312 static LLVMValueRef
313 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
314 {
315 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
316 unpack_param(ctx,
317 ctx->param_tcs_out_lds_offsets,
318 0, 16),
319 4);
320 }
321
322 static LLVMValueRef
323 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
324 {
325 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
326 unpack_param(ctx,
327 ctx->param_tcs_out_lds_offsets,
328 16, 16),
329 4);
330 }
331
332 static LLVMValueRef
333 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
334 {
335 struct gallivm_state *gallivm = &ctx->gallivm;
336 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
337 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
338
339 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
340 }
341
342 static LLVMValueRef
343 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
344 {
345 struct gallivm_state *gallivm = &ctx->gallivm;
346 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
347 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
348 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
349
350 return LLVMBuildAdd(gallivm->builder, patch0_offset,
351 LLVMBuildMul(gallivm->builder, patch_stride,
352 rel_patch_id, ""),
353 "");
354 }
355
356 static LLVMValueRef
357 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
358 {
359 struct gallivm_state *gallivm = &ctx->gallivm;
360 LLVMValueRef patch0_patch_data_offset =
361 get_tcs_out_patch0_patch_data_offset(ctx);
362 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
363 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
364
365 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
366 LLVMBuildMul(gallivm->builder, patch_stride,
367 rel_patch_id, ""),
368 "");
369 }
370
371 static LLVMValueRef get_instance_index_for_fetch(
372 struct si_shader_context *ctx,
373 unsigned param_start_instance, LLVMValueRef divisor)
374 {
375 struct gallivm_state *gallivm = &ctx->gallivm;
376
377 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
378 ctx->param_instance_id);
379
380 /* The division must be done before START_INSTANCE is added. */
381 if (divisor != ctx->i32_1)
382 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
383
384 return LLVMBuildAdd(gallivm->builder, result,
385 LLVMGetParam(ctx->main_fn, param_start_instance), "");
386 }
387
388 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
389 * to float. */
390 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
391 LLVMValueRef vec4,
392 unsigned double_index)
393 {
394 LLVMBuilderRef builder = ctx->gallivm.builder;
395 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
396 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
397 LLVMVectorType(f64, 2), "");
398 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
399 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
400 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
401 }
402
403 static void declare_input_vs(
404 struct si_shader_context *ctx,
405 unsigned input_index,
406 const struct tgsi_full_declaration *decl,
407 LLVMValueRef out[4])
408 {
409 struct gallivm_state *gallivm = &ctx->gallivm;
410
411 unsigned chan;
412 unsigned fix_fetch;
413 unsigned num_fetches;
414 unsigned fetch_stride;
415
416 LLVMValueRef t_list_ptr;
417 LLVMValueRef t_offset;
418 LLVMValueRef t_list;
419 LLVMValueRef vertex_index;
420 LLVMValueRef input[3];
421
422 /* Load the T list */
423 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
424
425 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
426
427 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
428
429 vertex_index = LLVMGetParam(ctx->main_fn,
430 ctx->param_vertex_index0 +
431 input_index);
432
433 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
434
435 /* Do multiple loads for special formats. */
436 switch (fix_fetch) {
437 case SI_FIX_FETCH_RGB_64_FLOAT:
438 num_fetches = 3; /* 3 2-dword loads */
439 fetch_stride = 8;
440 break;
441 case SI_FIX_FETCH_RGBA_64_FLOAT:
442 num_fetches = 2; /* 2 4-dword loads */
443 fetch_stride = 16;
444 break;
445 case SI_FIX_FETCH_RGB_8:
446 case SI_FIX_FETCH_RGB_8_INT:
447 num_fetches = 3;
448 fetch_stride = 1;
449 break;
450 case SI_FIX_FETCH_RGB_16:
451 case SI_FIX_FETCH_RGB_16_INT:
452 num_fetches = 3;
453 fetch_stride = 2;
454 break;
455 default:
456 num_fetches = 1;
457 fetch_stride = 0;
458 }
459
460 for (unsigned i = 0; i < num_fetches; i++) {
461 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
462
463 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
464 vertex_index, voffset,
465 true);
466 }
467
468 /* Break up the vec4 into individual components */
469 for (chan = 0; chan < 4; chan++) {
470 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
471 out[chan] = LLVMBuildExtractElement(gallivm->builder,
472 input[0], llvm_chan, "");
473 }
474
475 switch (fix_fetch) {
476 case SI_FIX_FETCH_A2_SNORM:
477 case SI_FIX_FETCH_A2_SSCALED:
478 case SI_FIX_FETCH_A2_SINT: {
479 /* The hardware returns an unsigned value; convert it to a
480 * signed one.
481 */
482 LLVMValueRef tmp = out[3];
483 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
484
485 /* First, recover the sign-extended signed integer value. */
486 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
487 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
488 else
489 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
490
491 /* For the integer-like cases, do a natural sign extension.
492 *
493 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
494 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
495 * exponent.
496 */
497 tmp = LLVMBuildShl(gallivm->builder, tmp,
498 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
499 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
500 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
501
502 /* Convert back to the right type. */
503 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
504 LLVMValueRef clamp;
505 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
506 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
507 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
508 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
509 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
510 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
511 }
512
513 out[3] = tmp;
514 break;
515 }
516 case SI_FIX_FETCH_RGBA_32_UNORM:
517 case SI_FIX_FETCH_RGBX_32_UNORM:
518 for (chan = 0; chan < 4; chan++) {
519 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
520 ctx->i32, "");
521 out[chan] = LLVMBuildUIToFP(gallivm->builder,
522 out[chan], ctx->f32, "");
523 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
524 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
525 }
526 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
527 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
528 out[3] = LLVMConstReal(ctx->f32, 1);
529 break;
530 case SI_FIX_FETCH_RGBA_32_SNORM:
531 case SI_FIX_FETCH_RGBX_32_SNORM:
532 case SI_FIX_FETCH_RGBA_32_FIXED:
533 case SI_FIX_FETCH_RGBX_32_FIXED: {
534 double scale;
535 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
536 scale = 1.0 / 0x10000;
537 else
538 scale = 1.0 / INT_MAX;
539
540 for (chan = 0; chan < 4; chan++) {
541 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
542 ctx->i32, "");
543 out[chan] = LLVMBuildSIToFP(gallivm->builder,
544 out[chan], ctx->f32, "");
545 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
546 LLVMConstReal(ctx->f32, scale), "");
547 }
548 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
549 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
550 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
551 out[3] = LLVMConstReal(ctx->f32, 1);
552 break;
553 }
554 case SI_FIX_FETCH_RGBA_32_USCALED:
555 for (chan = 0; chan < 4; chan++) {
556 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
557 ctx->i32, "");
558 out[chan] = LLVMBuildUIToFP(gallivm->builder,
559 out[chan], ctx->f32, "");
560 }
561 break;
562 case SI_FIX_FETCH_RGBA_32_SSCALED:
563 for (chan = 0; chan < 4; chan++) {
564 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
565 ctx->i32, "");
566 out[chan] = LLVMBuildSIToFP(gallivm->builder,
567 out[chan], ctx->f32, "");
568 }
569 break;
570 case SI_FIX_FETCH_RG_64_FLOAT:
571 for (chan = 0; chan < 2; chan++)
572 out[chan] = extract_double_to_float(ctx, input[0], chan);
573
574 out[2] = LLVMConstReal(ctx->f32, 0);
575 out[3] = LLVMConstReal(ctx->f32, 1);
576 break;
577 case SI_FIX_FETCH_RGB_64_FLOAT:
578 for (chan = 0; chan < 3; chan++)
579 out[chan] = extract_double_to_float(ctx, input[chan], 0);
580
581 out[3] = LLVMConstReal(ctx->f32, 1);
582 break;
583 case SI_FIX_FETCH_RGBA_64_FLOAT:
584 for (chan = 0; chan < 4; chan++) {
585 out[chan] = extract_double_to_float(ctx, input[chan / 2],
586 chan % 2);
587 }
588 break;
589 case SI_FIX_FETCH_RGB_8:
590 case SI_FIX_FETCH_RGB_8_INT:
591 case SI_FIX_FETCH_RGB_16:
592 case SI_FIX_FETCH_RGB_16_INT:
593 for (chan = 0; chan < 3; chan++) {
594 out[chan] = LLVMBuildExtractElement(gallivm->builder,
595 input[chan],
596 ctx->i32_0, "");
597 }
598 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
599 fix_fetch == SI_FIX_FETCH_RGB_16) {
600 out[3] = LLVMConstReal(ctx->f32, 1);
601 } else {
602 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
603 ctx->f32, "");
604 }
605 break;
606 }
607 }
608
609 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
610 unsigned swizzle)
611 {
612 struct si_shader_context *ctx = si_shader_context(bld_base);
613
614 if (swizzle > 0)
615 return ctx->i32_0;
616
617 switch (ctx->type) {
618 case PIPE_SHADER_VERTEX:
619 return LLVMGetParam(ctx->main_fn,
620 ctx->param_vs_prim_id);
621 case PIPE_SHADER_TESS_CTRL:
622 return LLVMGetParam(ctx->main_fn,
623 ctx->param_tcs_patch_id);
624 case PIPE_SHADER_TESS_EVAL:
625 return LLVMGetParam(ctx->main_fn,
626 ctx->param_tes_patch_id);
627 case PIPE_SHADER_GEOMETRY:
628 return LLVMGetParam(ctx->main_fn,
629 ctx->param_gs_prim_id);
630 default:
631 assert(0);
632 return ctx->i32_0;
633 }
634 }
635
636 /**
637 * Return the value of tgsi_ind_register for indexing.
638 * This is the indirect index with the constant offset added to it.
639 */
640 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
641 const struct tgsi_ind_register *ind,
642 int rel_index)
643 {
644 struct gallivm_state *gallivm = &ctx->gallivm;
645 LLVMValueRef result;
646
647 result = ctx->addrs[ind->Index][ind->Swizzle];
648 result = LLVMBuildLoad(gallivm->builder, result, "");
649 result = LLVMBuildAdd(gallivm->builder, result,
650 LLVMConstInt(ctx->i32, rel_index, 0), "");
651 return result;
652 }
653
654 /**
655 * Like get_indirect_index, but restricts the return value to a (possibly
656 * undefined) value inside [0..num).
657 */
658 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
659 const struct tgsi_ind_register *ind,
660 int rel_index, unsigned num)
661 {
662 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
663
664 return si_llvm_bound_index(ctx, result, num);
665 }
666
667
668 /**
669 * Calculate a dword address given an input or output register and a stride.
670 */
671 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
672 const struct tgsi_full_dst_register *dst,
673 const struct tgsi_full_src_register *src,
674 LLVMValueRef vertex_dw_stride,
675 LLVMValueRef base_addr)
676 {
677 struct gallivm_state *gallivm = &ctx->gallivm;
678 struct tgsi_shader_info *info = &ctx->shader->selector->info;
679 ubyte *name, *index, *array_first;
680 int first, param;
681 struct tgsi_full_dst_register reg;
682
683 /* Set the register description. The address computation is the same
684 * for sources and destinations. */
685 if (src) {
686 reg.Register.File = src->Register.File;
687 reg.Register.Index = src->Register.Index;
688 reg.Register.Indirect = src->Register.Indirect;
689 reg.Register.Dimension = src->Register.Dimension;
690 reg.Indirect = src->Indirect;
691 reg.Dimension = src->Dimension;
692 reg.DimIndirect = src->DimIndirect;
693 } else
694 reg = *dst;
695
696 /* If the register is 2-dimensional (e.g. an array of vertices
697 * in a primitive), calculate the base address of the vertex. */
698 if (reg.Register.Dimension) {
699 LLVMValueRef index;
700
701 if (reg.Dimension.Indirect)
702 index = get_indirect_index(ctx, &reg.DimIndirect,
703 reg.Dimension.Index);
704 else
705 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
706
707 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
708 LLVMBuildMul(gallivm->builder, index,
709 vertex_dw_stride, ""), "");
710 }
711
712 /* Get information about the register. */
713 if (reg.Register.File == TGSI_FILE_INPUT) {
714 name = info->input_semantic_name;
715 index = info->input_semantic_index;
716 array_first = info->input_array_first;
717 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
718 name = info->output_semantic_name;
719 index = info->output_semantic_index;
720 array_first = info->output_array_first;
721 } else {
722 assert(0);
723 return NULL;
724 }
725
726 if (reg.Register.Indirect) {
727 /* Add the relative address of the element. */
728 LLVMValueRef ind_index;
729
730 if (reg.Indirect.ArrayID)
731 first = array_first[reg.Indirect.ArrayID];
732 else
733 first = reg.Register.Index;
734
735 ind_index = get_indirect_index(ctx, &reg.Indirect,
736 reg.Register.Index - first);
737
738 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
739 LLVMBuildMul(gallivm->builder, ind_index,
740 LLVMConstInt(ctx->i32, 4, 0), ""), "");
741
742 param = reg.Register.Dimension ?
743 si_shader_io_get_unique_index(name[first], index[first]) :
744 si_shader_io_get_unique_index_patch(name[first], index[first]);
745 } else {
746 param = reg.Register.Dimension ?
747 si_shader_io_get_unique_index(name[reg.Register.Index],
748 index[reg.Register.Index]) :
749 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
750 index[reg.Register.Index]);
751 }
752
753 /* Add the base address of the element. */
754 return LLVMBuildAdd(gallivm->builder, base_addr,
755 LLVMConstInt(ctx->i32, param * 4, 0), "");
756 }
757
758 /* The offchip buffer layout for TCS->TES is
759 *
760 * - attribute 0 of patch 0 vertex 0
761 * - attribute 0 of patch 0 vertex 1
762 * - attribute 0 of patch 0 vertex 2
763 * ...
764 * - attribute 0 of patch 1 vertex 0
765 * - attribute 0 of patch 1 vertex 1
766 * ...
767 * - attribute 1 of patch 0 vertex 0
768 * - attribute 1 of patch 0 vertex 1
769 * ...
770 * - per patch attribute 0 of patch 0
771 * - per patch attribute 0 of patch 1
772 * ...
773 *
774 * Note that every attribute has 4 components.
775 */
776 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
777 LLVMValueRef rel_patch_id,
778 LLVMValueRef vertex_index,
779 LLVMValueRef param_index)
780 {
781 struct gallivm_state *gallivm = &ctx->gallivm;
782 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
783 LLVMValueRef param_stride, constant16;
784
785 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
786 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
787 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
788 num_patches, "");
789
790 constant16 = LLVMConstInt(ctx->i32, 16, 0);
791 if (vertex_index) {
792 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
793 vertices_per_patch, "");
794
795 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
796 vertex_index, "");
797
798 param_stride = total_vertices;
799 } else {
800 base_addr = rel_patch_id;
801 param_stride = num_patches;
802 }
803
804 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
805 LLVMBuildMul(gallivm->builder, param_index,
806 param_stride, ""), "");
807
808 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
809
810 if (!vertex_index) {
811 LLVMValueRef patch_data_offset =
812 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
813
814 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
815 patch_data_offset, "");
816 }
817 return base_addr;
818 }
819
820 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
821 struct si_shader_context *ctx,
822 const struct tgsi_full_dst_register *dst,
823 const struct tgsi_full_src_register *src)
824 {
825 struct gallivm_state *gallivm = &ctx->gallivm;
826 struct tgsi_shader_info *info = &ctx->shader->selector->info;
827 ubyte *name, *index, *array_first;
828 struct tgsi_full_src_register reg;
829 LLVMValueRef vertex_index = NULL;
830 LLVMValueRef param_index = NULL;
831 unsigned param_index_base, param_base;
832
833 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
834
835 if (reg.Register.Dimension) {
836
837 if (reg.Dimension.Indirect)
838 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
839 reg.Dimension.Index);
840 else
841 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
842 }
843
844 /* Get information about the register. */
845 if (reg.Register.File == TGSI_FILE_INPUT) {
846 name = info->input_semantic_name;
847 index = info->input_semantic_index;
848 array_first = info->input_array_first;
849 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
850 name = info->output_semantic_name;
851 index = info->output_semantic_index;
852 array_first = info->output_array_first;
853 } else {
854 assert(0);
855 return NULL;
856 }
857
858 if (reg.Register.Indirect) {
859 if (reg.Indirect.ArrayID)
860 param_base = array_first[reg.Indirect.ArrayID];
861 else
862 param_base = reg.Register.Index;
863
864 param_index = get_indirect_index(ctx, &reg.Indirect,
865 reg.Register.Index - param_base);
866
867 } else {
868 param_base = reg.Register.Index;
869 param_index = ctx->i32_0;
870 }
871
872 param_index_base = reg.Register.Dimension ?
873 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
874 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
875
876 param_index = LLVMBuildAdd(gallivm->builder, param_index,
877 LLVMConstInt(ctx->i32, param_index_base, 0),
878 "");
879
880 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
881 vertex_index, param_index);
882 }
883
884 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
885 enum tgsi_opcode_type type, unsigned swizzle,
886 LLVMValueRef buffer, LLVMValueRef offset,
887 LLVMValueRef base, bool can_speculate)
888 {
889 struct si_shader_context *ctx = si_shader_context(bld_base);
890 struct gallivm_state *gallivm = &ctx->gallivm;
891 LLVMValueRef value, value2;
892 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
893 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
894
895 if (swizzle == ~0) {
896 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
897 0, 1, 0, can_speculate, false);
898
899 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
900 }
901
902 if (!tgsi_type_is_64bit(type)) {
903 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
904 0, 1, 0, can_speculate, false);
905
906 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
907 return LLVMBuildExtractElement(gallivm->builder, value,
908 LLVMConstInt(ctx->i32, swizzle, 0), "");
909 }
910
911 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
912 swizzle * 4, 1, 0, can_speculate, false);
913
914 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
915 swizzle * 4 + 4, 1, 0, can_speculate, false);
916
917 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
918 }
919
920 /**
921 * Load from LDS.
922 *
923 * \param type output value type
924 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
925 * \param dw_addr address in dwords
926 */
927 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
928 enum tgsi_opcode_type type, unsigned swizzle,
929 LLVMValueRef dw_addr)
930 {
931 struct si_shader_context *ctx = si_shader_context(bld_base);
932 struct gallivm_state *gallivm = &ctx->gallivm;
933 LLVMValueRef value;
934
935 if (swizzle == ~0) {
936 LLVMValueRef values[TGSI_NUM_CHANNELS];
937
938 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
939 values[chan] = lds_load(bld_base, type, chan, dw_addr);
940
941 return lp_build_gather_values(gallivm, values,
942 TGSI_NUM_CHANNELS);
943 }
944
945 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
946 LLVMConstInt(ctx->i32, swizzle, 0));
947
948 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
949 if (tgsi_type_is_64bit(type)) {
950 LLVMValueRef value2;
951 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
952 ctx->i32_1);
953 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
954 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
955 }
956
957 return LLVMBuildBitCast(gallivm->builder, value,
958 tgsi2llvmtype(bld_base, type), "");
959 }
960
961 /**
962 * Store to LDS.
963 *
964 * \param swizzle offset (typically 0..3)
965 * \param dw_addr address in dwords
966 * \param value value to store
967 */
968 static void lds_store(struct lp_build_tgsi_context *bld_base,
969 unsigned dw_offset_imm, LLVMValueRef dw_addr,
970 LLVMValueRef value)
971 {
972 struct si_shader_context *ctx = si_shader_context(bld_base);
973 struct gallivm_state *gallivm = &ctx->gallivm;
974
975 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
976 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
977
978 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
979 ac_build_indexed_store(&ctx->ac, ctx->lds,
980 dw_addr, value);
981 }
982
983 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
984 unsigned param)
985 {
986 LLVMBuilderRef builder = ctx->gallivm.builder;
987
988 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
989 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
990 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
991
992 uint64_t desc2 = 0xffffffff;
993 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
994 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
995 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
996 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
997 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
998 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
999 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1000
1001 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1002 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1003 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1004 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1005 }
1006
1007 static LLVMValueRef fetch_input_tcs(
1008 struct lp_build_tgsi_context *bld_base,
1009 const struct tgsi_full_src_register *reg,
1010 enum tgsi_opcode_type type, unsigned swizzle)
1011 {
1012 struct si_shader_context *ctx = si_shader_context(bld_base);
1013 LLVMValueRef dw_addr, stride;
1014
1015 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
1016 dw_addr = get_tcs_in_current_patch_offset(ctx);
1017 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1018
1019 return lds_load(bld_base, type, swizzle, dw_addr);
1020 }
1021
1022 static LLVMValueRef fetch_output_tcs(
1023 struct lp_build_tgsi_context *bld_base,
1024 const struct tgsi_full_src_register *reg,
1025 enum tgsi_opcode_type type, unsigned swizzle)
1026 {
1027 struct si_shader_context *ctx = si_shader_context(bld_base);
1028 LLVMValueRef dw_addr, stride;
1029
1030 if (reg->Register.Dimension) {
1031 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1032 dw_addr = get_tcs_out_current_patch_offset(ctx);
1033 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1034 } else {
1035 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1036 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1037 }
1038
1039 return lds_load(bld_base, type, swizzle, dw_addr);
1040 }
1041
1042 static LLVMValueRef fetch_input_tes(
1043 struct lp_build_tgsi_context *bld_base,
1044 const struct tgsi_full_src_register *reg,
1045 enum tgsi_opcode_type type, unsigned swizzle)
1046 {
1047 struct si_shader_context *ctx = si_shader_context(bld_base);
1048 LLVMValueRef buffer, base, addr;
1049
1050 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1051
1052 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1053 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1054
1055 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1056 }
1057
1058 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1059 const struct tgsi_full_instruction *inst,
1060 const struct tgsi_opcode_info *info,
1061 LLVMValueRef dst[4])
1062 {
1063 struct si_shader_context *ctx = si_shader_context(bld_base);
1064 struct gallivm_state *gallivm = &ctx->gallivm;
1065 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1066 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1067 unsigned chan_index;
1068 LLVMValueRef dw_addr, stride;
1069 LLVMValueRef buffer, base, buf_addr;
1070 LLVMValueRef values[4];
1071 bool skip_lds_store;
1072 bool is_tess_factor = false;
1073
1074 /* Only handle per-patch and per-vertex outputs here.
1075 * Vectors will be lowered to scalars and this function will be called again.
1076 */
1077 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1078 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1079 si_llvm_emit_store(bld_base, inst, info, dst);
1080 return;
1081 }
1082
1083 if (reg->Register.Dimension) {
1084 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1085 dw_addr = get_tcs_out_current_patch_offset(ctx);
1086 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1087 skip_lds_store = !sh_info->reads_pervertex_outputs;
1088 } else {
1089 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1090 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1091 skip_lds_store = !sh_info->reads_perpatch_outputs;
1092
1093 if (!reg->Register.Indirect) {
1094 int name = sh_info->output_semantic_name[reg->Register.Index];
1095
1096 /* Always write tess factors into LDS for the TCS epilog. */
1097 if (name == TGSI_SEMANTIC_TESSINNER ||
1098 name == TGSI_SEMANTIC_TESSOUTER) {
1099 skip_lds_store = false;
1100 is_tess_factor = true;
1101 }
1102 }
1103 }
1104
1105 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1106
1107 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1108 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1109
1110
1111 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1112 LLVMValueRef value = dst[chan_index];
1113
1114 if (inst->Instruction.Saturate)
1115 value = ac_build_clamp(&ctx->ac, value);
1116
1117 /* Skip LDS stores if there is no LDS read of this output. */
1118 if (!skip_lds_store)
1119 lds_store(bld_base, chan_index, dw_addr, value);
1120
1121 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1122 values[chan_index] = value;
1123
1124 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1125 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1126 buf_addr, base,
1127 4 * chan_index, 1, 0, true, false);
1128 }
1129 }
1130
1131 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1132 LLVMValueRef value = lp_build_gather_values(gallivm,
1133 values, 4);
1134 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1135 base, 0, 1, 0, true, false);
1136 }
1137 }
1138
1139 static LLVMValueRef fetch_input_gs(
1140 struct lp_build_tgsi_context *bld_base,
1141 const struct tgsi_full_src_register *reg,
1142 enum tgsi_opcode_type type,
1143 unsigned swizzle)
1144 {
1145 struct si_shader_context *ctx = si_shader_context(bld_base);
1146 struct si_shader *shader = ctx->shader;
1147 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1148 struct gallivm_state *gallivm = &ctx->gallivm;
1149 LLVMValueRef vtx_offset, soffset;
1150 struct tgsi_shader_info *info = &shader->selector->info;
1151 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1152 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1153 unsigned param;
1154 LLVMValueRef value;
1155
1156 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1157 return get_primitive_id(bld_base, swizzle);
1158
1159 if (!reg->Register.Dimension)
1160 return NULL;
1161
1162 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1163
1164 /* GFX9 has the ESGS ring in LDS. */
1165 if (ctx->screen->b.chip_class >= GFX9) {
1166 unsigned index = reg->Dimension.Index;
1167
1168 switch (index / 2) {
1169 case 0:
1170 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1171 index % 2 ? 16 : 0, 16);
1172 break;
1173 case 1:
1174 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1175 index % 2 ? 16 : 0, 16);
1176 break;
1177 case 2:
1178 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1179 index % 2 ? 16 : 0, 16);
1180 break;
1181 default:
1182 assert(0);
1183 return NULL;
1184 }
1185
1186 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1187 LLVMConstInt(ctx->i32, param * 4, 0), "");
1188 return lds_load(bld_base, type, swizzle, vtx_offset);
1189 }
1190
1191 /* GFX6: input load from the ESGS ring in memory. */
1192 if (swizzle == ~0) {
1193 LLVMValueRef values[TGSI_NUM_CHANNELS];
1194 unsigned chan;
1195 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1196 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1197 }
1198 return lp_build_gather_values(gallivm, values,
1199 TGSI_NUM_CHANNELS);
1200 }
1201
1202 /* Get the vertex offset parameter on GFX6. */
1203 unsigned vtx_offset_param = reg->Dimension.Index;
1204 if (vtx_offset_param < 2) {
1205 vtx_offset_param += ctx->param_gs_vtx0_offset;
1206 } else {
1207 assert(vtx_offset_param < 6);
1208 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1209 }
1210 vtx_offset = lp_build_mul_imm(uint,
1211 LLVMGetParam(ctx->main_fn,
1212 vtx_offset_param),
1213 4);
1214
1215 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1216
1217 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1218 vtx_offset, soffset, 0, 1, 0, true, false);
1219 if (tgsi_type_is_64bit(type)) {
1220 LLVMValueRef value2;
1221 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1222
1223 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1224 ctx->i32_0, vtx_offset, soffset,
1225 0, 1, 0, true, false);
1226 return si_llvm_emit_fetch_64bit(bld_base, type,
1227 value, value2);
1228 }
1229 return LLVMBuildBitCast(gallivm->builder,
1230 value,
1231 tgsi2llvmtype(bld_base, type), "");
1232 }
1233
1234 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1235 {
1236 switch (interpolate) {
1237 case TGSI_INTERPOLATE_CONSTANT:
1238 return 0;
1239
1240 case TGSI_INTERPOLATE_LINEAR:
1241 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1242 return SI_PARAM_LINEAR_SAMPLE;
1243 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1244 return SI_PARAM_LINEAR_CENTROID;
1245 else
1246 return SI_PARAM_LINEAR_CENTER;
1247 break;
1248 case TGSI_INTERPOLATE_COLOR:
1249 case TGSI_INTERPOLATE_PERSPECTIVE:
1250 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1251 return SI_PARAM_PERSP_SAMPLE;
1252 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1253 return SI_PARAM_PERSP_CENTROID;
1254 else
1255 return SI_PARAM_PERSP_CENTER;
1256 break;
1257 default:
1258 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1259 return -1;
1260 }
1261 }
1262
1263 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1264 unsigned attr_index, unsigned chan,
1265 LLVMValueRef prim_mask,
1266 LLVMValueRef i, LLVMValueRef j)
1267 {
1268 if (i || j) {
1269 return ac_build_fs_interp(&ctx->ac,
1270 LLVMConstInt(ctx->i32, chan, 0),
1271 LLVMConstInt(ctx->i32, attr_index, 0),
1272 prim_mask, i, j);
1273 }
1274 return ac_build_fs_interp_mov(&ctx->ac,
1275 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1276 LLVMConstInt(ctx->i32, chan, 0),
1277 LLVMConstInt(ctx->i32, attr_index, 0),
1278 prim_mask);
1279 }
1280
1281 /**
1282 * Interpolate a fragment shader input.
1283 *
1284 * @param ctx context
1285 * @param input_index index of the input in hardware
1286 * @param semantic_name TGSI_SEMANTIC_*
1287 * @param semantic_index semantic index
1288 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1289 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1290 * @param interp_param interpolation weights (i,j)
1291 * @param prim_mask SI_PARAM_PRIM_MASK
1292 * @param face SI_PARAM_FRONT_FACE
1293 * @param result the return value (4 components)
1294 */
1295 static void interp_fs_input(struct si_shader_context *ctx,
1296 unsigned input_index,
1297 unsigned semantic_name,
1298 unsigned semantic_index,
1299 unsigned num_interp_inputs,
1300 unsigned colors_read_mask,
1301 LLVMValueRef interp_param,
1302 LLVMValueRef prim_mask,
1303 LLVMValueRef face,
1304 LLVMValueRef result[4])
1305 {
1306 struct gallivm_state *gallivm = &ctx->gallivm;
1307 LLVMValueRef i = NULL, j = NULL;
1308 unsigned chan;
1309
1310 /* fs.constant returns the param from the middle vertex, so it's not
1311 * really useful for flat shading. It's meant to be used for custom
1312 * interpolation (but the intrinsic can't fetch from the other two
1313 * vertices).
1314 *
1315 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1316 * to do the right thing. The only reason we use fs.constant is that
1317 * fs.interp cannot be used on integers, because they can be equal
1318 * to NaN.
1319 *
1320 * When interp is false we will use fs.constant or for newer llvm,
1321 * amdgcn.interp.mov.
1322 */
1323 bool interp = interp_param != NULL;
1324
1325 if (interp) {
1326 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1327 LLVMVectorType(ctx->f32, 2), "");
1328
1329 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1330 ctx->i32_0, "");
1331 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1332 ctx->i32_1, "");
1333 }
1334
1335 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1336 ctx->shader->key.part.ps.prolog.color_two_side) {
1337 LLVMValueRef is_face_positive;
1338
1339 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1340 * otherwise it's at offset "num_inputs".
1341 */
1342 unsigned back_attr_offset = num_interp_inputs;
1343 if (semantic_index == 1 && colors_read_mask & 0xf)
1344 back_attr_offset += 1;
1345
1346 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1347 face, ctx->i32_0, "");
1348
1349 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1350 LLVMValueRef front, back;
1351
1352 front = si_build_fs_interp(ctx,
1353 input_index, chan,
1354 prim_mask, i, j);
1355 back = si_build_fs_interp(ctx,
1356 back_attr_offset, chan,
1357 prim_mask, i, j);
1358
1359 result[chan] = LLVMBuildSelect(gallivm->builder,
1360 is_face_positive,
1361 front,
1362 back,
1363 "");
1364 }
1365 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1366 result[0] = si_build_fs_interp(ctx, input_index,
1367 0, prim_mask, i, j);
1368 result[1] =
1369 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1370 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1371 } else {
1372 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1373 result[chan] = si_build_fs_interp(ctx,
1374 input_index, chan,
1375 prim_mask, i, j);
1376 }
1377 }
1378 }
1379
1380 static void declare_input_fs(
1381 struct si_shader_context *ctx,
1382 unsigned input_index,
1383 const struct tgsi_full_declaration *decl,
1384 LLVMValueRef out[4])
1385 {
1386 struct lp_build_context *base = &ctx->bld_base.base;
1387 struct si_shader *shader = ctx->shader;
1388 LLVMValueRef main_fn = ctx->main_fn;
1389 LLVMValueRef interp_param = NULL;
1390 int interp_param_idx;
1391
1392 /* Get colors from input VGPRs (set by the prolog). */
1393 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1394 unsigned i = decl->Semantic.Index;
1395 unsigned colors_read = shader->selector->info.colors_read;
1396 unsigned mask = colors_read >> (i * 4);
1397 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1398 (i ? util_bitcount(colors_read & 0xf) : 0);
1399
1400 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1401 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1402 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1403 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1404 return;
1405 }
1406
1407 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1408 decl->Interp.Location);
1409 if (interp_param_idx == -1)
1410 return;
1411 else if (interp_param_idx) {
1412 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1413 }
1414
1415 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1416 decl->Semantic.Index, 0, /* this param is unused */
1417 shader->selector->info.colors_read, interp_param,
1418 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1419 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1420 &out[0]);
1421 }
1422
1423 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1424 {
1425 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1426 }
1427
1428
1429 /**
1430 * Load a dword from a constant buffer.
1431 */
1432 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1433 LLVMValueRef resource,
1434 LLVMValueRef offset)
1435 {
1436 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1437 0, 0, 0, true, true);
1438 }
1439
1440 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1441 {
1442 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1443 struct gallivm_state *gallivm = &ctx->gallivm;
1444 LLVMBuilderRef builder = gallivm->builder;
1445 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1446 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1447 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1448
1449 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1450 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1451 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1452
1453 LLVMValueRef pos[4] = {
1454 buffer_load_const(ctx, resource, offset0),
1455 buffer_load_const(ctx, resource, offset1),
1456 LLVMConstReal(ctx->f32, 0),
1457 LLVMConstReal(ctx->f32, 0)
1458 };
1459
1460 return lp_build_gather_values(gallivm, pos, 4);
1461 }
1462
1463 static void declare_system_value(struct si_shader_context *ctx,
1464 unsigned index,
1465 const struct tgsi_full_declaration *decl)
1466 {
1467 struct lp_build_context *bld = &ctx->bld_base.base;
1468 struct gallivm_state *gallivm = &ctx->gallivm;
1469 LLVMValueRef value = 0;
1470
1471 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1472
1473 switch (decl->Semantic.Name) {
1474 case TGSI_SEMANTIC_INSTANCEID:
1475 value = LLVMGetParam(ctx->main_fn,
1476 ctx->param_instance_id);
1477 break;
1478
1479 case TGSI_SEMANTIC_VERTEXID:
1480 value = LLVMBuildAdd(gallivm->builder,
1481 LLVMGetParam(ctx->main_fn,
1482 ctx->param_vertex_id),
1483 LLVMGetParam(ctx->main_fn,
1484 ctx->param_base_vertex), "");
1485 break;
1486
1487 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1488 /* Unused. Clarify the meaning in indexed vs. non-indexed
1489 * draws if this is ever used again. */
1490 assert(false);
1491 break;
1492
1493 case TGSI_SEMANTIC_BASEVERTEX:
1494 {
1495 /* For non-indexed draws, the base vertex set by the driver
1496 * (for direct draws) or the CP (for indirect draws) is the
1497 * first vertex ID, but GLSL expects 0 to be returned.
1498 */
1499 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1500 LLVMValueRef indexed;
1501
1502 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1503 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1504
1505 value = LLVMBuildSelect(gallivm->builder, indexed,
1506 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1507 ctx->i32_0, "");
1508 break;
1509 }
1510
1511 case TGSI_SEMANTIC_BASEINSTANCE:
1512 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1513 break;
1514
1515 case TGSI_SEMANTIC_DRAWID:
1516 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1517 break;
1518
1519 case TGSI_SEMANTIC_INVOCATIONID:
1520 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1521 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1522 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1523 value = LLVMGetParam(ctx->main_fn,
1524 ctx->param_gs_instance_id);
1525 else
1526 assert(!"INVOCATIONID not implemented");
1527 break;
1528
1529 case TGSI_SEMANTIC_POSITION:
1530 {
1531 LLVMValueRef pos[4] = {
1532 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1533 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1534 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1535 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1536 LLVMGetParam(ctx->main_fn,
1537 SI_PARAM_POS_W_FLOAT)),
1538 };
1539 value = lp_build_gather_values(gallivm, pos, 4);
1540 break;
1541 }
1542
1543 case TGSI_SEMANTIC_FACE:
1544 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1545 break;
1546
1547 case TGSI_SEMANTIC_SAMPLEID:
1548 value = get_sample_id(ctx);
1549 break;
1550
1551 case TGSI_SEMANTIC_SAMPLEPOS: {
1552 LLVMValueRef pos[4] = {
1553 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1554 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1555 LLVMConstReal(ctx->f32, 0),
1556 LLVMConstReal(ctx->f32, 0)
1557 };
1558 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1559 TGSI_OPCODE_FRC, pos[0]);
1560 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1561 TGSI_OPCODE_FRC, pos[1]);
1562 value = lp_build_gather_values(gallivm, pos, 4);
1563 break;
1564 }
1565
1566 case TGSI_SEMANTIC_SAMPLEMASK:
1567 /* This can only occur with the OpenGL Core profile, which
1568 * doesn't support smoothing.
1569 */
1570 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1571 break;
1572
1573 case TGSI_SEMANTIC_TESSCOORD:
1574 {
1575 LLVMValueRef coord[4] = {
1576 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1577 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1578 bld->zero,
1579 bld->zero
1580 };
1581
1582 /* For triangles, the vector should be (u, v, 1-u-v). */
1583 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1584 PIPE_PRIM_TRIANGLES)
1585 coord[2] = lp_build_sub(bld, bld->one,
1586 lp_build_add(bld, coord[0], coord[1]));
1587
1588 value = lp_build_gather_values(gallivm, coord, 4);
1589 break;
1590 }
1591
1592 case TGSI_SEMANTIC_VERTICESIN:
1593 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1594 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1595 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1596 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1597 else
1598 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1599 break;
1600
1601 case TGSI_SEMANTIC_TESSINNER:
1602 case TGSI_SEMANTIC_TESSOUTER:
1603 {
1604 LLVMValueRef buffer, base, addr;
1605 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1606
1607 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1608
1609 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1610 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1611 LLVMConstInt(ctx->i32, param, 0));
1612
1613 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1614 ~0, buffer, base, addr, true);
1615
1616 break;
1617 }
1618
1619 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1620 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1621 {
1622 LLVMValueRef buf, slot, val[4];
1623 int i, offset;
1624
1625 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1626 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1627 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1628 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1629
1630 for (i = 0; i < 4; i++)
1631 val[i] = buffer_load_const(ctx, buf,
1632 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1633 value = lp_build_gather_values(gallivm, val, 4);
1634 break;
1635 }
1636
1637 case TGSI_SEMANTIC_PRIMID:
1638 value = get_primitive_id(&ctx->bld_base, 0);
1639 break;
1640
1641 case TGSI_SEMANTIC_GRID_SIZE:
1642 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1643 break;
1644
1645 case TGSI_SEMANTIC_BLOCK_SIZE:
1646 {
1647 LLVMValueRef values[3];
1648 unsigned i;
1649 unsigned *properties = ctx->shader->selector->info.properties;
1650
1651 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1652 unsigned sizes[3] = {
1653 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1654 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1655 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1656 };
1657
1658 for (i = 0; i < 3; ++i)
1659 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1660
1661 value = lp_build_gather_values(gallivm, values, 3);
1662 } else {
1663 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1664 }
1665 break;
1666 }
1667
1668 case TGSI_SEMANTIC_BLOCK_ID:
1669 {
1670 LLVMValueRef values[3];
1671
1672 for (int i = 0; i < 3; i++) {
1673 values[i] = ctx->i32_0;
1674 if (ctx->param_block_id[i] >= 0) {
1675 values[i] = LLVMGetParam(ctx->main_fn,
1676 ctx->param_block_id[i]);
1677 }
1678 }
1679 value = lp_build_gather_values(gallivm, values, 3);
1680 break;
1681 }
1682
1683 case TGSI_SEMANTIC_THREAD_ID:
1684 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1685 break;
1686
1687 case TGSI_SEMANTIC_HELPER_INVOCATION:
1688 value = lp_build_intrinsic(gallivm->builder,
1689 "llvm.amdgcn.ps.live",
1690 ctx->i1, NULL, 0,
1691 LP_FUNC_ATTR_READNONE);
1692 value = LLVMBuildNot(gallivm->builder, value, "");
1693 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1694 break;
1695
1696 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1697 value = LLVMConstInt(ctx->i32, 64, 0);
1698 break;
1699
1700 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1701 value = ac_get_thread_id(&ctx->ac);
1702 break;
1703
1704 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1705 {
1706 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1707 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1708 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1709 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1710 break;
1711 }
1712
1713 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1714 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1715 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1716 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1717 {
1718 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1719 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1720 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1721 /* All bits set except LSB */
1722 value = LLVMConstInt(ctx->i64, -2, 0);
1723 } else {
1724 /* All bits set */
1725 value = LLVMConstInt(ctx->i64, -1, 0);
1726 }
1727 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1728 value = LLVMBuildShl(gallivm->builder, value, id, "");
1729 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1730 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1731 value = LLVMBuildNot(gallivm->builder, value, "");
1732 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1733 break;
1734 }
1735
1736 default:
1737 assert(!"unknown system value");
1738 return;
1739 }
1740
1741 ctx->system_values[index] = value;
1742 }
1743
1744 static void declare_compute_memory(struct si_shader_context *ctx,
1745 const struct tgsi_full_declaration *decl)
1746 {
1747 struct si_shader_selector *sel = ctx->shader->selector;
1748 struct gallivm_state *gallivm = &ctx->gallivm;
1749
1750 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1751 LLVMValueRef var;
1752
1753 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1754 assert(decl->Range.First == decl->Range.Last);
1755 assert(!ctx->shared_memory);
1756
1757 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1758 LLVMArrayType(ctx->i8, sel->local_size),
1759 "compute_lds",
1760 LOCAL_ADDR_SPACE);
1761 LLVMSetAlignment(var, 4);
1762
1763 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1764 }
1765
1766 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1767 {
1768 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1769 ctx->param_const_and_shader_buffers);
1770
1771 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1772 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1773 }
1774
1775 static LLVMValueRef fetch_constant(
1776 struct lp_build_tgsi_context *bld_base,
1777 const struct tgsi_full_src_register *reg,
1778 enum tgsi_opcode_type type,
1779 unsigned swizzle)
1780 {
1781 struct si_shader_context *ctx = si_shader_context(bld_base);
1782 struct lp_build_context *base = &bld_base->base;
1783 const struct tgsi_ind_register *ireg = &reg->Indirect;
1784 unsigned buf, idx;
1785
1786 LLVMValueRef addr, bufp;
1787 LLVMValueRef result;
1788
1789 if (swizzle == LP_CHAN_ALL) {
1790 unsigned chan;
1791 LLVMValueRef values[4];
1792 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1793 values[chan] = fetch_constant(bld_base, reg, type, chan);
1794
1795 return lp_build_gather_values(&ctx->gallivm, values, 4);
1796 }
1797
1798 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1799 idx = reg->Register.Index * 4 + swizzle;
1800
1801 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1802 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1803 LLVMValueRef index;
1804 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1805 reg->Dimension.Index,
1806 ctx->num_const_buffers);
1807 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1808 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1809 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1810 } else
1811 bufp = load_const_buffer_desc(ctx, buf);
1812
1813 if (reg->Register.Indirect) {
1814 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1815 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1816 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1817 addr = lp_build_add(&bld_base->uint_bld, addr,
1818 LLVMConstInt(ctx->i32, idx * 4, 0));
1819 } else {
1820 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1821 }
1822
1823 result = buffer_load_const(ctx, bufp, addr);
1824
1825 if (!tgsi_type_is_64bit(type))
1826 result = bitcast(bld_base, type, result);
1827 else {
1828 LLVMValueRef addr2, result2;
1829
1830 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1831 LLVMConstInt(ctx->i32, 4, 0));
1832 result2 = buffer_load_const(ctx, bufp, addr2);
1833
1834 result = si_llvm_emit_fetch_64bit(bld_base, type,
1835 result, result2);
1836 }
1837 return result;
1838 }
1839
1840 /* Upper 16 bits must be zero. */
1841 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1842 LLVMValueRef val[2])
1843 {
1844 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1845 LLVMBuildShl(ctx->gallivm.builder, val[1],
1846 LLVMConstInt(ctx->i32, 16, 0),
1847 ""), "");
1848 }
1849
1850 /* Upper 16 bits are ignored and will be dropped. */
1851 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1852 LLVMValueRef val[2])
1853 {
1854 LLVMValueRef v[2] = {
1855 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1856 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1857 val[1],
1858 };
1859 return si_llvm_pack_two_int16(ctx, v);
1860 }
1861
1862 /* Initialize arguments for the shader export intrinsic */
1863 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1864 LLVMValueRef *values,
1865 unsigned target,
1866 struct ac_export_args *args)
1867 {
1868 struct si_shader_context *ctx = si_shader_context(bld_base);
1869 struct lp_build_context *base = &bld_base->base;
1870 LLVMBuilderRef builder = ctx->gallivm.builder;
1871 LLVMValueRef val[4];
1872 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1873 unsigned chan;
1874 bool is_int8, is_int10;
1875
1876 /* Default is 0xf. Adjusted below depending on the format. */
1877 args->enabled_channels = 0xf; /* writemask */
1878
1879 /* Specify whether the EXEC mask represents the valid mask */
1880 args->valid_mask = 0;
1881
1882 /* Specify whether this is the last export */
1883 args->done = 0;
1884
1885 /* Specify the target we are exporting */
1886 args->target = target;
1887
1888 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1889 const struct si_shader_key *key = &ctx->shader->key;
1890 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1891 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1892
1893 assert(cbuf >= 0 && cbuf < 8);
1894 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1895 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1896 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1897 }
1898
1899 args->compr = false;
1900 args->out[0] = base->undef;
1901 args->out[1] = base->undef;
1902 args->out[2] = base->undef;
1903 args->out[3] = base->undef;
1904
1905 switch (spi_shader_col_format) {
1906 case V_028714_SPI_SHADER_ZERO:
1907 args->enabled_channels = 0; /* writemask */
1908 args->target = V_008DFC_SQ_EXP_NULL;
1909 break;
1910
1911 case V_028714_SPI_SHADER_32_R:
1912 args->enabled_channels = 1; /* writemask */
1913 args->out[0] = values[0];
1914 break;
1915
1916 case V_028714_SPI_SHADER_32_GR:
1917 args->enabled_channels = 0x3; /* writemask */
1918 args->out[0] = values[0];
1919 args->out[1] = values[1];
1920 break;
1921
1922 case V_028714_SPI_SHADER_32_AR:
1923 args->enabled_channels = 0x9; /* writemask */
1924 args->out[0] = values[0];
1925 args->out[3] = values[3];
1926 break;
1927
1928 case V_028714_SPI_SHADER_FP16_ABGR:
1929 args->compr = 1; /* COMPR flag */
1930
1931 for (chan = 0; chan < 2; chan++) {
1932 LLVMValueRef pack_args[2] = {
1933 values[2 * chan],
1934 values[2 * chan + 1]
1935 };
1936 LLVMValueRef packed;
1937
1938 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1939 args->out[chan] =
1940 LLVMBuildBitCast(ctx->gallivm.builder,
1941 packed, ctx->f32, "");
1942 }
1943 break;
1944
1945 case V_028714_SPI_SHADER_UNORM16_ABGR:
1946 for (chan = 0; chan < 4; chan++) {
1947 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1948 val[chan] = LLVMBuildFMul(builder, val[chan],
1949 LLVMConstReal(ctx->f32, 65535), "");
1950 val[chan] = LLVMBuildFAdd(builder, val[chan],
1951 LLVMConstReal(ctx->f32, 0.5), "");
1952 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1953 ctx->i32, "");
1954 }
1955
1956 args->compr = 1; /* COMPR flag */
1957 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1958 si_llvm_pack_two_int16(ctx, val));
1959 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1960 si_llvm_pack_two_int16(ctx, val+2));
1961 break;
1962
1963 case V_028714_SPI_SHADER_SNORM16_ABGR:
1964 for (chan = 0; chan < 4; chan++) {
1965 /* Clamp between [-1, 1]. */
1966 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1967 values[chan],
1968 LLVMConstReal(ctx->f32, 1));
1969 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1970 val[chan],
1971 LLVMConstReal(ctx->f32, -1));
1972 /* Convert to a signed integer in [-32767, 32767]. */
1973 val[chan] = LLVMBuildFMul(builder, val[chan],
1974 LLVMConstReal(ctx->f32, 32767), "");
1975 /* If positive, add 0.5, else add -0.5. */
1976 val[chan] = LLVMBuildFAdd(builder, val[chan],
1977 LLVMBuildSelect(builder,
1978 LLVMBuildFCmp(builder, LLVMRealOGE,
1979 val[chan], base->zero, ""),
1980 LLVMConstReal(ctx->f32, 0.5),
1981 LLVMConstReal(ctx->f32, -0.5), ""), "");
1982 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1983 }
1984
1985 args->compr = 1; /* COMPR flag */
1986 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1987 si_llvm_pack_two_int32_as_int16(ctx, val));
1988 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1989 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1990 break;
1991
1992 case V_028714_SPI_SHADER_UINT16_ABGR: {
1993 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1994 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1995 LLVMValueRef max_alpha =
1996 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1997
1998 /* Clamp. */
1999 for (chan = 0; chan < 4; chan++) {
2000 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2001 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2002 val[chan],
2003 chan == 3 ? max_alpha : max_rgb);
2004 }
2005
2006 args->compr = 1; /* COMPR flag */
2007 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2008 si_llvm_pack_two_int16(ctx, val));
2009 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2010 si_llvm_pack_two_int16(ctx, val+2));
2011 break;
2012 }
2013
2014 case V_028714_SPI_SHADER_SINT16_ABGR: {
2015 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2016 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2017 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2018 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2019 LLVMValueRef max_alpha =
2020 !is_int10 ? max_rgb : ctx->i32_1;
2021 LLVMValueRef min_alpha =
2022 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2023
2024 /* Clamp. */
2025 for (chan = 0; chan < 4; chan++) {
2026 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2027 val[chan] = lp_build_emit_llvm_binary(bld_base,
2028 TGSI_OPCODE_IMIN,
2029 val[chan], chan == 3 ? max_alpha : max_rgb);
2030 val[chan] = lp_build_emit_llvm_binary(bld_base,
2031 TGSI_OPCODE_IMAX,
2032 val[chan], chan == 3 ? min_alpha : min_rgb);
2033 }
2034
2035 args->compr = 1; /* COMPR flag */
2036 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2037 si_llvm_pack_two_int32_as_int16(ctx, val));
2038 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2039 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2040 break;
2041 }
2042
2043 case V_028714_SPI_SHADER_32_ABGR:
2044 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2045 break;
2046 }
2047 }
2048
2049 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2050 LLVMValueRef alpha)
2051 {
2052 struct si_shader_context *ctx = si_shader_context(bld_base);
2053
2054 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2055 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2056 SI_PARAM_ALPHA_REF);
2057
2058 LLVMValueRef alpha_pass =
2059 lp_build_cmp(&bld_base->base,
2060 ctx->shader->key.part.ps.epilog.alpha_func,
2061 alpha, alpha_ref);
2062 LLVMValueRef arg =
2063 lp_build_select(&bld_base->base,
2064 alpha_pass,
2065 LLVMConstReal(ctx->f32, 1.0f),
2066 LLVMConstReal(ctx->f32, -1.0f));
2067
2068 ac_build_kill(&ctx->ac, arg);
2069 } else {
2070 ac_build_kill(&ctx->ac, NULL);
2071 }
2072 }
2073
2074 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2075 LLVMValueRef alpha,
2076 unsigned samplemask_param)
2077 {
2078 struct si_shader_context *ctx = si_shader_context(bld_base);
2079 struct gallivm_state *gallivm = &ctx->gallivm;
2080 LLVMValueRef coverage;
2081
2082 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2083 coverage = LLVMGetParam(ctx->main_fn,
2084 samplemask_param);
2085 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2086
2087 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2088 ctx->i32,
2089 &coverage, 1, LP_FUNC_ATTR_READNONE);
2090
2091 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2092 ctx->f32, "");
2093
2094 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2095 LLVMConstReal(ctx->f32,
2096 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2097
2098 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2099 }
2100
2101 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2102 struct ac_export_args *pos, LLVMValueRef *out_elts)
2103 {
2104 struct si_shader_context *ctx = si_shader_context(bld_base);
2105 struct lp_build_context *base = &bld_base->base;
2106 unsigned reg_index;
2107 unsigned chan;
2108 unsigned const_chan;
2109 LLVMValueRef base_elt;
2110 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2111 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2112 SI_VS_CONST_CLIP_PLANES, 0);
2113 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2114
2115 for (reg_index = 0; reg_index < 2; reg_index ++) {
2116 struct ac_export_args *args = &pos[2 + reg_index];
2117
2118 args->out[0] =
2119 args->out[1] =
2120 args->out[2] =
2121 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2122
2123 /* Compute dot products of position and user clip plane vectors */
2124 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2125 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2126 LLVMValueRef addr =
2127 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2128 const_chan) * 4, 0);
2129 base_elt = buffer_load_const(ctx, const_resource,
2130 addr);
2131 args->out[chan] =
2132 lp_build_add(base, args->out[chan],
2133 lp_build_mul(base, base_elt,
2134 out_elts[const_chan]));
2135 }
2136 }
2137
2138 args->enabled_channels = 0xf;
2139 args->valid_mask = 0;
2140 args->done = 0;
2141 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2142 args->compr = 0;
2143 }
2144 }
2145
2146 static void si_dump_streamout(struct pipe_stream_output_info *so)
2147 {
2148 unsigned i;
2149
2150 if (so->num_outputs)
2151 fprintf(stderr, "STREAMOUT\n");
2152
2153 for (i = 0; i < so->num_outputs; i++) {
2154 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2155 so->output[i].start_component;
2156 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2157 i, so->output[i].output_buffer,
2158 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2159 so->output[i].register_index,
2160 mask & 1 ? "x" : "",
2161 mask & 2 ? "y" : "",
2162 mask & 4 ? "z" : "",
2163 mask & 8 ? "w" : "");
2164 }
2165 }
2166
2167 static void emit_streamout_output(struct si_shader_context *ctx,
2168 LLVMValueRef const *so_buffers,
2169 LLVMValueRef const *so_write_offsets,
2170 struct pipe_stream_output *stream_out,
2171 struct si_shader_output_values *shader_out)
2172 {
2173 struct gallivm_state *gallivm = &ctx->gallivm;
2174 LLVMBuilderRef builder = gallivm->builder;
2175 unsigned buf_idx = stream_out->output_buffer;
2176 unsigned start = stream_out->start_component;
2177 unsigned num_comps = stream_out->num_components;
2178 LLVMValueRef out[4];
2179
2180 assert(num_comps && num_comps <= 4);
2181 if (!num_comps || num_comps > 4)
2182 return;
2183
2184 /* Load the output as int. */
2185 for (int j = 0; j < num_comps; j++) {
2186 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2187
2188 out[j] = LLVMBuildBitCast(builder,
2189 shader_out->values[start + j],
2190 ctx->i32, "");
2191 }
2192
2193 /* Pack the output. */
2194 LLVMValueRef vdata = NULL;
2195
2196 switch (num_comps) {
2197 case 1: /* as i32 */
2198 vdata = out[0];
2199 break;
2200 case 2: /* as v2i32 */
2201 case 3: /* as v4i32 (aligned to 4) */
2202 case 4: /* as v4i32 */
2203 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2204 for (int j = 0; j < num_comps; j++) {
2205 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2206 LLVMConstInt(ctx->i32, j, 0), "");
2207 }
2208 break;
2209 }
2210
2211 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2212 vdata, num_comps,
2213 so_write_offsets[buf_idx],
2214 ctx->i32_0,
2215 stream_out->dst_offset * 4, 1, 1, true, false);
2216 }
2217
2218 /**
2219 * Write streamout data to buffers for vertex stream @p stream (different
2220 * vertex streams can occur for GS copy shaders).
2221 */
2222 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2223 struct si_shader_output_values *outputs,
2224 unsigned noutput, unsigned stream)
2225 {
2226 struct si_shader_selector *sel = ctx->shader->selector;
2227 struct pipe_stream_output_info *so = &sel->so;
2228 struct gallivm_state *gallivm = &ctx->gallivm;
2229 LLVMBuilderRef builder = gallivm->builder;
2230 int i;
2231 struct lp_build_if_state if_ctx;
2232
2233 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2234 LLVMValueRef so_vtx_count =
2235 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2236
2237 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2238
2239 /* can_emit = tid < so_vtx_count; */
2240 LLVMValueRef can_emit =
2241 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2242
2243 /* Emit the streamout code conditionally. This actually avoids
2244 * out-of-bounds buffer access. The hw tells us via the SGPR
2245 * (so_vtx_count) which threads are allowed to emit streamout data. */
2246 lp_build_if(&if_ctx, gallivm, can_emit);
2247 {
2248 /* The buffer offset is computed as follows:
2249 * ByteOffset = streamout_offset[buffer_id]*4 +
2250 * (streamout_write_index + thread_id)*stride[buffer_id] +
2251 * attrib_offset
2252 */
2253
2254 LLVMValueRef so_write_index =
2255 LLVMGetParam(ctx->main_fn,
2256 ctx->param_streamout_write_index);
2257
2258 /* Compute (streamout_write_index + thread_id). */
2259 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2260
2261 /* Load the descriptor and compute the write offset for each
2262 * enabled buffer. */
2263 LLVMValueRef so_write_offset[4] = {};
2264 LLVMValueRef so_buffers[4];
2265 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2266 ctx->param_rw_buffers);
2267
2268 for (i = 0; i < 4; i++) {
2269 if (!so->stride[i])
2270 continue;
2271
2272 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2273 SI_VS_STREAMOUT_BUF0 + i, 0);
2274
2275 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2276
2277 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2278 ctx->param_streamout_offset[i]);
2279 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2280
2281 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2282 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2283 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2284 }
2285
2286 /* Write streamout data. */
2287 for (i = 0; i < so->num_outputs; i++) {
2288 unsigned reg = so->output[i].register_index;
2289
2290 if (reg >= noutput)
2291 continue;
2292
2293 if (stream != so->output[i].stream)
2294 continue;
2295
2296 emit_streamout_output(ctx, so_buffers, so_write_offset,
2297 &so->output[i], &outputs[reg]);
2298 }
2299 }
2300 lp_build_endif(&if_ctx);
2301 }
2302
2303 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2304 LLVMValueRef *values)
2305 {
2306 struct ac_export_args args;
2307
2308 si_llvm_init_export_args(&ctx->bld_base, values,
2309 V_008DFC_SQ_EXP_PARAM + index, &args);
2310 ac_build_export(&ctx->ac, &args);
2311 }
2312
2313 static void si_build_param_exports(struct si_shader_context *ctx,
2314 struct si_shader_output_values *outputs,
2315 unsigned noutput)
2316 {
2317 struct si_shader *shader = ctx->shader;
2318 unsigned param_count = 0;
2319
2320 for (unsigned i = 0; i < noutput; i++) {
2321 unsigned semantic_name = outputs[i].semantic_name;
2322 unsigned semantic_index = outputs[i].semantic_index;
2323
2324 if (outputs[i].vertex_stream[0] != 0 &&
2325 outputs[i].vertex_stream[1] != 0 &&
2326 outputs[i].vertex_stream[2] != 0 &&
2327 outputs[i].vertex_stream[3] != 0)
2328 continue;
2329
2330 switch (semantic_name) {
2331 case TGSI_SEMANTIC_LAYER:
2332 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2333 case TGSI_SEMANTIC_CLIPDIST:
2334 case TGSI_SEMANTIC_COLOR:
2335 case TGSI_SEMANTIC_BCOLOR:
2336 case TGSI_SEMANTIC_PRIMID:
2337 case TGSI_SEMANTIC_FOG:
2338 case TGSI_SEMANTIC_TEXCOORD:
2339 case TGSI_SEMANTIC_GENERIC:
2340 break;
2341 default:
2342 continue;
2343 }
2344
2345 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2346 semantic_index < SI_MAX_IO_GENERIC) &&
2347 shader->key.opt.kill_outputs &
2348 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2349 continue;
2350
2351 si_export_param(ctx, param_count, outputs[i].values);
2352
2353 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2354 shader->info.vs_output_param_offset[i] = param_count++;
2355 }
2356
2357 shader->info.nr_param_exports = param_count;
2358 }
2359
2360 /* Generate export instructions for hardware VS shader stage */
2361 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2362 struct si_shader_output_values *outputs,
2363 unsigned noutput)
2364 {
2365 struct si_shader_context *ctx = si_shader_context(bld_base);
2366 struct si_shader *shader = ctx->shader;
2367 struct lp_build_context *base = &bld_base->base;
2368 struct ac_export_args pos_args[4] = {};
2369 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2370 unsigned pos_idx;
2371 int i;
2372
2373 /* Build position exports. */
2374 for (i = 0; i < noutput; i++) {
2375 switch (outputs[i].semantic_name) {
2376 case TGSI_SEMANTIC_POSITION:
2377 si_llvm_init_export_args(bld_base, outputs[i].values,
2378 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2379 break;
2380 case TGSI_SEMANTIC_PSIZE:
2381 psize_value = outputs[i].values[0];
2382 break;
2383 case TGSI_SEMANTIC_LAYER:
2384 layer_value = outputs[i].values[0];
2385 break;
2386 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2387 viewport_index_value = outputs[i].values[0];
2388 break;
2389 case TGSI_SEMANTIC_EDGEFLAG:
2390 edgeflag_value = outputs[i].values[0];
2391 break;
2392 case TGSI_SEMANTIC_CLIPDIST:
2393 if (!shader->key.opt.clip_disable) {
2394 unsigned index = 2 + outputs[i].semantic_index;
2395 si_llvm_init_export_args(bld_base, outputs[i].values,
2396 V_008DFC_SQ_EXP_POS + index,
2397 &pos_args[index]);
2398 }
2399 break;
2400 case TGSI_SEMANTIC_CLIPVERTEX:
2401 if (!shader->key.opt.clip_disable) {
2402 si_llvm_emit_clipvertex(bld_base, pos_args,
2403 outputs[i].values);
2404 }
2405 break;
2406 }
2407 }
2408
2409 /* We need to add the position output manually if it's missing. */
2410 if (!pos_args[0].out[0]) {
2411 pos_args[0].enabled_channels = 0xf; /* writemask */
2412 pos_args[0].valid_mask = 0; /* EXEC mask */
2413 pos_args[0].done = 0; /* last export? */
2414 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2415 pos_args[0].compr = 0; /* COMPR flag */
2416 pos_args[0].out[0] = base->zero; /* X */
2417 pos_args[0].out[1] = base->zero; /* Y */
2418 pos_args[0].out[2] = base->zero; /* Z */
2419 pos_args[0].out[3] = base->one; /* W */
2420 }
2421
2422 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2423 if (shader->selector->info.writes_psize ||
2424 shader->selector->info.writes_edgeflag ||
2425 shader->selector->info.writes_viewport_index ||
2426 shader->selector->info.writes_layer) {
2427 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2428 (shader->selector->info.writes_edgeflag << 1) |
2429 (shader->selector->info.writes_layer << 2);
2430
2431 pos_args[1].valid_mask = 0; /* EXEC mask */
2432 pos_args[1].done = 0; /* last export? */
2433 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2434 pos_args[1].compr = 0; /* COMPR flag */
2435 pos_args[1].out[0] = base->zero; /* X */
2436 pos_args[1].out[1] = base->zero; /* Y */
2437 pos_args[1].out[2] = base->zero; /* Z */
2438 pos_args[1].out[3] = base->zero; /* W */
2439
2440 if (shader->selector->info.writes_psize)
2441 pos_args[1].out[0] = psize_value;
2442
2443 if (shader->selector->info.writes_edgeflag) {
2444 /* The output is a float, but the hw expects an integer
2445 * with the first bit containing the edge flag. */
2446 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2447 edgeflag_value,
2448 ctx->i32, "");
2449 edgeflag_value = lp_build_min(&bld_base->int_bld,
2450 edgeflag_value,
2451 ctx->i32_1);
2452
2453 /* The LLVM intrinsic expects a float. */
2454 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2455 edgeflag_value,
2456 ctx->f32, "");
2457 }
2458
2459 if (ctx->screen->b.chip_class >= GFX9) {
2460 /* GFX9 has the layer in out.z[10:0] and the viewport
2461 * index in out.z[19:16].
2462 */
2463 if (shader->selector->info.writes_layer)
2464 pos_args[1].out[2] = layer_value;
2465
2466 if (shader->selector->info.writes_viewport_index) {
2467 LLVMValueRef v = viewport_index_value;
2468
2469 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2470 v = LLVMBuildShl(ctx->gallivm.builder, v,
2471 LLVMConstInt(ctx->i32, 16, 0), "");
2472 v = LLVMBuildOr(ctx->gallivm.builder, v,
2473 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2474 pos_args[1].out[2]), "");
2475 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2476 pos_args[1].enabled_channels |= 1 << 2;
2477 }
2478 } else {
2479 if (shader->selector->info.writes_layer)
2480 pos_args[1].out[2] = layer_value;
2481
2482 if (shader->selector->info.writes_viewport_index) {
2483 pos_args[1].out[3] = viewport_index_value;
2484 pos_args[1].enabled_channels |= 1 << 3;
2485 }
2486 }
2487 }
2488
2489 for (i = 0; i < 4; i++)
2490 if (pos_args[i].out[0])
2491 shader->info.nr_pos_exports++;
2492
2493 pos_idx = 0;
2494 for (i = 0; i < 4; i++) {
2495 if (!pos_args[i].out[0])
2496 continue;
2497
2498 /* Specify the target we are exporting */
2499 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2500
2501 if (pos_idx == shader->info.nr_pos_exports)
2502 /* Specify that this is the last export */
2503 pos_args[i].done = 1;
2504
2505 ac_build_export(&ctx->ac, &pos_args[i]);
2506 }
2507
2508 /* Build parameter exports. */
2509 si_build_param_exports(ctx, outputs, noutput);
2510 }
2511
2512 /**
2513 * Forward all outputs from the vertex shader to the TES. This is only used
2514 * for the fixed function TCS.
2515 */
2516 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2517 {
2518 struct si_shader_context *ctx = si_shader_context(bld_base);
2519 struct gallivm_state *gallivm = &ctx->gallivm;
2520 LLVMValueRef invocation_id, buffer, buffer_offset;
2521 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2522 uint64_t inputs;
2523
2524 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2525 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2526 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2527
2528 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2529 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2530 lds_vertex_stride, "");
2531 lds_base = get_tcs_in_current_patch_offset(ctx);
2532 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2533
2534 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2535 while (inputs) {
2536 unsigned i = u_bit_scan64(&inputs);
2537
2538 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2539 LLVMConstInt(ctx->i32, 4 * i, 0),
2540 "");
2541
2542 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2543 get_rel_patch_id(ctx),
2544 invocation_id,
2545 LLVMConstInt(ctx->i32, i, 0));
2546
2547 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2548 lds_ptr);
2549
2550 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2551 buffer_offset, 0, 1, 0, true, false);
2552 }
2553 }
2554
2555 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2556 LLVMValueRef rel_patch_id,
2557 LLVMValueRef invocation_id,
2558 LLVMValueRef tcs_out_current_patch_data_offset)
2559 {
2560 struct si_shader_context *ctx = si_shader_context(bld_base);
2561 struct gallivm_state *gallivm = &ctx->gallivm;
2562 struct si_shader *shader = ctx->shader;
2563 unsigned tess_inner_index, tess_outer_index;
2564 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2565 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2566 unsigned stride, outer_comps, inner_comps, i, offset;
2567 struct lp_build_if_state if_ctx, inner_if_ctx;
2568
2569 si_llvm_emit_barrier(NULL, bld_base, NULL);
2570
2571 /* Do this only for invocation 0, because the tess levels are per-patch,
2572 * not per-vertex.
2573 *
2574 * This can't jump, because invocation 0 executes this. It should
2575 * at least mask out the loads and stores for other invocations.
2576 */
2577 lp_build_if(&if_ctx, gallivm,
2578 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2579 invocation_id, ctx->i32_0, ""));
2580
2581 /* Determine the layout of one tess factor element in the buffer. */
2582 switch (shader->key.part.tcs.epilog.prim_mode) {
2583 case PIPE_PRIM_LINES:
2584 stride = 2; /* 2 dwords, 1 vec2 store */
2585 outer_comps = 2;
2586 inner_comps = 0;
2587 break;
2588 case PIPE_PRIM_TRIANGLES:
2589 stride = 4; /* 4 dwords, 1 vec4 store */
2590 outer_comps = 3;
2591 inner_comps = 1;
2592 break;
2593 case PIPE_PRIM_QUADS:
2594 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2595 outer_comps = 4;
2596 inner_comps = 2;
2597 break;
2598 default:
2599 assert(0);
2600 return;
2601 }
2602
2603 /* Load tess_inner and tess_outer from LDS.
2604 * Any invocation can write them, so we can't get them from a temporary.
2605 */
2606 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2607 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2608
2609 lds_base = tcs_out_current_patch_data_offset;
2610 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2611 LLVMConstInt(ctx->i32,
2612 tess_inner_index * 4, 0), "");
2613 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2614 LLVMConstInt(ctx->i32,
2615 tess_outer_index * 4, 0), "");
2616
2617 for (i = 0; i < 4; i++) {
2618 inner[i] = LLVMGetUndef(ctx->i32);
2619 outer[i] = LLVMGetUndef(ctx->i32);
2620 }
2621
2622 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2623 /* For isolines, the hardware expects tess factors in the
2624 * reverse order from what GLSL / TGSI specify.
2625 */
2626 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2627 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2628 } else {
2629 for (i = 0; i < outer_comps; i++) {
2630 outer[i] = out[i] =
2631 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2632 }
2633 for (i = 0; i < inner_comps; i++) {
2634 inner[i] = out[outer_comps+i] =
2635 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2636 }
2637 }
2638
2639 /* Convert the outputs to vectors for stores. */
2640 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2641 vec1 = NULL;
2642
2643 if (stride > 4)
2644 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2645
2646 /* Get the buffer. */
2647 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2648
2649 /* Get the offset. */
2650 tf_base = LLVMGetParam(ctx->main_fn,
2651 ctx->param_tcs_factor_offset);
2652 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2653 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2654
2655 lp_build_if(&inner_if_ctx, gallivm,
2656 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2657 rel_patch_id, ctx->i32_0, ""));
2658
2659 /* Store the dynamic HS control word. */
2660 offset = 0;
2661 if (ctx->screen->b.chip_class <= VI) {
2662 ac_build_buffer_store_dword(&ctx->ac, buffer,
2663 LLVMConstInt(ctx->i32, 0x80000000, 0),
2664 1, ctx->i32_0, tf_base,
2665 offset, 1, 0, true, false);
2666 offset += 4;
2667 }
2668
2669 lp_build_endif(&inner_if_ctx);
2670
2671 /* Store the tessellation factors. */
2672 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2673 MIN2(stride, 4), byteoffset, tf_base,
2674 offset, 1, 0, true, false);
2675 offset += 16;
2676 if (vec1)
2677 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2678 stride - 4, byteoffset, tf_base,
2679 offset, 1, 0, true, false);
2680
2681 /* Store the tess factors into the offchip buffer if TES reads them. */
2682 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2683 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2684 LLVMValueRef tf_inner_offset;
2685 unsigned param_outer, param_inner;
2686
2687 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2688 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2689
2690 param_outer = si_shader_io_get_unique_index_patch(
2691 TGSI_SEMANTIC_TESSOUTER, 0);
2692 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2693 LLVMConstInt(ctx->i32, param_outer, 0));
2694
2695 outer_vec = lp_build_gather_values(gallivm, outer,
2696 util_next_power_of_two(outer_comps));
2697
2698 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2699 outer_comps, tf_outer_offset,
2700 base, 0, 1, 0, true, false);
2701 if (inner_comps) {
2702 param_inner = si_shader_io_get_unique_index_patch(
2703 TGSI_SEMANTIC_TESSINNER, 0);
2704 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2705 LLVMConstInt(ctx->i32, param_inner, 0));
2706
2707 inner_vec = inner_comps == 1 ? inner[0] :
2708 lp_build_gather_values(gallivm, inner, inner_comps);
2709 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2710 inner_comps, tf_inner_offset,
2711 base, 0, 1, 0, true, false);
2712 }
2713 }
2714
2715 lp_build_endif(&if_ctx);
2716 }
2717
2718 static LLVMValueRef
2719 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2720 unsigned param, unsigned return_index)
2721 {
2722 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2723 LLVMGetParam(ctx->main_fn, param),
2724 return_index, "");
2725 }
2726
2727 static LLVMValueRef
2728 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2729 unsigned param, unsigned return_index)
2730 {
2731 LLVMBuilderRef builder = ctx->gallivm.builder;
2732 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2733
2734 return LLVMBuildInsertValue(builder, ret,
2735 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2736 return_index, "");
2737 }
2738
2739 static LLVMValueRef
2740 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2741 unsigned param, unsigned return_index)
2742 {
2743 LLVMBuilderRef builder = ctx->gallivm.builder;
2744 LLVMValueRef ptr, lo, hi;
2745
2746 ptr = LLVMGetParam(ctx->main_fn, param);
2747 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2748 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2749 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2750 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2751 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2752 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2753 }
2754
2755 /* This only writes the tessellation factor levels. */
2756 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2757 {
2758 struct si_shader_context *ctx = si_shader_context(bld_base);
2759 LLVMBuilderRef builder = ctx->gallivm.builder;
2760 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2761
2762 si_copy_tcs_inputs(bld_base);
2763
2764 rel_patch_id = get_rel_patch_id(ctx);
2765 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2766 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2767
2768 if (ctx->screen->b.chip_class >= GFX9) {
2769 LLVMBasicBlockRef blocks[2] = {
2770 LLVMGetInsertBlock(builder),
2771 ctx->merged_wrap_if_state.entry_block
2772 };
2773 LLVMValueRef values[2];
2774
2775 lp_build_endif(&ctx->merged_wrap_if_state);
2776
2777 values[0] = rel_patch_id;
2778 values[1] = LLVMGetUndef(ctx->i32);
2779 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2780
2781 values[0] = tf_lds_offset;
2782 values[1] = LLVMGetUndef(ctx->i32);
2783 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2784
2785 values[0] = invocation_id;
2786 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2787 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2788 }
2789
2790 /* Return epilog parameters from this function. */
2791 LLVMValueRef ret = ctx->return_value;
2792 unsigned vgpr;
2793
2794 if (ctx->screen->b.chip_class >= GFX9) {
2795 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2796 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2797 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2798 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2799 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2800 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2801 /* Tess offchip and tess factor offsets are at the beginning. */
2802 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2803 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2804 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2805 } else {
2806 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2807 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2808 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2809 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2810 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2811 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2812 /* Tess offchip and tess factor offsets are after user SGPRs. */
2813 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2814 GFX6_TCS_NUM_USER_SGPR);
2815 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2816 GFX6_TCS_NUM_USER_SGPR + 1);
2817 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2818 }
2819
2820 /* VGPRs */
2821 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2822 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2823 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2824
2825 /* Leave a hole corresponding to the two input VGPRs. This ensures that
2826 * the invocation_id output does not alias the param_tcs_rel_ids input,
2827 * which saves a V_MOV on gfx9.
2828 */
2829 vgpr += 2;
2830
2831 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2832 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2833 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2834 ctx->return_value = ret;
2835 }
2836
2837 /* Pass TCS inputs from LS to TCS on GFX9. */
2838 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2839 {
2840 LLVMValueRef ret = ctx->return_value;
2841
2842 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2843 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2844 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2845 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2846 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2847
2848 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2849 8 + SI_SGPR_VS_STATE_BITS);
2850 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2851 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2852 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2853 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2854 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2855 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2856 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2857 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2858 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2859 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2860
2861 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2862 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2863 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2864 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2865 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2866
2867 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2868 ret = si_insert_input_ret_float(ctx, ret,
2869 ctx->param_tcs_patch_id, vgpr++);
2870 ret = si_insert_input_ret_float(ctx, ret,
2871 ctx->param_tcs_rel_ids, vgpr++);
2872 ctx->return_value = ret;
2873 }
2874
2875 /* Pass GS inputs from ES to GS on GFX9. */
2876 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2877 {
2878 LLVMValueRef ret = ctx->return_value;
2879
2880 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2881 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2882 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2883
2884 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2885
2886 unsigned desc_param = ctx->param_vs_state_bits + 1;
2887 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2888 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
2889 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2890 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
2891
2892 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2893 for (unsigned i = 0; i < 5; i++) {
2894 unsigned param = ctx->param_gs_vtx01_offset + i;
2895 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2896 }
2897 ctx->return_value = ret;
2898 }
2899
2900 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2901 {
2902 struct si_shader_context *ctx = si_shader_context(bld_base);
2903 struct si_shader *shader = ctx->shader;
2904 struct tgsi_shader_info *info = &shader->selector->info;
2905 struct gallivm_state *gallivm = &ctx->gallivm;
2906 unsigned i, chan;
2907 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2908 ctx->param_rel_auto_id);
2909 LLVMValueRef vertex_dw_stride =
2910 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2911 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2912 vertex_dw_stride, "");
2913
2914 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2915 * its inputs from it. */
2916 for (i = 0; i < info->num_outputs; i++) {
2917 LLVMValueRef *out_ptr = ctx->outputs[i];
2918 unsigned name = info->output_semantic_name[i];
2919 unsigned index = info->output_semantic_index[i];
2920
2921 /* The ARB_shader_viewport_layer_array spec contains the
2922 * following issue:
2923 *
2924 * 2) What happens if gl_ViewportIndex or gl_Layer is
2925 * written in the vertex shader and a geometry shader is
2926 * present?
2927 *
2928 * RESOLVED: The value written by the last vertex processing
2929 * stage is used. If the last vertex processing stage
2930 * (vertex, tessellation evaluation or geometry) does not
2931 * statically assign to gl_ViewportIndex or gl_Layer, index
2932 * or layer zero is assumed.
2933 *
2934 * So writes to those outputs in VS-as-LS are simply ignored.
2935 */
2936 if (name == TGSI_SEMANTIC_LAYER ||
2937 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2938 continue;
2939
2940 int param = si_shader_io_get_unique_index(name, index);
2941 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2942 LLVMConstInt(ctx->i32, param * 4, 0), "");
2943
2944 for (chan = 0; chan < 4; chan++) {
2945 lds_store(bld_base, chan, dw_addr,
2946 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2947 }
2948 }
2949
2950 if (ctx->screen->b.chip_class >= GFX9)
2951 si_set_ls_return_value_for_tcs(ctx);
2952 }
2953
2954 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2955 {
2956 struct si_shader_context *ctx = si_shader_context(bld_base);
2957 struct gallivm_state *gallivm = &ctx->gallivm;
2958 struct si_shader *es = ctx->shader;
2959 struct tgsi_shader_info *info = &es->selector->info;
2960 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2961 ctx->param_es2gs_offset);
2962 LLVMValueRef lds_base = NULL;
2963 unsigned chan;
2964 int i;
2965
2966 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2967 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2968 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
2969 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
2970 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
2971 LLVMBuildMul(gallivm->builder, wave_idx,
2972 LLVMConstInt(ctx->i32, 64, false), ""), "");
2973 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
2974 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2975 }
2976
2977 for (i = 0; i < info->num_outputs; i++) {
2978 LLVMValueRef *out_ptr = ctx->outputs[i];
2979 int param;
2980
2981 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2982 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2983 continue;
2984
2985 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2986 info->output_semantic_index[i]);
2987
2988 for (chan = 0; chan < 4; chan++) {
2989 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2990 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2991
2992 /* GFX9 has the ESGS ring in LDS. */
2993 if (ctx->screen->b.chip_class >= GFX9) {
2994 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2995 continue;
2996 }
2997
2998 ac_build_buffer_store_dword(&ctx->ac,
2999 ctx->esgs_ring,
3000 out_val, 1, NULL, soffset,
3001 (4 * param + chan) * 4,
3002 1, 1, true, true);
3003 }
3004 }
3005
3006 if (ctx->screen->b.chip_class >= GFX9)
3007 si_set_es_return_value_for_gs(ctx);
3008 }
3009
3010 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3011 {
3012 if (ctx->screen->b.chip_class >= GFX9)
3013 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3014 else
3015 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3016 }
3017
3018 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3019 {
3020 struct si_shader_context *ctx = si_shader_context(bld_base);
3021
3022 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3023 si_get_gs_wave_id(ctx));
3024
3025 if (ctx->screen->b.chip_class >= GFX9)
3026 lp_build_endif(&ctx->merged_wrap_if_state);
3027 }
3028
3029 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
3030 {
3031 struct si_shader_context *ctx = si_shader_context(bld_base);
3032 struct gallivm_state *gallivm = &ctx->gallivm;
3033 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3034 struct si_shader_output_values *outputs = NULL;
3035 int i,j;
3036
3037 assert(!ctx->shader->is_gs_copy_shader);
3038
3039 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3040
3041 /* Vertex color clamping.
3042 *
3043 * This uses a state constant loaded in a user data SGPR and
3044 * an IF statement is added that clamps all colors if the constant
3045 * is true.
3046 */
3047 if (ctx->type == PIPE_SHADER_VERTEX) {
3048 struct lp_build_if_state if_ctx;
3049 LLVMValueRef cond = NULL;
3050 LLVMValueRef addr, val;
3051
3052 for (i = 0; i < info->num_outputs; i++) {
3053 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3054 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3055 continue;
3056
3057 /* We've found a color. */
3058 if (!cond) {
3059 /* The state is in the first bit of the user SGPR. */
3060 cond = LLVMGetParam(ctx->main_fn,
3061 ctx->param_vs_state_bits);
3062 cond = LLVMBuildTrunc(gallivm->builder, cond,
3063 ctx->i1, "");
3064 lp_build_if(&if_ctx, gallivm, cond);
3065 }
3066
3067 for (j = 0; j < 4; j++) {
3068 addr = ctx->outputs[i][j];
3069 val = LLVMBuildLoad(gallivm->builder, addr, "");
3070 val = ac_build_clamp(&ctx->ac, val);
3071 LLVMBuildStore(gallivm->builder, val, addr);
3072 }
3073 }
3074
3075 if (cond)
3076 lp_build_endif(&if_ctx);
3077 }
3078
3079 for (i = 0; i < info->num_outputs; i++) {
3080 outputs[i].semantic_name = info->output_semantic_name[i];
3081 outputs[i].semantic_index = info->output_semantic_index[i];
3082
3083 for (j = 0; j < 4; j++) {
3084 outputs[i].values[j] =
3085 LLVMBuildLoad(gallivm->builder,
3086 ctx->outputs[i][j],
3087 "");
3088 outputs[i].vertex_stream[j] =
3089 (info->output_streams[i] >> (2 * j)) & 3;
3090 }
3091 }
3092
3093 if (ctx->shader->selector->so.num_outputs)
3094 si_llvm_emit_streamout(ctx, outputs, i, 0);
3095
3096 /* Export PrimitiveID. */
3097 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3098 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3099 outputs[i].semantic_index = 0;
3100 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
3101 get_primitive_id(bld_base, 0));
3102 for (j = 1; j < 4; j++)
3103 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3104
3105 memset(outputs[i].vertex_stream, 0,
3106 sizeof(outputs[i].vertex_stream));
3107 i++;
3108 }
3109
3110 si_llvm_export_vs(bld_base, outputs, i);
3111 FREE(outputs);
3112 }
3113
3114 struct si_ps_exports {
3115 unsigned num;
3116 struct ac_export_args args[10];
3117 };
3118
3119 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3120 bool writes_samplemask)
3121 {
3122 if (writes_z) {
3123 /* Z needs 32 bits. */
3124 if (writes_samplemask)
3125 return V_028710_SPI_SHADER_32_ABGR;
3126 else if (writes_stencil)
3127 return V_028710_SPI_SHADER_32_GR;
3128 else
3129 return V_028710_SPI_SHADER_32_R;
3130 } else if (writes_stencil || writes_samplemask) {
3131 /* Both stencil and sample mask need only 16 bits. */
3132 return V_028710_SPI_SHADER_UINT16_ABGR;
3133 } else {
3134 return V_028710_SPI_SHADER_ZERO;
3135 }
3136 }
3137
3138 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3139 LLVMValueRef depth, LLVMValueRef stencil,
3140 LLVMValueRef samplemask, struct si_ps_exports *exp)
3141 {
3142 struct si_shader_context *ctx = si_shader_context(bld_base);
3143 struct lp_build_context *base = &bld_base->base;
3144 struct ac_export_args args;
3145 unsigned mask = 0;
3146 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3147 stencil != NULL,
3148 samplemask != NULL);
3149
3150 assert(depth || stencil || samplemask);
3151
3152 args.valid_mask = 1; /* whether the EXEC mask is valid */
3153 args.done = 1; /* DONE bit */
3154
3155 /* Specify the target we are exporting */
3156 args.target = V_008DFC_SQ_EXP_MRTZ;
3157
3158 args.compr = 0; /* COMP flag */
3159 args.out[0] = base->undef; /* R, depth */
3160 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3161 args.out[2] = base->undef; /* B, sample mask */
3162 args.out[3] = base->undef; /* A, alpha to mask */
3163
3164 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3165 assert(!depth);
3166 args.compr = 1; /* COMPR flag */
3167
3168 if (stencil) {
3169 /* Stencil should be in X[23:16]. */
3170 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3171 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3172 LLVMConstInt(ctx->i32, 16, 0), "");
3173 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3174 mask |= 0x3;
3175 }
3176 if (samplemask) {
3177 /* SampleMask should be in Y[15:0]. */
3178 args.out[1] = samplemask;
3179 mask |= 0xc;
3180 }
3181 } else {
3182 if (depth) {
3183 args.out[0] = depth;
3184 mask |= 0x1;
3185 }
3186 if (stencil) {
3187 args.out[1] = stencil;
3188 mask |= 0x2;
3189 }
3190 if (samplemask) {
3191 args.out[2] = samplemask;
3192 mask |= 0x4;
3193 }
3194 }
3195
3196 /* SI (except OLAND and HAINAN) has a bug that it only looks
3197 * at the X writemask component. */
3198 if (ctx->screen->b.chip_class == SI &&
3199 ctx->screen->b.family != CHIP_OLAND &&
3200 ctx->screen->b.family != CHIP_HAINAN)
3201 mask |= 0x1;
3202
3203 /* Specify which components to enable */
3204 args.enabled_channels = mask;
3205
3206 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3207 }
3208
3209 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3210 LLVMValueRef *color, unsigned index,
3211 unsigned samplemask_param,
3212 bool is_last, struct si_ps_exports *exp)
3213 {
3214 struct si_shader_context *ctx = si_shader_context(bld_base);
3215 struct lp_build_context *base = &bld_base->base;
3216 int i;
3217
3218 /* Clamp color */
3219 if (ctx->shader->key.part.ps.epilog.clamp_color)
3220 for (i = 0; i < 4; i++)
3221 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3222
3223 /* Alpha to one */
3224 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3225 color[3] = base->one;
3226
3227 /* Alpha test */
3228 if (index == 0 &&
3229 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3230 si_alpha_test(bld_base, color[3]);
3231
3232 /* Line & polygon smoothing */
3233 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3234 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3235 samplemask_param);
3236
3237 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3238 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3239 struct ac_export_args args[8];
3240 int c, last = -1;
3241
3242 /* Get the export arguments, also find out what the last one is. */
3243 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3244 si_llvm_init_export_args(bld_base, color,
3245 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3246 if (args[c].enabled_channels)
3247 last = c;
3248 }
3249
3250 /* Emit all exports. */
3251 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3252 if (is_last && last == c) {
3253 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3254 args[c].done = 1; /* DONE bit */
3255 } else if (!args[c].enabled_channels)
3256 continue; /* unnecessary NULL export */
3257
3258 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3259 }
3260 } else {
3261 struct ac_export_args args;
3262
3263 /* Export */
3264 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3265 &args);
3266 if (is_last) {
3267 args.valid_mask = 1; /* whether the EXEC mask is valid */
3268 args.done = 1; /* DONE bit */
3269 } else if (!args.enabled_channels)
3270 return; /* unnecessary NULL export */
3271
3272 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3273 }
3274 }
3275
3276 static void si_emit_ps_exports(struct si_shader_context *ctx,
3277 struct si_ps_exports *exp)
3278 {
3279 for (unsigned i = 0; i < exp->num; i++)
3280 ac_build_export(&ctx->ac, &exp->args[i]);
3281 }
3282
3283 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3284 {
3285 struct si_shader_context *ctx = si_shader_context(bld_base);
3286 struct lp_build_context *base = &bld_base->base;
3287 struct ac_export_args args;
3288
3289 args.enabled_channels = 0x0; /* enabled channels */
3290 args.valid_mask = 1; /* whether the EXEC mask is valid */
3291 args.done = 1; /* DONE bit */
3292 args.target = V_008DFC_SQ_EXP_NULL;
3293 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3294 args.out[0] = base->undef; /* R */
3295 args.out[1] = base->undef; /* G */
3296 args.out[2] = base->undef; /* B */
3297 args.out[3] = base->undef; /* A */
3298
3299 ac_build_export(&ctx->ac, &args);
3300 }
3301
3302 /**
3303 * Return PS outputs in this order:
3304 *
3305 * v[0:3] = color0.xyzw
3306 * v[4:7] = color1.xyzw
3307 * ...
3308 * vN+0 = Depth
3309 * vN+1 = Stencil
3310 * vN+2 = SampleMask
3311 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3312 *
3313 * The alpha-ref SGPR is returned via its original location.
3314 */
3315 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3316 {
3317 struct si_shader_context *ctx = si_shader_context(bld_base);
3318 struct si_shader *shader = ctx->shader;
3319 struct tgsi_shader_info *info = &shader->selector->info;
3320 LLVMBuilderRef builder = ctx->gallivm.builder;
3321 unsigned i, j, first_vgpr, vgpr;
3322
3323 LLVMValueRef color[8][4] = {};
3324 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3325 LLVMValueRef ret;
3326
3327 if (ctx->postponed_kill)
3328 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3329
3330 /* Read the output values. */
3331 for (i = 0; i < info->num_outputs; i++) {
3332 unsigned semantic_name = info->output_semantic_name[i];
3333 unsigned semantic_index = info->output_semantic_index[i];
3334
3335 switch (semantic_name) {
3336 case TGSI_SEMANTIC_COLOR:
3337 assert(semantic_index < 8);
3338 for (j = 0; j < 4; j++) {
3339 LLVMValueRef ptr = ctx->outputs[i][j];
3340 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3341 color[semantic_index][j] = result;
3342 }
3343 break;
3344 case TGSI_SEMANTIC_POSITION:
3345 depth = LLVMBuildLoad(builder,
3346 ctx->outputs[i][2], "");
3347 break;
3348 case TGSI_SEMANTIC_STENCIL:
3349 stencil = LLVMBuildLoad(builder,
3350 ctx->outputs[i][1], "");
3351 break;
3352 case TGSI_SEMANTIC_SAMPLEMASK:
3353 samplemask = LLVMBuildLoad(builder,
3354 ctx->outputs[i][0], "");
3355 break;
3356 default:
3357 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3358 semantic_name);
3359 }
3360 }
3361
3362 /* Fill the return structure. */
3363 ret = ctx->return_value;
3364
3365 /* Set SGPRs. */
3366 ret = LLVMBuildInsertValue(builder, ret,
3367 bitcast(bld_base, TGSI_TYPE_SIGNED,
3368 LLVMGetParam(ctx->main_fn,
3369 SI_PARAM_ALPHA_REF)),
3370 SI_SGPR_ALPHA_REF, "");
3371
3372 /* Set VGPRs */
3373 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3374 for (i = 0; i < ARRAY_SIZE(color); i++) {
3375 if (!color[i][0])
3376 continue;
3377
3378 for (j = 0; j < 4; j++)
3379 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3380 }
3381 if (depth)
3382 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3383 if (stencil)
3384 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3385 if (samplemask)
3386 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3387
3388 /* Add the input sample mask for smoothing at the end. */
3389 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3390 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3391 ret = LLVMBuildInsertValue(builder, ret,
3392 LLVMGetParam(ctx->main_fn,
3393 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3394
3395 ctx->return_value = ret;
3396 }
3397
3398 /* Prevent optimizations (at least of memory accesses) across the current
3399 * point in the program by emitting empty inline assembly that is marked as
3400 * having side effects.
3401 *
3402 * Optionally, a value can be passed through the inline assembly to prevent
3403 * LLVM from hoisting calls to ReadNone functions.
3404 */
3405 static void emit_optimization_barrier(struct si_shader_context *ctx,
3406 LLVMValueRef *pvgpr)
3407 {
3408 static int counter = 0;
3409
3410 LLVMBuilderRef builder = ctx->gallivm.builder;
3411 char code[16];
3412
3413 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3414
3415 if (!pvgpr) {
3416 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3417 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3418 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3419 } else {
3420 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3421 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3422 LLVMValueRef vgpr = *pvgpr;
3423 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3424 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3425 LLVMValueRef vgpr0;
3426
3427 assert(vgpr_size % 4 == 0);
3428
3429 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3430 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3431 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3432 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3433 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3434
3435 *pvgpr = vgpr;
3436 }
3437 }
3438
3439 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3440 {
3441 struct gallivm_state *gallivm = &ctx->gallivm;
3442 LLVMBuilderRef builder = gallivm->builder;
3443 LLVMValueRef args[1] = {
3444 LLVMConstInt(ctx->i32, simm16, 0)
3445 };
3446 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3447 ctx->voidt, args, 1, 0);
3448 }
3449
3450 static void membar_emit(
3451 const struct lp_build_tgsi_action *action,
3452 struct lp_build_tgsi_context *bld_base,
3453 struct lp_build_emit_data *emit_data)
3454 {
3455 struct si_shader_context *ctx = si_shader_context(bld_base);
3456 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3457 unsigned flags = LLVMConstIntGetZExtValue(src0);
3458 unsigned waitcnt = NOOP_WAITCNT;
3459
3460 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3461 waitcnt &= VM_CNT & LGKM_CNT;
3462
3463 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3464 TGSI_MEMBAR_SHADER_BUFFER |
3465 TGSI_MEMBAR_SHADER_IMAGE))
3466 waitcnt &= VM_CNT;
3467
3468 if (flags & TGSI_MEMBAR_SHARED)
3469 waitcnt &= LGKM_CNT;
3470
3471 if (waitcnt != NOOP_WAITCNT)
3472 si_emit_waitcnt(ctx, waitcnt);
3473 }
3474
3475 static void clock_emit(
3476 const struct lp_build_tgsi_action *action,
3477 struct lp_build_tgsi_context *bld_base,
3478 struct lp_build_emit_data *emit_data)
3479 {
3480 struct si_shader_context *ctx = si_shader_context(bld_base);
3481 struct gallivm_state *gallivm = &ctx->gallivm;
3482 LLVMValueRef tmp;
3483
3484 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3485 ctx->i64, NULL, 0, 0);
3486 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3487
3488 emit_data->output[0] =
3489 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3490 emit_data->output[1] =
3491 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3492 }
3493
3494 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3495 {
3496 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3497 CONST_ADDR_SPACE);
3498 }
3499
3500 static void si_llvm_emit_ddxy(
3501 const struct lp_build_tgsi_action *action,
3502 struct lp_build_tgsi_context *bld_base,
3503 struct lp_build_emit_data *emit_data)
3504 {
3505 struct si_shader_context *ctx = si_shader_context(bld_base);
3506 struct gallivm_state *gallivm = &ctx->gallivm;
3507 unsigned opcode = emit_data->info->opcode;
3508 LLVMValueRef val;
3509 int idx;
3510 unsigned mask;
3511
3512 if (opcode == TGSI_OPCODE_DDX_FINE)
3513 mask = AC_TID_MASK_LEFT;
3514 else if (opcode == TGSI_OPCODE_DDY_FINE)
3515 mask = AC_TID_MASK_TOP;
3516 else
3517 mask = AC_TID_MASK_TOP_LEFT;
3518
3519 /* for DDX we want to next X pixel, DDY next Y pixel. */
3520 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3521
3522 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3523 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3524 mask, idx, ctx->lds, val);
3525 emit_data->output[emit_data->chan] = val;
3526 }
3527
3528 /*
3529 * this takes an I,J coordinate pair,
3530 * and works out the X and Y derivatives.
3531 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3532 */
3533 static LLVMValueRef si_llvm_emit_ddxy_interp(
3534 struct lp_build_tgsi_context *bld_base,
3535 LLVMValueRef interp_ij)
3536 {
3537 struct si_shader_context *ctx = si_shader_context(bld_base);
3538 struct gallivm_state *gallivm = &ctx->gallivm;
3539 LLVMValueRef result[4], a;
3540 unsigned i;
3541
3542 for (i = 0; i < 2; i++) {
3543 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3544 LLVMConstInt(ctx->i32, i, 0), "");
3545 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3546 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3547 }
3548
3549 return lp_build_gather_values(gallivm, result, 4);
3550 }
3551
3552 static void interp_fetch_args(
3553 struct lp_build_tgsi_context *bld_base,
3554 struct lp_build_emit_data *emit_data)
3555 {
3556 struct si_shader_context *ctx = si_shader_context(bld_base);
3557 struct gallivm_state *gallivm = &ctx->gallivm;
3558 const struct tgsi_full_instruction *inst = emit_data->inst;
3559
3560 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3561 /* offset is in second src, first two channels */
3562 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3563 emit_data->inst, 1,
3564 TGSI_CHAN_X);
3565 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3566 emit_data->inst, 1,
3567 TGSI_CHAN_Y);
3568 emit_data->arg_count = 2;
3569 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3570 LLVMValueRef sample_position;
3571 LLVMValueRef sample_id;
3572 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3573
3574 /* fetch sample ID, then fetch its sample position,
3575 * and place into first two channels.
3576 */
3577 sample_id = lp_build_emit_fetch(bld_base,
3578 emit_data->inst, 1, TGSI_CHAN_X);
3579 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3580 ctx->i32, "");
3581 sample_position = load_sample_position(ctx, sample_id);
3582
3583 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3584 sample_position,
3585 ctx->i32_0, "");
3586
3587 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3588 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3589 sample_position,
3590 ctx->i32_1, "");
3591 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3592 emit_data->arg_count = 2;
3593 }
3594 }
3595
3596 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3597 struct lp_build_tgsi_context *bld_base,
3598 struct lp_build_emit_data *emit_data)
3599 {
3600 struct si_shader_context *ctx = si_shader_context(bld_base);
3601 struct si_shader *shader = ctx->shader;
3602 struct gallivm_state *gallivm = &ctx->gallivm;
3603 const struct tgsi_shader_info *info = &shader->selector->info;
3604 LLVMValueRef interp_param;
3605 const struct tgsi_full_instruction *inst = emit_data->inst;
3606 const struct tgsi_full_src_register *input = &inst->Src[0];
3607 int input_base, input_array_size;
3608 int chan;
3609 int i;
3610 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3611 LLVMValueRef array_idx;
3612 int interp_param_idx;
3613 unsigned interp;
3614 unsigned location;
3615
3616 assert(input->Register.File == TGSI_FILE_INPUT);
3617
3618 if (input->Register.Indirect) {
3619 unsigned array_id = input->Indirect.ArrayID;
3620
3621 if (array_id) {
3622 input_base = info->input_array_first[array_id];
3623 input_array_size = info->input_array_last[array_id] - input_base + 1;
3624 } else {
3625 input_base = inst->Src[0].Register.Index;
3626 input_array_size = info->num_inputs - input_base;
3627 }
3628
3629 array_idx = get_indirect_index(ctx, &input->Indirect,
3630 input->Register.Index - input_base);
3631 } else {
3632 input_base = inst->Src[0].Register.Index;
3633 input_array_size = 1;
3634 array_idx = ctx->i32_0;
3635 }
3636
3637 interp = shader->selector->info.input_interpolate[input_base];
3638
3639 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3640 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3641 location = TGSI_INTERPOLATE_LOC_CENTER;
3642 else
3643 location = TGSI_INTERPOLATE_LOC_CENTROID;
3644
3645 interp_param_idx = lookup_interp_param_index(interp, location);
3646 if (interp_param_idx == -1)
3647 return;
3648 else if (interp_param_idx)
3649 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3650 else
3651 interp_param = NULL;
3652
3653 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3654 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3655 LLVMValueRef ij_out[2];
3656 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3657
3658 /*
3659 * take the I then J parameters, and the DDX/Y for it, and
3660 * calculate the IJ inputs for the interpolator.
3661 * temp1 = ddx * offset/sample.x + I;
3662 * interp_param.I = ddy * offset/sample.y + temp1;
3663 * temp1 = ddx * offset/sample.x + J;
3664 * interp_param.J = ddy * offset/sample.y + temp1;
3665 */
3666 for (i = 0; i < 2; i++) {
3667 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3668 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3669 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3670 ddxy_out, ix_ll, "");
3671 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3672 ddxy_out, iy_ll, "");
3673 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3674 interp_param, ix_ll, "");
3675 LLVMValueRef temp1, temp2;
3676
3677 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3678 ctx->f32, "");
3679
3680 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3681
3682 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3683
3684 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3685
3686 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3687 }
3688 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3689 }
3690
3691 if (interp_param) {
3692 interp_param = LLVMBuildBitCast(gallivm->builder,
3693 interp_param, LLVMVectorType(ctx->f32, 2), "");
3694 }
3695
3696 for (chan = 0; chan < 4; chan++) {
3697 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3698 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3699
3700 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3701 LLVMValueRef v, i = NULL, j = NULL;
3702
3703 if (interp_param) {
3704 interp_param = LLVMBuildBitCast(gallivm->builder,
3705 interp_param, LLVMVectorType(ctx->f32, 2), "");
3706 i = LLVMBuildExtractElement(
3707 gallivm->builder, interp_param, ctx->i32_0, "");
3708 j = LLVMBuildExtractElement(
3709 gallivm->builder, interp_param, ctx->i32_1, "");
3710 }
3711 v = si_build_fs_interp(ctx, input_base + idx, schan,
3712 prim_mask, i, j);
3713
3714 gather = LLVMBuildInsertElement(gallivm->builder,
3715 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3716 }
3717
3718 emit_data->output[chan] = LLVMBuildExtractElement(
3719 gallivm->builder, gather, array_idx, "");
3720 }
3721 }
3722
3723 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3724 LLVMValueRef value)
3725 {
3726 struct gallivm_state *gallivm = &ctx->gallivm;
3727 LLVMValueRef args[3] = {
3728 value,
3729 ctx->i32_0,
3730 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3731 };
3732
3733 /* We currently have no other way to prevent LLVM from lifting the icmp
3734 * calls to a dominating basic block.
3735 */
3736 emit_optimization_barrier(ctx, &args[0]);
3737
3738 if (LLVMTypeOf(args[0]) != ctx->i32)
3739 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3740
3741 return lp_build_intrinsic(gallivm->builder,
3742 "llvm.amdgcn.icmp.i32",
3743 ctx->i64, args, 3,
3744 LP_FUNC_ATTR_NOUNWIND |
3745 LP_FUNC_ATTR_READNONE |
3746 LP_FUNC_ATTR_CONVERGENT);
3747 }
3748
3749 static void vote_all_emit(
3750 const struct lp_build_tgsi_action *action,
3751 struct lp_build_tgsi_context *bld_base,
3752 struct lp_build_emit_data *emit_data)
3753 {
3754 struct si_shader_context *ctx = si_shader_context(bld_base);
3755 struct gallivm_state *gallivm = &ctx->gallivm;
3756 LLVMValueRef active_set, vote_set;
3757 LLVMValueRef tmp;
3758
3759 active_set = si_emit_ballot(ctx, ctx->i32_1);
3760 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3761
3762 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3763 emit_data->output[emit_data->chan] =
3764 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3765 }
3766
3767 static void vote_any_emit(
3768 const struct lp_build_tgsi_action *action,
3769 struct lp_build_tgsi_context *bld_base,
3770 struct lp_build_emit_data *emit_data)
3771 {
3772 struct si_shader_context *ctx = si_shader_context(bld_base);
3773 struct gallivm_state *gallivm = &ctx->gallivm;
3774 LLVMValueRef vote_set;
3775 LLVMValueRef tmp;
3776
3777 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3778
3779 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3780 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3781 emit_data->output[emit_data->chan] =
3782 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3783 }
3784
3785 static void vote_eq_emit(
3786 const struct lp_build_tgsi_action *action,
3787 struct lp_build_tgsi_context *bld_base,
3788 struct lp_build_emit_data *emit_data)
3789 {
3790 struct si_shader_context *ctx = si_shader_context(bld_base);
3791 struct gallivm_state *gallivm = &ctx->gallivm;
3792 LLVMValueRef active_set, vote_set;
3793 LLVMValueRef all, none, tmp;
3794
3795 active_set = si_emit_ballot(ctx, ctx->i32_1);
3796 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3797
3798 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3799 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3800 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3801 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3802 emit_data->output[emit_data->chan] =
3803 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3804 }
3805
3806 static void ballot_emit(
3807 const struct lp_build_tgsi_action *action,
3808 struct lp_build_tgsi_context *bld_base,
3809 struct lp_build_emit_data *emit_data)
3810 {
3811 struct si_shader_context *ctx = si_shader_context(bld_base);
3812 LLVMBuilderRef builder = ctx->gallivm.builder;
3813 LLVMValueRef tmp;
3814
3815 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3816 tmp = si_emit_ballot(ctx, tmp);
3817 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3818
3819 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3820 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3821 }
3822
3823 static void read_invoc_fetch_args(
3824 struct lp_build_tgsi_context *bld_base,
3825 struct lp_build_emit_data *emit_data)
3826 {
3827 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3828 0, emit_data->src_chan);
3829
3830 /* Always read the source invocation (= lane) from the X channel. */
3831 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3832 1, TGSI_CHAN_X);
3833 emit_data->arg_count = 2;
3834 }
3835
3836 static void read_lane_emit(
3837 const struct lp_build_tgsi_action *action,
3838 struct lp_build_tgsi_context *bld_base,
3839 struct lp_build_emit_data *emit_data)
3840 {
3841 struct si_shader_context *ctx = si_shader_context(bld_base);
3842 LLVMBuilderRef builder = ctx->gallivm.builder;
3843
3844 /* We currently have no other way to prevent LLVM from lifting the icmp
3845 * calls to a dominating basic block.
3846 */
3847 emit_optimization_barrier(ctx, &emit_data->args[0]);
3848
3849 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3850 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3851 ctx->i32, "");
3852 }
3853
3854 emit_data->output[emit_data->chan] =
3855 ac_build_intrinsic(&ctx->ac, action->intr_name,
3856 ctx->i32, emit_data->args, emit_data->arg_count,
3857 AC_FUNC_ATTR_READNONE |
3858 AC_FUNC_ATTR_CONVERGENT);
3859 }
3860
3861 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3862 struct lp_build_emit_data *emit_data)
3863 {
3864 struct si_shader_context *ctx = si_shader_context(bld_base);
3865 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3866 LLVMValueRef imm;
3867 unsigned stream;
3868
3869 assert(src0.File == TGSI_FILE_IMMEDIATE);
3870
3871 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
3872 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
3873 return stream;
3874 }
3875
3876 /* Emit one vertex from the geometry shader */
3877 static void si_llvm_emit_vertex(
3878 const struct lp_build_tgsi_action *action,
3879 struct lp_build_tgsi_context *bld_base,
3880 struct lp_build_emit_data *emit_data)
3881 {
3882 struct si_shader_context *ctx = si_shader_context(bld_base);
3883 struct lp_build_context *uint = &bld_base->uint_bld;
3884 struct si_shader *shader = ctx->shader;
3885 struct tgsi_shader_info *info = &shader->selector->info;
3886 struct gallivm_state *gallivm = &ctx->gallivm;
3887 struct lp_build_if_state if_state;
3888 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3889 ctx->param_gs2vs_offset);
3890 LLVMValueRef gs_next_vertex;
3891 LLVMValueRef can_emit, kill;
3892 unsigned chan, offset;
3893 int i;
3894 unsigned stream;
3895
3896 stream = si_llvm_get_stream(bld_base, emit_data);
3897
3898 /* Write vertex attribute values to GSVS ring */
3899 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
3900 ctx->gs_next_vertex[stream],
3901 "");
3902
3903 /* If this thread has already emitted the declared maximum number of
3904 * vertices, skip the write: excessive vertex emissions are not
3905 * supposed to have any effect.
3906 *
3907 * If the shader has no writes to memory, kill it instead. This skips
3908 * further memory loads and may allow LLVM to skip to the end
3909 * altogether.
3910 */
3911 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
3912 LLVMConstInt(ctx->i32,
3913 shader->selector->gs_max_out_vertices, 0), "");
3914
3915 bool use_kill = !info->writes_memory;
3916 if (use_kill) {
3917 kill = lp_build_select(&bld_base->base, can_emit,
3918 LLVMConstReal(ctx->f32, 1.0f),
3919 LLVMConstReal(ctx->f32, -1.0f));
3920
3921 ac_build_kill(&ctx->ac, kill);
3922 } else {
3923 lp_build_if(&if_state, gallivm, can_emit);
3924 }
3925
3926 offset = 0;
3927 for (i = 0; i < info->num_outputs; i++) {
3928 LLVMValueRef *out_ptr = ctx->outputs[i];
3929
3930 for (chan = 0; chan < 4; chan++) {
3931 if (!(info->output_usagemask[i] & (1 << chan)) ||
3932 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
3933 continue;
3934
3935 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3936 LLVMValueRef voffset =
3937 LLVMConstInt(ctx->i32, offset *
3938 shader->selector->gs_max_out_vertices, 0);
3939 offset++;
3940
3941 voffset = lp_build_add(uint, voffset, gs_next_vertex);
3942 voffset = lp_build_mul_imm(uint, voffset, 4);
3943
3944 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3945
3946 ac_build_buffer_store_dword(&ctx->ac,
3947 ctx->gsvs_ring[stream],
3948 out_val, 1,
3949 voffset, soffset, 0,
3950 1, 1, true, true);
3951 }
3952 }
3953
3954 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
3955 ctx->i32_1);
3956
3957 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
3958
3959 /* Signal vertex emission */
3960 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
3961 si_get_gs_wave_id(ctx));
3962 if (!use_kill)
3963 lp_build_endif(&if_state);
3964 }
3965
3966 /* Cut one primitive from the geometry shader */
3967 static void si_llvm_emit_primitive(
3968 const struct lp_build_tgsi_action *action,
3969 struct lp_build_tgsi_context *bld_base,
3970 struct lp_build_emit_data *emit_data)
3971 {
3972 struct si_shader_context *ctx = si_shader_context(bld_base);
3973 unsigned stream;
3974
3975 /* Signal primitive cut */
3976 stream = si_llvm_get_stream(bld_base, emit_data);
3977 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
3978 si_get_gs_wave_id(ctx));
3979 }
3980
3981 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
3982 struct lp_build_tgsi_context *bld_base,
3983 struct lp_build_emit_data *emit_data)
3984 {
3985 struct si_shader_context *ctx = si_shader_context(bld_base);
3986 struct gallivm_state *gallivm = &ctx->gallivm;
3987
3988 /* SI only (thanks to a hw bug workaround):
3989 * The real barrier instruction isn’t needed, because an entire patch
3990 * always fits into a single wave.
3991 */
3992 if (ctx->screen->b.chip_class == SI &&
3993 ctx->type == PIPE_SHADER_TESS_CTRL) {
3994 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
3995 return;
3996 }
3997
3998 lp_build_intrinsic(gallivm->builder,
3999 "llvm.amdgcn.s.barrier",
4000 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4001 }
4002
4003 static const struct lp_build_tgsi_action interp_action = {
4004 .fetch_args = interp_fetch_args,
4005 .emit = build_interp_intrinsic,
4006 };
4007
4008 static void si_create_function(struct si_shader_context *ctx,
4009 const char *name,
4010 LLVMTypeRef *returns, unsigned num_returns,
4011 struct si_function_info *fninfo,
4012 unsigned max_workgroup_size)
4013 {
4014 int i;
4015
4016 si_llvm_create_func(ctx, name, returns, num_returns,
4017 fninfo->types, fninfo->num_params);
4018 ctx->return_value = LLVMGetUndef(ctx->return_type);
4019
4020 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4021 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4022
4023 /* The combination of:
4024 * - ByVal
4025 * - dereferenceable
4026 * - invariant.load
4027 * allows the optimization passes to move loads and reduces
4028 * SGPR spilling significantly.
4029 */
4030 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4031 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4032 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4033 ac_add_attr_dereferenceable(P, UINT64_MAX);
4034 } else
4035 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4036 }
4037
4038 if (max_workgroup_size) {
4039 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4040 max_workgroup_size);
4041 }
4042 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4043 "no-signed-zeros-fp-math",
4044 "true");
4045
4046 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
4047 /* These were copied from some LLVM test. */
4048 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4049 "less-precise-fpmad",
4050 "true");
4051 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4052 "no-infs-fp-math",
4053 "true");
4054 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4055 "no-nans-fp-math",
4056 "true");
4057 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4058 "unsafe-fp-math",
4059 "true");
4060 }
4061 }
4062
4063 static void declare_streamout_params(struct si_shader_context *ctx,
4064 struct pipe_stream_output_info *so,
4065 struct si_function_info *fninfo)
4066 {
4067 int i;
4068
4069 /* Streamout SGPRs. */
4070 if (so->num_outputs) {
4071 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4072 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4073 else
4074 ctx->param_streamout_config = fninfo->num_params - 1;
4075
4076 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4077 }
4078 /* A streamout buffer offset is loaded if the stride is non-zero. */
4079 for (i = 0; i < 4; i++) {
4080 if (!so->stride[i])
4081 continue;
4082
4083 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4084 }
4085 }
4086
4087 static unsigned llvm_get_type_size(LLVMTypeRef type)
4088 {
4089 LLVMTypeKind kind = LLVMGetTypeKind(type);
4090
4091 switch (kind) {
4092 case LLVMIntegerTypeKind:
4093 return LLVMGetIntTypeWidth(type) / 8;
4094 case LLVMFloatTypeKind:
4095 return 4;
4096 case LLVMPointerTypeKind:
4097 return 8;
4098 case LLVMVectorTypeKind:
4099 return LLVMGetVectorSize(type) *
4100 llvm_get_type_size(LLVMGetElementType(type));
4101 case LLVMArrayTypeKind:
4102 return LLVMGetArrayLength(type) *
4103 llvm_get_type_size(LLVMGetElementType(type));
4104 default:
4105 assert(0);
4106 return 0;
4107 }
4108 }
4109
4110 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4111 {
4112 struct gallivm_state *gallivm = &ctx->gallivm;
4113
4114 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4115 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4116 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4117 "lds");
4118 }
4119
4120 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4121 {
4122 switch (shader->selector->type) {
4123 case PIPE_SHADER_TESS_CTRL:
4124 /* Return this so that LLVM doesn't remove s_barrier
4125 * instructions on chips where we use s_barrier. */
4126 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4127
4128 case PIPE_SHADER_GEOMETRY:
4129 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4130
4131 case PIPE_SHADER_COMPUTE:
4132 break; /* see below */
4133
4134 default:
4135 return 0;
4136 }
4137
4138 const unsigned *properties = shader->selector->info.properties;
4139 unsigned max_work_group_size =
4140 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4141 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4142 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4143
4144 if (!max_work_group_size) {
4145 /* This is a variable group size compute shader,
4146 * compile it for the maximum possible group size.
4147 */
4148 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4149 }
4150 return max_work_group_size;
4151 }
4152
4153 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4154 struct si_function_info *fninfo,
4155 bool assign_params)
4156 {
4157 unsigned const_and_shader_buffers =
4158 add_arg(fninfo, ARG_SGPR,
4159 si_const_array(ctx->v4i32,
4160 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
4161 unsigned samplers_and_images =
4162 add_arg(fninfo, ARG_SGPR,
4163 si_const_array(ctx->v8i32,
4164 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4165
4166 if (assign_params) {
4167 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4168 ctx->param_samplers_and_images = samplers_and_images;
4169 }
4170 }
4171
4172 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4173 struct si_function_info *fninfo)
4174 {
4175 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4176 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4177 declare_per_stage_desc_pointers(ctx, fninfo, true);
4178 }
4179
4180 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4181 struct si_function_info *fninfo)
4182 {
4183 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4184 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4185 ctx->param_base_vertex = add_arg(fninfo, ARG_SGPR, ctx->i32);
4186 ctx->param_start_instance = add_arg(fninfo, ARG_SGPR, ctx->i32);
4187 ctx->param_draw_id = add_arg(fninfo, ARG_SGPR, ctx->i32);
4188 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4189 }
4190
4191 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4192 struct si_function_info *fninfo,
4193 unsigned *num_prolog_vgprs)
4194 {
4195 struct si_shader *shader = ctx->shader;
4196
4197 ctx->param_vertex_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4198 if (shader->key.as_ls) {
4199 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4200 ctx->param_instance_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4201 } else {
4202 ctx->param_instance_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4203 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4204 }
4205 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4206
4207 if (!shader->is_gs_copy_shader) {
4208 /* Vertex load indices. */
4209 ctx->param_vertex_index0 = fninfo->num_params;
4210 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4211 add_arg(fninfo, ARG_VGPR, ctx->i32);
4212 *num_prolog_vgprs += shader->selector->info.num_inputs;
4213 }
4214 }
4215
4216 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4217 struct si_function_info *fninfo)
4218 {
4219 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4220 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4221 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4222 ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4223 }
4224
4225 enum {
4226 /* Convenient merged shader definitions. */
4227 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4228 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4229 };
4230
4231 static void create_function(struct si_shader_context *ctx)
4232 {
4233 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
4234 struct gallivm_state *gallivm = &ctx->gallivm;
4235 struct si_shader *shader = ctx->shader;
4236 struct si_function_info fninfo;
4237 LLVMTypeRef returns[16+32*4];
4238 unsigned i, num_return_sgprs;
4239 unsigned num_returns = 0;
4240 unsigned num_prolog_vgprs = 0;
4241 unsigned type = ctx->type;
4242
4243 si_init_function_info(&fninfo);
4244
4245 /* Set MERGED shaders. */
4246 if (ctx->screen->b.chip_class >= GFX9) {
4247 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4248 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4249 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4250 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4251 }
4252
4253 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4254
4255 switch (type) {
4256 case PIPE_SHADER_VERTEX:
4257 declare_default_desc_pointers(ctx, &fninfo);
4258 declare_vs_specific_input_sgprs(ctx, &fninfo);
4259
4260 if (shader->key.as_es) {
4261 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4262 } else if (shader->key.as_ls) {
4263 /* no extra parameters */
4264 } else {
4265 if (shader->is_gs_copy_shader) {
4266 fninfo.num_params = ctx->param_rw_buffers + 1;
4267 fninfo.num_sgpr_params = fninfo.num_params;
4268 }
4269
4270 /* The locations of the other parameters are assigned dynamically. */
4271 declare_streamout_params(ctx, &shader->selector->so,
4272 &fninfo);
4273 }
4274
4275 /* VGPRs */
4276 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4277 break;
4278
4279 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4280 declare_default_desc_pointers(ctx, &fninfo);
4281 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4282 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4283 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4284 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4285 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4286 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4287 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4288 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4289
4290 /* VGPRs */
4291 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4292 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4293
4294 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4295 * placed after the user SGPRs.
4296 */
4297 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4298 returns[num_returns++] = ctx->i32; /* SGPRs */
4299 for (i = 0; i < 5; i++)
4300 returns[num_returns++] = ctx->f32; /* VGPRs */
4301 break;
4302
4303 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4304 /* Merged stages have 8 system SGPRs at the beginning. */
4305 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4306 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4307 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4308 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4309 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4310 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4311 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4312 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4313
4314 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4315 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4316 declare_per_stage_desc_pointers(ctx, &fninfo,
4317 ctx->type == PIPE_SHADER_VERTEX);
4318 declare_vs_specific_input_sgprs(ctx, &fninfo);
4319
4320 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4321 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4322 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4323 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4324 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4325 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4326
4327 declare_per_stage_desc_pointers(ctx, &fninfo,
4328 ctx->type == PIPE_SHADER_TESS_CTRL);
4329
4330 /* VGPRs (first TCS, then VS) */
4331 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4332 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4333
4334 if (ctx->type == PIPE_SHADER_VERTEX) {
4335 declare_vs_input_vgprs(ctx, &fninfo,
4336 &num_prolog_vgprs);
4337
4338 /* LS return values are inputs to the TCS main shader part. */
4339 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4340 returns[num_returns++] = ctx->i32; /* SGPRs */
4341 for (i = 0; i < 2; i++)
4342 returns[num_returns++] = ctx->f32; /* VGPRs */
4343 } else {
4344 /* TCS return values are inputs to the TCS epilog.
4345 *
4346 * param_tcs_offchip_offset, param_tcs_factor_offset,
4347 * param_tcs_offchip_layout, and param_rw_buffers
4348 * should be passed to the epilog.
4349 */
4350 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4351 returns[num_returns++] = ctx->i32; /* SGPRs */
4352 for (i = 0; i < 5; i++)
4353 returns[num_returns++] = ctx->f32; /* VGPRs */
4354 }
4355 break;
4356
4357 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4358 /* Merged stages have 8 system SGPRs at the beginning. */
4359 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4360 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4361 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4362 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4363 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4364 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4365 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4366 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4367
4368 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4369 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4370 declare_per_stage_desc_pointers(ctx, &fninfo,
4371 (ctx->type == PIPE_SHADER_VERTEX ||
4372 ctx->type == PIPE_SHADER_TESS_EVAL));
4373 if (ctx->type == PIPE_SHADER_VERTEX) {
4374 declare_vs_specific_input_sgprs(ctx, &fninfo);
4375 } else {
4376 /* TESS_EVAL (and also GEOMETRY):
4377 * Declare as many input SGPRs as the VS has. */
4378 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4379 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4380 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4381 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4382 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4383 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4384 }
4385
4386 declare_per_stage_desc_pointers(ctx, &fninfo,
4387 ctx->type == PIPE_SHADER_GEOMETRY);
4388
4389 /* VGPRs (first GS, then VS/TES) */
4390 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4391 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4392 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4393 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4394 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4395
4396 if (ctx->type == PIPE_SHADER_VERTEX) {
4397 declare_vs_input_vgprs(ctx, &fninfo,
4398 &num_prolog_vgprs);
4399 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4400 declare_tes_input_vgprs(ctx, &fninfo);
4401 }
4402
4403 if (ctx->type == PIPE_SHADER_VERTEX ||
4404 ctx->type == PIPE_SHADER_TESS_EVAL) {
4405 /* ES return values are inputs to GS. */
4406 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4407 returns[num_returns++] = ctx->i32; /* SGPRs */
4408 for (i = 0; i < 5; i++)
4409 returns[num_returns++] = ctx->f32; /* VGPRs */
4410 }
4411 break;
4412
4413 case PIPE_SHADER_TESS_EVAL:
4414 declare_default_desc_pointers(ctx, &fninfo);
4415 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4416 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4417
4418 if (shader->key.as_es) {
4419 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4420 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4421 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4422 } else {
4423 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4424 declare_streamout_params(ctx, &shader->selector->so,
4425 &fninfo);
4426 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4427 }
4428
4429 /* VGPRs */
4430 declare_tes_input_vgprs(ctx, &fninfo);
4431 break;
4432
4433 case PIPE_SHADER_GEOMETRY:
4434 declare_default_desc_pointers(ctx, &fninfo);
4435 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4436 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4437
4438 /* VGPRs */
4439 ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4440 ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4441 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4442 ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4443 ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4444 ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4445 ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4446 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4447 break;
4448
4449 case PIPE_SHADER_FRAGMENT:
4450 declare_default_desc_pointers(ctx, &fninfo);
4451 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4452 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4453
4454 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4455 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4456 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4457 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4458 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4459 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4460 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4461 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4462 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_X_FLOAT);
4463 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_Y_FLOAT);
4464 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_Z_FLOAT);
4465 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_W_FLOAT);
4466 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_FRONT_FACE);
4467 shader->info.face_vgpr_index = 20;
4468 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_ANCILLARY);
4469 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_SAMPLE_COVERAGE);
4470 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4471
4472 /* Color inputs from the prolog. */
4473 if (shader->selector->info.colors_read) {
4474 unsigned num_color_elements =
4475 util_bitcount(shader->selector->info.colors_read);
4476
4477 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4478 for (i = 0; i < num_color_elements; i++)
4479 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4480
4481 num_prolog_vgprs += num_color_elements;
4482 }
4483
4484 /* Outputs for the epilog. */
4485 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4486 num_returns =
4487 num_return_sgprs +
4488 util_bitcount(shader->selector->info.colors_written) * 4 +
4489 shader->selector->info.writes_z +
4490 shader->selector->info.writes_stencil +
4491 shader->selector->info.writes_samplemask +
4492 1 /* SampleMaskIn */;
4493
4494 num_returns = MAX2(num_returns,
4495 num_return_sgprs +
4496 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4497
4498 for (i = 0; i < num_return_sgprs; i++)
4499 returns[i] = ctx->i32;
4500 for (; i < num_returns; i++)
4501 returns[i] = ctx->f32;
4502 break;
4503
4504 case PIPE_SHADER_COMPUTE:
4505 declare_default_desc_pointers(ctx, &fninfo);
4506 if (shader->selector->info.uses_grid_size)
4507 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4508 if (shader->selector->info.uses_block_size)
4509 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4510
4511 for (i = 0; i < 3; i++) {
4512 ctx->param_block_id[i] = -1;
4513 if (shader->selector->info.uses_block_id[i])
4514 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4515 }
4516
4517 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4518 break;
4519 default:
4520 assert(0 && "unimplemented shader");
4521 return;
4522 }
4523
4524 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4525 si_get_max_workgroup_size(shader));
4526
4527 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4528 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4529 ctx->separate_prolog) {
4530 si_llvm_add_attribute(ctx->main_fn,
4531 "InitialPSInputAddr",
4532 S_0286D0_PERSP_SAMPLE_ENA(1) |
4533 S_0286D0_PERSP_CENTER_ENA(1) |
4534 S_0286D0_PERSP_CENTROID_ENA(1) |
4535 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4536 S_0286D0_LINEAR_CENTER_ENA(1) |
4537 S_0286D0_LINEAR_CENTROID_ENA(1) |
4538 S_0286D0_FRONT_FACE_ENA(1) |
4539 S_0286D0_POS_FIXED_PT_ENA(1));
4540 }
4541
4542 shader->info.num_input_sgprs = 0;
4543 shader->info.num_input_vgprs = 0;
4544
4545 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4546 shader->info.num_input_sgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4547
4548 for (; i < fninfo.num_params; ++i)
4549 shader->info.num_input_vgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4550
4551 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4552 shader->info.num_input_vgprs -= num_prolog_vgprs;
4553
4554 if (!ctx->screen->has_ds_bpermute &&
4555 bld_base->info &&
4556 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
4557 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
4558 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
4559 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
4560 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
4561 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
4562 ctx->lds =
4563 LLVMAddGlobalInAddressSpace(gallivm->module,
4564 LLVMArrayType(ctx->i32, 64),
4565 "ddxy_lds",
4566 LOCAL_ADDR_SPACE);
4567
4568 if (shader->key.as_ls ||
4569 ctx->type == PIPE_SHADER_TESS_CTRL ||
4570 /* GFX9 has the ESGS ring buffer in LDS. */
4571 (ctx->screen->b.chip_class >= GFX9 &&
4572 (shader->key.as_es ||
4573 ctx->type == PIPE_SHADER_GEOMETRY)))
4574 declare_lds_as_pointer(ctx);
4575 }
4576
4577 /**
4578 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4579 * for later use.
4580 */
4581 static void preload_ring_buffers(struct si_shader_context *ctx)
4582 {
4583 struct gallivm_state *gallivm = &ctx->gallivm;
4584 LLVMBuilderRef builder = gallivm->builder;
4585
4586 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4587 ctx->param_rw_buffers);
4588
4589 if (ctx->screen->b.chip_class <= VI &&
4590 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4591 unsigned ring =
4592 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4593 : SI_ES_RING_ESGS;
4594 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4595
4596 ctx->esgs_ring =
4597 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4598 }
4599
4600 if (ctx->shader->is_gs_copy_shader) {
4601 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4602
4603 ctx->gsvs_ring[0] =
4604 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4605 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4606 const struct si_shader_selector *sel = ctx->shader->selector;
4607 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4608 LLVMValueRef base_ring;
4609
4610 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4611
4612 /* The conceptual layout of the GSVS ring is
4613 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4614 * but the real memory layout is swizzled across
4615 * threads:
4616 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4617 * t16v0c0 ..
4618 * Override the buffer descriptor accordingly.
4619 */
4620 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4621 uint64_t stream_offset = 0;
4622
4623 for (unsigned stream = 0; stream < 4; ++stream) {
4624 unsigned num_components;
4625 unsigned stride;
4626 unsigned num_records;
4627 LLVMValueRef ring, tmp;
4628
4629 num_components = sel->info.num_stream_output_components[stream];
4630 if (!num_components)
4631 continue;
4632
4633 stride = 4 * num_components * sel->gs_max_out_vertices;
4634
4635 /* Limit on the stride field for <= CIK. */
4636 assert(stride < (1 << 14));
4637
4638 num_records = 64;
4639
4640 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4641 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4642 tmp = LLVMBuildAdd(builder, tmp,
4643 LLVMConstInt(ctx->i64,
4644 stream_offset, 0), "");
4645 stream_offset += stride * 64;
4646
4647 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4648 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4649 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4650 tmp = LLVMBuildOr(builder, tmp,
4651 LLVMConstInt(ctx->i32,
4652 S_008F04_STRIDE(stride) |
4653 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4654 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4655 ring = LLVMBuildInsertElement(builder, ring,
4656 LLVMConstInt(ctx->i32, num_records, 0),
4657 LLVMConstInt(ctx->i32, 2, 0), "");
4658 ring = LLVMBuildInsertElement(builder, ring,
4659 LLVMConstInt(ctx->i32,
4660 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4661 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4662 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4663 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4664 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4665 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4666 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4667 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4668 S_008F0C_ADD_TID_ENABLE(1),
4669 0),
4670 LLVMConstInt(ctx->i32, 3, 0), "");
4671
4672 ctx->gsvs_ring[stream] = ring;
4673 }
4674 }
4675 }
4676
4677 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4678 LLVMValueRef param_rw_buffers,
4679 unsigned param_pos_fixed_pt)
4680 {
4681 struct gallivm_state *gallivm = &ctx->gallivm;
4682 LLVMBuilderRef builder = gallivm->builder;
4683 LLVMValueRef slot, desc, offset, row, bit, address[2];
4684
4685 /* Use the fixed-point gl_FragCoord input.
4686 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4687 * per coordinate to get the repeating effect.
4688 */
4689 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4690 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4691
4692 /* Load the buffer descriptor. */
4693 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4694 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4695
4696 /* The stipple pattern is 32x32, each row has 32 bits. */
4697 offset = LLVMBuildMul(builder, address[1],
4698 LLVMConstInt(ctx->i32, 4, 0), "");
4699 row = buffer_load_const(ctx, desc, offset);
4700 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4701 bit = LLVMBuildLShr(builder, row, address[0], "");
4702 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4703
4704 /* The intrinsic kills the thread if arg < 0. */
4705 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4706 LLVMConstReal(ctx->f32, -1), "");
4707 ac_build_kill(&ctx->ac, bit);
4708 }
4709
4710 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4711 struct si_shader_config *conf,
4712 unsigned symbol_offset)
4713 {
4714 unsigned i;
4715 const unsigned char *config =
4716 ac_shader_binary_config_start(binary, symbol_offset);
4717 bool really_needs_scratch = false;
4718
4719 /* LLVM adds SGPR spills to the scratch size.
4720 * Find out if we really need the scratch buffer.
4721 */
4722 for (i = 0; i < binary->reloc_count; i++) {
4723 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4724
4725 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4726 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4727 really_needs_scratch = true;
4728 break;
4729 }
4730 }
4731
4732 /* XXX: We may be able to emit some of these values directly rather than
4733 * extracting fields to be emitted later.
4734 */
4735
4736 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4737 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4738 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4739 switch (reg) {
4740 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4741 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4742 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4743 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4744 case R_00B848_COMPUTE_PGM_RSRC1:
4745 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4746 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4747 conf->float_mode = G_00B028_FLOAT_MODE(value);
4748 conf->rsrc1 = value;
4749 break;
4750 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4751 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4752 break;
4753 case R_00B84C_COMPUTE_PGM_RSRC2:
4754 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4755 conf->rsrc2 = value;
4756 break;
4757 case R_0286CC_SPI_PS_INPUT_ENA:
4758 conf->spi_ps_input_ena = value;
4759 break;
4760 case R_0286D0_SPI_PS_INPUT_ADDR:
4761 conf->spi_ps_input_addr = value;
4762 break;
4763 case R_0286E8_SPI_TMPRING_SIZE:
4764 case R_00B860_COMPUTE_TMPRING_SIZE:
4765 /* WAVESIZE is in units of 256 dwords. */
4766 if (really_needs_scratch)
4767 conf->scratch_bytes_per_wave =
4768 G_00B860_WAVESIZE(value) * 256 * 4;
4769 break;
4770 case 0x4: /* SPILLED_SGPRS */
4771 conf->spilled_sgprs = value;
4772 break;
4773 case 0x8: /* SPILLED_VGPRS */
4774 conf->spilled_vgprs = value;
4775 break;
4776 default:
4777 {
4778 static bool printed;
4779
4780 if (!printed) {
4781 fprintf(stderr, "Warning: LLVM emitted unknown "
4782 "config register: 0x%x\n", reg);
4783 printed = true;
4784 }
4785 }
4786 break;
4787 }
4788 }
4789
4790 if (!conf->spi_ps_input_addr)
4791 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4792 }
4793
4794 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4795 uint64_t scratch_va)
4796 {
4797 unsigned i;
4798 uint32_t scratch_rsrc_dword0 = scratch_va;
4799 uint32_t scratch_rsrc_dword1 =
4800 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4801
4802 /* Enable scratch coalescing. */
4803 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4804
4805 for (i = 0 ; i < shader->binary.reloc_count; i++) {
4806 const struct ac_shader_reloc *reloc =
4807 &shader->binary.relocs[i];
4808 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4809 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4810 &scratch_rsrc_dword0, 4);
4811 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4812 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4813 &scratch_rsrc_dword1, 4);
4814 }
4815 }
4816 }
4817
4818 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4819 {
4820 unsigned size = shader->binary.code_size;
4821
4822 if (shader->prolog)
4823 size += shader->prolog->binary.code_size;
4824 if (shader->previous_stage)
4825 size += shader->previous_stage->binary.code_size;
4826 if (shader->prolog2)
4827 size += shader->prolog2->binary.code_size;
4828 if (shader->epilog)
4829 size += shader->epilog->binary.code_size;
4830 return size;
4831 }
4832
4833 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4834 {
4835 const struct ac_shader_binary *prolog =
4836 shader->prolog ? &shader->prolog->binary : NULL;
4837 const struct ac_shader_binary *previous_stage =
4838 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4839 const struct ac_shader_binary *prolog2 =
4840 shader->prolog2 ? &shader->prolog2->binary : NULL;
4841 const struct ac_shader_binary *epilog =
4842 shader->epilog ? &shader->epilog->binary : NULL;
4843 const struct ac_shader_binary *mainb = &shader->binary;
4844 unsigned bo_size = si_get_shader_binary_size(shader) +
4845 (!epilog ? mainb->rodata_size : 0);
4846 unsigned char *ptr;
4847
4848 assert(!prolog || !prolog->rodata_size);
4849 assert(!previous_stage || !previous_stage->rodata_size);
4850 assert(!prolog2 || !prolog2->rodata_size);
4851 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4852 !mainb->rodata_size);
4853 assert(!epilog || !epilog->rodata_size);
4854
4855 r600_resource_reference(&shader->bo, NULL);
4856 shader->bo = (struct r600_resource*)
4857 pipe_buffer_create(&sscreen->b.b, 0,
4858 PIPE_USAGE_IMMUTABLE,
4859 align(bo_size, SI_CPDMA_ALIGNMENT));
4860 if (!shader->bo)
4861 return -ENOMEM;
4862
4863 /* Upload. */
4864 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4865 PIPE_TRANSFER_READ_WRITE |
4866 PIPE_TRANSFER_UNSYNCHRONIZED);
4867
4868 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4869 * endian-independent. */
4870 if (prolog) {
4871 memcpy(ptr, prolog->code, prolog->code_size);
4872 ptr += prolog->code_size;
4873 }
4874 if (previous_stage) {
4875 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4876 ptr += previous_stage->code_size;
4877 }
4878 if (prolog2) {
4879 memcpy(ptr, prolog2->code, prolog2->code_size);
4880 ptr += prolog2->code_size;
4881 }
4882
4883 memcpy(ptr, mainb->code, mainb->code_size);
4884 ptr += mainb->code_size;
4885
4886 if (epilog)
4887 memcpy(ptr, epilog->code, epilog->code_size);
4888 else if (mainb->rodata_size > 0)
4889 memcpy(ptr, mainb->rodata, mainb->rodata_size);
4890
4891 sscreen->b.ws->buffer_unmap(shader->bo->buf);
4892 return 0;
4893 }
4894
4895 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
4896 struct pipe_debug_callback *debug,
4897 const char *name, FILE *file)
4898 {
4899 char *line, *p;
4900 unsigned i, count;
4901
4902 if (binary->disasm_string) {
4903 fprintf(file, "Shader %s disassembly:\n", name);
4904 fprintf(file, "%s", binary->disasm_string);
4905
4906 if (debug && debug->debug_message) {
4907 /* Very long debug messages are cut off, so send the
4908 * disassembly one line at a time. This causes more
4909 * overhead, but on the plus side it simplifies
4910 * parsing of resulting logs.
4911 */
4912 pipe_debug_message(debug, SHADER_INFO,
4913 "Shader Disassembly Begin");
4914
4915 line = binary->disasm_string;
4916 while (*line) {
4917 p = util_strchrnul(line, '\n');
4918 count = p - line;
4919
4920 if (count) {
4921 pipe_debug_message(debug, SHADER_INFO,
4922 "%.*s", count, line);
4923 }
4924
4925 if (!*p)
4926 break;
4927 line = p + 1;
4928 }
4929
4930 pipe_debug_message(debug, SHADER_INFO,
4931 "Shader Disassembly End");
4932 }
4933 } else {
4934 fprintf(file, "Shader %s binary:\n", name);
4935 for (i = 0; i < binary->code_size; i += 4) {
4936 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
4937 binary->code[i + 3], binary->code[i + 2],
4938 binary->code[i + 1], binary->code[i]);
4939 }
4940 }
4941 }
4942
4943 static void si_shader_dump_stats(struct si_screen *sscreen,
4944 const struct si_shader *shader,
4945 struct pipe_debug_callback *debug,
4946 unsigned processor,
4947 FILE *file,
4948 bool check_debug_option)
4949 {
4950 const struct si_shader_config *conf = &shader->config;
4951 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
4952 unsigned code_size = si_get_shader_binary_size(shader);
4953 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
4954 unsigned lds_per_wave = 0;
4955 unsigned max_simd_waves = 10;
4956
4957 /* Compute LDS usage for PS. */
4958 switch (processor) {
4959 case PIPE_SHADER_FRAGMENT:
4960 /* The minimum usage per wave is (num_inputs * 48). The maximum
4961 * usage is (num_inputs * 48 * 16).
4962 * We can get anything in between and it varies between waves.
4963 *
4964 * The 48 bytes per input for a single primitive is equal to
4965 * 4 bytes/component * 4 components/input * 3 points.
4966 *
4967 * Other stages don't know the size at compile time or don't
4968 * allocate LDS per wave, but instead they do it per thread group.
4969 */
4970 lds_per_wave = conf->lds_size * lds_increment +
4971 align(num_inputs * 48, lds_increment);
4972 break;
4973 case PIPE_SHADER_COMPUTE:
4974 if (shader->selector) {
4975 unsigned max_workgroup_size =
4976 si_get_max_workgroup_size(shader);
4977 lds_per_wave = (conf->lds_size * lds_increment) /
4978 DIV_ROUND_UP(max_workgroup_size, 64);
4979 }
4980 break;
4981 }
4982
4983 /* Compute the per-SIMD wave counts. */
4984 if (conf->num_sgprs) {
4985 if (sscreen->b.chip_class >= VI)
4986 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
4987 else
4988 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
4989 }
4990
4991 if (conf->num_vgprs)
4992 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
4993
4994 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
4995 * 16KB makes some SIMDs unoccupied). */
4996 if (lds_per_wave)
4997 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
4998
4999 if (!check_debug_option ||
5000 r600_can_dump_shader(&sscreen->b, processor)) {
5001 if (processor == PIPE_SHADER_FRAGMENT) {
5002 fprintf(file, "*** SHADER CONFIG ***\n"
5003 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5004 "SPI_PS_INPUT_ENA = 0x%04x\n",
5005 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5006 }
5007
5008 fprintf(file, "*** SHADER STATS ***\n"
5009 "SGPRS: %d\n"
5010 "VGPRS: %d\n"
5011 "Spilled SGPRs: %d\n"
5012 "Spilled VGPRs: %d\n"
5013 "Private memory VGPRs: %d\n"
5014 "Code Size: %d bytes\n"
5015 "LDS: %d blocks\n"
5016 "Scratch: %d bytes per wave\n"
5017 "Max Waves: %d\n"
5018 "********************\n\n\n",
5019 conf->num_sgprs, conf->num_vgprs,
5020 conf->spilled_sgprs, conf->spilled_vgprs,
5021 conf->private_mem_vgprs, code_size,
5022 conf->lds_size, conf->scratch_bytes_per_wave,
5023 max_simd_waves);
5024 }
5025
5026 pipe_debug_message(debug, SHADER_INFO,
5027 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5028 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5029 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5030 conf->num_sgprs, conf->num_vgprs, code_size,
5031 conf->lds_size, conf->scratch_bytes_per_wave,
5032 max_simd_waves, conf->spilled_sgprs,
5033 conf->spilled_vgprs, conf->private_mem_vgprs);
5034 }
5035
5036 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5037 {
5038 switch (processor) {
5039 case PIPE_SHADER_VERTEX:
5040 if (shader->key.as_es)
5041 return "Vertex Shader as ES";
5042 else if (shader->key.as_ls)
5043 return "Vertex Shader as LS";
5044 else
5045 return "Vertex Shader as VS";
5046 case PIPE_SHADER_TESS_CTRL:
5047 return "Tessellation Control Shader";
5048 case PIPE_SHADER_TESS_EVAL:
5049 if (shader->key.as_es)
5050 return "Tessellation Evaluation Shader as ES";
5051 else
5052 return "Tessellation Evaluation Shader as VS";
5053 case PIPE_SHADER_GEOMETRY:
5054 if (shader->is_gs_copy_shader)
5055 return "GS Copy Shader as VS";
5056 else
5057 return "Geometry Shader";
5058 case PIPE_SHADER_FRAGMENT:
5059 return "Pixel Shader";
5060 case PIPE_SHADER_COMPUTE:
5061 return "Compute Shader";
5062 default:
5063 return "Unknown Shader";
5064 }
5065 }
5066
5067 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5068 struct pipe_debug_callback *debug, unsigned processor,
5069 FILE *file, bool check_debug_option)
5070 {
5071 if (!check_debug_option ||
5072 r600_can_dump_shader(&sscreen->b, processor))
5073 si_dump_shader_key(processor, shader, file);
5074
5075 if (!check_debug_option && shader->binary.llvm_ir_string) {
5076 if (shader->previous_stage &&
5077 shader->previous_stage->binary.llvm_ir_string) {
5078 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5079 si_get_shader_name(shader, processor));
5080 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5081 }
5082
5083 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5084 si_get_shader_name(shader, processor));
5085 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5086 }
5087
5088 if (!check_debug_option ||
5089 (r600_can_dump_shader(&sscreen->b, processor) &&
5090 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5091 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5092
5093 if (shader->prolog)
5094 si_shader_dump_disassembly(&shader->prolog->binary,
5095 debug, "prolog", file);
5096 if (shader->previous_stage)
5097 si_shader_dump_disassembly(&shader->previous_stage->binary,
5098 debug, "previous stage", file);
5099 if (shader->prolog2)
5100 si_shader_dump_disassembly(&shader->prolog2->binary,
5101 debug, "prolog2", file);
5102
5103 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5104
5105 if (shader->epilog)
5106 si_shader_dump_disassembly(&shader->epilog->binary,
5107 debug, "epilog", file);
5108 fprintf(file, "\n");
5109 }
5110
5111 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5112 check_debug_option);
5113 }
5114
5115 static int si_compile_llvm(struct si_screen *sscreen,
5116 struct ac_shader_binary *binary,
5117 struct si_shader_config *conf,
5118 LLVMTargetMachineRef tm,
5119 LLVMModuleRef mod,
5120 struct pipe_debug_callback *debug,
5121 unsigned processor,
5122 const char *name)
5123 {
5124 int r = 0;
5125 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5126
5127 if (r600_can_dump_shader(&sscreen->b, processor)) {
5128 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5129
5130 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5131 fprintf(stderr, "%s LLVM IR:\n\n", name);
5132 ac_dump_module(mod);
5133 fprintf(stderr, "\n");
5134 }
5135 }
5136
5137 if (sscreen->record_llvm_ir) {
5138 char *ir = LLVMPrintModuleToString(mod);
5139 binary->llvm_ir_string = strdup(ir);
5140 LLVMDisposeMessage(ir);
5141 }
5142
5143 if (!si_replace_shader(count, binary)) {
5144 r = si_llvm_compile(mod, binary, tm, debug);
5145 if (r)
5146 return r;
5147 }
5148
5149 si_shader_binary_read_config(binary, conf, 0);
5150
5151 /* Enable 64-bit and 16-bit denormals, because there is no performance
5152 * cost.
5153 *
5154 * If denormals are enabled, all floating-point output modifiers are
5155 * ignored.
5156 *
5157 * Don't enable denormals for 32-bit floats, because:
5158 * - Floating-point output modifiers would be ignored by the hw.
5159 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5160 * have to stop using those.
5161 * - SI & CI would be very slow.
5162 */
5163 conf->float_mode |= V_00B028_FP_64_DENORMS;
5164
5165 FREE(binary->config);
5166 FREE(binary->global_symbol_offsets);
5167 binary->config = NULL;
5168 binary->global_symbol_offsets = NULL;
5169
5170 /* Some shaders can't have rodata because their binaries can be
5171 * concatenated.
5172 */
5173 if (binary->rodata_size &&
5174 (processor == PIPE_SHADER_VERTEX ||
5175 processor == PIPE_SHADER_TESS_CTRL ||
5176 processor == PIPE_SHADER_TESS_EVAL ||
5177 processor == PIPE_SHADER_FRAGMENT)) {
5178 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5179 return -EINVAL;
5180 }
5181
5182 return r;
5183 }
5184
5185 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5186 {
5187 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5188 LLVMBuildRetVoid(ctx->gallivm.builder);
5189 else
5190 LLVMBuildRet(ctx->gallivm.builder, ret);
5191 }
5192
5193 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5194 struct si_shader *
5195 si_generate_gs_copy_shader(struct si_screen *sscreen,
5196 LLVMTargetMachineRef tm,
5197 struct si_shader_selector *gs_selector,
5198 struct pipe_debug_callback *debug)
5199 {
5200 struct si_shader_context ctx;
5201 struct si_shader *shader;
5202 struct gallivm_state *gallivm = &ctx.gallivm;
5203 LLVMBuilderRef builder;
5204 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5205 struct lp_build_context *uint = &bld_base->uint_bld;
5206 struct si_shader_output_values *outputs;
5207 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5208 int i, r;
5209
5210 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5211
5212 if (!outputs)
5213 return NULL;
5214
5215 shader = CALLOC_STRUCT(si_shader);
5216 if (!shader) {
5217 FREE(outputs);
5218 return NULL;
5219 }
5220
5221
5222 shader->selector = gs_selector;
5223 shader->is_gs_copy_shader = true;
5224
5225 si_init_shader_ctx(&ctx, sscreen, tm);
5226 ctx.shader = shader;
5227 ctx.type = PIPE_SHADER_VERTEX;
5228
5229 builder = gallivm->builder;
5230
5231 create_function(&ctx);
5232 preload_ring_buffers(&ctx);
5233
5234 LLVMValueRef voffset =
5235 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
5236 ctx.param_vertex_id), 4);
5237
5238 /* Fetch the vertex stream ID.*/
5239 LLVMValueRef stream_id;
5240
5241 if (gs_selector->so.num_outputs)
5242 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5243 else
5244 stream_id = ctx.i32_0;
5245
5246 /* Fill in output information. */
5247 for (i = 0; i < gsinfo->num_outputs; ++i) {
5248 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5249 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5250
5251 for (int chan = 0; chan < 4; chan++) {
5252 outputs[i].vertex_stream[chan] =
5253 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5254 }
5255 }
5256
5257 LLVMBasicBlockRef end_bb;
5258 LLVMValueRef switch_inst;
5259
5260 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5261 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5262
5263 for (int stream = 0; stream < 4; stream++) {
5264 LLVMBasicBlockRef bb;
5265 unsigned offset;
5266
5267 if (!gsinfo->num_stream_output_components[stream])
5268 continue;
5269
5270 if (stream > 0 && !gs_selector->so.num_outputs)
5271 continue;
5272
5273 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5274 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5275 LLVMPositionBuilderAtEnd(builder, bb);
5276
5277 /* Fetch vertex data from GSVS ring */
5278 offset = 0;
5279 for (i = 0; i < gsinfo->num_outputs; ++i) {
5280 for (unsigned chan = 0; chan < 4; chan++) {
5281 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5282 outputs[i].vertex_stream[chan] != stream) {
5283 outputs[i].values[chan] = ctx.bld_base.base.undef;
5284 continue;
5285 }
5286
5287 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5288 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5289 offset++;
5290
5291 outputs[i].values[chan] =
5292 ac_build_buffer_load(&ctx.ac,
5293 ctx.gsvs_ring[0], 1,
5294 ctx.i32_0, voffset,
5295 soffset, 0, 1, 1,
5296 true, false);
5297 }
5298 }
5299
5300 /* Streamout and exports. */
5301 if (gs_selector->so.num_outputs) {
5302 si_llvm_emit_streamout(&ctx, outputs,
5303 gsinfo->num_outputs,
5304 stream);
5305 }
5306
5307 if (stream == 0)
5308 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5309
5310 LLVMBuildBr(builder, end_bb);
5311 }
5312
5313 LLVMPositionBuilderAtEnd(builder, end_bb);
5314
5315 LLVMBuildRetVoid(gallivm->builder);
5316
5317 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5318 si_llvm_optimize_module(&ctx);
5319
5320 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5321 &ctx.shader->config, ctx.tm,
5322 ctx.gallivm.module,
5323 debug, PIPE_SHADER_GEOMETRY,
5324 "GS Copy Shader");
5325 if (!r) {
5326 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5327 fprintf(stderr, "GS Copy Shader:\n");
5328 si_shader_dump(sscreen, ctx.shader, debug,
5329 PIPE_SHADER_GEOMETRY, stderr, true);
5330 r = si_shader_binary_upload(sscreen, ctx.shader);
5331 }
5332
5333 si_llvm_dispose(&ctx);
5334
5335 FREE(outputs);
5336
5337 if (r != 0) {
5338 FREE(shader);
5339 shader = NULL;
5340 }
5341 return shader;
5342 }
5343
5344 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5345 const struct si_vs_prolog_bits *prolog,
5346 const char *prefix, FILE *f)
5347 {
5348 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5349 prefix, prolog->instance_divisor_is_one);
5350 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5351 prefix, prolog->instance_divisor_is_fetched);
5352
5353 fprintf(f, " mono.vs.fix_fetch = {");
5354 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5355 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5356 fprintf(f, "}\n");
5357 }
5358
5359 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5360 FILE *f)
5361 {
5362 const struct si_shader_key *key = &shader->key;
5363
5364 fprintf(f, "SHADER KEY\n");
5365
5366 switch (processor) {
5367 case PIPE_SHADER_VERTEX:
5368 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5369 "part.vs.prolog", f);
5370 fprintf(f, " as_es = %u\n", key->as_es);
5371 fprintf(f, " as_ls = %u\n", key->as_ls);
5372 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5373 key->mono.u.vs_export_prim_id);
5374 break;
5375
5376 case PIPE_SHADER_TESS_CTRL:
5377 if (shader->selector->screen->b.chip_class >= GFX9) {
5378 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5379 "part.tcs.ls_prolog", f);
5380 }
5381 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5382 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5383 break;
5384
5385 case PIPE_SHADER_TESS_EVAL:
5386 fprintf(f, " as_es = %u\n", key->as_es);
5387 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5388 key->mono.u.vs_export_prim_id);
5389 break;
5390
5391 case PIPE_SHADER_GEOMETRY:
5392 if (shader->is_gs_copy_shader)
5393 break;
5394
5395 if (shader->selector->screen->b.chip_class >= GFX9 &&
5396 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5397 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5398 "part.gs.vs_prolog", f);
5399 }
5400 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5401 break;
5402
5403 case PIPE_SHADER_COMPUTE:
5404 break;
5405
5406 case PIPE_SHADER_FRAGMENT:
5407 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5408 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5409 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5410 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5411 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5412 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5413 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5414 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5415 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5416 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5417 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5418 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5419 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5420 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5421 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5422 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5423 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5424 break;
5425
5426 default:
5427 assert(0);
5428 }
5429
5430 if ((processor == PIPE_SHADER_GEOMETRY ||
5431 processor == PIPE_SHADER_TESS_EVAL ||
5432 processor == PIPE_SHADER_VERTEX) &&
5433 !key->as_es && !key->as_ls) {
5434 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5435 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5436 }
5437 }
5438
5439 static void si_init_shader_ctx(struct si_shader_context *ctx,
5440 struct si_screen *sscreen,
5441 LLVMTargetMachineRef tm)
5442 {
5443 struct lp_build_tgsi_context *bld_base;
5444
5445 si_llvm_context_init(ctx, sscreen, tm);
5446
5447 bld_base = &ctx->bld_base;
5448 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5449
5450 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5451 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5452 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5453
5454 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5455
5456 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5457
5458 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5459 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5460 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5461 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5462
5463 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5464 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5465 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5466 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5467 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5468 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5469 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5470 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5471 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5472
5473 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5474 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5475 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5476 }
5477
5478 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5479 {
5480 struct si_shader *shader = ctx->shader;
5481 struct tgsi_shader_info *info = &shader->selector->info;
5482
5483 if ((ctx->type != PIPE_SHADER_VERTEX &&
5484 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5485 shader->key.as_ls ||
5486 shader->key.as_es)
5487 return;
5488
5489 ac_optimize_vs_outputs(&ctx->ac,
5490 ctx->main_fn,
5491 shader->info.vs_output_param_offset,
5492 info->num_outputs,
5493 &shader->info.nr_param_exports);
5494 }
5495
5496 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5497 {
5498 ctx->shader->config.private_mem_vgprs = 0;
5499
5500 /* Process all LLVM instructions. */
5501 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5502 while (bb) {
5503 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5504
5505 while (next) {
5506 LLVMValueRef inst = next;
5507 next = LLVMGetNextInstruction(next);
5508
5509 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5510 continue;
5511
5512 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5513 /* No idea why LLVM aligns allocas to 4 elements. */
5514 unsigned alignment = LLVMGetAlignment(inst);
5515 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5516 ctx->shader->config.private_mem_vgprs += dw_size;
5517 }
5518 bb = LLVMGetNextBasicBlock(bb);
5519 }
5520 }
5521
5522 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5523 {
5524 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5525 lp_build_intrinsic(ctx->gallivm.builder,
5526 "llvm.amdgcn.init.exec", ctx->voidt,
5527 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5528 }
5529
5530 static void si_init_exec_from_input(struct si_shader_context *ctx,
5531 unsigned param, unsigned bitoffset)
5532 {
5533 LLVMValueRef args[] = {
5534 LLVMGetParam(ctx->main_fn, param),
5535 LLVMConstInt(ctx->i32, bitoffset, 0),
5536 };
5537 lp_build_intrinsic(ctx->gallivm.builder,
5538 "llvm.amdgcn.init.exec.from.input",
5539 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5540 }
5541
5542 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5543 bool is_monolithic)
5544 {
5545 struct si_shader *shader = ctx->shader;
5546 struct si_shader_selector *sel = shader->selector;
5547 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5548
5549 switch (ctx->type) {
5550 case PIPE_SHADER_VERTEX:
5551 ctx->load_input = declare_input_vs;
5552 if (shader->key.as_ls)
5553 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5554 else if (shader->key.as_es)
5555 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5556 else
5557 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5558 break;
5559 case PIPE_SHADER_TESS_CTRL:
5560 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5561 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5562 bld_base->emit_store = store_output_tcs;
5563 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5564 break;
5565 case PIPE_SHADER_TESS_EVAL:
5566 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5567 if (shader->key.as_es)
5568 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5569 else
5570 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5571 break;
5572 case PIPE_SHADER_GEOMETRY:
5573 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5574 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5575 break;
5576 case PIPE_SHADER_FRAGMENT:
5577 ctx->load_input = declare_input_fs;
5578 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
5579 break;
5580 case PIPE_SHADER_COMPUTE:
5581 ctx->declare_memory_region = declare_compute_memory;
5582 break;
5583 default:
5584 assert(!"Unsupported shader type");
5585 return false;
5586 }
5587
5588 create_function(ctx);
5589 preload_ring_buffers(ctx);
5590
5591 /* For GFX9 merged shaders:
5592 * - Set EXEC for the first shader. If the prolog is present, set
5593 * EXEC there instead.
5594 * - Add a barrier before the second shader.
5595 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5596 * an if-statement. This is required for correctness in geometry
5597 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5598 * GS_CUT messages.
5599 *
5600 * For monolithic merged shaders, the first shader is wrapped in an
5601 * if-block together with its prolog in si_build_wrapper_function.
5602 */
5603 if (ctx->screen->b.chip_class >= GFX9) {
5604 if (!is_monolithic &&
5605 sel->info.num_instructions > 1 && /* not empty shader */
5606 (shader->key.as_es || shader->key.as_ls) &&
5607 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5608 (ctx->type == PIPE_SHADER_VERTEX &&
5609 !sel->vs_needs_prolog))) {
5610 si_init_exec_from_input(ctx,
5611 ctx->param_merged_wave_info, 0);
5612 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5613 ctx->type == PIPE_SHADER_GEOMETRY) {
5614 if (!is_monolithic)
5615 si_init_exec_full_mask(ctx);
5616
5617 /* The barrier must execute for all shaders in a
5618 * threadgroup.
5619 */
5620 si_llvm_emit_barrier(NULL, bld_base, NULL);
5621
5622 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5623 LLVMValueRef ena =
5624 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5625 ac_get_thread_id(&ctx->ac), num_threads, "");
5626 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5627 }
5628 }
5629
5630 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5631 int i;
5632 for (i = 0; i < 4; i++) {
5633 ctx->gs_next_vertex[i] =
5634 lp_build_alloca(&ctx->gallivm,
5635 ctx->i32, "");
5636 }
5637 }
5638
5639 if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5640 ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5641 /* This is initialized to 0.0 = not kill. */
5642 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5643 }
5644
5645 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5646 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5647 return false;
5648 }
5649
5650 si_llvm_build_ret(ctx, ctx->return_value);
5651 return true;
5652 }
5653
5654 /**
5655 * Compute the VS prolog key, which contains all the information needed to
5656 * build the VS prolog function, and set shader->info bits where needed.
5657 *
5658 * \param info Shader info of the vertex shader.
5659 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5660 * \param prolog_key Key of the VS prolog
5661 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5662 * \param key Output shader part key.
5663 */
5664 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5665 unsigned num_input_sgprs,
5666 const struct si_vs_prolog_bits *prolog_key,
5667 struct si_shader *shader_out,
5668 union si_shader_part_key *key)
5669 {
5670 memset(key, 0, sizeof(*key));
5671 key->vs_prolog.states = *prolog_key;
5672 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5673 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5674 key->vs_prolog.as_ls = shader_out->key.as_ls;
5675
5676 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5677 key->vs_prolog.as_ls = 1;
5678 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5679 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5680 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5681 }
5682
5683 /* Enable loading the InstanceID VGPR. */
5684 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5685
5686 if ((key->vs_prolog.states.instance_divisor_is_one |
5687 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5688 shader_out->info.uses_instanceid = true;
5689 }
5690
5691 /**
5692 * Compute the PS prolog key, which contains all the information needed to
5693 * build the PS prolog function, and set related bits in shader->config.
5694 */
5695 static void si_get_ps_prolog_key(struct si_shader *shader,
5696 union si_shader_part_key *key,
5697 bool separate_prolog)
5698 {
5699 struct tgsi_shader_info *info = &shader->selector->info;
5700
5701 memset(key, 0, sizeof(*key));
5702 key->ps_prolog.states = shader->key.part.ps.prolog;
5703 key->ps_prolog.colors_read = info->colors_read;
5704 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5705 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5706 key->ps_prolog.wqm = info->uses_derivatives &&
5707 (key->ps_prolog.colors_read ||
5708 key->ps_prolog.states.force_persp_sample_interp ||
5709 key->ps_prolog.states.force_linear_sample_interp ||
5710 key->ps_prolog.states.force_persp_center_interp ||
5711 key->ps_prolog.states.force_linear_center_interp ||
5712 key->ps_prolog.states.bc_optimize_for_persp ||
5713 key->ps_prolog.states.bc_optimize_for_linear);
5714
5715 if (info->colors_read) {
5716 unsigned *color = shader->selector->color_attr_index;
5717
5718 if (shader->key.part.ps.prolog.color_two_side) {
5719 /* BCOLORs are stored after the last input. */
5720 key->ps_prolog.num_interp_inputs = info->num_inputs;
5721 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5722 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5723 }
5724
5725 for (unsigned i = 0; i < 2; i++) {
5726 unsigned interp = info->input_interpolate[color[i]];
5727 unsigned location = info->input_interpolate_loc[color[i]];
5728
5729 if (!(info->colors_read & (0xf << i*4)))
5730 continue;
5731
5732 key->ps_prolog.color_attr_index[i] = color[i];
5733
5734 if (shader->key.part.ps.prolog.flatshade_colors &&
5735 interp == TGSI_INTERPOLATE_COLOR)
5736 interp = TGSI_INTERPOLATE_CONSTANT;
5737
5738 switch (interp) {
5739 case TGSI_INTERPOLATE_CONSTANT:
5740 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5741 break;
5742 case TGSI_INTERPOLATE_PERSPECTIVE:
5743 case TGSI_INTERPOLATE_COLOR:
5744 /* Force the interpolation location for colors here. */
5745 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5746 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5747 if (shader->key.part.ps.prolog.force_persp_center_interp)
5748 location = TGSI_INTERPOLATE_LOC_CENTER;
5749
5750 switch (location) {
5751 case TGSI_INTERPOLATE_LOC_SAMPLE:
5752 key->ps_prolog.color_interp_vgpr_index[i] = 0;
5753 shader->config.spi_ps_input_ena |=
5754 S_0286CC_PERSP_SAMPLE_ENA(1);
5755 break;
5756 case TGSI_INTERPOLATE_LOC_CENTER:
5757 key->ps_prolog.color_interp_vgpr_index[i] = 2;
5758 shader->config.spi_ps_input_ena |=
5759 S_0286CC_PERSP_CENTER_ENA(1);
5760 break;
5761 case TGSI_INTERPOLATE_LOC_CENTROID:
5762 key->ps_prolog.color_interp_vgpr_index[i] = 4;
5763 shader->config.spi_ps_input_ena |=
5764 S_0286CC_PERSP_CENTROID_ENA(1);
5765 break;
5766 default:
5767 assert(0);
5768 }
5769 break;
5770 case TGSI_INTERPOLATE_LINEAR:
5771 /* Force the interpolation location for colors here. */
5772 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5773 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5774 if (shader->key.part.ps.prolog.force_linear_center_interp)
5775 location = TGSI_INTERPOLATE_LOC_CENTER;
5776
5777 /* The VGPR assignment for non-monolithic shaders
5778 * works because InitialPSInputAddr is set on the
5779 * main shader and PERSP_PULL_MODEL is never used.
5780 */
5781 switch (location) {
5782 case TGSI_INTERPOLATE_LOC_SAMPLE:
5783 key->ps_prolog.color_interp_vgpr_index[i] =
5784 separate_prolog ? 6 : 9;
5785 shader->config.spi_ps_input_ena |=
5786 S_0286CC_LINEAR_SAMPLE_ENA(1);
5787 break;
5788 case TGSI_INTERPOLATE_LOC_CENTER:
5789 key->ps_prolog.color_interp_vgpr_index[i] =
5790 separate_prolog ? 8 : 11;
5791 shader->config.spi_ps_input_ena |=
5792 S_0286CC_LINEAR_CENTER_ENA(1);
5793 break;
5794 case TGSI_INTERPOLATE_LOC_CENTROID:
5795 key->ps_prolog.color_interp_vgpr_index[i] =
5796 separate_prolog ? 10 : 13;
5797 shader->config.spi_ps_input_ena |=
5798 S_0286CC_LINEAR_CENTROID_ENA(1);
5799 break;
5800 default:
5801 assert(0);
5802 }
5803 break;
5804 default:
5805 assert(0);
5806 }
5807 }
5808 }
5809 }
5810
5811 /**
5812 * Check whether a PS prolog is required based on the key.
5813 */
5814 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5815 {
5816 return key->ps_prolog.colors_read ||
5817 key->ps_prolog.states.force_persp_sample_interp ||
5818 key->ps_prolog.states.force_linear_sample_interp ||
5819 key->ps_prolog.states.force_persp_center_interp ||
5820 key->ps_prolog.states.force_linear_center_interp ||
5821 key->ps_prolog.states.bc_optimize_for_persp ||
5822 key->ps_prolog.states.bc_optimize_for_linear ||
5823 key->ps_prolog.states.poly_stipple;
5824 }
5825
5826 /**
5827 * Compute the PS epilog key, which contains all the information needed to
5828 * build the PS epilog function.
5829 */
5830 static void si_get_ps_epilog_key(struct si_shader *shader,
5831 union si_shader_part_key *key)
5832 {
5833 struct tgsi_shader_info *info = &shader->selector->info;
5834 memset(key, 0, sizeof(*key));
5835 key->ps_epilog.colors_written = info->colors_written;
5836 key->ps_epilog.writes_z = info->writes_z;
5837 key->ps_epilog.writes_stencil = info->writes_stencil;
5838 key->ps_epilog.writes_samplemask = info->writes_samplemask;
5839 key->ps_epilog.states = shader->key.part.ps.epilog;
5840 }
5841
5842 /**
5843 * Build the GS prolog function. Rotate the input vertices for triangle strips
5844 * with adjacency.
5845 */
5846 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5847 union si_shader_part_key *key)
5848 {
5849 unsigned num_sgprs, num_vgprs;
5850 struct gallivm_state *gallivm = &ctx->gallivm;
5851 struct si_function_info fninfo;
5852 LLVMBuilderRef builder = gallivm->builder;
5853 LLVMTypeRef returns[48];
5854 LLVMValueRef func, ret;
5855
5856 si_init_function_info(&fninfo);
5857
5858 if (ctx->screen->b.chip_class >= GFX9) {
5859 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
5860 num_vgprs = 5; /* ES inputs are not needed by GS */
5861 } else {
5862 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
5863 num_vgprs = 8;
5864 }
5865
5866 for (unsigned i = 0; i < num_sgprs; ++i) {
5867 add_arg(&fninfo, ARG_SGPR, ctx->i32);
5868 returns[i] = ctx->i32;
5869 }
5870
5871 for (unsigned i = 0; i < num_vgprs; ++i) {
5872 add_arg(&fninfo, ARG_VGPR, ctx->i32);
5873 returns[num_sgprs + i] = ctx->f32;
5874 }
5875
5876 /* Create the function. */
5877 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
5878 &fninfo, 0);
5879 func = ctx->main_fn;
5880
5881 /* Set the full EXEC mask for the prolog, because we are only fiddling
5882 * with registers here. The main shader part will set the correct EXEC
5883 * mask.
5884 */
5885 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
5886 si_init_exec_full_mask(ctx);
5887
5888 /* Copy inputs to outputs. This should be no-op, as the registers match,
5889 * but it will prevent the compiler from overwriting them unintentionally.
5890 */
5891 ret = ctx->return_value;
5892 for (unsigned i = 0; i < num_sgprs; i++) {
5893 LLVMValueRef p = LLVMGetParam(func, i);
5894 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
5895 }
5896 for (unsigned i = 0; i < num_vgprs; i++) {
5897 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
5898 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
5899 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
5900 }
5901
5902 if (key->gs_prolog.states.tri_strip_adj_fix) {
5903 /* Remap the input vertices for every other primitive. */
5904 const unsigned gfx6_vtx_params[6] = {
5905 num_sgprs,
5906 num_sgprs + 1,
5907 num_sgprs + 3,
5908 num_sgprs + 4,
5909 num_sgprs + 5,
5910 num_sgprs + 6
5911 };
5912 const unsigned gfx9_vtx_params[3] = {
5913 num_sgprs,
5914 num_sgprs + 1,
5915 num_sgprs + 4,
5916 };
5917 LLVMValueRef vtx_in[6], vtx_out[6];
5918 LLVMValueRef prim_id, rotate;
5919
5920 if (ctx->screen->b.chip_class >= GFX9) {
5921 for (unsigned i = 0; i < 3; i++) {
5922 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
5923 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
5924 }
5925 } else {
5926 for (unsigned i = 0; i < 6; i++)
5927 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
5928 }
5929
5930 prim_id = LLVMGetParam(func, num_sgprs + 2);
5931 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
5932
5933 for (unsigned i = 0; i < 6; ++i) {
5934 LLVMValueRef base, rotated;
5935 base = vtx_in[i];
5936 rotated = vtx_in[(i + 4) % 6];
5937 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
5938 }
5939
5940 if (ctx->screen->b.chip_class >= GFX9) {
5941 for (unsigned i = 0; i < 3; i++) {
5942 LLVMValueRef hi, out;
5943
5944 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
5945 LLVMConstInt(ctx->i32, 16, 0), "");
5946 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
5947 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
5948 ret = LLVMBuildInsertValue(builder, ret, out,
5949 gfx9_vtx_params[i], "");
5950 }
5951 } else {
5952 for (unsigned i = 0; i < 6; i++) {
5953 LLVMValueRef out;
5954
5955 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
5956 ret = LLVMBuildInsertValue(builder, ret, out,
5957 gfx6_vtx_params[i], "");
5958 }
5959 }
5960 }
5961
5962 LLVMBuildRet(builder, ret);
5963 }
5964
5965 /**
5966 * Given a list of shader part functions, build a wrapper function that
5967 * runs them in sequence to form a monolithic shader.
5968 */
5969 static void si_build_wrapper_function(struct si_shader_context *ctx,
5970 LLVMValueRef *parts,
5971 unsigned num_parts,
5972 unsigned main_part,
5973 unsigned next_shader_first_part)
5974 {
5975 struct gallivm_state *gallivm = &ctx->gallivm;
5976 LLVMBuilderRef builder = ctx->gallivm.builder;
5977 /* PS epilog has one arg per color component; gfx9 merged shader
5978 * prologs need to forward 32 user SGPRs.
5979 */
5980 struct si_function_info fninfo;
5981 LLVMValueRef initial[64], out[64];
5982 LLVMTypeRef function_type;
5983 unsigned num_first_params;
5984 unsigned num_out, initial_num_out;
5985 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
5986 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
5987 unsigned num_sgprs, num_vgprs;
5988 unsigned gprs;
5989 struct lp_build_if_state if_state;
5990
5991 si_init_function_info(&fninfo);
5992
5993 for (unsigned i = 0; i < num_parts; ++i) {
5994 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
5995 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
5996 }
5997
5998 /* The parameters of the wrapper function correspond to those of the
5999 * first part in terms of SGPRs and VGPRs, but we use the types of the
6000 * main part to get the right types. This is relevant for the
6001 * dereferenceable attribute on descriptor table pointers.
6002 */
6003 num_sgprs = 0;
6004 num_vgprs = 0;
6005
6006 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6007 num_first_params = LLVMCountParamTypes(function_type);
6008
6009 for (unsigned i = 0; i < num_first_params; ++i) {
6010 LLVMValueRef param = LLVMGetParam(parts[0], i);
6011
6012 if (ac_is_sgpr_param(param)) {
6013 assert(num_vgprs == 0);
6014 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6015 } else {
6016 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6017 }
6018 }
6019
6020 gprs = 0;
6021 while (gprs < num_sgprs + num_vgprs) {
6022 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6023 LLVMTypeRef type = LLVMTypeOf(param);
6024 unsigned size = llvm_get_type_size(type) / 4;
6025
6026 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6027
6028 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6029 assert(gprs + size <= num_sgprs + num_vgprs &&
6030 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6031
6032 gprs += size;
6033 }
6034
6035 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6036 si_get_max_workgroup_size(ctx->shader));
6037
6038 if (is_merged_shader(ctx->shader))
6039 si_init_exec_full_mask(ctx);
6040
6041 /* Record the arguments of the function as if they were an output of
6042 * a previous part.
6043 */
6044 num_out = 0;
6045 num_out_sgpr = 0;
6046
6047 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6048 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6049 LLVMTypeRef param_type = LLVMTypeOf(param);
6050 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6051 unsigned size = llvm_get_type_size(param_type) / 4;
6052
6053 if (size == 1) {
6054 if (param_type != out_type)
6055 param = LLVMBuildBitCast(builder, param, out_type, "");
6056 out[num_out++] = param;
6057 } else {
6058 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6059
6060 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6061 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6062 param_type = ctx->i64;
6063 }
6064
6065 if (param_type != vector_type)
6066 param = LLVMBuildBitCast(builder, param, vector_type, "");
6067
6068 for (unsigned j = 0; j < size; ++j)
6069 out[num_out++] = LLVMBuildExtractElement(
6070 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6071 }
6072
6073 if (i < fninfo.num_sgpr_params)
6074 num_out_sgpr = num_out;
6075 }
6076
6077 memcpy(initial, out, sizeof(out));
6078 initial_num_out = num_out;
6079 initial_num_out_sgpr = num_out_sgpr;
6080
6081 /* Now chain the parts. */
6082 for (unsigned part = 0; part < num_parts; ++part) {
6083 LLVMValueRef in[48];
6084 LLVMValueRef ret;
6085 LLVMTypeRef ret_type;
6086 unsigned out_idx = 0;
6087 unsigned num_params = LLVMCountParams(parts[part]);
6088
6089 /* Merged shaders are executed conditionally depending
6090 * on the number of enabled threads passed in the input SGPRs. */
6091 if (is_merged_shader(ctx->shader) && part == 0) {
6092 LLVMValueRef ena, count = initial[3];
6093
6094 count = LLVMBuildAnd(builder, count,
6095 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6096 ena = LLVMBuildICmp(builder, LLVMIntULT,
6097 ac_get_thread_id(&ctx->ac), count, "");
6098 lp_build_if(&if_state, &ctx->gallivm, ena);
6099 }
6100
6101 /* Derive arguments for the next part from outputs of the
6102 * previous one.
6103 */
6104 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6105 LLVMValueRef param;
6106 LLVMTypeRef param_type;
6107 bool is_sgpr;
6108 unsigned param_size;
6109 LLVMValueRef arg = NULL;
6110
6111 param = LLVMGetParam(parts[part], param_idx);
6112 param_type = LLVMTypeOf(param);
6113 param_size = llvm_get_type_size(param_type) / 4;
6114 is_sgpr = ac_is_sgpr_param(param);
6115
6116 if (is_sgpr) {
6117 #if HAVE_LLVM < 0x0400
6118 LLVMRemoveAttribute(param, LLVMByValAttribute);
6119 #else
6120 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6121 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6122 #endif
6123 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6124 }
6125
6126 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6127 assert(is_sgpr || out_idx >= num_out_sgpr);
6128
6129 if (param_size == 1)
6130 arg = out[out_idx];
6131 else
6132 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6133
6134 if (LLVMTypeOf(arg) != param_type) {
6135 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6136 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6137 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6138 } else {
6139 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6140 }
6141 }
6142
6143 in[param_idx] = arg;
6144 out_idx += param_size;
6145 }
6146
6147 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6148
6149 if (is_merged_shader(ctx->shader) &&
6150 part + 1 == next_shader_first_part) {
6151 lp_build_endif(&if_state);
6152
6153 /* The second half of the merged shader should use
6154 * the inputs from the toplevel (wrapper) function,
6155 * not the return value from the last call.
6156 *
6157 * That's because the last call was executed condi-
6158 * tionally, so we can't consume it in the main
6159 * block.
6160 */
6161 memcpy(out, initial, sizeof(initial));
6162 num_out = initial_num_out;
6163 num_out_sgpr = initial_num_out_sgpr;
6164 continue;
6165 }
6166
6167 /* Extract the returned GPRs. */
6168 ret_type = LLVMTypeOf(ret);
6169 num_out = 0;
6170 num_out_sgpr = 0;
6171
6172 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6173 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6174
6175 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6176
6177 for (unsigned i = 0; i < ret_size; ++i) {
6178 LLVMValueRef val =
6179 LLVMBuildExtractValue(builder, ret, i, "");
6180
6181 assert(num_out < ARRAY_SIZE(out));
6182 out[num_out++] = val;
6183
6184 if (LLVMTypeOf(val) == ctx->i32) {
6185 assert(num_out_sgpr + 1 == num_out);
6186 num_out_sgpr = num_out;
6187 }
6188 }
6189 }
6190 }
6191
6192 LLVMBuildRetVoid(builder);
6193 }
6194
6195 int si_compile_tgsi_shader(struct si_screen *sscreen,
6196 LLVMTargetMachineRef tm,
6197 struct si_shader *shader,
6198 bool is_monolithic,
6199 struct pipe_debug_callback *debug)
6200 {
6201 struct si_shader_selector *sel = shader->selector;
6202 struct si_shader_context ctx;
6203 int r = -1;
6204
6205 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6206 * conversion fails. */
6207 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6208 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6209 tgsi_dump(sel->tokens, 0);
6210 si_dump_streamout(&sel->so);
6211 }
6212
6213 si_init_shader_ctx(&ctx, sscreen, tm);
6214 si_llvm_context_set_tgsi(&ctx, shader);
6215 ctx.separate_prolog = !is_monolithic;
6216
6217 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6218 sizeof(shader->info.vs_output_param_offset));
6219
6220 shader->info.uses_instanceid = sel->info.uses_instanceid;
6221
6222 ctx.load_system_value = declare_system_value;
6223
6224 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6225 si_llvm_dispose(&ctx);
6226 return -1;
6227 }
6228
6229 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6230 LLVMValueRef parts[2];
6231 bool need_prolog = sel->vs_needs_prolog;
6232
6233 parts[1] = ctx.main_fn;
6234
6235 if (need_prolog) {
6236 union si_shader_part_key prolog_key;
6237 si_get_vs_prolog_key(&sel->info,
6238 shader->info.num_input_sgprs,
6239 &shader->key.part.vs.prolog,
6240 shader, &prolog_key);
6241 si_build_vs_prolog_function(&ctx, &prolog_key);
6242 parts[0] = ctx.main_fn;
6243 }
6244
6245 si_build_wrapper_function(&ctx, parts + !need_prolog,
6246 1 + need_prolog, need_prolog, 0);
6247 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6248 if (sscreen->b.chip_class >= GFX9) {
6249 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6250 LLVMValueRef parts[4];
6251
6252 /* TCS main part */
6253 parts[2] = ctx.main_fn;
6254
6255 /* TCS epilog */
6256 union si_shader_part_key tcs_epilog_key;
6257 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6258 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6259 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6260 parts[3] = ctx.main_fn;
6261
6262 /* VS prolog */
6263 if (ls->vs_needs_prolog) {
6264 union si_shader_part_key vs_prolog_key;
6265 si_get_vs_prolog_key(&ls->info,
6266 shader->info.num_input_sgprs,
6267 &shader->key.part.tcs.ls_prolog,
6268 shader, &vs_prolog_key);
6269 vs_prolog_key.vs_prolog.is_monolithic = true;
6270 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6271 parts[0] = ctx.main_fn;
6272 }
6273
6274 /* VS as LS main part */
6275 struct si_shader shader_ls = {};
6276 shader_ls.selector = ls;
6277 shader_ls.key.as_ls = 1;
6278 shader_ls.key.mono = shader->key.mono;
6279 shader_ls.key.opt = shader->key.opt;
6280 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6281
6282 if (!si_compile_tgsi_main(&ctx, true)) {
6283 si_llvm_dispose(&ctx);
6284 return -1;
6285 }
6286 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6287 parts[1] = ctx.main_fn;
6288
6289 /* Reset the shader context. */
6290 ctx.shader = shader;
6291 ctx.type = PIPE_SHADER_TESS_CTRL;
6292
6293 si_build_wrapper_function(&ctx,
6294 parts + !ls->vs_needs_prolog,
6295 4 - !ls->vs_needs_prolog, 0,
6296 ls->vs_needs_prolog ? 2 : 1);
6297 } else {
6298 LLVMValueRef parts[2];
6299 union si_shader_part_key epilog_key;
6300
6301 parts[0] = ctx.main_fn;
6302
6303 memset(&epilog_key, 0, sizeof(epilog_key));
6304 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6305 si_build_tcs_epilog_function(&ctx, &epilog_key);
6306 parts[1] = ctx.main_fn;
6307
6308 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6309 }
6310 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6311 if (ctx.screen->b.chip_class >= GFX9) {
6312 struct si_shader_selector *es = shader->key.part.gs.es;
6313 LLVMValueRef es_prolog = NULL;
6314 LLVMValueRef es_main = NULL;
6315 LLVMValueRef gs_prolog = NULL;
6316 LLVMValueRef gs_main = ctx.main_fn;
6317
6318 /* GS prolog */
6319 union si_shader_part_key gs_prolog_key;
6320 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6321 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6322 gs_prolog_key.gs_prolog.is_monolithic = true;
6323 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6324 gs_prolog = ctx.main_fn;
6325
6326 /* ES prolog */
6327 if (es->vs_needs_prolog) {
6328 union si_shader_part_key vs_prolog_key;
6329 si_get_vs_prolog_key(&es->info,
6330 shader->info.num_input_sgprs,
6331 &shader->key.part.tcs.ls_prolog,
6332 shader, &vs_prolog_key);
6333 vs_prolog_key.vs_prolog.is_monolithic = true;
6334 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6335 es_prolog = ctx.main_fn;
6336 }
6337
6338 /* ES main part */
6339 struct si_shader shader_es = {};
6340 shader_es.selector = es;
6341 shader_es.key.as_es = 1;
6342 shader_es.key.mono = shader->key.mono;
6343 shader_es.key.opt = shader->key.opt;
6344 si_llvm_context_set_tgsi(&ctx, &shader_es);
6345
6346 if (!si_compile_tgsi_main(&ctx, true)) {
6347 si_llvm_dispose(&ctx);
6348 return -1;
6349 }
6350 shader->info.uses_instanceid |= es->info.uses_instanceid;
6351 es_main = ctx.main_fn;
6352
6353 /* Reset the shader context. */
6354 ctx.shader = shader;
6355 ctx.type = PIPE_SHADER_GEOMETRY;
6356
6357 /* Prepare the array of shader parts. */
6358 LLVMValueRef parts[4];
6359 unsigned num_parts = 0, main_part, next_first_part;
6360
6361 if (es_prolog)
6362 parts[num_parts++] = es_prolog;
6363
6364 parts[main_part = num_parts++] = es_main;
6365 parts[next_first_part = num_parts++] = gs_prolog;
6366 parts[num_parts++] = gs_main;
6367
6368 si_build_wrapper_function(&ctx, parts, num_parts,
6369 main_part, next_first_part);
6370 } else {
6371 LLVMValueRef parts[2];
6372 union si_shader_part_key prolog_key;
6373
6374 parts[1] = ctx.main_fn;
6375
6376 memset(&prolog_key, 0, sizeof(prolog_key));
6377 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6378 si_build_gs_prolog_function(&ctx, &prolog_key);
6379 parts[0] = ctx.main_fn;
6380
6381 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6382 }
6383 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6384 LLVMValueRef parts[3];
6385 union si_shader_part_key prolog_key;
6386 union si_shader_part_key epilog_key;
6387 bool need_prolog;
6388
6389 si_get_ps_prolog_key(shader, &prolog_key, false);
6390 need_prolog = si_need_ps_prolog(&prolog_key);
6391
6392 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6393
6394 if (need_prolog) {
6395 si_build_ps_prolog_function(&ctx, &prolog_key);
6396 parts[0] = ctx.main_fn;
6397 }
6398
6399 si_get_ps_epilog_key(shader, &epilog_key);
6400 si_build_ps_epilog_function(&ctx, &epilog_key);
6401 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6402
6403 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6404 need_prolog ? 1 : 0, 0);
6405 }
6406
6407 si_llvm_optimize_module(&ctx);
6408
6409 /* Post-optimization transformations and analysis. */
6410 si_optimize_vs_outputs(&ctx);
6411
6412 if ((debug && debug->debug_message) ||
6413 r600_can_dump_shader(&sscreen->b, ctx.type))
6414 si_count_scratch_private_memory(&ctx);
6415
6416 /* Compile to bytecode. */
6417 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6418 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6419 si_llvm_dispose(&ctx);
6420 if (r) {
6421 fprintf(stderr, "LLVM failed to compile shader\n");
6422 return r;
6423 }
6424
6425 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6426 * LLVM 3.9svn has this bug.
6427 */
6428 if (sel->type == PIPE_SHADER_COMPUTE) {
6429 unsigned wave_size = 64;
6430 unsigned max_vgprs = 256;
6431 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6432 unsigned max_sgprs_per_wave = 128;
6433 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6434 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6435 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6436
6437 max_vgprs = max_vgprs / min_waves_per_simd;
6438 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6439
6440 if (shader->config.num_sgprs > max_sgprs ||
6441 shader->config.num_vgprs > max_vgprs) {
6442 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6443 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6444 shader->config.num_sgprs, shader->config.num_vgprs,
6445 max_sgprs, max_vgprs);
6446
6447 /* Just terminate the process, because dependent
6448 * shaders can hang due to bad input data, but use
6449 * the env var to allow shader-db to work.
6450 */
6451 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6452 abort();
6453 }
6454 }
6455
6456 /* Add the scratch offset to input SGPRs. */
6457 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6458 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6459
6460 /* Calculate the number of fragment input VGPRs. */
6461 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6462 shader->info.num_input_vgprs = 0;
6463 shader->info.face_vgpr_index = -1;
6464
6465 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6466 shader->info.num_input_vgprs += 2;
6467 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6468 shader->info.num_input_vgprs += 2;
6469 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6470 shader->info.num_input_vgprs += 2;
6471 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6472 shader->info.num_input_vgprs += 3;
6473 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6474 shader->info.num_input_vgprs += 2;
6475 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6476 shader->info.num_input_vgprs += 2;
6477 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6478 shader->info.num_input_vgprs += 2;
6479 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6480 shader->info.num_input_vgprs += 1;
6481 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6482 shader->info.num_input_vgprs += 1;
6483 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6484 shader->info.num_input_vgprs += 1;
6485 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6486 shader->info.num_input_vgprs += 1;
6487 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6488 shader->info.num_input_vgprs += 1;
6489 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6490 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6491 shader->info.num_input_vgprs += 1;
6492 }
6493 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6494 shader->info.num_input_vgprs += 1;
6495 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6496 shader->info.num_input_vgprs += 1;
6497 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6498 shader->info.num_input_vgprs += 1;
6499 }
6500
6501 return 0;
6502 }
6503
6504 /**
6505 * Create, compile and return a shader part (prolog or epilog).
6506 *
6507 * \param sscreen screen
6508 * \param list list of shader parts of the same category
6509 * \param type shader type
6510 * \param key shader part key
6511 * \param prolog whether the part being requested is a prolog
6512 * \param tm LLVM target machine
6513 * \param debug debug callback
6514 * \param build the callback responsible for building the main function
6515 * \return non-NULL on success
6516 */
6517 static struct si_shader_part *
6518 si_get_shader_part(struct si_screen *sscreen,
6519 struct si_shader_part **list,
6520 enum pipe_shader_type type,
6521 bool prolog,
6522 union si_shader_part_key *key,
6523 LLVMTargetMachineRef tm,
6524 struct pipe_debug_callback *debug,
6525 void (*build)(struct si_shader_context *,
6526 union si_shader_part_key *),
6527 const char *name)
6528 {
6529 struct si_shader_part *result;
6530
6531 mtx_lock(&sscreen->shader_parts_mutex);
6532
6533 /* Find existing. */
6534 for (result = *list; result; result = result->next) {
6535 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6536 mtx_unlock(&sscreen->shader_parts_mutex);
6537 return result;
6538 }
6539 }
6540
6541 /* Compile a new one. */
6542 result = CALLOC_STRUCT(si_shader_part);
6543 result->key = *key;
6544
6545 struct si_shader shader = {};
6546 struct si_shader_context ctx;
6547 struct gallivm_state *gallivm = &ctx.gallivm;
6548
6549 si_init_shader_ctx(&ctx, sscreen, tm);
6550 ctx.shader = &shader;
6551 ctx.type = type;
6552
6553 switch (type) {
6554 case PIPE_SHADER_VERTEX:
6555 break;
6556 case PIPE_SHADER_TESS_CTRL:
6557 assert(!prolog);
6558 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6559 break;
6560 case PIPE_SHADER_GEOMETRY:
6561 assert(prolog);
6562 break;
6563 case PIPE_SHADER_FRAGMENT:
6564 if (prolog)
6565 shader.key.part.ps.prolog = key->ps_prolog.states;
6566 else
6567 shader.key.part.ps.epilog = key->ps_epilog.states;
6568 break;
6569 default:
6570 unreachable("bad shader part");
6571 }
6572
6573 build(&ctx, key);
6574
6575 /* Compile. */
6576 si_llvm_optimize_module(&ctx);
6577
6578 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6579 gallivm->module, debug, ctx.type, name)) {
6580 FREE(result);
6581 result = NULL;
6582 goto out;
6583 }
6584
6585 result->next = *list;
6586 *list = result;
6587
6588 out:
6589 si_llvm_dispose(&ctx);
6590 mtx_unlock(&sscreen->shader_parts_mutex);
6591 return result;
6592 }
6593
6594 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6595 {
6596 struct gallivm_state *gallivm = &ctx->gallivm;
6597 LLVMValueRef ptr[2], list;
6598
6599 /* Get the pointer to rw buffers. */
6600 ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6601 ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6602 list = lp_build_gather_values(gallivm, ptr, 2);
6603 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6604 list = LLVMBuildIntToPtr(gallivm->builder, list,
6605 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6606 return list;
6607 }
6608
6609 /**
6610 * Build the vertex shader prolog function.
6611 *
6612 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6613 * All inputs are returned unmodified. The vertex load indices are
6614 * stored after them, which will be used by the API VS for fetching inputs.
6615 *
6616 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6617 * input_v0,
6618 * input_v1,
6619 * input_v2,
6620 * input_v3,
6621 * (VertexID + BaseVertex),
6622 * (InstanceID + StartInstance),
6623 * (InstanceID / 2 + StartInstance)
6624 */
6625 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6626 union si_shader_part_key *key)
6627 {
6628 struct gallivm_state *gallivm = &ctx->gallivm;
6629 struct si_function_info fninfo;
6630 LLVMTypeRef *returns;
6631 LLVMValueRef ret, func;
6632 int num_returns, i;
6633 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
6634 key->vs_prolog.num_merged_next_stage_vgprs;
6635 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6636 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6637 num_input_vgprs;
6638 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6639
6640 ctx->param_vertex_id = first_vs_vgpr;
6641 ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
6642
6643 si_init_function_info(&fninfo);
6644
6645 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6646 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6647 sizeof(LLVMTypeRef));
6648 num_returns = 0;
6649
6650 /* Declare input and output SGPRs. */
6651 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6652 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6653 returns[num_returns++] = ctx->i32;
6654 }
6655
6656 /* Preloaded VGPRs (outputs must be floats) */
6657 for (i = 0; i < num_input_vgprs; i++) {
6658 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6659 returns[num_returns++] = ctx->f32;
6660 }
6661
6662 /* Vertex load indices. */
6663 for (i = 0; i <= key->vs_prolog.last_input; i++)
6664 returns[num_returns++] = ctx->f32;
6665
6666 /* Create the function. */
6667 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6668 func = ctx->main_fn;
6669
6670 if (key->vs_prolog.num_merged_next_stage_vgprs &&
6671 !key->vs_prolog.is_monolithic)
6672 si_init_exec_from_input(ctx, 3, 0);
6673
6674 /* Copy inputs to outputs. This should be no-op, as the registers match,
6675 * but it will prevent the compiler from overwriting them unintentionally.
6676 */
6677 ret = ctx->return_value;
6678 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6679 LLVMValueRef p = LLVMGetParam(func, i);
6680 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6681 }
6682 for (; i < fninfo.num_params; i++) {
6683 LLVMValueRef p = LLVMGetParam(func, i);
6684 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6685 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6686 }
6687
6688 /* Compute vertex load indices from instance divisors. */
6689 LLVMValueRef instance_divisor_constbuf = NULL;
6690
6691 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6692 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6693 LLVMValueRef buf_index =
6694 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6695 instance_divisor_constbuf =
6696 ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6697 }
6698
6699 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6700 bool divisor_is_one =
6701 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6702 bool divisor_is_fetched =
6703 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6704 LLVMValueRef index;
6705
6706 if (divisor_is_one || divisor_is_fetched) {
6707 LLVMValueRef divisor = ctx->i32_1;
6708
6709 if (divisor_is_fetched) {
6710 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6711 LLVMConstInt(ctx->i32, i * 4, 0));
6712 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6713 ctx->i32, "");
6714 }
6715
6716 /* InstanceID / Divisor + StartInstance */
6717 index = get_instance_index_for_fetch(ctx,
6718 user_sgpr_base +
6719 SI_SGPR_START_INSTANCE,
6720 divisor);
6721 } else {
6722 /* VertexID + BaseVertex */
6723 index = LLVMBuildAdd(gallivm->builder,
6724 LLVMGetParam(func, ctx->param_vertex_id),
6725 LLVMGetParam(func, user_sgpr_base +
6726 SI_SGPR_BASE_VERTEX), "");
6727 }
6728
6729 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6730 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6731 fninfo.num_params + i, "");
6732 }
6733
6734 si_llvm_build_ret(ctx, ret);
6735 }
6736
6737 static bool si_get_vs_prolog(struct si_screen *sscreen,
6738 LLVMTargetMachineRef tm,
6739 struct si_shader *shader,
6740 struct pipe_debug_callback *debug,
6741 struct si_shader *main_part,
6742 const struct si_vs_prolog_bits *key)
6743 {
6744 struct si_shader_selector *vs = main_part->selector;
6745
6746 /* The prolog is a no-op if there are no inputs. */
6747 if (!vs->vs_needs_prolog)
6748 return true;
6749
6750 /* Get the prolog. */
6751 union si_shader_part_key prolog_key;
6752 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6753 key, shader, &prolog_key);
6754
6755 shader->prolog =
6756 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6757 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6758 debug, si_build_vs_prolog_function,
6759 "Vertex Shader Prolog");
6760 return shader->prolog != NULL;
6761 }
6762
6763 /**
6764 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6765 */
6766 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6767 LLVMTargetMachineRef tm,
6768 struct si_shader *shader,
6769 struct pipe_debug_callback *debug)
6770 {
6771 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6772 &shader->key.part.vs.prolog);
6773 }
6774
6775 /**
6776 * Compile the TCS epilog function. This writes tesselation factors to memory
6777 * based on the output primitive type of the tesselator (determined by TES).
6778 */
6779 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6780 union si_shader_part_key *key)
6781 {
6782 struct gallivm_state *gallivm = &ctx->gallivm;
6783 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6784 struct si_function_info fninfo;
6785 LLVMValueRef func;
6786
6787 si_init_function_info(&fninfo);
6788
6789 if (ctx->screen->b.chip_class >= GFX9) {
6790 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6791 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6792 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
6793 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6794 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6795 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6796 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6797 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6798 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6799 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6800 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6801 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6802 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6803 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6804 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6805 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6806 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6807 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6808 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6809 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6810 } else {
6811 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6812 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6813 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6814 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6815 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6816 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6817 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6818 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6819 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6820 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6821 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6822 }
6823
6824 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6825 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6826 unsigned tess_factors_idx =
6827 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
6828 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
6829 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
6830
6831 /* Create the function. */
6832 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
6833 ctx->screen->b.chip_class >= CIK ? 128 : 64);
6834 declare_lds_as_pointer(ctx);
6835 func = ctx->main_fn;
6836
6837 si_write_tess_factors(bld_base,
6838 LLVMGetParam(func, tess_factors_idx),
6839 LLVMGetParam(func, tess_factors_idx + 1),
6840 LLVMGetParam(func, tess_factors_idx + 2));
6841
6842 LLVMBuildRetVoid(gallivm->builder);
6843 }
6844
6845 /**
6846 * Select and compile (or reuse) TCS parts (epilog).
6847 */
6848 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6849 LLVMTargetMachineRef tm,
6850 struct si_shader *shader,
6851 struct pipe_debug_callback *debug)
6852 {
6853 if (sscreen->b.chip_class >= GFX9) {
6854 struct si_shader *ls_main_part =
6855 shader->key.part.tcs.ls->main_shader_part_ls;
6856
6857 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
6858 &shader->key.part.tcs.ls_prolog))
6859 return false;
6860
6861 shader->previous_stage = ls_main_part;
6862 }
6863
6864 /* Get the epilog. */
6865 union si_shader_part_key epilog_key;
6866 memset(&epilog_key, 0, sizeof(epilog_key));
6867 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6868
6869 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6870 PIPE_SHADER_TESS_CTRL, false,
6871 &epilog_key, tm, debug,
6872 si_build_tcs_epilog_function,
6873 "Tessellation Control Shader Epilog");
6874 return shader->epilog != NULL;
6875 }
6876
6877 /**
6878 * Select and compile (or reuse) GS parts (prolog).
6879 */
6880 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
6881 LLVMTargetMachineRef tm,
6882 struct si_shader *shader,
6883 struct pipe_debug_callback *debug)
6884 {
6885 if (sscreen->b.chip_class >= GFX9) {
6886 struct si_shader *es_main_part =
6887 shader->key.part.gs.es->main_shader_part_es;
6888
6889 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
6890 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
6891 &shader->key.part.gs.vs_prolog))
6892 return false;
6893
6894 shader->previous_stage = es_main_part;
6895 }
6896
6897 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
6898 return true;
6899
6900 union si_shader_part_key prolog_key;
6901 memset(&prolog_key, 0, sizeof(prolog_key));
6902 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6903
6904 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
6905 PIPE_SHADER_GEOMETRY, true,
6906 &prolog_key, tm, debug,
6907 si_build_gs_prolog_function,
6908 "Geometry Shader Prolog");
6909 return shader->prolog2 != NULL;
6910 }
6911
6912 /**
6913 * Build the pixel shader prolog function. This handles:
6914 * - two-side color selection and interpolation
6915 * - overriding interpolation parameters for the API PS
6916 * - polygon stippling
6917 *
6918 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6919 * overriden by other states. (e.g. per-sample interpolation)
6920 * Interpolated colors are stored after the preloaded VGPRs.
6921 */
6922 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
6923 union si_shader_part_key *key)
6924 {
6925 struct gallivm_state *gallivm = &ctx->gallivm;
6926 struct si_function_info fninfo;
6927 LLVMValueRef ret, func;
6928 int num_returns, i, num_color_channels;
6929
6930 assert(si_need_ps_prolog(key));
6931
6932 si_init_function_info(&fninfo);
6933
6934 /* Declare inputs. */
6935 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6936 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6937
6938 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6939 add_arg(&fninfo, ARG_VGPR, ctx->f32);
6940
6941 /* Declare outputs (same as inputs + add colors if needed) */
6942 num_returns = fninfo.num_params;
6943 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6944 for (i = 0; i < num_color_channels; i++)
6945 fninfo.types[num_returns++] = ctx->f32;
6946
6947 /* Create the function. */
6948 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
6949 &fninfo, 0);
6950 func = ctx->main_fn;
6951
6952 /* Copy inputs to outputs. This should be no-op, as the registers match,
6953 * but it will prevent the compiler from overwriting them unintentionally.
6954 */
6955 ret = ctx->return_value;
6956 for (i = 0; i < fninfo.num_params; i++) {
6957 LLVMValueRef p = LLVMGetParam(func, i);
6958 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6959 }
6960
6961 /* Polygon stippling. */
6962 if (key->ps_prolog.states.poly_stipple) {
6963 /* POS_FIXED_PT is always last. */
6964 unsigned pos = key->ps_prolog.num_input_sgprs +
6965 key->ps_prolog.num_input_vgprs - 1;
6966 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6967
6968 si_llvm_emit_polygon_stipple(ctx, list, pos);
6969 }
6970
6971 if (key->ps_prolog.states.bc_optimize_for_persp ||
6972 key->ps_prolog.states.bc_optimize_for_linear) {
6973 unsigned i, base = key->ps_prolog.num_input_sgprs;
6974 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
6975
6976 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
6977 * The hw doesn't compute CENTROID if the whole wave only
6978 * contains fully-covered quads.
6979 *
6980 * PRIM_MASK is after user SGPRs.
6981 */
6982 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6983 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
6984 LLVMConstInt(ctx->i32, 31, 0), "");
6985 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
6986 ctx->i1, "");
6987
6988 if (key->ps_prolog.states.bc_optimize_for_persp) {
6989 /* Read PERSP_CENTER. */
6990 for (i = 0; i < 2; i++)
6991 center[i] = LLVMGetParam(func, base + 2 + i);
6992 /* Read PERSP_CENTROID. */
6993 for (i = 0; i < 2; i++)
6994 centroid[i] = LLVMGetParam(func, base + 4 + i);
6995 /* Select PERSP_CENTROID. */
6996 for (i = 0; i < 2; i++) {
6997 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
6998 center[i], centroid[i], "");
6999 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7000 tmp, base + 4 + i, "");
7001 }
7002 }
7003 if (key->ps_prolog.states.bc_optimize_for_linear) {
7004 /* Read LINEAR_CENTER. */
7005 for (i = 0; i < 2; i++)
7006 center[i] = LLVMGetParam(func, base + 8 + i);
7007 /* Read LINEAR_CENTROID. */
7008 for (i = 0; i < 2; i++)
7009 centroid[i] = LLVMGetParam(func, base + 10 + i);
7010 /* Select LINEAR_CENTROID. */
7011 for (i = 0; i < 2; i++) {
7012 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7013 center[i], centroid[i], "");
7014 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7015 tmp, base + 10 + i, "");
7016 }
7017 }
7018 }
7019
7020 /* Force per-sample interpolation. */
7021 if (key->ps_prolog.states.force_persp_sample_interp) {
7022 unsigned i, base = key->ps_prolog.num_input_sgprs;
7023 LLVMValueRef persp_sample[2];
7024
7025 /* Read PERSP_SAMPLE. */
7026 for (i = 0; i < 2; i++)
7027 persp_sample[i] = LLVMGetParam(func, base + i);
7028 /* Overwrite PERSP_CENTER. */
7029 for (i = 0; i < 2; i++)
7030 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7031 persp_sample[i], base + 2 + i, "");
7032 /* Overwrite PERSP_CENTROID. */
7033 for (i = 0; i < 2; i++)
7034 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7035 persp_sample[i], base + 4 + i, "");
7036 }
7037 if (key->ps_prolog.states.force_linear_sample_interp) {
7038 unsigned i, base = key->ps_prolog.num_input_sgprs;
7039 LLVMValueRef linear_sample[2];
7040
7041 /* Read LINEAR_SAMPLE. */
7042 for (i = 0; i < 2; i++)
7043 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7044 /* Overwrite LINEAR_CENTER. */
7045 for (i = 0; i < 2; i++)
7046 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7047 linear_sample[i], base + 8 + i, "");
7048 /* Overwrite LINEAR_CENTROID. */
7049 for (i = 0; i < 2; i++)
7050 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7051 linear_sample[i], base + 10 + i, "");
7052 }
7053
7054 /* Force center interpolation. */
7055 if (key->ps_prolog.states.force_persp_center_interp) {
7056 unsigned i, base = key->ps_prolog.num_input_sgprs;
7057 LLVMValueRef persp_center[2];
7058
7059 /* Read PERSP_CENTER. */
7060 for (i = 0; i < 2; i++)
7061 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7062 /* Overwrite PERSP_SAMPLE. */
7063 for (i = 0; i < 2; i++)
7064 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7065 persp_center[i], base + i, "");
7066 /* Overwrite PERSP_CENTROID. */
7067 for (i = 0; i < 2; i++)
7068 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7069 persp_center[i], base + 4 + i, "");
7070 }
7071 if (key->ps_prolog.states.force_linear_center_interp) {
7072 unsigned i, base = key->ps_prolog.num_input_sgprs;
7073 LLVMValueRef linear_center[2];
7074
7075 /* Read LINEAR_CENTER. */
7076 for (i = 0; i < 2; i++)
7077 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7078 /* Overwrite LINEAR_SAMPLE. */
7079 for (i = 0; i < 2; i++)
7080 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7081 linear_center[i], base + 6 + i, "");
7082 /* Overwrite LINEAR_CENTROID. */
7083 for (i = 0; i < 2; i++)
7084 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7085 linear_center[i], base + 10 + i, "");
7086 }
7087
7088 /* Interpolate colors. */
7089 unsigned color_out_idx = 0;
7090 for (i = 0; i < 2; i++) {
7091 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7092 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7093 key->ps_prolog.face_vgpr_index;
7094 LLVMValueRef interp[2], color[4];
7095 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7096
7097 if (!writemask)
7098 continue;
7099
7100 /* If the interpolation qualifier is not CONSTANT (-1). */
7101 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7102 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7103 key->ps_prolog.color_interp_vgpr_index[i];
7104
7105 /* Get the (i,j) updated by bc_optimize handling. */
7106 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7107 interp_vgpr, "");
7108 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7109 interp_vgpr + 1, "");
7110 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7111 }
7112
7113 /* Use the absolute location of the input. */
7114 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7115
7116 if (key->ps_prolog.states.color_two_side) {
7117 face = LLVMGetParam(func, face_vgpr);
7118 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7119 }
7120
7121 interp_fs_input(ctx,
7122 key->ps_prolog.color_attr_index[i],
7123 TGSI_SEMANTIC_COLOR, i,
7124 key->ps_prolog.num_interp_inputs,
7125 key->ps_prolog.colors_read, interp_ij,
7126 prim_mask, face, color);
7127
7128 while (writemask) {
7129 unsigned chan = u_bit_scan(&writemask);
7130 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7131 fninfo.num_params + color_out_idx++, "");
7132 }
7133 }
7134
7135 /* Tell LLVM to insert WQM instruction sequence when needed. */
7136 if (key->ps_prolog.wqm) {
7137 LLVMAddTargetDependentFunctionAttr(func,
7138 "amdgpu-ps-wqm-outputs", "");
7139 }
7140
7141 si_llvm_build_ret(ctx, ret);
7142 }
7143
7144 /**
7145 * Build the pixel shader epilog function. This handles everything that must be
7146 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7147 */
7148 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7149 union si_shader_part_key *key)
7150 {
7151 struct gallivm_state *gallivm = &ctx->gallivm;
7152 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7153 struct si_function_info fninfo;
7154 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7155 int i;
7156 struct si_ps_exports exp = {};
7157
7158 si_init_function_info(&fninfo);
7159
7160 /* Declare input SGPRs. */
7161 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7162 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7163 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7164 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7165
7166 /* Declare input VGPRs. */
7167 unsigned required_num_params =
7168 fninfo.num_sgpr_params +
7169 util_bitcount(key->ps_epilog.colors_written) * 4 +
7170 key->ps_epilog.writes_z +
7171 key->ps_epilog.writes_stencil +
7172 key->ps_epilog.writes_samplemask;
7173
7174 required_num_params = MAX2(required_num_params,
7175 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7176
7177 while (fninfo.num_params < required_num_params)
7178 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7179
7180 /* Create the function. */
7181 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7182 /* Disable elimination of unused inputs. */
7183 si_llvm_add_attribute(ctx->main_fn,
7184 "InitialPSInputAddr", 0xffffff);
7185
7186 /* Process colors. */
7187 unsigned vgpr = fninfo.num_sgpr_params;
7188 unsigned colors_written = key->ps_epilog.colors_written;
7189 int last_color_export = -1;
7190
7191 /* Find the last color export. */
7192 if (!key->ps_epilog.writes_z &&
7193 !key->ps_epilog.writes_stencil &&
7194 !key->ps_epilog.writes_samplemask) {
7195 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7196
7197 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7198 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7199 /* Just set this if any of the colorbuffers are enabled. */
7200 if (spi_format &
7201 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7202 last_color_export = 0;
7203 } else {
7204 for (i = 0; i < 8; i++)
7205 if (colors_written & (1 << i) &&
7206 (spi_format >> (i * 4)) & 0xf)
7207 last_color_export = i;
7208 }
7209 }
7210
7211 while (colors_written) {
7212 LLVMValueRef color[4];
7213 int mrt = u_bit_scan(&colors_written);
7214
7215 for (i = 0; i < 4; i++)
7216 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7217
7218 si_export_mrt_color(bld_base, color, mrt,
7219 fninfo.num_params - 1,
7220 mrt == last_color_export, &exp);
7221 }
7222
7223 /* Process depth, stencil, samplemask. */
7224 if (key->ps_epilog.writes_z)
7225 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7226 if (key->ps_epilog.writes_stencil)
7227 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7228 if (key->ps_epilog.writes_samplemask)
7229 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7230
7231 if (depth || stencil || samplemask)
7232 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7233 else if (last_color_export == -1)
7234 si_export_null(bld_base);
7235
7236 if (exp.num)
7237 si_emit_ps_exports(ctx, &exp);
7238
7239 /* Compile. */
7240 LLVMBuildRetVoid(gallivm->builder);
7241 }
7242
7243 /**
7244 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7245 */
7246 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7247 LLVMTargetMachineRef tm,
7248 struct si_shader *shader,
7249 struct pipe_debug_callback *debug)
7250 {
7251 union si_shader_part_key prolog_key;
7252 union si_shader_part_key epilog_key;
7253
7254 /* Get the prolog. */
7255 si_get_ps_prolog_key(shader, &prolog_key, true);
7256
7257 /* The prolog is a no-op if these aren't set. */
7258 if (si_need_ps_prolog(&prolog_key)) {
7259 shader->prolog =
7260 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7261 PIPE_SHADER_FRAGMENT, true,
7262 &prolog_key, tm, debug,
7263 si_build_ps_prolog_function,
7264 "Fragment Shader Prolog");
7265 if (!shader->prolog)
7266 return false;
7267 }
7268
7269 /* Get the epilog. */
7270 si_get_ps_epilog_key(shader, &epilog_key);
7271
7272 shader->epilog =
7273 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7274 PIPE_SHADER_FRAGMENT, false,
7275 &epilog_key, tm, debug,
7276 si_build_ps_epilog_function,
7277 "Fragment Shader Epilog");
7278 if (!shader->epilog)
7279 return false;
7280
7281 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7282 if (shader->key.part.ps.prolog.poly_stipple) {
7283 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7284 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7285 }
7286
7287 /* Set up the enable bits for per-sample shading if needed. */
7288 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7289 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7290 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7291 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7292 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7293 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7294 }
7295 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7296 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7297 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7298 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7299 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7300 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7301 }
7302 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7303 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7304 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7305 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7306 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7307 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7308 }
7309 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7310 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7311 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7312 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7313 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7314 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7315 }
7316
7317 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7318 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7319 !(shader->config.spi_ps_input_ena & 0xf)) {
7320 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7321 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7322 }
7323
7324 /* At least one pair of interpolation weights must be enabled. */
7325 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7326 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7327 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7328 }
7329
7330 /* The sample mask input is always enabled, because the API shader always
7331 * passes it through to the epilog. Disable it here if it's unused.
7332 */
7333 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7334 !shader->selector->info.reads_samplemask)
7335 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7336
7337 return true;
7338 }
7339
7340 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7341 unsigned *lds_size)
7342 {
7343 /* SPI barrier management bug:
7344 * Make sure we have at least 4k of LDS in use to avoid the bug.
7345 * It applies to workgroup sizes of more than one wavefront.
7346 */
7347 if (sscreen->b.family == CHIP_BONAIRE ||
7348 sscreen->b.family == CHIP_KABINI ||
7349 sscreen->b.family == CHIP_MULLINS)
7350 *lds_size = MAX2(*lds_size, 8);
7351 }
7352
7353 static void si_fix_resource_usage(struct si_screen *sscreen,
7354 struct si_shader *shader)
7355 {
7356 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7357
7358 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7359
7360 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7361 si_get_max_workgroup_size(shader) > 64) {
7362 si_multiwave_lds_size_workaround(sscreen,
7363 &shader->config.lds_size);
7364 }
7365 }
7366
7367 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7368 struct si_shader *shader,
7369 struct pipe_debug_callback *debug)
7370 {
7371 struct si_shader_selector *sel = shader->selector;
7372 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7373 int r;
7374
7375 /* LS, ES, VS are compiled on demand if the main part hasn't been
7376 * compiled for that stage.
7377 *
7378 * Vertex shaders are compiled on demand when a vertex fetch
7379 * workaround must be applied.
7380 */
7381 if (shader->is_monolithic) {
7382 /* Monolithic shader (compiled as a whole, has many variants,
7383 * may take a long time to compile).
7384 */
7385 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7386 if (r)
7387 return r;
7388 } else {
7389 /* The shader consists of several parts:
7390 *
7391 * - the middle part is the user shader, it has 1 variant only
7392 * and it was compiled during the creation of the shader
7393 * selector
7394 * - the prolog part is inserted at the beginning
7395 * - the epilog part is inserted at the end
7396 *
7397 * The prolog and epilog have many (but simple) variants.
7398 *
7399 * Starting with gfx9, geometry and tessellation control
7400 * shaders also contain the prolog and user shader parts of
7401 * the previous shader stage.
7402 */
7403
7404 if (!mainp)
7405 return -1;
7406
7407 /* Copy the compiled TGSI shader data over. */
7408 shader->is_binary_shared = true;
7409 shader->binary = mainp->binary;
7410 shader->config = mainp->config;
7411 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7412 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7413 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7414 memcpy(shader->info.vs_output_param_offset,
7415 mainp->info.vs_output_param_offset,
7416 sizeof(mainp->info.vs_output_param_offset));
7417 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7418 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7419 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7420
7421 /* Select prologs and/or epilogs. */
7422 switch (sel->type) {
7423 case PIPE_SHADER_VERTEX:
7424 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7425 return -1;
7426 break;
7427 case PIPE_SHADER_TESS_CTRL:
7428 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7429 return -1;
7430 break;
7431 case PIPE_SHADER_TESS_EVAL:
7432 break;
7433 case PIPE_SHADER_GEOMETRY:
7434 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7435 return -1;
7436 break;
7437 case PIPE_SHADER_FRAGMENT:
7438 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7439 return -1;
7440
7441 /* Make sure we have at least as many VGPRs as there
7442 * are allocated inputs.
7443 */
7444 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7445 shader->info.num_input_vgprs);
7446 break;
7447 }
7448
7449 /* Update SGPR and VGPR counts. */
7450 if (shader->prolog) {
7451 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7452 shader->prolog->config.num_sgprs);
7453 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7454 shader->prolog->config.num_vgprs);
7455 }
7456 if (shader->previous_stage) {
7457 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7458 shader->previous_stage->config.num_sgprs);
7459 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7460 shader->previous_stage->config.num_vgprs);
7461 shader->config.spilled_sgprs =
7462 MAX2(shader->config.spilled_sgprs,
7463 shader->previous_stage->config.spilled_sgprs);
7464 shader->config.spilled_vgprs =
7465 MAX2(shader->config.spilled_vgprs,
7466 shader->previous_stage->config.spilled_vgprs);
7467 shader->config.private_mem_vgprs =
7468 MAX2(shader->config.private_mem_vgprs,
7469 shader->previous_stage->config.private_mem_vgprs);
7470 shader->config.scratch_bytes_per_wave =
7471 MAX2(shader->config.scratch_bytes_per_wave,
7472 shader->previous_stage->config.scratch_bytes_per_wave);
7473 shader->info.uses_instanceid |=
7474 shader->previous_stage->info.uses_instanceid;
7475 }
7476 if (shader->prolog2) {
7477 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7478 shader->prolog2->config.num_sgprs);
7479 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7480 shader->prolog2->config.num_vgprs);
7481 }
7482 if (shader->epilog) {
7483 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7484 shader->epilog->config.num_sgprs);
7485 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7486 shader->epilog->config.num_vgprs);
7487 }
7488 }
7489
7490 si_fix_resource_usage(sscreen, shader);
7491 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7492 stderr, true);
7493
7494 /* Upload. */
7495 r = si_shader_binary_upload(sscreen, shader);
7496 if (r) {
7497 fprintf(stderr, "LLVM failed to upload shader\n");
7498 return r;
7499 }
7500
7501 return 0;
7502 }
7503
7504 void si_shader_destroy(struct si_shader *shader)
7505 {
7506 if (shader->scratch_bo)
7507 r600_resource_reference(&shader->scratch_bo, NULL);
7508
7509 r600_resource_reference(&shader->bo, NULL);
7510
7511 if (!shader->is_binary_shared)
7512 radeon_shader_binary_clean(&shader->binary);
7513
7514 free(shader->shader_log);
7515 }