radeonsi: dump NIR instead of TGSI when appropriate
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49 #include "compiler/nir/nir.h"
50
51 static const char *scratch_rsrc_dword0_symbol =
52 "SCRATCH_RSRC_DWORD0";
53
54 static const char *scratch_rsrc_dword1_symbol =
55 "SCRATCH_RSRC_DWORD1";
56
57 struct si_shader_output_values
58 {
59 LLVMValueRef values[4];
60 unsigned semantic_name;
61 unsigned semantic_index;
62 ubyte vertex_stream[4];
63 };
64
65 /**
66 * Used to collect types and other info about arguments of the LLVM function
67 * before the function is created.
68 */
69 struct si_function_info {
70 LLVMTypeRef types[100];
71 LLVMValueRef *assign[100];
72 unsigned num_sgpr_params;
73 unsigned num_params;
74 };
75
76 enum si_arg_regfile {
77 ARG_SGPR,
78 ARG_VGPR
79 };
80
81 static void si_init_shader_ctx(struct si_shader_context *ctx,
82 struct si_screen *sscreen,
83 LLVMTargetMachineRef tm);
84
85 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
86 struct lp_build_tgsi_context *bld_base,
87 struct lp_build_emit_data *emit_data);
88
89 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
90 FILE *f);
91
92 static unsigned llvm_get_type_size(LLVMTypeRef type);
93
94 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
95 union si_shader_part_key *key);
96 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
97 union si_shader_part_key *key);
98 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
99 union si_shader_part_key *key);
100 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
101 union si_shader_part_key *key);
102
103 /* Ideally pass the sample mask input to the PS epilog as v13, which
104 * is its usual location, so that the shader doesn't have to add v_mov.
105 */
106 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
107
108 enum {
109 CONST_ADDR_SPACE = 2,
110 LOCAL_ADDR_SPACE = 3,
111 };
112
113 static bool is_merged_shader(struct si_shader *shader)
114 {
115 if (shader->selector->screen->b.chip_class <= VI)
116 return false;
117
118 return shader->key.as_ls ||
119 shader->key.as_es ||
120 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
121 shader->selector->type == PIPE_SHADER_GEOMETRY;
122 }
123
124 static void si_init_function_info(struct si_function_info *fninfo)
125 {
126 fninfo->num_params = 0;
127 fninfo->num_sgpr_params = 0;
128 }
129
130 static unsigned add_arg_assign(struct si_function_info *fninfo,
131 enum si_arg_regfile regfile, LLVMTypeRef type,
132 LLVMValueRef *assign)
133 {
134 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
135
136 unsigned idx = fninfo->num_params++;
137 assert(idx < ARRAY_SIZE(fninfo->types));
138
139 if (regfile == ARG_SGPR)
140 fninfo->num_sgpr_params = fninfo->num_params;
141
142 fninfo->types[idx] = type;
143 fninfo->assign[idx] = assign;
144 return idx;
145 }
146
147 static unsigned add_arg(struct si_function_info *fninfo,
148 enum si_arg_regfile regfile, LLVMTypeRef type)
149 {
150 return add_arg_assign(fninfo, regfile, type, NULL);
151 }
152
153 static void add_arg_checked(struct si_function_info *fninfo,
154 enum si_arg_regfile regfile, LLVMTypeRef type,
155 unsigned idx)
156 {
157 MAYBE_UNUSED unsigned actual = add_arg(fninfo, regfile, type);
158 assert(actual == idx);
159 }
160
161 /**
162 * Returns a unique index for a per-patch semantic name and index. The index
163 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
164 * can be calculated.
165 */
166 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
167 {
168 switch (semantic_name) {
169 case TGSI_SEMANTIC_TESSOUTER:
170 return 0;
171 case TGSI_SEMANTIC_TESSINNER:
172 return 1;
173 case TGSI_SEMANTIC_PATCH:
174 assert(index < 30);
175 return 2 + index;
176
177 default:
178 assert(!"invalid semantic name");
179 return 0;
180 }
181 }
182
183 /**
184 * Returns a unique index for a semantic name and index. The index must be
185 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
186 * calculated.
187 */
188 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
189 {
190 switch (semantic_name) {
191 case TGSI_SEMANTIC_POSITION:
192 return 0;
193 case TGSI_SEMANTIC_GENERIC:
194 /* Since some shader stages use the the highest used IO index
195 * to determine the size to allocate for inputs/outputs
196 * (in LDS, tess and GS rings). GENERIC should be placed right
197 * after POSITION to make that size as small as possible.
198 */
199 if (index < SI_MAX_IO_GENERIC)
200 return 1 + index;
201
202 assert(!"invalid generic index");
203 return 0;
204 case TGSI_SEMANTIC_PSIZE:
205 return SI_MAX_IO_GENERIC + 1;
206 case TGSI_SEMANTIC_CLIPDIST:
207 assert(index <= 1);
208 return SI_MAX_IO_GENERIC + 2 + index;
209 case TGSI_SEMANTIC_FOG:
210 return SI_MAX_IO_GENERIC + 4;
211 case TGSI_SEMANTIC_LAYER:
212 return SI_MAX_IO_GENERIC + 5;
213 case TGSI_SEMANTIC_VIEWPORT_INDEX:
214 return SI_MAX_IO_GENERIC + 6;
215 case TGSI_SEMANTIC_PRIMID:
216 return SI_MAX_IO_GENERIC + 7;
217 case TGSI_SEMANTIC_COLOR: /* these alias */
218 case TGSI_SEMANTIC_BCOLOR:
219 assert(index < 2);
220 return SI_MAX_IO_GENERIC + 8 + index;
221 case TGSI_SEMANTIC_TEXCOORD:
222 assert(index < 8);
223 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
224 return SI_MAX_IO_GENERIC + 10 + index;
225 default:
226 assert(!"invalid semantic name");
227 return 0;
228 }
229 }
230
231 /**
232 * Helper function that builds an LLVM IR PHI node and immediately adds
233 * incoming edges.
234 */
235 static LLVMValueRef
236 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
237 unsigned count_incoming, LLVMValueRef *values,
238 LLVMBasicBlockRef *blocks)
239 {
240 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
241 LLVMAddIncoming(phi, values, blocks, count_incoming);
242 return phi;
243 }
244
245 /**
246 * Get the value of a shader input parameter and extract a bitfield.
247 */
248 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
249 unsigned param, unsigned rshift,
250 unsigned bitwidth)
251 {
252 struct gallivm_state *gallivm = &ctx->gallivm;
253 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
254 param);
255
256 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
257 value = bitcast(&ctx->bld_base,
258 TGSI_TYPE_UNSIGNED, value);
259
260 if (rshift)
261 value = LLVMBuildLShr(gallivm->builder, value,
262 LLVMConstInt(ctx->i32, rshift, 0), "");
263
264 if (rshift + bitwidth < 32) {
265 unsigned mask = (1 << bitwidth) - 1;
266 value = LLVMBuildAnd(gallivm->builder, value,
267 LLVMConstInt(ctx->i32, mask, 0), "");
268 }
269
270 return value;
271 }
272
273 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
274 {
275 switch (ctx->type) {
276 case PIPE_SHADER_TESS_CTRL:
277 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
278
279 case PIPE_SHADER_TESS_EVAL:
280 return LLVMGetParam(ctx->main_fn,
281 ctx->param_tes_rel_patch_id);
282
283 default:
284 assert(0);
285 return NULL;
286 }
287 }
288
289 /* Tessellation shaders pass outputs to the next shader using LDS.
290 *
291 * LS outputs = TCS inputs
292 * TCS outputs = TES inputs
293 *
294 * The LDS layout is:
295 * - TCS inputs for patch 0
296 * - TCS inputs for patch 1
297 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
298 * - ...
299 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
300 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
301 * - TCS outputs for patch 1
302 * - Per-patch TCS outputs for patch 1
303 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
304 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
305 * - ...
306 *
307 * All three shaders VS(LS), TCS, TES share the same LDS space.
308 */
309
310 static LLVMValueRef
311 get_tcs_in_patch_stride(struct si_shader_context *ctx)
312 {
313 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
314 }
315
316 static LLVMValueRef
317 get_tcs_out_patch_stride(struct si_shader_context *ctx)
318 {
319 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
320 }
321
322 static LLVMValueRef
323 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
324 {
325 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
326 unpack_param(ctx,
327 ctx->param_tcs_out_lds_offsets,
328 0, 16),
329 4);
330 }
331
332 static LLVMValueRef
333 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
334 {
335 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
336 unpack_param(ctx,
337 ctx->param_tcs_out_lds_offsets,
338 16, 16),
339 4);
340 }
341
342 static LLVMValueRef
343 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
344 {
345 struct gallivm_state *gallivm = &ctx->gallivm;
346 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
347 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
348
349 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
350 }
351
352 static LLVMValueRef
353 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
354 {
355 struct gallivm_state *gallivm = &ctx->gallivm;
356 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
357 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
358 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
359
360 return LLVMBuildAdd(gallivm->builder, patch0_offset,
361 LLVMBuildMul(gallivm->builder, patch_stride,
362 rel_patch_id, ""),
363 "");
364 }
365
366 static LLVMValueRef
367 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
368 {
369 struct gallivm_state *gallivm = &ctx->gallivm;
370 LLVMValueRef patch0_patch_data_offset =
371 get_tcs_out_patch0_patch_data_offset(ctx);
372 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
373 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
374
375 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
376 LLVMBuildMul(gallivm->builder, patch_stride,
377 rel_patch_id, ""),
378 "");
379 }
380
381 static LLVMValueRef get_instance_index_for_fetch(
382 struct si_shader_context *ctx,
383 unsigned param_start_instance, LLVMValueRef divisor)
384 {
385 struct gallivm_state *gallivm = &ctx->gallivm;
386
387 LLVMValueRef result = ctx->abi.instance_id;
388
389 /* The division must be done before START_INSTANCE is added. */
390 if (divisor != ctx->i32_1)
391 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
392
393 return LLVMBuildAdd(gallivm->builder, result,
394 LLVMGetParam(ctx->main_fn, param_start_instance), "");
395 }
396
397 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
398 * to float. */
399 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
400 LLVMValueRef vec4,
401 unsigned double_index)
402 {
403 LLVMBuilderRef builder = ctx->gallivm.builder;
404 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
405 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
406 LLVMVectorType(f64, 2), "");
407 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
408 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
409 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
410 }
411
412 static void declare_input_vs(
413 struct si_shader_context *ctx,
414 unsigned input_index,
415 const struct tgsi_full_declaration *decl,
416 LLVMValueRef out[4])
417 {
418 struct gallivm_state *gallivm = &ctx->gallivm;
419
420 unsigned chan;
421 unsigned fix_fetch;
422 unsigned num_fetches;
423 unsigned fetch_stride;
424
425 LLVMValueRef t_list_ptr;
426 LLVMValueRef t_offset;
427 LLVMValueRef t_list;
428 LLVMValueRef vertex_index;
429 LLVMValueRef input[3];
430
431 /* Load the T list */
432 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
433
434 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
435
436 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
437
438 vertex_index = LLVMGetParam(ctx->main_fn,
439 ctx->param_vertex_index0 +
440 input_index);
441
442 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
443
444 /* Do multiple loads for special formats. */
445 switch (fix_fetch) {
446 case SI_FIX_FETCH_RGB_64_FLOAT:
447 num_fetches = 3; /* 3 2-dword loads */
448 fetch_stride = 8;
449 break;
450 case SI_FIX_FETCH_RGBA_64_FLOAT:
451 num_fetches = 2; /* 2 4-dword loads */
452 fetch_stride = 16;
453 break;
454 case SI_FIX_FETCH_RGB_8:
455 case SI_FIX_FETCH_RGB_8_INT:
456 num_fetches = 3;
457 fetch_stride = 1;
458 break;
459 case SI_FIX_FETCH_RGB_16:
460 case SI_FIX_FETCH_RGB_16_INT:
461 num_fetches = 3;
462 fetch_stride = 2;
463 break;
464 default:
465 num_fetches = 1;
466 fetch_stride = 0;
467 }
468
469 for (unsigned i = 0; i < num_fetches; i++) {
470 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
471
472 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
473 vertex_index, voffset,
474 true);
475 }
476
477 /* Break up the vec4 into individual components */
478 for (chan = 0; chan < 4; chan++) {
479 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
480 out[chan] = LLVMBuildExtractElement(gallivm->builder,
481 input[0], llvm_chan, "");
482 }
483
484 switch (fix_fetch) {
485 case SI_FIX_FETCH_A2_SNORM:
486 case SI_FIX_FETCH_A2_SSCALED:
487 case SI_FIX_FETCH_A2_SINT: {
488 /* The hardware returns an unsigned value; convert it to a
489 * signed one.
490 */
491 LLVMValueRef tmp = out[3];
492 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
493
494 /* First, recover the sign-extended signed integer value. */
495 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
496 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
497 else
498 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
499
500 /* For the integer-like cases, do a natural sign extension.
501 *
502 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
503 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
504 * exponent.
505 */
506 tmp = LLVMBuildShl(gallivm->builder, tmp,
507 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
508 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
509 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
510
511 /* Convert back to the right type. */
512 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
513 LLVMValueRef clamp;
514 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
515 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
516 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
517 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
518 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
519 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
520 }
521
522 out[3] = tmp;
523 break;
524 }
525 case SI_FIX_FETCH_RGBA_32_UNORM:
526 case SI_FIX_FETCH_RGBX_32_UNORM:
527 for (chan = 0; chan < 4; chan++) {
528 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
529 ctx->i32, "");
530 out[chan] = LLVMBuildUIToFP(gallivm->builder,
531 out[chan], ctx->f32, "");
532 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
533 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
534 }
535 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
536 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
537 out[3] = LLVMConstReal(ctx->f32, 1);
538 break;
539 case SI_FIX_FETCH_RGBA_32_SNORM:
540 case SI_FIX_FETCH_RGBX_32_SNORM:
541 case SI_FIX_FETCH_RGBA_32_FIXED:
542 case SI_FIX_FETCH_RGBX_32_FIXED: {
543 double scale;
544 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
545 scale = 1.0 / 0x10000;
546 else
547 scale = 1.0 / INT_MAX;
548
549 for (chan = 0; chan < 4; chan++) {
550 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
551 ctx->i32, "");
552 out[chan] = LLVMBuildSIToFP(gallivm->builder,
553 out[chan], ctx->f32, "");
554 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
555 LLVMConstReal(ctx->f32, scale), "");
556 }
557 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
558 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
559 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
560 out[3] = LLVMConstReal(ctx->f32, 1);
561 break;
562 }
563 case SI_FIX_FETCH_RGBA_32_USCALED:
564 for (chan = 0; chan < 4; chan++) {
565 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
566 ctx->i32, "");
567 out[chan] = LLVMBuildUIToFP(gallivm->builder,
568 out[chan], ctx->f32, "");
569 }
570 break;
571 case SI_FIX_FETCH_RGBA_32_SSCALED:
572 for (chan = 0; chan < 4; chan++) {
573 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
574 ctx->i32, "");
575 out[chan] = LLVMBuildSIToFP(gallivm->builder,
576 out[chan], ctx->f32, "");
577 }
578 break;
579 case SI_FIX_FETCH_RG_64_FLOAT:
580 for (chan = 0; chan < 2; chan++)
581 out[chan] = extract_double_to_float(ctx, input[0], chan);
582
583 out[2] = LLVMConstReal(ctx->f32, 0);
584 out[3] = LLVMConstReal(ctx->f32, 1);
585 break;
586 case SI_FIX_FETCH_RGB_64_FLOAT:
587 for (chan = 0; chan < 3; chan++)
588 out[chan] = extract_double_to_float(ctx, input[chan], 0);
589
590 out[3] = LLVMConstReal(ctx->f32, 1);
591 break;
592 case SI_FIX_FETCH_RGBA_64_FLOAT:
593 for (chan = 0; chan < 4; chan++) {
594 out[chan] = extract_double_to_float(ctx, input[chan / 2],
595 chan % 2);
596 }
597 break;
598 case SI_FIX_FETCH_RGB_8:
599 case SI_FIX_FETCH_RGB_8_INT:
600 case SI_FIX_FETCH_RGB_16:
601 case SI_FIX_FETCH_RGB_16_INT:
602 for (chan = 0; chan < 3; chan++) {
603 out[chan] = LLVMBuildExtractElement(gallivm->builder,
604 input[chan],
605 ctx->i32_0, "");
606 }
607 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
608 fix_fetch == SI_FIX_FETCH_RGB_16) {
609 out[3] = LLVMConstReal(ctx->f32, 1);
610 } else {
611 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
612 ctx->f32, "");
613 }
614 break;
615 }
616 }
617
618 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
619 unsigned swizzle)
620 {
621 struct si_shader_context *ctx = si_shader_context(bld_base);
622
623 if (swizzle > 0)
624 return ctx->i32_0;
625
626 switch (ctx->type) {
627 case PIPE_SHADER_VERTEX:
628 return LLVMGetParam(ctx->main_fn,
629 ctx->param_vs_prim_id);
630 case PIPE_SHADER_TESS_CTRL:
631 return LLVMGetParam(ctx->main_fn,
632 ctx->param_tcs_patch_id);
633 case PIPE_SHADER_TESS_EVAL:
634 return LLVMGetParam(ctx->main_fn,
635 ctx->param_tes_patch_id);
636 case PIPE_SHADER_GEOMETRY:
637 return LLVMGetParam(ctx->main_fn,
638 ctx->param_gs_prim_id);
639 default:
640 assert(0);
641 return ctx->i32_0;
642 }
643 }
644
645 /**
646 * Return the value of tgsi_ind_register for indexing.
647 * This is the indirect index with the constant offset added to it.
648 */
649 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
650 const struct tgsi_ind_register *ind,
651 int rel_index)
652 {
653 struct gallivm_state *gallivm = &ctx->gallivm;
654 LLVMValueRef result;
655
656 result = ctx->addrs[ind->Index][ind->Swizzle];
657 result = LLVMBuildLoad(gallivm->builder, result, "");
658 result = LLVMBuildAdd(gallivm->builder, result,
659 LLVMConstInt(ctx->i32, rel_index, 0), "");
660 return result;
661 }
662
663 /**
664 * Like get_indirect_index, but restricts the return value to a (possibly
665 * undefined) value inside [0..num).
666 */
667 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
668 const struct tgsi_ind_register *ind,
669 int rel_index, unsigned num)
670 {
671 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
672
673 return si_llvm_bound_index(ctx, result, num);
674 }
675
676
677 /**
678 * Calculate a dword address given an input or output register and a stride.
679 */
680 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
681 const struct tgsi_full_dst_register *dst,
682 const struct tgsi_full_src_register *src,
683 LLVMValueRef vertex_dw_stride,
684 LLVMValueRef base_addr)
685 {
686 struct gallivm_state *gallivm = &ctx->gallivm;
687 struct tgsi_shader_info *info = &ctx->shader->selector->info;
688 ubyte *name, *index, *array_first;
689 int first, param;
690 struct tgsi_full_dst_register reg;
691
692 /* Set the register description. The address computation is the same
693 * for sources and destinations. */
694 if (src) {
695 reg.Register.File = src->Register.File;
696 reg.Register.Index = src->Register.Index;
697 reg.Register.Indirect = src->Register.Indirect;
698 reg.Register.Dimension = src->Register.Dimension;
699 reg.Indirect = src->Indirect;
700 reg.Dimension = src->Dimension;
701 reg.DimIndirect = src->DimIndirect;
702 } else
703 reg = *dst;
704
705 /* If the register is 2-dimensional (e.g. an array of vertices
706 * in a primitive), calculate the base address of the vertex. */
707 if (reg.Register.Dimension) {
708 LLVMValueRef index;
709
710 if (reg.Dimension.Indirect)
711 index = get_indirect_index(ctx, &reg.DimIndirect,
712 reg.Dimension.Index);
713 else
714 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
715
716 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
717 LLVMBuildMul(gallivm->builder, index,
718 vertex_dw_stride, ""), "");
719 }
720
721 /* Get information about the register. */
722 if (reg.Register.File == TGSI_FILE_INPUT) {
723 name = info->input_semantic_name;
724 index = info->input_semantic_index;
725 array_first = info->input_array_first;
726 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
727 name = info->output_semantic_name;
728 index = info->output_semantic_index;
729 array_first = info->output_array_first;
730 } else {
731 assert(0);
732 return NULL;
733 }
734
735 if (reg.Register.Indirect) {
736 /* Add the relative address of the element. */
737 LLVMValueRef ind_index;
738
739 if (reg.Indirect.ArrayID)
740 first = array_first[reg.Indirect.ArrayID];
741 else
742 first = reg.Register.Index;
743
744 ind_index = get_indirect_index(ctx, &reg.Indirect,
745 reg.Register.Index - first);
746
747 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
748 LLVMBuildMul(gallivm->builder, ind_index,
749 LLVMConstInt(ctx->i32, 4, 0), ""), "");
750
751 param = reg.Register.Dimension ?
752 si_shader_io_get_unique_index(name[first], index[first]) :
753 si_shader_io_get_unique_index_patch(name[first], index[first]);
754 } else {
755 param = reg.Register.Dimension ?
756 si_shader_io_get_unique_index(name[reg.Register.Index],
757 index[reg.Register.Index]) :
758 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
759 index[reg.Register.Index]);
760 }
761
762 /* Add the base address of the element. */
763 return LLVMBuildAdd(gallivm->builder, base_addr,
764 LLVMConstInt(ctx->i32, param * 4, 0), "");
765 }
766
767 /* The offchip buffer layout for TCS->TES is
768 *
769 * - attribute 0 of patch 0 vertex 0
770 * - attribute 0 of patch 0 vertex 1
771 * - attribute 0 of patch 0 vertex 2
772 * ...
773 * - attribute 0 of patch 1 vertex 0
774 * - attribute 0 of patch 1 vertex 1
775 * ...
776 * - attribute 1 of patch 0 vertex 0
777 * - attribute 1 of patch 0 vertex 1
778 * ...
779 * - per patch attribute 0 of patch 0
780 * - per patch attribute 0 of patch 1
781 * ...
782 *
783 * Note that every attribute has 4 components.
784 */
785 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
786 LLVMValueRef rel_patch_id,
787 LLVMValueRef vertex_index,
788 LLVMValueRef param_index)
789 {
790 struct gallivm_state *gallivm = &ctx->gallivm;
791 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
792 LLVMValueRef param_stride, constant16;
793
794 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
795 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
796 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
797 num_patches, "");
798
799 constant16 = LLVMConstInt(ctx->i32, 16, 0);
800 if (vertex_index) {
801 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
802 vertices_per_patch, "");
803
804 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
805 vertex_index, "");
806
807 param_stride = total_vertices;
808 } else {
809 base_addr = rel_patch_id;
810 param_stride = num_patches;
811 }
812
813 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
814 LLVMBuildMul(gallivm->builder, param_index,
815 param_stride, ""), "");
816
817 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
818
819 if (!vertex_index) {
820 LLVMValueRef patch_data_offset =
821 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
822
823 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
824 patch_data_offset, "");
825 }
826 return base_addr;
827 }
828
829 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
830 struct si_shader_context *ctx,
831 const struct tgsi_full_dst_register *dst,
832 const struct tgsi_full_src_register *src)
833 {
834 struct gallivm_state *gallivm = &ctx->gallivm;
835 struct tgsi_shader_info *info = &ctx->shader->selector->info;
836 ubyte *name, *index, *array_first;
837 struct tgsi_full_src_register reg;
838 LLVMValueRef vertex_index = NULL;
839 LLVMValueRef param_index = NULL;
840 unsigned param_index_base, param_base;
841
842 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
843
844 if (reg.Register.Dimension) {
845
846 if (reg.Dimension.Indirect)
847 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
848 reg.Dimension.Index);
849 else
850 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
851 }
852
853 /* Get information about the register. */
854 if (reg.Register.File == TGSI_FILE_INPUT) {
855 name = info->input_semantic_name;
856 index = info->input_semantic_index;
857 array_first = info->input_array_first;
858 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
859 name = info->output_semantic_name;
860 index = info->output_semantic_index;
861 array_first = info->output_array_first;
862 } else {
863 assert(0);
864 return NULL;
865 }
866
867 if (reg.Register.Indirect) {
868 if (reg.Indirect.ArrayID)
869 param_base = array_first[reg.Indirect.ArrayID];
870 else
871 param_base = reg.Register.Index;
872
873 param_index = get_indirect_index(ctx, &reg.Indirect,
874 reg.Register.Index - param_base);
875
876 } else {
877 param_base = reg.Register.Index;
878 param_index = ctx->i32_0;
879 }
880
881 param_index_base = reg.Register.Dimension ?
882 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
883 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
884
885 param_index = LLVMBuildAdd(gallivm->builder, param_index,
886 LLVMConstInt(ctx->i32, param_index_base, 0),
887 "");
888
889 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
890 vertex_index, param_index);
891 }
892
893 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
894 enum tgsi_opcode_type type, unsigned swizzle,
895 LLVMValueRef buffer, LLVMValueRef offset,
896 LLVMValueRef base, bool can_speculate)
897 {
898 struct si_shader_context *ctx = si_shader_context(bld_base);
899 struct gallivm_state *gallivm = &ctx->gallivm;
900 LLVMValueRef value, value2;
901 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
902 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
903
904 if (swizzle == ~0) {
905 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
906 0, 1, 0, can_speculate, false);
907
908 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
909 }
910
911 if (!tgsi_type_is_64bit(type)) {
912 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
913 0, 1, 0, can_speculate, false);
914
915 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
916 return LLVMBuildExtractElement(gallivm->builder, value,
917 LLVMConstInt(ctx->i32, swizzle, 0), "");
918 }
919
920 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
921 swizzle * 4, 1, 0, can_speculate, false);
922
923 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
924 swizzle * 4 + 4, 1, 0, can_speculate, false);
925
926 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
927 }
928
929 /**
930 * Load from LDS.
931 *
932 * \param type output value type
933 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
934 * \param dw_addr address in dwords
935 */
936 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
937 enum tgsi_opcode_type type, unsigned swizzle,
938 LLVMValueRef dw_addr)
939 {
940 struct si_shader_context *ctx = si_shader_context(bld_base);
941 struct gallivm_state *gallivm = &ctx->gallivm;
942 LLVMValueRef value;
943
944 if (swizzle == ~0) {
945 LLVMValueRef values[TGSI_NUM_CHANNELS];
946
947 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
948 values[chan] = lds_load(bld_base, type, chan, dw_addr);
949
950 return lp_build_gather_values(gallivm, values,
951 TGSI_NUM_CHANNELS);
952 }
953
954 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
955 LLVMConstInt(ctx->i32, swizzle, 0));
956
957 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
958 if (tgsi_type_is_64bit(type)) {
959 LLVMValueRef value2;
960 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
961 ctx->i32_1);
962 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
963 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
964 }
965
966 return LLVMBuildBitCast(gallivm->builder, value,
967 tgsi2llvmtype(bld_base, type), "");
968 }
969
970 /**
971 * Store to LDS.
972 *
973 * \param swizzle offset (typically 0..3)
974 * \param dw_addr address in dwords
975 * \param value value to store
976 */
977 static void lds_store(struct lp_build_tgsi_context *bld_base,
978 unsigned dw_offset_imm, LLVMValueRef dw_addr,
979 LLVMValueRef value)
980 {
981 struct si_shader_context *ctx = si_shader_context(bld_base);
982 struct gallivm_state *gallivm = &ctx->gallivm;
983
984 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
985 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
986
987 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
988 ac_build_indexed_store(&ctx->ac, ctx->lds,
989 dw_addr, value);
990 }
991
992 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
993 unsigned param)
994 {
995 LLVMBuilderRef builder = ctx->gallivm.builder;
996
997 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
998 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
999 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1000
1001 uint64_t desc2 = 0xffffffff;
1002 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1003 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1004 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1005 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1006 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1007 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1008 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1009
1010 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1011 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1012 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1013 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1014 }
1015
1016 static LLVMValueRef fetch_input_tcs(
1017 struct lp_build_tgsi_context *bld_base,
1018 const struct tgsi_full_src_register *reg,
1019 enum tgsi_opcode_type type, unsigned swizzle)
1020 {
1021 struct si_shader_context *ctx = si_shader_context(bld_base);
1022 LLVMValueRef dw_addr, stride;
1023
1024 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
1025 dw_addr = get_tcs_in_current_patch_offset(ctx);
1026 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1027
1028 return lds_load(bld_base, type, swizzle, dw_addr);
1029 }
1030
1031 static LLVMValueRef fetch_output_tcs(
1032 struct lp_build_tgsi_context *bld_base,
1033 const struct tgsi_full_src_register *reg,
1034 enum tgsi_opcode_type type, unsigned swizzle)
1035 {
1036 struct si_shader_context *ctx = si_shader_context(bld_base);
1037 LLVMValueRef dw_addr, stride;
1038
1039 if (reg->Register.Dimension) {
1040 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1041 dw_addr = get_tcs_out_current_patch_offset(ctx);
1042 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1043 } else {
1044 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1045 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1046 }
1047
1048 return lds_load(bld_base, type, swizzle, dw_addr);
1049 }
1050
1051 static LLVMValueRef fetch_input_tes(
1052 struct lp_build_tgsi_context *bld_base,
1053 const struct tgsi_full_src_register *reg,
1054 enum tgsi_opcode_type type, unsigned swizzle)
1055 {
1056 struct si_shader_context *ctx = si_shader_context(bld_base);
1057 LLVMValueRef buffer, base, addr;
1058
1059 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1060
1061 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1062 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1063
1064 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1065 }
1066
1067 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1068 const struct tgsi_full_instruction *inst,
1069 const struct tgsi_opcode_info *info,
1070 LLVMValueRef dst[4])
1071 {
1072 struct si_shader_context *ctx = si_shader_context(bld_base);
1073 struct gallivm_state *gallivm = &ctx->gallivm;
1074 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1075 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1076 unsigned chan_index;
1077 LLVMValueRef dw_addr, stride;
1078 LLVMValueRef buffer, base, buf_addr;
1079 LLVMValueRef values[4];
1080 bool skip_lds_store;
1081 bool is_tess_factor = false;
1082
1083 /* Only handle per-patch and per-vertex outputs here.
1084 * Vectors will be lowered to scalars and this function will be called again.
1085 */
1086 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1087 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1088 si_llvm_emit_store(bld_base, inst, info, dst);
1089 return;
1090 }
1091
1092 if (reg->Register.Dimension) {
1093 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1094 dw_addr = get_tcs_out_current_patch_offset(ctx);
1095 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1096 skip_lds_store = !sh_info->reads_pervertex_outputs;
1097 } else {
1098 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1099 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1100 skip_lds_store = !sh_info->reads_perpatch_outputs;
1101
1102 if (!reg->Register.Indirect) {
1103 int name = sh_info->output_semantic_name[reg->Register.Index];
1104
1105 /* Always write tess factors into LDS for the TCS epilog. */
1106 if (name == TGSI_SEMANTIC_TESSINNER ||
1107 name == TGSI_SEMANTIC_TESSOUTER) {
1108 skip_lds_store = false;
1109 is_tess_factor = true;
1110 }
1111 }
1112 }
1113
1114 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1115
1116 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1117 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1118
1119
1120 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1121 LLVMValueRef value = dst[chan_index];
1122
1123 if (inst->Instruction.Saturate)
1124 value = ac_build_clamp(&ctx->ac, value);
1125
1126 /* Skip LDS stores if there is no LDS read of this output. */
1127 if (!skip_lds_store)
1128 lds_store(bld_base, chan_index, dw_addr, value);
1129
1130 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1131 values[chan_index] = value;
1132
1133 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1134 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1135 buf_addr, base,
1136 4 * chan_index, 1, 0, true, false);
1137 }
1138 }
1139
1140 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1141 LLVMValueRef value = lp_build_gather_values(gallivm,
1142 values, 4);
1143 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1144 base, 0, 1, 0, true, false);
1145 }
1146 }
1147
1148 static LLVMValueRef fetch_input_gs(
1149 struct lp_build_tgsi_context *bld_base,
1150 const struct tgsi_full_src_register *reg,
1151 enum tgsi_opcode_type type,
1152 unsigned swizzle)
1153 {
1154 struct si_shader_context *ctx = si_shader_context(bld_base);
1155 struct si_shader *shader = ctx->shader;
1156 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1157 struct gallivm_state *gallivm = &ctx->gallivm;
1158 LLVMValueRef vtx_offset, soffset;
1159 struct tgsi_shader_info *info = &shader->selector->info;
1160 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1161 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1162 unsigned param;
1163 LLVMValueRef value;
1164
1165 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1166 return get_primitive_id(bld_base, swizzle);
1167
1168 if (!reg->Register.Dimension)
1169 return NULL;
1170
1171 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1172
1173 /* GFX9 has the ESGS ring in LDS. */
1174 if (ctx->screen->b.chip_class >= GFX9) {
1175 unsigned index = reg->Dimension.Index;
1176
1177 switch (index / 2) {
1178 case 0:
1179 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1180 index % 2 ? 16 : 0, 16);
1181 break;
1182 case 1:
1183 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1184 index % 2 ? 16 : 0, 16);
1185 break;
1186 case 2:
1187 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1188 index % 2 ? 16 : 0, 16);
1189 break;
1190 default:
1191 assert(0);
1192 return NULL;
1193 }
1194
1195 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1196 LLVMConstInt(ctx->i32, param * 4, 0), "");
1197 return lds_load(bld_base, type, swizzle, vtx_offset);
1198 }
1199
1200 /* GFX6: input load from the ESGS ring in memory. */
1201 if (swizzle == ~0) {
1202 LLVMValueRef values[TGSI_NUM_CHANNELS];
1203 unsigned chan;
1204 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1205 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1206 }
1207 return lp_build_gather_values(gallivm, values,
1208 TGSI_NUM_CHANNELS);
1209 }
1210
1211 /* Get the vertex offset parameter on GFX6. */
1212 unsigned vtx_offset_param = reg->Dimension.Index;
1213 if (vtx_offset_param < 2) {
1214 vtx_offset_param += ctx->param_gs_vtx0_offset;
1215 } else {
1216 assert(vtx_offset_param < 6);
1217 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1218 }
1219 vtx_offset = lp_build_mul_imm(uint,
1220 LLVMGetParam(ctx->main_fn,
1221 vtx_offset_param),
1222 4);
1223
1224 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1225
1226 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1227 vtx_offset, soffset, 0, 1, 0, true, false);
1228 if (tgsi_type_is_64bit(type)) {
1229 LLVMValueRef value2;
1230 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1231
1232 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1233 ctx->i32_0, vtx_offset, soffset,
1234 0, 1, 0, true, false);
1235 return si_llvm_emit_fetch_64bit(bld_base, type,
1236 value, value2);
1237 }
1238 return LLVMBuildBitCast(gallivm->builder,
1239 value,
1240 tgsi2llvmtype(bld_base, type), "");
1241 }
1242
1243 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1244 {
1245 switch (interpolate) {
1246 case TGSI_INTERPOLATE_CONSTANT:
1247 return 0;
1248
1249 case TGSI_INTERPOLATE_LINEAR:
1250 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1251 return SI_PARAM_LINEAR_SAMPLE;
1252 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1253 return SI_PARAM_LINEAR_CENTROID;
1254 else
1255 return SI_PARAM_LINEAR_CENTER;
1256 break;
1257 case TGSI_INTERPOLATE_COLOR:
1258 case TGSI_INTERPOLATE_PERSPECTIVE:
1259 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1260 return SI_PARAM_PERSP_SAMPLE;
1261 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1262 return SI_PARAM_PERSP_CENTROID;
1263 else
1264 return SI_PARAM_PERSP_CENTER;
1265 break;
1266 default:
1267 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1268 return -1;
1269 }
1270 }
1271
1272 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1273 unsigned attr_index, unsigned chan,
1274 LLVMValueRef prim_mask,
1275 LLVMValueRef i, LLVMValueRef j)
1276 {
1277 if (i || j) {
1278 return ac_build_fs_interp(&ctx->ac,
1279 LLVMConstInt(ctx->i32, chan, 0),
1280 LLVMConstInt(ctx->i32, attr_index, 0),
1281 prim_mask, i, j);
1282 }
1283 return ac_build_fs_interp_mov(&ctx->ac,
1284 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1285 LLVMConstInt(ctx->i32, chan, 0),
1286 LLVMConstInt(ctx->i32, attr_index, 0),
1287 prim_mask);
1288 }
1289
1290 /**
1291 * Interpolate a fragment shader input.
1292 *
1293 * @param ctx context
1294 * @param input_index index of the input in hardware
1295 * @param semantic_name TGSI_SEMANTIC_*
1296 * @param semantic_index semantic index
1297 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1298 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1299 * @param interp_param interpolation weights (i,j)
1300 * @param prim_mask SI_PARAM_PRIM_MASK
1301 * @param face SI_PARAM_FRONT_FACE
1302 * @param result the return value (4 components)
1303 */
1304 static void interp_fs_input(struct si_shader_context *ctx,
1305 unsigned input_index,
1306 unsigned semantic_name,
1307 unsigned semantic_index,
1308 unsigned num_interp_inputs,
1309 unsigned colors_read_mask,
1310 LLVMValueRef interp_param,
1311 LLVMValueRef prim_mask,
1312 LLVMValueRef face,
1313 LLVMValueRef result[4])
1314 {
1315 struct gallivm_state *gallivm = &ctx->gallivm;
1316 LLVMValueRef i = NULL, j = NULL;
1317 unsigned chan;
1318
1319 /* fs.constant returns the param from the middle vertex, so it's not
1320 * really useful for flat shading. It's meant to be used for custom
1321 * interpolation (but the intrinsic can't fetch from the other two
1322 * vertices).
1323 *
1324 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1325 * to do the right thing. The only reason we use fs.constant is that
1326 * fs.interp cannot be used on integers, because they can be equal
1327 * to NaN.
1328 *
1329 * When interp is false we will use fs.constant or for newer llvm,
1330 * amdgcn.interp.mov.
1331 */
1332 bool interp = interp_param != NULL;
1333
1334 if (interp) {
1335 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1336 LLVMVectorType(ctx->f32, 2), "");
1337
1338 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1339 ctx->i32_0, "");
1340 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1341 ctx->i32_1, "");
1342 }
1343
1344 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1345 ctx->shader->key.part.ps.prolog.color_two_side) {
1346 LLVMValueRef is_face_positive;
1347
1348 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1349 * otherwise it's at offset "num_inputs".
1350 */
1351 unsigned back_attr_offset = num_interp_inputs;
1352 if (semantic_index == 1 && colors_read_mask & 0xf)
1353 back_attr_offset += 1;
1354
1355 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1356 face, ctx->i32_0, "");
1357
1358 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1359 LLVMValueRef front, back;
1360
1361 front = si_build_fs_interp(ctx,
1362 input_index, chan,
1363 prim_mask, i, j);
1364 back = si_build_fs_interp(ctx,
1365 back_attr_offset, chan,
1366 prim_mask, i, j);
1367
1368 result[chan] = LLVMBuildSelect(gallivm->builder,
1369 is_face_positive,
1370 front,
1371 back,
1372 "");
1373 }
1374 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1375 result[0] = si_build_fs_interp(ctx, input_index,
1376 0, prim_mask, i, j);
1377 result[1] =
1378 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1379 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1380 } else {
1381 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1382 result[chan] = si_build_fs_interp(ctx,
1383 input_index, chan,
1384 prim_mask, i, j);
1385 }
1386 }
1387 }
1388
1389 static void declare_input_fs(
1390 struct si_shader_context *ctx,
1391 unsigned input_index,
1392 const struct tgsi_full_declaration *decl,
1393 LLVMValueRef out[4])
1394 {
1395 struct lp_build_context *base = &ctx->bld_base.base;
1396 struct si_shader *shader = ctx->shader;
1397 LLVMValueRef main_fn = ctx->main_fn;
1398 LLVMValueRef interp_param = NULL;
1399 int interp_param_idx;
1400
1401 /* Get colors from input VGPRs (set by the prolog). */
1402 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1403 unsigned i = decl->Semantic.Index;
1404 unsigned colors_read = shader->selector->info.colors_read;
1405 unsigned mask = colors_read >> (i * 4);
1406 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1407 (i ? util_bitcount(colors_read & 0xf) : 0);
1408
1409 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1410 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1411 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1412 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1413 return;
1414 }
1415
1416 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1417 decl->Interp.Location);
1418 if (interp_param_idx == -1)
1419 return;
1420 else if (interp_param_idx) {
1421 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1422 }
1423
1424 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1425 decl->Semantic.Index, 0, /* this param is unused */
1426 shader->selector->info.colors_read, interp_param,
1427 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1428 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1429 &out[0]);
1430 }
1431
1432 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1433 {
1434 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1435 }
1436
1437
1438 /**
1439 * Load a dword from a constant buffer.
1440 */
1441 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1442 LLVMValueRef resource,
1443 LLVMValueRef offset)
1444 {
1445 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1446 0, 0, 0, true, true);
1447 }
1448
1449 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1450 {
1451 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1452 struct gallivm_state *gallivm = &ctx->gallivm;
1453 LLVMBuilderRef builder = gallivm->builder;
1454 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1455 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1456 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1457
1458 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1459 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1460 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1461
1462 LLVMValueRef pos[4] = {
1463 buffer_load_const(ctx, resource, offset0),
1464 buffer_load_const(ctx, resource, offset1),
1465 LLVMConstReal(ctx->f32, 0),
1466 LLVMConstReal(ctx->f32, 0)
1467 };
1468
1469 return lp_build_gather_values(gallivm, pos, 4);
1470 }
1471
1472 static void declare_system_value(struct si_shader_context *ctx,
1473 unsigned index,
1474 const struct tgsi_full_declaration *decl)
1475 {
1476 struct lp_build_context *bld = &ctx->bld_base.base;
1477 struct gallivm_state *gallivm = &ctx->gallivm;
1478 LLVMValueRef value = 0;
1479
1480 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1481
1482 switch (decl->Semantic.Name) {
1483 case TGSI_SEMANTIC_INSTANCEID:
1484 value = ctx->abi.instance_id;
1485 break;
1486
1487 case TGSI_SEMANTIC_VERTEXID:
1488 value = LLVMBuildAdd(gallivm->builder,
1489 ctx->abi.vertex_id,
1490 ctx->abi.base_vertex, "");
1491 break;
1492
1493 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1494 /* Unused. Clarify the meaning in indexed vs. non-indexed
1495 * draws if this is ever used again. */
1496 assert(false);
1497 break;
1498
1499 case TGSI_SEMANTIC_BASEVERTEX:
1500 {
1501 /* For non-indexed draws, the base vertex set by the driver
1502 * (for direct draws) or the CP (for indirect draws) is the
1503 * first vertex ID, but GLSL expects 0 to be returned.
1504 */
1505 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1506 LLVMValueRef indexed;
1507
1508 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1509 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1510
1511 value = LLVMBuildSelect(gallivm->builder, indexed,
1512 ctx->abi.base_vertex, ctx->i32_0, "");
1513 break;
1514 }
1515
1516 case TGSI_SEMANTIC_BASEINSTANCE:
1517 value = ctx->abi.start_instance;
1518 break;
1519
1520 case TGSI_SEMANTIC_DRAWID:
1521 value = ctx->abi.draw_id;
1522 break;
1523
1524 case TGSI_SEMANTIC_INVOCATIONID:
1525 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1526 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1527 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1528 value = LLVMGetParam(ctx->main_fn,
1529 ctx->param_gs_instance_id);
1530 else
1531 assert(!"INVOCATIONID not implemented");
1532 break;
1533
1534 case TGSI_SEMANTIC_POSITION:
1535 {
1536 LLVMValueRef pos[4] = {
1537 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1538 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1539 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1540 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1541 LLVMGetParam(ctx->main_fn,
1542 SI_PARAM_POS_W_FLOAT)),
1543 };
1544 value = lp_build_gather_values(gallivm, pos, 4);
1545 break;
1546 }
1547
1548 case TGSI_SEMANTIC_FACE:
1549 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1550 break;
1551
1552 case TGSI_SEMANTIC_SAMPLEID:
1553 value = get_sample_id(ctx);
1554 break;
1555
1556 case TGSI_SEMANTIC_SAMPLEPOS: {
1557 LLVMValueRef pos[4] = {
1558 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1559 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1560 LLVMConstReal(ctx->f32, 0),
1561 LLVMConstReal(ctx->f32, 0)
1562 };
1563 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1564 TGSI_OPCODE_FRC, pos[0]);
1565 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1566 TGSI_OPCODE_FRC, pos[1]);
1567 value = lp_build_gather_values(gallivm, pos, 4);
1568 break;
1569 }
1570
1571 case TGSI_SEMANTIC_SAMPLEMASK:
1572 /* This can only occur with the OpenGL Core profile, which
1573 * doesn't support smoothing.
1574 */
1575 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1576 break;
1577
1578 case TGSI_SEMANTIC_TESSCOORD:
1579 {
1580 LLVMValueRef coord[4] = {
1581 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1582 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1583 bld->zero,
1584 bld->zero
1585 };
1586
1587 /* For triangles, the vector should be (u, v, 1-u-v). */
1588 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1589 PIPE_PRIM_TRIANGLES)
1590 coord[2] = lp_build_sub(bld, bld->one,
1591 lp_build_add(bld, coord[0], coord[1]));
1592
1593 value = lp_build_gather_values(gallivm, coord, 4);
1594 break;
1595 }
1596
1597 case TGSI_SEMANTIC_VERTICESIN:
1598 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1599 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1600 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1601 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1602 else
1603 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1604 break;
1605
1606 case TGSI_SEMANTIC_TESSINNER:
1607 case TGSI_SEMANTIC_TESSOUTER:
1608 {
1609 LLVMValueRef buffer, base, addr;
1610 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1611
1612 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1613
1614 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1615 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1616 LLVMConstInt(ctx->i32, param, 0));
1617
1618 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1619 ~0, buffer, base, addr, true);
1620
1621 break;
1622 }
1623
1624 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1625 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1626 {
1627 LLVMValueRef buf, slot, val[4];
1628 int i, offset;
1629
1630 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1631 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1632 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1633 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1634
1635 for (i = 0; i < 4; i++)
1636 val[i] = buffer_load_const(ctx, buf,
1637 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1638 value = lp_build_gather_values(gallivm, val, 4);
1639 break;
1640 }
1641
1642 case TGSI_SEMANTIC_PRIMID:
1643 value = get_primitive_id(&ctx->bld_base, 0);
1644 break;
1645
1646 case TGSI_SEMANTIC_GRID_SIZE:
1647 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1648 break;
1649
1650 case TGSI_SEMANTIC_BLOCK_SIZE:
1651 {
1652 LLVMValueRef values[3];
1653 unsigned i;
1654 unsigned *properties = ctx->shader->selector->info.properties;
1655
1656 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1657 unsigned sizes[3] = {
1658 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1659 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1660 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1661 };
1662
1663 for (i = 0; i < 3; ++i)
1664 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1665
1666 value = lp_build_gather_values(gallivm, values, 3);
1667 } else {
1668 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1669 }
1670 break;
1671 }
1672
1673 case TGSI_SEMANTIC_BLOCK_ID:
1674 {
1675 LLVMValueRef values[3];
1676
1677 for (int i = 0; i < 3; i++) {
1678 values[i] = ctx->i32_0;
1679 if (ctx->param_block_id[i] >= 0) {
1680 values[i] = LLVMGetParam(ctx->main_fn,
1681 ctx->param_block_id[i]);
1682 }
1683 }
1684 value = lp_build_gather_values(gallivm, values, 3);
1685 break;
1686 }
1687
1688 case TGSI_SEMANTIC_THREAD_ID:
1689 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1690 break;
1691
1692 case TGSI_SEMANTIC_HELPER_INVOCATION:
1693 value = lp_build_intrinsic(gallivm->builder,
1694 "llvm.amdgcn.ps.live",
1695 ctx->i1, NULL, 0,
1696 LP_FUNC_ATTR_READNONE);
1697 value = LLVMBuildNot(gallivm->builder, value, "");
1698 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1699 break;
1700
1701 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1702 value = LLVMConstInt(ctx->i32, 64, 0);
1703 break;
1704
1705 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1706 value = ac_get_thread_id(&ctx->ac);
1707 break;
1708
1709 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1710 {
1711 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1712 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1713 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1714 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1715 break;
1716 }
1717
1718 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1719 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1720 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1721 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1722 {
1723 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1724 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1725 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1726 /* All bits set except LSB */
1727 value = LLVMConstInt(ctx->i64, -2, 0);
1728 } else {
1729 /* All bits set */
1730 value = LLVMConstInt(ctx->i64, -1, 0);
1731 }
1732 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1733 value = LLVMBuildShl(gallivm->builder, value, id, "");
1734 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1735 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1736 value = LLVMBuildNot(gallivm->builder, value, "");
1737 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1738 break;
1739 }
1740
1741 default:
1742 assert(!"unknown system value");
1743 return;
1744 }
1745
1746 ctx->system_values[index] = value;
1747 }
1748
1749 static void declare_compute_memory(struct si_shader_context *ctx,
1750 const struct tgsi_full_declaration *decl)
1751 {
1752 struct si_shader_selector *sel = ctx->shader->selector;
1753 struct gallivm_state *gallivm = &ctx->gallivm;
1754
1755 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1756 LLVMValueRef var;
1757
1758 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1759 assert(decl->Range.First == decl->Range.Last);
1760 assert(!ctx->shared_memory);
1761
1762 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1763 LLVMArrayType(ctx->i8, sel->local_size),
1764 "compute_lds",
1765 LOCAL_ADDR_SPACE);
1766 LLVMSetAlignment(var, 4);
1767
1768 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1769 }
1770
1771 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1772 {
1773 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1774 ctx->param_const_and_shader_buffers);
1775
1776 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1777 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1778 }
1779
1780 static LLVMValueRef fetch_constant(
1781 struct lp_build_tgsi_context *bld_base,
1782 const struct tgsi_full_src_register *reg,
1783 enum tgsi_opcode_type type,
1784 unsigned swizzle)
1785 {
1786 struct si_shader_context *ctx = si_shader_context(bld_base);
1787 struct lp_build_context *base = &bld_base->base;
1788 const struct tgsi_ind_register *ireg = &reg->Indirect;
1789 unsigned buf, idx;
1790
1791 LLVMValueRef addr, bufp;
1792 LLVMValueRef result;
1793
1794 if (swizzle == LP_CHAN_ALL) {
1795 unsigned chan;
1796 LLVMValueRef values[4];
1797 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1798 values[chan] = fetch_constant(bld_base, reg, type, chan);
1799
1800 return lp_build_gather_values(&ctx->gallivm, values, 4);
1801 }
1802
1803 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1804 idx = reg->Register.Index * 4 + swizzle;
1805
1806 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1807 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1808 LLVMValueRef index;
1809 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1810 reg->Dimension.Index,
1811 ctx->num_const_buffers);
1812 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1813 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1814 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1815 } else
1816 bufp = load_const_buffer_desc(ctx, buf);
1817
1818 if (reg->Register.Indirect) {
1819 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1820 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1821 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1822 addr = lp_build_add(&bld_base->uint_bld, addr,
1823 LLVMConstInt(ctx->i32, idx * 4, 0));
1824 } else {
1825 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1826 }
1827
1828 result = buffer_load_const(ctx, bufp, addr);
1829
1830 if (!tgsi_type_is_64bit(type))
1831 result = bitcast(bld_base, type, result);
1832 else {
1833 LLVMValueRef addr2, result2;
1834
1835 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1836 LLVMConstInt(ctx->i32, 4, 0));
1837 result2 = buffer_load_const(ctx, bufp, addr2);
1838
1839 result = si_llvm_emit_fetch_64bit(bld_base, type,
1840 result, result2);
1841 }
1842 return result;
1843 }
1844
1845 /* Upper 16 bits must be zero. */
1846 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1847 LLVMValueRef val[2])
1848 {
1849 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1850 LLVMBuildShl(ctx->gallivm.builder, val[1],
1851 LLVMConstInt(ctx->i32, 16, 0),
1852 ""), "");
1853 }
1854
1855 /* Upper 16 bits are ignored and will be dropped. */
1856 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1857 LLVMValueRef val[2])
1858 {
1859 LLVMValueRef v[2] = {
1860 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1861 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1862 val[1],
1863 };
1864 return si_llvm_pack_two_int16(ctx, v);
1865 }
1866
1867 /* Initialize arguments for the shader export intrinsic */
1868 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1869 LLVMValueRef *values,
1870 unsigned target,
1871 struct ac_export_args *args)
1872 {
1873 struct si_shader_context *ctx = si_shader_context(bld_base);
1874 struct lp_build_context *base = &bld_base->base;
1875 LLVMBuilderRef builder = ctx->gallivm.builder;
1876 LLVMValueRef val[4];
1877 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1878 unsigned chan;
1879 bool is_int8, is_int10;
1880
1881 /* Default is 0xf. Adjusted below depending on the format. */
1882 args->enabled_channels = 0xf; /* writemask */
1883
1884 /* Specify whether the EXEC mask represents the valid mask */
1885 args->valid_mask = 0;
1886
1887 /* Specify whether this is the last export */
1888 args->done = 0;
1889
1890 /* Specify the target we are exporting */
1891 args->target = target;
1892
1893 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1894 const struct si_shader_key *key = &ctx->shader->key;
1895 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1896 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1897
1898 assert(cbuf >= 0 && cbuf < 8);
1899 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1900 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1901 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1902 }
1903
1904 args->compr = false;
1905 args->out[0] = base->undef;
1906 args->out[1] = base->undef;
1907 args->out[2] = base->undef;
1908 args->out[3] = base->undef;
1909
1910 switch (spi_shader_col_format) {
1911 case V_028714_SPI_SHADER_ZERO:
1912 args->enabled_channels = 0; /* writemask */
1913 args->target = V_008DFC_SQ_EXP_NULL;
1914 break;
1915
1916 case V_028714_SPI_SHADER_32_R:
1917 args->enabled_channels = 1; /* writemask */
1918 args->out[0] = values[0];
1919 break;
1920
1921 case V_028714_SPI_SHADER_32_GR:
1922 args->enabled_channels = 0x3; /* writemask */
1923 args->out[0] = values[0];
1924 args->out[1] = values[1];
1925 break;
1926
1927 case V_028714_SPI_SHADER_32_AR:
1928 args->enabled_channels = 0x9; /* writemask */
1929 args->out[0] = values[0];
1930 args->out[3] = values[3];
1931 break;
1932
1933 case V_028714_SPI_SHADER_FP16_ABGR:
1934 args->compr = 1; /* COMPR flag */
1935
1936 for (chan = 0; chan < 2; chan++) {
1937 LLVMValueRef pack_args[2] = {
1938 values[2 * chan],
1939 values[2 * chan + 1]
1940 };
1941 LLVMValueRef packed;
1942
1943 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1944 args->out[chan] =
1945 LLVMBuildBitCast(ctx->gallivm.builder,
1946 packed, ctx->f32, "");
1947 }
1948 break;
1949
1950 case V_028714_SPI_SHADER_UNORM16_ABGR:
1951 for (chan = 0; chan < 4; chan++) {
1952 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1953 val[chan] = LLVMBuildFMul(builder, val[chan],
1954 LLVMConstReal(ctx->f32, 65535), "");
1955 val[chan] = LLVMBuildFAdd(builder, val[chan],
1956 LLVMConstReal(ctx->f32, 0.5), "");
1957 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1958 ctx->i32, "");
1959 }
1960
1961 args->compr = 1; /* COMPR flag */
1962 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1963 si_llvm_pack_two_int16(ctx, val));
1964 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1965 si_llvm_pack_two_int16(ctx, val+2));
1966 break;
1967
1968 case V_028714_SPI_SHADER_SNORM16_ABGR:
1969 for (chan = 0; chan < 4; chan++) {
1970 /* Clamp between [-1, 1]. */
1971 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1972 values[chan],
1973 LLVMConstReal(ctx->f32, 1));
1974 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1975 val[chan],
1976 LLVMConstReal(ctx->f32, -1));
1977 /* Convert to a signed integer in [-32767, 32767]. */
1978 val[chan] = LLVMBuildFMul(builder, val[chan],
1979 LLVMConstReal(ctx->f32, 32767), "");
1980 /* If positive, add 0.5, else add -0.5. */
1981 val[chan] = LLVMBuildFAdd(builder, val[chan],
1982 LLVMBuildSelect(builder,
1983 LLVMBuildFCmp(builder, LLVMRealOGE,
1984 val[chan], base->zero, ""),
1985 LLVMConstReal(ctx->f32, 0.5),
1986 LLVMConstReal(ctx->f32, -0.5), ""), "");
1987 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1988 }
1989
1990 args->compr = 1; /* COMPR flag */
1991 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1992 si_llvm_pack_two_int32_as_int16(ctx, val));
1993 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1994 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1995 break;
1996
1997 case V_028714_SPI_SHADER_UINT16_ABGR: {
1998 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1999 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2000 LLVMValueRef max_alpha =
2001 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2002
2003 /* Clamp. */
2004 for (chan = 0; chan < 4; chan++) {
2005 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2006 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2007 val[chan],
2008 chan == 3 ? max_alpha : max_rgb);
2009 }
2010
2011 args->compr = 1; /* COMPR flag */
2012 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2013 si_llvm_pack_two_int16(ctx, val));
2014 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2015 si_llvm_pack_two_int16(ctx, val+2));
2016 break;
2017 }
2018
2019 case V_028714_SPI_SHADER_SINT16_ABGR: {
2020 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2021 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2022 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2023 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2024 LLVMValueRef max_alpha =
2025 !is_int10 ? max_rgb : ctx->i32_1;
2026 LLVMValueRef min_alpha =
2027 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2028
2029 /* Clamp. */
2030 for (chan = 0; chan < 4; chan++) {
2031 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2032 val[chan] = lp_build_emit_llvm_binary(bld_base,
2033 TGSI_OPCODE_IMIN,
2034 val[chan], chan == 3 ? max_alpha : max_rgb);
2035 val[chan] = lp_build_emit_llvm_binary(bld_base,
2036 TGSI_OPCODE_IMAX,
2037 val[chan], chan == 3 ? min_alpha : min_rgb);
2038 }
2039
2040 args->compr = 1; /* COMPR flag */
2041 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2042 si_llvm_pack_two_int32_as_int16(ctx, val));
2043 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2044 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2045 break;
2046 }
2047
2048 case V_028714_SPI_SHADER_32_ABGR:
2049 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2050 break;
2051 }
2052 }
2053
2054 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2055 LLVMValueRef alpha)
2056 {
2057 struct si_shader_context *ctx = si_shader_context(bld_base);
2058
2059 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2060 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2061 SI_PARAM_ALPHA_REF);
2062
2063 LLVMValueRef alpha_pass =
2064 lp_build_cmp(&bld_base->base,
2065 ctx->shader->key.part.ps.epilog.alpha_func,
2066 alpha, alpha_ref);
2067 LLVMValueRef arg =
2068 lp_build_select(&bld_base->base,
2069 alpha_pass,
2070 LLVMConstReal(ctx->f32, 1.0f),
2071 LLVMConstReal(ctx->f32, -1.0f));
2072
2073 ac_build_kill(&ctx->ac, arg);
2074 } else {
2075 ac_build_kill(&ctx->ac, NULL);
2076 }
2077 }
2078
2079 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2080 LLVMValueRef alpha,
2081 unsigned samplemask_param)
2082 {
2083 struct si_shader_context *ctx = si_shader_context(bld_base);
2084 struct gallivm_state *gallivm = &ctx->gallivm;
2085 LLVMValueRef coverage;
2086
2087 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2088 coverage = LLVMGetParam(ctx->main_fn,
2089 samplemask_param);
2090 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2091
2092 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2093 ctx->i32,
2094 &coverage, 1, LP_FUNC_ATTR_READNONE);
2095
2096 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2097 ctx->f32, "");
2098
2099 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2100 LLVMConstReal(ctx->f32,
2101 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2102
2103 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2104 }
2105
2106 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2107 struct ac_export_args *pos, LLVMValueRef *out_elts)
2108 {
2109 struct si_shader_context *ctx = si_shader_context(bld_base);
2110 struct lp_build_context *base = &bld_base->base;
2111 unsigned reg_index;
2112 unsigned chan;
2113 unsigned const_chan;
2114 LLVMValueRef base_elt;
2115 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2116 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2117 SI_VS_CONST_CLIP_PLANES, 0);
2118 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2119
2120 for (reg_index = 0; reg_index < 2; reg_index ++) {
2121 struct ac_export_args *args = &pos[2 + reg_index];
2122
2123 args->out[0] =
2124 args->out[1] =
2125 args->out[2] =
2126 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2127
2128 /* Compute dot products of position and user clip plane vectors */
2129 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2130 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2131 LLVMValueRef addr =
2132 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2133 const_chan) * 4, 0);
2134 base_elt = buffer_load_const(ctx, const_resource,
2135 addr);
2136 args->out[chan] =
2137 lp_build_add(base, args->out[chan],
2138 lp_build_mul(base, base_elt,
2139 out_elts[const_chan]));
2140 }
2141 }
2142
2143 args->enabled_channels = 0xf;
2144 args->valid_mask = 0;
2145 args->done = 0;
2146 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2147 args->compr = 0;
2148 }
2149 }
2150
2151 static void si_dump_streamout(struct pipe_stream_output_info *so)
2152 {
2153 unsigned i;
2154
2155 if (so->num_outputs)
2156 fprintf(stderr, "STREAMOUT\n");
2157
2158 for (i = 0; i < so->num_outputs; i++) {
2159 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2160 so->output[i].start_component;
2161 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2162 i, so->output[i].output_buffer,
2163 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2164 so->output[i].register_index,
2165 mask & 1 ? "x" : "",
2166 mask & 2 ? "y" : "",
2167 mask & 4 ? "z" : "",
2168 mask & 8 ? "w" : "");
2169 }
2170 }
2171
2172 static void emit_streamout_output(struct si_shader_context *ctx,
2173 LLVMValueRef const *so_buffers,
2174 LLVMValueRef const *so_write_offsets,
2175 struct pipe_stream_output *stream_out,
2176 struct si_shader_output_values *shader_out)
2177 {
2178 struct gallivm_state *gallivm = &ctx->gallivm;
2179 LLVMBuilderRef builder = gallivm->builder;
2180 unsigned buf_idx = stream_out->output_buffer;
2181 unsigned start = stream_out->start_component;
2182 unsigned num_comps = stream_out->num_components;
2183 LLVMValueRef out[4];
2184
2185 assert(num_comps && num_comps <= 4);
2186 if (!num_comps || num_comps > 4)
2187 return;
2188
2189 /* Load the output as int. */
2190 for (int j = 0; j < num_comps; j++) {
2191 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2192
2193 out[j] = LLVMBuildBitCast(builder,
2194 shader_out->values[start + j],
2195 ctx->i32, "");
2196 }
2197
2198 /* Pack the output. */
2199 LLVMValueRef vdata = NULL;
2200
2201 switch (num_comps) {
2202 case 1: /* as i32 */
2203 vdata = out[0];
2204 break;
2205 case 2: /* as v2i32 */
2206 case 3: /* as v4i32 (aligned to 4) */
2207 case 4: /* as v4i32 */
2208 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2209 for (int j = 0; j < num_comps; j++) {
2210 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2211 LLVMConstInt(ctx->i32, j, 0), "");
2212 }
2213 break;
2214 }
2215
2216 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2217 vdata, num_comps,
2218 so_write_offsets[buf_idx],
2219 ctx->i32_0,
2220 stream_out->dst_offset * 4, 1, 1, true, false);
2221 }
2222
2223 /**
2224 * Write streamout data to buffers for vertex stream @p stream (different
2225 * vertex streams can occur for GS copy shaders).
2226 */
2227 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2228 struct si_shader_output_values *outputs,
2229 unsigned noutput, unsigned stream)
2230 {
2231 struct si_shader_selector *sel = ctx->shader->selector;
2232 struct pipe_stream_output_info *so = &sel->so;
2233 struct gallivm_state *gallivm = &ctx->gallivm;
2234 LLVMBuilderRef builder = gallivm->builder;
2235 int i;
2236 struct lp_build_if_state if_ctx;
2237
2238 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2239 LLVMValueRef so_vtx_count =
2240 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2241
2242 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2243
2244 /* can_emit = tid < so_vtx_count; */
2245 LLVMValueRef can_emit =
2246 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2247
2248 /* Emit the streamout code conditionally. This actually avoids
2249 * out-of-bounds buffer access. The hw tells us via the SGPR
2250 * (so_vtx_count) which threads are allowed to emit streamout data. */
2251 lp_build_if(&if_ctx, gallivm, can_emit);
2252 {
2253 /* The buffer offset is computed as follows:
2254 * ByteOffset = streamout_offset[buffer_id]*4 +
2255 * (streamout_write_index + thread_id)*stride[buffer_id] +
2256 * attrib_offset
2257 */
2258
2259 LLVMValueRef so_write_index =
2260 LLVMGetParam(ctx->main_fn,
2261 ctx->param_streamout_write_index);
2262
2263 /* Compute (streamout_write_index + thread_id). */
2264 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2265
2266 /* Load the descriptor and compute the write offset for each
2267 * enabled buffer. */
2268 LLVMValueRef so_write_offset[4] = {};
2269 LLVMValueRef so_buffers[4];
2270 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2271 ctx->param_rw_buffers);
2272
2273 for (i = 0; i < 4; i++) {
2274 if (!so->stride[i])
2275 continue;
2276
2277 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2278 SI_VS_STREAMOUT_BUF0 + i, 0);
2279
2280 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2281
2282 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2283 ctx->param_streamout_offset[i]);
2284 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2285
2286 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2287 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2288 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2289 }
2290
2291 /* Write streamout data. */
2292 for (i = 0; i < so->num_outputs; i++) {
2293 unsigned reg = so->output[i].register_index;
2294
2295 if (reg >= noutput)
2296 continue;
2297
2298 if (stream != so->output[i].stream)
2299 continue;
2300
2301 emit_streamout_output(ctx, so_buffers, so_write_offset,
2302 &so->output[i], &outputs[reg]);
2303 }
2304 }
2305 lp_build_endif(&if_ctx);
2306 }
2307
2308 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2309 LLVMValueRef *values)
2310 {
2311 struct ac_export_args args;
2312
2313 si_llvm_init_export_args(&ctx->bld_base, values,
2314 V_008DFC_SQ_EXP_PARAM + index, &args);
2315 ac_build_export(&ctx->ac, &args);
2316 }
2317
2318 static void si_build_param_exports(struct si_shader_context *ctx,
2319 struct si_shader_output_values *outputs,
2320 unsigned noutput)
2321 {
2322 struct si_shader *shader = ctx->shader;
2323 unsigned param_count = 0;
2324
2325 for (unsigned i = 0; i < noutput; i++) {
2326 unsigned semantic_name = outputs[i].semantic_name;
2327 unsigned semantic_index = outputs[i].semantic_index;
2328
2329 if (outputs[i].vertex_stream[0] != 0 &&
2330 outputs[i].vertex_stream[1] != 0 &&
2331 outputs[i].vertex_stream[2] != 0 &&
2332 outputs[i].vertex_stream[3] != 0)
2333 continue;
2334
2335 switch (semantic_name) {
2336 case TGSI_SEMANTIC_LAYER:
2337 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2338 case TGSI_SEMANTIC_CLIPDIST:
2339 case TGSI_SEMANTIC_COLOR:
2340 case TGSI_SEMANTIC_BCOLOR:
2341 case TGSI_SEMANTIC_PRIMID:
2342 case TGSI_SEMANTIC_FOG:
2343 case TGSI_SEMANTIC_TEXCOORD:
2344 case TGSI_SEMANTIC_GENERIC:
2345 break;
2346 default:
2347 continue;
2348 }
2349
2350 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2351 semantic_index < SI_MAX_IO_GENERIC) &&
2352 shader->key.opt.kill_outputs &
2353 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2354 continue;
2355
2356 si_export_param(ctx, param_count, outputs[i].values);
2357
2358 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2359 shader->info.vs_output_param_offset[i] = param_count++;
2360 }
2361
2362 shader->info.nr_param_exports = param_count;
2363 }
2364
2365 /* Generate export instructions for hardware VS shader stage */
2366 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2367 struct si_shader_output_values *outputs,
2368 unsigned noutput)
2369 {
2370 struct si_shader_context *ctx = si_shader_context(bld_base);
2371 struct si_shader *shader = ctx->shader;
2372 struct lp_build_context *base = &bld_base->base;
2373 struct ac_export_args pos_args[4] = {};
2374 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2375 unsigned pos_idx;
2376 int i;
2377
2378 /* Build position exports. */
2379 for (i = 0; i < noutput; i++) {
2380 switch (outputs[i].semantic_name) {
2381 case TGSI_SEMANTIC_POSITION:
2382 si_llvm_init_export_args(bld_base, outputs[i].values,
2383 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2384 break;
2385 case TGSI_SEMANTIC_PSIZE:
2386 psize_value = outputs[i].values[0];
2387 break;
2388 case TGSI_SEMANTIC_LAYER:
2389 layer_value = outputs[i].values[0];
2390 break;
2391 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2392 viewport_index_value = outputs[i].values[0];
2393 break;
2394 case TGSI_SEMANTIC_EDGEFLAG:
2395 edgeflag_value = outputs[i].values[0];
2396 break;
2397 case TGSI_SEMANTIC_CLIPDIST:
2398 if (!shader->key.opt.clip_disable) {
2399 unsigned index = 2 + outputs[i].semantic_index;
2400 si_llvm_init_export_args(bld_base, outputs[i].values,
2401 V_008DFC_SQ_EXP_POS + index,
2402 &pos_args[index]);
2403 }
2404 break;
2405 case TGSI_SEMANTIC_CLIPVERTEX:
2406 if (!shader->key.opt.clip_disable) {
2407 si_llvm_emit_clipvertex(bld_base, pos_args,
2408 outputs[i].values);
2409 }
2410 break;
2411 }
2412 }
2413
2414 /* We need to add the position output manually if it's missing. */
2415 if (!pos_args[0].out[0]) {
2416 pos_args[0].enabled_channels = 0xf; /* writemask */
2417 pos_args[0].valid_mask = 0; /* EXEC mask */
2418 pos_args[0].done = 0; /* last export? */
2419 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2420 pos_args[0].compr = 0; /* COMPR flag */
2421 pos_args[0].out[0] = base->zero; /* X */
2422 pos_args[0].out[1] = base->zero; /* Y */
2423 pos_args[0].out[2] = base->zero; /* Z */
2424 pos_args[0].out[3] = base->one; /* W */
2425 }
2426
2427 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2428 if (shader->selector->info.writes_psize ||
2429 shader->selector->info.writes_edgeflag ||
2430 shader->selector->info.writes_viewport_index ||
2431 shader->selector->info.writes_layer) {
2432 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2433 (shader->selector->info.writes_edgeflag << 1) |
2434 (shader->selector->info.writes_layer << 2);
2435
2436 pos_args[1].valid_mask = 0; /* EXEC mask */
2437 pos_args[1].done = 0; /* last export? */
2438 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2439 pos_args[1].compr = 0; /* COMPR flag */
2440 pos_args[1].out[0] = base->zero; /* X */
2441 pos_args[1].out[1] = base->zero; /* Y */
2442 pos_args[1].out[2] = base->zero; /* Z */
2443 pos_args[1].out[3] = base->zero; /* W */
2444
2445 if (shader->selector->info.writes_psize)
2446 pos_args[1].out[0] = psize_value;
2447
2448 if (shader->selector->info.writes_edgeflag) {
2449 /* The output is a float, but the hw expects an integer
2450 * with the first bit containing the edge flag. */
2451 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2452 edgeflag_value,
2453 ctx->i32, "");
2454 edgeflag_value = lp_build_min(&bld_base->int_bld,
2455 edgeflag_value,
2456 ctx->i32_1);
2457
2458 /* The LLVM intrinsic expects a float. */
2459 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2460 edgeflag_value,
2461 ctx->f32, "");
2462 }
2463
2464 if (ctx->screen->b.chip_class >= GFX9) {
2465 /* GFX9 has the layer in out.z[10:0] and the viewport
2466 * index in out.z[19:16].
2467 */
2468 if (shader->selector->info.writes_layer)
2469 pos_args[1].out[2] = layer_value;
2470
2471 if (shader->selector->info.writes_viewport_index) {
2472 LLVMValueRef v = viewport_index_value;
2473
2474 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2475 v = LLVMBuildShl(ctx->gallivm.builder, v,
2476 LLVMConstInt(ctx->i32, 16, 0), "");
2477 v = LLVMBuildOr(ctx->gallivm.builder, v,
2478 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2479 pos_args[1].out[2]), "");
2480 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2481 pos_args[1].enabled_channels |= 1 << 2;
2482 }
2483 } else {
2484 if (shader->selector->info.writes_layer)
2485 pos_args[1].out[2] = layer_value;
2486
2487 if (shader->selector->info.writes_viewport_index) {
2488 pos_args[1].out[3] = viewport_index_value;
2489 pos_args[1].enabled_channels |= 1 << 3;
2490 }
2491 }
2492 }
2493
2494 for (i = 0; i < 4; i++)
2495 if (pos_args[i].out[0])
2496 shader->info.nr_pos_exports++;
2497
2498 pos_idx = 0;
2499 for (i = 0; i < 4; i++) {
2500 if (!pos_args[i].out[0])
2501 continue;
2502
2503 /* Specify the target we are exporting */
2504 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2505
2506 if (pos_idx == shader->info.nr_pos_exports)
2507 /* Specify that this is the last export */
2508 pos_args[i].done = 1;
2509
2510 ac_build_export(&ctx->ac, &pos_args[i]);
2511 }
2512
2513 /* Build parameter exports. */
2514 si_build_param_exports(ctx, outputs, noutput);
2515 }
2516
2517 /**
2518 * Forward all outputs from the vertex shader to the TES. This is only used
2519 * for the fixed function TCS.
2520 */
2521 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2522 {
2523 struct si_shader_context *ctx = si_shader_context(bld_base);
2524 struct gallivm_state *gallivm = &ctx->gallivm;
2525 LLVMValueRef invocation_id, buffer, buffer_offset;
2526 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2527 uint64_t inputs;
2528
2529 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2530 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2531 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2532
2533 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2534 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2535 lds_vertex_stride, "");
2536 lds_base = get_tcs_in_current_patch_offset(ctx);
2537 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2538
2539 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2540 while (inputs) {
2541 unsigned i = u_bit_scan64(&inputs);
2542
2543 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2544 LLVMConstInt(ctx->i32, 4 * i, 0),
2545 "");
2546
2547 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2548 get_rel_patch_id(ctx),
2549 invocation_id,
2550 LLVMConstInt(ctx->i32, i, 0));
2551
2552 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2553 lds_ptr);
2554
2555 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2556 buffer_offset, 0, 1, 0, true, false);
2557 }
2558 }
2559
2560 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2561 LLVMValueRef rel_patch_id,
2562 LLVMValueRef invocation_id,
2563 LLVMValueRef tcs_out_current_patch_data_offset)
2564 {
2565 struct si_shader_context *ctx = si_shader_context(bld_base);
2566 struct gallivm_state *gallivm = &ctx->gallivm;
2567 struct si_shader *shader = ctx->shader;
2568 unsigned tess_inner_index, tess_outer_index;
2569 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2570 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2571 unsigned stride, outer_comps, inner_comps, i, offset;
2572 struct lp_build_if_state if_ctx, inner_if_ctx;
2573
2574 si_llvm_emit_barrier(NULL, bld_base, NULL);
2575
2576 /* Do this only for invocation 0, because the tess levels are per-patch,
2577 * not per-vertex.
2578 *
2579 * This can't jump, because invocation 0 executes this. It should
2580 * at least mask out the loads and stores for other invocations.
2581 */
2582 lp_build_if(&if_ctx, gallivm,
2583 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2584 invocation_id, ctx->i32_0, ""));
2585
2586 /* Determine the layout of one tess factor element in the buffer. */
2587 switch (shader->key.part.tcs.epilog.prim_mode) {
2588 case PIPE_PRIM_LINES:
2589 stride = 2; /* 2 dwords, 1 vec2 store */
2590 outer_comps = 2;
2591 inner_comps = 0;
2592 break;
2593 case PIPE_PRIM_TRIANGLES:
2594 stride = 4; /* 4 dwords, 1 vec4 store */
2595 outer_comps = 3;
2596 inner_comps = 1;
2597 break;
2598 case PIPE_PRIM_QUADS:
2599 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2600 outer_comps = 4;
2601 inner_comps = 2;
2602 break;
2603 default:
2604 assert(0);
2605 return;
2606 }
2607
2608 /* Load tess_inner and tess_outer from LDS.
2609 * Any invocation can write them, so we can't get them from a temporary.
2610 */
2611 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2612 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2613
2614 lds_base = tcs_out_current_patch_data_offset;
2615 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2616 LLVMConstInt(ctx->i32,
2617 tess_inner_index * 4, 0), "");
2618 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2619 LLVMConstInt(ctx->i32,
2620 tess_outer_index * 4, 0), "");
2621
2622 for (i = 0; i < 4; i++) {
2623 inner[i] = LLVMGetUndef(ctx->i32);
2624 outer[i] = LLVMGetUndef(ctx->i32);
2625 }
2626
2627 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2628 /* For isolines, the hardware expects tess factors in the
2629 * reverse order from what GLSL / TGSI specify.
2630 */
2631 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2632 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2633 } else {
2634 for (i = 0; i < outer_comps; i++) {
2635 outer[i] = out[i] =
2636 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2637 }
2638 for (i = 0; i < inner_comps; i++) {
2639 inner[i] = out[outer_comps+i] =
2640 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2641 }
2642 }
2643
2644 /* Convert the outputs to vectors for stores. */
2645 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2646 vec1 = NULL;
2647
2648 if (stride > 4)
2649 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2650
2651 /* Get the buffer. */
2652 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2653
2654 /* Get the offset. */
2655 tf_base = LLVMGetParam(ctx->main_fn,
2656 ctx->param_tcs_factor_offset);
2657 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2658 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2659
2660 lp_build_if(&inner_if_ctx, gallivm,
2661 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2662 rel_patch_id, ctx->i32_0, ""));
2663
2664 /* Store the dynamic HS control word. */
2665 offset = 0;
2666 if (ctx->screen->b.chip_class <= VI) {
2667 ac_build_buffer_store_dword(&ctx->ac, buffer,
2668 LLVMConstInt(ctx->i32, 0x80000000, 0),
2669 1, ctx->i32_0, tf_base,
2670 offset, 1, 0, true, false);
2671 offset += 4;
2672 }
2673
2674 lp_build_endif(&inner_if_ctx);
2675
2676 /* Store the tessellation factors. */
2677 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2678 MIN2(stride, 4), byteoffset, tf_base,
2679 offset, 1, 0, true, false);
2680 offset += 16;
2681 if (vec1)
2682 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2683 stride - 4, byteoffset, tf_base,
2684 offset, 1, 0, true, false);
2685
2686 /* Store the tess factors into the offchip buffer if TES reads them. */
2687 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2688 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2689 LLVMValueRef tf_inner_offset;
2690 unsigned param_outer, param_inner;
2691
2692 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2693 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2694
2695 param_outer = si_shader_io_get_unique_index_patch(
2696 TGSI_SEMANTIC_TESSOUTER, 0);
2697 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2698 LLVMConstInt(ctx->i32, param_outer, 0));
2699
2700 outer_vec = lp_build_gather_values(gallivm, outer,
2701 util_next_power_of_two(outer_comps));
2702
2703 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2704 outer_comps, tf_outer_offset,
2705 base, 0, 1, 0, true, false);
2706 if (inner_comps) {
2707 param_inner = si_shader_io_get_unique_index_patch(
2708 TGSI_SEMANTIC_TESSINNER, 0);
2709 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2710 LLVMConstInt(ctx->i32, param_inner, 0));
2711
2712 inner_vec = inner_comps == 1 ? inner[0] :
2713 lp_build_gather_values(gallivm, inner, inner_comps);
2714 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2715 inner_comps, tf_inner_offset,
2716 base, 0, 1, 0, true, false);
2717 }
2718 }
2719
2720 lp_build_endif(&if_ctx);
2721 }
2722
2723 static LLVMValueRef
2724 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2725 unsigned param, unsigned return_index)
2726 {
2727 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2728 LLVMGetParam(ctx->main_fn, param),
2729 return_index, "");
2730 }
2731
2732 static LLVMValueRef
2733 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2734 unsigned param, unsigned return_index)
2735 {
2736 LLVMBuilderRef builder = ctx->gallivm.builder;
2737 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2738
2739 return LLVMBuildInsertValue(builder, ret,
2740 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2741 return_index, "");
2742 }
2743
2744 static LLVMValueRef
2745 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2746 unsigned param, unsigned return_index)
2747 {
2748 LLVMBuilderRef builder = ctx->gallivm.builder;
2749 LLVMValueRef ptr, lo, hi;
2750
2751 ptr = LLVMGetParam(ctx->main_fn, param);
2752 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2753 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2754 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2755 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2756 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2757 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2758 }
2759
2760 /* This only writes the tessellation factor levels. */
2761 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2762 {
2763 struct si_shader_context *ctx = si_shader_context(bld_base);
2764 LLVMBuilderRef builder = ctx->gallivm.builder;
2765 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2766
2767 si_copy_tcs_inputs(bld_base);
2768
2769 rel_patch_id = get_rel_patch_id(ctx);
2770 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2771 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2772
2773 if (ctx->screen->b.chip_class >= GFX9) {
2774 LLVMBasicBlockRef blocks[2] = {
2775 LLVMGetInsertBlock(builder),
2776 ctx->merged_wrap_if_state.entry_block
2777 };
2778 LLVMValueRef values[2];
2779
2780 lp_build_endif(&ctx->merged_wrap_if_state);
2781
2782 values[0] = rel_patch_id;
2783 values[1] = LLVMGetUndef(ctx->i32);
2784 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2785
2786 values[0] = tf_lds_offset;
2787 values[1] = LLVMGetUndef(ctx->i32);
2788 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2789
2790 values[0] = invocation_id;
2791 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2792 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2793 }
2794
2795 /* Return epilog parameters from this function. */
2796 LLVMValueRef ret = ctx->return_value;
2797 unsigned vgpr;
2798
2799 if (ctx->screen->b.chip_class >= GFX9) {
2800 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2801 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2802 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2803 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2804 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2805 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2806 /* Tess offchip and tess factor offsets are at the beginning. */
2807 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2808 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2809 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2810 } else {
2811 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2812 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2813 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2814 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2815 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2816 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2817 /* Tess offchip and tess factor offsets are after user SGPRs. */
2818 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2819 GFX6_TCS_NUM_USER_SGPR);
2820 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2821 GFX6_TCS_NUM_USER_SGPR + 1);
2822 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2823 }
2824
2825 /* VGPRs */
2826 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2827 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2828 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2829
2830 /* Leave a hole corresponding to the two input VGPRs. This ensures that
2831 * the invocation_id output does not alias the param_tcs_rel_ids input,
2832 * which saves a V_MOV on gfx9.
2833 */
2834 vgpr += 2;
2835
2836 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2837 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2838 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2839 ctx->return_value = ret;
2840 }
2841
2842 /* Pass TCS inputs from LS to TCS on GFX9. */
2843 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2844 {
2845 LLVMValueRef ret = ctx->return_value;
2846
2847 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2848 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2849 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2850 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2851 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2852
2853 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2854 8 + SI_SGPR_VS_STATE_BITS);
2855 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2856 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2857 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2858 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2859 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2860 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2861 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2862 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2863 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2864 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2865
2866 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2867 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2868 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2869 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2870 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2871
2872 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2873 ret = si_insert_input_ret_float(ctx, ret,
2874 ctx->param_tcs_patch_id, vgpr++);
2875 ret = si_insert_input_ret_float(ctx, ret,
2876 ctx->param_tcs_rel_ids, vgpr++);
2877 ctx->return_value = ret;
2878 }
2879
2880 /* Pass GS inputs from ES to GS on GFX9. */
2881 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2882 {
2883 LLVMValueRef ret = ctx->return_value;
2884
2885 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2886 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2887 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2888
2889 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2890
2891 unsigned desc_param = ctx->param_vs_state_bits + 1;
2892 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2893 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
2894 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2895 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
2896
2897 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2898 for (unsigned i = 0; i < 5; i++) {
2899 unsigned param = ctx->param_gs_vtx01_offset + i;
2900 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2901 }
2902 ctx->return_value = ret;
2903 }
2904
2905 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2906 {
2907 struct si_shader_context *ctx = si_shader_context(bld_base);
2908 struct si_shader *shader = ctx->shader;
2909 struct tgsi_shader_info *info = &shader->selector->info;
2910 struct gallivm_state *gallivm = &ctx->gallivm;
2911 unsigned i, chan;
2912 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2913 ctx->param_rel_auto_id);
2914 LLVMValueRef vertex_dw_stride =
2915 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2916 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2917 vertex_dw_stride, "");
2918
2919 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2920 * its inputs from it. */
2921 for (i = 0; i < info->num_outputs; i++) {
2922 LLVMValueRef *out_ptr = ctx->outputs[i];
2923 unsigned name = info->output_semantic_name[i];
2924 unsigned index = info->output_semantic_index[i];
2925
2926 /* The ARB_shader_viewport_layer_array spec contains the
2927 * following issue:
2928 *
2929 * 2) What happens if gl_ViewportIndex or gl_Layer is
2930 * written in the vertex shader and a geometry shader is
2931 * present?
2932 *
2933 * RESOLVED: The value written by the last vertex processing
2934 * stage is used. If the last vertex processing stage
2935 * (vertex, tessellation evaluation or geometry) does not
2936 * statically assign to gl_ViewportIndex or gl_Layer, index
2937 * or layer zero is assumed.
2938 *
2939 * So writes to those outputs in VS-as-LS are simply ignored.
2940 */
2941 if (name == TGSI_SEMANTIC_LAYER ||
2942 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2943 continue;
2944
2945 int param = si_shader_io_get_unique_index(name, index);
2946 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2947 LLVMConstInt(ctx->i32, param * 4, 0), "");
2948
2949 for (chan = 0; chan < 4; chan++) {
2950 lds_store(bld_base, chan, dw_addr,
2951 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2952 }
2953 }
2954
2955 if (ctx->screen->b.chip_class >= GFX9)
2956 si_set_ls_return_value_for_tcs(ctx);
2957 }
2958
2959 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2960 {
2961 struct si_shader_context *ctx = si_shader_context(bld_base);
2962 struct gallivm_state *gallivm = &ctx->gallivm;
2963 struct si_shader *es = ctx->shader;
2964 struct tgsi_shader_info *info = &es->selector->info;
2965 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2966 ctx->param_es2gs_offset);
2967 LLVMValueRef lds_base = NULL;
2968 unsigned chan;
2969 int i;
2970
2971 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2972 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2973 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
2974 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
2975 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
2976 LLVMBuildMul(gallivm->builder, wave_idx,
2977 LLVMConstInt(ctx->i32, 64, false), ""), "");
2978 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
2979 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2980 }
2981
2982 for (i = 0; i < info->num_outputs; i++) {
2983 LLVMValueRef *out_ptr = ctx->outputs[i];
2984 int param;
2985
2986 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2987 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2988 continue;
2989
2990 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2991 info->output_semantic_index[i]);
2992
2993 for (chan = 0; chan < 4; chan++) {
2994 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2995 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2996
2997 /* GFX9 has the ESGS ring in LDS. */
2998 if (ctx->screen->b.chip_class >= GFX9) {
2999 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
3000 continue;
3001 }
3002
3003 ac_build_buffer_store_dword(&ctx->ac,
3004 ctx->esgs_ring,
3005 out_val, 1, NULL, soffset,
3006 (4 * param + chan) * 4,
3007 1, 1, true, true);
3008 }
3009 }
3010
3011 if (ctx->screen->b.chip_class >= GFX9)
3012 si_set_es_return_value_for_gs(ctx);
3013 }
3014
3015 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3016 {
3017 if (ctx->screen->b.chip_class >= GFX9)
3018 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3019 else
3020 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3021 }
3022
3023 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3024 {
3025 struct si_shader_context *ctx = si_shader_context(bld_base);
3026
3027 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3028 si_get_gs_wave_id(ctx));
3029
3030 if (ctx->screen->b.chip_class >= GFX9)
3031 lp_build_endif(&ctx->merged_wrap_if_state);
3032 }
3033
3034 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
3035 {
3036 struct si_shader_context *ctx = si_shader_context(bld_base);
3037 struct gallivm_state *gallivm = &ctx->gallivm;
3038 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3039 struct si_shader_output_values *outputs = NULL;
3040 int i,j;
3041
3042 assert(!ctx->shader->is_gs_copy_shader);
3043
3044 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3045
3046 /* Vertex color clamping.
3047 *
3048 * This uses a state constant loaded in a user data SGPR and
3049 * an IF statement is added that clamps all colors if the constant
3050 * is true.
3051 */
3052 if (ctx->type == PIPE_SHADER_VERTEX) {
3053 struct lp_build_if_state if_ctx;
3054 LLVMValueRef cond = NULL;
3055 LLVMValueRef addr, val;
3056
3057 for (i = 0; i < info->num_outputs; i++) {
3058 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3059 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3060 continue;
3061
3062 /* We've found a color. */
3063 if (!cond) {
3064 /* The state is in the first bit of the user SGPR. */
3065 cond = LLVMGetParam(ctx->main_fn,
3066 ctx->param_vs_state_bits);
3067 cond = LLVMBuildTrunc(gallivm->builder, cond,
3068 ctx->i1, "");
3069 lp_build_if(&if_ctx, gallivm, cond);
3070 }
3071
3072 for (j = 0; j < 4; j++) {
3073 addr = ctx->outputs[i][j];
3074 val = LLVMBuildLoad(gallivm->builder, addr, "");
3075 val = ac_build_clamp(&ctx->ac, val);
3076 LLVMBuildStore(gallivm->builder, val, addr);
3077 }
3078 }
3079
3080 if (cond)
3081 lp_build_endif(&if_ctx);
3082 }
3083
3084 for (i = 0; i < info->num_outputs; i++) {
3085 outputs[i].semantic_name = info->output_semantic_name[i];
3086 outputs[i].semantic_index = info->output_semantic_index[i];
3087
3088 for (j = 0; j < 4; j++) {
3089 outputs[i].values[j] =
3090 LLVMBuildLoad(gallivm->builder,
3091 ctx->outputs[i][j],
3092 "");
3093 outputs[i].vertex_stream[j] =
3094 (info->output_streams[i] >> (2 * j)) & 3;
3095 }
3096 }
3097
3098 if (ctx->shader->selector->so.num_outputs)
3099 si_llvm_emit_streamout(ctx, outputs, i, 0);
3100
3101 /* Export PrimitiveID. */
3102 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3103 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3104 outputs[i].semantic_index = 0;
3105 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
3106 get_primitive_id(bld_base, 0));
3107 for (j = 1; j < 4; j++)
3108 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3109
3110 memset(outputs[i].vertex_stream, 0,
3111 sizeof(outputs[i].vertex_stream));
3112 i++;
3113 }
3114
3115 si_llvm_export_vs(bld_base, outputs, i);
3116 FREE(outputs);
3117 }
3118
3119 struct si_ps_exports {
3120 unsigned num;
3121 struct ac_export_args args[10];
3122 };
3123
3124 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3125 bool writes_samplemask)
3126 {
3127 if (writes_z) {
3128 /* Z needs 32 bits. */
3129 if (writes_samplemask)
3130 return V_028710_SPI_SHADER_32_ABGR;
3131 else if (writes_stencil)
3132 return V_028710_SPI_SHADER_32_GR;
3133 else
3134 return V_028710_SPI_SHADER_32_R;
3135 } else if (writes_stencil || writes_samplemask) {
3136 /* Both stencil and sample mask need only 16 bits. */
3137 return V_028710_SPI_SHADER_UINT16_ABGR;
3138 } else {
3139 return V_028710_SPI_SHADER_ZERO;
3140 }
3141 }
3142
3143 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3144 LLVMValueRef depth, LLVMValueRef stencil,
3145 LLVMValueRef samplemask, struct si_ps_exports *exp)
3146 {
3147 struct si_shader_context *ctx = si_shader_context(bld_base);
3148 struct lp_build_context *base = &bld_base->base;
3149 struct ac_export_args args;
3150 unsigned mask = 0;
3151 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3152 stencil != NULL,
3153 samplemask != NULL);
3154
3155 assert(depth || stencil || samplemask);
3156
3157 args.valid_mask = 1; /* whether the EXEC mask is valid */
3158 args.done = 1; /* DONE bit */
3159
3160 /* Specify the target we are exporting */
3161 args.target = V_008DFC_SQ_EXP_MRTZ;
3162
3163 args.compr = 0; /* COMP flag */
3164 args.out[0] = base->undef; /* R, depth */
3165 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3166 args.out[2] = base->undef; /* B, sample mask */
3167 args.out[3] = base->undef; /* A, alpha to mask */
3168
3169 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3170 assert(!depth);
3171 args.compr = 1; /* COMPR flag */
3172
3173 if (stencil) {
3174 /* Stencil should be in X[23:16]. */
3175 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3176 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3177 LLVMConstInt(ctx->i32, 16, 0), "");
3178 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3179 mask |= 0x3;
3180 }
3181 if (samplemask) {
3182 /* SampleMask should be in Y[15:0]. */
3183 args.out[1] = samplemask;
3184 mask |= 0xc;
3185 }
3186 } else {
3187 if (depth) {
3188 args.out[0] = depth;
3189 mask |= 0x1;
3190 }
3191 if (stencil) {
3192 args.out[1] = stencil;
3193 mask |= 0x2;
3194 }
3195 if (samplemask) {
3196 args.out[2] = samplemask;
3197 mask |= 0x4;
3198 }
3199 }
3200
3201 /* SI (except OLAND and HAINAN) has a bug that it only looks
3202 * at the X writemask component. */
3203 if (ctx->screen->b.chip_class == SI &&
3204 ctx->screen->b.family != CHIP_OLAND &&
3205 ctx->screen->b.family != CHIP_HAINAN)
3206 mask |= 0x1;
3207
3208 /* Specify which components to enable */
3209 args.enabled_channels = mask;
3210
3211 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3212 }
3213
3214 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3215 LLVMValueRef *color, unsigned index,
3216 unsigned samplemask_param,
3217 bool is_last, struct si_ps_exports *exp)
3218 {
3219 struct si_shader_context *ctx = si_shader_context(bld_base);
3220 struct lp_build_context *base = &bld_base->base;
3221 int i;
3222
3223 /* Clamp color */
3224 if (ctx->shader->key.part.ps.epilog.clamp_color)
3225 for (i = 0; i < 4; i++)
3226 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3227
3228 /* Alpha to one */
3229 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3230 color[3] = base->one;
3231
3232 /* Alpha test */
3233 if (index == 0 &&
3234 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3235 si_alpha_test(bld_base, color[3]);
3236
3237 /* Line & polygon smoothing */
3238 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3239 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3240 samplemask_param);
3241
3242 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3243 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3244 struct ac_export_args args[8];
3245 int c, last = -1;
3246
3247 /* Get the export arguments, also find out what the last one is. */
3248 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3249 si_llvm_init_export_args(bld_base, color,
3250 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3251 if (args[c].enabled_channels)
3252 last = c;
3253 }
3254
3255 /* Emit all exports. */
3256 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3257 if (is_last && last == c) {
3258 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3259 args[c].done = 1; /* DONE bit */
3260 } else if (!args[c].enabled_channels)
3261 continue; /* unnecessary NULL export */
3262
3263 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3264 }
3265 } else {
3266 struct ac_export_args args;
3267
3268 /* Export */
3269 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3270 &args);
3271 if (is_last) {
3272 args.valid_mask = 1; /* whether the EXEC mask is valid */
3273 args.done = 1; /* DONE bit */
3274 } else if (!args.enabled_channels)
3275 return; /* unnecessary NULL export */
3276
3277 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3278 }
3279 }
3280
3281 static void si_emit_ps_exports(struct si_shader_context *ctx,
3282 struct si_ps_exports *exp)
3283 {
3284 for (unsigned i = 0; i < exp->num; i++)
3285 ac_build_export(&ctx->ac, &exp->args[i]);
3286 }
3287
3288 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3289 {
3290 struct si_shader_context *ctx = si_shader_context(bld_base);
3291 struct lp_build_context *base = &bld_base->base;
3292 struct ac_export_args args;
3293
3294 args.enabled_channels = 0x0; /* enabled channels */
3295 args.valid_mask = 1; /* whether the EXEC mask is valid */
3296 args.done = 1; /* DONE bit */
3297 args.target = V_008DFC_SQ_EXP_NULL;
3298 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3299 args.out[0] = base->undef; /* R */
3300 args.out[1] = base->undef; /* G */
3301 args.out[2] = base->undef; /* B */
3302 args.out[3] = base->undef; /* A */
3303
3304 ac_build_export(&ctx->ac, &args);
3305 }
3306
3307 /**
3308 * Return PS outputs in this order:
3309 *
3310 * v[0:3] = color0.xyzw
3311 * v[4:7] = color1.xyzw
3312 * ...
3313 * vN+0 = Depth
3314 * vN+1 = Stencil
3315 * vN+2 = SampleMask
3316 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3317 *
3318 * The alpha-ref SGPR is returned via its original location.
3319 */
3320 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3321 {
3322 struct si_shader_context *ctx = si_shader_context(bld_base);
3323 struct si_shader *shader = ctx->shader;
3324 struct tgsi_shader_info *info = &shader->selector->info;
3325 LLVMBuilderRef builder = ctx->gallivm.builder;
3326 unsigned i, j, first_vgpr, vgpr;
3327
3328 LLVMValueRef color[8][4] = {};
3329 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3330 LLVMValueRef ret;
3331
3332 if (ctx->postponed_kill)
3333 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3334
3335 /* Read the output values. */
3336 for (i = 0; i < info->num_outputs; i++) {
3337 unsigned semantic_name = info->output_semantic_name[i];
3338 unsigned semantic_index = info->output_semantic_index[i];
3339
3340 switch (semantic_name) {
3341 case TGSI_SEMANTIC_COLOR:
3342 assert(semantic_index < 8);
3343 for (j = 0; j < 4; j++) {
3344 LLVMValueRef ptr = ctx->outputs[i][j];
3345 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3346 color[semantic_index][j] = result;
3347 }
3348 break;
3349 case TGSI_SEMANTIC_POSITION:
3350 depth = LLVMBuildLoad(builder,
3351 ctx->outputs[i][2], "");
3352 break;
3353 case TGSI_SEMANTIC_STENCIL:
3354 stencil = LLVMBuildLoad(builder,
3355 ctx->outputs[i][1], "");
3356 break;
3357 case TGSI_SEMANTIC_SAMPLEMASK:
3358 samplemask = LLVMBuildLoad(builder,
3359 ctx->outputs[i][0], "");
3360 break;
3361 default:
3362 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3363 semantic_name);
3364 }
3365 }
3366
3367 /* Fill the return structure. */
3368 ret = ctx->return_value;
3369
3370 /* Set SGPRs. */
3371 ret = LLVMBuildInsertValue(builder, ret,
3372 bitcast(bld_base, TGSI_TYPE_SIGNED,
3373 LLVMGetParam(ctx->main_fn,
3374 SI_PARAM_ALPHA_REF)),
3375 SI_SGPR_ALPHA_REF, "");
3376
3377 /* Set VGPRs */
3378 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3379 for (i = 0; i < ARRAY_SIZE(color); i++) {
3380 if (!color[i][0])
3381 continue;
3382
3383 for (j = 0; j < 4; j++)
3384 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3385 }
3386 if (depth)
3387 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3388 if (stencil)
3389 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3390 if (samplemask)
3391 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3392
3393 /* Add the input sample mask for smoothing at the end. */
3394 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3395 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3396 ret = LLVMBuildInsertValue(builder, ret,
3397 LLVMGetParam(ctx->main_fn,
3398 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3399
3400 ctx->return_value = ret;
3401 }
3402
3403 /* Prevent optimizations (at least of memory accesses) across the current
3404 * point in the program by emitting empty inline assembly that is marked as
3405 * having side effects.
3406 *
3407 * Optionally, a value can be passed through the inline assembly to prevent
3408 * LLVM from hoisting calls to ReadNone functions.
3409 */
3410 static void emit_optimization_barrier(struct si_shader_context *ctx,
3411 LLVMValueRef *pvgpr)
3412 {
3413 static int counter = 0;
3414
3415 LLVMBuilderRef builder = ctx->gallivm.builder;
3416 char code[16];
3417
3418 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3419
3420 if (!pvgpr) {
3421 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3422 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3423 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3424 } else {
3425 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3426 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3427 LLVMValueRef vgpr = *pvgpr;
3428 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3429 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3430 LLVMValueRef vgpr0;
3431
3432 assert(vgpr_size % 4 == 0);
3433
3434 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3435 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3436 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3437 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3438 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3439
3440 *pvgpr = vgpr;
3441 }
3442 }
3443
3444 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3445 {
3446 struct gallivm_state *gallivm = &ctx->gallivm;
3447 LLVMBuilderRef builder = gallivm->builder;
3448 LLVMValueRef args[1] = {
3449 LLVMConstInt(ctx->i32, simm16, 0)
3450 };
3451 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3452 ctx->voidt, args, 1, 0);
3453 }
3454
3455 static void membar_emit(
3456 const struct lp_build_tgsi_action *action,
3457 struct lp_build_tgsi_context *bld_base,
3458 struct lp_build_emit_data *emit_data)
3459 {
3460 struct si_shader_context *ctx = si_shader_context(bld_base);
3461 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3462 unsigned flags = LLVMConstIntGetZExtValue(src0);
3463 unsigned waitcnt = NOOP_WAITCNT;
3464
3465 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3466 waitcnt &= VM_CNT & LGKM_CNT;
3467
3468 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3469 TGSI_MEMBAR_SHADER_BUFFER |
3470 TGSI_MEMBAR_SHADER_IMAGE))
3471 waitcnt &= VM_CNT;
3472
3473 if (flags & TGSI_MEMBAR_SHARED)
3474 waitcnt &= LGKM_CNT;
3475
3476 if (waitcnt != NOOP_WAITCNT)
3477 si_emit_waitcnt(ctx, waitcnt);
3478 }
3479
3480 static void clock_emit(
3481 const struct lp_build_tgsi_action *action,
3482 struct lp_build_tgsi_context *bld_base,
3483 struct lp_build_emit_data *emit_data)
3484 {
3485 struct si_shader_context *ctx = si_shader_context(bld_base);
3486 struct gallivm_state *gallivm = &ctx->gallivm;
3487 LLVMValueRef tmp;
3488
3489 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3490 ctx->i64, NULL, 0, 0);
3491 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3492
3493 emit_data->output[0] =
3494 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3495 emit_data->output[1] =
3496 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3497 }
3498
3499 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3500 {
3501 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3502 CONST_ADDR_SPACE);
3503 }
3504
3505 static void si_llvm_emit_ddxy(
3506 const struct lp_build_tgsi_action *action,
3507 struct lp_build_tgsi_context *bld_base,
3508 struct lp_build_emit_data *emit_data)
3509 {
3510 struct si_shader_context *ctx = si_shader_context(bld_base);
3511 struct gallivm_state *gallivm = &ctx->gallivm;
3512 unsigned opcode = emit_data->info->opcode;
3513 LLVMValueRef val;
3514 int idx;
3515 unsigned mask;
3516
3517 if (opcode == TGSI_OPCODE_DDX_FINE)
3518 mask = AC_TID_MASK_LEFT;
3519 else if (opcode == TGSI_OPCODE_DDY_FINE)
3520 mask = AC_TID_MASK_TOP;
3521 else
3522 mask = AC_TID_MASK_TOP_LEFT;
3523
3524 /* for DDX we want to next X pixel, DDY next Y pixel. */
3525 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3526
3527 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3528 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3529 mask, idx, ctx->lds, val);
3530 emit_data->output[emit_data->chan] = val;
3531 }
3532
3533 /*
3534 * this takes an I,J coordinate pair,
3535 * and works out the X and Y derivatives.
3536 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3537 */
3538 static LLVMValueRef si_llvm_emit_ddxy_interp(
3539 struct lp_build_tgsi_context *bld_base,
3540 LLVMValueRef interp_ij)
3541 {
3542 struct si_shader_context *ctx = si_shader_context(bld_base);
3543 struct gallivm_state *gallivm = &ctx->gallivm;
3544 LLVMValueRef result[4], a;
3545 unsigned i;
3546
3547 for (i = 0; i < 2; i++) {
3548 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3549 LLVMConstInt(ctx->i32, i, 0), "");
3550 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3551 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3552 }
3553
3554 return lp_build_gather_values(gallivm, result, 4);
3555 }
3556
3557 static void interp_fetch_args(
3558 struct lp_build_tgsi_context *bld_base,
3559 struct lp_build_emit_data *emit_data)
3560 {
3561 struct si_shader_context *ctx = si_shader_context(bld_base);
3562 struct gallivm_state *gallivm = &ctx->gallivm;
3563 const struct tgsi_full_instruction *inst = emit_data->inst;
3564
3565 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3566 /* offset is in second src, first two channels */
3567 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3568 emit_data->inst, 1,
3569 TGSI_CHAN_X);
3570 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3571 emit_data->inst, 1,
3572 TGSI_CHAN_Y);
3573 emit_data->arg_count = 2;
3574 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3575 LLVMValueRef sample_position;
3576 LLVMValueRef sample_id;
3577 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3578
3579 /* fetch sample ID, then fetch its sample position,
3580 * and place into first two channels.
3581 */
3582 sample_id = lp_build_emit_fetch(bld_base,
3583 emit_data->inst, 1, TGSI_CHAN_X);
3584 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3585 ctx->i32, "");
3586 sample_position = load_sample_position(ctx, sample_id);
3587
3588 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3589 sample_position,
3590 ctx->i32_0, "");
3591
3592 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3593 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3594 sample_position,
3595 ctx->i32_1, "");
3596 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3597 emit_data->arg_count = 2;
3598 }
3599 }
3600
3601 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3602 struct lp_build_tgsi_context *bld_base,
3603 struct lp_build_emit_data *emit_data)
3604 {
3605 struct si_shader_context *ctx = si_shader_context(bld_base);
3606 struct si_shader *shader = ctx->shader;
3607 struct gallivm_state *gallivm = &ctx->gallivm;
3608 const struct tgsi_shader_info *info = &shader->selector->info;
3609 LLVMValueRef interp_param;
3610 const struct tgsi_full_instruction *inst = emit_data->inst;
3611 const struct tgsi_full_src_register *input = &inst->Src[0];
3612 int input_base, input_array_size;
3613 int chan;
3614 int i;
3615 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3616 LLVMValueRef array_idx;
3617 int interp_param_idx;
3618 unsigned interp;
3619 unsigned location;
3620
3621 assert(input->Register.File == TGSI_FILE_INPUT);
3622
3623 if (input->Register.Indirect) {
3624 unsigned array_id = input->Indirect.ArrayID;
3625
3626 if (array_id) {
3627 input_base = info->input_array_first[array_id];
3628 input_array_size = info->input_array_last[array_id] - input_base + 1;
3629 } else {
3630 input_base = inst->Src[0].Register.Index;
3631 input_array_size = info->num_inputs - input_base;
3632 }
3633
3634 array_idx = get_indirect_index(ctx, &input->Indirect,
3635 input->Register.Index - input_base);
3636 } else {
3637 input_base = inst->Src[0].Register.Index;
3638 input_array_size = 1;
3639 array_idx = ctx->i32_0;
3640 }
3641
3642 interp = shader->selector->info.input_interpolate[input_base];
3643
3644 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3645 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3646 location = TGSI_INTERPOLATE_LOC_CENTER;
3647 else
3648 location = TGSI_INTERPOLATE_LOC_CENTROID;
3649
3650 interp_param_idx = lookup_interp_param_index(interp, location);
3651 if (interp_param_idx == -1)
3652 return;
3653 else if (interp_param_idx)
3654 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3655 else
3656 interp_param = NULL;
3657
3658 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3659 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3660 LLVMValueRef ij_out[2];
3661 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3662
3663 /*
3664 * take the I then J parameters, and the DDX/Y for it, and
3665 * calculate the IJ inputs for the interpolator.
3666 * temp1 = ddx * offset/sample.x + I;
3667 * interp_param.I = ddy * offset/sample.y + temp1;
3668 * temp1 = ddx * offset/sample.x + J;
3669 * interp_param.J = ddy * offset/sample.y + temp1;
3670 */
3671 for (i = 0; i < 2; i++) {
3672 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3673 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3674 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3675 ddxy_out, ix_ll, "");
3676 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3677 ddxy_out, iy_ll, "");
3678 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3679 interp_param, ix_ll, "");
3680 LLVMValueRef temp1, temp2;
3681
3682 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3683 ctx->f32, "");
3684
3685 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3686
3687 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3688
3689 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3690
3691 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3692 }
3693 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3694 }
3695
3696 if (interp_param) {
3697 interp_param = LLVMBuildBitCast(gallivm->builder,
3698 interp_param, LLVMVectorType(ctx->f32, 2), "");
3699 }
3700
3701 for (chan = 0; chan < 4; chan++) {
3702 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3703 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3704
3705 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3706 LLVMValueRef v, i = NULL, j = NULL;
3707
3708 if (interp_param) {
3709 interp_param = LLVMBuildBitCast(gallivm->builder,
3710 interp_param, LLVMVectorType(ctx->f32, 2), "");
3711 i = LLVMBuildExtractElement(
3712 gallivm->builder, interp_param, ctx->i32_0, "");
3713 j = LLVMBuildExtractElement(
3714 gallivm->builder, interp_param, ctx->i32_1, "");
3715 }
3716 v = si_build_fs_interp(ctx, input_base + idx, schan,
3717 prim_mask, i, j);
3718
3719 gather = LLVMBuildInsertElement(gallivm->builder,
3720 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3721 }
3722
3723 emit_data->output[chan] = LLVMBuildExtractElement(
3724 gallivm->builder, gather, array_idx, "");
3725 }
3726 }
3727
3728 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3729 LLVMValueRef value)
3730 {
3731 struct gallivm_state *gallivm = &ctx->gallivm;
3732 LLVMValueRef args[3] = {
3733 value,
3734 ctx->i32_0,
3735 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3736 };
3737
3738 /* We currently have no other way to prevent LLVM from lifting the icmp
3739 * calls to a dominating basic block.
3740 */
3741 emit_optimization_barrier(ctx, &args[0]);
3742
3743 if (LLVMTypeOf(args[0]) != ctx->i32)
3744 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3745
3746 return lp_build_intrinsic(gallivm->builder,
3747 "llvm.amdgcn.icmp.i32",
3748 ctx->i64, args, 3,
3749 LP_FUNC_ATTR_NOUNWIND |
3750 LP_FUNC_ATTR_READNONE |
3751 LP_FUNC_ATTR_CONVERGENT);
3752 }
3753
3754 static void vote_all_emit(
3755 const struct lp_build_tgsi_action *action,
3756 struct lp_build_tgsi_context *bld_base,
3757 struct lp_build_emit_data *emit_data)
3758 {
3759 struct si_shader_context *ctx = si_shader_context(bld_base);
3760 struct gallivm_state *gallivm = &ctx->gallivm;
3761 LLVMValueRef active_set, vote_set;
3762 LLVMValueRef tmp;
3763
3764 active_set = si_emit_ballot(ctx, ctx->i32_1);
3765 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3766
3767 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3768 emit_data->output[emit_data->chan] =
3769 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3770 }
3771
3772 static void vote_any_emit(
3773 const struct lp_build_tgsi_action *action,
3774 struct lp_build_tgsi_context *bld_base,
3775 struct lp_build_emit_data *emit_data)
3776 {
3777 struct si_shader_context *ctx = si_shader_context(bld_base);
3778 struct gallivm_state *gallivm = &ctx->gallivm;
3779 LLVMValueRef vote_set;
3780 LLVMValueRef tmp;
3781
3782 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3783
3784 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3785 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3786 emit_data->output[emit_data->chan] =
3787 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3788 }
3789
3790 static void vote_eq_emit(
3791 const struct lp_build_tgsi_action *action,
3792 struct lp_build_tgsi_context *bld_base,
3793 struct lp_build_emit_data *emit_data)
3794 {
3795 struct si_shader_context *ctx = si_shader_context(bld_base);
3796 struct gallivm_state *gallivm = &ctx->gallivm;
3797 LLVMValueRef active_set, vote_set;
3798 LLVMValueRef all, none, tmp;
3799
3800 active_set = si_emit_ballot(ctx, ctx->i32_1);
3801 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3802
3803 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3804 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3805 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3806 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3807 emit_data->output[emit_data->chan] =
3808 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3809 }
3810
3811 static void ballot_emit(
3812 const struct lp_build_tgsi_action *action,
3813 struct lp_build_tgsi_context *bld_base,
3814 struct lp_build_emit_data *emit_data)
3815 {
3816 struct si_shader_context *ctx = si_shader_context(bld_base);
3817 LLVMBuilderRef builder = ctx->gallivm.builder;
3818 LLVMValueRef tmp;
3819
3820 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3821 tmp = si_emit_ballot(ctx, tmp);
3822 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3823
3824 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3825 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3826 }
3827
3828 static void read_invoc_fetch_args(
3829 struct lp_build_tgsi_context *bld_base,
3830 struct lp_build_emit_data *emit_data)
3831 {
3832 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3833 0, emit_data->src_chan);
3834
3835 /* Always read the source invocation (= lane) from the X channel. */
3836 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3837 1, TGSI_CHAN_X);
3838 emit_data->arg_count = 2;
3839 }
3840
3841 static void read_lane_emit(
3842 const struct lp_build_tgsi_action *action,
3843 struct lp_build_tgsi_context *bld_base,
3844 struct lp_build_emit_data *emit_data)
3845 {
3846 struct si_shader_context *ctx = si_shader_context(bld_base);
3847 LLVMBuilderRef builder = ctx->gallivm.builder;
3848
3849 /* We currently have no other way to prevent LLVM from lifting the icmp
3850 * calls to a dominating basic block.
3851 */
3852 emit_optimization_barrier(ctx, &emit_data->args[0]);
3853
3854 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3855 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3856 ctx->i32, "");
3857 }
3858
3859 emit_data->output[emit_data->chan] =
3860 ac_build_intrinsic(&ctx->ac, action->intr_name,
3861 ctx->i32, emit_data->args, emit_data->arg_count,
3862 AC_FUNC_ATTR_READNONE |
3863 AC_FUNC_ATTR_CONVERGENT);
3864 }
3865
3866 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3867 struct lp_build_emit_data *emit_data)
3868 {
3869 struct si_shader_context *ctx = si_shader_context(bld_base);
3870 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3871 LLVMValueRef imm;
3872 unsigned stream;
3873
3874 assert(src0.File == TGSI_FILE_IMMEDIATE);
3875
3876 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
3877 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
3878 return stream;
3879 }
3880
3881 /* Emit one vertex from the geometry shader */
3882 static void si_llvm_emit_vertex(
3883 const struct lp_build_tgsi_action *action,
3884 struct lp_build_tgsi_context *bld_base,
3885 struct lp_build_emit_data *emit_data)
3886 {
3887 struct si_shader_context *ctx = si_shader_context(bld_base);
3888 struct lp_build_context *uint = &bld_base->uint_bld;
3889 struct si_shader *shader = ctx->shader;
3890 struct tgsi_shader_info *info = &shader->selector->info;
3891 struct gallivm_state *gallivm = &ctx->gallivm;
3892 struct lp_build_if_state if_state;
3893 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3894 ctx->param_gs2vs_offset);
3895 LLVMValueRef gs_next_vertex;
3896 LLVMValueRef can_emit, kill;
3897 unsigned chan, offset;
3898 int i;
3899 unsigned stream;
3900
3901 stream = si_llvm_get_stream(bld_base, emit_data);
3902
3903 /* Write vertex attribute values to GSVS ring */
3904 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
3905 ctx->gs_next_vertex[stream],
3906 "");
3907
3908 /* If this thread has already emitted the declared maximum number of
3909 * vertices, skip the write: excessive vertex emissions are not
3910 * supposed to have any effect.
3911 *
3912 * If the shader has no writes to memory, kill it instead. This skips
3913 * further memory loads and may allow LLVM to skip to the end
3914 * altogether.
3915 */
3916 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
3917 LLVMConstInt(ctx->i32,
3918 shader->selector->gs_max_out_vertices, 0), "");
3919
3920 bool use_kill = !info->writes_memory;
3921 if (use_kill) {
3922 kill = lp_build_select(&bld_base->base, can_emit,
3923 LLVMConstReal(ctx->f32, 1.0f),
3924 LLVMConstReal(ctx->f32, -1.0f));
3925
3926 ac_build_kill(&ctx->ac, kill);
3927 } else {
3928 lp_build_if(&if_state, gallivm, can_emit);
3929 }
3930
3931 offset = 0;
3932 for (i = 0; i < info->num_outputs; i++) {
3933 LLVMValueRef *out_ptr = ctx->outputs[i];
3934
3935 for (chan = 0; chan < 4; chan++) {
3936 if (!(info->output_usagemask[i] & (1 << chan)) ||
3937 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
3938 continue;
3939
3940 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3941 LLVMValueRef voffset =
3942 LLVMConstInt(ctx->i32, offset *
3943 shader->selector->gs_max_out_vertices, 0);
3944 offset++;
3945
3946 voffset = lp_build_add(uint, voffset, gs_next_vertex);
3947 voffset = lp_build_mul_imm(uint, voffset, 4);
3948
3949 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3950
3951 ac_build_buffer_store_dword(&ctx->ac,
3952 ctx->gsvs_ring[stream],
3953 out_val, 1,
3954 voffset, soffset, 0,
3955 1, 1, true, true);
3956 }
3957 }
3958
3959 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
3960 ctx->i32_1);
3961
3962 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
3963
3964 /* Signal vertex emission */
3965 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
3966 si_get_gs_wave_id(ctx));
3967 if (!use_kill)
3968 lp_build_endif(&if_state);
3969 }
3970
3971 /* Cut one primitive from the geometry shader */
3972 static void si_llvm_emit_primitive(
3973 const struct lp_build_tgsi_action *action,
3974 struct lp_build_tgsi_context *bld_base,
3975 struct lp_build_emit_data *emit_data)
3976 {
3977 struct si_shader_context *ctx = si_shader_context(bld_base);
3978 unsigned stream;
3979
3980 /* Signal primitive cut */
3981 stream = si_llvm_get_stream(bld_base, emit_data);
3982 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
3983 si_get_gs_wave_id(ctx));
3984 }
3985
3986 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
3987 struct lp_build_tgsi_context *bld_base,
3988 struct lp_build_emit_data *emit_data)
3989 {
3990 struct si_shader_context *ctx = si_shader_context(bld_base);
3991 struct gallivm_state *gallivm = &ctx->gallivm;
3992
3993 /* SI only (thanks to a hw bug workaround):
3994 * The real barrier instruction isn’t needed, because an entire patch
3995 * always fits into a single wave.
3996 */
3997 if (ctx->screen->b.chip_class == SI &&
3998 ctx->type == PIPE_SHADER_TESS_CTRL) {
3999 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4000 return;
4001 }
4002
4003 lp_build_intrinsic(gallivm->builder,
4004 "llvm.amdgcn.s.barrier",
4005 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4006 }
4007
4008 static const struct lp_build_tgsi_action interp_action = {
4009 .fetch_args = interp_fetch_args,
4010 .emit = build_interp_intrinsic,
4011 };
4012
4013 static void si_create_function(struct si_shader_context *ctx,
4014 const char *name,
4015 LLVMTypeRef *returns, unsigned num_returns,
4016 struct si_function_info *fninfo,
4017 unsigned max_workgroup_size)
4018 {
4019 int i;
4020
4021 si_llvm_create_func(ctx, name, returns, num_returns,
4022 fninfo->types, fninfo->num_params);
4023 ctx->return_value = LLVMGetUndef(ctx->return_type);
4024
4025 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4026 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4027
4028 /* The combination of:
4029 * - ByVal
4030 * - dereferenceable
4031 * - invariant.load
4032 * allows the optimization passes to move loads and reduces
4033 * SGPR spilling significantly.
4034 */
4035 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4036 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4037 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4038 ac_add_attr_dereferenceable(P, UINT64_MAX);
4039 } else
4040 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4041 }
4042
4043 for (i = 0; i < fninfo->num_params; ++i) {
4044 if (fninfo->assign[i])
4045 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4046 }
4047
4048 if (max_workgroup_size) {
4049 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4050 max_workgroup_size);
4051 }
4052 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4053 "no-signed-zeros-fp-math",
4054 "true");
4055
4056 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
4057 /* These were copied from some LLVM test. */
4058 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4059 "less-precise-fpmad",
4060 "true");
4061 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4062 "no-infs-fp-math",
4063 "true");
4064 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4065 "no-nans-fp-math",
4066 "true");
4067 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4068 "unsafe-fp-math",
4069 "true");
4070 }
4071 }
4072
4073 static void declare_streamout_params(struct si_shader_context *ctx,
4074 struct pipe_stream_output_info *so,
4075 struct si_function_info *fninfo)
4076 {
4077 int i;
4078
4079 /* Streamout SGPRs. */
4080 if (so->num_outputs) {
4081 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4082 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4083 else
4084 ctx->param_streamout_config = fninfo->num_params - 1;
4085
4086 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4087 }
4088 /* A streamout buffer offset is loaded if the stride is non-zero. */
4089 for (i = 0; i < 4; i++) {
4090 if (!so->stride[i])
4091 continue;
4092
4093 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4094 }
4095 }
4096
4097 static unsigned llvm_get_type_size(LLVMTypeRef type)
4098 {
4099 LLVMTypeKind kind = LLVMGetTypeKind(type);
4100
4101 switch (kind) {
4102 case LLVMIntegerTypeKind:
4103 return LLVMGetIntTypeWidth(type) / 8;
4104 case LLVMFloatTypeKind:
4105 return 4;
4106 case LLVMPointerTypeKind:
4107 return 8;
4108 case LLVMVectorTypeKind:
4109 return LLVMGetVectorSize(type) *
4110 llvm_get_type_size(LLVMGetElementType(type));
4111 case LLVMArrayTypeKind:
4112 return LLVMGetArrayLength(type) *
4113 llvm_get_type_size(LLVMGetElementType(type));
4114 default:
4115 assert(0);
4116 return 0;
4117 }
4118 }
4119
4120 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4121 {
4122 struct gallivm_state *gallivm = &ctx->gallivm;
4123
4124 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4125 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4126 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4127 "lds");
4128 }
4129
4130 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4131 {
4132 switch (shader->selector->type) {
4133 case PIPE_SHADER_TESS_CTRL:
4134 /* Return this so that LLVM doesn't remove s_barrier
4135 * instructions on chips where we use s_barrier. */
4136 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4137
4138 case PIPE_SHADER_GEOMETRY:
4139 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4140
4141 case PIPE_SHADER_COMPUTE:
4142 break; /* see below */
4143
4144 default:
4145 return 0;
4146 }
4147
4148 const unsigned *properties = shader->selector->info.properties;
4149 unsigned max_work_group_size =
4150 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4151 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4152 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4153
4154 if (!max_work_group_size) {
4155 /* This is a variable group size compute shader,
4156 * compile it for the maximum possible group size.
4157 */
4158 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4159 }
4160 return max_work_group_size;
4161 }
4162
4163 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4164 struct si_function_info *fninfo,
4165 bool assign_params)
4166 {
4167 unsigned const_and_shader_buffers =
4168 add_arg(fninfo, ARG_SGPR,
4169 si_const_array(ctx->v4i32,
4170 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
4171 unsigned samplers_and_images =
4172 add_arg(fninfo, ARG_SGPR,
4173 si_const_array(ctx->v8i32,
4174 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4175
4176 if (assign_params) {
4177 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4178 ctx->param_samplers_and_images = samplers_and_images;
4179 }
4180 }
4181
4182 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4183 struct si_function_info *fninfo)
4184 {
4185 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4186 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4187 declare_per_stage_desc_pointers(ctx, fninfo, true);
4188 }
4189
4190 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4191 struct si_function_info *fninfo)
4192 {
4193 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4194 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4195 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4196 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4197 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4198 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4199 }
4200
4201 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4202 struct si_function_info *fninfo,
4203 unsigned *num_prolog_vgprs)
4204 {
4205 struct si_shader *shader = ctx->shader;
4206
4207 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4208 if (shader->key.as_ls) {
4209 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4210 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4211 } else {
4212 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4213 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4214 }
4215 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4216
4217 if (!shader->is_gs_copy_shader) {
4218 /* Vertex load indices. */
4219 ctx->param_vertex_index0 = fninfo->num_params;
4220 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4221 add_arg(fninfo, ARG_VGPR, ctx->i32);
4222 *num_prolog_vgprs += shader->selector->info.num_inputs;
4223 }
4224 }
4225
4226 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4227 struct si_function_info *fninfo)
4228 {
4229 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4230 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4231 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4232 ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4233 }
4234
4235 enum {
4236 /* Convenient merged shader definitions. */
4237 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4238 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4239 };
4240
4241 static void create_function(struct si_shader_context *ctx)
4242 {
4243 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
4244 struct gallivm_state *gallivm = &ctx->gallivm;
4245 struct si_shader *shader = ctx->shader;
4246 struct si_function_info fninfo;
4247 LLVMTypeRef returns[16+32*4];
4248 unsigned i, num_return_sgprs;
4249 unsigned num_returns = 0;
4250 unsigned num_prolog_vgprs = 0;
4251 unsigned type = ctx->type;
4252
4253 si_init_function_info(&fninfo);
4254
4255 /* Set MERGED shaders. */
4256 if (ctx->screen->b.chip_class >= GFX9) {
4257 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4258 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4259 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4260 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4261 }
4262
4263 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4264
4265 switch (type) {
4266 case PIPE_SHADER_VERTEX:
4267 declare_default_desc_pointers(ctx, &fninfo);
4268 declare_vs_specific_input_sgprs(ctx, &fninfo);
4269
4270 if (shader->key.as_es) {
4271 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4272 } else if (shader->key.as_ls) {
4273 /* no extra parameters */
4274 } else {
4275 if (shader->is_gs_copy_shader) {
4276 fninfo.num_params = ctx->param_rw_buffers + 1;
4277 fninfo.num_sgpr_params = fninfo.num_params;
4278 }
4279
4280 /* The locations of the other parameters are assigned dynamically. */
4281 declare_streamout_params(ctx, &shader->selector->so,
4282 &fninfo);
4283 }
4284
4285 /* VGPRs */
4286 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4287 break;
4288
4289 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4290 declare_default_desc_pointers(ctx, &fninfo);
4291 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4292 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4293 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4294 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4295 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4296 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4297 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4298 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4299
4300 /* VGPRs */
4301 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4302 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4303
4304 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4305 * placed after the user SGPRs.
4306 */
4307 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4308 returns[num_returns++] = ctx->i32; /* SGPRs */
4309 for (i = 0; i < 5; i++)
4310 returns[num_returns++] = ctx->f32; /* VGPRs */
4311 break;
4312
4313 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4314 /* Merged stages have 8 system SGPRs at the beginning. */
4315 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4316 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4317 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4318 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4319 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4320 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4321 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4322 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4323
4324 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4325 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4326 declare_per_stage_desc_pointers(ctx, &fninfo,
4327 ctx->type == PIPE_SHADER_VERTEX);
4328 declare_vs_specific_input_sgprs(ctx, &fninfo);
4329
4330 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4331 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4332 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4333 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4334 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4335 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4336
4337 declare_per_stage_desc_pointers(ctx, &fninfo,
4338 ctx->type == PIPE_SHADER_TESS_CTRL);
4339
4340 /* VGPRs (first TCS, then VS) */
4341 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4342 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4343
4344 if (ctx->type == PIPE_SHADER_VERTEX) {
4345 declare_vs_input_vgprs(ctx, &fninfo,
4346 &num_prolog_vgprs);
4347
4348 /* LS return values are inputs to the TCS main shader part. */
4349 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4350 returns[num_returns++] = ctx->i32; /* SGPRs */
4351 for (i = 0; i < 2; i++)
4352 returns[num_returns++] = ctx->f32; /* VGPRs */
4353 } else {
4354 /* TCS return values are inputs to the TCS epilog.
4355 *
4356 * param_tcs_offchip_offset, param_tcs_factor_offset,
4357 * param_tcs_offchip_layout, and param_rw_buffers
4358 * should be passed to the epilog.
4359 */
4360 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4361 returns[num_returns++] = ctx->i32; /* SGPRs */
4362 for (i = 0; i < 5; i++)
4363 returns[num_returns++] = ctx->f32; /* VGPRs */
4364 }
4365 break;
4366
4367 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4368 /* Merged stages have 8 system SGPRs at the beginning. */
4369 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4370 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4371 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4372 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4373 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4374 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4375 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4376 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4377
4378 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4379 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4380 declare_per_stage_desc_pointers(ctx, &fninfo,
4381 (ctx->type == PIPE_SHADER_VERTEX ||
4382 ctx->type == PIPE_SHADER_TESS_EVAL));
4383 if (ctx->type == PIPE_SHADER_VERTEX) {
4384 declare_vs_specific_input_sgprs(ctx, &fninfo);
4385 } else {
4386 /* TESS_EVAL (and also GEOMETRY):
4387 * Declare as many input SGPRs as the VS has. */
4388 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4389 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4390 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4391 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4392 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4393 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4394 }
4395
4396 declare_per_stage_desc_pointers(ctx, &fninfo,
4397 ctx->type == PIPE_SHADER_GEOMETRY);
4398
4399 /* VGPRs (first GS, then VS/TES) */
4400 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4401 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4402 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4403 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4404 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4405
4406 if (ctx->type == PIPE_SHADER_VERTEX) {
4407 declare_vs_input_vgprs(ctx, &fninfo,
4408 &num_prolog_vgprs);
4409 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4410 declare_tes_input_vgprs(ctx, &fninfo);
4411 }
4412
4413 if (ctx->type == PIPE_SHADER_VERTEX ||
4414 ctx->type == PIPE_SHADER_TESS_EVAL) {
4415 /* ES return values are inputs to GS. */
4416 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4417 returns[num_returns++] = ctx->i32; /* SGPRs */
4418 for (i = 0; i < 5; i++)
4419 returns[num_returns++] = ctx->f32; /* VGPRs */
4420 }
4421 break;
4422
4423 case PIPE_SHADER_TESS_EVAL:
4424 declare_default_desc_pointers(ctx, &fninfo);
4425 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4426 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4427
4428 if (shader->key.as_es) {
4429 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4430 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4431 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4432 } else {
4433 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4434 declare_streamout_params(ctx, &shader->selector->so,
4435 &fninfo);
4436 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4437 }
4438
4439 /* VGPRs */
4440 declare_tes_input_vgprs(ctx, &fninfo);
4441 break;
4442
4443 case PIPE_SHADER_GEOMETRY:
4444 declare_default_desc_pointers(ctx, &fninfo);
4445 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4446 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4447
4448 /* VGPRs */
4449 ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4450 ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4451 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4452 ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4453 ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4454 ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4455 ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4456 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4457 break;
4458
4459 case PIPE_SHADER_FRAGMENT:
4460 declare_default_desc_pointers(ctx, &fninfo);
4461 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4462 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4463
4464 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4465 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4466 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4467 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4468 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4469 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4470 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4471 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4472 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_X_FLOAT);
4473 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_Y_FLOAT);
4474 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_Z_FLOAT);
4475 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_POS_W_FLOAT);
4476 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_FRONT_FACE);
4477 shader->info.face_vgpr_index = 20;
4478 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_ANCILLARY);
4479 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_SAMPLE_COVERAGE);
4480 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4481
4482 /* Color inputs from the prolog. */
4483 if (shader->selector->info.colors_read) {
4484 unsigned num_color_elements =
4485 util_bitcount(shader->selector->info.colors_read);
4486
4487 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4488 for (i = 0; i < num_color_elements; i++)
4489 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4490
4491 num_prolog_vgprs += num_color_elements;
4492 }
4493
4494 /* Outputs for the epilog. */
4495 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4496 num_returns =
4497 num_return_sgprs +
4498 util_bitcount(shader->selector->info.colors_written) * 4 +
4499 shader->selector->info.writes_z +
4500 shader->selector->info.writes_stencil +
4501 shader->selector->info.writes_samplemask +
4502 1 /* SampleMaskIn */;
4503
4504 num_returns = MAX2(num_returns,
4505 num_return_sgprs +
4506 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4507
4508 for (i = 0; i < num_return_sgprs; i++)
4509 returns[i] = ctx->i32;
4510 for (; i < num_returns; i++)
4511 returns[i] = ctx->f32;
4512 break;
4513
4514 case PIPE_SHADER_COMPUTE:
4515 declare_default_desc_pointers(ctx, &fninfo);
4516 if (shader->selector->info.uses_grid_size)
4517 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4518 if (shader->selector->info.uses_block_size)
4519 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4520
4521 for (i = 0; i < 3; i++) {
4522 ctx->param_block_id[i] = -1;
4523 if (shader->selector->info.uses_block_id[i])
4524 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4525 }
4526
4527 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4528 break;
4529 default:
4530 assert(0 && "unimplemented shader");
4531 return;
4532 }
4533
4534 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4535 si_get_max_workgroup_size(shader));
4536
4537 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4538 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4539 ctx->separate_prolog) {
4540 si_llvm_add_attribute(ctx->main_fn,
4541 "InitialPSInputAddr",
4542 S_0286D0_PERSP_SAMPLE_ENA(1) |
4543 S_0286D0_PERSP_CENTER_ENA(1) |
4544 S_0286D0_PERSP_CENTROID_ENA(1) |
4545 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4546 S_0286D0_LINEAR_CENTER_ENA(1) |
4547 S_0286D0_LINEAR_CENTROID_ENA(1) |
4548 S_0286D0_FRONT_FACE_ENA(1) |
4549 S_0286D0_POS_FIXED_PT_ENA(1));
4550 }
4551
4552 shader->info.num_input_sgprs = 0;
4553 shader->info.num_input_vgprs = 0;
4554
4555 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4556 shader->info.num_input_sgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4557
4558 for (; i < fninfo.num_params; ++i)
4559 shader->info.num_input_vgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4560
4561 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4562 shader->info.num_input_vgprs -= num_prolog_vgprs;
4563
4564 if (!ctx->screen->has_ds_bpermute &&
4565 bld_base->info &&
4566 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
4567 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
4568 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
4569 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
4570 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
4571 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
4572 ctx->lds =
4573 LLVMAddGlobalInAddressSpace(gallivm->module,
4574 LLVMArrayType(ctx->i32, 64),
4575 "ddxy_lds",
4576 LOCAL_ADDR_SPACE);
4577
4578 if (shader->key.as_ls ||
4579 ctx->type == PIPE_SHADER_TESS_CTRL ||
4580 /* GFX9 has the ESGS ring buffer in LDS. */
4581 (ctx->screen->b.chip_class >= GFX9 &&
4582 (shader->key.as_es ||
4583 ctx->type == PIPE_SHADER_GEOMETRY)))
4584 declare_lds_as_pointer(ctx);
4585 }
4586
4587 /**
4588 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4589 * for later use.
4590 */
4591 static void preload_ring_buffers(struct si_shader_context *ctx)
4592 {
4593 struct gallivm_state *gallivm = &ctx->gallivm;
4594 LLVMBuilderRef builder = gallivm->builder;
4595
4596 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4597 ctx->param_rw_buffers);
4598
4599 if (ctx->screen->b.chip_class <= VI &&
4600 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4601 unsigned ring =
4602 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4603 : SI_ES_RING_ESGS;
4604 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4605
4606 ctx->esgs_ring =
4607 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4608 }
4609
4610 if (ctx->shader->is_gs_copy_shader) {
4611 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4612
4613 ctx->gsvs_ring[0] =
4614 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4615 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4616 const struct si_shader_selector *sel = ctx->shader->selector;
4617 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4618 LLVMValueRef base_ring;
4619
4620 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4621
4622 /* The conceptual layout of the GSVS ring is
4623 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4624 * but the real memory layout is swizzled across
4625 * threads:
4626 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4627 * t16v0c0 ..
4628 * Override the buffer descriptor accordingly.
4629 */
4630 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4631 uint64_t stream_offset = 0;
4632
4633 for (unsigned stream = 0; stream < 4; ++stream) {
4634 unsigned num_components;
4635 unsigned stride;
4636 unsigned num_records;
4637 LLVMValueRef ring, tmp;
4638
4639 num_components = sel->info.num_stream_output_components[stream];
4640 if (!num_components)
4641 continue;
4642
4643 stride = 4 * num_components * sel->gs_max_out_vertices;
4644
4645 /* Limit on the stride field for <= CIK. */
4646 assert(stride < (1 << 14));
4647
4648 num_records = 64;
4649
4650 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4651 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4652 tmp = LLVMBuildAdd(builder, tmp,
4653 LLVMConstInt(ctx->i64,
4654 stream_offset, 0), "");
4655 stream_offset += stride * 64;
4656
4657 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4658 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4659 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4660 tmp = LLVMBuildOr(builder, tmp,
4661 LLVMConstInt(ctx->i32,
4662 S_008F04_STRIDE(stride) |
4663 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4664 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4665 ring = LLVMBuildInsertElement(builder, ring,
4666 LLVMConstInt(ctx->i32, num_records, 0),
4667 LLVMConstInt(ctx->i32, 2, 0), "");
4668 ring = LLVMBuildInsertElement(builder, ring,
4669 LLVMConstInt(ctx->i32,
4670 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4671 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4672 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4673 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4674 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4675 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4676 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4677 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4678 S_008F0C_ADD_TID_ENABLE(1),
4679 0),
4680 LLVMConstInt(ctx->i32, 3, 0), "");
4681
4682 ctx->gsvs_ring[stream] = ring;
4683 }
4684 }
4685 }
4686
4687 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4688 LLVMValueRef param_rw_buffers,
4689 unsigned param_pos_fixed_pt)
4690 {
4691 struct gallivm_state *gallivm = &ctx->gallivm;
4692 LLVMBuilderRef builder = gallivm->builder;
4693 LLVMValueRef slot, desc, offset, row, bit, address[2];
4694
4695 /* Use the fixed-point gl_FragCoord input.
4696 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4697 * per coordinate to get the repeating effect.
4698 */
4699 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4700 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4701
4702 /* Load the buffer descriptor. */
4703 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4704 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4705
4706 /* The stipple pattern is 32x32, each row has 32 bits. */
4707 offset = LLVMBuildMul(builder, address[1],
4708 LLVMConstInt(ctx->i32, 4, 0), "");
4709 row = buffer_load_const(ctx, desc, offset);
4710 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4711 bit = LLVMBuildLShr(builder, row, address[0], "");
4712 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4713
4714 /* The intrinsic kills the thread if arg < 0. */
4715 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4716 LLVMConstReal(ctx->f32, -1), "");
4717 ac_build_kill(&ctx->ac, bit);
4718 }
4719
4720 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4721 struct si_shader_config *conf,
4722 unsigned symbol_offset)
4723 {
4724 unsigned i;
4725 const unsigned char *config =
4726 ac_shader_binary_config_start(binary, symbol_offset);
4727 bool really_needs_scratch = false;
4728
4729 /* LLVM adds SGPR spills to the scratch size.
4730 * Find out if we really need the scratch buffer.
4731 */
4732 for (i = 0; i < binary->reloc_count; i++) {
4733 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4734
4735 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4736 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4737 really_needs_scratch = true;
4738 break;
4739 }
4740 }
4741
4742 /* XXX: We may be able to emit some of these values directly rather than
4743 * extracting fields to be emitted later.
4744 */
4745
4746 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4747 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4748 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4749 switch (reg) {
4750 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4751 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4752 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4753 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4754 case R_00B848_COMPUTE_PGM_RSRC1:
4755 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4756 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4757 conf->float_mode = G_00B028_FLOAT_MODE(value);
4758 conf->rsrc1 = value;
4759 break;
4760 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4761 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4762 break;
4763 case R_00B84C_COMPUTE_PGM_RSRC2:
4764 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4765 conf->rsrc2 = value;
4766 break;
4767 case R_0286CC_SPI_PS_INPUT_ENA:
4768 conf->spi_ps_input_ena = value;
4769 break;
4770 case R_0286D0_SPI_PS_INPUT_ADDR:
4771 conf->spi_ps_input_addr = value;
4772 break;
4773 case R_0286E8_SPI_TMPRING_SIZE:
4774 case R_00B860_COMPUTE_TMPRING_SIZE:
4775 /* WAVESIZE is in units of 256 dwords. */
4776 if (really_needs_scratch)
4777 conf->scratch_bytes_per_wave =
4778 G_00B860_WAVESIZE(value) * 256 * 4;
4779 break;
4780 case 0x4: /* SPILLED_SGPRS */
4781 conf->spilled_sgprs = value;
4782 break;
4783 case 0x8: /* SPILLED_VGPRS */
4784 conf->spilled_vgprs = value;
4785 break;
4786 default:
4787 {
4788 static bool printed;
4789
4790 if (!printed) {
4791 fprintf(stderr, "Warning: LLVM emitted unknown "
4792 "config register: 0x%x\n", reg);
4793 printed = true;
4794 }
4795 }
4796 break;
4797 }
4798 }
4799
4800 if (!conf->spi_ps_input_addr)
4801 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4802 }
4803
4804 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4805 uint64_t scratch_va)
4806 {
4807 unsigned i;
4808 uint32_t scratch_rsrc_dword0 = scratch_va;
4809 uint32_t scratch_rsrc_dword1 =
4810 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4811
4812 /* Enable scratch coalescing. */
4813 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4814
4815 for (i = 0 ; i < shader->binary.reloc_count; i++) {
4816 const struct ac_shader_reloc *reloc =
4817 &shader->binary.relocs[i];
4818 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4819 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4820 &scratch_rsrc_dword0, 4);
4821 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4822 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4823 &scratch_rsrc_dword1, 4);
4824 }
4825 }
4826 }
4827
4828 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4829 {
4830 unsigned size = shader->binary.code_size;
4831
4832 if (shader->prolog)
4833 size += shader->prolog->binary.code_size;
4834 if (shader->previous_stage)
4835 size += shader->previous_stage->binary.code_size;
4836 if (shader->prolog2)
4837 size += shader->prolog2->binary.code_size;
4838 if (shader->epilog)
4839 size += shader->epilog->binary.code_size;
4840 return size;
4841 }
4842
4843 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4844 {
4845 const struct ac_shader_binary *prolog =
4846 shader->prolog ? &shader->prolog->binary : NULL;
4847 const struct ac_shader_binary *previous_stage =
4848 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4849 const struct ac_shader_binary *prolog2 =
4850 shader->prolog2 ? &shader->prolog2->binary : NULL;
4851 const struct ac_shader_binary *epilog =
4852 shader->epilog ? &shader->epilog->binary : NULL;
4853 const struct ac_shader_binary *mainb = &shader->binary;
4854 unsigned bo_size = si_get_shader_binary_size(shader) +
4855 (!epilog ? mainb->rodata_size : 0);
4856 unsigned char *ptr;
4857
4858 assert(!prolog || !prolog->rodata_size);
4859 assert(!previous_stage || !previous_stage->rodata_size);
4860 assert(!prolog2 || !prolog2->rodata_size);
4861 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4862 !mainb->rodata_size);
4863 assert(!epilog || !epilog->rodata_size);
4864
4865 r600_resource_reference(&shader->bo, NULL);
4866 shader->bo = (struct r600_resource*)
4867 pipe_buffer_create(&sscreen->b.b, 0,
4868 PIPE_USAGE_IMMUTABLE,
4869 align(bo_size, SI_CPDMA_ALIGNMENT));
4870 if (!shader->bo)
4871 return -ENOMEM;
4872
4873 /* Upload. */
4874 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4875 PIPE_TRANSFER_READ_WRITE |
4876 PIPE_TRANSFER_UNSYNCHRONIZED);
4877
4878 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4879 * endian-independent. */
4880 if (prolog) {
4881 memcpy(ptr, prolog->code, prolog->code_size);
4882 ptr += prolog->code_size;
4883 }
4884 if (previous_stage) {
4885 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4886 ptr += previous_stage->code_size;
4887 }
4888 if (prolog2) {
4889 memcpy(ptr, prolog2->code, prolog2->code_size);
4890 ptr += prolog2->code_size;
4891 }
4892
4893 memcpy(ptr, mainb->code, mainb->code_size);
4894 ptr += mainb->code_size;
4895
4896 if (epilog)
4897 memcpy(ptr, epilog->code, epilog->code_size);
4898 else if (mainb->rodata_size > 0)
4899 memcpy(ptr, mainb->rodata, mainb->rodata_size);
4900
4901 sscreen->b.ws->buffer_unmap(shader->bo->buf);
4902 return 0;
4903 }
4904
4905 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
4906 struct pipe_debug_callback *debug,
4907 const char *name, FILE *file)
4908 {
4909 char *line, *p;
4910 unsigned i, count;
4911
4912 if (binary->disasm_string) {
4913 fprintf(file, "Shader %s disassembly:\n", name);
4914 fprintf(file, "%s", binary->disasm_string);
4915
4916 if (debug && debug->debug_message) {
4917 /* Very long debug messages are cut off, so send the
4918 * disassembly one line at a time. This causes more
4919 * overhead, but on the plus side it simplifies
4920 * parsing of resulting logs.
4921 */
4922 pipe_debug_message(debug, SHADER_INFO,
4923 "Shader Disassembly Begin");
4924
4925 line = binary->disasm_string;
4926 while (*line) {
4927 p = util_strchrnul(line, '\n');
4928 count = p - line;
4929
4930 if (count) {
4931 pipe_debug_message(debug, SHADER_INFO,
4932 "%.*s", count, line);
4933 }
4934
4935 if (!*p)
4936 break;
4937 line = p + 1;
4938 }
4939
4940 pipe_debug_message(debug, SHADER_INFO,
4941 "Shader Disassembly End");
4942 }
4943 } else {
4944 fprintf(file, "Shader %s binary:\n", name);
4945 for (i = 0; i < binary->code_size; i += 4) {
4946 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
4947 binary->code[i + 3], binary->code[i + 2],
4948 binary->code[i + 1], binary->code[i]);
4949 }
4950 }
4951 }
4952
4953 static void si_shader_dump_stats(struct si_screen *sscreen,
4954 const struct si_shader *shader,
4955 struct pipe_debug_callback *debug,
4956 unsigned processor,
4957 FILE *file,
4958 bool check_debug_option)
4959 {
4960 const struct si_shader_config *conf = &shader->config;
4961 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
4962 unsigned code_size = si_get_shader_binary_size(shader);
4963 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
4964 unsigned lds_per_wave = 0;
4965 unsigned max_simd_waves = 10;
4966
4967 /* Compute LDS usage for PS. */
4968 switch (processor) {
4969 case PIPE_SHADER_FRAGMENT:
4970 /* The minimum usage per wave is (num_inputs * 48). The maximum
4971 * usage is (num_inputs * 48 * 16).
4972 * We can get anything in between and it varies between waves.
4973 *
4974 * The 48 bytes per input for a single primitive is equal to
4975 * 4 bytes/component * 4 components/input * 3 points.
4976 *
4977 * Other stages don't know the size at compile time or don't
4978 * allocate LDS per wave, but instead they do it per thread group.
4979 */
4980 lds_per_wave = conf->lds_size * lds_increment +
4981 align(num_inputs * 48, lds_increment);
4982 break;
4983 case PIPE_SHADER_COMPUTE:
4984 if (shader->selector) {
4985 unsigned max_workgroup_size =
4986 si_get_max_workgroup_size(shader);
4987 lds_per_wave = (conf->lds_size * lds_increment) /
4988 DIV_ROUND_UP(max_workgroup_size, 64);
4989 }
4990 break;
4991 }
4992
4993 /* Compute the per-SIMD wave counts. */
4994 if (conf->num_sgprs) {
4995 if (sscreen->b.chip_class >= VI)
4996 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
4997 else
4998 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
4999 }
5000
5001 if (conf->num_vgprs)
5002 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5003
5004 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5005 * 16KB makes some SIMDs unoccupied). */
5006 if (lds_per_wave)
5007 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5008
5009 if (!check_debug_option ||
5010 r600_can_dump_shader(&sscreen->b, processor)) {
5011 if (processor == PIPE_SHADER_FRAGMENT) {
5012 fprintf(file, "*** SHADER CONFIG ***\n"
5013 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5014 "SPI_PS_INPUT_ENA = 0x%04x\n",
5015 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5016 }
5017
5018 fprintf(file, "*** SHADER STATS ***\n"
5019 "SGPRS: %d\n"
5020 "VGPRS: %d\n"
5021 "Spilled SGPRs: %d\n"
5022 "Spilled VGPRs: %d\n"
5023 "Private memory VGPRs: %d\n"
5024 "Code Size: %d bytes\n"
5025 "LDS: %d blocks\n"
5026 "Scratch: %d bytes per wave\n"
5027 "Max Waves: %d\n"
5028 "********************\n\n\n",
5029 conf->num_sgprs, conf->num_vgprs,
5030 conf->spilled_sgprs, conf->spilled_vgprs,
5031 conf->private_mem_vgprs, code_size,
5032 conf->lds_size, conf->scratch_bytes_per_wave,
5033 max_simd_waves);
5034 }
5035
5036 pipe_debug_message(debug, SHADER_INFO,
5037 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5038 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5039 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5040 conf->num_sgprs, conf->num_vgprs, code_size,
5041 conf->lds_size, conf->scratch_bytes_per_wave,
5042 max_simd_waves, conf->spilled_sgprs,
5043 conf->spilled_vgprs, conf->private_mem_vgprs);
5044 }
5045
5046 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5047 {
5048 switch (processor) {
5049 case PIPE_SHADER_VERTEX:
5050 if (shader->key.as_es)
5051 return "Vertex Shader as ES";
5052 else if (shader->key.as_ls)
5053 return "Vertex Shader as LS";
5054 else
5055 return "Vertex Shader as VS";
5056 case PIPE_SHADER_TESS_CTRL:
5057 return "Tessellation Control Shader";
5058 case PIPE_SHADER_TESS_EVAL:
5059 if (shader->key.as_es)
5060 return "Tessellation Evaluation Shader as ES";
5061 else
5062 return "Tessellation Evaluation Shader as VS";
5063 case PIPE_SHADER_GEOMETRY:
5064 if (shader->is_gs_copy_shader)
5065 return "GS Copy Shader as VS";
5066 else
5067 return "Geometry Shader";
5068 case PIPE_SHADER_FRAGMENT:
5069 return "Pixel Shader";
5070 case PIPE_SHADER_COMPUTE:
5071 return "Compute Shader";
5072 default:
5073 return "Unknown Shader";
5074 }
5075 }
5076
5077 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5078 struct pipe_debug_callback *debug, unsigned processor,
5079 FILE *file, bool check_debug_option)
5080 {
5081 if (!check_debug_option ||
5082 r600_can_dump_shader(&sscreen->b, processor))
5083 si_dump_shader_key(processor, shader, file);
5084
5085 if (!check_debug_option && shader->binary.llvm_ir_string) {
5086 if (shader->previous_stage &&
5087 shader->previous_stage->binary.llvm_ir_string) {
5088 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5089 si_get_shader_name(shader, processor));
5090 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5091 }
5092
5093 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5094 si_get_shader_name(shader, processor));
5095 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5096 }
5097
5098 if (!check_debug_option ||
5099 (r600_can_dump_shader(&sscreen->b, processor) &&
5100 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5101 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5102
5103 if (shader->prolog)
5104 si_shader_dump_disassembly(&shader->prolog->binary,
5105 debug, "prolog", file);
5106 if (shader->previous_stage)
5107 si_shader_dump_disassembly(&shader->previous_stage->binary,
5108 debug, "previous stage", file);
5109 if (shader->prolog2)
5110 si_shader_dump_disassembly(&shader->prolog2->binary,
5111 debug, "prolog2", file);
5112
5113 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5114
5115 if (shader->epilog)
5116 si_shader_dump_disassembly(&shader->epilog->binary,
5117 debug, "epilog", file);
5118 fprintf(file, "\n");
5119 }
5120
5121 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5122 check_debug_option);
5123 }
5124
5125 static int si_compile_llvm(struct si_screen *sscreen,
5126 struct ac_shader_binary *binary,
5127 struct si_shader_config *conf,
5128 LLVMTargetMachineRef tm,
5129 LLVMModuleRef mod,
5130 struct pipe_debug_callback *debug,
5131 unsigned processor,
5132 const char *name)
5133 {
5134 int r = 0;
5135 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5136
5137 if (r600_can_dump_shader(&sscreen->b, processor)) {
5138 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5139
5140 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5141 fprintf(stderr, "%s LLVM IR:\n\n", name);
5142 ac_dump_module(mod);
5143 fprintf(stderr, "\n");
5144 }
5145 }
5146
5147 if (sscreen->record_llvm_ir) {
5148 char *ir = LLVMPrintModuleToString(mod);
5149 binary->llvm_ir_string = strdup(ir);
5150 LLVMDisposeMessage(ir);
5151 }
5152
5153 if (!si_replace_shader(count, binary)) {
5154 r = si_llvm_compile(mod, binary, tm, debug);
5155 if (r)
5156 return r;
5157 }
5158
5159 si_shader_binary_read_config(binary, conf, 0);
5160
5161 /* Enable 64-bit and 16-bit denormals, because there is no performance
5162 * cost.
5163 *
5164 * If denormals are enabled, all floating-point output modifiers are
5165 * ignored.
5166 *
5167 * Don't enable denormals for 32-bit floats, because:
5168 * - Floating-point output modifiers would be ignored by the hw.
5169 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5170 * have to stop using those.
5171 * - SI & CI would be very slow.
5172 */
5173 conf->float_mode |= V_00B028_FP_64_DENORMS;
5174
5175 FREE(binary->config);
5176 FREE(binary->global_symbol_offsets);
5177 binary->config = NULL;
5178 binary->global_symbol_offsets = NULL;
5179
5180 /* Some shaders can't have rodata because their binaries can be
5181 * concatenated.
5182 */
5183 if (binary->rodata_size &&
5184 (processor == PIPE_SHADER_VERTEX ||
5185 processor == PIPE_SHADER_TESS_CTRL ||
5186 processor == PIPE_SHADER_TESS_EVAL ||
5187 processor == PIPE_SHADER_FRAGMENT)) {
5188 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5189 return -EINVAL;
5190 }
5191
5192 return r;
5193 }
5194
5195 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5196 {
5197 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5198 LLVMBuildRetVoid(ctx->gallivm.builder);
5199 else
5200 LLVMBuildRet(ctx->gallivm.builder, ret);
5201 }
5202
5203 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5204 struct si_shader *
5205 si_generate_gs_copy_shader(struct si_screen *sscreen,
5206 LLVMTargetMachineRef tm,
5207 struct si_shader_selector *gs_selector,
5208 struct pipe_debug_callback *debug)
5209 {
5210 struct si_shader_context ctx;
5211 struct si_shader *shader;
5212 struct gallivm_state *gallivm = &ctx.gallivm;
5213 LLVMBuilderRef builder;
5214 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5215 struct lp_build_context *uint = &bld_base->uint_bld;
5216 struct si_shader_output_values *outputs;
5217 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5218 int i, r;
5219
5220 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5221
5222 if (!outputs)
5223 return NULL;
5224
5225 shader = CALLOC_STRUCT(si_shader);
5226 if (!shader) {
5227 FREE(outputs);
5228 return NULL;
5229 }
5230
5231
5232 shader->selector = gs_selector;
5233 shader->is_gs_copy_shader = true;
5234
5235 si_init_shader_ctx(&ctx, sscreen, tm);
5236 ctx.shader = shader;
5237 ctx.type = PIPE_SHADER_VERTEX;
5238
5239 builder = gallivm->builder;
5240
5241 create_function(&ctx);
5242 preload_ring_buffers(&ctx);
5243
5244 LLVMValueRef voffset =
5245 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5246
5247 /* Fetch the vertex stream ID.*/
5248 LLVMValueRef stream_id;
5249
5250 if (gs_selector->so.num_outputs)
5251 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5252 else
5253 stream_id = ctx.i32_0;
5254
5255 /* Fill in output information. */
5256 for (i = 0; i < gsinfo->num_outputs; ++i) {
5257 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5258 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5259
5260 for (int chan = 0; chan < 4; chan++) {
5261 outputs[i].vertex_stream[chan] =
5262 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5263 }
5264 }
5265
5266 LLVMBasicBlockRef end_bb;
5267 LLVMValueRef switch_inst;
5268
5269 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5270 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5271
5272 for (int stream = 0; stream < 4; stream++) {
5273 LLVMBasicBlockRef bb;
5274 unsigned offset;
5275
5276 if (!gsinfo->num_stream_output_components[stream])
5277 continue;
5278
5279 if (stream > 0 && !gs_selector->so.num_outputs)
5280 continue;
5281
5282 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5283 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5284 LLVMPositionBuilderAtEnd(builder, bb);
5285
5286 /* Fetch vertex data from GSVS ring */
5287 offset = 0;
5288 for (i = 0; i < gsinfo->num_outputs; ++i) {
5289 for (unsigned chan = 0; chan < 4; chan++) {
5290 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5291 outputs[i].vertex_stream[chan] != stream) {
5292 outputs[i].values[chan] = ctx.bld_base.base.undef;
5293 continue;
5294 }
5295
5296 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5297 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5298 offset++;
5299
5300 outputs[i].values[chan] =
5301 ac_build_buffer_load(&ctx.ac,
5302 ctx.gsvs_ring[0], 1,
5303 ctx.i32_0, voffset,
5304 soffset, 0, 1, 1,
5305 true, false);
5306 }
5307 }
5308
5309 /* Streamout and exports. */
5310 if (gs_selector->so.num_outputs) {
5311 si_llvm_emit_streamout(&ctx, outputs,
5312 gsinfo->num_outputs,
5313 stream);
5314 }
5315
5316 if (stream == 0)
5317 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5318
5319 LLVMBuildBr(builder, end_bb);
5320 }
5321
5322 LLVMPositionBuilderAtEnd(builder, end_bb);
5323
5324 LLVMBuildRetVoid(gallivm->builder);
5325
5326 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5327 si_llvm_optimize_module(&ctx);
5328
5329 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5330 &ctx.shader->config, ctx.tm,
5331 ctx.gallivm.module,
5332 debug, PIPE_SHADER_GEOMETRY,
5333 "GS Copy Shader");
5334 if (!r) {
5335 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5336 fprintf(stderr, "GS Copy Shader:\n");
5337 si_shader_dump(sscreen, ctx.shader, debug,
5338 PIPE_SHADER_GEOMETRY, stderr, true);
5339 r = si_shader_binary_upload(sscreen, ctx.shader);
5340 }
5341
5342 si_llvm_dispose(&ctx);
5343
5344 FREE(outputs);
5345
5346 if (r != 0) {
5347 FREE(shader);
5348 shader = NULL;
5349 }
5350 return shader;
5351 }
5352
5353 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5354 const struct si_vs_prolog_bits *prolog,
5355 const char *prefix, FILE *f)
5356 {
5357 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5358 prefix, prolog->instance_divisor_is_one);
5359 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5360 prefix, prolog->instance_divisor_is_fetched);
5361
5362 fprintf(f, " mono.vs.fix_fetch = {");
5363 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5364 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5365 fprintf(f, "}\n");
5366 }
5367
5368 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5369 FILE *f)
5370 {
5371 const struct si_shader_key *key = &shader->key;
5372
5373 fprintf(f, "SHADER KEY\n");
5374
5375 switch (processor) {
5376 case PIPE_SHADER_VERTEX:
5377 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5378 "part.vs.prolog", f);
5379 fprintf(f, " as_es = %u\n", key->as_es);
5380 fprintf(f, " as_ls = %u\n", key->as_ls);
5381 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5382 key->mono.u.vs_export_prim_id);
5383 break;
5384
5385 case PIPE_SHADER_TESS_CTRL:
5386 if (shader->selector->screen->b.chip_class >= GFX9) {
5387 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5388 "part.tcs.ls_prolog", f);
5389 }
5390 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5391 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5392 break;
5393
5394 case PIPE_SHADER_TESS_EVAL:
5395 fprintf(f, " as_es = %u\n", key->as_es);
5396 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5397 key->mono.u.vs_export_prim_id);
5398 break;
5399
5400 case PIPE_SHADER_GEOMETRY:
5401 if (shader->is_gs_copy_shader)
5402 break;
5403
5404 if (shader->selector->screen->b.chip_class >= GFX9 &&
5405 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5406 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5407 "part.gs.vs_prolog", f);
5408 }
5409 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5410 break;
5411
5412 case PIPE_SHADER_COMPUTE:
5413 break;
5414
5415 case PIPE_SHADER_FRAGMENT:
5416 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5417 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5418 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5419 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5420 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5421 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5422 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5423 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5424 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5425 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5426 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5427 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5428 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5429 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5430 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5431 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5432 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5433 break;
5434
5435 default:
5436 assert(0);
5437 }
5438
5439 if ((processor == PIPE_SHADER_GEOMETRY ||
5440 processor == PIPE_SHADER_TESS_EVAL ||
5441 processor == PIPE_SHADER_VERTEX) &&
5442 !key->as_es && !key->as_ls) {
5443 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5444 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5445 }
5446 }
5447
5448 static void si_init_shader_ctx(struct si_shader_context *ctx,
5449 struct si_screen *sscreen,
5450 LLVMTargetMachineRef tm)
5451 {
5452 struct lp_build_tgsi_context *bld_base;
5453
5454 si_llvm_context_init(ctx, sscreen, tm);
5455
5456 bld_base = &ctx->bld_base;
5457 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5458
5459 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5460 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5461 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5462
5463 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5464
5465 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5466
5467 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5468 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5469 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5470 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5471
5472 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5473 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5474 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5475 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5476 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5477 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5478 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5479 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5480 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5481
5482 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5483 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5484 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5485 }
5486
5487 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5488 {
5489 struct si_shader *shader = ctx->shader;
5490 struct tgsi_shader_info *info = &shader->selector->info;
5491
5492 if ((ctx->type != PIPE_SHADER_VERTEX &&
5493 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5494 shader->key.as_ls ||
5495 shader->key.as_es)
5496 return;
5497
5498 ac_optimize_vs_outputs(&ctx->ac,
5499 ctx->main_fn,
5500 shader->info.vs_output_param_offset,
5501 info->num_outputs,
5502 &shader->info.nr_param_exports);
5503 }
5504
5505 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5506 {
5507 ctx->shader->config.private_mem_vgprs = 0;
5508
5509 /* Process all LLVM instructions. */
5510 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5511 while (bb) {
5512 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5513
5514 while (next) {
5515 LLVMValueRef inst = next;
5516 next = LLVMGetNextInstruction(next);
5517
5518 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5519 continue;
5520
5521 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5522 /* No idea why LLVM aligns allocas to 4 elements. */
5523 unsigned alignment = LLVMGetAlignment(inst);
5524 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5525 ctx->shader->config.private_mem_vgprs += dw_size;
5526 }
5527 bb = LLVMGetNextBasicBlock(bb);
5528 }
5529 }
5530
5531 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5532 {
5533 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5534 lp_build_intrinsic(ctx->gallivm.builder,
5535 "llvm.amdgcn.init.exec", ctx->voidt,
5536 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5537 }
5538
5539 static void si_init_exec_from_input(struct si_shader_context *ctx,
5540 unsigned param, unsigned bitoffset)
5541 {
5542 LLVMValueRef args[] = {
5543 LLVMGetParam(ctx->main_fn, param),
5544 LLVMConstInt(ctx->i32, bitoffset, 0),
5545 };
5546 lp_build_intrinsic(ctx->gallivm.builder,
5547 "llvm.amdgcn.init.exec.from.input",
5548 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5549 }
5550
5551 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5552 bool is_monolithic)
5553 {
5554 struct si_shader *shader = ctx->shader;
5555 struct si_shader_selector *sel = shader->selector;
5556 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5557
5558 // TODO clean all this up!
5559 switch (ctx->type) {
5560 case PIPE_SHADER_VERTEX:
5561 ctx->load_input = declare_input_vs;
5562 if (shader->key.as_ls)
5563 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5564 else if (shader->key.as_es)
5565 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5566 else
5567 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5568 break;
5569 case PIPE_SHADER_TESS_CTRL:
5570 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5571 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5572 bld_base->emit_store = store_output_tcs;
5573 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5574 break;
5575 case PIPE_SHADER_TESS_EVAL:
5576 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5577 if (shader->key.as_es)
5578 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5579 else
5580 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5581 break;
5582 case PIPE_SHADER_GEOMETRY:
5583 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5584 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5585 break;
5586 case PIPE_SHADER_FRAGMENT:
5587 ctx->load_input = declare_input_fs;
5588 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
5589 break;
5590 case PIPE_SHADER_COMPUTE:
5591 ctx->declare_memory_region = declare_compute_memory;
5592 break;
5593 default:
5594 assert(!"Unsupported shader type");
5595 return false;
5596 }
5597
5598 create_function(ctx);
5599 preload_ring_buffers(ctx);
5600
5601 /* For GFX9 merged shaders:
5602 * - Set EXEC for the first shader. If the prolog is present, set
5603 * EXEC there instead.
5604 * - Add a barrier before the second shader.
5605 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5606 * an if-statement. This is required for correctness in geometry
5607 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5608 * GS_CUT messages.
5609 *
5610 * For monolithic merged shaders, the first shader is wrapped in an
5611 * if-block together with its prolog in si_build_wrapper_function.
5612 */
5613 if (ctx->screen->b.chip_class >= GFX9) {
5614 if (!is_monolithic &&
5615 sel->info.num_instructions > 1 && /* not empty shader */
5616 (shader->key.as_es || shader->key.as_ls) &&
5617 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5618 (ctx->type == PIPE_SHADER_VERTEX &&
5619 !sel->vs_needs_prolog))) {
5620 si_init_exec_from_input(ctx,
5621 ctx->param_merged_wave_info, 0);
5622 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5623 ctx->type == PIPE_SHADER_GEOMETRY) {
5624 if (!is_monolithic)
5625 si_init_exec_full_mask(ctx);
5626
5627 /* The barrier must execute for all shaders in a
5628 * threadgroup.
5629 */
5630 si_llvm_emit_barrier(NULL, bld_base, NULL);
5631
5632 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5633 LLVMValueRef ena =
5634 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5635 ac_get_thread_id(&ctx->ac), num_threads, "");
5636 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5637 }
5638 }
5639
5640 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5641 int i;
5642 for (i = 0; i < 4; i++) {
5643 ctx->gs_next_vertex[i] =
5644 lp_build_alloca(&ctx->gallivm,
5645 ctx->i32, "");
5646 }
5647 }
5648
5649 if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5650 ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5651 /* This is initialized to 0.0 = not kill. */
5652 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5653 }
5654
5655 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5656 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5657 return false;
5658 }
5659
5660 si_llvm_build_ret(ctx, ctx->return_value);
5661 return true;
5662 }
5663
5664 /**
5665 * Compute the VS prolog key, which contains all the information needed to
5666 * build the VS prolog function, and set shader->info bits where needed.
5667 *
5668 * \param info Shader info of the vertex shader.
5669 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5670 * \param prolog_key Key of the VS prolog
5671 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5672 * \param key Output shader part key.
5673 */
5674 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5675 unsigned num_input_sgprs,
5676 const struct si_vs_prolog_bits *prolog_key,
5677 struct si_shader *shader_out,
5678 union si_shader_part_key *key)
5679 {
5680 memset(key, 0, sizeof(*key));
5681 key->vs_prolog.states = *prolog_key;
5682 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5683 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5684 key->vs_prolog.as_ls = shader_out->key.as_ls;
5685
5686 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5687 key->vs_prolog.as_ls = 1;
5688 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5689 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5690 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5691 }
5692
5693 /* Enable loading the InstanceID VGPR. */
5694 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5695
5696 if ((key->vs_prolog.states.instance_divisor_is_one |
5697 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5698 shader_out->info.uses_instanceid = true;
5699 }
5700
5701 /**
5702 * Compute the PS prolog key, which contains all the information needed to
5703 * build the PS prolog function, and set related bits in shader->config.
5704 */
5705 static void si_get_ps_prolog_key(struct si_shader *shader,
5706 union si_shader_part_key *key,
5707 bool separate_prolog)
5708 {
5709 struct tgsi_shader_info *info = &shader->selector->info;
5710
5711 memset(key, 0, sizeof(*key));
5712 key->ps_prolog.states = shader->key.part.ps.prolog;
5713 key->ps_prolog.colors_read = info->colors_read;
5714 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5715 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5716 key->ps_prolog.wqm = info->uses_derivatives &&
5717 (key->ps_prolog.colors_read ||
5718 key->ps_prolog.states.force_persp_sample_interp ||
5719 key->ps_prolog.states.force_linear_sample_interp ||
5720 key->ps_prolog.states.force_persp_center_interp ||
5721 key->ps_prolog.states.force_linear_center_interp ||
5722 key->ps_prolog.states.bc_optimize_for_persp ||
5723 key->ps_prolog.states.bc_optimize_for_linear);
5724
5725 if (info->colors_read) {
5726 unsigned *color = shader->selector->color_attr_index;
5727
5728 if (shader->key.part.ps.prolog.color_two_side) {
5729 /* BCOLORs are stored after the last input. */
5730 key->ps_prolog.num_interp_inputs = info->num_inputs;
5731 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5732 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5733 }
5734
5735 for (unsigned i = 0; i < 2; i++) {
5736 unsigned interp = info->input_interpolate[color[i]];
5737 unsigned location = info->input_interpolate_loc[color[i]];
5738
5739 if (!(info->colors_read & (0xf << i*4)))
5740 continue;
5741
5742 key->ps_prolog.color_attr_index[i] = color[i];
5743
5744 if (shader->key.part.ps.prolog.flatshade_colors &&
5745 interp == TGSI_INTERPOLATE_COLOR)
5746 interp = TGSI_INTERPOLATE_CONSTANT;
5747
5748 switch (interp) {
5749 case TGSI_INTERPOLATE_CONSTANT:
5750 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5751 break;
5752 case TGSI_INTERPOLATE_PERSPECTIVE:
5753 case TGSI_INTERPOLATE_COLOR:
5754 /* Force the interpolation location for colors here. */
5755 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5756 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5757 if (shader->key.part.ps.prolog.force_persp_center_interp)
5758 location = TGSI_INTERPOLATE_LOC_CENTER;
5759
5760 switch (location) {
5761 case TGSI_INTERPOLATE_LOC_SAMPLE:
5762 key->ps_prolog.color_interp_vgpr_index[i] = 0;
5763 shader->config.spi_ps_input_ena |=
5764 S_0286CC_PERSP_SAMPLE_ENA(1);
5765 break;
5766 case TGSI_INTERPOLATE_LOC_CENTER:
5767 key->ps_prolog.color_interp_vgpr_index[i] = 2;
5768 shader->config.spi_ps_input_ena |=
5769 S_0286CC_PERSP_CENTER_ENA(1);
5770 break;
5771 case TGSI_INTERPOLATE_LOC_CENTROID:
5772 key->ps_prolog.color_interp_vgpr_index[i] = 4;
5773 shader->config.spi_ps_input_ena |=
5774 S_0286CC_PERSP_CENTROID_ENA(1);
5775 break;
5776 default:
5777 assert(0);
5778 }
5779 break;
5780 case TGSI_INTERPOLATE_LINEAR:
5781 /* Force the interpolation location for colors here. */
5782 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5783 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5784 if (shader->key.part.ps.prolog.force_linear_center_interp)
5785 location = TGSI_INTERPOLATE_LOC_CENTER;
5786
5787 /* The VGPR assignment for non-monolithic shaders
5788 * works because InitialPSInputAddr is set on the
5789 * main shader and PERSP_PULL_MODEL is never used.
5790 */
5791 switch (location) {
5792 case TGSI_INTERPOLATE_LOC_SAMPLE:
5793 key->ps_prolog.color_interp_vgpr_index[i] =
5794 separate_prolog ? 6 : 9;
5795 shader->config.spi_ps_input_ena |=
5796 S_0286CC_LINEAR_SAMPLE_ENA(1);
5797 break;
5798 case TGSI_INTERPOLATE_LOC_CENTER:
5799 key->ps_prolog.color_interp_vgpr_index[i] =
5800 separate_prolog ? 8 : 11;
5801 shader->config.spi_ps_input_ena |=
5802 S_0286CC_LINEAR_CENTER_ENA(1);
5803 break;
5804 case TGSI_INTERPOLATE_LOC_CENTROID:
5805 key->ps_prolog.color_interp_vgpr_index[i] =
5806 separate_prolog ? 10 : 13;
5807 shader->config.spi_ps_input_ena |=
5808 S_0286CC_LINEAR_CENTROID_ENA(1);
5809 break;
5810 default:
5811 assert(0);
5812 }
5813 break;
5814 default:
5815 assert(0);
5816 }
5817 }
5818 }
5819 }
5820
5821 /**
5822 * Check whether a PS prolog is required based on the key.
5823 */
5824 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5825 {
5826 return key->ps_prolog.colors_read ||
5827 key->ps_prolog.states.force_persp_sample_interp ||
5828 key->ps_prolog.states.force_linear_sample_interp ||
5829 key->ps_prolog.states.force_persp_center_interp ||
5830 key->ps_prolog.states.force_linear_center_interp ||
5831 key->ps_prolog.states.bc_optimize_for_persp ||
5832 key->ps_prolog.states.bc_optimize_for_linear ||
5833 key->ps_prolog.states.poly_stipple;
5834 }
5835
5836 /**
5837 * Compute the PS epilog key, which contains all the information needed to
5838 * build the PS epilog function.
5839 */
5840 static void si_get_ps_epilog_key(struct si_shader *shader,
5841 union si_shader_part_key *key)
5842 {
5843 struct tgsi_shader_info *info = &shader->selector->info;
5844 memset(key, 0, sizeof(*key));
5845 key->ps_epilog.colors_written = info->colors_written;
5846 key->ps_epilog.writes_z = info->writes_z;
5847 key->ps_epilog.writes_stencil = info->writes_stencil;
5848 key->ps_epilog.writes_samplemask = info->writes_samplemask;
5849 key->ps_epilog.states = shader->key.part.ps.epilog;
5850 }
5851
5852 /**
5853 * Build the GS prolog function. Rotate the input vertices for triangle strips
5854 * with adjacency.
5855 */
5856 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5857 union si_shader_part_key *key)
5858 {
5859 unsigned num_sgprs, num_vgprs;
5860 struct gallivm_state *gallivm = &ctx->gallivm;
5861 struct si_function_info fninfo;
5862 LLVMBuilderRef builder = gallivm->builder;
5863 LLVMTypeRef returns[48];
5864 LLVMValueRef func, ret;
5865
5866 si_init_function_info(&fninfo);
5867
5868 if (ctx->screen->b.chip_class >= GFX9) {
5869 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
5870 num_vgprs = 5; /* ES inputs are not needed by GS */
5871 } else {
5872 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
5873 num_vgprs = 8;
5874 }
5875
5876 for (unsigned i = 0; i < num_sgprs; ++i) {
5877 add_arg(&fninfo, ARG_SGPR, ctx->i32);
5878 returns[i] = ctx->i32;
5879 }
5880
5881 for (unsigned i = 0; i < num_vgprs; ++i) {
5882 add_arg(&fninfo, ARG_VGPR, ctx->i32);
5883 returns[num_sgprs + i] = ctx->f32;
5884 }
5885
5886 /* Create the function. */
5887 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
5888 &fninfo, 0);
5889 func = ctx->main_fn;
5890
5891 /* Set the full EXEC mask for the prolog, because we are only fiddling
5892 * with registers here. The main shader part will set the correct EXEC
5893 * mask.
5894 */
5895 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
5896 si_init_exec_full_mask(ctx);
5897
5898 /* Copy inputs to outputs. This should be no-op, as the registers match,
5899 * but it will prevent the compiler from overwriting them unintentionally.
5900 */
5901 ret = ctx->return_value;
5902 for (unsigned i = 0; i < num_sgprs; i++) {
5903 LLVMValueRef p = LLVMGetParam(func, i);
5904 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
5905 }
5906 for (unsigned i = 0; i < num_vgprs; i++) {
5907 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
5908 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
5909 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
5910 }
5911
5912 if (key->gs_prolog.states.tri_strip_adj_fix) {
5913 /* Remap the input vertices for every other primitive. */
5914 const unsigned gfx6_vtx_params[6] = {
5915 num_sgprs,
5916 num_sgprs + 1,
5917 num_sgprs + 3,
5918 num_sgprs + 4,
5919 num_sgprs + 5,
5920 num_sgprs + 6
5921 };
5922 const unsigned gfx9_vtx_params[3] = {
5923 num_sgprs,
5924 num_sgprs + 1,
5925 num_sgprs + 4,
5926 };
5927 LLVMValueRef vtx_in[6], vtx_out[6];
5928 LLVMValueRef prim_id, rotate;
5929
5930 if (ctx->screen->b.chip_class >= GFX9) {
5931 for (unsigned i = 0; i < 3; i++) {
5932 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
5933 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
5934 }
5935 } else {
5936 for (unsigned i = 0; i < 6; i++)
5937 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
5938 }
5939
5940 prim_id = LLVMGetParam(func, num_sgprs + 2);
5941 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
5942
5943 for (unsigned i = 0; i < 6; ++i) {
5944 LLVMValueRef base, rotated;
5945 base = vtx_in[i];
5946 rotated = vtx_in[(i + 4) % 6];
5947 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
5948 }
5949
5950 if (ctx->screen->b.chip_class >= GFX9) {
5951 for (unsigned i = 0; i < 3; i++) {
5952 LLVMValueRef hi, out;
5953
5954 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
5955 LLVMConstInt(ctx->i32, 16, 0), "");
5956 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
5957 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
5958 ret = LLVMBuildInsertValue(builder, ret, out,
5959 gfx9_vtx_params[i], "");
5960 }
5961 } else {
5962 for (unsigned i = 0; i < 6; i++) {
5963 LLVMValueRef out;
5964
5965 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
5966 ret = LLVMBuildInsertValue(builder, ret, out,
5967 gfx6_vtx_params[i], "");
5968 }
5969 }
5970 }
5971
5972 LLVMBuildRet(builder, ret);
5973 }
5974
5975 /**
5976 * Given a list of shader part functions, build a wrapper function that
5977 * runs them in sequence to form a monolithic shader.
5978 */
5979 static void si_build_wrapper_function(struct si_shader_context *ctx,
5980 LLVMValueRef *parts,
5981 unsigned num_parts,
5982 unsigned main_part,
5983 unsigned next_shader_first_part)
5984 {
5985 struct gallivm_state *gallivm = &ctx->gallivm;
5986 LLVMBuilderRef builder = ctx->gallivm.builder;
5987 /* PS epilog has one arg per color component; gfx9 merged shader
5988 * prologs need to forward 32 user SGPRs.
5989 */
5990 struct si_function_info fninfo;
5991 LLVMValueRef initial[64], out[64];
5992 LLVMTypeRef function_type;
5993 unsigned num_first_params;
5994 unsigned num_out, initial_num_out;
5995 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
5996 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
5997 unsigned num_sgprs, num_vgprs;
5998 unsigned gprs;
5999 struct lp_build_if_state if_state;
6000
6001 si_init_function_info(&fninfo);
6002
6003 for (unsigned i = 0; i < num_parts; ++i) {
6004 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6005 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6006 }
6007
6008 /* The parameters of the wrapper function correspond to those of the
6009 * first part in terms of SGPRs and VGPRs, but we use the types of the
6010 * main part to get the right types. This is relevant for the
6011 * dereferenceable attribute on descriptor table pointers.
6012 */
6013 num_sgprs = 0;
6014 num_vgprs = 0;
6015
6016 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6017 num_first_params = LLVMCountParamTypes(function_type);
6018
6019 for (unsigned i = 0; i < num_first_params; ++i) {
6020 LLVMValueRef param = LLVMGetParam(parts[0], i);
6021
6022 if (ac_is_sgpr_param(param)) {
6023 assert(num_vgprs == 0);
6024 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6025 } else {
6026 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6027 }
6028 }
6029
6030 gprs = 0;
6031 while (gprs < num_sgprs + num_vgprs) {
6032 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6033 LLVMTypeRef type = LLVMTypeOf(param);
6034 unsigned size = llvm_get_type_size(type) / 4;
6035
6036 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6037
6038 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6039 assert(gprs + size <= num_sgprs + num_vgprs &&
6040 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6041
6042 gprs += size;
6043 }
6044
6045 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6046 si_get_max_workgroup_size(ctx->shader));
6047
6048 if (is_merged_shader(ctx->shader))
6049 si_init_exec_full_mask(ctx);
6050
6051 /* Record the arguments of the function as if they were an output of
6052 * a previous part.
6053 */
6054 num_out = 0;
6055 num_out_sgpr = 0;
6056
6057 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6058 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6059 LLVMTypeRef param_type = LLVMTypeOf(param);
6060 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6061 unsigned size = llvm_get_type_size(param_type) / 4;
6062
6063 if (size == 1) {
6064 if (param_type != out_type)
6065 param = LLVMBuildBitCast(builder, param, out_type, "");
6066 out[num_out++] = param;
6067 } else {
6068 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6069
6070 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6071 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6072 param_type = ctx->i64;
6073 }
6074
6075 if (param_type != vector_type)
6076 param = LLVMBuildBitCast(builder, param, vector_type, "");
6077
6078 for (unsigned j = 0; j < size; ++j)
6079 out[num_out++] = LLVMBuildExtractElement(
6080 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6081 }
6082
6083 if (i < fninfo.num_sgpr_params)
6084 num_out_sgpr = num_out;
6085 }
6086
6087 memcpy(initial, out, sizeof(out));
6088 initial_num_out = num_out;
6089 initial_num_out_sgpr = num_out_sgpr;
6090
6091 /* Now chain the parts. */
6092 for (unsigned part = 0; part < num_parts; ++part) {
6093 LLVMValueRef in[48];
6094 LLVMValueRef ret;
6095 LLVMTypeRef ret_type;
6096 unsigned out_idx = 0;
6097 unsigned num_params = LLVMCountParams(parts[part]);
6098
6099 /* Merged shaders are executed conditionally depending
6100 * on the number of enabled threads passed in the input SGPRs. */
6101 if (is_merged_shader(ctx->shader) && part == 0) {
6102 LLVMValueRef ena, count = initial[3];
6103
6104 count = LLVMBuildAnd(builder, count,
6105 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6106 ena = LLVMBuildICmp(builder, LLVMIntULT,
6107 ac_get_thread_id(&ctx->ac), count, "");
6108 lp_build_if(&if_state, &ctx->gallivm, ena);
6109 }
6110
6111 /* Derive arguments for the next part from outputs of the
6112 * previous one.
6113 */
6114 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6115 LLVMValueRef param;
6116 LLVMTypeRef param_type;
6117 bool is_sgpr;
6118 unsigned param_size;
6119 LLVMValueRef arg = NULL;
6120
6121 param = LLVMGetParam(parts[part], param_idx);
6122 param_type = LLVMTypeOf(param);
6123 param_size = llvm_get_type_size(param_type) / 4;
6124 is_sgpr = ac_is_sgpr_param(param);
6125
6126 if (is_sgpr) {
6127 #if HAVE_LLVM < 0x0400
6128 LLVMRemoveAttribute(param, LLVMByValAttribute);
6129 #else
6130 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6131 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6132 #endif
6133 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6134 }
6135
6136 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6137 assert(is_sgpr || out_idx >= num_out_sgpr);
6138
6139 if (param_size == 1)
6140 arg = out[out_idx];
6141 else
6142 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6143
6144 if (LLVMTypeOf(arg) != param_type) {
6145 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6146 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6147 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6148 } else {
6149 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6150 }
6151 }
6152
6153 in[param_idx] = arg;
6154 out_idx += param_size;
6155 }
6156
6157 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6158
6159 if (is_merged_shader(ctx->shader) &&
6160 part + 1 == next_shader_first_part) {
6161 lp_build_endif(&if_state);
6162
6163 /* The second half of the merged shader should use
6164 * the inputs from the toplevel (wrapper) function,
6165 * not the return value from the last call.
6166 *
6167 * That's because the last call was executed condi-
6168 * tionally, so we can't consume it in the main
6169 * block.
6170 */
6171 memcpy(out, initial, sizeof(initial));
6172 num_out = initial_num_out;
6173 num_out_sgpr = initial_num_out_sgpr;
6174 continue;
6175 }
6176
6177 /* Extract the returned GPRs. */
6178 ret_type = LLVMTypeOf(ret);
6179 num_out = 0;
6180 num_out_sgpr = 0;
6181
6182 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6183 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6184
6185 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6186
6187 for (unsigned i = 0; i < ret_size; ++i) {
6188 LLVMValueRef val =
6189 LLVMBuildExtractValue(builder, ret, i, "");
6190
6191 assert(num_out < ARRAY_SIZE(out));
6192 out[num_out++] = val;
6193
6194 if (LLVMTypeOf(val) == ctx->i32) {
6195 assert(num_out_sgpr + 1 == num_out);
6196 num_out_sgpr = num_out;
6197 }
6198 }
6199 }
6200 }
6201
6202 LLVMBuildRetVoid(builder);
6203 }
6204
6205 int si_compile_tgsi_shader(struct si_screen *sscreen,
6206 LLVMTargetMachineRef tm,
6207 struct si_shader *shader,
6208 bool is_monolithic,
6209 struct pipe_debug_callback *debug)
6210 {
6211 struct si_shader_selector *sel = shader->selector;
6212 struct si_shader_context ctx;
6213 int r = -1;
6214
6215 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6216 * conversion fails. */
6217 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6218 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6219 if (sel->tokens)
6220 tgsi_dump(sel->tokens, 0);
6221 else
6222 nir_print_shader(sel->nir, stderr);
6223 si_dump_streamout(&sel->so);
6224 }
6225
6226 si_init_shader_ctx(&ctx, sscreen, tm);
6227 si_llvm_context_set_tgsi(&ctx, shader);
6228 ctx.separate_prolog = !is_monolithic;
6229
6230 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6231 sizeof(shader->info.vs_output_param_offset));
6232
6233 shader->info.uses_instanceid = sel->info.uses_instanceid;
6234
6235 ctx.load_system_value = declare_system_value;
6236
6237 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6238 si_llvm_dispose(&ctx);
6239 return -1;
6240 }
6241
6242 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6243 LLVMValueRef parts[2];
6244 bool need_prolog = sel->vs_needs_prolog;
6245
6246 parts[1] = ctx.main_fn;
6247
6248 if (need_prolog) {
6249 union si_shader_part_key prolog_key;
6250 si_get_vs_prolog_key(&sel->info,
6251 shader->info.num_input_sgprs,
6252 &shader->key.part.vs.prolog,
6253 shader, &prolog_key);
6254 si_build_vs_prolog_function(&ctx, &prolog_key);
6255 parts[0] = ctx.main_fn;
6256 }
6257
6258 si_build_wrapper_function(&ctx, parts + !need_prolog,
6259 1 + need_prolog, need_prolog, 0);
6260 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6261 if (sscreen->b.chip_class >= GFX9) {
6262 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6263 LLVMValueRef parts[4];
6264
6265 /* TCS main part */
6266 parts[2] = ctx.main_fn;
6267
6268 /* TCS epilog */
6269 union si_shader_part_key tcs_epilog_key;
6270 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6271 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6272 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6273 parts[3] = ctx.main_fn;
6274
6275 /* VS prolog */
6276 if (ls->vs_needs_prolog) {
6277 union si_shader_part_key vs_prolog_key;
6278 si_get_vs_prolog_key(&ls->info,
6279 shader->info.num_input_sgprs,
6280 &shader->key.part.tcs.ls_prolog,
6281 shader, &vs_prolog_key);
6282 vs_prolog_key.vs_prolog.is_monolithic = true;
6283 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6284 parts[0] = ctx.main_fn;
6285 }
6286
6287 /* VS as LS main part */
6288 struct si_shader shader_ls = {};
6289 shader_ls.selector = ls;
6290 shader_ls.key.as_ls = 1;
6291 shader_ls.key.mono = shader->key.mono;
6292 shader_ls.key.opt = shader->key.opt;
6293 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6294
6295 if (!si_compile_tgsi_main(&ctx, true)) {
6296 si_llvm_dispose(&ctx);
6297 return -1;
6298 }
6299 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6300 parts[1] = ctx.main_fn;
6301
6302 /* Reset the shader context. */
6303 ctx.shader = shader;
6304 ctx.type = PIPE_SHADER_TESS_CTRL;
6305
6306 si_build_wrapper_function(&ctx,
6307 parts + !ls->vs_needs_prolog,
6308 4 - !ls->vs_needs_prolog, 0,
6309 ls->vs_needs_prolog ? 2 : 1);
6310 } else {
6311 LLVMValueRef parts[2];
6312 union si_shader_part_key epilog_key;
6313
6314 parts[0] = ctx.main_fn;
6315
6316 memset(&epilog_key, 0, sizeof(epilog_key));
6317 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6318 si_build_tcs_epilog_function(&ctx, &epilog_key);
6319 parts[1] = ctx.main_fn;
6320
6321 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6322 }
6323 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6324 if (ctx.screen->b.chip_class >= GFX9) {
6325 struct si_shader_selector *es = shader->key.part.gs.es;
6326 LLVMValueRef es_prolog = NULL;
6327 LLVMValueRef es_main = NULL;
6328 LLVMValueRef gs_prolog = NULL;
6329 LLVMValueRef gs_main = ctx.main_fn;
6330
6331 /* GS prolog */
6332 union si_shader_part_key gs_prolog_key;
6333 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6334 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6335 gs_prolog_key.gs_prolog.is_monolithic = true;
6336 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6337 gs_prolog = ctx.main_fn;
6338
6339 /* ES prolog */
6340 if (es->vs_needs_prolog) {
6341 union si_shader_part_key vs_prolog_key;
6342 si_get_vs_prolog_key(&es->info,
6343 shader->info.num_input_sgprs,
6344 &shader->key.part.tcs.ls_prolog,
6345 shader, &vs_prolog_key);
6346 vs_prolog_key.vs_prolog.is_monolithic = true;
6347 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6348 es_prolog = ctx.main_fn;
6349 }
6350
6351 /* ES main part */
6352 struct si_shader shader_es = {};
6353 shader_es.selector = es;
6354 shader_es.key.as_es = 1;
6355 shader_es.key.mono = shader->key.mono;
6356 shader_es.key.opt = shader->key.opt;
6357 si_llvm_context_set_tgsi(&ctx, &shader_es);
6358
6359 if (!si_compile_tgsi_main(&ctx, true)) {
6360 si_llvm_dispose(&ctx);
6361 return -1;
6362 }
6363 shader->info.uses_instanceid |= es->info.uses_instanceid;
6364 es_main = ctx.main_fn;
6365
6366 /* Reset the shader context. */
6367 ctx.shader = shader;
6368 ctx.type = PIPE_SHADER_GEOMETRY;
6369
6370 /* Prepare the array of shader parts. */
6371 LLVMValueRef parts[4];
6372 unsigned num_parts = 0, main_part, next_first_part;
6373
6374 if (es_prolog)
6375 parts[num_parts++] = es_prolog;
6376
6377 parts[main_part = num_parts++] = es_main;
6378 parts[next_first_part = num_parts++] = gs_prolog;
6379 parts[num_parts++] = gs_main;
6380
6381 si_build_wrapper_function(&ctx, parts, num_parts,
6382 main_part, next_first_part);
6383 } else {
6384 LLVMValueRef parts[2];
6385 union si_shader_part_key prolog_key;
6386
6387 parts[1] = ctx.main_fn;
6388
6389 memset(&prolog_key, 0, sizeof(prolog_key));
6390 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6391 si_build_gs_prolog_function(&ctx, &prolog_key);
6392 parts[0] = ctx.main_fn;
6393
6394 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6395 }
6396 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6397 LLVMValueRef parts[3];
6398 union si_shader_part_key prolog_key;
6399 union si_shader_part_key epilog_key;
6400 bool need_prolog;
6401
6402 si_get_ps_prolog_key(shader, &prolog_key, false);
6403 need_prolog = si_need_ps_prolog(&prolog_key);
6404
6405 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6406
6407 if (need_prolog) {
6408 si_build_ps_prolog_function(&ctx, &prolog_key);
6409 parts[0] = ctx.main_fn;
6410 }
6411
6412 si_get_ps_epilog_key(shader, &epilog_key);
6413 si_build_ps_epilog_function(&ctx, &epilog_key);
6414 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6415
6416 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6417 need_prolog ? 1 : 0, 0);
6418 }
6419
6420 si_llvm_optimize_module(&ctx);
6421
6422 /* Post-optimization transformations and analysis. */
6423 si_optimize_vs_outputs(&ctx);
6424
6425 if ((debug && debug->debug_message) ||
6426 r600_can_dump_shader(&sscreen->b, ctx.type))
6427 si_count_scratch_private_memory(&ctx);
6428
6429 /* Compile to bytecode. */
6430 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6431 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6432 si_llvm_dispose(&ctx);
6433 if (r) {
6434 fprintf(stderr, "LLVM failed to compile shader\n");
6435 return r;
6436 }
6437
6438 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6439 * LLVM 3.9svn has this bug.
6440 */
6441 if (sel->type == PIPE_SHADER_COMPUTE) {
6442 unsigned wave_size = 64;
6443 unsigned max_vgprs = 256;
6444 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6445 unsigned max_sgprs_per_wave = 128;
6446 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6447 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6448 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6449
6450 max_vgprs = max_vgprs / min_waves_per_simd;
6451 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6452
6453 if (shader->config.num_sgprs > max_sgprs ||
6454 shader->config.num_vgprs > max_vgprs) {
6455 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6456 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6457 shader->config.num_sgprs, shader->config.num_vgprs,
6458 max_sgprs, max_vgprs);
6459
6460 /* Just terminate the process, because dependent
6461 * shaders can hang due to bad input data, but use
6462 * the env var to allow shader-db to work.
6463 */
6464 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6465 abort();
6466 }
6467 }
6468
6469 /* Add the scratch offset to input SGPRs. */
6470 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6471 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6472
6473 /* Calculate the number of fragment input VGPRs. */
6474 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6475 shader->info.num_input_vgprs = 0;
6476 shader->info.face_vgpr_index = -1;
6477
6478 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6479 shader->info.num_input_vgprs += 2;
6480 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6481 shader->info.num_input_vgprs += 2;
6482 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6483 shader->info.num_input_vgprs += 2;
6484 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6485 shader->info.num_input_vgprs += 3;
6486 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6487 shader->info.num_input_vgprs += 2;
6488 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6489 shader->info.num_input_vgprs += 2;
6490 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6491 shader->info.num_input_vgprs += 2;
6492 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6493 shader->info.num_input_vgprs += 1;
6494 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6495 shader->info.num_input_vgprs += 1;
6496 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6497 shader->info.num_input_vgprs += 1;
6498 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6499 shader->info.num_input_vgprs += 1;
6500 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6501 shader->info.num_input_vgprs += 1;
6502 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6503 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6504 shader->info.num_input_vgprs += 1;
6505 }
6506 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6507 shader->info.num_input_vgprs += 1;
6508 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6509 shader->info.num_input_vgprs += 1;
6510 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6511 shader->info.num_input_vgprs += 1;
6512 }
6513
6514 return 0;
6515 }
6516
6517 /**
6518 * Create, compile and return a shader part (prolog or epilog).
6519 *
6520 * \param sscreen screen
6521 * \param list list of shader parts of the same category
6522 * \param type shader type
6523 * \param key shader part key
6524 * \param prolog whether the part being requested is a prolog
6525 * \param tm LLVM target machine
6526 * \param debug debug callback
6527 * \param build the callback responsible for building the main function
6528 * \return non-NULL on success
6529 */
6530 static struct si_shader_part *
6531 si_get_shader_part(struct si_screen *sscreen,
6532 struct si_shader_part **list,
6533 enum pipe_shader_type type,
6534 bool prolog,
6535 union si_shader_part_key *key,
6536 LLVMTargetMachineRef tm,
6537 struct pipe_debug_callback *debug,
6538 void (*build)(struct si_shader_context *,
6539 union si_shader_part_key *),
6540 const char *name)
6541 {
6542 struct si_shader_part *result;
6543
6544 mtx_lock(&sscreen->shader_parts_mutex);
6545
6546 /* Find existing. */
6547 for (result = *list; result; result = result->next) {
6548 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6549 mtx_unlock(&sscreen->shader_parts_mutex);
6550 return result;
6551 }
6552 }
6553
6554 /* Compile a new one. */
6555 result = CALLOC_STRUCT(si_shader_part);
6556 result->key = *key;
6557
6558 struct si_shader shader = {};
6559 struct si_shader_context ctx;
6560 struct gallivm_state *gallivm = &ctx.gallivm;
6561
6562 si_init_shader_ctx(&ctx, sscreen, tm);
6563 ctx.shader = &shader;
6564 ctx.type = type;
6565
6566 switch (type) {
6567 case PIPE_SHADER_VERTEX:
6568 break;
6569 case PIPE_SHADER_TESS_CTRL:
6570 assert(!prolog);
6571 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6572 break;
6573 case PIPE_SHADER_GEOMETRY:
6574 assert(prolog);
6575 break;
6576 case PIPE_SHADER_FRAGMENT:
6577 if (prolog)
6578 shader.key.part.ps.prolog = key->ps_prolog.states;
6579 else
6580 shader.key.part.ps.epilog = key->ps_epilog.states;
6581 break;
6582 default:
6583 unreachable("bad shader part");
6584 }
6585
6586 build(&ctx, key);
6587
6588 /* Compile. */
6589 si_llvm_optimize_module(&ctx);
6590
6591 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6592 gallivm->module, debug, ctx.type, name)) {
6593 FREE(result);
6594 result = NULL;
6595 goto out;
6596 }
6597
6598 result->next = *list;
6599 *list = result;
6600
6601 out:
6602 si_llvm_dispose(&ctx);
6603 mtx_unlock(&sscreen->shader_parts_mutex);
6604 return result;
6605 }
6606
6607 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6608 {
6609 struct gallivm_state *gallivm = &ctx->gallivm;
6610 LLVMValueRef ptr[2], list;
6611
6612 /* Get the pointer to rw buffers. */
6613 ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6614 ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6615 list = lp_build_gather_values(gallivm, ptr, 2);
6616 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6617 list = LLVMBuildIntToPtr(gallivm->builder, list,
6618 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6619 return list;
6620 }
6621
6622 /**
6623 * Build the vertex shader prolog function.
6624 *
6625 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6626 * All inputs are returned unmodified. The vertex load indices are
6627 * stored after them, which will be used by the API VS for fetching inputs.
6628 *
6629 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6630 * input_v0,
6631 * input_v1,
6632 * input_v2,
6633 * input_v3,
6634 * (VertexID + BaseVertex),
6635 * (InstanceID + StartInstance),
6636 * (InstanceID / 2 + StartInstance)
6637 */
6638 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6639 union si_shader_part_key *key)
6640 {
6641 struct gallivm_state *gallivm = &ctx->gallivm;
6642 struct si_function_info fninfo;
6643 LLVMTypeRef *returns;
6644 LLVMValueRef ret, func;
6645 int num_returns, i;
6646 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
6647 key->vs_prolog.num_merged_next_stage_vgprs;
6648 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6649 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6650 num_input_vgprs;
6651 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6652
6653 si_init_function_info(&fninfo);
6654
6655 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6656 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6657 sizeof(LLVMTypeRef));
6658 num_returns = 0;
6659
6660 /* Declare input and output SGPRs. */
6661 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6662 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6663 returns[num_returns++] = ctx->i32;
6664 }
6665
6666 /* Preloaded VGPRs (outputs must be floats) */
6667 for (i = 0; i < num_input_vgprs; i++) {
6668 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6669 returns[num_returns++] = ctx->f32;
6670 }
6671
6672 fninfo.assign[first_vs_vgpr] = &ctx->abi.vertex_id;
6673 fninfo.assign[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)] = &ctx->abi.instance_id;
6674
6675 /* Vertex load indices. */
6676 for (i = 0; i <= key->vs_prolog.last_input; i++)
6677 returns[num_returns++] = ctx->f32;
6678
6679 /* Create the function. */
6680 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6681 func = ctx->main_fn;
6682
6683 if (key->vs_prolog.num_merged_next_stage_vgprs &&
6684 !key->vs_prolog.is_monolithic)
6685 si_init_exec_from_input(ctx, 3, 0);
6686
6687 /* Copy inputs to outputs. This should be no-op, as the registers match,
6688 * but it will prevent the compiler from overwriting them unintentionally.
6689 */
6690 ret = ctx->return_value;
6691 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6692 LLVMValueRef p = LLVMGetParam(func, i);
6693 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6694 }
6695 for (; i < fninfo.num_params; i++) {
6696 LLVMValueRef p = LLVMGetParam(func, i);
6697 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6698 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6699 }
6700
6701 /* Compute vertex load indices from instance divisors. */
6702 LLVMValueRef instance_divisor_constbuf = NULL;
6703
6704 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6705 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6706 LLVMValueRef buf_index =
6707 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6708 instance_divisor_constbuf =
6709 ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6710 }
6711
6712 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6713 bool divisor_is_one =
6714 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6715 bool divisor_is_fetched =
6716 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6717 LLVMValueRef index;
6718
6719 if (divisor_is_one || divisor_is_fetched) {
6720 LLVMValueRef divisor = ctx->i32_1;
6721
6722 if (divisor_is_fetched) {
6723 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6724 LLVMConstInt(ctx->i32, i * 4, 0));
6725 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6726 ctx->i32, "");
6727 }
6728
6729 /* InstanceID / Divisor + StartInstance */
6730 index = get_instance_index_for_fetch(ctx,
6731 user_sgpr_base +
6732 SI_SGPR_START_INSTANCE,
6733 divisor);
6734 } else {
6735 /* VertexID + BaseVertex */
6736 index = LLVMBuildAdd(gallivm->builder,
6737 ctx->abi.vertex_id,
6738 LLVMGetParam(func, user_sgpr_base +
6739 SI_SGPR_BASE_VERTEX), "");
6740 }
6741
6742 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6743 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6744 fninfo.num_params + i, "");
6745 }
6746
6747 si_llvm_build_ret(ctx, ret);
6748 }
6749
6750 static bool si_get_vs_prolog(struct si_screen *sscreen,
6751 LLVMTargetMachineRef tm,
6752 struct si_shader *shader,
6753 struct pipe_debug_callback *debug,
6754 struct si_shader *main_part,
6755 const struct si_vs_prolog_bits *key)
6756 {
6757 struct si_shader_selector *vs = main_part->selector;
6758
6759 /* The prolog is a no-op if there are no inputs. */
6760 if (!vs->vs_needs_prolog)
6761 return true;
6762
6763 /* Get the prolog. */
6764 union si_shader_part_key prolog_key;
6765 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6766 key, shader, &prolog_key);
6767
6768 shader->prolog =
6769 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6770 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6771 debug, si_build_vs_prolog_function,
6772 "Vertex Shader Prolog");
6773 return shader->prolog != NULL;
6774 }
6775
6776 /**
6777 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6778 */
6779 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6780 LLVMTargetMachineRef tm,
6781 struct si_shader *shader,
6782 struct pipe_debug_callback *debug)
6783 {
6784 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6785 &shader->key.part.vs.prolog);
6786 }
6787
6788 /**
6789 * Compile the TCS epilog function. This writes tesselation factors to memory
6790 * based on the output primitive type of the tesselator (determined by TES).
6791 */
6792 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6793 union si_shader_part_key *key)
6794 {
6795 struct gallivm_state *gallivm = &ctx->gallivm;
6796 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6797 struct si_function_info fninfo;
6798 LLVMValueRef func;
6799
6800 si_init_function_info(&fninfo);
6801
6802 if (ctx->screen->b.chip_class >= GFX9) {
6803 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6804 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6805 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
6806 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6807 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6808 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6809 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6810 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6811 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6812 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6813 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6814 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6815 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6816 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6817 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6818 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6819 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6820 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6821 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6822 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6823 } else {
6824 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6825 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6826 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6827 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6828 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6829 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6830 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6831 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6832 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6833 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6834 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6835 }
6836
6837 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6838 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
6839 unsigned tess_factors_idx =
6840 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
6841 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
6842 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
6843
6844 /* Create the function. */
6845 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
6846 ctx->screen->b.chip_class >= CIK ? 128 : 64);
6847 declare_lds_as_pointer(ctx);
6848 func = ctx->main_fn;
6849
6850 si_write_tess_factors(bld_base,
6851 LLVMGetParam(func, tess_factors_idx),
6852 LLVMGetParam(func, tess_factors_idx + 1),
6853 LLVMGetParam(func, tess_factors_idx + 2));
6854
6855 LLVMBuildRetVoid(gallivm->builder);
6856 }
6857
6858 /**
6859 * Select and compile (or reuse) TCS parts (epilog).
6860 */
6861 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6862 LLVMTargetMachineRef tm,
6863 struct si_shader *shader,
6864 struct pipe_debug_callback *debug)
6865 {
6866 if (sscreen->b.chip_class >= GFX9) {
6867 struct si_shader *ls_main_part =
6868 shader->key.part.tcs.ls->main_shader_part_ls;
6869
6870 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
6871 &shader->key.part.tcs.ls_prolog))
6872 return false;
6873
6874 shader->previous_stage = ls_main_part;
6875 }
6876
6877 /* Get the epilog. */
6878 union si_shader_part_key epilog_key;
6879 memset(&epilog_key, 0, sizeof(epilog_key));
6880 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6881
6882 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6883 PIPE_SHADER_TESS_CTRL, false,
6884 &epilog_key, tm, debug,
6885 si_build_tcs_epilog_function,
6886 "Tessellation Control Shader Epilog");
6887 return shader->epilog != NULL;
6888 }
6889
6890 /**
6891 * Select and compile (or reuse) GS parts (prolog).
6892 */
6893 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
6894 LLVMTargetMachineRef tm,
6895 struct si_shader *shader,
6896 struct pipe_debug_callback *debug)
6897 {
6898 if (sscreen->b.chip_class >= GFX9) {
6899 struct si_shader *es_main_part =
6900 shader->key.part.gs.es->main_shader_part_es;
6901
6902 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
6903 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
6904 &shader->key.part.gs.vs_prolog))
6905 return false;
6906
6907 shader->previous_stage = es_main_part;
6908 }
6909
6910 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
6911 return true;
6912
6913 union si_shader_part_key prolog_key;
6914 memset(&prolog_key, 0, sizeof(prolog_key));
6915 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6916
6917 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
6918 PIPE_SHADER_GEOMETRY, true,
6919 &prolog_key, tm, debug,
6920 si_build_gs_prolog_function,
6921 "Geometry Shader Prolog");
6922 return shader->prolog2 != NULL;
6923 }
6924
6925 /**
6926 * Build the pixel shader prolog function. This handles:
6927 * - two-side color selection and interpolation
6928 * - overriding interpolation parameters for the API PS
6929 * - polygon stippling
6930 *
6931 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6932 * overriden by other states. (e.g. per-sample interpolation)
6933 * Interpolated colors are stored after the preloaded VGPRs.
6934 */
6935 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
6936 union si_shader_part_key *key)
6937 {
6938 struct gallivm_state *gallivm = &ctx->gallivm;
6939 struct si_function_info fninfo;
6940 LLVMValueRef ret, func;
6941 int num_returns, i, num_color_channels;
6942
6943 assert(si_need_ps_prolog(key));
6944
6945 si_init_function_info(&fninfo);
6946
6947 /* Declare inputs. */
6948 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6949 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6950
6951 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6952 add_arg(&fninfo, ARG_VGPR, ctx->f32);
6953
6954 /* Declare outputs (same as inputs + add colors if needed) */
6955 num_returns = fninfo.num_params;
6956 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6957 for (i = 0; i < num_color_channels; i++)
6958 fninfo.types[num_returns++] = ctx->f32;
6959
6960 /* Create the function. */
6961 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
6962 &fninfo, 0);
6963 func = ctx->main_fn;
6964
6965 /* Copy inputs to outputs. This should be no-op, as the registers match,
6966 * but it will prevent the compiler from overwriting them unintentionally.
6967 */
6968 ret = ctx->return_value;
6969 for (i = 0; i < fninfo.num_params; i++) {
6970 LLVMValueRef p = LLVMGetParam(func, i);
6971 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6972 }
6973
6974 /* Polygon stippling. */
6975 if (key->ps_prolog.states.poly_stipple) {
6976 /* POS_FIXED_PT is always last. */
6977 unsigned pos = key->ps_prolog.num_input_sgprs +
6978 key->ps_prolog.num_input_vgprs - 1;
6979 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6980
6981 si_llvm_emit_polygon_stipple(ctx, list, pos);
6982 }
6983
6984 if (key->ps_prolog.states.bc_optimize_for_persp ||
6985 key->ps_prolog.states.bc_optimize_for_linear) {
6986 unsigned i, base = key->ps_prolog.num_input_sgprs;
6987 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
6988
6989 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
6990 * The hw doesn't compute CENTROID if the whole wave only
6991 * contains fully-covered quads.
6992 *
6993 * PRIM_MASK is after user SGPRs.
6994 */
6995 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6996 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
6997 LLVMConstInt(ctx->i32, 31, 0), "");
6998 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
6999 ctx->i1, "");
7000
7001 if (key->ps_prolog.states.bc_optimize_for_persp) {
7002 /* Read PERSP_CENTER. */
7003 for (i = 0; i < 2; i++)
7004 center[i] = LLVMGetParam(func, base + 2 + i);
7005 /* Read PERSP_CENTROID. */
7006 for (i = 0; i < 2; i++)
7007 centroid[i] = LLVMGetParam(func, base + 4 + i);
7008 /* Select PERSP_CENTROID. */
7009 for (i = 0; i < 2; i++) {
7010 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7011 center[i], centroid[i], "");
7012 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7013 tmp, base + 4 + i, "");
7014 }
7015 }
7016 if (key->ps_prolog.states.bc_optimize_for_linear) {
7017 /* Read LINEAR_CENTER. */
7018 for (i = 0; i < 2; i++)
7019 center[i] = LLVMGetParam(func, base + 8 + i);
7020 /* Read LINEAR_CENTROID. */
7021 for (i = 0; i < 2; i++)
7022 centroid[i] = LLVMGetParam(func, base + 10 + i);
7023 /* Select LINEAR_CENTROID. */
7024 for (i = 0; i < 2; i++) {
7025 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7026 center[i], centroid[i], "");
7027 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7028 tmp, base + 10 + i, "");
7029 }
7030 }
7031 }
7032
7033 /* Force per-sample interpolation. */
7034 if (key->ps_prolog.states.force_persp_sample_interp) {
7035 unsigned i, base = key->ps_prolog.num_input_sgprs;
7036 LLVMValueRef persp_sample[2];
7037
7038 /* Read PERSP_SAMPLE. */
7039 for (i = 0; i < 2; i++)
7040 persp_sample[i] = LLVMGetParam(func, base + i);
7041 /* Overwrite PERSP_CENTER. */
7042 for (i = 0; i < 2; i++)
7043 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7044 persp_sample[i], base + 2 + i, "");
7045 /* Overwrite PERSP_CENTROID. */
7046 for (i = 0; i < 2; i++)
7047 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7048 persp_sample[i], base + 4 + i, "");
7049 }
7050 if (key->ps_prolog.states.force_linear_sample_interp) {
7051 unsigned i, base = key->ps_prolog.num_input_sgprs;
7052 LLVMValueRef linear_sample[2];
7053
7054 /* Read LINEAR_SAMPLE. */
7055 for (i = 0; i < 2; i++)
7056 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7057 /* Overwrite LINEAR_CENTER. */
7058 for (i = 0; i < 2; i++)
7059 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7060 linear_sample[i], base + 8 + i, "");
7061 /* Overwrite LINEAR_CENTROID. */
7062 for (i = 0; i < 2; i++)
7063 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7064 linear_sample[i], base + 10 + i, "");
7065 }
7066
7067 /* Force center interpolation. */
7068 if (key->ps_prolog.states.force_persp_center_interp) {
7069 unsigned i, base = key->ps_prolog.num_input_sgprs;
7070 LLVMValueRef persp_center[2];
7071
7072 /* Read PERSP_CENTER. */
7073 for (i = 0; i < 2; i++)
7074 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7075 /* Overwrite PERSP_SAMPLE. */
7076 for (i = 0; i < 2; i++)
7077 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7078 persp_center[i], base + i, "");
7079 /* Overwrite PERSP_CENTROID. */
7080 for (i = 0; i < 2; i++)
7081 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7082 persp_center[i], base + 4 + i, "");
7083 }
7084 if (key->ps_prolog.states.force_linear_center_interp) {
7085 unsigned i, base = key->ps_prolog.num_input_sgprs;
7086 LLVMValueRef linear_center[2];
7087
7088 /* Read LINEAR_CENTER. */
7089 for (i = 0; i < 2; i++)
7090 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7091 /* Overwrite LINEAR_SAMPLE. */
7092 for (i = 0; i < 2; i++)
7093 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7094 linear_center[i], base + 6 + i, "");
7095 /* Overwrite LINEAR_CENTROID. */
7096 for (i = 0; i < 2; i++)
7097 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7098 linear_center[i], base + 10 + i, "");
7099 }
7100
7101 /* Interpolate colors. */
7102 unsigned color_out_idx = 0;
7103 for (i = 0; i < 2; i++) {
7104 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7105 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7106 key->ps_prolog.face_vgpr_index;
7107 LLVMValueRef interp[2], color[4];
7108 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7109
7110 if (!writemask)
7111 continue;
7112
7113 /* If the interpolation qualifier is not CONSTANT (-1). */
7114 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7115 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7116 key->ps_prolog.color_interp_vgpr_index[i];
7117
7118 /* Get the (i,j) updated by bc_optimize handling. */
7119 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7120 interp_vgpr, "");
7121 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7122 interp_vgpr + 1, "");
7123 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7124 }
7125
7126 /* Use the absolute location of the input. */
7127 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7128
7129 if (key->ps_prolog.states.color_two_side) {
7130 face = LLVMGetParam(func, face_vgpr);
7131 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7132 }
7133
7134 interp_fs_input(ctx,
7135 key->ps_prolog.color_attr_index[i],
7136 TGSI_SEMANTIC_COLOR, i,
7137 key->ps_prolog.num_interp_inputs,
7138 key->ps_prolog.colors_read, interp_ij,
7139 prim_mask, face, color);
7140
7141 while (writemask) {
7142 unsigned chan = u_bit_scan(&writemask);
7143 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7144 fninfo.num_params + color_out_idx++, "");
7145 }
7146 }
7147
7148 /* Tell LLVM to insert WQM instruction sequence when needed. */
7149 if (key->ps_prolog.wqm) {
7150 LLVMAddTargetDependentFunctionAttr(func,
7151 "amdgpu-ps-wqm-outputs", "");
7152 }
7153
7154 si_llvm_build_ret(ctx, ret);
7155 }
7156
7157 /**
7158 * Build the pixel shader epilog function. This handles everything that must be
7159 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7160 */
7161 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7162 union si_shader_part_key *key)
7163 {
7164 struct gallivm_state *gallivm = &ctx->gallivm;
7165 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7166 struct si_function_info fninfo;
7167 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7168 int i;
7169 struct si_ps_exports exp = {};
7170
7171 si_init_function_info(&fninfo);
7172
7173 /* Declare input SGPRs. */
7174 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7175 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7176 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7177 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7178
7179 /* Declare input VGPRs. */
7180 unsigned required_num_params =
7181 fninfo.num_sgpr_params +
7182 util_bitcount(key->ps_epilog.colors_written) * 4 +
7183 key->ps_epilog.writes_z +
7184 key->ps_epilog.writes_stencil +
7185 key->ps_epilog.writes_samplemask;
7186
7187 required_num_params = MAX2(required_num_params,
7188 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7189
7190 while (fninfo.num_params < required_num_params)
7191 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7192
7193 /* Create the function. */
7194 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7195 /* Disable elimination of unused inputs. */
7196 si_llvm_add_attribute(ctx->main_fn,
7197 "InitialPSInputAddr", 0xffffff);
7198
7199 /* Process colors. */
7200 unsigned vgpr = fninfo.num_sgpr_params;
7201 unsigned colors_written = key->ps_epilog.colors_written;
7202 int last_color_export = -1;
7203
7204 /* Find the last color export. */
7205 if (!key->ps_epilog.writes_z &&
7206 !key->ps_epilog.writes_stencil &&
7207 !key->ps_epilog.writes_samplemask) {
7208 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7209
7210 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7211 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7212 /* Just set this if any of the colorbuffers are enabled. */
7213 if (spi_format &
7214 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7215 last_color_export = 0;
7216 } else {
7217 for (i = 0; i < 8; i++)
7218 if (colors_written & (1 << i) &&
7219 (spi_format >> (i * 4)) & 0xf)
7220 last_color_export = i;
7221 }
7222 }
7223
7224 while (colors_written) {
7225 LLVMValueRef color[4];
7226 int mrt = u_bit_scan(&colors_written);
7227
7228 for (i = 0; i < 4; i++)
7229 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7230
7231 si_export_mrt_color(bld_base, color, mrt,
7232 fninfo.num_params - 1,
7233 mrt == last_color_export, &exp);
7234 }
7235
7236 /* Process depth, stencil, samplemask. */
7237 if (key->ps_epilog.writes_z)
7238 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7239 if (key->ps_epilog.writes_stencil)
7240 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7241 if (key->ps_epilog.writes_samplemask)
7242 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7243
7244 if (depth || stencil || samplemask)
7245 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7246 else if (last_color_export == -1)
7247 si_export_null(bld_base);
7248
7249 if (exp.num)
7250 si_emit_ps_exports(ctx, &exp);
7251
7252 /* Compile. */
7253 LLVMBuildRetVoid(gallivm->builder);
7254 }
7255
7256 /**
7257 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7258 */
7259 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7260 LLVMTargetMachineRef tm,
7261 struct si_shader *shader,
7262 struct pipe_debug_callback *debug)
7263 {
7264 union si_shader_part_key prolog_key;
7265 union si_shader_part_key epilog_key;
7266
7267 /* Get the prolog. */
7268 si_get_ps_prolog_key(shader, &prolog_key, true);
7269
7270 /* The prolog is a no-op if these aren't set. */
7271 if (si_need_ps_prolog(&prolog_key)) {
7272 shader->prolog =
7273 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7274 PIPE_SHADER_FRAGMENT, true,
7275 &prolog_key, tm, debug,
7276 si_build_ps_prolog_function,
7277 "Fragment Shader Prolog");
7278 if (!shader->prolog)
7279 return false;
7280 }
7281
7282 /* Get the epilog. */
7283 si_get_ps_epilog_key(shader, &epilog_key);
7284
7285 shader->epilog =
7286 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7287 PIPE_SHADER_FRAGMENT, false,
7288 &epilog_key, tm, debug,
7289 si_build_ps_epilog_function,
7290 "Fragment Shader Epilog");
7291 if (!shader->epilog)
7292 return false;
7293
7294 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7295 if (shader->key.part.ps.prolog.poly_stipple) {
7296 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7297 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7298 }
7299
7300 /* Set up the enable bits for per-sample shading if needed. */
7301 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7302 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7303 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7304 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7305 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7306 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7307 }
7308 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7309 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7310 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7311 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7312 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7313 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7314 }
7315 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7316 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7317 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7318 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7319 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7320 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7321 }
7322 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7323 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7324 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7325 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7326 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7327 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7328 }
7329
7330 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7331 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7332 !(shader->config.spi_ps_input_ena & 0xf)) {
7333 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7334 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7335 }
7336
7337 /* At least one pair of interpolation weights must be enabled. */
7338 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7339 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7340 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7341 }
7342
7343 /* The sample mask input is always enabled, because the API shader always
7344 * passes it through to the epilog. Disable it here if it's unused.
7345 */
7346 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7347 !shader->selector->info.reads_samplemask)
7348 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7349
7350 return true;
7351 }
7352
7353 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7354 unsigned *lds_size)
7355 {
7356 /* SPI barrier management bug:
7357 * Make sure we have at least 4k of LDS in use to avoid the bug.
7358 * It applies to workgroup sizes of more than one wavefront.
7359 */
7360 if (sscreen->b.family == CHIP_BONAIRE ||
7361 sscreen->b.family == CHIP_KABINI ||
7362 sscreen->b.family == CHIP_MULLINS)
7363 *lds_size = MAX2(*lds_size, 8);
7364 }
7365
7366 static void si_fix_resource_usage(struct si_screen *sscreen,
7367 struct si_shader *shader)
7368 {
7369 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7370
7371 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7372
7373 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7374 si_get_max_workgroup_size(shader) > 64) {
7375 si_multiwave_lds_size_workaround(sscreen,
7376 &shader->config.lds_size);
7377 }
7378 }
7379
7380 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7381 struct si_shader *shader,
7382 struct pipe_debug_callback *debug)
7383 {
7384 struct si_shader_selector *sel = shader->selector;
7385 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7386 int r;
7387
7388 /* LS, ES, VS are compiled on demand if the main part hasn't been
7389 * compiled for that stage.
7390 *
7391 * Vertex shaders are compiled on demand when a vertex fetch
7392 * workaround must be applied.
7393 */
7394 if (shader->is_monolithic) {
7395 /* Monolithic shader (compiled as a whole, has many variants,
7396 * may take a long time to compile).
7397 */
7398 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7399 if (r)
7400 return r;
7401 } else {
7402 /* The shader consists of several parts:
7403 *
7404 * - the middle part is the user shader, it has 1 variant only
7405 * and it was compiled during the creation of the shader
7406 * selector
7407 * - the prolog part is inserted at the beginning
7408 * - the epilog part is inserted at the end
7409 *
7410 * The prolog and epilog have many (but simple) variants.
7411 *
7412 * Starting with gfx9, geometry and tessellation control
7413 * shaders also contain the prolog and user shader parts of
7414 * the previous shader stage.
7415 */
7416
7417 if (!mainp)
7418 return -1;
7419
7420 /* Copy the compiled TGSI shader data over. */
7421 shader->is_binary_shared = true;
7422 shader->binary = mainp->binary;
7423 shader->config = mainp->config;
7424 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7425 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7426 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7427 memcpy(shader->info.vs_output_param_offset,
7428 mainp->info.vs_output_param_offset,
7429 sizeof(mainp->info.vs_output_param_offset));
7430 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7431 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7432 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7433
7434 /* Select prologs and/or epilogs. */
7435 switch (sel->type) {
7436 case PIPE_SHADER_VERTEX:
7437 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7438 return -1;
7439 break;
7440 case PIPE_SHADER_TESS_CTRL:
7441 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7442 return -1;
7443 break;
7444 case PIPE_SHADER_TESS_EVAL:
7445 break;
7446 case PIPE_SHADER_GEOMETRY:
7447 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7448 return -1;
7449 break;
7450 case PIPE_SHADER_FRAGMENT:
7451 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7452 return -1;
7453
7454 /* Make sure we have at least as many VGPRs as there
7455 * are allocated inputs.
7456 */
7457 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7458 shader->info.num_input_vgprs);
7459 break;
7460 }
7461
7462 /* Update SGPR and VGPR counts. */
7463 if (shader->prolog) {
7464 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7465 shader->prolog->config.num_sgprs);
7466 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7467 shader->prolog->config.num_vgprs);
7468 }
7469 if (shader->previous_stage) {
7470 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7471 shader->previous_stage->config.num_sgprs);
7472 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7473 shader->previous_stage->config.num_vgprs);
7474 shader->config.spilled_sgprs =
7475 MAX2(shader->config.spilled_sgprs,
7476 shader->previous_stage->config.spilled_sgprs);
7477 shader->config.spilled_vgprs =
7478 MAX2(shader->config.spilled_vgprs,
7479 shader->previous_stage->config.spilled_vgprs);
7480 shader->config.private_mem_vgprs =
7481 MAX2(shader->config.private_mem_vgprs,
7482 shader->previous_stage->config.private_mem_vgprs);
7483 shader->config.scratch_bytes_per_wave =
7484 MAX2(shader->config.scratch_bytes_per_wave,
7485 shader->previous_stage->config.scratch_bytes_per_wave);
7486 shader->info.uses_instanceid |=
7487 shader->previous_stage->info.uses_instanceid;
7488 }
7489 if (shader->prolog2) {
7490 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7491 shader->prolog2->config.num_sgprs);
7492 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7493 shader->prolog2->config.num_vgprs);
7494 }
7495 if (shader->epilog) {
7496 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7497 shader->epilog->config.num_sgprs);
7498 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7499 shader->epilog->config.num_vgprs);
7500 }
7501 }
7502
7503 si_fix_resource_usage(sscreen, shader);
7504 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7505 stderr, true);
7506
7507 /* Upload. */
7508 r = si_shader_binary_upload(sscreen, shader);
7509 if (r) {
7510 fprintf(stderr, "LLVM failed to upload shader\n");
7511 return r;
7512 }
7513
7514 return 0;
7515 }
7516
7517 void si_shader_destroy(struct si_shader *shader)
7518 {
7519 if (shader->scratch_bo)
7520 r600_resource_reference(&shader->scratch_bo, NULL);
7521
7522 r600_resource_reference(&shader->bo, NULL);
7523
7524 if (!shader->is_binary_shared)
7525 radeon_shader_binary_clean(&shader->binary);
7526
7527 free(shader->shader_log);
7528 }