radeonsi: don't read the LS output vertex stride from an SGPR in LS
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49 #include "compiler/nir/nir.h"
50
51 static const char *scratch_rsrc_dword0_symbol =
52 "SCRATCH_RSRC_DWORD0";
53
54 static const char *scratch_rsrc_dword1_symbol =
55 "SCRATCH_RSRC_DWORD1";
56
57 struct si_shader_output_values
58 {
59 LLVMValueRef values[4];
60 unsigned semantic_name;
61 unsigned semantic_index;
62 ubyte vertex_stream[4];
63 };
64
65 /**
66 * Used to collect types and other info about arguments of the LLVM function
67 * before the function is created.
68 */
69 struct si_function_info {
70 LLVMTypeRef types[100];
71 LLVMValueRef *assign[100];
72 unsigned num_sgpr_params;
73 unsigned num_params;
74 };
75
76 enum si_arg_regfile {
77 ARG_SGPR,
78 ARG_VGPR
79 };
80
81 static void si_init_shader_ctx(struct si_shader_context *ctx,
82 struct si_screen *sscreen,
83 LLVMTargetMachineRef tm);
84
85 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
86 struct lp_build_tgsi_context *bld_base,
87 struct lp_build_emit_data *emit_data);
88
89 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
90 FILE *f);
91
92 static unsigned llvm_get_type_size(LLVMTypeRef type);
93
94 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
95 union si_shader_part_key *key);
96 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
97 union si_shader_part_key *key);
98 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
99 union si_shader_part_key *key);
100 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
101 union si_shader_part_key *key);
102
103 /* Ideally pass the sample mask input to the PS epilog as v13, which
104 * is its usual location, so that the shader doesn't have to add v_mov.
105 */
106 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
107
108 enum {
109 CONST_ADDR_SPACE = 2,
110 LOCAL_ADDR_SPACE = 3,
111 };
112
113 static bool is_merged_shader(struct si_shader *shader)
114 {
115 if (shader->selector->screen->b.chip_class <= VI)
116 return false;
117
118 return shader->key.as_ls ||
119 shader->key.as_es ||
120 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
121 shader->selector->type == PIPE_SHADER_GEOMETRY;
122 }
123
124 static void si_init_function_info(struct si_function_info *fninfo)
125 {
126 fninfo->num_params = 0;
127 fninfo->num_sgpr_params = 0;
128 }
129
130 static unsigned add_arg_assign(struct si_function_info *fninfo,
131 enum si_arg_regfile regfile, LLVMTypeRef type,
132 LLVMValueRef *assign)
133 {
134 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
135
136 unsigned idx = fninfo->num_params++;
137 assert(idx < ARRAY_SIZE(fninfo->types));
138
139 if (regfile == ARG_SGPR)
140 fninfo->num_sgpr_params = fninfo->num_params;
141
142 fninfo->types[idx] = type;
143 fninfo->assign[idx] = assign;
144 return idx;
145 }
146
147 static unsigned add_arg(struct si_function_info *fninfo,
148 enum si_arg_regfile regfile, LLVMTypeRef type)
149 {
150 return add_arg_assign(fninfo, regfile, type, NULL);
151 }
152
153 static void add_arg_assign_checked(struct si_function_info *fninfo,
154 enum si_arg_regfile regfile, LLVMTypeRef type,
155 LLVMValueRef *assign, unsigned idx)
156 {
157 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
158 assert(actual == idx);
159 }
160
161 static void add_arg_checked(struct si_function_info *fninfo,
162 enum si_arg_regfile regfile, LLVMTypeRef type,
163 unsigned idx)
164 {
165 add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
166 }
167
168 /**
169 * Returns a unique index for a per-patch semantic name and index. The index
170 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
171 * can be calculated.
172 */
173 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
174 {
175 switch (semantic_name) {
176 case TGSI_SEMANTIC_TESSOUTER:
177 return 0;
178 case TGSI_SEMANTIC_TESSINNER:
179 return 1;
180 case TGSI_SEMANTIC_PATCH:
181 assert(index < 30);
182 return 2 + index;
183
184 default:
185 assert(!"invalid semantic name");
186 return 0;
187 }
188 }
189
190 /**
191 * Returns a unique index for a semantic name and index. The index must be
192 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
193 * calculated.
194 */
195 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
196 {
197 switch (semantic_name) {
198 case TGSI_SEMANTIC_POSITION:
199 return 0;
200 case TGSI_SEMANTIC_GENERIC:
201 /* Since some shader stages use the the highest used IO index
202 * to determine the size to allocate for inputs/outputs
203 * (in LDS, tess and GS rings). GENERIC should be placed right
204 * after POSITION to make that size as small as possible.
205 */
206 if (index < SI_MAX_IO_GENERIC)
207 return 1 + index;
208
209 assert(!"invalid generic index");
210 return 0;
211 case TGSI_SEMANTIC_PSIZE:
212 return SI_MAX_IO_GENERIC + 1;
213 case TGSI_SEMANTIC_CLIPDIST:
214 assert(index <= 1);
215 return SI_MAX_IO_GENERIC + 2 + index;
216 case TGSI_SEMANTIC_FOG:
217 return SI_MAX_IO_GENERIC + 4;
218 case TGSI_SEMANTIC_LAYER:
219 return SI_MAX_IO_GENERIC + 5;
220 case TGSI_SEMANTIC_VIEWPORT_INDEX:
221 return SI_MAX_IO_GENERIC + 6;
222 case TGSI_SEMANTIC_PRIMID:
223 return SI_MAX_IO_GENERIC + 7;
224 case TGSI_SEMANTIC_COLOR: /* these alias */
225 case TGSI_SEMANTIC_BCOLOR:
226 assert(index < 2);
227 return SI_MAX_IO_GENERIC + 8 + index;
228 case TGSI_SEMANTIC_TEXCOORD:
229 assert(index < 8);
230 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
231 return SI_MAX_IO_GENERIC + 10 + index;
232 default:
233 assert(!"invalid semantic name");
234 return 0;
235 }
236 }
237
238 /**
239 * Helper function that builds an LLVM IR PHI node and immediately adds
240 * incoming edges.
241 */
242 static LLVMValueRef
243 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
244 unsigned count_incoming, LLVMValueRef *values,
245 LLVMBasicBlockRef *blocks)
246 {
247 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
248 LLVMAddIncoming(phi, values, blocks, count_incoming);
249 return phi;
250 }
251
252 /**
253 * Get the value of a shader input parameter and extract a bitfield.
254 */
255 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
256 unsigned param, unsigned rshift,
257 unsigned bitwidth)
258 {
259 struct gallivm_state *gallivm = &ctx->gallivm;
260 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
261 param);
262
263 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
264 value = bitcast(&ctx->bld_base,
265 TGSI_TYPE_UNSIGNED, value);
266
267 if (rshift)
268 value = LLVMBuildLShr(gallivm->builder, value,
269 LLVMConstInt(ctx->i32, rshift, 0), "");
270
271 if (rshift + bitwidth < 32) {
272 unsigned mask = (1 << bitwidth) - 1;
273 value = LLVMBuildAnd(gallivm->builder, value,
274 LLVMConstInt(ctx->i32, mask, 0), "");
275 }
276
277 return value;
278 }
279
280 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
281 {
282 switch (ctx->type) {
283 case PIPE_SHADER_TESS_CTRL:
284 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
285
286 case PIPE_SHADER_TESS_EVAL:
287 return LLVMGetParam(ctx->main_fn,
288 ctx->param_tes_rel_patch_id);
289
290 default:
291 assert(0);
292 return NULL;
293 }
294 }
295
296 /* Tessellation shaders pass outputs to the next shader using LDS.
297 *
298 * LS outputs = TCS inputs
299 * TCS outputs = TES inputs
300 *
301 * The LDS layout is:
302 * - TCS inputs for patch 0
303 * - TCS inputs for patch 1
304 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
305 * - ...
306 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
307 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
308 * - TCS outputs for patch 1
309 * - Per-patch TCS outputs for patch 1
310 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
311 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
312 * - ...
313 *
314 * All three shaders VS(LS), TCS, TES share the same LDS space.
315 */
316
317 static LLVMValueRef
318 get_tcs_in_patch_stride(struct si_shader_context *ctx)
319 {
320 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
321 }
322
323 static LLVMValueRef
324 get_tcs_out_patch_stride(struct si_shader_context *ctx)
325 {
326 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
327 }
328
329 static LLVMValueRef
330 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
331 {
332 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
333 unpack_param(ctx,
334 ctx->param_tcs_out_lds_offsets,
335 0, 16),
336 4);
337 }
338
339 static LLVMValueRef
340 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
341 {
342 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
343 unpack_param(ctx,
344 ctx->param_tcs_out_lds_offsets,
345 16, 16),
346 4);
347 }
348
349 static LLVMValueRef
350 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
351 {
352 struct gallivm_state *gallivm = &ctx->gallivm;
353 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
354 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
355
356 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
357 }
358
359 static LLVMValueRef
360 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
361 {
362 struct gallivm_state *gallivm = &ctx->gallivm;
363 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
364 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
365 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
366
367 return LLVMBuildAdd(gallivm->builder, patch0_offset,
368 LLVMBuildMul(gallivm->builder, patch_stride,
369 rel_patch_id, ""),
370 "");
371 }
372
373 static LLVMValueRef
374 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
375 {
376 struct gallivm_state *gallivm = &ctx->gallivm;
377 LLVMValueRef patch0_patch_data_offset =
378 get_tcs_out_patch0_patch_data_offset(ctx);
379 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
380 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
381
382 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
383 LLVMBuildMul(gallivm->builder, patch_stride,
384 rel_patch_id, ""),
385 "");
386 }
387
388 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
389 {
390 unsigned tcs_out_vertices =
391 ctx->shader->selector ?
392 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
393
394 /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
395 if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
396 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
397
398 return unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
399 }
400
401 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
402 {
403 unsigned stride;
404
405 switch (ctx->type) {
406 case PIPE_SHADER_VERTEX:
407 stride = util_last_bit64(ctx->shader->selector->outputs_written);
408 return LLVMConstInt(ctx->i32, stride * 4, 0);
409
410 case PIPE_SHADER_TESS_CTRL:
411 return unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
412
413 default:
414 assert(0);
415 return NULL;
416 }
417 }
418
419 static LLVMValueRef get_instance_index_for_fetch(
420 struct si_shader_context *ctx,
421 unsigned param_start_instance, LLVMValueRef divisor)
422 {
423 struct gallivm_state *gallivm = &ctx->gallivm;
424
425 LLVMValueRef result = ctx->abi.instance_id;
426
427 /* The division must be done before START_INSTANCE is added. */
428 if (divisor != ctx->i32_1)
429 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
430
431 return LLVMBuildAdd(gallivm->builder, result,
432 LLVMGetParam(ctx->main_fn, param_start_instance), "");
433 }
434
435 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
436 * to float. */
437 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
438 LLVMValueRef vec4,
439 unsigned double_index)
440 {
441 LLVMBuilderRef builder = ctx->gallivm.builder;
442 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
443 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
444 LLVMVectorType(f64, 2), "");
445 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
446 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
447 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
448 }
449
450 void si_llvm_load_input_vs(
451 struct si_shader_context *ctx,
452 unsigned input_index,
453 LLVMValueRef out[4])
454 {
455 struct gallivm_state *gallivm = &ctx->gallivm;
456
457 unsigned chan;
458 unsigned fix_fetch;
459 unsigned num_fetches;
460 unsigned fetch_stride;
461
462 LLVMValueRef t_list_ptr;
463 LLVMValueRef t_offset;
464 LLVMValueRef t_list;
465 LLVMValueRef vertex_index;
466 LLVMValueRef input[3];
467
468 /* Load the T list */
469 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
470
471 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
472
473 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
474
475 vertex_index = LLVMGetParam(ctx->main_fn,
476 ctx->param_vertex_index0 +
477 input_index);
478
479 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
480
481 /* Do multiple loads for special formats. */
482 switch (fix_fetch) {
483 case SI_FIX_FETCH_RGB_64_FLOAT:
484 num_fetches = 3; /* 3 2-dword loads */
485 fetch_stride = 8;
486 break;
487 case SI_FIX_FETCH_RGBA_64_FLOAT:
488 num_fetches = 2; /* 2 4-dword loads */
489 fetch_stride = 16;
490 break;
491 case SI_FIX_FETCH_RGB_8:
492 case SI_FIX_FETCH_RGB_8_INT:
493 num_fetches = 3;
494 fetch_stride = 1;
495 break;
496 case SI_FIX_FETCH_RGB_16:
497 case SI_FIX_FETCH_RGB_16_INT:
498 num_fetches = 3;
499 fetch_stride = 2;
500 break;
501 default:
502 num_fetches = 1;
503 fetch_stride = 0;
504 }
505
506 for (unsigned i = 0; i < num_fetches; i++) {
507 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
508
509 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
510 vertex_index, voffset,
511 true);
512 }
513
514 /* Break up the vec4 into individual components */
515 for (chan = 0; chan < 4; chan++) {
516 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
517 out[chan] = LLVMBuildExtractElement(gallivm->builder,
518 input[0], llvm_chan, "");
519 }
520
521 switch (fix_fetch) {
522 case SI_FIX_FETCH_A2_SNORM:
523 case SI_FIX_FETCH_A2_SSCALED:
524 case SI_FIX_FETCH_A2_SINT: {
525 /* The hardware returns an unsigned value; convert it to a
526 * signed one.
527 */
528 LLVMValueRef tmp = out[3];
529 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
530
531 /* First, recover the sign-extended signed integer value. */
532 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
533 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
534 else
535 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
536
537 /* For the integer-like cases, do a natural sign extension.
538 *
539 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
540 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
541 * exponent.
542 */
543 tmp = LLVMBuildShl(gallivm->builder, tmp,
544 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
545 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
546 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
547
548 /* Convert back to the right type. */
549 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
550 LLVMValueRef clamp;
551 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
552 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
553 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
554 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
555 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
556 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
557 }
558
559 out[3] = tmp;
560 break;
561 }
562 case SI_FIX_FETCH_RGBA_32_UNORM:
563 case SI_FIX_FETCH_RGBX_32_UNORM:
564 for (chan = 0; chan < 4; chan++) {
565 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
566 ctx->i32, "");
567 out[chan] = LLVMBuildUIToFP(gallivm->builder,
568 out[chan], ctx->f32, "");
569 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
570 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
571 }
572 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
573 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
574 out[3] = LLVMConstReal(ctx->f32, 1);
575 break;
576 case SI_FIX_FETCH_RGBA_32_SNORM:
577 case SI_FIX_FETCH_RGBX_32_SNORM:
578 case SI_FIX_FETCH_RGBA_32_FIXED:
579 case SI_FIX_FETCH_RGBX_32_FIXED: {
580 double scale;
581 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
582 scale = 1.0 / 0x10000;
583 else
584 scale = 1.0 / INT_MAX;
585
586 for (chan = 0; chan < 4; chan++) {
587 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
588 ctx->i32, "");
589 out[chan] = LLVMBuildSIToFP(gallivm->builder,
590 out[chan], ctx->f32, "");
591 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
592 LLVMConstReal(ctx->f32, scale), "");
593 }
594 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
595 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
596 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
597 out[3] = LLVMConstReal(ctx->f32, 1);
598 break;
599 }
600 case SI_FIX_FETCH_RGBA_32_USCALED:
601 for (chan = 0; chan < 4; chan++) {
602 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
603 ctx->i32, "");
604 out[chan] = LLVMBuildUIToFP(gallivm->builder,
605 out[chan], ctx->f32, "");
606 }
607 break;
608 case SI_FIX_FETCH_RGBA_32_SSCALED:
609 for (chan = 0; chan < 4; chan++) {
610 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
611 ctx->i32, "");
612 out[chan] = LLVMBuildSIToFP(gallivm->builder,
613 out[chan], ctx->f32, "");
614 }
615 break;
616 case SI_FIX_FETCH_RG_64_FLOAT:
617 for (chan = 0; chan < 2; chan++)
618 out[chan] = extract_double_to_float(ctx, input[0], chan);
619
620 out[2] = LLVMConstReal(ctx->f32, 0);
621 out[3] = LLVMConstReal(ctx->f32, 1);
622 break;
623 case SI_FIX_FETCH_RGB_64_FLOAT:
624 for (chan = 0; chan < 3; chan++)
625 out[chan] = extract_double_to_float(ctx, input[chan], 0);
626
627 out[3] = LLVMConstReal(ctx->f32, 1);
628 break;
629 case SI_FIX_FETCH_RGBA_64_FLOAT:
630 for (chan = 0; chan < 4; chan++) {
631 out[chan] = extract_double_to_float(ctx, input[chan / 2],
632 chan % 2);
633 }
634 break;
635 case SI_FIX_FETCH_RGB_8:
636 case SI_FIX_FETCH_RGB_8_INT:
637 case SI_FIX_FETCH_RGB_16:
638 case SI_FIX_FETCH_RGB_16_INT:
639 for (chan = 0; chan < 3; chan++) {
640 out[chan] = LLVMBuildExtractElement(gallivm->builder,
641 input[chan],
642 ctx->i32_0, "");
643 }
644 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
645 fix_fetch == SI_FIX_FETCH_RGB_16) {
646 out[3] = LLVMConstReal(ctx->f32, 1);
647 } else {
648 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
649 ctx->f32, "");
650 }
651 break;
652 }
653 }
654
655 static void declare_input_vs(
656 struct si_shader_context *ctx,
657 unsigned input_index,
658 const struct tgsi_full_declaration *decl,
659 LLVMValueRef out[4])
660 {
661 si_llvm_load_input_vs(ctx, input_index, out);
662 }
663
664 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
665 unsigned swizzle)
666 {
667 if (swizzle > 0)
668 return ctx->i32_0;
669
670 switch (ctx->type) {
671 case PIPE_SHADER_VERTEX:
672 return LLVMGetParam(ctx->main_fn,
673 ctx->param_vs_prim_id);
674 case PIPE_SHADER_TESS_CTRL:
675 return LLVMGetParam(ctx->main_fn,
676 ctx->param_tcs_patch_id);
677 case PIPE_SHADER_TESS_EVAL:
678 return LLVMGetParam(ctx->main_fn,
679 ctx->param_tes_patch_id);
680 case PIPE_SHADER_GEOMETRY:
681 return LLVMGetParam(ctx->main_fn,
682 ctx->param_gs_prim_id);
683 default:
684 assert(0);
685 return ctx->i32_0;
686 }
687 }
688
689 /**
690 * Return the value of tgsi_ind_register for indexing.
691 * This is the indirect index with the constant offset added to it.
692 */
693 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
694 const struct tgsi_ind_register *ind,
695 int rel_index)
696 {
697 struct gallivm_state *gallivm = &ctx->gallivm;
698 LLVMValueRef result;
699
700 result = ctx->addrs[ind->Index][ind->Swizzle];
701 result = LLVMBuildLoad(gallivm->builder, result, "");
702 result = LLVMBuildAdd(gallivm->builder, result,
703 LLVMConstInt(ctx->i32, rel_index, 0), "");
704 return result;
705 }
706
707 /**
708 * Like si_get_indirect_index, but restricts the return value to a (possibly
709 * undefined) value inside [0..num).
710 */
711 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
712 const struct tgsi_ind_register *ind,
713 int rel_index, unsigned num)
714 {
715 LLVMValueRef result = si_get_indirect_index(ctx, ind, rel_index);
716
717 return si_llvm_bound_index(ctx, result, num);
718 }
719
720
721 /**
722 * Calculate a dword address given an input or output register and a stride.
723 */
724 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
725 const struct tgsi_full_dst_register *dst,
726 const struct tgsi_full_src_register *src,
727 LLVMValueRef vertex_dw_stride,
728 LLVMValueRef base_addr)
729 {
730 struct gallivm_state *gallivm = &ctx->gallivm;
731 struct tgsi_shader_info *info = &ctx->shader->selector->info;
732 ubyte *name, *index, *array_first;
733 int first, param;
734 struct tgsi_full_dst_register reg;
735
736 /* Set the register description. The address computation is the same
737 * for sources and destinations. */
738 if (src) {
739 reg.Register.File = src->Register.File;
740 reg.Register.Index = src->Register.Index;
741 reg.Register.Indirect = src->Register.Indirect;
742 reg.Register.Dimension = src->Register.Dimension;
743 reg.Indirect = src->Indirect;
744 reg.Dimension = src->Dimension;
745 reg.DimIndirect = src->DimIndirect;
746 } else
747 reg = *dst;
748
749 /* If the register is 2-dimensional (e.g. an array of vertices
750 * in a primitive), calculate the base address of the vertex. */
751 if (reg.Register.Dimension) {
752 LLVMValueRef index;
753
754 if (reg.Dimension.Indirect)
755 index = si_get_indirect_index(ctx, &reg.DimIndirect,
756 reg.Dimension.Index);
757 else
758 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
759
760 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
761 LLVMBuildMul(gallivm->builder, index,
762 vertex_dw_stride, ""), "");
763 }
764
765 /* Get information about the register. */
766 if (reg.Register.File == TGSI_FILE_INPUT) {
767 name = info->input_semantic_name;
768 index = info->input_semantic_index;
769 array_first = info->input_array_first;
770 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
771 name = info->output_semantic_name;
772 index = info->output_semantic_index;
773 array_first = info->output_array_first;
774 } else {
775 assert(0);
776 return NULL;
777 }
778
779 if (reg.Register.Indirect) {
780 /* Add the relative address of the element. */
781 LLVMValueRef ind_index;
782
783 if (reg.Indirect.ArrayID)
784 first = array_first[reg.Indirect.ArrayID];
785 else
786 first = reg.Register.Index;
787
788 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
789 reg.Register.Index - first);
790
791 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
792 LLVMBuildMul(gallivm->builder, ind_index,
793 LLVMConstInt(ctx->i32, 4, 0), ""), "");
794
795 param = reg.Register.Dimension ?
796 si_shader_io_get_unique_index(name[first], index[first]) :
797 si_shader_io_get_unique_index_patch(name[first], index[first]);
798 } else {
799 param = reg.Register.Dimension ?
800 si_shader_io_get_unique_index(name[reg.Register.Index],
801 index[reg.Register.Index]) :
802 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
803 index[reg.Register.Index]);
804 }
805
806 /* Add the base address of the element. */
807 return LLVMBuildAdd(gallivm->builder, base_addr,
808 LLVMConstInt(ctx->i32, param * 4, 0), "");
809 }
810
811 /* The offchip buffer layout for TCS->TES is
812 *
813 * - attribute 0 of patch 0 vertex 0
814 * - attribute 0 of patch 0 vertex 1
815 * - attribute 0 of patch 0 vertex 2
816 * ...
817 * - attribute 0 of patch 1 vertex 0
818 * - attribute 0 of patch 1 vertex 1
819 * ...
820 * - attribute 1 of patch 0 vertex 0
821 * - attribute 1 of patch 0 vertex 1
822 * ...
823 * - per patch attribute 0 of patch 0
824 * - per patch attribute 0 of patch 1
825 * ...
826 *
827 * Note that every attribute has 4 components.
828 */
829 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
830 LLVMValueRef rel_patch_id,
831 LLVMValueRef vertex_index,
832 LLVMValueRef param_index)
833 {
834 struct gallivm_state *gallivm = &ctx->gallivm;
835 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
836 LLVMValueRef param_stride, constant16;
837
838 vertices_per_patch = get_num_tcs_out_vertices(ctx);
839 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
840 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
841 num_patches, "");
842
843 constant16 = LLVMConstInt(ctx->i32, 16, 0);
844 if (vertex_index) {
845 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
846 vertices_per_patch, "");
847
848 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
849 vertex_index, "");
850
851 param_stride = total_vertices;
852 } else {
853 base_addr = rel_patch_id;
854 param_stride = num_patches;
855 }
856
857 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
858 LLVMBuildMul(gallivm->builder, param_index,
859 param_stride, ""), "");
860
861 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
862
863 if (!vertex_index) {
864 LLVMValueRef patch_data_offset =
865 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
866
867 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
868 patch_data_offset, "");
869 }
870 return base_addr;
871 }
872
873 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
874 struct si_shader_context *ctx,
875 const struct tgsi_full_dst_register *dst,
876 const struct tgsi_full_src_register *src)
877 {
878 struct gallivm_state *gallivm = &ctx->gallivm;
879 struct tgsi_shader_info *info = &ctx->shader->selector->info;
880 ubyte *name, *index, *array_first;
881 struct tgsi_full_src_register reg;
882 LLVMValueRef vertex_index = NULL;
883 LLVMValueRef param_index = NULL;
884 unsigned param_index_base, param_base;
885
886 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
887
888 if (reg.Register.Dimension) {
889
890 if (reg.Dimension.Indirect)
891 vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
892 reg.Dimension.Index);
893 else
894 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
895 }
896
897 /* Get information about the register. */
898 if (reg.Register.File == TGSI_FILE_INPUT) {
899 name = info->input_semantic_name;
900 index = info->input_semantic_index;
901 array_first = info->input_array_first;
902 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
903 name = info->output_semantic_name;
904 index = info->output_semantic_index;
905 array_first = info->output_array_first;
906 } else {
907 assert(0);
908 return NULL;
909 }
910
911 if (reg.Register.Indirect) {
912 if (reg.Indirect.ArrayID)
913 param_base = array_first[reg.Indirect.ArrayID];
914 else
915 param_base = reg.Register.Index;
916
917 param_index = si_get_indirect_index(ctx, &reg.Indirect,
918 reg.Register.Index - param_base);
919
920 } else {
921 param_base = reg.Register.Index;
922 param_index = ctx->i32_0;
923 }
924
925 param_index_base = reg.Register.Dimension ?
926 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
927 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
928
929 param_index = LLVMBuildAdd(gallivm->builder, param_index,
930 LLVMConstInt(ctx->i32, param_index_base, 0),
931 "");
932
933 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
934 vertex_index, param_index);
935 }
936
937 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
938 enum tgsi_opcode_type type, unsigned swizzle,
939 LLVMValueRef buffer, LLVMValueRef offset,
940 LLVMValueRef base, bool can_speculate)
941 {
942 struct si_shader_context *ctx = si_shader_context(bld_base);
943 struct gallivm_state *gallivm = &ctx->gallivm;
944 LLVMValueRef value, value2;
945 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
946 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
947
948 if (swizzle == ~0) {
949 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
950 0, 1, 0, can_speculate, false);
951
952 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
953 }
954
955 if (!tgsi_type_is_64bit(type)) {
956 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
957 0, 1, 0, can_speculate, false);
958
959 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
960 return LLVMBuildExtractElement(gallivm->builder, value,
961 LLVMConstInt(ctx->i32, swizzle, 0), "");
962 }
963
964 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
965 swizzle * 4, 1, 0, can_speculate, false);
966
967 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
968 swizzle * 4 + 4, 1, 0, can_speculate, false);
969
970 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
971 }
972
973 /**
974 * Load from LDS.
975 *
976 * \param type output value type
977 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
978 * \param dw_addr address in dwords
979 */
980 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
981 enum tgsi_opcode_type type, unsigned swizzle,
982 LLVMValueRef dw_addr)
983 {
984 struct si_shader_context *ctx = si_shader_context(bld_base);
985 struct gallivm_state *gallivm = &ctx->gallivm;
986 LLVMValueRef value;
987
988 if (swizzle == ~0) {
989 LLVMValueRef values[TGSI_NUM_CHANNELS];
990
991 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
992 values[chan] = lds_load(bld_base, type, chan, dw_addr);
993
994 return lp_build_gather_values(gallivm, values,
995 TGSI_NUM_CHANNELS);
996 }
997
998 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
999 LLVMConstInt(ctx->i32, swizzle, 0));
1000
1001 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
1002 if (tgsi_type_is_64bit(type)) {
1003 LLVMValueRef value2;
1004 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1005 ctx->i32_1);
1006 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
1007 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1008 }
1009
1010 return LLVMBuildBitCast(gallivm->builder, value,
1011 tgsi2llvmtype(bld_base, type), "");
1012 }
1013
1014 /**
1015 * Store to LDS.
1016 *
1017 * \param swizzle offset (typically 0..3)
1018 * \param dw_addr address in dwords
1019 * \param value value to store
1020 */
1021 static void lds_store(struct lp_build_tgsi_context *bld_base,
1022 unsigned dw_offset_imm, LLVMValueRef dw_addr,
1023 LLVMValueRef value)
1024 {
1025 struct si_shader_context *ctx = si_shader_context(bld_base);
1026 struct gallivm_state *gallivm = &ctx->gallivm;
1027
1028 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1029 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
1030
1031 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1032 ac_build_indexed_store(&ctx->ac, ctx->lds,
1033 dw_addr, value);
1034 }
1035
1036 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
1037 unsigned param)
1038 {
1039 LLVMBuilderRef builder = ctx->gallivm.builder;
1040
1041 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1042 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
1043 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1044
1045 uint64_t desc2 = 0xffffffff;
1046 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1047 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1048 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1049 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1050 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1051 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1052 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1053
1054 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1055 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1056 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1057 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1058 }
1059
1060 static LLVMValueRef fetch_input_tcs(
1061 struct lp_build_tgsi_context *bld_base,
1062 const struct tgsi_full_src_register *reg,
1063 enum tgsi_opcode_type type, unsigned swizzle)
1064 {
1065 struct si_shader_context *ctx = si_shader_context(bld_base);
1066 LLVMValueRef dw_addr, stride;
1067
1068 stride = get_tcs_in_vertex_dw_stride(ctx);
1069 dw_addr = get_tcs_in_current_patch_offset(ctx);
1070 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1071
1072 return lds_load(bld_base, type, swizzle, dw_addr);
1073 }
1074
1075 static LLVMValueRef fetch_output_tcs(
1076 struct lp_build_tgsi_context *bld_base,
1077 const struct tgsi_full_src_register *reg,
1078 enum tgsi_opcode_type type, unsigned swizzle)
1079 {
1080 struct si_shader_context *ctx = si_shader_context(bld_base);
1081 LLVMValueRef dw_addr, stride;
1082
1083 if (reg->Register.Dimension) {
1084 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1085 dw_addr = get_tcs_out_current_patch_offset(ctx);
1086 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1087 } else {
1088 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1089 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1090 }
1091
1092 return lds_load(bld_base, type, swizzle, dw_addr);
1093 }
1094
1095 static LLVMValueRef fetch_input_tes(
1096 struct lp_build_tgsi_context *bld_base,
1097 const struct tgsi_full_src_register *reg,
1098 enum tgsi_opcode_type type, unsigned swizzle)
1099 {
1100 struct si_shader_context *ctx = si_shader_context(bld_base);
1101 LLVMValueRef buffer, base, addr;
1102
1103 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1104
1105 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1106 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1107
1108 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1109 }
1110
1111 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1112 const struct tgsi_full_instruction *inst,
1113 const struct tgsi_opcode_info *info,
1114 LLVMValueRef dst[4])
1115 {
1116 struct si_shader_context *ctx = si_shader_context(bld_base);
1117 struct gallivm_state *gallivm = &ctx->gallivm;
1118 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1119 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1120 unsigned chan_index;
1121 LLVMValueRef dw_addr, stride;
1122 LLVMValueRef buffer, base, buf_addr;
1123 LLVMValueRef values[4];
1124 bool skip_lds_store;
1125 bool is_tess_factor = false;
1126
1127 /* Only handle per-patch and per-vertex outputs here.
1128 * Vectors will be lowered to scalars and this function will be called again.
1129 */
1130 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1131 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1132 si_llvm_emit_store(bld_base, inst, info, dst);
1133 return;
1134 }
1135
1136 if (reg->Register.Dimension) {
1137 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1138 dw_addr = get_tcs_out_current_patch_offset(ctx);
1139 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1140 skip_lds_store = !sh_info->reads_pervertex_outputs;
1141 } else {
1142 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1143 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1144 skip_lds_store = !sh_info->reads_perpatch_outputs;
1145
1146 if (!reg->Register.Indirect) {
1147 int name = sh_info->output_semantic_name[reg->Register.Index];
1148
1149 /* Always write tess factors into LDS for the TCS epilog. */
1150 if (name == TGSI_SEMANTIC_TESSINNER ||
1151 name == TGSI_SEMANTIC_TESSOUTER) {
1152 skip_lds_store = false;
1153 is_tess_factor = true;
1154 }
1155 }
1156 }
1157
1158 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1159
1160 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1161 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1162
1163
1164 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1165 LLVMValueRef value = dst[chan_index];
1166
1167 if (inst->Instruction.Saturate)
1168 value = ac_build_clamp(&ctx->ac, value);
1169
1170 /* Skip LDS stores if there is no LDS read of this output. */
1171 if (!skip_lds_store)
1172 lds_store(bld_base, chan_index, dw_addr, value);
1173
1174 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1175 values[chan_index] = value;
1176
1177 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1178 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1179 buf_addr, base,
1180 4 * chan_index, 1, 0, true, false);
1181 }
1182 }
1183
1184 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1185 LLVMValueRef value = lp_build_gather_values(gallivm,
1186 values, 4);
1187 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1188 base, 0, 1, 0, true, false);
1189 }
1190 }
1191
1192 static LLVMValueRef fetch_input_gs(
1193 struct lp_build_tgsi_context *bld_base,
1194 const struct tgsi_full_src_register *reg,
1195 enum tgsi_opcode_type type,
1196 unsigned swizzle)
1197 {
1198 struct si_shader_context *ctx = si_shader_context(bld_base);
1199 struct si_shader *shader = ctx->shader;
1200 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1201 struct gallivm_state *gallivm = &ctx->gallivm;
1202 LLVMValueRef vtx_offset, soffset;
1203 struct tgsi_shader_info *info = &shader->selector->info;
1204 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1205 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1206 unsigned param;
1207 LLVMValueRef value;
1208
1209 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1210 return get_primitive_id(ctx, swizzle);
1211
1212 if (!reg->Register.Dimension)
1213 return NULL;
1214
1215 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1216
1217 /* GFX9 has the ESGS ring in LDS. */
1218 if (ctx->screen->b.chip_class >= GFX9) {
1219 unsigned index = reg->Dimension.Index;
1220
1221 switch (index / 2) {
1222 case 0:
1223 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1224 index % 2 ? 16 : 0, 16);
1225 break;
1226 case 1:
1227 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1228 index % 2 ? 16 : 0, 16);
1229 break;
1230 case 2:
1231 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1232 index % 2 ? 16 : 0, 16);
1233 break;
1234 default:
1235 assert(0);
1236 return NULL;
1237 }
1238
1239 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1240 LLVMConstInt(ctx->i32, param * 4, 0), "");
1241 return lds_load(bld_base, type, swizzle, vtx_offset);
1242 }
1243
1244 /* GFX6: input load from the ESGS ring in memory. */
1245 if (swizzle == ~0) {
1246 LLVMValueRef values[TGSI_NUM_CHANNELS];
1247 unsigned chan;
1248 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1249 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1250 }
1251 return lp_build_gather_values(gallivm, values,
1252 TGSI_NUM_CHANNELS);
1253 }
1254
1255 /* Get the vertex offset parameter on GFX6. */
1256 unsigned vtx_offset_param = reg->Dimension.Index;
1257 if (vtx_offset_param < 2) {
1258 vtx_offset_param += ctx->param_gs_vtx0_offset;
1259 } else {
1260 assert(vtx_offset_param < 6);
1261 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1262 }
1263 vtx_offset = lp_build_mul_imm(uint,
1264 LLVMGetParam(ctx->main_fn,
1265 vtx_offset_param),
1266 4);
1267
1268 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1269
1270 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1271 vtx_offset, soffset, 0, 1, 0, true, false);
1272 if (tgsi_type_is_64bit(type)) {
1273 LLVMValueRef value2;
1274 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1275
1276 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1277 ctx->i32_0, vtx_offset, soffset,
1278 0, 1, 0, true, false);
1279 return si_llvm_emit_fetch_64bit(bld_base, type,
1280 value, value2);
1281 }
1282 return LLVMBuildBitCast(gallivm->builder,
1283 value,
1284 tgsi2llvmtype(bld_base, type), "");
1285 }
1286
1287 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1288 {
1289 switch (interpolate) {
1290 case TGSI_INTERPOLATE_CONSTANT:
1291 return 0;
1292
1293 case TGSI_INTERPOLATE_LINEAR:
1294 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1295 return SI_PARAM_LINEAR_SAMPLE;
1296 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1297 return SI_PARAM_LINEAR_CENTROID;
1298 else
1299 return SI_PARAM_LINEAR_CENTER;
1300 break;
1301 case TGSI_INTERPOLATE_COLOR:
1302 case TGSI_INTERPOLATE_PERSPECTIVE:
1303 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1304 return SI_PARAM_PERSP_SAMPLE;
1305 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1306 return SI_PARAM_PERSP_CENTROID;
1307 else
1308 return SI_PARAM_PERSP_CENTER;
1309 break;
1310 default:
1311 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1312 return -1;
1313 }
1314 }
1315
1316 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1317 unsigned attr_index, unsigned chan,
1318 LLVMValueRef prim_mask,
1319 LLVMValueRef i, LLVMValueRef j)
1320 {
1321 if (i || j) {
1322 return ac_build_fs_interp(&ctx->ac,
1323 LLVMConstInt(ctx->i32, chan, 0),
1324 LLVMConstInt(ctx->i32, attr_index, 0),
1325 prim_mask, i, j);
1326 }
1327 return ac_build_fs_interp_mov(&ctx->ac,
1328 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1329 LLVMConstInt(ctx->i32, chan, 0),
1330 LLVMConstInt(ctx->i32, attr_index, 0),
1331 prim_mask);
1332 }
1333
1334 /**
1335 * Interpolate a fragment shader input.
1336 *
1337 * @param ctx context
1338 * @param input_index index of the input in hardware
1339 * @param semantic_name TGSI_SEMANTIC_*
1340 * @param semantic_index semantic index
1341 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1342 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1343 * @param interp_param interpolation weights (i,j)
1344 * @param prim_mask SI_PARAM_PRIM_MASK
1345 * @param face SI_PARAM_FRONT_FACE
1346 * @param result the return value (4 components)
1347 */
1348 static void interp_fs_input(struct si_shader_context *ctx,
1349 unsigned input_index,
1350 unsigned semantic_name,
1351 unsigned semantic_index,
1352 unsigned num_interp_inputs,
1353 unsigned colors_read_mask,
1354 LLVMValueRef interp_param,
1355 LLVMValueRef prim_mask,
1356 LLVMValueRef face,
1357 LLVMValueRef result[4])
1358 {
1359 struct gallivm_state *gallivm = &ctx->gallivm;
1360 LLVMValueRef i = NULL, j = NULL;
1361 unsigned chan;
1362
1363 /* fs.constant returns the param from the middle vertex, so it's not
1364 * really useful for flat shading. It's meant to be used for custom
1365 * interpolation (but the intrinsic can't fetch from the other two
1366 * vertices).
1367 *
1368 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1369 * to do the right thing. The only reason we use fs.constant is that
1370 * fs.interp cannot be used on integers, because they can be equal
1371 * to NaN.
1372 *
1373 * When interp is false we will use fs.constant or for newer llvm,
1374 * amdgcn.interp.mov.
1375 */
1376 bool interp = interp_param != NULL;
1377
1378 if (interp) {
1379 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1380 LLVMVectorType(ctx->f32, 2), "");
1381
1382 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1383 ctx->i32_0, "");
1384 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1385 ctx->i32_1, "");
1386 }
1387
1388 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1389 ctx->shader->key.part.ps.prolog.color_two_side) {
1390 LLVMValueRef is_face_positive;
1391
1392 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1393 * otherwise it's at offset "num_inputs".
1394 */
1395 unsigned back_attr_offset = num_interp_inputs;
1396 if (semantic_index == 1 && colors_read_mask & 0xf)
1397 back_attr_offset += 1;
1398
1399 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1400 face, ctx->i32_0, "");
1401
1402 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1403 LLVMValueRef front, back;
1404
1405 front = si_build_fs_interp(ctx,
1406 input_index, chan,
1407 prim_mask, i, j);
1408 back = si_build_fs_interp(ctx,
1409 back_attr_offset, chan,
1410 prim_mask, i, j);
1411
1412 result[chan] = LLVMBuildSelect(gallivm->builder,
1413 is_face_positive,
1414 front,
1415 back,
1416 "");
1417 }
1418 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1419 result[0] = si_build_fs_interp(ctx, input_index,
1420 0, prim_mask, i, j);
1421 result[1] =
1422 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1423 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1424 } else {
1425 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1426 result[chan] = si_build_fs_interp(ctx,
1427 input_index, chan,
1428 prim_mask, i, j);
1429 }
1430 }
1431 }
1432
1433 void si_llvm_load_input_fs(
1434 struct si_shader_context *ctx,
1435 unsigned input_index,
1436 LLVMValueRef out[4])
1437 {
1438 struct lp_build_context *base = &ctx->bld_base.base;
1439 struct si_shader *shader = ctx->shader;
1440 struct tgsi_shader_info *info = &shader->selector->info;
1441 LLVMValueRef main_fn = ctx->main_fn;
1442 LLVMValueRef interp_param = NULL;
1443 int interp_param_idx;
1444 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1445 unsigned semantic_index = info->input_semantic_index[input_index];
1446 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1447 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1448
1449 /* Get colors from input VGPRs (set by the prolog). */
1450 if (semantic_name == TGSI_SEMANTIC_COLOR) {
1451 unsigned colors_read = shader->selector->info.colors_read;
1452 unsigned mask = colors_read >> (semantic_index * 4);
1453 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1454 (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1455
1456 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1457 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1458 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1459 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1460 return;
1461 }
1462
1463 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1464 if (interp_param_idx == -1)
1465 return;
1466 else if (interp_param_idx) {
1467 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1468 }
1469
1470 interp_fs_input(ctx, input_index, semantic_name,
1471 semantic_index, 0, /* this param is unused */
1472 shader->selector->info.colors_read, interp_param,
1473 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1474 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1475 &out[0]);
1476 }
1477
1478 static void declare_input_fs(
1479 struct si_shader_context *ctx,
1480 unsigned input_index,
1481 const struct tgsi_full_declaration *decl,
1482 LLVMValueRef out[4])
1483 {
1484 si_llvm_load_input_fs(ctx, input_index, out);
1485 }
1486
1487 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1488 {
1489 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1490 }
1491
1492
1493 /**
1494 * Load a dword from a constant buffer.
1495 */
1496 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1497 LLVMValueRef resource,
1498 LLVMValueRef offset)
1499 {
1500 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1501 0, 0, 0, true, true);
1502 }
1503
1504 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1505 {
1506 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1507 struct gallivm_state *gallivm = &ctx->gallivm;
1508 LLVMBuilderRef builder = gallivm->builder;
1509 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1510 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1511 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1512
1513 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1514 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1515 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1516
1517 LLVMValueRef pos[4] = {
1518 buffer_load_const(ctx, resource, offset0),
1519 buffer_load_const(ctx, resource, offset1),
1520 LLVMConstReal(ctx->f32, 0),
1521 LLVMConstReal(ctx->f32, 0)
1522 };
1523
1524 return lp_build_gather_values(gallivm, pos, 4);
1525 }
1526
1527 void si_load_system_value(struct si_shader_context *ctx,
1528 unsigned index,
1529 const struct tgsi_full_declaration *decl)
1530 {
1531 struct lp_build_context *bld = &ctx->bld_base.base;
1532 struct gallivm_state *gallivm = &ctx->gallivm;
1533 LLVMValueRef value = 0;
1534
1535 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1536
1537 switch (decl->Semantic.Name) {
1538 case TGSI_SEMANTIC_INSTANCEID:
1539 value = ctx->abi.instance_id;
1540 break;
1541
1542 case TGSI_SEMANTIC_VERTEXID:
1543 value = LLVMBuildAdd(gallivm->builder,
1544 ctx->abi.vertex_id,
1545 ctx->abi.base_vertex, "");
1546 break;
1547
1548 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1549 /* Unused. Clarify the meaning in indexed vs. non-indexed
1550 * draws if this is ever used again. */
1551 assert(false);
1552 break;
1553
1554 case TGSI_SEMANTIC_BASEVERTEX:
1555 {
1556 /* For non-indexed draws, the base vertex set by the driver
1557 * (for direct draws) or the CP (for indirect draws) is the
1558 * first vertex ID, but GLSL expects 0 to be returned.
1559 */
1560 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1561 LLVMValueRef indexed;
1562
1563 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1564 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1565
1566 value = LLVMBuildSelect(gallivm->builder, indexed,
1567 ctx->abi.base_vertex, ctx->i32_0, "");
1568 break;
1569 }
1570
1571 case TGSI_SEMANTIC_BASEINSTANCE:
1572 value = ctx->abi.start_instance;
1573 break;
1574
1575 case TGSI_SEMANTIC_DRAWID:
1576 value = ctx->abi.draw_id;
1577 break;
1578
1579 case TGSI_SEMANTIC_INVOCATIONID:
1580 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1581 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1582 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1583 value = LLVMGetParam(ctx->main_fn,
1584 ctx->param_gs_instance_id);
1585 else
1586 assert(!"INVOCATIONID not implemented");
1587 break;
1588
1589 case TGSI_SEMANTIC_POSITION:
1590 {
1591 LLVMValueRef pos[4] = {
1592 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1593 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1594 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1595 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1596 LLVMGetParam(ctx->main_fn,
1597 SI_PARAM_POS_W_FLOAT)),
1598 };
1599 value = lp_build_gather_values(gallivm, pos, 4);
1600 break;
1601 }
1602
1603 case TGSI_SEMANTIC_FACE:
1604 value = ctx->abi.front_face;
1605 break;
1606
1607 case TGSI_SEMANTIC_SAMPLEID:
1608 value = get_sample_id(ctx);
1609 break;
1610
1611 case TGSI_SEMANTIC_SAMPLEPOS: {
1612 LLVMValueRef pos[4] = {
1613 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1614 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1615 LLVMConstReal(ctx->f32, 0),
1616 LLVMConstReal(ctx->f32, 0)
1617 };
1618 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1619 TGSI_OPCODE_FRC, pos[0]);
1620 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1621 TGSI_OPCODE_FRC, pos[1]);
1622 value = lp_build_gather_values(gallivm, pos, 4);
1623 break;
1624 }
1625
1626 case TGSI_SEMANTIC_SAMPLEMASK:
1627 /* This can only occur with the OpenGL Core profile, which
1628 * doesn't support smoothing.
1629 */
1630 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1631 break;
1632
1633 case TGSI_SEMANTIC_TESSCOORD:
1634 {
1635 LLVMValueRef coord[4] = {
1636 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1637 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1638 bld->zero,
1639 bld->zero
1640 };
1641
1642 /* For triangles, the vector should be (u, v, 1-u-v). */
1643 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1644 PIPE_PRIM_TRIANGLES)
1645 coord[2] = lp_build_sub(bld, bld->one,
1646 lp_build_add(bld, coord[0], coord[1]));
1647
1648 value = lp_build_gather_values(gallivm, coord, 4);
1649 break;
1650 }
1651
1652 case TGSI_SEMANTIC_VERTICESIN:
1653 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1654 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1655 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1656 value = get_num_tcs_out_vertices(ctx);
1657 else
1658 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1659 break;
1660
1661 case TGSI_SEMANTIC_TESSINNER:
1662 case TGSI_SEMANTIC_TESSOUTER:
1663 {
1664 LLVMValueRef buffer, base, addr;
1665 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1666
1667 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1668
1669 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1670 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1671 LLVMConstInt(ctx->i32, param, 0));
1672
1673 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1674 ~0, buffer, base, addr, true);
1675
1676 break;
1677 }
1678
1679 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1680 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1681 {
1682 LLVMValueRef buf, slot, val[4];
1683 int i, offset;
1684
1685 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1686 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1687 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1688 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1689
1690 for (i = 0; i < 4; i++)
1691 val[i] = buffer_load_const(ctx, buf,
1692 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1693 value = lp_build_gather_values(gallivm, val, 4);
1694 break;
1695 }
1696
1697 case TGSI_SEMANTIC_PRIMID:
1698 value = get_primitive_id(ctx, 0);
1699 break;
1700
1701 case TGSI_SEMANTIC_GRID_SIZE:
1702 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1703 break;
1704
1705 case TGSI_SEMANTIC_BLOCK_SIZE:
1706 {
1707 LLVMValueRef values[3];
1708 unsigned i;
1709 unsigned *properties = ctx->shader->selector->info.properties;
1710
1711 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1712 unsigned sizes[3] = {
1713 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1714 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1715 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1716 };
1717
1718 for (i = 0; i < 3; ++i)
1719 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1720
1721 value = lp_build_gather_values(gallivm, values, 3);
1722 } else {
1723 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1724 }
1725 break;
1726 }
1727
1728 case TGSI_SEMANTIC_BLOCK_ID:
1729 {
1730 LLVMValueRef values[3];
1731
1732 for (int i = 0; i < 3; i++) {
1733 values[i] = ctx->i32_0;
1734 if (ctx->param_block_id[i] >= 0) {
1735 values[i] = LLVMGetParam(ctx->main_fn,
1736 ctx->param_block_id[i]);
1737 }
1738 }
1739 value = lp_build_gather_values(gallivm, values, 3);
1740 break;
1741 }
1742
1743 case TGSI_SEMANTIC_THREAD_ID:
1744 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1745 break;
1746
1747 case TGSI_SEMANTIC_HELPER_INVOCATION:
1748 value = lp_build_intrinsic(gallivm->builder,
1749 "llvm.amdgcn.ps.live",
1750 ctx->i1, NULL, 0,
1751 LP_FUNC_ATTR_READNONE);
1752 value = LLVMBuildNot(gallivm->builder, value, "");
1753 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1754 break;
1755
1756 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1757 value = LLVMConstInt(ctx->i32, 64, 0);
1758 break;
1759
1760 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1761 value = ac_get_thread_id(&ctx->ac);
1762 break;
1763
1764 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1765 {
1766 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1767 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1768 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1769 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1770 break;
1771 }
1772
1773 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1774 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1775 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1776 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1777 {
1778 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1779 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1780 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1781 /* All bits set except LSB */
1782 value = LLVMConstInt(ctx->i64, -2, 0);
1783 } else {
1784 /* All bits set */
1785 value = LLVMConstInt(ctx->i64, -1, 0);
1786 }
1787 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1788 value = LLVMBuildShl(gallivm->builder, value, id, "");
1789 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1790 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1791 value = LLVMBuildNot(gallivm->builder, value, "");
1792 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1793 break;
1794 }
1795
1796 default:
1797 assert(!"unknown system value");
1798 return;
1799 }
1800
1801 ctx->system_values[index] = value;
1802 }
1803
1804 void si_declare_compute_memory(struct si_shader_context *ctx,
1805 const struct tgsi_full_declaration *decl)
1806 {
1807 struct si_shader_selector *sel = ctx->shader->selector;
1808 struct gallivm_state *gallivm = &ctx->gallivm;
1809
1810 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1811 LLVMValueRef var;
1812
1813 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1814 assert(decl->Range.First == decl->Range.Last);
1815 assert(!ctx->shared_memory);
1816
1817 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1818 LLVMArrayType(ctx->i8, sel->local_size),
1819 "compute_lds",
1820 LOCAL_ADDR_SPACE);
1821 LLVMSetAlignment(var, 4);
1822
1823 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1824 }
1825
1826 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1827 {
1828 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1829 ctx->param_const_and_shader_buffers);
1830
1831 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1832 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1833 }
1834
1835 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1836 {
1837 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1838 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1839
1840 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1841 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1842 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1843
1844 return ac_build_indexed_load_const(&ctx->ac, ptr, index);
1845 }
1846
1847 static LLVMValueRef
1848 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1849 {
1850 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1851 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
1852 ctx->param_const_and_shader_buffers);
1853
1854 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1855 index = LLVMBuildSub(ctx->gallivm.builder,
1856 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1857 index, "");
1858
1859 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
1860 }
1861
1862 static LLVMValueRef fetch_constant(
1863 struct lp_build_tgsi_context *bld_base,
1864 const struct tgsi_full_src_register *reg,
1865 enum tgsi_opcode_type type,
1866 unsigned swizzle)
1867 {
1868 struct si_shader_context *ctx = si_shader_context(bld_base);
1869 struct lp_build_context *base = &bld_base->base;
1870 const struct tgsi_ind_register *ireg = &reg->Indirect;
1871 unsigned buf, idx;
1872
1873 LLVMValueRef addr, bufp;
1874 LLVMValueRef result;
1875
1876 if (swizzle == LP_CHAN_ALL) {
1877 unsigned chan;
1878 LLVMValueRef values[4];
1879 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1880 values[chan] = fetch_constant(bld_base, reg, type, chan);
1881
1882 return lp_build_gather_values(&ctx->gallivm, values, 4);
1883 }
1884
1885 assert(reg->Register.Dimension);
1886 buf = reg->Dimension.Index;
1887 idx = reg->Register.Index * 4 + swizzle;
1888
1889 if (reg->Dimension.Indirect) {
1890 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1891 LLVMValueRef index;
1892 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1893 reg->Dimension.Index,
1894 ctx->num_const_buffers);
1895 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1896 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1897 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1898 } else
1899 bufp = load_const_buffer_desc(ctx, buf);
1900
1901 if (reg->Register.Indirect) {
1902 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1903 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1904 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1905 addr = lp_build_add(&bld_base->uint_bld, addr,
1906 LLVMConstInt(ctx->i32, idx * 4, 0));
1907 } else {
1908 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1909 }
1910
1911 result = buffer_load_const(ctx, bufp, addr);
1912
1913 if (!tgsi_type_is_64bit(type))
1914 result = bitcast(bld_base, type, result);
1915 else {
1916 LLVMValueRef addr2, result2;
1917
1918 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1919 LLVMConstInt(ctx->i32, 4, 0));
1920 result2 = buffer_load_const(ctx, bufp, addr2);
1921
1922 result = si_llvm_emit_fetch_64bit(bld_base, type,
1923 result, result2);
1924 }
1925 return result;
1926 }
1927
1928 /* Upper 16 bits must be zero. */
1929 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1930 LLVMValueRef val[2])
1931 {
1932 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1933 LLVMBuildShl(ctx->gallivm.builder, val[1],
1934 LLVMConstInt(ctx->i32, 16, 0),
1935 ""), "");
1936 }
1937
1938 /* Upper 16 bits are ignored and will be dropped. */
1939 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1940 LLVMValueRef val[2])
1941 {
1942 LLVMValueRef v[2] = {
1943 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1944 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1945 val[1],
1946 };
1947 return si_llvm_pack_two_int16(ctx, v);
1948 }
1949
1950 /* Initialize arguments for the shader export intrinsic */
1951 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1952 LLVMValueRef *values,
1953 unsigned target,
1954 struct ac_export_args *args)
1955 {
1956 struct si_shader_context *ctx = si_shader_context(bld_base);
1957 struct lp_build_context *base = &bld_base->base;
1958 LLVMBuilderRef builder = ctx->gallivm.builder;
1959 LLVMValueRef val[4];
1960 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1961 unsigned chan;
1962 bool is_int8, is_int10;
1963
1964 /* Default is 0xf. Adjusted below depending on the format. */
1965 args->enabled_channels = 0xf; /* writemask */
1966
1967 /* Specify whether the EXEC mask represents the valid mask */
1968 args->valid_mask = 0;
1969
1970 /* Specify whether this is the last export */
1971 args->done = 0;
1972
1973 /* Specify the target we are exporting */
1974 args->target = target;
1975
1976 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1977 const struct si_shader_key *key = &ctx->shader->key;
1978 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1979 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1980
1981 assert(cbuf >= 0 && cbuf < 8);
1982 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1983 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1984 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1985 }
1986
1987 args->compr = false;
1988 args->out[0] = base->undef;
1989 args->out[1] = base->undef;
1990 args->out[2] = base->undef;
1991 args->out[3] = base->undef;
1992
1993 switch (spi_shader_col_format) {
1994 case V_028714_SPI_SHADER_ZERO:
1995 args->enabled_channels = 0; /* writemask */
1996 args->target = V_008DFC_SQ_EXP_NULL;
1997 break;
1998
1999 case V_028714_SPI_SHADER_32_R:
2000 args->enabled_channels = 1; /* writemask */
2001 args->out[0] = values[0];
2002 break;
2003
2004 case V_028714_SPI_SHADER_32_GR:
2005 args->enabled_channels = 0x3; /* writemask */
2006 args->out[0] = values[0];
2007 args->out[1] = values[1];
2008 break;
2009
2010 case V_028714_SPI_SHADER_32_AR:
2011 args->enabled_channels = 0x9; /* writemask */
2012 args->out[0] = values[0];
2013 args->out[3] = values[3];
2014 break;
2015
2016 case V_028714_SPI_SHADER_FP16_ABGR:
2017 args->compr = 1; /* COMPR flag */
2018
2019 for (chan = 0; chan < 2; chan++) {
2020 LLVMValueRef pack_args[2] = {
2021 values[2 * chan],
2022 values[2 * chan + 1]
2023 };
2024 LLVMValueRef packed;
2025
2026 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
2027 args->out[chan] =
2028 LLVMBuildBitCast(ctx->gallivm.builder,
2029 packed, ctx->f32, "");
2030 }
2031 break;
2032
2033 case V_028714_SPI_SHADER_UNORM16_ABGR:
2034 for (chan = 0; chan < 4; chan++) {
2035 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
2036 val[chan] = LLVMBuildFMul(builder, val[chan],
2037 LLVMConstReal(ctx->f32, 65535), "");
2038 val[chan] = LLVMBuildFAdd(builder, val[chan],
2039 LLVMConstReal(ctx->f32, 0.5), "");
2040 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2041 ctx->i32, "");
2042 }
2043
2044 args->compr = 1; /* COMPR flag */
2045 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2046 si_llvm_pack_two_int16(ctx, val));
2047 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2048 si_llvm_pack_two_int16(ctx, val+2));
2049 break;
2050
2051 case V_028714_SPI_SHADER_SNORM16_ABGR:
2052 for (chan = 0; chan < 4; chan++) {
2053 /* Clamp between [-1, 1]. */
2054 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2055 values[chan],
2056 LLVMConstReal(ctx->f32, 1));
2057 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2058 val[chan],
2059 LLVMConstReal(ctx->f32, -1));
2060 /* Convert to a signed integer in [-32767, 32767]. */
2061 val[chan] = LLVMBuildFMul(builder, val[chan],
2062 LLVMConstReal(ctx->f32, 32767), "");
2063 /* If positive, add 0.5, else add -0.5. */
2064 val[chan] = LLVMBuildFAdd(builder, val[chan],
2065 LLVMBuildSelect(builder,
2066 LLVMBuildFCmp(builder, LLVMRealOGE,
2067 val[chan], base->zero, ""),
2068 LLVMConstReal(ctx->f32, 0.5),
2069 LLVMConstReal(ctx->f32, -0.5), ""), "");
2070 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2071 }
2072
2073 args->compr = 1; /* COMPR flag */
2074 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2075 si_llvm_pack_two_int32_as_int16(ctx, val));
2076 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2077 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2078 break;
2079
2080 case V_028714_SPI_SHADER_UINT16_ABGR: {
2081 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2082 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2083 LLVMValueRef max_alpha =
2084 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2085
2086 /* Clamp. */
2087 for (chan = 0; chan < 4; chan++) {
2088 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2089 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2090 val[chan],
2091 chan == 3 ? max_alpha : max_rgb);
2092 }
2093
2094 args->compr = 1; /* COMPR flag */
2095 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2096 si_llvm_pack_two_int16(ctx, val));
2097 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2098 si_llvm_pack_two_int16(ctx, val+2));
2099 break;
2100 }
2101
2102 case V_028714_SPI_SHADER_SINT16_ABGR: {
2103 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2104 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2105 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2106 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2107 LLVMValueRef max_alpha =
2108 !is_int10 ? max_rgb : ctx->i32_1;
2109 LLVMValueRef min_alpha =
2110 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2111
2112 /* Clamp. */
2113 for (chan = 0; chan < 4; chan++) {
2114 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2115 val[chan] = lp_build_emit_llvm_binary(bld_base,
2116 TGSI_OPCODE_IMIN,
2117 val[chan], chan == 3 ? max_alpha : max_rgb);
2118 val[chan] = lp_build_emit_llvm_binary(bld_base,
2119 TGSI_OPCODE_IMAX,
2120 val[chan], chan == 3 ? min_alpha : min_rgb);
2121 }
2122
2123 args->compr = 1; /* COMPR flag */
2124 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2125 si_llvm_pack_two_int32_as_int16(ctx, val));
2126 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2127 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2128 break;
2129 }
2130
2131 case V_028714_SPI_SHADER_32_ABGR:
2132 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2133 break;
2134 }
2135 }
2136
2137 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2138 LLVMValueRef alpha)
2139 {
2140 struct si_shader_context *ctx = si_shader_context(bld_base);
2141
2142 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2143 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2144 SI_PARAM_ALPHA_REF);
2145
2146 LLVMValueRef alpha_pass =
2147 lp_build_cmp(&bld_base->base,
2148 ctx->shader->key.part.ps.epilog.alpha_func,
2149 alpha, alpha_ref);
2150 LLVMValueRef arg =
2151 lp_build_select(&bld_base->base,
2152 alpha_pass,
2153 LLVMConstReal(ctx->f32, 1.0f),
2154 LLVMConstReal(ctx->f32, -1.0f));
2155
2156 ac_build_kill(&ctx->ac, arg);
2157 } else {
2158 ac_build_kill(&ctx->ac, NULL);
2159 }
2160 }
2161
2162 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2163 LLVMValueRef alpha,
2164 unsigned samplemask_param)
2165 {
2166 struct si_shader_context *ctx = si_shader_context(bld_base);
2167 struct gallivm_state *gallivm = &ctx->gallivm;
2168 LLVMValueRef coverage;
2169
2170 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2171 coverage = LLVMGetParam(ctx->main_fn,
2172 samplemask_param);
2173 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2174
2175 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2176 ctx->i32,
2177 &coverage, 1, LP_FUNC_ATTR_READNONE);
2178
2179 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2180 ctx->f32, "");
2181
2182 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2183 LLVMConstReal(ctx->f32,
2184 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2185
2186 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2187 }
2188
2189 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2190 struct ac_export_args *pos, LLVMValueRef *out_elts)
2191 {
2192 struct si_shader_context *ctx = si_shader_context(bld_base);
2193 struct lp_build_context *base = &bld_base->base;
2194 unsigned reg_index;
2195 unsigned chan;
2196 unsigned const_chan;
2197 LLVMValueRef base_elt;
2198 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2199 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2200 SI_VS_CONST_CLIP_PLANES, 0);
2201 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2202
2203 for (reg_index = 0; reg_index < 2; reg_index ++) {
2204 struct ac_export_args *args = &pos[2 + reg_index];
2205
2206 args->out[0] =
2207 args->out[1] =
2208 args->out[2] =
2209 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2210
2211 /* Compute dot products of position and user clip plane vectors */
2212 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2213 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2214 LLVMValueRef addr =
2215 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2216 const_chan) * 4, 0);
2217 base_elt = buffer_load_const(ctx, const_resource,
2218 addr);
2219 args->out[chan] =
2220 lp_build_add(base, args->out[chan],
2221 lp_build_mul(base, base_elt,
2222 out_elts[const_chan]));
2223 }
2224 }
2225
2226 args->enabled_channels = 0xf;
2227 args->valid_mask = 0;
2228 args->done = 0;
2229 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2230 args->compr = 0;
2231 }
2232 }
2233
2234 static void si_dump_streamout(struct pipe_stream_output_info *so)
2235 {
2236 unsigned i;
2237
2238 if (so->num_outputs)
2239 fprintf(stderr, "STREAMOUT\n");
2240
2241 for (i = 0; i < so->num_outputs; i++) {
2242 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2243 so->output[i].start_component;
2244 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2245 i, so->output[i].output_buffer,
2246 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2247 so->output[i].register_index,
2248 mask & 1 ? "x" : "",
2249 mask & 2 ? "y" : "",
2250 mask & 4 ? "z" : "",
2251 mask & 8 ? "w" : "");
2252 }
2253 }
2254
2255 static void emit_streamout_output(struct si_shader_context *ctx,
2256 LLVMValueRef const *so_buffers,
2257 LLVMValueRef const *so_write_offsets,
2258 struct pipe_stream_output *stream_out,
2259 struct si_shader_output_values *shader_out)
2260 {
2261 struct gallivm_state *gallivm = &ctx->gallivm;
2262 LLVMBuilderRef builder = gallivm->builder;
2263 unsigned buf_idx = stream_out->output_buffer;
2264 unsigned start = stream_out->start_component;
2265 unsigned num_comps = stream_out->num_components;
2266 LLVMValueRef out[4];
2267
2268 assert(num_comps && num_comps <= 4);
2269 if (!num_comps || num_comps > 4)
2270 return;
2271
2272 /* Load the output as int. */
2273 for (int j = 0; j < num_comps; j++) {
2274 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2275
2276 out[j] = LLVMBuildBitCast(builder,
2277 shader_out->values[start + j],
2278 ctx->i32, "");
2279 }
2280
2281 /* Pack the output. */
2282 LLVMValueRef vdata = NULL;
2283
2284 switch (num_comps) {
2285 case 1: /* as i32 */
2286 vdata = out[0];
2287 break;
2288 case 2: /* as v2i32 */
2289 case 3: /* as v4i32 (aligned to 4) */
2290 case 4: /* as v4i32 */
2291 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2292 for (int j = 0; j < num_comps; j++) {
2293 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2294 LLVMConstInt(ctx->i32, j, 0), "");
2295 }
2296 break;
2297 }
2298
2299 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2300 vdata, num_comps,
2301 so_write_offsets[buf_idx],
2302 ctx->i32_0,
2303 stream_out->dst_offset * 4, 1, 1, true, false);
2304 }
2305
2306 /**
2307 * Write streamout data to buffers for vertex stream @p stream (different
2308 * vertex streams can occur for GS copy shaders).
2309 */
2310 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2311 struct si_shader_output_values *outputs,
2312 unsigned noutput, unsigned stream)
2313 {
2314 struct si_shader_selector *sel = ctx->shader->selector;
2315 struct pipe_stream_output_info *so = &sel->so;
2316 struct gallivm_state *gallivm = &ctx->gallivm;
2317 LLVMBuilderRef builder = gallivm->builder;
2318 int i;
2319 struct lp_build_if_state if_ctx;
2320
2321 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2322 LLVMValueRef so_vtx_count =
2323 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2324
2325 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2326
2327 /* can_emit = tid < so_vtx_count; */
2328 LLVMValueRef can_emit =
2329 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2330
2331 /* Emit the streamout code conditionally. This actually avoids
2332 * out-of-bounds buffer access. The hw tells us via the SGPR
2333 * (so_vtx_count) which threads are allowed to emit streamout data. */
2334 lp_build_if(&if_ctx, gallivm, can_emit);
2335 {
2336 /* The buffer offset is computed as follows:
2337 * ByteOffset = streamout_offset[buffer_id]*4 +
2338 * (streamout_write_index + thread_id)*stride[buffer_id] +
2339 * attrib_offset
2340 */
2341
2342 LLVMValueRef so_write_index =
2343 LLVMGetParam(ctx->main_fn,
2344 ctx->param_streamout_write_index);
2345
2346 /* Compute (streamout_write_index + thread_id). */
2347 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2348
2349 /* Load the descriptor and compute the write offset for each
2350 * enabled buffer. */
2351 LLVMValueRef so_write_offset[4] = {};
2352 LLVMValueRef so_buffers[4];
2353 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2354 ctx->param_rw_buffers);
2355
2356 for (i = 0; i < 4; i++) {
2357 if (!so->stride[i])
2358 continue;
2359
2360 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2361 SI_VS_STREAMOUT_BUF0 + i, 0);
2362
2363 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2364
2365 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2366 ctx->param_streamout_offset[i]);
2367 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2368
2369 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2370 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2371 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2372 }
2373
2374 /* Write streamout data. */
2375 for (i = 0; i < so->num_outputs; i++) {
2376 unsigned reg = so->output[i].register_index;
2377
2378 if (reg >= noutput)
2379 continue;
2380
2381 if (stream != so->output[i].stream)
2382 continue;
2383
2384 emit_streamout_output(ctx, so_buffers, so_write_offset,
2385 &so->output[i], &outputs[reg]);
2386 }
2387 }
2388 lp_build_endif(&if_ctx);
2389 }
2390
2391 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2392 LLVMValueRef *values)
2393 {
2394 struct ac_export_args args;
2395
2396 si_llvm_init_export_args(&ctx->bld_base, values,
2397 V_008DFC_SQ_EXP_PARAM + index, &args);
2398 ac_build_export(&ctx->ac, &args);
2399 }
2400
2401 static void si_build_param_exports(struct si_shader_context *ctx,
2402 struct si_shader_output_values *outputs,
2403 unsigned noutput)
2404 {
2405 struct si_shader *shader = ctx->shader;
2406 unsigned param_count = 0;
2407
2408 for (unsigned i = 0; i < noutput; i++) {
2409 unsigned semantic_name = outputs[i].semantic_name;
2410 unsigned semantic_index = outputs[i].semantic_index;
2411
2412 if (outputs[i].vertex_stream[0] != 0 &&
2413 outputs[i].vertex_stream[1] != 0 &&
2414 outputs[i].vertex_stream[2] != 0 &&
2415 outputs[i].vertex_stream[3] != 0)
2416 continue;
2417
2418 switch (semantic_name) {
2419 case TGSI_SEMANTIC_LAYER:
2420 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2421 case TGSI_SEMANTIC_CLIPDIST:
2422 case TGSI_SEMANTIC_COLOR:
2423 case TGSI_SEMANTIC_BCOLOR:
2424 case TGSI_SEMANTIC_PRIMID:
2425 case TGSI_SEMANTIC_FOG:
2426 case TGSI_SEMANTIC_TEXCOORD:
2427 case TGSI_SEMANTIC_GENERIC:
2428 break;
2429 default:
2430 continue;
2431 }
2432
2433 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2434 semantic_index < SI_MAX_IO_GENERIC) &&
2435 shader->key.opt.kill_outputs &
2436 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2437 continue;
2438
2439 si_export_param(ctx, param_count, outputs[i].values);
2440
2441 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2442 shader->info.vs_output_param_offset[i] = param_count++;
2443 }
2444
2445 shader->info.nr_param_exports = param_count;
2446 }
2447
2448 /* Generate export instructions for hardware VS shader stage */
2449 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2450 struct si_shader_output_values *outputs,
2451 unsigned noutput)
2452 {
2453 struct si_shader_context *ctx = si_shader_context(bld_base);
2454 struct si_shader *shader = ctx->shader;
2455 struct lp_build_context *base = &bld_base->base;
2456 struct ac_export_args pos_args[4] = {};
2457 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2458 unsigned pos_idx;
2459 int i;
2460
2461 /* Build position exports. */
2462 for (i = 0; i < noutput; i++) {
2463 switch (outputs[i].semantic_name) {
2464 case TGSI_SEMANTIC_POSITION:
2465 si_llvm_init_export_args(bld_base, outputs[i].values,
2466 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2467 break;
2468 case TGSI_SEMANTIC_PSIZE:
2469 psize_value = outputs[i].values[0];
2470 break;
2471 case TGSI_SEMANTIC_LAYER:
2472 layer_value = outputs[i].values[0];
2473 break;
2474 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2475 viewport_index_value = outputs[i].values[0];
2476 break;
2477 case TGSI_SEMANTIC_EDGEFLAG:
2478 edgeflag_value = outputs[i].values[0];
2479 break;
2480 case TGSI_SEMANTIC_CLIPDIST:
2481 if (!shader->key.opt.clip_disable) {
2482 unsigned index = 2 + outputs[i].semantic_index;
2483 si_llvm_init_export_args(bld_base, outputs[i].values,
2484 V_008DFC_SQ_EXP_POS + index,
2485 &pos_args[index]);
2486 }
2487 break;
2488 case TGSI_SEMANTIC_CLIPVERTEX:
2489 if (!shader->key.opt.clip_disable) {
2490 si_llvm_emit_clipvertex(bld_base, pos_args,
2491 outputs[i].values);
2492 }
2493 break;
2494 }
2495 }
2496
2497 /* We need to add the position output manually if it's missing. */
2498 if (!pos_args[0].out[0]) {
2499 pos_args[0].enabled_channels = 0xf; /* writemask */
2500 pos_args[0].valid_mask = 0; /* EXEC mask */
2501 pos_args[0].done = 0; /* last export? */
2502 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2503 pos_args[0].compr = 0; /* COMPR flag */
2504 pos_args[0].out[0] = base->zero; /* X */
2505 pos_args[0].out[1] = base->zero; /* Y */
2506 pos_args[0].out[2] = base->zero; /* Z */
2507 pos_args[0].out[3] = base->one; /* W */
2508 }
2509
2510 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2511 if (shader->selector->info.writes_psize ||
2512 shader->selector->info.writes_edgeflag ||
2513 shader->selector->info.writes_viewport_index ||
2514 shader->selector->info.writes_layer) {
2515 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2516 (shader->selector->info.writes_edgeflag << 1) |
2517 (shader->selector->info.writes_layer << 2);
2518
2519 pos_args[1].valid_mask = 0; /* EXEC mask */
2520 pos_args[1].done = 0; /* last export? */
2521 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2522 pos_args[1].compr = 0; /* COMPR flag */
2523 pos_args[1].out[0] = base->zero; /* X */
2524 pos_args[1].out[1] = base->zero; /* Y */
2525 pos_args[1].out[2] = base->zero; /* Z */
2526 pos_args[1].out[3] = base->zero; /* W */
2527
2528 if (shader->selector->info.writes_psize)
2529 pos_args[1].out[0] = psize_value;
2530
2531 if (shader->selector->info.writes_edgeflag) {
2532 /* The output is a float, but the hw expects an integer
2533 * with the first bit containing the edge flag. */
2534 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2535 edgeflag_value,
2536 ctx->i32, "");
2537 edgeflag_value = ac_build_umin(&ctx->ac,
2538 edgeflag_value,
2539 ctx->i32_1);
2540
2541 /* The LLVM intrinsic expects a float. */
2542 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2543 edgeflag_value,
2544 ctx->f32, "");
2545 }
2546
2547 if (ctx->screen->b.chip_class >= GFX9) {
2548 /* GFX9 has the layer in out.z[10:0] and the viewport
2549 * index in out.z[19:16].
2550 */
2551 if (shader->selector->info.writes_layer)
2552 pos_args[1].out[2] = layer_value;
2553
2554 if (shader->selector->info.writes_viewport_index) {
2555 LLVMValueRef v = viewport_index_value;
2556
2557 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2558 v = LLVMBuildShl(ctx->gallivm.builder, v,
2559 LLVMConstInt(ctx->i32, 16, 0), "");
2560 v = LLVMBuildOr(ctx->gallivm.builder, v,
2561 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2562 pos_args[1].out[2]), "");
2563 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2564 pos_args[1].enabled_channels |= 1 << 2;
2565 }
2566 } else {
2567 if (shader->selector->info.writes_layer)
2568 pos_args[1].out[2] = layer_value;
2569
2570 if (shader->selector->info.writes_viewport_index) {
2571 pos_args[1].out[3] = viewport_index_value;
2572 pos_args[1].enabled_channels |= 1 << 3;
2573 }
2574 }
2575 }
2576
2577 for (i = 0; i < 4; i++)
2578 if (pos_args[i].out[0])
2579 shader->info.nr_pos_exports++;
2580
2581 pos_idx = 0;
2582 for (i = 0; i < 4; i++) {
2583 if (!pos_args[i].out[0])
2584 continue;
2585
2586 /* Specify the target we are exporting */
2587 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2588
2589 if (pos_idx == shader->info.nr_pos_exports)
2590 /* Specify that this is the last export */
2591 pos_args[i].done = 1;
2592
2593 ac_build_export(&ctx->ac, &pos_args[i]);
2594 }
2595
2596 /* Build parameter exports. */
2597 si_build_param_exports(ctx, outputs, noutput);
2598 }
2599
2600 /**
2601 * Forward all outputs from the vertex shader to the TES. This is only used
2602 * for the fixed function TCS.
2603 */
2604 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2605 {
2606 struct si_shader_context *ctx = si_shader_context(bld_base);
2607 struct gallivm_state *gallivm = &ctx->gallivm;
2608 LLVMValueRef invocation_id, buffer, buffer_offset;
2609 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2610 uint64_t inputs;
2611
2612 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2613 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2614 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2615
2616 lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
2617 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2618 lds_vertex_stride, "");
2619 lds_base = get_tcs_in_current_patch_offset(ctx);
2620 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2621
2622 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2623 while (inputs) {
2624 unsigned i = u_bit_scan64(&inputs);
2625
2626 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2627 LLVMConstInt(ctx->i32, 4 * i, 0),
2628 "");
2629
2630 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2631 get_rel_patch_id(ctx),
2632 invocation_id,
2633 LLVMConstInt(ctx->i32, i, 0));
2634
2635 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2636 lds_ptr);
2637
2638 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2639 buffer_offset, 0, 1, 0, true, false);
2640 }
2641 }
2642
2643 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2644 LLVMValueRef rel_patch_id,
2645 LLVMValueRef invocation_id,
2646 LLVMValueRef tcs_out_current_patch_data_offset)
2647 {
2648 struct si_shader_context *ctx = si_shader_context(bld_base);
2649 struct gallivm_state *gallivm = &ctx->gallivm;
2650 struct si_shader *shader = ctx->shader;
2651 unsigned tess_inner_index, tess_outer_index;
2652 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2653 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2654 unsigned stride, outer_comps, inner_comps, i, offset;
2655 struct lp_build_if_state if_ctx, inner_if_ctx;
2656
2657 si_llvm_emit_barrier(NULL, bld_base, NULL);
2658
2659 /* Do this only for invocation 0, because the tess levels are per-patch,
2660 * not per-vertex.
2661 *
2662 * This can't jump, because invocation 0 executes this. It should
2663 * at least mask out the loads and stores for other invocations.
2664 */
2665 lp_build_if(&if_ctx, gallivm,
2666 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2667 invocation_id, ctx->i32_0, ""));
2668
2669 /* Determine the layout of one tess factor element in the buffer. */
2670 switch (shader->key.part.tcs.epilog.prim_mode) {
2671 case PIPE_PRIM_LINES:
2672 stride = 2; /* 2 dwords, 1 vec2 store */
2673 outer_comps = 2;
2674 inner_comps = 0;
2675 break;
2676 case PIPE_PRIM_TRIANGLES:
2677 stride = 4; /* 4 dwords, 1 vec4 store */
2678 outer_comps = 3;
2679 inner_comps = 1;
2680 break;
2681 case PIPE_PRIM_QUADS:
2682 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2683 outer_comps = 4;
2684 inner_comps = 2;
2685 break;
2686 default:
2687 assert(0);
2688 return;
2689 }
2690
2691 /* Load tess_inner and tess_outer from LDS.
2692 * Any invocation can write them, so we can't get them from a temporary.
2693 */
2694 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2695 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2696
2697 lds_base = tcs_out_current_patch_data_offset;
2698 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2699 LLVMConstInt(ctx->i32,
2700 tess_inner_index * 4, 0), "");
2701 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2702 LLVMConstInt(ctx->i32,
2703 tess_outer_index * 4, 0), "");
2704
2705 for (i = 0; i < 4; i++) {
2706 inner[i] = LLVMGetUndef(ctx->i32);
2707 outer[i] = LLVMGetUndef(ctx->i32);
2708 }
2709
2710 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2711 /* For isolines, the hardware expects tess factors in the
2712 * reverse order from what GLSL / TGSI specify.
2713 */
2714 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2715 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2716 } else {
2717 for (i = 0; i < outer_comps; i++) {
2718 outer[i] = out[i] =
2719 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2720 }
2721 for (i = 0; i < inner_comps; i++) {
2722 inner[i] = out[outer_comps+i] =
2723 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2724 }
2725 }
2726
2727 /* Convert the outputs to vectors for stores. */
2728 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2729 vec1 = NULL;
2730
2731 if (stride > 4)
2732 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2733
2734 /* Get the buffer. */
2735 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2736
2737 /* Get the offset. */
2738 tf_base = LLVMGetParam(ctx->main_fn,
2739 ctx->param_tcs_factor_offset);
2740 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2741 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2742
2743 lp_build_if(&inner_if_ctx, gallivm,
2744 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2745 rel_patch_id, ctx->i32_0, ""));
2746
2747 /* Store the dynamic HS control word. */
2748 offset = 0;
2749 if (ctx->screen->b.chip_class <= VI) {
2750 ac_build_buffer_store_dword(&ctx->ac, buffer,
2751 LLVMConstInt(ctx->i32, 0x80000000, 0),
2752 1, ctx->i32_0, tf_base,
2753 offset, 1, 0, true, false);
2754 offset += 4;
2755 }
2756
2757 lp_build_endif(&inner_if_ctx);
2758
2759 /* Store the tessellation factors. */
2760 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2761 MIN2(stride, 4), byteoffset, tf_base,
2762 offset, 1, 0, true, false);
2763 offset += 16;
2764 if (vec1)
2765 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2766 stride - 4, byteoffset, tf_base,
2767 offset, 1, 0, true, false);
2768
2769 /* Store the tess factors into the offchip buffer if TES reads them. */
2770 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2771 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2772 LLVMValueRef tf_inner_offset;
2773 unsigned param_outer, param_inner;
2774
2775 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2776 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2777
2778 param_outer = si_shader_io_get_unique_index_patch(
2779 TGSI_SEMANTIC_TESSOUTER, 0);
2780 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2781 LLVMConstInt(ctx->i32, param_outer, 0));
2782
2783 outer_vec = lp_build_gather_values(gallivm, outer,
2784 util_next_power_of_two(outer_comps));
2785
2786 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2787 outer_comps, tf_outer_offset,
2788 base, 0, 1, 0, true, false);
2789 if (inner_comps) {
2790 param_inner = si_shader_io_get_unique_index_patch(
2791 TGSI_SEMANTIC_TESSINNER, 0);
2792 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2793 LLVMConstInt(ctx->i32, param_inner, 0));
2794
2795 inner_vec = inner_comps == 1 ? inner[0] :
2796 lp_build_gather_values(gallivm, inner, inner_comps);
2797 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2798 inner_comps, tf_inner_offset,
2799 base, 0, 1, 0, true, false);
2800 }
2801 }
2802
2803 lp_build_endif(&if_ctx);
2804 }
2805
2806 static LLVMValueRef
2807 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2808 unsigned param, unsigned return_index)
2809 {
2810 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2811 LLVMGetParam(ctx->main_fn, param),
2812 return_index, "");
2813 }
2814
2815 static LLVMValueRef
2816 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2817 unsigned param, unsigned return_index)
2818 {
2819 LLVMBuilderRef builder = ctx->gallivm.builder;
2820 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2821
2822 return LLVMBuildInsertValue(builder, ret,
2823 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2824 return_index, "");
2825 }
2826
2827 static LLVMValueRef
2828 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2829 unsigned param, unsigned return_index)
2830 {
2831 LLVMBuilderRef builder = ctx->gallivm.builder;
2832 LLVMValueRef ptr, lo, hi;
2833
2834 ptr = LLVMGetParam(ctx->main_fn, param);
2835 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2836 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2837 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2838 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2839 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2840 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2841 }
2842
2843 /* This only writes the tessellation factor levels. */
2844 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2845 {
2846 struct si_shader_context *ctx = si_shader_context(bld_base);
2847 LLVMBuilderRef builder = ctx->gallivm.builder;
2848 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2849
2850 si_copy_tcs_inputs(bld_base);
2851
2852 rel_patch_id = get_rel_patch_id(ctx);
2853 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2854 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2855
2856 if (ctx->screen->b.chip_class >= GFX9) {
2857 LLVMBasicBlockRef blocks[2] = {
2858 LLVMGetInsertBlock(builder),
2859 ctx->merged_wrap_if_state.entry_block
2860 };
2861 LLVMValueRef values[2];
2862
2863 lp_build_endif(&ctx->merged_wrap_if_state);
2864
2865 values[0] = rel_patch_id;
2866 values[1] = LLVMGetUndef(ctx->i32);
2867 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2868
2869 values[0] = tf_lds_offset;
2870 values[1] = LLVMGetUndef(ctx->i32);
2871 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2872
2873 values[0] = invocation_id;
2874 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2875 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2876 }
2877
2878 /* Return epilog parameters from this function. */
2879 LLVMValueRef ret = ctx->return_value;
2880 unsigned vgpr;
2881
2882 if (ctx->screen->b.chip_class >= GFX9) {
2883 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2884 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2885 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2886 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2887 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2888 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2889 /* Tess offchip and tess factor offsets are at the beginning. */
2890 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2891 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2892 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2893 } else {
2894 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2895 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2896 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2897 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2898 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2899 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2900 /* Tess offchip and tess factor offsets are after user SGPRs. */
2901 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2902 GFX6_TCS_NUM_USER_SGPR);
2903 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2904 GFX6_TCS_NUM_USER_SGPR + 1);
2905 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2906 }
2907
2908 /* VGPRs */
2909 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2910 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2911 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2912
2913 /* Leave a hole corresponding to the two input VGPRs. This ensures that
2914 * the invocation_id output does not alias the param_tcs_rel_ids input,
2915 * which saves a V_MOV on gfx9.
2916 */
2917 vgpr += 2;
2918
2919 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2920 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2921 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2922 ctx->return_value = ret;
2923 }
2924
2925 /* Pass TCS inputs from LS to TCS on GFX9. */
2926 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2927 {
2928 LLVMValueRef ret = ctx->return_value;
2929
2930 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2931 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2932 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2933 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2934 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2935 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2936 ctx->param_bindless_samplers_and_images,
2937 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
2938
2939 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2940 8 + SI_SGPR_VS_STATE_BITS);
2941 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2942 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2943 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2944 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2945 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2946 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2947 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2948 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2949 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2950 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2951
2952 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2953 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2954 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2955 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2956 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2957
2958 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2959 ret = si_insert_input_ret_float(ctx, ret,
2960 ctx->param_tcs_patch_id, vgpr++);
2961 ret = si_insert_input_ret_float(ctx, ret,
2962 ctx->param_tcs_rel_ids, vgpr++);
2963 ctx->return_value = ret;
2964 }
2965
2966 /* Pass GS inputs from ES to GS on GFX9. */
2967 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2968 {
2969 LLVMValueRef ret = ctx->return_value;
2970
2971 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2972 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2973 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2974
2975 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2976 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2977 ctx->param_bindless_samplers_and_images,
2978 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
2979
2980 unsigned desc_param = ctx->param_vs_state_bits + 1;
2981 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2982 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
2983 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2984 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
2985
2986 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2987 for (unsigned i = 0; i < 5; i++) {
2988 unsigned param = ctx->param_gs_vtx01_offset + i;
2989 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2990 }
2991 ctx->return_value = ret;
2992 }
2993
2994 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2995 {
2996 struct si_shader_context *ctx = si_shader_context(bld_base);
2997 struct si_shader *shader = ctx->shader;
2998 struct tgsi_shader_info *info = &shader->selector->info;
2999 struct gallivm_state *gallivm = &ctx->gallivm;
3000 unsigned i, chan;
3001 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3002 ctx->param_rel_auto_id);
3003 LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3004 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
3005 vertex_dw_stride, "");
3006
3007 /* Write outputs to LDS. The next shader (TCS aka HS) will read
3008 * its inputs from it. */
3009 for (i = 0; i < info->num_outputs; i++) {
3010 LLVMValueRef *out_ptr = ctx->outputs[i];
3011 unsigned name = info->output_semantic_name[i];
3012 unsigned index = info->output_semantic_index[i];
3013
3014 /* The ARB_shader_viewport_layer_array spec contains the
3015 * following issue:
3016 *
3017 * 2) What happens if gl_ViewportIndex or gl_Layer is
3018 * written in the vertex shader and a geometry shader is
3019 * present?
3020 *
3021 * RESOLVED: The value written by the last vertex processing
3022 * stage is used. If the last vertex processing stage
3023 * (vertex, tessellation evaluation or geometry) does not
3024 * statically assign to gl_ViewportIndex or gl_Layer, index
3025 * or layer zero is assumed.
3026 *
3027 * So writes to those outputs in VS-as-LS are simply ignored.
3028 */
3029 if (name == TGSI_SEMANTIC_LAYER ||
3030 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3031 continue;
3032
3033 int param = si_shader_io_get_unique_index(name, index);
3034 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
3035 LLVMConstInt(ctx->i32, param * 4, 0), "");
3036
3037 for (chan = 0; chan < 4; chan++) {
3038 lds_store(bld_base, chan, dw_addr,
3039 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
3040 }
3041 }
3042
3043 if (ctx->screen->b.chip_class >= GFX9)
3044 si_set_ls_return_value_for_tcs(ctx);
3045 }
3046
3047 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
3048 {
3049 struct si_shader_context *ctx = si_shader_context(bld_base);
3050 struct gallivm_state *gallivm = &ctx->gallivm;
3051 struct si_shader *es = ctx->shader;
3052 struct tgsi_shader_info *info = &es->selector->info;
3053 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3054 ctx->param_es2gs_offset);
3055 LLVMValueRef lds_base = NULL;
3056 unsigned chan;
3057 int i;
3058
3059 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
3060 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3061 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3062 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3063 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
3064 LLVMBuildMul(gallivm->builder, wave_idx,
3065 LLVMConstInt(ctx->i32, 64, false), ""), "");
3066 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
3067 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3068 }
3069
3070 for (i = 0; i < info->num_outputs; i++) {
3071 LLVMValueRef *out_ptr = ctx->outputs[i];
3072 int param;
3073
3074 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3075 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3076 continue;
3077
3078 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3079 info->output_semantic_index[i]);
3080
3081 for (chan = 0; chan < 4; chan++) {
3082 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3083 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3084
3085 /* GFX9 has the ESGS ring in LDS. */
3086 if (ctx->screen->b.chip_class >= GFX9) {
3087 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
3088 continue;
3089 }
3090
3091 ac_build_buffer_store_dword(&ctx->ac,
3092 ctx->esgs_ring,
3093 out_val, 1, NULL, soffset,
3094 (4 * param + chan) * 4,
3095 1, 1, true, true);
3096 }
3097 }
3098
3099 if (ctx->screen->b.chip_class >= GFX9)
3100 si_set_es_return_value_for_gs(ctx);
3101 }
3102
3103 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3104 {
3105 if (ctx->screen->b.chip_class >= GFX9)
3106 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3107 else
3108 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3109 }
3110
3111 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3112 {
3113 struct si_shader_context *ctx = si_shader_context(bld_base);
3114
3115 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3116 si_get_gs_wave_id(ctx));
3117
3118 if (ctx->screen->b.chip_class >= GFX9)
3119 lp_build_endif(&ctx->merged_wrap_if_state);
3120 }
3121
3122 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3123 unsigned max_outputs,
3124 LLVMValueRef *addrs)
3125 {
3126 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3127 struct gallivm_state *gallivm = &ctx->gallivm;
3128 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3129 struct si_shader_output_values *outputs = NULL;
3130 int i,j;
3131
3132 assert(!ctx->shader->is_gs_copy_shader);
3133 assert(info->num_outputs <= max_outputs);
3134
3135 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3136
3137 /* Vertex color clamping.
3138 *
3139 * This uses a state constant loaded in a user data SGPR and
3140 * an IF statement is added that clamps all colors if the constant
3141 * is true.
3142 */
3143 if (ctx->type == PIPE_SHADER_VERTEX) {
3144 struct lp_build_if_state if_ctx;
3145 LLVMValueRef cond = NULL;
3146 LLVMValueRef addr, val;
3147
3148 for (i = 0; i < info->num_outputs; i++) {
3149 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3150 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3151 continue;
3152
3153 /* We've found a color. */
3154 if (!cond) {
3155 /* The state is in the first bit of the user SGPR. */
3156 cond = LLVMGetParam(ctx->main_fn,
3157 ctx->param_vs_state_bits);
3158 cond = LLVMBuildTrunc(gallivm->builder, cond,
3159 ctx->i1, "");
3160 lp_build_if(&if_ctx, gallivm, cond);
3161 }
3162
3163 for (j = 0; j < 4; j++) {
3164 addr = addrs[4 * i + j];
3165 val = LLVMBuildLoad(gallivm->builder, addr, "");
3166 val = ac_build_clamp(&ctx->ac, val);
3167 LLVMBuildStore(gallivm->builder, val, addr);
3168 }
3169 }
3170
3171 if (cond)
3172 lp_build_endif(&if_ctx);
3173 }
3174
3175 for (i = 0; i < info->num_outputs; i++) {
3176 outputs[i].semantic_name = info->output_semantic_name[i];
3177 outputs[i].semantic_index = info->output_semantic_index[i];
3178
3179 for (j = 0; j < 4; j++) {
3180 outputs[i].values[j] =
3181 LLVMBuildLoad(gallivm->builder,
3182 addrs[4 * i + j],
3183 "");
3184 outputs[i].vertex_stream[j] =
3185 (info->output_streams[i] >> (2 * j)) & 3;
3186 }
3187 }
3188
3189 if (ctx->shader->selector->so.num_outputs)
3190 si_llvm_emit_streamout(ctx, outputs, i, 0);
3191
3192 /* Export PrimitiveID. */
3193 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3194 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3195 outputs[i].semantic_index = 0;
3196 outputs[i].values[0] = LLVMBuildBitCast(gallivm->builder,
3197 get_primitive_id(ctx, 0), ctx->f32, "");
3198 for (j = 1; j < 4; j++)
3199 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3200
3201 memset(outputs[i].vertex_stream, 0,
3202 sizeof(outputs[i].vertex_stream));
3203 i++;
3204 }
3205
3206 si_llvm_export_vs(&ctx->bld_base, outputs, i);
3207 FREE(outputs);
3208 }
3209
3210 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3211 {
3212 struct si_shader_context *ctx = si_shader_context(bld_base);
3213
3214 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3215 &ctx->outputs[0][0]);
3216 }
3217
3218 struct si_ps_exports {
3219 unsigned num;
3220 struct ac_export_args args[10];
3221 };
3222
3223 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3224 bool writes_samplemask)
3225 {
3226 if (writes_z) {
3227 /* Z needs 32 bits. */
3228 if (writes_samplemask)
3229 return V_028710_SPI_SHADER_32_ABGR;
3230 else if (writes_stencil)
3231 return V_028710_SPI_SHADER_32_GR;
3232 else
3233 return V_028710_SPI_SHADER_32_R;
3234 } else if (writes_stencil || writes_samplemask) {
3235 /* Both stencil and sample mask need only 16 bits. */
3236 return V_028710_SPI_SHADER_UINT16_ABGR;
3237 } else {
3238 return V_028710_SPI_SHADER_ZERO;
3239 }
3240 }
3241
3242 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3243 LLVMValueRef depth, LLVMValueRef stencil,
3244 LLVMValueRef samplemask, struct si_ps_exports *exp)
3245 {
3246 struct si_shader_context *ctx = si_shader_context(bld_base);
3247 struct lp_build_context *base = &bld_base->base;
3248 struct ac_export_args args;
3249 unsigned mask = 0;
3250 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3251 stencil != NULL,
3252 samplemask != NULL);
3253
3254 assert(depth || stencil || samplemask);
3255
3256 args.valid_mask = 1; /* whether the EXEC mask is valid */
3257 args.done = 1; /* DONE bit */
3258
3259 /* Specify the target we are exporting */
3260 args.target = V_008DFC_SQ_EXP_MRTZ;
3261
3262 args.compr = 0; /* COMP flag */
3263 args.out[0] = base->undef; /* R, depth */
3264 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3265 args.out[2] = base->undef; /* B, sample mask */
3266 args.out[3] = base->undef; /* A, alpha to mask */
3267
3268 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3269 assert(!depth);
3270 args.compr = 1; /* COMPR flag */
3271
3272 if (stencil) {
3273 /* Stencil should be in X[23:16]. */
3274 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3275 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3276 LLVMConstInt(ctx->i32, 16, 0), "");
3277 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3278 mask |= 0x3;
3279 }
3280 if (samplemask) {
3281 /* SampleMask should be in Y[15:0]. */
3282 args.out[1] = samplemask;
3283 mask |= 0xc;
3284 }
3285 } else {
3286 if (depth) {
3287 args.out[0] = depth;
3288 mask |= 0x1;
3289 }
3290 if (stencil) {
3291 args.out[1] = stencil;
3292 mask |= 0x2;
3293 }
3294 if (samplemask) {
3295 args.out[2] = samplemask;
3296 mask |= 0x4;
3297 }
3298 }
3299
3300 /* SI (except OLAND and HAINAN) has a bug that it only looks
3301 * at the X writemask component. */
3302 if (ctx->screen->b.chip_class == SI &&
3303 ctx->screen->b.family != CHIP_OLAND &&
3304 ctx->screen->b.family != CHIP_HAINAN)
3305 mask |= 0x1;
3306
3307 /* Specify which components to enable */
3308 args.enabled_channels = mask;
3309
3310 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3311 }
3312
3313 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3314 LLVMValueRef *color, unsigned index,
3315 unsigned samplemask_param,
3316 bool is_last, struct si_ps_exports *exp)
3317 {
3318 struct si_shader_context *ctx = si_shader_context(bld_base);
3319 struct lp_build_context *base = &bld_base->base;
3320 int i;
3321
3322 /* Clamp color */
3323 if (ctx->shader->key.part.ps.epilog.clamp_color)
3324 for (i = 0; i < 4; i++)
3325 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3326
3327 /* Alpha to one */
3328 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3329 color[3] = base->one;
3330
3331 /* Alpha test */
3332 if (index == 0 &&
3333 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3334 si_alpha_test(bld_base, color[3]);
3335
3336 /* Line & polygon smoothing */
3337 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3338 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3339 samplemask_param);
3340
3341 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3342 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3343 struct ac_export_args args[8];
3344 int c, last = -1;
3345
3346 /* Get the export arguments, also find out what the last one is. */
3347 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3348 si_llvm_init_export_args(bld_base, color,
3349 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3350 if (args[c].enabled_channels)
3351 last = c;
3352 }
3353
3354 /* Emit all exports. */
3355 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3356 if (is_last && last == c) {
3357 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3358 args[c].done = 1; /* DONE bit */
3359 } else if (!args[c].enabled_channels)
3360 continue; /* unnecessary NULL export */
3361
3362 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3363 }
3364 } else {
3365 struct ac_export_args args;
3366
3367 /* Export */
3368 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3369 &args);
3370 if (is_last) {
3371 args.valid_mask = 1; /* whether the EXEC mask is valid */
3372 args.done = 1; /* DONE bit */
3373 } else if (!args.enabled_channels)
3374 return; /* unnecessary NULL export */
3375
3376 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3377 }
3378 }
3379
3380 static void si_emit_ps_exports(struct si_shader_context *ctx,
3381 struct si_ps_exports *exp)
3382 {
3383 for (unsigned i = 0; i < exp->num; i++)
3384 ac_build_export(&ctx->ac, &exp->args[i]);
3385 }
3386
3387 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3388 {
3389 struct si_shader_context *ctx = si_shader_context(bld_base);
3390 struct lp_build_context *base = &bld_base->base;
3391 struct ac_export_args args;
3392
3393 args.enabled_channels = 0x0; /* enabled channels */
3394 args.valid_mask = 1; /* whether the EXEC mask is valid */
3395 args.done = 1; /* DONE bit */
3396 args.target = V_008DFC_SQ_EXP_NULL;
3397 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3398 args.out[0] = base->undef; /* R */
3399 args.out[1] = base->undef; /* G */
3400 args.out[2] = base->undef; /* B */
3401 args.out[3] = base->undef; /* A */
3402
3403 ac_build_export(&ctx->ac, &args);
3404 }
3405
3406 /**
3407 * Return PS outputs in this order:
3408 *
3409 * v[0:3] = color0.xyzw
3410 * v[4:7] = color1.xyzw
3411 * ...
3412 * vN+0 = Depth
3413 * vN+1 = Stencil
3414 * vN+2 = SampleMask
3415 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3416 *
3417 * The alpha-ref SGPR is returned via its original location.
3418 */
3419 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3420 unsigned max_outputs,
3421 LLVMValueRef *addrs)
3422 {
3423 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3424 struct si_shader *shader = ctx->shader;
3425 struct tgsi_shader_info *info = &shader->selector->info;
3426 LLVMBuilderRef builder = ctx->gallivm.builder;
3427 unsigned i, j, first_vgpr, vgpr;
3428
3429 LLVMValueRef color[8][4] = {};
3430 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3431 LLVMValueRef ret;
3432
3433 if (ctx->postponed_kill)
3434 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3435
3436 /* Read the output values. */
3437 for (i = 0; i < info->num_outputs; i++) {
3438 unsigned semantic_name = info->output_semantic_name[i];
3439 unsigned semantic_index = info->output_semantic_index[i];
3440
3441 switch (semantic_name) {
3442 case TGSI_SEMANTIC_COLOR:
3443 assert(semantic_index < 8);
3444 for (j = 0; j < 4; j++) {
3445 LLVMValueRef ptr = addrs[4 * i + j];
3446 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3447 color[semantic_index][j] = result;
3448 }
3449 break;
3450 case TGSI_SEMANTIC_POSITION:
3451 depth = LLVMBuildLoad(builder,
3452 addrs[4 * i + 2], "");
3453 break;
3454 case TGSI_SEMANTIC_STENCIL:
3455 stencil = LLVMBuildLoad(builder,
3456 addrs[4 * i + 1], "");
3457 break;
3458 case TGSI_SEMANTIC_SAMPLEMASK:
3459 samplemask = LLVMBuildLoad(builder,
3460 addrs[4 * i + 0], "");
3461 break;
3462 default:
3463 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3464 semantic_name);
3465 }
3466 }
3467
3468 /* Fill the return structure. */
3469 ret = ctx->return_value;
3470
3471 /* Set SGPRs. */
3472 ret = LLVMBuildInsertValue(builder, ret,
3473 LLVMBuildBitCast(ctx->ac.builder,
3474 LLVMGetParam(ctx->main_fn,
3475 SI_PARAM_ALPHA_REF),
3476 ctx->i32, ""),
3477 SI_SGPR_ALPHA_REF, "");
3478
3479 /* Set VGPRs */
3480 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3481 for (i = 0; i < ARRAY_SIZE(color); i++) {
3482 if (!color[i][0])
3483 continue;
3484
3485 for (j = 0; j < 4; j++)
3486 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3487 }
3488 if (depth)
3489 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3490 if (stencil)
3491 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3492 if (samplemask)
3493 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3494
3495 /* Add the input sample mask for smoothing at the end. */
3496 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3497 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3498 ret = LLVMBuildInsertValue(builder, ret,
3499 LLVMGetParam(ctx->main_fn,
3500 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3501
3502 ctx->return_value = ret;
3503 }
3504
3505 /* Prevent optimizations (at least of memory accesses) across the current
3506 * point in the program by emitting empty inline assembly that is marked as
3507 * having side effects.
3508 *
3509 * Optionally, a value can be passed through the inline assembly to prevent
3510 * LLVM from hoisting calls to ReadNone functions.
3511 */
3512 static void emit_optimization_barrier(struct si_shader_context *ctx,
3513 LLVMValueRef *pvgpr)
3514 {
3515 static int counter = 0;
3516
3517 LLVMBuilderRef builder = ctx->gallivm.builder;
3518 char code[16];
3519
3520 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3521
3522 if (!pvgpr) {
3523 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3524 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3525 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3526 } else {
3527 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3528 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3529 LLVMValueRef vgpr = *pvgpr;
3530 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3531 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3532 LLVMValueRef vgpr0;
3533
3534 assert(vgpr_size % 4 == 0);
3535
3536 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3537 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3538 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3539 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3540 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3541
3542 *pvgpr = vgpr;
3543 }
3544 }
3545
3546 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3547 {
3548 struct gallivm_state *gallivm = &ctx->gallivm;
3549 LLVMBuilderRef builder = gallivm->builder;
3550 LLVMValueRef args[1] = {
3551 LLVMConstInt(ctx->i32, simm16, 0)
3552 };
3553 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3554 ctx->voidt, args, 1, 0);
3555 }
3556
3557 static void membar_emit(
3558 const struct lp_build_tgsi_action *action,
3559 struct lp_build_tgsi_context *bld_base,
3560 struct lp_build_emit_data *emit_data)
3561 {
3562 struct si_shader_context *ctx = si_shader_context(bld_base);
3563 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3564 unsigned flags = LLVMConstIntGetZExtValue(src0);
3565 unsigned waitcnt = NOOP_WAITCNT;
3566
3567 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3568 waitcnt &= VM_CNT & LGKM_CNT;
3569
3570 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3571 TGSI_MEMBAR_SHADER_BUFFER |
3572 TGSI_MEMBAR_SHADER_IMAGE))
3573 waitcnt &= VM_CNT;
3574
3575 if (flags & TGSI_MEMBAR_SHARED)
3576 waitcnt &= LGKM_CNT;
3577
3578 if (waitcnt != NOOP_WAITCNT)
3579 si_emit_waitcnt(ctx, waitcnt);
3580 }
3581
3582 static void clock_emit(
3583 const struct lp_build_tgsi_action *action,
3584 struct lp_build_tgsi_context *bld_base,
3585 struct lp_build_emit_data *emit_data)
3586 {
3587 struct si_shader_context *ctx = si_shader_context(bld_base);
3588 struct gallivm_state *gallivm = &ctx->gallivm;
3589 LLVMValueRef tmp;
3590
3591 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3592 ctx->i64, NULL, 0, 0);
3593 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3594
3595 emit_data->output[0] =
3596 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3597 emit_data->output[1] =
3598 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3599 }
3600
3601 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3602 {
3603 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3604 CONST_ADDR_SPACE);
3605 }
3606
3607 static void si_llvm_emit_ddxy(
3608 const struct lp_build_tgsi_action *action,
3609 struct lp_build_tgsi_context *bld_base,
3610 struct lp_build_emit_data *emit_data)
3611 {
3612 struct si_shader_context *ctx = si_shader_context(bld_base);
3613 struct gallivm_state *gallivm = &ctx->gallivm;
3614 unsigned opcode = emit_data->info->opcode;
3615 LLVMValueRef val;
3616 int idx;
3617 unsigned mask;
3618
3619 if (opcode == TGSI_OPCODE_DDX_FINE)
3620 mask = AC_TID_MASK_LEFT;
3621 else if (opcode == TGSI_OPCODE_DDY_FINE)
3622 mask = AC_TID_MASK_TOP;
3623 else
3624 mask = AC_TID_MASK_TOP_LEFT;
3625
3626 /* for DDX we want to next X pixel, DDY next Y pixel. */
3627 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3628
3629 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3630 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3631 mask, idx, val);
3632 emit_data->output[emit_data->chan] = val;
3633 }
3634
3635 /*
3636 * this takes an I,J coordinate pair,
3637 * and works out the X and Y derivatives.
3638 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3639 */
3640 static LLVMValueRef si_llvm_emit_ddxy_interp(
3641 struct lp_build_tgsi_context *bld_base,
3642 LLVMValueRef interp_ij)
3643 {
3644 struct si_shader_context *ctx = si_shader_context(bld_base);
3645 struct gallivm_state *gallivm = &ctx->gallivm;
3646 LLVMValueRef result[4], a;
3647 unsigned i;
3648
3649 for (i = 0; i < 2; i++) {
3650 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3651 LLVMConstInt(ctx->i32, i, 0), "");
3652 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3653 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3654 }
3655
3656 return lp_build_gather_values(gallivm, result, 4);
3657 }
3658
3659 static void interp_fetch_args(
3660 struct lp_build_tgsi_context *bld_base,
3661 struct lp_build_emit_data *emit_data)
3662 {
3663 struct si_shader_context *ctx = si_shader_context(bld_base);
3664 struct gallivm_state *gallivm = &ctx->gallivm;
3665 const struct tgsi_full_instruction *inst = emit_data->inst;
3666
3667 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3668 /* offset is in second src, first two channels */
3669 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3670 emit_data->inst, 1,
3671 TGSI_CHAN_X);
3672 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3673 emit_data->inst, 1,
3674 TGSI_CHAN_Y);
3675 emit_data->arg_count = 2;
3676 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3677 LLVMValueRef sample_position;
3678 LLVMValueRef sample_id;
3679 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3680
3681 /* fetch sample ID, then fetch its sample position,
3682 * and place into first two channels.
3683 */
3684 sample_id = lp_build_emit_fetch(bld_base,
3685 emit_data->inst, 1, TGSI_CHAN_X);
3686 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3687 ctx->i32, "");
3688 sample_position = load_sample_position(ctx, sample_id);
3689
3690 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3691 sample_position,
3692 ctx->i32_0, "");
3693
3694 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3695 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3696 sample_position,
3697 ctx->i32_1, "");
3698 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3699 emit_data->arg_count = 2;
3700 }
3701 }
3702
3703 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3704 struct lp_build_tgsi_context *bld_base,
3705 struct lp_build_emit_data *emit_data)
3706 {
3707 struct si_shader_context *ctx = si_shader_context(bld_base);
3708 struct si_shader *shader = ctx->shader;
3709 struct gallivm_state *gallivm = &ctx->gallivm;
3710 const struct tgsi_shader_info *info = &shader->selector->info;
3711 LLVMValueRef interp_param;
3712 const struct tgsi_full_instruction *inst = emit_data->inst;
3713 const struct tgsi_full_src_register *input = &inst->Src[0];
3714 int input_base, input_array_size;
3715 int chan;
3716 int i;
3717 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3718 LLVMValueRef array_idx;
3719 int interp_param_idx;
3720 unsigned interp;
3721 unsigned location;
3722
3723 assert(input->Register.File == TGSI_FILE_INPUT);
3724
3725 if (input->Register.Indirect) {
3726 unsigned array_id = input->Indirect.ArrayID;
3727
3728 if (array_id) {
3729 input_base = info->input_array_first[array_id];
3730 input_array_size = info->input_array_last[array_id] - input_base + 1;
3731 } else {
3732 input_base = inst->Src[0].Register.Index;
3733 input_array_size = info->num_inputs - input_base;
3734 }
3735
3736 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3737 input->Register.Index - input_base);
3738 } else {
3739 input_base = inst->Src[0].Register.Index;
3740 input_array_size = 1;
3741 array_idx = ctx->i32_0;
3742 }
3743
3744 interp = shader->selector->info.input_interpolate[input_base];
3745
3746 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3747 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3748 location = TGSI_INTERPOLATE_LOC_CENTER;
3749 else
3750 location = TGSI_INTERPOLATE_LOC_CENTROID;
3751
3752 interp_param_idx = lookup_interp_param_index(interp, location);
3753 if (interp_param_idx == -1)
3754 return;
3755 else if (interp_param_idx)
3756 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3757 else
3758 interp_param = NULL;
3759
3760 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3761 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3762 LLVMValueRef ij_out[2];
3763 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3764
3765 /*
3766 * take the I then J parameters, and the DDX/Y for it, and
3767 * calculate the IJ inputs for the interpolator.
3768 * temp1 = ddx * offset/sample.x + I;
3769 * interp_param.I = ddy * offset/sample.y + temp1;
3770 * temp1 = ddx * offset/sample.x + J;
3771 * interp_param.J = ddy * offset/sample.y + temp1;
3772 */
3773 for (i = 0; i < 2; i++) {
3774 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3775 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3776 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3777 ddxy_out, ix_ll, "");
3778 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3779 ddxy_out, iy_ll, "");
3780 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3781 interp_param, ix_ll, "");
3782 LLVMValueRef temp1, temp2;
3783
3784 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3785 ctx->f32, "");
3786
3787 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3788
3789 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3790
3791 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3792
3793 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3794 }
3795 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3796 }
3797
3798 if (interp_param) {
3799 interp_param = LLVMBuildBitCast(gallivm->builder,
3800 interp_param, LLVMVectorType(ctx->f32, 2), "");
3801 }
3802
3803 for (chan = 0; chan < 4; chan++) {
3804 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3805 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3806
3807 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3808 LLVMValueRef v, i = NULL, j = NULL;
3809
3810 if (interp_param) {
3811 interp_param = LLVMBuildBitCast(gallivm->builder,
3812 interp_param, LLVMVectorType(ctx->f32, 2), "");
3813 i = LLVMBuildExtractElement(
3814 gallivm->builder, interp_param, ctx->i32_0, "");
3815 j = LLVMBuildExtractElement(
3816 gallivm->builder, interp_param, ctx->i32_1, "");
3817 }
3818 v = si_build_fs_interp(ctx, input_base + idx, schan,
3819 prim_mask, i, j);
3820
3821 gather = LLVMBuildInsertElement(gallivm->builder,
3822 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3823 }
3824
3825 emit_data->output[chan] = LLVMBuildExtractElement(
3826 gallivm->builder, gather, array_idx, "");
3827 }
3828 }
3829
3830 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3831 LLVMValueRef value)
3832 {
3833 struct gallivm_state *gallivm = &ctx->gallivm;
3834 LLVMValueRef args[3] = {
3835 value,
3836 ctx->i32_0,
3837 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3838 };
3839
3840 /* We currently have no other way to prevent LLVM from lifting the icmp
3841 * calls to a dominating basic block.
3842 */
3843 emit_optimization_barrier(ctx, &args[0]);
3844
3845 if (LLVMTypeOf(args[0]) != ctx->i32)
3846 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3847
3848 return lp_build_intrinsic(gallivm->builder,
3849 "llvm.amdgcn.icmp.i32",
3850 ctx->i64, args, 3,
3851 LP_FUNC_ATTR_NOUNWIND |
3852 LP_FUNC_ATTR_READNONE |
3853 LP_FUNC_ATTR_CONVERGENT);
3854 }
3855
3856 static void vote_all_emit(
3857 const struct lp_build_tgsi_action *action,
3858 struct lp_build_tgsi_context *bld_base,
3859 struct lp_build_emit_data *emit_data)
3860 {
3861 struct si_shader_context *ctx = si_shader_context(bld_base);
3862 struct gallivm_state *gallivm = &ctx->gallivm;
3863 LLVMValueRef active_set, vote_set;
3864 LLVMValueRef tmp;
3865
3866 active_set = si_emit_ballot(ctx, ctx->i32_1);
3867 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3868
3869 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3870 emit_data->output[emit_data->chan] =
3871 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3872 }
3873
3874 static void vote_any_emit(
3875 const struct lp_build_tgsi_action *action,
3876 struct lp_build_tgsi_context *bld_base,
3877 struct lp_build_emit_data *emit_data)
3878 {
3879 struct si_shader_context *ctx = si_shader_context(bld_base);
3880 struct gallivm_state *gallivm = &ctx->gallivm;
3881 LLVMValueRef vote_set;
3882 LLVMValueRef tmp;
3883
3884 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3885
3886 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3887 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3888 emit_data->output[emit_data->chan] =
3889 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3890 }
3891
3892 static void vote_eq_emit(
3893 const struct lp_build_tgsi_action *action,
3894 struct lp_build_tgsi_context *bld_base,
3895 struct lp_build_emit_data *emit_data)
3896 {
3897 struct si_shader_context *ctx = si_shader_context(bld_base);
3898 struct gallivm_state *gallivm = &ctx->gallivm;
3899 LLVMValueRef active_set, vote_set;
3900 LLVMValueRef all, none, tmp;
3901
3902 active_set = si_emit_ballot(ctx, ctx->i32_1);
3903 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3904
3905 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3906 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3907 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3908 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3909 emit_data->output[emit_data->chan] =
3910 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3911 }
3912
3913 static void ballot_emit(
3914 const struct lp_build_tgsi_action *action,
3915 struct lp_build_tgsi_context *bld_base,
3916 struct lp_build_emit_data *emit_data)
3917 {
3918 struct si_shader_context *ctx = si_shader_context(bld_base);
3919 LLVMBuilderRef builder = ctx->gallivm.builder;
3920 LLVMValueRef tmp;
3921
3922 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3923 tmp = si_emit_ballot(ctx, tmp);
3924 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3925
3926 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3927 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3928 }
3929
3930 static void read_invoc_fetch_args(
3931 struct lp_build_tgsi_context *bld_base,
3932 struct lp_build_emit_data *emit_data)
3933 {
3934 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3935 0, emit_data->src_chan);
3936
3937 /* Always read the source invocation (= lane) from the X channel. */
3938 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3939 1, TGSI_CHAN_X);
3940 emit_data->arg_count = 2;
3941 }
3942
3943 static void read_lane_emit(
3944 const struct lp_build_tgsi_action *action,
3945 struct lp_build_tgsi_context *bld_base,
3946 struct lp_build_emit_data *emit_data)
3947 {
3948 struct si_shader_context *ctx = si_shader_context(bld_base);
3949 LLVMBuilderRef builder = ctx->gallivm.builder;
3950
3951 /* We currently have no other way to prevent LLVM from lifting the icmp
3952 * calls to a dominating basic block.
3953 */
3954 emit_optimization_barrier(ctx, &emit_data->args[0]);
3955
3956 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3957 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3958 ctx->i32, "");
3959 }
3960
3961 emit_data->output[emit_data->chan] =
3962 ac_build_intrinsic(&ctx->ac, action->intr_name,
3963 ctx->i32, emit_data->args, emit_data->arg_count,
3964 AC_FUNC_ATTR_READNONE |
3965 AC_FUNC_ATTR_CONVERGENT);
3966 }
3967
3968 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3969 struct lp_build_emit_data *emit_data)
3970 {
3971 struct si_shader_context *ctx = si_shader_context(bld_base);
3972 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3973 LLVMValueRef imm;
3974 unsigned stream;
3975
3976 assert(src0.File == TGSI_FILE_IMMEDIATE);
3977
3978 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
3979 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
3980 return stream;
3981 }
3982
3983 /* Emit one vertex from the geometry shader */
3984 static void si_llvm_emit_vertex(
3985 const struct lp_build_tgsi_action *action,
3986 struct lp_build_tgsi_context *bld_base,
3987 struct lp_build_emit_data *emit_data)
3988 {
3989 struct si_shader_context *ctx = si_shader_context(bld_base);
3990 struct lp_build_context *uint = &bld_base->uint_bld;
3991 struct si_shader *shader = ctx->shader;
3992 struct tgsi_shader_info *info = &shader->selector->info;
3993 struct gallivm_state *gallivm = &ctx->gallivm;
3994 struct lp_build_if_state if_state;
3995 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3996 ctx->param_gs2vs_offset);
3997 LLVMValueRef gs_next_vertex;
3998 LLVMValueRef can_emit, kill;
3999 unsigned chan, offset;
4000 int i;
4001 unsigned stream;
4002
4003 stream = si_llvm_get_stream(bld_base, emit_data);
4004
4005 /* Write vertex attribute values to GSVS ring */
4006 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
4007 ctx->gs_next_vertex[stream],
4008 "");
4009
4010 /* If this thread has already emitted the declared maximum number of
4011 * vertices, skip the write: excessive vertex emissions are not
4012 * supposed to have any effect.
4013 *
4014 * If the shader has no writes to memory, kill it instead. This skips
4015 * further memory loads and may allow LLVM to skip to the end
4016 * altogether.
4017 */
4018 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
4019 LLVMConstInt(ctx->i32,
4020 shader->selector->gs_max_out_vertices, 0), "");
4021
4022 bool use_kill = !info->writes_memory;
4023 if (use_kill) {
4024 kill = lp_build_select(&bld_base->base, can_emit,
4025 LLVMConstReal(ctx->f32, 1.0f),
4026 LLVMConstReal(ctx->f32, -1.0f));
4027
4028 ac_build_kill(&ctx->ac, kill);
4029 } else {
4030 lp_build_if(&if_state, gallivm, can_emit);
4031 }
4032
4033 offset = 0;
4034 for (i = 0; i < info->num_outputs; i++) {
4035 LLVMValueRef *out_ptr = ctx->outputs[i];
4036
4037 for (chan = 0; chan < 4; chan++) {
4038 if (!(info->output_usagemask[i] & (1 << chan)) ||
4039 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4040 continue;
4041
4042 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4043 LLVMValueRef voffset =
4044 LLVMConstInt(ctx->i32, offset *
4045 shader->selector->gs_max_out_vertices, 0);
4046 offset++;
4047
4048 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4049 voffset = lp_build_mul_imm(uint, voffset, 4);
4050
4051 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4052
4053 ac_build_buffer_store_dword(&ctx->ac,
4054 ctx->gsvs_ring[stream],
4055 out_val, 1,
4056 voffset, soffset, 0,
4057 1, 1, true, true);
4058 }
4059 }
4060
4061 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4062 ctx->i32_1);
4063
4064 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4065
4066 /* Signal vertex emission */
4067 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4068 si_get_gs_wave_id(ctx));
4069 if (!use_kill)
4070 lp_build_endif(&if_state);
4071 }
4072
4073 /* Cut one primitive from the geometry shader */
4074 static void si_llvm_emit_primitive(
4075 const struct lp_build_tgsi_action *action,
4076 struct lp_build_tgsi_context *bld_base,
4077 struct lp_build_emit_data *emit_data)
4078 {
4079 struct si_shader_context *ctx = si_shader_context(bld_base);
4080 unsigned stream;
4081
4082 /* Signal primitive cut */
4083 stream = si_llvm_get_stream(bld_base, emit_data);
4084 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4085 si_get_gs_wave_id(ctx));
4086 }
4087
4088 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4089 struct lp_build_tgsi_context *bld_base,
4090 struct lp_build_emit_data *emit_data)
4091 {
4092 struct si_shader_context *ctx = si_shader_context(bld_base);
4093 struct gallivm_state *gallivm = &ctx->gallivm;
4094
4095 /* SI only (thanks to a hw bug workaround):
4096 * The real barrier instruction isn’t needed, because an entire patch
4097 * always fits into a single wave.
4098 */
4099 if (ctx->screen->b.chip_class == SI &&
4100 ctx->type == PIPE_SHADER_TESS_CTRL) {
4101 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4102 return;
4103 }
4104
4105 lp_build_intrinsic(gallivm->builder,
4106 "llvm.amdgcn.s.barrier",
4107 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4108 }
4109
4110 static const struct lp_build_tgsi_action interp_action = {
4111 .fetch_args = interp_fetch_args,
4112 .emit = build_interp_intrinsic,
4113 };
4114
4115 static void si_create_function(struct si_shader_context *ctx,
4116 const char *name,
4117 LLVMTypeRef *returns, unsigned num_returns,
4118 struct si_function_info *fninfo,
4119 unsigned max_workgroup_size)
4120 {
4121 int i;
4122
4123 si_llvm_create_func(ctx, name, returns, num_returns,
4124 fninfo->types, fninfo->num_params);
4125 ctx->return_value = LLVMGetUndef(ctx->return_type);
4126
4127 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4128 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4129
4130 /* The combination of:
4131 * - ByVal
4132 * - dereferenceable
4133 * - invariant.load
4134 * allows the optimization passes to move loads and reduces
4135 * SGPR spilling significantly.
4136 */
4137 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4138 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4139 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4140 ac_add_attr_dereferenceable(P, UINT64_MAX);
4141 } else
4142 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4143 }
4144
4145 for (i = 0; i < fninfo->num_params; ++i) {
4146 if (fninfo->assign[i])
4147 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4148 }
4149
4150 if (max_workgroup_size) {
4151 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4152 max_workgroup_size);
4153 }
4154 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4155 "no-signed-zeros-fp-math",
4156 "true");
4157
4158 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
4159 /* These were copied from some LLVM test. */
4160 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4161 "less-precise-fpmad",
4162 "true");
4163 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4164 "no-infs-fp-math",
4165 "true");
4166 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4167 "no-nans-fp-math",
4168 "true");
4169 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4170 "unsafe-fp-math",
4171 "true");
4172 }
4173 }
4174
4175 static void declare_streamout_params(struct si_shader_context *ctx,
4176 struct pipe_stream_output_info *so,
4177 struct si_function_info *fninfo)
4178 {
4179 int i;
4180
4181 /* Streamout SGPRs. */
4182 if (so->num_outputs) {
4183 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4184 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4185 else
4186 ctx->param_streamout_config = fninfo->num_params - 1;
4187
4188 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4189 }
4190 /* A streamout buffer offset is loaded if the stride is non-zero. */
4191 for (i = 0; i < 4; i++) {
4192 if (!so->stride[i])
4193 continue;
4194
4195 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4196 }
4197 }
4198
4199 static unsigned llvm_get_type_size(LLVMTypeRef type)
4200 {
4201 LLVMTypeKind kind = LLVMGetTypeKind(type);
4202
4203 switch (kind) {
4204 case LLVMIntegerTypeKind:
4205 return LLVMGetIntTypeWidth(type) / 8;
4206 case LLVMFloatTypeKind:
4207 return 4;
4208 case LLVMPointerTypeKind:
4209 return 8;
4210 case LLVMVectorTypeKind:
4211 return LLVMGetVectorSize(type) *
4212 llvm_get_type_size(LLVMGetElementType(type));
4213 case LLVMArrayTypeKind:
4214 return LLVMGetArrayLength(type) *
4215 llvm_get_type_size(LLVMGetElementType(type));
4216 default:
4217 assert(0);
4218 return 0;
4219 }
4220 }
4221
4222 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4223 {
4224 struct gallivm_state *gallivm = &ctx->gallivm;
4225
4226 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4227 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4228 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4229 "lds");
4230 }
4231
4232 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4233 {
4234 switch (shader->selector->type) {
4235 case PIPE_SHADER_TESS_CTRL:
4236 /* Return this so that LLVM doesn't remove s_barrier
4237 * instructions on chips where we use s_barrier. */
4238 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4239
4240 case PIPE_SHADER_GEOMETRY:
4241 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4242
4243 case PIPE_SHADER_COMPUTE:
4244 break; /* see below */
4245
4246 default:
4247 return 0;
4248 }
4249
4250 const unsigned *properties = shader->selector->info.properties;
4251 unsigned max_work_group_size =
4252 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4253 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4254 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4255
4256 if (!max_work_group_size) {
4257 /* This is a variable group size compute shader,
4258 * compile it for the maximum possible group size.
4259 */
4260 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4261 }
4262 return max_work_group_size;
4263 }
4264
4265 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4266 struct si_function_info *fninfo,
4267 bool assign_params)
4268 {
4269 unsigned const_and_shader_buffers =
4270 add_arg(fninfo, ARG_SGPR,
4271 si_const_array(ctx->v4i32,
4272 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
4273 unsigned samplers_and_images =
4274 add_arg(fninfo, ARG_SGPR,
4275 si_const_array(ctx->v8i32,
4276 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4277
4278 if (assign_params) {
4279 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4280 ctx->param_samplers_and_images = samplers_and_images;
4281 }
4282 }
4283
4284 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4285 struct si_function_info *fninfo)
4286 {
4287 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4288 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4289 ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4290 si_const_array(ctx->v8i32, 0));
4291 declare_per_stage_desc_pointers(ctx, fninfo, true);
4292 }
4293
4294 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4295 struct si_function_info *fninfo)
4296 {
4297 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4298 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4299 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4300 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4301 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4302 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4303 }
4304
4305 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4306 struct si_function_info *fninfo,
4307 unsigned *num_prolog_vgprs)
4308 {
4309 struct si_shader *shader = ctx->shader;
4310
4311 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4312 if (shader->key.as_ls) {
4313 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4314 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4315 } else {
4316 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4317 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4318 }
4319 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4320
4321 if (!shader->is_gs_copy_shader) {
4322 /* Vertex load indices. */
4323 ctx->param_vertex_index0 = fninfo->num_params;
4324 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4325 add_arg(fninfo, ARG_VGPR, ctx->i32);
4326 *num_prolog_vgprs += shader->selector->info.num_inputs;
4327 }
4328 }
4329
4330 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4331 struct si_function_info *fninfo)
4332 {
4333 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4334 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4335 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4336 ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4337 }
4338
4339 enum {
4340 /* Convenient merged shader definitions. */
4341 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4342 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4343 };
4344
4345 static void create_function(struct si_shader_context *ctx)
4346 {
4347 struct si_shader *shader = ctx->shader;
4348 struct si_function_info fninfo;
4349 LLVMTypeRef returns[16+32*4];
4350 unsigned i, num_return_sgprs;
4351 unsigned num_returns = 0;
4352 unsigned num_prolog_vgprs = 0;
4353 unsigned type = ctx->type;
4354
4355 si_init_function_info(&fninfo);
4356
4357 /* Set MERGED shaders. */
4358 if (ctx->screen->b.chip_class >= GFX9) {
4359 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4360 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4361 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4362 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4363 }
4364
4365 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4366
4367 switch (type) {
4368 case PIPE_SHADER_VERTEX:
4369 declare_default_desc_pointers(ctx, &fninfo);
4370 declare_vs_specific_input_sgprs(ctx, &fninfo);
4371
4372 if (shader->key.as_es) {
4373 assert(!shader->selector->nir);
4374 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4375 } else if (shader->key.as_ls) {
4376 assert(!shader->selector->nir);
4377 /* no extra parameters */
4378 } else {
4379 if (shader->is_gs_copy_shader) {
4380 fninfo.num_params = ctx->param_rw_buffers + 1;
4381 fninfo.num_sgpr_params = fninfo.num_params;
4382 }
4383
4384 /* The locations of the other parameters are assigned dynamically. */
4385 declare_streamout_params(ctx, &shader->selector->so,
4386 &fninfo);
4387 }
4388
4389 /* VGPRs */
4390 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4391 break;
4392
4393 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4394 declare_default_desc_pointers(ctx, &fninfo);
4395 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4396 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4397 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4398 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4399 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4400 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4401 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4402 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4403
4404 /* VGPRs */
4405 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4406 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4407
4408 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4409 * placed after the user SGPRs.
4410 */
4411 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4412 returns[num_returns++] = ctx->i32; /* SGPRs */
4413 for (i = 0; i < 5; i++)
4414 returns[num_returns++] = ctx->f32; /* VGPRs */
4415 break;
4416
4417 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4418 /* Merged stages have 8 system SGPRs at the beginning. */
4419 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4420 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4421 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4422 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4423 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4424 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4425 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4426 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4427
4428 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4429 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4430
4431 ctx->param_bindless_samplers_and_images =
4432 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
4433
4434 declare_per_stage_desc_pointers(ctx, &fninfo,
4435 ctx->type == PIPE_SHADER_VERTEX);
4436 declare_vs_specific_input_sgprs(ctx, &fninfo);
4437
4438 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4439 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4440 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4441 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4442 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4443 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4444
4445 declare_per_stage_desc_pointers(ctx, &fninfo,
4446 ctx->type == PIPE_SHADER_TESS_CTRL);
4447
4448 /* VGPRs (first TCS, then VS) */
4449 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4450 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4451
4452 if (ctx->type == PIPE_SHADER_VERTEX) {
4453 declare_vs_input_vgprs(ctx, &fninfo,
4454 &num_prolog_vgprs);
4455
4456 /* LS return values are inputs to the TCS main shader part. */
4457 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4458 returns[num_returns++] = ctx->i32; /* SGPRs */
4459 for (i = 0; i < 2; i++)
4460 returns[num_returns++] = ctx->f32; /* VGPRs */
4461 } else {
4462 /* TCS return values are inputs to the TCS epilog.
4463 *
4464 * param_tcs_offchip_offset, param_tcs_factor_offset,
4465 * param_tcs_offchip_layout, and param_rw_buffers
4466 * should be passed to the epilog.
4467 */
4468 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4469 returns[num_returns++] = ctx->i32; /* SGPRs */
4470 for (i = 0; i < 5; i++)
4471 returns[num_returns++] = ctx->f32; /* VGPRs */
4472 }
4473 break;
4474
4475 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4476 /* Merged stages have 8 system SGPRs at the beginning. */
4477 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4478 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4479 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4480 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4481 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4482 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4483 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4484 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4485
4486 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4487 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4488
4489 ctx->param_bindless_samplers_and_images =
4490 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
4491
4492 declare_per_stage_desc_pointers(ctx, &fninfo,
4493 (ctx->type == PIPE_SHADER_VERTEX ||
4494 ctx->type == PIPE_SHADER_TESS_EVAL));
4495 if (ctx->type == PIPE_SHADER_VERTEX) {
4496 declare_vs_specific_input_sgprs(ctx, &fninfo);
4497 } else {
4498 /* TESS_EVAL (and also GEOMETRY):
4499 * Declare as many input SGPRs as the VS has. */
4500 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4501 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4502 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4503 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4504 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4505 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4506 }
4507
4508 declare_per_stage_desc_pointers(ctx, &fninfo,
4509 ctx->type == PIPE_SHADER_GEOMETRY);
4510
4511 /* VGPRs (first GS, then VS/TES) */
4512 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4513 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4514 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4515 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4516 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4517
4518 if (ctx->type == PIPE_SHADER_VERTEX) {
4519 declare_vs_input_vgprs(ctx, &fninfo,
4520 &num_prolog_vgprs);
4521 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4522 declare_tes_input_vgprs(ctx, &fninfo);
4523 }
4524
4525 if (ctx->type == PIPE_SHADER_VERTEX ||
4526 ctx->type == PIPE_SHADER_TESS_EVAL) {
4527 /* ES return values are inputs to GS. */
4528 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4529 returns[num_returns++] = ctx->i32; /* SGPRs */
4530 for (i = 0; i < 5; i++)
4531 returns[num_returns++] = ctx->f32; /* VGPRs */
4532 }
4533 break;
4534
4535 case PIPE_SHADER_TESS_EVAL:
4536 declare_default_desc_pointers(ctx, &fninfo);
4537 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4538 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4539
4540 if (shader->key.as_es) {
4541 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4542 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4543 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4544 } else {
4545 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4546 declare_streamout_params(ctx, &shader->selector->so,
4547 &fninfo);
4548 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4549 }
4550
4551 /* VGPRs */
4552 declare_tes_input_vgprs(ctx, &fninfo);
4553 break;
4554
4555 case PIPE_SHADER_GEOMETRY:
4556 declare_default_desc_pointers(ctx, &fninfo);
4557 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4558 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4559
4560 /* VGPRs */
4561 ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4562 ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4563 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4564 ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4565 ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4566 ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4567 ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4568 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4569 break;
4570
4571 case PIPE_SHADER_FRAGMENT:
4572 declare_default_desc_pointers(ctx, &fninfo);
4573 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4574 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4575
4576 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4577 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4578 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4579 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4580 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4581 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4582 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4583 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4584 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4585 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4586 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4587 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4588 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4589 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4590 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4591 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4592 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4593 &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4594 shader->info.face_vgpr_index = 20;
4595 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4596 &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4597 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4598 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4599 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4600
4601 /* Color inputs from the prolog. */
4602 if (shader->selector->info.colors_read) {
4603 unsigned num_color_elements =
4604 util_bitcount(shader->selector->info.colors_read);
4605
4606 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4607 for (i = 0; i < num_color_elements; i++)
4608 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4609
4610 num_prolog_vgprs += num_color_elements;
4611 }
4612
4613 /* Outputs for the epilog. */
4614 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4615 num_returns =
4616 num_return_sgprs +
4617 util_bitcount(shader->selector->info.colors_written) * 4 +
4618 shader->selector->info.writes_z +
4619 shader->selector->info.writes_stencil +
4620 shader->selector->info.writes_samplemask +
4621 1 /* SampleMaskIn */;
4622
4623 num_returns = MAX2(num_returns,
4624 num_return_sgprs +
4625 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4626
4627 for (i = 0; i < num_return_sgprs; i++)
4628 returns[i] = ctx->i32;
4629 for (; i < num_returns; i++)
4630 returns[i] = ctx->f32;
4631 break;
4632
4633 case PIPE_SHADER_COMPUTE:
4634 declare_default_desc_pointers(ctx, &fninfo);
4635 if (shader->selector->info.uses_grid_size)
4636 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4637 if (shader->selector->info.uses_block_size)
4638 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4639
4640 for (i = 0; i < 3; i++) {
4641 ctx->param_block_id[i] = -1;
4642 if (shader->selector->info.uses_block_id[i])
4643 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4644 }
4645
4646 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4647 break;
4648 default:
4649 assert(0 && "unimplemented shader");
4650 return;
4651 }
4652
4653 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4654 si_get_max_workgroup_size(shader));
4655
4656 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4657 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4658 ctx->separate_prolog) {
4659 si_llvm_add_attribute(ctx->main_fn,
4660 "InitialPSInputAddr",
4661 S_0286D0_PERSP_SAMPLE_ENA(1) |
4662 S_0286D0_PERSP_CENTER_ENA(1) |
4663 S_0286D0_PERSP_CENTROID_ENA(1) |
4664 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4665 S_0286D0_LINEAR_CENTER_ENA(1) |
4666 S_0286D0_LINEAR_CENTROID_ENA(1) |
4667 S_0286D0_FRONT_FACE_ENA(1) |
4668 S_0286D0_POS_FIXED_PT_ENA(1));
4669 }
4670
4671 shader->info.num_input_sgprs = 0;
4672 shader->info.num_input_vgprs = 0;
4673
4674 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4675 shader->info.num_input_sgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4676
4677 for (; i < fninfo.num_params; ++i)
4678 shader->info.num_input_vgprs += llvm_get_type_size(fninfo.types[i]) / 4;
4679
4680 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4681 shader->info.num_input_vgprs -= num_prolog_vgprs;
4682
4683 if (shader->key.as_ls ||
4684 ctx->type == PIPE_SHADER_TESS_CTRL ||
4685 /* GFX9 has the ESGS ring buffer in LDS. */
4686 (ctx->screen->b.chip_class >= GFX9 &&
4687 (shader->key.as_es ||
4688 ctx->type == PIPE_SHADER_GEOMETRY)))
4689 declare_lds_as_pointer(ctx);
4690 }
4691
4692 /**
4693 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4694 * for later use.
4695 */
4696 static void preload_ring_buffers(struct si_shader_context *ctx)
4697 {
4698 struct gallivm_state *gallivm = &ctx->gallivm;
4699 LLVMBuilderRef builder = gallivm->builder;
4700
4701 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4702 ctx->param_rw_buffers);
4703
4704 if (ctx->screen->b.chip_class <= VI &&
4705 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4706 unsigned ring =
4707 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4708 : SI_ES_RING_ESGS;
4709 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4710
4711 ctx->esgs_ring =
4712 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4713 }
4714
4715 if (ctx->shader->is_gs_copy_shader) {
4716 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4717
4718 ctx->gsvs_ring[0] =
4719 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4720 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4721 const struct si_shader_selector *sel = ctx->shader->selector;
4722 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4723 LLVMValueRef base_ring;
4724
4725 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4726
4727 /* The conceptual layout of the GSVS ring is
4728 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4729 * but the real memory layout is swizzled across
4730 * threads:
4731 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4732 * t16v0c0 ..
4733 * Override the buffer descriptor accordingly.
4734 */
4735 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4736 uint64_t stream_offset = 0;
4737
4738 for (unsigned stream = 0; stream < 4; ++stream) {
4739 unsigned num_components;
4740 unsigned stride;
4741 unsigned num_records;
4742 LLVMValueRef ring, tmp;
4743
4744 num_components = sel->info.num_stream_output_components[stream];
4745 if (!num_components)
4746 continue;
4747
4748 stride = 4 * num_components * sel->gs_max_out_vertices;
4749
4750 /* Limit on the stride field for <= CIK. */
4751 assert(stride < (1 << 14));
4752
4753 num_records = 64;
4754
4755 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4756 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4757 tmp = LLVMBuildAdd(builder, tmp,
4758 LLVMConstInt(ctx->i64,
4759 stream_offset, 0), "");
4760 stream_offset += stride * 64;
4761
4762 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4763 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4764 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4765 tmp = LLVMBuildOr(builder, tmp,
4766 LLVMConstInt(ctx->i32,
4767 S_008F04_STRIDE(stride) |
4768 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4769 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4770 ring = LLVMBuildInsertElement(builder, ring,
4771 LLVMConstInt(ctx->i32, num_records, 0),
4772 LLVMConstInt(ctx->i32, 2, 0), "");
4773 ring = LLVMBuildInsertElement(builder, ring,
4774 LLVMConstInt(ctx->i32,
4775 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4776 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4777 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4778 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4779 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4780 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4781 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4782 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4783 S_008F0C_ADD_TID_ENABLE(1),
4784 0),
4785 LLVMConstInt(ctx->i32, 3, 0), "");
4786
4787 ctx->gsvs_ring[stream] = ring;
4788 }
4789 }
4790 }
4791
4792 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4793 LLVMValueRef param_rw_buffers,
4794 unsigned param_pos_fixed_pt)
4795 {
4796 struct gallivm_state *gallivm = &ctx->gallivm;
4797 LLVMBuilderRef builder = gallivm->builder;
4798 LLVMValueRef slot, desc, offset, row, bit, address[2];
4799
4800 /* Use the fixed-point gl_FragCoord input.
4801 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4802 * per coordinate to get the repeating effect.
4803 */
4804 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4805 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4806
4807 /* Load the buffer descriptor. */
4808 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4809 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4810
4811 /* The stipple pattern is 32x32, each row has 32 bits. */
4812 offset = LLVMBuildMul(builder, address[1],
4813 LLVMConstInt(ctx->i32, 4, 0), "");
4814 row = buffer_load_const(ctx, desc, offset);
4815 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4816 bit = LLVMBuildLShr(builder, row, address[0], "");
4817 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4818
4819 /* The intrinsic kills the thread if arg < 0. */
4820 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4821 LLVMConstReal(ctx->f32, -1), "");
4822 ac_build_kill(&ctx->ac, bit);
4823 }
4824
4825 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4826 struct si_shader_config *conf,
4827 unsigned symbol_offset)
4828 {
4829 unsigned i;
4830 const unsigned char *config =
4831 ac_shader_binary_config_start(binary, symbol_offset);
4832 bool really_needs_scratch = false;
4833
4834 /* LLVM adds SGPR spills to the scratch size.
4835 * Find out if we really need the scratch buffer.
4836 */
4837 for (i = 0; i < binary->reloc_count; i++) {
4838 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4839
4840 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4841 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4842 really_needs_scratch = true;
4843 break;
4844 }
4845 }
4846
4847 /* XXX: We may be able to emit some of these values directly rather than
4848 * extracting fields to be emitted later.
4849 */
4850
4851 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4852 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4853 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4854 switch (reg) {
4855 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4856 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4857 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4858 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4859 case R_00B848_COMPUTE_PGM_RSRC1:
4860 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4861 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4862 conf->float_mode = G_00B028_FLOAT_MODE(value);
4863 conf->rsrc1 = value;
4864 break;
4865 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4866 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4867 break;
4868 case R_00B84C_COMPUTE_PGM_RSRC2:
4869 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4870 conf->rsrc2 = value;
4871 break;
4872 case R_0286CC_SPI_PS_INPUT_ENA:
4873 conf->spi_ps_input_ena = value;
4874 break;
4875 case R_0286D0_SPI_PS_INPUT_ADDR:
4876 conf->spi_ps_input_addr = value;
4877 break;
4878 case R_0286E8_SPI_TMPRING_SIZE:
4879 case R_00B860_COMPUTE_TMPRING_SIZE:
4880 /* WAVESIZE is in units of 256 dwords. */
4881 if (really_needs_scratch)
4882 conf->scratch_bytes_per_wave =
4883 G_00B860_WAVESIZE(value) * 256 * 4;
4884 break;
4885 case 0x4: /* SPILLED_SGPRS */
4886 conf->spilled_sgprs = value;
4887 break;
4888 case 0x8: /* SPILLED_VGPRS */
4889 conf->spilled_vgprs = value;
4890 break;
4891 default:
4892 {
4893 static bool printed;
4894
4895 if (!printed) {
4896 fprintf(stderr, "Warning: LLVM emitted unknown "
4897 "config register: 0x%x\n", reg);
4898 printed = true;
4899 }
4900 }
4901 break;
4902 }
4903 }
4904
4905 if (!conf->spi_ps_input_addr)
4906 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4907 }
4908
4909 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4910 uint64_t scratch_va)
4911 {
4912 unsigned i;
4913 uint32_t scratch_rsrc_dword0 = scratch_va;
4914 uint32_t scratch_rsrc_dword1 =
4915 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4916
4917 /* Enable scratch coalescing. */
4918 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4919
4920 for (i = 0 ; i < shader->binary.reloc_count; i++) {
4921 const struct ac_shader_reloc *reloc =
4922 &shader->binary.relocs[i];
4923 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4924 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4925 &scratch_rsrc_dword0, 4);
4926 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4927 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4928 &scratch_rsrc_dword1, 4);
4929 }
4930 }
4931 }
4932
4933 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4934 {
4935 unsigned size = shader->binary.code_size;
4936
4937 if (shader->prolog)
4938 size += shader->prolog->binary.code_size;
4939 if (shader->previous_stage)
4940 size += shader->previous_stage->binary.code_size;
4941 if (shader->prolog2)
4942 size += shader->prolog2->binary.code_size;
4943 if (shader->epilog)
4944 size += shader->epilog->binary.code_size;
4945 return size;
4946 }
4947
4948 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4949 {
4950 const struct ac_shader_binary *prolog =
4951 shader->prolog ? &shader->prolog->binary : NULL;
4952 const struct ac_shader_binary *previous_stage =
4953 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4954 const struct ac_shader_binary *prolog2 =
4955 shader->prolog2 ? &shader->prolog2->binary : NULL;
4956 const struct ac_shader_binary *epilog =
4957 shader->epilog ? &shader->epilog->binary : NULL;
4958 const struct ac_shader_binary *mainb = &shader->binary;
4959 unsigned bo_size = si_get_shader_binary_size(shader) +
4960 (!epilog ? mainb->rodata_size : 0);
4961 unsigned char *ptr;
4962
4963 assert(!prolog || !prolog->rodata_size);
4964 assert(!previous_stage || !previous_stage->rodata_size);
4965 assert(!prolog2 || !prolog2->rodata_size);
4966 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4967 !mainb->rodata_size);
4968 assert(!epilog || !epilog->rodata_size);
4969
4970 r600_resource_reference(&shader->bo, NULL);
4971 shader->bo = (struct r600_resource*)
4972 pipe_buffer_create(&sscreen->b.b, 0,
4973 PIPE_USAGE_IMMUTABLE,
4974 align(bo_size, SI_CPDMA_ALIGNMENT));
4975 if (!shader->bo)
4976 return -ENOMEM;
4977
4978 /* Upload. */
4979 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4980 PIPE_TRANSFER_READ_WRITE |
4981 PIPE_TRANSFER_UNSYNCHRONIZED);
4982
4983 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4984 * endian-independent. */
4985 if (prolog) {
4986 memcpy(ptr, prolog->code, prolog->code_size);
4987 ptr += prolog->code_size;
4988 }
4989 if (previous_stage) {
4990 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4991 ptr += previous_stage->code_size;
4992 }
4993 if (prolog2) {
4994 memcpy(ptr, prolog2->code, prolog2->code_size);
4995 ptr += prolog2->code_size;
4996 }
4997
4998 memcpy(ptr, mainb->code, mainb->code_size);
4999 ptr += mainb->code_size;
5000
5001 if (epilog)
5002 memcpy(ptr, epilog->code, epilog->code_size);
5003 else if (mainb->rodata_size > 0)
5004 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5005
5006 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5007 return 0;
5008 }
5009
5010 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5011 struct pipe_debug_callback *debug,
5012 const char *name, FILE *file)
5013 {
5014 char *line, *p;
5015 unsigned i, count;
5016
5017 if (binary->disasm_string) {
5018 fprintf(file, "Shader %s disassembly:\n", name);
5019 fprintf(file, "%s", binary->disasm_string);
5020
5021 if (debug && debug->debug_message) {
5022 /* Very long debug messages are cut off, so send the
5023 * disassembly one line at a time. This causes more
5024 * overhead, but on the plus side it simplifies
5025 * parsing of resulting logs.
5026 */
5027 pipe_debug_message(debug, SHADER_INFO,
5028 "Shader Disassembly Begin");
5029
5030 line = binary->disasm_string;
5031 while (*line) {
5032 p = util_strchrnul(line, '\n');
5033 count = p - line;
5034
5035 if (count) {
5036 pipe_debug_message(debug, SHADER_INFO,
5037 "%.*s", count, line);
5038 }
5039
5040 if (!*p)
5041 break;
5042 line = p + 1;
5043 }
5044
5045 pipe_debug_message(debug, SHADER_INFO,
5046 "Shader Disassembly End");
5047 }
5048 } else {
5049 fprintf(file, "Shader %s binary:\n", name);
5050 for (i = 0; i < binary->code_size; i += 4) {
5051 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5052 binary->code[i + 3], binary->code[i + 2],
5053 binary->code[i + 1], binary->code[i]);
5054 }
5055 }
5056 }
5057
5058 static void si_shader_dump_stats(struct si_screen *sscreen,
5059 const struct si_shader *shader,
5060 struct pipe_debug_callback *debug,
5061 unsigned processor,
5062 FILE *file,
5063 bool check_debug_option)
5064 {
5065 const struct si_shader_config *conf = &shader->config;
5066 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
5067 unsigned code_size = si_get_shader_binary_size(shader);
5068 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5069 unsigned lds_per_wave = 0;
5070 unsigned max_simd_waves;
5071
5072 switch (sscreen->b.family) {
5073 /* These always have 8 waves: */
5074 case CHIP_POLARIS10:
5075 case CHIP_POLARIS11:
5076 case CHIP_POLARIS12:
5077 max_simd_waves = 8;
5078 break;
5079 default:
5080 max_simd_waves = 10;
5081 }
5082
5083 /* Compute LDS usage for PS. */
5084 switch (processor) {
5085 case PIPE_SHADER_FRAGMENT:
5086 /* The minimum usage per wave is (num_inputs * 48). The maximum
5087 * usage is (num_inputs * 48 * 16).
5088 * We can get anything in between and it varies between waves.
5089 *
5090 * The 48 bytes per input for a single primitive is equal to
5091 * 4 bytes/component * 4 components/input * 3 points.
5092 *
5093 * Other stages don't know the size at compile time or don't
5094 * allocate LDS per wave, but instead they do it per thread group.
5095 */
5096 lds_per_wave = conf->lds_size * lds_increment +
5097 align(num_inputs * 48, lds_increment);
5098 break;
5099 case PIPE_SHADER_COMPUTE:
5100 if (shader->selector) {
5101 unsigned max_workgroup_size =
5102 si_get_max_workgroup_size(shader);
5103 lds_per_wave = (conf->lds_size * lds_increment) /
5104 DIV_ROUND_UP(max_workgroup_size, 64);
5105 }
5106 break;
5107 }
5108
5109 /* Compute the per-SIMD wave counts. */
5110 if (conf->num_sgprs) {
5111 if (sscreen->b.chip_class >= VI)
5112 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5113 else
5114 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5115 }
5116
5117 if (conf->num_vgprs)
5118 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5119
5120 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5121 * 16KB makes some SIMDs unoccupied). */
5122 if (lds_per_wave)
5123 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5124
5125 if (!check_debug_option ||
5126 r600_can_dump_shader(&sscreen->b, processor)) {
5127 if (processor == PIPE_SHADER_FRAGMENT) {
5128 fprintf(file, "*** SHADER CONFIG ***\n"
5129 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5130 "SPI_PS_INPUT_ENA = 0x%04x\n",
5131 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5132 }
5133
5134 fprintf(file, "*** SHADER STATS ***\n"
5135 "SGPRS: %d\n"
5136 "VGPRS: %d\n"
5137 "Spilled SGPRs: %d\n"
5138 "Spilled VGPRs: %d\n"
5139 "Private memory VGPRs: %d\n"
5140 "Code Size: %d bytes\n"
5141 "LDS: %d blocks\n"
5142 "Scratch: %d bytes per wave\n"
5143 "Max Waves: %d\n"
5144 "********************\n\n\n",
5145 conf->num_sgprs, conf->num_vgprs,
5146 conf->spilled_sgprs, conf->spilled_vgprs,
5147 conf->private_mem_vgprs, code_size,
5148 conf->lds_size, conf->scratch_bytes_per_wave,
5149 max_simd_waves);
5150 }
5151
5152 pipe_debug_message(debug, SHADER_INFO,
5153 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5154 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5155 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5156 conf->num_sgprs, conf->num_vgprs, code_size,
5157 conf->lds_size, conf->scratch_bytes_per_wave,
5158 max_simd_waves, conf->spilled_sgprs,
5159 conf->spilled_vgprs, conf->private_mem_vgprs);
5160 }
5161
5162 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5163 {
5164 switch (processor) {
5165 case PIPE_SHADER_VERTEX:
5166 if (shader->key.as_es)
5167 return "Vertex Shader as ES";
5168 else if (shader->key.as_ls)
5169 return "Vertex Shader as LS";
5170 else
5171 return "Vertex Shader as VS";
5172 case PIPE_SHADER_TESS_CTRL:
5173 return "Tessellation Control Shader";
5174 case PIPE_SHADER_TESS_EVAL:
5175 if (shader->key.as_es)
5176 return "Tessellation Evaluation Shader as ES";
5177 else
5178 return "Tessellation Evaluation Shader as VS";
5179 case PIPE_SHADER_GEOMETRY:
5180 if (shader->is_gs_copy_shader)
5181 return "GS Copy Shader as VS";
5182 else
5183 return "Geometry Shader";
5184 case PIPE_SHADER_FRAGMENT:
5185 return "Pixel Shader";
5186 case PIPE_SHADER_COMPUTE:
5187 return "Compute Shader";
5188 default:
5189 return "Unknown Shader";
5190 }
5191 }
5192
5193 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5194 struct pipe_debug_callback *debug, unsigned processor,
5195 FILE *file, bool check_debug_option)
5196 {
5197 if (!check_debug_option ||
5198 r600_can_dump_shader(&sscreen->b, processor))
5199 si_dump_shader_key(processor, shader, file);
5200
5201 if (!check_debug_option && shader->binary.llvm_ir_string) {
5202 if (shader->previous_stage &&
5203 shader->previous_stage->binary.llvm_ir_string) {
5204 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5205 si_get_shader_name(shader, processor));
5206 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5207 }
5208
5209 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5210 si_get_shader_name(shader, processor));
5211 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5212 }
5213
5214 if (!check_debug_option ||
5215 (r600_can_dump_shader(&sscreen->b, processor) &&
5216 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5217 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5218
5219 if (shader->prolog)
5220 si_shader_dump_disassembly(&shader->prolog->binary,
5221 debug, "prolog", file);
5222 if (shader->previous_stage)
5223 si_shader_dump_disassembly(&shader->previous_stage->binary,
5224 debug, "previous stage", file);
5225 if (shader->prolog2)
5226 si_shader_dump_disassembly(&shader->prolog2->binary,
5227 debug, "prolog2", file);
5228
5229 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5230
5231 if (shader->epilog)
5232 si_shader_dump_disassembly(&shader->epilog->binary,
5233 debug, "epilog", file);
5234 fprintf(file, "\n");
5235 }
5236
5237 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5238 check_debug_option);
5239 }
5240
5241 static int si_compile_llvm(struct si_screen *sscreen,
5242 struct ac_shader_binary *binary,
5243 struct si_shader_config *conf,
5244 LLVMTargetMachineRef tm,
5245 LLVMModuleRef mod,
5246 struct pipe_debug_callback *debug,
5247 unsigned processor,
5248 const char *name)
5249 {
5250 int r = 0;
5251 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5252
5253 if (r600_can_dump_shader(&sscreen->b, processor)) {
5254 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5255
5256 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5257 fprintf(stderr, "%s LLVM IR:\n\n", name);
5258 ac_dump_module(mod);
5259 fprintf(stderr, "\n");
5260 }
5261 }
5262
5263 if (sscreen->record_llvm_ir) {
5264 char *ir = LLVMPrintModuleToString(mod);
5265 binary->llvm_ir_string = strdup(ir);
5266 LLVMDisposeMessage(ir);
5267 }
5268
5269 if (!si_replace_shader(count, binary)) {
5270 r = si_llvm_compile(mod, binary, tm, debug);
5271 if (r)
5272 return r;
5273 }
5274
5275 si_shader_binary_read_config(binary, conf, 0);
5276
5277 /* Enable 64-bit and 16-bit denormals, because there is no performance
5278 * cost.
5279 *
5280 * If denormals are enabled, all floating-point output modifiers are
5281 * ignored.
5282 *
5283 * Don't enable denormals for 32-bit floats, because:
5284 * - Floating-point output modifiers would be ignored by the hw.
5285 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5286 * have to stop using those.
5287 * - SI & CI would be very slow.
5288 */
5289 conf->float_mode |= V_00B028_FP_64_DENORMS;
5290
5291 FREE(binary->config);
5292 FREE(binary->global_symbol_offsets);
5293 binary->config = NULL;
5294 binary->global_symbol_offsets = NULL;
5295
5296 /* Some shaders can't have rodata because their binaries can be
5297 * concatenated.
5298 */
5299 if (binary->rodata_size &&
5300 (processor == PIPE_SHADER_VERTEX ||
5301 processor == PIPE_SHADER_TESS_CTRL ||
5302 processor == PIPE_SHADER_TESS_EVAL ||
5303 processor == PIPE_SHADER_FRAGMENT)) {
5304 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5305 return -EINVAL;
5306 }
5307
5308 return r;
5309 }
5310
5311 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5312 {
5313 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5314 LLVMBuildRetVoid(ctx->gallivm.builder);
5315 else
5316 LLVMBuildRet(ctx->gallivm.builder, ret);
5317 }
5318
5319 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5320 struct si_shader *
5321 si_generate_gs_copy_shader(struct si_screen *sscreen,
5322 LLVMTargetMachineRef tm,
5323 struct si_shader_selector *gs_selector,
5324 struct pipe_debug_callback *debug)
5325 {
5326 struct si_shader_context ctx;
5327 struct si_shader *shader;
5328 struct gallivm_state *gallivm = &ctx.gallivm;
5329 LLVMBuilderRef builder;
5330 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5331 struct lp_build_context *uint = &bld_base->uint_bld;
5332 struct si_shader_output_values *outputs;
5333 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5334 int i, r;
5335
5336 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5337
5338 if (!outputs)
5339 return NULL;
5340
5341 shader = CALLOC_STRUCT(si_shader);
5342 if (!shader) {
5343 FREE(outputs);
5344 return NULL;
5345 }
5346
5347
5348 shader->selector = gs_selector;
5349 shader->is_gs_copy_shader = true;
5350
5351 si_init_shader_ctx(&ctx, sscreen, tm);
5352 ctx.shader = shader;
5353 ctx.type = PIPE_SHADER_VERTEX;
5354
5355 builder = gallivm->builder;
5356
5357 create_function(&ctx);
5358 preload_ring_buffers(&ctx);
5359
5360 LLVMValueRef voffset =
5361 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5362
5363 /* Fetch the vertex stream ID.*/
5364 LLVMValueRef stream_id;
5365
5366 if (gs_selector->so.num_outputs)
5367 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5368 else
5369 stream_id = ctx.i32_0;
5370
5371 /* Fill in output information. */
5372 for (i = 0; i < gsinfo->num_outputs; ++i) {
5373 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5374 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5375
5376 for (int chan = 0; chan < 4; chan++) {
5377 outputs[i].vertex_stream[chan] =
5378 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5379 }
5380 }
5381
5382 LLVMBasicBlockRef end_bb;
5383 LLVMValueRef switch_inst;
5384
5385 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5386 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5387
5388 for (int stream = 0; stream < 4; stream++) {
5389 LLVMBasicBlockRef bb;
5390 unsigned offset;
5391
5392 if (!gsinfo->num_stream_output_components[stream])
5393 continue;
5394
5395 if (stream > 0 && !gs_selector->so.num_outputs)
5396 continue;
5397
5398 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5399 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5400 LLVMPositionBuilderAtEnd(builder, bb);
5401
5402 /* Fetch vertex data from GSVS ring */
5403 offset = 0;
5404 for (i = 0; i < gsinfo->num_outputs; ++i) {
5405 for (unsigned chan = 0; chan < 4; chan++) {
5406 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5407 outputs[i].vertex_stream[chan] != stream) {
5408 outputs[i].values[chan] = ctx.bld_base.base.undef;
5409 continue;
5410 }
5411
5412 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5413 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5414 offset++;
5415
5416 outputs[i].values[chan] =
5417 ac_build_buffer_load(&ctx.ac,
5418 ctx.gsvs_ring[0], 1,
5419 ctx.i32_0, voffset,
5420 soffset, 0, 1, 1,
5421 true, false);
5422 }
5423 }
5424
5425 /* Streamout and exports. */
5426 if (gs_selector->so.num_outputs) {
5427 si_llvm_emit_streamout(&ctx, outputs,
5428 gsinfo->num_outputs,
5429 stream);
5430 }
5431
5432 if (stream == 0)
5433 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5434
5435 LLVMBuildBr(builder, end_bb);
5436 }
5437
5438 LLVMPositionBuilderAtEnd(builder, end_bb);
5439
5440 LLVMBuildRetVoid(gallivm->builder);
5441
5442 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5443 si_llvm_optimize_module(&ctx);
5444
5445 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5446 &ctx.shader->config, ctx.tm,
5447 ctx.gallivm.module,
5448 debug, PIPE_SHADER_GEOMETRY,
5449 "GS Copy Shader");
5450 if (!r) {
5451 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5452 fprintf(stderr, "GS Copy Shader:\n");
5453 si_shader_dump(sscreen, ctx.shader, debug,
5454 PIPE_SHADER_GEOMETRY, stderr, true);
5455 r = si_shader_binary_upload(sscreen, ctx.shader);
5456 }
5457
5458 si_llvm_dispose(&ctx);
5459
5460 FREE(outputs);
5461
5462 if (r != 0) {
5463 FREE(shader);
5464 shader = NULL;
5465 }
5466 return shader;
5467 }
5468
5469 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5470 const struct si_vs_prolog_bits *prolog,
5471 const char *prefix, FILE *f)
5472 {
5473 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5474 prefix, prolog->instance_divisor_is_one);
5475 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5476 prefix, prolog->instance_divisor_is_fetched);
5477 fprintf(f, " %s.ls_vgpr_fix = %u\n",
5478 prefix, prolog->ls_vgpr_fix);
5479
5480 fprintf(f, " mono.vs.fix_fetch = {");
5481 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5482 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5483 fprintf(f, "}\n");
5484 }
5485
5486 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5487 FILE *f)
5488 {
5489 const struct si_shader_key *key = &shader->key;
5490
5491 fprintf(f, "SHADER KEY\n");
5492
5493 switch (processor) {
5494 case PIPE_SHADER_VERTEX:
5495 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5496 "part.vs.prolog", f);
5497 fprintf(f, " as_es = %u\n", key->as_es);
5498 fprintf(f, " as_ls = %u\n", key->as_ls);
5499 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5500 key->mono.u.vs_export_prim_id);
5501 break;
5502
5503 case PIPE_SHADER_TESS_CTRL:
5504 if (shader->selector->screen->b.chip_class >= GFX9) {
5505 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5506 "part.tcs.ls_prolog", f);
5507 }
5508 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5509 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5510 break;
5511
5512 case PIPE_SHADER_TESS_EVAL:
5513 fprintf(f, " as_es = %u\n", key->as_es);
5514 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5515 key->mono.u.vs_export_prim_id);
5516 break;
5517
5518 case PIPE_SHADER_GEOMETRY:
5519 if (shader->is_gs_copy_shader)
5520 break;
5521
5522 if (shader->selector->screen->b.chip_class >= GFX9 &&
5523 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5524 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5525 "part.gs.vs_prolog", f);
5526 }
5527 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5528 break;
5529
5530 case PIPE_SHADER_COMPUTE:
5531 break;
5532
5533 case PIPE_SHADER_FRAGMENT:
5534 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5535 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5536 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5537 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5538 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5539 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5540 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5541 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5542 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5543 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5544 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5545 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5546 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5547 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5548 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5549 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5550 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5551 break;
5552
5553 default:
5554 assert(0);
5555 }
5556
5557 if ((processor == PIPE_SHADER_GEOMETRY ||
5558 processor == PIPE_SHADER_TESS_EVAL ||
5559 processor == PIPE_SHADER_VERTEX) &&
5560 !key->as_es && !key->as_ls) {
5561 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5562 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5563 }
5564 }
5565
5566 static void si_init_shader_ctx(struct si_shader_context *ctx,
5567 struct si_screen *sscreen,
5568 LLVMTargetMachineRef tm)
5569 {
5570 struct lp_build_tgsi_context *bld_base;
5571
5572 ctx->abi.chip_class = sscreen->b.chip_class;
5573
5574 si_llvm_context_init(ctx, sscreen, tm);
5575
5576 bld_base = &ctx->bld_base;
5577 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5578
5579 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5580 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5581 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5582
5583 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5584
5585 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5586
5587 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5588 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5589 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5590 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5591
5592 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5593 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5594 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5595 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5596 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5597 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5598 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5599 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5600 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5601
5602 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5603 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5604 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5605 }
5606
5607 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5608 {
5609 struct si_shader *shader = ctx->shader;
5610 struct tgsi_shader_info *info = &shader->selector->info;
5611
5612 if ((ctx->type != PIPE_SHADER_VERTEX &&
5613 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5614 shader->key.as_ls ||
5615 shader->key.as_es)
5616 return;
5617
5618 ac_optimize_vs_outputs(&ctx->ac,
5619 ctx->main_fn,
5620 shader->info.vs_output_param_offset,
5621 info->num_outputs,
5622 &shader->info.nr_param_exports);
5623 }
5624
5625 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5626 {
5627 ctx->shader->config.private_mem_vgprs = 0;
5628
5629 /* Process all LLVM instructions. */
5630 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5631 while (bb) {
5632 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5633
5634 while (next) {
5635 LLVMValueRef inst = next;
5636 next = LLVMGetNextInstruction(next);
5637
5638 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5639 continue;
5640
5641 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5642 /* No idea why LLVM aligns allocas to 4 elements. */
5643 unsigned alignment = LLVMGetAlignment(inst);
5644 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5645 ctx->shader->config.private_mem_vgprs += dw_size;
5646 }
5647 bb = LLVMGetNextBasicBlock(bb);
5648 }
5649 }
5650
5651 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5652 {
5653 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5654 lp_build_intrinsic(ctx->gallivm.builder,
5655 "llvm.amdgcn.init.exec", ctx->voidt,
5656 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5657 }
5658
5659 static void si_init_exec_from_input(struct si_shader_context *ctx,
5660 unsigned param, unsigned bitoffset)
5661 {
5662 LLVMValueRef args[] = {
5663 LLVMGetParam(ctx->main_fn, param),
5664 LLVMConstInt(ctx->i32, bitoffset, 0),
5665 };
5666 lp_build_intrinsic(ctx->gallivm.builder,
5667 "llvm.amdgcn.init.exec.from.input",
5668 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5669 }
5670
5671 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5672 const struct si_vs_prolog_bits *key)
5673 {
5674 /* VGPR initialization fixup for Vega10 and Raven is always done in the
5675 * VS prolog. */
5676 return sel->vs_needs_prolog || key->ls_vgpr_fix;
5677 }
5678
5679 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5680 bool is_monolithic)
5681 {
5682 struct si_shader *shader = ctx->shader;
5683 struct si_shader_selector *sel = shader->selector;
5684 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5685
5686 // TODO clean all this up!
5687 switch (ctx->type) {
5688 case PIPE_SHADER_VERTEX:
5689 ctx->load_input = declare_input_vs;
5690 if (shader->key.as_ls)
5691 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5692 else if (shader->key.as_es)
5693 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5694 else {
5695 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5696 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5697 }
5698 break;
5699 case PIPE_SHADER_TESS_CTRL:
5700 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5701 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5702 bld_base->emit_store = store_output_tcs;
5703 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5704 break;
5705 case PIPE_SHADER_TESS_EVAL:
5706 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5707 if (shader->key.as_es)
5708 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5709 else {
5710 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5711 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5712 }
5713 break;
5714 case PIPE_SHADER_GEOMETRY:
5715 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5716 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5717 break;
5718 case PIPE_SHADER_FRAGMENT:
5719 ctx->load_input = declare_input_fs;
5720 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5721 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5722 break;
5723 case PIPE_SHADER_COMPUTE:
5724 break;
5725 default:
5726 assert(!"Unsupported shader type");
5727 return false;
5728 }
5729
5730 ctx->abi.load_ubo = load_ubo;
5731 ctx->abi.load_ssbo = load_ssbo;
5732
5733 create_function(ctx);
5734 preload_ring_buffers(ctx);
5735
5736 /* For GFX9 merged shaders:
5737 * - Set EXEC for the first shader. If the prolog is present, set
5738 * EXEC there instead.
5739 * - Add a barrier before the second shader.
5740 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5741 * an if-statement. This is required for correctness in geometry
5742 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5743 * GS_CUT messages.
5744 *
5745 * For monolithic merged shaders, the first shader is wrapped in an
5746 * if-block together with its prolog in si_build_wrapper_function.
5747 */
5748 if (ctx->screen->b.chip_class >= GFX9) {
5749 if (!is_monolithic &&
5750 sel->info.num_instructions > 1 && /* not empty shader */
5751 (shader->key.as_es || shader->key.as_ls) &&
5752 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5753 (ctx->type == PIPE_SHADER_VERTEX &&
5754 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
5755 si_init_exec_from_input(ctx,
5756 ctx->param_merged_wave_info, 0);
5757 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5758 ctx->type == PIPE_SHADER_GEOMETRY) {
5759 if (!is_monolithic)
5760 si_init_exec_full_mask(ctx);
5761
5762 /* The barrier must execute for all shaders in a
5763 * threadgroup.
5764 */
5765 si_llvm_emit_barrier(NULL, bld_base, NULL);
5766
5767 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5768 LLVMValueRef ena =
5769 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5770 ac_get_thread_id(&ctx->ac), num_threads, "");
5771 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5772 }
5773 }
5774
5775 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5776 int i;
5777 for (i = 0; i < 4; i++) {
5778 ctx->gs_next_vertex[i] =
5779 lp_build_alloca(&ctx->gallivm,
5780 ctx->i32, "");
5781 }
5782 }
5783
5784 if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5785 ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5786 /* This is initialized to 0.0 = not kill. */
5787 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5788 }
5789
5790 if (sel->tokens) {
5791 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5792 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5793 return false;
5794 }
5795 } else {
5796 if (!si_nir_build_llvm(ctx, sel->nir)) {
5797 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5798 return false;
5799 }
5800 }
5801
5802 si_llvm_build_ret(ctx, ctx->return_value);
5803 return true;
5804 }
5805
5806 /**
5807 * Compute the VS prolog key, which contains all the information needed to
5808 * build the VS prolog function, and set shader->info bits where needed.
5809 *
5810 * \param info Shader info of the vertex shader.
5811 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5812 * \param prolog_key Key of the VS prolog
5813 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5814 * \param key Output shader part key.
5815 */
5816 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5817 unsigned num_input_sgprs,
5818 const struct si_vs_prolog_bits *prolog_key,
5819 struct si_shader *shader_out,
5820 union si_shader_part_key *key)
5821 {
5822 memset(key, 0, sizeof(*key));
5823 key->vs_prolog.states = *prolog_key;
5824 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5825 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5826 key->vs_prolog.as_ls = shader_out->key.as_ls;
5827
5828 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5829 key->vs_prolog.as_ls = 1;
5830 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5831 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5832 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5833 }
5834
5835 /* Enable loading the InstanceID VGPR. */
5836 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5837
5838 if ((key->vs_prolog.states.instance_divisor_is_one |
5839 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5840 shader_out->info.uses_instanceid = true;
5841 }
5842
5843 /**
5844 * Compute the PS prolog key, which contains all the information needed to
5845 * build the PS prolog function, and set related bits in shader->config.
5846 */
5847 static void si_get_ps_prolog_key(struct si_shader *shader,
5848 union si_shader_part_key *key,
5849 bool separate_prolog)
5850 {
5851 struct tgsi_shader_info *info = &shader->selector->info;
5852
5853 memset(key, 0, sizeof(*key));
5854 key->ps_prolog.states = shader->key.part.ps.prolog;
5855 key->ps_prolog.colors_read = info->colors_read;
5856 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5857 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5858 key->ps_prolog.wqm = info->uses_derivatives &&
5859 (key->ps_prolog.colors_read ||
5860 key->ps_prolog.states.force_persp_sample_interp ||
5861 key->ps_prolog.states.force_linear_sample_interp ||
5862 key->ps_prolog.states.force_persp_center_interp ||
5863 key->ps_prolog.states.force_linear_center_interp ||
5864 key->ps_prolog.states.bc_optimize_for_persp ||
5865 key->ps_prolog.states.bc_optimize_for_linear);
5866
5867 if (info->colors_read) {
5868 unsigned *color = shader->selector->color_attr_index;
5869
5870 if (shader->key.part.ps.prolog.color_two_side) {
5871 /* BCOLORs are stored after the last input. */
5872 key->ps_prolog.num_interp_inputs = info->num_inputs;
5873 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5874 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5875 }
5876
5877 for (unsigned i = 0; i < 2; i++) {
5878 unsigned interp = info->input_interpolate[color[i]];
5879 unsigned location = info->input_interpolate_loc[color[i]];
5880
5881 if (!(info->colors_read & (0xf << i*4)))
5882 continue;
5883
5884 key->ps_prolog.color_attr_index[i] = color[i];
5885
5886 if (shader->key.part.ps.prolog.flatshade_colors &&
5887 interp == TGSI_INTERPOLATE_COLOR)
5888 interp = TGSI_INTERPOLATE_CONSTANT;
5889
5890 switch (interp) {
5891 case TGSI_INTERPOLATE_CONSTANT:
5892 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5893 break;
5894 case TGSI_INTERPOLATE_PERSPECTIVE:
5895 case TGSI_INTERPOLATE_COLOR:
5896 /* Force the interpolation location for colors here. */
5897 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5898 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5899 if (shader->key.part.ps.prolog.force_persp_center_interp)
5900 location = TGSI_INTERPOLATE_LOC_CENTER;
5901
5902 switch (location) {
5903 case TGSI_INTERPOLATE_LOC_SAMPLE:
5904 key->ps_prolog.color_interp_vgpr_index[i] = 0;
5905 shader->config.spi_ps_input_ena |=
5906 S_0286CC_PERSP_SAMPLE_ENA(1);
5907 break;
5908 case TGSI_INTERPOLATE_LOC_CENTER:
5909 key->ps_prolog.color_interp_vgpr_index[i] = 2;
5910 shader->config.spi_ps_input_ena |=
5911 S_0286CC_PERSP_CENTER_ENA(1);
5912 break;
5913 case TGSI_INTERPOLATE_LOC_CENTROID:
5914 key->ps_prolog.color_interp_vgpr_index[i] = 4;
5915 shader->config.spi_ps_input_ena |=
5916 S_0286CC_PERSP_CENTROID_ENA(1);
5917 break;
5918 default:
5919 assert(0);
5920 }
5921 break;
5922 case TGSI_INTERPOLATE_LINEAR:
5923 /* Force the interpolation location for colors here. */
5924 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5925 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5926 if (shader->key.part.ps.prolog.force_linear_center_interp)
5927 location = TGSI_INTERPOLATE_LOC_CENTER;
5928
5929 /* The VGPR assignment for non-monolithic shaders
5930 * works because InitialPSInputAddr is set on the
5931 * main shader and PERSP_PULL_MODEL is never used.
5932 */
5933 switch (location) {
5934 case TGSI_INTERPOLATE_LOC_SAMPLE:
5935 key->ps_prolog.color_interp_vgpr_index[i] =
5936 separate_prolog ? 6 : 9;
5937 shader->config.spi_ps_input_ena |=
5938 S_0286CC_LINEAR_SAMPLE_ENA(1);
5939 break;
5940 case TGSI_INTERPOLATE_LOC_CENTER:
5941 key->ps_prolog.color_interp_vgpr_index[i] =
5942 separate_prolog ? 8 : 11;
5943 shader->config.spi_ps_input_ena |=
5944 S_0286CC_LINEAR_CENTER_ENA(1);
5945 break;
5946 case TGSI_INTERPOLATE_LOC_CENTROID:
5947 key->ps_prolog.color_interp_vgpr_index[i] =
5948 separate_prolog ? 10 : 13;
5949 shader->config.spi_ps_input_ena |=
5950 S_0286CC_LINEAR_CENTROID_ENA(1);
5951 break;
5952 default:
5953 assert(0);
5954 }
5955 break;
5956 default:
5957 assert(0);
5958 }
5959 }
5960 }
5961 }
5962
5963 /**
5964 * Check whether a PS prolog is required based on the key.
5965 */
5966 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5967 {
5968 return key->ps_prolog.colors_read ||
5969 key->ps_prolog.states.force_persp_sample_interp ||
5970 key->ps_prolog.states.force_linear_sample_interp ||
5971 key->ps_prolog.states.force_persp_center_interp ||
5972 key->ps_prolog.states.force_linear_center_interp ||
5973 key->ps_prolog.states.bc_optimize_for_persp ||
5974 key->ps_prolog.states.bc_optimize_for_linear ||
5975 key->ps_prolog.states.poly_stipple;
5976 }
5977
5978 /**
5979 * Compute the PS epilog key, which contains all the information needed to
5980 * build the PS epilog function.
5981 */
5982 static void si_get_ps_epilog_key(struct si_shader *shader,
5983 union si_shader_part_key *key)
5984 {
5985 struct tgsi_shader_info *info = &shader->selector->info;
5986 memset(key, 0, sizeof(*key));
5987 key->ps_epilog.colors_written = info->colors_written;
5988 key->ps_epilog.writes_z = info->writes_z;
5989 key->ps_epilog.writes_stencil = info->writes_stencil;
5990 key->ps_epilog.writes_samplemask = info->writes_samplemask;
5991 key->ps_epilog.states = shader->key.part.ps.epilog;
5992 }
5993
5994 /**
5995 * Build the GS prolog function. Rotate the input vertices for triangle strips
5996 * with adjacency.
5997 */
5998 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5999 union si_shader_part_key *key)
6000 {
6001 unsigned num_sgprs, num_vgprs;
6002 struct gallivm_state *gallivm = &ctx->gallivm;
6003 struct si_function_info fninfo;
6004 LLVMBuilderRef builder = gallivm->builder;
6005 LLVMTypeRef returns[48];
6006 LLVMValueRef func, ret;
6007
6008 si_init_function_info(&fninfo);
6009
6010 if (ctx->screen->b.chip_class >= GFX9) {
6011 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
6012 num_vgprs = 5; /* ES inputs are not needed by GS */
6013 } else {
6014 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6015 num_vgprs = 8;
6016 }
6017
6018 for (unsigned i = 0; i < num_sgprs; ++i) {
6019 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6020 returns[i] = ctx->i32;
6021 }
6022
6023 for (unsigned i = 0; i < num_vgprs; ++i) {
6024 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6025 returns[num_sgprs + i] = ctx->f32;
6026 }
6027
6028 /* Create the function. */
6029 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6030 &fninfo, 0);
6031 func = ctx->main_fn;
6032
6033 /* Set the full EXEC mask for the prolog, because we are only fiddling
6034 * with registers here. The main shader part will set the correct EXEC
6035 * mask.
6036 */
6037 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6038 si_init_exec_full_mask(ctx);
6039
6040 /* Copy inputs to outputs. This should be no-op, as the registers match,
6041 * but it will prevent the compiler from overwriting them unintentionally.
6042 */
6043 ret = ctx->return_value;
6044 for (unsigned i = 0; i < num_sgprs; i++) {
6045 LLVMValueRef p = LLVMGetParam(func, i);
6046 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6047 }
6048 for (unsigned i = 0; i < num_vgprs; i++) {
6049 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6050 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
6051 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6052 }
6053
6054 if (key->gs_prolog.states.tri_strip_adj_fix) {
6055 /* Remap the input vertices for every other primitive. */
6056 const unsigned gfx6_vtx_params[6] = {
6057 num_sgprs,
6058 num_sgprs + 1,
6059 num_sgprs + 3,
6060 num_sgprs + 4,
6061 num_sgprs + 5,
6062 num_sgprs + 6
6063 };
6064 const unsigned gfx9_vtx_params[3] = {
6065 num_sgprs,
6066 num_sgprs + 1,
6067 num_sgprs + 4,
6068 };
6069 LLVMValueRef vtx_in[6], vtx_out[6];
6070 LLVMValueRef prim_id, rotate;
6071
6072 if (ctx->screen->b.chip_class >= GFX9) {
6073 for (unsigned i = 0; i < 3; i++) {
6074 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6075 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6076 }
6077 } else {
6078 for (unsigned i = 0; i < 6; i++)
6079 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6080 }
6081
6082 prim_id = LLVMGetParam(func, num_sgprs + 2);
6083 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6084
6085 for (unsigned i = 0; i < 6; ++i) {
6086 LLVMValueRef base, rotated;
6087 base = vtx_in[i];
6088 rotated = vtx_in[(i + 4) % 6];
6089 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6090 }
6091
6092 if (ctx->screen->b.chip_class >= GFX9) {
6093 for (unsigned i = 0; i < 3; i++) {
6094 LLVMValueRef hi, out;
6095
6096 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6097 LLVMConstInt(ctx->i32, 16, 0), "");
6098 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6099 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
6100 ret = LLVMBuildInsertValue(builder, ret, out,
6101 gfx9_vtx_params[i], "");
6102 }
6103 } else {
6104 for (unsigned i = 0; i < 6; i++) {
6105 LLVMValueRef out;
6106
6107 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
6108 ret = LLVMBuildInsertValue(builder, ret, out,
6109 gfx6_vtx_params[i], "");
6110 }
6111 }
6112 }
6113
6114 LLVMBuildRet(builder, ret);
6115 }
6116
6117 /**
6118 * Given a list of shader part functions, build a wrapper function that
6119 * runs them in sequence to form a monolithic shader.
6120 */
6121 static void si_build_wrapper_function(struct si_shader_context *ctx,
6122 LLVMValueRef *parts,
6123 unsigned num_parts,
6124 unsigned main_part,
6125 unsigned next_shader_first_part)
6126 {
6127 struct gallivm_state *gallivm = &ctx->gallivm;
6128 LLVMBuilderRef builder = ctx->gallivm.builder;
6129 /* PS epilog has one arg per color component; gfx9 merged shader
6130 * prologs need to forward 32 user SGPRs.
6131 */
6132 struct si_function_info fninfo;
6133 LLVMValueRef initial[64], out[64];
6134 LLVMTypeRef function_type;
6135 unsigned num_first_params;
6136 unsigned num_out, initial_num_out;
6137 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6138 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6139 unsigned num_sgprs, num_vgprs;
6140 unsigned gprs;
6141 struct lp_build_if_state if_state;
6142
6143 si_init_function_info(&fninfo);
6144
6145 for (unsigned i = 0; i < num_parts; ++i) {
6146 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6147 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6148 }
6149
6150 /* The parameters of the wrapper function correspond to those of the
6151 * first part in terms of SGPRs and VGPRs, but we use the types of the
6152 * main part to get the right types. This is relevant for the
6153 * dereferenceable attribute on descriptor table pointers.
6154 */
6155 num_sgprs = 0;
6156 num_vgprs = 0;
6157
6158 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6159 num_first_params = LLVMCountParamTypes(function_type);
6160
6161 for (unsigned i = 0; i < num_first_params; ++i) {
6162 LLVMValueRef param = LLVMGetParam(parts[0], i);
6163
6164 if (ac_is_sgpr_param(param)) {
6165 assert(num_vgprs == 0);
6166 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6167 } else {
6168 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
6169 }
6170 }
6171
6172 gprs = 0;
6173 while (gprs < num_sgprs + num_vgprs) {
6174 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6175 LLVMTypeRef type = LLVMTypeOf(param);
6176 unsigned size = llvm_get_type_size(type) / 4;
6177
6178 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6179
6180 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6181 assert(gprs + size <= num_sgprs + num_vgprs &&
6182 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6183
6184 gprs += size;
6185 }
6186
6187 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6188 si_get_max_workgroup_size(ctx->shader));
6189
6190 if (is_merged_shader(ctx->shader))
6191 si_init_exec_full_mask(ctx);
6192
6193 /* Record the arguments of the function as if they were an output of
6194 * a previous part.
6195 */
6196 num_out = 0;
6197 num_out_sgpr = 0;
6198
6199 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6200 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6201 LLVMTypeRef param_type = LLVMTypeOf(param);
6202 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6203 unsigned size = llvm_get_type_size(param_type) / 4;
6204
6205 if (size == 1) {
6206 if (param_type != out_type)
6207 param = LLVMBuildBitCast(builder, param, out_type, "");
6208 out[num_out++] = param;
6209 } else {
6210 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6211
6212 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6213 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6214 param_type = ctx->i64;
6215 }
6216
6217 if (param_type != vector_type)
6218 param = LLVMBuildBitCast(builder, param, vector_type, "");
6219
6220 for (unsigned j = 0; j < size; ++j)
6221 out[num_out++] = LLVMBuildExtractElement(
6222 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6223 }
6224
6225 if (i < fninfo.num_sgpr_params)
6226 num_out_sgpr = num_out;
6227 }
6228
6229 memcpy(initial, out, sizeof(out));
6230 initial_num_out = num_out;
6231 initial_num_out_sgpr = num_out_sgpr;
6232
6233 /* Now chain the parts. */
6234 for (unsigned part = 0; part < num_parts; ++part) {
6235 LLVMValueRef in[48];
6236 LLVMValueRef ret;
6237 LLVMTypeRef ret_type;
6238 unsigned out_idx = 0;
6239 unsigned num_params = LLVMCountParams(parts[part]);
6240
6241 /* Merged shaders are executed conditionally depending
6242 * on the number of enabled threads passed in the input SGPRs. */
6243 if (is_merged_shader(ctx->shader) && part == 0) {
6244 LLVMValueRef ena, count = initial[3];
6245
6246 count = LLVMBuildAnd(builder, count,
6247 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6248 ena = LLVMBuildICmp(builder, LLVMIntULT,
6249 ac_get_thread_id(&ctx->ac), count, "");
6250 lp_build_if(&if_state, &ctx->gallivm, ena);
6251 }
6252
6253 /* Derive arguments for the next part from outputs of the
6254 * previous one.
6255 */
6256 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6257 LLVMValueRef param;
6258 LLVMTypeRef param_type;
6259 bool is_sgpr;
6260 unsigned param_size;
6261 LLVMValueRef arg = NULL;
6262
6263 param = LLVMGetParam(parts[part], param_idx);
6264 param_type = LLVMTypeOf(param);
6265 param_size = llvm_get_type_size(param_type) / 4;
6266 is_sgpr = ac_is_sgpr_param(param);
6267
6268 if (is_sgpr) {
6269 #if HAVE_LLVM < 0x0400
6270 LLVMRemoveAttribute(param, LLVMByValAttribute);
6271 #else
6272 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6273 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6274 #endif
6275 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6276 }
6277
6278 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6279 assert(is_sgpr || out_idx >= num_out_sgpr);
6280
6281 if (param_size == 1)
6282 arg = out[out_idx];
6283 else
6284 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6285
6286 if (LLVMTypeOf(arg) != param_type) {
6287 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6288 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6289 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6290 } else {
6291 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6292 }
6293 }
6294
6295 in[param_idx] = arg;
6296 out_idx += param_size;
6297 }
6298
6299 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6300
6301 if (is_merged_shader(ctx->shader) &&
6302 part + 1 == next_shader_first_part) {
6303 lp_build_endif(&if_state);
6304
6305 /* The second half of the merged shader should use
6306 * the inputs from the toplevel (wrapper) function,
6307 * not the return value from the last call.
6308 *
6309 * That's because the last call was executed condi-
6310 * tionally, so we can't consume it in the main
6311 * block.
6312 */
6313 memcpy(out, initial, sizeof(initial));
6314 num_out = initial_num_out;
6315 num_out_sgpr = initial_num_out_sgpr;
6316 continue;
6317 }
6318
6319 /* Extract the returned GPRs. */
6320 ret_type = LLVMTypeOf(ret);
6321 num_out = 0;
6322 num_out_sgpr = 0;
6323
6324 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6325 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6326
6327 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6328
6329 for (unsigned i = 0; i < ret_size; ++i) {
6330 LLVMValueRef val =
6331 LLVMBuildExtractValue(builder, ret, i, "");
6332
6333 assert(num_out < ARRAY_SIZE(out));
6334 out[num_out++] = val;
6335
6336 if (LLVMTypeOf(val) == ctx->i32) {
6337 assert(num_out_sgpr + 1 == num_out);
6338 num_out_sgpr = num_out;
6339 }
6340 }
6341 }
6342 }
6343
6344 LLVMBuildRetVoid(builder);
6345 }
6346
6347 int si_compile_tgsi_shader(struct si_screen *sscreen,
6348 LLVMTargetMachineRef tm,
6349 struct si_shader *shader,
6350 bool is_monolithic,
6351 struct pipe_debug_callback *debug)
6352 {
6353 struct si_shader_selector *sel = shader->selector;
6354 struct si_shader_context ctx;
6355 int r = -1;
6356
6357 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6358 * conversion fails. */
6359 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6360 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6361 if (sel->tokens)
6362 tgsi_dump(sel->tokens, 0);
6363 else
6364 nir_print_shader(sel->nir, stderr);
6365 si_dump_streamout(&sel->so);
6366 }
6367
6368 si_init_shader_ctx(&ctx, sscreen, tm);
6369 si_llvm_context_set_tgsi(&ctx, shader);
6370 ctx.separate_prolog = !is_monolithic;
6371
6372 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6373 sizeof(shader->info.vs_output_param_offset));
6374
6375 shader->info.uses_instanceid = sel->info.uses_instanceid;
6376
6377 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6378 si_llvm_dispose(&ctx);
6379 return -1;
6380 }
6381
6382 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6383 LLVMValueRef parts[2];
6384 bool need_prolog = sel->vs_needs_prolog;
6385
6386 parts[1] = ctx.main_fn;
6387
6388 if (need_prolog) {
6389 union si_shader_part_key prolog_key;
6390 si_get_vs_prolog_key(&sel->info,
6391 shader->info.num_input_sgprs,
6392 &shader->key.part.vs.prolog,
6393 shader, &prolog_key);
6394 si_build_vs_prolog_function(&ctx, &prolog_key);
6395 parts[0] = ctx.main_fn;
6396 }
6397
6398 si_build_wrapper_function(&ctx, parts + !need_prolog,
6399 1 + need_prolog, need_prolog, 0);
6400 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6401 if (sscreen->b.chip_class >= GFX9) {
6402 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6403 LLVMValueRef parts[4];
6404 bool vs_needs_prolog =
6405 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6406
6407 /* TCS main part */
6408 parts[2] = ctx.main_fn;
6409
6410 /* TCS epilog */
6411 union si_shader_part_key tcs_epilog_key;
6412 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6413 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6414 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6415 parts[3] = ctx.main_fn;
6416
6417 /* VS prolog */
6418 if (vs_needs_prolog) {
6419 union si_shader_part_key vs_prolog_key;
6420 si_get_vs_prolog_key(&ls->info,
6421 shader->info.num_input_sgprs,
6422 &shader->key.part.tcs.ls_prolog,
6423 shader, &vs_prolog_key);
6424 vs_prolog_key.vs_prolog.is_monolithic = true;
6425 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6426 parts[0] = ctx.main_fn;
6427 }
6428
6429 /* VS as LS main part */
6430 struct si_shader shader_ls = {};
6431 shader_ls.selector = ls;
6432 shader_ls.key.as_ls = 1;
6433 shader_ls.key.mono = shader->key.mono;
6434 shader_ls.key.opt = shader->key.opt;
6435 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6436
6437 if (!si_compile_tgsi_main(&ctx, true)) {
6438 si_llvm_dispose(&ctx);
6439 return -1;
6440 }
6441 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6442 parts[1] = ctx.main_fn;
6443
6444 /* Reset the shader context. */
6445 ctx.shader = shader;
6446 ctx.type = PIPE_SHADER_TESS_CTRL;
6447
6448 si_build_wrapper_function(&ctx,
6449 parts + !vs_needs_prolog,
6450 4 - !vs_needs_prolog, 0,
6451 vs_needs_prolog ? 2 : 1);
6452 } else {
6453 LLVMValueRef parts[2];
6454 union si_shader_part_key epilog_key;
6455
6456 parts[0] = ctx.main_fn;
6457
6458 memset(&epilog_key, 0, sizeof(epilog_key));
6459 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6460 si_build_tcs_epilog_function(&ctx, &epilog_key);
6461 parts[1] = ctx.main_fn;
6462
6463 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6464 }
6465 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6466 if (ctx.screen->b.chip_class >= GFX9) {
6467 struct si_shader_selector *es = shader->key.part.gs.es;
6468 LLVMValueRef es_prolog = NULL;
6469 LLVMValueRef es_main = NULL;
6470 LLVMValueRef gs_prolog = NULL;
6471 LLVMValueRef gs_main = ctx.main_fn;
6472
6473 /* GS prolog */
6474 union si_shader_part_key gs_prolog_key;
6475 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6476 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6477 gs_prolog_key.gs_prolog.is_monolithic = true;
6478 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6479 gs_prolog = ctx.main_fn;
6480
6481 /* ES prolog */
6482 if (es->vs_needs_prolog) {
6483 union si_shader_part_key vs_prolog_key;
6484 si_get_vs_prolog_key(&es->info,
6485 shader->info.num_input_sgprs,
6486 &shader->key.part.tcs.ls_prolog,
6487 shader, &vs_prolog_key);
6488 vs_prolog_key.vs_prolog.is_monolithic = true;
6489 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6490 es_prolog = ctx.main_fn;
6491 }
6492
6493 /* ES main part */
6494 struct si_shader shader_es = {};
6495 shader_es.selector = es;
6496 shader_es.key.as_es = 1;
6497 shader_es.key.mono = shader->key.mono;
6498 shader_es.key.opt = shader->key.opt;
6499 si_llvm_context_set_tgsi(&ctx, &shader_es);
6500
6501 if (!si_compile_tgsi_main(&ctx, true)) {
6502 si_llvm_dispose(&ctx);
6503 return -1;
6504 }
6505 shader->info.uses_instanceid |= es->info.uses_instanceid;
6506 es_main = ctx.main_fn;
6507
6508 /* Reset the shader context. */
6509 ctx.shader = shader;
6510 ctx.type = PIPE_SHADER_GEOMETRY;
6511
6512 /* Prepare the array of shader parts. */
6513 LLVMValueRef parts[4];
6514 unsigned num_parts = 0, main_part, next_first_part;
6515
6516 if (es_prolog)
6517 parts[num_parts++] = es_prolog;
6518
6519 parts[main_part = num_parts++] = es_main;
6520 parts[next_first_part = num_parts++] = gs_prolog;
6521 parts[num_parts++] = gs_main;
6522
6523 si_build_wrapper_function(&ctx, parts, num_parts,
6524 main_part, next_first_part);
6525 } else {
6526 LLVMValueRef parts[2];
6527 union si_shader_part_key prolog_key;
6528
6529 parts[1] = ctx.main_fn;
6530
6531 memset(&prolog_key, 0, sizeof(prolog_key));
6532 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6533 si_build_gs_prolog_function(&ctx, &prolog_key);
6534 parts[0] = ctx.main_fn;
6535
6536 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6537 }
6538 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6539 LLVMValueRef parts[3];
6540 union si_shader_part_key prolog_key;
6541 union si_shader_part_key epilog_key;
6542 bool need_prolog;
6543
6544 si_get_ps_prolog_key(shader, &prolog_key, false);
6545 need_prolog = si_need_ps_prolog(&prolog_key);
6546
6547 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6548
6549 if (need_prolog) {
6550 si_build_ps_prolog_function(&ctx, &prolog_key);
6551 parts[0] = ctx.main_fn;
6552 }
6553
6554 si_get_ps_epilog_key(shader, &epilog_key);
6555 si_build_ps_epilog_function(&ctx, &epilog_key);
6556 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6557
6558 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6559 need_prolog ? 1 : 0, 0);
6560 }
6561
6562 si_llvm_optimize_module(&ctx);
6563
6564 /* Post-optimization transformations and analysis. */
6565 si_optimize_vs_outputs(&ctx);
6566
6567 if ((debug && debug->debug_message) ||
6568 r600_can_dump_shader(&sscreen->b, ctx.type))
6569 si_count_scratch_private_memory(&ctx);
6570
6571 /* Compile to bytecode. */
6572 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6573 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6574 si_llvm_dispose(&ctx);
6575 if (r) {
6576 fprintf(stderr, "LLVM failed to compile shader\n");
6577 return r;
6578 }
6579
6580 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6581 * LLVM 3.9svn has this bug.
6582 */
6583 if (sel->type == PIPE_SHADER_COMPUTE) {
6584 unsigned wave_size = 64;
6585 unsigned max_vgprs = 256;
6586 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6587 unsigned max_sgprs_per_wave = 128;
6588 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6589 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6590 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6591
6592 max_vgprs = max_vgprs / min_waves_per_simd;
6593 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6594
6595 if (shader->config.num_sgprs > max_sgprs ||
6596 shader->config.num_vgprs > max_vgprs) {
6597 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6598 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6599 shader->config.num_sgprs, shader->config.num_vgprs,
6600 max_sgprs, max_vgprs);
6601
6602 /* Just terminate the process, because dependent
6603 * shaders can hang due to bad input data, but use
6604 * the env var to allow shader-db to work.
6605 */
6606 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6607 abort();
6608 }
6609 }
6610
6611 /* Add the scratch offset to input SGPRs. */
6612 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6613 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6614
6615 /* Calculate the number of fragment input VGPRs. */
6616 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6617 shader->info.num_input_vgprs = 0;
6618 shader->info.face_vgpr_index = -1;
6619
6620 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6621 shader->info.num_input_vgprs += 2;
6622 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6623 shader->info.num_input_vgprs += 2;
6624 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6625 shader->info.num_input_vgprs += 2;
6626 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6627 shader->info.num_input_vgprs += 3;
6628 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6629 shader->info.num_input_vgprs += 2;
6630 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6631 shader->info.num_input_vgprs += 2;
6632 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6633 shader->info.num_input_vgprs += 2;
6634 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6635 shader->info.num_input_vgprs += 1;
6636 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6637 shader->info.num_input_vgprs += 1;
6638 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6639 shader->info.num_input_vgprs += 1;
6640 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6641 shader->info.num_input_vgprs += 1;
6642 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6643 shader->info.num_input_vgprs += 1;
6644 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6645 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6646 shader->info.num_input_vgprs += 1;
6647 }
6648 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6649 shader->info.num_input_vgprs += 1;
6650 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6651 shader->info.num_input_vgprs += 1;
6652 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6653 shader->info.num_input_vgprs += 1;
6654 }
6655
6656 return 0;
6657 }
6658
6659 /**
6660 * Create, compile and return a shader part (prolog or epilog).
6661 *
6662 * \param sscreen screen
6663 * \param list list of shader parts of the same category
6664 * \param type shader type
6665 * \param key shader part key
6666 * \param prolog whether the part being requested is a prolog
6667 * \param tm LLVM target machine
6668 * \param debug debug callback
6669 * \param build the callback responsible for building the main function
6670 * \return non-NULL on success
6671 */
6672 static struct si_shader_part *
6673 si_get_shader_part(struct si_screen *sscreen,
6674 struct si_shader_part **list,
6675 enum pipe_shader_type type,
6676 bool prolog,
6677 union si_shader_part_key *key,
6678 LLVMTargetMachineRef tm,
6679 struct pipe_debug_callback *debug,
6680 void (*build)(struct si_shader_context *,
6681 union si_shader_part_key *),
6682 const char *name)
6683 {
6684 struct si_shader_part *result;
6685
6686 mtx_lock(&sscreen->shader_parts_mutex);
6687
6688 /* Find existing. */
6689 for (result = *list; result; result = result->next) {
6690 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6691 mtx_unlock(&sscreen->shader_parts_mutex);
6692 return result;
6693 }
6694 }
6695
6696 /* Compile a new one. */
6697 result = CALLOC_STRUCT(si_shader_part);
6698 result->key = *key;
6699
6700 struct si_shader shader = {};
6701 struct si_shader_context ctx;
6702 struct gallivm_state *gallivm = &ctx.gallivm;
6703
6704 si_init_shader_ctx(&ctx, sscreen, tm);
6705 ctx.shader = &shader;
6706 ctx.type = type;
6707
6708 switch (type) {
6709 case PIPE_SHADER_VERTEX:
6710 break;
6711 case PIPE_SHADER_TESS_CTRL:
6712 assert(!prolog);
6713 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6714 break;
6715 case PIPE_SHADER_GEOMETRY:
6716 assert(prolog);
6717 break;
6718 case PIPE_SHADER_FRAGMENT:
6719 if (prolog)
6720 shader.key.part.ps.prolog = key->ps_prolog.states;
6721 else
6722 shader.key.part.ps.epilog = key->ps_epilog.states;
6723 break;
6724 default:
6725 unreachable("bad shader part");
6726 }
6727
6728 build(&ctx, key);
6729
6730 /* Compile. */
6731 si_llvm_optimize_module(&ctx);
6732
6733 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6734 gallivm->module, debug, ctx.type, name)) {
6735 FREE(result);
6736 result = NULL;
6737 goto out;
6738 }
6739
6740 result->next = *list;
6741 *list = result;
6742
6743 out:
6744 si_llvm_dispose(&ctx);
6745 mtx_unlock(&sscreen->shader_parts_mutex);
6746 return result;
6747 }
6748
6749 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6750 {
6751 struct gallivm_state *gallivm = &ctx->gallivm;
6752 LLVMValueRef ptr[2], list;
6753
6754 /* Get the pointer to rw buffers. */
6755 ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6756 ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6757 list = lp_build_gather_values(gallivm, ptr, 2);
6758 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6759 list = LLVMBuildIntToPtr(gallivm->builder, list,
6760 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6761 return list;
6762 }
6763
6764 /**
6765 * Build the vertex shader prolog function.
6766 *
6767 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6768 * All inputs are returned unmodified. The vertex load indices are
6769 * stored after them, which will be used by the API VS for fetching inputs.
6770 *
6771 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6772 * input_v0,
6773 * input_v1,
6774 * input_v2,
6775 * input_v3,
6776 * (VertexID + BaseVertex),
6777 * (InstanceID + StartInstance),
6778 * (InstanceID / 2 + StartInstance)
6779 */
6780 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6781 union si_shader_part_key *key)
6782 {
6783 struct gallivm_state *gallivm = &ctx->gallivm;
6784 struct si_function_info fninfo;
6785 LLVMTypeRef *returns;
6786 LLVMValueRef ret, func;
6787 int num_returns, i;
6788 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
6789 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6790 LLVMValueRef input_vgprs[9];
6791 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6792 num_input_vgprs;
6793 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6794
6795 si_init_function_info(&fninfo);
6796
6797 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6798 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6799 sizeof(LLVMTypeRef));
6800 num_returns = 0;
6801
6802 /* Declare input and output SGPRs. */
6803 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6804 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6805 returns[num_returns++] = ctx->i32;
6806 }
6807
6808 /* Preloaded VGPRs (outputs must be floats) */
6809 for (i = 0; i < num_input_vgprs; i++) {
6810 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
6811 returns[num_returns++] = ctx->f32;
6812 }
6813
6814 /* Vertex load indices. */
6815 for (i = 0; i <= key->vs_prolog.last_input; i++)
6816 returns[num_returns++] = ctx->f32;
6817
6818 /* Create the function. */
6819 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6820 func = ctx->main_fn;
6821
6822 if (key->vs_prolog.num_merged_next_stage_vgprs) {
6823 if (!key->vs_prolog.is_monolithic)
6824 si_init_exec_from_input(ctx, 3, 0);
6825
6826 if (key->vs_prolog.as_ls &&
6827 (ctx->screen->b.family == CHIP_VEGA10 ||
6828 ctx->screen->b.family == CHIP_RAVEN)) {
6829 /* If there are no HS threads, SPI loads the LS VGPRs
6830 * starting at VGPR 0. Shift them back to where they
6831 * belong.
6832 */
6833 LLVMValueRef has_hs_threads =
6834 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
6835 unpack_param(ctx, 3, 8, 8),
6836 ctx->i32_0, "");
6837
6838 for (i = 4; i > 0; --i) {
6839 input_vgprs[i + 1] =
6840 LLVMBuildSelect(gallivm->builder, has_hs_threads,
6841 input_vgprs[i + 1],
6842 input_vgprs[i - 1], "");
6843 }
6844 }
6845 }
6846
6847 ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
6848 ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
6849
6850 /* Copy inputs to outputs. This should be no-op, as the registers match,
6851 * but it will prevent the compiler from overwriting them unintentionally.
6852 */
6853 ret = ctx->return_value;
6854 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6855 LLVMValueRef p = LLVMGetParam(func, i);
6856 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6857 }
6858 for (i = 0; i < num_input_vgprs; i++) {
6859 LLVMValueRef p = input_vgprs[i];
6860 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6861 ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
6862 key->vs_prolog.num_input_sgprs + i, "");
6863 }
6864
6865 /* Compute vertex load indices from instance divisors. */
6866 LLVMValueRef instance_divisor_constbuf = NULL;
6867
6868 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6869 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6870 LLVMValueRef buf_index =
6871 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6872 instance_divisor_constbuf =
6873 ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6874 }
6875
6876 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6877 bool divisor_is_one =
6878 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6879 bool divisor_is_fetched =
6880 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6881 LLVMValueRef index;
6882
6883 if (divisor_is_one || divisor_is_fetched) {
6884 LLVMValueRef divisor = ctx->i32_1;
6885
6886 if (divisor_is_fetched) {
6887 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6888 LLVMConstInt(ctx->i32, i * 4, 0));
6889 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6890 ctx->i32, "");
6891 }
6892
6893 /* InstanceID / Divisor + StartInstance */
6894 index = get_instance_index_for_fetch(ctx,
6895 user_sgpr_base +
6896 SI_SGPR_START_INSTANCE,
6897 divisor);
6898 } else {
6899 /* VertexID + BaseVertex */
6900 index = LLVMBuildAdd(gallivm->builder,
6901 ctx->abi.vertex_id,
6902 LLVMGetParam(func, user_sgpr_base +
6903 SI_SGPR_BASE_VERTEX), "");
6904 }
6905
6906 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6907 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6908 fninfo.num_params + i, "");
6909 }
6910
6911 si_llvm_build_ret(ctx, ret);
6912 }
6913
6914 static bool si_get_vs_prolog(struct si_screen *sscreen,
6915 LLVMTargetMachineRef tm,
6916 struct si_shader *shader,
6917 struct pipe_debug_callback *debug,
6918 struct si_shader *main_part,
6919 const struct si_vs_prolog_bits *key)
6920 {
6921 struct si_shader_selector *vs = main_part->selector;
6922
6923 if (!si_vs_needs_prolog(vs, key))
6924 return true;
6925
6926 /* Get the prolog. */
6927 union si_shader_part_key prolog_key;
6928 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6929 key, shader, &prolog_key);
6930
6931 shader->prolog =
6932 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6933 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6934 debug, si_build_vs_prolog_function,
6935 "Vertex Shader Prolog");
6936 return shader->prolog != NULL;
6937 }
6938
6939 /**
6940 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6941 */
6942 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6943 LLVMTargetMachineRef tm,
6944 struct si_shader *shader,
6945 struct pipe_debug_callback *debug)
6946 {
6947 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6948 &shader->key.part.vs.prolog);
6949 }
6950
6951 /**
6952 * Compile the TCS epilog function. This writes tesselation factors to memory
6953 * based on the output primitive type of the tesselator (determined by TES).
6954 */
6955 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6956 union si_shader_part_key *key)
6957 {
6958 struct gallivm_state *gallivm = &ctx->gallivm;
6959 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6960 struct si_function_info fninfo;
6961 LLVMValueRef func;
6962
6963 si_init_function_info(&fninfo);
6964
6965 if (ctx->screen->b.chip_class >= GFX9) {
6966 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6967 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6968 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
6969 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6970 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6971 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6972 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6973 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6974 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6975 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6976 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6977 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6978 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6979 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6980 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6981 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6982 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6983 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6984 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6985 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6986 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6987 } else {
6988 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6989 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6990 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6991 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6992 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6993 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6994 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6995 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6996 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6997 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6998 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6999 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7000 }
7001
7002 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7003 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7004 unsigned tess_factors_idx =
7005 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7006 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7007 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7008
7009 /* Create the function. */
7010 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7011 ctx->screen->b.chip_class >= CIK ? 128 : 64);
7012 declare_lds_as_pointer(ctx);
7013 func = ctx->main_fn;
7014
7015 si_write_tess_factors(bld_base,
7016 LLVMGetParam(func, tess_factors_idx),
7017 LLVMGetParam(func, tess_factors_idx + 1),
7018 LLVMGetParam(func, tess_factors_idx + 2));
7019
7020 LLVMBuildRetVoid(gallivm->builder);
7021 }
7022
7023 /**
7024 * Select and compile (or reuse) TCS parts (epilog).
7025 */
7026 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7027 LLVMTargetMachineRef tm,
7028 struct si_shader *shader,
7029 struct pipe_debug_callback *debug)
7030 {
7031 if (sscreen->b.chip_class >= GFX9) {
7032 struct si_shader *ls_main_part =
7033 shader->key.part.tcs.ls->main_shader_part_ls;
7034
7035 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
7036 &shader->key.part.tcs.ls_prolog))
7037 return false;
7038
7039 shader->previous_stage = ls_main_part;
7040 }
7041
7042 /* Get the epilog. */
7043 union si_shader_part_key epilog_key;
7044 memset(&epilog_key, 0, sizeof(epilog_key));
7045 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7046
7047 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7048 PIPE_SHADER_TESS_CTRL, false,
7049 &epilog_key, tm, debug,
7050 si_build_tcs_epilog_function,
7051 "Tessellation Control Shader Epilog");
7052 return shader->epilog != NULL;
7053 }
7054
7055 /**
7056 * Select and compile (or reuse) GS parts (prolog).
7057 */
7058 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7059 LLVMTargetMachineRef tm,
7060 struct si_shader *shader,
7061 struct pipe_debug_callback *debug)
7062 {
7063 if (sscreen->b.chip_class >= GFX9) {
7064 struct si_shader *es_main_part =
7065 shader->key.part.gs.es->main_shader_part_es;
7066
7067 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7068 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
7069 &shader->key.part.gs.vs_prolog))
7070 return false;
7071
7072 shader->previous_stage = es_main_part;
7073 }
7074
7075 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7076 return true;
7077
7078 union si_shader_part_key prolog_key;
7079 memset(&prolog_key, 0, sizeof(prolog_key));
7080 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7081
7082 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7083 PIPE_SHADER_GEOMETRY, true,
7084 &prolog_key, tm, debug,
7085 si_build_gs_prolog_function,
7086 "Geometry Shader Prolog");
7087 return shader->prolog2 != NULL;
7088 }
7089
7090 /**
7091 * Build the pixel shader prolog function. This handles:
7092 * - two-side color selection and interpolation
7093 * - overriding interpolation parameters for the API PS
7094 * - polygon stippling
7095 *
7096 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7097 * overriden by other states. (e.g. per-sample interpolation)
7098 * Interpolated colors are stored after the preloaded VGPRs.
7099 */
7100 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7101 union si_shader_part_key *key)
7102 {
7103 struct gallivm_state *gallivm = &ctx->gallivm;
7104 struct si_function_info fninfo;
7105 LLVMValueRef ret, func;
7106 int num_returns, i, num_color_channels;
7107
7108 assert(si_need_ps_prolog(key));
7109
7110 si_init_function_info(&fninfo);
7111
7112 /* Declare inputs. */
7113 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7114 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7115
7116 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7117 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7118
7119 /* Declare outputs (same as inputs + add colors if needed) */
7120 num_returns = fninfo.num_params;
7121 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7122 for (i = 0; i < num_color_channels; i++)
7123 fninfo.types[num_returns++] = ctx->f32;
7124
7125 /* Create the function. */
7126 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7127 &fninfo, 0);
7128 func = ctx->main_fn;
7129
7130 /* Copy inputs to outputs. This should be no-op, as the registers match,
7131 * but it will prevent the compiler from overwriting them unintentionally.
7132 */
7133 ret = ctx->return_value;
7134 for (i = 0; i < fninfo.num_params; i++) {
7135 LLVMValueRef p = LLVMGetParam(func, i);
7136 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7137 }
7138
7139 /* Polygon stippling. */
7140 if (key->ps_prolog.states.poly_stipple) {
7141 /* POS_FIXED_PT is always last. */
7142 unsigned pos = key->ps_prolog.num_input_sgprs +
7143 key->ps_prolog.num_input_vgprs - 1;
7144 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7145
7146 si_llvm_emit_polygon_stipple(ctx, list, pos);
7147 }
7148
7149 if (key->ps_prolog.states.bc_optimize_for_persp ||
7150 key->ps_prolog.states.bc_optimize_for_linear) {
7151 unsigned i, base = key->ps_prolog.num_input_sgprs;
7152 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7153
7154 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7155 * The hw doesn't compute CENTROID if the whole wave only
7156 * contains fully-covered quads.
7157 *
7158 * PRIM_MASK is after user SGPRs.
7159 */
7160 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7161 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7162 LLVMConstInt(ctx->i32, 31, 0), "");
7163 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7164 ctx->i1, "");
7165
7166 if (key->ps_prolog.states.bc_optimize_for_persp) {
7167 /* Read PERSP_CENTER. */
7168 for (i = 0; i < 2; i++)
7169 center[i] = LLVMGetParam(func, base + 2 + i);
7170 /* Read PERSP_CENTROID. */
7171 for (i = 0; i < 2; i++)
7172 centroid[i] = LLVMGetParam(func, base + 4 + i);
7173 /* Select PERSP_CENTROID. */
7174 for (i = 0; i < 2; i++) {
7175 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7176 center[i], centroid[i], "");
7177 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7178 tmp, base + 4 + i, "");
7179 }
7180 }
7181 if (key->ps_prolog.states.bc_optimize_for_linear) {
7182 /* Read LINEAR_CENTER. */
7183 for (i = 0; i < 2; i++)
7184 center[i] = LLVMGetParam(func, base + 8 + i);
7185 /* Read LINEAR_CENTROID. */
7186 for (i = 0; i < 2; i++)
7187 centroid[i] = LLVMGetParam(func, base + 10 + i);
7188 /* Select LINEAR_CENTROID. */
7189 for (i = 0; i < 2; i++) {
7190 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7191 center[i], centroid[i], "");
7192 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7193 tmp, base + 10 + i, "");
7194 }
7195 }
7196 }
7197
7198 /* Force per-sample interpolation. */
7199 if (key->ps_prolog.states.force_persp_sample_interp) {
7200 unsigned i, base = key->ps_prolog.num_input_sgprs;
7201 LLVMValueRef persp_sample[2];
7202
7203 /* Read PERSP_SAMPLE. */
7204 for (i = 0; i < 2; i++)
7205 persp_sample[i] = LLVMGetParam(func, base + i);
7206 /* Overwrite PERSP_CENTER. */
7207 for (i = 0; i < 2; i++)
7208 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7209 persp_sample[i], base + 2 + i, "");
7210 /* Overwrite PERSP_CENTROID. */
7211 for (i = 0; i < 2; i++)
7212 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7213 persp_sample[i], base + 4 + i, "");
7214 }
7215 if (key->ps_prolog.states.force_linear_sample_interp) {
7216 unsigned i, base = key->ps_prolog.num_input_sgprs;
7217 LLVMValueRef linear_sample[2];
7218
7219 /* Read LINEAR_SAMPLE. */
7220 for (i = 0; i < 2; i++)
7221 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7222 /* Overwrite LINEAR_CENTER. */
7223 for (i = 0; i < 2; i++)
7224 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7225 linear_sample[i], base + 8 + i, "");
7226 /* Overwrite LINEAR_CENTROID. */
7227 for (i = 0; i < 2; i++)
7228 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7229 linear_sample[i], base + 10 + i, "");
7230 }
7231
7232 /* Force center interpolation. */
7233 if (key->ps_prolog.states.force_persp_center_interp) {
7234 unsigned i, base = key->ps_prolog.num_input_sgprs;
7235 LLVMValueRef persp_center[2];
7236
7237 /* Read PERSP_CENTER. */
7238 for (i = 0; i < 2; i++)
7239 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7240 /* Overwrite PERSP_SAMPLE. */
7241 for (i = 0; i < 2; i++)
7242 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7243 persp_center[i], base + i, "");
7244 /* Overwrite PERSP_CENTROID. */
7245 for (i = 0; i < 2; i++)
7246 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7247 persp_center[i], base + 4 + i, "");
7248 }
7249 if (key->ps_prolog.states.force_linear_center_interp) {
7250 unsigned i, base = key->ps_prolog.num_input_sgprs;
7251 LLVMValueRef linear_center[2];
7252
7253 /* Read LINEAR_CENTER. */
7254 for (i = 0; i < 2; i++)
7255 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7256 /* Overwrite LINEAR_SAMPLE. */
7257 for (i = 0; i < 2; i++)
7258 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7259 linear_center[i], base + 6 + i, "");
7260 /* Overwrite LINEAR_CENTROID. */
7261 for (i = 0; i < 2; i++)
7262 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7263 linear_center[i], base + 10 + i, "");
7264 }
7265
7266 /* Interpolate colors. */
7267 unsigned color_out_idx = 0;
7268 for (i = 0; i < 2; i++) {
7269 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7270 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7271 key->ps_prolog.face_vgpr_index;
7272 LLVMValueRef interp[2], color[4];
7273 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7274
7275 if (!writemask)
7276 continue;
7277
7278 /* If the interpolation qualifier is not CONSTANT (-1). */
7279 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7280 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7281 key->ps_prolog.color_interp_vgpr_index[i];
7282
7283 /* Get the (i,j) updated by bc_optimize handling. */
7284 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7285 interp_vgpr, "");
7286 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7287 interp_vgpr + 1, "");
7288 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7289 }
7290
7291 /* Use the absolute location of the input. */
7292 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7293
7294 if (key->ps_prolog.states.color_two_side) {
7295 face = LLVMGetParam(func, face_vgpr);
7296 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7297 }
7298
7299 interp_fs_input(ctx,
7300 key->ps_prolog.color_attr_index[i],
7301 TGSI_SEMANTIC_COLOR, i,
7302 key->ps_prolog.num_interp_inputs,
7303 key->ps_prolog.colors_read, interp_ij,
7304 prim_mask, face, color);
7305
7306 while (writemask) {
7307 unsigned chan = u_bit_scan(&writemask);
7308 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7309 fninfo.num_params + color_out_idx++, "");
7310 }
7311 }
7312
7313 /* Tell LLVM to insert WQM instruction sequence when needed. */
7314 if (key->ps_prolog.wqm) {
7315 LLVMAddTargetDependentFunctionAttr(func,
7316 "amdgpu-ps-wqm-outputs", "");
7317 }
7318
7319 si_llvm_build_ret(ctx, ret);
7320 }
7321
7322 /**
7323 * Build the pixel shader epilog function. This handles everything that must be
7324 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7325 */
7326 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7327 union si_shader_part_key *key)
7328 {
7329 struct gallivm_state *gallivm = &ctx->gallivm;
7330 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7331 struct si_function_info fninfo;
7332 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7333 int i;
7334 struct si_ps_exports exp = {};
7335
7336 si_init_function_info(&fninfo);
7337
7338 /* Declare input SGPRs. */
7339 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7340 ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7341 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7342 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7343 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7344
7345 /* Declare input VGPRs. */
7346 unsigned required_num_params =
7347 fninfo.num_sgpr_params +
7348 util_bitcount(key->ps_epilog.colors_written) * 4 +
7349 key->ps_epilog.writes_z +
7350 key->ps_epilog.writes_stencil +
7351 key->ps_epilog.writes_samplemask;
7352
7353 required_num_params = MAX2(required_num_params,
7354 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7355
7356 while (fninfo.num_params < required_num_params)
7357 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7358
7359 /* Create the function. */
7360 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7361 /* Disable elimination of unused inputs. */
7362 si_llvm_add_attribute(ctx->main_fn,
7363 "InitialPSInputAddr", 0xffffff);
7364
7365 /* Process colors. */
7366 unsigned vgpr = fninfo.num_sgpr_params;
7367 unsigned colors_written = key->ps_epilog.colors_written;
7368 int last_color_export = -1;
7369
7370 /* Find the last color export. */
7371 if (!key->ps_epilog.writes_z &&
7372 !key->ps_epilog.writes_stencil &&
7373 !key->ps_epilog.writes_samplemask) {
7374 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7375
7376 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7377 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7378 /* Just set this if any of the colorbuffers are enabled. */
7379 if (spi_format &
7380 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7381 last_color_export = 0;
7382 } else {
7383 for (i = 0; i < 8; i++)
7384 if (colors_written & (1 << i) &&
7385 (spi_format >> (i * 4)) & 0xf)
7386 last_color_export = i;
7387 }
7388 }
7389
7390 while (colors_written) {
7391 LLVMValueRef color[4];
7392 int mrt = u_bit_scan(&colors_written);
7393
7394 for (i = 0; i < 4; i++)
7395 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7396
7397 si_export_mrt_color(bld_base, color, mrt,
7398 fninfo.num_params - 1,
7399 mrt == last_color_export, &exp);
7400 }
7401
7402 /* Process depth, stencil, samplemask. */
7403 if (key->ps_epilog.writes_z)
7404 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7405 if (key->ps_epilog.writes_stencil)
7406 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7407 if (key->ps_epilog.writes_samplemask)
7408 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7409
7410 if (depth || stencil || samplemask)
7411 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7412 else if (last_color_export == -1)
7413 si_export_null(bld_base);
7414
7415 if (exp.num)
7416 si_emit_ps_exports(ctx, &exp);
7417
7418 /* Compile. */
7419 LLVMBuildRetVoid(gallivm->builder);
7420 }
7421
7422 /**
7423 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7424 */
7425 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7426 LLVMTargetMachineRef tm,
7427 struct si_shader *shader,
7428 struct pipe_debug_callback *debug)
7429 {
7430 union si_shader_part_key prolog_key;
7431 union si_shader_part_key epilog_key;
7432
7433 /* Get the prolog. */
7434 si_get_ps_prolog_key(shader, &prolog_key, true);
7435
7436 /* The prolog is a no-op if these aren't set. */
7437 if (si_need_ps_prolog(&prolog_key)) {
7438 shader->prolog =
7439 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7440 PIPE_SHADER_FRAGMENT, true,
7441 &prolog_key, tm, debug,
7442 si_build_ps_prolog_function,
7443 "Fragment Shader Prolog");
7444 if (!shader->prolog)
7445 return false;
7446 }
7447
7448 /* Get the epilog. */
7449 si_get_ps_epilog_key(shader, &epilog_key);
7450
7451 shader->epilog =
7452 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7453 PIPE_SHADER_FRAGMENT, false,
7454 &epilog_key, tm, debug,
7455 si_build_ps_epilog_function,
7456 "Fragment Shader Epilog");
7457 if (!shader->epilog)
7458 return false;
7459
7460 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7461 if (shader->key.part.ps.prolog.poly_stipple) {
7462 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7463 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7464 }
7465
7466 /* Set up the enable bits for per-sample shading if needed. */
7467 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7468 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7469 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7470 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7471 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7472 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7473 }
7474 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7475 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7476 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7477 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7478 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7479 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7480 }
7481 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7482 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7483 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7484 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7485 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7486 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7487 }
7488 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7489 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7490 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7491 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7492 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7493 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7494 }
7495
7496 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7497 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7498 !(shader->config.spi_ps_input_ena & 0xf)) {
7499 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7500 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7501 }
7502
7503 /* At least one pair of interpolation weights must be enabled. */
7504 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7505 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7506 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7507 }
7508
7509 /* The sample mask input is always enabled, because the API shader always
7510 * passes it through to the epilog. Disable it here if it's unused.
7511 */
7512 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7513 !shader->selector->info.reads_samplemask)
7514 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7515
7516 return true;
7517 }
7518
7519 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7520 unsigned *lds_size)
7521 {
7522 /* SPI barrier management bug:
7523 * Make sure we have at least 4k of LDS in use to avoid the bug.
7524 * It applies to workgroup sizes of more than one wavefront.
7525 */
7526 if (sscreen->b.family == CHIP_BONAIRE ||
7527 sscreen->b.family == CHIP_KABINI ||
7528 sscreen->b.family == CHIP_MULLINS)
7529 *lds_size = MAX2(*lds_size, 8);
7530 }
7531
7532 static void si_fix_resource_usage(struct si_screen *sscreen,
7533 struct si_shader *shader)
7534 {
7535 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7536
7537 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7538
7539 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7540 si_get_max_workgroup_size(shader) > 64) {
7541 si_multiwave_lds_size_workaround(sscreen,
7542 &shader->config.lds_size);
7543 }
7544 }
7545
7546 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7547 struct si_shader *shader,
7548 struct pipe_debug_callback *debug)
7549 {
7550 struct si_shader_selector *sel = shader->selector;
7551 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7552 int r;
7553
7554 /* LS, ES, VS are compiled on demand if the main part hasn't been
7555 * compiled for that stage.
7556 *
7557 * Vertex shaders are compiled on demand when a vertex fetch
7558 * workaround must be applied.
7559 */
7560 if (shader->is_monolithic) {
7561 /* Monolithic shader (compiled as a whole, has many variants,
7562 * may take a long time to compile).
7563 */
7564 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7565 if (r)
7566 return r;
7567 } else {
7568 /* The shader consists of several parts:
7569 *
7570 * - the middle part is the user shader, it has 1 variant only
7571 * and it was compiled during the creation of the shader
7572 * selector
7573 * - the prolog part is inserted at the beginning
7574 * - the epilog part is inserted at the end
7575 *
7576 * The prolog and epilog have many (but simple) variants.
7577 *
7578 * Starting with gfx9, geometry and tessellation control
7579 * shaders also contain the prolog and user shader parts of
7580 * the previous shader stage.
7581 */
7582
7583 if (!mainp)
7584 return -1;
7585
7586 /* Copy the compiled TGSI shader data over. */
7587 shader->is_binary_shared = true;
7588 shader->binary = mainp->binary;
7589 shader->config = mainp->config;
7590 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7591 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7592 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7593 memcpy(shader->info.vs_output_param_offset,
7594 mainp->info.vs_output_param_offset,
7595 sizeof(mainp->info.vs_output_param_offset));
7596 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7597 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7598 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7599
7600 /* Select prologs and/or epilogs. */
7601 switch (sel->type) {
7602 case PIPE_SHADER_VERTEX:
7603 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7604 return -1;
7605 break;
7606 case PIPE_SHADER_TESS_CTRL:
7607 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7608 return -1;
7609 break;
7610 case PIPE_SHADER_TESS_EVAL:
7611 break;
7612 case PIPE_SHADER_GEOMETRY:
7613 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7614 return -1;
7615 break;
7616 case PIPE_SHADER_FRAGMENT:
7617 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7618 return -1;
7619
7620 /* Make sure we have at least as many VGPRs as there
7621 * are allocated inputs.
7622 */
7623 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7624 shader->info.num_input_vgprs);
7625 break;
7626 }
7627
7628 /* Update SGPR and VGPR counts. */
7629 if (shader->prolog) {
7630 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7631 shader->prolog->config.num_sgprs);
7632 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7633 shader->prolog->config.num_vgprs);
7634 }
7635 if (shader->previous_stage) {
7636 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7637 shader->previous_stage->config.num_sgprs);
7638 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7639 shader->previous_stage->config.num_vgprs);
7640 shader->config.spilled_sgprs =
7641 MAX2(shader->config.spilled_sgprs,
7642 shader->previous_stage->config.spilled_sgprs);
7643 shader->config.spilled_vgprs =
7644 MAX2(shader->config.spilled_vgprs,
7645 shader->previous_stage->config.spilled_vgprs);
7646 shader->config.private_mem_vgprs =
7647 MAX2(shader->config.private_mem_vgprs,
7648 shader->previous_stage->config.private_mem_vgprs);
7649 shader->config.scratch_bytes_per_wave =
7650 MAX2(shader->config.scratch_bytes_per_wave,
7651 shader->previous_stage->config.scratch_bytes_per_wave);
7652 shader->info.uses_instanceid |=
7653 shader->previous_stage->info.uses_instanceid;
7654 }
7655 if (shader->prolog2) {
7656 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7657 shader->prolog2->config.num_sgprs);
7658 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7659 shader->prolog2->config.num_vgprs);
7660 }
7661 if (shader->epilog) {
7662 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7663 shader->epilog->config.num_sgprs);
7664 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7665 shader->epilog->config.num_vgprs);
7666 }
7667 }
7668
7669 si_fix_resource_usage(sscreen, shader);
7670 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7671 stderr, true);
7672
7673 /* Upload. */
7674 r = si_shader_binary_upload(sscreen, shader);
7675 if (r) {
7676 fprintf(stderr, "LLVM failed to upload shader\n");
7677 return r;
7678 }
7679
7680 return 0;
7681 }
7682
7683 void si_shader_destroy(struct si_shader *shader)
7684 {
7685 if (shader->scratch_bo)
7686 r600_resource_reference(&shader->scratch_bo, NULL);
7687
7688 r600_resource_reference(&shader->bo, NULL);
7689
7690 if (!shader->is_binary_shared)
7691 radeon_shader_binary_clean(&shader->binary);
7692
7693 free(shader->shader_log);
7694 }