radeonsi/gfx9: always wrap GS and TCS in an if-block (v2)
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85
86 /* Ideally pass the sample mask input to the PS epilog as v13, which
87 * is its usual location, so that the shader doesn't have to add v_mov.
88 */
89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
90
91 enum {
92 CONST_ADDR_SPACE = 2,
93 LOCAL_ADDR_SPACE = 3,
94 };
95
96 static bool is_merged_shader(struct si_shader *shader)
97 {
98 if (shader->selector->screen->b.chip_class <= VI)
99 return false;
100
101 return shader->key.as_ls ||
102 shader->key.as_es ||
103 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
104 shader->selector->type == PIPE_SHADER_GEOMETRY;
105 }
106
107 /**
108 * Returns a unique index for a per-patch semantic name and index. The index
109 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
110 * can be calculated.
111 */
112 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
113 {
114 switch (semantic_name) {
115 case TGSI_SEMANTIC_TESSOUTER:
116 return 0;
117 case TGSI_SEMANTIC_TESSINNER:
118 return 1;
119 case TGSI_SEMANTIC_PATCH:
120 assert(index < 30);
121 return 2 + index;
122
123 default:
124 assert(!"invalid semantic name");
125 return 0;
126 }
127 }
128
129 /**
130 * Returns a unique index for a semantic name and index. The index must be
131 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
132 * calculated.
133 */
134 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
135 {
136 switch (semantic_name) {
137 case TGSI_SEMANTIC_POSITION:
138 return 0;
139 case TGSI_SEMANTIC_GENERIC:
140 /* Since some shader stages use the the highest used IO index
141 * to determine the size to allocate for inputs/outputs
142 * (in LDS, tess and GS rings). GENERIC should be placed right
143 * after POSITION to make that size as small as possible.
144 */
145 if (index < SI_MAX_IO_GENERIC)
146 return 1 + index;
147
148 assert(!"invalid generic index");
149 return 0;
150 case TGSI_SEMANTIC_PSIZE:
151 return SI_MAX_IO_GENERIC + 1;
152 case TGSI_SEMANTIC_CLIPDIST:
153 assert(index <= 1);
154 return SI_MAX_IO_GENERIC + 2 + index;
155 case TGSI_SEMANTIC_FOG:
156 return SI_MAX_IO_GENERIC + 4;
157 case TGSI_SEMANTIC_LAYER:
158 return SI_MAX_IO_GENERIC + 5;
159 case TGSI_SEMANTIC_VIEWPORT_INDEX:
160 return SI_MAX_IO_GENERIC + 6;
161 case TGSI_SEMANTIC_PRIMID:
162 return SI_MAX_IO_GENERIC + 7;
163 case TGSI_SEMANTIC_COLOR: /* these alias */
164 case TGSI_SEMANTIC_BCOLOR:
165 assert(index < 2);
166 return SI_MAX_IO_GENERIC + 8 + index;
167 case TGSI_SEMANTIC_TEXCOORD:
168 assert(index < 8);
169 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
170 return SI_MAX_IO_GENERIC + 10 + index;
171 default:
172 assert(!"invalid semantic name");
173 return 0;
174 }
175 }
176
177 /**
178 * Helper function that builds an LLVM IR PHI node and immediately adds
179 * incoming edges.
180 */
181 static LLVMValueRef
182 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
183 unsigned count_incoming, LLVMValueRef *values,
184 LLVMBasicBlockRef *blocks)
185 {
186 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
187 LLVMAddIncoming(phi, values, blocks, count_incoming);
188 return phi;
189 }
190
191 /**
192 * Get the value of a shader input parameter and extract a bitfield.
193 */
194 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
195 unsigned param, unsigned rshift,
196 unsigned bitwidth)
197 {
198 struct gallivm_state *gallivm = &ctx->gallivm;
199 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
200 param);
201
202 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
203 value = bitcast(&ctx->bld_base,
204 TGSI_TYPE_UNSIGNED, value);
205
206 if (rshift)
207 value = LLVMBuildLShr(gallivm->builder, value,
208 LLVMConstInt(ctx->i32, rshift, 0), "");
209
210 if (rshift + bitwidth < 32) {
211 unsigned mask = (1 << bitwidth) - 1;
212 value = LLVMBuildAnd(gallivm->builder, value,
213 LLVMConstInt(ctx->i32, mask, 0), "");
214 }
215
216 return value;
217 }
218
219 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
220 {
221 switch (ctx->type) {
222 case PIPE_SHADER_TESS_CTRL:
223 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
224
225 case PIPE_SHADER_TESS_EVAL:
226 return LLVMGetParam(ctx->main_fn,
227 ctx->param_tes_rel_patch_id);
228
229 default:
230 assert(0);
231 return NULL;
232 }
233 }
234
235 /* Tessellation shaders pass outputs to the next shader using LDS.
236 *
237 * LS outputs = TCS inputs
238 * TCS outputs = TES inputs
239 *
240 * The LDS layout is:
241 * - TCS inputs for patch 0
242 * - TCS inputs for patch 1
243 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
244 * - ...
245 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
246 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
247 * - TCS outputs for patch 1
248 * - Per-patch TCS outputs for patch 1
249 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
250 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
251 * - ...
252 *
253 * All three shaders VS(LS), TCS, TES share the same LDS space.
254 */
255
256 static LLVMValueRef
257 get_tcs_in_patch_stride(struct si_shader_context *ctx)
258 {
259 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
260 }
261
262 static LLVMValueRef
263 get_tcs_out_patch_stride(struct si_shader_context *ctx)
264 {
265 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
266 }
267
268 static LLVMValueRef
269 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
270 {
271 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
272 unpack_param(ctx,
273 ctx->param_tcs_out_lds_offsets,
274 0, 16),
275 4);
276 }
277
278 static LLVMValueRef
279 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
280 {
281 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
282 unpack_param(ctx,
283 ctx->param_tcs_out_lds_offsets,
284 16, 16),
285 4);
286 }
287
288 static LLVMValueRef
289 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
290 {
291 struct gallivm_state *gallivm = &ctx->gallivm;
292 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
293 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
294
295 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
296 }
297
298 static LLVMValueRef
299 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
300 {
301 struct gallivm_state *gallivm = &ctx->gallivm;
302 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
303 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
304 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
305
306 return LLVMBuildAdd(gallivm->builder, patch0_offset,
307 LLVMBuildMul(gallivm->builder, patch_stride,
308 rel_patch_id, ""),
309 "");
310 }
311
312 static LLVMValueRef
313 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
314 {
315 struct gallivm_state *gallivm = &ctx->gallivm;
316 LLVMValueRef patch0_patch_data_offset =
317 get_tcs_out_patch0_patch_data_offset(ctx);
318 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
319 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
320
321 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
322 LLVMBuildMul(gallivm->builder, patch_stride,
323 rel_patch_id, ""),
324 "");
325 }
326
327 static LLVMValueRef get_instance_index_for_fetch(
328 struct si_shader_context *ctx,
329 unsigned param_start_instance, LLVMValueRef divisor)
330 {
331 struct gallivm_state *gallivm = &ctx->gallivm;
332
333 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
334 ctx->param_instance_id);
335
336 /* The division must be done before START_INSTANCE is added. */
337 if (divisor != ctx->i32_1)
338 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
339
340 return LLVMBuildAdd(gallivm->builder, result,
341 LLVMGetParam(ctx->main_fn, param_start_instance), "");
342 }
343
344 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
345 * to float. */
346 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
347 LLVMValueRef vec4,
348 unsigned double_index)
349 {
350 LLVMBuilderRef builder = ctx->gallivm.builder;
351 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
352 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
353 LLVMVectorType(f64, 2), "");
354 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
355 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
356 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
357 }
358
359 static void declare_input_vs(
360 struct si_shader_context *ctx,
361 unsigned input_index,
362 const struct tgsi_full_declaration *decl,
363 LLVMValueRef out[4])
364 {
365 struct gallivm_state *gallivm = &ctx->gallivm;
366
367 unsigned chan;
368 unsigned fix_fetch;
369 unsigned num_fetches;
370 unsigned fetch_stride;
371
372 LLVMValueRef t_list_ptr;
373 LLVMValueRef t_offset;
374 LLVMValueRef t_list;
375 LLVMValueRef vertex_index;
376 LLVMValueRef input[3];
377
378 /* Load the T list */
379 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
380
381 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
382
383 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
384
385 vertex_index = LLVMGetParam(ctx->main_fn,
386 ctx->param_vertex_index0 +
387 input_index);
388
389 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
390
391 /* Do multiple loads for special formats. */
392 switch (fix_fetch) {
393 case SI_FIX_FETCH_RGB_64_FLOAT:
394 num_fetches = 3; /* 3 2-dword loads */
395 fetch_stride = 8;
396 break;
397 case SI_FIX_FETCH_RGBA_64_FLOAT:
398 num_fetches = 2; /* 2 4-dword loads */
399 fetch_stride = 16;
400 break;
401 case SI_FIX_FETCH_RGB_8:
402 case SI_FIX_FETCH_RGB_8_INT:
403 num_fetches = 3;
404 fetch_stride = 1;
405 break;
406 case SI_FIX_FETCH_RGB_16:
407 case SI_FIX_FETCH_RGB_16_INT:
408 num_fetches = 3;
409 fetch_stride = 2;
410 break;
411 default:
412 num_fetches = 1;
413 fetch_stride = 0;
414 }
415
416 for (unsigned i = 0; i < num_fetches; i++) {
417 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
418
419 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
420 vertex_index, voffset,
421 true);
422 }
423
424 /* Break up the vec4 into individual components */
425 for (chan = 0; chan < 4; chan++) {
426 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
427 out[chan] = LLVMBuildExtractElement(gallivm->builder,
428 input[0], llvm_chan, "");
429 }
430
431 switch (fix_fetch) {
432 case SI_FIX_FETCH_A2_SNORM:
433 case SI_FIX_FETCH_A2_SSCALED:
434 case SI_FIX_FETCH_A2_SINT: {
435 /* The hardware returns an unsigned value; convert it to a
436 * signed one.
437 */
438 LLVMValueRef tmp = out[3];
439 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
440
441 /* First, recover the sign-extended signed integer value. */
442 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
443 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
444 else
445 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
446
447 /* For the integer-like cases, do a natural sign extension.
448 *
449 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
450 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
451 * exponent.
452 */
453 tmp = LLVMBuildShl(gallivm->builder, tmp,
454 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
455 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
456 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
457
458 /* Convert back to the right type. */
459 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
460 LLVMValueRef clamp;
461 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
462 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
463 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
464 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
465 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
466 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
467 }
468
469 out[3] = tmp;
470 break;
471 }
472 case SI_FIX_FETCH_RGBA_32_UNORM:
473 case SI_FIX_FETCH_RGBX_32_UNORM:
474 for (chan = 0; chan < 4; chan++) {
475 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
476 ctx->i32, "");
477 out[chan] = LLVMBuildUIToFP(gallivm->builder,
478 out[chan], ctx->f32, "");
479 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
480 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
481 }
482 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
483 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
484 out[3] = LLVMConstReal(ctx->f32, 1);
485 break;
486 case SI_FIX_FETCH_RGBA_32_SNORM:
487 case SI_FIX_FETCH_RGBX_32_SNORM:
488 case SI_FIX_FETCH_RGBA_32_FIXED:
489 case SI_FIX_FETCH_RGBX_32_FIXED: {
490 double scale;
491 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
492 scale = 1.0 / 0x10000;
493 else
494 scale = 1.0 / INT_MAX;
495
496 for (chan = 0; chan < 4; chan++) {
497 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
498 ctx->i32, "");
499 out[chan] = LLVMBuildSIToFP(gallivm->builder,
500 out[chan], ctx->f32, "");
501 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
502 LLVMConstReal(ctx->f32, scale), "");
503 }
504 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
505 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
506 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
507 out[3] = LLVMConstReal(ctx->f32, 1);
508 break;
509 }
510 case SI_FIX_FETCH_RGBA_32_USCALED:
511 for (chan = 0; chan < 4; chan++) {
512 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
513 ctx->i32, "");
514 out[chan] = LLVMBuildUIToFP(gallivm->builder,
515 out[chan], ctx->f32, "");
516 }
517 break;
518 case SI_FIX_FETCH_RGBA_32_SSCALED:
519 for (chan = 0; chan < 4; chan++) {
520 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
521 ctx->i32, "");
522 out[chan] = LLVMBuildSIToFP(gallivm->builder,
523 out[chan], ctx->f32, "");
524 }
525 break;
526 case SI_FIX_FETCH_RG_64_FLOAT:
527 for (chan = 0; chan < 2; chan++)
528 out[chan] = extract_double_to_float(ctx, input[0], chan);
529
530 out[2] = LLVMConstReal(ctx->f32, 0);
531 out[3] = LLVMConstReal(ctx->f32, 1);
532 break;
533 case SI_FIX_FETCH_RGB_64_FLOAT:
534 for (chan = 0; chan < 3; chan++)
535 out[chan] = extract_double_to_float(ctx, input[chan], 0);
536
537 out[3] = LLVMConstReal(ctx->f32, 1);
538 break;
539 case SI_FIX_FETCH_RGBA_64_FLOAT:
540 for (chan = 0; chan < 4; chan++) {
541 out[chan] = extract_double_to_float(ctx, input[chan / 2],
542 chan % 2);
543 }
544 break;
545 case SI_FIX_FETCH_RGB_8:
546 case SI_FIX_FETCH_RGB_8_INT:
547 case SI_FIX_FETCH_RGB_16:
548 case SI_FIX_FETCH_RGB_16_INT:
549 for (chan = 0; chan < 3; chan++) {
550 out[chan] = LLVMBuildExtractElement(gallivm->builder,
551 input[chan],
552 ctx->i32_0, "");
553 }
554 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
555 fix_fetch == SI_FIX_FETCH_RGB_16) {
556 out[3] = LLVMConstReal(ctx->f32, 1);
557 } else {
558 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
559 ctx->f32, "");
560 }
561 break;
562 }
563 }
564
565 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
566 unsigned swizzle)
567 {
568 struct si_shader_context *ctx = si_shader_context(bld_base);
569
570 if (swizzle > 0)
571 return ctx->i32_0;
572
573 switch (ctx->type) {
574 case PIPE_SHADER_VERTEX:
575 return LLVMGetParam(ctx->main_fn,
576 ctx->param_vs_prim_id);
577 case PIPE_SHADER_TESS_CTRL:
578 return LLVMGetParam(ctx->main_fn,
579 ctx->param_tcs_patch_id);
580 case PIPE_SHADER_TESS_EVAL:
581 return LLVMGetParam(ctx->main_fn,
582 ctx->param_tes_patch_id);
583 case PIPE_SHADER_GEOMETRY:
584 return LLVMGetParam(ctx->main_fn,
585 ctx->param_gs_prim_id);
586 default:
587 assert(0);
588 return ctx->i32_0;
589 }
590 }
591
592 /**
593 * Return the value of tgsi_ind_register for indexing.
594 * This is the indirect index with the constant offset added to it.
595 */
596 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
597 const struct tgsi_ind_register *ind,
598 int rel_index)
599 {
600 struct gallivm_state *gallivm = &ctx->gallivm;
601 LLVMValueRef result;
602
603 result = ctx->addrs[ind->Index][ind->Swizzle];
604 result = LLVMBuildLoad(gallivm->builder, result, "");
605 result = LLVMBuildAdd(gallivm->builder, result,
606 LLVMConstInt(ctx->i32, rel_index, 0), "");
607 return result;
608 }
609
610 /**
611 * Like get_indirect_index, but restricts the return value to a (possibly
612 * undefined) value inside [0..num).
613 */
614 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
615 const struct tgsi_ind_register *ind,
616 int rel_index, unsigned num)
617 {
618 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
619
620 return si_llvm_bound_index(ctx, result, num);
621 }
622
623
624 /**
625 * Calculate a dword address given an input or output register and a stride.
626 */
627 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
628 const struct tgsi_full_dst_register *dst,
629 const struct tgsi_full_src_register *src,
630 LLVMValueRef vertex_dw_stride,
631 LLVMValueRef base_addr)
632 {
633 struct gallivm_state *gallivm = &ctx->gallivm;
634 struct tgsi_shader_info *info = &ctx->shader->selector->info;
635 ubyte *name, *index, *array_first;
636 int first, param;
637 struct tgsi_full_dst_register reg;
638
639 /* Set the register description. The address computation is the same
640 * for sources and destinations. */
641 if (src) {
642 reg.Register.File = src->Register.File;
643 reg.Register.Index = src->Register.Index;
644 reg.Register.Indirect = src->Register.Indirect;
645 reg.Register.Dimension = src->Register.Dimension;
646 reg.Indirect = src->Indirect;
647 reg.Dimension = src->Dimension;
648 reg.DimIndirect = src->DimIndirect;
649 } else
650 reg = *dst;
651
652 /* If the register is 2-dimensional (e.g. an array of vertices
653 * in a primitive), calculate the base address of the vertex. */
654 if (reg.Register.Dimension) {
655 LLVMValueRef index;
656
657 if (reg.Dimension.Indirect)
658 index = get_indirect_index(ctx, &reg.DimIndirect,
659 reg.Dimension.Index);
660 else
661 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
662
663 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
664 LLVMBuildMul(gallivm->builder, index,
665 vertex_dw_stride, ""), "");
666 }
667
668 /* Get information about the register. */
669 if (reg.Register.File == TGSI_FILE_INPUT) {
670 name = info->input_semantic_name;
671 index = info->input_semantic_index;
672 array_first = info->input_array_first;
673 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
674 name = info->output_semantic_name;
675 index = info->output_semantic_index;
676 array_first = info->output_array_first;
677 } else {
678 assert(0);
679 return NULL;
680 }
681
682 if (reg.Register.Indirect) {
683 /* Add the relative address of the element. */
684 LLVMValueRef ind_index;
685
686 if (reg.Indirect.ArrayID)
687 first = array_first[reg.Indirect.ArrayID];
688 else
689 first = reg.Register.Index;
690
691 ind_index = get_indirect_index(ctx, &reg.Indirect,
692 reg.Register.Index - first);
693
694 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
695 LLVMBuildMul(gallivm->builder, ind_index,
696 LLVMConstInt(ctx->i32, 4, 0), ""), "");
697
698 param = reg.Register.Dimension ?
699 si_shader_io_get_unique_index(name[first], index[first]) :
700 si_shader_io_get_unique_index_patch(name[first], index[first]);
701 } else {
702 param = reg.Register.Dimension ?
703 si_shader_io_get_unique_index(name[reg.Register.Index],
704 index[reg.Register.Index]) :
705 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
706 index[reg.Register.Index]);
707 }
708
709 /* Add the base address of the element. */
710 return LLVMBuildAdd(gallivm->builder, base_addr,
711 LLVMConstInt(ctx->i32, param * 4, 0), "");
712 }
713
714 /* The offchip buffer layout for TCS->TES is
715 *
716 * - attribute 0 of patch 0 vertex 0
717 * - attribute 0 of patch 0 vertex 1
718 * - attribute 0 of patch 0 vertex 2
719 * ...
720 * - attribute 0 of patch 1 vertex 0
721 * - attribute 0 of patch 1 vertex 1
722 * ...
723 * - attribute 1 of patch 0 vertex 0
724 * - attribute 1 of patch 0 vertex 1
725 * ...
726 * - per patch attribute 0 of patch 0
727 * - per patch attribute 0 of patch 1
728 * ...
729 *
730 * Note that every attribute has 4 components.
731 */
732 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
733 LLVMValueRef rel_patch_id,
734 LLVMValueRef vertex_index,
735 LLVMValueRef param_index)
736 {
737 struct gallivm_state *gallivm = &ctx->gallivm;
738 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
739 LLVMValueRef param_stride, constant16;
740
741 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
742 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
743 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
744 num_patches, "");
745
746 constant16 = LLVMConstInt(ctx->i32, 16, 0);
747 if (vertex_index) {
748 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
749 vertices_per_patch, "");
750
751 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
752 vertex_index, "");
753
754 param_stride = total_vertices;
755 } else {
756 base_addr = rel_patch_id;
757 param_stride = num_patches;
758 }
759
760 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
761 LLVMBuildMul(gallivm->builder, param_index,
762 param_stride, ""), "");
763
764 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
765
766 if (!vertex_index) {
767 LLVMValueRef patch_data_offset =
768 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
769
770 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
771 patch_data_offset, "");
772 }
773 return base_addr;
774 }
775
776 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
777 struct si_shader_context *ctx,
778 const struct tgsi_full_dst_register *dst,
779 const struct tgsi_full_src_register *src)
780 {
781 struct gallivm_state *gallivm = &ctx->gallivm;
782 struct tgsi_shader_info *info = &ctx->shader->selector->info;
783 ubyte *name, *index, *array_first;
784 struct tgsi_full_src_register reg;
785 LLVMValueRef vertex_index = NULL;
786 LLVMValueRef param_index = NULL;
787 unsigned param_index_base, param_base;
788
789 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
790
791 if (reg.Register.Dimension) {
792
793 if (reg.Dimension.Indirect)
794 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
795 reg.Dimension.Index);
796 else
797 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
798 }
799
800 /* Get information about the register. */
801 if (reg.Register.File == TGSI_FILE_INPUT) {
802 name = info->input_semantic_name;
803 index = info->input_semantic_index;
804 array_first = info->input_array_first;
805 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
806 name = info->output_semantic_name;
807 index = info->output_semantic_index;
808 array_first = info->output_array_first;
809 } else {
810 assert(0);
811 return NULL;
812 }
813
814 if (reg.Register.Indirect) {
815 if (reg.Indirect.ArrayID)
816 param_base = array_first[reg.Indirect.ArrayID];
817 else
818 param_base = reg.Register.Index;
819
820 param_index = get_indirect_index(ctx, &reg.Indirect,
821 reg.Register.Index - param_base);
822
823 } else {
824 param_base = reg.Register.Index;
825 param_index = ctx->i32_0;
826 }
827
828 param_index_base = reg.Register.Dimension ?
829 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
830 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
831
832 param_index = LLVMBuildAdd(gallivm->builder, param_index,
833 LLVMConstInt(ctx->i32, param_index_base, 0),
834 "");
835
836 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
837 vertex_index, param_index);
838 }
839
840 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
841 enum tgsi_opcode_type type, unsigned swizzle,
842 LLVMValueRef buffer, LLVMValueRef offset,
843 LLVMValueRef base, bool can_speculate)
844 {
845 struct si_shader_context *ctx = si_shader_context(bld_base);
846 struct gallivm_state *gallivm = &ctx->gallivm;
847 LLVMValueRef value, value2;
848 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
849 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
850
851 if (swizzle == ~0) {
852 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
853 0, 1, 0, can_speculate, false);
854
855 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
856 }
857
858 if (!tgsi_type_is_64bit(type)) {
859 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
860 0, 1, 0, can_speculate, false);
861
862 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
863 return LLVMBuildExtractElement(gallivm->builder, value,
864 LLVMConstInt(ctx->i32, swizzle, 0), "");
865 }
866
867 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
868 swizzle * 4, 1, 0, can_speculate, false);
869
870 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
871 swizzle * 4 + 4, 1, 0, can_speculate, false);
872
873 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
874 }
875
876 /**
877 * Load from LDS.
878 *
879 * \param type output value type
880 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
881 * \param dw_addr address in dwords
882 */
883 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
884 enum tgsi_opcode_type type, unsigned swizzle,
885 LLVMValueRef dw_addr)
886 {
887 struct si_shader_context *ctx = si_shader_context(bld_base);
888 struct gallivm_state *gallivm = &ctx->gallivm;
889 LLVMValueRef value;
890
891 if (swizzle == ~0) {
892 LLVMValueRef values[TGSI_NUM_CHANNELS];
893
894 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
895 values[chan] = lds_load(bld_base, type, chan, dw_addr);
896
897 return lp_build_gather_values(gallivm, values,
898 TGSI_NUM_CHANNELS);
899 }
900
901 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
902 LLVMConstInt(ctx->i32, swizzle, 0));
903
904 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
905 if (tgsi_type_is_64bit(type)) {
906 LLVMValueRef value2;
907 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
908 ctx->i32_1);
909 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
910 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
911 }
912
913 return LLVMBuildBitCast(gallivm->builder, value,
914 tgsi2llvmtype(bld_base, type), "");
915 }
916
917 /**
918 * Store to LDS.
919 *
920 * \param swizzle offset (typically 0..3)
921 * \param dw_addr address in dwords
922 * \param value value to store
923 */
924 static void lds_store(struct lp_build_tgsi_context *bld_base,
925 unsigned dw_offset_imm, LLVMValueRef dw_addr,
926 LLVMValueRef value)
927 {
928 struct si_shader_context *ctx = si_shader_context(bld_base);
929 struct gallivm_state *gallivm = &ctx->gallivm;
930
931 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
932 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
933
934 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
935 ac_build_indexed_store(&ctx->ac, ctx->lds,
936 dw_addr, value);
937 }
938
939 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
940 unsigned param)
941 {
942 LLVMBuilderRef builder = ctx->gallivm.builder;
943
944 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
945 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
946 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
947
948 uint64_t desc2 = 0xffffffff;
949 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
950 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
951 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
952 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
953 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
954 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
955 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
956
957 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
958 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
959 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
960 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
961 }
962
963 static LLVMValueRef fetch_input_tcs(
964 struct lp_build_tgsi_context *bld_base,
965 const struct tgsi_full_src_register *reg,
966 enum tgsi_opcode_type type, unsigned swizzle)
967 {
968 struct si_shader_context *ctx = si_shader_context(bld_base);
969 LLVMValueRef dw_addr, stride;
970
971 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
972 dw_addr = get_tcs_in_current_patch_offset(ctx);
973 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
974
975 return lds_load(bld_base, type, swizzle, dw_addr);
976 }
977
978 static LLVMValueRef fetch_output_tcs(
979 struct lp_build_tgsi_context *bld_base,
980 const struct tgsi_full_src_register *reg,
981 enum tgsi_opcode_type type, unsigned swizzle)
982 {
983 struct si_shader_context *ctx = si_shader_context(bld_base);
984 LLVMValueRef dw_addr, stride;
985
986 if (reg->Register.Dimension) {
987 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
988 dw_addr = get_tcs_out_current_patch_offset(ctx);
989 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
990 } else {
991 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
992 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
993 }
994
995 return lds_load(bld_base, type, swizzle, dw_addr);
996 }
997
998 static LLVMValueRef fetch_input_tes(
999 struct lp_build_tgsi_context *bld_base,
1000 const struct tgsi_full_src_register *reg,
1001 enum tgsi_opcode_type type, unsigned swizzle)
1002 {
1003 struct si_shader_context *ctx = si_shader_context(bld_base);
1004 LLVMValueRef buffer, base, addr;
1005
1006 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1007
1008 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1009 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1010
1011 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1012 }
1013
1014 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1015 const struct tgsi_full_instruction *inst,
1016 const struct tgsi_opcode_info *info,
1017 LLVMValueRef dst[4])
1018 {
1019 struct si_shader_context *ctx = si_shader_context(bld_base);
1020 struct gallivm_state *gallivm = &ctx->gallivm;
1021 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1022 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1023 unsigned chan_index;
1024 LLVMValueRef dw_addr, stride;
1025 LLVMValueRef buffer, base, buf_addr;
1026 LLVMValueRef values[4];
1027 bool skip_lds_store;
1028 bool is_tess_factor = false;
1029
1030 /* Only handle per-patch and per-vertex outputs here.
1031 * Vectors will be lowered to scalars and this function will be called again.
1032 */
1033 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1034 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1035 si_llvm_emit_store(bld_base, inst, info, dst);
1036 return;
1037 }
1038
1039 if (reg->Register.Dimension) {
1040 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1041 dw_addr = get_tcs_out_current_patch_offset(ctx);
1042 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1043 skip_lds_store = !sh_info->reads_pervertex_outputs;
1044 } else {
1045 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1046 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1047 skip_lds_store = !sh_info->reads_perpatch_outputs;
1048
1049 if (!reg->Register.Indirect) {
1050 int name = sh_info->output_semantic_name[reg->Register.Index];
1051
1052 /* Always write tess factors into LDS for the TCS epilog. */
1053 if (name == TGSI_SEMANTIC_TESSINNER ||
1054 name == TGSI_SEMANTIC_TESSOUTER) {
1055 skip_lds_store = false;
1056 is_tess_factor = true;
1057 }
1058 }
1059 }
1060
1061 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1062
1063 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1064 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1065
1066
1067 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1068 LLVMValueRef value = dst[chan_index];
1069
1070 if (inst->Instruction.Saturate)
1071 value = ac_build_clamp(&ctx->ac, value);
1072
1073 /* Skip LDS stores if there is no LDS read of this output. */
1074 if (!skip_lds_store)
1075 lds_store(bld_base, chan_index, dw_addr, value);
1076
1077 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1078 values[chan_index] = value;
1079
1080 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1081 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1082 buf_addr, base,
1083 4 * chan_index, 1, 0, true, false);
1084 }
1085 }
1086
1087 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1088 LLVMValueRef value = lp_build_gather_values(gallivm,
1089 values, 4);
1090 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1091 base, 0, 1, 0, true, false);
1092 }
1093 }
1094
1095 static LLVMValueRef fetch_input_gs(
1096 struct lp_build_tgsi_context *bld_base,
1097 const struct tgsi_full_src_register *reg,
1098 enum tgsi_opcode_type type,
1099 unsigned swizzle)
1100 {
1101 struct si_shader_context *ctx = si_shader_context(bld_base);
1102 struct si_shader *shader = ctx->shader;
1103 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1104 struct gallivm_state *gallivm = &ctx->gallivm;
1105 LLVMValueRef vtx_offset, soffset;
1106 struct tgsi_shader_info *info = &shader->selector->info;
1107 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1108 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1109 unsigned param;
1110 LLVMValueRef value;
1111
1112 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1113 return get_primitive_id(bld_base, swizzle);
1114
1115 if (!reg->Register.Dimension)
1116 return NULL;
1117
1118 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1119
1120 /* GFX9 has the ESGS ring in LDS. */
1121 if (ctx->screen->b.chip_class >= GFX9) {
1122 unsigned index = reg->Dimension.Index;
1123
1124 switch (index / 2) {
1125 case 0:
1126 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1127 index % 2 ? 16 : 0, 16);
1128 break;
1129 case 1:
1130 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1131 index % 2 ? 16 : 0, 16);
1132 break;
1133 case 2:
1134 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1135 index % 2 ? 16 : 0, 16);
1136 break;
1137 default:
1138 assert(0);
1139 return NULL;
1140 }
1141
1142 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1143 LLVMConstInt(ctx->i32, param * 4, 0), "");
1144 return lds_load(bld_base, type, swizzle, vtx_offset);
1145 }
1146
1147 /* GFX6: input load from the ESGS ring in memory. */
1148 if (swizzle == ~0) {
1149 LLVMValueRef values[TGSI_NUM_CHANNELS];
1150 unsigned chan;
1151 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1152 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1153 }
1154 return lp_build_gather_values(gallivm, values,
1155 TGSI_NUM_CHANNELS);
1156 }
1157
1158 /* Get the vertex offset parameter on GFX6. */
1159 unsigned vtx_offset_param = reg->Dimension.Index;
1160 if (vtx_offset_param < 2) {
1161 vtx_offset_param += ctx->param_gs_vtx0_offset;
1162 } else {
1163 assert(vtx_offset_param < 6);
1164 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1165 }
1166 vtx_offset = lp_build_mul_imm(uint,
1167 LLVMGetParam(ctx->main_fn,
1168 vtx_offset_param),
1169 4);
1170
1171 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1172
1173 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1174 vtx_offset, soffset, 0, 1, 0, true, false);
1175 if (tgsi_type_is_64bit(type)) {
1176 LLVMValueRef value2;
1177 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1178
1179 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1180 ctx->i32_0, vtx_offset, soffset,
1181 0, 1, 0, true, false);
1182 return si_llvm_emit_fetch_64bit(bld_base, type,
1183 value, value2);
1184 }
1185 return LLVMBuildBitCast(gallivm->builder,
1186 value,
1187 tgsi2llvmtype(bld_base, type), "");
1188 }
1189
1190 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1191 {
1192 switch (interpolate) {
1193 case TGSI_INTERPOLATE_CONSTANT:
1194 return 0;
1195
1196 case TGSI_INTERPOLATE_LINEAR:
1197 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1198 return SI_PARAM_LINEAR_SAMPLE;
1199 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1200 return SI_PARAM_LINEAR_CENTROID;
1201 else
1202 return SI_PARAM_LINEAR_CENTER;
1203 break;
1204 case TGSI_INTERPOLATE_COLOR:
1205 case TGSI_INTERPOLATE_PERSPECTIVE:
1206 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1207 return SI_PARAM_PERSP_SAMPLE;
1208 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1209 return SI_PARAM_PERSP_CENTROID;
1210 else
1211 return SI_PARAM_PERSP_CENTER;
1212 break;
1213 default:
1214 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1215 return -1;
1216 }
1217 }
1218
1219 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1220 unsigned attr_index, unsigned chan,
1221 LLVMValueRef prim_mask,
1222 LLVMValueRef i, LLVMValueRef j)
1223 {
1224 if (i || j) {
1225 return ac_build_fs_interp(&ctx->ac,
1226 LLVMConstInt(ctx->i32, chan, 0),
1227 LLVMConstInt(ctx->i32, attr_index, 0),
1228 prim_mask, i, j);
1229 }
1230 return ac_build_fs_interp_mov(&ctx->ac,
1231 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1232 LLVMConstInt(ctx->i32, chan, 0),
1233 LLVMConstInt(ctx->i32, attr_index, 0),
1234 prim_mask);
1235 }
1236
1237 /**
1238 * Interpolate a fragment shader input.
1239 *
1240 * @param ctx context
1241 * @param input_index index of the input in hardware
1242 * @param semantic_name TGSI_SEMANTIC_*
1243 * @param semantic_index semantic index
1244 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1245 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1246 * @param interp_param interpolation weights (i,j)
1247 * @param prim_mask SI_PARAM_PRIM_MASK
1248 * @param face SI_PARAM_FRONT_FACE
1249 * @param result the return value (4 components)
1250 */
1251 static void interp_fs_input(struct si_shader_context *ctx,
1252 unsigned input_index,
1253 unsigned semantic_name,
1254 unsigned semantic_index,
1255 unsigned num_interp_inputs,
1256 unsigned colors_read_mask,
1257 LLVMValueRef interp_param,
1258 LLVMValueRef prim_mask,
1259 LLVMValueRef face,
1260 LLVMValueRef result[4])
1261 {
1262 struct gallivm_state *gallivm = &ctx->gallivm;
1263 LLVMValueRef i = NULL, j = NULL;
1264 unsigned chan;
1265
1266 /* fs.constant returns the param from the middle vertex, so it's not
1267 * really useful for flat shading. It's meant to be used for custom
1268 * interpolation (but the intrinsic can't fetch from the other two
1269 * vertices).
1270 *
1271 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1272 * to do the right thing. The only reason we use fs.constant is that
1273 * fs.interp cannot be used on integers, because they can be equal
1274 * to NaN.
1275 *
1276 * When interp is false we will use fs.constant or for newer llvm,
1277 * amdgcn.interp.mov.
1278 */
1279 bool interp = interp_param != NULL;
1280
1281 if (interp) {
1282 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1283 LLVMVectorType(ctx->f32, 2), "");
1284
1285 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1286 ctx->i32_0, "");
1287 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1288 ctx->i32_1, "");
1289 }
1290
1291 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1292 ctx->shader->key.part.ps.prolog.color_two_side) {
1293 LLVMValueRef is_face_positive;
1294
1295 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1296 * otherwise it's at offset "num_inputs".
1297 */
1298 unsigned back_attr_offset = num_interp_inputs;
1299 if (semantic_index == 1 && colors_read_mask & 0xf)
1300 back_attr_offset += 1;
1301
1302 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1303 face, ctx->i32_0, "");
1304
1305 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1306 LLVMValueRef front, back;
1307
1308 front = si_build_fs_interp(ctx,
1309 input_index, chan,
1310 prim_mask, i, j);
1311 back = si_build_fs_interp(ctx,
1312 back_attr_offset, chan,
1313 prim_mask, i, j);
1314
1315 result[chan] = LLVMBuildSelect(gallivm->builder,
1316 is_face_positive,
1317 front,
1318 back,
1319 "");
1320 }
1321 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1322 result[0] = si_build_fs_interp(ctx, input_index,
1323 0, prim_mask, i, j);
1324 result[1] =
1325 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1326 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1327 } else {
1328 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1329 result[chan] = si_build_fs_interp(ctx,
1330 input_index, chan,
1331 prim_mask, i, j);
1332 }
1333 }
1334 }
1335
1336 static void declare_input_fs(
1337 struct si_shader_context *ctx,
1338 unsigned input_index,
1339 const struct tgsi_full_declaration *decl,
1340 LLVMValueRef out[4])
1341 {
1342 struct lp_build_context *base = &ctx->bld_base.base;
1343 struct si_shader *shader = ctx->shader;
1344 LLVMValueRef main_fn = ctx->main_fn;
1345 LLVMValueRef interp_param = NULL;
1346 int interp_param_idx;
1347
1348 /* Get colors from input VGPRs (set by the prolog). */
1349 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1350 unsigned i = decl->Semantic.Index;
1351 unsigned colors_read = shader->selector->info.colors_read;
1352 unsigned mask = colors_read >> (i * 4);
1353 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1354 (i ? util_bitcount(colors_read & 0xf) : 0);
1355
1356 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1357 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1358 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1359 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1360 return;
1361 }
1362
1363 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1364 decl->Interp.Location);
1365 if (interp_param_idx == -1)
1366 return;
1367 else if (interp_param_idx) {
1368 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1369 }
1370
1371 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1372 decl->Semantic.Index, 0, /* this param is unused */
1373 shader->selector->info.colors_read, interp_param,
1374 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1375 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1376 &out[0]);
1377 }
1378
1379 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1380 {
1381 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1382 }
1383
1384
1385 /**
1386 * Load a dword from a constant buffer.
1387 */
1388 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1389 LLVMValueRef resource,
1390 LLVMValueRef offset)
1391 {
1392 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1393 0, 0, 0, true, true);
1394 }
1395
1396 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1397 {
1398 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1399 struct gallivm_state *gallivm = &ctx->gallivm;
1400 LLVMBuilderRef builder = gallivm->builder;
1401 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1402 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1403 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1404
1405 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1406 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1407 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1408
1409 LLVMValueRef pos[4] = {
1410 buffer_load_const(ctx, resource, offset0),
1411 buffer_load_const(ctx, resource, offset1),
1412 LLVMConstReal(ctx->f32, 0),
1413 LLVMConstReal(ctx->f32, 0)
1414 };
1415
1416 return lp_build_gather_values(gallivm, pos, 4);
1417 }
1418
1419 static void declare_system_value(struct si_shader_context *ctx,
1420 unsigned index,
1421 const struct tgsi_full_declaration *decl)
1422 {
1423 struct lp_build_context *bld = &ctx->bld_base.base;
1424 struct gallivm_state *gallivm = &ctx->gallivm;
1425 LLVMValueRef value = 0;
1426
1427 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1428
1429 switch (decl->Semantic.Name) {
1430 case TGSI_SEMANTIC_INSTANCEID:
1431 value = LLVMGetParam(ctx->main_fn,
1432 ctx->param_instance_id);
1433 break;
1434
1435 case TGSI_SEMANTIC_VERTEXID:
1436 value = LLVMBuildAdd(gallivm->builder,
1437 LLVMGetParam(ctx->main_fn,
1438 ctx->param_vertex_id),
1439 LLVMGetParam(ctx->main_fn,
1440 ctx->param_base_vertex), "");
1441 break;
1442
1443 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1444 /* Unused. Clarify the meaning in indexed vs. non-indexed
1445 * draws if this is ever used again. */
1446 assert(false);
1447 break;
1448
1449 case TGSI_SEMANTIC_BASEVERTEX:
1450 {
1451 /* For non-indexed draws, the base vertex set by the driver
1452 * (for direct draws) or the CP (for indirect draws) is the
1453 * first vertex ID, but GLSL expects 0 to be returned.
1454 */
1455 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1456 LLVMValueRef indexed;
1457
1458 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1459 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1460
1461 value = LLVMBuildSelect(gallivm->builder, indexed,
1462 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1463 ctx->i32_0, "");
1464 break;
1465 }
1466
1467 case TGSI_SEMANTIC_BASEINSTANCE:
1468 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1469 break;
1470
1471 case TGSI_SEMANTIC_DRAWID:
1472 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1473 break;
1474
1475 case TGSI_SEMANTIC_INVOCATIONID:
1476 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1477 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1478 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1479 value = LLVMGetParam(ctx->main_fn,
1480 ctx->param_gs_instance_id);
1481 else
1482 assert(!"INVOCATIONID not implemented");
1483 break;
1484
1485 case TGSI_SEMANTIC_POSITION:
1486 {
1487 LLVMValueRef pos[4] = {
1488 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1489 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1490 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1491 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1492 LLVMGetParam(ctx->main_fn,
1493 SI_PARAM_POS_W_FLOAT)),
1494 };
1495 value = lp_build_gather_values(gallivm, pos, 4);
1496 break;
1497 }
1498
1499 case TGSI_SEMANTIC_FACE:
1500 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1501 break;
1502
1503 case TGSI_SEMANTIC_SAMPLEID:
1504 value = get_sample_id(ctx);
1505 break;
1506
1507 case TGSI_SEMANTIC_SAMPLEPOS: {
1508 LLVMValueRef pos[4] = {
1509 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1510 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1511 LLVMConstReal(ctx->f32, 0),
1512 LLVMConstReal(ctx->f32, 0)
1513 };
1514 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1515 TGSI_OPCODE_FRC, pos[0]);
1516 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1517 TGSI_OPCODE_FRC, pos[1]);
1518 value = lp_build_gather_values(gallivm, pos, 4);
1519 break;
1520 }
1521
1522 case TGSI_SEMANTIC_SAMPLEMASK:
1523 /* This can only occur with the OpenGL Core profile, which
1524 * doesn't support smoothing.
1525 */
1526 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1527 break;
1528
1529 case TGSI_SEMANTIC_TESSCOORD:
1530 {
1531 LLVMValueRef coord[4] = {
1532 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1533 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1534 bld->zero,
1535 bld->zero
1536 };
1537
1538 /* For triangles, the vector should be (u, v, 1-u-v). */
1539 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1540 PIPE_PRIM_TRIANGLES)
1541 coord[2] = lp_build_sub(bld, bld->one,
1542 lp_build_add(bld, coord[0], coord[1]));
1543
1544 value = lp_build_gather_values(gallivm, coord, 4);
1545 break;
1546 }
1547
1548 case TGSI_SEMANTIC_VERTICESIN:
1549 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1550 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1551 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1552 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1553 else
1554 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1555 break;
1556
1557 case TGSI_SEMANTIC_TESSINNER:
1558 case TGSI_SEMANTIC_TESSOUTER:
1559 {
1560 LLVMValueRef buffer, base, addr;
1561 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1562
1563 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1564
1565 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1566 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1567 LLVMConstInt(ctx->i32, param, 0));
1568
1569 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1570 ~0, buffer, base, addr, true);
1571
1572 break;
1573 }
1574
1575 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1576 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1577 {
1578 LLVMValueRef buf, slot, val[4];
1579 int i, offset;
1580
1581 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1582 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1583 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1584 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1585
1586 for (i = 0; i < 4; i++)
1587 val[i] = buffer_load_const(ctx, buf,
1588 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1589 value = lp_build_gather_values(gallivm, val, 4);
1590 break;
1591 }
1592
1593 case TGSI_SEMANTIC_PRIMID:
1594 value = get_primitive_id(&ctx->bld_base, 0);
1595 break;
1596
1597 case TGSI_SEMANTIC_GRID_SIZE:
1598 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1599 break;
1600
1601 case TGSI_SEMANTIC_BLOCK_SIZE:
1602 {
1603 LLVMValueRef values[3];
1604 unsigned i;
1605 unsigned *properties = ctx->shader->selector->info.properties;
1606
1607 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1608 unsigned sizes[3] = {
1609 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1610 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1611 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1612 };
1613
1614 for (i = 0; i < 3; ++i)
1615 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1616
1617 value = lp_build_gather_values(gallivm, values, 3);
1618 } else {
1619 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1620 }
1621 break;
1622 }
1623
1624 case TGSI_SEMANTIC_BLOCK_ID:
1625 {
1626 LLVMValueRef values[3];
1627
1628 for (int i = 0; i < 3; i++) {
1629 values[i] = ctx->i32_0;
1630 if (ctx->param_block_id[i] >= 0) {
1631 values[i] = LLVMGetParam(ctx->main_fn,
1632 ctx->param_block_id[i]);
1633 }
1634 }
1635 value = lp_build_gather_values(gallivm, values, 3);
1636 break;
1637 }
1638
1639 case TGSI_SEMANTIC_THREAD_ID:
1640 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1641 break;
1642
1643 case TGSI_SEMANTIC_HELPER_INVOCATION:
1644 value = lp_build_intrinsic(gallivm->builder,
1645 "llvm.amdgcn.ps.live",
1646 ctx->i1, NULL, 0,
1647 LP_FUNC_ATTR_READNONE);
1648 value = LLVMBuildNot(gallivm->builder, value, "");
1649 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1650 break;
1651
1652 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1653 value = LLVMConstInt(ctx->i32, 64, 0);
1654 break;
1655
1656 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1657 value = ac_get_thread_id(&ctx->ac);
1658 break;
1659
1660 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1661 {
1662 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1663 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1664 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1665 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1666 break;
1667 }
1668
1669 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1670 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1671 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1672 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1673 {
1674 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1675 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1676 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1677 /* All bits set except LSB */
1678 value = LLVMConstInt(ctx->i64, -2, 0);
1679 } else {
1680 /* All bits set */
1681 value = LLVMConstInt(ctx->i64, -1, 0);
1682 }
1683 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1684 value = LLVMBuildShl(gallivm->builder, value, id, "");
1685 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1686 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1687 value = LLVMBuildNot(gallivm->builder, value, "");
1688 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1689 break;
1690 }
1691
1692 default:
1693 assert(!"unknown system value");
1694 return;
1695 }
1696
1697 ctx->system_values[index] = value;
1698 }
1699
1700 static void declare_compute_memory(struct si_shader_context *ctx,
1701 const struct tgsi_full_declaration *decl)
1702 {
1703 struct si_shader_selector *sel = ctx->shader->selector;
1704 struct gallivm_state *gallivm = &ctx->gallivm;
1705
1706 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1707 LLVMValueRef var;
1708
1709 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1710 assert(decl->Range.First == decl->Range.Last);
1711 assert(!ctx->shared_memory);
1712
1713 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1714 LLVMArrayType(ctx->i8, sel->local_size),
1715 "compute_lds",
1716 LOCAL_ADDR_SPACE);
1717 LLVMSetAlignment(var, 4);
1718
1719 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1720 }
1721
1722 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1723 {
1724 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1725 ctx->param_const_and_shader_buffers);
1726
1727 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1728 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1729 }
1730
1731 static LLVMValueRef fetch_constant(
1732 struct lp_build_tgsi_context *bld_base,
1733 const struct tgsi_full_src_register *reg,
1734 enum tgsi_opcode_type type,
1735 unsigned swizzle)
1736 {
1737 struct si_shader_context *ctx = si_shader_context(bld_base);
1738 struct lp_build_context *base = &bld_base->base;
1739 const struct tgsi_ind_register *ireg = &reg->Indirect;
1740 unsigned buf, idx;
1741
1742 LLVMValueRef addr, bufp;
1743 LLVMValueRef result;
1744
1745 if (swizzle == LP_CHAN_ALL) {
1746 unsigned chan;
1747 LLVMValueRef values[4];
1748 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1749 values[chan] = fetch_constant(bld_base, reg, type, chan);
1750
1751 return lp_build_gather_values(&ctx->gallivm, values, 4);
1752 }
1753
1754 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1755 idx = reg->Register.Index * 4 + swizzle;
1756
1757 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1758 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1759 LLVMValueRef index;
1760 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1761 reg->Dimension.Index,
1762 ctx->num_const_buffers);
1763 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1764 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1765 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1766 } else
1767 bufp = load_const_buffer_desc(ctx, buf);
1768
1769 if (reg->Register.Indirect) {
1770 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1771 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1772 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1773 addr = lp_build_add(&bld_base->uint_bld, addr,
1774 LLVMConstInt(ctx->i32, idx * 4, 0));
1775 } else {
1776 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1777 }
1778
1779 result = buffer_load_const(ctx, bufp, addr);
1780
1781 if (!tgsi_type_is_64bit(type))
1782 result = bitcast(bld_base, type, result);
1783 else {
1784 LLVMValueRef addr2, result2;
1785
1786 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1787 LLVMConstInt(ctx->i32, 4, 0));
1788 result2 = buffer_load_const(ctx, bufp, addr2);
1789
1790 result = si_llvm_emit_fetch_64bit(bld_base, type,
1791 result, result2);
1792 }
1793 return result;
1794 }
1795
1796 /* Upper 16 bits must be zero. */
1797 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1798 LLVMValueRef val[2])
1799 {
1800 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1801 LLVMBuildShl(ctx->gallivm.builder, val[1],
1802 LLVMConstInt(ctx->i32, 16, 0),
1803 ""), "");
1804 }
1805
1806 /* Upper 16 bits are ignored and will be dropped. */
1807 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1808 LLVMValueRef val[2])
1809 {
1810 LLVMValueRef v[2] = {
1811 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1812 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1813 val[1],
1814 };
1815 return si_llvm_pack_two_int16(ctx, v);
1816 }
1817
1818 /* Initialize arguments for the shader export intrinsic */
1819 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1820 LLVMValueRef *values,
1821 unsigned target,
1822 struct ac_export_args *args)
1823 {
1824 struct si_shader_context *ctx = si_shader_context(bld_base);
1825 struct lp_build_context *base = &bld_base->base;
1826 LLVMBuilderRef builder = ctx->gallivm.builder;
1827 LLVMValueRef val[4];
1828 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1829 unsigned chan;
1830 bool is_int8, is_int10;
1831
1832 /* Default is 0xf. Adjusted below depending on the format. */
1833 args->enabled_channels = 0xf; /* writemask */
1834
1835 /* Specify whether the EXEC mask represents the valid mask */
1836 args->valid_mask = 0;
1837
1838 /* Specify whether this is the last export */
1839 args->done = 0;
1840
1841 /* Specify the target we are exporting */
1842 args->target = target;
1843
1844 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1845 const struct si_shader_key *key = &ctx->shader->key;
1846 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1847 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1848
1849 assert(cbuf >= 0 && cbuf < 8);
1850 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1851 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1852 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1853 }
1854
1855 args->compr = false;
1856 args->out[0] = base->undef;
1857 args->out[1] = base->undef;
1858 args->out[2] = base->undef;
1859 args->out[3] = base->undef;
1860
1861 switch (spi_shader_col_format) {
1862 case V_028714_SPI_SHADER_ZERO:
1863 args->enabled_channels = 0; /* writemask */
1864 args->target = V_008DFC_SQ_EXP_NULL;
1865 break;
1866
1867 case V_028714_SPI_SHADER_32_R:
1868 args->enabled_channels = 1; /* writemask */
1869 args->out[0] = values[0];
1870 break;
1871
1872 case V_028714_SPI_SHADER_32_GR:
1873 args->enabled_channels = 0x3; /* writemask */
1874 args->out[0] = values[0];
1875 args->out[1] = values[1];
1876 break;
1877
1878 case V_028714_SPI_SHADER_32_AR:
1879 args->enabled_channels = 0x9; /* writemask */
1880 args->out[0] = values[0];
1881 args->out[3] = values[3];
1882 break;
1883
1884 case V_028714_SPI_SHADER_FP16_ABGR:
1885 args->compr = 1; /* COMPR flag */
1886
1887 for (chan = 0; chan < 2; chan++) {
1888 LLVMValueRef pack_args[2] = {
1889 values[2 * chan],
1890 values[2 * chan + 1]
1891 };
1892 LLVMValueRef packed;
1893
1894 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1895 args->out[chan] =
1896 LLVMBuildBitCast(ctx->gallivm.builder,
1897 packed, ctx->f32, "");
1898 }
1899 break;
1900
1901 case V_028714_SPI_SHADER_UNORM16_ABGR:
1902 for (chan = 0; chan < 4; chan++) {
1903 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1904 val[chan] = LLVMBuildFMul(builder, val[chan],
1905 LLVMConstReal(ctx->f32, 65535), "");
1906 val[chan] = LLVMBuildFAdd(builder, val[chan],
1907 LLVMConstReal(ctx->f32, 0.5), "");
1908 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1909 ctx->i32, "");
1910 }
1911
1912 args->compr = 1; /* COMPR flag */
1913 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1914 si_llvm_pack_two_int16(ctx, val));
1915 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1916 si_llvm_pack_two_int16(ctx, val+2));
1917 break;
1918
1919 case V_028714_SPI_SHADER_SNORM16_ABGR:
1920 for (chan = 0; chan < 4; chan++) {
1921 /* Clamp between [-1, 1]. */
1922 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1923 values[chan],
1924 LLVMConstReal(ctx->f32, 1));
1925 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1926 val[chan],
1927 LLVMConstReal(ctx->f32, -1));
1928 /* Convert to a signed integer in [-32767, 32767]. */
1929 val[chan] = LLVMBuildFMul(builder, val[chan],
1930 LLVMConstReal(ctx->f32, 32767), "");
1931 /* If positive, add 0.5, else add -0.5. */
1932 val[chan] = LLVMBuildFAdd(builder, val[chan],
1933 LLVMBuildSelect(builder,
1934 LLVMBuildFCmp(builder, LLVMRealOGE,
1935 val[chan], base->zero, ""),
1936 LLVMConstReal(ctx->f32, 0.5),
1937 LLVMConstReal(ctx->f32, -0.5), ""), "");
1938 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1939 }
1940
1941 args->compr = 1; /* COMPR flag */
1942 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1943 si_llvm_pack_two_int32_as_int16(ctx, val));
1944 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1945 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1946 break;
1947
1948 case V_028714_SPI_SHADER_UINT16_ABGR: {
1949 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1950 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1951 LLVMValueRef max_alpha =
1952 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1953
1954 /* Clamp. */
1955 for (chan = 0; chan < 4; chan++) {
1956 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1957 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1958 val[chan],
1959 chan == 3 ? max_alpha : max_rgb);
1960 }
1961
1962 args->compr = 1; /* COMPR flag */
1963 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1964 si_llvm_pack_two_int16(ctx, val));
1965 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1966 si_llvm_pack_two_int16(ctx, val+2));
1967 break;
1968 }
1969
1970 case V_028714_SPI_SHADER_SINT16_ABGR: {
1971 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1972 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1973 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1974 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1975 LLVMValueRef max_alpha =
1976 !is_int10 ? max_rgb : ctx->i32_1;
1977 LLVMValueRef min_alpha =
1978 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1979
1980 /* Clamp. */
1981 for (chan = 0; chan < 4; chan++) {
1982 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1983 val[chan] = lp_build_emit_llvm_binary(bld_base,
1984 TGSI_OPCODE_IMIN,
1985 val[chan], chan == 3 ? max_alpha : max_rgb);
1986 val[chan] = lp_build_emit_llvm_binary(bld_base,
1987 TGSI_OPCODE_IMAX,
1988 val[chan], chan == 3 ? min_alpha : min_rgb);
1989 }
1990
1991 args->compr = 1; /* COMPR flag */
1992 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1993 si_llvm_pack_two_int32_as_int16(ctx, val));
1994 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1995 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1996 break;
1997 }
1998
1999 case V_028714_SPI_SHADER_32_ABGR:
2000 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2001 break;
2002 }
2003 }
2004
2005 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2006 LLVMValueRef alpha)
2007 {
2008 struct si_shader_context *ctx = si_shader_context(bld_base);
2009
2010 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2011 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2012 SI_PARAM_ALPHA_REF);
2013
2014 LLVMValueRef alpha_pass =
2015 lp_build_cmp(&bld_base->base,
2016 ctx->shader->key.part.ps.epilog.alpha_func,
2017 alpha, alpha_ref);
2018 LLVMValueRef arg =
2019 lp_build_select(&bld_base->base,
2020 alpha_pass,
2021 LLVMConstReal(ctx->f32, 1.0f),
2022 LLVMConstReal(ctx->f32, -1.0f));
2023
2024 ac_build_kill(&ctx->ac, arg);
2025 } else {
2026 ac_build_kill(&ctx->ac, NULL);
2027 }
2028 }
2029
2030 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2031 LLVMValueRef alpha,
2032 unsigned samplemask_param)
2033 {
2034 struct si_shader_context *ctx = si_shader_context(bld_base);
2035 struct gallivm_state *gallivm = &ctx->gallivm;
2036 LLVMValueRef coverage;
2037
2038 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2039 coverage = LLVMGetParam(ctx->main_fn,
2040 samplemask_param);
2041 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2042
2043 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2044 ctx->i32,
2045 &coverage, 1, LP_FUNC_ATTR_READNONE);
2046
2047 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2048 ctx->f32, "");
2049
2050 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2051 LLVMConstReal(ctx->f32,
2052 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2053
2054 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2055 }
2056
2057 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2058 struct ac_export_args *pos, LLVMValueRef *out_elts)
2059 {
2060 struct si_shader_context *ctx = si_shader_context(bld_base);
2061 struct lp_build_context *base = &bld_base->base;
2062 unsigned reg_index;
2063 unsigned chan;
2064 unsigned const_chan;
2065 LLVMValueRef base_elt;
2066 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2067 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2068 SI_VS_CONST_CLIP_PLANES, 0);
2069 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2070
2071 for (reg_index = 0; reg_index < 2; reg_index ++) {
2072 struct ac_export_args *args = &pos[2 + reg_index];
2073
2074 args->out[0] =
2075 args->out[1] =
2076 args->out[2] =
2077 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2078
2079 /* Compute dot products of position and user clip plane vectors */
2080 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2081 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2082 LLVMValueRef addr =
2083 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2084 const_chan) * 4, 0);
2085 base_elt = buffer_load_const(ctx, const_resource,
2086 addr);
2087 args->out[chan] =
2088 lp_build_add(base, args->out[chan],
2089 lp_build_mul(base, base_elt,
2090 out_elts[const_chan]));
2091 }
2092 }
2093
2094 args->enabled_channels = 0xf;
2095 args->valid_mask = 0;
2096 args->done = 0;
2097 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2098 args->compr = 0;
2099 }
2100 }
2101
2102 static void si_dump_streamout(struct pipe_stream_output_info *so)
2103 {
2104 unsigned i;
2105
2106 if (so->num_outputs)
2107 fprintf(stderr, "STREAMOUT\n");
2108
2109 for (i = 0; i < so->num_outputs; i++) {
2110 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2111 so->output[i].start_component;
2112 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2113 i, so->output[i].output_buffer,
2114 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2115 so->output[i].register_index,
2116 mask & 1 ? "x" : "",
2117 mask & 2 ? "y" : "",
2118 mask & 4 ? "z" : "",
2119 mask & 8 ? "w" : "");
2120 }
2121 }
2122
2123 static void emit_streamout_output(struct si_shader_context *ctx,
2124 LLVMValueRef const *so_buffers,
2125 LLVMValueRef const *so_write_offsets,
2126 struct pipe_stream_output *stream_out,
2127 struct si_shader_output_values *shader_out)
2128 {
2129 struct gallivm_state *gallivm = &ctx->gallivm;
2130 LLVMBuilderRef builder = gallivm->builder;
2131 unsigned buf_idx = stream_out->output_buffer;
2132 unsigned start = stream_out->start_component;
2133 unsigned num_comps = stream_out->num_components;
2134 LLVMValueRef out[4];
2135
2136 assert(num_comps && num_comps <= 4);
2137 if (!num_comps || num_comps > 4)
2138 return;
2139
2140 /* Load the output as int. */
2141 for (int j = 0; j < num_comps; j++) {
2142 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2143
2144 out[j] = LLVMBuildBitCast(builder,
2145 shader_out->values[start + j],
2146 ctx->i32, "");
2147 }
2148
2149 /* Pack the output. */
2150 LLVMValueRef vdata = NULL;
2151
2152 switch (num_comps) {
2153 case 1: /* as i32 */
2154 vdata = out[0];
2155 break;
2156 case 2: /* as v2i32 */
2157 case 3: /* as v4i32 (aligned to 4) */
2158 case 4: /* as v4i32 */
2159 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2160 for (int j = 0; j < num_comps; j++) {
2161 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2162 LLVMConstInt(ctx->i32, j, 0), "");
2163 }
2164 break;
2165 }
2166
2167 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2168 vdata, num_comps,
2169 so_write_offsets[buf_idx],
2170 ctx->i32_0,
2171 stream_out->dst_offset * 4, 1, 1, true, false);
2172 }
2173
2174 /**
2175 * Write streamout data to buffers for vertex stream @p stream (different
2176 * vertex streams can occur for GS copy shaders).
2177 */
2178 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2179 struct si_shader_output_values *outputs,
2180 unsigned noutput, unsigned stream)
2181 {
2182 struct si_shader_selector *sel = ctx->shader->selector;
2183 struct pipe_stream_output_info *so = &sel->so;
2184 struct gallivm_state *gallivm = &ctx->gallivm;
2185 LLVMBuilderRef builder = gallivm->builder;
2186 int i;
2187 struct lp_build_if_state if_ctx;
2188
2189 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2190 LLVMValueRef so_vtx_count =
2191 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2192
2193 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2194
2195 /* can_emit = tid < so_vtx_count; */
2196 LLVMValueRef can_emit =
2197 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2198
2199 /* Emit the streamout code conditionally. This actually avoids
2200 * out-of-bounds buffer access. The hw tells us via the SGPR
2201 * (so_vtx_count) which threads are allowed to emit streamout data. */
2202 lp_build_if(&if_ctx, gallivm, can_emit);
2203 {
2204 /* The buffer offset is computed as follows:
2205 * ByteOffset = streamout_offset[buffer_id]*4 +
2206 * (streamout_write_index + thread_id)*stride[buffer_id] +
2207 * attrib_offset
2208 */
2209
2210 LLVMValueRef so_write_index =
2211 LLVMGetParam(ctx->main_fn,
2212 ctx->param_streamout_write_index);
2213
2214 /* Compute (streamout_write_index + thread_id). */
2215 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2216
2217 /* Load the descriptor and compute the write offset for each
2218 * enabled buffer. */
2219 LLVMValueRef so_write_offset[4] = {};
2220 LLVMValueRef so_buffers[4];
2221 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2222 ctx->param_rw_buffers);
2223
2224 for (i = 0; i < 4; i++) {
2225 if (!so->stride[i])
2226 continue;
2227
2228 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2229 SI_VS_STREAMOUT_BUF0 + i, 0);
2230
2231 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2232
2233 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2234 ctx->param_streamout_offset[i]);
2235 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2236
2237 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2238 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2239 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2240 }
2241
2242 /* Write streamout data. */
2243 for (i = 0; i < so->num_outputs; i++) {
2244 unsigned reg = so->output[i].register_index;
2245
2246 if (reg >= noutput)
2247 continue;
2248
2249 if (stream != so->output[i].stream)
2250 continue;
2251
2252 emit_streamout_output(ctx, so_buffers, so_write_offset,
2253 &so->output[i], &outputs[reg]);
2254 }
2255 }
2256 lp_build_endif(&if_ctx);
2257 }
2258
2259 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2260 LLVMValueRef *values)
2261 {
2262 struct ac_export_args args;
2263
2264 si_llvm_init_export_args(&ctx->bld_base, values,
2265 V_008DFC_SQ_EXP_PARAM + index, &args);
2266 ac_build_export(&ctx->ac, &args);
2267 }
2268
2269 static void si_build_param_exports(struct si_shader_context *ctx,
2270 struct si_shader_output_values *outputs,
2271 unsigned noutput)
2272 {
2273 struct si_shader *shader = ctx->shader;
2274 unsigned param_count = 0;
2275
2276 for (unsigned i = 0; i < noutput; i++) {
2277 unsigned semantic_name = outputs[i].semantic_name;
2278 unsigned semantic_index = outputs[i].semantic_index;
2279
2280 if (outputs[i].vertex_stream[0] != 0 &&
2281 outputs[i].vertex_stream[1] != 0 &&
2282 outputs[i].vertex_stream[2] != 0 &&
2283 outputs[i].vertex_stream[3] != 0)
2284 continue;
2285
2286 switch (semantic_name) {
2287 case TGSI_SEMANTIC_LAYER:
2288 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2289 case TGSI_SEMANTIC_CLIPDIST:
2290 case TGSI_SEMANTIC_COLOR:
2291 case TGSI_SEMANTIC_BCOLOR:
2292 case TGSI_SEMANTIC_PRIMID:
2293 case TGSI_SEMANTIC_FOG:
2294 case TGSI_SEMANTIC_TEXCOORD:
2295 case TGSI_SEMANTIC_GENERIC:
2296 break;
2297 default:
2298 continue;
2299 }
2300
2301 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2302 semantic_index < SI_MAX_IO_GENERIC) &&
2303 shader->key.opt.kill_outputs &
2304 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2305 continue;
2306
2307 si_export_param(ctx, param_count, outputs[i].values);
2308
2309 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2310 shader->info.vs_output_param_offset[i] = param_count++;
2311 }
2312
2313 shader->info.nr_param_exports = param_count;
2314 }
2315
2316 /* Generate export instructions for hardware VS shader stage */
2317 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2318 struct si_shader_output_values *outputs,
2319 unsigned noutput)
2320 {
2321 struct si_shader_context *ctx = si_shader_context(bld_base);
2322 struct si_shader *shader = ctx->shader;
2323 struct lp_build_context *base = &bld_base->base;
2324 struct ac_export_args pos_args[4] = {};
2325 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2326 unsigned pos_idx;
2327 int i;
2328
2329 /* Build position exports. */
2330 for (i = 0; i < noutput; i++) {
2331 switch (outputs[i].semantic_name) {
2332 case TGSI_SEMANTIC_POSITION:
2333 si_llvm_init_export_args(bld_base, outputs[i].values,
2334 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2335 break;
2336 case TGSI_SEMANTIC_PSIZE:
2337 psize_value = outputs[i].values[0];
2338 break;
2339 case TGSI_SEMANTIC_LAYER:
2340 layer_value = outputs[i].values[0];
2341 break;
2342 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2343 viewport_index_value = outputs[i].values[0];
2344 break;
2345 case TGSI_SEMANTIC_EDGEFLAG:
2346 edgeflag_value = outputs[i].values[0];
2347 break;
2348 case TGSI_SEMANTIC_CLIPDIST:
2349 if (!shader->key.opt.clip_disable) {
2350 unsigned index = 2 + outputs[i].semantic_index;
2351 si_llvm_init_export_args(bld_base, outputs[i].values,
2352 V_008DFC_SQ_EXP_POS + index,
2353 &pos_args[index]);
2354 }
2355 break;
2356 case TGSI_SEMANTIC_CLIPVERTEX:
2357 if (!shader->key.opt.clip_disable) {
2358 si_llvm_emit_clipvertex(bld_base, pos_args,
2359 outputs[i].values);
2360 }
2361 break;
2362 }
2363 }
2364
2365 /* We need to add the position output manually if it's missing. */
2366 if (!pos_args[0].out[0]) {
2367 pos_args[0].enabled_channels = 0xf; /* writemask */
2368 pos_args[0].valid_mask = 0; /* EXEC mask */
2369 pos_args[0].done = 0; /* last export? */
2370 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2371 pos_args[0].compr = 0; /* COMPR flag */
2372 pos_args[0].out[0] = base->zero; /* X */
2373 pos_args[0].out[1] = base->zero; /* Y */
2374 pos_args[0].out[2] = base->zero; /* Z */
2375 pos_args[0].out[3] = base->one; /* W */
2376 }
2377
2378 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2379 if (shader->selector->info.writes_psize ||
2380 shader->selector->info.writes_edgeflag ||
2381 shader->selector->info.writes_viewport_index ||
2382 shader->selector->info.writes_layer) {
2383 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2384 (shader->selector->info.writes_edgeflag << 1) |
2385 (shader->selector->info.writes_layer << 2);
2386
2387 pos_args[1].valid_mask = 0; /* EXEC mask */
2388 pos_args[1].done = 0; /* last export? */
2389 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2390 pos_args[1].compr = 0; /* COMPR flag */
2391 pos_args[1].out[0] = base->zero; /* X */
2392 pos_args[1].out[1] = base->zero; /* Y */
2393 pos_args[1].out[2] = base->zero; /* Z */
2394 pos_args[1].out[3] = base->zero; /* W */
2395
2396 if (shader->selector->info.writes_psize)
2397 pos_args[1].out[0] = psize_value;
2398
2399 if (shader->selector->info.writes_edgeflag) {
2400 /* The output is a float, but the hw expects an integer
2401 * with the first bit containing the edge flag. */
2402 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2403 edgeflag_value,
2404 ctx->i32, "");
2405 edgeflag_value = lp_build_min(&bld_base->int_bld,
2406 edgeflag_value,
2407 ctx->i32_1);
2408
2409 /* The LLVM intrinsic expects a float. */
2410 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2411 edgeflag_value,
2412 ctx->f32, "");
2413 }
2414
2415 if (ctx->screen->b.chip_class >= GFX9) {
2416 /* GFX9 has the layer in out.z[10:0] and the viewport
2417 * index in out.z[19:16].
2418 */
2419 if (shader->selector->info.writes_layer)
2420 pos_args[1].out[2] = layer_value;
2421
2422 if (shader->selector->info.writes_viewport_index) {
2423 LLVMValueRef v = viewport_index_value;
2424
2425 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2426 v = LLVMBuildShl(ctx->gallivm.builder, v,
2427 LLVMConstInt(ctx->i32, 16, 0), "");
2428 v = LLVMBuildOr(ctx->gallivm.builder, v,
2429 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2430 pos_args[1].out[2]), "");
2431 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2432 pos_args[1].enabled_channels |= 1 << 2;
2433 }
2434 } else {
2435 if (shader->selector->info.writes_layer)
2436 pos_args[1].out[2] = layer_value;
2437
2438 if (shader->selector->info.writes_viewport_index) {
2439 pos_args[1].out[3] = viewport_index_value;
2440 pos_args[1].enabled_channels |= 1 << 3;
2441 }
2442 }
2443 }
2444
2445 for (i = 0; i < 4; i++)
2446 if (pos_args[i].out[0])
2447 shader->info.nr_pos_exports++;
2448
2449 pos_idx = 0;
2450 for (i = 0; i < 4; i++) {
2451 if (!pos_args[i].out[0])
2452 continue;
2453
2454 /* Specify the target we are exporting */
2455 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2456
2457 if (pos_idx == shader->info.nr_pos_exports)
2458 /* Specify that this is the last export */
2459 pos_args[i].done = 1;
2460
2461 ac_build_export(&ctx->ac, &pos_args[i]);
2462 }
2463
2464 /* Build parameter exports. */
2465 si_build_param_exports(ctx, outputs, noutput);
2466 }
2467
2468 /**
2469 * Forward all outputs from the vertex shader to the TES. This is only used
2470 * for the fixed function TCS.
2471 */
2472 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2473 {
2474 struct si_shader_context *ctx = si_shader_context(bld_base);
2475 struct gallivm_state *gallivm = &ctx->gallivm;
2476 LLVMValueRef invocation_id, buffer, buffer_offset;
2477 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2478 uint64_t inputs;
2479
2480 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2481 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2482 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2483
2484 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2485 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2486 lds_vertex_stride, "");
2487 lds_base = get_tcs_in_current_patch_offset(ctx);
2488 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2489
2490 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2491 while (inputs) {
2492 unsigned i = u_bit_scan64(&inputs);
2493
2494 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2495 LLVMConstInt(ctx->i32, 4 * i, 0),
2496 "");
2497
2498 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2499 get_rel_patch_id(ctx),
2500 invocation_id,
2501 LLVMConstInt(ctx->i32, i, 0));
2502
2503 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2504 lds_ptr);
2505
2506 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2507 buffer_offset, 0, 1, 0, true, false);
2508 }
2509 }
2510
2511 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2512 LLVMValueRef rel_patch_id,
2513 LLVMValueRef invocation_id,
2514 LLVMValueRef tcs_out_current_patch_data_offset)
2515 {
2516 struct si_shader_context *ctx = si_shader_context(bld_base);
2517 struct gallivm_state *gallivm = &ctx->gallivm;
2518 struct si_shader *shader = ctx->shader;
2519 unsigned tess_inner_index, tess_outer_index;
2520 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2521 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2522 unsigned stride, outer_comps, inner_comps, i, offset;
2523 struct lp_build_if_state if_ctx, inner_if_ctx;
2524
2525 si_llvm_emit_barrier(NULL, bld_base, NULL);
2526
2527 /* Do this only for invocation 0, because the tess levels are per-patch,
2528 * not per-vertex.
2529 *
2530 * This can't jump, because invocation 0 executes this. It should
2531 * at least mask out the loads and stores for other invocations.
2532 */
2533 lp_build_if(&if_ctx, gallivm,
2534 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2535 invocation_id, ctx->i32_0, ""));
2536
2537 /* Determine the layout of one tess factor element in the buffer. */
2538 switch (shader->key.part.tcs.epilog.prim_mode) {
2539 case PIPE_PRIM_LINES:
2540 stride = 2; /* 2 dwords, 1 vec2 store */
2541 outer_comps = 2;
2542 inner_comps = 0;
2543 break;
2544 case PIPE_PRIM_TRIANGLES:
2545 stride = 4; /* 4 dwords, 1 vec4 store */
2546 outer_comps = 3;
2547 inner_comps = 1;
2548 break;
2549 case PIPE_PRIM_QUADS:
2550 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2551 outer_comps = 4;
2552 inner_comps = 2;
2553 break;
2554 default:
2555 assert(0);
2556 return;
2557 }
2558
2559 /* Load tess_inner and tess_outer from LDS.
2560 * Any invocation can write them, so we can't get them from a temporary.
2561 */
2562 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2563 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2564
2565 lds_base = tcs_out_current_patch_data_offset;
2566 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2567 LLVMConstInt(ctx->i32,
2568 tess_inner_index * 4, 0), "");
2569 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2570 LLVMConstInt(ctx->i32,
2571 tess_outer_index * 4, 0), "");
2572
2573 for (i = 0; i < 4; i++) {
2574 inner[i] = LLVMGetUndef(ctx->i32);
2575 outer[i] = LLVMGetUndef(ctx->i32);
2576 }
2577
2578 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2579 /* For isolines, the hardware expects tess factors in the
2580 * reverse order from what GLSL / TGSI specify.
2581 */
2582 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2583 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2584 } else {
2585 for (i = 0; i < outer_comps; i++) {
2586 outer[i] = out[i] =
2587 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2588 }
2589 for (i = 0; i < inner_comps; i++) {
2590 inner[i] = out[outer_comps+i] =
2591 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2592 }
2593 }
2594
2595 /* Convert the outputs to vectors for stores. */
2596 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2597 vec1 = NULL;
2598
2599 if (stride > 4)
2600 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2601
2602 /* Get the buffer. */
2603 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2604
2605 /* Get the offset. */
2606 tf_base = LLVMGetParam(ctx->main_fn,
2607 ctx->param_tcs_factor_offset);
2608 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2609 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2610
2611 lp_build_if(&inner_if_ctx, gallivm,
2612 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2613 rel_patch_id, ctx->i32_0, ""));
2614
2615 /* Store the dynamic HS control word. */
2616 offset = 0;
2617 if (ctx->screen->b.chip_class <= VI) {
2618 ac_build_buffer_store_dword(&ctx->ac, buffer,
2619 LLVMConstInt(ctx->i32, 0x80000000, 0),
2620 1, ctx->i32_0, tf_base,
2621 offset, 1, 0, true, false);
2622 offset += 4;
2623 }
2624
2625 lp_build_endif(&inner_if_ctx);
2626
2627 /* Store the tessellation factors. */
2628 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2629 MIN2(stride, 4), byteoffset, tf_base,
2630 offset, 1, 0, true, false);
2631 offset += 16;
2632 if (vec1)
2633 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2634 stride - 4, byteoffset, tf_base,
2635 offset, 1, 0, true, false);
2636
2637 /* Store the tess factors into the offchip buffer if TES reads them. */
2638 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2639 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2640 LLVMValueRef tf_inner_offset;
2641 unsigned param_outer, param_inner;
2642
2643 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2644 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2645
2646 param_outer = si_shader_io_get_unique_index_patch(
2647 TGSI_SEMANTIC_TESSOUTER, 0);
2648 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2649 LLVMConstInt(ctx->i32, param_outer, 0));
2650
2651 outer_vec = lp_build_gather_values(gallivm, outer,
2652 util_next_power_of_two(outer_comps));
2653
2654 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2655 outer_comps, tf_outer_offset,
2656 base, 0, 1, 0, true, false);
2657 if (inner_comps) {
2658 param_inner = si_shader_io_get_unique_index_patch(
2659 TGSI_SEMANTIC_TESSINNER, 0);
2660 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2661 LLVMConstInt(ctx->i32, param_inner, 0));
2662
2663 inner_vec = inner_comps == 1 ? inner[0] :
2664 lp_build_gather_values(gallivm, inner, inner_comps);
2665 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2666 inner_comps, tf_inner_offset,
2667 base, 0, 1, 0, true, false);
2668 }
2669 }
2670
2671 lp_build_endif(&if_ctx);
2672 }
2673
2674 static LLVMValueRef
2675 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2676 unsigned param, unsigned return_index)
2677 {
2678 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2679 LLVMGetParam(ctx->main_fn, param),
2680 return_index, "");
2681 }
2682
2683 static LLVMValueRef
2684 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2685 unsigned param, unsigned return_index)
2686 {
2687 LLVMBuilderRef builder = ctx->gallivm.builder;
2688 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2689
2690 return LLVMBuildInsertValue(builder, ret,
2691 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2692 return_index, "");
2693 }
2694
2695 static LLVMValueRef
2696 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2697 unsigned param, unsigned return_index)
2698 {
2699 LLVMBuilderRef builder = ctx->gallivm.builder;
2700 LLVMValueRef ptr, lo, hi;
2701
2702 ptr = LLVMGetParam(ctx->main_fn, param);
2703 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2704 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2705 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2706 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2707 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2708 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2709 }
2710
2711 /* This only writes the tessellation factor levels. */
2712 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2713 {
2714 struct si_shader_context *ctx = si_shader_context(bld_base);
2715 LLVMBuilderRef builder = ctx->gallivm.builder;
2716 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2717
2718 si_copy_tcs_inputs(bld_base);
2719
2720 rel_patch_id = get_rel_patch_id(ctx);
2721 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2722 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2723
2724 if (ctx->screen->b.chip_class >= GFX9) {
2725 LLVMBasicBlockRef blocks[2] = {
2726 LLVMGetInsertBlock(builder),
2727 ctx->merged_wrap_if_state.entry_block
2728 };
2729 LLVMValueRef values[2];
2730
2731 lp_build_endif(&ctx->merged_wrap_if_state);
2732
2733 values[0] = rel_patch_id;
2734 values[1] = LLVMGetUndef(ctx->i32);
2735 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2736
2737 values[0] = tf_lds_offset;
2738 values[1] = LLVMGetUndef(ctx->i32);
2739 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2740
2741 values[0] = invocation_id;
2742 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2743 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2744 }
2745
2746 /* Return epilog parameters from this function. */
2747 LLVMValueRef ret = ctx->return_value;
2748 unsigned vgpr;
2749
2750 if (ctx->screen->b.chip_class >= GFX9) {
2751 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2752 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2753 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2754 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2755 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2756 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2757 /* Tess offchip and tess factor offsets are at the beginning. */
2758 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2759 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2760 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2761 } else {
2762 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2763 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2764 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2765 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2766 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2767 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2768 /* Tess offchip and tess factor offsets are after user SGPRs. */
2769 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2770 GFX6_TCS_NUM_USER_SGPR);
2771 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2772 GFX6_TCS_NUM_USER_SGPR + 1);
2773 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2774 }
2775
2776 /* VGPRs */
2777 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2778 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2779 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2780
2781 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2782 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2783 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2784 ctx->return_value = ret;
2785 }
2786
2787 /* Pass TCS inputs from LS to TCS on GFX9. */
2788 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2789 {
2790 LLVMValueRef ret = ctx->return_value;
2791
2792 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2793 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2794 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2795 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2796 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2797
2798 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2799 8 + SI_SGPR_VS_STATE_BITS);
2800 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2801 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2802 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2803 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2804 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2805 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2806 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2807 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2808 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2809 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2810
2811 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2812 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2813 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2814 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2815 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2816
2817 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2818 ret = si_insert_input_ret_float(ctx, ret,
2819 ctx->param_tcs_patch_id, vgpr++);
2820 ret = si_insert_input_ret_float(ctx, ret,
2821 ctx->param_tcs_rel_ids, vgpr++);
2822 ctx->return_value = ret;
2823 }
2824
2825 /* Pass GS inputs from ES to GS on GFX9. */
2826 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2827 {
2828 LLVMValueRef ret = ctx->return_value;
2829
2830 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2831 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2832 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2833
2834 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2835
2836 unsigned desc_param = ctx->param_vs_state_bits + 1;
2837 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2838 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
2839 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2840 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
2841
2842 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2843 for (unsigned i = 0; i < 5; i++) {
2844 unsigned param = ctx->param_gs_vtx01_offset + i;
2845 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2846 }
2847 ctx->return_value = ret;
2848 }
2849
2850 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2851 {
2852 struct si_shader_context *ctx = si_shader_context(bld_base);
2853 struct si_shader *shader = ctx->shader;
2854 struct tgsi_shader_info *info = &shader->selector->info;
2855 struct gallivm_state *gallivm = &ctx->gallivm;
2856 unsigned i, chan;
2857 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2858 ctx->param_rel_auto_id);
2859 LLVMValueRef vertex_dw_stride =
2860 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2861 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2862 vertex_dw_stride, "");
2863
2864 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2865 * its inputs from it. */
2866 for (i = 0; i < info->num_outputs; i++) {
2867 LLVMValueRef *out_ptr = ctx->outputs[i];
2868 unsigned name = info->output_semantic_name[i];
2869 unsigned index = info->output_semantic_index[i];
2870
2871 /* The ARB_shader_viewport_layer_array spec contains the
2872 * following issue:
2873 *
2874 * 2) What happens if gl_ViewportIndex or gl_Layer is
2875 * written in the vertex shader and a geometry shader is
2876 * present?
2877 *
2878 * RESOLVED: The value written by the last vertex processing
2879 * stage is used. If the last vertex processing stage
2880 * (vertex, tessellation evaluation or geometry) does not
2881 * statically assign to gl_ViewportIndex or gl_Layer, index
2882 * or layer zero is assumed.
2883 *
2884 * So writes to those outputs in VS-as-LS are simply ignored.
2885 */
2886 if (name == TGSI_SEMANTIC_LAYER ||
2887 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2888 continue;
2889
2890 int param = si_shader_io_get_unique_index(name, index);
2891 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2892 LLVMConstInt(ctx->i32, param * 4, 0), "");
2893
2894 for (chan = 0; chan < 4; chan++) {
2895 lds_store(bld_base, chan, dw_addr,
2896 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2897 }
2898 }
2899
2900 if (ctx->screen->b.chip_class >= GFX9)
2901 si_set_ls_return_value_for_tcs(ctx);
2902 }
2903
2904 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2905 {
2906 struct si_shader_context *ctx = si_shader_context(bld_base);
2907 struct gallivm_state *gallivm = &ctx->gallivm;
2908 struct si_shader *es = ctx->shader;
2909 struct tgsi_shader_info *info = &es->selector->info;
2910 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2911 ctx->param_es2gs_offset);
2912 LLVMValueRef lds_base = NULL;
2913 unsigned chan;
2914 int i;
2915
2916 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2917 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2918 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
2919 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
2920 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
2921 LLVMBuildMul(gallivm->builder, wave_idx,
2922 LLVMConstInt(ctx->i32, 64, false), ""), "");
2923 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
2924 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2925 }
2926
2927 for (i = 0; i < info->num_outputs; i++) {
2928 LLVMValueRef *out_ptr = ctx->outputs[i];
2929 int param;
2930
2931 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2932 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2933 continue;
2934
2935 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2936 info->output_semantic_index[i]);
2937
2938 for (chan = 0; chan < 4; chan++) {
2939 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2940 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2941
2942 /* GFX9 has the ESGS ring in LDS. */
2943 if (ctx->screen->b.chip_class >= GFX9) {
2944 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2945 continue;
2946 }
2947
2948 ac_build_buffer_store_dword(&ctx->ac,
2949 ctx->esgs_ring,
2950 out_val, 1, NULL, soffset,
2951 (4 * param + chan) * 4,
2952 1, 1, true, true);
2953 }
2954 }
2955
2956 if (ctx->screen->b.chip_class >= GFX9)
2957 si_set_es_return_value_for_gs(ctx);
2958 }
2959
2960 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2961 {
2962 if (ctx->screen->b.chip_class >= GFX9)
2963 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2964 else
2965 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2966 }
2967
2968 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2969 {
2970 struct si_shader_context *ctx = si_shader_context(bld_base);
2971
2972 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2973 si_get_gs_wave_id(ctx));
2974
2975 if (ctx->screen->b.chip_class >= GFX9)
2976 lp_build_endif(&ctx->merged_wrap_if_state);
2977 }
2978
2979 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2980 {
2981 struct si_shader_context *ctx = si_shader_context(bld_base);
2982 struct gallivm_state *gallivm = &ctx->gallivm;
2983 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2984 struct si_shader_output_values *outputs = NULL;
2985 int i,j;
2986
2987 assert(!ctx->shader->is_gs_copy_shader);
2988
2989 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2990
2991 /* Vertex color clamping.
2992 *
2993 * This uses a state constant loaded in a user data SGPR and
2994 * an IF statement is added that clamps all colors if the constant
2995 * is true.
2996 */
2997 if (ctx->type == PIPE_SHADER_VERTEX) {
2998 struct lp_build_if_state if_ctx;
2999 LLVMValueRef cond = NULL;
3000 LLVMValueRef addr, val;
3001
3002 for (i = 0; i < info->num_outputs; i++) {
3003 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3004 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3005 continue;
3006
3007 /* We've found a color. */
3008 if (!cond) {
3009 /* The state is in the first bit of the user SGPR. */
3010 cond = LLVMGetParam(ctx->main_fn,
3011 ctx->param_vs_state_bits);
3012 cond = LLVMBuildTrunc(gallivm->builder, cond,
3013 ctx->i1, "");
3014 lp_build_if(&if_ctx, gallivm, cond);
3015 }
3016
3017 for (j = 0; j < 4; j++) {
3018 addr = ctx->outputs[i][j];
3019 val = LLVMBuildLoad(gallivm->builder, addr, "");
3020 val = ac_build_clamp(&ctx->ac, val);
3021 LLVMBuildStore(gallivm->builder, val, addr);
3022 }
3023 }
3024
3025 if (cond)
3026 lp_build_endif(&if_ctx);
3027 }
3028
3029 for (i = 0; i < info->num_outputs; i++) {
3030 outputs[i].semantic_name = info->output_semantic_name[i];
3031 outputs[i].semantic_index = info->output_semantic_index[i];
3032
3033 for (j = 0; j < 4; j++) {
3034 outputs[i].values[j] =
3035 LLVMBuildLoad(gallivm->builder,
3036 ctx->outputs[i][j],
3037 "");
3038 outputs[i].vertex_stream[j] =
3039 (info->output_streams[i] >> (2 * j)) & 3;
3040 }
3041 }
3042
3043 if (ctx->shader->selector->so.num_outputs)
3044 si_llvm_emit_streamout(ctx, outputs, i, 0);
3045
3046 /* Export PrimitiveID. */
3047 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3048 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3049 outputs[i].semantic_index = 0;
3050 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
3051 get_primitive_id(bld_base, 0));
3052 for (j = 1; j < 4; j++)
3053 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3054
3055 memset(outputs[i].vertex_stream, 0,
3056 sizeof(outputs[i].vertex_stream));
3057 i++;
3058 }
3059
3060 si_llvm_export_vs(bld_base, outputs, i);
3061 FREE(outputs);
3062 }
3063
3064 struct si_ps_exports {
3065 unsigned num;
3066 struct ac_export_args args[10];
3067 };
3068
3069 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3070 bool writes_samplemask)
3071 {
3072 if (writes_z) {
3073 /* Z needs 32 bits. */
3074 if (writes_samplemask)
3075 return V_028710_SPI_SHADER_32_ABGR;
3076 else if (writes_stencil)
3077 return V_028710_SPI_SHADER_32_GR;
3078 else
3079 return V_028710_SPI_SHADER_32_R;
3080 } else if (writes_stencil || writes_samplemask) {
3081 /* Both stencil and sample mask need only 16 bits. */
3082 return V_028710_SPI_SHADER_UINT16_ABGR;
3083 } else {
3084 return V_028710_SPI_SHADER_ZERO;
3085 }
3086 }
3087
3088 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3089 LLVMValueRef depth, LLVMValueRef stencil,
3090 LLVMValueRef samplemask, struct si_ps_exports *exp)
3091 {
3092 struct si_shader_context *ctx = si_shader_context(bld_base);
3093 struct lp_build_context *base = &bld_base->base;
3094 struct ac_export_args args;
3095 unsigned mask = 0;
3096 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3097 stencil != NULL,
3098 samplemask != NULL);
3099
3100 assert(depth || stencil || samplemask);
3101
3102 args.valid_mask = 1; /* whether the EXEC mask is valid */
3103 args.done = 1; /* DONE bit */
3104
3105 /* Specify the target we are exporting */
3106 args.target = V_008DFC_SQ_EXP_MRTZ;
3107
3108 args.compr = 0; /* COMP flag */
3109 args.out[0] = base->undef; /* R, depth */
3110 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3111 args.out[2] = base->undef; /* B, sample mask */
3112 args.out[3] = base->undef; /* A, alpha to mask */
3113
3114 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3115 assert(!depth);
3116 args.compr = 1; /* COMPR flag */
3117
3118 if (stencil) {
3119 /* Stencil should be in X[23:16]. */
3120 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3121 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3122 LLVMConstInt(ctx->i32, 16, 0), "");
3123 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3124 mask |= 0x3;
3125 }
3126 if (samplemask) {
3127 /* SampleMask should be in Y[15:0]. */
3128 args.out[1] = samplemask;
3129 mask |= 0xc;
3130 }
3131 } else {
3132 if (depth) {
3133 args.out[0] = depth;
3134 mask |= 0x1;
3135 }
3136 if (stencil) {
3137 args.out[1] = stencil;
3138 mask |= 0x2;
3139 }
3140 if (samplemask) {
3141 args.out[2] = samplemask;
3142 mask |= 0x4;
3143 }
3144 }
3145
3146 /* SI (except OLAND and HAINAN) has a bug that it only looks
3147 * at the X writemask component. */
3148 if (ctx->screen->b.chip_class == SI &&
3149 ctx->screen->b.family != CHIP_OLAND &&
3150 ctx->screen->b.family != CHIP_HAINAN)
3151 mask |= 0x1;
3152
3153 /* Specify which components to enable */
3154 args.enabled_channels = mask;
3155
3156 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3157 }
3158
3159 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3160 LLVMValueRef *color, unsigned index,
3161 unsigned samplemask_param,
3162 bool is_last, struct si_ps_exports *exp)
3163 {
3164 struct si_shader_context *ctx = si_shader_context(bld_base);
3165 struct lp_build_context *base = &bld_base->base;
3166 int i;
3167
3168 /* Clamp color */
3169 if (ctx->shader->key.part.ps.epilog.clamp_color)
3170 for (i = 0; i < 4; i++)
3171 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3172
3173 /* Alpha to one */
3174 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3175 color[3] = base->one;
3176
3177 /* Alpha test */
3178 if (index == 0 &&
3179 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3180 si_alpha_test(bld_base, color[3]);
3181
3182 /* Line & polygon smoothing */
3183 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3184 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3185 samplemask_param);
3186
3187 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3188 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3189 struct ac_export_args args[8];
3190 int c, last = -1;
3191
3192 /* Get the export arguments, also find out what the last one is. */
3193 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3194 si_llvm_init_export_args(bld_base, color,
3195 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3196 if (args[c].enabled_channels)
3197 last = c;
3198 }
3199
3200 /* Emit all exports. */
3201 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3202 if (is_last && last == c) {
3203 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3204 args[c].done = 1; /* DONE bit */
3205 } else if (!args[c].enabled_channels)
3206 continue; /* unnecessary NULL export */
3207
3208 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3209 }
3210 } else {
3211 struct ac_export_args args;
3212
3213 /* Export */
3214 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3215 &args);
3216 if (is_last) {
3217 args.valid_mask = 1; /* whether the EXEC mask is valid */
3218 args.done = 1; /* DONE bit */
3219 } else if (!args.enabled_channels)
3220 return; /* unnecessary NULL export */
3221
3222 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3223 }
3224 }
3225
3226 static void si_emit_ps_exports(struct si_shader_context *ctx,
3227 struct si_ps_exports *exp)
3228 {
3229 for (unsigned i = 0; i < exp->num; i++)
3230 ac_build_export(&ctx->ac, &exp->args[i]);
3231 }
3232
3233 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3234 {
3235 struct si_shader_context *ctx = si_shader_context(bld_base);
3236 struct lp_build_context *base = &bld_base->base;
3237 struct ac_export_args args;
3238
3239 args.enabled_channels = 0x0; /* enabled channels */
3240 args.valid_mask = 1; /* whether the EXEC mask is valid */
3241 args.done = 1; /* DONE bit */
3242 args.target = V_008DFC_SQ_EXP_NULL;
3243 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3244 args.out[0] = base->undef; /* R */
3245 args.out[1] = base->undef; /* G */
3246 args.out[2] = base->undef; /* B */
3247 args.out[3] = base->undef; /* A */
3248
3249 ac_build_export(&ctx->ac, &args);
3250 }
3251
3252 /**
3253 * Return PS outputs in this order:
3254 *
3255 * v[0:3] = color0.xyzw
3256 * v[4:7] = color1.xyzw
3257 * ...
3258 * vN+0 = Depth
3259 * vN+1 = Stencil
3260 * vN+2 = SampleMask
3261 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3262 *
3263 * The alpha-ref SGPR is returned via its original location.
3264 */
3265 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3266 {
3267 struct si_shader_context *ctx = si_shader_context(bld_base);
3268 struct si_shader *shader = ctx->shader;
3269 struct tgsi_shader_info *info = &shader->selector->info;
3270 LLVMBuilderRef builder = ctx->gallivm.builder;
3271 unsigned i, j, first_vgpr, vgpr;
3272
3273 LLVMValueRef color[8][4] = {};
3274 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3275 LLVMValueRef ret;
3276
3277 if (ctx->postponed_kill)
3278 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3279
3280 /* Read the output values. */
3281 for (i = 0; i < info->num_outputs; i++) {
3282 unsigned semantic_name = info->output_semantic_name[i];
3283 unsigned semantic_index = info->output_semantic_index[i];
3284
3285 switch (semantic_name) {
3286 case TGSI_SEMANTIC_COLOR:
3287 assert(semantic_index < 8);
3288 for (j = 0; j < 4; j++) {
3289 LLVMValueRef ptr = ctx->outputs[i][j];
3290 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3291 color[semantic_index][j] = result;
3292 }
3293 break;
3294 case TGSI_SEMANTIC_POSITION:
3295 depth = LLVMBuildLoad(builder,
3296 ctx->outputs[i][2], "");
3297 break;
3298 case TGSI_SEMANTIC_STENCIL:
3299 stencil = LLVMBuildLoad(builder,
3300 ctx->outputs[i][1], "");
3301 break;
3302 case TGSI_SEMANTIC_SAMPLEMASK:
3303 samplemask = LLVMBuildLoad(builder,
3304 ctx->outputs[i][0], "");
3305 break;
3306 default:
3307 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3308 semantic_name);
3309 }
3310 }
3311
3312 /* Fill the return structure. */
3313 ret = ctx->return_value;
3314
3315 /* Set SGPRs. */
3316 ret = LLVMBuildInsertValue(builder, ret,
3317 bitcast(bld_base, TGSI_TYPE_SIGNED,
3318 LLVMGetParam(ctx->main_fn,
3319 SI_PARAM_ALPHA_REF)),
3320 SI_SGPR_ALPHA_REF, "");
3321
3322 /* Set VGPRs */
3323 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3324 for (i = 0; i < ARRAY_SIZE(color); i++) {
3325 if (!color[i][0])
3326 continue;
3327
3328 for (j = 0; j < 4; j++)
3329 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3330 }
3331 if (depth)
3332 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3333 if (stencil)
3334 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3335 if (samplemask)
3336 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3337
3338 /* Add the input sample mask for smoothing at the end. */
3339 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3340 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3341 ret = LLVMBuildInsertValue(builder, ret,
3342 LLVMGetParam(ctx->main_fn,
3343 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3344
3345 ctx->return_value = ret;
3346 }
3347
3348 /* Prevent optimizations (at least of memory accesses) across the current
3349 * point in the program by emitting empty inline assembly that is marked as
3350 * having side effects.
3351 *
3352 * Optionally, a value can be passed through the inline assembly to prevent
3353 * LLVM from hoisting calls to ReadNone functions.
3354 */
3355 static void emit_optimization_barrier(struct si_shader_context *ctx,
3356 LLVMValueRef *pvgpr)
3357 {
3358 static int counter = 0;
3359
3360 LLVMBuilderRef builder = ctx->gallivm.builder;
3361 char code[16];
3362
3363 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3364
3365 if (!pvgpr) {
3366 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3367 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3368 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3369 } else {
3370 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3371 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3372 LLVMValueRef vgpr = *pvgpr;
3373 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3374 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3375 LLVMValueRef vgpr0;
3376
3377 assert(vgpr_size % 4 == 0);
3378
3379 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3380 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3381 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3382 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3383 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3384
3385 *pvgpr = vgpr;
3386 }
3387 }
3388
3389 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3390 {
3391 struct gallivm_state *gallivm = &ctx->gallivm;
3392 LLVMBuilderRef builder = gallivm->builder;
3393 LLVMValueRef args[1] = {
3394 LLVMConstInt(ctx->i32, simm16, 0)
3395 };
3396 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3397 ctx->voidt, args, 1, 0);
3398 }
3399
3400 static void membar_emit(
3401 const struct lp_build_tgsi_action *action,
3402 struct lp_build_tgsi_context *bld_base,
3403 struct lp_build_emit_data *emit_data)
3404 {
3405 struct si_shader_context *ctx = si_shader_context(bld_base);
3406 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3407 unsigned flags = LLVMConstIntGetZExtValue(src0);
3408 unsigned waitcnt = NOOP_WAITCNT;
3409
3410 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3411 waitcnt &= VM_CNT & LGKM_CNT;
3412
3413 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3414 TGSI_MEMBAR_SHADER_BUFFER |
3415 TGSI_MEMBAR_SHADER_IMAGE))
3416 waitcnt &= VM_CNT;
3417
3418 if (flags & TGSI_MEMBAR_SHARED)
3419 waitcnt &= LGKM_CNT;
3420
3421 if (waitcnt != NOOP_WAITCNT)
3422 si_emit_waitcnt(ctx, waitcnt);
3423 }
3424
3425 static void clock_emit(
3426 const struct lp_build_tgsi_action *action,
3427 struct lp_build_tgsi_context *bld_base,
3428 struct lp_build_emit_data *emit_data)
3429 {
3430 struct si_shader_context *ctx = si_shader_context(bld_base);
3431 struct gallivm_state *gallivm = &ctx->gallivm;
3432 LLVMValueRef tmp;
3433
3434 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3435 ctx->i64, NULL, 0, 0);
3436 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3437
3438 emit_data->output[0] =
3439 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3440 emit_data->output[1] =
3441 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3442 }
3443
3444 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3445 {
3446 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3447 CONST_ADDR_SPACE);
3448 }
3449
3450 static void si_llvm_emit_ddxy(
3451 const struct lp_build_tgsi_action *action,
3452 struct lp_build_tgsi_context *bld_base,
3453 struct lp_build_emit_data *emit_data)
3454 {
3455 struct si_shader_context *ctx = si_shader_context(bld_base);
3456 struct gallivm_state *gallivm = &ctx->gallivm;
3457 unsigned opcode = emit_data->info->opcode;
3458 LLVMValueRef val;
3459 int idx;
3460 unsigned mask;
3461
3462 if (opcode == TGSI_OPCODE_DDX_FINE)
3463 mask = AC_TID_MASK_LEFT;
3464 else if (opcode == TGSI_OPCODE_DDY_FINE)
3465 mask = AC_TID_MASK_TOP;
3466 else
3467 mask = AC_TID_MASK_TOP_LEFT;
3468
3469 /* for DDX we want to next X pixel, DDY next Y pixel. */
3470 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3471
3472 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3473 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3474 mask, idx, ctx->lds, val);
3475 emit_data->output[emit_data->chan] = val;
3476 }
3477
3478 /*
3479 * this takes an I,J coordinate pair,
3480 * and works out the X and Y derivatives.
3481 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3482 */
3483 static LLVMValueRef si_llvm_emit_ddxy_interp(
3484 struct lp_build_tgsi_context *bld_base,
3485 LLVMValueRef interp_ij)
3486 {
3487 struct si_shader_context *ctx = si_shader_context(bld_base);
3488 struct gallivm_state *gallivm = &ctx->gallivm;
3489 LLVMValueRef result[4], a;
3490 unsigned i;
3491
3492 for (i = 0; i < 2; i++) {
3493 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3494 LLVMConstInt(ctx->i32, i, 0), "");
3495 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3496 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3497 }
3498
3499 return lp_build_gather_values(gallivm, result, 4);
3500 }
3501
3502 static void interp_fetch_args(
3503 struct lp_build_tgsi_context *bld_base,
3504 struct lp_build_emit_data *emit_data)
3505 {
3506 struct si_shader_context *ctx = si_shader_context(bld_base);
3507 struct gallivm_state *gallivm = &ctx->gallivm;
3508 const struct tgsi_full_instruction *inst = emit_data->inst;
3509
3510 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3511 /* offset is in second src, first two channels */
3512 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3513 emit_data->inst, 1,
3514 TGSI_CHAN_X);
3515 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3516 emit_data->inst, 1,
3517 TGSI_CHAN_Y);
3518 emit_data->arg_count = 2;
3519 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3520 LLVMValueRef sample_position;
3521 LLVMValueRef sample_id;
3522 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3523
3524 /* fetch sample ID, then fetch its sample position,
3525 * and place into first two channels.
3526 */
3527 sample_id = lp_build_emit_fetch(bld_base,
3528 emit_data->inst, 1, TGSI_CHAN_X);
3529 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3530 ctx->i32, "");
3531 sample_position = load_sample_position(ctx, sample_id);
3532
3533 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3534 sample_position,
3535 ctx->i32_0, "");
3536
3537 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3538 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3539 sample_position,
3540 ctx->i32_1, "");
3541 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3542 emit_data->arg_count = 2;
3543 }
3544 }
3545
3546 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3547 struct lp_build_tgsi_context *bld_base,
3548 struct lp_build_emit_data *emit_data)
3549 {
3550 struct si_shader_context *ctx = si_shader_context(bld_base);
3551 struct si_shader *shader = ctx->shader;
3552 struct gallivm_state *gallivm = &ctx->gallivm;
3553 const struct tgsi_shader_info *info = &shader->selector->info;
3554 LLVMValueRef interp_param;
3555 const struct tgsi_full_instruction *inst = emit_data->inst;
3556 const struct tgsi_full_src_register *input = &inst->Src[0];
3557 int input_base, input_array_size;
3558 int chan;
3559 int i;
3560 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3561 LLVMValueRef array_idx;
3562 int interp_param_idx;
3563 unsigned interp;
3564 unsigned location;
3565
3566 assert(input->Register.File == TGSI_FILE_INPUT);
3567
3568 if (input->Register.Indirect) {
3569 unsigned array_id = input->Indirect.ArrayID;
3570
3571 if (array_id) {
3572 input_base = info->input_array_first[array_id];
3573 input_array_size = info->input_array_last[array_id] - input_base + 1;
3574 } else {
3575 input_base = inst->Src[0].Register.Index;
3576 input_array_size = info->num_inputs - input_base;
3577 }
3578
3579 array_idx = get_indirect_index(ctx, &input->Indirect,
3580 input->Register.Index - input_base);
3581 } else {
3582 input_base = inst->Src[0].Register.Index;
3583 input_array_size = 1;
3584 array_idx = ctx->i32_0;
3585 }
3586
3587 interp = shader->selector->info.input_interpolate[input_base];
3588
3589 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3590 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3591 location = TGSI_INTERPOLATE_LOC_CENTER;
3592 else
3593 location = TGSI_INTERPOLATE_LOC_CENTROID;
3594
3595 interp_param_idx = lookup_interp_param_index(interp, location);
3596 if (interp_param_idx == -1)
3597 return;
3598 else if (interp_param_idx)
3599 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3600 else
3601 interp_param = NULL;
3602
3603 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3604 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3605 LLVMValueRef ij_out[2];
3606 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3607
3608 /*
3609 * take the I then J parameters, and the DDX/Y for it, and
3610 * calculate the IJ inputs for the interpolator.
3611 * temp1 = ddx * offset/sample.x + I;
3612 * interp_param.I = ddy * offset/sample.y + temp1;
3613 * temp1 = ddx * offset/sample.x + J;
3614 * interp_param.J = ddy * offset/sample.y + temp1;
3615 */
3616 for (i = 0; i < 2; i++) {
3617 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3618 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3619 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3620 ddxy_out, ix_ll, "");
3621 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3622 ddxy_out, iy_ll, "");
3623 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3624 interp_param, ix_ll, "");
3625 LLVMValueRef temp1, temp2;
3626
3627 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3628 ctx->f32, "");
3629
3630 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3631
3632 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3633
3634 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3635
3636 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3637 }
3638 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3639 }
3640
3641 if (interp_param) {
3642 interp_param = LLVMBuildBitCast(gallivm->builder,
3643 interp_param, LLVMVectorType(ctx->f32, 2), "");
3644 }
3645
3646 for (chan = 0; chan < 4; chan++) {
3647 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3648 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3649
3650 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3651 LLVMValueRef v, i = NULL, j = NULL;
3652
3653 if (interp_param) {
3654 interp_param = LLVMBuildBitCast(gallivm->builder,
3655 interp_param, LLVMVectorType(ctx->f32, 2), "");
3656 i = LLVMBuildExtractElement(
3657 gallivm->builder, interp_param, ctx->i32_0, "");
3658 j = LLVMBuildExtractElement(
3659 gallivm->builder, interp_param, ctx->i32_1, "");
3660 }
3661 v = si_build_fs_interp(ctx, input_base + idx, schan,
3662 prim_mask, i, j);
3663
3664 gather = LLVMBuildInsertElement(gallivm->builder,
3665 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3666 }
3667
3668 emit_data->output[chan] = LLVMBuildExtractElement(
3669 gallivm->builder, gather, array_idx, "");
3670 }
3671 }
3672
3673 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3674 LLVMValueRef value)
3675 {
3676 struct gallivm_state *gallivm = &ctx->gallivm;
3677 LLVMValueRef args[3] = {
3678 value,
3679 ctx->i32_0,
3680 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3681 };
3682
3683 /* We currently have no other way to prevent LLVM from lifting the icmp
3684 * calls to a dominating basic block.
3685 */
3686 emit_optimization_barrier(ctx, &args[0]);
3687
3688 if (LLVMTypeOf(args[0]) != ctx->i32)
3689 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3690
3691 return lp_build_intrinsic(gallivm->builder,
3692 "llvm.amdgcn.icmp.i32",
3693 ctx->i64, args, 3,
3694 LP_FUNC_ATTR_NOUNWIND |
3695 LP_FUNC_ATTR_READNONE |
3696 LP_FUNC_ATTR_CONVERGENT);
3697 }
3698
3699 static void vote_all_emit(
3700 const struct lp_build_tgsi_action *action,
3701 struct lp_build_tgsi_context *bld_base,
3702 struct lp_build_emit_data *emit_data)
3703 {
3704 struct si_shader_context *ctx = si_shader_context(bld_base);
3705 struct gallivm_state *gallivm = &ctx->gallivm;
3706 LLVMValueRef active_set, vote_set;
3707 LLVMValueRef tmp;
3708
3709 active_set = si_emit_ballot(ctx, ctx->i32_1);
3710 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3711
3712 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3713 emit_data->output[emit_data->chan] =
3714 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3715 }
3716
3717 static void vote_any_emit(
3718 const struct lp_build_tgsi_action *action,
3719 struct lp_build_tgsi_context *bld_base,
3720 struct lp_build_emit_data *emit_data)
3721 {
3722 struct si_shader_context *ctx = si_shader_context(bld_base);
3723 struct gallivm_state *gallivm = &ctx->gallivm;
3724 LLVMValueRef vote_set;
3725 LLVMValueRef tmp;
3726
3727 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3728
3729 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3730 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3731 emit_data->output[emit_data->chan] =
3732 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3733 }
3734
3735 static void vote_eq_emit(
3736 const struct lp_build_tgsi_action *action,
3737 struct lp_build_tgsi_context *bld_base,
3738 struct lp_build_emit_data *emit_data)
3739 {
3740 struct si_shader_context *ctx = si_shader_context(bld_base);
3741 struct gallivm_state *gallivm = &ctx->gallivm;
3742 LLVMValueRef active_set, vote_set;
3743 LLVMValueRef all, none, tmp;
3744
3745 active_set = si_emit_ballot(ctx, ctx->i32_1);
3746 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3747
3748 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3749 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3750 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3751 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3752 emit_data->output[emit_data->chan] =
3753 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3754 }
3755
3756 static void ballot_emit(
3757 const struct lp_build_tgsi_action *action,
3758 struct lp_build_tgsi_context *bld_base,
3759 struct lp_build_emit_data *emit_data)
3760 {
3761 struct si_shader_context *ctx = si_shader_context(bld_base);
3762 LLVMBuilderRef builder = ctx->gallivm.builder;
3763 LLVMValueRef tmp;
3764
3765 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3766 tmp = si_emit_ballot(ctx, tmp);
3767 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3768
3769 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3770 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3771 }
3772
3773 static void read_invoc_fetch_args(
3774 struct lp_build_tgsi_context *bld_base,
3775 struct lp_build_emit_data *emit_data)
3776 {
3777 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3778 0, emit_data->src_chan);
3779
3780 /* Always read the source invocation (= lane) from the X channel. */
3781 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3782 1, TGSI_CHAN_X);
3783 emit_data->arg_count = 2;
3784 }
3785
3786 static void read_lane_emit(
3787 const struct lp_build_tgsi_action *action,
3788 struct lp_build_tgsi_context *bld_base,
3789 struct lp_build_emit_data *emit_data)
3790 {
3791 struct si_shader_context *ctx = si_shader_context(bld_base);
3792 LLVMBuilderRef builder = ctx->gallivm.builder;
3793
3794 /* We currently have no other way to prevent LLVM from lifting the icmp
3795 * calls to a dominating basic block.
3796 */
3797 emit_optimization_barrier(ctx, &emit_data->args[0]);
3798
3799 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3800 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3801 ctx->i32, "");
3802 }
3803
3804 emit_data->output[emit_data->chan] =
3805 ac_build_intrinsic(&ctx->ac, action->intr_name,
3806 ctx->i32, emit_data->args, emit_data->arg_count,
3807 AC_FUNC_ATTR_READNONE |
3808 AC_FUNC_ATTR_CONVERGENT);
3809 }
3810
3811 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3812 struct lp_build_emit_data *emit_data)
3813 {
3814 struct si_shader_context *ctx = si_shader_context(bld_base);
3815 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
3816 LLVMValueRef imm;
3817 unsigned stream;
3818
3819 assert(src0.File == TGSI_FILE_IMMEDIATE);
3820
3821 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
3822 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
3823 return stream;
3824 }
3825
3826 /* Emit one vertex from the geometry shader */
3827 static void si_llvm_emit_vertex(
3828 const struct lp_build_tgsi_action *action,
3829 struct lp_build_tgsi_context *bld_base,
3830 struct lp_build_emit_data *emit_data)
3831 {
3832 struct si_shader_context *ctx = si_shader_context(bld_base);
3833 struct lp_build_context *uint = &bld_base->uint_bld;
3834 struct si_shader *shader = ctx->shader;
3835 struct tgsi_shader_info *info = &shader->selector->info;
3836 struct gallivm_state *gallivm = &ctx->gallivm;
3837 struct lp_build_if_state if_state;
3838 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3839 ctx->param_gs2vs_offset);
3840 LLVMValueRef gs_next_vertex;
3841 LLVMValueRef can_emit, kill;
3842 unsigned chan, offset;
3843 int i;
3844 unsigned stream;
3845
3846 stream = si_llvm_get_stream(bld_base, emit_data);
3847
3848 /* Write vertex attribute values to GSVS ring */
3849 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
3850 ctx->gs_next_vertex[stream],
3851 "");
3852
3853 /* If this thread has already emitted the declared maximum number of
3854 * vertices, skip the write: excessive vertex emissions are not
3855 * supposed to have any effect.
3856 *
3857 * If the shader has no writes to memory, kill it instead. This skips
3858 * further memory loads and may allow LLVM to skip to the end
3859 * altogether.
3860 */
3861 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
3862 LLVMConstInt(ctx->i32,
3863 shader->selector->gs_max_out_vertices, 0), "");
3864
3865 bool use_kill = !info->writes_memory;
3866 if (use_kill) {
3867 kill = lp_build_select(&bld_base->base, can_emit,
3868 LLVMConstReal(ctx->f32, 1.0f),
3869 LLVMConstReal(ctx->f32, -1.0f));
3870
3871 ac_build_kill(&ctx->ac, kill);
3872 } else {
3873 lp_build_if(&if_state, gallivm, can_emit);
3874 }
3875
3876 offset = 0;
3877 for (i = 0; i < info->num_outputs; i++) {
3878 LLVMValueRef *out_ptr = ctx->outputs[i];
3879
3880 for (chan = 0; chan < 4; chan++) {
3881 if (!(info->output_usagemask[i] & (1 << chan)) ||
3882 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
3883 continue;
3884
3885 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3886 LLVMValueRef voffset =
3887 LLVMConstInt(ctx->i32, offset *
3888 shader->selector->gs_max_out_vertices, 0);
3889 offset++;
3890
3891 voffset = lp_build_add(uint, voffset, gs_next_vertex);
3892 voffset = lp_build_mul_imm(uint, voffset, 4);
3893
3894 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3895
3896 ac_build_buffer_store_dword(&ctx->ac,
3897 ctx->gsvs_ring[stream],
3898 out_val, 1,
3899 voffset, soffset, 0,
3900 1, 1, true, true);
3901 }
3902 }
3903
3904 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
3905 ctx->i32_1);
3906
3907 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
3908
3909 /* Signal vertex emission */
3910 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
3911 si_get_gs_wave_id(ctx));
3912 if (!use_kill)
3913 lp_build_endif(&if_state);
3914 }
3915
3916 /* Cut one primitive from the geometry shader */
3917 static void si_llvm_emit_primitive(
3918 const struct lp_build_tgsi_action *action,
3919 struct lp_build_tgsi_context *bld_base,
3920 struct lp_build_emit_data *emit_data)
3921 {
3922 struct si_shader_context *ctx = si_shader_context(bld_base);
3923 unsigned stream;
3924
3925 /* Signal primitive cut */
3926 stream = si_llvm_get_stream(bld_base, emit_data);
3927 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
3928 si_get_gs_wave_id(ctx));
3929 }
3930
3931 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
3932 struct lp_build_tgsi_context *bld_base,
3933 struct lp_build_emit_data *emit_data)
3934 {
3935 struct si_shader_context *ctx = si_shader_context(bld_base);
3936 struct gallivm_state *gallivm = &ctx->gallivm;
3937
3938 /* SI only (thanks to a hw bug workaround):
3939 * The real barrier instruction isn’t needed, because an entire patch
3940 * always fits into a single wave.
3941 */
3942 if (ctx->screen->b.chip_class == SI &&
3943 ctx->type == PIPE_SHADER_TESS_CTRL) {
3944 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
3945 return;
3946 }
3947
3948 lp_build_intrinsic(gallivm->builder,
3949 "llvm.amdgcn.s.barrier",
3950 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
3951 }
3952
3953 static const struct lp_build_tgsi_action interp_action = {
3954 .fetch_args = interp_fetch_args,
3955 .emit = build_interp_intrinsic,
3956 };
3957
3958 static void si_create_function(struct si_shader_context *ctx,
3959 const char *name,
3960 LLVMTypeRef *returns, unsigned num_returns,
3961 LLVMTypeRef *params, unsigned num_params,
3962 int last_sgpr, unsigned max_workgroup_size)
3963 {
3964 int i;
3965
3966 si_llvm_create_func(ctx, name, returns, num_returns,
3967 params, num_params);
3968 ctx->return_value = LLVMGetUndef(ctx->return_type);
3969
3970 for (i = 0; i <= last_sgpr; ++i) {
3971 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
3972
3973 /* The combination of:
3974 * - ByVal
3975 * - dereferenceable
3976 * - invariant.load
3977 * allows the optimization passes to move loads and reduces
3978 * SGPR spilling significantly.
3979 */
3980 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3981 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
3982 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
3983 ac_add_attr_dereferenceable(P, UINT64_MAX);
3984 } else
3985 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
3986 }
3987
3988 if (max_workgroup_size) {
3989 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
3990 max_workgroup_size);
3991 }
3992 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3993 "no-signed-zeros-fp-math",
3994 "true");
3995
3996 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
3997 /* These were copied from some LLVM test. */
3998 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
3999 "less-precise-fpmad",
4000 "true");
4001 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4002 "no-infs-fp-math",
4003 "true");
4004 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4005 "no-nans-fp-math",
4006 "true");
4007 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4008 "unsafe-fp-math",
4009 "true");
4010 }
4011 }
4012
4013 static void declare_streamout_params(struct si_shader_context *ctx,
4014 struct pipe_stream_output_info *so,
4015 LLVMTypeRef *params, LLVMTypeRef i32,
4016 unsigned *num_params)
4017 {
4018 int i;
4019
4020 /* Streamout SGPRs. */
4021 if (so->num_outputs) {
4022 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4023 params[ctx->param_streamout_config = (*num_params)++] = i32;
4024 else
4025 ctx->param_streamout_config = *num_params - 1;
4026
4027 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
4028 }
4029 /* A streamout buffer offset is loaded if the stride is non-zero. */
4030 for (i = 0; i < 4; i++) {
4031 if (!so->stride[i])
4032 continue;
4033
4034 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
4035 }
4036 }
4037
4038 static unsigned llvm_get_type_size(LLVMTypeRef type)
4039 {
4040 LLVMTypeKind kind = LLVMGetTypeKind(type);
4041
4042 switch (kind) {
4043 case LLVMIntegerTypeKind:
4044 return LLVMGetIntTypeWidth(type) / 8;
4045 case LLVMFloatTypeKind:
4046 return 4;
4047 case LLVMPointerTypeKind:
4048 return 8;
4049 case LLVMVectorTypeKind:
4050 return LLVMGetVectorSize(type) *
4051 llvm_get_type_size(LLVMGetElementType(type));
4052 case LLVMArrayTypeKind:
4053 return LLVMGetArrayLength(type) *
4054 llvm_get_type_size(LLVMGetElementType(type));
4055 default:
4056 assert(0);
4057 return 0;
4058 }
4059 }
4060
4061 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4062 {
4063 struct gallivm_state *gallivm = &ctx->gallivm;
4064
4065 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4066 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4067 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4068 "lds");
4069 }
4070
4071 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4072 {
4073 switch (shader->selector->type) {
4074 case PIPE_SHADER_TESS_CTRL:
4075 /* Return this so that LLVM doesn't remove s_barrier
4076 * instructions on chips where we use s_barrier. */
4077 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4078
4079 case PIPE_SHADER_GEOMETRY:
4080 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4081
4082 case PIPE_SHADER_COMPUTE:
4083 break; /* see below */
4084
4085 default:
4086 return 0;
4087 }
4088
4089 const unsigned *properties = shader->selector->info.properties;
4090 unsigned max_work_group_size =
4091 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4092 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4093 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4094
4095 if (!max_work_group_size) {
4096 /* This is a variable group size compute shader,
4097 * compile it for the maximum possible group size.
4098 */
4099 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4100 }
4101 return max_work_group_size;
4102 }
4103
4104 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4105 LLVMTypeRef *params,
4106 unsigned *num_params,
4107 bool assign_params)
4108 {
4109 params[(*num_params)++] = si_const_array(ctx->v4i32,
4110 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS);
4111 params[(*num_params)++] = si_const_array(ctx->v8i32,
4112 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2);
4113
4114 if (assign_params) {
4115 ctx->param_const_and_shader_buffers = *num_params - 2;
4116 ctx->param_samplers_and_images = *num_params - 1;
4117 }
4118 }
4119
4120 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4121 LLVMTypeRef *params,
4122 unsigned *num_params)
4123 {
4124 params[ctx->param_rw_buffers = (*num_params)++] =
4125 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
4126 declare_per_stage_desc_pointers(ctx, params, num_params, true);
4127 }
4128
4129 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4130 LLVMTypeRef *params,
4131 unsigned *num_params)
4132 {
4133 params[ctx->param_vertex_buffers = (*num_params)++] =
4134 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS);
4135 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
4136 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
4137 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
4138 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
4139 }
4140
4141 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4142 LLVMTypeRef *params, unsigned *num_params,
4143 unsigned *num_prolog_vgprs)
4144 {
4145 struct si_shader *shader = ctx->shader;
4146
4147 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
4148 if (shader->key.as_ls) {
4149 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
4150 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
4151 } else {
4152 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
4153 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
4154 }
4155 params[(*num_params)++] = ctx->i32; /* unused */
4156
4157 if (!shader->is_gs_copy_shader) {
4158 /* Vertex load indices. */
4159 ctx->param_vertex_index0 = (*num_params);
4160 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4161 params[(*num_params)++] = ctx->i32;
4162 *num_prolog_vgprs += shader->selector->info.num_inputs;
4163 }
4164 }
4165
4166 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4167 LLVMTypeRef *params, unsigned *num_params)
4168 {
4169 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
4170 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
4171 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
4172 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
4173 }
4174
4175 enum {
4176 /* Convenient merged shader definitions. */
4177 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4178 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4179 };
4180
4181 static void create_function(struct si_shader_context *ctx)
4182 {
4183 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
4184 struct gallivm_state *gallivm = &ctx->gallivm;
4185 struct si_shader *shader = ctx->shader;
4186 LLVMTypeRef params[100]; /* just make it large enough */
4187 LLVMTypeRef returns[16+32*4];
4188 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
4189 unsigned num_returns = 0;
4190 unsigned num_prolog_vgprs = 0;
4191 unsigned type = ctx->type;
4192
4193 /* Set MERGED shaders. */
4194 if (ctx->screen->b.chip_class >= GFX9) {
4195 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4196 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4197 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4198 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4199 }
4200
4201 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4202
4203 switch (type) {
4204 case PIPE_SHADER_VERTEX:
4205 declare_default_desc_pointers(ctx, params, &num_params);
4206 declare_vs_specific_input_sgprs(ctx, params, &num_params);
4207
4208 if (shader->key.as_es) {
4209 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4210 } else if (shader->key.as_ls) {
4211 /* no extra parameters */
4212 } else {
4213 if (shader->is_gs_copy_shader)
4214 num_params = ctx->param_rw_buffers + 1;
4215
4216 /* The locations of the other parameters are assigned dynamically. */
4217 declare_streamout_params(ctx, &shader->selector->so,
4218 params, ctx->i32, &num_params);
4219 }
4220
4221 last_sgpr = num_params-1;
4222
4223 /* VGPRs */
4224 declare_vs_input_vgprs(ctx, params, &num_params,
4225 &num_prolog_vgprs);
4226 break;
4227
4228 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4229 declare_default_desc_pointers(ctx, params, &num_params);
4230 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4231 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
4232 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
4233 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
4234 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4235 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
4236 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4237 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
4238 last_sgpr = num_params - 1;
4239
4240 /* VGPRs */
4241 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
4242 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
4243
4244 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4245 * placed after the user SGPRs.
4246 */
4247 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4248 returns[num_returns++] = ctx->i32; /* SGPRs */
4249 for (i = 0; i < 3; i++)
4250 returns[num_returns++] = ctx->f32; /* VGPRs */
4251 break;
4252
4253 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4254 /* Merged stages have 8 system SGPRs at the beginning. */
4255 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4256 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
4257 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4258 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
4259 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
4260 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
4261 params[num_params++] = ctx->i32; /* unused */
4262 params[num_params++] = ctx->i32; /* unused */
4263
4264 params[num_params++] = ctx->i32; /* unused */
4265 params[num_params++] = ctx->i32; /* unused */
4266 declare_per_stage_desc_pointers(ctx, params, &num_params,
4267 ctx->type == PIPE_SHADER_VERTEX);
4268 declare_vs_specific_input_sgprs(ctx, params, &num_params);
4269
4270 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4271 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
4272 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
4273 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4274 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
4275 params[num_params++] = ctx->i32; /* unused */
4276
4277 declare_per_stage_desc_pointers(ctx, params, &num_params,
4278 ctx->type == PIPE_SHADER_TESS_CTRL);
4279 last_sgpr = num_params - 1;
4280
4281 /* VGPRs (first TCS, then VS) */
4282 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
4283 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
4284
4285 if (ctx->type == PIPE_SHADER_VERTEX) {
4286 declare_vs_input_vgprs(ctx, params, &num_params,
4287 &num_prolog_vgprs);
4288
4289 /* LS return values are inputs to the TCS main shader part. */
4290 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4291 returns[num_returns++] = ctx->i32; /* SGPRs */
4292 for (i = 0; i < 2; i++)
4293 returns[num_returns++] = ctx->f32; /* VGPRs */
4294 } else {
4295 /* TCS return values are inputs to the TCS epilog.
4296 *
4297 * param_tcs_offchip_offset, param_tcs_factor_offset,
4298 * param_tcs_offchip_layout, and param_rw_buffers
4299 * should be passed to the epilog.
4300 */
4301 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4302 returns[num_returns++] = ctx->i32; /* SGPRs */
4303 for (i = 0; i < 3; i++)
4304 returns[num_returns++] = ctx->f32; /* VGPRs */
4305 }
4306 break;
4307
4308 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4309 /* Merged stages have 8 system SGPRs at the beginning. */
4310 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4311 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
4312 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
4313 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
4314 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4315 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
4316 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4317 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4318
4319 params[num_params++] = ctx->i32; /* unused */
4320 params[num_params++] = ctx->i32; /* unused */
4321 declare_per_stage_desc_pointers(ctx, params, &num_params,
4322 (ctx->type == PIPE_SHADER_VERTEX ||
4323 ctx->type == PIPE_SHADER_TESS_EVAL));
4324 if (ctx->type == PIPE_SHADER_VERTEX) {
4325 declare_vs_specific_input_sgprs(ctx, params, &num_params);
4326 } else {
4327 /* TESS_EVAL (and also GEOMETRY):
4328 * Declare as many input SGPRs as the VS has. */
4329 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4330 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4331 params[num_params++] = ctx->i32; /* unused */
4332 params[num_params++] = ctx->i32; /* unused */
4333 params[num_params++] = ctx->i32; /* unused */
4334 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
4335 }
4336
4337 declare_per_stage_desc_pointers(ctx, params, &num_params,
4338 ctx->type == PIPE_SHADER_GEOMETRY);
4339 last_sgpr = num_params - 1;
4340
4341 /* VGPRs (first GS, then VS/TES) */
4342 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
4343 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
4344 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
4345 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
4346 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
4347
4348 if (ctx->type == PIPE_SHADER_VERTEX) {
4349 declare_vs_input_vgprs(ctx, params, &num_params,
4350 &num_prolog_vgprs);
4351 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4352 declare_tes_input_vgprs(ctx, params, &num_params);
4353 }
4354
4355 if (ctx->type == PIPE_SHADER_VERTEX ||
4356 ctx->type == PIPE_SHADER_TESS_EVAL) {
4357 /* ES return values are inputs to GS. */
4358 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4359 returns[num_returns++] = ctx->i32; /* SGPRs */
4360 for (i = 0; i < 5; i++)
4361 returns[num_returns++] = ctx->f32; /* VGPRs */
4362 }
4363 break;
4364
4365 case PIPE_SHADER_TESS_EVAL:
4366 declare_default_desc_pointers(ctx, params, &num_params);
4367 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
4368 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
4369
4370 if (shader->key.as_es) {
4371 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4372 params[num_params++] = ctx->i32;
4373 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4374 } else {
4375 params[num_params++] = ctx->i32;
4376 declare_streamout_params(ctx, &shader->selector->so,
4377 params, ctx->i32, &num_params);
4378 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
4379 }
4380 last_sgpr = num_params - 1;
4381
4382 /* VGPRs */
4383 declare_tes_input_vgprs(ctx, params, &num_params);
4384 break;
4385
4386 case PIPE_SHADER_GEOMETRY:
4387 declare_default_desc_pointers(ctx, params, &num_params);
4388 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
4389 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
4390 last_sgpr = num_params - 1;
4391
4392 /* VGPRs */
4393 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
4394 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
4395 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
4396 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
4397 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
4398 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
4399 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
4400 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
4401 break;
4402
4403 case PIPE_SHADER_FRAGMENT:
4404 declare_default_desc_pointers(ctx, params, &num_params);
4405 params[SI_PARAM_ALPHA_REF] = ctx->f32;
4406 params[SI_PARAM_PRIM_MASK] = ctx->i32;
4407 last_sgpr = SI_PARAM_PRIM_MASK;
4408 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
4409 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
4410 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
4411 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
4412 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
4413 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
4414 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
4415 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
4416 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
4417 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
4418 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
4419 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
4420 params[SI_PARAM_FRONT_FACE] = ctx->i32;
4421 shader->info.face_vgpr_index = 20;
4422 params[SI_PARAM_ANCILLARY] = ctx->i32;
4423 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
4424 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
4425 num_params = SI_PARAM_POS_FIXED_PT+1;
4426
4427 /* Color inputs from the prolog. */
4428 if (shader->selector->info.colors_read) {
4429 unsigned num_color_elements =
4430 util_bitcount(shader->selector->info.colors_read);
4431
4432 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
4433 for (i = 0; i < num_color_elements; i++)
4434 params[num_params++] = ctx->f32;
4435
4436 num_prolog_vgprs += num_color_elements;
4437 }
4438
4439 /* Outputs for the epilog. */
4440 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4441 num_returns =
4442 num_return_sgprs +
4443 util_bitcount(shader->selector->info.colors_written) * 4 +
4444 shader->selector->info.writes_z +
4445 shader->selector->info.writes_stencil +
4446 shader->selector->info.writes_samplemask +
4447 1 /* SampleMaskIn */;
4448
4449 num_returns = MAX2(num_returns,
4450 num_return_sgprs +
4451 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4452
4453 for (i = 0; i < num_return_sgprs; i++)
4454 returns[i] = ctx->i32;
4455 for (; i < num_returns; i++)
4456 returns[i] = ctx->f32;
4457 break;
4458
4459 case PIPE_SHADER_COMPUTE:
4460 declare_default_desc_pointers(ctx, params, &num_params);
4461 if (shader->selector->info.uses_grid_size)
4462 params[ctx->param_grid_size = num_params++] = v3i32;
4463 if (shader->selector->info.uses_block_size)
4464 params[ctx->param_block_size = num_params++] = v3i32;
4465
4466 for (i = 0; i < 3; i++) {
4467 ctx->param_block_id[i] = -1;
4468 if (shader->selector->info.uses_block_id[i])
4469 params[ctx->param_block_id[i] = num_params++] = ctx->i32;
4470 }
4471 last_sgpr = num_params - 1;
4472
4473 params[ctx->param_thread_id = num_params++] = v3i32;
4474 break;
4475 default:
4476 assert(0 && "unimplemented shader");
4477 return;
4478 }
4479
4480 assert(num_params <= ARRAY_SIZE(params));
4481
4482 si_create_function(ctx, "main", returns, num_returns, params,
4483 num_params, last_sgpr,
4484 si_get_max_workgroup_size(shader));
4485
4486 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4487 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4488 ctx->separate_prolog) {
4489 si_llvm_add_attribute(ctx->main_fn,
4490 "InitialPSInputAddr",
4491 S_0286D0_PERSP_SAMPLE_ENA(1) |
4492 S_0286D0_PERSP_CENTER_ENA(1) |
4493 S_0286D0_PERSP_CENTROID_ENA(1) |
4494 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4495 S_0286D0_LINEAR_CENTER_ENA(1) |
4496 S_0286D0_LINEAR_CENTROID_ENA(1) |
4497 S_0286D0_FRONT_FACE_ENA(1) |
4498 S_0286D0_POS_FIXED_PT_ENA(1));
4499 }
4500
4501 shader->info.num_input_sgprs = 0;
4502 shader->info.num_input_vgprs = 0;
4503
4504 for (i = 0; i <= last_sgpr; ++i)
4505 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
4506
4507 for (; i < num_params; ++i)
4508 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
4509
4510 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4511 shader->info.num_input_vgprs -= num_prolog_vgprs;
4512
4513 if (!ctx->screen->has_ds_bpermute &&
4514 bld_base->info &&
4515 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
4516 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
4517 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
4518 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
4519 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
4520 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
4521 ctx->lds =
4522 LLVMAddGlobalInAddressSpace(gallivm->module,
4523 LLVMArrayType(ctx->i32, 64),
4524 "ddxy_lds",
4525 LOCAL_ADDR_SPACE);
4526
4527 if (shader->key.as_ls ||
4528 ctx->type == PIPE_SHADER_TESS_CTRL ||
4529 /* GFX9 has the ESGS ring buffer in LDS. */
4530 (ctx->screen->b.chip_class >= GFX9 &&
4531 (shader->key.as_es ||
4532 ctx->type == PIPE_SHADER_GEOMETRY)))
4533 declare_lds_as_pointer(ctx);
4534 }
4535
4536 /**
4537 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4538 * for later use.
4539 */
4540 static void preload_ring_buffers(struct si_shader_context *ctx)
4541 {
4542 struct gallivm_state *gallivm = &ctx->gallivm;
4543 LLVMBuilderRef builder = gallivm->builder;
4544
4545 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4546 ctx->param_rw_buffers);
4547
4548 if (ctx->screen->b.chip_class <= VI &&
4549 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4550 unsigned ring =
4551 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4552 : SI_ES_RING_ESGS;
4553 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4554
4555 ctx->esgs_ring =
4556 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4557 }
4558
4559 if (ctx->shader->is_gs_copy_shader) {
4560 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4561
4562 ctx->gsvs_ring[0] =
4563 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4564 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4565 const struct si_shader_selector *sel = ctx->shader->selector;
4566 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4567 LLVMValueRef base_ring;
4568
4569 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4570
4571 /* The conceptual layout of the GSVS ring is
4572 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4573 * but the real memory layout is swizzled across
4574 * threads:
4575 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4576 * t16v0c0 ..
4577 * Override the buffer descriptor accordingly.
4578 */
4579 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4580 uint64_t stream_offset = 0;
4581
4582 for (unsigned stream = 0; stream < 4; ++stream) {
4583 unsigned num_components;
4584 unsigned stride;
4585 unsigned num_records;
4586 LLVMValueRef ring, tmp;
4587
4588 num_components = sel->info.num_stream_output_components[stream];
4589 if (!num_components)
4590 continue;
4591
4592 stride = 4 * num_components * sel->gs_max_out_vertices;
4593
4594 /* Limit on the stride field for <= CIK. */
4595 assert(stride < (1 << 14));
4596
4597 num_records = 64;
4598
4599 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4600 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4601 tmp = LLVMBuildAdd(builder, tmp,
4602 LLVMConstInt(ctx->i64,
4603 stream_offset, 0), "");
4604 stream_offset += stride * 64;
4605
4606 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4607 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4608 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4609 tmp = LLVMBuildOr(builder, tmp,
4610 LLVMConstInt(ctx->i32,
4611 S_008F04_STRIDE(stride) |
4612 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4613 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4614 ring = LLVMBuildInsertElement(builder, ring,
4615 LLVMConstInt(ctx->i32, num_records, 0),
4616 LLVMConstInt(ctx->i32, 2, 0), "");
4617 ring = LLVMBuildInsertElement(builder, ring,
4618 LLVMConstInt(ctx->i32,
4619 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4620 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4621 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4622 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4623 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4624 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4625 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4626 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4627 S_008F0C_ADD_TID_ENABLE(1),
4628 0),
4629 LLVMConstInt(ctx->i32, 3, 0), "");
4630
4631 ctx->gsvs_ring[stream] = ring;
4632 }
4633 }
4634 }
4635
4636 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4637 LLVMValueRef param_rw_buffers,
4638 unsigned param_pos_fixed_pt)
4639 {
4640 struct gallivm_state *gallivm = &ctx->gallivm;
4641 LLVMBuilderRef builder = gallivm->builder;
4642 LLVMValueRef slot, desc, offset, row, bit, address[2];
4643
4644 /* Use the fixed-point gl_FragCoord input.
4645 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4646 * per coordinate to get the repeating effect.
4647 */
4648 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4649 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4650
4651 /* Load the buffer descriptor. */
4652 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4653 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4654
4655 /* The stipple pattern is 32x32, each row has 32 bits. */
4656 offset = LLVMBuildMul(builder, address[1],
4657 LLVMConstInt(ctx->i32, 4, 0), "");
4658 row = buffer_load_const(ctx, desc, offset);
4659 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4660 bit = LLVMBuildLShr(builder, row, address[0], "");
4661 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4662
4663 /* The intrinsic kills the thread if arg < 0. */
4664 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4665 LLVMConstReal(ctx->f32, -1), "");
4666 ac_build_kill(&ctx->ac, bit);
4667 }
4668
4669 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4670 struct si_shader_config *conf,
4671 unsigned symbol_offset)
4672 {
4673 unsigned i;
4674 const unsigned char *config =
4675 ac_shader_binary_config_start(binary, symbol_offset);
4676 bool really_needs_scratch = false;
4677
4678 /* LLVM adds SGPR spills to the scratch size.
4679 * Find out if we really need the scratch buffer.
4680 */
4681 for (i = 0; i < binary->reloc_count; i++) {
4682 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4683
4684 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4685 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4686 really_needs_scratch = true;
4687 break;
4688 }
4689 }
4690
4691 /* XXX: We may be able to emit some of these values directly rather than
4692 * extracting fields to be emitted later.
4693 */
4694
4695 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4696 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4697 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4698 switch (reg) {
4699 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4700 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4701 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4702 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4703 case R_00B848_COMPUTE_PGM_RSRC1:
4704 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4705 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4706 conf->float_mode = G_00B028_FLOAT_MODE(value);
4707 conf->rsrc1 = value;
4708 break;
4709 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4710 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4711 break;
4712 case R_00B84C_COMPUTE_PGM_RSRC2:
4713 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4714 conf->rsrc2 = value;
4715 break;
4716 case R_0286CC_SPI_PS_INPUT_ENA:
4717 conf->spi_ps_input_ena = value;
4718 break;
4719 case R_0286D0_SPI_PS_INPUT_ADDR:
4720 conf->spi_ps_input_addr = value;
4721 break;
4722 case R_0286E8_SPI_TMPRING_SIZE:
4723 case R_00B860_COMPUTE_TMPRING_SIZE:
4724 /* WAVESIZE is in units of 256 dwords. */
4725 if (really_needs_scratch)
4726 conf->scratch_bytes_per_wave =
4727 G_00B860_WAVESIZE(value) * 256 * 4;
4728 break;
4729 case 0x4: /* SPILLED_SGPRS */
4730 conf->spilled_sgprs = value;
4731 break;
4732 case 0x8: /* SPILLED_VGPRS */
4733 conf->spilled_vgprs = value;
4734 break;
4735 default:
4736 {
4737 static bool printed;
4738
4739 if (!printed) {
4740 fprintf(stderr, "Warning: LLVM emitted unknown "
4741 "config register: 0x%x\n", reg);
4742 printed = true;
4743 }
4744 }
4745 break;
4746 }
4747 }
4748
4749 if (!conf->spi_ps_input_addr)
4750 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4751 }
4752
4753 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4754 uint64_t scratch_va)
4755 {
4756 unsigned i;
4757 uint32_t scratch_rsrc_dword0 = scratch_va;
4758 uint32_t scratch_rsrc_dword1 =
4759 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4760
4761 /* Enable scratch coalescing. */
4762 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4763
4764 for (i = 0 ; i < shader->binary.reloc_count; i++) {
4765 const struct ac_shader_reloc *reloc =
4766 &shader->binary.relocs[i];
4767 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4768 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4769 &scratch_rsrc_dword0, 4);
4770 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4771 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4772 &scratch_rsrc_dword1, 4);
4773 }
4774 }
4775 }
4776
4777 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4778 {
4779 unsigned size = shader->binary.code_size;
4780
4781 if (shader->prolog)
4782 size += shader->prolog->binary.code_size;
4783 if (shader->previous_stage)
4784 size += shader->previous_stage->binary.code_size;
4785 if (shader->prolog2)
4786 size += shader->prolog2->binary.code_size;
4787 if (shader->epilog)
4788 size += shader->epilog->binary.code_size;
4789 return size;
4790 }
4791
4792 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4793 {
4794 const struct ac_shader_binary *prolog =
4795 shader->prolog ? &shader->prolog->binary : NULL;
4796 const struct ac_shader_binary *previous_stage =
4797 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4798 const struct ac_shader_binary *prolog2 =
4799 shader->prolog2 ? &shader->prolog2->binary : NULL;
4800 const struct ac_shader_binary *epilog =
4801 shader->epilog ? &shader->epilog->binary : NULL;
4802 const struct ac_shader_binary *mainb = &shader->binary;
4803 unsigned bo_size = si_get_shader_binary_size(shader) +
4804 (!epilog ? mainb->rodata_size : 0);
4805 unsigned char *ptr;
4806
4807 assert(!prolog || !prolog->rodata_size);
4808 assert(!previous_stage || !previous_stage->rodata_size);
4809 assert(!prolog2 || !prolog2->rodata_size);
4810 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4811 !mainb->rodata_size);
4812 assert(!epilog || !epilog->rodata_size);
4813
4814 r600_resource_reference(&shader->bo, NULL);
4815 shader->bo = (struct r600_resource*)
4816 pipe_buffer_create(&sscreen->b.b, 0,
4817 PIPE_USAGE_IMMUTABLE,
4818 align(bo_size, SI_CPDMA_ALIGNMENT));
4819 if (!shader->bo)
4820 return -ENOMEM;
4821
4822 /* Upload. */
4823 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4824 PIPE_TRANSFER_READ_WRITE |
4825 PIPE_TRANSFER_UNSYNCHRONIZED);
4826
4827 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4828 * endian-independent. */
4829 if (prolog) {
4830 memcpy(ptr, prolog->code, prolog->code_size);
4831 ptr += prolog->code_size;
4832 }
4833 if (previous_stage) {
4834 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4835 ptr += previous_stage->code_size;
4836 }
4837 if (prolog2) {
4838 memcpy(ptr, prolog2->code, prolog2->code_size);
4839 ptr += prolog2->code_size;
4840 }
4841
4842 memcpy(ptr, mainb->code, mainb->code_size);
4843 ptr += mainb->code_size;
4844
4845 if (epilog)
4846 memcpy(ptr, epilog->code, epilog->code_size);
4847 else if (mainb->rodata_size > 0)
4848 memcpy(ptr, mainb->rodata, mainb->rodata_size);
4849
4850 sscreen->b.ws->buffer_unmap(shader->bo->buf);
4851 return 0;
4852 }
4853
4854 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
4855 struct pipe_debug_callback *debug,
4856 const char *name, FILE *file)
4857 {
4858 char *line, *p;
4859 unsigned i, count;
4860
4861 if (binary->disasm_string) {
4862 fprintf(file, "Shader %s disassembly:\n", name);
4863 fprintf(file, "%s", binary->disasm_string);
4864
4865 if (debug && debug->debug_message) {
4866 /* Very long debug messages are cut off, so send the
4867 * disassembly one line at a time. This causes more
4868 * overhead, but on the plus side it simplifies
4869 * parsing of resulting logs.
4870 */
4871 pipe_debug_message(debug, SHADER_INFO,
4872 "Shader Disassembly Begin");
4873
4874 line = binary->disasm_string;
4875 while (*line) {
4876 p = util_strchrnul(line, '\n');
4877 count = p - line;
4878
4879 if (count) {
4880 pipe_debug_message(debug, SHADER_INFO,
4881 "%.*s", count, line);
4882 }
4883
4884 if (!*p)
4885 break;
4886 line = p + 1;
4887 }
4888
4889 pipe_debug_message(debug, SHADER_INFO,
4890 "Shader Disassembly End");
4891 }
4892 } else {
4893 fprintf(file, "Shader %s binary:\n", name);
4894 for (i = 0; i < binary->code_size; i += 4) {
4895 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
4896 binary->code[i + 3], binary->code[i + 2],
4897 binary->code[i + 1], binary->code[i]);
4898 }
4899 }
4900 }
4901
4902 static void si_shader_dump_stats(struct si_screen *sscreen,
4903 const struct si_shader *shader,
4904 struct pipe_debug_callback *debug,
4905 unsigned processor,
4906 FILE *file,
4907 bool check_debug_option)
4908 {
4909 const struct si_shader_config *conf = &shader->config;
4910 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
4911 unsigned code_size = si_get_shader_binary_size(shader);
4912 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
4913 unsigned lds_per_wave = 0;
4914 unsigned max_simd_waves = 10;
4915
4916 /* Compute LDS usage for PS. */
4917 switch (processor) {
4918 case PIPE_SHADER_FRAGMENT:
4919 /* The minimum usage per wave is (num_inputs * 48). The maximum
4920 * usage is (num_inputs * 48 * 16).
4921 * We can get anything in between and it varies between waves.
4922 *
4923 * The 48 bytes per input for a single primitive is equal to
4924 * 4 bytes/component * 4 components/input * 3 points.
4925 *
4926 * Other stages don't know the size at compile time or don't
4927 * allocate LDS per wave, but instead they do it per thread group.
4928 */
4929 lds_per_wave = conf->lds_size * lds_increment +
4930 align(num_inputs * 48, lds_increment);
4931 break;
4932 case PIPE_SHADER_COMPUTE:
4933 if (shader->selector) {
4934 unsigned max_workgroup_size =
4935 si_get_max_workgroup_size(shader);
4936 lds_per_wave = (conf->lds_size * lds_increment) /
4937 DIV_ROUND_UP(max_workgroup_size, 64);
4938 }
4939 break;
4940 }
4941
4942 /* Compute the per-SIMD wave counts. */
4943 if (conf->num_sgprs) {
4944 if (sscreen->b.chip_class >= VI)
4945 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
4946 else
4947 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
4948 }
4949
4950 if (conf->num_vgprs)
4951 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
4952
4953 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
4954 * 16KB makes some SIMDs unoccupied). */
4955 if (lds_per_wave)
4956 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
4957
4958 if (!check_debug_option ||
4959 r600_can_dump_shader(&sscreen->b, processor)) {
4960 if (processor == PIPE_SHADER_FRAGMENT) {
4961 fprintf(file, "*** SHADER CONFIG ***\n"
4962 "SPI_PS_INPUT_ADDR = 0x%04x\n"
4963 "SPI_PS_INPUT_ENA = 0x%04x\n",
4964 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
4965 }
4966
4967 fprintf(file, "*** SHADER STATS ***\n"
4968 "SGPRS: %d\n"
4969 "VGPRS: %d\n"
4970 "Spilled SGPRs: %d\n"
4971 "Spilled VGPRs: %d\n"
4972 "Private memory VGPRs: %d\n"
4973 "Code Size: %d bytes\n"
4974 "LDS: %d blocks\n"
4975 "Scratch: %d bytes per wave\n"
4976 "Max Waves: %d\n"
4977 "********************\n\n\n",
4978 conf->num_sgprs, conf->num_vgprs,
4979 conf->spilled_sgprs, conf->spilled_vgprs,
4980 conf->private_mem_vgprs, code_size,
4981 conf->lds_size, conf->scratch_bytes_per_wave,
4982 max_simd_waves);
4983 }
4984
4985 pipe_debug_message(debug, SHADER_INFO,
4986 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
4987 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
4988 "Spilled VGPRs: %d PrivMem VGPRs: %d",
4989 conf->num_sgprs, conf->num_vgprs, code_size,
4990 conf->lds_size, conf->scratch_bytes_per_wave,
4991 max_simd_waves, conf->spilled_sgprs,
4992 conf->spilled_vgprs, conf->private_mem_vgprs);
4993 }
4994
4995 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
4996 {
4997 switch (processor) {
4998 case PIPE_SHADER_VERTEX:
4999 if (shader->key.as_es)
5000 return "Vertex Shader as ES";
5001 else if (shader->key.as_ls)
5002 return "Vertex Shader as LS";
5003 else
5004 return "Vertex Shader as VS";
5005 case PIPE_SHADER_TESS_CTRL:
5006 return "Tessellation Control Shader";
5007 case PIPE_SHADER_TESS_EVAL:
5008 if (shader->key.as_es)
5009 return "Tessellation Evaluation Shader as ES";
5010 else
5011 return "Tessellation Evaluation Shader as VS";
5012 case PIPE_SHADER_GEOMETRY:
5013 if (shader->is_gs_copy_shader)
5014 return "GS Copy Shader as VS";
5015 else
5016 return "Geometry Shader";
5017 case PIPE_SHADER_FRAGMENT:
5018 return "Pixel Shader";
5019 case PIPE_SHADER_COMPUTE:
5020 return "Compute Shader";
5021 default:
5022 return "Unknown Shader";
5023 }
5024 }
5025
5026 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5027 struct pipe_debug_callback *debug, unsigned processor,
5028 FILE *file, bool check_debug_option)
5029 {
5030 if (!check_debug_option ||
5031 r600_can_dump_shader(&sscreen->b, processor))
5032 si_dump_shader_key(processor, shader, file);
5033
5034 if (!check_debug_option && shader->binary.llvm_ir_string) {
5035 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5036 si_get_shader_name(shader, processor));
5037 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5038 }
5039
5040 if (!check_debug_option ||
5041 (r600_can_dump_shader(&sscreen->b, processor) &&
5042 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5043 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5044
5045 if (shader->prolog)
5046 si_shader_dump_disassembly(&shader->prolog->binary,
5047 debug, "prolog", file);
5048 if (shader->previous_stage)
5049 si_shader_dump_disassembly(&shader->previous_stage->binary,
5050 debug, "previous stage", file);
5051 if (shader->prolog2)
5052 si_shader_dump_disassembly(&shader->prolog2->binary,
5053 debug, "prolog2", file);
5054
5055 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5056
5057 if (shader->epilog)
5058 si_shader_dump_disassembly(&shader->epilog->binary,
5059 debug, "epilog", file);
5060 fprintf(file, "\n");
5061 }
5062
5063 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5064 check_debug_option);
5065 }
5066
5067 static int si_compile_llvm(struct si_screen *sscreen,
5068 struct ac_shader_binary *binary,
5069 struct si_shader_config *conf,
5070 LLVMTargetMachineRef tm,
5071 LLVMModuleRef mod,
5072 struct pipe_debug_callback *debug,
5073 unsigned processor,
5074 const char *name)
5075 {
5076 int r = 0;
5077 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5078
5079 if (r600_can_dump_shader(&sscreen->b, processor)) {
5080 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5081
5082 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5083 fprintf(stderr, "%s LLVM IR:\n\n", name);
5084 ac_dump_module(mod);
5085 fprintf(stderr, "\n");
5086 }
5087 }
5088
5089 if (sscreen->record_llvm_ir) {
5090 char *ir = LLVMPrintModuleToString(mod);
5091 binary->llvm_ir_string = strdup(ir);
5092 LLVMDisposeMessage(ir);
5093 }
5094
5095 if (!si_replace_shader(count, binary)) {
5096 r = si_llvm_compile(mod, binary, tm, debug);
5097 if (r)
5098 return r;
5099 }
5100
5101 si_shader_binary_read_config(binary, conf, 0);
5102
5103 /* Enable 64-bit and 16-bit denormals, because there is no performance
5104 * cost.
5105 *
5106 * If denormals are enabled, all floating-point output modifiers are
5107 * ignored.
5108 *
5109 * Don't enable denormals for 32-bit floats, because:
5110 * - Floating-point output modifiers would be ignored by the hw.
5111 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5112 * have to stop using those.
5113 * - SI & CI would be very slow.
5114 */
5115 conf->float_mode |= V_00B028_FP_64_DENORMS;
5116
5117 FREE(binary->config);
5118 FREE(binary->global_symbol_offsets);
5119 binary->config = NULL;
5120 binary->global_symbol_offsets = NULL;
5121
5122 /* Some shaders can't have rodata because their binaries can be
5123 * concatenated.
5124 */
5125 if (binary->rodata_size &&
5126 (processor == PIPE_SHADER_VERTEX ||
5127 processor == PIPE_SHADER_TESS_CTRL ||
5128 processor == PIPE_SHADER_TESS_EVAL ||
5129 processor == PIPE_SHADER_FRAGMENT)) {
5130 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5131 return -EINVAL;
5132 }
5133
5134 return r;
5135 }
5136
5137 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5138 {
5139 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5140 LLVMBuildRetVoid(ctx->gallivm.builder);
5141 else
5142 LLVMBuildRet(ctx->gallivm.builder, ret);
5143 }
5144
5145 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5146 struct si_shader *
5147 si_generate_gs_copy_shader(struct si_screen *sscreen,
5148 LLVMTargetMachineRef tm,
5149 struct si_shader_selector *gs_selector,
5150 struct pipe_debug_callback *debug)
5151 {
5152 struct si_shader_context ctx;
5153 struct si_shader *shader;
5154 struct gallivm_state *gallivm = &ctx.gallivm;
5155 LLVMBuilderRef builder;
5156 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5157 struct lp_build_context *uint = &bld_base->uint_bld;
5158 struct si_shader_output_values *outputs;
5159 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5160 int i, r;
5161
5162 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5163
5164 if (!outputs)
5165 return NULL;
5166
5167 shader = CALLOC_STRUCT(si_shader);
5168 if (!shader) {
5169 FREE(outputs);
5170 return NULL;
5171 }
5172
5173
5174 shader->selector = gs_selector;
5175 shader->is_gs_copy_shader = true;
5176
5177 si_init_shader_ctx(&ctx, sscreen, tm);
5178 ctx.shader = shader;
5179 ctx.type = PIPE_SHADER_VERTEX;
5180
5181 builder = gallivm->builder;
5182
5183 create_function(&ctx);
5184 preload_ring_buffers(&ctx);
5185
5186 LLVMValueRef voffset =
5187 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
5188 ctx.param_vertex_id), 4);
5189
5190 /* Fetch the vertex stream ID.*/
5191 LLVMValueRef stream_id;
5192
5193 if (gs_selector->so.num_outputs)
5194 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5195 else
5196 stream_id = ctx.i32_0;
5197
5198 /* Fill in output information. */
5199 for (i = 0; i < gsinfo->num_outputs; ++i) {
5200 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5201 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5202
5203 for (int chan = 0; chan < 4; chan++) {
5204 outputs[i].vertex_stream[chan] =
5205 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5206 }
5207 }
5208
5209 LLVMBasicBlockRef end_bb;
5210 LLVMValueRef switch_inst;
5211
5212 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5213 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5214
5215 for (int stream = 0; stream < 4; stream++) {
5216 LLVMBasicBlockRef bb;
5217 unsigned offset;
5218
5219 if (!gsinfo->num_stream_output_components[stream])
5220 continue;
5221
5222 if (stream > 0 && !gs_selector->so.num_outputs)
5223 continue;
5224
5225 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5226 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5227 LLVMPositionBuilderAtEnd(builder, bb);
5228
5229 /* Fetch vertex data from GSVS ring */
5230 offset = 0;
5231 for (i = 0; i < gsinfo->num_outputs; ++i) {
5232 for (unsigned chan = 0; chan < 4; chan++) {
5233 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5234 outputs[i].vertex_stream[chan] != stream) {
5235 outputs[i].values[chan] = ctx.bld_base.base.undef;
5236 continue;
5237 }
5238
5239 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5240 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5241 offset++;
5242
5243 outputs[i].values[chan] =
5244 ac_build_buffer_load(&ctx.ac,
5245 ctx.gsvs_ring[0], 1,
5246 ctx.i32_0, voffset,
5247 soffset, 0, 1, 1,
5248 true, false);
5249 }
5250 }
5251
5252 /* Streamout and exports. */
5253 if (gs_selector->so.num_outputs) {
5254 si_llvm_emit_streamout(&ctx, outputs,
5255 gsinfo->num_outputs,
5256 stream);
5257 }
5258
5259 if (stream == 0)
5260 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5261
5262 LLVMBuildBr(builder, end_bb);
5263 }
5264
5265 LLVMPositionBuilderAtEnd(builder, end_bb);
5266
5267 LLVMBuildRetVoid(gallivm->builder);
5268
5269 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5270 si_llvm_optimize_module(&ctx);
5271
5272 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5273 &ctx.shader->config, ctx.tm,
5274 ctx.gallivm.module,
5275 debug, PIPE_SHADER_GEOMETRY,
5276 "GS Copy Shader");
5277 if (!r) {
5278 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5279 fprintf(stderr, "GS Copy Shader:\n");
5280 si_shader_dump(sscreen, ctx.shader, debug,
5281 PIPE_SHADER_GEOMETRY, stderr, true);
5282 r = si_shader_binary_upload(sscreen, ctx.shader);
5283 }
5284
5285 si_llvm_dispose(&ctx);
5286
5287 FREE(outputs);
5288
5289 if (r != 0) {
5290 FREE(shader);
5291 shader = NULL;
5292 }
5293 return shader;
5294 }
5295
5296 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5297 const struct si_vs_prolog_bits *prolog,
5298 const char *prefix, FILE *f)
5299 {
5300 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5301 prefix, prolog->instance_divisor_is_one);
5302 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5303 prefix, prolog->instance_divisor_is_fetched);
5304
5305 fprintf(f, " mono.vs.fix_fetch = {");
5306 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5307 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5308 fprintf(f, "}\n");
5309 }
5310
5311 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5312 FILE *f)
5313 {
5314 const struct si_shader_key *key = &shader->key;
5315
5316 fprintf(f, "SHADER KEY\n");
5317
5318 switch (processor) {
5319 case PIPE_SHADER_VERTEX:
5320 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5321 "part.vs.prolog", f);
5322 fprintf(f, " as_es = %u\n", key->as_es);
5323 fprintf(f, " as_ls = %u\n", key->as_ls);
5324 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5325 key->mono.u.vs_export_prim_id);
5326 break;
5327
5328 case PIPE_SHADER_TESS_CTRL:
5329 if (shader->selector->screen->b.chip_class >= GFX9) {
5330 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5331 "part.tcs.ls_prolog", f);
5332 }
5333 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5334 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5335 break;
5336
5337 case PIPE_SHADER_TESS_EVAL:
5338 fprintf(f, " as_es = %u\n", key->as_es);
5339 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5340 key->mono.u.vs_export_prim_id);
5341 break;
5342
5343 case PIPE_SHADER_GEOMETRY:
5344 if (shader->is_gs_copy_shader)
5345 break;
5346
5347 if (shader->selector->screen->b.chip_class >= GFX9 &&
5348 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5349 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5350 "part.gs.vs_prolog", f);
5351 }
5352 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5353 break;
5354
5355 case PIPE_SHADER_COMPUTE:
5356 break;
5357
5358 case PIPE_SHADER_FRAGMENT:
5359 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5360 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5361 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5362 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5363 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5364 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5365 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5366 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5367 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5368 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5369 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5370 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5371 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5372 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5373 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5374 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5375 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5376 break;
5377
5378 default:
5379 assert(0);
5380 }
5381
5382 if ((processor == PIPE_SHADER_GEOMETRY ||
5383 processor == PIPE_SHADER_TESS_EVAL ||
5384 processor == PIPE_SHADER_VERTEX) &&
5385 !key->as_es && !key->as_ls) {
5386 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5387 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5388 }
5389 }
5390
5391 static void si_init_shader_ctx(struct si_shader_context *ctx,
5392 struct si_screen *sscreen,
5393 LLVMTargetMachineRef tm)
5394 {
5395 struct lp_build_tgsi_context *bld_base;
5396
5397 si_llvm_context_init(ctx, sscreen, tm);
5398
5399 bld_base = &ctx->bld_base;
5400 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5401
5402 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5403 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5404 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5405
5406 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5407
5408 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5409
5410 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5411 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5412 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5413 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5414
5415 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5416 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5417 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5418 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5419 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5420 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5421 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5422 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5423 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5424
5425 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5426 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5427 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5428 }
5429
5430 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5431 {
5432 struct si_shader *shader = ctx->shader;
5433 struct tgsi_shader_info *info = &shader->selector->info;
5434
5435 if ((ctx->type != PIPE_SHADER_VERTEX &&
5436 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5437 shader->key.as_ls ||
5438 shader->key.as_es)
5439 return;
5440
5441 ac_optimize_vs_outputs(&ctx->ac,
5442 ctx->main_fn,
5443 shader->info.vs_output_param_offset,
5444 info->num_outputs,
5445 &shader->info.nr_param_exports);
5446 }
5447
5448 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5449 {
5450 ctx->shader->config.private_mem_vgprs = 0;
5451
5452 /* Process all LLVM instructions. */
5453 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5454 while (bb) {
5455 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5456
5457 while (next) {
5458 LLVMValueRef inst = next;
5459 next = LLVMGetNextInstruction(next);
5460
5461 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5462 continue;
5463
5464 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5465 /* No idea why LLVM aligns allocas to 4 elements. */
5466 unsigned alignment = LLVMGetAlignment(inst);
5467 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
5468 ctx->shader->config.private_mem_vgprs += dw_size;
5469 }
5470 bb = LLVMGetNextBasicBlock(bb);
5471 }
5472 }
5473
5474 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5475 {
5476 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5477 lp_build_intrinsic(ctx->gallivm.builder,
5478 "llvm.amdgcn.init.exec", ctx->voidt,
5479 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5480 }
5481
5482 static void si_init_exec_from_input(struct si_shader_context *ctx,
5483 unsigned param, unsigned bitoffset)
5484 {
5485 LLVMValueRef args[] = {
5486 LLVMGetParam(ctx->main_fn, param),
5487 LLVMConstInt(ctx->i32, bitoffset, 0),
5488 };
5489 lp_build_intrinsic(ctx->gallivm.builder,
5490 "llvm.amdgcn.init.exec.from.input",
5491 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5492 }
5493
5494 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5495 bool is_monolithic)
5496 {
5497 struct si_shader *shader = ctx->shader;
5498 struct si_shader_selector *sel = shader->selector;
5499 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5500
5501 switch (ctx->type) {
5502 case PIPE_SHADER_VERTEX:
5503 ctx->load_input = declare_input_vs;
5504 if (shader->key.as_ls)
5505 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5506 else if (shader->key.as_es)
5507 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5508 else
5509 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5510 break;
5511 case PIPE_SHADER_TESS_CTRL:
5512 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5513 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5514 bld_base->emit_store = store_output_tcs;
5515 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5516 break;
5517 case PIPE_SHADER_TESS_EVAL:
5518 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5519 if (shader->key.as_es)
5520 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5521 else
5522 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5523 break;
5524 case PIPE_SHADER_GEOMETRY:
5525 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5526 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5527 break;
5528 case PIPE_SHADER_FRAGMENT:
5529 ctx->load_input = declare_input_fs;
5530 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
5531 break;
5532 case PIPE_SHADER_COMPUTE:
5533 ctx->declare_memory_region = declare_compute_memory;
5534 break;
5535 default:
5536 assert(!"Unsupported shader type");
5537 return false;
5538 }
5539
5540 create_function(ctx);
5541 preload_ring_buffers(ctx);
5542
5543 /* For GFX9 merged shaders:
5544 * - Set EXEC for the first shader. If the prolog is present, set
5545 * EXEC there instead.
5546 * - Add a barrier before the second shader.
5547 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5548 * an if-statement. This is required for correctness in geometry
5549 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5550 * GS_CUT messages.
5551 *
5552 * For monolithic merged shaders, the first shader is wrapped in an
5553 * if-block together with its prolog in si_build_wrapper_function.
5554 */
5555 if (ctx->screen->b.chip_class >= GFX9) {
5556 if (!is_monolithic &&
5557 sel->info.num_instructions > 1 && /* not empty shader */
5558 (shader->key.as_es || shader->key.as_ls) &&
5559 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5560 (ctx->type == PIPE_SHADER_VERTEX &&
5561 !sel->vs_needs_prolog))) {
5562 si_init_exec_from_input(ctx,
5563 ctx->param_merged_wave_info, 0);
5564 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5565 ctx->type == PIPE_SHADER_GEOMETRY) {
5566 if (!is_monolithic)
5567 si_init_exec_full_mask(ctx);
5568
5569 /* The barrier must execute for all shaders in a
5570 * threadgroup.
5571 */
5572 si_llvm_emit_barrier(NULL, bld_base, NULL);
5573
5574 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5575 LLVMValueRef ena =
5576 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5577 ac_get_thread_id(&ctx->ac), num_threads, "");
5578 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5579 }
5580 }
5581
5582 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5583 int i;
5584 for (i = 0; i < 4; i++) {
5585 ctx->gs_next_vertex[i] =
5586 lp_build_alloca(&ctx->gallivm,
5587 ctx->i32, "");
5588 }
5589 }
5590
5591 if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5592 ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5593 /* This is initialized to 0.0 = not kill. */
5594 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5595 }
5596
5597 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5598 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5599 return false;
5600 }
5601
5602 si_llvm_build_ret(ctx, ctx->return_value);
5603 return true;
5604 }
5605
5606 /**
5607 * Compute the VS prolog key, which contains all the information needed to
5608 * build the VS prolog function, and set shader->info bits where needed.
5609 *
5610 * \param info Shader info of the vertex shader.
5611 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5612 * \param prolog_key Key of the VS prolog
5613 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5614 * \param key Output shader part key.
5615 */
5616 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5617 unsigned num_input_sgprs,
5618 const struct si_vs_prolog_bits *prolog_key,
5619 struct si_shader *shader_out,
5620 union si_shader_part_key *key)
5621 {
5622 memset(key, 0, sizeof(*key));
5623 key->vs_prolog.states = *prolog_key;
5624 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5625 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5626 key->vs_prolog.as_ls = shader_out->key.as_ls;
5627
5628 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5629 key->vs_prolog.as_ls = 1;
5630 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5631 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5632 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5633 }
5634
5635 /* Enable loading the InstanceID VGPR. */
5636 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5637
5638 if ((key->vs_prolog.states.instance_divisor_is_one |
5639 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5640 shader_out->info.uses_instanceid = true;
5641 }
5642
5643 /**
5644 * Compute the PS prolog key, which contains all the information needed to
5645 * build the PS prolog function, and set related bits in shader->config.
5646 */
5647 static void si_get_ps_prolog_key(struct si_shader *shader,
5648 union si_shader_part_key *key,
5649 bool separate_prolog)
5650 {
5651 struct tgsi_shader_info *info = &shader->selector->info;
5652
5653 memset(key, 0, sizeof(*key));
5654 key->ps_prolog.states = shader->key.part.ps.prolog;
5655 key->ps_prolog.colors_read = info->colors_read;
5656 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5657 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5658 key->ps_prolog.wqm = info->uses_derivatives &&
5659 (key->ps_prolog.colors_read ||
5660 key->ps_prolog.states.force_persp_sample_interp ||
5661 key->ps_prolog.states.force_linear_sample_interp ||
5662 key->ps_prolog.states.force_persp_center_interp ||
5663 key->ps_prolog.states.force_linear_center_interp ||
5664 key->ps_prolog.states.bc_optimize_for_persp ||
5665 key->ps_prolog.states.bc_optimize_for_linear);
5666
5667 if (info->colors_read) {
5668 unsigned *color = shader->selector->color_attr_index;
5669
5670 if (shader->key.part.ps.prolog.color_two_side) {
5671 /* BCOLORs are stored after the last input. */
5672 key->ps_prolog.num_interp_inputs = info->num_inputs;
5673 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5674 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5675 }
5676
5677 for (unsigned i = 0; i < 2; i++) {
5678 unsigned interp = info->input_interpolate[color[i]];
5679 unsigned location = info->input_interpolate_loc[color[i]];
5680
5681 if (!(info->colors_read & (0xf << i*4)))
5682 continue;
5683
5684 key->ps_prolog.color_attr_index[i] = color[i];
5685
5686 if (shader->key.part.ps.prolog.flatshade_colors &&
5687 interp == TGSI_INTERPOLATE_COLOR)
5688 interp = TGSI_INTERPOLATE_CONSTANT;
5689
5690 switch (interp) {
5691 case TGSI_INTERPOLATE_CONSTANT:
5692 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5693 break;
5694 case TGSI_INTERPOLATE_PERSPECTIVE:
5695 case TGSI_INTERPOLATE_COLOR:
5696 /* Force the interpolation location for colors here. */
5697 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5698 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5699 if (shader->key.part.ps.prolog.force_persp_center_interp)
5700 location = TGSI_INTERPOLATE_LOC_CENTER;
5701
5702 switch (location) {
5703 case TGSI_INTERPOLATE_LOC_SAMPLE:
5704 key->ps_prolog.color_interp_vgpr_index[i] = 0;
5705 shader->config.spi_ps_input_ena |=
5706 S_0286CC_PERSP_SAMPLE_ENA(1);
5707 break;
5708 case TGSI_INTERPOLATE_LOC_CENTER:
5709 key->ps_prolog.color_interp_vgpr_index[i] = 2;
5710 shader->config.spi_ps_input_ena |=
5711 S_0286CC_PERSP_CENTER_ENA(1);
5712 break;
5713 case TGSI_INTERPOLATE_LOC_CENTROID:
5714 key->ps_prolog.color_interp_vgpr_index[i] = 4;
5715 shader->config.spi_ps_input_ena |=
5716 S_0286CC_PERSP_CENTROID_ENA(1);
5717 break;
5718 default:
5719 assert(0);
5720 }
5721 break;
5722 case TGSI_INTERPOLATE_LINEAR:
5723 /* Force the interpolation location for colors here. */
5724 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5725 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5726 if (shader->key.part.ps.prolog.force_linear_center_interp)
5727 location = TGSI_INTERPOLATE_LOC_CENTER;
5728
5729 /* The VGPR assignment for non-monolithic shaders
5730 * works because InitialPSInputAddr is set on the
5731 * main shader and PERSP_PULL_MODEL is never used.
5732 */
5733 switch (location) {
5734 case TGSI_INTERPOLATE_LOC_SAMPLE:
5735 key->ps_prolog.color_interp_vgpr_index[i] =
5736 separate_prolog ? 6 : 9;
5737 shader->config.spi_ps_input_ena |=
5738 S_0286CC_LINEAR_SAMPLE_ENA(1);
5739 break;
5740 case TGSI_INTERPOLATE_LOC_CENTER:
5741 key->ps_prolog.color_interp_vgpr_index[i] =
5742 separate_prolog ? 8 : 11;
5743 shader->config.spi_ps_input_ena |=
5744 S_0286CC_LINEAR_CENTER_ENA(1);
5745 break;
5746 case TGSI_INTERPOLATE_LOC_CENTROID:
5747 key->ps_prolog.color_interp_vgpr_index[i] =
5748 separate_prolog ? 10 : 13;
5749 shader->config.spi_ps_input_ena |=
5750 S_0286CC_LINEAR_CENTROID_ENA(1);
5751 break;
5752 default:
5753 assert(0);
5754 }
5755 break;
5756 default:
5757 assert(0);
5758 }
5759 }
5760 }
5761 }
5762
5763 /**
5764 * Check whether a PS prolog is required based on the key.
5765 */
5766 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5767 {
5768 return key->ps_prolog.colors_read ||
5769 key->ps_prolog.states.force_persp_sample_interp ||
5770 key->ps_prolog.states.force_linear_sample_interp ||
5771 key->ps_prolog.states.force_persp_center_interp ||
5772 key->ps_prolog.states.force_linear_center_interp ||
5773 key->ps_prolog.states.bc_optimize_for_persp ||
5774 key->ps_prolog.states.bc_optimize_for_linear ||
5775 key->ps_prolog.states.poly_stipple;
5776 }
5777
5778 /**
5779 * Compute the PS epilog key, which contains all the information needed to
5780 * build the PS epilog function.
5781 */
5782 static void si_get_ps_epilog_key(struct si_shader *shader,
5783 union si_shader_part_key *key)
5784 {
5785 struct tgsi_shader_info *info = &shader->selector->info;
5786 memset(key, 0, sizeof(*key));
5787 key->ps_epilog.colors_written = info->colors_written;
5788 key->ps_epilog.writes_z = info->writes_z;
5789 key->ps_epilog.writes_stencil = info->writes_stencil;
5790 key->ps_epilog.writes_samplemask = info->writes_samplemask;
5791 key->ps_epilog.states = shader->key.part.ps.epilog;
5792 }
5793
5794 /**
5795 * Build the GS prolog function. Rotate the input vertices for triangle strips
5796 * with adjacency.
5797 */
5798 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
5799 union si_shader_part_key *key)
5800 {
5801 unsigned num_sgprs, num_vgprs;
5802 struct gallivm_state *gallivm = &ctx->gallivm;
5803 LLVMBuilderRef builder = gallivm->builder;
5804 LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
5805 LLVMTypeRef returns[48];
5806 LLVMValueRef func, ret;
5807
5808 if (ctx->screen->b.chip_class >= GFX9) {
5809 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
5810 num_vgprs = 5; /* ES inputs are not needed by GS */
5811 } else {
5812 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
5813 num_vgprs = 8;
5814 }
5815
5816 for (unsigned i = 0; i < num_sgprs; ++i) {
5817 params[i] = ctx->i32;
5818 returns[i] = ctx->i32;
5819 }
5820
5821 for (unsigned i = 0; i < num_vgprs; ++i) {
5822 params[num_sgprs + i] = ctx->i32;
5823 returns[num_sgprs + i] = ctx->f32;
5824 }
5825
5826 /* Create the function. */
5827 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
5828 params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
5829 func = ctx->main_fn;
5830
5831 /* Set the full EXEC mask for the prolog, because we are only fiddling
5832 * with registers here. The main shader part will set the correct EXEC
5833 * mask.
5834 */
5835 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
5836 si_init_exec_full_mask(ctx);
5837
5838 /* Copy inputs to outputs. This should be no-op, as the registers match,
5839 * but it will prevent the compiler from overwriting them unintentionally.
5840 */
5841 ret = ctx->return_value;
5842 for (unsigned i = 0; i < num_sgprs; i++) {
5843 LLVMValueRef p = LLVMGetParam(func, i);
5844 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
5845 }
5846 for (unsigned i = 0; i < num_vgprs; i++) {
5847 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
5848 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
5849 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
5850 }
5851
5852 if (key->gs_prolog.states.tri_strip_adj_fix) {
5853 /* Remap the input vertices for every other primitive. */
5854 const unsigned gfx6_vtx_params[6] = {
5855 num_sgprs,
5856 num_sgprs + 1,
5857 num_sgprs + 3,
5858 num_sgprs + 4,
5859 num_sgprs + 5,
5860 num_sgprs + 6
5861 };
5862 const unsigned gfx9_vtx_params[3] = {
5863 num_sgprs,
5864 num_sgprs + 1,
5865 num_sgprs + 4,
5866 };
5867 LLVMValueRef vtx_in[6], vtx_out[6];
5868 LLVMValueRef prim_id, rotate;
5869
5870 if (ctx->screen->b.chip_class >= GFX9) {
5871 for (unsigned i = 0; i < 3; i++) {
5872 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
5873 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
5874 }
5875 } else {
5876 for (unsigned i = 0; i < 6; i++)
5877 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
5878 }
5879
5880 prim_id = LLVMGetParam(func, num_sgprs + 2);
5881 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
5882
5883 for (unsigned i = 0; i < 6; ++i) {
5884 LLVMValueRef base, rotated;
5885 base = vtx_in[i];
5886 rotated = vtx_in[(i + 4) % 6];
5887 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
5888 }
5889
5890 if (ctx->screen->b.chip_class >= GFX9) {
5891 for (unsigned i = 0; i < 3; i++) {
5892 LLVMValueRef hi, out;
5893
5894 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
5895 LLVMConstInt(ctx->i32, 16, 0), "");
5896 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
5897 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
5898 ret = LLVMBuildInsertValue(builder, ret, out,
5899 gfx9_vtx_params[i], "");
5900 }
5901 } else {
5902 for (unsigned i = 0; i < 6; i++) {
5903 LLVMValueRef out;
5904
5905 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
5906 ret = LLVMBuildInsertValue(builder, ret, out,
5907 gfx6_vtx_params[i], "");
5908 }
5909 }
5910 }
5911
5912 LLVMBuildRet(builder, ret);
5913 }
5914
5915 /**
5916 * Given a list of shader part functions, build a wrapper function that
5917 * runs them in sequence to form a monolithic shader.
5918 */
5919 static void si_build_wrapper_function(struct si_shader_context *ctx,
5920 LLVMValueRef *parts,
5921 unsigned num_parts,
5922 unsigned main_part,
5923 unsigned next_shader_first_part)
5924 {
5925 struct gallivm_state *gallivm = &ctx->gallivm;
5926 LLVMBuilderRef builder = ctx->gallivm.builder;
5927 /* PS epilog has one arg per color component; gfx9 merged shader
5928 * prologs need to forward 32 user SGPRs.
5929 */
5930 LLVMTypeRef param_types[64];
5931 LLVMValueRef initial[64], out[64];
5932 LLVMTypeRef function_type;
5933 unsigned num_params;
5934 unsigned num_out, initial_num_out;
5935 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
5936 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
5937 unsigned num_sgprs, num_vgprs;
5938 unsigned last_sgpr_param;
5939 unsigned gprs;
5940 struct lp_build_if_state if_state;
5941
5942 for (unsigned i = 0; i < num_parts; ++i) {
5943 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
5944 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
5945 }
5946
5947 /* The parameters of the wrapper function correspond to those of the
5948 * first part in terms of SGPRs and VGPRs, but we use the types of the
5949 * main part to get the right types. This is relevant for the
5950 * dereferenceable attribute on descriptor table pointers.
5951 */
5952 num_sgprs = 0;
5953 num_vgprs = 0;
5954
5955 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
5956 num_params = LLVMCountParamTypes(function_type);
5957
5958 for (unsigned i = 0; i < num_params; ++i) {
5959 LLVMValueRef param = LLVMGetParam(parts[0], i);
5960
5961 if (ac_is_sgpr_param(param)) {
5962 assert(num_vgprs == 0);
5963 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
5964 } else {
5965 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
5966 }
5967 }
5968 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
5969
5970 num_params = 0;
5971 last_sgpr_param = 0;
5972 gprs = 0;
5973 while (gprs < num_sgprs + num_vgprs) {
5974 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
5975 unsigned size;
5976
5977 param_types[num_params] = LLVMTypeOf(param);
5978 if (gprs < num_sgprs)
5979 last_sgpr_param = num_params;
5980 size = llvm_get_type_size(param_types[num_params]) / 4;
5981 num_params++;
5982
5983 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
5984 assert(gprs + size <= num_sgprs + num_vgprs &&
5985 (gprs >= num_sgprs || gprs + size <= num_sgprs));
5986
5987 gprs += size;
5988 }
5989
5990 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
5991 last_sgpr_param,
5992 si_get_max_workgroup_size(ctx->shader));
5993
5994 if (is_merged_shader(ctx->shader))
5995 si_init_exec_full_mask(ctx);
5996
5997 /* Record the arguments of the function as if they were an output of
5998 * a previous part.
5999 */
6000 num_out = 0;
6001 num_out_sgpr = 0;
6002
6003 for (unsigned i = 0; i < num_params; ++i) {
6004 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6005 LLVMTypeRef param_type = LLVMTypeOf(param);
6006 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
6007 unsigned size = llvm_get_type_size(param_type) / 4;
6008
6009 if (size == 1) {
6010 if (param_type != out_type)
6011 param = LLVMBuildBitCast(builder, param, out_type, "");
6012 out[num_out++] = param;
6013 } else {
6014 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6015
6016 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6017 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6018 param_type = ctx->i64;
6019 }
6020
6021 if (param_type != vector_type)
6022 param = LLVMBuildBitCast(builder, param, vector_type, "");
6023
6024 for (unsigned j = 0; j < size; ++j)
6025 out[num_out++] = LLVMBuildExtractElement(
6026 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6027 }
6028
6029 if (i <= last_sgpr_param)
6030 num_out_sgpr = num_out;
6031 }
6032
6033 memcpy(initial, out, sizeof(out));
6034 initial_num_out = num_out;
6035 initial_num_out_sgpr = num_out_sgpr;
6036
6037 /* Now chain the parts. */
6038 for (unsigned part = 0; part < num_parts; ++part) {
6039 LLVMValueRef in[48];
6040 LLVMValueRef ret;
6041 LLVMTypeRef ret_type;
6042 unsigned out_idx = 0;
6043
6044 num_params = LLVMCountParams(parts[part]);
6045 assert(num_params <= ARRAY_SIZE(param_types));
6046
6047 /* Merged shaders are executed conditionally depending
6048 * on the number of enabled threads passed in the input SGPRs. */
6049 if (is_merged_shader(ctx->shader) && part == 0) {
6050 LLVMValueRef ena, count = initial[3];
6051
6052 count = LLVMBuildAnd(builder, count,
6053 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6054 ena = LLVMBuildICmp(builder, LLVMIntULT,
6055 ac_get_thread_id(&ctx->ac), count, "");
6056 lp_build_if(&if_state, &ctx->gallivm, ena);
6057 }
6058
6059 /* Derive arguments for the next part from outputs of the
6060 * previous one.
6061 */
6062 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6063 LLVMValueRef param;
6064 LLVMTypeRef param_type;
6065 bool is_sgpr;
6066 unsigned param_size;
6067 LLVMValueRef arg = NULL;
6068
6069 param = LLVMGetParam(parts[part], param_idx);
6070 param_type = LLVMTypeOf(param);
6071 param_size = llvm_get_type_size(param_type) / 4;
6072 is_sgpr = ac_is_sgpr_param(param);
6073
6074 if (is_sgpr) {
6075 #if HAVE_LLVM < 0x0400
6076 LLVMRemoveAttribute(param, LLVMByValAttribute);
6077 #else
6078 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6079 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6080 #endif
6081 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6082 }
6083
6084 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6085 assert(is_sgpr || out_idx >= num_out_sgpr);
6086
6087 if (param_size == 1)
6088 arg = out[out_idx];
6089 else
6090 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6091
6092 if (LLVMTypeOf(arg) != param_type) {
6093 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6094 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6095 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6096 } else {
6097 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6098 }
6099 }
6100
6101 in[param_idx] = arg;
6102 out_idx += param_size;
6103 }
6104
6105 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6106
6107 if (is_merged_shader(ctx->shader) &&
6108 part + 1 == next_shader_first_part) {
6109 lp_build_endif(&if_state);
6110
6111 /* The second half of the merged shader should use
6112 * the inputs from the toplevel (wrapper) function,
6113 * not the return value from the last call.
6114 *
6115 * That's because the last call was executed condi-
6116 * tionally, so we can't consume it in the main
6117 * block.
6118 */
6119 memcpy(out, initial, sizeof(initial));
6120 num_out = initial_num_out;
6121 num_out_sgpr = initial_num_out_sgpr;
6122 continue;
6123 }
6124
6125 /* Extract the returned GPRs. */
6126 ret_type = LLVMTypeOf(ret);
6127 num_out = 0;
6128 num_out_sgpr = 0;
6129
6130 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6131 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6132
6133 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6134
6135 for (unsigned i = 0; i < ret_size; ++i) {
6136 LLVMValueRef val =
6137 LLVMBuildExtractValue(builder, ret, i, "");
6138
6139 assert(num_out < ARRAY_SIZE(out));
6140 out[num_out++] = val;
6141
6142 if (LLVMTypeOf(val) == ctx->i32) {
6143 assert(num_out_sgpr + 1 == num_out);
6144 num_out_sgpr = num_out;
6145 }
6146 }
6147 }
6148 }
6149
6150 LLVMBuildRetVoid(builder);
6151 }
6152
6153 int si_compile_tgsi_shader(struct si_screen *sscreen,
6154 LLVMTargetMachineRef tm,
6155 struct si_shader *shader,
6156 bool is_monolithic,
6157 struct pipe_debug_callback *debug)
6158 {
6159 struct si_shader_selector *sel = shader->selector;
6160 struct si_shader_context ctx;
6161 int r = -1;
6162
6163 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6164 * conversion fails. */
6165 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6166 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6167 tgsi_dump(sel->tokens, 0);
6168 si_dump_streamout(&sel->so);
6169 }
6170
6171 si_init_shader_ctx(&ctx, sscreen, tm);
6172 si_llvm_context_set_tgsi(&ctx, shader);
6173 ctx.separate_prolog = !is_monolithic;
6174
6175 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6176 sizeof(shader->info.vs_output_param_offset));
6177
6178 shader->info.uses_instanceid = sel->info.uses_instanceid;
6179
6180 ctx.load_system_value = declare_system_value;
6181
6182 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6183 si_llvm_dispose(&ctx);
6184 return -1;
6185 }
6186
6187 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6188 LLVMValueRef parts[2];
6189 bool need_prolog = sel->vs_needs_prolog;
6190
6191 parts[1] = ctx.main_fn;
6192
6193 if (need_prolog) {
6194 union si_shader_part_key prolog_key;
6195 si_get_vs_prolog_key(&sel->info,
6196 shader->info.num_input_sgprs,
6197 &shader->key.part.vs.prolog,
6198 shader, &prolog_key);
6199 si_build_vs_prolog_function(&ctx, &prolog_key);
6200 parts[0] = ctx.main_fn;
6201 }
6202
6203 si_build_wrapper_function(&ctx, parts + !need_prolog,
6204 1 + need_prolog, need_prolog, 0);
6205 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6206 if (sscreen->b.chip_class >= GFX9) {
6207 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6208 LLVMValueRef parts[4];
6209
6210 /* TCS main part */
6211 parts[2] = ctx.main_fn;
6212
6213 /* TCS epilog */
6214 union si_shader_part_key tcs_epilog_key;
6215 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6216 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6217 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6218 parts[3] = ctx.main_fn;
6219
6220 /* VS prolog */
6221 if (ls->vs_needs_prolog) {
6222 union si_shader_part_key vs_prolog_key;
6223 si_get_vs_prolog_key(&ls->info,
6224 shader->info.num_input_sgprs,
6225 &shader->key.part.tcs.ls_prolog,
6226 shader, &vs_prolog_key);
6227 vs_prolog_key.vs_prolog.is_monolithic = true;
6228 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6229 parts[0] = ctx.main_fn;
6230 }
6231
6232 /* VS as LS main part */
6233 struct si_shader shader_ls = {};
6234 shader_ls.selector = ls;
6235 shader_ls.key.as_ls = 1;
6236 shader_ls.key.mono = shader->key.mono;
6237 shader_ls.key.opt = shader->key.opt;
6238 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6239
6240 if (!si_compile_tgsi_main(&ctx, true)) {
6241 si_llvm_dispose(&ctx);
6242 return -1;
6243 }
6244 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6245 parts[1] = ctx.main_fn;
6246
6247 /* Reset the shader context. */
6248 ctx.shader = shader;
6249 ctx.type = PIPE_SHADER_TESS_CTRL;
6250
6251 si_build_wrapper_function(&ctx,
6252 parts + !ls->vs_needs_prolog,
6253 4 - !ls->vs_needs_prolog, 0,
6254 ls->vs_needs_prolog ? 2 : 1);
6255 } else {
6256 LLVMValueRef parts[2];
6257 union si_shader_part_key epilog_key;
6258
6259 parts[0] = ctx.main_fn;
6260
6261 memset(&epilog_key, 0, sizeof(epilog_key));
6262 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6263 si_build_tcs_epilog_function(&ctx, &epilog_key);
6264 parts[1] = ctx.main_fn;
6265
6266 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6267 }
6268 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6269 if (ctx.screen->b.chip_class >= GFX9) {
6270 struct si_shader_selector *es = shader->key.part.gs.es;
6271 LLVMValueRef es_prolog = NULL;
6272 LLVMValueRef es_main = NULL;
6273 LLVMValueRef gs_prolog = NULL;
6274 LLVMValueRef gs_main = ctx.main_fn;
6275
6276 /* GS prolog */
6277 union si_shader_part_key gs_prolog_key;
6278 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6279 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6280 gs_prolog_key.gs_prolog.is_monolithic = true;
6281 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6282 gs_prolog = ctx.main_fn;
6283
6284 /* ES prolog */
6285 if (es->vs_needs_prolog) {
6286 union si_shader_part_key vs_prolog_key;
6287 si_get_vs_prolog_key(&es->info,
6288 shader->info.num_input_sgprs,
6289 &shader->key.part.tcs.ls_prolog,
6290 shader, &vs_prolog_key);
6291 vs_prolog_key.vs_prolog.is_monolithic = true;
6292 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6293 es_prolog = ctx.main_fn;
6294 }
6295
6296 /* ES main part */
6297 struct si_shader shader_es = {};
6298 shader_es.selector = es;
6299 shader_es.key.as_es = 1;
6300 shader_es.key.mono = shader->key.mono;
6301 shader_es.key.opt = shader->key.opt;
6302 si_llvm_context_set_tgsi(&ctx, &shader_es);
6303
6304 if (!si_compile_tgsi_main(&ctx, true)) {
6305 si_llvm_dispose(&ctx);
6306 return -1;
6307 }
6308 shader->info.uses_instanceid |= es->info.uses_instanceid;
6309 es_main = ctx.main_fn;
6310
6311 /* Reset the shader context. */
6312 ctx.shader = shader;
6313 ctx.type = PIPE_SHADER_GEOMETRY;
6314
6315 /* Prepare the array of shader parts. */
6316 LLVMValueRef parts[4];
6317 unsigned num_parts = 0, main_part, next_first_part;
6318
6319 if (es_prolog)
6320 parts[num_parts++] = es_prolog;
6321
6322 parts[main_part = num_parts++] = es_main;
6323 parts[next_first_part = num_parts++] = gs_prolog;
6324 parts[num_parts++] = gs_main;
6325
6326 si_build_wrapper_function(&ctx, parts, num_parts,
6327 main_part, next_first_part);
6328 } else {
6329 LLVMValueRef parts[2];
6330 union si_shader_part_key prolog_key;
6331
6332 parts[1] = ctx.main_fn;
6333
6334 memset(&prolog_key, 0, sizeof(prolog_key));
6335 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6336 si_build_gs_prolog_function(&ctx, &prolog_key);
6337 parts[0] = ctx.main_fn;
6338
6339 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6340 }
6341 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6342 LLVMValueRef parts[3];
6343 union si_shader_part_key prolog_key;
6344 union si_shader_part_key epilog_key;
6345 bool need_prolog;
6346
6347 si_get_ps_prolog_key(shader, &prolog_key, false);
6348 need_prolog = si_need_ps_prolog(&prolog_key);
6349
6350 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6351
6352 if (need_prolog) {
6353 si_build_ps_prolog_function(&ctx, &prolog_key);
6354 parts[0] = ctx.main_fn;
6355 }
6356
6357 si_get_ps_epilog_key(shader, &epilog_key);
6358 si_build_ps_epilog_function(&ctx, &epilog_key);
6359 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6360
6361 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6362 need_prolog ? 1 : 0, 0);
6363 }
6364
6365 si_llvm_optimize_module(&ctx);
6366
6367 /* Post-optimization transformations and analysis. */
6368 si_optimize_vs_outputs(&ctx);
6369
6370 if ((debug && debug->debug_message) ||
6371 r600_can_dump_shader(&sscreen->b, ctx.type))
6372 si_count_scratch_private_memory(&ctx);
6373
6374 /* Compile to bytecode. */
6375 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6376 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6377 si_llvm_dispose(&ctx);
6378 if (r) {
6379 fprintf(stderr, "LLVM failed to compile shader\n");
6380 return r;
6381 }
6382
6383 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6384 * LLVM 3.9svn has this bug.
6385 */
6386 if (sel->type == PIPE_SHADER_COMPUTE) {
6387 unsigned wave_size = 64;
6388 unsigned max_vgprs = 256;
6389 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6390 unsigned max_sgprs_per_wave = 128;
6391 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6392 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6393 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6394
6395 max_vgprs = max_vgprs / min_waves_per_simd;
6396 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6397
6398 if (shader->config.num_sgprs > max_sgprs ||
6399 shader->config.num_vgprs > max_vgprs) {
6400 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6401 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6402 shader->config.num_sgprs, shader->config.num_vgprs,
6403 max_sgprs, max_vgprs);
6404
6405 /* Just terminate the process, because dependent
6406 * shaders can hang due to bad input data, but use
6407 * the env var to allow shader-db to work.
6408 */
6409 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6410 abort();
6411 }
6412 }
6413
6414 /* Add the scratch offset to input SGPRs. */
6415 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6416 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6417
6418 /* Calculate the number of fragment input VGPRs. */
6419 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6420 shader->info.num_input_vgprs = 0;
6421 shader->info.face_vgpr_index = -1;
6422
6423 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6424 shader->info.num_input_vgprs += 2;
6425 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6426 shader->info.num_input_vgprs += 2;
6427 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6428 shader->info.num_input_vgprs += 2;
6429 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6430 shader->info.num_input_vgprs += 3;
6431 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6432 shader->info.num_input_vgprs += 2;
6433 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6434 shader->info.num_input_vgprs += 2;
6435 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6436 shader->info.num_input_vgprs += 2;
6437 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6438 shader->info.num_input_vgprs += 1;
6439 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6440 shader->info.num_input_vgprs += 1;
6441 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6442 shader->info.num_input_vgprs += 1;
6443 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6444 shader->info.num_input_vgprs += 1;
6445 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6446 shader->info.num_input_vgprs += 1;
6447 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6448 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6449 shader->info.num_input_vgprs += 1;
6450 }
6451 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6452 shader->info.num_input_vgprs += 1;
6453 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6454 shader->info.num_input_vgprs += 1;
6455 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6456 shader->info.num_input_vgprs += 1;
6457 }
6458
6459 return 0;
6460 }
6461
6462 /**
6463 * Create, compile and return a shader part (prolog or epilog).
6464 *
6465 * \param sscreen screen
6466 * \param list list of shader parts of the same category
6467 * \param type shader type
6468 * \param key shader part key
6469 * \param prolog whether the part being requested is a prolog
6470 * \param tm LLVM target machine
6471 * \param debug debug callback
6472 * \param build the callback responsible for building the main function
6473 * \return non-NULL on success
6474 */
6475 static struct si_shader_part *
6476 si_get_shader_part(struct si_screen *sscreen,
6477 struct si_shader_part **list,
6478 enum pipe_shader_type type,
6479 bool prolog,
6480 union si_shader_part_key *key,
6481 LLVMTargetMachineRef tm,
6482 struct pipe_debug_callback *debug,
6483 void (*build)(struct si_shader_context *,
6484 union si_shader_part_key *),
6485 const char *name)
6486 {
6487 struct si_shader_part *result;
6488
6489 mtx_lock(&sscreen->shader_parts_mutex);
6490
6491 /* Find existing. */
6492 for (result = *list; result; result = result->next) {
6493 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6494 mtx_unlock(&sscreen->shader_parts_mutex);
6495 return result;
6496 }
6497 }
6498
6499 /* Compile a new one. */
6500 result = CALLOC_STRUCT(si_shader_part);
6501 result->key = *key;
6502
6503 struct si_shader shader = {};
6504 struct si_shader_context ctx;
6505 struct gallivm_state *gallivm = &ctx.gallivm;
6506
6507 si_init_shader_ctx(&ctx, sscreen, tm);
6508 ctx.shader = &shader;
6509 ctx.type = type;
6510
6511 switch (type) {
6512 case PIPE_SHADER_VERTEX:
6513 break;
6514 case PIPE_SHADER_TESS_CTRL:
6515 assert(!prolog);
6516 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6517 break;
6518 case PIPE_SHADER_GEOMETRY:
6519 assert(prolog);
6520 break;
6521 case PIPE_SHADER_FRAGMENT:
6522 if (prolog)
6523 shader.key.part.ps.prolog = key->ps_prolog.states;
6524 else
6525 shader.key.part.ps.epilog = key->ps_epilog.states;
6526 break;
6527 default:
6528 unreachable("bad shader part");
6529 }
6530
6531 build(&ctx, key);
6532
6533 /* Compile. */
6534 si_llvm_optimize_module(&ctx);
6535
6536 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6537 gallivm->module, debug, ctx.type, name)) {
6538 FREE(result);
6539 result = NULL;
6540 goto out;
6541 }
6542
6543 result->next = *list;
6544 *list = result;
6545
6546 out:
6547 si_llvm_dispose(&ctx);
6548 mtx_unlock(&sscreen->shader_parts_mutex);
6549 return result;
6550 }
6551
6552 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6553 {
6554 struct gallivm_state *gallivm = &ctx->gallivm;
6555 LLVMValueRef ptr[2], list;
6556
6557 /* Get the pointer to rw buffers. */
6558 ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6559 ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6560 list = lp_build_gather_values(gallivm, ptr, 2);
6561 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6562 list = LLVMBuildIntToPtr(gallivm->builder, list,
6563 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6564 return list;
6565 }
6566
6567 /**
6568 * Build the vertex shader prolog function.
6569 *
6570 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6571 * All inputs are returned unmodified. The vertex load indices are
6572 * stored after them, which will be used by the API VS for fetching inputs.
6573 *
6574 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6575 * input_v0,
6576 * input_v1,
6577 * input_v2,
6578 * input_v3,
6579 * (VertexID + BaseVertex),
6580 * (InstanceID + StartInstance),
6581 * (InstanceID / 2 + StartInstance)
6582 */
6583 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6584 union si_shader_part_key *key)
6585 {
6586 struct gallivm_state *gallivm = &ctx->gallivm;
6587 LLVMTypeRef *params, *returns;
6588 LLVMValueRef ret, func;
6589 int last_sgpr, num_params, num_returns, i;
6590 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
6591 key->vs_prolog.num_merged_next_stage_vgprs;
6592 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6593 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6594 num_input_vgprs;
6595 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6596
6597 ctx->param_vertex_id = first_vs_vgpr;
6598 ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
6599
6600 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6601 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
6602 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6603 sizeof(LLVMTypeRef));
6604 num_params = 0;
6605 num_returns = 0;
6606
6607 /* Declare input and output SGPRs. */
6608 num_params = 0;
6609 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6610 params[num_params++] = ctx->i32;
6611 returns[num_returns++] = ctx->i32;
6612 }
6613 last_sgpr = num_params - 1;
6614
6615 /* Preloaded VGPRs (outputs must be floats) */
6616 for (i = 0; i < num_input_vgprs; i++) {
6617 params[num_params++] = ctx->i32;
6618 returns[num_returns++] = ctx->f32;
6619 }
6620
6621 /* Vertex load indices. */
6622 for (i = 0; i <= key->vs_prolog.last_input; i++)
6623 returns[num_returns++] = ctx->f32;
6624
6625 /* Create the function. */
6626 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
6627 num_params, last_sgpr, 0);
6628 func = ctx->main_fn;
6629
6630 if (key->vs_prolog.num_merged_next_stage_vgprs &&
6631 !key->vs_prolog.is_monolithic)
6632 si_init_exec_from_input(ctx, 3, 0);
6633
6634 /* Copy inputs to outputs. This should be no-op, as the registers match,
6635 * but it will prevent the compiler from overwriting them unintentionally.
6636 */
6637 ret = ctx->return_value;
6638 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6639 LLVMValueRef p = LLVMGetParam(func, i);
6640 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6641 }
6642 for (; i < num_params; i++) {
6643 LLVMValueRef p = LLVMGetParam(func, i);
6644 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6645 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6646 }
6647
6648 /* Compute vertex load indices from instance divisors. */
6649 LLVMValueRef instance_divisor_constbuf = NULL;
6650
6651 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6652 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6653 LLVMValueRef buf_index =
6654 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6655 instance_divisor_constbuf =
6656 ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6657 }
6658
6659 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6660 bool divisor_is_one =
6661 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6662 bool divisor_is_fetched =
6663 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6664 LLVMValueRef index;
6665
6666 if (divisor_is_one || divisor_is_fetched) {
6667 LLVMValueRef divisor = ctx->i32_1;
6668
6669 if (divisor_is_fetched) {
6670 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6671 LLVMConstInt(ctx->i32, i * 4, 0));
6672 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6673 ctx->i32, "");
6674 }
6675
6676 /* InstanceID / Divisor + StartInstance */
6677 index = get_instance_index_for_fetch(ctx,
6678 user_sgpr_base +
6679 SI_SGPR_START_INSTANCE,
6680 divisor);
6681 } else {
6682 /* VertexID + BaseVertex */
6683 index = LLVMBuildAdd(gallivm->builder,
6684 LLVMGetParam(func, ctx->param_vertex_id),
6685 LLVMGetParam(func, user_sgpr_base +
6686 SI_SGPR_BASE_VERTEX), "");
6687 }
6688
6689 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6690 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6691 num_params++, "");
6692 }
6693
6694 si_llvm_build_ret(ctx, ret);
6695 }
6696
6697 static bool si_get_vs_prolog(struct si_screen *sscreen,
6698 LLVMTargetMachineRef tm,
6699 struct si_shader *shader,
6700 struct pipe_debug_callback *debug,
6701 struct si_shader *main_part,
6702 const struct si_vs_prolog_bits *key)
6703 {
6704 struct si_shader_selector *vs = main_part->selector;
6705
6706 /* The prolog is a no-op if there are no inputs. */
6707 if (!vs->vs_needs_prolog)
6708 return true;
6709
6710 /* Get the prolog. */
6711 union si_shader_part_key prolog_key;
6712 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6713 key, shader, &prolog_key);
6714
6715 shader->prolog =
6716 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6717 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6718 debug, si_build_vs_prolog_function,
6719 "Vertex Shader Prolog");
6720 return shader->prolog != NULL;
6721 }
6722
6723 /**
6724 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6725 */
6726 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6727 LLVMTargetMachineRef tm,
6728 struct si_shader *shader,
6729 struct pipe_debug_callback *debug)
6730 {
6731 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6732 &shader->key.part.vs.prolog);
6733 }
6734
6735 /**
6736 * Compile the TCS epilog function. This writes tesselation factors to memory
6737 * based on the output primitive type of the tesselator (determined by TES).
6738 */
6739 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6740 union si_shader_part_key *key)
6741 {
6742 struct gallivm_state *gallivm = &ctx->gallivm;
6743 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6744 LLVMTypeRef params[32];
6745 LLVMValueRef func;
6746 int last_sgpr, num_params = 0;
6747
6748 if (ctx->screen->b.chip_class >= GFX9) {
6749 params[num_params++] = ctx->i64;
6750 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6751 params[num_params++] = ctx->i32; /* wave info */
6752 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
6753 params[num_params++] = ctx->i32;
6754 params[num_params++] = ctx->i32;
6755 params[num_params++] = ctx->i32;
6756 params[num_params++] = ctx->i64;
6757 params[num_params++] = ctx->i64;
6758 params[num_params++] = ctx->i64;
6759 params[num_params++] = ctx->i64;
6760 params[num_params++] = ctx->i32;
6761 params[num_params++] = ctx->i32;
6762 params[num_params++] = ctx->i32;
6763 params[num_params++] = ctx->i32;
6764 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6765 params[num_params++] = ctx->i32;
6766 params[num_params++] = ctx->i32;
6767 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6768 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
6769 } else {
6770 params[num_params++] = ctx->i64;
6771 params[num_params++] = ctx->i64;
6772 params[num_params++] = ctx->i64;
6773 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6774 params[num_params++] = ctx->i32;
6775 params[num_params++] = ctx->i32;
6776 params[num_params++] = ctx->i32;
6777 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6778 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
6779 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6780 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
6781 }
6782 last_sgpr = num_params - 1;
6783
6784 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
6785 params[num_params++] = ctx->i32; /* invocation ID within the patch */
6786 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
6787
6788 /* Create the function. */
6789 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
6790 ctx->screen->b.chip_class >= CIK ? 128 : 64);
6791 declare_lds_as_pointer(ctx);
6792 func = ctx->main_fn;
6793
6794 si_write_tess_factors(bld_base,
6795 LLVMGetParam(func, last_sgpr + 1),
6796 LLVMGetParam(func, last_sgpr + 2),
6797 LLVMGetParam(func, last_sgpr + 3));
6798
6799 LLVMBuildRetVoid(gallivm->builder);
6800 }
6801
6802 /**
6803 * Select and compile (or reuse) TCS parts (epilog).
6804 */
6805 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6806 LLVMTargetMachineRef tm,
6807 struct si_shader *shader,
6808 struct pipe_debug_callback *debug)
6809 {
6810 if (sscreen->b.chip_class >= GFX9) {
6811 struct si_shader *ls_main_part =
6812 shader->key.part.tcs.ls->main_shader_part_ls;
6813
6814 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
6815 &shader->key.part.tcs.ls_prolog))
6816 return false;
6817
6818 shader->previous_stage = ls_main_part;
6819 }
6820
6821 /* Get the epilog. */
6822 union si_shader_part_key epilog_key;
6823 memset(&epilog_key, 0, sizeof(epilog_key));
6824 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6825
6826 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6827 PIPE_SHADER_TESS_CTRL, false,
6828 &epilog_key, tm, debug,
6829 si_build_tcs_epilog_function,
6830 "Tessellation Control Shader Epilog");
6831 return shader->epilog != NULL;
6832 }
6833
6834 /**
6835 * Select and compile (or reuse) GS parts (prolog).
6836 */
6837 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
6838 LLVMTargetMachineRef tm,
6839 struct si_shader *shader,
6840 struct pipe_debug_callback *debug)
6841 {
6842 if (sscreen->b.chip_class >= GFX9) {
6843 struct si_shader *es_main_part =
6844 shader->key.part.gs.es->main_shader_part_es;
6845
6846 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
6847 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
6848 &shader->key.part.gs.vs_prolog))
6849 return false;
6850
6851 shader->previous_stage = es_main_part;
6852 }
6853
6854 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
6855 return true;
6856
6857 union si_shader_part_key prolog_key;
6858 memset(&prolog_key, 0, sizeof(prolog_key));
6859 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6860
6861 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
6862 PIPE_SHADER_GEOMETRY, true,
6863 &prolog_key, tm, debug,
6864 si_build_gs_prolog_function,
6865 "Geometry Shader Prolog");
6866 return shader->prolog2 != NULL;
6867 }
6868
6869 /**
6870 * Build the pixel shader prolog function. This handles:
6871 * - two-side color selection and interpolation
6872 * - overriding interpolation parameters for the API PS
6873 * - polygon stippling
6874 *
6875 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6876 * overriden by other states. (e.g. per-sample interpolation)
6877 * Interpolated colors are stored after the preloaded VGPRs.
6878 */
6879 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
6880 union si_shader_part_key *key)
6881 {
6882 struct gallivm_state *gallivm = &ctx->gallivm;
6883 LLVMTypeRef *params;
6884 LLVMValueRef ret, func;
6885 int last_sgpr, num_params, num_returns, i, num_color_channels;
6886
6887 assert(si_need_ps_prolog(key));
6888
6889 /* Number of inputs + 8 color elements. */
6890 params = alloca((key->ps_prolog.num_input_sgprs +
6891 key->ps_prolog.num_input_vgprs + 8) *
6892 sizeof(LLVMTypeRef));
6893
6894 /* Declare inputs. */
6895 num_params = 0;
6896 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6897 params[num_params++] = ctx->i32;
6898 last_sgpr = num_params - 1;
6899
6900 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6901 params[num_params++] = ctx->f32;
6902
6903 /* Declare outputs (same as inputs + add colors if needed) */
6904 num_returns = num_params;
6905 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6906 for (i = 0; i < num_color_channels; i++)
6907 params[num_returns++] = ctx->f32;
6908
6909 /* Create the function. */
6910 si_create_function(ctx, "ps_prolog", params, num_returns, params,
6911 num_params, last_sgpr, 0);
6912 func = ctx->main_fn;
6913
6914 /* Copy inputs to outputs. This should be no-op, as the registers match,
6915 * but it will prevent the compiler from overwriting them unintentionally.
6916 */
6917 ret = ctx->return_value;
6918 for (i = 0; i < num_params; i++) {
6919 LLVMValueRef p = LLVMGetParam(func, i);
6920 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6921 }
6922
6923 /* Polygon stippling. */
6924 if (key->ps_prolog.states.poly_stipple) {
6925 /* POS_FIXED_PT is always last. */
6926 unsigned pos = key->ps_prolog.num_input_sgprs +
6927 key->ps_prolog.num_input_vgprs - 1;
6928 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6929
6930 si_llvm_emit_polygon_stipple(ctx, list, pos);
6931 }
6932
6933 if (key->ps_prolog.states.bc_optimize_for_persp ||
6934 key->ps_prolog.states.bc_optimize_for_linear) {
6935 unsigned i, base = key->ps_prolog.num_input_sgprs;
6936 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
6937
6938 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
6939 * The hw doesn't compute CENTROID if the whole wave only
6940 * contains fully-covered quads.
6941 *
6942 * PRIM_MASK is after user SGPRs.
6943 */
6944 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6945 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
6946 LLVMConstInt(ctx->i32, 31, 0), "");
6947 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
6948 ctx->i1, "");
6949
6950 if (key->ps_prolog.states.bc_optimize_for_persp) {
6951 /* Read PERSP_CENTER. */
6952 for (i = 0; i < 2; i++)
6953 center[i] = LLVMGetParam(func, base + 2 + i);
6954 /* Read PERSP_CENTROID. */
6955 for (i = 0; i < 2; i++)
6956 centroid[i] = LLVMGetParam(func, base + 4 + i);
6957 /* Select PERSP_CENTROID. */
6958 for (i = 0; i < 2; i++) {
6959 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
6960 center[i], centroid[i], "");
6961 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6962 tmp, base + 4 + i, "");
6963 }
6964 }
6965 if (key->ps_prolog.states.bc_optimize_for_linear) {
6966 /* Read LINEAR_CENTER. */
6967 for (i = 0; i < 2; i++)
6968 center[i] = LLVMGetParam(func, base + 8 + i);
6969 /* Read LINEAR_CENTROID. */
6970 for (i = 0; i < 2; i++)
6971 centroid[i] = LLVMGetParam(func, base + 10 + i);
6972 /* Select LINEAR_CENTROID. */
6973 for (i = 0; i < 2; i++) {
6974 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
6975 center[i], centroid[i], "");
6976 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6977 tmp, base + 10 + i, "");
6978 }
6979 }
6980 }
6981
6982 /* Force per-sample interpolation. */
6983 if (key->ps_prolog.states.force_persp_sample_interp) {
6984 unsigned i, base = key->ps_prolog.num_input_sgprs;
6985 LLVMValueRef persp_sample[2];
6986
6987 /* Read PERSP_SAMPLE. */
6988 for (i = 0; i < 2; i++)
6989 persp_sample[i] = LLVMGetParam(func, base + i);
6990 /* Overwrite PERSP_CENTER. */
6991 for (i = 0; i < 2; i++)
6992 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6993 persp_sample[i], base + 2 + i, "");
6994 /* Overwrite PERSP_CENTROID. */
6995 for (i = 0; i < 2; i++)
6996 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6997 persp_sample[i], base + 4 + i, "");
6998 }
6999 if (key->ps_prolog.states.force_linear_sample_interp) {
7000 unsigned i, base = key->ps_prolog.num_input_sgprs;
7001 LLVMValueRef linear_sample[2];
7002
7003 /* Read LINEAR_SAMPLE. */
7004 for (i = 0; i < 2; i++)
7005 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7006 /* Overwrite LINEAR_CENTER. */
7007 for (i = 0; i < 2; i++)
7008 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7009 linear_sample[i], base + 8 + i, "");
7010 /* Overwrite LINEAR_CENTROID. */
7011 for (i = 0; i < 2; i++)
7012 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7013 linear_sample[i], base + 10 + i, "");
7014 }
7015
7016 /* Force center interpolation. */
7017 if (key->ps_prolog.states.force_persp_center_interp) {
7018 unsigned i, base = key->ps_prolog.num_input_sgprs;
7019 LLVMValueRef persp_center[2];
7020
7021 /* Read PERSP_CENTER. */
7022 for (i = 0; i < 2; i++)
7023 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7024 /* Overwrite PERSP_SAMPLE. */
7025 for (i = 0; i < 2; i++)
7026 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7027 persp_center[i], base + i, "");
7028 /* Overwrite PERSP_CENTROID. */
7029 for (i = 0; i < 2; i++)
7030 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7031 persp_center[i], base + 4 + i, "");
7032 }
7033 if (key->ps_prolog.states.force_linear_center_interp) {
7034 unsigned i, base = key->ps_prolog.num_input_sgprs;
7035 LLVMValueRef linear_center[2];
7036
7037 /* Read LINEAR_CENTER. */
7038 for (i = 0; i < 2; i++)
7039 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7040 /* Overwrite LINEAR_SAMPLE. */
7041 for (i = 0; i < 2; i++)
7042 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7043 linear_center[i], base + 6 + i, "");
7044 /* Overwrite LINEAR_CENTROID. */
7045 for (i = 0; i < 2; i++)
7046 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7047 linear_center[i], base + 10 + i, "");
7048 }
7049
7050 /* Interpolate colors. */
7051 for (i = 0; i < 2; i++) {
7052 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7053 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7054 key->ps_prolog.face_vgpr_index;
7055 LLVMValueRef interp[2], color[4];
7056 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7057
7058 if (!writemask)
7059 continue;
7060
7061 /* If the interpolation qualifier is not CONSTANT (-1). */
7062 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7063 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7064 key->ps_prolog.color_interp_vgpr_index[i];
7065
7066 /* Get the (i,j) updated by bc_optimize handling. */
7067 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7068 interp_vgpr, "");
7069 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7070 interp_vgpr + 1, "");
7071 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7072 }
7073
7074 /* Use the absolute location of the input. */
7075 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7076
7077 if (key->ps_prolog.states.color_two_side) {
7078 face = LLVMGetParam(func, face_vgpr);
7079 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7080 }
7081
7082 interp_fs_input(ctx,
7083 key->ps_prolog.color_attr_index[i],
7084 TGSI_SEMANTIC_COLOR, i,
7085 key->ps_prolog.num_interp_inputs,
7086 key->ps_prolog.colors_read, interp_ij,
7087 prim_mask, face, color);
7088
7089 while (writemask) {
7090 unsigned chan = u_bit_scan(&writemask);
7091 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7092 num_params++, "");
7093 }
7094 }
7095
7096 /* Tell LLVM to insert WQM instruction sequence when needed. */
7097 if (key->ps_prolog.wqm) {
7098 LLVMAddTargetDependentFunctionAttr(func,
7099 "amdgpu-ps-wqm-outputs", "");
7100 }
7101
7102 si_llvm_build_ret(ctx, ret);
7103 }
7104
7105 /**
7106 * Build the pixel shader epilog function. This handles everything that must be
7107 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7108 */
7109 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7110 union si_shader_part_key *key)
7111 {
7112 struct gallivm_state *gallivm = &ctx->gallivm;
7113 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7114 LLVMTypeRef params[16+8*4+3];
7115 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7116 int last_sgpr, num_params = 0, i;
7117 struct si_ps_exports exp = {};
7118
7119 /* Declare input SGPRs. */
7120 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
7121 params[ctx->param_const_and_shader_buffers = num_params++] = ctx->i64;
7122 params[ctx->param_samplers_and_images = num_params++] = ctx->i64;
7123 assert(num_params == SI_PARAM_ALPHA_REF);
7124 params[SI_PARAM_ALPHA_REF] = ctx->f32;
7125 last_sgpr = SI_PARAM_ALPHA_REF;
7126
7127 /* Declare input VGPRs. */
7128 num_params = (last_sgpr + 1) +
7129 util_bitcount(key->ps_epilog.colors_written) * 4 +
7130 key->ps_epilog.writes_z +
7131 key->ps_epilog.writes_stencil +
7132 key->ps_epilog.writes_samplemask;
7133
7134 num_params = MAX2(num_params,
7135 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7136
7137 assert(num_params <= ARRAY_SIZE(params));
7138
7139 for (i = last_sgpr + 1; i < num_params; i++)
7140 params[i] = ctx->f32;
7141
7142 /* Create the function. */
7143 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
7144 last_sgpr, 0);
7145 /* Disable elimination of unused inputs. */
7146 si_llvm_add_attribute(ctx->main_fn,
7147 "InitialPSInputAddr", 0xffffff);
7148
7149 /* Process colors. */
7150 unsigned vgpr = last_sgpr + 1;
7151 unsigned colors_written = key->ps_epilog.colors_written;
7152 int last_color_export = -1;
7153
7154 /* Find the last color export. */
7155 if (!key->ps_epilog.writes_z &&
7156 !key->ps_epilog.writes_stencil &&
7157 !key->ps_epilog.writes_samplemask) {
7158 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7159
7160 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7161 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7162 /* Just set this if any of the colorbuffers are enabled. */
7163 if (spi_format &
7164 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7165 last_color_export = 0;
7166 } else {
7167 for (i = 0; i < 8; i++)
7168 if (colors_written & (1 << i) &&
7169 (spi_format >> (i * 4)) & 0xf)
7170 last_color_export = i;
7171 }
7172 }
7173
7174 while (colors_written) {
7175 LLVMValueRef color[4];
7176 int mrt = u_bit_scan(&colors_written);
7177
7178 for (i = 0; i < 4; i++)
7179 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7180
7181 si_export_mrt_color(bld_base, color, mrt,
7182 num_params - 1,
7183 mrt == last_color_export, &exp);
7184 }
7185
7186 /* Process depth, stencil, samplemask. */
7187 if (key->ps_epilog.writes_z)
7188 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7189 if (key->ps_epilog.writes_stencil)
7190 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7191 if (key->ps_epilog.writes_samplemask)
7192 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7193
7194 if (depth || stencil || samplemask)
7195 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7196 else if (last_color_export == -1)
7197 si_export_null(bld_base);
7198
7199 if (exp.num)
7200 si_emit_ps_exports(ctx, &exp);
7201
7202 /* Compile. */
7203 LLVMBuildRetVoid(gallivm->builder);
7204 }
7205
7206 /**
7207 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7208 */
7209 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7210 LLVMTargetMachineRef tm,
7211 struct si_shader *shader,
7212 struct pipe_debug_callback *debug)
7213 {
7214 union si_shader_part_key prolog_key;
7215 union si_shader_part_key epilog_key;
7216
7217 /* Get the prolog. */
7218 si_get_ps_prolog_key(shader, &prolog_key, true);
7219
7220 /* The prolog is a no-op if these aren't set. */
7221 if (si_need_ps_prolog(&prolog_key)) {
7222 shader->prolog =
7223 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7224 PIPE_SHADER_FRAGMENT, true,
7225 &prolog_key, tm, debug,
7226 si_build_ps_prolog_function,
7227 "Fragment Shader Prolog");
7228 if (!shader->prolog)
7229 return false;
7230 }
7231
7232 /* Get the epilog. */
7233 si_get_ps_epilog_key(shader, &epilog_key);
7234
7235 shader->epilog =
7236 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7237 PIPE_SHADER_FRAGMENT, false,
7238 &epilog_key, tm, debug,
7239 si_build_ps_epilog_function,
7240 "Fragment Shader Epilog");
7241 if (!shader->epilog)
7242 return false;
7243
7244 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7245 if (shader->key.part.ps.prolog.poly_stipple) {
7246 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7247 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7248 }
7249
7250 /* Set up the enable bits for per-sample shading if needed. */
7251 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7252 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7253 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7254 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7255 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7256 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7257 }
7258 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7259 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7260 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7261 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7262 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7263 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7264 }
7265 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7266 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7267 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7268 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7269 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7270 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7271 }
7272 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7273 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7274 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7275 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7276 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7277 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7278 }
7279
7280 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7281 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7282 !(shader->config.spi_ps_input_ena & 0xf)) {
7283 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7284 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7285 }
7286
7287 /* At least one pair of interpolation weights must be enabled. */
7288 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7289 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7290 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7291 }
7292
7293 /* The sample mask input is always enabled, because the API shader always
7294 * passes it through to the epilog. Disable it here if it's unused.
7295 */
7296 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7297 !shader->selector->info.reads_samplemask)
7298 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7299
7300 return true;
7301 }
7302
7303 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7304 unsigned *lds_size)
7305 {
7306 /* SPI barrier management bug:
7307 * Make sure we have at least 4k of LDS in use to avoid the bug.
7308 * It applies to workgroup sizes of more than one wavefront.
7309 */
7310 if (sscreen->b.family == CHIP_BONAIRE ||
7311 sscreen->b.family == CHIP_KABINI ||
7312 sscreen->b.family == CHIP_MULLINS)
7313 *lds_size = MAX2(*lds_size, 8);
7314 }
7315
7316 static void si_fix_resource_usage(struct si_screen *sscreen,
7317 struct si_shader *shader)
7318 {
7319 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7320
7321 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7322
7323 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7324 si_get_max_workgroup_size(shader) > 64) {
7325 si_multiwave_lds_size_workaround(sscreen,
7326 &shader->config.lds_size);
7327 }
7328 }
7329
7330 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7331 struct si_shader *shader,
7332 struct pipe_debug_callback *debug)
7333 {
7334 struct si_shader_selector *sel = shader->selector;
7335 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7336 int r;
7337
7338 /* LS, ES, VS are compiled on demand if the main part hasn't been
7339 * compiled for that stage.
7340 *
7341 * Vertex shaders are compiled on demand when a vertex fetch
7342 * workaround must be applied.
7343 */
7344 if (shader->is_monolithic) {
7345 /* Monolithic shader (compiled as a whole, has many variants,
7346 * may take a long time to compile).
7347 */
7348 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7349 if (r)
7350 return r;
7351 } else {
7352 /* The shader consists of 2-3 parts:
7353 *
7354 * - the middle part is the user shader, it has 1 variant only
7355 * and it was compiled during the creation of the shader
7356 * selector
7357 * - the prolog part is inserted at the beginning
7358 * - the epilog part is inserted at the end
7359 *
7360 * The prolog and epilog have many (but simple) variants.
7361 */
7362
7363 /* Copy the compiled TGSI shader data over. */
7364 shader->is_binary_shared = true;
7365 shader->binary = mainp->binary;
7366 shader->config = mainp->config;
7367 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7368 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7369 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7370 memcpy(shader->info.vs_output_param_offset,
7371 mainp->info.vs_output_param_offset,
7372 sizeof(mainp->info.vs_output_param_offset));
7373 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7374 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7375 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7376
7377 /* Select prologs and/or epilogs. */
7378 switch (sel->type) {
7379 case PIPE_SHADER_VERTEX:
7380 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7381 return -1;
7382 break;
7383 case PIPE_SHADER_TESS_CTRL:
7384 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7385 return -1;
7386 break;
7387 case PIPE_SHADER_TESS_EVAL:
7388 break;
7389 case PIPE_SHADER_GEOMETRY:
7390 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7391 return -1;
7392 break;
7393 case PIPE_SHADER_FRAGMENT:
7394 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7395 return -1;
7396
7397 /* Make sure we have at least as many VGPRs as there
7398 * are allocated inputs.
7399 */
7400 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7401 shader->info.num_input_vgprs);
7402 break;
7403 }
7404
7405 /* Update SGPR and VGPR counts. */
7406 if (shader->prolog) {
7407 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7408 shader->prolog->config.num_sgprs);
7409 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7410 shader->prolog->config.num_vgprs);
7411 }
7412 if (shader->previous_stage) {
7413 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7414 shader->previous_stage->config.num_sgprs);
7415 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7416 shader->previous_stage->config.num_vgprs);
7417 shader->config.spilled_sgprs =
7418 MAX2(shader->config.spilled_sgprs,
7419 shader->previous_stage->config.spilled_sgprs);
7420 shader->config.spilled_vgprs =
7421 MAX2(shader->config.spilled_vgprs,
7422 shader->previous_stage->config.spilled_vgprs);
7423 shader->config.private_mem_vgprs =
7424 MAX2(shader->config.private_mem_vgprs,
7425 shader->previous_stage->config.private_mem_vgprs);
7426 shader->config.scratch_bytes_per_wave =
7427 MAX2(shader->config.scratch_bytes_per_wave,
7428 shader->previous_stage->config.scratch_bytes_per_wave);
7429 shader->info.uses_instanceid |=
7430 shader->previous_stage->info.uses_instanceid;
7431 }
7432 if (shader->prolog2) {
7433 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7434 shader->prolog2->config.num_sgprs);
7435 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7436 shader->prolog2->config.num_vgprs);
7437 }
7438 if (shader->epilog) {
7439 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7440 shader->epilog->config.num_sgprs);
7441 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7442 shader->epilog->config.num_vgprs);
7443 }
7444 }
7445
7446 si_fix_resource_usage(sscreen, shader);
7447 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7448 stderr, true);
7449
7450 /* Upload. */
7451 r = si_shader_binary_upload(sscreen, shader);
7452 if (r) {
7453 fprintf(stderr, "LLVM failed to upload shader\n");
7454 return r;
7455 }
7456
7457 return 0;
7458 }
7459
7460 void si_shader_destroy(struct si_shader *shader)
7461 {
7462 if (shader->scratch_bo)
7463 r600_resource_reference(&shader->scratch_bo, NULL);
7464
7465 r600_resource_reference(&shader->bo, NULL);
7466
7467 if (!shader->is_binary_shared)
7468 radeon_shader_binary_clean(&shader->binary);
7469
7470 free(shader->shader_log);
7471 }