radeonsi: move llvm_get_type_size() to ac
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49 #include "compiler/nir/nir.h"
50
51 static const char *scratch_rsrc_dword0_symbol =
52 "SCRATCH_RSRC_DWORD0";
53
54 static const char *scratch_rsrc_dword1_symbol =
55 "SCRATCH_RSRC_DWORD1";
56
57 struct si_shader_output_values
58 {
59 LLVMValueRef values[4];
60 unsigned semantic_name;
61 unsigned semantic_index;
62 ubyte vertex_stream[4];
63 };
64
65 /**
66 * Used to collect types and other info about arguments of the LLVM function
67 * before the function is created.
68 */
69 struct si_function_info {
70 LLVMTypeRef types[100];
71 LLVMValueRef *assign[100];
72 unsigned num_sgpr_params;
73 unsigned num_params;
74 };
75
76 enum si_arg_regfile {
77 ARG_SGPR,
78 ARG_VGPR
79 };
80
81 static void si_init_shader_ctx(struct si_shader_context *ctx,
82 struct si_screen *sscreen,
83 LLVMTargetMachineRef tm);
84
85 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
86 struct lp_build_tgsi_context *bld_base,
87 struct lp_build_emit_data *emit_data);
88
89 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
90 FILE *f);
91
92 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
93 union si_shader_part_key *key);
94 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
95 union si_shader_part_key *key);
96 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
97 union si_shader_part_key *key);
98 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
99 union si_shader_part_key *key);
100
101 /* Ideally pass the sample mask input to the PS epilog as v13, which
102 * is its usual location, so that the shader doesn't have to add v_mov.
103 */
104 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
105
106 enum {
107 CONST_ADDR_SPACE = 2,
108 LOCAL_ADDR_SPACE = 3,
109 };
110
111 static bool is_merged_shader(struct si_shader *shader)
112 {
113 if (shader->selector->screen->b.chip_class <= VI)
114 return false;
115
116 return shader->key.as_ls ||
117 shader->key.as_es ||
118 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
119 shader->selector->type == PIPE_SHADER_GEOMETRY;
120 }
121
122 static void si_init_function_info(struct si_function_info *fninfo)
123 {
124 fninfo->num_params = 0;
125 fninfo->num_sgpr_params = 0;
126 }
127
128 static unsigned add_arg_assign(struct si_function_info *fninfo,
129 enum si_arg_regfile regfile, LLVMTypeRef type,
130 LLVMValueRef *assign)
131 {
132 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
133
134 unsigned idx = fninfo->num_params++;
135 assert(idx < ARRAY_SIZE(fninfo->types));
136
137 if (regfile == ARG_SGPR)
138 fninfo->num_sgpr_params = fninfo->num_params;
139
140 fninfo->types[idx] = type;
141 fninfo->assign[idx] = assign;
142 return idx;
143 }
144
145 static unsigned add_arg(struct si_function_info *fninfo,
146 enum si_arg_regfile regfile, LLVMTypeRef type)
147 {
148 return add_arg_assign(fninfo, regfile, type, NULL);
149 }
150
151 static void add_arg_assign_checked(struct si_function_info *fninfo,
152 enum si_arg_regfile regfile, LLVMTypeRef type,
153 LLVMValueRef *assign, unsigned idx)
154 {
155 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
156 assert(actual == idx);
157 }
158
159 static void add_arg_checked(struct si_function_info *fninfo,
160 enum si_arg_regfile regfile, LLVMTypeRef type,
161 unsigned idx)
162 {
163 add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
164 }
165
166 /**
167 * Returns a unique index for a per-patch semantic name and index. The index
168 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
169 * can be calculated.
170 */
171 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
172 {
173 switch (semantic_name) {
174 case TGSI_SEMANTIC_TESSOUTER:
175 return 0;
176 case TGSI_SEMANTIC_TESSINNER:
177 return 1;
178 case TGSI_SEMANTIC_PATCH:
179 assert(index < 30);
180 return 2 + index;
181
182 default:
183 assert(!"invalid semantic name");
184 return 0;
185 }
186 }
187
188 /**
189 * Returns a unique index for a semantic name and index. The index must be
190 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
191 * calculated.
192 */
193 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
194 {
195 switch (semantic_name) {
196 case TGSI_SEMANTIC_POSITION:
197 return 0;
198 case TGSI_SEMANTIC_GENERIC:
199 /* Since some shader stages use the the highest used IO index
200 * to determine the size to allocate for inputs/outputs
201 * (in LDS, tess and GS rings). GENERIC should be placed right
202 * after POSITION to make that size as small as possible.
203 */
204 if (index < SI_MAX_IO_GENERIC)
205 return 1 + index;
206
207 assert(!"invalid generic index");
208 return 0;
209 case TGSI_SEMANTIC_PSIZE:
210 return SI_MAX_IO_GENERIC + 1;
211 case TGSI_SEMANTIC_CLIPDIST:
212 assert(index <= 1);
213 return SI_MAX_IO_GENERIC + 2 + index;
214 case TGSI_SEMANTIC_FOG:
215 return SI_MAX_IO_GENERIC + 4;
216 case TGSI_SEMANTIC_LAYER:
217 return SI_MAX_IO_GENERIC + 5;
218 case TGSI_SEMANTIC_VIEWPORT_INDEX:
219 return SI_MAX_IO_GENERIC + 6;
220 case TGSI_SEMANTIC_PRIMID:
221 return SI_MAX_IO_GENERIC + 7;
222 case TGSI_SEMANTIC_COLOR: /* these alias */
223 case TGSI_SEMANTIC_BCOLOR:
224 assert(index < 2);
225 return SI_MAX_IO_GENERIC + 8 + index;
226 case TGSI_SEMANTIC_TEXCOORD:
227 assert(index < 8);
228 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
229 return SI_MAX_IO_GENERIC + 10 + index;
230 default:
231 assert(!"invalid semantic name");
232 return 0;
233 }
234 }
235
236 /**
237 * Helper function that builds an LLVM IR PHI node and immediately adds
238 * incoming edges.
239 */
240 static LLVMValueRef
241 build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
242 unsigned count_incoming, LLVMValueRef *values,
243 LLVMBasicBlockRef *blocks)
244 {
245 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
246 LLVMAddIncoming(phi, values, blocks, count_incoming);
247 return phi;
248 }
249
250 /**
251 * Get the value of a shader input parameter and extract a bitfield.
252 */
253 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
254 unsigned param, unsigned rshift,
255 unsigned bitwidth)
256 {
257 struct gallivm_state *gallivm = &ctx->gallivm;
258 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
259 param);
260
261 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
262 value = bitcast(&ctx->bld_base,
263 TGSI_TYPE_UNSIGNED, value);
264
265 if (rshift)
266 value = LLVMBuildLShr(gallivm->builder, value,
267 LLVMConstInt(ctx->i32, rshift, 0), "");
268
269 if (rshift + bitwidth < 32) {
270 unsigned mask = (1 << bitwidth) - 1;
271 value = LLVMBuildAnd(gallivm->builder, value,
272 LLVMConstInt(ctx->i32, mask, 0), "");
273 }
274
275 return value;
276 }
277
278 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
279 {
280 switch (ctx->type) {
281 case PIPE_SHADER_TESS_CTRL:
282 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
283
284 case PIPE_SHADER_TESS_EVAL:
285 return LLVMGetParam(ctx->main_fn,
286 ctx->param_tes_rel_patch_id);
287
288 default:
289 assert(0);
290 return NULL;
291 }
292 }
293
294 /* Tessellation shaders pass outputs to the next shader using LDS.
295 *
296 * LS outputs = TCS inputs
297 * TCS outputs = TES inputs
298 *
299 * The LDS layout is:
300 * - TCS inputs for patch 0
301 * - TCS inputs for patch 1
302 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
303 * - ...
304 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
305 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
306 * - TCS outputs for patch 1
307 * - Per-patch TCS outputs for patch 1
308 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
309 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
310 * - ...
311 *
312 * All three shaders VS(LS), TCS, TES share the same LDS space.
313 */
314
315 static LLVMValueRef
316 get_tcs_in_patch_stride(struct si_shader_context *ctx)
317 {
318 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
319 }
320
321 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
322 {
323 assert(ctx->type == PIPE_SHADER_TESS_CTRL);
324
325 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
326 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
327
328 return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
329 }
330
331 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
332 {
333 unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
334
335 return LLVMConstInt(ctx->i32, stride, 0);
336 }
337
338 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
339 {
340 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
341 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
342
343 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
344 unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
345 unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
346 unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
347 unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
348 num_patch_outputs * 4;
349 return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
350 }
351
352 static LLVMValueRef
353 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
354 {
355 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
356 unpack_param(ctx,
357 ctx->param_tcs_out_lds_offsets,
358 0, 16),
359 4);
360 }
361
362 static LLVMValueRef
363 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
364 {
365 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
366 unpack_param(ctx,
367 ctx->param_tcs_out_lds_offsets,
368 16, 16),
369 4);
370 }
371
372 static LLVMValueRef
373 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
374 {
375 struct gallivm_state *gallivm = &ctx->gallivm;
376 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
377 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
378
379 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
380 }
381
382 static LLVMValueRef
383 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
384 {
385 struct gallivm_state *gallivm = &ctx->gallivm;
386 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
387 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
388 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
389
390 return LLVMBuildAdd(gallivm->builder, patch0_offset,
391 LLVMBuildMul(gallivm->builder, patch_stride,
392 rel_patch_id, ""),
393 "");
394 }
395
396 static LLVMValueRef
397 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
398 {
399 struct gallivm_state *gallivm = &ctx->gallivm;
400 LLVMValueRef patch0_patch_data_offset =
401 get_tcs_out_patch0_patch_data_offset(ctx);
402 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
403 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
404
405 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
406 LLVMBuildMul(gallivm->builder, patch_stride,
407 rel_patch_id, ""),
408 "");
409 }
410
411 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
412 {
413 unsigned tcs_out_vertices =
414 ctx->shader->selector ?
415 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
416
417 /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
418 if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
419 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
420
421 return unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
422 }
423
424 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
425 {
426 unsigned stride;
427
428 switch (ctx->type) {
429 case PIPE_SHADER_VERTEX:
430 stride = util_last_bit64(ctx->shader->selector->outputs_written);
431 return LLVMConstInt(ctx->i32, stride * 4, 0);
432
433 case PIPE_SHADER_TESS_CTRL:
434 if (ctx->screen->b.chip_class >= GFX9 &&
435 ctx->shader->is_monolithic) {
436 stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
437 return LLVMConstInt(ctx->i32, stride * 4, 0);
438 }
439 return unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
440
441 default:
442 assert(0);
443 return NULL;
444 }
445 }
446
447 static LLVMValueRef get_instance_index_for_fetch(
448 struct si_shader_context *ctx,
449 unsigned param_start_instance, LLVMValueRef divisor)
450 {
451 struct gallivm_state *gallivm = &ctx->gallivm;
452
453 LLVMValueRef result = ctx->abi.instance_id;
454
455 /* The division must be done before START_INSTANCE is added. */
456 if (divisor != ctx->i32_1)
457 result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
458
459 return LLVMBuildAdd(gallivm->builder, result,
460 LLVMGetParam(ctx->main_fn, param_start_instance), "");
461 }
462
463 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
464 * to float. */
465 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
466 LLVMValueRef vec4,
467 unsigned double_index)
468 {
469 LLVMBuilderRef builder = ctx->gallivm.builder;
470 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
471 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
472 LLVMVectorType(f64, 2), "");
473 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
474 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
475 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
476 }
477
478 void si_llvm_load_input_vs(
479 struct si_shader_context *ctx,
480 unsigned input_index,
481 LLVMValueRef out[4])
482 {
483 struct gallivm_state *gallivm = &ctx->gallivm;
484
485 unsigned chan;
486 unsigned fix_fetch;
487 unsigned num_fetches;
488 unsigned fetch_stride;
489
490 LLVMValueRef t_list_ptr;
491 LLVMValueRef t_offset;
492 LLVMValueRef t_list;
493 LLVMValueRef vertex_index;
494 LLVMValueRef input[3];
495
496 /* Load the T list */
497 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
498
499 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
500
501 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
502
503 vertex_index = LLVMGetParam(ctx->main_fn,
504 ctx->param_vertex_index0 +
505 input_index);
506
507 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
508
509 /* Do multiple loads for special formats. */
510 switch (fix_fetch) {
511 case SI_FIX_FETCH_RGB_64_FLOAT:
512 num_fetches = 3; /* 3 2-dword loads */
513 fetch_stride = 8;
514 break;
515 case SI_FIX_FETCH_RGBA_64_FLOAT:
516 num_fetches = 2; /* 2 4-dword loads */
517 fetch_stride = 16;
518 break;
519 case SI_FIX_FETCH_RGB_8:
520 case SI_FIX_FETCH_RGB_8_INT:
521 num_fetches = 3;
522 fetch_stride = 1;
523 break;
524 case SI_FIX_FETCH_RGB_16:
525 case SI_FIX_FETCH_RGB_16_INT:
526 num_fetches = 3;
527 fetch_stride = 2;
528 break;
529 default:
530 num_fetches = 1;
531 fetch_stride = 0;
532 }
533
534 for (unsigned i = 0; i < num_fetches; i++) {
535 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
536
537 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
538 vertex_index, voffset,
539 true);
540 }
541
542 /* Break up the vec4 into individual components */
543 for (chan = 0; chan < 4; chan++) {
544 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
545 out[chan] = LLVMBuildExtractElement(gallivm->builder,
546 input[0], llvm_chan, "");
547 }
548
549 switch (fix_fetch) {
550 case SI_FIX_FETCH_A2_SNORM:
551 case SI_FIX_FETCH_A2_SSCALED:
552 case SI_FIX_FETCH_A2_SINT: {
553 /* The hardware returns an unsigned value; convert it to a
554 * signed one.
555 */
556 LLVMValueRef tmp = out[3];
557 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
558
559 /* First, recover the sign-extended signed integer value. */
560 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
561 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
562 else
563 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
564
565 /* For the integer-like cases, do a natural sign extension.
566 *
567 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
568 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
569 * exponent.
570 */
571 tmp = LLVMBuildShl(gallivm->builder, tmp,
572 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
573 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
574 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
575
576 /* Convert back to the right type. */
577 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
578 LLVMValueRef clamp;
579 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
580 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
581 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
582 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
583 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
584 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
585 }
586
587 out[3] = tmp;
588 break;
589 }
590 case SI_FIX_FETCH_RGBA_32_UNORM:
591 case SI_FIX_FETCH_RGBX_32_UNORM:
592 for (chan = 0; chan < 4; chan++) {
593 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
594 ctx->i32, "");
595 out[chan] = LLVMBuildUIToFP(gallivm->builder,
596 out[chan], ctx->f32, "");
597 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
598 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
599 }
600 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
601 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
602 out[3] = LLVMConstReal(ctx->f32, 1);
603 break;
604 case SI_FIX_FETCH_RGBA_32_SNORM:
605 case SI_FIX_FETCH_RGBX_32_SNORM:
606 case SI_FIX_FETCH_RGBA_32_FIXED:
607 case SI_FIX_FETCH_RGBX_32_FIXED: {
608 double scale;
609 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
610 scale = 1.0 / 0x10000;
611 else
612 scale = 1.0 / INT_MAX;
613
614 for (chan = 0; chan < 4; chan++) {
615 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
616 ctx->i32, "");
617 out[chan] = LLVMBuildSIToFP(gallivm->builder,
618 out[chan], ctx->f32, "");
619 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
620 LLVMConstReal(ctx->f32, scale), "");
621 }
622 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
623 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
624 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
625 out[3] = LLVMConstReal(ctx->f32, 1);
626 break;
627 }
628 case SI_FIX_FETCH_RGBA_32_USCALED:
629 for (chan = 0; chan < 4; chan++) {
630 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
631 ctx->i32, "");
632 out[chan] = LLVMBuildUIToFP(gallivm->builder,
633 out[chan], ctx->f32, "");
634 }
635 break;
636 case SI_FIX_FETCH_RGBA_32_SSCALED:
637 for (chan = 0; chan < 4; chan++) {
638 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
639 ctx->i32, "");
640 out[chan] = LLVMBuildSIToFP(gallivm->builder,
641 out[chan], ctx->f32, "");
642 }
643 break;
644 case SI_FIX_FETCH_RG_64_FLOAT:
645 for (chan = 0; chan < 2; chan++)
646 out[chan] = extract_double_to_float(ctx, input[0], chan);
647
648 out[2] = LLVMConstReal(ctx->f32, 0);
649 out[3] = LLVMConstReal(ctx->f32, 1);
650 break;
651 case SI_FIX_FETCH_RGB_64_FLOAT:
652 for (chan = 0; chan < 3; chan++)
653 out[chan] = extract_double_to_float(ctx, input[chan], 0);
654
655 out[3] = LLVMConstReal(ctx->f32, 1);
656 break;
657 case SI_FIX_FETCH_RGBA_64_FLOAT:
658 for (chan = 0; chan < 4; chan++) {
659 out[chan] = extract_double_to_float(ctx, input[chan / 2],
660 chan % 2);
661 }
662 break;
663 case SI_FIX_FETCH_RGB_8:
664 case SI_FIX_FETCH_RGB_8_INT:
665 case SI_FIX_FETCH_RGB_16:
666 case SI_FIX_FETCH_RGB_16_INT:
667 for (chan = 0; chan < 3; chan++) {
668 out[chan] = LLVMBuildExtractElement(gallivm->builder,
669 input[chan],
670 ctx->i32_0, "");
671 }
672 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
673 fix_fetch == SI_FIX_FETCH_RGB_16) {
674 out[3] = LLVMConstReal(ctx->f32, 1);
675 } else {
676 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
677 ctx->f32, "");
678 }
679 break;
680 }
681 }
682
683 static void declare_input_vs(
684 struct si_shader_context *ctx,
685 unsigned input_index,
686 const struct tgsi_full_declaration *decl,
687 LLVMValueRef out[4])
688 {
689 si_llvm_load_input_vs(ctx, input_index, out);
690 }
691
692 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
693 unsigned swizzle)
694 {
695 if (swizzle > 0)
696 return ctx->i32_0;
697
698 switch (ctx->type) {
699 case PIPE_SHADER_VERTEX:
700 return LLVMGetParam(ctx->main_fn,
701 ctx->param_vs_prim_id);
702 case PIPE_SHADER_TESS_CTRL:
703 return LLVMGetParam(ctx->main_fn,
704 ctx->param_tcs_patch_id);
705 case PIPE_SHADER_TESS_EVAL:
706 return LLVMGetParam(ctx->main_fn,
707 ctx->param_tes_patch_id);
708 case PIPE_SHADER_GEOMETRY:
709 return LLVMGetParam(ctx->main_fn,
710 ctx->param_gs_prim_id);
711 default:
712 assert(0);
713 return ctx->i32_0;
714 }
715 }
716
717 /**
718 * Return the value of tgsi_ind_register for indexing.
719 * This is the indirect index with the constant offset added to it.
720 */
721 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
722 const struct tgsi_ind_register *ind,
723 int rel_index)
724 {
725 struct gallivm_state *gallivm = &ctx->gallivm;
726 LLVMValueRef result;
727
728 result = ctx->addrs[ind->Index][ind->Swizzle];
729 result = LLVMBuildLoad(gallivm->builder, result, "");
730 result = LLVMBuildAdd(gallivm->builder, result,
731 LLVMConstInt(ctx->i32, rel_index, 0), "");
732 return result;
733 }
734
735 /**
736 * Like si_get_indirect_index, but restricts the return value to a (possibly
737 * undefined) value inside [0..num).
738 */
739 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
740 const struct tgsi_ind_register *ind,
741 int rel_index, unsigned num)
742 {
743 LLVMValueRef result = si_get_indirect_index(ctx, ind, rel_index);
744
745 return si_llvm_bound_index(ctx, result, num);
746 }
747
748
749 /**
750 * Calculate a dword address given an input or output register and a stride.
751 */
752 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
753 const struct tgsi_full_dst_register *dst,
754 const struct tgsi_full_src_register *src,
755 LLVMValueRef vertex_dw_stride,
756 LLVMValueRef base_addr)
757 {
758 struct gallivm_state *gallivm = &ctx->gallivm;
759 struct tgsi_shader_info *info = &ctx->shader->selector->info;
760 ubyte *name, *index, *array_first;
761 int first, param;
762 struct tgsi_full_dst_register reg;
763
764 /* Set the register description. The address computation is the same
765 * for sources and destinations. */
766 if (src) {
767 reg.Register.File = src->Register.File;
768 reg.Register.Index = src->Register.Index;
769 reg.Register.Indirect = src->Register.Indirect;
770 reg.Register.Dimension = src->Register.Dimension;
771 reg.Indirect = src->Indirect;
772 reg.Dimension = src->Dimension;
773 reg.DimIndirect = src->DimIndirect;
774 } else
775 reg = *dst;
776
777 /* If the register is 2-dimensional (e.g. an array of vertices
778 * in a primitive), calculate the base address of the vertex. */
779 if (reg.Register.Dimension) {
780 LLVMValueRef index;
781
782 if (reg.Dimension.Indirect)
783 index = si_get_indirect_index(ctx, &reg.DimIndirect,
784 reg.Dimension.Index);
785 else
786 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
787
788 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
789 LLVMBuildMul(gallivm->builder, index,
790 vertex_dw_stride, ""), "");
791 }
792
793 /* Get information about the register. */
794 if (reg.Register.File == TGSI_FILE_INPUT) {
795 name = info->input_semantic_name;
796 index = info->input_semantic_index;
797 array_first = info->input_array_first;
798 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
799 name = info->output_semantic_name;
800 index = info->output_semantic_index;
801 array_first = info->output_array_first;
802 } else {
803 assert(0);
804 return NULL;
805 }
806
807 if (reg.Register.Indirect) {
808 /* Add the relative address of the element. */
809 LLVMValueRef ind_index;
810
811 if (reg.Indirect.ArrayID)
812 first = array_first[reg.Indirect.ArrayID];
813 else
814 first = reg.Register.Index;
815
816 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
817 reg.Register.Index - first);
818
819 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
820 LLVMBuildMul(gallivm->builder, ind_index,
821 LLVMConstInt(ctx->i32, 4, 0), ""), "");
822
823 param = reg.Register.Dimension ?
824 si_shader_io_get_unique_index(name[first], index[first]) :
825 si_shader_io_get_unique_index_patch(name[first], index[first]);
826 } else {
827 param = reg.Register.Dimension ?
828 si_shader_io_get_unique_index(name[reg.Register.Index],
829 index[reg.Register.Index]) :
830 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
831 index[reg.Register.Index]);
832 }
833
834 /* Add the base address of the element. */
835 return LLVMBuildAdd(gallivm->builder, base_addr,
836 LLVMConstInt(ctx->i32, param * 4, 0), "");
837 }
838
839 /* The offchip buffer layout for TCS->TES is
840 *
841 * - attribute 0 of patch 0 vertex 0
842 * - attribute 0 of patch 0 vertex 1
843 * - attribute 0 of patch 0 vertex 2
844 * ...
845 * - attribute 0 of patch 1 vertex 0
846 * - attribute 0 of patch 1 vertex 1
847 * ...
848 * - attribute 1 of patch 0 vertex 0
849 * - attribute 1 of patch 0 vertex 1
850 * ...
851 * - per patch attribute 0 of patch 0
852 * - per patch attribute 0 of patch 1
853 * ...
854 *
855 * Note that every attribute has 4 components.
856 */
857 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
858 LLVMValueRef rel_patch_id,
859 LLVMValueRef vertex_index,
860 LLVMValueRef param_index)
861 {
862 struct gallivm_state *gallivm = &ctx->gallivm;
863 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
864 LLVMValueRef param_stride, constant16;
865
866 vertices_per_patch = get_num_tcs_out_vertices(ctx);
867 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
868 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
869 num_patches, "");
870
871 constant16 = LLVMConstInt(ctx->i32, 16, 0);
872 if (vertex_index) {
873 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
874 vertices_per_patch, "");
875
876 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
877 vertex_index, "");
878
879 param_stride = total_vertices;
880 } else {
881 base_addr = rel_patch_id;
882 param_stride = num_patches;
883 }
884
885 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
886 LLVMBuildMul(gallivm->builder, param_index,
887 param_stride, ""), "");
888
889 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
890
891 if (!vertex_index) {
892 LLVMValueRef patch_data_offset =
893 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
894
895 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
896 patch_data_offset, "");
897 }
898 return base_addr;
899 }
900
901 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
902 struct si_shader_context *ctx,
903 const struct tgsi_full_dst_register *dst,
904 const struct tgsi_full_src_register *src)
905 {
906 struct gallivm_state *gallivm = &ctx->gallivm;
907 struct tgsi_shader_info *info = &ctx->shader->selector->info;
908 ubyte *name, *index, *array_first;
909 struct tgsi_full_src_register reg;
910 LLVMValueRef vertex_index = NULL;
911 LLVMValueRef param_index = NULL;
912 unsigned param_index_base, param_base;
913
914 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
915
916 if (reg.Register.Dimension) {
917
918 if (reg.Dimension.Indirect)
919 vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
920 reg.Dimension.Index);
921 else
922 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
923 }
924
925 /* Get information about the register. */
926 if (reg.Register.File == TGSI_FILE_INPUT) {
927 name = info->input_semantic_name;
928 index = info->input_semantic_index;
929 array_first = info->input_array_first;
930 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
931 name = info->output_semantic_name;
932 index = info->output_semantic_index;
933 array_first = info->output_array_first;
934 } else {
935 assert(0);
936 return NULL;
937 }
938
939 if (reg.Register.Indirect) {
940 if (reg.Indirect.ArrayID)
941 param_base = array_first[reg.Indirect.ArrayID];
942 else
943 param_base = reg.Register.Index;
944
945 param_index = si_get_indirect_index(ctx, &reg.Indirect,
946 reg.Register.Index - param_base);
947
948 } else {
949 param_base = reg.Register.Index;
950 param_index = ctx->i32_0;
951 }
952
953 param_index_base = reg.Register.Dimension ?
954 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
955 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
956
957 param_index = LLVMBuildAdd(gallivm->builder, param_index,
958 LLVMConstInt(ctx->i32, param_index_base, 0),
959 "");
960
961 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
962 vertex_index, param_index);
963 }
964
965 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
966 enum tgsi_opcode_type type, unsigned swizzle,
967 LLVMValueRef buffer, LLVMValueRef offset,
968 LLVMValueRef base, bool can_speculate)
969 {
970 struct si_shader_context *ctx = si_shader_context(bld_base);
971 struct gallivm_state *gallivm = &ctx->gallivm;
972 LLVMValueRef value, value2;
973 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
974 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
975
976 if (swizzle == ~0) {
977 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
978 0, 1, 0, can_speculate, false);
979
980 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
981 }
982
983 if (!tgsi_type_is_64bit(type)) {
984 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
985 0, 1, 0, can_speculate, false);
986
987 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
988 return LLVMBuildExtractElement(gallivm->builder, value,
989 LLVMConstInt(ctx->i32, swizzle, 0), "");
990 }
991
992 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
993 swizzle * 4, 1, 0, can_speculate, false);
994
995 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
996 swizzle * 4 + 4, 1, 0, can_speculate, false);
997
998 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
999 }
1000
1001 /**
1002 * Load from LDS.
1003 *
1004 * \param type output value type
1005 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
1006 * \param dw_addr address in dwords
1007 */
1008 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1009 enum tgsi_opcode_type type, unsigned swizzle,
1010 LLVMValueRef dw_addr)
1011 {
1012 struct si_shader_context *ctx = si_shader_context(bld_base);
1013 struct gallivm_state *gallivm = &ctx->gallivm;
1014 LLVMValueRef value;
1015
1016 if (swizzle == ~0) {
1017 LLVMValueRef values[TGSI_NUM_CHANNELS];
1018
1019 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1020 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1021
1022 return lp_build_gather_values(gallivm, values,
1023 TGSI_NUM_CHANNELS);
1024 }
1025
1026 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1027 LLVMConstInt(ctx->i32, swizzle, 0));
1028
1029 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
1030 if (tgsi_type_is_64bit(type)) {
1031 LLVMValueRef value2;
1032 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1033 ctx->i32_1);
1034 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
1035 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1036 }
1037
1038 return LLVMBuildBitCast(gallivm->builder, value,
1039 tgsi2llvmtype(bld_base, type), "");
1040 }
1041
1042 /**
1043 * Store to LDS.
1044 *
1045 * \param swizzle offset (typically 0..3)
1046 * \param dw_addr address in dwords
1047 * \param value value to store
1048 */
1049 static void lds_store(struct lp_build_tgsi_context *bld_base,
1050 unsigned dw_offset_imm, LLVMValueRef dw_addr,
1051 LLVMValueRef value)
1052 {
1053 struct si_shader_context *ctx = si_shader_context(bld_base);
1054 struct gallivm_state *gallivm = &ctx->gallivm;
1055
1056 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1057 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
1058
1059 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1060 ac_build_indexed_store(&ctx->ac, ctx->lds,
1061 dw_addr, value);
1062 }
1063
1064 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
1065 unsigned param)
1066 {
1067 LLVMBuilderRef builder = ctx->gallivm.builder;
1068
1069 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1070 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
1071 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1072
1073 uint64_t desc2 = 0xffffffff;
1074 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1075 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1076 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1077 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1078 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1079 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1080 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1081
1082 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1083 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1084 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1085 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1086 }
1087
1088 static LLVMValueRef fetch_input_tcs(
1089 struct lp_build_tgsi_context *bld_base,
1090 const struct tgsi_full_src_register *reg,
1091 enum tgsi_opcode_type type, unsigned swizzle)
1092 {
1093 struct si_shader_context *ctx = si_shader_context(bld_base);
1094 LLVMValueRef dw_addr, stride;
1095
1096 stride = get_tcs_in_vertex_dw_stride(ctx);
1097 dw_addr = get_tcs_in_current_patch_offset(ctx);
1098 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1099
1100 return lds_load(bld_base, type, swizzle, dw_addr);
1101 }
1102
1103 static LLVMValueRef fetch_output_tcs(
1104 struct lp_build_tgsi_context *bld_base,
1105 const struct tgsi_full_src_register *reg,
1106 enum tgsi_opcode_type type, unsigned swizzle)
1107 {
1108 struct si_shader_context *ctx = si_shader_context(bld_base);
1109 LLVMValueRef dw_addr, stride;
1110
1111 if (reg->Register.Dimension) {
1112 stride = get_tcs_out_vertex_dw_stride(ctx);
1113 dw_addr = get_tcs_out_current_patch_offset(ctx);
1114 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1115 } else {
1116 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1117 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1118 }
1119
1120 return lds_load(bld_base, type, swizzle, dw_addr);
1121 }
1122
1123 static LLVMValueRef fetch_input_tes(
1124 struct lp_build_tgsi_context *bld_base,
1125 const struct tgsi_full_src_register *reg,
1126 enum tgsi_opcode_type type, unsigned swizzle)
1127 {
1128 struct si_shader_context *ctx = si_shader_context(bld_base);
1129 LLVMValueRef buffer, base, addr;
1130
1131 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1132
1133 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1134 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1135
1136 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1137 }
1138
1139 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1140 const struct tgsi_full_instruction *inst,
1141 const struct tgsi_opcode_info *info,
1142 LLVMValueRef dst[4])
1143 {
1144 struct si_shader_context *ctx = si_shader_context(bld_base);
1145 struct gallivm_state *gallivm = &ctx->gallivm;
1146 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1147 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1148 unsigned chan_index;
1149 LLVMValueRef dw_addr, stride;
1150 LLVMValueRef buffer, base, buf_addr;
1151 LLVMValueRef values[4];
1152 bool skip_lds_store;
1153 bool is_tess_factor = false;
1154
1155 /* Only handle per-patch and per-vertex outputs here.
1156 * Vectors will be lowered to scalars and this function will be called again.
1157 */
1158 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1159 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1160 si_llvm_emit_store(bld_base, inst, info, dst);
1161 return;
1162 }
1163
1164 if (reg->Register.Dimension) {
1165 stride = get_tcs_out_vertex_dw_stride(ctx);
1166 dw_addr = get_tcs_out_current_patch_offset(ctx);
1167 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1168 skip_lds_store = !sh_info->reads_pervertex_outputs;
1169 } else {
1170 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1171 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1172 skip_lds_store = !sh_info->reads_perpatch_outputs;
1173
1174 if (!reg->Register.Indirect) {
1175 int name = sh_info->output_semantic_name[reg->Register.Index];
1176
1177 /* Always write tess factors into LDS for the TCS epilog. */
1178 if (name == TGSI_SEMANTIC_TESSINNER ||
1179 name == TGSI_SEMANTIC_TESSOUTER) {
1180 skip_lds_store = false;
1181 is_tess_factor = true;
1182 }
1183 }
1184 }
1185
1186 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1187
1188 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1189 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1190
1191
1192 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1193 LLVMValueRef value = dst[chan_index];
1194
1195 if (inst->Instruction.Saturate)
1196 value = ac_build_clamp(&ctx->ac, value);
1197
1198 /* Skip LDS stores if there is no LDS read of this output. */
1199 if (!skip_lds_store)
1200 lds_store(bld_base, chan_index, dw_addr, value);
1201
1202 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1203 values[chan_index] = value;
1204
1205 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1206 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1207 buf_addr, base,
1208 4 * chan_index, 1, 0, true, false);
1209 }
1210 }
1211
1212 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1213 LLVMValueRef value = lp_build_gather_values(gallivm,
1214 values, 4);
1215 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1216 base, 0, 1, 0, true, false);
1217 }
1218 }
1219
1220 static LLVMValueRef fetch_input_gs(
1221 struct lp_build_tgsi_context *bld_base,
1222 const struct tgsi_full_src_register *reg,
1223 enum tgsi_opcode_type type,
1224 unsigned swizzle)
1225 {
1226 struct si_shader_context *ctx = si_shader_context(bld_base);
1227 struct si_shader *shader = ctx->shader;
1228 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1229 struct gallivm_state *gallivm = &ctx->gallivm;
1230 LLVMValueRef vtx_offset, soffset;
1231 struct tgsi_shader_info *info = &shader->selector->info;
1232 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1233 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1234 unsigned param;
1235 LLVMValueRef value;
1236
1237 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1238 return get_primitive_id(ctx, swizzle);
1239
1240 if (!reg->Register.Dimension)
1241 return NULL;
1242
1243 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1244
1245 /* GFX9 has the ESGS ring in LDS. */
1246 if (ctx->screen->b.chip_class >= GFX9) {
1247 unsigned index = reg->Dimension.Index;
1248
1249 switch (index / 2) {
1250 case 0:
1251 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1252 index % 2 ? 16 : 0, 16);
1253 break;
1254 case 1:
1255 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1256 index % 2 ? 16 : 0, 16);
1257 break;
1258 case 2:
1259 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1260 index % 2 ? 16 : 0, 16);
1261 break;
1262 default:
1263 assert(0);
1264 return NULL;
1265 }
1266
1267 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1268 LLVMConstInt(ctx->i32, param * 4, 0), "");
1269 return lds_load(bld_base, type, swizzle, vtx_offset);
1270 }
1271
1272 /* GFX6: input load from the ESGS ring in memory. */
1273 if (swizzle == ~0) {
1274 LLVMValueRef values[TGSI_NUM_CHANNELS];
1275 unsigned chan;
1276 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1277 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1278 }
1279 return lp_build_gather_values(gallivm, values,
1280 TGSI_NUM_CHANNELS);
1281 }
1282
1283 /* Get the vertex offset parameter on GFX6. */
1284 unsigned vtx_offset_param = reg->Dimension.Index;
1285 if (vtx_offset_param < 2) {
1286 vtx_offset_param += ctx->param_gs_vtx0_offset;
1287 } else {
1288 assert(vtx_offset_param < 6);
1289 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1290 }
1291 vtx_offset = lp_build_mul_imm(uint,
1292 LLVMGetParam(ctx->main_fn,
1293 vtx_offset_param),
1294 4);
1295
1296 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1297
1298 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1299 vtx_offset, soffset, 0, 1, 0, true, false);
1300 if (tgsi_type_is_64bit(type)) {
1301 LLVMValueRef value2;
1302 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1303
1304 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1305 ctx->i32_0, vtx_offset, soffset,
1306 0, 1, 0, true, false);
1307 return si_llvm_emit_fetch_64bit(bld_base, type,
1308 value, value2);
1309 }
1310 return LLVMBuildBitCast(gallivm->builder,
1311 value,
1312 tgsi2llvmtype(bld_base, type), "");
1313 }
1314
1315 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1316 {
1317 switch (interpolate) {
1318 case TGSI_INTERPOLATE_CONSTANT:
1319 return 0;
1320
1321 case TGSI_INTERPOLATE_LINEAR:
1322 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1323 return SI_PARAM_LINEAR_SAMPLE;
1324 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1325 return SI_PARAM_LINEAR_CENTROID;
1326 else
1327 return SI_PARAM_LINEAR_CENTER;
1328 break;
1329 case TGSI_INTERPOLATE_COLOR:
1330 case TGSI_INTERPOLATE_PERSPECTIVE:
1331 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1332 return SI_PARAM_PERSP_SAMPLE;
1333 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1334 return SI_PARAM_PERSP_CENTROID;
1335 else
1336 return SI_PARAM_PERSP_CENTER;
1337 break;
1338 default:
1339 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1340 return -1;
1341 }
1342 }
1343
1344 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1345 unsigned attr_index, unsigned chan,
1346 LLVMValueRef prim_mask,
1347 LLVMValueRef i, LLVMValueRef j)
1348 {
1349 if (i || j) {
1350 return ac_build_fs_interp(&ctx->ac,
1351 LLVMConstInt(ctx->i32, chan, 0),
1352 LLVMConstInt(ctx->i32, attr_index, 0),
1353 prim_mask, i, j);
1354 }
1355 return ac_build_fs_interp_mov(&ctx->ac,
1356 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1357 LLVMConstInt(ctx->i32, chan, 0),
1358 LLVMConstInt(ctx->i32, attr_index, 0),
1359 prim_mask);
1360 }
1361
1362 /**
1363 * Interpolate a fragment shader input.
1364 *
1365 * @param ctx context
1366 * @param input_index index of the input in hardware
1367 * @param semantic_name TGSI_SEMANTIC_*
1368 * @param semantic_index semantic index
1369 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1370 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1371 * @param interp_param interpolation weights (i,j)
1372 * @param prim_mask SI_PARAM_PRIM_MASK
1373 * @param face SI_PARAM_FRONT_FACE
1374 * @param result the return value (4 components)
1375 */
1376 static void interp_fs_input(struct si_shader_context *ctx,
1377 unsigned input_index,
1378 unsigned semantic_name,
1379 unsigned semantic_index,
1380 unsigned num_interp_inputs,
1381 unsigned colors_read_mask,
1382 LLVMValueRef interp_param,
1383 LLVMValueRef prim_mask,
1384 LLVMValueRef face,
1385 LLVMValueRef result[4])
1386 {
1387 struct gallivm_state *gallivm = &ctx->gallivm;
1388 LLVMValueRef i = NULL, j = NULL;
1389 unsigned chan;
1390
1391 /* fs.constant returns the param from the middle vertex, so it's not
1392 * really useful for flat shading. It's meant to be used for custom
1393 * interpolation (but the intrinsic can't fetch from the other two
1394 * vertices).
1395 *
1396 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1397 * to do the right thing. The only reason we use fs.constant is that
1398 * fs.interp cannot be used on integers, because they can be equal
1399 * to NaN.
1400 *
1401 * When interp is false we will use fs.constant or for newer llvm,
1402 * amdgcn.interp.mov.
1403 */
1404 bool interp = interp_param != NULL;
1405
1406 if (interp) {
1407 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1408 LLVMVectorType(ctx->f32, 2), "");
1409
1410 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1411 ctx->i32_0, "");
1412 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1413 ctx->i32_1, "");
1414 }
1415
1416 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1417 ctx->shader->key.part.ps.prolog.color_two_side) {
1418 LLVMValueRef is_face_positive;
1419
1420 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1421 * otherwise it's at offset "num_inputs".
1422 */
1423 unsigned back_attr_offset = num_interp_inputs;
1424 if (semantic_index == 1 && colors_read_mask & 0xf)
1425 back_attr_offset += 1;
1426
1427 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1428 face, ctx->i32_0, "");
1429
1430 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1431 LLVMValueRef front, back;
1432
1433 front = si_build_fs_interp(ctx,
1434 input_index, chan,
1435 prim_mask, i, j);
1436 back = si_build_fs_interp(ctx,
1437 back_attr_offset, chan,
1438 prim_mask, i, j);
1439
1440 result[chan] = LLVMBuildSelect(gallivm->builder,
1441 is_face_positive,
1442 front,
1443 back,
1444 "");
1445 }
1446 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1447 result[0] = si_build_fs_interp(ctx, input_index,
1448 0, prim_mask, i, j);
1449 result[1] =
1450 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1451 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1452 } else {
1453 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1454 result[chan] = si_build_fs_interp(ctx,
1455 input_index, chan,
1456 prim_mask, i, j);
1457 }
1458 }
1459 }
1460
1461 void si_llvm_load_input_fs(
1462 struct si_shader_context *ctx,
1463 unsigned input_index,
1464 LLVMValueRef out[4])
1465 {
1466 struct lp_build_context *base = &ctx->bld_base.base;
1467 struct si_shader *shader = ctx->shader;
1468 struct tgsi_shader_info *info = &shader->selector->info;
1469 LLVMValueRef main_fn = ctx->main_fn;
1470 LLVMValueRef interp_param = NULL;
1471 int interp_param_idx;
1472 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1473 unsigned semantic_index = info->input_semantic_index[input_index];
1474 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1475 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1476
1477 /* Get colors from input VGPRs (set by the prolog). */
1478 if (semantic_name == TGSI_SEMANTIC_COLOR) {
1479 unsigned colors_read = shader->selector->info.colors_read;
1480 unsigned mask = colors_read >> (semantic_index * 4);
1481 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1482 (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1483
1484 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1485 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1486 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1487 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1488 return;
1489 }
1490
1491 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1492 if (interp_param_idx == -1)
1493 return;
1494 else if (interp_param_idx) {
1495 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1496 }
1497
1498 interp_fs_input(ctx, input_index, semantic_name,
1499 semantic_index, 0, /* this param is unused */
1500 shader->selector->info.colors_read, interp_param,
1501 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1502 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1503 &out[0]);
1504 }
1505
1506 static void declare_input_fs(
1507 struct si_shader_context *ctx,
1508 unsigned input_index,
1509 const struct tgsi_full_declaration *decl,
1510 LLVMValueRef out[4])
1511 {
1512 si_llvm_load_input_fs(ctx, input_index, out);
1513 }
1514
1515 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1516 {
1517 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1518 }
1519
1520
1521 /**
1522 * Load a dword from a constant buffer.
1523 */
1524 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1525 LLVMValueRef resource,
1526 LLVMValueRef offset)
1527 {
1528 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1529 0, 0, 0, true, true);
1530 }
1531
1532 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1533 {
1534 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1535 struct gallivm_state *gallivm = &ctx->gallivm;
1536 LLVMBuilderRef builder = gallivm->builder;
1537 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1538 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1539 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1540
1541 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1542 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1543 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1544
1545 LLVMValueRef pos[4] = {
1546 buffer_load_const(ctx, resource, offset0),
1547 buffer_load_const(ctx, resource, offset1),
1548 LLVMConstReal(ctx->f32, 0),
1549 LLVMConstReal(ctx->f32, 0)
1550 };
1551
1552 return lp_build_gather_values(gallivm, pos, 4);
1553 }
1554
1555 void si_load_system_value(struct si_shader_context *ctx,
1556 unsigned index,
1557 const struct tgsi_full_declaration *decl)
1558 {
1559 struct lp_build_context *bld = &ctx->bld_base.base;
1560 struct gallivm_state *gallivm = &ctx->gallivm;
1561 LLVMValueRef value = 0;
1562
1563 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1564
1565 switch (decl->Semantic.Name) {
1566 case TGSI_SEMANTIC_INSTANCEID:
1567 value = ctx->abi.instance_id;
1568 break;
1569
1570 case TGSI_SEMANTIC_VERTEXID:
1571 value = LLVMBuildAdd(gallivm->builder,
1572 ctx->abi.vertex_id,
1573 ctx->abi.base_vertex, "");
1574 break;
1575
1576 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1577 /* Unused. Clarify the meaning in indexed vs. non-indexed
1578 * draws if this is ever used again. */
1579 assert(false);
1580 break;
1581
1582 case TGSI_SEMANTIC_BASEVERTEX:
1583 {
1584 /* For non-indexed draws, the base vertex set by the driver
1585 * (for direct draws) or the CP (for indirect draws) is the
1586 * first vertex ID, but GLSL expects 0 to be returned.
1587 */
1588 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1589 LLVMValueRef indexed;
1590
1591 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1592 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1593
1594 value = LLVMBuildSelect(gallivm->builder, indexed,
1595 ctx->abi.base_vertex, ctx->i32_0, "");
1596 break;
1597 }
1598
1599 case TGSI_SEMANTIC_BASEINSTANCE:
1600 value = ctx->abi.start_instance;
1601 break;
1602
1603 case TGSI_SEMANTIC_DRAWID:
1604 value = ctx->abi.draw_id;
1605 break;
1606
1607 case TGSI_SEMANTIC_INVOCATIONID:
1608 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1609 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1610 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1611 value = LLVMGetParam(ctx->main_fn,
1612 ctx->param_gs_instance_id);
1613 else
1614 assert(!"INVOCATIONID not implemented");
1615 break;
1616
1617 case TGSI_SEMANTIC_POSITION:
1618 {
1619 LLVMValueRef pos[4] = {
1620 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1621 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1622 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1623 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1624 LLVMGetParam(ctx->main_fn,
1625 SI_PARAM_POS_W_FLOAT)),
1626 };
1627 value = lp_build_gather_values(gallivm, pos, 4);
1628 break;
1629 }
1630
1631 case TGSI_SEMANTIC_FACE:
1632 value = ctx->abi.front_face;
1633 break;
1634
1635 case TGSI_SEMANTIC_SAMPLEID:
1636 value = get_sample_id(ctx);
1637 break;
1638
1639 case TGSI_SEMANTIC_SAMPLEPOS: {
1640 LLVMValueRef pos[4] = {
1641 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1642 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1643 LLVMConstReal(ctx->f32, 0),
1644 LLVMConstReal(ctx->f32, 0)
1645 };
1646 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1647 TGSI_OPCODE_FRC, pos[0]);
1648 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1649 TGSI_OPCODE_FRC, pos[1]);
1650 value = lp_build_gather_values(gallivm, pos, 4);
1651 break;
1652 }
1653
1654 case TGSI_SEMANTIC_SAMPLEMASK:
1655 /* This can only occur with the OpenGL Core profile, which
1656 * doesn't support smoothing.
1657 */
1658 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1659 break;
1660
1661 case TGSI_SEMANTIC_TESSCOORD:
1662 {
1663 LLVMValueRef coord[4] = {
1664 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1665 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1666 bld->zero,
1667 bld->zero
1668 };
1669
1670 /* For triangles, the vector should be (u, v, 1-u-v). */
1671 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1672 PIPE_PRIM_TRIANGLES)
1673 coord[2] = lp_build_sub(bld, bld->one,
1674 lp_build_add(bld, coord[0], coord[1]));
1675
1676 value = lp_build_gather_values(gallivm, coord, 4);
1677 break;
1678 }
1679
1680 case TGSI_SEMANTIC_VERTICESIN:
1681 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1682 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1683 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1684 value = get_num_tcs_out_vertices(ctx);
1685 else
1686 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1687 break;
1688
1689 case TGSI_SEMANTIC_TESSINNER:
1690 case TGSI_SEMANTIC_TESSOUTER:
1691 {
1692 LLVMValueRef buffer, base, addr;
1693 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1694
1695 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1696
1697 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1698 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1699 LLVMConstInt(ctx->i32, param, 0));
1700
1701 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1702 ~0, buffer, base, addr, true);
1703
1704 break;
1705 }
1706
1707 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1708 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1709 {
1710 LLVMValueRef buf, slot, val[4];
1711 int i, offset;
1712
1713 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1714 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1715 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1716 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1717
1718 for (i = 0; i < 4; i++)
1719 val[i] = buffer_load_const(ctx, buf,
1720 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1721 value = lp_build_gather_values(gallivm, val, 4);
1722 break;
1723 }
1724
1725 case TGSI_SEMANTIC_PRIMID:
1726 value = get_primitive_id(ctx, 0);
1727 break;
1728
1729 case TGSI_SEMANTIC_GRID_SIZE:
1730 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1731 break;
1732
1733 case TGSI_SEMANTIC_BLOCK_SIZE:
1734 {
1735 LLVMValueRef values[3];
1736 unsigned i;
1737 unsigned *properties = ctx->shader->selector->info.properties;
1738
1739 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1740 unsigned sizes[3] = {
1741 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1742 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1743 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1744 };
1745
1746 for (i = 0; i < 3; ++i)
1747 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1748
1749 value = lp_build_gather_values(gallivm, values, 3);
1750 } else {
1751 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1752 }
1753 break;
1754 }
1755
1756 case TGSI_SEMANTIC_BLOCK_ID:
1757 {
1758 LLVMValueRef values[3];
1759
1760 for (int i = 0; i < 3; i++) {
1761 values[i] = ctx->i32_0;
1762 if (ctx->param_block_id[i] >= 0) {
1763 values[i] = LLVMGetParam(ctx->main_fn,
1764 ctx->param_block_id[i]);
1765 }
1766 }
1767 value = lp_build_gather_values(gallivm, values, 3);
1768 break;
1769 }
1770
1771 case TGSI_SEMANTIC_THREAD_ID:
1772 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1773 break;
1774
1775 case TGSI_SEMANTIC_HELPER_INVOCATION:
1776 value = lp_build_intrinsic(gallivm->builder,
1777 "llvm.amdgcn.ps.live",
1778 ctx->i1, NULL, 0,
1779 LP_FUNC_ATTR_READNONE);
1780 value = LLVMBuildNot(gallivm->builder, value, "");
1781 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1782 break;
1783
1784 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1785 value = LLVMConstInt(ctx->i32, 64, 0);
1786 break;
1787
1788 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1789 value = ac_get_thread_id(&ctx->ac);
1790 break;
1791
1792 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1793 {
1794 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1795 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1796 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1797 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1798 break;
1799 }
1800
1801 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1802 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1803 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1804 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1805 {
1806 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1807 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1808 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1809 /* All bits set except LSB */
1810 value = LLVMConstInt(ctx->i64, -2, 0);
1811 } else {
1812 /* All bits set */
1813 value = LLVMConstInt(ctx->i64, -1, 0);
1814 }
1815 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1816 value = LLVMBuildShl(gallivm->builder, value, id, "");
1817 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1818 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1819 value = LLVMBuildNot(gallivm->builder, value, "");
1820 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1821 break;
1822 }
1823
1824 default:
1825 assert(!"unknown system value");
1826 return;
1827 }
1828
1829 ctx->system_values[index] = value;
1830 }
1831
1832 void si_declare_compute_memory(struct si_shader_context *ctx,
1833 const struct tgsi_full_declaration *decl)
1834 {
1835 struct si_shader_selector *sel = ctx->shader->selector;
1836 struct gallivm_state *gallivm = &ctx->gallivm;
1837
1838 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1839 LLVMValueRef var;
1840
1841 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1842 assert(decl->Range.First == decl->Range.Last);
1843 assert(!ctx->shared_memory);
1844
1845 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1846 LLVMArrayType(ctx->i8, sel->local_size),
1847 "compute_lds",
1848 LOCAL_ADDR_SPACE);
1849 LLVMSetAlignment(var, 4);
1850
1851 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1852 }
1853
1854 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1855 {
1856 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1857 ctx->param_const_and_shader_buffers);
1858
1859 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1860 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1861 }
1862
1863 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1864 {
1865 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1866 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1867
1868 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1869 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1870 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1871
1872 return ac_build_indexed_load_const(&ctx->ac, ptr, index);
1873 }
1874
1875 static LLVMValueRef
1876 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1877 {
1878 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1879 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
1880 ctx->param_const_and_shader_buffers);
1881
1882 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1883 index = LLVMBuildSub(ctx->gallivm.builder,
1884 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1885 index, "");
1886
1887 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
1888 }
1889
1890 static LLVMValueRef fetch_constant(
1891 struct lp_build_tgsi_context *bld_base,
1892 const struct tgsi_full_src_register *reg,
1893 enum tgsi_opcode_type type,
1894 unsigned swizzle)
1895 {
1896 struct si_shader_context *ctx = si_shader_context(bld_base);
1897 struct lp_build_context *base = &bld_base->base;
1898 const struct tgsi_ind_register *ireg = &reg->Indirect;
1899 unsigned buf, idx;
1900
1901 LLVMValueRef addr, bufp;
1902 LLVMValueRef result;
1903
1904 if (swizzle == LP_CHAN_ALL) {
1905 unsigned chan;
1906 LLVMValueRef values[4];
1907 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1908 values[chan] = fetch_constant(bld_base, reg, type, chan);
1909
1910 return lp_build_gather_values(&ctx->gallivm, values, 4);
1911 }
1912
1913 assert(reg->Register.Dimension);
1914 buf = reg->Dimension.Index;
1915 idx = reg->Register.Index * 4 + swizzle;
1916
1917 if (reg->Dimension.Indirect) {
1918 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1919 LLVMValueRef index;
1920 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
1921 reg->Dimension.Index,
1922 ctx->num_const_buffers);
1923 index = LLVMBuildAdd(ctx->gallivm.builder, index,
1924 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1925 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1926 } else
1927 bufp = load_const_buffer_desc(ctx, buf);
1928
1929 if (reg->Register.Indirect) {
1930 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1931 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1932 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1933 addr = lp_build_add(&bld_base->uint_bld, addr,
1934 LLVMConstInt(ctx->i32, idx * 4, 0));
1935 } else {
1936 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1937 }
1938
1939 result = buffer_load_const(ctx, bufp, addr);
1940
1941 if (!tgsi_type_is_64bit(type))
1942 result = bitcast(bld_base, type, result);
1943 else {
1944 LLVMValueRef addr2, result2;
1945
1946 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1947 LLVMConstInt(ctx->i32, 4, 0));
1948 result2 = buffer_load_const(ctx, bufp, addr2);
1949
1950 result = si_llvm_emit_fetch_64bit(bld_base, type,
1951 result, result2);
1952 }
1953 return result;
1954 }
1955
1956 /* Upper 16 bits must be zero. */
1957 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1958 LLVMValueRef val[2])
1959 {
1960 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1961 LLVMBuildShl(ctx->gallivm.builder, val[1],
1962 LLVMConstInt(ctx->i32, 16, 0),
1963 ""), "");
1964 }
1965
1966 /* Upper 16 bits are ignored and will be dropped. */
1967 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1968 LLVMValueRef val[2])
1969 {
1970 LLVMValueRef v[2] = {
1971 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1972 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1973 val[1],
1974 };
1975 return si_llvm_pack_two_int16(ctx, v);
1976 }
1977
1978 /* Initialize arguments for the shader export intrinsic */
1979 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1980 LLVMValueRef *values,
1981 unsigned target,
1982 struct ac_export_args *args)
1983 {
1984 struct si_shader_context *ctx = si_shader_context(bld_base);
1985 struct lp_build_context *base = &bld_base->base;
1986 LLVMBuilderRef builder = ctx->gallivm.builder;
1987 LLVMValueRef val[4];
1988 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1989 unsigned chan;
1990 bool is_int8, is_int10;
1991
1992 /* Default is 0xf. Adjusted below depending on the format. */
1993 args->enabled_channels = 0xf; /* writemask */
1994
1995 /* Specify whether the EXEC mask represents the valid mask */
1996 args->valid_mask = 0;
1997
1998 /* Specify whether this is the last export */
1999 args->done = 0;
2000
2001 /* Specify the target we are exporting */
2002 args->target = target;
2003
2004 if (ctx->type == PIPE_SHADER_FRAGMENT) {
2005 const struct si_shader_key *key = &ctx->shader->key;
2006 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2007 int cbuf = target - V_008DFC_SQ_EXP_MRT;
2008
2009 assert(cbuf >= 0 && cbuf < 8);
2010 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2011 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2012 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2013 }
2014
2015 args->compr = false;
2016 args->out[0] = base->undef;
2017 args->out[1] = base->undef;
2018 args->out[2] = base->undef;
2019 args->out[3] = base->undef;
2020
2021 switch (spi_shader_col_format) {
2022 case V_028714_SPI_SHADER_ZERO:
2023 args->enabled_channels = 0; /* writemask */
2024 args->target = V_008DFC_SQ_EXP_NULL;
2025 break;
2026
2027 case V_028714_SPI_SHADER_32_R:
2028 args->enabled_channels = 1; /* writemask */
2029 args->out[0] = values[0];
2030 break;
2031
2032 case V_028714_SPI_SHADER_32_GR:
2033 args->enabled_channels = 0x3; /* writemask */
2034 args->out[0] = values[0];
2035 args->out[1] = values[1];
2036 break;
2037
2038 case V_028714_SPI_SHADER_32_AR:
2039 args->enabled_channels = 0x9; /* writemask */
2040 args->out[0] = values[0];
2041 args->out[3] = values[3];
2042 break;
2043
2044 case V_028714_SPI_SHADER_FP16_ABGR:
2045 args->compr = 1; /* COMPR flag */
2046
2047 for (chan = 0; chan < 2; chan++) {
2048 LLVMValueRef pack_args[2] = {
2049 values[2 * chan],
2050 values[2 * chan + 1]
2051 };
2052 LLVMValueRef packed;
2053
2054 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
2055 args->out[chan] =
2056 LLVMBuildBitCast(ctx->gallivm.builder,
2057 packed, ctx->f32, "");
2058 }
2059 break;
2060
2061 case V_028714_SPI_SHADER_UNORM16_ABGR:
2062 for (chan = 0; chan < 4; chan++) {
2063 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
2064 val[chan] = LLVMBuildFMul(builder, val[chan],
2065 LLVMConstReal(ctx->f32, 65535), "");
2066 val[chan] = LLVMBuildFAdd(builder, val[chan],
2067 LLVMConstReal(ctx->f32, 0.5), "");
2068 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2069 ctx->i32, "");
2070 }
2071
2072 args->compr = 1; /* COMPR flag */
2073 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2074 si_llvm_pack_two_int16(ctx, val));
2075 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2076 si_llvm_pack_two_int16(ctx, val+2));
2077 break;
2078
2079 case V_028714_SPI_SHADER_SNORM16_ABGR:
2080 for (chan = 0; chan < 4; chan++) {
2081 /* Clamp between [-1, 1]. */
2082 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2083 values[chan],
2084 LLVMConstReal(ctx->f32, 1));
2085 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2086 val[chan],
2087 LLVMConstReal(ctx->f32, -1));
2088 /* Convert to a signed integer in [-32767, 32767]. */
2089 val[chan] = LLVMBuildFMul(builder, val[chan],
2090 LLVMConstReal(ctx->f32, 32767), "");
2091 /* If positive, add 0.5, else add -0.5. */
2092 val[chan] = LLVMBuildFAdd(builder, val[chan],
2093 LLVMBuildSelect(builder,
2094 LLVMBuildFCmp(builder, LLVMRealOGE,
2095 val[chan], base->zero, ""),
2096 LLVMConstReal(ctx->f32, 0.5),
2097 LLVMConstReal(ctx->f32, -0.5), ""), "");
2098 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2099 }
2100
2101 args->compr = 1; /* COMPR flag */
2102 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2103 si_llvm_pack_two_int32_as_int16(ctx, val));
2104 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2105 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2106 break;
2107
2108 case V_028714_SPI_SHADER_UINT16_ABGR: {
2109 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2110 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2111 LLVMValueRef max_alpha =
2112 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2113
2114 /* Clamp. */
2115 for (chan = 0; chan < 4; chan++) {
2116 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2117 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2118 val[chan],
2119 chan == 3 ? max_alpha : max_rgb);
2120 }
2121
2122 args->compr = 1; /* COMPR flag */
2123 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2124 si_llvm_pack_two_int16(ctx, val));
2125 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2126 si_llvm_pack_two_int16(ctx, val+2));
2127 break;
2128 }
2129
2130 case V_028714_SPI_SHADER_SINT16_ABGR: {
2131 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2132 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2133 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2134 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2135 LLVMValueRef max_alpha =
2136 !is_int10 ? max_rgb : ctx->i32_1;
2137 LLVMValueRef min_alpha =
2138 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2139
2140 /* Clamp. */
2141 for (chan = 0; chan < 4; chan++) {
2142 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2143 val[chan] = lp_build_emit_llvm_binary(bld_base,
2144 TGSI_OPCODE_IMIN,
2145 val[chan], chan == 3 ? max_alpha : max_rgb);
2146 val[chan] = lp_build_emit_llvm_binary(bld_base,
2147 TGSI_OPCODE_IMAX,
2148 val[chan], chan == 3 ? min_alpha : min_rgb);
2149 }
2150
2151 args->compr = 1; /* COMPR flag */
2152 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2153 si_llvm_pack_two_int32_as_int16(ctx, val));
2154 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2155 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2156 break;
2157 }
2158
2159 case V_028714_SPI_SHADER_32_ABGR:
2160 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2161 break;
2162 }
2163 }
2164
2165 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2166 LLVMValueRef alpha)
2167 {
2168 struct si_shader_context *ctx = si_shader_context(bld_base);
2169
2170 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2171 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2172 SI_PARAM_ALPHA_REF);
2173
2174 LLVMValueRef alpha_pass =
2175 lp_build_cmp(&bld_base->base,
2176 ctx->shader->key.part.ps.epilog.alpha_func,
2177 alpha, alpha_ref);
2178 LLVMValueRef arg =
2179 lp_build_select(&bld_base->base,
2180 alpha_pass,
2181 LLVMConstReal(ctx->f32, 1.0f),
2182 LLVMConstReal(ctx->f32, -1.0f));
2183
2184 ac_build_kill(&ctx->ac, arg);
2185 } else {
2186 ac_build_kill(&ctx->ac, NULL);
2187 }
2188 }
2189
2190 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2191 LLVMValueRef alpha,
2192 unsigned samplemask_param)
2193 {
2194 struct si_shader_context *ctx = si_shader_context(bld_base);
2195 struct gallivm_state *gallivm = &ctx->gallivm;
2196 LLVMValueRef coverage;
2197
2198 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2199 coverage = LLVMGetParam(ctx->main_fn,
2200 samplemask_param);
2201 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2202
2203 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2204 ctx->i32,
2205 &coverage, 1, LP_FUNC_ATTR_READNONE);
2206
2207 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2208 ctx->f32, "");
2209
2210 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2211 LLVMConstReal(ctx->f32,
2212 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2213
2214 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2215 }
2216
2217 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2218 struct ac_export_args *pos, LLVMValueRef *out_elts)
2219 {
2220 struct si_shader_context *ctx = si_shader_context(bld_base);
2221 struct lp_build_context *base = &bld_base->base;
2222 unsigned reg_index;
2223 unsigned chan;
2224 unsigned const_chan;
2225 LLVMValueRef base_elt;
2226 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2227 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2228 SI_VS_CONST_CLIP_PLANES, 0);
2229 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2230
2231 for (reg_index = 0; reg_index < 2; reg_index ++) {
2232 struct ac_export_args *args = &pos[2 + reg_index];
2233
2234 args->out[0] =
2235 args->out[1] =
2236 args->out[2] =
2237 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2238
2239 /* Compute dot products of position and user clip plane vectors */
2240 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2241 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2242 LLVMValueRef addr =
2243 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2244 const_chan) * 4, 0);
2245 base_elt = buffer_load_const(ctx, const_resource,
2246 addr);
2247 args->out[chan] =
2248 lp_build_add(base, args->out[chan],
2249 lp_build_mul(base, base_elt,
2250 out_elts[const_chan]));
2251 }
2252 }
2253
2254 args->enabled_channels = 0xf;
2255 args->valid_mask = 0;
2256 args->done = 0;
2257 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2258 args->compr = 0;
2259 }
2260 }
2261
2262 static void si_dump_streamout(struct pipe_stream_output_info *so)
2263 {
2264 unsigned i;
2265
2266 if (so->num_outputs)
2267 fprintf(stderr, "STREAMOUT\n");
2268
2269 for (i = 0; i < so->num_outputs; i++) {
2270 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2271 so->output[i].start_component;
2272 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2273 i, so->output[i].output_buffer,
2274 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2275 so->output[i].register_index,
2276 mask & 1 ? "x" : "",
2277 mask & 2 ? "y" : "",
2278 mask & 4 ? "z" : "",
2279 mask & 8 ? "w" : "");
2280 }
2281 }
2282
2283 static void emit_streamout_output(struct si_shader_context *ctx,
2284 LLVMValueRef const *so_buffers,
2285 LLVMValueRef const *so_write_offsets,
2286 struct pipe_stream_output *stream_out,
2287 struct si_shader_output_values *shader_out)
2288 {
2289 struct gallivm_state *gallivm = &ctx->gallivm;
2290 LLVMBuilderRef builder = gallivm->builder;
2291 unsigned buf_idx = stream_out->output_buffer;
2292 unsigned start = stream_out->start_component;
2293 unsigned num_comps = stream_out->num_components;
2294 LLVMValueRef out[4];
2295
2296 assert(num_comps && num_comps <= 4);
2297 if (!num_comps || num_comps > 4)
2298 return;
2299
2300 /* Load the output as int. */
2301 for (int j = 0; j < num_comps; j++) {
2302 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2303
2304 out[j] = LLVMBuildBitCast(builder,
2305 shader_out->values[start + j],
2306 ctx->i32, "");
2307 }
2308
2309 /* Pack the output. */
2310 LLVMValueRef vdata = NULL;
2311
2312 switch (num_comps) {
2313 case 1: /* as i32 */
2314 vdata = out[0];
2315 break;
2316 case 2: /* as v2i32 */
2317 case 3: /* as v4i32 (aligned to 4) */
2318 case 4: /* as v4i32 */
2319 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2320 for (int j = 0; j < num_comps; j++) {
2321 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2322 LLVMConstInt(ctx->i32, j, 0), "");
2323 }
2324 break;
2325 }
2326
2327 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2328 vdata, num_comps,
2329 so_write_offsets[buf_idx],
2330 ctx->i32_0,
2331 stream_out->dst_offset * 4, 1, 1, true, false);
2332 }
2333
2334 /**
2335 * Write streamout data to buffers for vertex stream @p stream (different
2336 * vertex streams can occur for GS copy shaders).
2337 */
2338 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2339 struct si_shader_output_values *outputs,
2340 unsigned noutput, unsigned stream)
2341 {
2342 struct si_shader_selector *sel = ctx->shader->selector;
2343 struct pipe_stream_output_info *so = &sel->so;
2344 struct gallivm_state *gallivm = &ctx->gallivm;
2345 LLVMBuilderRef builder = gallivm->builder;
2346 int i;
2347 struct lp_build_if_state if_ctx;
2348
2349 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2350 LLVMValueRef so_vtx_count =
2351 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2352
2353 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2354
2355 /* can_emit = tid < so_vtx_count; */
2356 LLVMValueRef can_emit =
2357 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2358
2359 /* Emit the streamout code conditionally. This actually avoids
2360 * out-of-bounds buffer access. The hw tells us via the SGPR
2361 * (so_vtx_count) which threads are allowed to emit streamout data. */
2362 lp_build_if(&if_ctx, gallivm, can_emit);
2363 {
2364 /* The buffer offset is computed as follows:
2365 * ByteOffset = streamout_offset[buffer_id]*4 +
2366 * (streamout_write_index + thread_id)*stride[buffer_id] +
2367 * attrib_offset
2368 */
2369
2370 LLVMValueRef so_write_index =
2371 LLVMGetParam(ctx->main_fn,
2372 ctx->param_streamout_write_index);
2373
2374 /* Compute (streamout_write_index + thread_id). */
2375 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2376
2377 /* Load the descriptor and compute the write offset for each
2378 * enabled buffer. */
2379 LLVMValueRef so_write_offset[4] = {};
2380 LLVMValueRef so_buffers[4];
2381 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2382 ctx->param_rw_buffers);
2383
2384 for (i = 0; i < 4; i++) {
2385 if (!so->stride[i])
2386 continue;
2387
2388 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2389 SI_VS_STREAMOUT_BUF0 + i, 0);
2390
2391 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2392
2393 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2394 ctx->param_streamout_offset[i]);
2395 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2396
2397 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2398 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2399 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2400 }
2401
2402 /* Write streamout data. */
2403 for (i = 0; i < so->num_outputs; i++) {
2404 unsigned reg = so->output[i].register_index;
2405
2406 if (reg >= noutput)
2407 continue;
2408
2409 if (stream != so->output[i].stream)
2410 continue;
2411
2412 emit_streamout_output(ctx, so_buffers, so_write_offset,
2413 &so->output[i], &outputs[reg]);
2414 }
2415 }
2416 lp_build_endif(&if_ctx);
2417 }
2418
2419 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2420 LLVMValueRef *values)
2421 {
2422 struct ac_export_args args;
2423
2424 si_llvm_init_export_args(&ctx->bld_base, values,
2425 V_008DFC_SQ_EXP_PARAM + index, &args);
2426 ac_build_export(&ctx->ac, &args);
2427 }
2428
2429 static void si_build_param_exports(struct si_shader_context *ctx,
2430 struct si_shader_output_values *outputs,
2431 unsigned noutput)
2432 {
2433 struct si_shader *shader = ctx->shader;
2434 unsigned param_count = 0;
2435
2436 for (unsigned i = 0; i < noutput; i++) {
2437 unsigned semantic_name = outputs[i].semantic_name;
2438 unsigned semantic_index = outputs[i].semantic_index;
2439
2440 if (outputs[i].vertex_stream[0] != 0 &&
2441 outputs[i].vertex_stream[1] != 0 &&
2442 outputs[i].vertex_stream[2] != 0 &&
2443 outputs[i].vertex_stream[3] != 0)
2444 continue;
2445
2446 switch (semantic_name) {
2447 case TGSI_SEMANTIC_LAYER:
2448 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2449 case TGSI_SEMANTIC_CLIPDIST:
2450 case TGSI_SEMANTIC_COLOR:
2451 case TGSI_SEMANTIC_BCOLOR:
2452 case TGSI_SEMANTIC_PRIMID:
2453 case TGSI_SEMANTIC_FOG:
2454 case TGSI_SEMANTIC_TEXCOORD:
2455 case TGSI_SEMANTIC_GENERIC:
2456 break;
2457 default:
2458 continue;
2459 }
2460
2461 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2462 semantic_index < SI_MAX_IO_GENERIC) &&
2463 shader->key.opt.kill_outputs &
2464 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2465 continue;
2466
2467 si_export_param(ctx, param_count, outputs[i].values);
2468
2469 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2470 shader->info.vs_output_param_offset[i] = param_count++;
2471 }
2472
2473 shader->info.nr_param_exports = param_count;
2474 }
2475
2476 /* Generate export instructions for hardware VS shader stage */
2477 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2478 struct si_shader_output_values *outputs,
2479 unsigned noutput)
2480 {
2481 struct si_shader_context *ctx = si_shader_context(bld_base);
2482 struct si_shader *shader = ctx->shader;
2483 struct lp_build_context *base = &bld_base->base;
2484 struct ac_export_args pos_args[4] = {};
2485 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2486 unsigned pos_idx;
2487 int i;
2488
2489 /* Build position exports. */
2490 for (i = 0; i < noutput; i++) {
2491 switch (outputs[i].semantic_name) {
2492 case TGSI_SEMANTIC_POSITION:
2493 si_llvm_init_export_args(bld_base, outputs[i].values,
2494 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2495 break;
2496 case TGSI_SEMANTIC_PSIZE:
2497 psize_value = outputs[i].values[0];
2498 break;
2499 case TGSI_SEMANTIC_LAYER:
2500 layer_value = outputs[i].values[0];
2501 break;
2502 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2503 viewport_index_value = outputs[i].values[0];
2504 break;
2505 case TGSI_SEMANTIC_EDGEFLAG:
2506 edgeflag_value = outputs[i].values[0];
2507 break;
2508 case TGSI_SEMANTIC_CLIPDIST:
2509 if (!shader->key.opt.clip_disable) {
2510 unsigned index = 2 + outputs[i].semantic_index;
2511 si_llvm_init_export_args(bld_base, outputs[i].values,
2512 V_008DFC_SQ_EXP_POS + index,
2513 &pos_args[index]);
2514 }
2515 break;
2516 case TGSI_SEMANTIC_CLIPVERTEX:
2517 if (!shader->key.opt.clip_disable) {
2518 si_llvm_emit_clipvertex(bld_base, pos_args,
2519 outputs[i].values);
2520 }
2521 break;
2522 }
2523 }
2524
2525 /* We need to add the position output manually if it's missing. */
2526 if (!pos_args[0].out[0]) {
2527 pos_args[0].enabled_channels = 0xf; /* writemask */
2528 pos_args[0].valid_mask = 0; /* EXEC mask */
2529 pos_args[0].done = 0; /* last export? */
2530 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2531 pos_args[0].compr = 0; /* COMPR flag */
2532 pos_args[0].out[0] = base->zero; /* X */
2533 pos_args[0].out[1] = base->zero; /* Y */
2534 pos_args[0].out[2] = base->zero; /* Z */
2535 pos_args[0].out[3] = base->one; /* W */
2536 }
2537
2538 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2539 if (shader->selector->info.writes_psize ||
2540 shader->selector->info.writes_edgeflag ||
2541 shader->selector->info.writes_viewport_index ||
2542 shader->selector->info.writes_layer) {
2543 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2544 (shader->selector->info.writes_edgeflag << 1) |
2545 (shader->selector->info.writes_layer << 2);
2546
2547 pos_args[1].valid_mask = 0; /* EXEC mask */
2548 pos_args[1].done = 0; /* last export? */
2549 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2550 pos_args[1].compr = 0; /* COMPR flag */
2551 pos_args[1].out[0] = base->zero; /* X */
2552 pos_args[1].out[1] = base->zero; /* Y */
2553 pos_args[1].out[2] = base->zero; /* Z */
2554 pos_args[1].out[3] = base->zero; /* W */
2555
2556 if (shader->selector->info.writes_psize)
2557 pos_args[1].out[0] = psize_value;
2558
2559 if (shader->selector->info.writes_edgeflag) {
2560 /* The output is a float, but the hw expects an integer
2561 * with the first bit containing the edge flag. */
2562 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2563 edgeflag_value,
2564 ctx->i32, "");
2565 edgeflag_value = ac_build_umin(&ctx->ac,
2566 edgeflag_value,
2567 ctx->i32_1);
2568
2569 /* The LLVM intrinsic expects a float. */
2570 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2571 edgeflag_value,
2572 ctx->f32, "");
2573 }
2574
2575 if (ctx->screen->b.chip_class >= GFX9) {
2576 /* GFX9 has the layer in out.z[10:0] and the viewport
2577 * index in out.z[19:16].
2578 */
2579 if (shader->selector->info.writes_layer)
2580 pos_args[1].out[2] = layer_value;
2581
2582 if (shader->selector->info.writes_viewport_index) {
2583 LLVMValueRef v = viewport_index_value;
2584
2585 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2586 v = LLVMBuildShl(ctx->gallivm.builder, v,
2587 LLVMConstInt(ctx->i32, 16, 0), "");
2588 v = LLVMBuildOr(ctx->gallivm.builder, v,
2589 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2590 pos_args[1].out[2]), "");
2591 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2592 pos_args[1].enabled_channels |= 1 << 2;
2593 }
2594 } else {
2595 if (shader->selector->info.writes_layer)
2596 pos_args[1].out[2] = layer_value;
2597
2598 if (shader->selector->info.writes_viewport_index) {
2599 pos_args[1].out[3] = viewport_index_value;
2600 pos_args[1].enabled_channels |= 1 << 3;
2601 }
2602 }
2603 }
2604
2605 for (i = 0; i < 4; i++)
2606 if (pos_args[i].out[0])
2607 shader->info.nr_pos_exports++;
2608
2609 pos_idx = 0;
2610 for (i = 0; i < 4; i++) {
2611 if (!pos_args[i].out[0])
2612 continue;
2613
2614 /* Specify the target we are exporting */
2615 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2616
2617 if (pos_idx == shader->info.nr_pos_exports)
2618 /* Specify that this is the last export */
2619 pos_args[i].done = 1;
2620
2621 ac_build_export(&ctx->ac, &pos_args[i]);
2622 }
2623
2624 /* Build parameter exports. */
2625 si_build_param_exports(ctx, outputs, noutput);
2626 }
2627
2628 /**
2629 * Forward all outputs from the vertex shader to the TES. This is only used
2630 * for the fixed function TCS.
2631 */
2632 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2633 {
2634 struct si_shader_context *ctx = si_shader_context(bld_base);
2635 struct gallivm_state *gallivm = &ctx->gallivm;
2636 LLVMValueRef invocation_id, buffer, buffer_offset;
2637 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2638 uint64_t inputs;
2639
2640 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2641 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2642 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2643
2644 lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
2645 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2646 lds_vertex_stride, "");
2647 lds_base = get_tcs_in_current_patch_offset(ctx);
2648 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2649
2650 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2651 while (inputs) {
2652 unsigned i = u_bit_scan64(&inputs);
2653
2654 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2655 LLVMConstInt(ctx->i32, 4 * i, 0),
2656 "");
2657
2658 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2659 get_rel_patch_id(ctx),
2660 invocation_id,
2661 LLVMConstInt(ctx->i32, i, 0));
2662
2663 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2664 lds_ptr);
2665
2666 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2667 buffer_offset, 0, 1, 0, true, false);
2668 }
2669 }
2670
2671 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2672 LLVMValueRef rel_patch_id,
2673 LLVMValueRef invocation_id,
2674 LLVMValueRef tcs_out_current_patch_data_offset)
2675 {
2676 struct si_shader_context *ctx = si_shader_context(bld_base);
2677 struct gallivm_state *gallivm = &ctx->gallivm;
2678 struct si_shader *shader = ctx->shader;
2679 unsigned tess_inner_index, tess_outer_index;
2680 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2681 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2682 unsigned stride, outer_comps, inner_comps, i, offset;
2683 struct lp_build_if_state if_ctx, inner_if_ctx;
2684
2685 si_llvm_emit_barrier(NULL, bld_base, NULL);
2686
2687 /* Do this only for invocation 0, because the tess levels are per-patch,
2688 * not per-vertex.
2689 *
2690 * This can't jump, because invocation 0 executes this. It should
2691 * at least mask out the loads and stores for other invocations.
2692 */
2693 lp_build_if(&if_ctx, gallivm,
2694 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2695 invocation_id, ctx->i32_0, ""));
2696
2697 /* Determine the layout of one tess factor element in the buffer. */
2698 switch (shader->key.part.tcs.epilog.prim_mode) {
2699 case PIPE_PRIM_LINES:
2700 stride = 2; /* 2 dwords, 1 vec2 store */
2701 outer_comps = 2;
2702 inner_comps = 0;
2703 break;
2704 case PIPE_PRIM_TRIANGLES:
2705 stride = 4; /* 4 dwords, 1 vec4 store */
2706 outer_comps = 3;
2707 inner_comps = 1;
2708 break;
2709 case PIPE_PRIM_QUADS:
2710 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2711 outer_comps = 4;
2712 inner_comps = 2;
2713 break;
2714 default:
2715 assert(0);
2716 return;
2717 }
2718
2719 /* Load tess_inner and tess_outer from LDS.
2720 * Any invocation can write them, so we can't get them from a temporary.
2721 */
2722 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2723 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2724
2725 lds_base = tcs_out_current_patch_data_offset;
2726 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2727 LLVMConstInt(ctx->i32,
2728 tess_inner_index * 4, 0), "");
2729 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2730 LLVMConstInt(ctx->i32,
2731 tess_outer_index * 4, 0), "");
2732
2733 for (i = 0; i < 4; i++) {
2734 inner[i] = LLVMGetUndef(ctx->i32);
2735 outer[i] = LLVMGetUndef(ctx->i32);
2736 }
2737
2738 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2739 /* For isolines, the hardware expects tess factors in the
2740 * reverse order from what GLSL / TGSI specify.
2741 */
2742 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2743 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2744 } else {
2745 for (i = 0; i < outer_comps; i++) {
2746 outer[i] = out[i] =
2747 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2748 }
2749 for (i = 0; i < inner_comps; i++) {
2750 inner[i] = out[outer_comps+i] =
2751 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2752 }
2753 }
2754
2755 /* Convert the outputs to vectors for stores. */
2756 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2757 vec1 = NULL;
2758
2759 if (stride > 4)
2760 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2761
2762 /* Get the buffer. */
2763 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2764
2765 /* Get the offset. */
2766 tf_base = LLVMGetParam(ctx->main_fn,
2767 ctx->param_tcs_factor_offset);
2768 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2769 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2770
2771 lp_build_if(&inner_if_ctx, gallivm,
2772 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2773 rel_patch_id, ctx->i32_0, ""));
2774
2775 /* Store the dynamic HS control word. */
2776 offset = 0;
2777 if (ctx->screen->b.chip_class <= VI) {
2778 ac_build_buffer_store_dword(&ctx->ac, buffer,
2779 LLVMConstInt(ctx->i32, 0x80000000, 0),
2780 1, ctx->i32_0, tf_base,
2781 offset, 1, 0, true, false);
2782 offset += 4;
2783 }
2784
2785 lp_build_endif(&inner_if_ctx);
2786
2787 /* Store the tessellation factors. */
2788 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2789 MIN2(stride, 4), byteoffset, tf_base,
2790 offset, 1, 0, true, false);
2791 offset += 16;
2792 if (vec1)
2793 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2794 stride - 4, byteoffset, tf_base,
2795 offset, 1, 0, true, false);
2796
2797 /* Store the tess factors into the offchip buffer if TES reads them. */
2798 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2799 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2800 LLVMValueRef tf_inner_offset;
2801 unsigned param_outer, param_inner;
2802
2803 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2804 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2805
2806 param_outer = si_shader_io_get_unique_index_patch(
2807 TGSI_SEMANTIC_TESSOUTER, 0);
2808 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2809 LLVMConstInt(ctx->i32, param_outer, 0));
2810
2811 outer_vec = lp_build_gather_values(gallivm, outer,
2812 util_next_power_of_two(outer_comps));
2813
2814 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2815 outer_comps, tf_outer_offset,
2816 base, 0, 1, 0, true, false);
2817 if (inner_comps) {
2818 param_inner = si_shader_io_get_unique_index_patch(
2819 TGSI_SEMANTIC_TESSINNER, 0);
2820 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2821 LLVMConstInt(ctx->i32, param_inner, 0));
2822
2823 inner_vec = inner_comps == 1 ? inner[0] :
2824 lp_build_gather_values(gallivm, inner, inner_comps);
2825 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2826 inner_comps, tf_inner_offset,
2827 base, 0, 1, 0, true, false);
2828 }
2829 }
2830
2831 lp_build_endif(&if_ctx);
2832 }
2833
2834 static LLVMValueRef
2835 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2836 unsigned param, unsigned return_index)
2837 {
2838 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2839 LLVMGetParam(ctx->main_fn, param),
2840 return_index, "");
2841 }
2842
2843 static LLVMValueRef
2844 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2845 unsigned param, unsigned return_index)
2846 {
2847 LLVMBuilderRef builder = ctx->gallivm.builder;
2848 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2849
2850 return LLVMBuildInsertValue(builder, ret,
2851 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2852 return_index, "");
2853 }
2854
2855 static LLVMValueRef
2856 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2857 unsigned param, unsigned return_index)
2858 {
2859 LLVMBuilderRef builder = ctx->gallivm.builder;
2860 LLVMValueRef ptr, lo, hi;
2861
2862 ptr = LLVMGetParam(ctx->main_fn, param);
2863 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2864 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2865 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2866 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2867 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2868 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2869 }
2870
2871 /* This only writes the tessellation factor levels. */
2872 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2873 {
2874 struct si_shader_context *ctx = si_shader_context(bld_base);
2875 LLVMBuilderRef builder = ctx->gallivm.builder;
2876 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2877
2878 si_copy_tcs_inputs(bld_base);
2879
2880 rel_patch_id = get_rel_patch_id(ctx);
2881 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2882 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2883
2884 if (ctx->screen->b.chip_class >= GFX9) {
2885 LLVMBasicBlockRef blocks[2] = {
2886 LLVMGetInsertBlock(builder),
2887 ctx->merged_wrap_if_state.entry_block
2888 };
2889 LLVMValueRef values[2];
2890
2891 lp_build_endif(&ctx->merged_wrap_if_state);
2892
2893 values[0] = rel_patch_id;
2894 values[1] = LLVMGetUndef(ctx->i32);
2895 rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2896
2897 values[0] = tf_lds_offset;
2898 values[1] = LLVMGetUndef(ctx->i32);
2899 tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2900
2901 values[0] = invocation_id;
2902 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
2903 invocation_id = build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
2904 }
2905
2906 /* Return epilog parameters from this function. */
2907 LLVMValueRef ret = ctx->return_value;
2908 unsigned vgpr;
2909
2910 if (ctx->screen->b.chip_class >= GFX9) {
2911 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2912 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2913 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2914 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2915 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2916 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2917 /* Tess offchip and tess factor offsets are at the beginning. */
2918 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2919 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2920 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2921 } else {
2922 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2923 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2924 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2925 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2926 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2927 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2928 /* Tess offchip and tess factor offsets are after user SGPRs. */
2929 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2930 GFX6_TCS_NUM_USER_SGPR);
2931 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2932 GFX6_TCS_NUM_USER_SGPR + 1);
2933 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2934 }
2935
2936 /* VGPRs */
2937 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2938 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2939 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2940
2941 /* Leave a hole corresponding to the two input VGPRs. This ensures that
2942 * the invocation_id output does not alias the param_tcs_rel_ids input,
2943 * which saves a V_MOV on gfx9.
2944 */
2945 vgpr += 2;
2946
2947 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2948 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2949 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2950 ctx->return_value = ret;
2951 }
2952
2953 /* Pass TCS inputs from LS to TCS on GFX9. */
2954 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2955 {
2956 LLVMValueRef ret = ctx->return_value;
2957
2958 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2959 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2960 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2961 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2962 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2963 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2964 ctx->param_bindless_samplers_and_images,
2965 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
2966
2967 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2968 8 + SI_SGPR_VS_STATE_BITS);
2969 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2970 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2971 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2972 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2973 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2974 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2975 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2976 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2977 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2978 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2979
2980 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2981 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2982 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
2983 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2984 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
2985
2986 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2987 ret = si_insert_input_ret_float(ctx, ret,
2988 ctx->param_tcs_patch_id, vgpr++);
2989 ret = si_insert_input_ret_float(ctx, ret,
2990 ctx->param_tcs_rel_ids, vgpr++);
2991 ctx->return_value = ret;
2992 }
2993
2994 /* Pass GS inputs from ES to GS on GFX9. */
2995 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2996 {
2997 LLVMValueRef ret = ctx->return_value;
2998
2999 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
3000 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3001 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3002
3003 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3004 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
3005 ctx->param_bindless_samplers_and_images,
3006 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3007
3008 unsigned desc_param = ctx->param_vs_state_bits + 1;
3009 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
3010 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
3011 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
3012 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
3013
3014 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
3015 for (unsigned i = 0; i < 5; i++) {
3016 unsigned param = ctx->param_gs_vtx01_offset + i;
3017 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3018 }
3019 ctx->return_value = ret;
3020 }
3021
3022 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
3023 {
3024 struct si_shader_context *ctx = si_shader_context(bld_base);
3025 struct si_shader *shader = ctx->shader;
3026 struct tgsi_shader_info *info = &shader->selector->info;
3027 struct gallivm_state *gallivm = &ctx->gallivm;
3028 unsigned i, chan;
3029 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3030 ctx->param_rel_auto_id);
3031 LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3032 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
3033 vertex_dw_stride, "");
3034
3035 /* Write outputs to LDS. The next shader (TCS aka HS) will read
3036 * its inputs from it. */
3037 for (i = 0; i < info->num_outputs; i++) {
3038 LLVMValueRef *out_ptr = ctx->outputs[i];
3039 unsigned name = info->output_semantic_name[i];
3040 unsigned index = info->output_semantic_index[i];
3041
3042 /* The ARB_shader_viewport_layer_array spec contains the
3043 * following issue:
3044 *
3045 * 2) What happens if gl_ViewportIndex or gl_Layer is
3046 * written in the vertex shader and a geometry shader is
3047 * present?
3048 *
3049 * RESOLVED: The value written by the last vertex processing
3050 * stage is used. If the last vertex processing stage
3051 * (vertex, tessellation evaluation or geometry) does not
3052 * statically assign to gl_ViewportIndex or gl_Layer, index
3053 * or layer zero is assumed.
3054 *
3055 * So writes to those outputs in VS-as-LS are simply ignored.
3056 */
3057 if (name == TGSI_SEMANTIC_LAYER ||
3058 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3059 continue;
3060
3061 int param = si_shader_io_get_unique_index(name, index);
3062 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
3063 LLVMConstInt(ctx->i32, param * 4, 0), "");
3064
3065 for (chan = 0; chan < 4; chan++) {
3066 lds_store(bld_base, chan, dw_addr,
3067 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
3068 }
3069 }
3070
3071 if (ctx->screen->b.chip_class >= GFX9)
3072 si_set_ls_return_value_for_tcs(ctx);
3073 }
3074
3075 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
3076 {
3077 struct si_shader_context *ctx = si_shader_context(bld_base);
3078 struct gallivm_state *gallivm = &ctx->gallivm;
3079 struct si_shader *es = ctx->shader;
3080 struct tgsi_shader_info *info = &es->selector->info;
3081 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3082 ctx->param_es2gs_offset);
3083 LLVMValueRef lds_base = NULL;
3084 unsigned chan;
3085 int i;
3086
3087 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
3088 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3089 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3090 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3091 vertex_idx = LLVMBuildOr(gallivm->builder, vertex_idx,
3092 LLVMBuildMul(gallivm->builder, wave_idx,
3093 LLVMConstInt(ctx->i32, 64, false), ""), "");
3094 lds_base = LLVMBuildMul(gallivm->builder, vertex_idx,
3095 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3096 }
3097
3098 for (i = 0; i < info->num_outputs; i++) {
3099 LLVMValueRef *out_ptr = ctx->outputs[i];
3100 int param;
3101
3102 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3103 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3104 continue;
3105
3106 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3107 info->output_semantic_index[i]);
3108
3109 for (chan = 0; chan < 4; chan++) {
3110 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
3111 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
3112
3113 /* GFX9 has the ESGS ring in LDS. */
3114 if (ctx->screen->b.chip_class >= GFX9) {
3115 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
3116 continue;
3117 }
3118
3119 ac_build_buffer_store_dword(&ctx->ac,
3120 ctx->esgs_ring,
3121 out_val, 1, NULL, soffset,
3122 (4 * param + chan) * 4,
3123 1, 1, true, true);
3124 }
3125 }
3126
3127 if (ctx->screen->b.chip_class >= GFX9)
3128 si_set_es_return_value_for_gs(ctx);
3129 }
3130
3131 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3132 {
3133 if (ctx->screen->b.chip_class >= GFX9)
3134 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3135 else
3136 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3137 }
3138
3139 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3140 {
3141 struct si_shader_context *ctx = si_shader_context(bld_base);
3142
3143 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3144 si_get_gs_wave_id(ctx));
3145
3146 if (ctx->screen->b.chip_class >= GFX9)
3147 lp_build_endif(&ctx->merged_wrap_if_state);
3148 }
3149
3150 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3151 unsigned max_outputs,
3152 LLVMValueRef *addrs)
3153 {
3154 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3155 struct gallivm_state *gallivm = &ctx->gallivm;
3156 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3157 struct si_shader_output_values *outputs = NULL;
3158 int i,j;
3159
3160 assert(!ctx->shader->is_gs_copy_shader);
3161 assert(info->num_outputs <= max_outputs);
3162
3163 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3164
3165 /* Vertex color clamping.
3166 *
3167 * This uses a state constant loaded in a user data SGPR and
3168 * an IF statement is added that clamps all colors if the constant
3169 * is true.
3170 */
3171 if (ctx->type == PIPE_SHADER_VERTEX) {
3172 struct lp_build_if_state if_ctx;
3173 LLVMValueRef cond = NULL;
3174 LLVMValueRef addr, val;
3175
3176 for (i = 0; i < info->num_outputs; i++) {
3177 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3178 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3179 continue;
3180
3181 /* We've found a color. */
3182 if (!cond) {
3183 /* The state is in the first bit of the user SGPR. */
3184 cond = LLVMGetParam(ctx->main_fn,
3185 ctx->param_vs_state_bits);
3186 cond = LLVMBuildTrunc(gallivm->builder, cond,
3187 ctx->i1, "");
3188 lp_build_if(&if_ctx, gallivm, cond);
3189 }
3190
3191 for (j = 0; j < 4; j++) {
3192 addr = addrs[4 * i + j];
3193 val = LLVMBuildLoad(gallivm->builder, addr, "");
3194 val = ac_build_clamp(&ctx->ac, val);
3195 LLVMBuildStore(gallivm->builder, val, addr);
3196 }
3197 }
3198
3199 if (cond)
3200 lp_build_endif(&if_ctx);
3201 }
3202
3203 for (i = 0; i < info->num_outputs; i++) {
3204 outputs[i].semantic_name = info->output_semantic_name[i];
3205 outputs[i].semantic_index = info->output_semantic_index[i];
3206
3207 for (j = 0; j < 4; j++) {
3208 outputs[i].values[j] =
3209 LLVMBuildLoad(gallivm->builder,
3210 addrs[4 * i + j],
3211 "");
3212 outputs[i].vertex_stream[j] =
3213 (info->output_streams[i] >> (2 * j)) & 3;
3214 }
3215 }
3216
3217 if (ctx->shader->selector->so.num_outputs)
3218 si_llvm_emit_streamout(ctx, outputs, i, 0);
3219
3220 /* Export PrimitiveID. */
3221 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3222 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3223 outputs[i].semantic_index = 0;
3224 outputs[i].values[0] = LLVMBuildBitCast(gallivm->builder,
3225 get_primitive_id(ctx, 0), ctx->f32, "");
3226 for (j = 1; j < 4; j++)
3227 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3228
3229 memset(outputs[i].vertex_stream, 0,
3230 sizeof(outputs[i].vertex_stream));
3231 i++;
3232 }
3233
3234 si_llvm_export_vs(&ctx->bld_base, outputs, i);
3235 FREE(outputs);
3236 }
3237
3238 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3239 {
3240 struct si_shader_context *ctx = si_shader_context(bld_base);
3241
3242 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3243 &ctx->outputs[0][0]);
3244 }
3245
3246 struct si_ps_exports {
3247 unsigned num;
3248 struct ac_export_args args[10];
3249 };
3250
3251 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3252 bool writes_samplemask)
3253 {
3254 if (writes_z) {
3255 /* Z needs 32 bits. */
3256 if (writes_samplemask)
3257 return V_028710_SPI_SHADER_32_ABGR;
3258 else if (writes_stencil)
3259 return V_028710_SPI_SHADER_32_GR;
3260 else
3261 return V_028710_SPI_SHADER_32_R;
3262 } else if (writes_stencil || writes_samplemask) {
3263 /* Both stencil and sample mask need only 16 bits. */
3264 return V_028710_SPI_SHADER_UINT16_ABGR;
3265 } else {
3266 return V_028710_SPI_SHADER_ZERO;
3267 }
3268 }
3269
3270 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3271 LLVMValueRef depth, LLVMValueRef stencil,
3272 LLVMValueRef samplemask, struct si_ps_exports *exp)
3273 {
3274 struct si_shader_context *ctx = si_shader_context(bld_base);
3275 struct lp_build_context *base = &bld_base->base;
3276 struct ac_export_args args;
3277 unsigned mask = 0;
3278 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3279 stencil != NULL,
3280 samplemask != NULL);
3281
3282 assert(depth || stencil || samplemask);
3283
3284 args.valid_mask = 1; /* whether the EXEC mask is valid */
3285 args.done = 1; /* DONE bit */
3286
3287 /* Specify the target we are exporting */
3288 args.target = V_008DFC_SQ_EXP_MRTZ;
3289
3290 args.compr = 0; /* COMP flag */
3291 args.out[0] = base->undef; /* R, depth */
3292 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3293 args.out[2] = base->undef; /* B, sample mask */
3294 args.out[3] = base->undef; /* A, alpha to mask */
3295
3296 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3297 assert(!depth);
3298 args.compr = 1; /* COMPR flag */
3299
3300 if (stencil) {
3301 /* Stencil should be in X[23:16]. */
3302 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3303 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3304 LLVMConstInt(ctx->i32, 16, 0), "");
3305 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3306 mask |= 0x3;
3307 }
3308 if (samplemask) {
3309 /* SampleMask should be in Y[15:0]. */
3310 args.out[1] = samplemask;
3311 mask |= 0xc;
3312 }
3313 } else {
3314 if (depth) {
3315 args.out[0] = depth;
3316 mask |= 0x1;
3317 }
3318 if (stencil) {
3319 args.out[1] = stencil;
3320 mask |= 0x2;
3321 }
3322 if (samplemask) {
3323 args.out[2] = samplemask;
3324 mask |= 0x4;
3325 }
3326 }
3327
3328 /* SI (except OLAND and HAINAN) has a bug that it only looks
3329 * at the X writemask component. */
3330 if (ctx->screen->b.chip_class == SI &&
3331 ctx->screen->b.family != CHIP_OLAND &&
3332 ctx->screen->b.family != CHIP_HAINAN)
3333 mask |= 0x1;
3334
3335 /* Specify which components to enable */
3336 args.enabled_channels = mask;
3337
3338 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3339 }
3340
3341 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3342 LLVMValueRef *color, unsigned index,
3343 unsigned samplemask_param,
3344 bool is_last, struct si_ps_exports *exp)
3345 {
3346 struct si_shader_context *ctx = si_shader_context(bld_base);
3347 struct lp_build_context *base = &bld_base->base;
3348 int i;
3349
3350 /* Clamp color */
3351 if (ctx->shader->key.part.ps.epilog.clamp_color)
3352 for (i = 0; i < 4; i++)
3353 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3354
3355 /* Alpha to one */
3356 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3357 color[3] = base->one;
3358
3359 /* Alpha test */
3360 if (index == 0 &&
3361 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3362 si_alpha_test(bld_base, color[3]);
3363
3364 /* Line & polygon smoothing */
3365 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3366 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3367 samplemask_param);
3368
3369 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3370 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3371 struct ac_export_args args[8];
3372 int c, last = -1;
3373
3374 /* Get the export arguments, also find out what the last one is. */
3375 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3376 si_llvm_init_export_args(bld_base, color,
3377 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3378 if (args[c].enabled_channels)
3379 last = c;
3380 }
3381
3382 /* Emit all exports. */
3383 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3384 if (is_last && last == c) {
3385 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3386 args[c].done = 1; /* DONE bit */
3387 } else if (!args[c].enabled_channels)
3388 continue; /* unnecessary NULL export */
3389
3390 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3391 }
3392 } else {
3393 struct ac_export_args args;
3394
3395 /* Export */
3396 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3397 &args);
3398 if (is_last) {
3399 args.valid_mask = 1; /* whether the EXEC mask is valid */
3400 args.done = 1; /* DONE bit */
3401 } else if (!args.enabled_channels)
3402 return; /* unnecessary NULL export */
3403
3404 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3405 }
3406 }
3407
3408 static void si_emit_ps_exports(struct si_shader_context *ctx,
3409 struct si_ps_exports *exp)
3410 {
3411 for (unsigned i = 0; i < exp->num; i++)
3412 ac_build_export(&ctx->ac, &exp->args[i]);
3413 }
3414
3415 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3416 {
3417 struct si_shader_context *ctx = si_shader_context(bld_base);
3418 struct lp_build_context *base = &bld_base->base;
3419 struct ac_export_args args;
3420
3421 args.enabled_channels = 0x0; /* enabled channels */
3422 args.valid_mask = 1; /* whether the EXEC mask is valid */
3423 args.done = 1; /* DONE bit */
3424 args.target = V_008DFC_SQ_EXP_NULL;
3425 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3426 args.out[0] = base->undef; /* R */
3427 args.out[1] = base->undef; /* G */
3428 args.out[2] = base->undef; /* B */
3429 args.out[3] = base->undef; /* A */
3430
3431 ac_build_export(&ctx->ac, &args);
3432 }
3433
3434 /**
3435 * Return PS outputs in this order:
3436 *
3437 * v[0:3] = color0.xyzw
3438 * v[4:7] = color1.xyzw
3439 * ...
3440 * vN+0 = Depth
3441 * vN+1 = Stencil
3442 * vN+2 = SampleMask
3443 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3444 *
3445 * The alpha-ref SGPR is returned via its original location.
3446 */
3447 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3448 unsigned max_outputs,
3449 LLVMValueRef *addrs)
3450 {
3451 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3452 struct si_shader *shader = ctx->shader;
3453 struct tgsi_shader_info *info = &shader->selector->info;
3454 LLVMBuilderRef builder = ctx->gallivm.builder;
3455 unsigned i, j, first_vgpr, vgpr;
3456
3457 LLVMValueRef color[8][4] = {};
3458 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3459 LLVMValueRef ret;
3460
3461 if (ctx->postponed_kill)
3462 ac_build_kill(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3463
3464 /* Read the output values. */
3465 for (i = 0; i < info->num_outputs; i++) {
3466 unsigned semantic_name = info->output_semantic_name[i];
3467 unsigned semantic_index = info->output_semantic_index[i];
3468
3469 switch (semantic_name) {
3470 case TGSI_SEMANTIC_COLOR:
3471 assert(semantic_index < 8);
3472 for (j = 0; j < 4; j++) {
3473 LLVMValueRef ptr = addrs[4 * i + j];
3474 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3475 color[semantic_index][j] = result;
3476 }
3477 break;
3478 case TGSI_SEMANTIC_POSITION:
3479 depth = LLVMBuildLoad(builder,
3480 addrs[4 * i + 2], "");
3481 break;
3482 case TGSI_SEMANTIC_STENCIL:
3483 stencil = LLVMBuildLoad(builder,
3484 addrs[4 * i + 1], "");
3485 break;
3486 case TGSI_SEMANTIC_SAMPLEMASK:
3487 samplemask = LLVMBuildLoad(builder,
3488 addrs[4 * i + 0], "");
3489 break;
3490 default:
3491 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3492 semantic_name);
3493 }
3494 }
3495
3496 /* Fill the return structure. */
3497 ret = ctx->return_value;
3498
3499 /* Set SGPRs. */
3500 ret = LLVMBuildInsertValue(builder, ret,
3501 LLVMBuildBitCast(ctx->ac.builder,
3502 LLVMGetParam(ctx->main_fn,
3503 SI_PARAM_ALPHA_REF),
3504 ctx->i32, ""),
3505 SI_SGPR_ALPHA_REF, "");
3506
3507 /* Set VGPRs */
3508 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3509 for (i = 0; i < ARRAY_SIZE(color); i++) {
3510 if (!color[i][0])
3511 continue;
3512
3513 for (j = 0; j < 4; j++)
3514 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3515 }
3516 if (depth)
3517 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3518 if (stencil)
3519 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3520 if (samplemask)
3521 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3522
3523 /* Add the input sample mask for smoothing at the end. */
3524 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3525 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3526 ret = LLVMBuildInsertValue(builder, ret,
3527 LLVMGetParam(ctx->main_fn,
3528 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3529
3530 ctx->return_value = ret;
3531 }
3532
3533 /* Prevent optimizations (at least of memory accesses) across the current
3534 * point in the program by emitting empty inline assembly that is marked as
3535 * having side effects.
3536 *
3537 * Optionally, a value can be passed through the inline assembly to prevent
3538 * LLVM from hoisting calls to ReadNone functions.
3539 */
3540 static void emit_optimization_barrier(struct si_shader_context *ctx,
3541 LLVMValueRef *pvgpr)
3542 {
3543 static int counter = 0;
3544
3545 LLVMBuilderRef builder = ctx->gallivm.builder;
3546 char code[16];
3547
3548 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3549
3550 if (!pvgpr) {
3551 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3552 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3553 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3554 } else {
3555 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3556 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3557 LLVMValueRef vgpr = *pvgpr;
3558 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3559 unsigned vgpr_size = ac_get_type_size(vgpr_type);
3560 LLVMValueRef vgpr0;
3561
3562 assert(vgpr_size % 4 == 0);
3563
3564 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3565 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3566 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3567 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3568 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3569
3570 *pvgpr = vgpr;
3571 }
3572 }
3573
3574 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3575 {
3576 struct gallivm_state *gallivm = &ctx->gallivm;
3577 LLVMBuilderRef builder = gallivm->builder;
3578 LLVMValueRef args[1] = {
3579 LLVMConstInt(ctx->i32, simm16, 0)
3580 };
3581 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3582 ctx->voidt, args, 1, 0);
3583 }
3584
3585 static void membar_emit(
3586 const struct lp_build_tgsi_action *action,
3587 struct lp_build_tgsi_context *bld_base,
3588 struct lp_build_emit_data *emit_data)
3589 {
3590 struct si_shader_context *ctx = si_shader_context(bld_base);
3591 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3592 unsigned flags = LLVMConstIntGetZExtValue(src0);
3593 unsigned waitcnt = NOOP_WAITCNT;
3594
3595 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3596 waitcnt &= VM_CNT & LGKM_CNT;
3597
3598 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3599 TGSI_MEMBAR_SHADER_BUFFER |
3600 TGSI_MEMBAR_SHADER_IMAGE))
3601 waitcnt &= VM_CNT;
3602
3603 if (flags & TGSI_MEMBAR_SHARED)
3604 waitcnt &= LGKM_CNT;
3605
3606 if (waitcnt != NOOP_WAITCNT)
3607 si_emit_waitcnt(ctx, waitcnt);
3608 }
3609
3610 static void clock_emit(
3611 const struct lp_build_tgsi_action *action,
3612 struct lp_build_tgsi_context *bld_base,
3613 struct lp_build_emit_data *emit_data)
3614 {
3615 struct si_shader_context *ctx = si_shader_context(bld_base);
3616 struct gallivm_state *gallivm = &ctx->gallivm;
3617 LLVMValueRef tmp;
3618
3619 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3620 ctx->i64, NULL, 0, 0);
3621 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3622
3623 emit_data->output[0] =
3624 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3625 emit_data->output[1] =
3626 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3627 }
3628
3629 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3630 {
3631 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3632 CONST_ADDR_SPACE);
3633 }
3634
3635 static void si_llvm_emit_ddxy(
3636 const struct lp_build_tgsi_action *action,
3637 struct lp_build_tgsi_context *bld_base,
3638 struct lp_build_emit_data *emit_data)
3639 {
3640 struct si_shader_context *ctx = si_shader_context(bld_base);
3641 struct gallivm_state *gallivm = &ctx->gallivm;
3642 unsigned opcode = emit_data->info->opcode;
3643 LLVMValueRef val;
3644 int idx;
3645 unsigned mask;
3646
3647 if (opcode == TGSI_OPCODE_DDX_FINE)
3648 mask = AC_TID_MASK_LEFT;
3649 else if (opcode == TGSI_OPCODE_DDY_FINE)
3650 mask = AC_TID_MASK_TOP;
3651 else
3652 mask = AC_TID_MASK_TOP_LEFT;
3653
3654 /* for DDX we want to next X pixel, DDY next Y pixel. */
3655 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3656
3657 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
3658 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
3659 mask, idx, val);
3660 emit_data->output[emit_data->chan] = val;
3661 }
3662
3663 /*
3664 * this takes an I,J coordinate pair,
3665 * and works out the X and Y derivatives.
3666 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3667 */
3668 static LLVMValueRef si_llvm_emit_ddxy_interp(
3669 struct lp_build_tgsi_context *bld_base,
3670 LLVMValueRef interp_ij)
3671 {
3672 struct si_shader_context *ctx = si_shader_context(bld_base);
3673 struct gallivm_state *gallivm = &ctx->gallivm;
3674 LLVMValueRef result[4], a;
3675 unsigned i;
3676
3677 for (i = 0; i < 2; i++) {
3678 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
3679 LLVMConstInt(ctx->i32, i, 0), "");
3680 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3681 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3682 }
3683
3684 return lp_build_gather_values(gallivm, result, 4);
3685 }
3686
3687 static void interp_fetch_args(
3688 struct lp_build_tgsi_context *bld_base,
3689 struct lp_build_emit_data *emit_data)
3690 {
3691 struct si_shader_context *ctx = si_shader_context(bld_base);
3692 struct gallivm_state *gallivm = &ctx->gallivm;
3693 const struct tgsi_full_instruction *inst = emit_data->inst;
3694
3695 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3696 /* offset is in second src, first two channels */
3697 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3698 emit_data->inst, 1,
3699 TGSI_CHAN_X);
3700 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3701 emit_data->inst, 1,
3702 TGSI_CHAN_Y);
3703 emit_data->arg_count = 2;
3704 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3705 LLVMValueRef sample_position;
3706 LLVMValueRef sample_id;
3707 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3708
3709 /* fetch sample ID, then fetch its sample position,
3710 * and place into first two channels.
3711 */
3712 sample_id = lp_build_emit_fetch(bld_base,
3713 emit_data->inst, 1, TGSI_CHAN_X);
3714 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
3715 ctx->i32, "");
3716 sample_position = load_sample_position(ctx, sample_id);
3717
3718 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
3719 sample_position,
3720 ctx->i32_0, "");
3721
3722 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
3723 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
3724 sample_position,
3725 ctx->i32_1, "");
3726 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
3727 emit_data->arg_count = 2;
3728 }
3729 }
3730
3731 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3732 struct lp_build_tgsi_context *bld_base,
3733 struct lp_build_emit_data *emit_data)
3734 {
3735 struct si_shader_context *ctx = si_shader_context(bld_base);
3736 struct si_shader *shader = ctx->shader;
3737 struct gallivm_state *gallivm = &ctx->gallivm;
3738 const struct tgsi_shader_info *info = &shader->selector->info;
3739 LLVMValueRef interp_param;
3740 const struct tgsi_full_instruction *inst = emit_data->inst;
3741 const struct tgsi_full_src_register *input = &inst->Src[0];
3742 int input_base, input_array_size;
3743 int chan;
3744 int i;
3745 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3746 LLVMValueRef array_idx;
3747 int interp_param_idx;
3748 unsigned interp;
3749 unsigned location;
3750
3751 assert(input->Register.File == TGSI_FILE_INPUT);
3752
3753 if (input->Register.Indirect) {
3754 unsigned array_id = input->Indirect.ArrayID;
3755
3756 if (array_id) {
3757 input_base = info->input_array_first[array_id];
3758 input_array_size = info->input_array_last[array_id] - input_base + 1;
3759 } else {
3760 input_base = inst->Src[0].Register.Index;
3761 input_array_size = info->num_inputs - input_base;
3762 }
3763
3764 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3765 input->Register.Index - input_base);
3766 } else {
3767 input_base = inst->Src[0].Register.Index;
3768 input_array_size = 1;
3769 array_idx = ctx->i32_0;
3770 }
3771
3772 interp = shader->selector->info.input_interpolate[input_base];
3773
3774 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3775 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3776 location = TGSI_INTERPOLATE_LOC_CENTER;
3777 else
3778 location = TGSI_INTERPOLATE_LOC_CENTROID;
3779
3780 interp_param_idx = lookup_interp_param_index(interp, location);
3781 if (interp_param_idx == -1)
3782 return;
3783 else if (interp_param_idx)
3784 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3785 else
3786 interp_param = NULL;
3787
3788 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3789 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3790 LLVMValueRef ij_out[2];
3791 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3792
3793 /*
3794 * take the I then J parameters, and the DDX/Y for it, and
3795 * calculate the IJ inputs for the interpolator.
3796 * temp1 = ddx * offset/sample.x + I;
3797 * interp_param.I = ddy * offset/sample.y + temp1;
3798 * temp1 = ddx * offset/sample.x + J;
3799 * interp_param.J = ddy * offset/sample.y + temp1;
3800 */
3801 for (i = 0; i < 2; i++) {
3802 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3803 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3804 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
3805 ddxy_out, ix_ll, "");
3806 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
3807 ddxy_out, iy_ll, "");
3808 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
3809 interp_param, ix_ll, "");
3810 LLVMValueRef temp1, temp2;
3811
3812 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
3813 ctx->f32, "");
3814
3815 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
3816
3817 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
3818
3819 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
3820
3821 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
3822 }
3823 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
3824 }
3825
3826 if (interp_param) {
3827 interp_param = LLVMBuildBitCast(gallivm->builder,
3828 interp_param, LLVMVectorType(ctx->f32, 2), "");
3829 }
3830
3831 for (chan = 0; chan < 4; chan++) {
3832 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3833 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3834
3835 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3836 LLVMValueRef v, i = NULL, j = NULL;
3837
3838 if (interp_param) {
3839 interp_param = LLVMBuildBitCast(gallivm->builder,
3840 interp_param, LLVMVectorType(ctx->f32, 2), "");
3841 i = LLVMBuildExtractElement(
3842 gallivm->builder, interp_param, ctx->i32_0, "");
3843 j = LLVMBuildExtractElement(
3844 gallivm->builder, interp_param, ctx->i32_1, "");
3845 }
3846 v = si_build_fs_interp(ctx, input_base + idx, schan,
3847 prim_mask, i, j);
3848
3849 gather = LLVMBuildInsertElement(gallivm->builder,
3850 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3851 }
3852
3853 emit_data->output[chan] = LLVMBuildExtractElement(
3854 gallivm->builder, gather, array_idx, "");
3855 }
3856 }
3857
3858 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
3859 LLVMValueRef value)
3860 {
3861 struct gallivm_state *gallivm = &ctx->gallivm;
3862 LLVMValueRef args[3] = {
3863 value,
3864 ctx->i32_0,
3865 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
3866 };
3867
3868 /* We currently have no other way to prevent LLVM from lifting the icmp
3869 * calls to a dominating basic block.
3870 */
3871 emit_optimization_barrier(ctx, &args[0]);
3872
3873 if (LLVMTypeOf(args[0]) != ctx->i32)
3874 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
3875
3876 return lp_build_intrinsic(gallivm->builder,
3877 "llvm.amdgcn.icmp.i32",
3878 ctx->i64, args, 3,
3879 LP_FUNC_ATTR_NOUNWIND |
3880 LP_FUNC_ATTR_READNONE |
3881 LP_FUNC_ATTR_CONVERGENT);
3882 }
3883
3884 static void vote_all_emit(
3885 const struct lp_build_tgsi_action *action,
3886 struct lp_build_tgsi_context *bld_base,
3887 struct lp_build_emit_data *emit_data)
3888 {
3889 struct si_shader_context *ctx = si_shader_context(bld_base);
3890 struct gallivm_state *gallivm = &ctx->gallivm;
3891 LLVMValueRef active_set, vote_set;
3892 LLVMValueRef tmp;
3893
3894 active_set = si_emit_ballot(ctx, ctx->i32_1);
3895 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3896
3897 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3898 emit_data->output[emit_data->chan] =
3899 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3900 }
3901
3902 static void vote_any_emit(
3903 const struct lp_build_tgsi_action *action,
3904 struct lp_build_tgsi_context *bld_base,
3905 struct lp_build_emit_data *emit_data)
3906 {
3907 struct si_shader_context *ctx = si_shader_context(bld_base);
3908 struct gallivm_state *gallivm = &ctx->gallivm;
3909 LLVMValueRef vote_set;
3910 LLVMValueRef tmp;
3911
3912 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3913
3914 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
3915 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3916 emit_data->output[emit_data->chan] =
3917 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3918 }
3919
3920 static void vote_eq_emit(
3921 const struct lp_build_tgsi_action *action,
3922 struct lp_build_tgsi_context *bld_base,
3923 struct lp_build_emit_data *emit_data)
3924 {
3925 struct si_shader_context *ctx = si_shader_context(bld_base);
3926 struct gallivm_state *gallivm = &ctx->gallivm;
3927 LLVMValueRef active_set, vote_set;
3928 LLVMValueRef all, none, tmp;
3929
3930 active_set = si_emit_ballot(ctx, ctx->i32_1);
3931 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
3932
3933 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
3934 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
3935 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
3936 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
3937 emit_data->output[emit_data->chan] =
3938 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
3939 }
3940
3941 static void ballot_emit(
3942 const struct lp_build_tgsi_action *action,
3943 struct lp_build_tgsi_context *bld_base,
3944 struct lp_build_emit_data *emit_data)
3945 {
3946 struct si_shader_context *ctx = si_shader_context(bld_base);
3947 LLVMBuilderRef builder = ctx->gallivm.builder;
3948 LLVMValueRef tmp;
3949
3950 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3951 tmp = si_emit_ballot(ctx, tmp);
3952 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
3953
3954 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
3955 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
3956 }
3957
3958 static void read_invoc_fetch_args(
3959 struct lp_build_tgsi_context *bld_base,
3960 struct lp_build_emit_data *emit_data)
3961 {
3962 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
3963 0, emit_data->src_chan);
3964
3965 /* Always read the source invocation (= lane) from the X channel. */
3966 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
3967 1, TGSI_CHAN_X);
3968 emit_data->arg_count = 2;
3969 }
3970
3971 static void read_lane_emit(
3972 const struct lp_build_tgsi_action *action,
3973 struct lp_build_tgsi_context *bld_base,
3974 struct lp_build_emit_data *emit_data)
3975 {
3976 struct si_shader_context *ctx = si_shader_context(bld_base);
3977 LLVMBuilderRef builder = ctx->gallivm.builder;
3978
3979 /* We currently have no other way to prevent LLVM from lifting the icmp
3980 * calls to a dominating basic block.
3981 */
3982 emit_optimization_barrier(ctx, &emit_data->args[0]);
3983
3984 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
3985 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
3986 ctx->i32, "");
3987 }
3988
3989 emit_data->output[emit_data->chan] =
3990 ac_build_intrinsic(&ctx->ac, action->intr_name,
3991 ctx->i32, emit_data->args, emit_data->arg_count,
3992 AC_FUNC_ATTR_READNONE |
3993 AC_FUNC_ATTR_CONVERGENT);
3994 }
3995
3996 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
3997 struct lp_build_emit_data *emit_data)
3998 {
3999 struct si_shader_context *ctx = si_shader_context(bld_base);
4000 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4001 LLVMValueRef imm;
4002 unsigned stream;
4003
4004 assert(src0.File == TGSI_FILE_IMMEDIATE);
4005
4006 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4007 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4008 return stream;
4009 }
4010
4011 /* Emit one vertex from the geometry shader */
4012 static void si_llvm_emit_vertex(
4013 const struct lp_build_tgsi_action *action,
4014 struct lp_build_tgsi_context *bld_base,
4015 struct lp_build_emit_data *emit_data)
4016 {
4017 struct si_shader_context *ctx = si_shader_context(bld_base);
4018 struct lp_build_context *uint = &bld_base->uint_bld;
4019 struct si_shader *shader = ctx->shader;
4020 struct tgsi_shader_info *info = &shader->selector->info;
4021 struct gallivm_state *gallivm = &ctx->gallivm;
4022 struct lp_build_if_state if_state;
4023 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4024 ctx->param_gs2vs_offset);
4025 LLVMValueRef gs_next_vertex;
4026 LLVMValueRef can_emit, kill;
4027 unsigned chan, offset;
4028 int i;
4029 unsigned stream;
4030
4031 stream = si_llvm_get_stream(bld_base, emit_data);
4032
4033 /* Write vertex attribute values to GSVS ring */
4034 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
4035 ctx->gs_next_vertex[stream],
4036 "");
4037
4038 /* If this thread has already emitted the declared maximum number of
4039 * vertices, skip the write: excessive vertex emissions are not
4040 * supposed to have any effect.
4041 *
4042 * If the shader has no writes to memory, kill it instead. This skips
4043 * further memory loads and may allow LLVM to skip to the end
4044 * altogether.
4045 */
4046 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
4047 LLVMConstInt(ctx->i32,
4048 shader->selector->gs_max_out_vertices, 0), "");
4049
4050 bool use_kill = !info->writes_memory;
4051 if (use_kill) {
4052 kill = lp_build_select(&bld_base->base, can_emit,
4053 LLVMConstReal(ctx->f32, 1.0f),
4054 LLVMConstReal(ctx->f32, -1.0f));
4055
4056 ac_build_kill(&ctx->ac, kill);
4057 } else {
4058 lp_build_if(&if_state, gallivm, can_emit);
4059 }
4060
4061 offset = 0;
4062 for (i = 0; i < info->num_outputs; i++) {
4063 LLVMValueRef *out_ptr = ctx->outputs[i];
4064
4065 for (chan = 0; chan < 4; chan++) {
4066 if (!(info->output_usagemask[i] & (1 << chan)) ||
4067 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4068 continue;
4069
4070 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4071 LLVMValueRef voffset =
4072 LLVMConstInt(ctx->i32, offset *
4073 shader->selector->gs_max_out_vertices, 0);
4074 offset++;
4075
4076 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4077 voffset = lp_build_mul_imm(uint, voffset, 4);
4078
4079 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4080
4081 ac_build_buffer_store_dword(&ctx->ac,
4082 ctx->gsvs_ring[stream],
4083 out_val, 1,
4084 voffset, soffset, 0,
4085 1, 1, true, true);
4086 }
4087 }
4088
4089 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4090 ctx->i32_1);
4091
4092 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4093
4094 /* Signal vertex emission */
4095 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4096 si_get_gs_wave_id(ctx));
4097 if (!use_kill)
4098 lp_build_endif(&if_state);
4099 }
4100
4101 /* Cut one primitive from the geometry shader */
4102 static void si_llvm_emit_primitive(
4103 const struct lp_build_tgsi_action *action,
4104 struct lp_build_tgsi_context *bld_base,
4105 struct lp_build_emit_data *emit_data)
4106 {
4107 struct si_shader_context *ctx = si_shader_context(bld_base);
4108 unsigned stream;
4109
4110 /* Signal primitive cut */
4111 stream = si_llvm_get_stream(bld_base, emit_data);
4112 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4113 si_get_gs_wave_id(ctx));
4114 }
4115
4116 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4117 struct lp_build_tgsi_context *bld_base,
4118 struct lp_build_emit_data *emit_data)
4119 {
4120 struct si_shader_context *ctx = si_shader_context(bld_base);
4121 struct gallivm_state *gallivm = &ctx->gallivm;
4122
4123 /* SI only (thanks to a hw bug workaround):
4124 * The real barrier instruction isn’t needed, because an entire patch
4125 * always fits into a single wave.
4126 */
4127 if (ctx->screen->b.chip_class == SI &&
4128 ctx->type == PIPE_SHADER_TESS_CTRL) {
4129 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4130 return;
4131 }
4132
4133 lp_build_intrinsic(gallivm->builder,
4134 "llvm.amdgcn.s.barrier",
4135 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4136 }
4137
4138 static const struct lp_build_tgsi_action interp_action = {
4139 .fetch_args = interp_fetch_args,
4140 .emit = build_interp_intrinsic,
4141 };
4142
4143 static void si_create_function(struct si_shader_context *ctx,
4144 const char *name,
4145 LLVMTypeRef *returns, unsigned num_returns,
4146 struct si_function_info *fninfo,
4147 unsigned max_workgroup_size)
4148 {
4149 int i;
4150
4151 si_llvm_create_func(ctx, name, returns, num_returns,
4152 fninfo->types, fninfo->num_params);
4153 ctx->return_value = LLVMGetUndef(ctx->return_type);
4154
4155 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4156 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4157
4158 /* The combination of:
4159 * - ByVal
4160 * - dereferenceable
4161 * - invariant.load
4162 * allows the optimization passes to move loads and reduces
4163 * SGPR spilling significantly.
4164 */
4165 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4166 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4167 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4168 ac_add_attr_dereferenceable(P, UINT64_MAX);
4169 } else
4170 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4171 }
4172
4173 for (i = 0; i < fninfo->num_params; ++i) {
4174 if (fninfo->assign[i])
4175 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4176 }
4177
4178 if (max_workgroup_size) {
4179 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4180 max_workgroup_size);
4181 }
4182 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4183 "no-signed-zeros-fp-math",
4184 "true");
4185
4186 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
4187 /* These were copied from some LLVM test. */
4188 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4189 "less-precise-fpmad",
4190 "true");
4191 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4192 "no-infs-fp-math",
4193 "true");
4194 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4195 "no-nans-fp-math",
4196 "true");
4197 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4198 "unsafe-fp-math",
4199 "true");
4200 }
4201 }
4202
4203 static void declare_streamout_params(struct si_shader_context *ctx,
4204 struct pipe_stream_output_info *so,
4205 struct si_function_info *fninfo)
4206 {
4207 int i;
4208
4209 /* Streamout SGPRs. */
4210 if (so->num_outputs) {
4211 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4212 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4213 else
4214 ctx->param_streamout_config = fninfo->num_params - 1;
4215
4216 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4217 }
4218 /* A streamout buffer offset is loaded if the stride is non-zero. */
4219 for (i = 0; i < 4; i++) {
4220 if (!so->stride[i])
4221 continue;
4222
4223 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4224 }
4225 }
4226
4227 static void declare_lds_as_pointer(struct si_shader_context *ctx)
4228 {
4229 struct gallivm_state *gallivm = &ctx->gallivm;
4230
4231 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
4232 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
4233 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
4234 "lds");
4235 }
4236
4237 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4238 {
4239 switch (shader->selector->type) {
4240 case PIPE_SHADER_TESS_CTRL:
4241 /* Return this so that LLVM doesn't remove s_barrier
4242 * instructions on chips where we use s_barrier. */
4243 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
4244
4245 case PIPE_SHADER_GEOMETRY:
4246 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
4247
4248 case PIPE_SHADER_COMPUTE:
4249 break; /* see below */
4250
4251 default:
4252 return 0;
4253 }
4254
4255 const unsigned *properties = shader->selector->info.properties;
4256 unsigned max_work_group_size =
4257 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4258 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4259 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4260
4261 if (!max_work_group_size) {
4262 /* This is a variable group size compute shader,
4263 * compile it for the maximum possible group size.
4264 */
4265 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4266 }
4267 return max_work_group_size;
4268 }
4269
4270 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4271 struct si_function_info *fninfo,
4272 bool assign_params)
4273 {
4274 unsigned const_and_shader_buffers =
4275 add_arg(fninfo, ARG_SGPR,
4276 si_const_array(ctx->v4i32,
4277 SI_NUM_SHADER_BUFFERS + SI_NUM_CONST_BUFFERS));
4278 unsigned samplers_and_images =
4279 add_arg(fninfo, ARG_SGPR,
4280 si_const_array(ctx->v8i32,
4281 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4282
4283 if (assign_params) {
4284 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4285 ctx->param_samplers_and_images = samplers_and_images;
4286 }
4287 }
4288
4289 static void declare_default_desc_pointers(struct si_shader_context *ctx,
4290 struct si_function_info *fninfo)
4291 {
4292 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4293 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4294 ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4295 si_const_array(ctx->v8i32, 0));
4296 declare_per_stage_desc_pointers(ctx, fninfo, true);
4297 }
4298
4299 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4300 struct si_function_info *fninfo)
4301 {
4302 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4303 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4304 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4305 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4306 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4307 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4308 }
4309
4310 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4311 struct si_function_info *fninfo,
4312 unsigned *num_prolog_vgprs)
4313 {
4314 struct si_shader *shader = ctx->shader;
4315
4316 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4317 if (shader->key.as_ls) {
4318 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4319 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4320 } else {
4321 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4322 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4323 }
4324 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4325
4326 if (!shader->is_gs_copy_shader) {
4327 /* Vertex load indices. */
4328 ctx->param_vertex_index0 = fninfo->num_params;
4329 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4330 add_arg(fninfo, ARG_VGPR, ctx->i32);
4331 *num_prolog_vgprs += shader->selector->info.num_inputs;
4332 }
4333 }
4334
4335 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4336 struct si_function_info *fninfo)
4337 {
4338 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4339 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4340 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4341 ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4342 }
4343
4344 enum {
4345 /* Convenient merged shader definitions. */
4346 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4347 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4348 };
4349
4350 static void create_function(struct si_shader_context *ctx)
4351 {
4352 struct si_shader *shader = ctx->shader;
4353 struct si_function_info fninfo;
4354 LLVMTypeRef returns[16+32*4];
4355 unsigned i, num_return_sgprs;
4356 unsigned num_returns = 0;
4357 unsigned num_prolog_vgprs = 0;
4358 unsigned type = ctx->type;
4359
4360 si_init_function_info(&fninfo);
4361
4362 /* Set MERGED shaders. */
4363 if (ctx->screen->b.chip_class >= GFX9) {
4364 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4365 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4366 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4367 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4368 }
4369
4370 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4371
4372 switch (type) {
4373 case PIPE_SHADER_VERTEX:
4374 declare_default_desc_pointers(ctx, &fninfo);
4375 declare_vs_specific_input_sgprs(ctx, &fninfo);
4376
4377 if (shader->key.as_es) {
4378 assert(!shader->selector->nir);
4379 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4380 } else if (shader->key.as_ls) {
4381 assert(!shader->selector->nir);
4382 /* no extra parameters */
4383 } else {
4384 if (shader->is_gs_copy_shader) {
4385 fninfo.num_params = ctx->param_rw_buffers + 1;
4386 fninfo.num_sgpr_params = fninfo.num_params;
4387 }
4388
4389 /* The locations of the other parameters are assigned dynamically. */
4390 declare_streamout_params(ctx, &shader->selector->so,
4391 &fninfo);
4392 }
4393
4394 /* VGPRs */
4395 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4396 break;
4397
4398 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4399 declare_default_desc_pointers(ctx, &fninfo);
4400 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4401 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4402 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4403 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4404 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4405 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4406 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4407 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4408
4409 /* VGPRs */
4410 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4411 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4412
4413 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4414 * placed after the user SGPRs.
4415 */
4416 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4417 returns[num_returns++] = ctx->i32; /* SGPRs */
4418 for (i = 0; i < 5; i++)
4419 returns[num_returns++] = ctx->f32; /* VGPRs */
4420 break;
4421
4422 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4423 /* Merged stages have 8 system SGPRs at the beginning. */
4424 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4425 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4426 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4427 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4428 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4429 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4430 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4431 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4432
4433 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4434 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4435
4436 ctx->param_bindless_samplers_and_images =
4437 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
4438
4439 declare_per_stage_desc_pointers(ctx, &fninfo,
4440 ctx->type == PIPE_SHADER_VERTEX);
4441 declare_vs_specific_input_sgprs(ctx, &fninfo);
4442
4443 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4444 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4445 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4446 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4447 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4448 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4449
4450 declare_per_stage_desc_pointers(ctx, &fninfo,
4451 ctx->type == PIPE_SHADER_TESS_CTRL);
4452
4453 /* VGPRs (first TCS, then VS) */
4454 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4455 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4456
4457 if (ctx->type == PIPE_SHADER_VERTEX) {
4458 declare_vs_input_vgprs(ctx, &fninfo,
4459 &num_prolog_vgprs);
4460
4461 /* LS return values are inputs to the TCS main shader part. */
4462 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4463 returns[num_returns++] = ctx->i32; /* SGPRs */
4464 for (i = 0; i < 2; i++)
4465 returns[num_returns++] = ctx->f32; /* VGPRs */
4466 } else {
4467 /* TCS return values are inputs to the TCS epilog.
4468 *
4469 * param_tcs_offchip_offset, param_tcs_factor_offset,
4470 * param_tcs_offchip_layout, and param_rw_buffers
4471 * should be passed to the epilog.
4472 */
4473 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4474 returns[num_returns++] = ctx->i32; /* SGPRs */
4475 for (i = 0; i < 5; i++)
4476 returns[num_returns++] = ctx->f32; /* VGPRs */
4477 }
4478 break;
4479
4480 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4481 /* Merged stages have 8 system SGPRs at the beginning. */
4482 ctx->param_rw_buffers = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
4483 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4484 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4485 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4486 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4487 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4488 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4489 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4490
4491 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4492 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4493
4494 ctx->param_bindless_samplers_and_images =
4495 add_arg(&fninfo, ARG_SGPR, si_const_array(ctx->v8i32, 0));
4496
4497 declare_per_stage_desc_pointers(ctx, &fninfo,
4498 (ctx->type == PIPE_SHADER_VERTEX ||
4499 ctx->type == PIPE_SHADER_TESS_EVAL));
4500 if (ctx->type == PIPE_SHADER_VERTEX) {
4501 declare_vs_specific_input_sgprs(ctx, &fninfo);
4502 } else {
4503 /* TESS_EVAL (and also GEOMETRY):
4504 * Declare as many input SGPRs as the VS has. */
4505 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4506 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4507 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4508 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4509 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4510 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4511 }
4512
4513 declare_per_stage_desc_pointers(ctx, &fninfo,
4514 ctx->type == PIPE_SHADER_GEOMETRY);
4515
4516 /* VGPRs (first GS, then VS/TES) */
4517 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4518 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4519 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4520 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4521 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4522
4523 if (ctx->type == PIPE_SHADER_VERTEX) {
4524 declare_vs_input_vgprs(ctx, &fninfo,
4525 &num_prolog_vgprs);
4526 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4527 declare_tes_input_vgprs(ctx, &fninfo);
4528 }
4529
4530 if (ctx->type == PIPE_SHADER_VERTEX ||
4531 ctx->type == PIPE_SHADER_TESS_EVAL) {
4532 /* ES return values are inputs to GS. */
4533 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4534 returns[num_returns++] = ctx->i32; /* SGPRs */
4535 for (i = 0; i < 5; i++)
4536 returns[num_returns++] = ctx->f32; /* VGPRs */
4537 }
4538 break;
4539
4540 case PIPE_SHADER_TESS_EVAL:
4541 declare_default_desc_pointers(ctx, &fninfo);
4542 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4543 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4544
4545 if (shader->key.as_es) {
4546 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4547 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4548 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4549 } else {
4550 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4551 declare_streamout_params(ctx, &shader->selector->so,
4552 &fninfo);
4553 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4554 }
4555
4556 /* VGPRs */
4557 declare_tes_input_vgprs(ctx, &fninfo);
4558 break;
4559
4560 case PIPE_SHADER_GEOMETRY:
4561 declare_default_desc_pointers(ctx, &fninfo);
4562 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4563 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4564
4565 /* VGPRs */
4566 ctx->param_gs_vtx0_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4567 ctx->param_gs_vtx1_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4568 ctx->param_gs_prim_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4569 ctx->param_gs_vtx2_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4570 ctx->param_gs_vtx3_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4571 ctx->param_gs_vtx4_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4572 ctx->param_gs_vtx5_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4573 ctx->param_gs_instance_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4574 break;
4575
4576 case PIPE_SHADER_FRAGMENT:
4577 declare_default_desc_pointers(ctx, &fninfo);
4578 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4579 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4580
4581 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4582 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4583 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4584 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4585 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4586 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4587 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4588 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4589 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4590 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4591 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4592 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4593 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4594 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4595 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4596 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4597 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4598 &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4599 shader->info.face_vgpr_index = 20;
4600 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4601 &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4602 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4603 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4604 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4605
4606 /* Color inputs from the prolog. */
4607 if (shader->selector->info.colors_read) {
4608 unsigned num_color_elements =
4609 util_bitcount(shader->selector->info.colors_read);
4610
4611 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4612 for (i = 0; i < num_color_elements; i++)
4613 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4614
4615 num_prolog_vgprs += num_color_elements;
4616 }
4617
4618 /* Outputs for the epilog. */
4619 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4620 num_returns =
4621 num_return_sgprs +
4622 util_bitcount(shader->selector->info.colors_written) * 4 +
4623 shader->selector->info.writes_z +
4624 shader->selector->info.writes_stencil +
4625 shader->selector->info.writes_samplemask +
4626 1 /* SampleMaskIn */;
4627
4628 num_returns = MAX2(num_returns,
4629 num_return_sgprs +
4630 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4631
4632 for (i = 0; i < num_return_sgprs; i++)
4633 returns[i] = ctx->i32;
4634 for (; i < num_returns; i++)
4635 returns[i] = ctx->f32;
4636 break;
4637
4638 case PIPE_SHADER_COMPUTE:
4639 declare_default_desc_pointers(ctx, &fninfo);
4640 if (shader->selector->info.uses_grid_size)
4641 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4642 if (shader->selector->info.uses_block_size)
4643 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4644
4645 for (i = 0; i < 3; i++) {
4646 ctx->param_block_id[i] = -1;
4647 if (shader->selector->info.uses_block_id[i])
4648 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4649 }
4650
4651 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4652 break;
4653 default:
4654 assert(0 && "unimplemented shader");
4655 return;
4656 }
4657
4658 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4659 si_get_max_workgroup_size(shader));
4660
4661 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4662 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4663 ctx->separate_prolog) {
4664 si_llvm_add_attribute(ctx->main_fn,
4665 "InitialPSInputAddr",
4666 S_0286D0_PERSP_SAMPLE_ENA(1) |
4667 S_0286D0_PERSP_CENTER_ENA(1) |
4668 S_0286D0_PERSP_CENTROID_ENA(1) |
4669 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4670 S_0286D0_LINEAR_CENTER_ENA(1) |
4671 S_0286D0_LINEAR_CENTROID_ENA(1) |
4672 S_0286D0_FRONT_FACE_ENA(1) |
4673 S_0286D0_POS_FIXED_PT_ENA(1));
4674 }
4675
4676 shader->info.num_input_sgprs = 0;
4677 shader->info.num_input_vgprs = 0;
4678
4679 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4680 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4;
4681
4682 for (; i < fninfo.num_params; ++i)
4683 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4;
4684
4685 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4686 shader->info.num_input_vgprs -= num_prolog_vgprs;
4687
4688 if (shader->key.as_ls ||
4689 ctx->type == PIPE_SHADER_TESS_CTRL ||
4690 /* GFX9 has the ESGS ring buffer in LDS. */
4691 (ctx->screen->b.chip_class >= GFX9 &&
4692 (shader->key.as_es ||
4693 ctx->type == PIPE_SHADER_GEOMETRY)))
4694 declare_lds_as_pointer(ctx);
4695 }
4696
4697 /**
4698 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4699 * for later use.
4700 */
4701 static void preload_ring_buffers(struct si_shader_context *ctx)
4702 {
4703 struct gallivm_state *gallivm = &ctx->gallivm;
4704 LLVMBuilderRef builder = gallivm->builder;
4705
4706 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4707 ctx->param_rw_buffers);
4708
4709 if (ctx->screen->b.chip_class <= VI &&
4710 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4711 unsigned ring =
4712 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4713 : SI_ES_RING_ESGS;
4714 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4715
4716 ctx->esgs_ring =
4717 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4718 }
4719
4720 if (ctx->shader->is_gs_copy_shader) {
4721 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4722
4723 ctx->gsvs_ring[0] =
4724 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4725 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4726 const struct si_shader_selector *sel = ctx->shader->selector;
4727 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4728 LLVMValueRef base_ring;
4729
4730 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
4731
4732 /* The conceptual layout of the GSVS ring is
4733 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4734 * but the real memory layout is swizzled across
4735 * threads:
4736 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4737 * t16v0c0 ..
4738 * Override the buffer descriptor accordingly.
4739 */
4740 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4741 uint64_t stream_offset = 0;
4742
4743 for (unsigned stream = 0; stream < 4; ++stream) {
4744 unsigned num_components;
4745 unsigned stride;
4746 unsigned num_records;
4747 LLVMValueRef ring, tmp;
4748
4749 num_components = sel->info.num_stream_output_components[stream];
4750 if (!num_components)
4751 continue;
4752
4753 stride = 4 * num_components * sel->gs_max_out_vertices;
4754
4755 /* Limit on the stride field for <= CIK. */
4756 assert(stride < (1 << 14));
4757
4758 num_records = 64;
4759
4760 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4761 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4762 tmp = LLVMBuildAdd(builder, tmp,
4763 LLVMConstInt(ctx->i64,
4764 stream_offset, 0), "");
4765 stream_offset += stride * 64;
4766
4767 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4768 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4769 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4770 tmp = LLVMBuildOr(builder, tmp,
4771 LLVMConstInt(ctx->i32,
4772 S_008F04_STRIDE(stride) |
4773 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4774 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4775 ring = LLVMBuildInsertElement(builder, ring,
4776 LLVMConstInt(ctx->i32, num_records, 0),
4777 LLVMConstInt(ctx->i32, 2, 0), "");
4778 ring = LLVMBuildInsertElement(builder, ring,
4779 LLVMConstInt(ctx->i32,
4780 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4781 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4782 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4783 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4784 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4785 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4786 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4787 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4788 S_008F0C_ADD_TID_ENABLE(1),
4789 0),
4790 LLVMConstInt(ctx->i32, 3, 0), "");
4791
4792 ctx->gsvs_ring[stream] = ring;
4793 }
4794 }
4795 }
4796
4797 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4798 LLVMValueRef param_rw_buffers,
4799 unsigned param_pos_fixed_pt)
4800 {
4801 struct gallivm_state *gallivm = &ctx->gallivm;
4802 LLVMBuilderRef builder = gallivm->builder;
4803 LLVMValueRef slot, desc, offset, row, bit, address[2];
4804
4805 /* Use the fixed-point gl_FragCoord input.
4806 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4807 * per coordinate to get the repeating effect.
4808 */
4809 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4810 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4811
4812 /* Load the buffer descriptor. */
4813 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4814 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
4815
4816 /* The stipple pattern is 32x32, each row has 32 bits. */
4817 offset = LLVMBuildMul(builder, address[1],
4818 LLVMConstInt(ctx->i32, 4, 0), "");
4819 row = buffer_load_const(ctx, desc, offset);
4820 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
4821 bit = LLVMBuildLShr(builder, row, address[0], "");
4822 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4823
4824 /* The intrinsic kills the thread if arg < 0. */
4825 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
4826 LLVMConstReal(ctx->f32, -1), "");
4827 ac_build_kill(&ctx->ac, bit);
4828 }
4829
4830 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4831 struct si_shader_config *conf,
4832 unsigned symbol_offset)
4833 {
4834 unsigned i;
4835 const unsigned char *config =
4836 ac_shader_binary_config_start(binary, symbol_offset);
4837 bool really_needs_scratch = false;
4838
4839 /* LLVM adds SGPR spills to the scratch size.
4840 * Find out if we really need the scratch buffer.
4841 */
4842 for (i = 0; i < binary->reloc_count; i++) {
4843 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4844
4845 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4846 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4847 really_needs_scratch = true;
4848 break;
4849 }
4850 }
4851
4852 /* XXX: We may be able to emit some of these values directly rather than
4853 * extracting fields to be emitted later.
4854 */
4855
4856 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4857 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4858 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4859 switch (reg) {
4860 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4861 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4862 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4863 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4864 case R_00B848_COMPUTE_PGM_RSRC1:
4865 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4866 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4867 conf->float_mode = G_00B028_FLOAT_MODE(value);
4868 conf->rsrc1 = value;
4869 break;
4870 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4871 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4872 break;
4873 case R_00B84C_COMPUTE_PGM_RSRC2:
4874 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4875 conf->rsrc2 = value;
4876 break;
4877 case R_0286CC_SPI_PS_INPUT_ENA:
4878 conf->spi_ps_input_ena = value;
4879 break;
4880 case R_0286D0_SPI_PS_INPUT_ADDR:
4881 conf->spi_ps_input_addr = value;
4882 break;
4883 case R_0286E8_SPI_TMPRING_SIZE:
4884 case R_00B860_COMPUTE_TMPRING_SIZE:
4885 /* WAVESIZE is in units of 256 dwords. */
4886 if (really_needs_scratch)
4887 conf->scratch_bytes_per_wave =
4888 G_00B860_WAVESIZE(value) * 256 * 4;
4889 break;
4890 case 0x4: /* SPILLED_SGPRS */
4891 conf->spilled_sgprs = value;
4892 break;
4893 case 0x8: /* SPILLED_VGPRS */
4894 conf->spilled_vgprs = value;
4895 break;
4896 default:
4897 {
4898 static bool printed;
4899
4900 if (!printed) {
4901 fprintf(stderr, "Warning: LLVM emitted unknown "
4902 "config register: 0x%x\n", reg);
4903 printed = true;
4904 }
4905 }
4906 break;
4907 }
4908 }
4909
4910 if (!conf->spi_ps_input_addr)
4911 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
4912 }
4913
4914 void si_shader_apply_scratch_relocs(struct si_shader *shader,
4915 uint64_t scratch_va)
4916 {
4917 unsigned i;
4918 uint32_t scratch_rsrc_dword0 = scratch_va;
4919 uint32_t scratch_rsrc_dword1 =
4920 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
4921
4922 /* Enable scratch coalescing. */
4923 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
4924
4925 for (i = 0 ; i < shader->binary.reloc_count; i++) {
4926 const struct ac_shader_reloc *reloc =
4927 &shader->binary.relocs[i];
4928 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
4929 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4930 &scratch_rsrc_dword0, 4);
4931 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4932 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
4933 &scratch_rsrc_dword1, 4);
4934 }
4935 }
4936 }
4937
4938 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
4939 {
4940 unsigned size = shader->binary.code_size;
4941
4942 if (shader->prolog)
4943 size += shader->prolog->binary.code_size;
4944 if (shader->previous_stage)
4945 size += shader->previous_stage->binary.code_size;
4946 if (shader->prolog2)
4947 size += shader->prolog2->binary.code_size;
4948 if (shader->epilog)
4949 size += shader->epilog->binary.code_size;
4950 return size;
4951 }
4952
4953 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
4954 {
4955 const struct ac_shader_binary *prolog =
4956 shader->prolog ? &shader->prolog->binary : NULL;
4957 const struct ac_shader_binary *previous_stage =
4958 shader->previous_stage ? &shader->previous_stage->binary : NULL;
4959 const struct ac_shader_binary *prolog2 =
4960 shader->prolog2 ? &shader->prolog2->binary : NULL;
4961 const struct ac_shader_binary *epilog =
4962 shader->epilog ? &shader->epilog->binary : NULL;
4963 const struct ac_shader_binary *mainb = &shader->binary;
4964 unsigned bo_size = si_get_shader_binary_size(shader) +
4965 (!epilog ? mainb->rodata_size : 0);
4966 unsigned char *ptr;
4967
4968 assert(!prolog || !prolog->rodata_size);
4969 assert(!previous_stage || !previous_stage->rodata_size);
4970 assert(!prolog2 || !prolog2->rodata_size);
4971 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
4972 !mainb->rodata_size);
4973 assert(!epilog || !epilog->rodata_size);
4974
4975 r600_resource_reference(&shader->bo, NULL);
4976 shader->bo = (struct r600_resource*)
4977 pipe_buffer_create(&sscreen->b.b, 0,
4978 PIPE_USAGE_IMMUTABLE,
4979 align(bo_size, SI_CPDMA_ALIGNMENT));
4980 if (!shader->bo)
4981 return -ENOMEM;
4982
4983 /* Upload. */
4984 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
4985 PIPE_TRANSFER_READ_WRITE |
4986 PIPE_TRANSFER_UNSYNCHRONIZED);
4987
4988 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
4989 * endian-independent. */
4990 if (prolog) {
4991 memcpy(ptr, prolog->code, prolog->code_size);
4992 ptr += prolog->code_size;
4993 }
4994 if (previous_stage) {
4995 memcpy(ptr, previous_stage->code, previous_stage->code_size);
4996 ptr += previous_stage->code_size;
4997 }
4998 if (prolog2) {
4999 memcpy(ptr, prolog2->code, prolog2->code_size);
5000 ptr += prolog2->code_size;
5001 }
5002
5003 memcpy(ptr, mainb->code, mainb->code_size);
5004 ptr += mainb->code_size;
5005
5006 if (epilog)
5007 memcpy(ptr, epilog->code, epilog->code_size);
5008 else if (mainb->rodata_size > 0)
5009 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5010
5011 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5012 return 0;
5013 }
5014
5015 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5016 struct pipe_debug_callback *debug,
5017 const char *name, FILE *file)
5018 {
5019 char *line, *p;
5020 unsigned i, count;
5021
5022 if (binary->disasm_string) {
5023 fprintf(file, "Shader %s disassembly:\n", name);
5024 fprintf(file, "%s", binary->disasm_string);
5025
5026 if (debug && debug->debug_message) {
5027 /* Very long debug messages are cut off, so send the
5028 * disassembly one line at a time. This causes more
5029 * overhead, but on the plus side it simplifies
5030 * parsing of resulting logs.
5031 */
5032 pipe_debug_message(debug, SHADER_INFO,
5033 "Shader Disassembly Begin");
5034
5035 line = binary->disasm_string;
5036 while (*line) {
5037 p = util_strchrnul(line, '\n');
5038 count = p - line;
5039
5040 if (count) {
5041 pipe_debug_message(debug, SHADER_INFO,
5042 "%.*s", count, line);
5043 }
5044
5045 if (!*p)
5046 break;
5047 line = p + 1;
5048 }
5049
5050 pipe_debug_message(debug, SHADER_INFO,
5051 "Shader Disassembly End");
5052 }
5053 } else {
5054 fprintf(file, "Shader %s binary:\n", name);
5055 for (i = 0; i < binary->code_size; i += 4) {
5056 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5057 binary->code[i + 3], binary->code[i + 2],
5058 binary->code[i + 1], binary->code[i]);
5059 }
5060 }
5061 }
5062
5063 static void si_shader_dump_stats(struct si_screen *sscreen,
5064 const struct si_shader *shader,
5065 struct pipe_debug_callback *debug,
5066 unsigned processor,
5067 FILE *file,
5068 bool check_debug_option)
5069 {
5070 const struct si_shader_config *conf = &shader->config;
5071 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
5072 unsigned code_size = si_get_shader_binary_size(shader);
5073 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5074 unsigned lds_per_wave = 0;
5075 unsigned max_simd_waves;
5076
5077 switch (sscreen->b.family) {
5078 /* These always have 8 waves: */
5079 case CHIP_POLARIS10:
5080 case CHIP_POLARIS11:
5081 case CHIP_POLARIS12:
5082 max_simd_waves = 8;
5083 break;
5084 default:
5085 max_simd_waves = 10;
5086 }
5087
5088 /* Compute LDS usage for PS. */
5089 switch (processor) {
5090 case PIPE_SHADER_FRAGMENT:
5091 /* The minimum usage per wave is (num_inputs * 48). The maximum
5092 * usage is (num_inputs * 48 * 16).
5093 * We can get anything in between and it varies between waves.
5094 *
5095 * The 48 bytes per input for a single primitive is equal to
5096 * 4 bytes/component * 4 components/input * 3 points.
5097 *
5098 * Other stages don't know the size at compile time or don't
5099 * allocate LDS per wave, but instead they do it per thread group.
5100 */
5101 lds_per_wave = conf->lds_size * lds_increment +
5102 align(num_inputs * 48, lds_increment);
5103 break;
5104 case PIPE_SHADER_COMPUTE:
5105 if (shader->selector) {
5106 unsigned max_workgroup_size =
5107 si_get_max_workgroup_size(shader);
5108 lds_per_wave = (conf->lds_size * lds_increment) /
5109 DIV_ROUND_UP(max_workgroup_size, 64);
5110 }
5111 break;
5112 }
5113
5114 /* Compute the per-SIMD wave counts. */
5115 if (conf->num_sgprs) {
5116 if (sscreen->b.chip_class >= VI)
5117 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5118 else
5119 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5120 }
5121
5122 if (conf->num_vgprs)
5123 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5124
5125 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5126 * 16KB makes some SIMDs unoccupied). */
5127 if (lds_per_wave)
5128 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5129
5130 if (!check_debug_option ||
5131 r600_can_dump_shader(&sscreen->b, processor)) {
5132 if (processor == PIPE_SHADER_FRAGMENT) {
5133 fprintf(file, "*** SHADER CONFIG ***\n"
5134 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5135 "SPI_PS_INPUT_ENA = 0x%04x\n",
5136 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5137 }
5138
5139 fprintf(file, "*** SHADER STATS ***\n"
5140 "SGPRS: %d\n"
5141 "VGPRS: %d\n"
5142 "Spilled SGPRs: %d\n"
5143 "Spilled VGPRs: %d\n"
5144 "Private memory VGPRs: %d\n"
5145 "Code Size: %d bytes\n"
5146 "LDS: %d blocks\n"
5147 "Scratch: %d bytes per wave\n"
5148 "Max Waves: %d\n"
5149 "********************\n\n\n",
5150 conf->num_sgprs, conf->num_vgprs,
5151 conf->spilled_sgprs, conf->spilled_vgprs,
5152 conf->private_mem_vgprs, code_size,
5153 conf->lds_size, conf->scratch_bytes_per_wave,
5154 max_simd_waves);
5155 }
5156
5157 pipe_debug_message(debug, SHADER_INFO,
5158 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5159 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5160 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5161 conf->num_sgprs, conf->num_vgprs, code_size,
5162 conf->lds_size, conf->scratch_bytes_per_wave,
5163 max_simd_waves, conf->spilled_sgprs,
5164 conf->spilled_vgprs, conf->private_mem_vgprs);
5165 }
5166
5167 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5168 {
5169 switch (processor) {
5170 case PIPE_SHADER_VERTEX:
5171 if (shader->key.as_es)
5172 return "Vertex Shader as ES";
5173 else if (shader->key.as_ls)
5174 return "Vertex Shader as LS";
5175 else
5176 return "Vertex Shader as VS";
5177 case PIPE_SHADER_TESS_CTRL:
5178 return "Tessellation Control Shader";
5179 case PIPE_SHADER_TESS_EVAL:
5180 if (shader->key.as_es)
5181 return "Tessellation Evaluation Shader as ES";
5182 else
5183 return "Tessellation Evaluation Shader as VS";
5184 case PIPE_SHADER_GEOMETRY:
5185 if (shader->is_gs_copy_shader)
5186 return "GS Copy Shader as VS";
5187 else
5188 return "Geometry Shader";
5189 case PIPE_SHADER_FRAGMENT:
5190 return "Pixel Shader";
5191 case PIPE_SHADER_COMPUTE:
5192 return "Compute Shader";
5193 default:
5194 return "Unknown Shader";
5195 }
5196 }
5197
5198 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5199 struct pipe_debug_callback *debug, unsigned processor,
5200 FILE *file, bool check_debug_option)
5201 {
5202 if (!check_debug_option ||
5203 r600_can_dump_shader(&sscreen->b, processor))
5204 si_dump_shader_key(processor, shader, file);
5205
5206 if (!check_debug_option && shader->binary.llvm_ir_string) {
5207 if (shader->previous_stage &&
5208 shader->previous_stage->binary.llvm_ir_string) {
5209 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5210 si_get_shader_name(shader, processor));
5211 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5212 }
5213
5214 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5215 si_get_shader_name(shader, processor));
5216 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5217 }
5218
5219 if (!check_debug_option ||
5220 (r600_can_dump_shader(&sscreen->b, processor) &&
5221 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5222 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5223
5224 if (shader->prolog)
5225 si_shader_dump_disassembly(&shader->prolog->binary,
5226 debug, "prolog", file);
5227 if (shader->previous_stage)
5228 si_shader_dump_disassembly(&shader->previous_stage->binary,
5229 debug, "previous stage", file);
5230 if (shader->prolog2)
5231 si_shader_dump_disassembly(&shader->prolog2->binary,
5232 debug, "prolog2", file);
5233
5234 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5235
5236 if (shader->epilog)
5237 si_shader_dump_disassembly(&shader->epilog->binary,
5238 debug, "epilog", file);
5239 fprintf(file, "\n");
5240 }
5241
5242 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5243 check_debug_option);
5244 }
5245
5246 static int si_compile_llvm(struct si_screen *sscreen,
5247 struct ac_shader_binary *binary,
5248 struct si_shader_config *conf,
5249 LLVMTargetMachineRef tm,
5250 LLVMModuleRef mod,
5251 struct pipe_debug_callback *debug,
5252 unsigned processor,
5253 const char *name)
5254 {
5255 int r = 0;
5256 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5257
5258 if (r600_can_dump_shader(&sscreen->b, processor)) {
5259 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5260
5261 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5262 fprintf(stderr, "%s LLVM IR:\n\n", name);
5263 ac_dump_module(mod);
5264 fprintf(stderr, "\n");
5265 }
5266 }
5267
5268 if (sscreen->record_llvm_ir) {
5269 char *ir = LLVMPrintModuleToString(mod);
5270 binary->llvm_ir_string = strdup(ir);
5271 LLVMDisposeMessage(ir);
5272 }
5273
5274 if (!si_replace_shader(count, binary)) {
5275 r = si_llvm_compile(mod, binary, tm, debug);
5276 if (r)
5277 return r;
5278 }
5279
5280 si_shader_binary_read_config(binary, conf, 0);
5281
5282 /* Enable 64-bit and 16-bit denormals, because there is no performance
5283 * cost.
5284 *
5285 * If denormals are enabled, all floating-point output modifiers are
5286 * ignored.
5287 *
5288 * Don't enable denormals for 32-bit floats, because:
5289 * - Floating-point output modifiers would be ignored by the hw.
5290 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5291 * have to stop using those.
5292 * - SI & CI would be very slow.
5293 */
5294 conf->float_mode |= V_00B028_FP_64_DENORMS;
5295
5296 FREE(binary->config);
5297 FREE(binary->global_symbol_offsets);
5298 binary->config = NULL;
5299 binary->global_symbol_offsets = NULL;
5300
5301 /* Some shaders can't have rodata because their binaries can be
5302 * concatenated.
5303 */
5304 if (binary->rodata_size &&
5305 (processor == PIPE_SHADER_VERTEX ||
5306 processor == PIPE_SHADER_TESS_CTRL ||
5307 processor == PIPE_SHADER_TESS_EVAL ||
5308 processor == PIPE_SHADER_FRAGMENT)) {
5309 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5310 return -EINVAL;
5311 }
5312
5313 return r;
5314 }
5315
5316 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5317 {
5318 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5319 LLVMBuildRetVoid(ctx->gallivm.builder);
5320 else
5321 LLVMBuildRet(ctx->gallivm.builder, ret);
5322 }
5323
5324 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5325 struct si_shader *
5326 si_generate_gs_copy_shader(struct si_screen *sscreen,
5327 LLVMTargetMachineRef tm,
5328 struct si_shader_selector *gs_selector,
5329 struct pipe_debug_callback *debug)
5330 {
5331 struct si_shader_context ctx;
5332 struct si_shader *shader;
5333 struct gallivm_state *gallivm = &ctx.gallivm;
5334 LLVMBuilderRef builder;
5335 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5336 struct lp_build_context *uint = &bld_base->uint_bld;
5337 struct si_shader_output_values *outputs;
5338 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5339 int i, r;
5340
5341 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5342
5343 if (!outputs)
5344 return NULL;
5345
5346 shader = CALLOC_STRUCT(si_shader);
5347 if (!shader) {
5348 FREE(outputs);
5349 return NULL;
5350 }
5351
5352
5353 shader->selector = gs_selector;
5354 shader->is_gs_copy_shader = true;
5355
5356 si_init_shader_ctx(&ctx, sscreen, tm);
5357 ctx.shader = shader;
5358 ctx.type = PIPE_SHADER_VERTEX;
5359
5360 builder = gallivm->builder;
5361
5362 create_function(&ctx);
5363 preload_ring_buffers(&ctx);
5364
5365 LLVMValueRef voffset =
5366 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5367
5368 /* Fetch the vertex stream ID.*/
5369 LLVMValueRef stream_id;
5370
5371 if (gs_selector->so.num_outputs)
5372 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5373 else
5374 stream_id = ctx.i32_0;
5375
5376 /* Fill in output information. */
5377 for (i = 0; i < gsinfo->num_outputs; ++i) {
5378 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5379 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5380
5381 for (int chan = 0; chan < 4; chan++) {
5382 outputs[i].vertex_stream[chan] =
5383 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5384 }
5385 }
5386
5387 LLVMBasicBlockRef end_bb;
5388 LLVMValueRef switch_inst;
5389
5390 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
5391 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5392
5393 for (int stream = 0; stream < 4; stream++) {
5394 LLVMBasicBlockRef bb;
5395 unsigned offset;
5396
5397 if (!gsinfo->num_stream_output_components[stream])
5398 continue;
5399
5400 if (stream > 0 && !gs_selector->so.num_outputs)
5401 continue;
5402
5403 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
5404 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5405 LLVMPositionBuilderAtEnd(builder, bb);
5406
5407 /* Fetch vertex data from GSVS ring */
5408 offset = 0;
5409 for (i = 0; i < gsinfo->num_outputs; ++i) {
5410 for (unsigned chan = 0; chan < 4; chan++) {
5411 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5412 outputs[i].vertex_stream[chan] != stream) {
5413 outputs[i].values[chan] = ctx.bld_base.base.undef;
5414 continue;
5415 }
5416
5417 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5418 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5419 offset++;
5420
5421 outputs[i].values[chan] =
5422 ac_build_buffer_load(&ctx.ac,
5423 ctx.gsvs_ring[0], 1,
5424 ctx.i32_0, voffset,
5425 soffset, 0, 1, 1,
5426 true, false);
5427 }
5428 }
5429
5430 /* Streamout and exports. */
5431 if (gs_selector->so.num_outputs) {
5432 si_llvm_emit_streamout(&ctx, outputs,
5433 gsinfo->num_outputs,
5434 stream);
5435 }
5436
5437 if (stream == 0)
5438 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5439
5440 LLVMBuildBr(builder, end_bb);
5441 }
5442
5443 LLVMPositionBuilderAtEnd(builder, end_bb);
5444
5445 LLVMBuildRetVoid(gallivm->builder);
5446
5447 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5448 si_llvm_optimize_module(&ctx);
5449
5450 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5451 &ctx.shader->config, ctx.tm,
5452 ctx.gallivm.module,
5453 debug, PIPE_SHADER_GEOMETRY,
5454 "GS Copy Shader");
5455 if (!r) {
5456 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5457 fprintf(stderr, "GS Copy Shader:\n");
5458 si_shader_dump(sscreen, ctx.shader, debug,
5459 PIPE_SHADER_GEOMETRY, stderr, true);
5460 r = si_shader_binary_upload(sscreen, ctx.shader);
5461 }
5462
5463 si_llvm_dispose(&ctx);
5464
5465 FREE(outputs);
5466
5467 if (r != 0) {
5468 FREE(shader);
5469 shader = NULL;
5470 }
5471 return shader;
5472 }
5473
5474 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5475 const struct si_vs_prolog_bits *prolog,
5476 const char *prefix, FILE *f)
5477 {
5478 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5479 prefix, prolog->instance_divisor_is_one);
5480 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5481 prefix, prolog->instance_divisor_is_fetched);
5482 fprintf(f, " %s.ls_vgpr_fix = %u\n",
5483 prefix, prolog->ls_vgpr_fix);
5484
5485 fprintf(f, " mono.vs.fix_fetch = {");
5486 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5487 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5488 fprintf(f, "}\n");
5489 }
5490
5491 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5492 FILE *f)
5493 {
5494 const struct si_shader_key *key = &shader->key;
5495
5496 fprintf(f, "SHADER KEY\n");
5497
5498 switch (processor) {
5499 case PIPE_SHADER_VERTEX:
5500 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5501 "part.vs.prolog", f);
5502 fprintf(f, " as_es = %u\n", key->as_es);
5503 fprintf(f, " as_ls = %u\n", key->as_ls);
5504 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5505 key->mono.u.vs_export_prim_id);
5506 break;
5507
5508 case PIPE_SHADER_TESS_CTRL:
5509 if (shader->selector->screen->b.chip_class >= GFX9) {
5510 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5511 "part.tcs.ls_prolog", f);
5512 }
5513 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5514 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5515 break;
5516
5517 case PIPE_SHADER_TESS_EVAL:
5518 fprintf(f, " as_es = %u\n", key->as_es);
5519 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5520 key->mono.u.vs_export_prim_id);
5521 break;
5522
5523 case PIPE_SHADER_GEOMETRY:
5524 if (shader->is_gs_copy_shader)
5525 break;
5526
5527 if (shader->selector->screen->b.chip_class >= GFX9 &&
5528 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5529 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5530 "part.gs.vs_prolog", f);
5531 }
5532 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5533 break;
5534
5535 case PIPE_SHADER_COMPUTE:
5536 break;
5537
5538 case PIPE_SHADER_FRAGMENT:
5539 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5540 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5541 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5542 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5543 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5544 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5545 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5546 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5547 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5548 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5549 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5550 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5551 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5552 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5553 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5554 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5555 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5556 break;
5557
5558 default:
5559 assert(0);
5560 }
5561
5562 if ((processor == PIPE_SHADER_GEOMETRY ||
5563 processor == PIPE_SHADER_TESS_EVAL ||
5564 processor == PIPE_SHADER_VERTEX) &&
5565 !key->as_es && !key->as_ls) {
5566 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5567 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5568 }
5569 }
5570
5571 static void si_init_shader_ctx(struct si_shader_context *ctx,
5572 struct si_screen *sscreen,
5573 LLVMTargetMachineRef tm)
5574 {
5575 struct lp_build_tgsi_context *bld_base;
5576
5577 ctx->abi.chip_class = sscreen->b.chip_class;
5578
5579 si_llvm_context_init(ctx, sscreen, tm);
5580
5581 bld_base = &ctx->bld_base;
5582 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5583
5584 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5585 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5586 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5587
5588 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5589
5590 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5591
5592 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5593 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5594 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5595 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5596
5597 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5598 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5599 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5600 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5601 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5602 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5603 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5604 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5605 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5606
5607 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5608 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5609 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5610 }
5611
5612 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5613 {
5614 struct si_shader *shader = ctx->shader;
5615 struct tgsi_shader_info *info = &shader->selector->info;
5616
5617 if ((ctx->type != PIPE_SHADER_VERTEX &&
5618 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5619 shader->key.as_ls ||
5620 shader->key.as_es)
5621 return;
5622
5623 ac_optimize_vs_outputs(&ctx->ac,
5624 ctx->main_fn,
5625 shader->info.vs_output_param_offset,
5626 info->num_outputs,
5627 &shader->info.nr_param_exports);
5628 }
5629
5630 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5631 {
5632 ctx->shader->config.private_mem_vgprs = 0;
5633
5634 /* Process all LLVM instructions. */
5635 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5636 while (bb) {
5637 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5638
5639 while (next) {
5640 LLVMValueRef inst = next;
5641 next = LLVMGetNextInstruction(next);
5642
5643 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5644 continue;
5645
5646 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5647 /* No idea why LLVM aligns allocas to 4 elements. */
5648 unsigned alignment = LLVMGetAlignment(inst);
5649 unsigned dw_size = align(ac_get_type_size(type) / 4, alignment);
5650 ctx->shader->config.private_mem_vgprs += dw_size;
5651 }
5652 bb = LLVMGetNextBasicBlock(bb);
5653 }
5654 }
5655
5656 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5657 {
5658 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5659 lp_build_intrinsic(ctx->gallivm.builder,
5660 "llvm.amdgcn.init.exec", ctx->voidt,
5661 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5662 }
5663
5664 static void si_init_exec_from_input(struct si_shader_context *ctx,
5665 unsigned param, unsigned bitoffset)
5666 {
5667 LLVMValueRef args[] = {
5668 LLVMGetParam(ctx->main_fn, param),
5669 LLVMConstInt(ctx->i32, bitoffset, 0),
5670 };
5671 lp_build_intrinsic(ctx->gallivm.builder,
5672 "llvm.amdgcn.init.exec.from.input",
5673 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5674 }
5675
5676 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5677 const struct si_vs_prolog_bits *key)
5678 {
5679 /* VGPR initialization fixup for Vega10 and Raven is always done in the
5680 * VS prolog. */
5681 return sel->vs_needs_prolog || key->ls_vgpr_fix;
5682 }
5683
5684 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5685 bool is_monolithic)
5686 {
5687 struct si_shader *shader = ctx->shader;
5688 struct si_shader_selector *sel = shader->selector;
5689 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5690
5691 // TODO clean all this up!
5692 switch (ctx->type) {
5693 case PIPE_SHADER_VERTEX:
5694 ctx->load_input = declare_input_vs;
5695 if (shader->key.as_ls)
5696 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5697 else if (shader->key.as_es)
5698 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5699 else {
5700 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5701 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5702 }
5703 break;
5704 case PIPE_SHADER_TESS_CTRL:
5705 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5706 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5707 bld_base->emit_store = store_output_tcs;
5708 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5709 break;
5710 case PIPE_SHADER_TESS_EVAL:
5711 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5712 if (shader->key.as_es)
5713 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5714 else {
5715 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5716 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5717 }
5718 break;
5719 case PIPE_SHADER_GEOMETRY:
5720 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5721 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
5722 break;
5723 case PIPE_SHADER_FRAGMENT:
5724 ctx->load_input = declare_input_fs;
5725 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5726 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5727 break;
5728 case PIPE_SHADER_COMPUTE:
5729 break;
5730 default:
5731 assert(!"Unsupported shader type");
5732 return false;
5733 }
5734
5735 ctx->abi.load_ubo = load_ubo;
5736 ctx->abi.load_ssbo = load_ssbo;
5737
5738 create_function(ctx);
5739 preload_ring_buffers(ctx);
5740
5741 /* For GFX9 merged shaders:
5742 * - Set EXEC for the first shader. If the prolog is present, set
5743 * EXEC there instead.
5744 * - Add a barrier before the second shader.
5745 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5746 * an if-statement. This is required for correctness in geometry
5747 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5748 * GS_CUT messages.
5749 *
5750 * For monolithic merged shaders, the first shader is wrapped in an
5751 * if-block together with its prolog in si_build_wrapper_function.
5752 */
5753 if (ctx->screen->b.chip_class >= GFX9) {
5754 if (!is_monolithic &&
5755 sel->info.num_instructions > 1 && /* not empty shader */
5756 (shader->key.as_es || shader->key.as_ls) &&
5757 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5758 (ctx->type == PIPE_SHADER_VERTEX &&
5759 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
5760 si_init_exec_from_input(ctx,
5761 ctx->param_merged_wave_info, 0);
5762 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5763 ctx->type == PIPE_SHADER_GEOMETRY) {
5764 if (!is_monolithic)
5765 si_init_exec_full_mask(ctx);
5766
5767 /* The barrier must execute for all shaders in a
5768 * threadgroup.
5769 */
5770 si_llvm_emit_barrier(NULL, bld_base, NULL);
5771
5772 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5773 LLVMValueRef ena =
5774 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5775 ac_get_thread_id(&ctx->ac), num_threads, "");
5776 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5777 }
5778 }
5779
5780 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5781 int i;
5782 for (i = 0; i < 4; i++) {
5783 ctx->gs_next_vertex[i] =
5784 lp_build_alloca(&ctx->gallivm,
5785 ctx->i32, "");
5786 }
5787 }
5788
5789 if (ctx->type == PIPE_SHADER_FRAGMENT && sel->info.uses_kill &&
5790 ctx->screen->b.debug_flags & DBG_FS_CORRECT_DERIVS_AFTER_KILL) {
5791 /* This is initialized to 0.0 = not kill. */
5792 ctx->postponed_kill = lp_build_alloca(&ctx->gallivm, ctx->f32, "");
5793 }
5794
5795 if (sel->tokens) {
5796 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5797 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5798 return false;
5799 }
5800 } else {
5801 if (!si_nir_build_llvm(ctx, sel->nir)) {
5802 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5803 return false;
5804 }
5805 }
5806
5807 si_llvm_build_ret(ctx, ctx->return_value);
5808 return true;
5809 }
5810
5811 /**
5812 * Compute the VS prolog key, which contains all the information needed to
5813 * build the VS prolog function, and set shader->info bits where needed.
5814 *
5815 * \param info Shader info of the vertex shader.
5816 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5817 * \param prolog_key Key of the VS prolog
5818 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5819 * \param key Output shader part key.
5820 */
5821 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5822 unsigned num_input_sgprs,
5823 const struct si_vs_prolog_bits *prolog_key,
5824 struct si_shader *shader_out,
5825 union si_shader_part_key *key)
5826 {
5827 memset(key, 0, sizeof(*key));
5828 key->vs_prolog.states = *prolog_key;
5829 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5830 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5831 key->vs_prolog.as_ls = shader_out->key.as_ls;
5832
5833 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5834 key->vs_prolog.as_ls = 1;
5835 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5836 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5837 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5838 }
5839
5840 /* Enable loading the InstanceID VGPR. */
5841 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5842
5843 if ((key->vs_prolog.states.instance_divisor_is_one |
5844 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5845 shader_out->info.uses_instanceid = true;
5846 }
5847
5848 /**
5849 * Compute the PS prolog key, which contains all the information needed to
5850 * build the PS prolog function, and set related bits in shader->config.
5851 */
5852 static void si_get_ps_prolog_key(struct si_shader *shader,
5853 union si_shader_part_key *key,
5854 bool separate_prolog)
5855 {
5856 struct tgsi_shader_info *info = &shader->selector->info;
5857
5858 memset(key, 0, sizeof(*key));
5859 key->ps_prolog.states = shader->key.part.ps.prolog;
5860 key->ps_prolog.colors_read = info->colors_read;
5861 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5862 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5863 key->ps_prolog.wqm = info->uses_derivatives &&
5864 (key->ps_prolog.colors_read ||
5865 key->ps_prolog.states.force_persp_sample_interp ||
5866 key->ps_prolog.states.force_linear_sample_interp ||
5867 key->ps_prolog.states.force_persp_center_interp ||
5868 key->ps_prolog.states.force_linear_center_interp ||
5869 key->ps_prolog.states.bc_optimize_for_persp ||
5870 key->ps_prolog.states.bc_optimize_for_linear);
5871
5872 if (info->colors_read) {
5873 unsigned *color = shader->selector->color_attr_index;
5874
5875 if (shader->key.part.ps.prolog.color_two_side) {
5876 /* BCOLORs are stored after the last input. */
5877 key->ps_prolog.num_interp_inputs = info->num_inputs;
5878 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5879 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5880 }
5881
5882 for (unsigned i = 0; i < 2; i++) {
5883 unsigned interp = info->input_interpolate[color[i]];
5884 unsigned location = info->input_interpolate_loc[color[i]];
5885
5886 if (!(info->colors_read & (0xf << i*4)))
5887 continue;
5888
5889 key->ps_prolog.color_attr_index[i] = color[i];
5890
5891 if (shader->key.part.ps.prolog.flatshade_colors &&
5892 interp == TGSI_INTERPOLATE_COLOR)
5893 interp = TGSI_INTERPOLATE_CONSTANT;
5894
5895 switch (interp) {
5896 case TGSI_INTERPOLATE_CONSTANT:
5897 key->ps_prolog.color_interp_vgpr_index[i] = -1;
5898 break;
5899 case TGSI_INTERPOLATE_PERSPECTIVE:
5900 case TGSI_INTERPOLATE_COLOR:
5901 /* Force the interpolation location for colors here. */
5902 if (shader->key.part.ps.prolog.force_persp_sample_interp)
5903 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5904 if (shader->key.part.ps.prolog.force_persp_center_interp)
5905 location = TGSI_INTERPOLATE_LOC_CENTER;
5906
5907 switch (location) {
5908 case TGSI_INTERPOLATE_LOC_SAMPLE:
5909 key->ps_prolog.color_interp_vgpr_index[i] = 0;
5910 shader->config.spi_ps_input_ena |=
5911 S_0286CC_PERSP_SAMPLE_ENA(1);
5912 break;
5913 case TGSI_INTERPOLATE_LOC_CENTER:
5914 key->ps_prolog.color_interp_vgpr_index[i] = 2;
5915 shader->config.spi_ps_input_ena |=
5916 S_0286CC_PERSP_CENTER_ENA(1);
5917 break;
5918 case TGSI_INTERPOLATE_LOC_CENTROID:
5919 key->ps_prolog.color_interp_vgpr_index[i] = 4;
5920 shader->config.spi_ps_input_ena |=
5921 S_0286CC_PERSP_CENTROID_ENA(1);
5922 break;
5923 default:
5924 assert(0);
5925 }
5926 break;
5927 case TGSI_INTERPOLATE_LINEAR:
5928 /* Force the interpolation location for colors here. */
5929 if (shader->key.part.ps.prolog.force_linear_sample_interp)
5930 location = TGSI_INTERPOLATE_LOC_SAMPLE;
5931 if (shader->key.part.ps.prolog.force_linear_center_interp)
5932 location = TGSI_INTERPOLATE_LOC_CENTER;
5933
5934 /* The VGPR assignment for non-monolithic shaders
5935 * works because InitialPSInputAddr is set on the
5936 * main shader and PERSP_PULL_MODEL is never used.
5937 */
5938 switch (location) {
5939 case TGSI_INTERPOLATE_LOC_SAMPLE:
5940 key->ps_prolog.color_interp_vgpr_index[i] =
5941 separate_prolog ? 6 : 9;
5942 shader->config.spi_ps_input_ena |=
5943 S_0286CC_LINEAR_SAMPLE_ENA(1);
5944 break;
5945 case TGSI_INTERPOLATE_LOC_CENTER:
5946 key->ps_prolog.color_interp_vgpr_index[i] =
5947 separate_prolog ? 8 : 11;
5948 shader->config.spi_ps_input_ena |=
5949 S_0286CC_LINEAR_CENTER_ENA(1);
5950 break;
5951 case TGSI_INTERPOLATE_LOC_CENTROID:
5952 key->ps_prolog.color_interp_vgpr_index[i] =
5953 separate_prolog ? 10 : 13;
5954 shader->config.spi_ps_input_ena |=
5955 S_0286CC_LINEAR_CENTROID_ENA(1);
5956 break;
5957 default:
5958 assert(0);
5959 }
5960 break;
5961 default:
5962 assert(0);
5963 }
5964 }
5965 }
5966 }
5967
5968 /**
5969 * Check whether a PS prolog is required based on the key.
5970 */
5971 static bool si_need_ps_prolog(const union si_shader_part_key *key)
5972 {
5973 return key->ps_prolog.colors_read ||
5974 key->ps_prolog.states.force_persp_sample_interp ||
5975 key->ps_prolog.states.force_linear_sample_interp ||
5976 key->ps_prolog.states.force_persp_center_interp ||
5977 key->ps_prolog.states.force_linear_center_interp ||
5978 key->ps_prolog.states.bc_optimize_for_persp ||
5979 key->ps_prolog.states.bc_optimize_for_linear ||
5980 key->ps_prolog.states.poly_stipple;
5981 }
5982
5983 /**
5984 * Compute the PS epilog key, which contains all the information needed to
5985 * build the PS epilog function.
5986 */
5987 static void si_get_ps_epilog_key(struct si_shader *shader,
5988 union si_shader_part_key *key)
5989 {
5990 struct tgsi_shader_info *info = &shader->selector->info;
5991 memset(key, 0, sizeof(*key));
5992 key->ps_epilog.colors_written = info->colors_written;
5993 key->ps_epilog.writes_z = info->writes_z;
5994 key->ps_epilog.writes_stencil = info->writes_stencil;
5995 key->ps_epilog.writes_samplemask = info->writes_samplemask;
5996 key->ps_epilog.states = shader->key.part.ps.epilog;
5997 }
5998
5999 /**
6000 * Build the GS prolog function. Rotate the input vertices for triangle strips
6001 * with adjacency.
6002 */
6003 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
6004 union si_shader_part_key *key)
6005 {
6006 unsigned num_sgprs, num_vgprs;
6007 struct gallivm_state *gallivm = &ctx->gallivm;
6008 struct si_function_info fninfo;
6009 LLVMBuilderRef builder = gallivm->builder;
6010 LLVMTypeRef returns[48];
6011 LLVMValueRef func, ret;
6012
6013 si_init_function_info(&fninfo);
6014
6015 if (ctx->screen->b.chip_class >= GFX9) {
6016 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
6017 num_vgprs = 5; /* ES inputs are not needed by GS */
6018 } else {
6019 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6020 num_vgprs = 8;
6021 }
6022
6023 for (unsigned i = 0; i < num_sgprs; ++i) {
6024 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6025 returns[i] = ctx->i32;
6026 }
6027
6028 for (unsigned i = 0; i < num_vgprs; ++i) {
6029 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6030 returns[num_sgprs + i] = ctx->f32;
6031 }
6032
6033 /* Create the function. */
6034 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6035 &fninfo, 0);
6036 func = ctx->main_fn;
6037
6038 /* Set the full EXEC mask for the prolog, because we are only fiddling
6039 * with registers here. The main shader part will set the correct EXEC
6040 * mask.
6041 */
6042 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6043 si_init_exec_full_mask(ctx);
6044
6045 /* Copy inputs to outputs. This should be no-op, as the registers match,
6046 * but it will prevent the compiler from overwriting them unintentionally.
6047 */
6048 ret = ctx->return_value;
6049 for (unsigned i = 0; i < num_sgprs; i++) {
6050 LLVMValueRef p = LLVMGetParam(func, i);
6051 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6052 }
6053 for (unsigned i = 0; i < num_vgprs; i++) {
6054 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6055 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
6056 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6057 }
6058
6059 if (key->gs_prolog.states.tri_strip_adj_fix) {
6060 /* Remap the input vertices for every other primitive. */
6061 const unsigned gfx6_vtx_params[6] = {
6062 num_sgprs,
6063 num_sgprs + 1,
6064 num_sgprs + 3,
6065 num_sgprs + 4,
6066 num_sgprs + 5,
6067 num_sgprs + 6
6068 };
6069 const unsigned gfx9_vtx_params[3] = {
6070 num_sgprs,
6071 num_sgprs + 1,
6072 num_sgprs + 4,
6073 };
6074 LLVMValueRef vtx_in[6], vtx_out[6];
6075 LLVMValueRef prim_id, rotate;
6076
6077 if (ctx->screen->b.chip_class >= GFX9) {
6078 for (unsigned i = 0; i < 3; i++) {
6079 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6080 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6081 }
6082 } else {
6083 for (unsigned i = 0; i < 6; i++)
6084 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6085 }
6086
6087 prim_id = LLVMGetParam(func, num_sgprs + 2);
6088 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6089
6090 for (unsigned i = 0; i < 6; ++i) {
6091 LLVMValueRef base, rotated;
6092 base = vtx_in[i];
6093 rotated = vtx_in[(i + 4) % 6];
6094 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6095 }
6096
6097 if (ctx->screen->b.chip_class >= GFX9) {
6098 for (unsigned i = 0; i < 3; i++) {
6099 LLVMValueRef hi, out;
6100
6101 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6102 LLVMConstInt(ctx->i32, 16, 0), "");
6103 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6104 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
6105 ret = LLVMBuildInsertValue(builder, ret, out,
6106 gfx9_vtx_params[i], "");
6107 }
6108 } else {
6109 for (unsigned i = 0; i < 6; i++) {
6110 LLVMValueRef out;
6111
6112 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
6113 ret = LLVMBuildInsertValue(builder, ret, out,
6114 gfx6_vtx_params[i], "");
6115 }
6116 }
6117 }
6118
6119 LLVMBuildRet(builder, ret);
6120 }
6121
6122 /**
6123 * Given a list of shader part functions, build a wrapper function that
6124 * runs them in sequence to form a monolithic shader.
6125 */
6126 static void si_build_wrapper_function(struct si_shader_context *ctx,
6127 LLVMValueRef *parts,
6128 unsigned num_parts,
6129 unsigned main_part,
6130 unsigned next_shader_first_part)
6131 {
6132 struct gallivm_state *gallivm = &ctx->gallivm;
6133 LLVMBuilderRef builder = ctx->gallivm.builder;
6134 /* PS epilog has one arg per color component; gfx9 merged shader
6135 * prologs need to forward 32 user SGPRs.
6136 */
6137 struct si_function_info fninfo;
6138 LLVMValueRef initial[64], out[64];
6139 LLVMTypeRef function_type;
6140 unsigned num_first_params;
6141 unsigned num_out, initial_num_out;
6142 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6143 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6144 unsigned num_sgprs, num_vgprs;
6145 unsigned gprs;
6146 struct lp_build_if_state if_state;
6147
6148 si_init_function_info(&fninfo);
6149
6150 for (unsigned i = 0; i < num_parts; ++i) {
6151 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6152 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6153 }
6154
6155 /* The parameters of the wrapper function correspond to those of the
6156 * first part in terms of SGPRs and VGPRs, but we use the types of the
6157 * main part to get the right types. This is relevant for the
6158 * dereferenceable attribute on descriptor table pointers.
6159 */
6160 num_sgprs = 0;
6161 num_vgprs = 0;
6162
6163 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6164 num_first_params = LLVMCountParamTypes(function_type);
6165
6166 for (unsigned i = 0; i < num_first_params; ++i) {
6167 LLVMValueRef param = LLVMGetParam(parts[0], i);
6168
6169 if (ac_is_sgpr_param(param)) {
6170 assert(num_vgprs == 0);
6171 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6172 } else {
6173 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6174 }
6175 }
6176
6177 gprs = 0;
6178 while (gprs < num_sgprs + num_vgprs) {
6179 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6180 LLVMTypeRef type = LLVMTypeOf(param);
6181 unsigned size = ac_get_type_size(type) / 4;
6182
6183 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6184
6185 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6186 assert(gprs + size <= num_sgprs + num_vgprs &&
6187 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6188
6189 gprs += size;
6190 }
6191
6192 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6193 si_get_max_workgroup_size(ctx->shader));
6194
6195 if (is_merged_shader(ctx->shader))
6196 si_init_exec_full_mask(ctx);
6197
6198 /* Record the arguments of the function as if they were an output of
6199 * a previous part.
6200 */
6201 num_out = 0;
6202 num_out_sgpr = 0;
6203
6204 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6205 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6206 LLVMTypeRef param_type = LLVMTypeOf(param);
6207 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6208 unsigned size = ac_get_type_size(param_type) / 4;
6209
6210 if (size == 1) {
6211 if (param_type != out_type)
6212 param = LLVMBuildBitCast(builder, param, out_type, "");
6213 out[num_out++] = param;
6214 } else {
6215 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6216
6217 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6218 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6219 param_type = ctx->i64;
6220 }
6221
6222 if (param_type != vector_type)
6223 param = LLVMBuildBitCast(builder, param, vector_type, "");
6224
6225 for (unsigned j = 0; j < size; ++j)
6226 out[num_out++] = LLVMBuildExtractElement(
6227 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6228 }
6229
6230 if (i < fninfo.num_sgpr_params)
6231 num_out_sgpr = num_out;
6232 }
6233
6234 memcpy(initial, out, sizeof(out));
6235 initial_num_out = num_out;
6236 initial_num_out_sgpr = num_out_sgpr;
6237
6238 /* Now chain the parts. */
6239 for (unsigned part = 0; part < num_parts; ++part) {
6240 LLVMValueRef in[48];
6241 LLVMValueRef ret;
6242 LLVMTypeRef ret_type;
6243 unsigned out_idx = 0;
6244 unsigned num_params = LLVMCountParams(parts[part]);
6245
6246 /* Merged shaders are executed conditionally depending
6247 * on the number of enabled threads passed in the input SGPRs. */
6248 if (is_merged_shader(ctx->shader) && part == 0) {
6249 LLVMValueRef ena, count = initial[3];
6250
6251 count = LLVMBuildAnd(builder, count,
6252 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6253 ena = LLVMBuildICmp(builder, LLVMIntULT,
6254 ac_get_thread_id(&ctx->ac), count, "");
6255 lp_build_if(&if_state, &ctx->gallivm, ena);
6256 }
6257
6258 /* Derive arguments for the next part from outputs of the
6259 * previous one.
6260 */
6261 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6262 LLVMValueRef param;
6263 LLVMTypeRef param_type;
6264 bool is_sgpr;
6265 unsigned param_size;
6266 LLVMValueRef arg = NULL;
6267
6268 param = LLVMGetParam(parts[part], param_idx);
6269 param_type = LLVMTypeOf(param);
6270 param_size = ac_get_type_size(param_type) / 4;
6271 is_sgpr = ac_is_sgpr_param(param);
6272
6273 if (is_sgpr) {
6274 #if HAVE_LLVM < 0x0400
6275 LLVMRemoveAttribute(param, LLVMByValAttribute);
6276 #else
6277 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6278 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6279 #endif
6280 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6281 }
6282
6283 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6284 assert(is_sgpr || out_idx >= num_out_sgpr);
6285
6286 if (param_size == 1)
6287 arg = out[out_idx];
6288 else
6289 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
6290
6291 if (LLVMTypeOf(arg) != param_type) {
6292 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6293 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6294 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6295 } else {
6296 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6297 }
6298 }
6299
6300 in[param_idx] = arg;
6301 out_idx += param_size;
6302 }
6303
6304 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6305
6306 if (is_merged_shader(ctx->shader) &&
6307 part + 1 == next_shader_first_part) {
6308 lp_build_endif(&if_state);
6309
6310 /* The second half of the merged shader should use
6311 * the inputs from the toplevel (wrapper) function,
6312 * not the return value from the last call.
6313 *
6314 * That's because the last call was executed condi-
6315 * tionally, so we can't consume it in the main
6316 * block.
6317 */
6318 memcpy(out, initial, sizeof(initial));
6319 num_out = initial_num_out;
6320 num_out_sgpr = initial_num_out_sgpr;
6321 continue;
6322 }
6323
6324 /* Extract the returned GPRs. */
6325 ret_type = LLVMTypeOf(ret);
6326 num_out = 0;
6327 num_out_sgpr = 0;
6328
6329 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6330 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6331
6332 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6333
6334 for (unsigned i = 0; i < ret_size; ++i) {
6335 LLVMValueRef val =
6336 LLVMBuildExtractValue(builder, ret, i, "");
6337
6338 assert(num_out < ARRAY_SIZE(out));
6339 out[num_out++] = val;
6340
6341 if (LLVMTypeOf(val) == ctx->i32) {
6342 assert(num_out_sgpr + 1 == num_out);
6343 num_out_sgpr = num_out;
6344 }
6345 }
6346 }
6347 }
6348
6349 LLVMBuildRetVoid(builder);
6350 }
6351
6352 int si_compile_tgsi_shader(struct si_screen *sscreen,
6353 LLVMTargetMachineRef tm,
6354 struct si_shader *shader,
6355 bool is_monolithic,
6356 struct pipe_debug_callback *debug)
6357 {
6358 struct si_shader_selector *sel = shader->selector;
6359 struct si_shader_context ctx;
6360 int r = -1;
6361
6362 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6363 * conversion fails. */
6364 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6365 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6366 if (sel->tokens)
6367 tgsi_dump(sel->tokens, 0);
6368 else
6369 nir_print_shader(sel->nir, stderr);
6370 si_dump_streamout(&sel->so);
6371 }
6372
6373 si_init_shader_ctx(&ctx, sscreen, tm);
6374 si_llvm_context_set_tgsi(&ctx, shader);
6375 ctx.separate_prolog = !is_monolithic;
6376
6377 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6378 sizeof(shader->info.vs_output_param_offset));
6379
6380 shader->info.uses_instanceid = sel->info.uses_instanceid;
6381
6382 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6383 si_llvm_dispose(&ctx);
6384 return -1;
6385 }
6386
6387 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6388 LLVMValueRef parts[2];
6389 bool need_prolog = sel->vs_needs_prolog;
6390
6391 parts[1] = ctx.main_fn;
6392
6393 if (need_prolog) {
6394 union si_shader_part_key prolog_key;
6395 si_get_vs_prolog_key(&sel->info,
6396 shader->info.num_input_sgprs,
6397 &shader->key.part.vs.prolog,
6398 shader, &prolog_key);
6399 si_build_vs_prolog_function(&ctx, &prolog_key);
6400 parts[0] = ctx.main_fn;
6401 }
6402
6403 si_build_wrapper_function(&ctx, parts + !need_prolog,
6404 1 + need_prolog, need_prolog, 0);
6405 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6406 if (sscreen->b.chip_class >= GFX9) {
6407 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6408 LLVMValueRef parts[4];
6409 bool vs_needs_prolog =
6410 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6411
6412 /* TCS main part */
6413 parts[2] = ctx.main_fn;
6414
6415 /* TCS epilog */
6416 union si_shader_part_key tcs_epilog_key;
6417 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6418 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6419 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6420 parts[3] = ctx.main_fn;
6421
6422 /* VS prolog */
6423 if (vs_needs_prolog) {
6424 union si_shader_part_key vs_prolog_key;
6425 si_get_vs_prolog_key(&ls->info,
6426 shader->info.num_input_sgprs,
6427 &shader->key.part.tcs.ls_prolog,
6428 shader, &vs_prolog_key);
6429 vs_prolog_key.vs_prolog.is_monolithic = true;
6430 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6431 parts[0] = ctx.main_fn;
6432 }
6433
6434 /* VS as LS main part */
6435 struct si_shader shader_ls = {};
6436 shader_ls.selector = ls;
6437 shader_ls.key.as_ls = 1;
6438 shader_ls.key.mono = shader->key.mono;
6439 shader_ls.key.opt = shader->key.opt;
6440 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6441
6442 if (!si_compile_tgsi_main(&ctx, true)) {
6443 si_llvm_dispose(&ctx);
6444 return -1;
6445 }
6446 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6447 parts[1] = ctx.main_fn;
6448
6449 /* Reset the shader context. */
6450 ctx.shader = shader;
6451 ctx.type = PIPE_SHADER_TESS_CTRL;
6452
6453 si_build_wrapper_function(&ctx,
6454 parts + !vs_needs_prolog,
6455 4 - !vs_needs_prolog, 0,
6456 vs_needs_prolog ? 2 : 1);
6457 } else {
6458 LLVMValueRef parts[2];
6459 union si_shader_part_key epilog_key;
6460
6461 parts[0] = ctx.main_fn;
6462
6463 memset(&epilog_key, 0, sizeof(epilog_key));
6464 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6465 si_build_tcs_epilog_function(&ctx, &epilog_key);
6466 parts[1] = ctx.main_fn;
6467
6468 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6469 }
6470 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6471 if (ctx.screen->b.chip_class >= GFX9) {
6472 struct si_shader_selector *es = shader->key.part.gs.es;
6473 LLVMValueRef es_prolog = NULL;
6474 LLVMValueRef es_main = NULL;
6475 LLVMValueRef gs_prolog = NULL;
6476 LLVMValueRef gs_main = ctx.main_fn;
6477
6478 /* GS prolog */
6479 union si_shader_part_key gs_prolog_key;
6480 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6481 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6482 gs_prolog_key.gs_prolog.is_monolithic = true;
6483 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6484 gs_prolog = ctx.main_fn;
6485
6486 /* ES prolog */
6487 if (es->vs_needs_prolog) {
6488 union si_shader_part_key vs_prolog_key;
6489 si_get_vs_prolog_key(&es->info,
6490 shader->info.num_input_sgprs,
6491 &shader->key.part.tcs.ls_prolog,
6492 shader, &vs_prolog_key);
6493 vs_prolog_key.vs_prolog.is_monolithic = true;
6494 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6495 es_prolog = ctx.main_fn;
6496 }
6497
6498 /* ES main part */
6499 struct si_shader shader_es = {};
6500 shader_es.selector = es;
6501 shader_es.key.as_es = 1;
6502 shader_es.key.mono = shader->key.mono;
6503 shader_es.key.opt = shader->key.opt;
6504 si_llvm_context_set_tgsi(&ctx, &shader_es);
6505
6506 if (!si_compile_tgsi_main(&ctx, true)) {
6507 si_llvm_dispose(&ctx);
6508 return -1;
6509 }
6510 shader->info.uses_instanceid |= es->info.uses_instanceid;
6511 es_main = ctx.main_fn;
6512
6513 /* Reset the shader context. */
6514 ctx.shader = shader;
6515 ctx.type = PIPE_SHADER_GEOMETRY;
6516
6517 /* Prepare the array of shader parts. */
6518 LLVMValueRef parts[4];
6519 unsigned num_parts = 0, main_part, next_first_part;
6520
6521 if (es_prolog)
6522 parts[num_parts++] = es_prolog;
6523
6524 parts[main_part = num_parts++] = es_main;
6525 parts[next_first_part = num_parts++] = gs_prolog;
6526 parts[num_parts++] = gs_main;
6527
6528 si_build_wrapper_function(&ctx, parts, num_parts,
6529 main_part, next_first_part);
6530 } else {
6531 LLVMValueRef parts[2];
6532 union si_shader_part_key prolog_key;
6533
6534 parts[1] = ctx.main_fn;
6535
6536 memset(&prolog_key, 0, sizeof(prolog_key));
6537 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6538 si_build_gs_prolog_function(&ctx, &prolog_key);
6539 parts[0] = ctx.main_fn;
6540
6541 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6542 }
6543 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6544 LLVMValueRef parts[3];
6545 union si_shader_part_key prolog_key;
6546 union si_shader_part_key epilog_key;
6547 bool need_prolog;
6548
6549 si_get_ps_prolog_key(shader, &prolog_key, false);
6550 need_prolog = si_need_ps_prolog(&prolog_key);
6551
6552 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6553
6554 if (need_prolog) {
6555 si_build_ps_prolog_function(&ctx, &prolog_key);
6556 parts[0] = ctx.main_fn;
6557 }
6558
6559 si_get_ps_epilog_key(shader, &epilog_key);
6560 si_build_ps_epilog_function(&ctx, &epilog_key);
6561 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6562
6563 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6564 need_prolog ? 1 : 0, 0);
6565 }
6566
6567 si_llvm_optimize_module(&ctx);
6568
6569 /* Post-optimization transformations and analysis. */
6570 si_optimize_vs_outputs(&ctx);
6571
6572 if ((debug && debug->debug_message) ||
6573 r600_can_dump_shader(&sscreen->b, ctx.type))
6574 si_count_scratch_private_memory(&ctx);
6575
6576 /* Compile to bytecode. */
6577 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6578 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6579 si_llvm_dispose(&ctx);
6580 if (r) {
6581 fprintf(stderr, "LLVM failed to compile shader\n");
6582 return r;
6583 }
6584
6585 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6586 * LLVM 3.9svn has this bug.
6587 */
6588 if (sel->type == PIPE_SHADER_COMPUTE) {
6589 unsigned wave_size = 64;
6590 unsigned max_vgprs = 256;
6591 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6592 unsigned max_sgprs_per_wave = 128;
6593 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6594 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6595 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6596
6597 max_vgprs = max_vgprs / min_waves_per_simd;
6598 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6599
6600 if (shader->config.num_sgprs > max_sgprs ||
6601 shader->config.num_vgprs > max_vgprs) {
6602 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6603 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6604 shader->config.num_sgprs, shader->config.num_vgprs,
6605 max_sgprs, max_vgprs);
6606
6607 /* Just terminate the process, because dependent
6608 * shaders can hang due to bad input data, but use
6609 * the env var to allow shader-db to work.
6610 */
6611 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6612 abort();
6613 }
6614 }
6615
6616 /* Add the scratch offset to input SGPRs. */
6617 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6618 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6619
6620 /* Calculate the number of fragment input VGPRs. */
6621 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6622 shader->info.num_input_vgprs = 0;
6623 shader->info.face_vgpr_index = -1;
6624
6625 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6626 shader->info.num_input_vgprs += 2;
6627 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6628 shader->info.num_input_vgprs += 2;
6629 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6630 shader->info.num_input_vgprs += 2;
6631 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6632 shader->info.num_input_vgprs += 3;
6633 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6634 shader->info.num_input_vgprs += 2;
6635 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6636 shader->info.num_input_vgprs += 2;
6637 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6638 shader->info.num_input_vgprs += 2;
6639 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6640 shader->info.num_input_vgprs += 1;
6641 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6642 shader->info.num_input_vgprs += 1;
6643 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6644 shader->info.num_input_vgprs += 1;
6645 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6646 shader->info.num_input_vgprs += 1;
6647 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6648 shader->info.num_input_vgprs += 1;
6649 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6650 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6651 shader->info.num_input_vgprs += 1;
6652 }
6653 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6654 shader->info.num_input_vgprs += 1;
6655 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6656 shader->info.num_input_vgprs += 1;
6657 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6658 shader->info.num_input_vgprs += 1;
6659 }
6660
6661 return 0;
6662 }
6663
6664 /**
6665 * Create, compile and return a shader part (prolog or epilog).
6666 *
6667 * \param sscreen screen
6668 * \param list list of shader parts of the same category
6669 * \param type shader type
6670 * \param key shader part key
6671 * \param prolog whether the part being requested is a prolog
6672 * \param tm LLVM target machine
6673 * \param debug debug callback
6674 * \param build the callback responsible for building the main function
6675 * \return non-NULL on success
6676 */
6677 static struct si_shader_part *
6678 si_get_shader_part(struct si_screen *sscreen,
6679 struct si_shader_part **list,
6680 enum pipe_shader_type type,
6681 bool prolog,
6682 union si_shader_part_key *key,
6683 LLVMTargetMachineRef tm,
6684 struct pipe_debug_callback *debug,
6685 void (*build)(struct si_shader_context *,
6686 union si_shader_part_key *),
6687 const char *name)
6688 {
6689 struct si_shader_part *result;
6690
6691 mtx_lock(&sscreen->shader_parts_mutex);
6692
6693 /* Find existing. */
6694 for (result = *list; result; result = result->next) {
6695 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6696 mtx_unlock(&sscreen->shader_parts_mutex);
6697 return result;
6698 }
6699 }
6700
6701 /* Compile a new one. */
6702 result = CALLOC_STRUCT(si_shader_part);
6703 result->key = *key;
6704
6705 struct si_shader shader = {};
6706 struct si_shader_context ctx;
6707 struct gallivm_state *gallivm = &ctx.gallivm;
6708
6709 si_init_shader_ctx(&ctx, sscreen, tm);
6710 ctx.shader = &shader;
6711 ctx.type = type;
6712
6713 switch (type) {
6714 case PIPE_SHADER_VERTEX:
6715 break;
6716 case PIPE_SHADER_TESS_CTRL:
6717 assert(!prolog);
6718 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6719 break;
6720 case PIPE_SHADER_GEOMETRY:
6721 assert(prolog);
6722 break;
6723 case PIPE_SHADER_FRAGMENT:
6724 if (prolog)
6725 shader.key.part.ps.prolog = key->ps_prolog.states;
6726 else
6727 shader.key.part.ps.epilog = key->ps_epilog.states;
6728 break;
6729 default:
6730 unreachable("bad shader part");
6731 }
6732
6733 build(&ctx, key);
6734
6735 /* Compile. */
6736 si_llvm_optimize_module(&ctx);
6737
6738 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6739 gallivm->module, debug, ctx.type, name)) {
6740 FREE(result);
6741 result = NULL;
6742 goto out;
6743 }
6744
6745 result->next = *list;
6746 *list = result;
6747
6748 out:
6749 si_llvm_dispose(&ctx);
6750 mtx_unlock(&sscreen->shader_parts_mutex);
6751 return result;
6752 }
6753
6754 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6755 {
6756 struct gallivm_state *gallivm = &ctx->gallivm;
6757 LLVMValueRef ptr[2], list;
6758
6759 /* Get the pointer to rw buffers. */
6760 ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
6761 ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
6762 list = lp_build_gather_values(gallivm, ptr, 2);
6763 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
6764 list = LLVMBuildIntToPtr(gallivm->builder, list,
6765 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6766 return list;
6767 }
6768
6769 /**
6770 * Build the vertex shader prolog function.
6771 *
6772 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6773 * All inputs are returned unmodified. The vertex load indices are
6774 * stored after them, which will be used by the API VS for fetching inputs.
6775 *
6776 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6777 * input_v0,
6778 * input_v1,
6779 * input_v2,
6780 * input_v3,
6781 * (VertexID + BaseVertex),
6782 * (InstanceID + StartInstance),
6783 * (InstanceID / 2 + StartInstance)
6784 */
6785 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6786 union si_shader_part_key *key)
6787 {
6788 struct gallivm_state *gallivm = &ctx->gallivm;
6789 struct si_function_info fninfo;
6790 LLVMTypeRef *returns;
6791 LLVMValueRef ret, func;
6792 int num_returns, i;
6793 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
6794 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6795 LLVMValueRef input_vgprs[9];
6796 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6797 num_input_vgprs;
6798 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6799
6800 si_init_function_info(&fninfo);
6801
6802 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6803 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6804 sizeof(LLVMTypeRef));
6805 num_returns = 0;
6806
6807 /* Declare input and output SGPRs. */
6808 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6809 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6810 returns[num_returns++] = ctx->i32;
6811 }
6812
6813 /* Preloaded VGPRs (outputs must be floats) */
6814 for (i = 0; i < num_input_vgprs; i++) {
6815 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
6816 returns[num_returns++] = ctx->f32;
6817 }
6818
6819 /* Vertex load indices. */
6820 for (i = 0; i <= key->vs_prolog.last_input; i++)
6821 returns[num_returns++] = ctx->f32;
6822
6823 /* Create the function. */
6824 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6825 func = ctx->main_fn;
6826
6827 if (key->vs_prolog.num_merged_next_stage_vgprs) {
6828 if (!key->vs_prolog.is_monolithic)
6829 si_init_exec_from_input(ctx, 3, 0);
6830
6831 if (key->vs_prolog.as_ls &&
6832 (ctx->screen->b.family == CHIP_VEGA10 ||
6833 ctx->screen->b.family == CHIP_RAVEN)) {
6834 /* If there are no HS threads, SPI loads the LS VGPRs
6835 * starting at VGPR 0. Shift them back to where they
6836 * belong.
6837 */
6838 LLVMValueRef has_hs_threads =
6839 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
6840 unpack_param(ctx, 3, 8, 8),
6841 ctx->i32_0, "");
6842
6843 for (i = 4; i > 0; --i) {
6844 input_vgprs[i + 1] =
6845 LLVMBuildSelect(gallivm->builder, has_hs_threads,
6846 input_vgprs[i + 1],
6847 input_vgprs[i - 1], "");
6848 }
6849 }
6850 }
6851
6852 ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
6853 ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
6854
6855 /* Copy inputs to outputs. This should be no-op, as the registers match,
6856 * but it will prevent the compiler from overwriting them unintentionally.
6857 */
6858 ret = ctx->return_value;
6859 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6860 LLVMValueRef p = LLVMGetParam(func, i);
6861 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6862 }
6863 for (i = 0; i < num_input_vgprs; i++) {
6864 LLVMValueRef p = input_vgprs[i];
6865 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
6866 ret = LLVMBuildInsertValue(gallivm->builder, ret, p,
6867 key->vs_prolog.num_input_sgprs + i, "");
6868 }
6869
6870 /* Compute vertex load indices from instance divisors. */
6871 LLVMValueRef instance_divisor_constbuf = NULL;
6872
6873 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6874 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6875 LLVMValueRef buf_index =
6876 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
6877 instance_divisor_constbuf =
6878 ac_build_indexed_load_const(&ctx->ac, list, buf_index);
6879 }
6880
6881 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6882 bool divisor_is_one =
6883 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
6884 bool divisor_is_fetched =
6885 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
6886 LLVMValueRef index;
6887
6888 if (divisor_is_one || divisor_is_fetched) {
6889 LLVMValueRef divisor = ctx->i32_1;
6890
6891 if (divisor_is_fetched) {
6892 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
6893 LLVMConstInt(ctx->i32, i * 4, 0));
6894 divisor = LLVMBuildBitCast(gallivm->builder, divisor,
6895 ctx->i32, "");
6896 }
6897
6898 /* InstanceID / Divisor + StartInstance */
6899 index = get_instance_index_for_fetch(ctx,
6900 user_sgpr_base +
6901 SI_SGPR_START_INSTANCE,
6902 divisor);
6903 } else {
6904 /* VertexID + BaseVertex */
6905 index = LLVMBuildAdd(gallivm->builder,
6906 ctx->abi.vertex_id,
6907 LLVMGetParam(func, user_sgpr_base +
6908 SI_SGPR_BASE_VERTEX), "");
6909 }
6910
6911 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
6912 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6913 fninfo.num_params + i, "");
6914 }
6915
6916 si_llvm_build_ret(ctx, ret);
6917 }
6918
6919 static bool si_get_vs_prolog(struct si_screen *sscreen,
6920 LLVMTargetMachineRef tm,
6921 struct si_shader *shader,
6922 struct pipe_debug_callback *debug,
6923 struct si_shader *main_part,
6924 const struct si_vs_prolog_bits *key)
6925 {
6926 struct si_shader_selector *vs = main_part->selector;
6927
6928 if (!si_vs_needs_prolog(vs, key))
6929 return true;
6930
6931 /* Get the prolog. */
6932 union si_shader_part_key prolog_key;
6933 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
6934 key, shader, &prolog_key);
6935
6936 shader->prolog =
6937 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6938 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
6939 debug, si_build_vs_prolog_function,
6940 "Vertex Shader Prolog");
6941 return shader->prolog != NULL;
6942 }
6943
6944 /**
6945 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6946 */
6947 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6948 LLVMTargetMachineRef tm,
6949 struct si_shader *shader,
6950 struct pipe_debug_callback *debug)
6951 {
6952 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
6953 &shader->key.part.vs.prolog);
6954 }
6955
6956 /**
6957 * Compile the TCS epilog function. This writes tesselation factors to memory
6958 * based on the output primitive type of the tesselator (determined by TES).
6959 */
6960 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
6961 union si_shader_part_key *key)
6962 {
6963 struct gallivm_state *gallivm = &ctx->gallivm;
6964 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
6965 struct si_function_info fninfo;
6966 LLVMValueRef func;
6967
6968 si_init_function_info(&fninfo);
6969
6970 if (ctx->screen->b.chip_class >= GFX9) {
6971 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6972 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6973 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
6974 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6975 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6976 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6977 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6978 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6979 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6980 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6981 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6982 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6983 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6984 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6985 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6986 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6987 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6988 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6989 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6990 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6991 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6992 } else {
6993 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6994 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6995 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6996 add_arg(&fninfo, ARG_SGPR, ctx->i64);
6997 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
6998 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6999 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7000 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7001 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7002 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7003 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7004 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7005 }
7006
7007 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7008 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7009 unsigned tess_factors_idx =
7010 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7011 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7012 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7013
7014 /* Create the function. */
7015 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7016 ctx->screen->b.chip_class >= CIK ? 128 : 64);
7017 declare_lds_as_pointer(ctx);
7018 func = ctx->main_fn;
7019
7020 si_write_tess_factors(bld_base,
7021 LLVMGetParam(func, tess_factors_idx),
7022 LLVMGetParam(func, tess_factors_idx + 1),
7023 LLVMGetParam(func, tess_factors_idx + 2));
7024
7025 LLVMBuildRetVoid(gallivm->builder);
7026 }
7027
7028 /**
7029 * Select and compile (or reuse) TCS parts (epilog).
7030 */
7031 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7032 LLVMTargetMachineRef tm,
7033 struct si_shader *shader,
7034 struct pipe_debug_callback *debug)
7035 {
7036 if (sscreen->b.chip_class >= GFX9) {
7037 struct si_shader *ls_main_part =
7038 shader->key.part.tcs.ls->main_shader_part_ls;
7039
7040 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
7041 &shader->key.part.tcs.ls_prolog))
7042 return false;
7043
7044 shader->previous_stage = ls_main_part;
7045 }
7046
7047 /* Get the epilog. */
7048 union si_shader_part_key epilog_key;
7049 memset(&epilog_key, 0, sizeof(epilog_key));
7050 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7051
7052 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7053 PIPE_SHADER_TESS_CTRL, false,
7054 &epilog_key, tm, debug,
7055 si_build_tcs_epilog_function,
7056 "Tessellation Control Shader Epilog");
7057 return shader->epilog != NULL;
7058 }
7059
7060 /**
7061 * Select and compile (or reuse) GS parts (prolog).
7062 */
7063 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7064 LLVMTargetMachineRef tm,
7065 struct si_shader *shader,
7066 struct pipe_debug_callback *debug)
7067 {
7068 if (sscreen->b.chip_class >= GFX9) {
7069 struct si_shader *es_main_part =
7070 shader->key.part.gs.es->main_shader_part_es;
7071
7072 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7073 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
7074 &shader->key.part.gs.vs_prolog))
7075 return false;
7076
7077 shader->previous_stage = es_main_part;
7078 }
7079
7080 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7081 return true;
7082
7083 union si_shader_part_key prolog_key;
7084 memset(&prolog_key, 0, sizeof(prolog_key));
7085 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7086
7087 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7088 PIPE_SHADER_GEOMETRY, true,
7089 &prolog_key, tm, debug,
7090 si_build_gs_prolog_function,
7091 "Geometry Shader Prolog");
7092 return shader->prolog2 != NULL;
7093 }
7094
7095 /**
7096 * Build the pixel shader prolog function. This handles:
7097 * - two-side color selection and interpolation
7098 * - overriding interpolation parameters for the API PS
7099 * - polygon stippling
7100 *
7101 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7102 * overriden by other states. (e.g. per-sample interpolation)
7103 * Interpolated colors are stored after the preloaded VGPRs.
7104 */
7105 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7106 union si_shader_part_key *key)
7107 {
7108 struct gallivm_state *gallivm = &ctx->gallivm;
7109 struct si_function_info fninfo;
7110 LLVMValueRef ret, func;
7111 int num_returns, i, num_color_channels;
7112
7113 assert(si_need_ps_prolog(key));
7114
7115 si_init_function_info(&fninfo);
7116
7117 /* Declare inputs. */
7118 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7119 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7120
7121 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7122 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7123
7124 /* Declare outputs (same as inputs + add colors if needed) */
7125 num_returns = fninfo.num_params;
7126 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7127 for (i = 0; i < num_color_channels; i++)
7128 fninfo.types[num_returns++] = ctx->f32;
7129
7130 /* Create the function. */
7131 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7132 &fninfo, 0);
7133 func = ctx->main_fn;
7134
7135 /* Copy inputs to outputs. This should be no-op, as the registers match,
7136 * but it will prevent the compiler from overwriting them unintentionally.
7137 */
7138 ret = ctx->return_value;
7139 for (i = 0; i < fninfo.num_params; i++) {
7140 LLVMValueRef p = LLVMGetParam(func, i);
7141 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7142 }
7143
7144 /* Polygon stippling. */
7145 if (key->ps_prolog.states.poly_stipple) {
7146 /* POS_FIXED_PT is always last. */
7147 unsigned pos = key->ps_prolog.num_input_sgprs +
7148 key->ps_prolog.num_input_vgprs - 1;
7149 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7150
7151 si_llvm_emit_polygon_stipple(ctx, list, pos);
7152 }
7153
7154 if (key->ps_prolog.states.bc_optimize_for_persp ||
7155 key->ps_prolog.states.bc_optimize_for_linear) {
7156 unsigned i, base = key->ps_prolog.num_input_sgprs;
7157 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7158
7159 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7160 * The hw doesn't compute CENTROID if the whole wave only
7161 * contains fully-covered quads.
7162 *
7163 * PRIM_MASK is after user SGPRs.
7164 */
7165 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7166 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7167 LLVMConstInt(ctx->i32, 31, 0), "");
7168 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7169 ctx->i1, "");
7170
7171 if (key->ps_prolog.states.bc_optimize_for_persp) {
7172 /* Read PERSP_CENTER. */
7173 for (i = 0; i < 2; i++)
7174 center[i] = LLVMGetParam(func, base + 2 + i);
7175 /* Read PERSP_CENTROID. */
7176 for (i = 0; i < 2; i++)
7177 centroid[i] = LLVMGetParam(func, base + 4 + i);
7178 /* Select PERSP_CENTROID. */
7179 for (i = 0; i < 2; i++) {
7180 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7181 center[i], centroid[i], "");
7182 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7183 tmp, base + 4 + i, "");
7184 }
7185 }
7186 if (key->ps_prolog.states.bc_optimize_for_linear) {
7187 /* Read LINEAR_CENTER. */
7188 for (i = 0; i < 2; i++)
7189 center[i] = LLVMGetParam(func, base + 8 + i);
7190 /* Read LINEAR_CENTROID. */
7191 for (i = 0; i < 2; i++)
7192 centroid[i] = LLVMGetParam(func, base + 10 + i);
7193 /* Select LINEAR_CENTROID. */
7194 for (i = 0; i < 2; i++) {
7195 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7196 center[i], centroid[i], "");
7197 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7198 tmp, base + 10 + i, "");
7199 }
7200 }
7201 }
7202
7203 /* Force per-sample interpolation. */
7204 if (key->ps_prolog.states.force_persp_sample_interp) {
7205 unsigned i, base = key->ps_prolog.num_input_sgprs;
7206 LLVMValueRef persp_sample[2];
7207
7208 /* Read PERSP_SAMPLE. */
7209 for (i = 0; i < 2; i++)
7210 persp_sample[i] = LLVMGetParam(func, base + i);
7211 /* Overwrite PERSP_CENTER. */
7212 for (i = 0; i < 2; i++)
7213 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7214 persp_sample[i], base + 2 + i, "");
7215 /* Overwrite PERSP_CENTROID. */
7216 for (i = 0; i < 2; i++)
7217 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7218 persp_sample[i], base + 4 + i, "");
7219 }
7220 if (key->ps_prolog.states.force_linear_sample_interp) {
7221 unsigned i, base = key->ps_prolog.num_input_sgprs;
7222 LLVMValueRef linear_sample[2];
7223
7224 /* Read LINEAR_SAMPLE. */
7225 for (i = 0; i < 2; i++)
7226 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7227 /* Overwrite LINEAR_CENTER. */
7228 for (i = 0; i < 2; i++)
7229 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7230 linear_sample[i], base + 8 + i, "");
7231 /* Overwrite LINEAR_CENTROID. */
7232 for (i = 0; i < 2; i++)
7233 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7234 linear_sample[i], base + 10 + i, "");
7235 }
7236
7237 /* Force center interpolation. */
7238 if (key->ps_prolog.states.force_persp_center_interp) {
7239 unsigned i, base = key->ps_prolog.num_input_sgprs;
7240 LLVMValueRef persp_center[2];
7241
7242 /* Read PERSP_CENTER. */
7243 for (i = 0; i < 2; i++)
7244 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7245 /* Overwrite PERSP_SAMPLE. */
7246 for (i = 0; i < 2; i++)
7247 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7248 persp_center[i], base + i, "");
7249 /* Overwrite PERSP_CENTROID. */
7250 for (i = 0; i < 2; i++)
7251 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7252 persp_center[i], base + 4 + i, "");
7253 }
7254 if (key->ps_prolog.states.force_linear_center_interp) {
7255 unsigned i, base = key->ps_prolog.num_input_sgprs;
7256 LLVMValueRef linear_center[2];
7257
7258 /* Read LINEAR_CENTER. */
7259 for (i = 0; i < 2; i++)
7260 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7261 /* Overwrite LINEAR_SAMPLE. */
7262 for (i = 0; i < 2; i++)
7263 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7264 linear_center[i], base + 6 + i, "");
7265 /* Overwrite LINEAR_CENTROID. */
7266 for (i = 0; i < 2; i++)
7267 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7268 linear_center[i], base + 10 + i, "");
7269 }
7270
7271 /* Interpolate colors. */
7272 unsigned color_out_idx = 0;
7273 for (i = 0; i < 2; i++) {
7274 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7275 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7276 key->ps_prolog.face_vgpr_index;
7277 LLVMValueRef interp[2], color[4];
7278 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7279
7280 if (!writemask)
7281 continue;
7282
7283 /* If the interpolation qualifier is not CONSTANT (-1). */
7284 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7285 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7286 key->ps_prolog.color_interp_vgpr_index[i];
7287
7288 /* Get the (i,j) updated by bc_optimize handling. */
7289 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7290 interp_vgpr, "");
7291 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7292 interp_vgpr + 1, "");
7293 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7294 }
7295
7296 /* Use the absolute location of the input. */
7297 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7298
7299 if (key->ps_prolog.states.color_two_side) {
7300 face = LLVMGetParam(func, face_vgpr);
7301 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
7302 }
7303
7304 interp_fs_input(ctx,
7305 key->ps_prolog.color_attr_index[i],
7306 TGSI_SEMANTIC_COLOR, i,
7307 key->ps_prolog.num_interp_inputs,
7308 key->ps_prolog.colors_read, interp_ij,
7309 prim_mask, face, color);
7310
7311 while (writemask) {
7312 unsigned chan = u_bit_scan(&writemask);
7313 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7314 fninfo.num_params + color_out_idx++, "");
7315 }
7316 }
7317
7318 /* Tell LLVM to insert WQM instruction sequence when needed. */
7319 if (key->ps_prolog.wqm) {
7320 LLVMAddTargetDependentFunctionAttr(func,
7321 "amdgpu-ps-wqm-outputs", "");
7322 }
7323
7324 si_llvm_build_ret(ctx, ret);
7325 }
7326
7327 /**
7328 * Build the pixel shader epilog function. This handles everything that must be
7329 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7330 */
7331 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7332 union si_shader_part_key *key)
7333 {
7334 struct gallivm_state *gallivm = &ctx->gallivm;
7335 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7336 struct si_function_info fninfo;
7337 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7338 int i;
7339 struct si_ps_exports exp = {};
7340
7341 si_init_function_info(&fninfo);
7342
7343 /* Declare input SGPRs. */
7344 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7345 ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7346 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7347 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7348 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7349
7350 /* Declare input VGPRs. */
7351 unsigned required_num_params =
7352 fninfo.num_sgpr_params +
7353 util_bitcount(key->ps_epilog.colors_written) * 4 +
7354 key->ps_epilog.writes_z +
7355 key->ps_epilog.writes_stencil +
7356 key->ps_epilog.writes_samplemask;
7357
7358 required_num_params = MAX2(required_num_params,
7359 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7360
7361 while (fninfo.num_params < required_num_params)
7362 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7363
7364 /* Create the function. */
7365 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7366 /* Disable elimination of unused inputs. */
7367 si_llvm_add_attribute(ctx->main_fn,
7368 "InitialPSInputAddr", 0xffffff);
7369
7370 /* Process colors. */
7371 unsigned vgpr = fninfo.num_sgpr_params;
7372 unsigned colors_written = key->ps_epilog.colors_written;
7373 int last_color_export = -1;
7374
7375 /* Find the last color export. */
7376 if (!key->ps_epilog.writes_z &&
7377 !key->ps_epilog.writes_stencil &&
7378 !key->ps_epilog.writes_samplemask) {
7379 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7380
7381 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7382 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7383 /* Just set this if any of the colorbuffers are enabled. */
7384 if (spi_format &
7385 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7386 last_color_export = 0;
7387 } else {
7388 for (i = 0; i < 8; i++)
7389 if (colors_written & (1 << i) &&
7390 (spi_format >> (i * 4)) & 0xf)
7391 last_color_export = i;
7392 }
7393 }
7394
7395 while (colors_written) {
7396 LLVMValueRef color[4];
7397 int mrt = u_bit_scan(&colors_written);
7398
7399 for (i = 0; i < 4; i++)
7400 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7401
7402 si_export_mrt_color(bld_base, color, mrt,
7403 fninfo.num_params - 1,
7404 mrt == last_color_export, &exp);
7405 }
7406
7407 /* Process depth, stencil, samplemask. */
7408 if (key->ps_epilog.writes_z)
7409 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7410 if (key->ps_epilog.writes_stencil)
7411 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7412 if (key->ps_epilog.writes_samplemask)
7413 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7414
7415 if (depth || stencil || samplemask)
7416 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7417 else if (last_color_export == -1)
7418 si_export_null(bld_base);
7419
7420 if (exp.num)
7421 si_emit_ps_exports(ctx, &exp);
7422
7423 /* Compile. */
7424 LLVMBuildRetVoid(gallivm->builder);
7425 }
7426
7427 /**
7428 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7429 */
7430 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7431 LLVMTargetMachineRef tm,
7432 struct si_shader *shader,
7433 struct pipe_debug_callback *debug)
7434 {
7435 union si_shader_part_key prolog_key;
7436 union si_shader_part_key epilog_key;
7437
7438 /* Get the prolog. */
7439 si_get_ps_prolog_key(shader, &prolog_key, true);
7440
7441 /* The prolog is a no-op if these aren't set. */
7442 if (si_need_ps_prolog(&prolog_key)) {
7443 shader->prolog =
7444 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7445 PIPE_SHADER_FRAGMENT, true,
7446 &prolog_key, tm, debug,
7447 si_build_ps_prolog_function,
7448 "Fragment Shader Prolog");
7449 if (!shader->prolog)
7450 return false;
7451 }
7452
7453 /* Get the epilog. */
7454 si_get_ps_epilog_key(shader, &epilog_key);
7455
7456 shader->epilog =
7457 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7458 PIPE_SHADER_FRAGMENT, false,
7459 &epilog_key, tm, debug,
7460 si_build_ps_epilog_function,
7461 "Fragment Shader Epilog");
7462 if (!shader->epilog)
7463 return false;
7464
7465 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7466 if (shader->key.part.ps.prolog.poly_stipple) {
7467 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7468 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7469 }
7470
7471 /* Set up the enable bits for per-sample shading if needed. */
7472 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7473 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7474 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7475 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7476 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7477 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7478 }
7479 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7480 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7481 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7482 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7483 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7484 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7485 }
7486 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7487 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7488 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7489 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7490 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7491 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7492 }
7493 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7494 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7495 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7496 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7497 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7498 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7499 }
7500
7501 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7502 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7503 !(shader->config.spi_ps_input_ena & 0xf)) {
7504 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7505 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7506 }
7507
7508 /* At least one pair of interpolation weights must be enabled. */
7509 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7510 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7511 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7512 }
7513
7514 /* The sample mask input is always enabled, because the API shader always
7515 * passes it through to the epilog. Disable it here if it's unused.
7516 */
7517 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7518 !shader->selector->info.reads_samplemask)
7519 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7520
7521 return true;
7522 }
7523
7524 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7525 unsigned *lds_size)
7526 {
7527 /* SPI barrier management bug:
7528 * Make sure we have at least 4k of LDS in use to avoid the bug.
7529 * It applies to workgroup sizes of more than one wavefront.
7530 */
7531 if (sscreen->b.family == CHIP_BONAIRE ||
7532 sscreen->b.family == CHIP_KABINI ||
7533 sscreen->b.family == CHIP_MULLINS)
7534 *lds_size = MAX2(*lds_size, 8);
7535 }
7536
7537 static void si_fix_resource_usage(struct si_screen *sscreen,
7538 struct si_shader *shader)
7539 {
7540 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7541
7542 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7543
7544 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7545 si_get_max_workgroup_size(shader) > 64) {
7546 si_multiwave_lds_size_workaround(sscreen,
7547 &shader->config.lds_size);
7548 }
7549 }
7550
7551 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7552 struct si_shader *shader,
7553 struct pipe_debug_callback *debug)
7554 {
7555 struct si_shader_selector *sel = shader->selector;
7556 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7557 int r;
7558
7559 /* LS, ES, VS are compiled on demand if the main part hasn't been
7560 * compiled for that stage.
7561 *
7562 * Vertex shaders are compiled on demand when a vertex fetch
7563 * workaround must be applied.
7564 */
7565 if (shader->is_monolithic) {
7566 /* Monolithic shader (compiled as a whole, has many variants,
7567 * may take a long time to compile).
7568 */
7569 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7570 if (r)
7571 return r;
7572 } else {
7573 /* The shader consists of several parts:
7574 *
7575 * - the middle part is the user shader, it has 1 variant only
7576 * and it was compiled during the creation of the shader
7577 * selector
7578 * - the prolog part is inserted at the beginning
7579 * - the epilog part is inserted at the end
7580 *
7581 * The prolog and epilog have many (but simple) variants.
7582 *
7583 * Starting with gfx9, geometry and tessellation control
7584 * shaders also contain the prolog and user shader parts of
7585 * the previous shader stage.
7586 */
7587
7588 if (!mainp)
7589 return -1;
7590
7591 /* Copy the compiled TGSI shader data over. */
7592 shader->is_binary_shared = true;
7593 shader->binary = mainp->binary;
7594 shader->config = mainp->config;
7595 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7596 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7597 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7598 memcpy(shader->info.vs_output_param_offset,
7599 mainp->info.vs_output_param_offset,
7600 sizeof(mainp->info.vs_output_param_offset));
7601 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7602 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7603 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7604
7605 /* Select prologs and/or epilogs. */
7606 switch (sel->type) {
7607 case PIPE_SHADER_VERTEX:
7608 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7609 return -1;
7610 break;
7611 case PIPE_SHADER_TESS_CTRL:
7612 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7613 return -1;
7614 break;
7615 case PIPE_SHADER_TESS_EVAL:
7616 break;
7617 case PIPE_SHADER_GEOMETRY:
7618 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7619 return -1;
7620 break;
7621 case PIPE_SHADER_FRAGMENT:
7622 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7623 return -1;
7624
7625 /* Make sure we have at least as many VGPRs as there
7626 * are allocated inputs.
7627 */
7628 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7629 shader->info.num_input_vgprs);
7630 break;
7631 }
7632
7633 /* Update SGPR and VGPR counts. */
7634 if (shader->prolog) {
7635 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7636 shader->prolog->config.num_sgprs);
7637 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7638 shader->prolog->config.num_vgprs);
7639 }
7640 if (shader->previous_stage) {
7641 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7642 shader->previous_stage->config.num_sgprs);
7643 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7644 shader->previous_stage->config.num_vgprs);
7645 shader->config.spilled_sgprs =
7646 MAX2(shader->config.spilled_sgprs,
7647 shader->previous_stage->config.spilled_sgprs);
7648 shader->config.spilled_vgprs =
7649 MAX2(shader->config.spilled_vgprs,
7650 shader->previous_stage->config.spilled_vgprs);
7651 shader->config.private_mem_vgprs =
7652 MAX2(shader->config.private_mem_vgprs,
7653 shader->previous_stage->config.private_mem_vgprs);
7654 shader->config.scratch_bytes_per_wave =
7655 MAX2(shader->config.scratch_bytes_per_wave,
7656 shader->previous_stage->config.scratch_bytes_per_wave);
7657 shader->info.uses_instanceid |=
7658 shader->previous_stage->info.uses_instanceid;
7659 }
7660 if (shader->prolog2) {
7661 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7662 shader->prolog2->config.num_sgprs);
7663 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7664 shader->prolog2->config.num_vgprs);
7665 }
7666 if (shader->epilog) {
7667 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7668 shader->epilog->config.num_sgprs);
7669 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7670 shader->epilog->config.num_vgprs);
7671 }
7672 }
7673
7674 si_fix_resource_usage(sscreen, shader);
7675 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7676 stderr, true);
7677
7678 /* Upload. */
7679 r = si_shader_binary_upload(sscreen, shader);
7680 if (r) {
7681 fprintf(stderr, "LLVM failed to upload shader\n");
7682 return r;
7683 }
7684
7685 return 0;
7686 }
7687
7688 void si_shader_destroy(struct si_shader *shader)
7689 {
7690 if (shader->scratch_bo)
7691 r600_resource_reference(&shader->scratch_bo, NULL);
7692
7693 r600_resource_reference(&shader->bo, NULL);
7694
7695 if (!shader->is_binary_shared)
7696 radeon_shader_binary_clean(&shader->binary);
7697
7698 free(shader->shader_log);
7699 }