radeonsi: skip generic out/in indices without a shader IO index
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85
86 /* Ideally pass the sample mask input to the PS epilog as v13, which
87 * is its usual location, so that the shader doesn't have to add v_mov.
88 */
89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
90
91 enum {
92 CONST_ADDR_SPACE = 2,
93 LOCAL_ADDR_SPACE = 3,
94 };
95
96 static bool is_merged_shader(struct si_shader *shader)
97 {
98 if (shader->selector->screen->b.chip_class <= VI)
99 return false;
100
101 return shader->key.as_ls ||
102 shader->key.as_es ||
103 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
104 shader->selector->type == PIPE_SHADER_GEOMETRY;
105 }
106
107 /**
108 * Returns a unique index for a per-patch semantic name and index. The index
109 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
110 * can be calculated.
111 */
112 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
113 {
114 switch (semantic_name) {
115 case TGSI_SEMANTIC_TESSOUTER:
116 return 0;
117 case TGSI_SEMANTIC_TESSINNER:
118 return 1;
119 case TGSI_SEMANTIC_PATCH:
120 assert(index < 30);
121 return 2 + index;
122
123 default:
124 assert(!"invalid semantic name");
125 return 0;
126 }
127 }
128
129 /**
130 * Returns a unique index for a semantic name and index. The index must be
131 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
132 * calculated.
133 */
134 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
135 {
136 switch (semantic_name) {
137 case TGSI_SEMANTIC_POSITION:
138 return 0;
139 case TGSI_SEMANTIC_PSIZE:
140 return 1;
141 case TGSI_SEMANTIC_CLIPDIST:
142 assert(index <= 1);
143 return 2 + index;
144 case TGSI_SEMANTIC_GENERIC:
145 if (index < SI_MAX_IO_GENERIC)
146 return 4 + index;
147
148 assert(!"invalid generic index");
149 return 0;
150
151 default:
152 assert(!"invalid semantic name");
153 return 0;
154 }
155 }
156
157 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
158 {
159 switch (name) {
160 case TGSI_SEMANTIC_FOG:
161 return 0;
162 case TGSI_SEMANTIC_LAYER:
163 return 1;
164 case TGSI_SEMANTIC_VIEWPORT_INDEX:
165 return 2;
166 case TGSI_SEMANTIC_PRIMID:
167 return 3;
168 case TGSI_SEMANTIC_COLOR: /* these alias */
169 case TGSI_SEMANTIC_BCOLOR:
170 return 4 + index;
171 case TGSI_SEMANTIC_TEXCOORD:
172 return 6 + index;
173 default:
174 assert(!"invalid semantic name");
175 return 0;
176 }
177 }
178
179 /**
180 * Get the value of a shader input parameter and extract a bitfield.
181 */
182 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
183 unsigned param, unsigned rshift,
184 unsigned bitwidth)
185 {
186 struct gallivm_state *gallivm = &ctx->gallivm;
187 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
188 param);
189
190 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
191 value = bitcast(&ctx->bld_base,
192 TGSI_TYPE_UNSIGNED, value);
193
194 if (rshift)
195 value = LLVMBuildLShr(gallivm->builder, value,
196 LLVMConstInt(ctx->i32, rshift, 0), "");
197
198 if (rshift + bitwidth < 32) {
199 unsigned mask = (1 << bitwidth) - 1;
200 value = LLVMBuildAnd(gallivm->builder, value,
201 LLVMConstInt(ctx->i32, mask, 0), "");
202 }
203
204 return value;
205 }
206
207 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
208 {
209 switch (ctx->type) {
210 case PIPE_SHADER_TESS_CTRL:
211 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
212
213 case PIPE_SHADER_TESS_EVAL:
214 return LLVMGetParam(ctx->main_fn,
215 ctx->param_tes_rel_patch_id);
216
217 default:
218 assert(0);
219 return NULL;
220 }
221 }
222
223 /* Tessellation shaders pass outputs to the next shader using LDS.
224 *
225 * LS outputs = TCS inputs
226 * TCS outputs = TES inputs
227 *
228 * The LDS layout is:
229 * - TCS inputs for patch 0
230 * - TCS inputs for patch 1
231 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
232 * - ...
233 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
234 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
235 * - TCS outputs for patch 1
236 * - Per-patch TCS outputs for patch 1
237 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
238 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
239 * - ...
240 *
241 * All three shaders VS(LS), TCS, TES share the same LDS space.
242 */
243
244 static LLVMValueRef
245 get_tcs_in_patch_stride(struct si_shader_context *ctx)
246 {
247 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
248 }
249
250 static LLVMValueRef
251 get_tcs_out_patch_stride(struct si_shader_context *ctx)
252 {
253 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
254 }
255
256 static LLVMValueRef
257 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
258 {
259 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
260 unpack_param(ctx,
261 ctx->param_tcs_out_lds_offsets,
262 0, 16),
263 4);
264 }
265
266 static LLVMValueRef
267 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
268 {
269 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
270 unpack_param(ctx,
271 ctx->param_tcs_out_lds_offsets,
272 16, 16),
273 4);
274 }
275
276 static LLVMValueRef
277 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
278 {
279 struct gallivm_state *gallivm = &ctx->gallivm;
280 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
281 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
282
283 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
284 }
285
286 static LLVMValueRef
287 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
288 {
289 struct gallivm_state *gallivm = &ctx->gallivm;
290 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
291 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
292 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
293
294 return LLVMBuildAdd(gallivm->builder, patch0_offset,
295 LLVMBuildMul(gallivm->builder, patch_stride,
296 rel_patch_id, ""),
297 "");
298 }
299
300 static LLVMValueRef
301 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
302 {
303 struct gallivm_state *gallivm = &ctx->gallivm;
304 LLVMValueRef patch0_patch_data_offset =
305 get_tcs_out_patch0_patch_data_offset(ctx);
306 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
307 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
308
309 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
310 LLVMBuildMul(gallivm->builder, patch_stride,
311 rel_patch_id, ""),
312 "");
313 }
314
315 static LLVMValueRef get_instance_index_for_fetch(
316 struct si_shader_context *ctx,
317 unsigned param_start_instance, unsigned divisor)
318 {
319 struct gallivm_state *gallivm = &ctx->gallivm;
320
321 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
322 ctx->param_instance_id);
323
324 /* The division must be done before START_INSTANCE is added. */
325 if (divisor > 1)
326 result = LLVMBuildUDiv(gallivm->builder, result,
327 LLVMConstInt(ctx->i32, divisor, 0), "");
328
329 return LLVMBuildAdd(gallivm->builder, result,
330 LLVMGetParam(ctx->main_fn, param_start_instance), "");
331 }
332
333 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
334 * to float. */
335 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
336 LLVMValueRef vec4,
337 unsigned double_index)
338 {
339 LLVMBuilderRef builder = ctx->gallivm.builder;
340 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
341 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
342 LLVMVectorType(f64, 2), "");
343 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
344 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
345 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
346 }
347
348 static void declare_input_vs(
349 struct si_shader_context *ctx,
350 unsigned input_index,
351 const struct tgsi_full_declaration *decl,
352 LLVMValueRef out[4])
353 {
354 struct gallivm_state *gallivm = &ctx->gallivm;
355
356 unsigned chan;
357 unsigned fix_fetch;
358 unsigned num_fetches;
359 unsigned fetch_stride;
360
361 LLVMValueRef t_list_ptr;
362 LLVMValueRef t_offset;
363 LLVMValueRef t_list;
364 LLVMValueRef vertex_index;
365 LLVMValueRef input[3];
366
367 /* Load the T list */
368 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
369
370 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
371
372 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
373
374 vertex_index = LLVMGetParam(ctx->main_fn,
375 ctx->param_vertex_index0 +
376 input_index);
377
378 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
379
380 /* Do multiple loads for special formats. */
381 switch (fix_fetch) {
382 case SI_FIX_FETCH_RGB_64_FLOAT:
383 num_fetches = 3; /* 3 2-dword loads */
384 fetch_stride = 8;
385 break;
386 case SI_FIX_FETCH_RGBA_64_FLOAT:
387 num_fetches = 2; /* 2 4-dword loads */
388 fetch_stride = 16;
389 break;
390 case SI_FIX_FETCH_RGB_8:
391 case SI_FIX_FETCH_RGB_8_INT:
392 num_fetches = 3;
393 fetch_stride = 1;
394 break;
395 case SI_FIX_FETCH_RGB_16:
396 case SI_FIX_FETCH_RGB_16_INT:
397 num_fetches = 3;
398 fetch_stride = 2;
399 break;
400 default:
401 num_fetches = 1;
402 fetch_stride = 0;
403 }
404
405 for (unsigned i = 0; i < num_fetches; i++) {
406 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
407
408 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
409 vertex_index, voffset,
410 true);
411 }
412
413 /* Break up the vec4 into individual components */
414 for (chan = 0; chan < 4; chan++) {
415 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
416 out[chan] = LLVMBuildExtractElement(gallivm->builder,
417 input[0], llvm_chan, "");
418 }
419
420 switch (fix_fetch) {
421 case SI_FIX_FETCH_A2_SNORM:
422 case SI_FIX_FETCH_A2_SSCALED:
423 case SI_FIX_FETCH_A2_SINT: {
424 /* The hardware returns an unsigned value; convert it to a
425 * signed one.
426 */
427 LLVMValueRef tmp = out[3];
428 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
429
430 /* First, recover the sign-extended signed integer value. */
431 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
432 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
433 else
434 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
435
436 /* For the integer-like cases, do a natural sign extension.
437 *
438 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
439 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
440 * exponent.
441 */
442 tmp = LLVMBuildShl(gallivm->builder, tmp,
443 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
444 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
445 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
446
447 /* Convert back to the right type. */
448 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
449 LLVMValueRef clamp;
450 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
451 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
452 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
453 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
454 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
455 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
456 }
457
458 out[3] = tmp;
459 break;
460 }
461 case SI_FIX_FETCH_RGBA_32_UNORM:
462 case SI_FIX_FETCH_RGBX_32_UNORM:
463 for (chan = 0; chan < 4; chan++) {
464 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
465 ctx->i32, "");
466 out[chan] = LLVMBuildUIToFP(gallivm->builder,
467 out[chan], ctx->f32, "");
468 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
469 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
470 }
471 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
472 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
473 out[3] = LLVMConstReal(ctx->f32, 1);
474 break;
475 case SI_FIX_FETCH_RGBA_32_SNORM:
476 case SI_FIX_FETCH_RGBX_32_SNORM:
477 case SI_FIX_FETCH_RGBA_32_FIXED:
478 case SI_FIX_FETCH_RGBX_32_FIXED: {
479 double scale;
480 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
481 scale = 1.0 / 0x10000;
482 else
483 scale = 1.0 / INT_MAX;
484
485 for (chan = 0; chan < 4; chan++) {
486 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
487 ctx->i32, "");
488 out[chan] = LLVMBuildSIToFP(gallivm->builder,
489 out[chan], ctx->f32, "");
490 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
491 LLVMConstReal(ctx->f32, scale), "");
492 }
493 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
494 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
495 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
496 out[3] = LLVMConstReal(ctx->f32, 1);
497 break;
498 }
499 case SI_FIX_FETCH_RGBA_32_USCALED:
500 for (chan = 0; chan < 4; chan++) {
501 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
502 ctx->i32, "");
503 out[chan] = LLVMBuildUIToFP(gallivm->builder,
504 out[chan], ctx->f32, "");
505 }
506 break;
507 case SI_FIX_FETCH_RGBA_32_SSCALED:
508 for (chan = 0; chan < 4; chan++) {
509 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
510 ctx->i32, "");
511 out[chan] = LLVMBuildSIToFP(gallivm->builder,
512 out[chan], ctx->f32, "");
513 }
514 break;
515 case SI_FIX_FETCH_RG_64_FLOAT:
516 for (chan = 0; chan < 2; chan++)
517 out[chan] = extract_double_to_float(ctx, input[0], chan);
518
519 out[2] = LLVMConstReal(ctx->f32, 0);
520 out[3] = LLVMConstReal(ctx->f32, 1);
521 break;
522 case SI_FIX_FETCH_RGB_64_FLOAT:
523 for (chan = 0; chan < 3; chan++)
524 out[chan] = extract_double_to_float(ctx, input[chan], 0);
525
526 out[3] = LLVMConstReal(ctx->f32, 1);
527 break;
528 case SI_FIX_FETCH_RGBA_64_FLOAT:
529 for (chan = 0; chan < 4; chan++) {
530 out[chan] = extract_double_to_float(ctx, input[chan / 2],
531 chan % 2);
532 }
533 break;
534 case SI_FIX_FETCH_RGB_8:
535 case SI_FIX_FETCH_RGB_8_INT:
536 case SI_FIX_FETCH_RGB_16:
537 case SI_FIX_FETCH_RGB_16_INT:
538 for (chan = 0; chan < 3; chan++) {
539 out[chan] = LLVMBuildExtractElement(gallivm->builder,
540 input[chan],
541 ctx->i32_0, "");
542 }
543 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
544 fix_fetch == SI_FIX_FETCH_RGB_16) {
545 out[3] = LLVMConstReal(ctx->f32, 1);
546 } else {
547 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
548 ctx->f32, "");
549 }
550 break;
551 }
552 }
553
554 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
555 unsigned swizzle)
556 {
557 struct si_shader_context *ctx = si_shader_context(bld_base);
558
559 if (swizzle > 0)
560 return ctx->i32_0;
561
562 switch (ctx->type) {
563 case PIPE_SHADER_VERTEX:
564 return LLVMGetParam(ctx->main_fn,
565 ctx->param_vs_prim_id);
566 case PIPE_SHADER_TESS_CTRL:
567 return LLVMGetParam(ctx->main_fn,
568 ctx->param_tcs_patch_id);
569 case PIPE_SHADER_TESS_EVAL:
570 return LLVMGetParam(ctx->main_fn,
571 ctx->param_tes_patch_id);
572 case PIPE_SHADER_GEOMETRY:
573 return LLVMGetParam(ctx->main_fn,
574 ctx->param_gs_prim_id);
575 default:
576 assert(0);
577 return ctx->i32_0;
578 }
579 }
580
581 /**
582 * Return the value of tgsi_ind_register for indexing.
583 * This is the indirect index with the constant offset added to it.
584 */
585 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
586 const struct tgsi_ind_register *ind,
587 int rel_index)
588 {
589 struct gallivm_state *gallivm = &ctx->gallivm;
590 LLVMValueRef result;
591
592 result = ctx->addrs[ind->Index][ind->Swizzle];
593 result = LLVMBuildLoad(gallivm->builder, result, "");
594 result = LLVMBuildAdd(gallivm->builder, result,
595 LLVMConstInt(ctx->i32, rel_index, 0), "");
596 return result;
597 }
598
599 /**
600 * Like get_indirect_index, but restricts the return value to a (possibly
601 * undefined) value inside [0..num).
602 */
603 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
604 const struct tgsi_ind_register *ind,
605 int rel_index, unsigned num)
606 {
607 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
608
609 return si_llvm_bound_index(ctx, result, num);
610 }
611
612
613 /**
614 * Calculate a dword address given an input or output register and a stride.
615 */
616 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
617 const struct tgsi_full_dst_register *dst,
618 const struct tgsi_full_src_register *src,
619 LLVMValueRef vertex_dw_stride,
620 LLVMValueRef base_addr)
621 {
622 struct gallivm_state *gallivm = &ctx->gallivm;
623 struct tgsi_shader_info *info = &ctx->shader->selector->info;
624 ubyte *name, *index, *array_first;
625 int first, param;
626 struct tgsi_full_dst_register reg;
627
628 /* Set the register description. The address computation is the same
629 * for sources and destinations. */
630 if (src) {
631 reg.Register.File = src->Register.File;
632 reg.Register.Index = src->Register.Index;
633 reg.Register.Indirect = src->Register.Indirect;
634 reg.Register.Dimension = src->Register.Dimension;
635 reg.Indirect = src->Indirect;
636 reg.Dimension = src->Dimension;
637 reg.DimIndirect = src->DimIndirect;
638 } else
639 reg = *dst;
640
641 /* If the register is 2-dimensional (e.g. an array of vertices
642 * in a primitive), calculate the base address of the vertex. */
643 if (reg.Register.Dimension) {
644 LLVMValueRef index;
645
646 if (reg.Dimension.Indirect)
647 index = get_indirect_index(ctx, &reg.DimIndirect,
648 reg.Dimension.Index);
649 else
650 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
651
652 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
653 LLVMBuildMul(gallivm->builder, index,
654 vertex_dw_stride, ""), "");
655 }
656
657 /* Get information about the register. */
658 if (reg.Register.File == TGSI_FILE_INPUT) {
659 name = info->input_semantic_name;
660 index = info->input_semantic_index;
661 array_first = info->input_array_first;
662 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
663 name = info->output_semantic_name;
664 index = info->output_semantic_index;
665 array_first = info->output_array_first;
666 } else {
667 assert(0);
668 return NULL;
669 }
670
671 if (reg.Register.Indirect) {
672 /* Add the relative address of the element. */
673 LLVMValueRef ind_index;
674
675 if (reg.Indirect.ArrayID)
676 first = array_first[reg.Indirect.ArrayID];
677 else
678 first = reg.Register.Index;
679
680 ind_index = get_indirect_index(ctx, &reg.Indirect,
681 reg.Register.Index - first);
682
683 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
684 LLVMBuildMul(gallivm->builder, ind_index,
685 LLVMConstInt(ctx->i32, 4, 0), ""), "");
686
687 param = reg.Register.Dimension ?
688 si_shader_io_get_unique_index(name[first], index[first]) :
689 si_shader_io_get_unique_index_patch(name[first], index[first]);
690 } else {
691 param = reg.Register.Dimension ?
692 si_shader_io_get_unique_index(name[reg.Register.Index],
693 index[reg.Register.Index]) :
694 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
695 index[reg.Register.Index]);
696 }
697
698 /* Add the base address of the element. */
699 return LLVMBuildAdd(gallivm->builder, base_addr,
700 LLVMConstInt(ctx->i32, param * 4, 0), "");
701 }
702
703 /* The offchip buffer layout for TCS->TES is
704 *
705 * - attribute 0 of patch 0 vertex 0
706 * - attribute 0 of patch 0 vertex 1
707 * - attribute 0 of patch 0 vertex 2
708 * ...
709 * - attribute 0 of patch 1 vertex 0
710 * - attribute 0 of patch 1 vertex 1
711 * ...
712 * - attribute 1 of patch 0 vertex 0
713 * - attribute 1 of patch 0 vertex 1
714 * ...
715 * - per patch attribute 0 of patch 0
716 * - per patch attribute 0 of patch 1
717 * ...
718 *
719 * Note that every attribute has 4 components.
720 */
721 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
722 LLVMValueRef rel_patch_id,
723 LLVMValueRef vertex_index,
724 LLVMValueRef param_index)
725 {
726 struct gallivm_state *gallivm = &ctx->gallivm;
727 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
728 LLVMValueRef param_stride, constant16;
729
730 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
731 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
732 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
733 num_patches, "");
734
735 constant16 = LLVMConstInt(ctx->i32, 16, 0);
736 if (vertex_index) {
737 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
738 vertices_per_patch, "");
739
740 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
741 vertex_index, "");
742
743 param_stride = total_vertices;
744 } else {
745 base_addr = rel_patch_id;
746 param_stride = num_patches;
747 }
748
749 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
750 LLVMBuildMul(gallivm->builder, param_index,
751 param_stride, ""), "");
752
753 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
754
755 if (!vertex_index) {
756 LLVMValueRef patch_data_offset =
757 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
758
759 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
760 patch_data_offset, "");
761 }
762 return base_addr;
763 }
764
765 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
766 struct si_shader_context *ctx,
767 const struct tgsi_full_dst_register *dst,
768 const struct tgsi_full_src_register *src)
769 {
770 struct gallivm_state *gallivm = &ctx->gallivm;
771 struct tgsi_shader_info *info = &ctx->shader->selector->info;
772 ubyte *name, *index, *array_first;
773 struct tgsi_full_src_register reg;
774 LLVMValueRef vertex_index = NULL;
775 LLVMValueRef param_index = NULL;
776 unsigned param_index_base, param_base;
777
778 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
779
780 if (reg.Register.Dimension) {
781
782 if (reg.Dimension.Indirect)
783 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
784 reg.Dimension.Index);
785 else
786 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
787 }
788
789 /* Get information about the register. */
790 if (reg.Register.File == TGSI_FILE_INPUT) {
791 name = info->input_semantic_name;
792 index = info->input_semantic_index;
793 array_first = info->input_array_first;
794 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
795 name = info->output_semantic_name;
796 index = info->output_semantic_index;
797 array_first = info->output_array_first;
798 } else {
799 assert(0);
800 return NULL;
801 }
802
803 if (reg.Register.Indirect) {
804 if (reg.Indirect.ArrayID)
805 param_base = array_first[reg.Indirect.ArrayID];
806 else
807 param_base = reg.Register.Index;
808
809 param_index = get_indirect_index(ctx, &reg.Indirect,
810 reg.Register.Index - param_base);
811
812 } else {
813 param_base = reg.Register.Index;
814 param_index = ctx->i32_0;
815 }
816
817 param_index_base = reg.Register.Dimension ?
818 si_shader_io_get_unique_index(name[param_base], index[param_base]) :
819 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]);
820
821 param_index = LLVMBuildAdd(gallivm->builder, param_index,
822 LLVMConstInt(ctx->i32, param_index_base, 0),
823 "");
824
825 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
826 vertex_index, param_index);
827 }
828
829 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
830 enum tgsi_opcode_type type, unsigned swizzle,
831 LLVMValueRef buffer, LLVMValueRef offset,
832 LLVMValueRef base, bool readonly_memory)
833 {
834 struct si_shader_context *ctx = si_shader_context(bld_base);
835 struct gallivm_state *gallivm = &ctx->gallivm;
836 LLVMValueRef value, value2;
837 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
838 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
839
840 if (swizzle == ~0) {
841 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
842 0, 1, 0, readonly_memory);
843
844 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
845 }
846
847 if (!tgsi_type_is_64bit(type)) {
848 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
849 0, 1, 0, readonly_memory);
850
851 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
852 return LLVMBuildExtractElement(gallivm->builder, value,
853 LLVMConstInt(ctx->i32, swizzle, 0), "");
854 }
855
856 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
857 swizzle * 4, 1, 0, readonly_memory);
858
859 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
860 swizzle * 4 + 4, 1, 0, readonly_memory);
861
862 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
863 }
864
865 /**
866 * Load from LDS.
867 *
868 * \param type output value type
869 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
870 * \param dw_addr address in dwords
871 */
872 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
873 enum tgsi_opcode_type type, unsigned swizzle,
874 LLVMValueRef dw_addr)
875 {
876 struct si_shader_context *ctx = si_shader_context(bld_base);
877 struct gallivm_state *gallivm = &ctx->gallivm;
878 LLVMValueRef value;
879
880 if (swizzle == ~0) {
881 LLVMValueRef values[TGSI_NUM_CHANNELS];
882
883 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
884 values[chan] = lds_load(bld_base, type, chan, dw_addr);
885
886 return lp_build_gather_values(gallivm, values,
887 TGSI_NUM_CHANNELS);
888 }
889
890 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
891 LLVMConstInt(ctx->i32, swizzle, 0));
892
893 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
894 if (tgsi_type_is_64bit(type)) {
895 LLVMValueRef value2;
896 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
897 ctx->i32_1);
898 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
899 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
900 }
901
902 return LLVMBuildBitCast(gallivm->builder, value,
903 tgsi2llvmtype(bld_base, type), "");
904 }
905
906 /**
907 * Store to LDS.
908 *
909 * \param swizzle offset (typically 0..3)
910 * \param dw_addr address in dwords
911 * \param value value to store
912 */
913 static void lds_store(struct lp_build_tgsi_context *bld_base,
914 unsigned dw_offset_imm, LLVMValueRef dw_addr,
915 LLVMValueRef value)
916 {
917 struct si_shader_context *ctx = si_shader_context(bld_base);
918 struct gallivm_state *gallivm = &ctx->gallivm;
919
920 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
921 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
922
923 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
924 ac_build_indexed_store(&ctx->ac, ctx->lds,
925 dw_addr, value);
926 }
927
928 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
929 unsigned param)
930 {
931 LLVMBuilderRef builder = ctx->gallivm.builder;
932
933 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
934 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
935 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
936
937 uint64_t desc2 = 0xffffffff;
938 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
939 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
940 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
941 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
942 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
943 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
944 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
945
946 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
947 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
948 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
949 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
950 }
951
952 static LLVMValueRef fetch_input_tcs(
953 struct lp_build_tgsi_context *bld_base,
954 const struct tgsi_full_src_register *reg,
955 enum tgsi_opcode_type type, unsigned swizzle)
956 {
957 struct si_shader_context *ctx = si_shader_context(bld_base);
958 LLVMValueRef dw_addr, stride;
959
960 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
961 dw_addr = get_tcs_in_current_patch_offset(ctx);
962 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
963
964 return lds_load(bld_base, type, swizzle, dw_addr);
965 }
966
967 static LLVMValueRef fetch_output_tcs(
968 struct lp_build_tgsi_context *bld_base,
969 const struct tgsi_full_src_register *reg,
970 enum tgsi_opcode_type type, unsigned swizzle)
971 {
972 struct si_shader_context *ctx = si_shader_context(bld_base);
973 LLVMValueRef dw_addr, stride;
974
975 if (reg->Register.Dimension) {
976 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
977 dw_addr = get_tcs_out_current_patch_offset(ctx);
978 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
979 } else {
980 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
981 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
982 }
983
984 return lds_load(bld_base, type, swizzle, dw_addr);
985 }
986
987 static LLVMValueRef fetch_input_tes(
988 struct lp_build_tgsi_context *bld_base,
989 const struct tgsi_full_src_register *reg,
990 enum tgsi_opcode_type type, unsigned swizzle)
991 {
992 struct si_shader_context *ctx = si_shader_context(bld_base);
993 LLVMValueRef buffer, base, addr;
994
995 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
996
997 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
998 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
999
1000 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
1001 }
1002
1003 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1004 const struct tgsi_full_instruction *inst,
1005 const struct tgsi_opcode_info *info,
1006 LLVMValueRef dst[4])
1007 {
1008 struct si_shader_context *ctx = si_shader_context(bld_base);
1009 struct gallivm_state *gallivm = &ctx->gallivm;
1010 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1011 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1012 unsigned chan_index;
1013 LLVMValueRef dw_addr, stride;
1014 LLVMValueRef buffer, base, buf_addr;
1015 LLVMValueRef values[4];
1016 bool skip_lds_store;
1017 bool is_tess_factor = false;
1018
1019 /* Only handle per-patch and per-vertex outputs here.
1020 * Vectors will be lowered to scalars and this function will be called again.
1021 */
1022 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1023 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1024 si_llvm_emit_store(bld_base, inst, info, dst);
1025 return;
1026 }
1027
1028 if (reg->Register.Dimension) {
1029 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1030 dw_addr = get_tcs_out_current_patch_offset(ctx);
1031 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1032 skip_lds_store = !sh_info->reads_pervertex_outputs;
1033 } else {
1034 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1035 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1036 skip_lds_store = !sh_info->reads_perpatch_outputs;
1037
1038 if (!reg->Register.Indirect) {
1039 int name = sh_info->output_semantic_name[reg->Register.Index];
1040
1041 /* Always write tess factors into LDS for the TCS epilog. */
1042 if (name == TGSI_SEMANTIC_TESSINNER ||
1043 name == TGSI_SEMANTIC_TESSOUTER) {
1044 skip_lds_store = false;
1045 is_tess_factor = true;
1046 }
1047 }
1048 }
1049
1050 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1051
1052 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1053 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1054
1055
1056 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1057 LLVMValueRef value = dst[chan_index];
1058
1059 if (inst->Instruction.Saturate)
1060 value = ac_build_clamp(&ctx->ac, value);
1061
1062 /* Skip LDS stores if there is no LDS read of this output. */
1063 if (!skip_lds_store)
1064 lds_store(bld_base, chan_index, dw_addr, value);
1065
1066 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1067 values[chan_index] = value;
1068
1069 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1070 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1071 buf_addr, base,
1072 4 * chan_index, 1, 0, true, false);
1073 }
1074 }
1075
1076 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1077 LLVMValueRef value = lp_build_gather_values(gallivm,
1078 values, 4);
1079 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1080 base, 0, 1, 0, true, false);
1081 }
1082 }
1083
1084 static LLVMValueRef fetch_input_gs(
1085 struct lp_build_tgsi_context *bld_base,
1086 const struct tgsi_full_src_register *reg,
1087 enum tgsi_opcode_type type,
1088 unsigned swizzle)
1089 {
1090 struct si_shader_context *ctx = si_shader_context(bld_base);
1091 struct si_shader *shader = ctx->shader;
1092 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1093 struct gallivm_state *gallivm = &ctx->gallivm;
1094 LLVMValueRef vtx_offset, soffset;
1095 struct tgsi_shader_info *info = &shader->selector->info;
1096 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1097 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1098 unsigned param;
1099 LLVMValueRef value;
1100
1101 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1102 return get_primitive_id(bld_base, swizzle);
1103
1104 if (!reg->Register.Dimension)
1105 return NULL;
1106
1107 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1108
1109 /* GFX9 has the ESGS ring in LDS. */
1110 if (ctx->screen->b.chip_class >= GFX9) {
1111 unsigned index = reg->Dimension.Index;
1112
1113 switch (index / 2) {
1114 case 0:
1115 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1116 index % 2 ? 16 : 0, 16);
1117 break;
1118 case 1:
1119 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1120 index % 2 ? 16 : 0, 16);
1121 break;
1122 case 2:
1123 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1124 index % 2 ? 16 : 0, 16);
1125 break;
1126 default:
1127 assert(0);
1128 return NULL;
1129 }
1130
1131 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1132 LLVMConstInt(ctx->i32, param * 4, 0), "");
1133 return lds_load(bld_base, type, swizzle, vtx_offset);
1134 }
1135
1136 /* GFX6: input load from the ESGS ring in memory. */
1137 if (swizzle == ~0) {
1138 LLVMValueRef values[TGSI_NUM_CHANNELS];
1139 unsigned chan;
1140 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1141 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1142 }
1143 return lp_build_gather_values(gallivm, values,
1144 TGSI_NUM_CHANNELS);
1145 }
1146
1147 /* Get the vertex offset parameter on GFX6. */
1148 unsigned vtx_offset_param = reg->Dimension.Index;
1149 if (vtx_offset_param < 2) {
1150 vtx_offset_param += ctx->param_gs_vtx0_offset;
1151 } else {
1152 assert(vtx_offset_param < 6);
1153 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1154 }
1155 vtx_offset = lp_build_mul_imm(uint,
1156 LLVMGetParam(ctx->main_fn,
1157 vtx_offset_param),
1158 4);
1159
1160 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1161
1162 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1163 vtx_offset, soffset, 0, 1, 0, true);
1164 if (tgsi_type_is_64bit(type)) {
1165 LLVMValueRef value2;
1166 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1167
1168 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1169 ctx->i32_0, vtx_offset, soffset,
1170 0, 1, 0, true);
1171 return si_llvm_emit_fetch_64bit(bld_base, type,
1172 value, value2);
1173 }
1174 return LLVMBuildBitCast(gallivm->builder,
1175 value,
1176 tgsi2llvmtype(bld_base, type), "");
1177 }
1178
1179 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1180 {
1181 switch (interpolate) {
1182 case TGSI_INTERPOLATE_CONSTANT:
1183 return 0;
1184
1185 case TGSI_INTERPOLATE_LINEAR:
1186 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1187 return SI_PARAM_LINEAR_SAMPLE;
1188 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1189 return SI_PARAM_LINEAR_CENTROID;
1190 else
1191 return SI_PARAM_LINEAR_CENTER;
1192 break;
1193 case TGSI_INTERPOLATE_COLOR:
1194 case TGSI_INTERPOLATE_PERSPECTIVE:
1195 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1196 return SI_PARAM_PERSP_SAMPLE;
1197 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1198 return SI_PARAM_PERSP_CENTROID;
1199 else
1200 return SI_PARAM_PERSP_CENTER;
1201 break;
1202 default:
1203 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1204 return -1;
1205 }
1206 }
1207
1208 /**
1209 * Interpolate a fragment shader input.
1210 *
1211 * @param ctx context
1212 * @param input_index index of the input in hardware
1213 * @param semantic_name TGSI_SEMANTIC_*
1214 * @param semantic_index semantic index
1215 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1216 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1217 * @param interp_param interpolation weights (i,j)
1218 * @param prim_mask SI_PARAM_PRIM_MASK
1219 * @param face SI_PARAM_FRONT_FACE
1220 * @param result the return value (4 components)
1221 */
1222 static void interp_fs_input(struct si_shader_context *ctx,
1223 unsigned input_index,
1224 unsigned semantic_name,
1225 unsigned semantic_index,
1226 unsigned num_interp_inputs,
1227 unsigned colors_read_mask,
1228 LLVMValueRef interp_param,
1229 LLVMValueRef prim_mask,
1230 LLVMValueRef face,
1231 LLVMValueRef result[4])
1232 {
1233 struct gallivm_state *gallivm = &ctx->gallivm;
1234 LLVMValueRef attr_number;
1235 LLVMValueRef i, j;
1236
1237 unsigned chan;
1238
1239 /* fs.constant returns the param from the middle vertex, so it's not
1240 * really useful for flat shading. It's meant to be used for custom
1241 * interpolation (but the intrinsic can't fetch from the other two
1242 * vertices).
1243 *
1244 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1245 * to do the right thing. The only reason we use fs.constant is that
1246 * fs.interp cannot be used on integers, because they can be equal
1247 * to NaN.
1248 *
1249 * When interp is false we will use fs.constant or for newer llvm,
1250 * amdgcn.interp.mov.
1251 */
1252 bool interp = interp_param != NULL;
1253
1254 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1255
1256 if (interp) {
1257 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1258 LLVMVectorType(ctx->f32, 2), "");
1259
1260 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1261 ctx->i32_0, "");
1262 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1263 ctx->i32_1, "");
1264 }
1265
1266 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1267 ctx->shader->key.part.ps.prolog.color_two_side) {
1268 LLVMValueRef is_face_positive;
1269 LLVMValueRef back_attr_number;
1270
1271 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1272 * otherwise it's at offset "num_inputs".
1273 */
1274 unsigned back_attr_offset = num_interp_inputs;
1275 if (semantic_index == 1 && colors_read_mask & 0xf)
1276 back_attr_offset += 1;
1277
1278 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1279
1280 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1281 face, ctx->i32_0, "");
1282
1283 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1284 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1285 LLVMValueRef front, back;
1286
1287 if (interp) {
1288 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1289 attr_number, prim_mask,
1290 i, j);
1291 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1292 back_attr_number, prim_mask,
1293 i, j);
1294 } else {
1295 front = ac_build_fs_interp_mov(&ctx->ac,
1296 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1297 llvm_chan, attr_number, prim_mask);
1298 back = ac_build_fs_interp_mov(&ctx->ac,
1299 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1300 llvm_chan, back_attr_number, prim_mask);
1301 }
1302
1303 result[chan] = LLVMBuildSelect(gallivm->builder,
1304 is_face_positive,
1305 front,
1306 back,
1307 "");
1308 }
1309 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1310 if (interp) {
1311 result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1312 attr_number, prim_mask, i, j);
1313 } else {
1314 result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1315 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1316 attr_number, prim_mask);
1317 }
1318 result[1] =
1319 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1320 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1321 } else {
1322 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1323 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1324
1325 if (interp) {
1326 result[chan] = ac_build_fs_interp(&ctx->ac,
1327 llvm_chan, attr_number, prim_mask, i, j);
1328 } else {
1329 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1330 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1331 llvm_chan, attr_number, prim_mask);
1332 }
1333 }
1334 }
1335 }
1336
1337 static void declare_input_fs(
1338 struct si_shader_context *ctx,
1339 unsigned input_index,
1340 const struct tgsi_full_declaration *decl,
1341 LLVMValueRef out[4])
1342 {
1343 struct lp_build_context *base = &ctx->bld_base.base;
1344 struct si_shader *shader = ctx->shader;
1345 LLVMValueRef main_fn = ctx->main_fn;
1346 LLVMValueRef interp_param = NULL;
1347 int interp_param_idx;
1348
1349 /* Get colors from input VGPRs (set by the prolog). */
1350 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1351 unsigned i = decl->Semantic.Index;
1352 unsigned colors_read = shader->selector->info.colors_read;
1353 unsigned mask = colors_read >> (i * 4);
1354 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1355 (i ? util_bitcount(colors_read & 0xf) : 0);
1356
1357 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1358 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1359 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1360 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1361 return;
1362 }
1363
1364 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1365 decl->Interp.Location);
1366 if (interp_param_idx == -1)
1367 return;
1368 else if (interp_param_idx) {
1369 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1370 }
1371
1372 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1373 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1374 ctx->shader->key.part.ps.prolog.flatshade_colors)
1375 interp_param = NULL; /* load the constant color */
1376
1377 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1378 decl->Semantic.Index, shader->selector->info.num_inputs,
1379 shader->selector->info.colors_read, interp_param,
1380 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1381 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1382 &out[0]);
1383 }
1384
1385 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1386 {
1387 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1388 }
1389
1390
1391 /**
1392 * Load a dword from a constant buffer.
1393 */
1394 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1395 LLVMValueRef resource,
1396 LLVMValueRef offset)
1397 {
1398 LLVMBuilderRef builder = ctx->gallivm.builder;
1399 LLVMValueRef args[2] = {resource, offset};
1400
1401 return lp_build_intrinsic(builder, "llvm.SI.load.const.v4i32", ctx->f32, args, 2,
1402 LP_FUNC_ATTR_READNONE |
1403 LP_FUNC_ATTR_LEGACY);
1404 }
1405
1406 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1407 {
1408 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1409 struct gallivm_state *gallivm = &ctx->gallivm;
1410 LLVMBuilderRef builder = gallivm->builder;
1411 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1412 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1413 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1414
1415 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1416 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1417 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1418
1419 LLVMValueRef pos[4] = {
1420 buffer_load_const(ctx, resource, offset0),
1421 buffer_load_const(ctx, resource, offset1),
1422 LLVMConstReal(ctx->f32, 0),
1423 LLVMConstReal(ctx->f32, 0)
1424 };
1425
1426 return lp_build_gather_values(gallivm, pos, 4);
1427 }
1428
1429 static void declare_system_value(struct si_shader_context *ctx,
1430 unsigned index,
1431 const struct tgsi_full_declaration *decl)
1432 {
1433 struct lp_build_context *bld = &ctx->bld_base.base;
1434 struct gallivm_state *gallivm = &ctx->gallivm;
1435 LLVMValueRef value = 0;
1436
1437 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1438
1439 switch (decl->Semantic.Name) {
1440 case TGSI_SEMANTIC_INSTANCEID:
1441 value = LLVMGetParam(ctx->main_fn,
1442 ctx->param_instance_id);
1443 break;
1444
1445 case TGSI_SEMANTIC_VERTEXID:
1446 value = LLVMBuildAdd(gallivm->builder,
1447 LLVMGetParam(ctx->main_fn,
1448 ctx->param_vertex_id),
1449 LLVMGetParam(ctx->main_fn,
1450 ctx->param_base_vertex), "");
1451 break;
1452
1453 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1454 /* Unused. Clarify the meaning in indexed vs. non-indexed
1455 * draws if this is ever used again. */
1456 assert(false);
1457 break;
1458
1459 case TGSI_SEMANTIC_BASEVERTEX:
1460 {
1461 /* For non-indexed draws, the base vertex set by the driver
1462 * (for direct draws) or the CP (for indirect draws) is the
1463 * first vertex ID, but GLSL expects 0 to be returned.
1464 */
1465 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1466 LLVMValueRef indexed;
1467
1468 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1469 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1470
1471 value = LLVMBuildSelect(gallivm->builder, indexed,
1472 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1473 ctx->i32_0, "");
1474 break;
1475 }
1476
1477 case TGSI_SEMANTIC_BASEINSTANCE:
1478 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1479 break;
1480
1481 case TGSI_SEMANTIC_DRAWID:
1482 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1483 break;
1484
1485 case TGSI_SEMANTIC_INVOCATIONID:
1486 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1487 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1488 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1489 value = LLVMGetParam(ctx->main_fn,
1490 ctx->param_gs_instance_id);
1491 else
1492 assert(!"INVOCATIONID not implemented");
1493 break;
1494
1495 case TGSI_SEMANTIC_POSITION:
1496 {
1497 LLVMValueRef pos[4] = {
1498 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1499 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1500 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1501 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1502 LLVMGetParam(ctx->main_fn,
1503 SI_PARAM_POS_W_FLOAT)),
1504 };
1505 value = lp_build_gather_values(gallivm, pos, 4);
1506 break;
1507 }
1508
1509 case TGSI_SEMANTIC_FACE:
1510 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1511 break;
1512
1513 case TGSI_SEMANTIC_SAMPLEID:
1514 value = get_sample_id(ctx);
1515 break;
1516
1517 case TGSI_SEMANTIC_SAMPLEPOS: {
1518 LLVMValueRef pos[4] = {
1519 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1520 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1521 LLVMConstReal(ctx->f32, 0),
1522 LLVMConstReal(ctx->f32, 0)
1523 };
1524 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1525 TGSI_OPCODE_FRC, pos[0]);
1526 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1527 TGSI_OPCODE_FRC, pos[1]);
1528 value = lp_build_gather_values(gallivm, pos, 4);
1529 break;
1530 }
1531
1532 case TGSI_SEMANTIC_SAMPLEMASK:
1533 /* This can only occur with the OpenGL Core profile, which
1534 * doesn't support smoothing.
1535 */
1536 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1537 break;
1538
1539 case TGSI_SEMANTIC_TESSCOORD:
1540 {
1541 LLVMValueRef coord[4] = {
1542 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1543 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1544 bld->zero,
1545 bld->zero
1546 };
1547
1548 /* For triangles, the vector should be (u, v, 1-u-v). */
1549 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1550 PIPE_PRIM_TRIANGLES)
1551 coord[2] = lp_build_sub(bld, bld->one,
1552 lp_build_add(bld, coord[0], coord[1]));
1553
1554 value = lp_build_gather_values(gallivm, coord, 4);
1555 break;
1556 }
1557
1558 case TGSI_SEMANTIC_VERTICESIN:
1559 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1560 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1561 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1562 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1563 else
1564 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1565 break;
1566
1567 case TGSI_SEMANTIC_TESSINNER:
1568 case TGSI_SEMANTIC_TESSOUTER:
1569 {
1570 LLVMValueRef buffer, base, addr;
1571 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1572
1573 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1574
1575 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1576 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1577 LLVMConstInt(ctx->i32, param, 0));
1578
1579 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1580 ~0, buffer, base, addr, true);
1581
1582 break;
1583 }
1584
1585 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1586 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1587 {
1588 LLVMValueRef buf, slot, val[4];
1589 int i, offset;
1590
1591 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1592 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1593 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1594 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1595
1596 for (i = 0; i < 4; i++)
1597 val[i] = buffer_load_const(ctx, buf,
1598 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1599 value = lp_build_gather_values(gallivm, val, 4);
1600 break;
1601 }
1602
1603 case TGSI_SEMANTIC_PRIMID:
1604 value = get_primitive_id(&ctx->bld_base, 0);
1605 break;
1606
1607 case TGSI_SEMANTIC_GRID_SIZE:
1608 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1609 break;
1610
1611 case TGSI_SEMANTIC_BLOCK_SIZE:
1612 {
1613 LLVMValueRef values[3];
1614 unsigned i;
1615 unsigned *properties = ctx->shader->selector->info.properties;
1616
1617 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1618 unsigned sizes[3] = {
1619 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1620 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1621 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1622 };
1623
1624 for (i = 0; i < 3; ++i)
1625 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1626
1627 value = lp_build_gather_values(gallivm, values, 3);
1628 } else {
1629 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1630 }
1631 break;
1632 }
1633
1634 case TGSI_SEMANTIC_BLOCK_ID:
1635 {
1636 LLVMValueRef values[3];
1637
1638 for (int i = 0; i < 3; i++) {
1639 values[i] = ctx->i32_0;
1640 if (ctx->param_block_id[i] >= 0) {
1641 values[i] = LLVMGetParam(ctx->main_fn,
1642 ctx->param_block_id[i]);
1643 }
1644 }
1645 value = lp_build_gather_values(gallivm, values, 3);
1646 break;
1647 }
1648
1649 case TGSI_SEMANTIC_THREAD_ID:
1650 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1651 break;
1652
1653 case TGSI_SEMANTIC_HELPER_INVOCATION:
1654 value = lp_build_intrinsic(gallivm->builder,
1655 "llvm.amdgcn.ps.live",
1656 ctx->i1, NULL, 0,
1657 LP_FUNC_ATTR_READNONE);
1658 value = LLVMBuildNot(gallivm->builder, value, "");
1659 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1660 break;
1661
1662 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1663 value = LLVMConstInt(ctx->i32, 64, 0);
1664 break;
1665
1666 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1667 value = ac_get_thread_id(&ctx->ac);
1668 break;
1669
1670 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1671 {
1672 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1673 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1674 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1675 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1676 break;
1677 }
1678
1679 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1680 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1681 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1682 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1683 {
1684 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1685 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1686 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1687 /* All bits set except LSB */
1688 value = LLVMConstInt(ctx->i64, -2, 0);
1689 } else {
1690 /* All bits set */
1691 value = LLVMConstInt(ctx->i64, -1, 0);
1692 }
1693 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1694 value = LLVMBuildShl(gallivm->builder, value, id, "");
1695 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1696 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1697 value = LLVMBuildNot(gallivm->builder, value, "");
1698 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1699 break;
1700 }
1701
1702 default:
1703 assert(!"unknown system value");
1704 return;
1705 }
1706
1707 ctx->system_values[index] = value;
1708 }
1709
1710 static void declare_compute_memory(struct si_shader_context *ctx,
1711 const struct tgsi_full_declaration *decl)
1712 {
1713 struct si_shader_selector *sel = ctx->shader->selector;
1714 struct gallivm_state *gallivm = &ctx->gallivm;
1715
1716 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1717 LLVMValueRef var;
1718
1719 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1720 assert(decl->Range.First == decl->Range.Last);
1721 assert(!ctx->shared_memory);
1722
1723 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1724 LLVMArrayType(ctx->i8, sel->local_size),
1725 "compute_lds",
1726 LOCAL_ADDR_SPACE);
1727 LLVMSetAlignment(var, 4);
1728
1729 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1730 }
1731
1732 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1733 {
1734 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1735 ctx->param_const_buffers);
1736
1737 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1738 LLVMConstInt(ctx->i32, i, 0));
1739 }
1740
1741 static LLVMValueRef fetch_constant(
1742 struct lp_build_tgsi_context *bld_base,
1743 const struct tgsi_full_src_register *reg,
1744 enum tgsi_opcode_type type,
1745 unsigned swizzle)
1746 {
1747 struct si_shader_context *ctx = si_shader_context(bld_base);
1748 struct lp_build_context *base = &bld_base->base;
1749 const struct tgsi_ind_register *ireg = &reg->Indirect;
1750 unsigned buf, idx;
1751
1752 LLVMValueRef addr, bufp;
1753 LLVMValueRef result;
1754
1755 if (swizzle == LP_CHAN_ALL) {
1756 unsigned chan;
1757 LLVMValueRef values[4];
1758 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1759 values[chan] = fetch_constant(bld_base, reg, type, chan);
1760
1761 return lp_build_gather_values(&ctx->gallivm, values, 4);
1762 }
1763
1764 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1765 idx = reg->Register.Index * 4 + swizzle;
1766
1767 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1768 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1769 LLVMValueRef index;
1770 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1771 reg->Dimension.Index,
1772 SI_NUM_CONST_BUFFERS);
1773 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1774 } else
1775 bufp = load_const_buffer_desc(ctx, buf);
1776
1777 if (reg->Register.Indirect) {
1778 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1779 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1780 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1781 addr = lp_build_add(&bld_base->uint_bld, addr,
1782 LLVMConstInt(ctx->i32, idx * 4, 0));
1783 } else {
1784 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1785 }
1786
1787 result = buffer_load_const(ctx, bufp, addr);
1788
1789 if (!tgsi_type_is_64bit(type))
1790 result = bitcast(bld_base, type, result);
1791 else {
1792 LLVMValueRef addr2, result2;
1793
1794 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1795 LLVMConstInt(ctx->i32, 4, 0));
1796 result2 = buffer_load_const(ctx, bufp, addr2);
1797
1798 result = si_llvm_emit_fetch_64bit(bld_base, type,
1799 result, result2);
1800 }
1801 return result;
1802 }
1803
1804 /* Upper 16 bits must be zero. */
1805 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1806 LLVMValueRef val[2])
1807 {
1808 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1809 LLVMBuildShl(ctx->gallivm.builder, val[1],
1810 LLVMConstInt(ctx->i32, 16, 0),
1811 ""), "");
1812 }
1813
1814 /* Upper 16 bits are ignored and will be dropped. */
1815 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1816 LLVMValueRef val[2])
1817 {
1818 LLVMValueRef v[2] = {
1819 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1820 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1821 val[1],
1822 };
1823 return si_llvm_pack_two_int16(ctx, v);
1824 }
1825
1826 /* Initialize arguments for the shader export intrinsic */
1827 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1828 LLVMValueRef *values,
1829 unsigned target,
1830 struct ac_export_args *args)
1831 {
1832 struct si_shader_context *ctx = si_shader_context(bld_base);
1833 struct lp_build_context *base = &bld_base->base;
1834 LLVMBuilderRef builder = ctx->gallivm.builder;
1835 LLVMValueRef val[4];
1836 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1837 unsigned chan;
1838 bool is_int8, is_int10;
1839
1840 /* Default is 0xf. Adjusted below depending on the format. */
1841 args->enabled_channels = 0xf; /* writemask */
1842
1843 /* Specify whether the EXEC mask represents the valid mask */
1844 args->valid_mask = 0;
1845
1846 /* Specify whether this is the last export */
1847 args->done = 0;
1848
1849 /* Specify the target we are exporting */
1850 args->target = target;
1851
1852 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1853 const struct si_shader_key *key = &ctx->shader->key;
1854 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1855 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1856
1857 assert(cbuf >= 0 && cbuf < 8);
1858 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1859 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1860 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1861 }
1862
1863 args->compr = false;
1864 args->out[0] = base->undef;
1865 args->out[1] = base->undef;
1866 args->out[2] = base->undef;
1867 args->out[3] = base->undef;
1868
1869 switch (spi_shader_col_format) {
1870 case V_028714_SPI_SHADER_ZERO:
1871 args->enabled_channels = 0; /* writemask */
1872 args->target = V_008DFC_SQ_EXP_NULL;
1873 break;
1874
1875 case V_028714_SPI_SHADER_32_R:
1876 args->enabled_channels = 1; /* writemask */
1877 args->out[0] = values[0];
1878 break;
1879
1880 case V_028714_SPI_SHADER_32_GR:
1881 args->enabled_channels = 0x3; /* writemask */
1882 args->out[0] = values[0];
1883 args->out[1] = values[1];
1884 break;
1885
1886 case V_028714_SPI_SHADER_32_AR:
1887 args->enabled_channels = 0x9; /* writemask */
1888 args->out[0] = values[0];
1889 args->out[3] = values[3];
1890 break;
1891
1892 case V_028714_SPI_SHADER_FP16_ABGR:
1893 args->compr = 1; /* COMPR flag */
1894
1895 for (chan = 0; chan < 2; chan++) {
1896 LLVMValueRef pack_args[2] = {
1897 values[2 * chan],
1898 values[2 * chan + 1]
1899 };
1900 LLVMValueRef packed;
1901
1902 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1903 args->out[chan] =
1904 LLVMBuildBitCast(ctx->gallivm.builder,
1905 packed, ctx->f32, "");
1906 }
1907 break;
1908
1909 case V_028714_SPI_SHADER_UNORM16_ABGR:
1910 for (chan = 0; chan < 4; chan++) {
1911 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1912 val[chan] = LLVMBuildFMul(builder, val[chan],
1913 LLVMConstReal(ctx->f32, 65535), "");
1914 val[chan] = LLVMBuildFAdd(builder, val[chan],
1915 LLVMConstReal(ctx->f32, 0.5), "");
1916 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1917 ctx->i32, "");
1918 }
1919
1920 args->compr = 1; /* COMPR flag */
1921 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1922 si_llvm_pack_two_int16(ctx, val));
1923 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1924 si_llvm_pack_two_int16(ctx, val+2));
1925 break;
1926
1927 case V_028714_SPI_SHADER_SNORM16_ABGR:
1928 for (chan = 0; chan < 4; chan++) {
1929 /* Clamp between [-1, 1]. */
1930 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1931 values[chan],
1932 LLVMConstReal(ctx->f32, 1));
1933 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1934 val[chan],
1935 LLVMConstReal(ctx->f32, -1));
1936 /* Convert to a signed integer in [-32767, 32767]. */
1937 val[chan] = LLVMBuildFMul(builder, val[chan],
1938 LLVMConstReal(ctx->f32, 32767), "");
1939 /* If positive, add 0.5, else add -0.5. */
1940 val[chan] = LLVMBuildFAdd(builder, val[chan],
1941 LLVMBuildSelect(builder,
1942 LLVMBuildFCmp(builder, LLVMRealOGE,
1943 val[chan], base->zero, ""),
1944 LLVMConstReal(ctx->f32, 0.5),
1945 LLVMConstReal(ctx->f32, -0.5), ""), "");
1946 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1947 }
1948
1949 args->compr = 1; /* COMPR flag */
1950 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1951 si_llvm_pack_two_int32_as_int16(ctx, val));
1952 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1953 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1954 break;
1955
1956 case V_028714_SPI_SHADER_UINT16_ABGR: {
1957 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1958 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1959 LLVMValueRef max_alpha =
1960 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1961
1962 /* Clamp. */
1963 for (chan = 0; chan < 4; chan++) {
1964 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1965 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1966 val[chan],
1967 chan == 3 ? max_alpha : max_rgb);
1968 }
1969
1970 args->compr = 1; /* COMPR flag */
1971 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1972 si_llvm_pack_two_int16(ctx, val));
1973 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1974 si_llvm_pack_two_int16(ctx, val+2));
1975 break;
1976 }
1977
1978 case V_028714_SPI_SHADER_SINT16_ABGR: {
1979 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1980 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1981 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1982 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1983 LLVMValueRef max_alpha =
1984 !is_int10 ? max_rgb : ctx->i32_1;
1985 LLVMValueRef min_alpha =
1986 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1987
1988 /* Clamp. */
1989 for (chan = 0; chan < 4; chan++) {
1990 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1991 val[chan] = lp_build_emit_llvm_binary(bld_base,
1992 TGSI_OPCODE_IMIN,
1993 val[chan], chan == 3 ? max_alpha : max_rgb);
1994 val[chan] = lp_build_emit_llvm_binary(bld_base,
1995 TGSI_OPCODE_IMAX,
1996 val[chan], chan == 3 ? min_alpha : min_rgb);
1997 }
1998
1999 args->compr = 1; /* COMPR flag */
2000 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2001 si_llvm_pack_two_int32_as_int16(ctx, val));
2002 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2003 si_llvm_pack_two_int32_as_int16(ctx, val+2));
2004 break;
2005 }
2006
2007 case V_028714_SPI_SHADER_32_ABGR:
2008 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2009 break;
2010 }
2011 }
2012
2013 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2014 LLVMValueRef alpha)
2015 {
2016 struct si_shader_context *ctx = si_shader_context(bld_base);
2017
2018 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2019 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2020 SI_PARAM_ALPHA_REF);
2021
2022 LLVMValueRef alpha_pass =
2023 lp_build_cmp(&bld_base->base,
2024 ctx->shader->key.part.ps.epilog.alpha_func,
2025 alpha, alpha_ref);
2026 LLVMValueRef arg =
2027 lp_build_select(&bld_base->base,
2028 alpha_pass,
2029 LLVMConstReal(ctx->f32, 1.0f),
2030 LLVMConstReal(ctx->f32, -1.0f));
2031
2032 ac_build_kill(&ctx->ac, arg);
2033 } else {
2034 ac_build_kill(&ctx->ac, NULL);
2035 }
2036 }
2037
2038 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2039 LLVMValueRef alpha,
2040 unsigned samplemask_param)
2041 {
2042 struct si_shader_context *ctx = si_shader_context(bld_base);
2043 struct gallivm_state *gallivm = &ctx->gallivm;
2044 LLVMValueRef coverage;
2045
2046 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2047 coverage = LLVMGetParam(ctx->main_fn,
2048 samplemask_param);
2049 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2050
2051 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2052 ctx->i32,
2053 &coverage, 1, LP_FUNC_ATTR_READNONE);
2054
2055 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2056 ctx->f32, "");
2057
2058 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2059 LLVMConstReal(ctx->f32,
2060 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2061
2062 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2063 }
2064
2065 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2066 struct ac_export_args *pos, LLVMValueRef *out_elts)
2067 {
2068 struct si_shader_context *ctx = si_shader_context(bld_base);
2069 struct lp_build_context *base = &bld_base->base;
2070 unsigned reg_index;
2071 unsigned chan;
2072 unsigned const_chan;
2073 LLVMValueRef base_elt;
2074 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2075 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2076 SI_VS_CONST_CLIP_PLANES, 0);
2077 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2078
2079 for (reg_index = 0; reg_index < 2; reg_index ++) {
2080 struct ac_export_args *args = &pos[2 + reg_index];
2081
2082 args->out[0] =
2083 args->out[1] =
2084 args->out[2] =
2085 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2086
2087 /* Compute dot products of position and user clip plane vectors */
2088 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2089 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2090 LLVMValueRef addr =
2091 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2092 const_chan) * 4, 0);
2093 base_elt = buffer_load_const(ctx, const_resource,
2094 addr);
2095 args->out[chan] =
2096 lp_build_add(base, args->out[chan],
2097 lp_build_mul(base, base_elt,
2098 out_elts[const_chan]));
2099 }
2100 }
2101
2102 args->enabled_channels = 0xf;
2103 args->valid_mask = 0;
2104 args->done = 0;
2105 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2106 args->compr = 0;
2107 }
2108 }
2109
2110 static void si_dump_streamout(struct pipe_stream_output_info *so)
2111 {
2112 unsigned i;
2113
2114 if (so->num_outputs)
2115 fprintf(stderr, "STREAMOUT\n");
2116
2117 for (i = 0; i < so->num_outputs; i++) {
2118 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2119 so->output[i].start_component;
2120 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2121 i, so->output[i].output_buffer,
2122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2123 so->output[i].register_index,
2124 mask & 1 ? "x" : "",
2125 mask & 2 ? "y" : "",
2126 mask & 4 ? "z" : "",
2127 mask & 8 ? "w" : "");
2128 }
2129 }
2130
2131 static void emit_streamout_output(struct si_shader_context *ctx,
2132 LLVMValueRef const *so_buffers,
2133 LLVMValueRef const *so_write_offsets,
2134 struct pipe_stream_output *stream_out,
2135 struct si_shader_output_values *shader_out)
2136 {
2137 struct gallivm_state *gallivm = &ctx->gallivm;
2138 LLVMBuilderRef builder = gallivm->builder;
2139 unsigned buf_idx = stream_out->output_buffer;
2140 unsigned start = stream_out->start_component;
2141 unsigned num_comps = stream_out->num_components;
2142 LLVMValueRef out[4];
2143
2144 assert(num_comps && num_comps <= 4);
2145 if (!num_comps || num_comps > 4)
2146 return;
2147
2148 /* Load the output as int. */
2149 for (int j = 0; j < num_comps; j++) {
2150 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2151
2152 out[j] = LLVMBuildBitCast(builder,
2153 shader_out->values[start + j],
2154 ctx->i32, "");
2155 }
2156
2157 /* Pack the output. */
2158 LLVMValueRef vdata = NULL;
2159
2160 switch (num_comps) {
2161 case 1: /* as i32 */
2162 vdata = out[0];
2163 break;
2164 case 2: /* as v2i32 */
2165 case 3: /* as v4i32 (aligned to 4) */
2166 case 4: /* as v4i32 */
2167 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2168 for (int j = 0; j < num_comps; j++) {
2169 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2170 LLVMConstInt(ctx->i32, j, 0), "");
2171 }
2172 break;
2173 }
2174
2175 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2176 vdata, num_comps,
2177 so_write_offsets[buf_idx],
2178 ctx->i32_0,
2179 stream_out->dst_offset * 4, 1, 1, true, false);
2180 }
2181
2182 /**
2183 * Write streamout data to buffers for vertex stream @p stream (different
2184 * vertex streams can occur for GS copy shaders).
2185 */
2186 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2187 struct si_shader_output_values *outputs,
2188 unsigned noutput, unsigned stream)
2189 {
2190 struct si_shader_selector *sel = ctx->shader->selector;
2191 struct pipe_stream_output_info *so = &sel->so;
2192 struct gallivm_state *gallivm = &ctx->gallivm;
2193 LLVMBuilderRef builder = gallivm->builder;
2194 int i;
2195 struct lp_build_if_state if_ctx;
2196
2197 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2198 LLVMValueRef so_vtx_count =
2199 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2200
2201 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2202
2203 /* can_emit = tid < so_vtx_count; */
2204 LLVMValueRef can_emit =
2205 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2206
2207 /* Emit the streamout code conditionally. This actually avoids
2208 * out-of-bounds buffer access. The hw tells us via the SGPR
2209 * (so_vtx_count) which threads are allowed to emit streamout data. */
2210 lp_build_if(&if_ctx, gallivm, can_emit);
2211 {
2212 /* The buffer offset is computed as follows:
2213 * ByteOffset = streamout_offset[buffer_id]*4 +
2214 * (streamout_write_index + thread_id)*stride[buffer_id] +
2215 * attrib_offset
2216 */
2217
2218 LLVMValueRef so_write_index =
2219 LLVMGetParam(ctx->main_fn,
2220 ctx->param_streamout_write_index);
2221
2222 /* Compute (streamout_write_index + thread_id). */
2223 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2224
2225 /* Load the descriptor and compute the write offset for each
2226 * enabled buffer. */
2227 LLVMValueRef so_write_offset[4] = {};
2228 LLVMValueRef so_buffers[4];
2229 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2230 ctx->param_rw_buffers);
2231
2232 for (i = 0; i < 4; i++) {
2233 if (!so->stride[i])
2234 continue;
2235
2236 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2237 SI_VS_STREAMOUT_BUF0 + i, 0);
2238
2239 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2240
2241 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2242 ctx->param_streamout_offset[i]);
2243 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2244
2245 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2246 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2247 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2248 }
2249
2250 /* Write streamout data. */
2251 for (i = 0; i < so->num_outputs; i++) {
2252 unsigned reg = so->output[i].register_index;
2253
2254 if (reg >= noutput)
2255 continue;
2256
2257 if (stream != so->output[i].stream)
2258 continue;
2259
2260 emit_streamout_output(ctx, so_buffers, so_write_offset,
2261 &so->output[i], &outputs[reg]);
2262 }
2263 }
2264 lp_build_endif(&if_ctx);
2265 }
2266
2267
2268 /* Generate export instructions for hardware VS shader stage */
2269 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2270 struct si_shader_output_values *outputs,
2271 unsigned noutput)
2272 {
2273 struct si_shader_context *ctx = si_shader_context(bld_base);
2274 struct si_shader *shader = ctx->shader;
2275 struct lp_build_context *base = &bld_base->base;
2276 struct ac_export_args args, pos_args[4] = {};
2277 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2278 unsigned semantic_name, semantic_index;
2279 unsigned target;
2280 unsigned param_count = 0;
2281 unsigned pos_idx;
2282 int i;
2283
2284 for (i = 0; i < noutput; i++) {
2285 semantic_name = outputs[i].semantic_name;
2286 semantic_index = outputs[i].semantic_index;
2287 bool export_param = true;
2288
2289 switch (semantic_name) {
2290 case TGSI_SEMANTIC_POSITION: /* ignore these */
2291 case TGSI_SEMANTIC_PSIZE:
2292 case TGSI_SEMANTIC_CLIPVERTEX:
2293 case TGSI_SEMANTIC_EDGEFLAG:
2294 break;
2295 case TGSI_SEMANTIC_GENERIC:
2296 /* don't process indices the function can't handle */
2297 if (semantic_index >= SI_MAX_IO_GENERIC)
2298 break;
2299 /* fall through */
2300 case TGSI_SEMANTIC_CLIPDIST:
2301 if (shader->key.opt.hw_vs.kill_outputs &
2302 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2303 export_param = false;
2304 break;
2305 default:
2306 if (shader->key.opt.hw_vs.kill_outputs2 &
2307 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2308 export_param = false;
2309 break;
2310 }
2311
2312 if (outputs[i].vertex_stream[0] != 0 &&
2313 outputs[i].vertex_stream[1] != 0 &&
2314 outputs[i].vertex_stream[2] != 0 &&
2315 outputs[i].vertex_stream[3] != 0)
2316 export_param = false;
2317
2318 handle_semantic:
2319 /* Select the correct target */
2320 switch(semantic_name) {
2321 case TGSI_SEMANTIC_PSIZE:
2322 psize_value = outputs[i].values[0];
2323 continue;
2324 case TGSI_SEMANTIC_EDGEFLAG:
2325 edgeflag_value = outputs[i].values[0];
2326 continue;
2327 case TGSI_SEMANTIC_LAYER:
2328 layer_value = outputs[i].values[0];
2329 semantic_name = TGSI_SEMANTIC_GENERIC;
2330 goto handle_semantic;
2331 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2332 viewport_index_value = outputs[i].values[0];
2333 semantic_name = TGSI_SEMANTIC_GENERIC;
2334 goto handle_semantic;
2335 case TGSI_SEMANTIC_POSITION:
2336 target = V_008DFC_SQ_EXP_POS;
2337 break;
2338 case TGSI_SEMANTIC_CLIPDIST:
2339 if (shader->key.opt.hw_vs.clip_disable) {
2340 semantic_name = TGSI_SEMANTIC_GENERIC;
2341 goto handle_semantic;
2342 }
2343 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2344 break;
2345 case TGSI_SEMANTIC_CLIPVERTEX:
2346 if (shader->key.opt.hw_vs.clip_disable)
2347 continue;
2348 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2349 continue;
2350 case TGSI_SEMANTIC_COLOR:
2351 case TGSI_SEMANTIC_BCOLOR:
2352 case TGSI_SEMANTIC_PRIMID:
2353 case TGSI_SEMANTIC_FOG:
2354 case TGSI_SEMANTIC_TEXCOORD:
2355 case TGSI_SEMANTIC_GENERIC:
2356 if (!export_param)
2357 continue;
2358 target = V_008DFC_SQ_EXP_PARAM + param_count;
2359 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2360 shader->info.vs_output_param_offset[i] = param_count;
2361 param_count++;
2362 break;
2363 default:
2364 target = 0;
2365 fprintf(stderr,
2366 "Warning: SI unhandled vs output type:%d\n",
2367 semantic_name);
2368 }
2369
2370 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2371
2372 if (target >= V_008DFC_SQ_EXP_POS &&
2373 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2374 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2375 &args, sizeof(args));
2376 } else {
2377 ac_build_export(&ctx->ac, &args);
2378 }
2379
2380 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2381 semantic_name = TGSI_SEMANTIC_GENERIC;
2382 goto handle_semantic;
2383 }
2384 }
2385
2386 shader->info.nr_param_exports = param_count;
2387
2388 /* We need to add the position output manually if it's missing. */
2389 if (!pos_args[0].out[0]) {
2390 pos_args[0].enabled_channels = 0xf; /* writemask */
2391 pos_args[0].valid_mask = 0; /* EXEC mask */
2392 pos_args[0].done = 0; /* last export? */
2393 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2394 pos_args[0].compr = 0; /* COMPR flag */
2395 pos_args[0].out[0] = base->zero; /* X */
2396 pos_args[0].out[1] = base->zero; /* Y */
2397 pos_args[0].out[2] = base->zero; /* Z */
2398 pos_args[0].out[3] = base->one; /* W */
2399 }
2400
2401 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2402 if (shader->selector->info.writes_psize ||
2403 shader->selector->info.writes_edgeflag ||
2404 shader->selector->info.writes_viewport_index ||
2405 shader->selector->info.writes_layer) {
2406 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2407 (shader->selector->info.writes_edgeflag << 1) |
2408 (shader->selector->info.writes_layer << 2);
2409
2410 pos_args[1].valid_mask = 0; /* EXEC mask */
2411 pos_args[1].done = 0; /* last export? */
2412 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2413 pos_args[1].compr = 0; /* COMPR flag */
2414 pos_args[1].out[0] = base->zero; /* X */
2415 pos_args[1].out[1] = base->zero; /* Y */
2416 pos_args[1].out[2] = base->zero; /* Z */
2417 pos_args[1].out[3] = base->zero; /* W */
2418
2419 if (shader->selector->info.writes_psize)
2420 pos_args[1].out[0] = psize_value;
2421
2422 if (shader->selector->info.writes_edgeflag) {
2423 /* The output is a float, but the hw expects an integer
2424 * with the first bit containing the edge flag. */
2425 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2426 edgeflag_value,
2427 ctx->i32, "");
2428 edgeflag_value = lp_build_min(&bld_base->int_bld,
2429 edgeflag_value,
2430 ctx->i32_1);
2431
2432 /* The LLVM intrinsic expects a float. */
2433 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2434 edgeflag_value,
2435 ctx->f32, "");
2436 }
2437
2438 if (ctx->screen->b.chip_class >= GFX9) {
2439 /* GFX9 has the layer in out.z[10:0] and the viewport
2440 * index in out.z[19:16].
2441 */
2442 if (shader->selector->info.writes_layer)
2443 pos_args[1].out[2] = layer_value;
2444
2445 if (shader->selector->info.writes_viewport_index) {
2446 LLVMValueRef v = viewport_index_value;
2447
2448 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2449 v = LLVMBuildShl(ctx->gallivm.builder, v,
2450 LLVMConstInt(ctx->i32, 16, 0), "");
2451 v = LLVMBuildOr(ctx->gallivm.builder, v,
2452 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2453 pos_args[1].out[2]), "");
2454 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2455 pos_args[1].enabled_channels |= 1 << 2;
2456 }
2457 } else {
2458 if (shader->selector->info.writes_layer)
2459 pos_args[1].out[2] = layer_value;
2460
2461 if (shader->selector->info.writes_viewport_index) {
2462 pos_args[1].out[3] = viewport_index_value;
2463 pos_args[1].enabled_channels |= 1 << 3;
2464 }
2465 }
2466 }
2467
2468 for (i = 0; i < 4; i++)
2469 if (pos_args[i].out[0])
2470 shader->info.nr_pos_exports++;
2471
2472 pos_idx = 0;
2473 for (i = 0; i < 4; i++) {
2474 if (!pos_args[i].out[0])
2475 continue;
2476
2477 /* Specify the target we are exporting */
2478 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2479
2480 if (pos_idx == shader->info.nr_pos_exports)
2481 /* Specify that this is the last export */
2482 pos_args[i].done = 1;
2483
2484 ac_build_export(&ctx->ac, &pos_args[i]);
2485 }
2486 }
2487
2488 /**
2489 * Forward all outputs from the vertex shader to the TES. This is only used
2490 * for the fixed function TCS.
2491 */
2492 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2493 {
2494 struct si_shader_context *ctx = si_shader_context(bld_base);
2495 struct gallivm_state *gallivm = &ctx->gallivm;
2496 LLVMValueRef invocation_id, buffer, buffer_offset;
2497 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2498 uint64_t inputs;
2499
2500 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2501 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2502 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2503
2504 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2505 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2506 lds_vertex_stride, "");
2507 lds_base = get_tcs_in_current_patch_offset(ctx);
2508 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2509
2510 inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2511 while (inputs) {
2512 unsigned i = u_bit_scan64(&inputs);
2513
2514 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2515 LLVMConstInt(ctx->i32, 4 * i, 0),
2516 "");
2517
2518 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2519 get_rel_patch_id(ctx),
2520 invocation_id,
2521 LLVMConstInt(ctx->i32, i, 0));
2522
2523 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2524 lds_ptr);
2525
2526 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2527 buffer_offset, 0, 1, 0, true, false);
2528 }
2529 }
2530
2531 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2532 LLVMValueRef rel_patch_id,
2533 LLVMValueRef invocation_id,
2534 LLVMValueRef tcs_out_current_patch_data_offset)
2535 {
2536 struct si_shader_context *ctx = si_shader_context(bld_base);
2537 struct gallivm_state *gallivm = &ctx->gallivm;
2538 struct si_shader *shader = ctx->shader;
2539 unsigned tess_inner_index, tess_outer_index;
2540 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2541 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2542 unsigned stride, outer_comps, inner_comps, i, offset;
2543 struct lp_build_if_state if_ctx, inner_if_ctx;
2544
2545 si_llvm_emit_barrier(NULL, bld_base, NULL);
2546
2547 /* Do this only for invocation 0, because the tess levels are per-patch,
2548 * not per-vertex.
2549 *
2550 * This can't jump, because invocation 0 executes this. It should
2551 * at least mask out the loads and stores for other invocations.
2552 */
2553 lp_build_if(&if_ctx, gallivm,
2554 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2555 invocation_id, ctx->i32_0, ""));
2556
2557 /* Determine the layout of one tess factor element in the buffer. */
2558 switch (shader->key.part.tcs.epilog.prim_mode) {
2559 case PIPE_PRIM_LINES:
2560 stride = 2; /* 2 dwords, 1 vec2 store */
2561 outer_comps = 2;
2562 inner_comps = 0;
2563 break;
2564 case PIPE_PRIM_TRIANGLES:
2565 stride = 4; /* 4 dwords, 1 vec4 store */
2566 outer_comps = 3;
2567 inner_comps = 1;
2568 break;
2569 case PIPE_PRIM_QUADS:
2570 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2571 outer_comps = 4;
2572 inner_comps = 2;
2573 break;
2574 default:
2575 assert(0);
2576 return;
2577 }
2578
2579 /* Load tess_inner and tess_outer from LDS.
2580 * Any invocation can write them, so we can't get them from a temporary.
2581 */
2582 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2583 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2584
2585 lds_base = tcs_out_current_patch_data_offset;
2586 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2587 LLVMConstInt(ctx->i32,
2588 tess_inner_index * 4, 0), "");
2589 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2590 LLVMConstInt(ctx->i32,
2591 tess_outer_index * 4, 0), "");
2592
2593 for (i = 0; i < 4; i++) {
2594 inner[i] = LLVMGetUndef(ctx->i32);
2595 outer[i] = LLVMGetUndef(ctx->i32);
2596 }
2597
2598 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2599 /* For isolines, the hardware expects tess factors in the
2600 * reverse order from what GLSL / TGSI specify.
2601 */
2602 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2603 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2604 } else {
2605 for (i = 0; i < outer_comps; i++) {
2606 outer[i] = out[i] =
2607 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2608 }
2609 for (i = 0; i < inner_comps; i++) {
2610 inner[i] = out[outer_comps+i] =
2611 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2612 }
2613 }
2614
2615 /* Convert the outputs to vectors for stores. */
2616 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2617 vec1 = NULL;
2618
2619 if (stride > 4)
2620 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2621
2622 /* Get the buffer. */
2623 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2624
2625 /* Get the offset. */
2626 tf_base = LLVMGetParam(ctx->main_fn,
2627 ctx->param_tcs_factor_offset);
2628 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2629 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2630
2631 lp_build_if(&inner_if_ctx, gallivm,
2632 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2633 rel_patch_id, ctx->i32_0, ""));
2634
2635 /* Store the dynamic HS control word. */
2636 offset = 0;
2637 if (ctx->screen->b.chip_class <= VI) {
2638 ac_build_buffer_store_dword(&ctx->ac, buffer,
2639 LLVMConstInt(ctx->i32, 0x80000000, 0),
2640 1, ctx->i32_0, tf_base,
2641 offset, 1, 0, true, false);
2642 offset += 4;
2643 }
2644
2645 lp_build_endif(&inner_if_ctx);
2646
2647 /* Store the tessellation factors. */
2648 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2649 MIN2(stride, 4), byteoffset, tf_base,
2650 offset, 1, 0, true, false);
2651 offset += 16;
2652 if (vec1)
2653 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2654 stride - 4, byteoffset, tf_base,
2655 offset, 1, 0, true, false);
2656
2657 /* Store the tess factors into the offchip buffer if TES reads them. */
2658 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2659 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2660 LLVMValueRef tf_inner_offset;
2661 unsigned param_outer, param_inner;
2662
2663 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2664 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2665
2666 param_outer = si_shader_io_get_unique_index_patch(
2667 TGSI_SEMANTIC_TESSOUTER, 0);
2668 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2669 LLVMConstInt(ctx->i32, param_outer, 0));
2670
2671 outer_vec = lp_build_gather_values(gallivm, outer,
2672 util_next_power_of_two(outer_comps));
2673
2674 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2675 outer_comps, tf_outer_offset,
2676 base, 0, 1, 0, true, false);
2677 if (inner_comps) {
2678 param_inner = si_shader_io_get_unique_index_patch(
2679 TGSI_SEMANTIC_TESSINNER, 0);
2680 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2681 LLVMConstInt(ctx->i32, param_inner, 0));
2682
2683 inner_vec = inner_comps == 1 ? inner[0] :
2684 lp_build_gather_values(gallivm, inner, inner_comps);
2685 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2686 inner_comps, tf_inner_offset,
2687 base, 0, 1, 0, true, false);
2688 }
2689 }
2690
2691 lp_build_endif(&if_ctx);
2692 }
2693
2694 static LLVMValueRef
2695 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2696 unsigned param, unsigned return_index)
2697 {
2698 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2699 LLVMGetParam(ctx->main_fn, param),
2700 return_index, "");
2701 }
2702
2703 static LLVMValueRef
2704 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2705 unsigned param, unsigned return_index)
2706 {
2707 LLVMBuilderRef builder = ctx->gallivm.builder;
2708 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2709
2710 return LLVMBuildInsertValue(builder, ret,
2711 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2712 return_index, "");
2713 }
2714
2715 static LLVMValueRef
2716 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2717 unsigned param, unsigned return_index)
2718 {
2719 LLVMBuilderRef builder = ctx->gallivm.builder;
2720 LLVMValueRef ptr, lo, hi;
2721
2722 ptr = LLVMGetParam(ctx->main_fn, param);
2723 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2724 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2725 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2726 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2727 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2728 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2729 }
2730
2731 /* This only writes the tessellation factor levels. */
2732 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2733 {
2734 struct si_shader_context *ctx = si_shader_context(bld_base);
2735 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2736
2737 si_copy_tcs_inputs(bld_base);
2738
2739 rel_patch_id = get_rel_patch_id(ctx);
2740 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2741 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2742
2743 /* Return epilog parameters from this function. */
2744 LLVMBuilderRef builder = ctx->gallivm.builder;
2745 LLVMValueRef ret = ctx->return_value;
2746 unsigned vgpr;
2747
2748 if (ctx->screen->b.chip_class >= GFX9) {
2749 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2750 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2751 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2752 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2753 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2754 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2755 /* Tess offchip and tess factor offsets are at the beginning. */
2756 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2757 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2758 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2759 } else {
2760 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2761 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2762 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2763 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2764 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2765 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2766 /* Tess offchip and tess factor offsets are after user SGPRs. */
2767 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2768 GFX6_TCS_NUM_USER_SGPR);
2769 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2770 GFX6_TCS_NUM_USER_SGPR + 1);
2771 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2772 }
2773
2774 /* VGPRs */
2775 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2776 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2777 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2778
2779 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2780 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2781 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2782 ctx->return_value = ret;
2783 }
2784
2785 /* Pass TCS inputs from LS to TCS on GFX9. */
2786 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2787 {
2788 LLVMValueRef ret = ctx->return_value;
2789
2790 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2791 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2792 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2793 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2794 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2795
2796 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2797 8 + SI_SGPR_VS_STATE_BITS);
2798 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2799 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2800 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2801 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2802 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2803 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2804 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2805 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2806 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2807 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2808
2809 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2810 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2811 8 + GFX9_SGPR_TCS_CONST_BUFFERS);
2812 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2813 8 + GFX9_SGPR_TCS_SAMPLERS);
2814 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2815 8 + GFX9_SGPR_TCS_IMAGES);
2816 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2817 8 + GFX9_SGPR_TCS_SHADER_BUFFERS);
2818
2819 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2820 ret = si_insert_input_ret_float(ctx, ret,
2821 ctx->param_tcs_patch_id, vgpr++);
2822 ret = si_insert_input_ret_float(ctx, ret,
2823 ctx->param_tcs_rel_ids, vgpr++);
2824 ctx->return_value = ret;
2825 }
2826
2827 /* Pass GS inputs from ES to GS on GFX9. */
2828 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2829 {
2830 LLVMValueRef ret = ctx->return_value;
2831
2832 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2833 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2834 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2835
2836 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2837
2838 unsigned desc_param = ctx->param_vs_state_bits + 1;
2839 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2840 8 + GFX9_SGPR_GS_CONST_BUFFERS);
2841 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2842 8 + GFX9_SGPR_GS_SAMPLERS);
2843 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2844 8 + GFX9_SGPR_GS_IMAGES);
2845 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2846 8 + GFX9_SGPR_GS_SHADER_BUFFERS);
2847
2848 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2849 for (unsigned i = 0; i < 5; i++) {
2850 unsigned param = ctx->param_gs_vtx01_offset + i;
2851 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2852 }
2853 ctx->return_value = ret;
2854 }
2855
2856 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2857 {
2858 struct si_shader_context *ctx = si_shader_context(bld_base);
2859 struct si_shader *shader = ctx->shader;
2860 struct tgsi_shader_info *info = &shader->selector->info;
2861 struct gallivm_state *gallivm = &ctx->gallivm;
2862 unsigned i, chan;
2863 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2864 ctx->param_rel_auto_id);
2865 LLVMValueRef vertex_dw_stride =
2866 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2867 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2868 vertex_dw_stride, "");
2869
2870 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2871 * its inputs from it. */
2872 for (i = 0; i < info->num_outputs; i++) {
2873 LLVMValueRef *out_ptr = ctx->outputs[i];
2874 unsigned name = info->output_semantic_name[i];
2875 unsigned index = info->output_semantic_index[i];
2876
2877 /* The ARB_shader_viewport_layer_array spec contains the
2878 * following issue:
2879 *
2880 * 2) What happens if gl_ViewportIndex or gl_Layer is
2881 * written in the vertex shader and a geometry shader is
2882 * present?
2883 *
2884 * RESOLVED: The value written by the last vertex processing
2885 * stage is used. If the last vertex processing stage
2886 * (vertex, tessellation evaluation or geometry) does not
2887 * statically assign to gl_ViewportIndex or gl_Layer, index
2888 * or layer zero is assumed.
2889 *
2890 * So writes to those outputs in VS-as-LS are simply ignored.
2891 */
2892 if (name == TGSI_SEMANTIC_LAYER ||
2893 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2894 continue;
2895
2896 int param = si_shader_io_get_unique_index(name, index);
2897 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2898 LLVMConstInt(ctx->i32, param * 4, 0), "");
2899
2900 for (chan = 0; chan < 4; chan++) {
2901 lds_store(bld_base, chan, dw_addr,
2902 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2903 }
2904 }
2905
2906 if (ctx->screen->b.chip_class >= GFX9)
2907 si_set_ls_return_value_for_tcs(ctx);
2908 }
2909
2910 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2911 {
2912 struct si_shader_context *ctx = si_shader_context(bld_base);
2913 struct gallivm_state *gallivm = &ctx->gallivm;
2914 struct si_shader *es = ctx->shader;
2915 struct tgsi_shader_info *info = &es->selector->info;
2916 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2917 ctx->param_es2gs_offset);
2918 LLVMValueRef lds_base = NULL;
2919 unsigned chan;
2920 int i;
2921
2922 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2923 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2924 lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
2925 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2926 }
2927
2928 for (i = 0; i < info->num_outputs; i++) {
2929 LLVMValueRef *out_ptr = ctx->outputs[i];
2930 int param;
2931
2932 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2933 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2934 continue;
2935
2936 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2937 info->output_semantic_index[i]);
2938
2939 for (chan = 0; chan < 4; chan++) {
2940 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2941 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2942
2943 /* GFX9 has the ESGS ring in LDS. */
2944 if (ctx->screen->b.chip_class >= GFX9) {
2945 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2946 continue;
2947 }
2948
2949 ac_build_buffer_store_dword(&ctx->ac,
2950 ctx->esgs_ring,
2951 out_val, 1, NULL, soffset,
2952 (4 * param + chan) * 4,
2953 1, 1, true, true);
2954 }
2955 }
2956
2957 if (ctx->screen->b.chip_class >= GFX9)
2958 si_set_es_return_value_for_gs(ctx);
2959 }
2960
2961 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2962 {
2963 if (ctx->screen->b.chip_class >= GFX9)
2964 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2965 else
2966 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2967 }
2968
2969 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2970 {
2971 struct si_shader_context *ctx = si_shader_context(bld_base);
2972
2973 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2974 si_get_gs_wave_id(ctx));
2975 }
2976
2977 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2978 {
2979 struct si_shader_context *ctx = si_shader_context(bld_base);
2980 struct gallivm_state *gallivm = &ctx->gallivm;
2981 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2982 struct si_shader_output_values *outputs = NULL;
2983 int i,j;
2984
2985 assert(!ctx->shader->is_gs_copy_shader);
2986
2987 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2988
2989 /* Vertex color clamping.
2990 *
2991 * This uses a state constant loaded in a user data SGPR and
2992 * an IF statement is added that clamps all colors if the constant
2993 * is true.
2994 */
2995 if (ctx->type == PIPE_SHADER_VERTEX) {
2996 struct lp_build_if_state if_ctx;
2997 LLVMValueRef cond = NULL;
2998 LLVMValueRef addr, val;
2999
3000 for (i = 0; i < info->num_outputs; i++) {
3001 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3002 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3003 continue;
3004
3005 /* We've found a color. */
3006 if (!cond) {
3007 /* The state is in the first bit of the user SGPR. */
3008 cond = LLVMGetParam(ctx->main_fn,
3009 ctx->param_vs_state_bits);
3010 cond = LLVMBuildTrunc(gallivm->builder, cond,
3011 ctx->i1, "");
3012 lp_build_if(&if_ctx, gallivm, cond);
3013 }
3014
3015 for (j = 0; j < 4; j++) {
3016 addr = ctx->outputs[i][j];
3017 val = LLVMBuildLoad(gallivm->builder, addr, "");
3018 val = ac_build_clamp(&ctx->ac, val);
3019 LLVMBuildStore(gallivm->builder, val, addr);
3020 }
3021 }
3022
3023 if (cond)
3024 lp_build_endif(&if_ctx);
3025 }
3026
3027 for (i = 0; i < info->num_outputs; i++) {
3028 outputs[i].semantic_name = info->output_semantic_name[i];
3029 outputs[i].semantic_index = info->output_semantic_index[i];
3030
3031 for (j = 0; j < 4; j++) {
3032 outputs[i].values[j] =
3033 LLVMBuildLoad(gallivm->builder,
3034 ctx->outputs[i][j],
3035 "");
3036 outputs[i].vertex_stream[j] =
3037 (info->output_streams[i] >> (2 * j)) & 3;
3038 }
3039 }
3040
3041 if (ctx->shader->selector->so.num_outputs)
3042 si_llvm_emit_streamout(ctx, outputs, i, 0);
3043
3044 /* Export PrimitiveID. */
3045 if (ctx->shader->key.mono.vs_export_prim_id) {
3046 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3047 outputs[i].semantic_index = 0;
3048 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
3049 get_primitive_id(bld_base, 0));
3050 for (j = 1; j < 4; j++)
3051 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3052
3053 memset(outputs[i].vertex_stream, 0,
3054 sizeof(outputs[i].vertex_stream));
3055 i++;
3056 }
3057
3058 si_llvm_export_vs(bld_base, outputs, i);
3059 FREE(outputs);
3060 }
3061
3062 struct si_ps_exports {
3063 unsigned num;
3064 struct ac_export_args args[10];
3065 };
3066
3067 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3068 bool writes_samplemask)
3069 {
3070 if (writes_z) {
3071 /* Z needs 32 bits. */
3072 if (writes_samplemask)
3073 return V_028710_SPI_SHADER_32_ABGR;
3074 else if (writes_stencil)
3075 return V_028710_SPI_SHADER_32_GR;
3076 else
3077 return V_028710_SPI_SHADER_32_R;
3078 } else if (writes_stencil || writes_samplemask) {
3079 /* Both stencil and sample mask need only 16 bits. */
3080 return V_028710_SPI_SHADER_UINT16_ABGR;
3081 } else {
3082 return V_028710_SPI_SHADER_ZERO;
3083 }
3084 }
3085
3086 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3087 LLVMValueRef depth, LLVMValueRef stencil,
3088 LLVMValueRef samplemask, struct si_ps_exports *exp)
3089 {
3090 struct si_shader_context *ctx = si_shader_context(bld_base);
3091 struct lp_build_context *base = &bld_base->base;
3092 struct ac_export_args args;
3093 unsigned mask = 0;
3094 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3095 stencil != NULL,
3096 samplemask != NULL);
3097
3098 assert(depth || stencil || samplemask);
3099
3100 args.valid_mask = 1; /* whether the EXEC mask is valid */
3101 args.done = 1; /* DONE bit */
3102
3103 /* Specify the target we are exporting */
3104 args.target = V_008DFC_SQ_EXP_MRTZ;
3105
3106 args.compr = 0; /* COMP flag */
3107 args.out[0] = base->undef; /* R, depth */
3108 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3109 args.out[2] = base->undef; /* B, sample mask */
3110 args.out[3] = base->undef; /* A, alpha to mask */
3111
3112 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3113 assert(!depth);
3114 args.compr = 1; /* COMPR flag */
3115
3116 if (stencil) {
3117 /* Stencil should be in X[23:16]. */
3118 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3119 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3120 LLVMConstInt(ctx->i32, 16, 0), "");
3121 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3122 mask |= 0x3;
3123 }
3124 if (samplemask) {
3125 /* SampleMask should be in Y[15:0]. */
3126 args.out[1] = samplemask;
3127 mask |= 0xc;
3128 }
3129 } else {
3130 if (depth) {
3131 args.out[0] = depth;
3132 mask |= 0x1;
3133 }
3134 if (stencil) {
3135 args.out[1] = stencil;
3136 mask |= 0x2;
3137 }
3138 if (samplemask) {
3139 args.out[2] = samplemask;
3140 mask |= 0x4;
3141 }
3142 }
3143
3144 /* SI (except OLAND and HAINAN) has a bug that it only looks
3145 * at the X writemask component. */
3146 if (ctx->screen->b.chip_class == SI &&
3147 ctx->screen->b.family != CHIP_OLAND &&
3148 ctx->screen->b.family != CHIP_HAINAN)
3149 mask |= 0x1;
3150
3151 /* Specify which components to enable */
3152 args.enabled_channels = mask;
3153
3154 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3155 }
3156
3157 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3158 LLVMValueRef *color, unsigned index,
3159 unsigned samplemask_param,
3160 bool is_last, struct si_ps_exports *exp)
3161 {
3162 struct si_shader_context *ctx = si_shader_context(bld_base);
3163 struct lp_build_context *base = &bld_base->base;
3164 int i;
3165
3166 /* Clamp color */
3167 if (ctx->shader->key.part.ps.epilog.clamp_color)
3168 for (i = 0; i < 4; i++)
3169 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3170
3171 /* Alpha to one */
3172 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3173 color[3] = base->one;
3174
3175 /* Alpha test */
3176 if (index == 0 &&
3177 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3178 si_alpha_test(bld_base, color[3]);
3179
3180 /* Line & polygon smoothing */
3181 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3182 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3183 samplemask_param);
3184
3185 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3186 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3187 struct ac_export_args args[8];
3188 int c, last = -1;
3189
3190 /* Get the export arguments, also find out what the last one is. */
3191 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3192 si_llvm_init_export_args(bld_base, color,
3193 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3194 if (args[c].enabled_channels)
3195 last = c;
3196 }
3197
3198 /* Emit all exports. */
3199 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3200 if (is_last && last == c) {
3201 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3202 args[c].done = 1; /* DONE bit */
3203 } else if (!args[c].enabled_channels)
3204 continue; /* unnecessary NULL export */
3205
3206 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3207 }
3208 } else {
3209 struct ac_export_args args;
3210
3211 /* Export */
3212 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3213 &args);
3214 if (is_last) {
3215 args.valid_mask = 1; /* whether the EXEC mask is valid */
3216 args.done = 1; /* DONE bit */
3217 } else if (!args.enabled_channels)
3218 return; /* unnecessary NULL export */
3219
3220 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3221 }
3222 }
3223
3224 static void si_emit_ps_exports(struct si_shader_context *ctx,
3225 struct si_ps_exports *exp)
3226 {
3227 for (unsigned i = 0; i < exp->num; i++)
3228 ac_build_export(&ctx->ac, &exp->args[i]);
3229 }
3230
3231 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3232 {
3233 struct si_shader_context *ctx = si_shader_context(bld_base);
3234 struct lp_build_context *base = &bld_base->base;
3235 struct ac_export_args args;
3236
3237 args.enabled_channels = 0x0; /* enabled channels */
3238 args.valid_mask = 1; /* whether the EXEC mask is valid */
3239 args.done = 1; /* DONE bit */
3240 args.target = V_008DFC_SQ_EXP_NULL;
3241 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3242 args.out[0] = base->undef; /* R */
3243 args.out[1] = base->undef; /* G */
3244 args.out[2] = base->undef; /* B */
3245 args.out[3] = base->undef; /* A */
3246
3247 ac_build_export(&ctx->ac, &args);
3248 }
3249
3250 /**
3251 * Return PS outputs in this order:
3252 *
3253 * v[0:3] = color0.xyzw
3254 * v[4:7] = color1.xyzw
3255 * ...
3256 * vN+0 = Depth
3257 * vN+1 = Stencil
3258 * vN+2 = SampleMask
3259 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3260 *
3261 * The alpha-ref SGPR is returned via its original location.
3262 */
3263 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3264 {
3265 struct si_shader_context *ctx = si_shader_context(bld_base);
3266 struct si_shader *shader = ctx->shader;
3267 struct tgsi_shader_info *info = &shader->selector->info;
3268 LLVMBuilderRef builder = ctx->gallivm.builder;
3269 unsigned i, j, first_vgpr, vgpr;
3270
3271 LLVMValueRef color[8][4] = {};
3272 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3273 LLVMValueRef ret;
3274
3275 /* Read the output values. */
3276 for (i = 0; i < info->num_outputs; i++) {
3277 unsigned semantic_name = info->output_semantic_name[i];
3278 unsigned semantic_index = info->output_semantic_index[i];
3279
3280 switch (semantic_name) {
3281 case TGSI_SEMANTIC_COLOR:
3282 assert(semantic_index < 8);
3283 for (j = 0; j < 4; j++) {
3284 LLVMValueRef ptr = ctx->outputs[i][j];
3285 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3286 color[semantic_index][j] = result;
3287 }
3288 break;
3289 case TGSI_SEMANTIC_POSITION:
3290 depth = LLVMBuildLoad(builder,
3291 ctx->outputs[i][2], "");
3292 break;
3293 case TGSI_SEMANTIC_STENCIL:
3294 stencil = LLVMBuildLoad(builder,
3295 ctx->outputs[i][1], "");
3296 break;
3297 case TGSI_SEMANTIC_SAMPLEMASK:
3298 samplemask = LLVMBuildLoad(builder,
3299 ctx->outputs[i][0], "");
3300 break;
3301 default:
3302 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3303 semantic_name);
3304 }
3305 }
3306
3307 /* Fill the return structure. */
3308 ret = ctx->return_value;
3309
3310 /* Set SGPRs. */
3311 ret = LLVMBuildInsertValue(builder, ret,
3312 bitcast(bld_base, TGSI_TYPE_SIGNED,
3313 LLVMGetParam(ctx->main_fn,
3314 SI_PARAM_ALPHA_REF)),
3315 SI_SGPR_ALPHA_REF, "");
3316
3317 /* Set VGPRs */
3318 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3319 for (i = 0; i < ARRAY_SIZE(color); i++) {
3320 if (!color[i][0])
3321 continue;
3322
3323 for (j = 0; j < 4; j++)
3324 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3325 }
3326 if (depth)
3327 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3328 if (stencil)
3329 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3330 if (samplemask)
3331 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3332
3333 /* Add the input sample mask for smoothing at the end. */
3334 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3335 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3336 ret = LLVMBuildInsertValue(builder, ret,
3337 LLVMGetParam(ctx->main_fn,
3338 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3339
3340 ctx->return_value = ret;
3341 }
3342
3343 /**
3344 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3345 * buffer in number of elements and return it as an i32.
3346 */
3347 static LLVMValueRef get_buffer_size(
3348 struct lp_build_tgsi_context *bld_base,
3349 LLVMValueRef descriptor)
3350 {
3351 struct si_shader_context *ctx = si_shader_context(bld_base);
3352 struct gallivm_state *gallivm = &ctx->gallivm;
3353 LLVMBuilderRef builder = gallivm->builder;
3354 LLVMValueRef size =
3355 LLVMBuildExtractElement(builder, descriptor,
3356 LLVMConstInt(ctx->i32, 2, 0), "");
3357
3358 if (ctx->screen->b.chip_class == VI) {
3359 /* On VI, the descriptor contains the size in bytes,
3360 * but TXQ must return the size in elements.
3361 * The stride is always non-zero for resources using TXQ.
3362 */
3363 LLVMValueRef stride =
3364 LLVMBuildExtractElement(builder, descriptor,
3365 ctx->i32_1, "");
3366 stride = LLVMBuildLShr(builder, stride,
3367 LLVMConstInt(ctx->i32, 16, 0), "");
3368 stride = LLVMBuildAnd(builder, stride,
3369 LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3370
3371 size = LLVMBuildUDiv(builder, size, stride, "");
3372 }
3373
3374 return size;
3375 }
3376
3377 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3378 struct lp_build_tgsi_context *bld_base,
3379 struct lp_build_emit_data *emit_data);
3380
3381 /* Prevent optimizations (at least of memory accesses) across the current
3382 * point in the program by emitting empty inline assembly that is marked as
3383 * having side effects.
3384 *
3385 * Optionally, a value can be passed through the inline assembly to prevent
3386 * LLVM from hoisting calls to ReadNone functions.
3387 */
3388 static void emit_optimization_barrier(struct si_shader_context *ctx,
3389 LLVMValueRef *pvgpr)
3390 {
3391 static int counter = 0;
3392
3393 LLVMBuilderRef builder = ctx->gallivm.builder;
3394 char code[16];
3395
3396 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3397
3398 if (!pvgpr) {
3399 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3400 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3401 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3402 } else {
3403 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3404 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3405 LLVMValueRef vgpr = *pvgpr;
3406 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3407 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3408 LLVMValueRef vgpr0;
3409
3410 assert(vgpr_size % 4 == 0);
3411
3412 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3413 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3414 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3415 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3416 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3417
3418 *pvgpr = vgpr;
3419 }
3420 }
3421
3422 /* Combine these with & instead of |. */
3423 #define NOOP_WAITCNT 0xf7f
3424 #define LGKM_CNT 0x07f
3425 #define VM_CNT 0xf70
3426
3427 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3428 {
3429 struct gallivm_state *gallivm = &ctx->gallivm;
3430 LLVMBuilderRef builder = gallivm->builder;
3431 LLVMValueRef args[1] = {
3432 LLVMConstInt(ctx->i32, simm16, 0)
3433 };
3434 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3435 ctx->voidt, args, 1, 0);
3436 }
3437
3438 static void membar_emit(
3439 const struct lp_build_tgsi_action *action,
3440 struct lp_build_tgsi_context *bld_base,
3441 struct lp_build_emit_data *emit_data)
3442 {
3443 struct si_shader_context *ctx = si_shader_context(bld_base);
3444 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3445 unsigned flags = LLVMConstIntGetZExtValue(src0);
3446 unsigned waitcnt = NOOP_WAITCNT;
3447
3448 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3449 waitcnt &= VM_CNT & LGKM_CNT;
3450
3451 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3452 TGSI_MEMBAR_SHADER_BUFFER |
3453 TGSI_MEMBAR_SHADER_IMAGE))
3454 waitcnt &= VM_CNT;
3455
3456 if (flags & TGSI_MEMBAR_SHARED)
3457 waitcnt &= LGKM_CNT;
3458
3459 if (waitcnt != NOOP_WAITCNT)
3460 emit_waitcnt(ctx, waitcnt);
3461 }
3462
3463 static void clock_emit(
3464 const struct lp_build_tgsi_action *action,
3465 struct lp_build_tgsi_context *bld_base,
3466 struct lp_build_emit_data *emit_data)
3467 {
3468 struct si_shader_context *ctx = si_shader_context(bld_base);
3469 struct gallivm_state *gallivm = &ctx->gallivm;
3470 LLVMValueRef tmp;
3471
3472 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3473 ctx->i64, NULL, 0, 0);
3474 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3475
3476 emit_data->output[0] =
3477 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3478 emit_data->output[1] =
3479 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3480 }
3481
3482 static LLVMValueRef
3483 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3484 const struct tgsi_full_src_register *reg)
3485 {
3486 LLVMValueRef index;
3487 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3488 ctx->param_shader_buffers);
3489
3490 if (!reg->Register.Indirect)
3491 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3492 else
3493 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3494 reg->Register.Index,
3495 SI_NUM_SHADER_BUFFERS);
3496
3497 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3498 }
3499
3500 static bool tgsi_is_array_sampler(unsigned target)
3501 {
3502 return target == TGSI_TEXTURE_1D_ARRAY ||
3503 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3504 target == TGSI_TEXTURE_2D_ARRAY ||
3505 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3506 target == TGSI_TEXTURE_CUBE_ARRAY ||
3507 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3508 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3509 }
3510
3511 static bool tgsi_is_array_image(unsigned target)
3512 {
3513 return target == TGSI_TEXTURE_3D ||
3514 target == TGSI_TEXTURE_CUBE ||
3515 target == TGSI_TEXTURE_1D_ARRAY ||
3516 target == TGSI_TEXTURE_2D_ARRAY ||
3517 target == TGSI_TEXTURE_CUBE_ARRAY ||
3518 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3519 }
3520
3521 /**
3522 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3523 *
3524 * At least on Tonga, executing image stores on images with DCC enabled and
3525 * non-trivial can eventually lead to lockups. This can occur when an
3526 * application binds an image as read-only but then uses a shader that writes
3527 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3528 * program termination) in this case, but it doesn't cost much to be a bit
3529 * nicer: disabling DCC in the shader still leads to undefined results but
3530 * avoids the lockup.
3531 */
3532 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3533 LLVMValueRef rsrc)
3534 {
3535 if (ctx->screen->b.chip_class <= CIK) {
3536 return rsrc;
3537 } else {
3538 LLVMBuilderRef builder = ctx->gallivm.builder;
3539 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3540 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3541 LLVMValueRef tmp;
3542
3543 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3544 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3545 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3546 }
3547 }
3548
3549 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3550 {
3551 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3552 CONST_ADDR_SPACE);
3553 }
3554
3555 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3556 LLVMValueRef list, LLVMValueRef index,
3557 unsigned target)
3558 {
3559 LLVMBuilderRef builder = ctx->gallivm.builder;
3560
3561 if (target == TGSI_TEXTURE_BUFFER) {
3562 index = LLVMBuildMul(builder, index,
3563 LLVMConstInt(ctx->i32, 2, 0), "");
3564 index = LLVMBuildAdd(builder, index,
3565 ctx->i32_1, "");
3566 list = LLVMBuildPointerCast(builder, list,
3567 const_array(ctx->v4i32, 0), "");
3568 }
3569
3570 return ac_build_indexed_load_const(&ctx->ac, list, index);
3571 }
3572
3573 /**
3574 * Load the resource descriptor for \p image.
3575 */
3576 static void
3577 image_fetch_rsrc(
3578 struct lp_build_tgsi_context *bld_base,
3579 const struct tgsi_full_src_register *image,
3580 bool is_store, unsigned target,
3581 LLVMValueRef *rsrc)
3582 {
3583 struct si_shader_context *ctx = si_shader_context(bld_base);
3584 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3585 ctx->param_images);
3586 LLVMValueRef index;
3587 bool dcc_off = is_store;
3588
3589 assert(image->Register.File == TGSI_FILE_IMAGE);
3590
3591 if (!image->Register.Indirect) {
3592 const struct tgsi_shader_info *info = bld_base->info;
3593 unsigned images_writemask = info->images_store |
3594 info->images_atomic;
3595
3596 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3597
3598 if (images_writemask & (1 << image->Register.Index))
3599 dcc_off = true;
3600 } else {
3601 /* From the GL_ARB_shader_image_load_store extension spec:
3602 *
3603 * If a shader performs an image load, store, or atomic
3604 * operation using an image variable declared as an array,
3605 * and if the index used to select an individual element is
3606 * negative or greater than or equal to the size of the
3607 * array, the results of the operation are undefined but may
3608 * not lead to termination.
3609 */
3610 index = get_bounded_indirect_index(ctx, &image->Indirect,
3611 image->Register.Index,
3612 SI_NUM_IMAGES);
3613 }
3614
3615 *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3616 if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3617 *rsrc = force_dcc_off(ctx, *rsrc);
3618 }
3619
3620 static LLVMValueRef image_fetch_coords(
3621 struct lp_build_tgsi_context *bld_base,
3622 const struct tgsi_full_instruction *inst,
3623 unsigned src, LLVMValueRef desc)
3624 {
3625 struct si_shader_context *ctx = si_shader_context(bld_base);
3626 struct gallivm_state *gallivm = &ctx->gallivm;
3627 LLVMBuilderRef builder = gallivm->builder;
3628 unsigned target = inst->Memory.Texture;
3629 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3630 LLVMValueRef coords[4];
3631 LLVMValueRef tmp;
3632 int chan;
3633
3634 for (chan = 0; chan < num_coords; ++chan) {
3635 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3636 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3637 coords[chan] = tmp;
3638 }
3639
3640 if (ctx->screen->b.chip_class >= GFX9) {
3641 /* 1D textures are allocated and used as 2D on GFX9. */
3642 if (target == TGSI_TEXTURE_1D) {
3643 coords[1] = ctx->i32_0;
3644 num_coords++;
3645 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3646 coords[2] = coords[1];
3647 coords[1] = ctx->i32_0;
3648 num_coords++;
3649 } else if (target == TGSI_TEXTURE_2D) {
3650 /* The hw can't bind a slice of a 3D image as a 2D
3651 * image, because it ignores BASE_ARRAY if the target
3652 * is 3D. The workaround is to read BASE_ARRAY and set
3653 * it as the 3rd address operand for all 2D images.
3654 */
3655 LLVMValueRef first_layer, const5, mask;
3656
3657 const5 = LLVMConstInt(ctx->i32, 5, 0);
3658 mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3659 first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3660 first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3661
3662 coords[2] = first_layer;
3663 num_coords++;
3664 }
3665 }
3666
3667 if (num_coords == 1)
3668 return coords[0];
3669
3670 if (num_coords == 3) {
3671 /* LLVM has difficulties lowering 3-element vectors. */
3672 coords[3] = bld_base->uint_bld.undef;
3673 num_coords = 4;
3674 }
3675
3676 return lp_build_gather_values(gallivm, coords, num_coords);
3677 }
3678
3679 /**
3680 * Append the extra mode bits that are used by image load and store.
3681 */
3682 static void image_append_args(
3683 struct si_shader_context *ctx,
3684 struct lp_build_emit_data * emit_data,
3685 unsigned target,
3686 bool atomic,
3687 bool force_glc)
3688 {
3689 const struct tgsi_full_instruction *inst = emit_data->inst;
3690 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3691 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3692 LLVMValueRef r128 = i1false;
3693 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3694 LLVMValueRef glc =
3695 force_glc ||
3696 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3697 i1true : i1false;
3698 LLVMValueRef slc = i1false;
3699 LLVMValueRef lwe = i1false;
3700
3701 if (atomic || (HAVE_LLVM <= 0x0309)) {
3702 emit_data->args[emit_data->arg_count++] = r128;
3703 emit_data->args[emit_data->arg_count++] = da;
3704 if (!atomic) {
3705 emit_data->args[emit_data->arg_count++] = glc;
3706 }
3707 emit_data->args[emit_data->arg_count++] = slc;
3708 return;
3709 }
3710
3711 /* HAVE_LLVM >= 0x0400 */
3712 emit_data->args[emit_data->arg_count++] = glc;
3713 emit_data->args[emit_data->arg_count++] = slc;
3714 emit_data->args[emit_data->arg_count++] = lwe;
3715 emit_data->args[emit_data->arg_count++] = da;
3716 }
3717
3718 /**
3719 * Append the resource and indexing arguments for buffer intrinsics.
3720 *
3721 * \param rsrc the v4i32 buffer resource
3722 * \param index index into the buffer (stride-based)
3723 * \param offset byte offset into the buffer
3724 */
3725 static void buffer_append_args(
3726 struct si_shader_context *ctx,
3727 struct lp_build_emit_data *emit_data,
3728 LLVMValueRef rsrc,
3729 LLVMValueRef index,
3730 LLVMValueRef offset,
3731 bool atomic,
3732 bool force_glc)
3733 {
3734 const struct tgsi_full_instruction *inst = emit_data->inst;
3735 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3736 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3737
3738 emit_data->args[emit_data->arg_count++] = rsrc;
3739 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3740 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3741 if (!atomic) {
3742 emit_data->args[emit_data->arg_count++] =
3743 force_glc ||
3744 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3745 i1true : i1false; /* glc */
3746 }
3747 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3748 }
3749
3750 static void load_fetch_args(
3751 struct lp_build_tgsi_context * bld_base,
3752 struct lp_build_emit_data * emit_data)
3753 {
3754 struct si_shader_context *ctx = si_shader_context(bld_base);
3755 struct gallivm_state *gallivm = &ctx->gallivm;
3756 const struct tgsi_full_instruction * inst = emit_data->inst;
3757 unsigned target = inst->Memory.Texture;
3758 LLVMValueRef rsrc;
3759
3760 emit_data->dst_type = ctx->v4f32;
3761
3762 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3763 LLVMBuilderRef builder = gallivm->builder;
3764 LLVMValueRef offset;
3765 LLVMValueRef tmp;
3766
3767 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3768
3769 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3770 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3771
3772 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3773 offset, false, false);
3774 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3775 LLVMValueRef coords;
3776
3777 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3778 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3779
3780 if (target == TGSI_TEXTURE_BUFFER) {
3781 buffer_append_args(ctx, emit_data, rsrc, coords,
3782 ctx->i32_0, false, false);
3783 } else {
3784 emit_data->args[0] = coords;
3785 emit_data->args[1] = rsrc;
3786 emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3787 emit_data->arg_count = 3;
3788
3789 image_append_args(ctx, emit_data, target, false, false);
3790 }
3791 }
3792 }
3793
3794 static unsigned get_load_intr_attribs(bool readonly_memory)
3795 {
3796 /* READNONE means writes can't affect it, while READONLY means that
3797 * writes can affect it. */
3798 return readonly_memory && HAVE_LLVM >= 0x0400 ?
3799 LP_FUNC_ATTR_READNONE :
3800 LP_FUNC_ATTR_READONLY;
3801 }
3802
3803 static unsigned get_store_intr_attribs(bool writeonly_memory)
3804 {
3805 return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3806 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3807 LP_FUNC_ATTR_WRITEONLY;
3808 }
3809
3810 static void load_emit_buffer(struct si_shader_context *ctx,
3811 struct lp_build_emit_data *emit_data,
3812 bool readonly_memory)
3813 {
3814 const struct tgsi_full_instruction *inst = emit_data->inst;
3815 struct gallivm_state *gallivm = &ctx->gallivm;
3816 LLVMBuilderRef builder = gallivm->builder;
3817 uint writemask = inst->Dst[0].Register.WriteMask;
3818 uint count = util_last_bit(writemask);
3819 const char *intrinsic_name;
3820 LLVMTypeRef dst_type;
3821
3822 switch (count) {
3823 case 1:
3824 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3825 dst_type = ctx->f32;
3826 break;
3827 case 2:
3828 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3829 dst_type = LLVMVectorType(ctx->f32, 2);
3830 break;
3831 default: // 3 & 4
3832 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3833 dst_type = ctx->v4f32;
3834 count = 4;
3835 }
3836
3837 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3838 builder, intrinsic_name, dst_type,
3839 emit_data->args, emit_data->arg_count,
3840 get_load_intr_attribs(readonly_memory));
3841 }
3842
3843 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3844 const struct tgsi_full_instruction *inst,
3845 LLVMTypeRef type, int arg)
3846 {
3847 struct gallivm_state *gallivm = &ctx->gallivm;
3848 LLVMBuilderRef builder = gallivm->builder;
3849 LLVMValueRef offset, ptr;
3850 int addr_space;
3851
3852 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3853 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3854
3855 ptr = ctx->shared_memory;
3856 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3857 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3858 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3859
3860 return ptr;
3861 }
3862
3863 static void load_emit_memory(
3864 struct si_shader_context *ctx,
3865 struct lp_build_emit_data *emit_data)
3866 {
3867 const struct tgsi_full_instruction *inst = emit_data->inst;
3868 struct gallivm_state *gallivm = &ctx->gallivm;
3869 LLVMBuilderRef builder = gallivm->builder;
3870 unsigned writemask = inst->Dst[0].Register.WriteMask;
3871 LLVMValueRef channels[4], ptr, derived_ptr, index;
3872 int chan;
3873
3874 ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3875
3876 for (chan = 0; chan < 4; ++chan) {
3877 if (!(writemask & (1 << chan))) {
3878 channels[chan] = LLVMGetUndef(ctx->f32);
3879 continue;
3880 }
3881
3882 index = LLVMConstInt(ctx->i32, chan, 0);
3883 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3884 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3885 }
3886 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3887 }
3888
3889 /**
3890 * Return true if the memory accessed by a LOAD or STORE instruction is
3891 * read-only or write-only, respectively.
3892 *
3893 * \param shader_buffers_reverse_access_mask
3894 * For LOAD, set this to (store | atomic) slot usage in the shader.
3895 * For STORE, set this to (load | atomic) slot usage in the shader.
3896 * \param images_reverse_access_mask Same as above, but for images.
3897 */
3898 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3899 const struct tgsi_shader_info *info,
3900 unsigned shader_buffers_reverse_access_mask,
3901 unsigned images_reverse_access_mask)
3902 {
3903 /* RESTRICT means NOALIAS.
3904 * If there are no writes, we can assume the accessed memory is read-only.
3905 * If there are no reads, we can assume the accessed memory is write-only.
3906 */
3907 if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3908 unsigned reverse_access_mask;
3909
3910 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3911 reverse_access_mask = shader_buffers_reverse_access_mask;
3912 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3913 reverse_access_mask = info->images_buffers &
3914 images_reverse_access_mask;
3915 } else {
3916 reverse_access_mask = ~info->images_buffers &
3917 images_reverse_access_mask;
3918 }
3919
3920 if (inst->Src[0].Register.Indirect) {
3921 if (!reverse_access_mask)
3922 return true;
3923 } else {
3924 if (!(reverse_access_mask &
3925 (1u << inst->Src[0].Register.Index)))
3926 return true;
3927 }
3928 }
3929
3930 /* If there are no buffer writes (for both shader buffers & image
3931 * buffers), it implies that buffer memory is read-only.
3932 * If there are no buffer reads (for both shader buffers & image
3933 * buffers), it implies that buffer memory is write-only.
3934 *
3935 * Same for the case when there are no writes/reads for non-buffer
3936 * images.
3937 */
3938 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3939 (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3940 inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3941 if (!shader_buffers_reverse_access_mask &&
3942 !(info->images_buffers & images_reverse_access_mask))
3943 return true;
3944 } else {
3945 if (!(~info->images_buffers & images_reverse_access_mask))
3946 return true;
3947 }
3948 return false;
3949 }
3950
3951 static void load_emit(
3952 const struct lp_build_tgsi_action *action,
3953 struct lp_build_tgsi_context *bld_base,
3954 struct lp_build_emit_data *emit_data)
3955 {
3956 struct si_shader_context *ctx = si_shader_context(bld_base);
3957 struct gallivm_state *gallivm = &ctx->gallivm;
3958 LLVMBuilderRef builder = gallivm->builder;
3959 const struct tgsi_full_instruction * inst = emit_data->inst;
3960 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3961 char intrinsic_name[64];
3962 bool readonly_memory = false;
3963
3964 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3965 load_emit_memory(ctx, emit_data);
3966 return;
3967 }
3968
3969 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3970 emit_waitcnt(ctx, VM_CNT);
3971
3972 readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3973 is_oneway_access_only(inst, info,
3974 info->shader_buffers_store |
3975 info->shader_buffers_atomic,
3976 info->images_store |
3977 info->images_atomic);
3978
3979 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3980 load_emit_buffer(ctx, emit_data, readonly_memory);
3981 return;
3982 }
3983
3984 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3985 emit_data->output[emit_data->chan] =
3986 lp_build_intrinsic(
3987 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3988 emit_data->args, emit_data->arg_count,
3989 get_load_intr_attribs(readonly_memory));
3990 } else {
3991 ac_get_image_intr_name("llvm.amdgcn.image.load",
3992 emit_data->dst_type, /* vdata */
3993 LLVMTypeOf(emit_data->args[0]), /* coords */
3994 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3995 intrinsic_name, sizeof(intrinsic_name));
3996
3997 emit_data->output[emit_data->chan] =
3998 lp_build_intrinsic(
3999 builder, intrinsic_name, emit_data->dst_type,
4000 emit_data->args, emit_data->arg_count,
4001 get_load_intr_attribs(readonly_memory));
4002 }
4003 }
4004
4005 static void store_fetch_args(
4006 struct lp_build_tgsi_context * bld_base,
4007 struct lp_build_emit_data * emit_data)
4008 {
4009 struct si_shader_context *ctx = si_shader_context(bld_base);
4010 struct gallivm_state *gallivm = &ctx->gallivm;
4011 LLVMBuilderRef builder = gallivm->builder;
4012 const struct tgsi_full_instruction * inst = emit_data->inst;
4013 struct tgsi_full_src_register memory;
4014 LLVMValueRef chans[4];
4015 LLVMValueRef data;
4016 LLVMValueRef rsrc;
4017 unsigned chan;
4018
4019 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
4020
4021 for (chan = 0; chan < 4; ++chan) {
4022 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
4023 }
4024 data = lp_build_gather_values(gallivm, chans, 4);
4025
4026 emit_data->args[emit_data->arg_count++] = data;
4027
4028 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
4029
4030 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4031 LLVMValueRef offset;
4032 LLVMValueRef tmp;
4033
4034 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
4035
4036 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
4037 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4038
4039 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4040 offset, false, false);
4041 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
4042 unsigned target = inst->Memory.Texture;
4043 LLVMValueRef coords;
4044
4045 /* 8bit/16bit TC L1 write corruption bug on SI.
4046 * All store opcodes not aligned to a dword are affected.
4047 *
4048 * The only way to get unaligned stores in radeonsi is through
4049 * shader images.
4050 */
4051 bool force_glc = ctx->screen->b.chip_class == SI;
4052
4053 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
4054 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
4055
4056 if (target == TGSI_TEXTURE_BUFFER) {
4057 buffer_append_args(ctx, emit_data, rsrc, coords,
4058 ctx->i32_0, false, force_glc);
4059 } else {
4060 emit_data->args[1] = coords;
4061 emit_data->args[2] = rsrc;
4062 emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
4063 emit_data->arg_count = 4;
4064
4065 image_append_args(ctx, emit_data, target, false, force_glc);
4066 }
4067 }
4068 }
4069
4070 static void store_emit_buffer(
4071 struct si_shader_context *ctx,
4072 struct lp_build_emit_data *emit_data,
4073 bool writeonly_memory)
4074 {
4075 const struct tgsi_full_instruction *inst = emit_data->inst;
4076 struct gallivm_state *gallivm = &ctx->gallivm;
4077 LLVMBuilderRef builder = gallivm->builder;
4078 LLVMValueRef base_data = emit_data->args[0];
4079 LLVMValueRef base_offset = emit_data->args[3];
4080 unsigned writemask = inst->Dst[0].Register.WriteMask;
4081
4082 while (writemask) {
4083 int start, count;
4084 const char *intrinsic_name;
4085 LLVMValueRef data;
4086 LLVMValueRef offset;
4087 LLVMValueRef tmp;
4088
4089 u_bit_scan_consecutive_range(&writemask, &start, &count);
4090
4091 /* Due to an LLVM limitation, split 3-element writes
4092 * into a 2-element and a 1-element write. */
4093 if (count == 3) {
4094 writemask |= 1 << (start + 2);
4095 count = 2;
4096 }
4097
4098 if (count == 4) {
4099 data = base_data;
4100 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
4101 } else if (count == 2) {
4102 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
4103
4104 tmp = LLVMBuildExtractElement(
4105 builder, base_data,
4106 LLVMConstInt(ctx->i32, start, 0), "");
4107 data = LLVMBuildInsertElement(
4108 builder, LLVMGetUndef(v2f32), tmp,
4109 ctx->i32_0, "");
4110
4111 tmp = LLVMBuildExtractElement(
4112 builder, base_data,
4113 LLVMConstInt(ctx->i32, start + 1, 0), "");
4114 data = LLVMBuildInsertElement(
4115 builder, data, tmp, ctx->i32_1, "");
4116
4117 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
4118 } else {
4119 assert(count == 1);
4120 data = LLVMBuildExtractElement(
4121 builder, base_data,
4122 LLVMConstInt(ctx->i32, start, 0), "");
4123 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
4124 }
4125
4126 offset = base_offset;
4127 if (start != 0) {
4128 offset = LLVMBuildAdd(
4129 builder, offset,
4130 LLVMConstInt(ctx->i32, start * 4, 0), "");
4131 }
4132
4133 emit_data->args[0] = data;
4134 emit_data->args[3] = offset;
4135
4136 lp_build_intrinsic(
4137 builder, intrinsic_name, emit_data->dst_type,
4138 emit_data->args, emit_data->arg_count,
4139 get_store_intr_attribs(writeonly_memory));
4140 }
4141 }
4142
4143 static void store_emit_memory(
4144 struct si_shader_context *ctx,
4145 struct lp_build_emit_data *emit_data)
4146 {
4147 const struct tgsi_full_instruction *inst = emit_data->inst;
4148 struct gallivm_state *gallivm = &ctx->gallivm;
4149 LLVMBuilderRef builder = gallivm->builder;
4150 unsigned writemask = inst->Dst[0].Register.WriteMask;
4151 LLVMValueRef ptr, derived_ptr, data, index;
4152 int chan;
4153
4154 ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
4155
4156 for (chan = 0; chan < 4; ++chan) {
4157 if (!(writemask & (1 << chan))) {
4158 continue;
4159 }
4160 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
4161 index = LLVMConstInt(ctx->i32, chan, 0);
4162 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
4163 LLVMBuildStore(builder, data, derived_ptr);
4164 }
4165 }
4166
4167 static void store_emit(
4168 const struct lp_build_tgsi_action *action,
4169 struct lp_build_tgsi_context *bld_base,
4170 struct lp_build_emit_data *emit_data)
4171 {
4172 struct si_shader_context *ctx = si_shader_context(bld_base);
4173 struct gallivm_state *gallivm = &ctx->gallivm;
4174 LLVMBuilderRef builder = gallivm->builder;
4175 const struct tgsi_full_instruction * inst = emit_data->inst;
4176 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
4177 unsigned target = inst->Memory.Texture;
4178 char intrinsic_name[64];
4179 bool writeonly_memory = false;
4180
4181 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4182 store_emit_memory(ctx, emit_data);
4183 return;
4184 }
4185
4186 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4187 emit_waitcnt(ctx, VM_CNT);
4188
4189 writeonly_memory = is_oneway_access_only(inst, info,
4190 info->shader_buffers_load |
4191 info->shader_buffers_atomic,
4192 info->images_load |
4193 info->images_atomic);
4194
4195 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4196 store_emit_buffer(ctx, emit_data, writeonly_memory);
4197 return;
4198 }
4199
4200 if (target == TGSI_TEXTURE_BUFFER) {
4201 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4202 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4203 emit_data->dst_type, emit_data->args,
4204 emit_data->arg_count,
4205 get_store_intr_attribs(writeonly_memory));
4206 } else {
4207 ac_get_image_intr_name("llvm.amdgcn.image.store",
4208 LLVMTypeOf(emit_data->args[0]), /* vdata */
4209 LLVMTypeOf(emit_data->args[1]), /* coords */
4210 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4211 intrinsic_name, sizeof(intrinsic_name));
4212
4213 emit_data->output[emit_data->chan] =
4214 lp_build_intrinsic(
4215 builder, intrinsic_name, emit_data->dst_type,
4216 emit_data->args, emit_data->arg_count,
4217 get_store_intr_attribs(writeonly_memory));
4218 }
4219 }
4220
4221 static void atomic_fetch_args(
4222 struct lp_build_tgsi_context * bld_base,
4223 struct lp_build_emit_data * emit_data)
4224 {
4225 struct si_shader_context *ctx = si_shader_context(bld_base);
4226 struct gallivm_state *gallivm = &ctx->gallivm;
4227 LLVMBuilderRef builder = gallivm->builder;
4228 const struct tgsi_full_instruction * inst = emit_data->inst;
4229 LLVMValueRef data1, data2;
4230 LLVMValueRef rsrc;
4231 LLVMValueRef tmp;
4232
4233 emit_data->dst_type = ctx->f32;
4234
4235 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4236 data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4237
4238 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4239 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4240 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4241 }
4242
4243 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4244 * of arguments, which is reversed relative to TGSI (and GLSL)
4245 */
4246 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4247 emit_data->args[emit_data->arg_count++] = data2;
4248 emit_data->args[emit_data->arg_count++] = data1;
4249
4250 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4251 LLVMValueRef offset;
4252
4253 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4254
4255 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4256 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4257
4258 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4259 offset, true, false);
4260 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4261 unsigned target = inst->Memory.Texture;
4262 LLVMValueRef coords;
4263
4264 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4265 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4266
4267 if (target == TGSI_TEXTURE_BUFFER) {
4268 buffer_append_args(ctx, emit_data, rsrc, coords,
4269 ctx->i32_0, true, false);
4270 } else {
4271 emit_data->args[emit_data->arg_count++] = coords;
4272 emit_data->args[emit_data->arg_count++] = rsrc;
4273
4274 image_append_args(ctx, emit_data, target, true, false);
4275 }
4276 }
4277 }
4278
4279 static void atomic_emit_memory(struct si_shader_context *ctx,
4280 struct lp_build_emit_data *emit_data) {
4281 struct gallivm_state *gallivm = &ctx->gallivm;
4282 LLVMBuilderRef builder = gallivm->builder;
4283 const struct tgsi_full_instruction * inst = emit_data->inst;
4284 LLVMValueRef ptr, result, arg;
4285
4286 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4287
4288 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4289 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4290
4291 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4292 LLVMValueRef new_data;
4293 new_data = lp_build_emit_fetch(&ctx->bld_base,
4294 inst, 3, 0);
4295
4296 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4297
4298 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4299 LLVMAtomicOrderingSequentiallyConsistent,
4300 LLVMAtomicOrderingSequentiallyConsistent,
4301 false);
4302
4303 result = LLVMBuildExtractValue(builder, result, 0, "");
4304 } else {
4305 LLVMAtomicRMWBinOp op;
4306
4307 switch(inst->Instruction.Opcode) {
4308 case TGSI_OPCODE_ATOMUADD:
4309 op = LLVMAtomicRMWBinOpAdd;
4310 break;
4311 case TGSI_OPCODE_ATOMXCHG:
4312 op = LLVMAtomicRMWBinOpXchg;
4313 break;
4314 case TGSI_OPCODE_ATOMAND:
4315 op = LLVMAtomicRMWBinOpAnd;
4316 break;
4317 case TGSI_OPCODE_ATOMOR:
4318 op = LLVMAtomicRMWBinOpOr;
4319 break;
4320 case TGSI_OPCODE_ATOMXOR:
4321 op = LLVMAtomicRMWBinOpXor;
4322 break;
4323 case TGSI_OPCODE_ATOMUMIN:
4324 op = LLVMAtomicRMWBinOpUMin;
4325 break;
4326 case TGSI_OPCODE_ATOMUMAX:
4327 op = LLVMAtomicRMWBinOpUMax;
4328 break;
4329 case TGSI_OPCODE_ATOMIMIN:
4330 op = LLVMAtomicRMWBinOpMin;
4331 break;
4332 case TGSI_OPCODE_ATOMIMAX:
4333 op = LLVMAtomicRMWBinOpMax;
4334 break;
4335 default:
4336 unreachable("unknown atomic opcode");
4337 }
4338
4339 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4340 LLVMAtomicOrderingSequentiallyConsistent,
4341 false);
4342 }
4343 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4344 }
4345
4346 static void atomic_emit(
4347 const struct lp_build_tgsi_action *action,
4348 struct lp_build_tgsi_context *bld_base,
4349 struct lp_build_emit_data *emit_data)
4350 {
4351 struct si_shader_context *ctx = si_shader_context(bld_base);
4352 struct gallivm_state *gallivm = &ctx->gallivm;
4353 LLVMBuilderRef builder = gallivm->builder;
4354 const struct tgsi_full_instruction * inst = emit_data->inst;
4355 char intrinsic_name[40];
4356 LLVMValueRef tmp;
4357
4358 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4359 atomic_emit_memory(ctx, emit_data);
4360 return;
4361 }
4362
4363 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4364 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4365 snprintf(intrinsic_name, sizeof(intrinsic_name),
4366 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4367 } else {
4368 LLVMValueRef coords;
4369 char coords_type[8];
4370
4371 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4372 coords = emit_data->args[2];
4373 else
4374 coords = emit_data->args[1];
4375
4376 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4377 snprintf(intrinsic_name, sizeof(intrinsic_name),
4378 "llvm.amdgcn.image.atomic.%s.%s",
4379 action->intr_name, coords_type);
4380 }
4381
4382 tmp = lp_build_intrinsic(
4383 builder, intrinsic_name, ctx->i32,
4384 emit_data->args, emit_data->arg_count, 0);
4385 emit_data->output[emit_data->chan] =
4386 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4387 }
4388
4389 static void set_tex_fetch_args(struct si_shader_context *ctx,
4390 struct lp_build_emit_data *emit_data,
4391 unsigned target,
4392 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4393 LLVMValueRef *param, unsigned count,
4394 unsigned dmask)
4395 {
4396 struct gallivm_state *gallivm = &ctx->gallivm;
4397 struct ac_image_args args = {};
4398
4399 /* Pad to power of two vector */
4400 while (count < util_next_power_of_two(count))
4401 param[count++] = LLVMGetUndef(ctx->i32);
4402
4403 if (count > 1)
4404 args.addr = lp_build_gather_values(gallivm, param, count);
4405 else
4406 args.addr = param[0];
4407
4408 args.resource = res_ptr;
4409 args.sampler = samp_ptr;
4410 args.dmask = dmask;
4411 args.unorm = target == TGSI_TEXTURE_RECT ||
4412 target == TGSI_TEXTURE_SHADOWRECT;
4413 args.da = tgsi_is_array_sampler(target);
4414
4415 /* Ugly, but we seem to have no other choice right now. */
4416 STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4417 memcpy(emit_data->args, &args, sizeof(args));
4418 }
4419
4420 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4421 unsigned target, LLVMValueRef out)
4422 {
4423 LLVMBuilderRef builder = ctx->gallivm.builder;
4424
4425 /* 1D textures are allocated and used as 2D on GFX9. */
4426 if (ctx->screen->b.chip_class >= GFX9 &&
4427 (target == TGSI_TEXTURE_1D_ARRAY ||
4428 target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4429 LLVMValueRef layers =
4430 LLVMBuildExtractElement(builder, out,
4431 LLVMConstInt(ctx->i32, 2, 0), "");
4432 out = LLVMBuildInsertElement(builder, out, layers,
4433 ctx->i32_1, "");
4434 }
4435
4436 /* Divide the number of layers by 6 to get the number of cubes. */
4437 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4438 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4439 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4440
4441 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4442 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4443
4444 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4445 }
4446 return out;
4447 }
4448
4449 static void resq_fetch_args(
4450 struct lp_build_tgsi_context * bld_base,
4451 struct lp_build_emit_data * emit_data)
4452 {
4453 struct si_shader_context *ctx = si_shader_context(bld_base);
4454 const struct tgsi_full_instruction *inst = emit_data->inst;
4455 const struct tgsi_full_src_register *reg = &inst->Src[0];
4456
4457 emit_data->dst_type = ctx->v4i32;
4458
4459 if (reg->Register.File == TGSI_FILE_BUFFER) {
4460 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4461 emit_data->arg_count = 1;
4462 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4463 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4464 &emit_data->args[0]);
4465 emit_data->arg_count = 1;
4466 } else {
4467 LLVMValueRef res_ptr;
4468 unsigned image_target;
4469
4470 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4471 image_target = TGSI_TEXTURE_2D_ARRAY;
4472 else
4473 image_target = inst->Memory.Texture;
4474
4475 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4476 &res_ptr);
4477 set_tex_fetch_args(ctx, emit_data, image_target,
4478 res_ptr, NULL, &ctx->i32_0, 1,
4479 0xf);
4480 }
4481 }
4482
4483 static void resq_emit(
4484 const struct lp_build_tgsi_action *action,
4485 struct lp_build_tgsi_context *bld_base,
4486 struct lp_build_emit_data *emit_data)
4487 {
4488 struct si_shader_context *ctx = si_shader_context(bld_base);
4489 struct gallivm_state *gallivm = &ctx->gallivm;
4490 LLVMBuilderRef builder = gallivm->builder;
4491 const struct tgsi_full_instruction *inst = emit_data->inst;
4492 LLVMValueRef out;
4493
4494 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4495 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4496 LLVMConstInt(ctx->i32, 2, 0), "");
4497 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4498 out = get_buffer_size(bld_base, emit_data->args[0]);
4499 } else {
4500 struct ac_image_args args;
4501
4502 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4503 args.opcode = ac_image_get_resinfo;
4504 out = ac_build_image_opcode(&ctx->ac, &args);
4505
4506 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4507 }
4508
4509 emit_data->output[emit_data->chan] = out;
4510 }
4511
4512 static const struct lp_build_tgsi_action tex_action;
4513
4514 enum desc_type {
4515 DESC_IMAGE,
4516 DESC_BUFFER,
4517 DESC_FMASK,
4518 DESC_SAMPLER,
4519 };
4520
4521 /**
4522 * Load an image view, fmask view. or sampler state descriptor.
4523 */
4524 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4525 LLVMValueRef list, LLVMValueRef index,
4526 enum desc_type type)
4527 {
4528 struct gallivm_state *gallivm = &ctx->gallivm;
4529 LLVMBuilderRef builder = gallivm->builder;
4530
4531 switch (type) {
4532 case DESC_IMAGE:
4533 /* The image is at [0:7]. */
4534 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4535 break;
4536 case DESC_BUFFER:
4537 /* The buffer is in [4:7]. */
4538 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4539 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4540 list = LLVMBuildPointerCast(builder, list,
4541 const_array(ctx->v4i32, 0), "");
4542 break;
4543 case DESC_FMASK:
4544 /* The FMASK is at [8:15]. */
4545 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4546 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4547 break;
4548 case DESC_SAMPLER:
4549 /* The sampler state is at [12:15]. */
4550 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4551 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4552 list = LLVMBuildPointerCast(builder, list,
4553 const_array(ctx->v4i32, 0), "");
4554 break;
4555 }
4556
4557 return ac_build_indexed_load_const(&ctx->ac, list, index);
4558 }
4559
4560 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4561 *
4562 * SI-CI:
4563 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4564 * filtering manually. The driver sets img7 to a mask clearing
4565 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4566 * s_and_b32 samp0, samp0, img7
4567 *
4568 * VI:
4569 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4570 */
4571 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4572 LLVMValueRef res, LLVMValueRef samp)
4573 {
4574 LLVMBuilderRef builder = ctx->gallivm.builder;
4575 LLVMValueRef img7, samp0;
4576
4577 if (ctx->screen->b.chip_class >= VI)
4578 return samp;
4579
4580 img7 = LLVMBuildExtractElement(builder, res,
4581 LLVMConstInt(ctx->i32, 7, 0), "");
4582 samp0 = LLVMBuildExtractElement(builder, samp,
4583 ctx->i32_0, "");
4584 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4585 return LLVMBuildInsertElement(builder, samp, samp0,
4586 ctx->i32_0, "");
4587 }
4588
4589 static void tex_fetch_ptrs(
4590 struct lp_build_tgsi_context *bld_base,
4591 struct lp_build_emit_data *emit_data,
4592 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4593 {
4594 struct si_shader_context *ctx = si_shader_context(bld_base);
4595 LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4596 const struct tgsi_full_instruction *inst = emit_data->inst;
4597 const struct tgsi_full_src_register *reg;
4598 unsigned target = inst->Texture.Texture;
4599 unsigned sampler_src;
4600 LLVMValueRef index;
4601
4602 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4603 reg = &emit_data->inst->Src[sampler_src];
4604
4605 if (reg->Register.Indirect) {
4606 index = get_bounded_indirect_index(ctx,
4607 &reg->Indirect,
4608 reg->Register.Index,
4609 SI_NUM_SAMPLERS);
4610 } else {
4611 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4612 }
4613
4614 if (target == TGSI_TEXTURE_BUFFER)
4615 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4616 else
4617 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4618
4619 if (samp_ptr)
4620 *samp_ptr = NULL;
4621 if (fmask_ptr)
4622 *fmask_ptr = NULL;
4623
4624 if (target == TGSI_TEXTURE_2D_MSAA ||
4625 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4626 if (fmask_ptr)
4627 *fmask_ptr = load_sampler_desc(ctx, list, index,
4628 DESC_FMASK);
4629 } else if (target != TGSI_TEXTURE_BUFFER) {
4630 if (samp_ptr) {
4631 *samp_ptr = load_sampler_desc(ctx, list, index,
4632 DESC_SAMPLER);
4633 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4634 }
4635 }
4636 }
4637
4638 static void txq_fetch_args(
4639 struct lp_build_tgsi_context *bld_base,
4640 struct lp_build_emit_data *emit_data)
4641 {
4642 struct si_shader_context *ctx = si_shader_context(bld_base);
4643 const struct tgsi_full_instruction *inst = emit_data->inst;
4644 unsigned target = inst->Texture.Texture;
4645 LLVMValueRef res_ptr;
4646 LLVMValueRef address;
4647
4648 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4649
4650 if (target == TGSI_TEXTURE_BUFFER) {
4651 /* Read the size from the buffer descriptor directly. */
4652 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4653 return;
4654 }
4655
4656 /* Textures - set the mip level. */
4657 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4658
4659 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4660 NULL, &address, 1, 0xf);
4661 }
4662
4663 static void txq_emit(const struct lp_build_tgsi_action *action,
4664 struct lp_build_tgsi_context *bld_base,
4665 struct lp_build_emit_data *emit_data)
4666 {
4667 struct si_shader_context *ctx = si_shader_context(bld_base);
4668 struct ac_image_args args;
4669 unsigned target = emit_data->inst->Texture.Texture;
4670
4671 if (target == TGSI_TEXTURE_BUFFER) {
4672 /* Just return the buffer size. */
4673 emit_data->output[emit_data->chan] = emit_data->args[0];
4674 return;
4675 }
4676
4677 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4678
4679 args.opcode = ac_image_get_resinfo;
4680 LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4681
4682 emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4683 }
4684
4685 static void tex_fetch_args(
4686 struct lp_build_tgsi_context *bld_base,
4687 struct lp_build_emit_data *emit_data)
4688 {
4689 struct si_shader_context *ctx = si_shader_context(bld_base);
4690 struct gallivm_state *gallivm = &ctx->gallivm;
4691 const struct tgsi_full_instruction *inst = emit_data->inst;
4692 unsigned opcode = inst->Instruction.Opcode;
4693 unsigned target = inst->Texture.Texture;
4694 LLVMValueRef coords[5], derivs[6];
4695 LLVMValueRef address[16];
4696 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4697 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4698 unsigned count = 0;
4699 unsigned chan;
4700 unsigned num_deriv_channels = 0;
4701 bool has_offset = inst->Texture.NumOffsets > 0;
4702 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4703 unsigned dmask = 0xf;
4704
4705 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4706
4707 if (target == TGSI_TEXTURE_BUFFER) {
4708 emit_data->dst_type = ctx->v4f32;
4709 emit_data->args[0] = res_ptr;
4710 emit_data->args[1] = ctx->i32_0;
4711 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4712 emit_data->arg_count = 3;
4713 return;
4714 }
4715
4716 /* Fetch and project texture coordinates */
4717 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4718 for (chan = 0; chan < 3; chan++ ) {
4719 coords[chan] = lp_build_emit_fetch(bld_base,
4720 emit_data->inst, 0,
4721 chan);
4722 if (opcode == TGSI_OPCODE_TXP)
4723 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4724 TGSI_OPCODE_DIV,
4725 coords[chan],
4726 coords[3]);
4727 }
4728
4729 if (opcode == TGSI_OPCODE_TXP)
4730 coords[3] = bld_base->base.one;
4731
4732 /* Pack offsets. */
4733 if (has_offset &&
4734 opcode != TGSI_OPCODE_TXF &&
4735 opcode != TGSI_OPCODE_TXF_LZ) {
4736 /* The offsets are six-bit signed integers packed like this:
4737 * X=[5:0], Y=[13:8], and Z=[21:16].
4738 */
4739 LLVMValueRef offset[3], pack;
4740
4741 assert(inst->Texture.NumOffsets == 1);
4742
4743 for (chan = 0; chan < 3; chan++) {
4744 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4745 emit_data->inst, 0, chan);
4746 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4747 LLVMConstInt(ctx->i32, 0x3f, 0), "");
4748 if (chan)
4749 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4750 LLVMConstInt(ctx->i32, chan*8, 0), "");
4751 }
4752
4753 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4754 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4755 address[count++] = pack;
4756 }
4757
4758 /* Pack LOD bias value */
4759 if (opcode == TGSI_OPCODE_TXB)
4760 address[count++] = coords[3];
4761 if (opcode == TGSI_OPCODE_TXB2)
4762 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4763
4764 /* Pack depth comparison value */
4765 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4766 LLVMValueRef z;
4767
4768 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4769 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4770 } else {
4771 assert(ref_pos >= 0);
4772 z = coords[ref_pos];
4773 }
4774
4775 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4776 * so the depth comparison value isn't clamped for Z16 and
4777 * Z24 anymore. Do it manually here.
4778 *
4779 * It's unnecessary if the original texture format was
4780 * Z32_FLOAT, but we don't know that here.
4781 */
4782 if (ctx->screen->b.chip_class == VI)
4783 z = ac_build_clamp(&ctx->ac, z);
4784
4785 address[count++] = z;
4786 }
4787
4788 /* Pack user derivatives */
4789 if (opcode == TGSI_OPCODE_TXD) {
4790 int param, num_src_deriv_channels, num_dst_deriv_channels;
4791
4792 switch (target) {
4793 case TGSI_TEXTURE_3D:
4794 num_src_deriv_channels = 3;
4795 num_dst_deriv_channels = 3;
4796 num_deriv_channels = 3;
4797 break;
4798 case TGSI_TEXTURE_2D:
4799 case TGSI_TEXTURE_SHADOW2D:
4800 case TGSI_TEXTURE_RECT:
4801 case TGSI_TEXTURE_SHADOWRECT:
4802 case TGSI_TEXTURE_2D_ARRAY:
4803 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4804 num_src_deriv_channels = 2;
4805 num_dst_deriv_channels = 2;
4806 num_deriv_channels = 2;
4807 break;
4808 case TGSI_TEXTURE_CUBE:
4809 case TGSI_TEXTURE_SHADOWCUBE:
4810 case TGSI_TEXTURE_CUBE_ARRAY:
4811 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4812 /* Cube derivatives will be converted to 2D. */
4813 num_src_deriv_channels = 3;
4814 num_dst_deriv_channels = 3;
4815 num_deriv_channels = 2;
4816 break;
4817 case TGSI_TEXTURE_1D:
4818 case TGSI_TEXTURE_SHADOW1D:
4819 case TGSI_TEXTURE_1D_ARRAY:
4820 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4821 num_src_deriv_channels = 1;
4822
4823 /* 1D textures are allocated and used as 2D on GFX9. */
4824 if (ctx->screen->b.chip_class >= GFX9) {
4825 num_dst_deriv_channels = 2;
4826 num_deriv_channels = 2;
4827 } else {
4828 num_dst_deriv_channels = 1;
4829 num_deriv_channels = 1;
4830 }
4831 break;
4832 default:
4833 unreachable("invalid target");
4834 }
4835
4836 for (param = 0; param < 2; param++) {
4837 for (chan = 0; chan < num_src_deriv_channels; chan++)
4838 derivs[param * num_dst_deriv_channels + chan] =
4839 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4840
4841 /* Fill in the rest with zeros. */
4842 for (chan = num_src_deriv_channels;
4843 chan < num_dst_deriv_channels; chan++)
4844 derivs[param * num_dst_deriv_channels + chan] =
4845 bld_base->base.zero;
4846 }
4847 }
4848
4849 if (target == TGSI_TEXTURE_CUBE ||
4850 target == TGSI_TEXTURE_CUBE_ARRAY ||
4851 target == TGSI_TEXTURE_SHADOWCUBE ||
4852 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4853 ac_prepare_cube_coords(&ctx->ac,
4854 opcode == TGSI_OPCODE_TXD,
4855 target == TGSI_TEXTURE_CUBE_ARRAY ||
4856 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4857 coords, derivs);
4858
4859 if (opcode == TGSI_OPCODE_TXD)
4860 for (int i = 0; i < num_deriv_channels * 2; i++)
4861 address[count++] = derivs[i];
4862
4863 /* Pack texture coordinates */
4864 address[count++] = coords[0];
4865 if (num_coords > 1)
4866 address[count++] = coords[1];
4867 if (num_coords > 2)
4868 address[count++] = coords[2];
4869
4870 /* 1D textures are allocated and used as 2D on GFX9. */
4871 if (ctx->screen->b.chip_class >= GFX9) {
4872 LLVMValueRef filler;
4873
4874 /* Use 0.5, so that we don't sample the border color. */
4875 if (opcode == TGSI_OPCODE_TXF)
4876 filler = ctx->i32_0;
4877 else
4878 filler = LLVMConstReal(ctx->f32, 0.5);
4879
4880 if (target == TGSI_TEXTURE_1D ||
4881 target == TGSI_TEXTURE_SHADOW1D) {
4882 address[count++] = filler;
4883 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4884 target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4885 address[count] = address[count - 1];
4886 address[count - 1] = filler;
4887 count++;
4888 }
4889 }
4890
4891 /* Pack LOD or sample index */
4892 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4893 address[count++] = coords[3];
4894 else if (opcode == TGSI_OPCODE_TXL2)
4895 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4896
4897 if (count > 16) {
4898 assert(!"Cannot handle more than 16 texture address parameters");
4899 count = 16;
4900 }
4901
4902 for (chan = 0; chan < count; chan++ ) {
4903 address[chan] = LLVMBuildBitCast(gallivm->builder,
4904 address[chan], ctx->i32, "");
4905 }
4906
4907 /* Adjust the sample index according to FMASK.
4908 *
4909 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4910 * which is the identity mapping. Each nibble says which physical sample
4911 * should be fetched to get that sample.
4912 *
4913 * For example, 0x11111100 means there are only 2 samples stored and
4914 * the second sample covers 3/4 of the pixel. When reading samples 0
4915 * and 1, return physical sample 0 (determined by the first two 0s
4916 * in FMASK), otherwise return physical sample 1.
4917 *
4918 * The sample index should be adjusted as follows:
4919 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4920 */
4921 if (target == TGSI_TEXTURE_2D_MSAA ||
4922 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4923 struct lp_build_emit_data txf_emit_data = *emit_data;
4924 LLVMValueRef txf_address[4];
4925 /* We only need .xy for non-arrays, and .xyz for arrays. */
4926 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4927 struct tgsi_full_instruction inst = {};
4928
4929 memcpy(txf_address, address, sizeof(txf_address));
4930
4931 /* Read FMASK using TXF_LZ. */
4932 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4933 inst.Texture.Texture = target;
4934 txf_emit_data.inst = &inst;
4935 txf_emit_data.chan = 0;
4936 set_tex_fetch_args(ctx, &txf_emit_data,
4937 target, fmask_ptr, NULL,
4938 txf_address, txf_count, 0xf);
4939 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4940
4941 /* Initialize some constants. */
4942 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4943 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4944
4945 /* Apply the formula. */
4946 LLVMValueRef fmask =
4947 LLVMBuildExtractElement(gallivm->builder,
4948 txf_emit_data.output[0],
4949 ctx->i32_0, "");
4950
4951 unsigned sample_chan = txf_count; /* the sample index is last */
4952
4953 LLVMValueRef sample_index4 =
4954 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4955
4956 LLVMValueRef shifted_fmask =
4957 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4958
4959 LLVMValueRef final_sample =
4960 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4961
4962 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4963 * resource descriptor is 0 (invalid),
4964 */
4965 LLVMValueRef fmask_desc =
4966 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4967 ctx->v8i32, "");
4968
4969 LLVMValueRef fmask_word1 =
4970 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4971 ctx->i32_1, "");
4972
4973 LLVMValueRef word1_is_nonzero =
4974 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4975 fmask_word1, ctx->i32_0, "");
4976
4977 /* Replace the MSAA sample index. */
4978 address[sample_chan] =
4979 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4980 final_sample, address[sample_chan], "");
4981 }
4982
4983 if (opcode == TGSI_OPCODE_TXF ||
4984 opcode == TGSI_OPCODE_TXF_LZ) {
4985 /* add tex offsets */
4986 if (inst->Texture.NumOffsets) {
4987 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4988 const struct tgsi_texture_offset *off = inst->TexOffsets;
4989
4990 assert(inst->Texture.NumOffsets == 1);
4991
4992 switch (target) {
4993 case TGSI_TEXTURE_3D:
4994 address[2] = lp_build_add(uint_bld, address[2],
4995 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4996 /* fall through */
4997 case TGSI_TEXTURE_2D:
4998 case TGSI_TEXTURE_SHADOW2D:
4999 case TGSI_TEXTURE_RECT:
5000 case TGSI_TEXTURE_SHADOWRECT:
5001 case TGSI_TEXTURE_2D_ARRAY:
5002 case TGSI_TEXTURE_SHADOW2D_ARRAY:
5003 address[1] =
5004 lp_build_add(uint_bld, address[1],
5005 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
5006 /* fall through */
5007 case TGSI_TEXTURE_1D:
5008 case TGSI_TEXTURE_SHADOW1D:
5009 case TGSI_TEXTURE_1D_ARRAY:
5010 case TGSI_TEXTURE_SHADOW1D_ARRAY:
5011 address[0] =
5012 lp_build_add(uint_bld, address[0],
5013 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
5014 break;
5015 /* texture offsets do not apply to other texture targets */
5016 }
5017 }
5018 }
5019
5020 if (opcode == TGSI_OPCODE_TG4) {
5021 unsigned gather_comp = 0;
5022
5023 /* DMASK was repurposed for GATHER4. 4 components are always
5024 * returned and DMASK works like a swizzle - it selects
5025 * the component to fetch. The only valid DMASK values are
5026 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
5027 * (red,red,red,red) etc.) The ISA document doesn't mention
5028 * this.
5029 */
5030
5031 /* Get the component index from src1.x for Gather4. */
5032 if (!tgsi_is_shadow_target(target)) {
5033 LLVMValueRef comp_imm;
5034 struct tgsi_src_register src1 = inst->Src[1].Register;
5035
5036 assert(src1.File == TGSI_FILE_IMMEDIATE);
5037
5038 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
5039 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
5040 gather_comp = CLAMP(gather_comp, 0, 3);
5041 }
5042
5043 dmask = 1 << gather_comp;
5044 }
5045
5046 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
5047 samp_ptr, address, count, dmask);
5048 }
5049
5050 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
5051 * incorrectly forces nearest filtering if the texture format is integer.
5052 * The only effect it has on Gather4, which always returns 4 texels for
5053 * bilinear filtering, is that the final coordinates are off by 0.5 of
5054 * the texel size.
5055 *
5056 * The workaround is to subtract 0.5 from the unnormalized coordinates,
5057 * or (0.5 / size) from the normalized coordinates.
5058 */
5059 static void si_lower_gather4_integer(struct si_shader_context *ctx,
5060 struct ac_image_args *args,
5061 unsigned target)
5062 {
5063 LLVMBuilderRef builder = ctx->gallivm.builder;
5064 LLVMValueRef coord = args->addr;
5065 LLVMValueRef half_texel[2];
5066 /* Texture coordinates start after:
5067 * {offset, bias, z-compare, derivatives}
5068 * Only the offset and z-compare can occur here.
5069 */
5070 unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
5071 int c;
5072
5073 if (target == TGSI_TEXTURE_RECT ||
5074 target == TGSI_TEXTURE_SHADOWRECT) {
5075 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
5076 } else {
5077 struct tgsi_full_instruction txq_inst = {};
5078 struct lp_build_emit_data txq_emit_data = {};
5079
5080 /* Query the texture size. */
5081 txq_inst.Texture.Texture = target;
5082 txq_emit_data.inst = &txq_inst;
5083 txq_emit_data.dst_type = ctx->v4i32;
5084 set_tex_fetch_args(ctx, &txq_emit_data, target,
5085 args->resource, NULL, &ctx->i32_0,
5086 1, 0xf);
5087 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
5088
5089 /* Compute -0.5 / size. */
5090 for (c = 0; c < 2; c++) {
5091 half_texel[c] =
5092 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
5093 LLVMConstInt(ctx->i32, c, 0), "");
5094 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
5095 half_texel[c] =
5096 lp_build_emit_llvm_unary(&ctx->bld_base,
5097 TGSI_OPCODE_RCP, half_texel[c]);
5098 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
5099 LLVMConstReal(ctx->f32, -0.5), "");
5100 }
5101 }
5102
5103 for (c = 0; c < 2; c++) {
5104 LLVMValueRef tmp;
5105 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
5106
5107 tmp = LLVMBuildExtractElement(builder, coord, index, "");
5108 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
5109 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
5110 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
5111 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
5112 }
5113
5114 args->addr = coord;
5115 }
5116
5117 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
5118 struct lp_build_tgsi_context *bld_base,
5119 struct lp_build_emit_data *emit_data)
5120 {
5121 struct si_shader_context *ctx = si_shader_context(bld_base);
5122 const struct tgsi_full_instruction *inst = emit_data->inst;
5123 struct ac_image_args args;
5124 unsigned opcode = inst->Instruction.Opcode;
5125 unsigned target = inst->Texture.Texture;
5126
5127 if (target == TGSI_TEXTURE_BUFFER) {
5128 emit_data->output[emit_data->chan] =
5129 ac_build_buffer_load_format(&ctx->ac,
5130 emit_data->args[0],
5131 emit_data->args[2],
5132 emit_data->args[1],
5133 true);
5134 return;
5135 }
5136
5137 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
5138
5139 args.opcode = ac_image_sample;
5140 args.compare = tgsi_is_shadow_target(target);
5141 args.offset = inst->Texture.NumOffsets > 0;
5142
5143 switch (opcode) {
5144 case TGSI_OPCODE_TXF:
5145 case TGSI_OPCODE_TXF_LZ:
5146 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
5147 target == TGSI_TEXTURE_2D_MSAA ||
5148 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
5149 ac_image_load : ac_image_load_mip;
5150 args.compare = false;
5151 args.offset = false;
5152 break;
5153 case TGSI_OPCODE_LODQ:
5154 args.opcode = ac_image_get_lod;
5155 args.compare = false;
5156 args.offset = false;
5157 break;
5158 case TGSI_OPCODE_TEX:
5159 case TGSI_OPCODE_TEX2:
5160 case TGSI_OPCODE_TXP:
5161 if (ctx->type != PIPE_SHADER_FRAGMENT)
5162 args.level_zero = true;
5163 break;
5164 case TGSI_OPCODE_TEX_LZ:
5165 args.level_zero = true;
5166 break;
5167 case TGSI_OPCODE_TXB:
5168 case TGSI_OPCODE_TXB2:
5169 assert(ctx->type == PIPE_SHADER_FRAGMENT);
5170 args.bias = true;
5171 break;
5172 case TGSI_OPCODE_TXL:
5173 case TGSI_OPCODE_TXL2:
5174 args.lod = true;
5175 break;
5176 case TGSI_OPCODE_TXD:
5177 args.deriv = true;
5178 break;
5179 case TGSI_OPCODE_TG4:
5180 args.opcode = ac_image_gather4;
5181 args.level_zero = true;
5182 break;
5183 default:
5184 assert(0);
5185 return;
5186 }
5187
5188 /* The hardware needs special lowering for Gather4 with integer formats. */
5189 if (ctx->screen->b.chip_class <= VI &&
5190 opcode == TGSI_OPCODE_TG4) {
5191 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5192 /* This will also work with non-constant indexing because of how
5193 * glsl_to_tgsi works and we intent to preserve that behavior.
5194 */
5195 const unsigned src_idx = 2;
5196 unsigned sampler = inst->Src[src_idx].Register.Index;
5197
5198 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
5199
5200 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
5201 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
5202 si_lower_gather4_integer(ctx, &args, target);
5203 }
5204
5205 emit_data->output[emit_data->chan] =
5206 ac_build_image_opcode(&ctx->ac, &args);
5207 }
5208
5209 static void si_llvm_emit_txqs(
5210 const struct lp_build_tgsi_action *action,
5211 struct lp_build_tgsi_context *bld_base,
5212 struct lp_build_emit_data *emit_data)
5213 {
5214 struct si_shader_context *ctx = si_shader_context(bld_base);
5215 struct gallivm_state *gallivm = &ctx->gallivm;
5216 LLVMBuilderRef builder = gallivm->builder;
5217 LLVMValueRef res, samples;
5218 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5219
5220 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5221
5222
5223 /* Read the samples from the descriptor directly. */
5224 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5225 samples = LLVMBuildExtractElement(
5226 builder, res,
5227 LLVMConstInt(ctx->i32, 3, 0), "");
5228 samples = LLVMBuildLShr(builder, samples,
5229 LLVMConstInt(ctx->i32, 16, 0), "");
5230 samples = LLVMBuildAnd(builder, samples,
5231 LLVMConstInt(ctx->i32, 0xf, 0), "");
5232 samples = LLVMBuildShl(builder, ctx->i32_1,
5233 samples, "");
5234
5235 emit_data->output[emit_data->chan] = samples;
5236 }
5237
5238 static void si_llvm_emit_ddxy(
5239 const struct lp_build_tgsi_action *action,
5240 struct lp_build_tgsi_context *bld_base,
5241 struct lp_build_emit_data *emit_data)
5242 {
5243 struct si_shader_context *ctx = si_shader_context(bld_base);
5244 struct gallivm_state *gallivm = &ctx->gallivm;
5245 unsigned opcode = emit_data->info->opcode;
5246 LLVMValueRef val;
5247 int idx;
5248 unsigned mask;
5249
5250 if (opcode == TGSI_OPCODE_DDX_FINE)
5251 mask = AC_TID_MASK_LEFT;
5252 else if (opcode == TGSI_OPCODE_DDY_FINE)
5253 mask = AC_TID_MASK_TOP;
5254 else
5255 mask = AC_TID_MASK_TOP_LEFT;
5256
5257 /* for DDX we want to next X pixel, DDY next Y pixel. */
5258 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5259
5260 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5261 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5262 mask, idx, ctx->lds, val);
5263 emit_data->output[emit_data->chan] = val;
5264 }
5265
5266 /*
5267 * this takes an I,J coordinate pair,
5268 * and works out the X and Y derivatives.
5269 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5270 */
5271 static LLVMValueRef si_llvm_emit_ddxy_interp(
5272 struct lp_build_tgsi_context *bld_base,
5273 LLVMValueRef interp_ij)
5274 {
5275 struct si_shader_context *ctx = si_shader_context(bld_base);
5276 struct gallivm_state *gallivm = &ctx->gallivm;
5277 LLVMValueRef result[4], a;
5278 unsigned i;
5279
5280 for (i = 0; i < 2; i++) {
5281 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5282 LLVMConstInt(ctx->i32, i, 0), "");
5283 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5284 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5285 }
5286
5287 return lp_build_gather_values(gallivm, result, 4);
5288 }
5289
5290 static void interp_fetch_args(
5291 struct lp_build_tgsi_context *bld_base,
5292 struct lp_build_emit_data *emit_data)
5293 {
5294 struct si_shader_context *ctx = si_shader_context(bld_base);
5295 struct gallivm_state *gallivm = &ctx->gallivm;
5296 const struct tgsi_full_instruction *inst = emit_data->inst;
5297
5298 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5299 /* offset is in second src, first two channels */
5300 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5301 emit_data->inst, 1,
5302 TGSI_CHAN_X);
5303 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5304 emit_data->inst, 1,
5305 TGSI_CHAN_Y);
5306 emit_data->arg_count = 2;
5307 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5308 LLVMValueRef sample_position;
5309 LLVMValueRef sample_id;
5310 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5311
5312 /* fetch sample ID, then fetch its sample position,
5313 * and place into first two channels.
5314 */
5315 sample_id = lp_build_emit_fetch(bld_base,
5316 emit_data->inst, 1, TGSI_CHAN_X);
5317 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5318 ctx->i32, "");
5319 sample_position = load_sample_position(ctx, sample_id);
5320
5321 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5322 sample_position,
5323 ctx->i32_0, "");
5324
5325 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5326 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5327 sample_position,
5328 ctx->i32_1, "");
5329 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5330 emit_data->arg_count = 2;
5331 }
5332 }
5333
5334 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5335 struct lp_build_tgsi_context *bld_base,
5336 struct lp_build_emit_data *emit_data)
5337 {
5338 struct si_shader_context *ctx = si_shader_context(bld_base);
5339 struct si_shader *shader = ctx->shader;
5340 struct gallivm_state *gallivm = &ctx->gallivm;
5341 LLVMValueRef interp_param;
5342 const struct tgsi_full_instruction *inst = emit_data->inst;
5343 int input_index = inst->Src[0].Register.Index;
5344 int chan;
5345 int i;
5346 LLVMValueRef attr_number;
5347 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5348 int interp_param_idx;
5349 unsigned interp = shader->selector->info.input_interpolate[input_index];
5350 unsigned location;
5351
5352 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5353
5354 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5355 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5356 location = TGSI_INTERPOLATE_LOC_CENTER;
5357 else
5358 location = TGSI_INTERPOLATE_LOC_CENTROID;
5359
5360 interp_param_idx = lookup_interp_param_index(interp, location);
5361 if (interp_param_idx == -1)
5362 return;
5363 else if (interp_param_idx)
5364 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5365 else
5366 interp_param = NULL;
5367
5368 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5369
5370 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5371 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5372 LLVMValueRef ij_out[2];
5373 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5374
5375 /*
5376 * take the I then J parameters, and the DDX/Y for it, and
5377 * calculate the IJ inputs for the interpolator.
5378 * temp1 = ddx * offset/sample.x + I;
5379 * interp_param.I = ddy * offset/sample.y + temp1;
5380 * temp1 = ddx * offset/sample.x + J;
5381 * interp_param.J = ddy * offset/sample.y + temp1;
5382 */
5383 for (i = 0; i < 2; i++) {
5384 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5385 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5386 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5387 ddxy_out, ix_ll, "");
5388 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5389 ddxy_out, iy_ll, "");
5390 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5391 interp_param, ix_ll, "");
5392 LLVMValueRef temp1, temp2;
5393
5394 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5395 ctx->f32, "");
5396
5397 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5398
5399 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5400
5401 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5402
5403 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5404 }
5405 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5406 }
5407
5408 for (chan = 0; chan < 4; chan++) {
5409 LLVMValueRef llvm_chan;
5410 unsigned schan;
5411
5412 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5413 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5414
5415 if (interp_param) {
5416 interp_param = LLVMBuildBitCast(gallivm->builder,
5417 interp_param, LLVMVectorType(ctx->f32, 2), "");
5418 LLVMValueRef i = LLVMBuildExtractElement(
5419 gallivm->builder, interp_param, ctx->i32_0, "");
5420 LLVMValueRef j = LLVMBuildExtractElement(
5421 gallivm->builder, interp_param, ctx->i32_1, "");
5422 emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5423 llvm_chan, attr_number, params,
5424 i, j);
5425 } else {
5426 emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5427 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5428 llvm_chan, attr_number, params);
5429 }
5430 }
5431 }
5432
5433 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5434 LLVMValueRef value)
5435 {
5436 struct gallivm_state *gallivm = &ctx->gallivm;
5437 LLVMValueRef args[3] = {
5438 value,
5439 ctx->i32_0,
5440 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5441 };
5442
5443 /* We currently have no other way to prevent LLVM from lifting the icmp
5444 * calls to a dominating basic block.
5445 */
5446 emit_optimization_barrier(ctx, &args[0]);
5447
5448 if (LLVMTypeOf(args[0]) != ctx->i32)
5449 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5450
5451 return lp_build_intrinsic(gallivm->builder,
5452 "llvm.amdgcn.icmp.i32",
5453 ctx->i64, args, 3,
5454 LP_FUNC_ATTR_NOUNWIND |
5455 LP_FUNC_ATTR_READNONE |
5456 LP_FUNC_ATTR_CONVERGENT);
5457 }
5458
5459 static void vote_all_emit(
5460 const struct lp_build_tgsi_action *action,
5461 struct lp_build_tgsi_context *bld_base,
5462 struct lp_build_emit_data *emit_data)
5463 {
5464 struct si_shader_context *ctx = si_shader_context(bld_base);
5465 struct gallivm_state *gallivm = &ctx->gallivm;
5466 LLVMValueRef active_set, vote_set;
5467 LLVMValueRef tmp;
5468
5469 active_set = si_emit_ballot(ctx, ctx->i32_1);
5470 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5471
5472 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5473 emit_data->output[emit_data->chan] =
5474 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5475 }
5476
5477 static void vote_any_emit(
5478 const struct lp_build_tgsi_action *action,
5479 struct lp_build_tgsi_context *bld_base,
5480 struct lp_build_emit_data *emit_data)
5481 {
5482 struct si_shader_context *ctx = si_shader_context(bld_base);
5483 struct gallivm_state *gallivm = &ctx->gallivm;
5484 LLVMValueRef vote_set;
5485 LLVMValueRef tmp;
5486
5487 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5488
5489 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5490 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5491 emit_data->output[emit_data->chan] =
5492 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5493 }
5494
5495 static void vote_eq_emit(
5496 const struct lp_build_tgsi_action *action,
5497 struct lp_build_tgsi_context *bld_base,
5498 struct lp_build_emit_data *emit_data)
5499 {
5500 struct si_shader_context *ctx = si_shader_context(bld_base);
5501 struct gallivm_state *gallivm = &ctx->gallivm;
5502 LLVMValueRef active_set, vote_set;
5503 LLVMValueRef all, none, tmp;
5504
5505 active_set = si_emit_ballot(ctx, ctx->i32_1);
5506 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5507
5508 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5509 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5510 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5511 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5512 emit_data->output[emit_data->chan] =
5513 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5514 }
5515
5516 static void ballot_emit(
5517 const struct lp_build_tgsi_action *action,
5518 struct lp_build_tgsi_context *bld_base,
5519 struct lp_build_emit_data *emit_data)
5520 {
5521 struct si_shader_context *ctx = si_shader_context(bld_base);
5522 LLVMBuilderRef builder = ctx->gallivm.builder;
5523 LLVMValueRef tmp;
5524
5525 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5526 tmp = si_emit_ballot(ctx, tmp);
5527 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5528
5529 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5530 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5531 }
5532
5533 static void read_invoc_fetch_args(
5534 struct lp_build_tgsi_context *bld_base,
5535 struct lp_build_emit_data *emit_data)
5536 {
5537 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5538 0, emit_data->src_chan);
5539
5540 /* Always read the source invocation (= lane) from the X channel. */
5541 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5542 1, TGSI_CHAN_X);
5543 emit_data->arg_count = 2;
5544 }
5545
5546 static void read_lane_emit(
5547 const struct lp_build_tgsi_action *action,
5548 struct lp_build_tgsi_context *bld_base,
5549 struct lp_build_emit_data *emit_data)
5550 {
5551 struct si_shader_context *ctx = si_shader_context(bld_base);
5552 LLVMBuilderRef builder = ctx->gallivm.builder;
5553
5554 /* We currently have no other way to prevent LLVM from lifting the icmp
5555 * calls to a dominating basic block.
5556 */
5557 emit_optimization_barrier(ctx, &emit_data->args[0]);
5558
5559 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5560 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5561 ctx->i32, "");
5562 }
5563
5564 emit_data->output[emit_data->chan] =
5565 ac_build_intrinsic(&ctx->ac, action->intr_name,
5566 ctx->i32, emit_data->args, emit_data->arg_count,
5567 AC_FUNC_ATTR_READNONE |
5568 AC_FUNC_ATTR_CONVERGENT);
5569 }
5570
5571 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5572 struct lp_build_emit_data *emit_data)
5573 {
5574 struct si_shader_context *ctx = si_shader_context(bld_base);
5575 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5576 LLVMValueRef imm;
5577 unsigned stream;
5578
5579 assert(src0.File == TGSI_FILE_IMMEDIATE);
5580
5581 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5582 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5583 return stream;
5584 }
5585
5586 /* Emit one vertex from the geometry shader */
5587 static void si_llvm_emit_vertex(
5588 const struct lp_build_tgsi_action *action,
5589 struct lp_build_tgsi_context *bld_base,
5590 struct lp_build_emit_data *emit_data)
5591 {
5592 struct si_shader_context *ctx = si_shader_context(bld_base);
5593 struct lp_build_context *uint = &bld_base->uint_bld;
5594 struct si_shader *shader = ctx->shader;
5595 struct tgsi_shader_info *info = &shader->selector->info;
5596 struct gallivm_state *gallivm = &ctx->gallivm;
5597 struct lp_build_if_state if_state;
5598 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5599 ctx->param_gs2vs_offset);
5600 LLVMValueRef gs_next_vertex;
5601 LLVMValueRef can_emit, kill;
5602 unsigned chan, offset;
5603 int i;
5604 unsigned stream;
5605
5606 stream = si_llvm_get_stream(bld_base, emit_data);
5607
5608 /* Write vertex attribute values to GSVS ring */
5609 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5610 ctx->gs_next_vertex[stream],
5611 "");
5612
5613 /* If this thread has already emitted the declared maximum number of
5614 * vertices, skip the write: excessive vertex emissions are not
5615 * supposed to have any effect.
5616 *
5617 * If the shader has no writes to memory, kill it instead. This skips
5618 * further memory loads and may allow LLVM to skip to the end
5619 * altogether.
5620 */
5621 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5622 LLVMConstInt(ctx->i32,
5623 shader->selector->gs_max_out_vertices, 0), "");
5624
5625 bool use_kill = !info->writes_memory;
5626 if (use_kill) {
5627 kill = lp_build_select(&bld_base->base, can_emit,
5628 LLVMConstReal(ctx->f32, 1.0f),
5629 LLVMConstReal(ctx->f32, -1.0f));
5630
5631 ac_build_kill(&ctx->ac, kill);
5632 } else {
5633 lp_build_if(&if_state, gallivm, can_emit);
5634 }
5635
5636 offset = 0;
5637 for (i = 0; i < info->num_outputs; i++) {
5638 LLVMValueRef *out_ptr = ctx->outputs[i];
5639
5640 for (chan = 0; chan < 4; chan++) {
5641 if (!(info->output_usagemask[i] & (1 << chan)) ||
5642 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5643 continue;
5644
5645 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5646 LLVMValueRef voffset =
5647 LLVMConstInt(ctx->i32, offset *
5648 shader->selector->gs_max_out_vertices, 0);
5649 offset++;
5650
5651 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5652 voffset = lp_build_mul_imm(uint, voffset, 4);
5653
5654 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5655
5656 ac_build_buffer_store_dword(&ctx->ac,
5657 ctx->gsvs_ring[stream],
5658 out_val, 1,
5659 voffset, soffset, 0,
5660 1, 1, true, true);
5661 }
5662 }
5663
5664 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5665 ctx->i32_1);
5666
5667 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5668
5669 /* Signal vertex emission */
5670 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5671 si_get_gs_wave_id(ctx));
5672 if (!use_kill)
5673 lp_build_endif(&if_state);
5674 }
5675
5676 /* Cut one primitive from the geometry shader */
5677 static void si_llvm_emit_primitive(
5678 const struct lp_build_tgsi_action *action,
5679 struct lp_build_tgsi_context *bld_base,
5680 struct lp_build_emit_data *emit_data)
5681 {
5682 struct si_shader_context *ctx = si_shader_context(bld_base);
5683 unsigned stream;
5684
5685 /* Signal primitive cut */
5686 stream = si_llvm_get_stream(bld_base, emit_data);
5687 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5688 si_get_gs_wave_id(ctx));
5689 }
5690
5691 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5692 struct lp_build_tgsi_context *bld_base,
5693 struct lp_build_emit_data *emit_data)
5694 {
5695 struct si_shader_context *ctx = si_shader_context(bld_base);
5696 struct gallivm_state *gallivm = &ctx->gallivm;
5697
5698 /* SI only (thanks to a hw bug workaround):
5699 * The real barrier instruction isn’t needed, because an entire patch
5700 * always fits into a single wave.
5701 */
5702 if (ctx->screen->b.chip_class == SI &&
5703 ctx->type == PIPE_SHADER_TESS_CTRL) {
5704 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5705 return;
5706 }
5707
5708 lp_build_intrinsic(gallivm->builder,
5709 "llvm.amdgcn.s.barrier",
5710 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5711 }
5712
5713 static const struct lp_build_tgsi_action tex_action = {
5714 .fetch_args = tex_fetch_args,
5715 .emit = build_tex_intrinsic,
5716 };
5717
5718 static const struct lp_build_tgsi_action interp_action = {
5719 .fetch_args = interp_fetch_args,
5720 .emit = build_interp_intrinsic,
5721 };
5722
5723 static void si_create_function(struct si_shader_context *ctx,
5724 const char *name,
5725 LLVMTypeRef *returns, unsigned num_returns,
5726 LLVMTypeRef *params, unsigned num_params,
5727 int last_sgpr, unsigned max_workgroup_size)
5728 {
5729 int i;
5730
5731 si_llvm_create_func(ctx, name, returns, num_returns,
5732 params, num_params);
5733 ctx->return_value = LLVMGetUndef(ctx->return_type);
5734
5735 for (i = 0; i <= last_sgpr; ++i) {
5736 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5737
5738 /* The combination of:
5739 * - ByVal
5740 * - dereferenceable
5741 * - invariant.load
5742 * allows the optimization passes to move loads and reduces
5743 * SGPR spilling significantly.
5744 */
5745 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5746 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5747 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5748 ac_add_attr_dereferenceable(P, UINT64_MAX);
5749 } else
5750 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5751 }
5752
5753 if (max_workgroup_size) {
5754 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
5755 max_workgroup_size);
5756 }
5757 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5758 "no-signed-zeros-fp-math",
5759 "true");
5760
5761 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5762 /* These were copied from some LLVM test. */
5763 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5764 "less-precise-fpmad",
5765 "true");
5766 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5767 "no-infs-fp-math",
5768 "true");
5769 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5770 "no-nans-fp-math",
5771 "true");
5772 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5773 "unsafe-fp-math",
5774 "true");
5775 }
5776 }
5777
5778 static void declare_streamout_params(struct si_shader_context *ctx,
5779 struct pipe_stream_output_info *so,
5780 LLVMTypeRef *params, LLVMTypeRef i32,
5781 unsigned *num_params)
5782 {
5783 int i;
5784
5785 /* Streamout SGPRs. */
5786 if (so->num_outputs) {
5787 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5788 params[ctx->param_streamout_config = (*num_params)++] = i32;
5789 else
5790 ctx->param_streamout_config = *num_params - 1;
5791
5792 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5793 }
5794 /* A streamout buffer offset is loaded if the stride is non-zero. */
5795 for (i = 0; i < 4; i++) {
5796 if (!so->stride[i])
5797 continue;
5798
5799 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5800 }
5801 }
5802
5803 static unsigned llvm_get_type_size(LLVMTypeRef type)
5804 {
5805 LLVMTypeKind kind = LLVMGetTypeKind(type);
5806
5807 switch (kind) {
5808 case LLVMIntegerTypeKind:
5809 return LLVMGetIntTypeWidth(type) / 8;
5810 case LLVMFloatTypeKind:
5811 return 4;
5812 case LLVMPointerTypeKind:
5813 return 8;
5814 case LLVMVectorTypeKind:
5815 return LLVMGetVectorSize(type) *
5816 llvm_get_type_size(LLVMGetElementType(type));
5817 case LLVMArrayTypeKind:
5818 return LLVMGetArrayLength(type) *
5819 llvm_get_type_size(LLVMGetElementType(type));
5820 default:
5821 assert(0);
5822 return 0;
5823 }
5824 }
5825
5826 static void declare_lds_as_pointer(struct si_shader_context *ctx)
5827 {
5828 struct gallivm_state *gallivm = &ctx->gallivm;
5829
5830 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5831 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5832 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5833 "lds");
5834 }
5835
5836 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
5837 {
5838 switch (shader->selector->type) {
5839 case PIPE_SHADER_TESS_CTRL:
5840 /* Return this so that LLVM doesn't remove s_barrier
5841 * instructions on chips where we use s_barrier. */
5842 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
5843
5844 case PIPE_SHADER_GEOMETRY:
5845 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
5846
5847 case PIPE_SHADER_COMPUTE:
5848 break; /* see below */
5849
5850 default:
5851 return 0;
5852 }
5853
5854 const unsigned *properties = shader->selector->info.properties;
5855 unsigned max_work_group_size =
5856 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5857 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5858 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5859
5860 if (!max_work_group_size) {
5861 /* This is a variable group size compute shader,
5862 * compile it for the maximum possible group size.
5863 */
5864 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5865 }
5866 return max_work_group_size;
5867 }
5868
5869 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5870 LLVMTypeRef *params,
5871 unsigned *num_params,
5872 bool assign_params)
5873 {
5874 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_CONST_BUFFERS);
5875 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5876 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5877 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5878
5879 if (assign_params) {
5880 ctx->param_const_buffers = *num_params - 4;
5881 ctx->param_samplers = *num_params - 3;
5882 ctx->param_images = *num_params - 2;
5883 ctx->param_shader_buffers = *num_params - 1;
5884 }
5885 }
5886
5887 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5888 LLVMTypeRef *params,
5889 unsigned *num_params)
5890 {
5891 params[ctx->param_rw_buffers = (*num_params)++] =
5892 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
5893 declare_per_stage_desc_pointers(ctx, params, num_params, true);
5894 }
5895
5896 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5897 LLVMTypeRef *params,
5898 unsigned *num_params)
5899 {
5900 params[ctx->param_vertex_buffers = (*num_params)++] =
5901 const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS);
5902 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5903 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5904 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5905 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5906 }
5907
5908 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5909 LLVMTypeRef *params, unsigned *num_params,
5910 unsigned *num_prolog_vgprs)
5911 {
5912 struct si_shader *shader = ctx->shader;
5913
5914 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5915 if (shader->key.as_ls) {
5916 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5917 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5918 } else {
5919 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5920 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5921 }
5922 params[(*num_params)++] = ctx->i32; /* unused */
5923
5924 if (!shader->is_gs_copy_shader) {
5925 /* Vertex load indices. */
5926 ctx->param_vertex_index0 = (*num_params);
5927 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5928 params[(*num_params)++] = ctx->i32;
5929 *num_prolog_vgprs += shader->selector->info.num_inputs;
5930 }
5931 }
5932
5933 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
5934 LLVMTypeRef *params, unsigned *num_params)
5935 {
5936 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
5937 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
5938 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
5939 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
5940 }
5941
5942 enum {
5943 /* Convenient merged shader definitions. */
5944 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5945 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5946 };
5947
5948 static void create_function(struct si_shader_context *ctx)
5949 {
5950 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5951 struct gallivm_state *gallivm = &ctx->gallivm;
5952 struct si_shader *shader = ctx->shader;
5953 LLVMTypeRef params[100]; /* just make it large enough */
5954 LLVMTypeRef returns[16+32*4];
5955 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5956 unsigned num_returns = 0;
5957 unsigned num_prolog_vgprs = 0;
5958 unsigned type = ctx->type;
5959
5960 /* Set MERGED shaders. */
5961 if (ctx->screen->b.chip_class >= GFX9) {
5962 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5963 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5964 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5965 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5966 }
5967
5968 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5969
5970 switch (type) {
5971 case PIPE_SHADER_VERTEX:
5972 declare_default_desc_pointers(ctx, params, &num_params);
5973 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5974
5975 if (shader->key.as_es) {
5976 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5977 } else if (shader->key.as_ls) {
5978 /* no extra parameters */
5979 } else {
5980 if (shader->is_gs_copy_shader)
5981 num_params = ctx->param_rw_buffers + 1;
5982
5983 /* The locations of the other parameters are assigned dynamically. */
5984 declare_streamout_params(ctx, &shader->selector->so,
5985 params, ctx->i32, &num_params);
5986 }
5987
5988 last_sgpr = num_params-1;
5989
5990 /* VGPRs */
5991 declare_vs_input_vgprs(ctx, params, &num_params,
5992 &num_prolog_vgprs);
5993 break;
5994
5995 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5996 declare_default_desc_pointers(ctx, params, &num_params);
5997 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5998 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5999 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
6000 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
6001 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6002 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
6003 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6004 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
6005 last_sgpr = num_params - 1;
6006
6007 /* VGPRs */
6008 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
6009 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
6010
6011 /* param_tcs_offchip_offset and param_tcs_factor_offset are
6012 * placed after the user SGPRs.
6013 */
6014 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
6015 returns[num_returns++] = ctx->i32; /* SGPRs */
6016 for (i = 0; i < 3; i++)
6017 returns[num_returns++] = ctx->f32; /* VGPRs */
6018 break;
6019
6020 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
6021 /* Merged stages have 8 system SGPRs at the beginning. */
6022 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
6023 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
6024 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6025 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6026 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
6027 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6028 params[num_params++] = ctx->i32; /* unused */
6029 params[num_params++] = ctx->i32; /* unused */
6030
6031 params[num_params++] = ctx->i32; /* unused */
6032 params[num_params++] = ctx->i32; /* unused */
6033 declare_per_stage_desc_pointers(ctx, params, &num_params,
6034 ctx->type == PIPE_SHADER_VERTEX);
6035 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6036
6037 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6038 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
6039 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
6040 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6041 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
6042 params[num_params++] = ctx->i32; /* unused */
6043
6044 declare_per_stage_desc_pointers(ctx, params, &num_params,
6045 ctx->type == PIPE_SHADER_TESS_CTRL);
6046 last_sgpr = num_params - 1;
6047
6048 /* VGPRs (first TCS, then VS) */
6049 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
6050 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
6051
6052 if (ctx->type == PIPE_SHADER_VERTEX) {
6053 declare_vs_input_vgprs(ctx, params, &num_params,
6054 &num_prolog_vgprs);
6055
6056 /* LS return values are inputs to the TCS main shader part. */
6057 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
6058 returns[num_returns++] = ctx->i32; /* SGPRs */
6059 for (i = 0; i < 2; i++)
6060 returns[num_returns++] = ctx->f32; /* VGPRs */
6061 } else {
6062 /* TCS return values are inputs to the TCS epilog.
6063 *
6064 * param_tcs_offchip_offset, param_tcs_factor_offset,
6065 * param_tcs_offchip_layout, and param_rw_buffers
6066 * should be passed to the epilog.
6067 */
6068 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
6069 returns[num_returns++] = ctx->i32; /* SGPRs */
6070 for (i = 0; i < 3; i++)
6071 returns[num_returns++] = ctx->f32; /* VGPRs */
6072 }
6073 break;
6074
6075 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
6076 /* Merged stages have 8 system SGPRs at the beginning. */
6077 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
6078 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
6079 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6080 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6081 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6082 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6083 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
6084 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
6085
6086 params[num_params++] = ctx->i32; /* unused */
6087 params[num_params++] = ctx->i32; /* unused */
6088 declare_per_stage_desc_pointers(ctx, params, &num_params,
6089 (ctx->type == PIPE_SHADER_VERTEX ||
6090 ctx->type == PIPE_SHADER_TESS_EVAL));
6091 if (ctx->type == PIPE_SHADER_VERTEX) {
6092 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6093 } else {
6094 /* TESS_EVAL (and also GEOMETRY):
6095 * Declare as many input SGPRs as the VS has. */
6096 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6097 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6098 params[num_params++] = ctx->i32; /* unused */
6099 params[num_params++] = ctx->i32; /* unused */
6100 params[num_params++] = ctx->i32; /* unused */
6101 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
6102 }
6103
6104 declare_per_stage_desc_pointers(ctx, params, &num_params,
6105 ctx->type == PIPE_SHADER_GEOMETRY);
6106 last_sgpr = num_params - 1;
6107
6108 /* VGPRs (first GS, then VS/TES) */
6109 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
6110 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
6111 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6112 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6113 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
6114
6115 if (ctx->type == PIPE_SHADER_VERTEX) {
6116 declare_vs_input_vgprs(ctx, params, &num_params,
6117 &num_prolog_vgprs);
6118 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
6119 declare_tes_input_vgprs(ctx, params, &num_params);
6120 }
6121
6122 if (ctx->type == PIPE_SHADER_VERTEX ||
6123 ctx->type == PIPE_SHADER_TESS_EVAL) {
6124 /* ES return values are inputs to GS. */
6125 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
6126 returns[num_returns++] = ctx->i32; /* SGPRs */
6127 for (i = 0; i < 5; i++)
6128 returns[num_returns++] = ctx->f32; /* VGPRs */
6129 }
6130 break;
6131
6132 case PIPE_SHADER_TESS_EVAL:
6133 declare_default_desc_pointers(ctx, params, &num_params);
6134 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6135 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6136
6137 if (shader->key.as_es) {
6138 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6139 params[num_params++] = ctx->i32;
6140 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
6141 } else {
6142 params[num_params++] = ctx->i32;
6143 declare_streamout_params(ctx, &shader->selector->so,
6144 params, ctx->i32, &num_params);
6145 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6146 }
6147 last_sgpr = num_params - 1;
6148
6149 /* VGPRs */
6150 declare_tes_input_vgprs(ctx, params, &num_params);
6151 break;
6152
6153 case PIPE_SHADER_GEOMETRY:
6154 declare_default_desc_pointers(ctx, params, &num_params);
6155 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6156 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
6157 last_sgpr = num_params - 1;
6158
6159 /* VGPRs */
6160 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
6161 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
6162 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6163 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
6164 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
6165 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
6166 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
6167 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6168 break;
6169
6170 case PIPE_SHADER_FRAGMENT:
6171 declare_default_desc_pointers(ctx, params, &num_params);
6172 params[SI_PARAM_ALPHA_REF] = ctx->f32;
6173 params[SI_PARAM_PRIM_MASK] = ctx->i32;
6174 last_sgpr = SI_PARAM_PRIM_MASK;
6175 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
6176 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
6177 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
6178 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
6179 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
6180 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
6181 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
6182 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
6183 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
6184 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
6185 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
6186 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
6187 params[SI_PARAM_FRONT_FACE] = ctx->i32;
6188 shader->info.face_vgpr_index = 20;
6189 params[SI_PARAM_ANCILLARY] = ctx->i32;
6190 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
6191 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
6192 num_params = SI_PARAM_POS_FIXED_PT+1;
6193
6194 /* Color inputs from the prolog. */
6195 if (shader->selector->info.colors_read) {
6196 unsigned num_color_elements =
6197 util_bitcount(shader->selector->info.colors_read);
6198
6199 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
6200 for (i = 0; i < num_color_elements; i++)
6201 params[num_params++] = ctx->f32;
6202
6203 num_prolog_vgprs += num_color_elements;
6204 }
6205
6206 /* Outputs for the epilog. */
6207 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6208 num_returns =
6209 num_return_sgprs +
6210 util_bitcount(shader->selector->info.colors_written) * 4 +
6211 shader->selector->info.writes_z +
6212 shader->selector->info.writes_stencil +
6213 shader->selector->info.writes_samplemask +
6214 1 /* SampleMaskIn */;
6215
6216 num_returns = MAX2(num_returns,
6217 num_return_sgprs +
6218 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6219
6220 for (i = 0; i < num_return_sgprs; i++)
6221 returns[i] = ctx->i32;
6222 for (; i < num_returns; i++)
6223 returns[i] = ctx->f32;
6224 break;
6225
6226 case PIPE_SHADER_COMPUTE:
6227 declare_default_desc_pointers(ctx, params, &num_params);
6228 if (shader->selector->info.uses_grid_size)
6229 params[ctx->param_grid_size = num_params++] = v3i32;
6230 if (shader->selector->info.uses_block_size)
6231 params[ctx->param_block_size = num_params++] = v3i32;
6232
6233 for (i = 0; i < 3; i++) {
6234 ctx->param_block_id[i] = -1;
6235 if (shader->selector->info.uses_block_id[i])
6236 params[ctx->param_block_id[i] = num_params++] = ctx->i32;
6237 }
6238 last_sgpr = num_params - 1;
6239
6240 params[ctx->param_thread_id = num_params++] = v3i32;
6241 break;
6242 default:
6243 assert(0 && "unimplemented shader");
6244 return;
6245 }
6246
6247 assert(num_params <= ARRAY_SIZE(params));
6248
6249 si_create_function(ctx, "main", returns, num_returns, params,
6250 num_params, last_sgpr,
6251 si_get_max_workgroup_size(shader));
6252
6253 /* Reserve register locations for VGPR inputs the PS prolog may need. */
6254 if (ctx->type == PIPE_SHADER_FRAGMENT &&
6255 ctx->separate_prolog) {
6256 si_llvm_add_attribute(ctx->main_fn,
6257 "InitialPSInputAddr",
6258 S_0286D0_PERSP_SAMPLE_ENA(1) |
6259 S_0286D0_PERSP_CENTER_ENA(1) |
6260 S_0286D0_PERSP_CENTROID_ENA(1) |
6261 S_0286D0_LINEAR_SAMPLE_ENA(1) |
6262 S_0286D0_LINEAR_CENTER_ENA(1) |
6263 S_0286D0_LINEAR_CENTROID_ENA(1) |
6264 S_0286D0_FRONT_FACE_ENA(1) |
6265 S_0286D0_POS_FIXED_PT_ENA(1));
6266 }
6267
6268 shader->info.num_input_sgprs = 0;
6269 shader->info.num_input_vgprs = 0;
6270
6271 for (i = 0; i <= last_sgpr; ++i)
6272 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
6273
6274 for (; i < num_params; ++i)
6275 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
6276
6277 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
6278 shader->info.num_input_vgprs -= num_prolog_vgprs;
6279
6280 if (!ctx->screen->has_ds_bpermute &&
6281 bld_base->info &&
6282 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
6283 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
6284 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
6285 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
6286 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
6287 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
6288 ctx->lds =
6289 LLVMAddGlobalInAddressSpace(gallivm->module,
6290 LLVMArrayType(ctx->i32, 64),
6291 "ddxy_lds",
6292 LOCAL_ADDR_SPACE);
6293
6294 if (shader->key.as_ls ||
6295 ctx->type == PIPE_SHADER_TESS_CTRL ||
6296 /* GFX9 has the ESGS ring buffer in LDS. */
6297 (ctx->screen->b.chip_class >= GFX9 &&
6298 (shader->key.as_es ||
6299 ctx->type == PIPE_SHADER_GEOMETRY)))
6300 declare_lds_as_pointer(ctx);
6301 }
6302
6303 /**
6304 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
6305 * for later use.
6306 */
6307 static void preload_ring_buffers(struct si_shader_context *ctx)
6308 {
6309 struct gallivm_state *gallivm = &ctx->gallivm;
6310 LLVMBuilderRef builder = gallivm->builder;
6311
6312 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6313 ctx->param_rw_buffers);
6314
6315 if (ctx->screen->b.chip_class <= VI &&
6316 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
6317 unsigned ring =
6318 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6319 : SI_ES_RING_ESGS;
6320 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6321
6322 ctx->esgs_ring =
6323 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6324 }
6325
6326 if (ctx->shader->is_gs_copy_shader) {
6327 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6328
6329 ctx->gsvs_ring[0] =
6330 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6331 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6332 const struct si_shader_selector *sel = ctx->shader->selector;
6333 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6334 LLVMValueRef base_ring;
6335
6336 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6337
6338 /* The conceptual layout of the GSVS ring is
6339 * v0c0 .. vLv0 v0c1 .. vLc1 ..
6340 * but the real memory layout is swizzled across
6341 * threads:
6342 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6343 * t16v0c0 ..
6344 * Override the buffer descriptor accordingly.
6345 */
6346 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6347 uint64_t stream_offset = 0;
6348
6349 for (unsigned stream = 0; stream < 4; ++stream) {
6350 unsigned num_components;
6351 unsigned stride;
6352 unsigned num_records;
6353 LLVMValueRef ring, tmp;
6354
6355 num_components = sel->info.num_stream_output_components[stream];
6356 if (!num_components)
6357 continue;
6358
6359 stride = 4 * num_components * sel->gs_max_out_vertices;
6360
6361 /* Limit on the stride field for <= CIK. */
6362 assert(stride < (1 << 14));
6363
6364 num_records = 64;
6365
6366 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6367 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6368 tmp = LLVMBuildAdd(builder, tmp,
6369 LLVMConstInt(ctx->i64,
6370 stream_offset, 0), "");
6371 stream_offset += stride * 64;
6372
6373 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6374 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6375 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6376 tmp = LLVMBuildOr(builder, tmp,
6377 LLVMConstInt(ctx->i32,
6378 S_008F04_STRIDE(stride) |
6379 S_008F04_SWIZZLE_ENABLE(1), 0), "");
6380 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6381 ring = LLVMBuildInsertElement(builder, ring,
6382 LLVMConstInt(ctx->i32, num_records, 0),
6383 LLVMConstInt(ctx->i32, 2, 0), "");
6384 ring = LLVMBuildInsertElement(builder, ring,
6385 LLVMConstInt(ctx->i32,
6386 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6387 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6388 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6389 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6390 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6391 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6392 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6393 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6394 S_008F0C_ADD_TID_ENABLE(1),
6395 0),
6396 LLVMConstInt(ctx->i32, 3, 0), "");
6397
6398 ctx->gsvs_ring[stream] = ring;
6399 }
6400 }
6401 }
6402
6403 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6404 LLVMValueRef param_rw_buffers,
6405 unsigned param_pos_fixed_pt)
6406 {
6407 struct gallivm_state *gallivm = &ctx->gallivm;
6408 LLVMBuilderRef builder = gallivm->builder;
6409 LLVMValueRef slot, desc, offset, row, bit, address[2];
6410
6411 /* Use the fixed-point gl_FragCoord input.
6412 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6413 * per coordinate to get the repeating effect.
6414 */
6415 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6416 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6417
6418 /* Load the buffer descriptor. */
6419 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6420 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6421
6422 /* The stipple pattern is 32x32, each row has 32 bits. */
6423 offset = LLVMBuildMul(builder, address[1],
6424 LLVMConstInt(ctx->i32, 4, 0), "");
6425 row = buffer_load_const(ctx, desc, offset);
6426 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6427 bit = LLVMBuildLShr(builder, row, address[0], "");
6428 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6429
6430 /* The intrinsic kills the thread if arg < 0. */
6431 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6432 LLVMConstReal(ctx->f32, -1), "");
6433 ac_build_kill(&ctx->ac, bit);
6434 }
6435
6436 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6437 struct si_shader_config *conf,
6438 unsigned symbol_offset)
6439 {
6440 unsigned i;
6441 const unsigned char *config =
6442 ac_shader_binary_config_start(binary, symbol_offset);
6443 bool really_needs_scratch = false;
6444
6445 /* LLVM adds SGPR spills to the scratch size.
6446 * Find out if we really need the scratch buffer.
6447 */
6448 for (i = 0; i < binary->reloc_count; i++) {
6449 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6450
6451 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6452 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6453 really_needs_scratch = true;
6454 break;
6455 }
6456 }
6457
6458 /* XXX: We may be able to emit some of these values directly rather than
6459 * extracting fields to be emitted later.
6460 */
6461
6462 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6463 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6464 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6465 switch (reg) {
6466 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6467 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6468 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6469 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
6470 case R_00B848_COMPUTE_PGM_RSRC1:
6471 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6472 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6473 conf->float_mode = G_00B028_FLOAT_MODE(value);
6474 conf->rsrc1 = value;
6475 break;
6476 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6477 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6478 break;
6479 case R_00B84C_COMPUTE_PGM_RSRC2:
6480 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6481 conf->rsrc2 = value;
6482 break;
6483 case R_0286CC_SPI_PS_INPUT_ENA:
6484 conf->spi_ps_input_ena = value;
6485 break;
6486 case R_0286D0_SPI_PS_INPUT_ADDR:
6487 conf->spi_ps_input_addr = value;
6488 break;
6489 case R_0286E8_SPI_TMPRING_SIZE:
6490 case R_00B860_COMPUTE_TMPRING_SIZE:
6491 /* WAVESIZE is in units of 256 dwords. */
6492 if (really_needs_scratch)
6493 conf->scratch_bytes_per_wave =
6494 G_00B860_WAVESIZE(value) * 256 * 4;
6495 break;
6496 case 0x4: /* SPILLED_SGPRS */
6497 conf->spilled_sgprs = value;
6498 break;
6499 case 0x8: /* SPILLED_VGPRS */
6500 conf->spilled_vgprs = value;
6501 break;
6502 default:
6503 {
6504 static bool printed;
6505
6506 if (!printed) {
6507 fprintf(stderr, "Warning: LLVM emitted unknown "
6508 "config register: 0x%x\n", reg);
6509 printed = true;
6510 }
6511 }
6512 break;
6513 }
6514 }
6515
6516 if (!conf->spi_ps_input_addr)
6517 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6518 }
6519
6520 void si_shader_apply_scratch_relocs(struct si_shader *shader,
6521 uint64_t scratch_va)
6522 {
6523 unsigned i;
6524 uint32_t scratch_rsrc_dword0 = scratch_va;
6525 uint32_t scratch_rsrc_dword1 =
6526 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6527
6528 /* Enable scratch coalescing. */
6529 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6530
6531 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6532 const struct ac_shader_reloc *reloc =
6533 &shader->binary.relocs[i];
6534 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6535 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6536 &scratch_rsrc_dword0, 4);
6537 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6538 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6539 &scratch_rsrc_dword1, 4);
6540 }
6541 }
6542 }
6543
6544 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
6545 {
6546 unsigned size = shader->binary.code_size;
6547
6548 if (shader->prolog)
6549 size += shader->prolog->binary.code_size;
6550 if (shader->previous_stage)
6551 size += shader->previous_stage->binary.code_size;
6552 if (shader->prolog2)
6553 size += shader->prolog2->binary.code_size;
6554 if (shader->epilog)
6555 size += shader->epilog->binary.code_size;
6556 return size;
6557 }
6558
6559 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6560 {
6561 const struct ac_shader_binary *prolog =
6562 shader->prolog ? &shader->prolog->binary : NULL;
6563 const struct ac_shader_binary *previous_stage =
6564 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6565 const struct ac_shader_binary *prolog2 =
6566 shader->prolog2 ? &shader->prolog2->binary : NULL;
6567 const struct ac_shader_binary *epilog =
6568 shader->epilog ? &shader->epilog->binary : NULL;
6569 const struct ac_shader_binary *mainb = &shader->binary;
6570 unsigned bo_size = si_get_shader_binary_size(shader) +
6571 (!epilog ? mainb->rodata_size : 0);
6572 unsigned char *ptr;
6573
6574 assert(!prolog || !prolog->rodata_size);
6575 assert(!previous_stage || !previous_stage->rodata_size);
6576 assert(!prolog2 || !prolog2->rodata_size);
6577 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
6578 !mainb->rodata_size);
6579 assert(!epilog || !epilog->rodata_size);
6580
6581 /* GFX9 can fetch at most 128 bytes past the end of the shader.
6582 * Prevent VM faults.
6583 */
6584 if (sscreen->b.chip_class >= GFX9)
6585 bo_size += 128;
6586
6587 r600_resource_reference(&shader->bo, NULL);
6588 shader->bo = (struct r600_resource*)
6589 pipe_buffer_create(&sscreen->b.b, 0,
6590 PIPE_USAGE_IMMUTABLE,
6591 align(bo_size, SI_CPDMA_ALIGNMENT));
6592 if (!shader->bo)
6593 return -ENOMEM;
6594
6595 /* Upload. */
6596 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6597 PIPE_TRANSFER_READ_WRITE |
6598 PIPE_TRANSFER_UNSYNCHRONIZED);
6599
6600 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
6601 * endian-independent. */
6602 if (prolog) {
6603 memcpy(ptr, prolog->code, prolog->code_size);
6604 ptr += prolog->code_size;
6605 }
6606 if (previous_stage) {
6607 memcpy(ptr, previous_stage->code, previous_stage->code_size);
6608 ptr += previous_stage->code_size;
6609 }
6610 if (prolog2) {
6611 memcpy(ptr, prolog2->code, prolog2->code_size);
6612 ptr += prolog2->code_size;
6613 }
6614
6615 memcpy(ptr, mainb->code, mainb->code_size);
6616 ptr += mainb->code_size;
6617
6618 if (epilog)
6619 memcpy(ptr, epilog->code, epilog->code_size);
6620 else if (mainb->rodata_size > 0)
6621 memcpy(ptr, mainb->rodata, mainb->rodata_size);
6622
6623 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6624 return 0;
6625 }
6626
6627 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6628 struct pipe_debug_callback *debug,
6629 const char *name, FILE *file)
6630 {
6631 char *line, *p;
6632 unsigned i, count;
6633
6634 if (binary->disasm_string) {
6635 fprintf(file, "Shader %s disassembly:\n", name);
6636 fprintf(file, "%s", binary->disasm_string);
6637
6638 if (debug && debug->debug_message) {
6639 /* Very long debug messages are cut off, so send the
6640 * disassembly one line at a time. This causes more
6641 * overhead, but on the plus side it simplifies
6642 * parsing of resulting logs.
6643 */
6644 pipe_debug_message(debug, SHADER_INFO,
6645 "Shader Disassembly Begin");
6646
6647 line = binary->disasm_string;
6648 while (*line) {
6649 p = util_strchrnul(line, '\n');
6650 count = p - line;
6651
6652 if (count) {
6653 pipe_debug_message(debug, SHADER_INFO,
6654 "%.*s", count, line);
6655 }
6656
6657 if (!*p)
6658 break;
6659 line = p + 1;
6660 }
6661
6662 pipe_debug_message(debug, SHADER_INFO,
6663 "Shader Disassembly End");
6664 }
6665 } else {
6666 fprintf(file, "Shader %s binary:\n", name);
6667 for (i = 0; i < binary->code_size; i += 4) {
6668 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6669 binary->code[i + 3], binary->code[i + 2],
6670 binary->code[i + 1], binary->code[i]);
6671 }
6672 }
6673 }
6674
6675 static void si_shader_dump_stats(struct si_screen *sscreen,
6676 const struct si_shader *shader,
6677 struct pipe_debug_callback *debug,
6678 unsigned processor,
6679 FILE *file,
6680 bool check_debug_option)
6681 {
6682 const struct si_shader_config *conf = &shader->config;
6683 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6684 unsigned code_size = si_get_shader_binary_size(shader);
6685 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6686 unsigned lds_per_wave = 0;
6687 unsigned max_simd_waves = 10;
6688
6689 /* Compute LDS usage for PS. */
6690 switch (processor) {
6691 case PIPE_SHADER_FRAGMENT:
6692 /* The minimum usage per wave is (num_inputs * 48). The maximum
6693 * usage is (num_inputs * 48 * 16).
6694 * We can get anything in between and it varies between waves.
6695 *
6696 * The 48 bytes per input for a single primitive is equal to
6697 * 4 bytes/component * 4 components/input * 3 points.
6698 *
6699 * Other stages don't know the size at compile time or don't
6700 * allocate LDS per wave, but instead they do it per thread group.
6701 */
6702 lds_per_wave = conf->lds_size * lds_increment +
6703 align(num_inputs * 48, lds_increment);
6704 break;
6705 case PIPE_SHADER_COMPUTE:
6706 if (shader->selector) {
6707 unsigned max_workgroup_size =
6708 si_get_max_workgroup_size(shader);
6709 lds_per_wave = (conf->lds_size * lds_increment) /
6710 DIV_ROUND_UP(max_workgroup_size, 64);
6711 }
6712 break;
6713 }
6714
6715 /* Compute the per-SIMD wave counts. */
6716 if (conf->num_sgprs) {
6717 if (sscreen->b.chip_class >= VI)
6718 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6719 else
6720 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6721 }
6722
6723 if (conf->num_vgprs)
6724 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6725
6726 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6727 * 16KB makes some SIMDs unoccupied). */
6728 if (lds_per_wave)
6729 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6730
6731 if (!check_debug_option ||
6732 r600_can_dump_shader(&sscreen->b, processor)) {
6733 if (processor == PIPE_SHADER_FRAGMENT) {
6734 fprintf(file, "*** SHADER CONFIG ***\n"
6735 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6736 "SPI_PS_INPUT_ENA = 0x%04x\n",
6737 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6738 }
6739
6740 fprintf(file, "*** SHADER STATS ***\n"
6741 "SGPRS: %d\n"
6742 "VGPRS: %d\n"
6743 "Spilled SGPRs: %d\n"
6744 "Spilled VGPRs: %d\n"
6745 "Private memory VGPRs: %d\n"
6746 "Code Size: %d bytes\n"
6747 "LDS: %d blocks\n"
6748 "Scratch: %d bytes per wave\n"
6749 "Max Waves: %d\n"
6750 "********************\n\n\n",
6751 conf->num_sgprs, conf->num_vgprs,
6752 conf->spilled_sgprs, conf->spilled_vgprs,
6753 conf->private_mem_vgprs, code_size,
6754 conf->lds_size, conf->scratch_bytes_per_wave,
6755 max_simd_waves);
6756 }
6757
6758 pipe_debug_message(debug, SHADER_INFO,
6759 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6760 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6761 "Spilled VGPRs: %d PrivMem VGPRs: %d",
6762 conf->num_sgprs, conf->num_vgprs, code_size,
6763 conf->lds_size, conf->scratch_bytes_per_wave,
6764 max_simd_waves, conf->spilled_sgprs,
6765 conf->spilled_vgprs, conf->private_mem_vgprs);
6766 }
6767
6768 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
6769 {
6770 switch (processor) {
6771 case PIPE_SHADER_VERTEX:
6772 if (shader->key.as_es)
6773 return "Vertex Shader as ES";
6774 else if (shader->key.as_ls)
6775 return "Vertex Shader as LS";
6776 else
6777 return "Vertex Shader as VS";
6778 case PIPE_SHADER_TESS_CTRL:
6779 return "Tessellation Control Shader";
6780 case PIPE_SHADER_TESS_EVAL:
6781 if (shader->key.as_es)
6782 return "Tessellation Evaluation Shader as ES";
6783 else
6784 return "Tessellation Evaluation Shader as VS";
6785 case PIPE_SHADER_GEOMETRY:
6786 if (shader->is_gs_copy_shader)
6787 return "GS Copy Shader as VS";
6788 else
6789 return "Geometry Shader";
6790 case PIPE_SHADER_FRAGMENT:
6791 return "Pixel Shader";
6792 case PIPE_SHADER_COMPUTE:
6793 return "Compute Shader";
6794 default:
6795 return "Unknown Shader";
6796 }
6797 }
6798
6799 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
6800 struct pipe_debug_callback *debug, unsigned processor,
6801 FILE *file, bool check_debug_option)
6802 {
6803 if (!check_debug_option ||
6804 r600_can_dump_shader(&sscreen->b, processor))
6805 si_dump_shader_key(processor, shader, file);
6806
6807 if (!check_debug_option && shader->binary.llvm_ir_string) {
6808 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6809 si_get_shader_name(shader, processor));
6810 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6811 }
6812
6813 if (!check_debug_option ||
6814 (r600_can_dump_shader(&sscreen->b, processor) &&
6815 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6816 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6817
6818 if (shader->prolog)
6819 si_shader_dump_disassembly(&shader->prolog->binary,
6820 debug, "prolog", file);
6821 if (shader->previous_stage)
6822 si_shader_dump_disassembly(&shader->previous_stage->binary,
6823 debug, "previous stage", file);
6824 if (shader->prolog2)
6825 si_shader_dump_disassembly(&shader->prolog2->binary,
6826 debug, "prolog2", file);
6827
6828 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6829
6830 if (shader->epilog)
6831 si_shader_dump_disassembly(&shader->epilog->binary,
6832 debug, "epilog", file);
6833 fprintf(file, "\n");
6834 }
6835
6836 si_shader_dump_stats(sscreen, shader, debug, processor, file,
6837 check_debug_option);
6838 }
6839
6840 static int si_compile_llvm(struct si_screen *sscreen,
6841 struct ac_shader_binary *binary,
6842 struct si_shader_config *conf,
6843 LLVMTargetMachineRef tm,
6844 LLVMModuleRef mod,
6845 struct pipe_debug_callback *debug,
6846 unsigned processor,
6847 const char *name)
6848 {
6849 int r = 0;
6850 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6851
6852 if (r600_can_dump_shader(&sscreen->b, processor)) {
6853 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6854
6855 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6856 fprintf(stderr, "%s LLVM IR:\n\n", name);
6857 ac_dump_module(mod);
6858 fprintf(stderr, "\n");
6859 }
6860 }
6861
6862 if (sscreen->record_llvm_ir) {
6863 char *ir = LLVMPrintModuleToString(mod);
6864 binary->llvm_ir_string = strdup(ir);
6865 LLVMDisposeMessage(ir);
6866 }
6867
6868 if (!si_replace_shader(count, binary)) {
6869 r = si_llvm_compile(mod, binary, tm, debug);
6870 if (r)
6871 return r;
6872 }
6873
6874 si_shader_binary_read_config(binary, conf, 0);
6875
6876 /* Enable 64-bit and 16-bit denormals, because there is no performance
6877 * cost.
6878 *
6879 * If denormals are enabled, all floating-point output modifiers are
6880 * ignored.
6881 *
6882 * Don't enable denormals for 32-bit floats, because:
6883 * - Floating-point output modifiers would be ignored by the hw.
6884 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6885 * have to stop using those.
6886 * - SI & CI would be very slow.
6887 */
6888 conf->float_mode |= V_00B028_FP_64_DENORMS;
6889
6890 FREE(binary->config);
6891 FREE(binary->global_symbol_offsets);
6892 binary->config = NULL;
6893 binary->global_symbol_offsets = NULL;
6894
6895 /* Some shaders can't have rodata because their binaries can be
6896 * concatenated.
6897 */
6898 if (binary->rodata_size &&
6899 (processor == PIPE_SHADER_VERTEX ||
6900 processor == PIPE_SHADER_TESS_CTRL ||
6901 processor == PIPE_SHADER_TESS_EVAL ||
6902 processor == PIPE_SHADER_FRAGMENT)) {
6903 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6904 return -EINVAL;
6905 }
6906
6907 return r;
6908 }
6909
6910 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6911 {
6912 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6913 LLVMBuildRetVoid(ctx->gallivm.builder);
6914 else
6915 LLVMBuildRet(ctx->gallivm.builder, ret);
6916 }
6917
6918 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6919 struct si_shader *
6920 si_generate_gs_copy_shader(struct si_screen *sscreen,
6921 LLVMTargetMachineRef tm,
6922 struct si_shader_selector *gs_selector,
6923 struct pipe_debug_callback *debug)
6924 {
6925 struct si_shader_context ctx;
6926 struct si_shader *shader;
6927 struct gallivm_state *gallivm = &ctx.gallivm;
6928 LLVMBuilderRef builder;
6929 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6930 struct lp_build_context *uint = &bld_base->uint_bld;
6931 struct si_shader_output_values *outputs;
6932 struct tgsi_shader_info *gsinfo = &gs_selector->info;
6933 int i, r;
6934
6935 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6936
6937 if (!outputs)
6938 return NULL;
6939
6940 shader = CALLOC_STRUCT(si_shader);
6941 if (!shader) {
6942 FREE(outputs);
6943 return NULL;
6944 }
6945
6946
6947 shader->selector = gs_selector;
6948 shader->is_gs_copy_shader = true;
6949
6950 si_init_shader_ctx(&ctx, sscreen, tm);
6951 ctx.shader = shader;
6952 ctx.type = PIPE_SHADER_VERTEX;
6953
6954 builder = gallivm->builder;
6955
6956 create_function(&ctx);
6957 preload_ring_buffers(&ctx);
6958
6959 LLVMValueRef voffset =
6960 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6961 ctx.param_vertex_id), 4);
6962
6963 /* Fetch the vertex stream ID.*/
6964 LLVMValueRef stream_id;
6965
6966 if (gs_selector->so.num_outputs)
6967 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6968 else
6969 stream_id = ctx.i32_0;
6970
6971 /* Fill in output information. */
6972 for (i = 0; i < gsinfo->num_outputs; ++i) {
6973 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6974 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6975
6976 for (int chan = 0; chan < 4; chan++) {
6977 outputs[i].vertex_stream[chan] =
6978 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6979 }
6980 }
6981
6982 LLVMBasicBlockRef end_bb;
6983 LLVMValueRef switch_inst;
6984
6985 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6986 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6987
6988 for (int stream = 0; stream < 4; stream++) {
6989 LLVMBasicBlockRef bb;
6990 unsigned offset;
6991
6992 if (!gsinfo->num_stream_output_components[stream])
6993 continue;
6994
6995 if (stream > 0 && !gs_selector->so.num_outputs)
6996 continue;
6997
6998 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6999 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
7000 LLVMPositionBuilderAtEnd(builder, bb);
7001
7002 /* Fetch vertex data from GSVS ring */
7003 offset = 0;
7004 for (i = 0; i < gsinfo->num_outputs; ++i) {
7005 for (unsigned chan = 0; chan < 4; chan++) {
7006 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
7007 outputs[i].vertex_stream[chan] != stream) {
7008 outputs[i].values[chan] = ctx.bld_base.base.undef;
7009 continue;
7010 }
7011
7012 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
7013 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
7014 offset++;
7015
7016 outputs[i].values[chan] =
7017 ac_build_buffer_load(&ctx.ac,
7018 ctx.gsvs_ring[0], 1,
7019 ctx.i32_0, voffset,
7020 soffset, 0, 1, 1, true);
7021 }
7022 }
7023
7024 /* Streamout and exports. */
7025 if (gs_selector->so.num_outputs) {
7026 si_llvm_emit_streamout(&ctx, outputs,
7027 gsinfo->num_outputs,
7028 stream);
7029 }
7030
7031 if (stream == 0)
7032 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
7033
7034 LLVMBuildBr(builder, end_bb);
7035 }
7036
7037 LLVMPositionBuilderAtEnd(builder, end_bb);
7038
7039 LLVMBuildRetVoid(gallivm->builder);
7040
7041 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
7042 si_llvm_optimize_module(&ctx);
7043
7044 r = si_compile_llvm(sscreen, &ctx.shader->binary,
7045 &ctx.shader->config, ctx.tm,
7046 ctx.gallivm.module,
7047 debug, PIPE_SHADER_GEOMETRY,
7048 "GS Copy Shader");
7049 if (!r) {
7050 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
7051 fprintf(stderr, "GS Copy Shader:\n");
7052 si_shader_dump(sscreen, ctx.shader, debug,
7053 PIPE_SHADER_GEOMETRY, stderr, true);
7054 r = si_shader_binary_upload(sscreen, ctx.shader);
7055 }
7056
7057 si_llvm_dispose(&ctx);
7058
7059 FREE(outputs);
7060
7061 if (r != 0) {
7062 FREE(shader);
7063 shader = NULL;
7064 }
7065 return shader;
7066 }
7067
7068 static void si_dump_shader_key_vs(const struct si_shader_key *key,
7069 const struct si_vs_prolog_bits *prolog,
7070 const char *prefix, FILE *f)
7071 {
7072 fprintf(f, " %s.instance_divisors = {", prefix);
7073 for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
7074 fprintf(f, !i ? "%u" : ", %u",
7075 prolog->instance_divisors[i]);
7076 }
7077 fprintf(f, "}\n");
7078
7079 fprintf(f, " mono.vs.fix_fetch = {");
7080 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
7081 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
7082 fprintf(f, "}\n");
7083 }
7084
7085 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
7086 FILE *f)
7087 {
7088 const struct si_shader_key *key = &shader->key;
7089
7090 fprintf(f, "SHADER KEY\n");
7091
7092 switch (processor) {
7093 case PIPE_SHADER_VERTEX:
7094 si_dump_shader_key_vs(key, &key->part.vs.prolog,
7095 "part.vs.prolog", f);
7096 fprintf(f, " as_es = %u\n", key->as_es);
7097 fprintf(f, " as_ls = %u\n", key->as_ls);
7098 fprintf(f, " mono.vs_export_prim_id = %u\n",
7099 key->mono.vs_export_prim_id);
7100 break;
7101
7102 case PIPE_SHADER_TESS_CTRL:
7103 if (shader->selector->screen->b.chip_class >= GFX9) {
7104 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
7105 "part.tcs.ls_prolog", f);
7106 }
7107 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
7108 fprintf(f, " mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
7109 break;
7110
7111 case PIPE_SHADER_TESS_EVAL:
7112 fprintf(f, " as_es = %u\n", key->as_es);
7113 fprintf(f, " mono.vs_export_prim_id = %u\n",
7114 key->mono.vs_export_prim_id);
7115 break;
7116
7117 case PIPE_SHADER_GEOMETRY:
7118 if (shader->is_gs_copy_shader)
7119 break;
7120
7121 if (shader->selector->screen->b.chip_class >= GFX9 &&
7122 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
7123 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
7124 "part.gs.vs_prolog", f);
7125 }
7126 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
7127 break;
7128
7129 case PIPE_SHADER_COMPUTE:
7130 break;
7131
7132 case PIPE_SHADER_FRAGMENT:
7133 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
7134 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
7135 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
7136 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
7137 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
7138 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
7139 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
7140 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
7141 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
7142 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
7143 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
7144 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
7145 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
7146 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
7147 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
7148 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
7149 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
7150 break;
7151
7152 default:
7153 assert(0);
7154 }
7155
7156 if ((processor == PIPE_SHADER_GEOMETRY ||
7157 processor == PIPE_SHADER_TESS_EVAL ||
7158 processor == PIPE_SHADER_VERTEX) &&
7159 !key->as_es && !key->as_ls) {
7160 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
7161 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
7162 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
7163 }
7164 }
7165
7166 static void si_init_shader_ctx(struct si_shader_context *ctx,
7167 struct si_screen *sscreen,
7168 LLVMTargetMachineRef tm)
7169 {
7170 struct lp_build_tgsi_context *bld_base;
7171 struct lp_build_tgsi_action tmpl = {};
7172
7173 si_llvm_context_init(ctx, sscreen, tm);
7174
7175 bld_base = &ctx->bld_base;
7176 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
7177
7178 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
7179 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
7180 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
7181
7182 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
7183 bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
7184 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
7185 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
7186 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
7187 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
7188 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
7189 bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
7190 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
7191 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
7192 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
7193 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
7194 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
7195 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
7196 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
7197 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
7198
7199 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
7200 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
7201 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
7202 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
7203 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
7204 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
7205
7206 tmpl.fetch_args = atomic_fetch_args;
7207 tmpl.emit = atomic_emit;
7208 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
7209 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
7210 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
7211 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
7212 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
7213 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
7214 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
7215 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
7216 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
7217 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
7218 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
7219 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
7220 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
7221 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
7222 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
7223 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
7224 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
7225 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
7226 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
7227 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
7228
7229 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
7230
7231 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
7232
7233 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
7234 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
7235 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
7236 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
7237
7238 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
7239 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
7240 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
7241 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
7242 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
7243 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
7244 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
7245 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
7246 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
7247
7248 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
7249 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
7250 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
7251 }
7252
7253 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
7254 {
7255 struct si_shader *shader = ctx->shader;
7256 struct tgsi_shader_info *info = &shader->selector->info;
7257
7258 if ((ctx->type != PIPE_SHADER_VERTEX &&
7259 ctx->type != PIPE_SHADER_TESS_EVAL) ||
7260 shader->key.as_ls ||
7261 shader->key.as_es)
7262 return;
7263
7264 ac_optimize_vs_outputs(&ctx->ac,
7265 ctx->main_fn,
7266 shader->info.vs_output_param_offset,
7267 info->num_outputs,
7268 &shader->info.nr_param_exports);
7269 }
7270
7271 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
7272 {
7273 ctx->shader->config.private_mem_vgprs = 0;
7274
7275 /* Process all LLVM instructions. */
7276 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
7277 while (bb) {
7278 LLVMValueRef next = LLVMGetFirstInstruction(bb);
7279
7280 while (next) {
7281 LLVMValueRef inst = next;
7282 next = LLVMGetNextInstruction(next);
7283
7284 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
7285 continue;
7286
7287 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
7288 /* No idea why LLVM aligns allocas to 4 elements. */
7289 unsigned alignment = LLVMGetAlignment(inst);
7290 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
7291 ctx->shader->config.private_mem_vgprs += dw_size;
7292 }
7293 bb = LLVMGetNextBasicBlock(bb);
7294 }
7295 }
7296
7297 static void si_init_exec_full_mask(struct si_shader_context *ctx)
7298 {
7299 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
7300 lp_build_intrinsic(ctx->gallivm.builder,
7301 "llvm.amdgcn.init.exec", ctx->voidt,
7302 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
7303 }
7304
7305 static void si_init_exec_from_input(struct si_shader_context *ctx,
7306 unsigned param, unsigned bitoffset)
7307 {
7308 LLVMValueRef args[] = {
7309 LLVMGetParam(ctx->main_fn, param),
7310 LLVMConstInt(ctx->i32, bitoffset, 0),
7311 };
7312 lp_build_intrinsic(ctx->gallivm.builder,
7313 "llvm.amdgcn.init.exec.from.input",
7314 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
7315 }
7316
7317 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
7318 bool is_monolithic)
7319 {
7320 struct si_shader *shader = ctx->shader;
7321 struct si_shader_selector *sel = shader->selector;
7322 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7323
7324 switch (ctx->type) {
7325 case PIPE_SHADER_VERTEX:
7326 ctx->load_input = declare_input_vs;
7327 if (shader->key.as_ls)
7328 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
7329 else if (shader->key.as_es)
7330 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7331 else
7332 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7333 break;
7334 case PIPE_SHADER_TESS_CTRL:
7335 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7336 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7337 bld_base->emit_store = store_output_tcs;
7338 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7339 break;
7340 case PIPE_SHADER_TESS_EVAL:
7341 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7342 if (shader->key.as_es)
7343 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7344 else
7345 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7346 break;
7347 case PIPE_SHADER_GEOMETRY:
7348 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7349 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7350 break;
7351 case PIPE_SHADER_FRAGMENT:
7352 ctx->load_input = declare_input_fs;
7353 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7354 break;
7355 case PIPE_SHADER_COMPUTE:
7356 ctx->declare_memory_region = declare_compute_memory;
7357 break;
7358 default:
7359 assert(!"Unsupported shader type");
7360 return false;
7361 }
7362
7363 create_function(ctx);
7364 preload_ring_buffers(ctx);
7365
7366 /* For GFX9 merged shaders:
7367 * - Set EXEC. If the prolog is present, set EXEC there instead.
7368 * - Add a barrier before the second shader.
7369 *
7370 * The same thing for monolithic shaders is done in
7371 * si_build_wrapper_function.
7372 */
7373 if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
7374 if (sel->info.num_instructions > 1 && /* not empty shader */
7375 (shader->key.as_es || shader->key.as_ls) &&
7376 (ctx->type == PIPE_SHADER_TESS_EVAL ||
7377 (ctx->type == PIPE_SHADER_VERTEX &&
7378 !sel->vs_needs_prolog))) {
7379 si_init_exec_from_input(ctx,
7380 ctx->param_merged_wave_info, 0);
7381 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
7382 ctx->type == PIPE_SHADER_GEOMETRY) {
7383 si_init_exec_from_input(ctx,
7384 ctx->param_merged_wave_info, 8);
7385 si_llvm_emit_barrier(NULL, bld_base, NULL);
7386 }
7387 }
7388
7389 if (ctx->type == PIPE_SHADER_GEOMETRY) {
7390 int i;
7391 for (i = 0; i < 4; i++) {
7392 ctx->gs_next_vertex[i] =
7393 lp_build_alloca(&ctx->gallivm,
7394 ctx->i32, "");
7395 }
7396 }
7397
7398 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7399 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7400 return false;
7401 }
7402
7403 si_llvm_build_ret(ctx, ctx->return_value);
7404 return true;
7405 }
7406
7407 /**
7408 * Compute the VS prolog key, which contains all the information needed to
7409 * build the VS prolog function, and set shader->info bits where needed.
7410 *
7411 * \param info Shader info of the vertex shader.
7412 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
7413 * \param prolog_key Key of the VS prolog
7414 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
7415 * \param key Output shader part key.
7416 */
7417 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7418 unsigned num_input_sgprs,
7419 const struct si_vs_prolog_bits *prolog_key,
7420 struct si_shader *shader_out,
7421 union si_shader_part_key *key)
7422 {
7423 memset(key, 0, sizeof(*key));
7424 key->vs_prolog.states = *prolog_key;
7425 key->vs_prolog.num_input_sgprs = num_input_sgprs;
7426 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7427 key->vs_prolog.as_ls = shader_out->key.as_ls;
7428
7429 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
7430 key->vs_prolog.as_ls = 1;
7431 key->vs_prolog.num_merged_next_stage_vgprs = 2;
7432 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
7433 key->vs_prolog.num_merged_next_stage_vgprs = 5;
7434 }
7435
7436 /* Set the instanceID flag. */
7437 for (unsigned i = 0; i < info->num_inputs; i++)
7438 if (key->vs_prolog.states.instance_divisors[i])
7439 shader_out->info.uses_instanceid = true;
7440 }
7441
7442 /**
7443 * Compute the PS prolog key, which contains all the information needed to
7444 * build the PS prolog function, and set related bits in shader->config.
7445 */
7446 static void si_get_ps_prolog_key(struct si_shader *shader,
7447 union si_shader_part_key *key,
7448 bool separate_prolog)
7449 {
7450 struct tgsi_shader_info *info = &shader->selector->info;
7451
7452 memset(key, 0, sizeof(*key));
7453 key->ps_prolog.states = shader->key.part.ps.prolog;
7454 key->ps_prolog.colors_read = info->colors_read;
7455 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7456 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7457 key->ps_prolog.wqm = info->uses_derivatives &&
7458 (key->ps_prolog.colors_read ||
7459 key->ps_prolog.states.force_persp_sample_interp ||
7460 key->ps_prolog.states.force_linear_sample_interp ||
7461 key->ps_prolog.states.force_persp_center_interp ||
7462 key->ps_prolog.states.force_linear_center_interp ||
7463 key->ps_prolog.states.bc_optimize_for_persp ||
7464 key->ps_prolog.states.bc_optimize_for_linear);
7465
7466 if (info->colors_read) {
7467 unsigned *color = shader->selector->color_attr_index;
7468
7469 if (shader->key.part.ps.prolog.color_two_side) {
7470 /* BCOLORs are stored after the last input. */
7471 key->ps_prolog.num_interp_inputs = info->num_inputs;
7472 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7473 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7474 }
7475
7476 for (unsigned i = 0; i < 2; i++) {
7477 unsigned interp = info->input_interpolate[color[i]];
7478 unsigned location = info->input_interpolate_loc[color[i]];
7479
7480 if (!(info->colors_read & (0xf << i*4)))
7481 continue;
7482
7483 key->ps_prolog.color_attr_index[i] = color[i];
7484
7485 if (shader->key.part.ps.prolog.flatshade_colors &&
7486 interp == TGSI_INTERPOLATE_COLOR)
7487 interp = TGSI_INTERPOLATE_CONSTANT;
7488
7489 switch (interp) {
7490 case TGSI_INTERPOLATE_CONSTANT:
7491 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7492 break;
7493 case TGSI_INTERPOLATE_PERSPECTIVE:
7494 case TGSI_INTERPOLATE_COLOR:
7495 /* Force the interpolation location for colors here. */
7496 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7497 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7498 if (shader->key.part.ps.prolog.force_persp_center_interp)
7499 location = TGSI_INTERPOLATE_LOC_CENTER;
7500
7501 switch (location) {
7502 case TGSI_INTERPOLATE_LOC_SAMPLE:
7503 key->ps_prolog.color_interp_vgpr_index[i] = 0;
7504 shader->config.spi_ps_input_ena |=
7505 S_0286CC_PERSP_SAMPLE_ENA(1);
7506 break;
7507 case TGSI_INTERPOLATE_LOC_CENTER:
7508 key->ps_prolog.color_interp_vgpr_index[i] = 2;
7509 shader->config.spi_ps_input_ena |=
7510 S_0286CC_PERSP_CENTER_ENA(1);
7511 break;
7512 case TGSI_INTERPOLATE_LOC_CENTROID:
7513 key->ps_prolog.color_interp_vgpr_index[i] = 4;
7514 shader->config.spi_ps_input_ena |=
7515 S_0286CC_PERSP_CENTROID_ENA(1);
7516 break;
7517 default:
7518 assert(0);
7519 }
7520 break;
7521 case TGSI_INTERPOLATE_LINEAR:
7522 /* Force the interpolation location for colors here. */
7523 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7524 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7525 if (shader->key.part.ps.prolog.force_linear_center_interp)
7526 location = TGSI_INTERPOLATE_LOC_CENTER;
7527
7528 /* The VGPR assignment for non-monolithic shaders
7529 * works because InitialPSInputAddr is set on the
7530 * main shader and PERSP_PULL_MODEL is never used.
7531 */
7532 switch (location) {
7533 case TGSI_INTERPOLATE_LOC_SAMPLE:
7534 key->ps_prolog.color_interp_vgpr_index[i] =
7535 separate_prolog ? 6 : 9;
7536 shader->config.spi_ps_input_ena |=
7537 S_0286CC_LINEAR_SAMPLE_ENA(1);
7538 break;
7539 case TGSI_INTERPOLATE_LOC_CENTER:
7540 key->ps_prolog.color_interp_vgpr_index[i] =
7541 separate_prolog ? 8 : 11;
7542 shader->config.spi_ps_input_ena |=
7543 S_0286CC_LINEAR_CENTER_ENA(1);
7544 break;
7545 case TGSI_INTERPOLATE_LOC_CENTROID:
7546 key->ps_prolog.color_interp_vgpr_index[i] =
7547 separate_prolog ? 10 : 13;
7548 shader->config.spi_ps_input_ena |=
7549 S_0286CC_LINEAR_CENTROID_ENA(1);
7550 break;
7551 default:
7552 assert(0);
7553 }
7554 break;
7555 default:
7556 assert(0);
7557 }
7558 }
7559 }
7560 }
7561
7562 /**
7563 * Check whether a PS prolog is required based on the key.
7564 */
7565 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7566 {
7567 return key->ps_prolog.colors_read ||
7568 key->ps_prolog.states.force_persp_sample_interp ||
7569 key->ps_prolog.states.force_linear_sample_interp ||
7570 key->ps_prolog.states.force_persp_center_interp ||
7571 key->ps_prolog.states.force_linear_center_interp ||
7572 key->ps_prolog.states.bc_optimize_for_persp ||
7573 key->ps_prolog.states.bc_optimize_for_linear ||
7574 key->ps_prolog.states.poly_stipple;
7575 }
7576
7577 /**
7578 * Compute the PS epilog key, which contains all the information needed to
7579 * build the PS epilog function.
7580 */
7581 static void si_get_ps_epilog_key(struct si_shader *shader,
7582 union si_shader_part_key *key)
7583 {
7584 struct tgsi_shader_info *info = &shader->selector->info;
7585 memset(key, 0, sizeof(*key));
7586 key->ps_epilog.colors_written = info->colors_written;
7587 key->ps_epilog.writes_z = info->writes_z;
7588 key->ps_epilog.writes_stencil = info->writes_stencil;
7589 key->ps_epilog.writes_samplemask = info->writes_samplemask;
7590 key->ps_epilog.states = shader->key.part.ps.epilog;
7591 }
7592
7593 /**
7594 * Build the GS prolog function. Rotate the input vertices for triangle strips
7595 * with adjacency.
7596 */
7597 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7598 union si_shader_part_key *key)
7599 {
7600 unsigned num_sgprs, num_vgprs;
7601 struct gallivm_state *gallivm = &ctx->gallivm;
7602 LLVMBuilderRef builder = gallivm->builder;
7603 LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
7604 LLVMTypeRef returns[48];
7605 LLVMValueRef func, ret;
7606
7607 if (ctx->screen->b.chip_class >= GFX9) {
7608 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
7609 num_vgprs = 5; /* ES inputs are not needed by GS */
7610 } else {
7611 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
7612 num_vgprs = 8;
7613 }
7614
7615 for (unsigned i = 0; i < num_sgprs; ++i) {
7616 params[i] = ctx->i32;
7617 returns[i] = ctx->i32;
7618 }
7619
7620 for (unsigned i = 0; i < num_vgprs; ++i) {
7621 params[num_sgprs + i] = ctx->i32;
7622 returns[num_sgprs + i] = ctx->f32;
7623 }
7624
7625 /* Create the function. */
7626 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7627 params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
7628 func = ctx->main_fn;
7629
7630 /* Set the full EXEC mask for the prolog, because we are only fiddling
7631 * with registers here. The main shader part will set the correct EXEC
7632 * mask.
7633 */
7634 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
7635 si_init_exec_full_mask(ctx);
7636
7637 /* Copy inputs to outputs. This should be no-op, as the registers match,
7638 * but it will prevent the compiler from overwriting them unintentionally.
7639 */
7640 ret = ctx->return_value;
7641 for (unsigned i = 0; i < num_sgprs; i++) {
7642 LLVMValueRef p = LLVMGetParam(func, i);
7643 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7644 }
7645 for (unsigned i = 0; i < num_vgprs; i++) {
7646 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7647 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7648 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7649 }
7650
7651 if (key->gs_prolog.states.tri_strip_adj_fix) {
7652 /* Remap the input vertices for every other primitive. */
7653 const unsigned gfx6_vtx_params[6] = {
7654 num_sgprs,
7655 num_sgprs + 1,
7656 num_sgprs + 3,
7657 num_sgprs + 4,
7658 num_sgprs + 5,
7659 num_sgprs + 6
7660 };
7661 const unsigned gfx9_vtx_params[3] = {
7662 num_sgprs,
7663 num_sgprs + 1,
7664 num_sgprs + 4,
7665 };
7666 LLVMValueRef vtx_in[6], vtx_out[6];
7667 LLVMValueRef prim_id, rotate;
7668
7669 if (ctx->screen->b.chip_class >= GFX9) {
7670 for (unsigned i = 0; i < 3; i++) {
7671 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
7672 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
7673 }
7674 } else {
7675 for (unsigned i = 0; i < 6; i++)
7676 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
7677 }
7678
7679 prim_id = LLVMGetParam(func, num_sgprs + 2);
7680 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7681
7682 for (unsigned i = 0; i < 6; ++i) {
7683 LLVMValueRef base, rotated;
7684 base = vtx_in[i];
7685 rotated = vtx_in[(i + 4) % 6];
7686 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
7687 }
7688
7689 if (ctx->screen->b.chip_class >= GFX9) {
7690 for (unsigned i = 0; i < 3; i++) {
7691 LLVMValueRef hi, out;
7692
7693 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
7694 LLVMConstInt(ctx->i32, 16, 0), "");
7695 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
7696 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
7697 ret = LLVMBuildInsertValue(builder, ret, out,
7698 gfx9_vtx_params[i], "");
7699 }
7700 } else {
7701 for (unsigned i = 0; i < 6; i++) {
7702 LLVMValueRef out;
7703
7704 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
7705 ret = LLVMBuildInsertValue(builder, ret, out,
7706 gfx6_vtx_params[i], "");
7707 }
7708 }
7709 }
7710
7711 LLVMBuildRet(builder, ret);
7712 }
7713
7714 /**
7715 * Given a list of shader part functions, build a wrapper function that
7716 * runs them in sequence to form a monolithic shader.
7717 */
7718 static void si_build_wrapper_function(struct si_shader_context *ctx,
7719 LLVMValueRef *parts,
7720 unsigned num_parts,
7721 unsigned main_part,
7722 unsigned next_shader_first_part)
7723 {
7724 struct gallivm_state *gallivm = &ctx->gallivm;
7725 LLVMBuilderRef builder = ctx->gallivm.builder;
7726 /* PS epilog has one arg per color component */
7727 LLVMTypeRef param_types[48];
7728 LLVMValueRef initial[48], out[48];
7729 LLVMTypeRef function_type;
7730 unsigned num_params;
7731 unsigned num_out, initial_num_out;
7732 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7733 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
7734 unsigned num_sgprs, num_vgprs;
7735 unsigned last_sgpr_param;
7736 unsigned gprs;
7737 struct lp_build_if_state if_state;
7738
7739 for (unsigned i = 0; i < num_parts; ++i) {
7740 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7741 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7742 }
7743
7744 /* The parameters of the wrapper function correspond to those of the
7745 * first part in terms of SGPRs and VGPRs, but we use the types of the
7746 * main part to get the right types. This is relevant for the
7747 * dereferenceable attribute on descriptor table pointers.
7748 */
7749 num_sgprs = 0;
7750 num_vgprs = 0;
7751
7752 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7753 num_params = LLVMCountParamTypes(function_type);
7754
7755 for (unsigned i = 0; i < num_params; ++i) {
7756 LLVMValueRef param = LLVMGetParam(parts[0], i);
7757
7758 if (ac_is_sgpr_param(param)) {
7759 assert(num_vgprs == 0);
7760 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7761 } else {
7762 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7763 }
7764 }
7765 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7766
7767 num_params = 0;
7768 last_sgpr_param = 0;
7769 gprs = 0;
7770 while (gprs < num_sgprs + num_vgprs) {
7771 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7772 unsigned size;
7773
7774 param_types[num_params] = LLVMTypeOf(param);
7775 if (gprs < num_sgprs)
7776 last_sgpr_param = num_params;
7777 size = llvm_get_type_size(param_types[num_params]) / 4;
7778 num_params++;
7779
7780 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7781 assert(gprs + size <= num_sgprs + num_vgprs &&
7782 (gprs >= num_sgprs || gprs + size <= num_sgprs));
7783
7784 gprs += size;
7785 }
7786
7787 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
7788 last_sgpr_param,
7789 si_get_max_workgroup_size(ctx->shader));
7790
7791 if (is_merged_shader(ctx->shader))
7792 si_init_exec_full_mask(ctx);
7793
7794 /* Record the arguments of the function as if they were an output of
7795 * a previous part.
7796 */
7797 num_out = 0;
7798 num_out_sgpr = 0;
7799
7800 for (unsigned i = 0; i < num_params; ++i) {
7801 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7802 LLVMTypeRef param_type = LLVMTypeOf(param);
7803 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7804 unsigned size = llvm_get_type_size(param_type) / 4;
7805
7806 if (size == 1) {
7807 if (param_type != out_type)
7808 param = LLVMBuildBitCast(builder, param, out_type, "");
7809 out[num_out++] = param;
7810 } else {
7811 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7812
7813 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7814 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7815 param_type = ctx->i64;
7816 }
7817
7818 if (param_type != vector_type)
7819 param = LLVMBuildBitCast(builder, param, vector_type, "");
7820
7821 for (unsigned j = 0; j < size; ++j)
7822 out[num_out++] = LLVMBuildExtractElement(
7823 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7824 }
7825
7826 if (i <= last_sgpr_param)
7827 num_out_sgpr = num_out;
7828 }
7829
7830 memcpy(initial, out, sizeof(out));
7831 initial_num_out = num_out;
7832 initial_num_out_sgpr = num_out_sgpr;
7833
7834 /* Now chain the parts. */
7835 for (unsigned part = 0; part < num_parts; ++part) {
7836 LLVMValueRef in[48];
7837 LLVMValueRef ret;
7838 LLVMTypeRef ret_type;
7839 unsigned out_idx = 0;
7840
7841 num_params = LLVMCountParams(parts[part]);
7842 assert(num_params <= ARRAY_SIZE(param_types));
7843
7844 /* Merged shaders are executed conditionally depending
7845 * on the number of enabled threads passed in the input SGPRs. */
7846 if (is_merged_shader(ctx->shader) &&
7847 (part == 0 || part == next_shader_first_part)) {
7848 LLVMValueRef ena, count = initial[3];
7849
7850 /* The thread count for the 2nd shader is at bit-offset 8. */
7851 if (part == next_shader_first_part) {
7852 count = LLVMBuildLShr(builder, count,
7853 LLVMConstInt(ctx->i32, 8, 0), "");
7854 }
7855 count = LLVMBuildAnd(builder, count,
7856 LLVMConstInt(ctx->i32, 0x7f, 0), "");
7857 ena = LLVMBuildICmp(builder, LLVMIntULT,
7858 ac_get_thread_id(&ctx->ac), count, "");
7859 lp_build_if(&if_state, &ctx->gallivm, ena);
7860 }
7861
7862 /* Derive arguments for the next part from outputs of the
7863 * previous one.
7864 */
7865 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7866 LLVMValueRef param;
7867 LLVMTypeRef param_type;
7868 bool is_sgpr;
7869 unsigned param_size;
7870 LLVMValueRef arg = NULL;
7871
7872 param = LLVMGetParam(parts[part], param_idx);
7873 param_type = LLVMTypeOf(param);
7874 param_size = llvm_get_type_size(param_type) / 4;
7875 is_sgpr = ac_is_sgpr_param(param);
7876
7877 if (is_sgpr) {
7878 #if HAVE_LLVM < 0x0400
7879 LLVMRemoveAttribute(param, LLVMByValAttribute);
7880 #else
7881 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7882 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7883 #endif
7884 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7885 }
7886
7887 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7888 assert(is_sgpr || out_idx >= num_out_sgpr);
7889
7890 if (param_size == 1)
7891 arg = out[out_idx];
7892 else
7893 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7894
7895 if (LLVMTypeOf(arg) != param_type) {
7896 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7897 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7898 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7899 } else {
7900 arg = LLVMBuildBitCast(builder, arg, param_type, "");
7901 }
7902 }
7903
7904 in[param_idx] = arg;
7905 out_idx += param_size;
7906 }
7907
7908 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7909
7910 if (is_merged_shader(ctx->shader) &&
7911 (part + 1 == next_shader_first_part ||
7912 part + 1 == num_parts)) {
7913 lp_build_endif(&if_state);
7914
7915 if (part + 1 == next_shader_first_part) {
7916 /* A barrier is required between 2 merged shaders. */
7917 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
7918
7919 /* The second half of the merged shader should use
7920 * the inputs from the toplevel (wrapper) function,
7921 * not the return value from the last call.
7922 *
7923 * That's because the last call was executed condi-
7924 * tionally, so we can't consume it in the main
7925 * block.
7926 */
7927 memcpy(out, initial, sizeof(initial));
7928 num_out = initial_num_out;
7929 num_out_sgpr = initial_num_out_sgpr;
7930 }
7931 continue;
7932 }
7933
7934 /* Extract the returned GPRs. */
7935 ret_type = LLVMTypeOf(ret);
7936 num_out = 0;
7937 num_out_sgpr = 0;
7938
7939 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7940 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7941
7942 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7943
7944 for (unsigned i = 0; i < ret_size; ++i) {
7945 LLVMValueRef val =
7946 LLVMBuildExtractValue(builder, ret, i, "");
7947
7948 out[num_out++] = val;
7949
7950 if (LLVMTypeOf(val) == ctx->i32) {
7951 assert(num_out_sgpr + 1 == num_out);
7952 num_out_sgpr = num_out;
7953 }
7954 }
7955 }
7956 }
7957
7958 LLVMBuildRetVoid(builder);
7959 }
7960
7961 int si_compile_tgsi_shader(struct si_screen *sscreen,
7962 LLVMTargetMachineRef tm,
7963 struct si_shader *shader,
7964 bool is_monolithic,
7965 struct pipe_debug_callback *debug)
7966 {
7967 struct si_shader_selector *sel = shader->selector;
7968 struct si_shader_context ctx;
7969 int r = -1;
7970
7971 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7972 * conversion fails. */
7973 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7974 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7975 tgsi_dump(sel->tokens, 0);
7976 si_dump_streamout(&sel->so);
7977 }
7978
7979 si_init_shader_ctx(&ctx, sscreen, tm);
7980 si_llvm_context_set_tgsi(&ctx, shader);
7981 ctx.separate_prolog = !is_monolithic;
7982
7983 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7984 sizeof(shader->info.vs_output_param_offset));
7985
7986 shader->info.uses_instanceid = sel->info.uses_instanceid;
7987
7988 ctx.load_system_value = declare_system_value;
7989
7990 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
7991 si_llvm_dispose(&ctx);
7992 return -1;
7993 }
7994
7995 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7996 LLVMValueRef parts[2];
7997 bool need_prolog = sel->vs_needs_prolog;
7998
7999 parts[1] = ctx.main_fn;
8000
8001 if (need_prolog) {
8002 union si_shader_part_key prolog_key;
8003 si_get_vs_prolog_key(&sel->info,
8004 shader->info.num_input_sgprs,
8005 &shader->key.part.vs.prolog,
8006 shader, &prolog_key);
8007 si_build_vs_prolog_function(&ctx, &prolog_key);
8008 parts[0] = ctx.main_fn;
8009 }
8010
8011 si_build_wrapper_function(&ctx, parts + !need_prolog,
8012 1 + need_prolog, need_prolog, 0);
8013 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
8014 if (sscreen->b.chip_class >= GFX9) {
8015 struct si_shader_selector *ls = shader->key.part.tcs.ls;
8016 LLVMValueRef parts[4];
8017
8018 /* TCS main part */
8019 parts[2] = ctx.main_fn;
8020
8021 /* TCS epilog */
8022 union si_shader_part_key tcs_epilog_key;
8023 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
8024 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8025 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
8026 parts[3] = ctx.main_fn;
8027
8028 /* VS prolog */
8029 if (ls->vs_needs_prolog) {
8030 union si_shader_part_key vs_prolog_key;
8031 si_get_vs_prolog_key(&ls->info,
8032 shader->info.num_input_sgprs,
8033 &shader->key.part.tcs.ls_prolog,
8034 shader, &vs_prolog_key);
8035 vs_prolog_key.vs_prolog.is_monolithic = true;
8036 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8037 parts[0] = ctx.main_fn;
8038 }
8039
8040 /* VS as LS main part */
8041 struct si_shader shader_ls = {};
8042 shader_ls.selector = ls;
8043 shader_ls.key.as_ls = 1;
8044 shader_ls.key.mono = shader->key.mono;
8045 shader_ls.key.opt = shader->key.opt;
8046 si_llvm_context_set_tgsi(&ctx, &shader_ls);
8047
8048 if (!si_compile_tgsi_main(&ctx, true)) {
8049 si_llvm_dispose(&ctx);
8050 return -1;
8051 }
8052 shader->info.uses_instanceid |= ls->info.uses_instanceid;
8053 parts[1] = ctx.main_fn;
8054
8055 /* Reset the shader context. */
8056 ctx.shader = shader;
8057 ctx.type = PIPE_SHADER_TESS_CTRL;
8058
8059 si_build_wrapper_function(&ctx,
8060 parts + !ls->vs_needs_prolog,
8061 4 - !ls->vs_needs_prolog, 0,
8062 ls->vs_needs_prolog ? 2 : 1);
8063 } else {
8064 LLVMValueRef parts[2];
8065 union si_shader_part_key epilog_key;
8066
8067 parts[0] = ctx.main_fn;
8068
8069 memset(&epilog_key, 0, sizeof(epilog_key));
8070 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8071 si_build_tcs_epilog_function(&ctx, &epilog_key);
8072 parts[1] = ctx.main_fn;
8073
8074 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8075 }
8076 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
8077 if (ctx.screen->b.chip_class >= GFX9) {
8078 struct si_shader_selector *es = shader->key.part.gs.es;
8079 LLVMValueRef es_prolog = NULL;
8080 LLVMValueRef es_main = NULL;
8081 LLVMValueRef gs_prolog = NULL;
8082 LLVMValueRef gs_main = ctx.main_fn;
8083
8084 /* GS prolog */
8085 union si_shader_part_key gs_prolog_key;
8086 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
8087 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8088 gs_prolog_key.gs_prolog.is_monolithic = true;
8089 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
8090 gs_prolog = ctx.main_fn;
8091
8092 /* ES prolog */
8093 if (es->vs_needs_prolog) {
8094 union si_shader_part_key vs_prolog_key;
8095 si_get_vs_prolog_key(&es->info,
8096 shader->info.num_input_sgprs,
8097 &shader->key.part.tcs.ls_prolog,
8098 shader, &vs_prolog_key);
8099 vs_prolog_key.vs_prolog.is_monolithic = true;
8100 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8101 es_prolog = ctx.main_fn;
8102 }
8103
8104 /* ES main part */
8105 struct si_shader shader_es = {};
8106 shader_es.selector = es;
8107 shader_es.key.as_es = 1;
8108 shader_es.key.mono = shader->key.mono;
8109 shader_es.key.opt = shader->key.opt;
8110 si_llvm_context_set_tgsi(&ctx, &shader_es);
8111
8112 if (!si_compile_tgsi_main(&ctx, true)) {
8113 si_llvm_dispose(&ctx);
8114 return -1;
8115 }
8116 shader->info.uses_instanceid |= es->info.uses_instanceid;
8117 es_main = ctx.main_fn;
8118
8119 /* Reset the shader context. */
8120 ctx.shader = shader;
8121 ctx.type = PIPE_SHADER_GEOMETRY;
8122
8123 /* Prepare the array of shader parts. */
8124 LLVMValueRef parts[4];
8125 unsigned num_parts = 0, main_part, next_first_part;
8126
8127 if (es_prolog)
8128 parts[num_parts++] = es_prolog;
8129
8130 parts[main_part = num_parts++] = es_main;
8131 parts[next_first_part = num_parts++] = gs_prolog;
8132 parts[num_parts++] = gs_main;
8133
8134 si_build_wrapper_function(&ctx, parts, num_parts,
8135 main_part, next_first_part);
8136 } else {
8137 LLVMValueRef parts[2];
8138 union si_shader_part_key prolog_key;
8139
8140 parts[1] = ctx.main_fn;
8141
8142 memset(&prolog_key, 0, sizeof(prolog_key));
8143 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8144 si_build_gs_prolog_function(&ctx, &prolog_key);
8145 parts[0] = ctx.main_fn;
8146
8147 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
8148 }
8149 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
8150 LLVMValueRef parts[3];
8151 union si_shader_part_key prolog_key;
8152 union si_shader_part_key epilog_key;
8153 bool need_prolog;
8154
8155 si_get_ps_prolog_key(shader, &prolog_key, false);
8156 need_prolog = si_need_ps_prolog(&prolog_key);
8157
8158 parts[need_prolog ? 1 : 0] = ctx.main_fn;
8159
8160 if (need_prolog) {
8161 si_build_ps_prolog_function(&ctx, &prolog_key);
8162 parts[0] = ctx.main_fn;
8163 }
8164
8165 si_get_ps_epilog_key(shader, &epilog_key);
8166 si_build_ps_epilog_function(&ctx, &epilog_key);
8167 parts[need_prolog ? 2 : 1] = ctx.main_fn;
8168
8169 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
8170 need_prolog ? 1 : 0, 0);
8171 }
8172
8173 si_llvm_optimize_module(&ctx);
8174
8175 /* Post-optimization transformations and analysis. */
8176 si_optimize_vs_outputs(&ctx);
8177
8178 if ((debug && debug->debug_message) ||
8179 r600_can_dump_shader(&sscreen->b, ctx.type))
8180 si_count_scratch_private_memory(&ctx);
8181
8182 /* Compile to bytecode. */
8183 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
8184 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
8185 si_llvm_dispose(&ctx);
8186 if (r) {
8187 fprintf(stderr, "LLVM failed to compile shader\n");
8188 return r;
8189 }
8190
8191 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
8192 * LLVM 3.9svn has this bug.
8193 */
8194 if (sel->type == PIPE_SHADER_COMPUTE) {
8195 unsigned wave_size = 64;
8196 unsigned max_vgprs = 256;
8197 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
8198 unsigned max_sgprs_per_wave = 128;
8199 unsigned max_block_threads = si_get_max_workgroup_size(shader);
8200 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
8201 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
8202
8203 max_vgprs = max_vgprs / min_waves_per_simd;
8204 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
8205
8206 if (shader->config.num_sgprs > max_sgprs ||
8207 shader->config.num_vgprs > max_vgprs) {
8208 fprintf(stderr, "LLVM failed to compile a shader correctly: "
8209 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
8210 shader->config.num_sgprs, shader->config.num_vgprs,
8211 max_sgprs, max_vgprs);
8212
8213 /* Just terminate the process, because dependent
8214 * shaders can hang due to bad input data, but use
8215 * the env var to allow shader-db to work.
8216 */
8217 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
8218 abort();
8219 }
8220 }
8221
8222 /* Add the scratch offset to input SGPRs. */
8223 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
8224 shader->info.num_input_sgprs += 1; /* scratch byte offset */
8225
8226 /* Calculate the number of fragment input VGPRs. */
8227 if (ctx.type == PIPE_SHADER_FRAGMENT) {
8228 shader->info.num_input_vgprs = 0;
8229 shader->info.face_vgpr_index = -1;
8230
8231 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8232 shader->info.num_input_vgprs += 2;
8233 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
8234 shader->info.num_input_vgprs += 2;
8235 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
8236 shader->info.num_input_vgprs += 2;
8237 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
8238 shader->info.num_input_vgprs += 3;
8239 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8240 shader->info.num_input_vgprs += 2;
8241 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
8242 shader->info.num_input_vgprs += 2;
8243 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
8244 shader->info.num_input_vgprs += 2;
8245 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
8246 shader->info.num_input_vgprs += 1;
8247 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
8248 shader->info.num_input_vgprs += 1;
8249 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
8250 shader->info.num_input_vgprs += 1;
8251 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
8252 shader->info.num_input_vgprs += 1;
8253 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
8254 shader->info.num_input_vgprs += 1;
8255 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
8256 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
8257 shader->info.num_input_vgprs += 1;
8258 }
8259 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
8260 shader->info.num_input_vgprs += 1;
8261 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
8262 shader->info.num_input_vgprs += 1;
8263 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
8264 shader->info.num_input_vgprs += 1;
8265 }
8266
8267 return 0;
8268 }
8269
8270 /**
8271 * Create, compile and return a shader part (prolog or epilog).
8272 *
8273 * \param sscreen screen
8274 * \param list list of shader parts of the same category
8275 * \param type shader type
8276 * \param key shader part key
8277 * \param prolog whether the part being requested is a prolog
8278 * \param tm LLVM target machine
8279 * \param debug debug callback
8280 * \param build the callback responsible for building the main function
8281 * \return non-NULL on success
8282 */
8283 static struct si_shader_part *
8284 si_get_shader_part(struct si_screen *sscreen,
8285 struct si_shader_part **list,
8286 enum pipe_shader_type type,
8287 bool prolog,
8288 union si_shader_part_key *key,
8289 LLVMTargetMachineRef tm,
8290 struct pipe_debug_callback *debug,
8291 void (*build)(struct si_shader_context *,
8292 union si_shader_part_key *),
8293 const char *name)
8294 {
8295 struct si_shader_part *result;
8296
8297 mtx_lock(&sscreen->shader_parts_mutex);
8298
8299 /* Find existing. */
8300 for (result = *list; result; result = result->next) {
8301 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
8302 mtx_unlock(&sscreen->shader_parts_mutex);
8303 return result;
8304 }
8305 }
8306
8307 /* Compile a new one. */
8308 result = CALLOC_STRUCT(si_shader_part);
8309 result->key = *key;
8310
8311 struct si_shader shader = {};
8312 struct si_shader_context ctx;
8313 struct gallivm_state *gallivm = &ctx.gallivm;
8314
8315 si_init_shader_ctx(&ctx, sscreen, tm);
8316 ctx.shader = &shader;
8317 ctx.type = type;
8318
8319 switch (type) {
8320 case PIPE_SHADER_VERTEX:
8321 break;
8322 case PIPE_SHADER_TESS_CTRL:
8323 assert(!prolog);
8324 shader.key.part.tcs.epilog = key->tcs_epilog.states;
8325 break;
8326 case PIPE_SHADER_GEOMETRY:
8327 assert(prolog);
8328 break;
8329 case PIPE_SHADER_FRAGMENT:
8330 if (prolog)
8331 shader.key.part.ps.prolog = key->ps_prolog.states;
8332 else
8333 shader.key.part.ps.epilog = key->ps_epilog.states;
8334 break;
8335 default:
8336 unreachable("bad shader part");
8337 }
8338
8339 build(&ctx, key);
8340
8341 /* Compile. */
8342 si_llvm_optimize_module(&ctx);
8343
8344 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
8345 gallivm->module, debug, ctx.type, name)) {
8346 FREE(result);
8347 result = NULL;
8348 goto out;
8349 }
8350
8351 result->next = *list;
8352 *list = result;
8353
8354 out:
8355 si_llvm_dispose(&ctx);
8356 mtx_unlock(&sscreen->shader_parts_mutex);
8357 return result;
8358 }
8359
8360 /**
8361 * Build the vertex shader prolog function.
8362 *
8363 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
8364 * All inputs are returned unmodified. The vertex load indices are
8365 * stored after them, which will be used by the API VS for fetching inputs.
8366 *
8367 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
8368 * input_v0,
8369 * input_v1,
8370 * input_v2,
8371 * input_v3,
8372 * (VertexID + BaseVertex),
8373 * (InstanceID + StartInstance),
8374 * (InstanceID / 2 + StartInstance)
8375 */
8376 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
8377 union si_shader_part_key *key)
8378 {
8379 struct gallivm_state *gallivm = &ctx->gallivm;
8380 LLVMTypeRef *params, *returns;
8381 LLVMValueRef ret, func;
8382 int last_sgpr, num_params, num_returns, i;
8383 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
8384 key->vs_prolog.num_merged_next_stage_vgprs;
8385 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
8386 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
8387 num_input_vgprs;
8388 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
8389
8390 ctx->param_vertex_id = first_vs_vgpr;
8391 ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
8392
8393 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
8394 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
8395 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
8396 sizeof(LLVMTypeRef));
8397 num_params = 0;
8398 num_returns = 0;
8399
8400 /* Declare input and output SGPRs. */
8401 num_params = 0;
8402 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8403 params[num_params++] = ctx->i32;
8404 returns[num_returns++] = ctx->i32;
8405 }
8406 last_sgpr = num_params - 1;
8407
8408 /* Preloaded VGPRs (outputs must be floats) */
8409 for (i = 0; i < num_input_vgprs; i++) {
8410 params[num_params++] = ctx->i32;
8411 returns[num_returns++] = ctx->f32;
8412 }
8413
8414 /* Vertex load indices. */
8415 for (i = 0; i <= key->vs_prolog.last_input; i++)
8416 returns[num_returns++] = ctx->f32;
8417
8418 /* Create the function. */
8419 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
8420 num_params, last_sgpr, 0);
8421 func = ctx->main_fn;
8422
8423 if (key->vs_prolog.num_merged_next_stage_vgprs &&
8424 !key->vs_prolog.is_monolithic)
8425 si_init_exec_from_input(ctx, 3, 0);
8426
8427 /* Copy inputs to outputs. This should be no-op, as the registers match,
8428 * but it will prevent the compiler from overwriting them unintentionally.
8429 */
8430 ret = ctx->return_value;
8431 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8432 LLVMValueRef p = LLVMGetParam(func, i);
8433 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8434 }
8435 for (; i < num_params; i++) {
8436 LLVMValueRef p = LLVMGetParam(func, i);
8437 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
8438 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8439 }
8440
8441 /* Compute vertex load indices from instance divisors. */
8442 for (i = 0; i <= key->vs_prolog.last_input; i++) {
8443 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
8444 LLVMValueRef index;
8445
8446 if (divisor) {
8447 /* InstanceID / Divisor + StartInstance */
8448 index = get_instance_index_for_fetch(ctx,
8449 user_sgpr_base +
8450 SI_SGPR_START_INSTANCE,
8451 divisor);
8452 } else {
8453 /* VertexID + BaseVertex */
8454 index = LLVMBuildAdd(gallivm->builder,
8455 LLVMGetParam(func, ctx->param_vertex_id),
8456 LLVMGetParam(func, user_sgpr_base +
8457 SI_SGPR_BASE_VERTEX), "");
8458 }
8459
8460 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
8461 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
8462 num_params++, "");
8463 }
8464
8465 si_llvm_build_ret(ctx, ret);
8466 }
8467
8468 static bool si_get_vs_prolog(struct si_screen *sscreen,
8469 LLVMTargetMachineRef tm,
8470 struct si_shader *shader,
8471 struct pipe_debug_callback *debug,
8472 struct si_shader *main_part,
8473 const struct si_vs_prolog_bits *key)
8474 {
8475 struct si_shader_selector *vs = main_part->selector;
8476
8477 /* The prolog is a no-op if there are no inputs. */
8478 if (!vs->vs_needs_prolog)
8479 return true;
8480
8481 /* Get the prolog. */
8482 union si_shader_part_key prolog_key;
8483 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
8484 key, shader, &prolog_key);
8485
8486 shader->prolog =
8487 si_get_shader_part(sscreen, &sscreen->vs_prologs,
8488 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8489 debug, si_build_vs_prolog_function,
8490 "Vertex Shader Prolog");
8491 return shader->prolog != NULL;
8492 }
8493
8494 /**
8495 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8496 */
8497 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8498 LLVMTargetMachineRef tm,
8499 struct si_shader *shader,
8500 struct pipe_debug_callback *debug)
8501 {
8502 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8503 &shader->key.part.vs.prolog);
8504 }
8505
8506 /**
8507 * Compile the TCS epilog function. This writes tesselation factors to memory
8508 * based on the output primitive type of the tesselator (determined by TES).
8509 */
8510 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8511 union si_shader_part_key *key)
8512 {
8513 struct gallivm_state *gallivm = &ctx->gallivm;
8514 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8515 LLVMTypeRef params[32];
8516 LLVMValueRef func;
8517 int last_sgpr, num_params = 0;
8518
8519 if (ctx->screen->b.chip_class >= GFX9) {
8520 params[num_params++] = ctx->i64;
8521 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8522 params[num_params++] = ctx->i32; /* wave info */
8523 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8524 params[num_params++] = ctx->i32;
8525 params[num_params++] = ctx->i32;
8526 params[num_params++] = ctx->i32;
8527 params[num_params++] = ctx->i64;
8528 params[num_params++] = ctx->i64;
8529 params[num_params++] = ctx->i64;
8530 params[num_params++] = ctx->i64;
8531 params[num_params++] = ctx->i64;
8532 params[num_params++] = ctx->i64;
8533 params[num_params++] = ctx->i32;
8534 params[num_params++] = ctx->i32;
8535 params[num_params++] = ctx->i32;
8536 params[num_params++] = ctx->i32;
8537 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8538 params[num_params++] = ctx->i32;
8539 params[num_params++] = ctx->i32;
8540 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
8541 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
8542 } else {
8543 params[num_params++] = ctx->i64;
8544 params[num_params++] = ctx->i64;
8545 params[num_params++] = ctx->i64;
8546 params[num_params++] = ctx->i64;
8547 params[num_params++] = ctx->i64;
8548 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8549 params[num_params++] = ctx->i32;
8550 params[num_params++] = ctx->i32;
8551 params[num_params++] = ctx->i32;
8552 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
8553 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
8554 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8555 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8556 }
8557 last_sgpr = num_params - 1;
8558
8559 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8560 params[num_params++] = ctx->i32; /* invocation ID within the patch */
8561 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8562
8563 /* Create the function. */
8564 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
8565 ctx->screen->b.chip_class >= CIK ? 128 : 64);
8566 declare_lds_as_pointer(ctx);
8567 func = ctx->main_fn;
8568
8569 si_write_tess_factors(bld_base,
8570 LLVMGetParam(func, last_sgpr + 1),
8571 LLVMGetParam(func, last_sgpr + 2),
8572 LLVMGetParam(func, last_sgpr + 3));
8573
8574 LLVMBuildRetVoid(gallivm->builder);
8575 }
8576
8577 /**
8578 * Select and compile (or reuse) TCS parts (epilog).
8579 */
8580 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8581 LLVMTargetMachineRef tm,
8582 struct si_shader *shader,
8583 struct pipe_debug_callback *debug)
8584 {
8585 if (sscreen->b.chip_class >= GFX9) {
8586 struct si_shader *ls_main_part =
8587 shader->key.part.tcs.ls->main_shader_part_ls;
8588
8589 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8590 &shader->key.part.tcs.ls_prolog))
8591 return false;
8592
8593 shader->previous_stage = ls_main_part;
8594 }
8595
8596 /* Get the epilog. */
8597 union si_shader_part_key epilog_key;
8598 memset(&epilog_key, 0, sizeof(epilog_key));
8599 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8600
8601 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8602 PIPE_SHADER_TESS_CTRL, false,
8603 &epilog_key, tm, debug,
8604 si_build_tcs_epilog_function,
8605 "Tessellation Control Shader Epilog");
8606 return shader->epilog != NULL;
8607 }
8608
8609 /**
8610 * Select and compile (or reuse) GS parts (prolog).
8611 */
8612 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8613 LLVMTargetMachineRef tm,
8614 struct si_shader *shader,
8615 struct pipe_debug_callback *debug)
8616 {
8617 if (sscreen->b.chip_class >= GFX9) {
8618 struct si_shader *es_main_part =
8619 shader->key.part.gs.es->main_shader_part_es;
8620
8621 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
8622 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
8623 &shader->key.part.gs.vs_prolog))
8624 return false;
8625
8626 shader->previous_stage = es_main_part;
8627 }
8628
8629 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8630 return true;
8631
8632 union si_shader_part_key prolog_key;
8633 memset(&prolog_key, 0, sizeof(prolog_key));
8634 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8635
8636 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8637 PIPE_SHADER_GEOMETRY, true,
8638 &prolog_key, tm, debug,
8639 si_build_gs_prolog_function,
8640 "Geometry Shader Prolog");
8641 return shader->prolog2 != NULL;
8642 }
8643
8644 /**
8645 * Build the pixel shader prolog function. This handles:
8646 * - two-side color selection and interpolation
8647 * - overriding interpolation parameters for the API PS
8648 * - polygon stippling
8649 *
8650 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8651 * overriden by other states. (e.g. per-sample interpolation)
8652 * Interpolated colors are stored after the preloaded VGPRs.
8653 */
8654 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8655 union si_shader_part_key *key)
8656 {
8657 struct gallivm_state *gallivm = &ctx->gallivm;
8658 LLVMTypeRef *params;
8659 LLVMValueRef ret, func;
8660 int last_sgpr, num_params, num_returns, i, num_color_channels;
8661
8662 assert(si_need_ps_prolog(key));
8663
8664 /* Number of inputs + 8 color elements. */
8665 params = alloca((key->ps_prolog.num_input_sgprs +
8666 key->ps_prolog.num_input_vgprs + 8) *
8667 sizeof(LLVMTypeRef));
8668
8669 /* Declare inputs. */
8670 num_params = 0;
8671 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8672 params[num_params++] = ctx->i32;
8673 last_sgpr = num_params - 1;
8674
8675 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8676 params[num_params++] = ctx->f32;
8677
8678 /* Declare outputs (same as inputs + add colors if needed) */
8679 num_returns = num_params;
8680 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8681 for (i = 0; i < num_color_channels; i++)
8682 params[num_returns++] = ctx->f32;
8683
8684 /* Create the function. */
8685 si_create_function(ctx, "ps_prolog", params, num_returns, params,
8686 num_params, last_sgpr, 0);
8687 func = ctx->main_fn;
8688
8689 /* Copy inputs to outputs. This should be no-op, as the registers match,
8690 * but it will prevent the compiler from overwriting them unintentionally.
8691 */
8692 ret = ctx->return_value;
8693 for (i = 0; i < num_params; i++) {
8694 LLVMValueRef p = LLVMGetParam(func, i);
8695 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8696 }
8697
8698 /* Polygon stippling. */
8699 if (key->ps_prolog.states.poly_stipple) {
8700 /* POS_FIXED_PT is always last. */
8701 unsigned pos = key->ps_prolog.num_input_sgprs +
8702 key->ps_prolog.num_input_vgprs - 1;
8703 LLVMValueRef ptr[2], list;
8704
8705 /* Get the pointer to rw buffers. */
8706 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8707 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8708 list = lp_build_gather_values(gallivm, ptr, 2);
8709 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8710 list = LLVMBuildIntToPtr(gallivm->builder, list,
8711 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
8712
8713 si_llvm_emit_polygon_stipple(ctx, list, pos);
8714 }
8715
8716 if (key->ps_prolog.states.bc_optimize_for_persp ||
8717 key->ps_prolog.states.bc_optimize_for_linear) {
8718 unsigned i, base = key->ps_prolog.num_input_sgprs;
8719 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8720
8721 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8722 * The hw doesn't compute CENTROID if the whole wave only
8723 * contains fully-covered quads.
8724 *
8725 * PRIM_MASK is after user SGPRs.
8726 */
8727 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8728 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8729 LLVMConstInt(ctx->i32, 31, 0), "");
8730 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8731 ctx->i1, "");
8732
8733 if (key->ps_prolog.states.bc_optimize_for_persp) {
8734 /* Read PERSP_CENTER. */
8735 for (i = 0; i < 2; i++)
8736 center[i] = LLVMGetParam(func, base + 2 + i);
8737 /* Read PERSP_CENTROID. */
8738 for (i = 0; i < 2; i++)
8739 centroid[i] = LLVMGetParam(func, base + 4 + i);
8740 /* Select PERSP_CENTROID. */
8741 for (i = 0; i < 2; i++) {
8742 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8743 center[i], centroid[i], "");
8744 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8745 tmp, base + 4 + i, "");
8746 }
8747 }
8748 if (key->ps_prolog.states.bc_optimize_for_linear) {
8749 /* Read LINEAR_CENTER. */
8750 for (i = 0; i < 2; i++)
8751 center[i] = LLVMGetParam(func, base + 8 + i);
8752 /* Read LINEAR_CENTROID. */
8753 for (i = 0; i < 2; i++)
8754 centroid[i] = LLVMGetParam(func, base + 10 + i);
8755 /* Select LINEAR_CENTROID. */
8756 for (i = 0; i < 2; i++) {
8757 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8758 center[i], centroid[i], "");
8759 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8760 tmp, base + 10 + i, "");
8761 }
8762 }
8763 }
8764
8765 /* Force per-sample interpolation. */
8766 if (key->ps_prolog.states.force_persp_sample_interp) {
8767 unsigned i, base = key->ps_prolog.num_input_sgprs;
8768 LLVMValueRef persp_sample[2];
8769
8770 /* Read PERSP_SAMPLE. */
8771 for (i = 0; i < 2; i++)
8772 persp_sample[i] = LLVMGetParam(func, base + i);
8773 /* Overwrite PERSP_CENTER. */
8774 for (i = 0; i < 2; i++)
8775 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8776 persp_sample[i], base + 2 + i, "");
8777 /* Overwrite PERSP_CENTROID. */
8778 for (i = 0; i < 2; i++)
8779 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8780 persp_sample[i], base + 4 + i, "");
8781 }
8782 if (key->ps_prolog.states.force_linear_sample_interp) {
8783 unsigned i, base = key->ps_prolog.num_input_sgprs;
8784 LLVMValueRef linear_sample[2];
8785
8786 /* Read LINEAR_SAMPLE. */
8787 for (i = 0; i < 2; i++)
8788 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8789 /* Overwrite LINEAR_CENTER. */
8790 for (i = 0; i < 2; i++)
8791 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8792 linear_sample[i], base + 8 + i, "");
8793 /* Overwrite LINEAR_CENTROID. */
8794 for (i = 0; i < 2; i++)
8795 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8796 linear_sample[i], base + 10 + i, "");
8797 }
8798
8799 /* Force center interpolation. */
8800 if (key->ps_prolog.states.force_persp_center_interp) {
8801 unsigned i, base = key->ps_prolog.num_input_sgprs;
8802 LLVMValueRef persp_center[2];
8803
8804 /* Read PERSP_CENTER. */
8805 for (i = 0; i < 2; i++)
8806 persp_center[i] = LLVMGetParam(func, base + 2 + i);
8807 /* Overwrite PERSP_SAMPLE. */
8808 for (i = 0; i < 2; i++)
8809 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8810 persp_center[i], base + i, "");
8811 /* Overwrite PERSP_CENTROID. */
8812 for (i = 0; i < 2; i++)
8813 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8814 persp_center[i], base + 4 + i, "");
8815 }
8816 if (key->ps_prolog.states.force_linear_center_interp) {
8817 unsigned i, base = key->ps_prolog.num_input_sgprs;
8818 LLVMValueRef linear_center[2];
8819
8820 /* Read LINEAR_CENTER. */
8821 for (i = 0; i < 2; i++)
8822 linear_center[i] = LLVMGetParam(func, base + 8 + i);
8823 /* Overwrite LINEAR_SAMPLE. */
8824 for (i = 0; i < 2; i++)
8825 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8826 linear_center[i], base + 6 + i, "");
8827 /* Overwrite LINEAR_CENTROID. */
8828 for (i = 0; i < 2; i++)
8829 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8830 linear_center[i], base + 10 + i, "");
8831 }
8832
8833 /* Interpolate colors. */
8834 for (i = 0; i < 2; i++) {
8835 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8836 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8837 key->ps_prolog.face_vgpr_index;
8838 LLVMValueRef interp[2], color[4];
8839 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8840
8841 if (!writemask)
8842 continue;
8843
8844 /* If the interpolation qualifier is not CONSTANT (-1). */
8845 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8846 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8847 key->ps_prolog.color_interp_vgpr_index[i];
8848
8849 /* Get the (i,j) updated by bc_optimize handling. */
8850 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8851 interp_vgpr, "");
8852 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8853 interp_vgpr + 1, "");
8854 interp_ij = lp_build_gather_values(gallivm, interp, 2);
8855 }
8856
8857 /* Use the absolute location of the input. */
8858 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8859
8860 if (key->ps_prolog.states.color_two_side) {
8861 face = LLVMGetParam(func, face_vgpr);
8862 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8863 }
8864
8865 interp_fs_input(ctx,
8866 key->ps_prolog.color_attr_index[i],
8867 TGSI_SEMANTIC_COLOR, i,
8868 key->ps_prolog.num_interp_inputs,
8869 key->ps_prolog.colors_read, interp_ij,
8870 prim_mask, face, color);
8871
8872 while (writemask) {
8873 unsigned chan = u_bit_scan(&writemask);
8874 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8875 num_params++, "");
8876 }
8877 }
8878
8879 /* Tell LLVM to insert WQM instruction sequence when needed. */
8880 if (key->ps_prolog.wqm) {
8881 LLVMAddTargetDependentFunctionAttr(func,
8882 "amdgpu-ps-wqm-outputs", "");
8883 }
8884
8885 si_llvm_build_ret(ctx, ret);
8886 }
8887
8888 /**
8889 * Build the pixel shader epilog function. This handles everything that must be
8890 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8891 */
8892 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8893 union si_shader_part_key *key)
8894 {
8895 struct gallivm_state *gallivm = &ctx->gallivm;
8896 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8897 LLVMTypeRef params[16+8*4+3];
8898 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8899 int last_sgpr, num_params = 0, i;
8900 struct si_ps_exports exp = {};
8901
8902 /* Declare input SGPRs. */
8903 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8904 params[ctx->param_const_buffers = num_params++] = ctx->i64;
8905 params[ctx->param_samplers = num_params++] = ctx->i64;
8906 params[ctx->param_images = num_params++] = ctx->i64;
8907 params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8908 assert(num_params == SI_PARAM_ALPHA_REF);
8909 params[SI_PARAM_ALPHA_REF] = ctx->f32;
8910 last_sgpr = SI_PARAM_ALPHA_REF;
8911
8912 /* Declare input VGPRs. */
8913 num_params = (last_sgpr + 1) +
8914 util_bitcount(key->ps_epilog.colors_written) * 4 +
8915 key->ps_epilog.writes_z +
8916 key->ps_epilog.writes_stencil +
8917 key->ps_epilog.writes_samplemask;
8918
8919 num_params = MAX2(num_params,
8920 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8921
8922 assert(num_params <= ARRAY_SIZE(params));
8923
8924 for (i = last_sgpr + 1; i < num_params; i++)
8925 params[i] = ctx->f32;
8926
8927 /* Create the function. */
8928 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
8929 last_sgpr, 0);
8930 /* Disable elimination of unused inputs. */
8931 si_llvm_add_attribute(ctx->main_fn,
8932 "InitialPSInputAddr", 0xffffff);
8933
8934 /* Process colors. */
8935 unsigned vgpr = last_sgpr + 1;
8936 unsigned colors_written = key->ps_epilog.colors_written;
8937 int last_color_export = -1;
8938
8939 /* Find the last color export. */
8940 if (!key->ps_epilog.writes_z &&
8941 !key->ps_epilog.writes_stencil &&
8942 !key->ps_epilog.writes_samplemask) {
8943 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8944
8945 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8946 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8947 /* Just set this if any of the colorbuffers are enabled. */
8948 if (spi_format &
8949 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8950 last_color_export = 0;
8951 } else {
8952 for (i = 0; i < 8; i++)
8953 if (colors_written & (1 << i) &&
8954 (spi_format >> (i * 4)) & 0xf)
8955 last_color_export = i;
8956 }
8957 }
8958
8959 while (colors_written) {
8960 LLVMValueRef color[4];
8961 int mrt = u_bit_scan(&colors_written);
8962
8963 for (i = 0; i < 4; i++)
8964 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8965
8966 si_export_mrt_color(bld_base, color, mrt,
8967 num_params - 1,
8968 mrt == last_color_export, &exp);
8969 }
8970
8971 /* Process depth, stencil, samplemask. */
8972 if (key->ps_epilog.writes_z)
8973 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8974 if (key->ps_epilog.writes_stencil)
8975 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8976 if (key->ps_epilog.writes_samplemask)
8977 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8978
8979 if (depth || stencil || samplemask)
8980 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8981 else if (last_color_export == -1)
8982 si_export_null(bld_base);
8983
8984 if (exp.num)
8985 si_emit_ps_exports(ctx, &exp);
8986
8987 /* Compile. */
8988 LLVMBuildRetVoid(gallivm->builder);
8989 }
8990
8991 /**
8992 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8993 */
8994 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8995 LLVMTargetMachineRef tm,
8996 struct si_shader *shader,
8997 struct pipe_debug_callback *debug)
8998 {
8999 union si_shader_part_key prolog_key;
9000 union si_shader_part_key epilog_key;
9001
9002 /* Get the prolog. */
9003 si_get_ps_prolog_key(shader, &prolog_key, true);
9004
9005 /* The prolog is a no-op if these aren't set. */
9006 if (si_need_ps_prolog(&prolog_key)) {
9007 shader->prolog =
9008 si_get_shader_part(sscreen, &sscreen->ps_prologs,
9009 PIPE_SHADER_FRAGMENT, true,
9010 &prolog_key, tm, debug,
9011 si_build_ps_prolog_function,
9012 "Fragment Shader Prolog");
9013 if (!shader->prolog)
9014 return false;
9015 }
9016
9017 /* Get the epilog. */
9018 si_get_ps_epilog_key(shader, &epilog_key);
9019
9020 shader->epilog =
9021 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
9022 PIPE_SHADER_FRAGMENT, false,
9023 &epilog_key, tm, debug,
9024 si_build_ps_epilog_function,
9025 "Fragment Shader Epilog");
9026 if (!shader->epilog)
9027 return false;
9028
9029 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
9030 if (shader->key.part.ps.prolog.poly_stipple) {
9031 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
9032 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
9033 }
9034
9035 /* Set up the enable bits for per-sample shading if needed. */
9036 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
9037 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9038 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9039 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
9040 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9041 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
9042 }
9043 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
9044 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9045 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9046 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
9047 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9048 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
9049 }
9050 if (shader->key.part.ps.prolog.force_persp_center_interp &&
9051 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9052 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9053 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
9054 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9055 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9056 }
9057 if (shader->key.part.ps.prolog.force_linear_center_interp &&
9058 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9059 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9060 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
9061 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9062 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9063 }
9064
9065 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
9066 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
9067 !(shader->config.spi_ps_input_ena & 0xf)) {
9068 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9069 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
9070 }
9071
9072 /* At least one pair of interpolation weights must be enabled. */
9073 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
9074 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9075 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
9076 }
9077
9078 /* The sample mask input is always enabled, because the API shader always
9079 * passes it through to the epilog. Disable it here if it's unused.
9080 */
9081 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
9082 !shader->selector->info.reads_samplemask)
9083 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
9084
9085 return true;
9086 }
9087
9088 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
9089 unsigned *lds_size)
9090 {
9091 /* SPI barrier management bug:
9092 * Make sure we have at least 4k of LDS in use to avoid the bug.
9093 * It applies to workgroup sizes of more than one wavefront.
9094 */
9095 if (sscreen->b.family == CHIP_BONAIRE ||
9096 sscreen->b.family == CHIP_KABINI ||
9097 sscreen->b.family == CHIP_MULLINS)
9098 *lds_size = MAX2(*lds_size, 8);
9099 }
9100
9101 static void si_fix_resource_usage(struct si_screen *sscreen,
9102 struct si_shader *shader)
9103 {
9104 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
9105
9106 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
9107
9108 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
9109 si_get_max_workgroup_size(shader) > 64) {
9110 si_multiwave_lds_size_workaround(sscreen,
9111 &shader->config.lds_size);
9112 }
9113 }
9114
9115 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
9116 struct si_shader *shader,
9117 struct pipe_debug_callback *debug)
9118 {
9119 struct si_shader_selector *sel = shader->selector;
9120 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
9121 int r;
9122
9123 /* LS, ES, VS are compiled on demand if the main part hasn't been
9124 * compiled for that stage.
9125 *
9126 * Vertex shaders are compiled on demand when a vertex fetch
9127 * workaround must be applied.
9128 */
9129 if (shader->is_monolithic) {
9130 /* Monolithic shader (compiled as a whole, has many variants,
9131 * may take a long time to compile).
9132 */
9133 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
9134 if (r)
9135 return r;
9136 } else {
9137 /* The shader consists of 2-3 parts:
9138 *
9139 * - the middle part is the user shader, it has 1 variant only
9140 * and it was compiled during the creation of the shader
9141 * selector
9142 * - the prolog part is inserted at the beginning
9143 * - the epilog part is inserted at the end
9144 *
9145 * The prolog and epilog have many (but simple) variants.
9146 */
9147
9148 /* Copy the compiled TGSI shader data over. */
9149 shader->is_binary_shared = true;
9150 shader->binary = mainp->binary;
9151 shader->config = mainp->config;
9152 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
9153 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
9154 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
9155 memcpy(shader->info.vs_output_param_offset,
9156 mainp->info.vs_output_param_offset,
9157 sizeof(mainp->info.vs_output_param_offset));
9158 shader->info.uses_instanceid = mainp->info.uses_instanceid;
9159 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
9160 shader->info.nr_param_exports = mainp->info.nr_param_exports;
9161
9162 /* Select prologs and/or epilogs. */
9163 switch (sel->type) {
9164 case PIPE_SHADER_VERTEX:
9165 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
9166 return -1;
9167 break;
9168 case PIPE_SHADER_TESS_CTRL:
9169 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
9170 return -1;
9171 break;
9172 case PIPE_SHADER_TESS_EVAL:
9173 break;
9174 case PIPE_SHADER_GEOMETRY:
9175 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
9176 return -1;
9177 break;
9178 case PIPE_SHADER_FRAGMENT:
9179 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
9180 return -1;
9181
9182 /* Make sure we have at least as many VGPRs as there
9183 * are allocated inputs.
9184 */
9185 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9186 shader->info.num_input_vgprs);
9187 break;
9188 }
9189
9190 /* Update SGPR and VGPR counts. */
9191 if (shader->prolog) {
9192 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9193 shader->prolog->config.num_sgprs);
9194 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9195 shader->prolog->config.num_vgprs);
9196 }
9197 if (shader->previous_stage) {
9198 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9199 shader->previous_stage->config.num_sgprs);
9200 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9201 shader->previous_stage->config.num_vgprs);
9202 shader->config.spilled_sgprs =
9203 MAX2(shader->config.spilled_sgprs,
9204 shader->previous_stage->config.spilled_sgprs);
9205 shader->config.spilled_vgprs =
9206 MAX2(shader->config.spilled_vgprs,
9207 shader->previous_stage->config.spilled_vgprs);
9208 shader->config.private_mem_vgprs =
9209 MAX2(shader->config.private_mem_vgprs,
9210 shader->previous_stage->config.private_mem_vgprs);
9211 shader->config.scratch_bytes_per_wave =
9212 MAX2(shader->config.scratch_bytes_per_wave,
9213 shader->previous_stage->config.scratch_bytes_per_wave);
9214 shader->info.uses_instanceid |=
9215 shader->previous_stage->info.uses_instanceid;
9216 }
9217 if (shader->prolog2) {
9218 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9219 shader->prolog2->config.num_sgprs);
9220 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9221 shader->prolog2->config.num_vgprs);
9222 }
9223 if (shader->epilog) {
9224 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9225 shader->epilog->config.num_sgprs);
9226 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9227 shader->epilog->config.num_vgprs);
9228 }
9229 }
9230
9231 si_fix_resource_usage(sscreen, shader);
9232 si_shader_dump(sscreen, shader, debug, sel->info.processor,
9233 stderr, true);
9234
9235 /* Upload. */
9236 r = si_shader_binary_upload(sscreen, shader);
9237 if (r) {
9238 fprintf(stderr, "LLVM failed to upload shader\n");
9239 return r;
9240 }
9241
9242 return 0;
9243 }
9244
9245 void si_shader_destroy(struct si_shader *shader)
9246 {
9247 if (shader->scratch_bo)
9248 r600_resource_reference(&shader->scratch_bo, NULL);
9249
9250 r600_resource_reference(&shader->bo, NULL);
9251
9252 if (!shader->is_binary_shared)
9253 radeon_shader_binary_clean(&shader->binary);
9254
9255 free(shader->shader_log);
9256 }