radeonsi: get InstanceID from VGPR1 (or VGPR2 for tess) instead of VGPR3
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
86 union si_shader_part_key *key);
87
88 /* Ideally pass the sample mask input to the PS epilog as v13, which
89 * is its usual location, so that the shader doesn't have to add v_mov.
90 */
91 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
92
93 /* The VS location of the PrimitiveID input is the same in the epilog,
94 * so that the main shader part doesn't have to move it.
95 */
96 #define VS_EPILOG_PRIMID_LOC 2
97
98 enum {
99 CONST_ADDR_SPACE = 2,
100 LOCAL_ADDR_SPACE = 3,
101 };
102
103 static bool is_merged_shader(struct si_shader *shader)
104 {
105 if (shader->selector->screen->b.chip_class <= VI)
106 return false;
107
108 return shader->key.as_ls ||
109 shader->key.as_es ||
110 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
111 shader->selector->type == PIPE_SHADER_GEOMETRY;
112 }
113
114 /**
115 * Returns a unique index for a semantic name and index. The index must be
116 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
117 * calculated.
118 */
119 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
120 {
121 switch (semantic_name) {
122 case TGSI_SEMANTIC_POSITION:
123 return 0;
124 case TGSI_SEMANTIC_PSIZE:
125 return 1;
126 case TGSI_SEMANTIC_CLIPDIST:
127 assert(index <= 1);
128 return 2 + index;
129 case TGSI_SEMANTIC_GENERIC:
130 if (index <= 63-4)
131 return 4 + index;
132
133 assert(!"invalid generic index");
134 return 0;
135
136 /* patch indices are completely separate and thus start from 0 */
137 case TGSI_SEMANTIC_TESSOUTER:
138 return 0;
139 case TGSI_SEMANTIC_TESSINNER:
140 return 1;
141 case TGSI_SEMANTIC_PATCH:
142 return 2 + index;
143
144 default:
145 assert(!"invalid semantic name");
146 return 0;
147 }
148 }
149
150 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
151 {
152 switch (name) {
153 case TGSI_SEMANTIC_FOG:
154 return 0;
155 case TGSI_SEMANTIC_LAYER:
156 return 1;
157 case TGSI_SEMANTIC_VIEWPORT_INDEX:
158 return 2;
159 case TGSI_SEMANTIC_PRIMID:
160 return 3;
161 case TGSI_SEMANTIC_COLOR: /* these alias */
162 case TGSI_SEMANTIC_BCOLOR:
163 return 4 + index;
164 case TGSI_SEMANTIC_TEXCOORD:
165 return 6 + index;
166 default:
167 assert(!"invalid semantic name");
168 return 0;
169 }
170 }
171
172 /**
173 * Get the value of a shader input parameter and extract a bitfield.
174 */
175 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
176 unsigned param, unsigned rshift,
177 unsigned bitwidth)
178 {
179 struct gallivm_state *gallivm = &ctx->gallivm;
180 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
181 param);
182
183 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
184 value = bitcast(&ctx->bld_base,
185 TGSI_TYPE_UNSIGNED, value);
186
187 if (rshift)
188 value = LLVMBuildLShr(gallivm->builder, value,
189 LLVMConstInt(ctx->i32, rshift, 0), "");
190
191 if (rshift + bitwidth < 32) {
192 unsigned mask = (1 << bitwidth) - 1;
193 value = LLVMBuildAnd(gallivm->builder, value,
194 LLVMConstInt(ctx->i32, mask, 0), "");
195 }
196
197 return value;
198 }
199
200 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
201 {
202 switch (ctx->type) {
203 case PIPE_SHADER_TESS_CTRL:
204 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
205
206 case PIPE_SHADER_TESS_EVAL:
207 return LLVMGetParam(ctx->main_fn,
208 ctx->param_tes_rel_patch_id);
209
210 default:
211 assert(0);
212 return NULL;
213 }
214 }
215
216 /* Tessellation shaders pass outputs to the next shader using LDS.
217 *
218 * LS outputs = TCS inputs
219 * TCS outputs = TES inputs
220 *
221 * The LDS layout is:
222 * - TCS inputs for patch 0
223 * - TCS inputs for patch 1
224 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
225 * - ...
226 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
227 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
228 * - TCS outputs for patch 1
229 * - Per-patch TCS outputs for patch 1
230 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
231 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
232 * - ...
233 *
234 * All three shaders VS(LS), TCS, TES share the same LDS space.
235 */
236
237 static LLVMValueRef
238 get_tcs_in_patch_stride(struct si_shader_context *ctx)
239 {
240 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
241 }
242
243 static LLVMValueRef
244 get_tcs_out_patch_stride(struct si_shader_context *ctx)
245 {
246 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
247 }
248
249 static LLVMValueRef
250 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
251 {
252 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
253 unpack_param(ctx,
254 ctx->param_tcs_out_lds_offsets,
255 0, 16),
256 4);
257 }
258
259 static LLVMValueRef
260 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
261 {
262 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
263 unpack_param(ctx,
264 ctx->param_tcs_out_lds_offsets,
265 16, 16),
266 4);
267 }
268
269 static LLVMValueRef
270 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
271 {
272 struct gallivm_state *gallivm = &ctx->gallivm;
273 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
274 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
275
276 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
277 }
278
279 static LLVMValueRef
280 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
281 {
282 struct gallivm_state *gallivm = &ctx->gallivm;
283 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
284 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
285 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
286
287 return LLVMBuildAdd(gallivm->builder, patch0_offset,
288 LLVMBuildMul(gallivm->builder, patch_stride,
289 rel_patch_id, ""),
290 "");
291 }
292
293 static LLVMValueRef
294 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
295 {
296 struct gallivm_state *gallivm = &ctx->gallivm;
297 LLVMValueRef patch0_patch_data_offset =
298 get_tcs_out_patch0_patch_data_offset(ctx);
299 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
300 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
301
302 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
303 LLVMBuildMul(gallivm->builder, patch_stride,
304 rel_patch_id, ""),
305 "");
306 }
307
308 static LLVMValueRef get_instance_index_for_fetch(
309 struct si_shader_context *ctx,
310 unsigned param_start_instance, unsigned divisor)
311 {
312 struct gallivm_state *gallivm = &ctx->gallivm;
313
314 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
315 ctx->param_instance_id);
316
317 /* The division must be done before START_INSTANCE is added. */
318 if (divisor > 1)
319 result = LLVMBuildUDiv(gallivm->builder, result,
320 LLVMConstInt(ctx->i32, divisor, 0), "");
321
322 return LLVMBuildAdd(gallivm->builder, result,
323 LLVMGetParam(ctx->main_fn, param_start_instance), "");
324 }
325
326 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
327 * to float. */
328 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
329 LLVMValueRef vec4,
330 unsigned double_index)
331 {
332 LLVMBuilderRef builder = ctx->gallivm.builder;
333 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
334 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
335 LLVMVectorType(f64, 2), "");
336 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
337 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
338 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
339 }
340
341 static void declare_input_vs(
342 struct si_shader_context *ctx,
343 unsigned input_index,
344 const struct tgsi_full_declaration *decl,
345 LLVMValueRef out[4])
346 {
347 struct gallivm_state *gallivm = &ctx->gallivm;
348
349 unsigned chan;
350 unsigned fix_fetch;
351 unsigned num_fetches;
352 unsigned fetch_stride;
353
354 LLVMValueRef t_list_ptr;
355 LLVMValueRef t_offset;
356 LLVMValueRef t_list;
357 LLVMValueRef vertex_index;
358 LLVMValueRef input[3];
359
360 /* Load the T list */
361 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
362
363 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
364
365 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
366
367 vertex_index = LLVMGetParam(ctx->main_fn,
368 ctx->param_vertex_index0 +
369 input_index);
370
371 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
372
373 /* Do multiple loads for special formats. */
374 switch (fix_fetch) {
375 case SI_FIX_FETCH_RGB_64_FLOAT:
376 num_fetches = 3; /* 3 2-dword loads */
377 fetch_stride = 8;
378 break;
379 case SI_FIX_FETCH_RGBA_64_FLOAT:
380 num_fetches = 2; /* 2 4-dword loads */
381 fetch_stride = 16;
382 break;
383 case SI_FIX_FETCH_RGB_8:
384 case SI_FIX_FETCH_RGB_8_INT:
385 num_fetches = 3;
386 fetch_stride = 1;
387 break;
388 case SI_FIX_FETCH_RGB_16:
389 case SI_FIX_FETCH_RGB_16_INT:
390 num_fetches = 3;
391 fetch_stride = 2;
392 break;
393 default:
394 num_fetches = 1;
395 fetch_stride = 0;
396 }
397
398 for (unsigned i = 0; i < num_fetches; i++) {
399 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
400
401 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
402 vertex_index, voffset,
403 true);
404 }
405
406 /* Break up the vec4 into individual components */
407 for (chan = 0; chan < 4; chan++) {
408 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
409 out[chan] = LLVMBuildExtractElement(gallivm->builder,
410 input[0], llvm_chan, "");
411 }
412
413 switch (fix_fetch) {
414 case SI_FIX_FETCH_A2_SNORM:
415 case SI_FIX_FETCH_A2_SSCALED:
416 case SI_FIX_FETCH_A2_SINT: {
417 /* The hardware returns an unsigned value; convert it to a
418 * signed one.
419 */
420 LLVMValueRef tmp = out[3];
421 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
422
423 /* First, recover the sign-extended signed integer value. */
424 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
425 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
426 else
427 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
428
429 /* For the integer-like cases, do a natural sign extension.
430 *
431 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
432 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
433 * exponent.
434 */
435 tmp = LLVMBuildShl(gallivm->builder, tmp,
436 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
437 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
438 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
439
440 /* Convert back to the right type. */
441 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
442 LLVMValueRef clamp;
443 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
444 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
445 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
446 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
447 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
448 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
449 }
450
451 out[3] = tmp;
452 break;
453 }
454 case SI_FIX_FETCH_RGBA_32_UNORM:
455 case SI_FIX_FETCH_RGBX_32_UNORM:
456 for (chan = 0; chan < 4; chan++) {
457 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
458 ctx->i32, "");
459 out[chan] = LLVMBuildUIToFP(gallivm->builder,
460 out[chan], ctx->f32, "");
461 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
462 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
463 }
464 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
465 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
466 out[3] = LLVMConstReal(ctx->f32, 1);
467 break;
468 case SI_FIX_FETCH_RGBA_32_SNORM:
469 case SI_FIX_FETCH_RGBX_32_SNORM:
470 case SI_FIX_FETCH_RGBA_32_FIXED:
471 case SI_FIX_FETCH_RGBX_32_FIXED: {
472 double scale;
473 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
474 scale = 1.0 / 0x10000;
475 else
476 scale = 1.0 / INT_MAX;
477
478 for (chan = 0; chan < 4; chan++) {
479 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
480 ctx->i32, "");
481 out[chan] = LLVMBuildSIToFP(gallivm->builder,
482 out[chan], ctx->f32, "");
483 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
484 LLVMConstReal(ctx->f32, scale), "");
485 }
486 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
487 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
488 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
489 out[3] = LLVMConstReal(ctx->f32, 1);
490 break;
491 }
492 case SI_FIX_FETCH_RGBA_32_USCALED:
493 for (chan = 0; chan < 4; chan++) {
494 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
495 ctx->i32, "");
496 out[chan] = LLVMBuildUIToFP(gallivm->builder,
497 out[chan], ctx->f32, "");
498 }
499 break;
500 case SI_FIX_FETCH_RGBA_32_SSCALED:
501 for (chan = 0; chan < 4; chan++) {
502 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
503 ctx->i32, "");
504 out[chan] = LLVMBuildSIToFP(gallivm->builder,
505 out[chan], ctx->f32, "");
506 }
507 break;
508 case SI_FIX_FETCH_RG_64_FLOAT:
509 for (chan = 0; chan < 2; chan++)
510 out[chan] = extract_double_to_float(ctx, input[0], chan);
511
512 out[2] = LLVMConstReal(ctx->f32, 0);
513 out[3] = LLVMConstReal(ctx->f32, 1);
514 break;
515 case SI_FIX_FETCH_RGB_64_FLOAT:
516 for (chan = 0; chan < 3; chan++)
517 out[chan] = extract_double_to_float(ctx, input[chan], 0);
518
519 out[3] = LLVMConstReal(ctx->f32, 1);
520 break;
521 case SI_FIX_FETCH_RGBA_64_FLOAT:
522 for (chan = 0; chan < 4; chan++) {
523 out[chan] = extract_double_to_float(ctx, input[chan / 2],
524 chan % 2);
525 }
526 break;
527 case SI_FIX_FETCH_RGB_8:
528 case SI_FIX_FETCH_RGB_8_INT:
529 case SI_FIX_FETCH_RGB_16:
530 case SI_FIX_FETCH_RGB_16_INT:
531 for (chan = 0; chan < 3; chan++) {
532 out[chan] = LLVMBuildExtractElement(gallivm->builder,
533 input[chan],
534 ctx->i32_0, "");
535 }
536 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
537 fix_fetch == SI_FIX_FETCH_RGB_16) {
538 out[3] = LLVMConstReal(ctx->f32, 1);
539 } else {
540 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
541 ctx->f32, "");
542 }
543 break;
544 }
545 }
546
547 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
548 unsigned swizzle)
549 {
550 struct si_shader_context *ctx = si_shader_context(bld_base);
551
552 if (swizzle > 0)
553 return ctx->i32_0;
554
555 switch (ctx->type) {
556 case PIPE_SHADER_VERTEX:
557 return LLVMGetParam(ctx->main_fn,
558 ctx->param_vs_prim_id);
559 case PIPE_SHADER_TESS_CTRL:
560 return LLVMGetParam(ctx->main_fn,
561 ctx->param_tcs_patch_id);
562 case PIPE_SHADER_TESS_EVAL:
563 return LLVMGetParam(ctx->main_fn,
564 ctx->param_tes_patch_id);
565 case PIPE_SHADER_GEOMETRY:
566 return LLVMGetParam(ctx->main_fn,
567 ctx->param_gs_prim_id);
568 default:
569 assert(0);
570 return ctx->i32_0;
571 }
572 }
573
574 /**
575 * Return the value of tgsi_ind_register for indexing.
576 * This is the indirect index with the constant offset added to it.
577 */
578 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
579 const struct tgsi_ind_register *ind,
580 int rel_index)
581 {
582 struct gallivm_state *gallivm = &ctx->gallivm;
583 LLVMValueRef result;
584
585 result = ctx->addrs[ind->Index][ind->Swizzle];
586 result = LLVMBuildLoad(gallivm->builder, result, "");
587 result = LLVMBuildAdd(gallivm->builder, result,
588 LLVMConstInt(ctx->i32, rel_index, 0), "");
589 return result;
590 }
591
592 /**
593 * Like get_indirect_index, but restricts the return value to a (possibly
594 * undefined) value inside [0..num).
595 */
596 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
597 const struct tgsi_ind_register *ind,
598 int rel_index, unsigned num)
599 {
600 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
601
602 /* LLVM 3.8: If indirect resource indexing is used:
603 * - SI & CIK hang
604 * - VI crashes
605 */
606 if (HAVE_LLVM == 0x0308)
607 return LLVMGetUndef(ctx->i32);
608
609 return si_llvm_bound_index(ctx, result, num);
610 }
611
612
613 /**
614 * Calculate a dword address given an input or output register and a stride.
615 */
616 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
617 const struct tgsi_full_dst_register *dst,
618 const struct tgsi_full_src_register *src,
619 LLVMValueRef vertex_dw_stride,
620 LLVMValueRef base_addr)
621 {
622 struct gallivm_state *gallivm = &ctx->gallivm;
623 struct tgsi_shader_info *info = &ctx->shader->selector->info;
624 ubyte *name, *index, *array_first;
625 int first, param;
626 struct tgsi_full_dst_register reg;
627
628 /* Set the register description. The address computation is the same
629 * for sources and destinations. */
630 if (src) {
631 reg.Register.File = src->Register.File;
632 reg.Register.Index = src->Register.Index;
633 reg.Register.Indirect = src->Register.Indirect;
634 reg.Register.Dimension = src->Register.Dimension;
635 reg.Indirect = src->Indirect;
636 reg.Dimension = src->Dimension;
637 reg.DimIndirect = src->DimIndirect;
638 } else
639 reg = *dst;
640
641 /* If the register is 2-dimensional (e.g. an array of vertices
642 * in a primitive), calculate the base address of the vertex. */
643 if (reg.Register.Dimension) {
644 LLVMValueRef index;
645
646 if (reg.Dimension.Indirect)
647 index = get_indirect_index(ctx, &reg.DimIndirect,
648 reg.Dimension.Index);
649 else
650 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
651
652 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
653 LLVMBuildMul(gallivm->builder, index,
654 vertex_dw_stride, ""), "");
655 }
656
657 /* Get information about the register. */
658 if (reg.Register.File == TGSI_FILE_INPUT) {
659 name = info->input_semantic_name;
660 index = info->input_semantic_index;
661 array_first = info->input_array_first;
662 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
663 name = info->output_semantic_name;
664 index = info->output_semantic_index;
665 array_first = info->output_array_first;
666 } else {
667 assert(0);
668 return NULL;
669 }
670
671 if (reg.Register.Indirect) {
672 /* Add the relative address of the element. */
673 LLVMValueRef ind_index;
674
675 if (reg.Indirect.ArrayID)
676 first = array_first[reg.Indirect.ArrayID];
677 else
678 first = reg.Register.Index;
679
680 ind_index = get_indirect_index(ctx, &reg.Indirect,
681 reg.Register.Index - first);
682
683 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
684 LLVMBuildMul(gallivm->builder, ind_index,
685 LLVMConstInt(ctx->i32, 4, 0), ""), "");
686
687 param = si_shader_io_get_unique_index(name[first], index[first]);
688 } else {
689 param = si_shader_io_get_unique_index(name[reg.Register.Index],
690 index[reg.Register.Index]);
691 }
692
693 /* Add the base address of the element. */
694 return LLVMBuildAdd(gallivm->builder, base_addr,
695 LLVMConstInt(ctx->i32, param * 4, 0), "");
696 }
697
698 /* The offchip buffer layout for TCS->TES is
699 *
700 * - attribute 0 of patch 0 vertex 0
701 * - attribute 0 of patch 0 vertex 1
702 * - attribute 0 of patch 0 vertex 2
703 * ...
704 * - attribute 0 of patch 1 vertex 0
705 * - attribute 0 of patch 1 vertex 1
706 * ...
707 * - attribute 1 of patch 0 vertex 0
708 * - attribute 1 of patch 0 vertex 1
709 * ...
710 * - per patch attribute 0 of patch 0
711 * - per patch attribute 0 of patch 1
712 * ...
713 *
714 * Note that every attribute has 4 components.
715 */
716 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
717 LLVMValueRef rel_patch_id,
718 LLVMValueRef vertex_index,
719 LLVMValueRef param_index)
720 {
721 struct gallivm_state *gallivm = &ctx->gallivm;
722 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
723 LLVMValueRef param_stride, constant16;
724
725 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 6);
726 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 9);
727 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
728 num_patches, "");
729
730 constant16 = LLVMConstInt(ctx->i32, 16, 0);
731 if (vertex_index) {
732 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
733 vertices_per_patch, "");
734
735 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
736 vertex_index, "");
737
738 param_stride = total_vertices;
739 } else {
740 base_addr = rel_patch_id;
741 param_stride = num_patches;
742 }
743
744 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
745 LLVMBuildMul(gallivm->builder, param_index,
746 param_stride, ""), "");
747
748 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
749
750 if (!vertex_index) {
751 LLVMValueRef patch_data_offset =
752 unpack_param(ctx, ctx->param_tcs_offchip_layout, 16, 16);
753
754 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
755 patch_data_offset, "");
756 }
757 return base_addr;
758 }
759
760 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
761 struct si_shader_context *ctx,
762 const struct tgsi_full_dst_register *dst,
763 const struct tgsi_full_src_register *src)
764 {
765 struct gallivm_state *gallivm = &ctx->gallivm;
766 struct tgsi_shader_info *info = &ctx->shader->selector->info;
767 ubyte *name, *index, *array_first;
768 struct tgsi_full_src_register reg;
769 LLVMValueRef vertex_index = NULL;
770 LLVMValueRef param_index = NULL;
771 unsigned param_index_base, param_base;
772
773 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
774
775 if (reg.Register.Dimension) {
776
777 if (reg.Dimension.Indirect)
778 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
779 reg.Dimension.Index);
780 else
781 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
782 }
783
784 /* Get information about the register. */
785 if (reg.Register.File == TGSI_FILE_INPUT) {
786 name = info->input_semantic_name;
787 index = info->input_semantic_index;
788 array_first = info->input_array_first;
789 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
790 name = info->output_semantic_name;
791 index = info->output_semantic_index;
792 array_first = info->output_array_first;
793 } else {
794 assert(0);
795 return NULL;
796 }
797
798 if (reg.Register.Indirect) {
799 if (reg.Indirect.ArrayID)
800 param_base = array_first[reg.Indirect.ArrayID];
801 else
802 param_base = reg.Register.Index;
803
804 param_index = get_indirect_index(ctx, &reg.Indirect,
805 reg.Register.Index - param_base);
806
807 } else {
808 param_base = reg.Register.Index;
809 param_index = ctx->i32_0;
810 }
811
812 param_index_base = si_shader_io_get_unique_index(name[param_base],
813 index[param_base]);
814
815 param_index = LLVMBuildAdd(gallivm->builder, param_index,
816 LLVMConstInt(ctx->i32, param_index_base, 0),
817 "");
818
819 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
820 vertex_index, param_index);
821 }
822
823 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
824 enum tgsi_opcode_type type, unsigned swizzle,
825 LLVMValueRef buffer, LLVMValueRef offset,
826 LLVMValueRef base, bool readonly_memory)
827 {
828 struct si_shader_context *ctx = si_shader_context(bld_base);
829 struct gallivm_state *gallivm = &ctx->gallivm;
830 LLVMValueRef value, value2;
831 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
832 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
833
834 if (swizzle == ~0) {
835 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
836 0, 1, 0, readonly_memory);
837
838 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
839 }
840
841 if (!tgsi_type_is_64bit(type)) {
842 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
843 0, 1, 0, readonly_memory);
844
845 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
846 return LLVMBuildExtractElement(gallivm->builder, value,
847 LLVMConstInt(ctx->i32, swizzle, 0), "");
848 }
849
850 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
851 swizzle * 4, 1, 0, readonly_memory);
852
853 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
854 swizzle * 4 + 4, 1, 0, readonly_memory);
855
856 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
857 }
858
859 /**
860 * Load from LDS.
861 *
862 * \param type output value type
863 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
864 * \param dw_addr address in dwords
865 */
866 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
867 enum tgsi_opcode_type type, unsigned swizzle,
868 LLVMValueRef dw_addr)
869 {
870 struct si_shader_context *ctx = si_shader_context(bld_base);
871 struct gallivm_state *gallivm = &ctx->gallivm;
872 LLVMValueRef value;
873
874 if (swizzle == ~0) {
875 LLVMValueRef values[TGSI_NUM_CHANNELS];
876
877 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
878 values[chan] = lds_load(bld_base, type, chan, dw_addr);
879
880 return lp_build_gather_values(gallivm, values,
881 TGSI_NUM_CHANNELS);
882 }
883
884 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
885 LLVMConstInt(ctx->i32, swizzle, 0));
886
887 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
888 if (tgsi_type_is_64bit(type)) {
889 LLVMValueRef value2;
890 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
891 ctx->i32_1);
892 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
893 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
894 }
895
896 return LLVMBuildBitCast(gallivm->builder, value,
897 tgsi2llvmtype(bld_base, type), "");
898 }
899
900 /**
901 * Store to LDS.
902 *
903 * \param swizzle offset (typically 0..3)
904 * \param dw_addr address in dwords
905 * \param value value to store
906 */
907 static void lds_store(struct lp_build_tgsi_context *bld_base,
908 unsigned dw_offset_imm, LLVMValueRef dw_addr,
909 LLVMValueRef value)
910 {
911 struct si_shader_context *ctx = si_shader_context(bld_base);
912 struct gallivm_state *gallivm = &ctx->gallivm;
913
914 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
915 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
916
917 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
918 ac_build_indexed_store(&ctx->ac, ctx->lds,
919 dw_addr, value);
920 }
921
922 static LLVMValueRef fetch_input_tcs(
923 struct lp_build_tgsi_context *bld_base,
924 const struct tgsi_full_src_register *reg,
925 enum tgsi_opcode_type type, unsigned swizzle)
926 {
927 struct si_shader_context *ctx = si_shader_context(bld_base);
928 LLVMValueRef dw_addr, stride;
929
930 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
931 dw_addr = get_tcs_in_current_patch_offset(ctx);
932 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
933
934 return lds_load(bld_base, type, swizzle, dw_addr);
935 }
936
937 static LLVMValueRef fetch_output_tcs(
938 struct lp_build_tgsi_context *bld_base,
939 const struct tgsi_full_src_register *reg,
940 enum tgsi_opcode_type type, unsigned swizzle)
941 {
942 struct si_shader_context *ctx = si_shader_context(bld_base);
943 LLVMValueRef dw_addr, stride;
944
945 if (reg->Register.Dimension) {
946 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
947 dw_addr = get_tcs_out_current_patch_offset(ctx);
948 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
949 } else {
950 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
951 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
952 }
953
954 return lds_load(bld_base, type, swizzle, dw_addr);
955 }
956
957 static LLVMValueRef fetch_input_tes(
958 struct lp_build_tgsi_context *bld_base,
959 const struct tgsi_full_src_register *reg,
960 enum tgsi_opcode_type type, unsigned swizzle)
961 {
962 struct si_shader_context *ctx = si_shader_context(bld_base);
963 LLVMValueRef rw_buffers, buffer, base, addr;
964
965 rw_buffers = LLVMGetParam(ctx->main_fn,
966 ctx->param_rw_buffers);
967 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
968 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
969
970 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
971 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
972
973 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
974 }
975
976 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
977 const struct tgsi_full_instruction *inst,
978 const struct tgsi_opcode_info *info,
979 LLVMValueRef dst[4])
980 {
981 struct si_shader_context *ctx = si_shader_context(bld_base);
982 struct gallivm_state *gallivm = &ctx->gallivm;
983 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
984 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
985 unsigned chan_index;
986 LLVMValueRef dw_addr, stride;
987 LLVMValueRef rw_buffers, buffer, base, buf_addr;
988 LLVMValueRef values[4];
989 bool skip_lds_store;
990 bool is_tess_factor = false;
991
992 /* Only handle per-patch and per-vertex outputs here.
993 * Vectors will be lowered to scalars and this function will be called again.
994 */
995 if (reg->Register.File != TGSI_FILE_OUTPUT ||
996 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
997 si_llvm_emit_store(bld_base, inst, info, dst);
998 return;
999 }
1000
1001 if (reg->Register.Dimension) {
1002 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1003 dw_addr = get_tcs_out_current_patch_offset(ctx);
1004 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1005 skip_lds_store = !sh_info->reads_pervertex_outputs;
1006 } else {
1007 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1008 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1009 skip_lds_store = !sh_info->reads_perpatch_outputs;
1010
1011 if (!reg->Register.Indirect) {
1012 int name = sh_info->output_semantic_name[reg->Register.Index];
1013
1014 /* Always write tess factors into LDS for the TCS epilog. */
1015 if (name == TGSI_SEMANTIC_TESSINNER ||
1016 name == TGSI_SEMANTIC_TESSOUTER) {
1017 skip_lds_store = false;
1018 is_tess_factor = true;
1019 }
1020 }
1021 }
1022
1023 rw_buffers = LLVMGetParam(ctx->main_fn,
1024 ctx->param_rw_buffers);
1025 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1026 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1027
1028 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1029 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1030
1031
1032 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1033 LLVMValueRef value = dst[chan_index];
1034
1035 if (inst->Instruction.Saturate)
1036 value = ac_build_clamp(&ctx->ac, value);
1037
1038 /* Skip LDS stores if there is no LDS read of this output. */
1039 if (!skip_lds_store)
1040 lds_store(bld_base, chan_index, dw_addr, value);
1041
1042 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1043 values[chan_index] = value;
1044
1045 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1046 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1047 buf_addr, base,
1048 4 * chan_index, 1, 0, true, false);
1049 }
1050 }
1051
1052 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1053 LLVMValueRef value = lp_build_gather_values(gallivm,
1054 values, 4);
1055 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1056 base, 0, 1, 0, true, false);
1057 }
1058 }
1059
1060 static LLVMValueRef fetch_input_gs(
1061 struct lp_build_tgsi_context *bld_base,
1062 const struct tgsi_full_src_register *reg,
1063 enum tgsi_opcode_type type,
1064 unsigned swizzle)
1065 {
1066 struct si_shader_context *ctx = si_shader_context(bld_base);
1067 struct si_shader *shader = ctx->shader;
1068 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1069 struct gallivm_state *gallivm = &ctx->gallivm;
1070 LLVMValueRef vtx_offset, soffset;
1071 struct tgsi_shader_info *info = &shader->selector->info;
1072 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1073 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1074 unsigned param;
1075 LLVMValueRef value;
1076
1077 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1078 return get_primitive_id(bld_base, swizzle);
1079
1080 if (!reg->Register.Dimension)
1081 return NULL;
1082
1083 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1084
1085 /* GFX9 has the ESGS ring in LDS. */
1086 if (ctx->screen->b.chip_class >= GFX9) {
1087 unsigned index = reg->Dimension.Index;
1088
1089 switch (index / 2) {
1090 case 0:
1091 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1092 index % 2 ? 16 : 0, 16);
1093 break;
1094 case 1:
1095 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1096 index % 2 ? 16 : 0, 16);
1097 break;
1098 case 2:
1099 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1100 index % 2 ? 16 : 0, 16);
1101 break;
1102 default:
1103 assert(0);
1104 return NULL;
1105 }
1106
1107 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1108 LLVMConstInt(ctx->i32, param * 4, 0), "");
1109 return lds_load(bld_base, type, swizzle, vtx_offset);
1110 }
1111
1112 /* GFX6: input load from the ESGS ring in memory. */
1113 if (swizzle == ~0) {
1114 LLVMValueRef values[TGSI_NUM_CHANNELS];
1115 unsigned chan;
1116 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1117 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1118 }
1119 return lp_build_gather_values(gallivm, values,
1120 TGSI_NUM_CHANNELS);
1121 }
1122
1123 /* Get the vertex offset parameter on GFX6. */
1124 unsigned vtx_offset_param = reg->Dimension.Index;
1125 if (vtx_offset_param < 2) {
1126 vtx_offset_param += ctx->param_gs_vtx0_offset;
1127 } else {
1128 assert(vtx_offset_param < 6);
1129 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1130 }
1131 vtx_offset = lp_build_mul_imm(uint,
1132 LLVMGetParam(ctx->main_fn,
1133 vtx_offset_param),
1134 4);
1135
1136 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1137
1138 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1139 vtx_offset, soffset, 0, 1, 0, true);
1140 if (tgsi_type_is_64bit(type)) {
1141 LLVMValueRef value2;
1142 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1143
1144 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1145 ctx->i32_0, vtx_offset, soffset,
1146 0, 1, 0, true);
1147 return si_llvm_emit_fetch_64bit(bld_base, type,
1148 value, value2);
1149 }
1150 return LLVMBuildBitCast(gallivm->builder,
1151 value,
1152 tgsi2llvmtype(bld_base, type), "");
1153 }
1154
1155 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1156 {
1157 switch (interpolate) {
1158 case TGSI_INTERPOLATE_CONSTANT:
1159 return 0;
1160
1161 case TGSI_INTERPOLATE_LINEAR:
1162 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1163 return SI_PARAM_LINEAR_SAMPLE;
1164 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1165 return SI_PARAM_LINEAR_CENTROID;
1166 else
1167 return SI_PARAM_LINEAR_CENTER;
1168 break;
1169 case TGSI_INTERPOLATE_COLOR:
1170 case TGSI_INTERPOLATE_PERSPECTIVE:
1171 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1172 return SI_PARAM_PERSP_SAMPLE;
1173 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1174 return SI_PARAM_PERSP_CENTROID;
1175 else
1176 return SI_PARAM_PERSP_CENTER;
1177 break;
1178 default:
1179 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1180 return -1;
1181 }
1182 }
1183
1184 /**
1185 * Interpolate a fragment shader input.
1186 *
1187 * @param ctx context
1188 * @param input_index index of the input in hardware
1189 * @param semantic_name TGSI_SEMANTIC_*
1190 * @param semantic_index semantic index
1191 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1192 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1193 * @param interp_param interpolation weights (i,j)
1194 * @param prim_mask SI_PARAM_PRIM_MASK
1195 * @param face SI_PARAM_FRONT_FACE
1196 * @param result the return value (4 components)
1197 */
1198 static void interp_fs_input(struct si_shader_context *ctx,
1199 unsigned input_index,
1200 unsigned semantic_name,
1201 unsigned semantic_index,
1202 unsigned num_interp_inputs,
1203 unsigned colors_read_mask,
1204 LLVMValueRef interp_param,
1205 LLVMValueRef prim_mask,
1206 LLVMValueRef face,
1207 LLVMValueRef result[4])
1208 {
1209 struct gallivm_state *gallivm = &ctx->gallivm;
1210 LLVMValueRef attr_number;
1211 LLVMValueRef i, j;
1212
1213 unsigned chan;
1214
1215 /* fs.constant returns the param from the middle vertex, so it's not
1216 * really useful for flat shading. It's meant to be used for custom
1217 * interpolation (but the intrinsic can't fetch from the other two
1218 * vertices).
1219 *
1220 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1221 * to do the right thing. The only reason we use fs.constant is that
1222 * fs.interp cannot be used on integers, because they can be equal
1223 * to NaN.
1224 *
1225 * When interp is false we will use fs.constant or for newer llvm,
1226 * amdgcn.interp.mov.
1227 */
1228 bool interp = interp_param != NULL;
1229
1230 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1231
1232 if (interp) {
1233 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1234 LLVMVectorType(ctx->f32, 2), "");
1235
1236 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1237 ctx->i32_0, "");
1238 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1239 ctx->i32_1, "");
1240 }
1241
1242 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1243 ctx->shader->key.part.ps.prolog.color_two_side) {
1244 LLVMValueRef is_face_positive;
1245 LLVMValueRef back_attr_number;
1246
1247 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1248 * otherwise it's at offset "num_inputs".
1249 */
1250 unsigned back_attr_offset = num_interp_inputs;
1251 if (semantic_index == 1 && colors_read_mask & 0xf)
1252 back_attr_offset += 1;
1253
1254 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1255
1256 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1257 face, ctx->i32_0, "");
1258
1259 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1260 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1261 LLVMValueRef front, back;
1262
1263 if (interp) {
1264 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1265 attr_number, prim_mask,
1266 i, j);
1267 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1268 back_attr_number, prim_mask,
1269 i, j);
1270 } else {
1271 front = ac_build_fs_interp_mov(&ctx->ac,
1272 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1273 llvm_chan, attr_number, prim_mask);
1274 back = ac_build_fs_interp_mov(&ctx->ac,
1275 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1276 llvm_chan, back_attr_number, prim_mask);
1277 }
1278
1279 result[chan] = LLVMBuildSelect(gallivm->builder,
1280 is_face_positive,
1281 front,
1282 back,
1283 "");
1284 }
1285 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1286 if (interp) {
1287 result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1288 attr_number, prim_mask, i, j);
1289 } else {
1290 result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1291 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1292 attr_number, prim_mask);
1293 }
1294 result[1] =
1295 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1296 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1297 } else {
1298 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1299 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1300
1301 if (interp) {
1302 result[chan] = ac_build_fs_interp(&ctx->ac,
1303 llvm_chan, attr_number, prim_mask, i, j);
1304 } else {
1305 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1306 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1307 llvm_chan, attr_number, prim_mask);
1308 }
1309 }
1310 }
1311 }
1312
1313 static void declare_input_fs(
1314 struct si_shader_context *ctx,
1315 unsigned input_index,
1316 const struct tgsi_full_declaration *decl,
1317 LLVMValueRef out[4])
1318 {
1319 struct lp_build_context *base = &ctx->bld_base.base;
1320 struct si_shader *shader = ctx->shader;
1321 LLVMValueRef main_fn = ctx->main_fn;
1322 LLVMValueRef interp_param = NULL;
1323 int interp_param_idx;
1324
1325 /* Get colors from input VGPRs (set by the prolog). */
1326 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1327 unsigned i = decl->Semantic.Index;
1328 unsigned colors_read = shader->selector->info.colors_read;
1329 unsigned mask = colors_read >> (i * 4);
1330 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1331 (i ? util_bitcount(colors_read & 0xf) : 0);
1332
1333 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1334 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1335 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1336 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1337 return;
1338 }
1339
1340 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1341 decl->Interp.Location);
1342 if (interp_param_idx == -1)
1343 return;
1344 else if (interp_param_idx) {
1345 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1346 }
1347
1348 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1349 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1350 ctx->shader->key.part.ps.prolog.flatshade_colors)
1351 interp_param = NULL; /* load the constant color */
1352
1353 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1354 decl->Semantic.Index, shader->selector->info.num_inputs,
1355 shader->selector->info.colors_read, interp_param,
1356 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1357 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1358 &out[0]);
1359 }
1360
1361 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1362 {
1363 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1364 }
1365
1366
1367 /**
1368 * Load a dword from a constant buffer.
1369 */
1370 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1371 LLVMValueRef resource,
1372 LLVMValueRef offset)
1373 {
1374 LLVMBuilderRef builder = ctx->gallivm.builder;
1375 LLVMValueRef args[2] = {resource, offset};
1376
1377 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1378 LP_FUNC_ATTR_READNONE |
1379 LP_FUNC_ATTR_LEGACY);
1380 }
1381
1382 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1383 {
1384 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1385 struct gallivm_state *gallivm = &ctx->gallivm;
1386 LLVMBuilderRef builder = gallivm->builder;
1387 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1388 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1389 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1390
1391 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1392 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1393 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1394
1395 LLVMValueRef pos[4] = {
1396 buffer_load_const(ctx, resource, offset0),
1397 buffer_load_const(ctx, resource, offset1),
1398 LLVMConstReal(ctx->f32, 0),
1399 LLVMConstReal(ctx->f32, 0)
1400 };
1401
1402 return lp_build_gather_values(gallivm, pos, 4);
1403 }
1404
1405 static void declare_system_value(struct si_shader_context *ctx,
1406 unsigned index,
1407 const struct tgsi_full_declaration *decl)
1408 {
1409 struct lp_build_context *bld = &ctx->bld_base.base;
1410 struct gallivm_state *gallivm = &ctx->gallivm;
1411 LLVMValueRef value = 0;
1412
1413 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1414
1415 switch (decl->Semantic.Name) {
1416 case TGSI_SEMANTIC_INSTANCEID:
1417 value = LLVMGetParam(ctx->main_fn,
1418 ctx->param_instance_id);
1419 break;
1420
1421 case TGSI_SEMANTIC_VERTEXID:
1422 value = LLVMBuildAdd(gallivm->builder,
1423 LLVMGetParam(ctx->main_fn,
1424 ctx->param_vertex_id),
1425 LLVMGetParam(ctx->main_fn,
1426 ctx->param_base_vertex), "");
1427 break;
1428
1429 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1430 /* Unused. Clarify the meaning in indexed vs. non-indexed
1431 * draws if this is ever used again. */
1432 assert(false);
1433 break;
1434
1435 case TGSI_SEMANTIC_BASEVERTEX:
1436 {
1437 /* For non-indexed draws, the base vertex set by the driver
1438 * (for direct draws) or the CP (for indirect draws) is the
1439 * first vertex ID, but GLSL expects 0 to be returned.
1440 */
1441 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1442 LLVMValueRef indexed;
1443
1444 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1445 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1446
1447 value = LLVMBuildSelect(gallivm->builder, indexed,
1448 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1449 ctx->i32_0, "");
1450 break;
1451 }
1452
1453 case TGSI_SEMANTIC_BASEINSTANCE:
1454 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1455 break;
1456
1457 case TGSI_SEMANTIC_DRAWID:
1458 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1459 break;
1460
1461 case TGSI_SEMANTIC_INVOCATIONID:
1462 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1463 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1464 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1465 value = LLVMGetParam(ctx->main_fn,
1466 ctx->param_gs_instance_id);
1467 else
1468 assert(!"INVOCATIONID not implemented");
1469 break;
1470
1471 case TGSI_SEMANTIC_POSITION:
1472 {
1473 LLVMValueRef pos[4] = {
1474 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1475 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1476 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1477 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1478 LLVMGetParam(ctx->main_fn,
1479 SI_PARAM_POS_W_FLOAT)),
1480 };
1481 value = lp_build_gather_values(gallivm, pos, 4);
1482 break;
1483 }
1484
1485 case TGSI_SEMANTIC_FACE:
1486 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1487 break;
1488
1489 case TGSI_SEMANTIC_SAMPLEID:
1490 value = get_sample_id(ctx);
1491 break;
1492
1493 case TGSI_SEMANTIC_SAMPLEPOS: {
1494 LLVMValueRef pos[4] = {
1495 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1496 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1497 LLVMConstReal(ctx->f32, 0),
1498 LLVMConstReal(ctx->f32, 0)
1499 };
1500 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1501 TGSI_OPCODE_FRC, pos[0]);
1502 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1503 TGSI_OPCODE_FRC, pos[1]);
1504 value = lp_build_gather_values(gallivm, pos, 4);
1505 break;
1506 }
1507
1508 case TGSI_SEMANTIC_SAMPLEMASK:
1509 /* This can only occur with the OpenGL Core profile, which
1510 * doesn't support smoothing.
1511 */
1512 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1513 break;
1514
1515 case TGSI_SEMANTIC_TESSCOORD:
1516 {
1517 LLVMValueRef coord[4] = {
1518 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1519 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1520 bld->zero,
1521 bld->zero
1522 };
1523
1524 /* For triangles, the vector should be (u, v, 1-u-v). */
1525 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1526 PIPE_PRIM_TRIANGLES)
1527 coord[2] = lp_build_sub(bld, bld->one,
1528 lp_build_add(bld, coord[0], coord[1]));
1529
1530 value = lp_build_gather_values(gallivm, coord, 4);
1531 break;
1532 }
1533
1534 case TGSI_SEMANTIC_VERTICESIN:
1535 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1536 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1537 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1538 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 7);
1539 else
1540 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1541 break;
1542
1543 case TGSI_SEMANTIC_TESSINNER:
1544 case TGSI_SEMANTIC_TESSOUTER:
1545 {
1546 LLVMValueRef rw_buffers, buffer, base, addr;
1547 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1548
1549 rw_buffers = LLVMGetParam(ctx->main_fn,
1550 ctx->param_rw_buffers);
1551 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1552 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1553
1554 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1555 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1556 LLVMConstInt(ctx->i32, param, 0));
1557
1558 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1559 ~0, buffer, base, addr, true);
1560
1561 break;
1562 }
1563
1564 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1565 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1566 {
1567 LLVMValueRef buf, slot, val[4];
1568 int i, offset;
1569
1570 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1571 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1572 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1573 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1574
1575 for (i = 0; i < 4; i++)
1576 val[i] = buffer_load_const(ctx, buf,
1577 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1578 value = lp_build_gather_values(gallivm, val, 4);
1579 break;
1580 }
1581
1582 case TGSI_SEMANTIC_PRIMID:
1583 value = get_primitive_id(&ctx->bld_base, 0);
1584 break;
1585
1586 case TGSI_SEMANTIC_GRID_SIZE:
1587 value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
1588 break;
1589
1590 case TGSI_SEMANTIC_BLOCK_SIZE:
1591 {
1592 LLVMValueRef values[3];
1593 unsigned i;
1594 unsigned *properties = ctx->shader->selector->info.properties;
1595
1596 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1597 unsigned sizes[3] = {
1598 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1599 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1600 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1601 };
1602
1603 for (i = 0; i < 3; ++i)
1604 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1605
1606 value = lp_build_gather_values(gallivm, values, 3);
1607 } else {
1608 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
1609 }
1610 break;
1611 }
1612
1613 case TGSI_SEMANTIC_BLOCK_ID:
1614 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
1615 break;
1616
1617 case TGSI_SEMANTIC_THREAD_ID:
1618 value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
1619 break;
1620
1621 case TGSI_SEMANTIC_HELPER_INVOCATION:
1622 if (HAVE_LLVM >= 0x0309) {
1623 value = lp_build_intrinsic(gallivm->builder,
1624 "llvm.amdgcn.ps.live",
1625 ctx->i1, NULL, 0,
1626 LP_FUNC_ATTR_READNONE);
1627 value = LLVMBuildNot(gallivm->builder, value, "");
1628 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1629 } else {
1630 assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1631 return;
1632 }
1633 break;
1634
1635 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1636 value = LLVMConstInt(ctx->i32, 64, 0);
1637 break;
1638
1639 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1640 value = ac_get_thread_id(&ctx->ac);
1641 break;
1642
1643 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1644 {
1645 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1646 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1647 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1648 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1649 break;
1650 }
1651
1652 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1653 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1654 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1655 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1656 {
1657 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1658 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1659 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1660 /* All bits set except LSB */
1661 value = LLVMConstInt(ctx->i64, -2, 0);
1662 } else {
1663 /* All bits set */
1664 value = LLVMConstInt(ctx->i64, -1, 0);
1665 }
1666 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1667 value = LLVMBuildShl(gallivm->builder, value, id, "");
1668 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1669 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1670 value = LLVMBuildNot(gallivm->builder, value, "");
1671 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1672 break;
1673 }
1674
1675 default:
1676 assert(!"unknown system value");
1677 return;
1678 }
1679
1680 ctx->system_values[index] = value;
1681 }
1682
1683 static void declare_compute_memory(struct si_shader_context *ctx,
1684 const struct tgsi_full_declaration *decl)
1685 {
1686 struct si_shader_selector *sel = ctx->shader->selector;
1687 struct gallivm_state *gallivm = &ctx->gallivm;
1688
1689 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1690 LLVMValueRef var;
1691
1692 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1693 assert(decl->Range.First == decl->Range.Last);
1694 assert(!ctx->shared_memory);
1695
1696 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1697 LLVMArrayType(ctx->i8, sel->local_size),
1698 "compute_lds",
1699 LOCAL_ADDR_SPACE);
1700 LLVMSetAlignment(var, 4);
1701
1702 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1703 }
1704
1705 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1706 {
1707 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1708 ctx->param_const_buffers);
1709
1710 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1711 LLVMConstInt(ctx->i32, i, 0));
1712 }
1713
1714 static LLVMValueRef fetch_constant(
1715 struct lp_build_tgsi_context *bld_base,
1716 const struct tgsi_full_src_register *reg,
1717 enum tgsi_opcode_type type,
1718 unsigned swizzle)
1719 {
1720 struct si_shader_context *ctx = si_shader_context(bld_base);
1721 struct lp_build_context *base = &bld_base->base;
1722 const struct tgsi_ind_register *ireg = &reg->Indirect;
1723 unsigned buf, idx;
1724
1725 LLVMValueRef addr, bufp;
1726 LLVMValueRef result;
1727
1728 if (swizzle == LP_CHAN_ALL) {
1729 unsigned chan;
1730 LLVMValueRef values[4];
1731 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1732 values[chan] = fetch_constant(bld_base, reg, type, chan);
1733
1734 return lp_build_gather_values(&ctx->gallivm, values, 4);
1735 }
1736
1737 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1738 idx = reg->Register.Index * 4 + swizzle;
1739
1740 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1741 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1742 LLVMValueRef index;
1743 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1744 reg->Dimension.Index,
1745 SI_NUM_CONST_BUFFERS);
1746 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1747 } else
1748 bufp = load_const_buffer_desc(ctx, buf);
1749
1750 if (reg->Register.Indirect) {
1751 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1752 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1753 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1754 addr = lp_build_add(&bld_base->uint_bld, addr,
1755 LLVMConstInt(ctx->i32, idx * 4, 0));
1756 } else {
1757 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1758 }
1759
1760 result = buffer_load_const(ctx, bufp, addr);
1761
1762 if (!tgsi_type_is_64bit(type))
1763 result = bitcast(bld_base, type, result);
1764 else {
1765 LLVMValueRef addr2, result2;
1766
1767 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1768 LLVMConstInt(ctx->i32, 4, 0));
1769 result2 = buffer_load_const(ctx, bufp, addr2);
1770
1771 result = si_llvm_emit_fetch_64bit(bld_base, type,
1772 result, result2);
1773 }
1774 return result;
1775 }
1776
1777 /* Upper 16 bits must be zero. */
1778 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1779 LLVMValueRef val[2])
1780 {
1781 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1782 LLVMBuildShl(ctx->gallivm.builder, val[1],
1783 LLVMConstInt(ctx->i32, 16, 0),
1784 ""), "");
1785 }
1786
1787 /* Upper 16 bits are ignored and will be dropped. */
1788 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1789 LLVMValueRef val[2])
1790 {
1791 LLVMValueRef v[2] = {
1792 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1793 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1794 val[1],
1795 };
1796 return si_llvm_pack_two_int16(ctx, v);
1797 }
1798
1799 /* Initialize arguments for the shader export intrinsic */
1800 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1801 LLVMValueRef *values,
1802 unsigned target,
1803 struct ac_export_args *args)
1804 {
1805 struct si_shader_context *ctx = si_shader_context(bld_base);
1806 struct lp_build_context *base = &bld_base->base;
1807 LLVMBuilderRef builder = ctx->gallivm.builder;
1808 LLVMValueRef val[4];
1809 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1810 unsigned chan;
1811 bool is_int8, is_int10;
1812
1813 /* Default is 0xf. Adjusted below depending on the format. */
1814 args->enabled_channels = 0xf; /* writemask */
1815
1816 /* Specify whether the EXEC mask represents the valid mask */
1817 args->valid_mask = 0;
1818
1819 /* Specify whether this is the last export */
1820 args->done = 0;
1821
1822 /* Specify the target we are exporting */
1823 args->target = target;
1824
1825 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1826 const struct si_shader_key *key = &ctx->shader->key;
1827 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1828 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1829
1830 assert(cbuf >= 0 && cbuf < 8);
1831 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1832 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1833 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1834 }
1835
1836 args->compr = false;
1837 args->out[0] = base->undef;
1838 args->out[1] = base->undef;
1839 args->out[2] = base->undef;
1840 args->out[3] = base->undef;
1841
1842 switch (spi_shader_col_format) {
1843 case V_028714_SPI_SHADER_ZERO:
1844 args->enabled_channels = 0; /* writemask */
1845 args->target = V_008DFC_SQ_EXP_NULL;
1846 break;
1847
1848 case V_028714_SPI_SHADER_32_R:
1849 args->enabled_channels = 1; /* writemask */
1850 args->out[0] = values[0];
1851 break;
1852
1853 case V_028714_SPI_SHADER_32_GR:
1854 args->enabled_channels = 0x3; /* writemask */
1855 args->out[0] = values[0];
1856 args->out[1] = values[1];
1857 break;
1858
1859 case V_028714_SPI_SHADER_32_AR:
1860 args->enabled_channels = 0x9; /* writemask */
1861 args->out[0] = values[0];
1862 args->out[3] = values[3];
1863 break;
1864
1865 case V_028714_SPI_SHADER_FP16_ABGR:
1866 args->compr = 1; /* COMPR flag */
1867
1868 for (chan = 0; chan < 2; chan++) {
1869 LLVMValueRef pack_args[2] = {
1870 values[2 * chan],
1871 values[2 * chan + 1]
1872 };
1873 LLVMValueRef packed;
1874
1875 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1876 args->out[chan] =
1877 LLVMBuildBitCast(ctx->gallivm.builder,
1878 packed, ctx->f32, "");
1879 }
1880 break;
1881
1882 case V_028714_SPI_SHADER_UNORM16_ABGR:
1883 for (chan = 0; chan < 4; chan++) {
1884 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1885 val[chan] = LLVMBuildFMul(builder, val[chan],
1886 LLVMConstReal(ctx->f32, 65535), "");
1887 val[chan] = LLVMBuildFAdd(builder, val[chan],
1888 LLVMConstReal(ctx->f32, 0.5), "");
1889 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1890 ctx->i32, "");
1891 }
1892
1893 args->compr = 1; /* COMPR flag */
1894 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1895 si_llvm_pack_two_int16(ctx, val));
1896 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1897 si_llvm_pack_two_int16(ctx, val+2));
1898 break;
1899
1900 case V_028714_SPI_SHADER_SNORM16_ABGR:
1901 for (chan = 0; chan < 4; chan++) {
1902 /* Clamp between [-1, 1]. */
1903 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1904 values[chan],
1905 LLVMConstReal(ctx->f32, 1));
1906 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1907 val[chan],
1908 LLVMConstReal(ctx->f32, -1));
1909 /* Convert to a signed integer in [-32767, 32767]. */
1910 val[chan] = LLVMBuildFMul(builder, val[chan],
1911 LLVMConstReal(ctx->f32, 32767), "");
1912 /* If positive, add 0.5, else add -0.5. */
1913 val[chan] = LLVMBuildFAdd(builder, val[chan],
1914 LLVMBuildSelect(builder,
1915 LLVMBuildFCmp(builder, LLVMRealOGE,
1916 val[chan], base->zero, ""),
1917 LLVMConstReal(ctx->f32, 0.5),
1918 LLVMConstReal(ctx->f32, -0.5), ""), "");
1919 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1920 }
1921
1922 args->compr = 1; /* COMPR flag */
1923 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1924 si_llvm_pack_two_int32_as_int16(ctx, val));
1925 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1926 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1927 break;
1928
1929 case V_028714_SPI_SHADER_UINT16_ABGR: {
1930 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1931 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1932 LLVMValueRef max_alpha =
1933 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1934
1935 /* Clamp. */
1936 for (chan = 0; chan < 4; chan++) {
1937 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1938 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1939 val[chan],
1940 chan == 3 ? max_alpha : max_rgb);
1941 }
1942
1943 args->compr = 1; /* COMPR flag */
1944 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1945 si_llvm_pack_two_int16(ctx, val));
1946 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1947 si_llvm_pack_two_int16(ctx, val+2));
1948 break;
1949 }
1950
1951 case V_028714_SPI_SHADER_SINT16_ABGR: {
1952 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1953 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1954 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1955 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1956 LLVMValueRef max_alpha =
1957 !is_int10 ? max_rgb : ctx->i32_1;
1958 LLVMValueRef min_alpha =
1959 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1960
1961 /* Clamp. */
1962 for (chan = 0; chan < 4; chan++) {
1963 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1964 val[chan] = lp_build_emit_llvm_binary(bld_base,
1965 TGSI_OPCODE_IMIN,
1966 val[chan], chan == 3 ? max_alpha : max_rgb);
1967 val[chan] = lp_build_emit_llvm_binary(bld_base,
1968 TGSI_OPCODE_IMAX,
1969 val[chan], chan == 3 ? min_alpha : min_rgb);
1970 }
1971
1972 args->compr = 1; /* COMPR flag */
1973 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1974 si_llvm_pack_two_int32_as_int16(ctx, val));
1975 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1976 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1977 break;
1978 }
1979
1980 case V_028714_SPI_SHADER_32_ABGR:
1981 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1982 break;
1983 }
1984 }
1985
1986 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1987 LLVMValueRef alpha)
1988 {
1989 struct si_shader_context *ctx = si_shader_context(bld_base);
1990
1991 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1992 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
1993 SI_PARAM_ALPHA_REF);
1994
1995 LLVMValueRef alpha_pass =
1996 lp_build_cmp(&bld_base->base,
1997 ctx->shader->key.part.ps.epilog.alpha_func,
1998 alpha, alpha_ref);
1999 LLVMValueRef arg =
2000 lp_build_select(&bld_base->base,
2001 alpha_pass,
2002 LLVMConstReal(ctx->f32, 1.0f),
2003 LLVMConstReal(ctx->f32, -1.0f));
2004
2005 ac_build_kill(&ctx->ac, arg);
2006 } else {
2007 ac_build_kill(&ctx->ac, NULL);
2008 }
2009 }
2010
2011 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2012 LLVMValueRef alpha,
2013 unsigned samplemask_param)
2014 {
2015 struct si_shader_context *ctx = si_shader_context(bld_base);
2016 struct gallivm_state *gallivm = &ctx->gallivm;
2017 LLVMValueRef coverage;
2018
2019 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2020 coverage = LLVMGetParam(ctx->main_fn,
2021 samplemask_param);
2022 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2023
2024 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2025 ctx->i32,
2026 &coverage, 1, LP_FUNC_ATTR_READNONE);
2027
2028 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2029 ctx->f32, "");
2030
2031 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2032 LLVMConstReal(ctx->f32,
2033 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2034
2035 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2036 }
2037
2038 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2039 struct ac_export_args *pos, LLVMValueRef *out_elts)
2040 {
2041 struct si_shader_context *ctx = si_shader_context(bld_base);
2042 struct lp_build_context *base = &bld_base->base;
2043 unsigned reg_index;
2044 unsigned chan;
2045 unsigned const_chan;
2046 LLVMValueRef base_elt;
2047 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2048 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2049 SI_VS_CONST_CLIP_PLANES, 0);
2050 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2051
2052 for (reg_index = 0; reg_index < 2; reg_index ++) {
2053 struct ac_export_args *args = &pos[2 + reg_index];
2054
2055 args->out[0] =
2056 args->out[1] =
2057 args->out[2] =
2058 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2059
2060 /* Compute dot products of position and user clip plane vectors */
2061 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2062 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2063 LLVMValueRef addr =
2064 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2065 const_chan) * 4, 0);
2066 base_elt = buffer_load_const(ctx, const_resource,
2067 addr);
2068 args->out[chan] =
2069 lp_build_add(base, args->out[chan],
2070 lp_build_mul(base, base_elt,
2071 out_elts[const_chan]));
2072 }
2073 }
2074
2075 args->enabled_channels = 0xf;
2076 args->valid_mask = 0;
2077 args->done = 0;
2078 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2079 args->compr = 0;
2080 }
2081 }
2082
2083 static void si_dump_streamout(struct pipe_stream_output_info *so)
2084 {
2085 unsigned i;
2086
2087 if (so->num_outputs)
2088 fprintf(stderr, "STREAMOUT\n");
2089
2090 for (i = 0; i < so->num_outputs; i++) {
2091 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2092 so->output[i].start_component;
2093 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2094 i, so->output[i].output_buffer,
2095 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2096 so->output[i].register_index,
2097 mask & 1 ? "x" : "",
2098 mask & 2 ? "y" : "",
2099 mask & 4 ? "z" : "",
2100 mask & 8 ? "w" : "");
2101 }
2102 }
2103
2104 static void emit_streamout_output(struct si_shader_context *ctx,
2105 LLVMValueRef const *so_buffers,
2106 LLVMValueRef const *so_write_offsets,
2107 struct pipe_stream_output *stream_out,
2108 struct si_shader_output_values *shader_out)
2109 {
2110 struct gallivm_state *gallivm = &ctx->gallivm;
2111 LLVMBuilderRef builder = gallivm->builder;
2112 unsigned buf_idx = stream_out->output_buffer;
2113 unsigned start = stream_out->start_component;
2114 unsigned num_comps = stream_out->num_components;
2115 LLVMValueRef out[4];
2116
2117 assert(num_comps && num_comps <= 4);
2118 if (!num_comps || num_comps > 4)
2119 return;
2120
2121 /* Load the output as int. */
2122 for (int j = 0; j < num_comps; j++) {
2123 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2124
2125 out[j] = LLVMBuildBitCast(builder,
2126 shader_out->values[start + j],
2127 ctx->i32, "");
2128 }
2129
2130 /* Pack the output. */
2131 LLVMValueRef vdata = NULL;
2132
2133 switch (num_comps) {
2134 case 1: /* as i32 */
2135 vdata = out[0];
2136 break;
2137 case 2: /* as v2i32 */
2138 case 3: /* as v4i32 (aligned to 4) */
2139 case 4: /* as v4i32 */
2140 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2141 for (int j = 0; j < num_comps; j++) {
2142 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2143 LLVMConstInt(ctx->i32, j, 0), "");
2144 }
2145 break;
2146 }
2147
2148 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2149 vdata, num_comps,
2150 so_write_offsets[buf_idx],
2151 ctx->i32_0,
2152 stream_out->dst_offset * 4, 1, 1, true, false);
2153 }
2154
2155 /**
2156 * Write streamout data to buffers for vertex stream @p stream (different
2157 * vertex streams can occur for GS copy shaders).
2158 */
2159 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2160 struct si_shader_output_values *outputs,
2161 unsigned noutput, unsigned stream)
2162 {
2163 struct si_shader_selector *sel = ctx->shader->selector;
2164 struct pipe_stream_output_info *so = &sel->so;
2165 struct gallivm_state *gallivm = &ctx->gallivm;
2166 LLVMBuilderRef builder = gallivm->builder;
2167 int i;
2168 struct lp_build_if_state if_ctx;
2169
2170 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2171 LLVMValueRef so_vtx_count =
2172 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2173
2174 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2175
2176 /* can_emit = tid < so_vtx_count; */
2177 LLVMValueRef can_emit =
2178 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2179
2180 /* Emit the streamout code conditionally. This actually avoids
2181 * out-of-bounds buffer access. The hw tells us via the SGPR
2182 * (so_vtx_count) which threads are allowed to emit streamout data. */
2183 lp_build_if(&if_ctx, gallivm, can_emit);
2184 {
2185 /* The buffer offset is computed as follows:
2186 * ByteOffset = streamout_offset[buffer_id]*4 +
2187 * (streamout_write_index + thread_id)*stride[buffer_id] +
2188 * attrib_offset
2189 */
2190
2191 LLVMValueRef so_write_index =
2192 LLVMGetParam(ctx->main_fn,
2193 ctx->param_streamout_write_index);
2194
2195 /* Compute (streamout_write_index + thread_id). */
2196 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2197
2198 /* Load the descriptor and compute the write offset for each
2199 * enabled buffer. */
2200 LLVMValueRef so_write_offset[4] = {};
2201 LLVMValueRef so_buffers[4];
2202 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2203 ctx->param_rw_buffers);
2204
2205 for (i = 0; i < 4; i++) {
2206 if (!so->stride[i])
2207 continue;
2208
2209 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2210 SI_VS_STREAMOUT_BUF0 + i, 0);
2211
2212 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2213
2214 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2215 ctx->param_streamout_offset[i]);
2216 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2217
2218 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2219 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2220 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2221 }
2222
2223 /* Write streamout data. */
2224 for (i = 0; i < so->num_outputs; i++) {
2225 unsigned reg = so->output[i].register_index;
2226
2227 if (reg >= noutput)
2228 continue;
2229
2230 if (stream != so->output[i].stream)
2231 continue;
2232
2233 emit_streamout_output(ctx, so_buffers, so_write_offset,
2234 &so->output[i], &outputs[reg]);
2235 }
2236 }
2237 lp_build_endif(&if_ctx);
2238 }
2239
2240
2241 /* Generate export instructions for hardware VS shader stage */
2242 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2243 struct si_shader_output_values *outputs,
2244 unsigned noutput)
2245 {
2246 struct si_shader_context *ctx = si_shader_context(bld_base);
2247 struct si_shader *shader = ctx->shader;
2248 struct lp_build_context *base = &bld_base->base;
2249 struct ac_export_args args, pos_args[4] = {};
2250 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2251 unsigned semantic_name, semantic_index;
2252 unsigned target;
2253 unsigned param_count = 0;
2254 unsigned pos_idx;
2255 int i;
2256
2257 for (i = 0; i < noutput; i++) {
2258 semantic_name = outputs[i].semantic_name;
2259 semantic_index = outputs[i].semantic_index;
2260 bool export_param = true;
2261
2262 switch (semantic_name) {
2263 case TGSI_SEMANTIC_POSITION: /* ignore these */
2264 case TGSI_SEMANTIC_PSIZE:
2265 case TGSI_SEMANTIC_CLIPVERTEX:
2266 case TGSI_SEMANTIC_EDGEFLAG:
2267 break;
2268 case TGSI_SEMANTIC_GENERIC:
2269 case TGSI_SEMANTIC_CLIPDIST:
2270 if (shader->key.opt.hw_vs.kill_outputs &
2271 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2272 export_param = false;
2273 break;
2274 default:
2275 if (shader->key.opt.hw_vs.kill_outputs2 &
2276 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2277 export_param = false;
2278 break;
2279 }
2280
2281 if (outputs[i].vertex_stream[0] != 0 &&
2282 outputs[i].vertex_stream[1] != 0 &&
2283 outputs[i].vertex_stream[2] != 0 &&
2284 outputs[i].vertex_stream[3] != 0)
2285 export_param = false;
2286
2287 handle_semantic:
2288 /* Select the correct target */
2289 switch(semantic_name) {
2290 case TGSI_SEMANTIC_PSIZE:
2291 psize_value = outputs[i].values[0];
2292 continue;
2293 case TGSI_SEMANTIC_EDGEFLAG:
2294 edgeflag_value = outputs[i].values[0];
2295 continue;
2296 case TGSI_SEMANTIC_LAYER:
2297 layer_value = outputs[i].values[0];
2298 semantic_name = TGSI_SEMANTIC_GENERIC;
2299 goto handle_semantic;
2300 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2301 viewport_index_value = outputs[i].values[0];
2302 semantic_name = TGSI_SEMANTIC_GENERIC;
2303 goto handle_semantic;
2304 case TGSI_SEMANTIC_POSITION:
2305 target = V_008DFC_SQ_EXP_POS;
2306 break;
2307 case TGSI_SEMANTIC_CLIPDIST:
2308 if (shader->key.opt.hw_vs.clip_disable) {
2309 semantic_name = TGSI_SEMANTIC_GENERIC;
2310 goto handle_semantic;
2311 }
2312 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2313 break;
2314 case TGSI_SEMANTIC_CLIPVERTEX:
2315 if (shader->key.opt.hw_vs.clip_disable)
2316 continue;
2317 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2318 continue;
2319 case TGSI_SEMANTIC_COLOR:
2320 case TGSI_SEMANTIC_BCOLOR:
2321 case TGSI_SEMANTIC_PRIMID:
2322 case TGSI_SEMANTIC_FOG:
2323 case TGSI_SEMANTIC_TEXCOORD:
2324 case TGSI_SEMANTIC_GENERIC:
2325 if (!export_param)
2326 continue;
2327 target = V_008DFC_SQ_EXP_PARAM + param_count;
2328 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2329 shader->info.vs_output_param_offset[i] = param_count;
2330 param_count++;
2331 break;
2332 default:
2333 target = 0;
2334 fprintf(stderr,
2335 "Warning: SI unhandled vs output type:%d\n",
2336 semantic_name);
2337 }
2338
2339 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2340
2341 if (target >= V_008DFC_SQ_EXP_POS &&
2342 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2343 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2344 &args, sizeof(args));
2345 } else {
2346 ac_build_export(&ctx->ac, &args);
2347 }
2348
2349 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2350 semantic_name = TGSI_SEMANTIC_GENERIC;
2351 goto handle_semantic;
2352 }
2353 }
2354
2355 shader->info.nr_param_exports = param_count;
2356
2357 /* We need to add the position output manually if it's missing. */
2358 if (!pos_args[0].out[0]) {
2359 pos_args[0].enabled_channels = 0xf; /* writemask */
2360 pos_args[0].valid_mask = 0; /* EXEC mask */
2361 pos_args[0].done = 0; /* last export? */
2362 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2363 pos_args[0].compr = 0; /* COMPR flag */
2364 pos_args[0].out[0] = base->zero; /* X */
2365 pos_args[0].out[1] = base->zero; /* Y */
2366 pos_args[0].out[2] = base->zero; /* Z */
2367 pos_args[0].out[3] = base->one; /* W */
2368 }
2369
2370 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2371 if (shader->selector->info.writes_psize ||
2372 shader->selector->info.writes_edgeflag ||
2373 shader->selector->info.writes_viewport_index ||
2374 shader->selector->info.writes_layer) {
2375 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2376 (shader->selector->info.writes_edgeflag << 1) |
2377 (shader->selector->info.writes_layer << 2) |
2378 (shader->selector->info.writes_viewport_index << 3);
2379 pos_args[1].valid_mask = 0; /* EXEC mask */
2380 pos_args[1].done = 0; /* last export? */
2381 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2382 pos_args[1].compr = 0; /* COMPR flag */
2383 pos_args[1].out[0] = base->zero; /* X */
2384 pos_args[1].out[1] = base->zero; /* Y */
2385 pos_args[1].out[2] = base->zero; /* Z */
2386 pos_args[1].out[3] = base->zero; /* W */
2387
2388 if (shader->selector->info.writes_psize)
2389 pos_args[1].out[0] = psize_value;
2390
2391 if (shader->selector->info.writes_edgeflag) {
2392 /* The output is a float, but the hw expects an integer
2393 * with the first bit containing the edge flag. */
2394 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2395 edgeflag_value,
2396 ctx->i32, "");
2397 edgeflag_value = lp_build_min(&bld_base->int_bld,
2398 edgeflag_value,
2399 ctx->i32_1);
2400
2401 /* The LLVM intrinsic expects a float. */
2402 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2403 edgeflag_value,
2404 ctx->f32, "");
2405 }
2406
2407 if (shader->selector->info.writes_layer)
2408 pos_args[1].out[2] = layer_value;
2409
2410 if (shader->selector->info.writes_viewport_index)
2411 pos_args[1].out[3] = viewport_index_value;
2412 }
2413
2414 for (i = 0; i < 4; i++)
2415 if (pos_args[i].out[0])
2416 shader->info.nr_pos_exports++;
2417
2418 pos_idx = 0;
2419 for (i = 0; i < 4; i++) {
2420 if (!pos_args[i].out[0])
2421 continue;
2422
2423 /* Specify the target we are exporting */
2424 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2425
2426 if (pos_idx == shader->info.nr_pos_exports)
2427 /* Specify that this is the last export */
2428 pos_args[i].done = 1;
2429
2430 ac_build_export(&ctx->ac, &pos_args[i]);
2431 }
2432 }
2433
2434 /**
2435 * Forward all outputs from the vertex shader to the TES. This is only used
2436 * for the fixed function TCS.
2437 */
2438 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2439 {
2440 struct si_shader_context *ctx = si_shader_context(bld_base);
2441 struct gallivm_state *gallivm = &ctx->gallivm;
2442 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2443 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2444 uint64_t inputs;
2445
2446 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2447
2448 rw_buffers = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2449 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2450 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2451
2452 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2453
2454 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2455 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2456 lds_vertex_stride, "");
2457 lds_base = get_tcs_in_current_patch_offset(ctx);
2458 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2459
2460 inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2461 while (inputs) {
2462 unsigned i = u_bit_scan64(&inputs);
2463
2464 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2465 LLVMConstInt(ctx->i32, 4 * i, 0),
2466 "");
2467
2468 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2469 get_rel_patch_id(ctx),
2470 invocation_id,
2471 LLVMConstInt(ctx->i32, i, 0));
2472
2473 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2474 lds_ptr);
2475
2476 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2477 buffer_offset, 0, 1, 0, true, false);
2478 }
2479 }
2480
2481 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2482 LLVMValueRef rel_patch_id,
2483 LLVMValueRef invocation_id,
2484 LLVMValueRef tcs_out_current_patch_data_offset)
2485 {
2486 struct si_shader_context *ctx = si_shader_context(bld_base);
2487 struct gallivm_state *gallivm = &ctx->gallivm;
2488 struct si_shader *shader = ctx->shader;
2489 unsigned tess_inner_index, tess_outer_index;
2490 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2491 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
2492 unsigned stride, outer_comps, inner_comps, i, offset;
2493 struct lp_build_if_state if_ctx, inner_if_ctx;
2494
2495 si_llvm_emit_barrier(NULL, bld_base, NULL);
2496
2497 /* Do this only for invocation 0, because the tess levels are per-patch,
2498 * not per-vertex.
2499 *
2500 * This can't jump, because invocation 0 executes this. It should
2501 * at least mask out the loads and stores for other invocations.
2502 */
2503 lp_build_if(&if_ctx, gallivm,
2504 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2505 invocation_id, ctx->i32_0, ""));
2506
2507 /* Determine the layout of one tess factor element in the buffer. */
2508 switch (shader->key.part.tcs.epilog.prim_mode) {
2509 case PIPE_PRIM_LINES:
2510 stride = 2; /* 2 dwords, 1 vec2 store */
2511 outer_comps = 2;
2512 inner_comps = 0;
2513 break;
2514 case PIPE_PRIM_TRIANGLES:
2515 stride = 4; /* 4 dwords, 1 vec4 store */
2516 outer_comps = 3;
2517 inner_comps = 1;
2518 break;
2519 case PIPE_PRIM_QUADS:
2520 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2521 outer_comps = 4;
2522 inner_comps = 2;
2523 break;
2524 default:
2525 assert(0);
2526 return;
2527 }
2528
2529 /* Load tess_inner and tess_outer from LDS.
2530 * Any invocation can write them, so we can't get them from a temporary.
2531 */
2532 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2533 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2534
2535 lds_base = tcs_out_current_patch_data_offset;
2536 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2537 LLVMConstInt(ctx->i32,
2538 tess_inner_index * 4, 0), "");
2539 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2540 LLVMConstInt(ctx->i32,
2541 tess_outer_index * 4, 0), "");
2542
2543 for (i = 0; i < 4; i++) {
2544 inner[i] = LLVMGetUndef(ctx->i32);
2545 outer[i] = LLVMGetUndef(ctx->i32);
2546 }
2547
2548 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2549 /* For isolines, the hardware expects tess factors in the
2550 * reverse order from what GLSL / TGSI specify.
2551 */
2552 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2553 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2554 } else {
2555 for (i = 0; i < outer_comps; i++) {
2556 outer[i] = out[i] =
2557 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2558 }
2559 for (i = 0; i < inner_comps; i++) {
2560 inner[i] = out[outer_comps+i] =
2561 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2562 }
2563 }
2564
2565 /* Convert the outputs to vectors for stores. */
2566 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2567 vec1 = NULL;
2568
2569 if (stride > 4)
2570 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2571
2572 /* Get the buffer. */
2573 rw_buffers = LLVMGetParam(ctx->main_fn,
2574 ctx->param_rw_buffers);
2575 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2576 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
2577
2578 /* Get the offset. */
2579 tf_base = LLVMGetParam(ctx->main_fn,
2580 ctx->param_tcs_factor_offset);
2581 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2582 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2583
2584 lp_build_if(&inner_if_ctx, gallivm,
2585 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2586 rel_patch_id, ctx->i32_0, ""));
2587
2588 /* Store the dynamic HS control word. */
2589 offset = 0;
2590 if (ctx->screen->b.chip_class <= VI) {
2591 ac_build_buffer_store_dword(&ctx->ac, buffer,
2592 LLVMConstInt(ctx->i32, 0x80000000, 0),
2593 1, ctx->i32_0, tf_base,
2594 offset, 1, 0, true, false);
2595 offset += 4;
2596 }
2597
2598 lp_build_endif(&inner_if_ctx);
2599
2600 /* Store the tessellation factors. */
2601 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2602 MIN2(stride, 4), byteoffset, tf_base,
2603 offset, 1, 0, true, false);
2604 offset += 16;
2605 if (vec1)
2606 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2607 stride - 4, byteoffset, tf_base,
2608 offset, 1, 0, true, false);
2609
2610 /* Store the tess factors into the offchip buffer if TES reads them. */
2611 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2612 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2613 LLVMValueRef tf_inner_offset;
2614 unsigned param_outer, param_inner;
2615
2616 buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2617 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2618 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2619
2620 param_outer = si_shader_io_get_unique_index(
2621 TGSI_SEMANTIC_TESSOUTER, 0);
2622 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2623 LLVMConstInt(ctx->i32, param_outer, 0));
2624
2625 outer_vec = lp_build_gather_values(gallivm, outer,
2626 util_next_power_of_two(outer_comps));
2627
2628 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2629 outer_comps, tf_outer_offset,
2630 base, 0, 1, 0, true, false);
2631 if (inner_comps) {
2632 param_inner = si_shader_io_get_unique_index(
2633 TGSI_SEMANTIC_TESSINNER, 0);
2634 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2635 LLVMConstInt(ctx->i32, param_inner, 0));
2636
2637 inner_vec = inner_comps == 1 ? inner[0] :
2638 lp_build_gather_values(gallivm, inner, inner_comps);
2639 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2640 inner_comps, tf_inner_offset,
2641 base, 0, 1, 0, true, false);
2642 }
2643 }
2644
2645 lp_build_endif(&if_ctx);
2646 }
2647
2648 static LLVMValueRef
2649 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2650 unsigned param, unsigned return_index)
2651 {
2652 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2653 LLVMGetParam(ctx->main_fn, param),
2654 return_index, "");
2655 }
2656
2657 static LLVMValueRef
2658 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2659 unsigned param, unsigned return_index)
2660 {
2661 LLVMBuilderRef builder = ctx->gallivm.builder;
2662 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2663
2664 return LLVMBuildInsertValue(builder, ret,
2665 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2666 return_index, "");
2667 }
2668
2669 static LLVMValueRef
2670 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2671 unsigned param, unsigned return_index)
2672 {
2673 LLVMBuilderRef builder = ctx->gallivm.builder;
2674 LLVMValueRef ptr, lo, hi;
2675
2676 ptr = LLVMGetParam(ctx->main_fn, param);
2677 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2678 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2679 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2680 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2681 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2682 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2683 }
2684
2685 /* This only writes the tessellation factor levels. */
2686 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2687 {
2688 struct si_shader_context *ctx = si_shader_context(bld_base);
2689 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2690 LLVMValueRef offchip_soffset, offchip_layout;
2691
2692 si_copy_tcs_inputs(bld_base);
2693
2694 rel_patch_id = get_rel_patch_id(ctx);
2695 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2696 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2697
2698 /* Return epilog parameters from this function. */
2699 LLVMBuilderRef builder = ctx->gallivm.builder;
2700 LLVMValueRef ret = ctx->return_value;
2701 LLVMValueRef tf_soffset;
2702 unsigned vgpr;
2703
2704 offchip_layout = LLVMGetParam(ctx->main_fn,
2705 ctx->param_tcs_offchip_layout);
2706 offchip_soffset = LLVMGetParam(ctx->main_fn,
2707 ctx->param_tcs_offchip_offset);
2708 tf_soffset = LLVMGetParam(ctx->main_fn,
2709 ctx->param_tcs_factor_offset);
2710
2711 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2712 ctx->param_rw_buffers, 0);
2713
2714 if (ctx->screen->b.chip_class >= GFX9) {
2715 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2716 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT, "");
2717 /* Tess offchip and tess factor offsets are at the beginning. */
2718 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset, 2, "");
2719 ret = LLVMBuildInsertValue(builder, ret, tf_soffset, 4, "");
2720 vgpr = 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT + 1;
2721 } else {
2722 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2723 GFX6_SGPR_TCS_OFFCHIP_LAYOUT, "");
2724 /* Tess offchip and tess factor offsets are after user SGPRs. */
2725 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
2726 GFX6_TCS_NUM_USER_SGPR, "");
2727 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2728 GFX6_TCS_NUM_USER_SGPR + 1, "");
2729 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2730 }
2731
2732 /* VGPRs */
2733 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2734 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2735 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2736
2737 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2738 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2739 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2740 ctx->return_value = ret;
2741 }
2742
2743 /* Pass TCS inputs from LS to TCS on GFX9. */
2744 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2745 {
2746 LLVMValueRef ret = ctx->return_value;
2747
2748 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2749 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2750 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2751 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2752 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2753
2754 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2755 8 + SI_SGPR_VS_STATE_BITS);
2756 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2757 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2758 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2759 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2760 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2761 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2762
2763 unsigned desc_param = ctx->param_tcs_out_lds_layout + 2;
2764 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2765 8 + GFX9_SGPR_TCS_CONST_BUFFERS);
2766 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2767 8 + GFX9_SGPR_TCS_SAMPLERS);
2768 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2769 8 + GFX9_SGPR_TCS_IMAGES);
2770 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2771 8 + GFX9_SGPR_TCS_SHADER_BUFFERS);
2772
2773 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2774 ret = si_insert_input_ret_float(ctx, ret,
2775 ctx->param_tcs_patch_id, vgpr++);
2776 ret = si_insert_input_ret_float(ctx, ret,
2777 ctx->param_tcs_rel_ids, vgpr++);
2778 ctx->return_value = ret;
2779 }
2780
2781 /* Pass GS inputs from ES to GS on GFX9. */
2782 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2783 {
2784 LLVMValueRef ret = ctx->return_value;
2785
2786 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2787 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2788 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2789
2790 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2791
2792 unsigned desc_param = ctx->param_vs_state_bits + 1;
2793 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2794 8 + GFX9_SGPR_GS_CONST_BUFFERS);
2795 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2796 8 + GFX9_SGPR_GS_SAMPLERS);
2797 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2798 8 + GFX9_SGPR_GS_IMAGES);
2799 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2800 8 + GFX9_SGPR_GS_SHADER_BUFFERS);
2801
2802 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2803 for (unsigned i = 0; i < 5; i++) {
2804 unsigned param = ctx->param_gs_vtx01_offset + i;
2805 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2806 }
2807 ctx->return_value = ret;
2808 }
2809
2810 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2811 {
2812 struct si_shader_context *ctx = si_shader_context(bld_base);
2813 struct si_shader *shader = ctx->shader;
2814 struct tgsi_shader_info *info = &shader->selector->info;
2815 struct gallivm_state *gallivm = &ctx->gallivm;
2816 unsigned i, chan;
2817 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2818 ctx->param_rel_auto_id);
2819 LLVMValueRef vertex_dw_stride =
2820 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2821 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2822 vertex_dw_stride, "");
2823
2824 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2825 * its inputs from it. */
2826 for (i = 0; i < info->num_outputs; i++) {
2827 LLVMValueRef *out_ptr = ctx->outputs[i];
2828 unsigned name = info->output_semantic_name[i];
2829 unsigned index = info->output_semantic_index[i];
2830
2831 /* The ARB_shader_viewport_layer_array spec contains the
2832 * following issue:
2833 *
2834 * 2) What happens if gl_ViewportIndex or gl_Layer is
2835 * written in the vertex shader and a geometry shader is
2836 * present?
2837 *
2838 * RESOLVED: The value written by the last vertex processing
2839 * stage is used. If the last vertex processing stage
2840 * (vertex, tessellation evaluation or geometry) does not
2841 * statically assign to gl_ViewportIndex or gl_Layer, index
2842 * or layer zero is assumed.
2843 *
2844 * So writes to those outputs in VS-as-LS are simply ignored.
2845 */
2846 if (name == TGSI_SEMANTIC_LAYER ||
2847 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2848 continue;
2849
2850 int param = si_shader_io_get_unique_index(name, index);
2851 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2852 LLVMConstInt(ctx->i32, param * 4, 0), "");
2853
2854 for (chan = 0; chan < 4; chan++) {
2855 lds_store(bld_base, chan, dw_addr,
2856 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2857 }
2858 }
2859
2860 if (ctx->screen->b.chip_class >= GFX9)
2861 si_set_ls_return_value_for_tcs(ctx);
2862 }
2863
2864 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2865 {
2866 struct si_shader_context *ctx = si_shader_context(bld_base);
2867 struct gallivm_state *gallivm = &ctx->gallivm;
2868 struct si_shader *es = ctx->shader;
2869 struct tgsi_shader_info *info = &es->selector->info;
2870 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2871 ctx->param_es2gs_offset);
2872 LLVMValueRef lds_base = NULL;
2873 unsigned chan;
2874 int i;
2875
2876 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2877 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2878 lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
2879 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2880 }
2881
2882 for (i = 0; i < info->num_outputs; i++) {
2883 LLVMValueRef *out_ptr = ctx->outputs[i];
2884 int param;
2885
2886 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2887 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2888 continue;
2889
2890 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2891 info->output_semantic_index[i]);
2892
2893 for (chan = 0; chan < 4; chan++) {
2894 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2895 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2896
2897 /* GFX9 has the ESGS ring in LDS. */
2898 if (ctx->screen->b.chip_class >= GFX9) {
2899 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2900 continue;
2901 }
2902
2903 ac_build_buffer_store_dword(&ctx->ac,
2904 ctx->esgs_ring,
2905 out_val, 1, NULL, soffset,
2906 (4 * param + chan) * 4,
2907 1, 1, true, true);
2908 }
2909 }
2910
2911 if (ctx->screen->b.chip_class >= GFX9)
2912 si_set_es_return_value_for_gs(ctx);
2913 }
2914
2915 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2916 {
2917 if (ctx->screen->b.chip_class >= GFX9)
2918 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2919 else
2920 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2921 }
2922
2923 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2924 {
2925 struct si_shader_context *ctx = si_shader_context(bld_base);
2926
2927 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2928 si_get_gs_wave_id(ctx));
2929 }
2930
2931 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2932 {
2933 struct si_shader_context *ctx = si_shader_context(bld_base);
2934 struct gallivm_state *gallivm = &ctx->gallivm;
2935 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2936 struct si_shader_output_values *outputs = NULL;
2937 int i,j;
2938
2939 assert(!ctx->shader->is_gs_copy_shader);
2940
2941 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2942
2943 /* Vertex color clamping.
2944 *
2945 * This uses a state constant loaded in a user data SGPR and
2946 * an IF statement is added that clamps all colors if the constant
2947 * is true.
2948 */
2949 if (ctx->type == PIPE_SHADER_VERTEX) {
2950 struct lp_build_if_state if_ctx;
2951 LLVMValueRef cond = NULL;
2952 LLVMValueRef addr, val;
2953
2954 for (i = 0; i < info->num_outputs; i++) {
2955 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2956 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2957 continue;
2958
2959 /* We've found a color. */
2960 if (!cond) {
2961 /* The state is in the first bit of the user SGPR. */
2962 cond = LLVMGetParam(ctx->main_fn,
2963 ctx->param_vs_state_bits);
2964 cond = LLVMBuildTrunc(gallivm->builder, cond,
2965 ctx->i1, "");
2966 lp_build_if(&if_ctx, gallivm, cond);
2967 }
2968
2969 for (j = 0; j < 4; j++) {
2970 addr = ctx->outputs[i][j];
2971 val = LLVMBuildLoad(gallivm->builder, addr, "");
2972 val = ac_build_clamp(&ctx->ac, val);
2973 LLVMBuildStore(gallivm->builder, val, addr);
2974 }
2975 }
2976
2977 if (cond)
2978 lp_build_endif(&if_ctx);
2979 }
2980
2981 for (i = 0; i < info->num_outputs; i++) {
2982 outputs[i].semantic_name = info->output_semantic_name[i];
2983 outputs[i].semantic_index = info->output_semantic_index[i];
2984
2985 for (j = 0; j < 4; j++) {
2986 outputs[i].values[j] =
2987 LLVMBuildLoad(gallivm->builder,
2988 ctx->outputs[i][j],
2989 "");
2990 outputs[i].vertex_stream[j] =
2991 (info->output_streams[i] >> (2 * j)) & 3;
2992 }
2993
2994 }
2995
2996 /* Return the primitive ID from the LLVM function. */
2997 ctx->return_value =
2998 LLVMBuildInsertValue(gallivm->builder,
2999 ctx->return_value,
3000 bitcast(bld_base, TGSI_TYPE_FLOAT,
3001 get_primitive_id(bld_base, 0)),
3002 VS_EPILOG_PRIMID_LOC, "");
3003
3004 if (ctx->shader->selector->so.num_outputs)
3005 si_llvm_emit_streamout(ctx, outputs, i, 0);
3006 si_llvm_export_vs(bld_base, outputs, i);
3007 FREE(outputs);
3008 }
3009
3010 struct si_ps_exports {
3011 unsigned num;
3012 struct ac_export_args args[10];
3013 };
3014
3015 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3016 bool writes_samplemask)
3017 {
3018 if (writes_z) {
3019 /* Z needs 32 bits. */
3020 if (writes_samplemask)
3021 return V_028710_SPI_SHADER_32_ABGR;
3022 else if (writes_stencil)
3023 return V_028710_SPI_SHADER_32_GR;
3024 else
3025 return V_028710_SPI_SHADER_32_R;
3026 } else if (writes_stencil || writes_samplemask) {
3027 /* Both stencil and sample mask need only 16 bits. */
3028 return V_028710_SPI_SHADER_UINT16_ABGR;
3029 } else {
3030 return V_028710_SPI_SHADER_ZERO;
3031 }
3032 }
3033
3034 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3035 LLVMValueRef depth, LLVMValueRef stencil,
3036 LLVMValueRef samplemask, struct si_ps_exports *exp)
3037 {
3038 struct si_shader_context *ctx = si_shader_context(bld_base);
3039 struct lp_build_context *base = &bld_base->base;
3040 struct ac_export_args args;
3041 unsigned mask = 0;
3042 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3043 stencil != NULL,
3044 samplemask != NULL);
3045
3046 assert(depth || stencil || samplemask);
3047
3048 args.valid_mask = 1; /* whether the EXEC mask is valid */
3049 args.done = 1; /* DONE bit */
3050
3051 /* Specify the target we are exporting */
3052 args.target = V_008DFC_SQ_EXP_MRTZ;
3053
3054 args.compr = 0; /* COMP flag */
3055 args.out[0] = base->undef; /* R, depth */
3056 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3057 args.out[2] = base->undef; /* B, sample mask */
3058 args.out[3] = base->undef; /* A, alpha to mask */
3059
3060 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3061 assert(!depth);
3062 args.compr = 1; /* COMPR flag */
3063
3064 if (stencil) {
3065 /* Stencil should be in X[23:16]. */
3066 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3067 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3068 LLVMConstInt(ctx->i32, 16, 0), "");
3069 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3070 mask |= 0x3;
3071 }
3072 if (samplemask) {
3073 /* SampleMask should be in Y[15:0]. */
3074 args.out[1] = samplemask;
3075 mask |= 0xc;
3076 }
3077 } else {
3078 if (depth) {
3079 args.out[0] = depth;
3080 mask |= 0x1;
3081 }
3082 if (stencil) {
3083 args.out[1] = stencil;
3084 mask |= 0x2;
3085 }
3086 if (samplemask) {
3087 args.out[2] = samplemask;
3088 mask |= 0x4;
3089 }
3090 }
3091
3092 /* SI (except OLAND and HAINAN) has a bug that it only looks
3093 * at the X writemask component. */
3094 if (ctx->screen->b.chip_class == SI &&
3095 ctx->screen->b.family != CHIP_OLAND &&
3096 ctx->screen->b.family != CHIP_HAINAN)
3097 mask |= 0x1;
3098
3099 /* Specify which components to enable */
3100 args.enabled_channels = mask;
3101
3102 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3103 }
3104
3105 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3106 LLVMValueRef *color, unsigned index,
3107 unsigned samplemask_param,
3108 bool is_last, struct si_ps_exports *exp)
3109 {
3110 struct si_shader_context *ctx = si_shader_context(bld_base);
3111 struct lp_build_context *base = &bld_base->base;
3112 int i;
3113
3114 /* Clamp color */
3115 if (ctx->shader->key.part.ps.epilog.clamp_color)
3116 for (i = 0; i < 4; i++)
3117 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3118
3119 /* Alpha to one */
3120 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3121 color[3] = base->one;
3122
3123 /* Alpha test */
3124 if (index == 0 &&
3125 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3126 si_alpha_test(bld_base, color[3]);
3127
3128 /* Line & polygon smoothing */
3129 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3130 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3131 samplemask_param);
3132
3133 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3134 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3135 struct ac_export_args args[8];
3136 int c, last = -1;
3137
3138 /* Get the export arguments, also find out what the last one is. */
3139 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3140 si_llvm_init_export_args(bld_base, color,
3141 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3142 if (args[c].enabled_channels)
3143 last = c;
3144 }
3145
3146 /* Emit all exports. */
3147 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3148 if (is_last && last == c) {
3149 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3150 args[c].done = 1; /* DONE bit */
3151 } else if (!args[c].enabled_channels)
3152 continue; /* unnecessary NULL export */
3153
3154 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3155 }
3156 } else {
3157 struct ac_export_args args;
3158
3159 /* Export */
3160 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3161 &args);
3162 if (is_last) {
3163 args.valid_mask = 1; /* whether the EXEC mask is valid */
3164 args.done = 1; /* DONE bit */
3165 } else if (!args.enabled_channels)
3166 return; /* unnecessary NULL export */
3167
3168 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3169 }
3170 }
3171
3172 static void si_emit_ps_exports(struct si_shader_context *ctx,
3173 struct si_ps_exports *exp)
3174 {
3175 for (unsigned i = 0; i < exp->num; i++)
3176 ac_build_export(&ctx->ac, &exp->args[i]);
3177 }
3178
3179 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3180 {
3181 struct si_shader_context *ctx = si_shader_context(bld_base);
3182 struct lp_build_context *base = &bld_base->base;
3183 struct ac_export_args args;
3184
3185 args.enabled_channels = 0x0; /* enabled channels */
3186 args.valid_mask = 1; /* whether the EXEC mask is valid */
3187 args.done = 1; /* DONE bit */
3188 args.target = V_008DFC_SQ_EXP_NULL;
3189 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3190 args.out[0] = base->undef; /* R */
3191 args.out[1] = base->undef; /* G */
3192 args.out[2] = base->undef; /* B */
3193 args.out[3] = base->undef; /* A */
3194
3195 ac_build_export(&ctx->ac, &args);
3196 }
3197
3198 /**
3199 * Return PS outputs in this order:
3200 *
3201 * v[0:3] = color0.xyzw
3202 * v[4:7] = color1.xyzw
3203 * ...
3204 * vN+0 = Depth
3205 * vN+1 = Stencil
3206 * vN+2 = SampleMask
3207 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3208 *
3209 * The alpha-ref SGPR is returned via its original location.
3210 */
3211 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3212 {
3213 struct si_shader_context *ctx = si_shader_context(bld_base);
3214 struct si_shader *shader = ctx->shader;
3215 struct tgsi_shader_info *info = &shader->selector->info;
3216 LLVMBuilderRef builder = ctx->gallivm.builder;
3217 unsigned i, j, first_vgpr, vgpr;
3218
3219 LLVMValueRef color[8][4] = {};
3220 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3221 LLVMValueRef ret;
3222
3223 /* Read the output values. */
3224 for (i = 0; i < info->num_outputs; i++) {
3225 unsigned semantic_name = info->output_semantic_name[i];
3226 unsigned semantic_index = info->output_semantic_index[i];
3227
3228 switch (semantic_name) {
3229 case TGSI_SEMANTIC_COLOR:
3230 assert(semantic_index < 8);
3231 for (j = 0; j < 4; j++) {
3232 LLVMValueRef ptr = ctx->outputs[i][j];
3233 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3234 color[semantic_index][j] = result;
3235 }
3236 break;
3237 case TGSI_SEMANTIC_POSITION:
3238 depth = LLVMBuildLoad(builder,
3239 ctx->outputs[i][2], "");
3240 break;
3241 case TGSI_SEMANTIC_STENCIL:
3242 stencil = LLVMBuildLoad(builder,
3243 ctx->outputs[i][1], "");
3244 break;
3245 case TGSI_SEMANTIC_SAMPLEMASK:
3246 samplemask = LLVMBuildLoad(builder,
3247 ctx->outputs[i][0], "");
3248 break;
3249 default:
3250 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3251 semantic_name);
3252 }
3253 }
3254
3255 /* Fill the return structure. */
3256 ret = ctx->return_value;
3257
3258 /* Set SGPRs. */
3259 ret = LLVMBuildInsertValue(builder, ret,
3260 bitcast(bld_base, TGSI_TYPE_SIGNED,
3261 LLVMGetParam(ctx->main_fn,
3262 SI_PARAM_ALPHA_REF)),
3263 SI_SGPR_ALPHA_REF, "");
3264
3265 /* Set VGPRs */
3266 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3267 for (i = 0; i < ARRAY_SIZE(color); i++) {
3268 if (!color[i][0])
3269 continue;
3270
3271 for (j = 0; j < 4; j++)
3272 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3273 }
3274 if (depth)
3275 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3276 if (stencil)
3277 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3278 if (samplemask)
3279 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3280
3281 /* Add the input sample mask for smoothing at the end. */
3282 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3283 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3284 ret = LLVMBuildInsertValue(builder, ret,
3285 LLVMGetParam(ctx->main_fn,
3286 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3287
3288 ctx->return_value = ret;
3289 }
3290
3291 /**
3292 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3293 * buffer in number of elements and return it as an i32.
3294 */
3295 static LLVMValueRef get_buffer_size(
3296 struct lp_build_tgsi_context *bld_base,
3297 LLVMValueRef descriptor)
3298 {
3299 struct si_shader_context *ctx = si_shader_context(bld_base);
3300 struct gallivm_state *gallivm = &ctx->gallivm;
3301 LLVMBuilderRef builder = gallivm->builder;
3302 LLVMValueRef size =
3303 LLVMBuildExtractElement(builder, descriptor,
3304 LLVMConstInt(ctx->i32, 2, 0), "");
3305
3306 if (ctx->screen->b.chip_class == VI) {
3307 /* On VI, the descriptor contains the size in bytes,
3308 * but TXQ must return the size in elements.
3309 * The stride is always non-zero for resources using TXQ.
3310 */
3311 LLVMValueRef stride =
3312 LLVMBuildExtractElement(builder, descriptor,
3313 ctx->i32_1, "");
3314 stride = LLVMBuildLShr(builder, stride,
3315 LLVMConstInt(ctx->i32, 16, 0), "");
3316 stride = LLVMBuildAnd(builder, stride,
3317 LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3318
3319 size = LLVMBuildUDiv(builder, size, stride, "");
3320 }
3321
3322 return size;
3323 }
3324
3325 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3326 struct lp_build_tgsi_context *bld_base,
3327 struct lp_build_emit_data *emit_data);
3328
3329 /* Prevent optimizations (at least of memory accesses) across the current
3330 * point in the program by emitting empty inline assembly that is marked as
3331 * having side effects.
3332 *
3333 * Optionally, a value can be passed through the inline assembly to prevent
3334 * LLVM from hoisting calls to ReadNone functions.
3335 */
3336 static void emit_optimization_barrier(struct si_shader_context *ctx,
3337 LLVMValueRef *pvgpr)
3338 {
3339 static int counter = 0;
3340
3341 LLVMBuilderRef builder = ctx->gallivm.builder;
3342 char code[16];
3343
3344 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3345
3346 if (!pvgpr) {
3347 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3348 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3349 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3350 } else {
3351 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3352 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3353 LLVMValueRef vgpr = *pvgpr;
3354 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3355 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3356 LLVMValueRef vgpr0;
3357
3358 assert(vgpr_size % 4 == 0);
3359
3360 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3361 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3362 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3363 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3364 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3365
3366 *pvgpr = vgpr;
3367 }
3368 }
3369
3370 /* Combine these with & instead of |. */
3371 #define NOOP_WAITCNT 0xf7f
3372 #define LGKM_CNT 0x07f
3373 #define VM_CNT 0xf70
3374
3375 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3376 {
3377 struct gallivm_state *gallivm = &ctx->gallivm;
3378 LLVMBuilderRef builder = gallivm->builder;
3379 LLVMValueRef args[1] = {
3380 LLVMConstInt(ctx->i32, simm16, 0)
3381 };
3382 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3383 ctx->voidt, args, 1, 0);
3384 }
3385
3386 static void membar_emit(
3387 const struct lp_build_tgsi_action *action,
3388 struct lp_build_tgsi_context *bld_base,
3389 struct lp_build_emit_data *emit_data)
3390 {
3391 struct si_shader_context *ctx = si_shader_context(bld_base);
3392 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3393 unsigned flags = LLVMConstIntGetZExtValue(src0);
3394 unsigned waitcnt = NOOP_WAITCNT;
3395
3396 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3397 waitcnt &= VM_CNT & LGKM_CNT;
3398
3399 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3400 TGSI_MEMBAR_SHADER_BUFFER |
3401 TGSI_MEMBAR_SHADER_IMAGE))
3402 waitcnt &= VM_CNT;
3403
3404 if (flags & TGSI_MEMBAR_SHARED)
3405 waitcnt &= LGKM_CNT;
3406
3407 if (waitcnt != NOOP_WAITCNT)
3408 emit_waitcnt(ctx, waitcnt);
3409 }
3410
3411 static void clock_emit(
3412 const struct lp_build_tgsi_action *action,
3413 struct lp_build_tgsi_context *bld_base,
3414 struct lp_build_emit_data *emit_data)
3415 {
3416 struct si_shader_context *ctx = si_shader_context(bld_base);
3417 struct gallivm_state *gallivm = &ctx->gallivm;
3418 LLVMValueRef tmp;
3419
3420 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3421 ctx->i64, NULL, 0, 0);
3422 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3423
3424 emit_data->output[0] =
3425 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3426 emit_data->output[1] =
3427 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3428 }
3429
3430 static LLVMValueRef
3431 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3432 const struct tgsi_full_src_register *reg)
3433 {
3434 LLVMValueRef index;
3435 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3436 ctx->param_shader_buffers);
3437
3438 if (!reg->Register.Indirect)
3439 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3440 else
3441 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3442 reg->Register.Index,
3443 SI_NUM_SHADER_BUFFERS);
3444
3445 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3446 }
3447
3448 static bool tgsi_is_array_sampler(unsigned target)
3449 {
3450 return target == TGSI_TEXTURE_1D_ARRAY ||
3451 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3452 target == TGSI_TEXTURE_2D_ARRAY ||
3453 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3454 target == TGSI_TEXTURE_CUBE_ARRAY ||
3455 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3456 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3457 }
3458
3459 static bool tgsi_is_array_image(unsigned target)
3460 {
3461 return target == TGSI_TEXTURE_3D ||
3462 target == TGSI_TEXTURE_CUBE ||
3463 target == TGSI_TEXTURE_1D_ARRAY ||
3464 target == TGSI_TEXTURE_2D_ARRAY ||
3465 target == TGSI_TEXTURE_CUBE_ARRAY ||
3466 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3467 }
3468
3469 /**
3470 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3471 *
3472 * At least on Tonga, executing image stores on images with DCC enabled and
3473 * non-trivial can eventually lead to lockups. This can occur when an
3474 * application binds an image as read-only but then uses a shader that writes
3475 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3476 * program termination) in this case, but it doesn't cost much to be a bit
3477 * nicer: disabling DCC in the shader still leads to undefined results but
3478 * avoids the lockup.
3479 */
3480 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3481 LLVMValueRef rsrc)
3482 {
3483 if (ctx->screen->b.chip_class <= CIK) {
3484 return rsrc;
3485 } else {
3486 LLVMBuilderRef builder = ctx->gallivm.builder;
3487 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3488 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3489 LLVMValueRef tmp;
3490
3491 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3492 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3493 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3494 }
3495 }
3496
3497 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3498 {
3499 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3500 CONST_ADDR_SPACE);
3501 }
3502
3503 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3504 LLVMValueRef list, LLVMValueRef index,
3505 unsigned target)
3506 {
3507 LLVMBuilderRef builder = ctx->gallivm.builder;
3508
3509 if (target == TGSI_TEXTURE_BUFFER) {
3510 index = LLVMBuildMul(builder, index,
3511 LLVMConstInt(ctx->i32, 2, 0), "");
3512 index = LLVMBuildAdd(builder, index,
3513 ctx->i32_1, "");
3514 list = LLVMBuildPointerCast(builder, list,
3515 const_array(ctx->v4i32, 0), "");
3516 }
3517
3518 return ac_build_indexed_load_const(&ctx->ac, list, index);
3519 }
3520
3521 /**
3522 * Load the resource descriptor for \p image.
3523 */
3524 static void
3525 image_fetch_rsrc(
3526 struct lp_build_tgsi_context *bld_base,
3527 const struct tgsi_full_src_register *image,
3528 bool is_store, unsigned target,
3529 LLVMValueRef *rsrc)
3530 {
3531 struct si_shader_context *ctx = si_shader_context(bld_base);
3532 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3533 ctx->param_images);
3534 LLVMValueRef index;
3535 bool dcc_off = is_store;
3536
3537 assert(image->Register.File == TGSI_FILE_IMAGE);
3538
3539 if (!image->Register.Indirect) {
3540 const struct tgsi_shader_info *info = bld_base->info;
3541 unsigned images_writemask = info->images_store |
3542 info->images_atomic;
3543
3544 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3545
3546 if (images_writemask & (1 << image->Register.Index))
3547 dcc_off = true;
3548 } else {
3549 /* From the GL_ARB_shader_image_load_store extension spec:
3550 *
3551 * If a shader performs an image load, store, or atomic
3552 * operation using an image variable declared as an array,
3553 * and if the index used to select an individual element is
3554 * negative or greater than or equal to the size of the
3555 * array, the results of the operation are undefined but may
3556 * not lead to termination.
3557 */
3558 index = get_bounded_indirect_index(ctx, &image->Indirect,
3559 image->Register.Index,
3560 SI_NUM_IMAGES);
3561 }
3562
3563 *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3564 if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3565 *rsrc = force_dcc_off(ctx, *rsrc);
3566 }
3567
3568 static LLVMValueRef image_fetch_coords(
3569 struct lp_build_tgsi_context *bld_base,
3570 const struct tgsi_full_instruction *inst,
3571 unsigned src, LLVMValueRef desc)
3572 {
3573 struct si_shader_context *ctx = si_shader_context(bld_base);
3574 struct gallivm_state *gallivm = &ctx->gallivm;
3575 LLVMBuilderRef builder = gallivm->builder;
3576 unsigned target = inst->Memory.Texture;
3577 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3578 LLVMValueRef coords[4];
3579 LLVMValueRef tmp;
3580 int chan;
3581
3582 for (chan = 0; chan < num_coords; ++chan) {
3583 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3584 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3585 coords[chan] = tmp;
3586 }
3587
3588 if (ctx->screen->b.chip_class >= GFX9) {
3589 /* 1D textures are allocated and used as 2D on GFX9. */
3590 if (target == TGSI_TEXTURE_1D) {
3591 coords[1] = ctx->i32_0;
3592 num_coords++;
3593 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3594 coords[2] = coords[1];
3595 coords[1] = ctx->i32_0;
3596 num_coords++;
3597 } else if (target == TGSI_TEXTURE_2D) {
3598 /* The hw can't bind a slice of a 3D image as a 2D
3599 * image, because it ignores BASE_ARRAY if the target
3600 * is 3D. The workaround is to read BASE_ARRAY and set
3601 * it as the 3rd address operand for all 2D images.
3602 */
3603 LLVMValueRef first_layer, const5, mask;
3604
3605 const5 = LLVMConstInt(ctx->i32, 5, 0);
3606 mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3607 first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3608 first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3609
3610 coords[2] = first_layer;
3611 num_coords++;
3612 }
3613 }
3614
3615 if (num_coords == 1)
3616 return coords[0];
3617
3618 if (num_coords == 3) {
3619 /* LLVM has difficulties lowering 3-element vectors. */
3620 coords[3] = bld_base->uint_bld.undef;
3621 num_coords = 4;
3622 }
3623
3624 return lp_build_gather_values(gallivm, coords, num_coords);
3625 }
3626
3627 /**
3628 * Append the extra mode bits that are used by image load and store.
3629 */
3630 static void image_append_args(
3631 struct si_shader_context *ctx,
3632 struct lp_build_emit_data * emit_data,
3633 unsigned target,
3634 bool atomic,
3635 bool force_glc)
3636 {
3637 const struct tgsi_full_instruction *inst = emit_data->inst;
3638 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3639 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3640 LLVMValueRef r128 = i1false;
3641 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3642 LLVMValueRef glc =
3643 force_glc ||
3644 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3645 i1true : i1false;
3646 LLVMValueRef slc = i1false;
3647 LLVMValueRef lwe = i1false;
3648
3649 if (atomic || (HAVE_LLVM <= 0x0309)) {
3650 emit_data->args[emit_data->arg_count++] = r128;
3651 emit_data->args[emit_data->arg_count++] = da;
3652 if (!atomic) {
3653 emit_data->args[emit_data->arg_count++] = glc;
3654 }
3655 emit_data->args[emit_data->arg_count++] = slc;
3656 return;
3657 }
3658
3659 /* HAVE_LLVM >= 0x0400 */
3660 emit_data->args[emit_data->arg_count++] = glc;
3661 emit_data->args[emit_data->arg_count++] = slc;
3662 emit_data->args[emit_data->arg_count++] = lwe;
3663 emit_data->args[emit_data->arg_count++] = da;
3664 }
3665
3666 /**
3667 * Append the resource and indexing arguments for buffer intrinsics.
3668 *
3669 * \param rsrc the v4i32 buffer resource
3670 * \param index index into the buffer (stride-based)
3671 * \param offset byte offset into the buffer
3672 */
3673 static void buffer_append_args(
3674 struct si_shader_context *ctx,
3675 struct lp_build_emit_data *emit_data,
3676 LLVMValueRef rsrc,
3677 LLVMValueRef index,
3678 LLVMValueRef offset,
3679 bool atomic,
3680 bool force_glc)
3681 {
3682 const struct tgsi_full_instruction *inst = emit_data->inst;
3683 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3684 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3685
3686 emit_data->args[emit_data->arg_count++] = rsrc;
3687 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3688 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3689 if (!atomic) {
3690 emit_data->args[emit_data->arg_count++] =
3691 force_glc ||
3692 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3693 i1true : i1false; /* glc */
3694 }
3695 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3696 }
3697
3698 static void load_fetch_args(
3699 struct lp_build_tgsi_context * bld_base,
3700 struct lp_build_emit_data * emit_data)
3701 {
3702 struct si_shader_context *ctx = si_shader_context(bld_base);
3703 struct gallivm_state *gallivm = &ctx->gallivm;
3704 const struct tgsi_full_instruction * inst = emit_data->inst;
3705 unsigned target = inst->Memory.Texture;
3706 LLVMValueRef rsrc;
3707
3708 emit_data->dst_type = ctx->v4f32;
3709
3710 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3711 LLVMBuilderRef builder = gallivm->builder;
3712 LLVMValueRef offset;
3713 LLVMValueRef tmp;
3714
3715 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3716
3717 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3718 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3719
3720 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3721 offset, false, false);
3722 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3723 LLVMValueRef coords;
3724
3725 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3726 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3727
3728 if (target == TGSI_TEXTURE_BUFFER) {
3729 buffer_append_args(ctx, emit_data, rsrc, coords,
3730 ctx->i32_0, false, false);
3731 } else {
3732 emit_data->args[0] = coords;
3733 emit_data->args[1] = rsrc;
3734 emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3735 emit_data->arg_count = 3;
3736
3737 image_append_args(ctx, emit_data, target, false, false);
3738 }
3739 }
3740 }
3741
3742 static unsigned get_load_intr_attribs(bool readonly_memory)
3743 {
3744 /* READNONE means writes can't affect it, while READONLY means that
3745 * writes can affect it. */
3746 return readonly_memory && HAVE_LLVM >= 0x0400 ?
3747 LP_FUNC_ATTR_READNONE :
3748 LP_FUNC_ATTR_READONLY;
3749 }
3750
3751 static unsigned get_store_intr_attribs(bool writeonly_memory)
3752 {
3753 return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3754 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3755 LP_FUNC_ATTR_WRITEONLY;
3756 }
3757
3758 static void load_emit_buffer(struct si_shader_context *ctx,
3759 struct lp_build_emit_data *emit_data,
3760 bool readonly_memory)
3761 {
3762 const struct tgsi_full_instruction *inst = emit_data->inst;
3763 struct gallivm_state *gallivm = &ctx->gallivm;
3764 LLVMBuilderRef builder = gallivm->builder;
3765 uint writemask = inst->Dst[0].Register.WriteMask;
3766 uint count = util_last_bit(writemask);
3767 const char *intrinsic_name;
3768 LLVMTypeRef dst_type;
3769
3770 switch (count) {
3771 case 1:
3772 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3773 dst_type = ctx->f32;
3774 break;
3775 case 2:
3776 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3777 dst_type = LLVMVectorType(ctx->f32, 2);
3778 break;
3779 default: // 3 & 4
3780 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3781 dst_type = ctx->v4f32;
3782 count = 4;
3783 }
3784
3785 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3786 builder, intrinsic_name, dst_type,
3787 emit_data->args, emit_data->arg_count,
3788 get_load_intr_attribs(readonly_memory));
3789 }
3790
3791 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3792 const struct tgsi_full_instruction *inst,
3793 LLVMTypeRef type, int arg)
3794 {
3795 struct gallivm_state *gallivm = &ctx->gallivm;
3796 LLVMBuilderRef builder = gallivm->builder;
3797 LLVMValueRef offset, ptr;
3798 int addr_space;
3799
3800 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3801 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3802
3803 ptr = ctx->shared_memory;
3804 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3805 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3806 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3807
3808 return ptr;
3809 }
3810
3811 static void load_emit_memory(
3812 struct si_shader_context *ctx,
3813 struct lp_build_emit_data *emit_data)
3814 {
3815 const struct tgsi_full_instruction *inst = emit_data->inst;
3816 struct gallivm_state *gallivm = &ctx->gallivm;
3817 LLVMBuilderRef builder = gallivm->builder;
3818 unsigned writemask = inst->Dst[0].Register.WriteMask;
3819 LLVMValueRef channels[4], ptr, derived_ptr, index;
3820 int chan;
3821
3822 ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3823
3824 for (chan = 0; chan < 4; ++chan) {
3825 if (!(writemask & (1 << chan))) {
3826 channels[chan] = LLVMGetUndef(ctx->f32);
3827 continue;
3828 }
3829
3830 index = LLVMConstInt(ctx->i32, chan, 0);
3831 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3832 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3833 }
3834 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3835 }
3836
3837 /**
3838 * Return true if the memory accessed by a LOAD or STORE instruction is
3839 * read-only or write-only, respectively.
3840 *
3841 * \param shader_buffers_reverse_access_mask
3842 * For LOAD, set this to (store | atomic) slot usage in the shader.
3843 * For STORE, set this to (load | atomic) slot usage in the shader.
3844 * \param images_reverse_access_mask Same as above, but for images.
3845 */
3846 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3847 const struct tgsi_shader_info *info,
3848 unsigned shader_buffers_reverse_access_mask,
3849 unsigned images_reverse_access_mask)
3850 {
3851 /* RESTRICT means NOALIAS.
3852 * If there are no writes, we can assume the accessed memory is read-only.
3853 * If there are no reads, we can assume the accessed memory is write-only.
3854 */
3855 if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3856 unsigned reverse_access_mask;
3857
3858 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3859 reverse_access_mask = shader_buffers_reverse_access_mask;
3860 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3861 reverse_access_mask = info->images_buffers &
3862 images_reverse_access_mask;
3863 } else {
3864 reverse_access_mask = ~info->images_buffers &
3865 images_reverse_access_mask;
3866 }
3867
3868 if (inst->Src[0].Register.Indirect) {
3869 if (!reverse_access_mask)
3870 return true;
3871 } else {
3872 if (!(reverse_access_mask &
3873 (1u << inst->Src[0].Register.Index)))
3874 return true;
3875 }
3876 }
3877
3878 /* If there are no buffer writes (for both shader buffers & image
3879 * buffers), it implies that buffer memory is read-only.
3880 * If there are no buffer reads (for both shader buffers & image
3881 * buffers), it implies that buffer memory is write-only.
3882 *
3883 * Same for the case when there are no writes/reads for non-buffer
3884 * images.
3885 */
3886 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3887 (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3888 inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3889 if (!shader_buffers_reverse_access_mask &&
3890 !(info->images_buffers & images_reverse_access_mask))
3891 return true;
3892 } else {
3893 if (!(~info->images_buffers & images_reverse_access_mask))
3894 return true;
3895 }
3896 return false;
3897 }
3898
3899 static void load_emit(
3900 const struct lp_build_tgsi_action *action,
3901 struct lp_build_tgsi_context *bld_base,
3902 struct lp_build_emit_data *emit_data)
3903 {
3904 struct si_shader_context *ctx = si_shader_context(bld_base);
3905 struct gallivm_state *gallivm = &ctx->gallivm;
3906 LLVMBuilderRef builder = gallivm->builder;
3907 const struct tgsi_full_instruction * inst = emit_data->inst;
3908 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3909 char intrinsic_name[64];
3910 bool readonly_memory = false;
3911
3912 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3913 load_emit_memory(ctx, emit_data);
3914 return;
3915 }
3916
3917 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3918 emit_waitcnt(ctx, VM_CNT);
3919
3920 readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3921 is_oneway_access_only(inst, info,
3922 info->shader_buffers_store |
3923 info->shader_buffers_atomic,
3924 info->images_store |
3925 info->images_atomic);
3926
3927 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3928 load_emit_buffer(ctx, emit_data, readonly_memory);
3929 return;
3930 }
3931
3932 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3933 emit_data->output[emit_data->chan] =
3934 lp_build_intrinsic(
3935 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3936 emit_data->args, emit_data->arg_count,
3937 get_load_intr_attribs(readonly_memory));
3938 } else {
3939 ac_get_image_intr_name("llvm.amdgcn.image.load",
3940 emit_data->dst_type, /* vdata */
3941 LLVMTypeOf(emit_data->args[0]), /* coords */
3942 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3943 intrinsic_name, sizeof(intrinsic_name));
3944
3945 emit_data->output[emit_data->chan] =
3946 lp_build_intrinsic(
3947 builder, intrinsic_name, emit_data->dst_type,
3948 emit_data->args, emit_data->arg_count,
3949 get_load_intr_attribs(readonly_memory));
3950 }
3951 }
3952
3953 static void store_fetch_args(
3954 struct lp_build_tgsi_context * bld_base,
3955 struct lp_build_emit_data * emit_data)
3956 {
3957 struct si_shader_context *ctx = si_shader_context(bld_base);
3958 struct gallivm_state *gallivm = &ctx->gallivm;
3959 LLVMBuilderRef builder = gallivm->builder;
3960 const struct tgsi_full_instruction * inst = emit_data->inst;
3961 struct tgsi_full_src_register memory;
3962 LLVMValueRef chans[4];
3963 LLVMValueRef data;
3964 LLVMValueRef rsrc;
3965 unsigned chan;
3966
3967 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3968
3969 for (chan = 0; chan < 4; ++chan) {
3970 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3971 }
3972 data = lp_build_gather_values(gallivm, chans, 4);
3973
3974 emit_data->args[emit_data->arg_count++] = data;
3975
3976 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3977
3978 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3979 LLVMValueRef offset;
3980 LLVMValueRef tmp;
3981
3982 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3983
3984 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3985 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3986
3987 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3988 offset, false, false);
3989 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3990 unsigned target = inst->Memory.Texture;
3991 LLVMValueRef coords;
3992
3993 /* 8bit/16bit TC L1 write corruption bug on SI.
3994 * All store opcodes not aligned to a dword are affected.
3995 *
3996 * The only way to get unaligned stores in radeonsi is through
3997 * shader images.
3998 */
3999 bool force_glc = ctx->screen->b.chip_class == SI;
4000
4001 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
4002 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
4003
4004 if (target == TGSI_TEXTURE_BUFFER) {
4005 buffer_append_args(ctx, emit_data, rsrc, coords,
4006 ctx->i32_0, false, force_glc);
4007 } else {
4008 emit_data->args[1] = coords;
4009 emit_data->args[2] = rsrc;
4010 emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
4011 emit_data->arg_count = 4;
4012
4013 image_append_args(ctx, emit_data, target, false, force_glc);
4014 }
4015 }
4016 }
4017
4018 static void store_emit_buffer(
4019 struct si_shader_context *ctx,
4020 struct lp_build_emit_data *emit_data,
4021 bool writeonly_memory)
4022 {
4023 const struct tgsi_full_instruction *inst = emit_data->inst;
4024 struct gallivm_state *gallivm = &ctx->gallivm;
4025 LLVMBuilderRef builder = gallivm->builder;
4026 LLVMValueRef base_data = emit_data->args[0];
4027 LLVMValueRef base_offset = emit_data->args[3];
4028 unsigned writemask = inst->Dst[0].Register.WriteMask;
4029
4030 while (writemask) {
4031 int start, count;
4032 const char *intrinsic_name;
4033 LLVMValueRef data;
4034 LLVMValueRef offset;
4035 LLVMValueRef tmp;
4036
4037 u_bit_scan_consecutive_range(&writemask, &start, &count);
4038
4039 /* Due to an LLVM limitation, split 3-element writes
4040 * into a 2-element and a 1-element write. */
4041 if (count == 3) {
4042 writemask |= 1 << (start + 2);
4043 count = 2;
4044 }
4045
4046 if (count == 4) {
4047 data = base_data;
4048 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
4049 } else if (count == 2) {
4050 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
4051
4052 tmp = LLVMBuildExtractElement(
4053 builder, base_data,
4054 LLVMConstInt(ctx->i32, start, 0), "");
4055 data = LLVMBuildInsertElement(
4056 builder, LLVMGetUndef(v2f32), tmp,
4057 ctx->i32_0, "");
4058
4059 tmp = LLVMBuildExtractElement(
4060 builder, base_data,
4061 LLVMConstInt(ctx->i32, start + 1, 0), "");
4062 data = LLVMBuildInsertElement(
4063 builder, data, tmp, ctx->i32_1, "");
4064
4065 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
4066 } else {
4067 assert(count == 1);
4068 data = LLVMBuildExtractElement(
4069 builder, base_data,
4070 LLVMConstInt(ctx->i32, start, 0), "");
4071 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
4072 }
4073
4074 offset = base_offset;
4075 if (start != 0) {
4076 offset = LLVMBuildAdd(
4077 builder, offset,
4078 LLVMConstInt(ctx->i32, start * 4, 0), "");
4079 }
4080
4081 emit_data->args[0] = data;
4082 emit_data->args[3] = offset;
4083
4084 lp_build_intrinsic(
4085 builder, intrinsic_name, emit_data->dst_type,
4086 emit_data->args, emit_data->arg_count,
4087 get_store_intr_attribs(writeonly_memory));
4088 }
4089 }
4090
4091 static void store_emit_memory(
4092 struct si_shader_context *ctx,
4093 struct lp_build_emit_data *emit_data)
4094 {
4095 const struct tgsi_full_instruction *inst = emit_data->inst;
4096 struct gallivm_state *gallivm = &ctx->gallivm;
4097 LLVMBuilderRef builder = gallivm->builder;
4098 unsigned writemask = inst->Dst[0].Register.WriteMask;
4099 LLVMValueRef ptr, derived_ptr, data, index;
4100 int chan;
4101
4102 ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
4103
4104 for (chan = 0; chan < 4; ++chan) {
4105 if (!(writemask & (1 << chan))) {
4106 continue;
4107 }
4108 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
4109 index = LLVMConstInt(ctx->i32, chan, 0);
4110 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
4111 LLVMBuildStore(builder, data, derived_ptr);
4112 }
4113 }
4114
4115 static void store_emit(
4116 const struct lp_build_tgsi_action *action,
4117 struct lp_build_tgsi_context *bld_base,
4118 struct lp_build_emit_data *emit_data)
4119 {
4120 struct si_shader_context *ctx = si_shader_context(bld_base);
4121 struct gallivm_state *gallivm = &ctx->gallivm;
4122 LLVMBuilderRef builder = gallivm->builder;
4123 const struct tgsi_full_instruction * inst = emit_data->inst;
4124 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
4125 unsigned target = inst->Memory.Texture;
4126 char intrinsic_name[64];
4127 bool writeonly_memory = false;
4128
4129 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4130 store_emit_memory(ctx, emit_data);
4131 return;
4132 }
4133
4134 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4135 emit_waitcnt(ctx, VM_CNT);
4136
4137 writeonly_memory = is_oneway_access_only(inst, info,
4138 info->shader_buffers_load |
4139 info->shader_buffers_atomic,
4140 info->images_load |
4141 info->images_atomic);
4142
4143 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4144 store_emit_buffer(ctx, emit_data, writeonly_memory);
4145 return;
4146 }
4147
4148 if (target == TGSI_TEXTURE_BUFFER) {
4149 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4150 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4151 emit_data->dst_type, emit_data->args,
4152 emit_data->arg_count,
4153 get_store_intr_attribs(writeonly_memory));
4154 } else {
4155 ac_get_image_intr_name("llvm.amdgcn.image.store",
4156 LLVMTypeOf(emit_data->args[0]), /* vdata */
4157 LLVMTypeOf(emit_data->args[1]), /* coords */
4158 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4159 intrinsic_name, sizeof(intrinsic_name));
4160
4161 emit_data->output[emit_data->chan] =
4162 lp_build_intrinsic(
4163 builder, intrinsic_name, emit_data->dst_type,
4164 emit_data->args, emit_data->arg_count,
4165 get_store_intr_attribs(writeonly_memory));
4166 }
4167 }
4168
4169 static void atomic_fetch_args(
4170 struct lp_build_tgsi_context * bld_base,
4171 struct lp_build_emit_data * emit_data)
4172 {
4173 struct si_shader_context *ctx = si_shader_context(bld_base);
4174 struct gallivm_state *gallivm = &ctx->gallivm;
4175 LLVMBuilderRef builder = gallivm->builder;
4176 const struct tgsi_full_instruction * inst = emit_data->inst;
4177 LLVMValueRef data1, data2;
4178 LLVMValueRef rsrc;
4179 LLVMValueRef tmp;
4180
4181 emit_data->dst_type = ctx->f32;
4182
4183 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4184 data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4185
4186 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4187 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4188 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4189 }
4190
4191 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4192 * of arguments, which is reversed relative to TGSI (and GLSL)
4193 */
4194 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4195 emit_data->args[emit_data->arg_count++] = data2;
4196 emit_data->args[emit_data->arg_count++] = data1;
4197
4198 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4199 LLVMValueRef offset;
4200
4201 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4202
4203 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4204 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4205
4206 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4207 offset, true, false);
4208 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4209 unsigned target = inst->Memory.Texture;
4210 LLVMValueRef coords;
4211
4212 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4213 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4214
4215 if (target == TGSI_TEXTURE_BUFFER) {
4216 buffer_append_args(ctx, emit_data, rsrc, coords,
4217 ctx->i32_0, true, false);
4218 } else {
4219 emit_data->args[emit_data->arg_count++] = coords;
4220 emit_data->args[emit_data->arg_count++] = rsrc;
4221
4222 image_append_args(ctx, emit_data, target, true, false);
4223 }
4224 }
4225 }
4226
4227 static void atomic_emit_memory(struct si_shader_context *ctx,
4228 struct lp_build_emit_data *emit_data) {
4229 struct gallivm_state *gallivm = &ctx->gallivm;
4230 LLVMBuilderRef builder = gallivm->builder;
4231 const struct tgsi_full_instruction * inst = emit_data->inst;
4232 LLVMValueRef ptr, result, arg;
4233
4234 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4235
4236 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4237 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4238
4239 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4240 LLVMValueRef new_data;
4241 new_data = lp_build_emit_fetch(&ctx->bld_base,
4242 inst, 3, 0);
4243
4244 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4245
4246 #if HAVE_LLVM >= 0x309
4247 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4248 LLVMAtomicOrderingSequentiallyConsistent,
4249 LLVMAtomicOrderingSequentiallyConsistent,
4250 false);
4251 #endif
4252
4253 result = LLVMBuildExtractValue(builder, result, 0, "");
4254 } else {
4255 LLVMAtomicRMWBinOp op;
4256
4257 switch(inst->Instruction.Opcode) {
4258 case TGSI_OPCODE_ATOMUADD:
4259 op = LLVMAtomicRMWBinOpAdd;
4260 break;
4261 case TGSI_OPCODE_ATOMXCHG:
4262 op = LLVMAtomicRMWBinOpXchg;
4263 break;
4264 case TGSI_OPCODE_ATOMAND:
4265 op = LLVMAtomicRMWBinOpAnd;
4266 break;
4267 case TGSI_OPCODE_ATOMOR:
4268 op = LLVMAtomicRMWBinOpOr;
4269 break;
4270 case TGSI_OPCODE_ATOMXOR:
4271 op = LLVMAtomicRMWBinOpXor;
4272 break;
4273 case TGSI_OPCODE_ATOMUMIN:
4274 op = LLVMAtomicRMWBinOpUMin;
4275 break;
4276 case TGSI_OPCODE_ATOMUMAX:
4277 op = LLVMAtomicRMWBinOpUMax;
4278 break;
4279 case TGSI_OPCODE_ATOMIMIN:
4280 op = LLVMAtomicRMWBinOpMin;
4281 break;
4282 case TGSI_OPCODE_ATOMIMAX:
4283 op = LLVMAtomicRMWBinOpMax;
4284 break;
4285 default:
4286 unreachable("unknown atomic opcode");
4287 }
4288
4289 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4290 LLVMAtomicOrderingSequentiallyConsistent,
4291 false);
4292 }
4293 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4294 }
4295
4296 static void atomic_emit(
4297 const struct lp_build_tgsi_action *action,
4298 struct lp_build_tgsi_context *bld_base,
4299 struct lp_build_emit_data *emit_data)
4300 {
4301 struct si_shader_context *ctx = si_shader_context(bld_base);
4302 struct gallivm_state *gallivm = &ctx->gallivm;
4303 LLVMBuilderRef builder = gallivm->builder;
4304 const struct tgsi_full_instruction * inst = emit_data->inst;
4305 char intrinsic_name[40];
4306 LLVMValueRef tmp;
4307
4308 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4309 atomic_emit_memory(ctx, emit_data);
4310 return;
4311 }
4312
4313 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4314 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4315 snprintf(intrinsic_name, sizeof(intrinsic_name),
4316 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4317 } else {
4318 LLVMValueRef coords;
4319 char coords_type[8];
4320
4321 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4322 coords = emit_data->args[2];
4323 else
4324 coords = emit_data->args[1];
4325
4326 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4327 snprintf(intrinsic_name, sizeof(intrinsic_name),
4328 "llvm.amdgcn.image.atomic.%s.%s",
4329 action->intr_name, coords_type);
4330 }
4331
4332 tmp = lp_build_intrinsic(
4333 builder, intrinsic_name, ctx->i32,
4334 emit_data->args, emit_data->arg_count, 0);
4335 emit_data->output[emit_data->chan] =
4336 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4337 }
4338
4339 static void set_tex_fetch_args(struct si_shader_context *ctx,
4340 struct lp_build_emit_data *emit_data,
4341 unsigned target,
4342 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4343 LLVMValueRef *param, unsigned count,
4344 unsigned dmask)
4345 {
4346 struct gallivm_state *gallivm = &ctx->gallivm;
4347 struct ac_image_args args = {};
4348
4349 /* Pad to power of two vector */
4350 while (count < util_next_power_of_two(count))
4351 param[count++] = LLVMGetUndef(ctx->i32);
4352
4353 if (count > 1)
4354 args.addr = lp_build_gather_values(gallivm, param, count);
4355 else
4356 args.addr = param[0];
4357
4358 args.resource = res_ptr;
4359 args.sampler = samp_ptr;
4360 args.dmask = dmask;
4361 args.unorm = target == TGSI_TEXTURE_RECT ||
4362 target == TGSI_TEXTURE_SHADOWRECT;
4363 args.da = tgsi_is_array_sampler(target);
4364
4365 /* Ugly, but we seem to have no other choice right now. */
4366 STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4367 memcpy(emit_data->args, &args, sizeof(args));
4368 }
4369
4370 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4371 unsigned target, LLVMValueRef out)
4372 {
4373 LLVMBuilderRef builder = ctx->gallivm.builder;
4374
4375 /* 1D textures are allocated and used as 2D on GFX9. */
4376 if (ctx->screen->b.chip_class >= GFX9 &&
4377 (target == TGSI_TEXTURE_1D_ARRAY ||
4378 target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4379 LLVMValueRef layers =
4380 LLVMBuildExtractElement(builder, out,
4381 LLVMConstInt(ctx->i32, 2, 0), "");
4382 out = LLVMBuildInsertElement(builder, out, layers,
4383 ctx->i32_1, "");
4384 }
4385
4386 /* Divide the number of layers by 6 to get the number of cubes. */
4387 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4388 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4389 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4390
4391 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4392 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4393
4394 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4395 }
4396 return out;
4397 }
4398
4399 static void resq_fetch_args(
4400 struct lp_build_tgsi_context * bld_base,
4401 struct lp_build_emit_data * emit_data)
4402 {
4403 struct si_shader_context *ctx = si_shader_context(bld_base);
4404 const struct tgsi_full_instruction *inst = emit_data->inst;
4405 const struct tgsi_full_src_register *reg = &inst->Src[0];
4406
4407 emit_data->dst_type = ctx->v4i32;
4408
4409 if (reg->Register.File == TGSI_FILE_BUFFER) {
4410 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4411 emit_data->arg_count = 1;
4412 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4413 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4414 &emit_data->args[0]);
4415 emit_data->arg_count = 1;
4416 } else {
4417 LLVMValueRef res_ptr;
4418 unsigned image_target;
4419
4420 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4421 image_target = TGSI_TEXTURE_2D_ARRAY;
4422 else
4423 image_target = inst->Memory.Texture;
4424
4425 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4426 &res_ptr);
4427 set_tex_fetch_args(ctx, emit_data, image_target,
4428 res_ptr, NULL, &ctx->i32_0, 1,
4429 0xf);
4430 }
4431 }
4432
4433 static void resq_emit(
4434 const struct lp_build_tgsi_action *action,
4435 struct lp_build_tgsi_context *bld_base,
4436 struct lp_build_emit_data *emit_data)
4437 {
4438 struct si_shader_context *ctx = si_shader_context(bld_base);
4439 struct gallivm_state *gallivm = &ctx->gallivm;
4440 LLVMBuilderRef builder = gallivm->builder;
4441 const struct tgsi_full_instruction *inst = emit_data->inst;
4442 LLVMValueRef out;
4443
4444 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4445 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4446 LLVMConstInt(ctx->i32, 2, 0), "");
4447 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4448 out = get_buffer_size(bld_base, emit_data->args[0]);
4449 } else {
4450 struct ac_image_args args;
4451
4452 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4453 args.opcode = ac_image_get_resinfo;
4454 out = ac_build_image_opcode(&ctx->ac, &args);
4455
4456 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4457 }
4458
4459 emit_data->output[emit_data->chan] = out;
4460 }
4461
4462 static const struct lp_build_tgsi_action tex_action;
4463
4464 enum desc_type {
4465 DESC_IMAGE,
4466 DESC_BUFFER,
4467 DESC_FMASK,
4468 DESC_SAMPLER,
4469 };
4470
4471 /**
4472 * Load an image view, fmask view. or sampler state descriptor.
4473 */
4474 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4475 LLVMValueRef list, LLVMValueRef index,
4476 enum desc_type type)
4477 {
4478 struct gallivm_state *gallivm = &ctx->gallivm;
4479 LLVMBuilderRef builder = gallivm->builder;
4480
4481 switch (type) {
4482 case DESC_IMAGE:
4483 /* The image is at [0:7]. */
4484 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4485 break;
4486 case DESC_BUFFER:
4487 /* The buffer is in [4:7]. */
4488 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4489 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4490 list = LLVMBuildPointerCast(builder, list,
4491 const_array(ctx->v4i32, 0), "");
4492 break;
4493 case DESC_FMASK:
4494 /* The FMASK is at [8:15]. */
4495 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4496 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4497 break;
4498 case DESC_SAMPLER:
4499 /* The sampler state is at [12:15]. */
4500 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4501 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4502 list = LLVMBuildPointerCast(builder, list,
4503 const_array(ctx->v4i32, 0), "");
4504 break;
4505 }
4506
4507 return ac_build_indexed_load_const(&ctx->ac, list, index);
4508 }
4509
4510 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4511 *
4512 * SI-CI:
4513 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4514 * filtering manually. The driver sets img7 to a mask clearing
4515 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4516 * s_and_b32 samp0, samp0, img7
4517 *
4518 * VI:
4519 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4520 */
4521 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4522 LLVMValueRef res, LLVMValueRef samp)
4523 {
4524 LLVMBuilderRef builder = ctx->gallivm.builder;
4525 LLVMValueRef img7, samp0;
4526
4527 if (ctx->screen->b.chip_class >= VI)
4528 return samp;
4529
4530 img7 = LLVMBuildExtractElement(builder, res,
4531 LLVMConstInt(ctx->i32, 7, 0), "");
4532 samp0 = LLVMBuildExtractElement(builder, samp,
4533 ctx->i32_0, "");
4534 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4535 return LLVMBuildInsertElement(builder, samp, samp0,
4536 ctx->i32_0, "");
4537 }
4538
4539 static void tex_fetch_ptrs(
4540 struct lp_build_tgsi_context *bld_base,
4541 struct lp_build_emit_data *emit_data,
4542 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4543 {
4544 struct si_shader_context *ctx = si_shader_context(bld_base);
4545 LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4546 const struct tgsi_full_instruction *inst = emit_data->inst;
4547 const struct tgsi_full_src_register *reg;
4548 unsigned target = inst->Texture.Texture;
4549 unsigned sampler_src;
4550 LLVMValueRef index;
4551
4552 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4553 reg = &emit_data->inst->Src[sampler_src];
4554
4555 if (reg->Register.Indirect) {
4556 index = get_bounded_indirect_index(ctx,
4557 &reg->Indirect,
4558 reg->Register.Index,
4559 SI_NUM_SAMPLERS);
4560 } else {
4561 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4562 }
4563
4564 if (target == TGSI_TEXTURE_BUFFER)
4565 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4566 else
4567 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4568
4569 if (samp_ptr)
4570 *samp_ptr = NULL;
4571 if (fmask_ptr)
4572 *fmask_ptr = NULL;
4573
4574 if (target == TGSI_TEXTURE_2D_MSAA ||
4575 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4576 if (fmask_ptr)
4577 *fmask_ptr = load_sampler_desc(ctx, list, index,
4578 DESC_FMASK);
4579 } else if (target != TGSI_TEXTURE_BUFFER) {
4580 if (samp_ptr) {
4581 *samp_ptr = load_sampler_desc(ctx, list, index,
4582 DESC_SAMPLER);
4583 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4584 }
4585 }
4586 }
4587
4588 static void txq_fetch_args(
4589 struct lp_build_tgsi_context *bld_base,
4590 struct lp_build_emit_data *emit_data)
4591 {
4592 struct si_shader_context *ctx = si_shader_context(bld_base);
4593 const struct tgsi_full_instruction *inst = emit_data->inst;
4594 unsigned target = inst->Texture.Texture;
4595 LLVMValueRef res_ptr;
4596 LLVMValueRef address;
4597
4598 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4599
4600 if (target == TGSI_TEXTURE_BUFFER) {
4601 /* Read the size from the buffer descriptor directly. */
4602 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4603 return;
4604 }
4605
4606 /* Textures - set the mip level. */
4607 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4608
4609 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4610 NULL, &address, 1, 0xf);
4611 }
4612
4613 static void txq_emit(const struct lp_build_tgsi_action *action,
4614 struct lp_build_tgsi_context *bld_base,
4615 struct lp_build_emit_data *emit_data)
4616 {
4617 struct si_shader_context *ctx = si_shader_context(bld_base);
4618 struct ac_image_args args;
4619 unsigned target = emit_data->inst->Texture.Texture;
4620
4621 if (target == TGSI_TEXTURE_BUFFER) {
4622 /* Just return the buffer size. */
4623 emit_data->output[emit_data->chan] = emit_data->args[0];
4624 return;
4625 }
4626
4627 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4628
4629 args.opcode = ac_image_get_resinfo;
4630 LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4631
4632 emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4633 }
4634
4635 static void tex_fetch_args(
4636 struct lp_build_tgsi_context *bld_base,
4637 struct lp_build_emit_data *emit_data)
4638 {
4639 struct si_shader_context *ctx = si_shader_context(bld_base);
4640 struct gallivm_state *gallivm = &ctx->gallivm;
4641 const struct tgsi_full_instruction *inst = emit_data->inst;
4642 unsigned opcode = inst->Instruction.Opcode;
4643 unsigned target = inst->Texture.Texture;
4644 LLVMValueRef coords[5], derivs[6];
4645 LLVMValueRef address[16];
4646 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4647 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4648 unsigned count = 0;
4649 unsigned chan;
4650 unsigned num_deriv_channels = 0;
4651 bool has_offset = inst->Texture.NumOffsets > 0;
4652 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4653 unsigned dmask = 0xf;
4654
4655 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4656
4657 if (target == TGSI_TEXTURE_BUFFER) {
4658 emit_data->dst_type = ctx->v4f32;
4659 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4660 ctx->v16i8, "");
4661 emit_data->args[1] = ctx->i32_0;
4662 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4663 emit_data->arg_count = 3;
4664 return;
4665 }
4666
4667 /* Fetch and project texture coordinates */
4668 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4669 for (chan = 0; chan < 3; chan++ ) {
4670 coords[chan] = lp_build_emit_fetch(bld_base,
4671 emit_data->inst, 0,
4672 chan);
4673 if (opcode == TGSI_OPCODE_TXP)
4674 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4675 TGSI_OPCODE_DIV,
4676 coords[chan],
4677 coords[3]);
4678 }
4679
4680 if (opcode == TGSI_OPCODE_TXP)
4681 coords[3] = bld_base->base.one;
4682
4683 /* Pack offsets. */
4684 if (has_offset &&
4685 opcode != TGSI_OPCODE_TXF &&
4686 opcode != TGSI_OPCODE_TXF_LZ) {
4687 /* The offsets are six-bit signed integers packed like this:
4688 * X=[5:0], Y=[13:8], and Z=[21:16].
4689 */
4690 LLVMValueRef offset[3], pack;
4691
4692 assert(inst->Texture.NumOffsets == 1);
4693
4694 for (chan = 0; chan < 3; chan++) {
4695 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4696 emit_data->inst, 0, chan);
4697 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4698 LLVMConstInt(ctx->i32, 0x3f, 0), "");
4699 if (chan)
4700 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4701 LLVMConstInt(ctx->i32, chan*8, 0), "");
4702 }
4703
4704 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4705 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4706 address[count++] = pack;
4707 }
4708
4709 /* Pack LOD bias value */
4710 if (opcode == TGSI_OPCODE_TXB)
4711 address[count++] = coords[3];
4712 if (opcode == TGSI_OPCODE_TXB2)
4713 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4714
4715 /* Pack depth comparison value */
4716 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4717 LLVMValueRef z;
4718
4719 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4720 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4721 } else {
4722 assert(ref_pos >= 0);
4723 z = coords[ref_pos];
4724 }
4725
4726 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4727 * so the depth comparison value isn't clamped for Z16 and
4728 * Z24 anymore. Do it manually here.
4729 *
4730 * It's unnecessary if the original texture format was
4731 * Z32_FLOAT, but we don't know that here.
4732 */
4733 if (ctx->screen->b.chip_class == VI)
4734 z = ac_build_clamp(&ctx->ac, z);
4735
4736 address[count++] = z;
4737 }
4738
4739 /* Pack user derivatives */
4740 if (opcode == TGSI_OPCODE_TXD) {
4741 int param, num_src_deriv_channels, num_dst_deriv_channels;
4742
4743 switch (target) {
4744 case TGSI_TEXTURE_3D:
4745 num_src_deriv_channels = 3;
4746 num_dst_deriv_channels = 3;
4747 num_deriv_channels = 3;
4748 break;
4749 case TGSI_TEXTURE_2D:
4750 case TGSI_TEXTURE_SHADOW2D:
4751 case TGSI_TEXTURE_RECT:
4752 case TGSI_TEXTURE_SHADOWRECT:
4753 case TGSI_TEXTURE_2D_ARRAY:
4754 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4755 num_src_deriv_channels = 2;
4756 num_dst_deriv_channels = 2;
4757 num_deriv_channels = 2;
4758 break;
4759 case TGSI_TEXTURE_CUBE:
4760 case TGSI_TEXTURE_SHADOWCUBE:
4761 case TGSI_TEXTURE_CUBE_ARRAY:
4762 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4763 /* Cube derivatives will be converted to 2D. */
4764 num_src_deriv_channels = 3;
4765 num_dst_deriv_channels = 3;
4766 num_deriv_channels = 2;
4767 break;
4768 case TGSI_TEXTURE_1D:
4769 case TGSI_TEXTURE_SHADOW1D:
4770 case TGSI_TEXTURE_1D_ARRAY:
4771 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4772 num_src_deriv_channels = 1;
4773
4774 /* 1D textures are allocated and used as 2D on GFX9. */
4775 if (ctx->screen->b.chip_class >= GFX9) {
4776 num_dst_deriv_channels = 2;
4777 num_deriv_channels = 2;
4778 } else {
4779 num_dst_deriv_channels = 1;
4780 num_deriv_channels = 1;
4781 }
4782 break;
4783 default:
4784 unreachable("invalid target");
4785 }
4786
4787 for (param = 0; param < 2; param++) {
4788 for (chan = 0; chan < num_src_deriv_channels; chan++)
4789 derivs[param * num_dst_deriv_channels + chan] =
4790 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4791
4792 /* Fill in the rest with zeros. */
4793 for (chan = num_src_deriv_channels;
4794 chan < num_dst_deriv_channels; chan++)
4795 derivs[param * num_dst_deriv_channels + chan] =
4796 bld_base->base.zero;
4797 }
4798 }
4799
4800 if (target == TGSI_TEXTURE_CUBE ||
4801 target == TGSI_TEXTURE_CUBE_ARRAY ||
4802 target == TGSI_TEXTURE_SHADOWCUBE ||
4803 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4804 ac_prepare_cube_coords(&ctx->ac,
4805 opcode == TGSI_OPCODE_TXD,
4806 target == TGSI_TEXTURE_CUBE_ARRAY ||
4807 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4808 coords, derivs);
4809
4810 if (opcode == TGSI_OPCODE_TXD)
4811 for (int i = 0; i < num_deriv_channels * 2; i++)
4812 address[count++] = derivs[i];
4813
4814 /* Pack texture coordinates */
4815 address[count++] = coords[0];
4816 if (num_coords > 1)
4817 address[count++] = coords[1];
4818 if (num_coords > 2)
4819 address[count++] = coords[2];
4820
4821 /* 1D textures are allocated and used as 2D on GFX9. */
4822 if (ctx->screen->b.chip_class >= GFX9) {
4823 LLVMValueRef filler;
4824
4825 /* Use 0.5, so that we don't sample the border color. */
4826 if (opcode == TGSI_OPCODE_TXF)
4827 filler = ctx->i32_0;
4828 else
4829 filler = LLVMConstReal(ctx->f32, 0.5);
4830
4831 if (target == TGSI_TEXTURE_1D ||
4832 target == TGSI_TEXTURE_SHADOW1D) {
4833 address[count++] = filler;
4834 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4835 target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4836 address[count] = address[count - 1];
4837 address[count - 1] = filler;
4838 count++;
4839 }
4840 }
4841
4842 /* Pack LOD or sample index */
4843 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4844 address[count++] = coords[3];
4845 else if (opcode == TGSI_OPCODE_TXL2)
4846 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4847
4848 if (count > 16) {
4849 assert(!"Cannot handle more than 16 texture address parameters");
4850 count = 16;
4851 }
4852
4853 for (chan = 0; chan < count; chan++ ) {
4854 address[chan] = LLVMBuildBitCast(gallivm->builder,
4855 address[chan], ctx->i32, "");
4856 }
4857
4858 /* Adjust the sample index according to FMASK.
4859 *
4860 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4861 * which is the identity mapping. Each nibble says which physical sample
4862 * should be fetched to get that sample.
4863 *
4864 * For example, 0x11111100 means there are only 2 samples stored and
4865 * the second sample covers 3/4 of the pixel. When reading samples 0
4866 * and 1, return physical sample 0 (determined by the first two 0s
4867 * in FMASK), otherwise return physical sample 1.
4868 *
4869 * The sample index should be adjusted as follows:
4870 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4871 */
4872 if (target == TGSI_TEXTURE_2D_MSAA ||
4873 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4874 struct lp_build_emit_data txf_emit_data = *emit_data;
4875 LLVMValueRef txf_address[4];
4876 /* We only need .xy for non-arrays, and .xyz for arrays. */
4877 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4878 struct tgsi_full_instruction inst = {};
4879
4880 memcpy(txf_address, address, sizeof(txf_address));
4881
4882 /* Read FMASK using TXF_LZ. */
4883 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4884 inst.Texture.Texture = target;
4885 txf_emit_data.inst = &inst;
4886 txf_emit_data.chan = 0;
4887 set_tex_fetch_args(ctx, &txf_emit_data,
4888 target, fmask_ptr, NULL,
4889 txf_address, txf_count, 0xf);
4890 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4891
4892 /* Initialize some constants. */
4893 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4894 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4895
4896 /* Apply the formula. */
4897 LLVMValueRef fmask =
4898 LLVMBuildExtractElement(gallivm->builder,
4899 txf_emit_data.output[0],
4900 ctx->i32_0, "");
4901
4902 unsigned sample_chan = txf_count; /* the sample index is last */
4903
4904 LLVMValueRef sample_index4 =
4905 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4906
4907 LLVMValueRef shifted_fmask =
4908 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4909
4910 LLVMValueRef final_sample =
4911 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4912
4913 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4914 * resource descriptor is 0 (invalid),
4915 */
4916 LLVMValueRef fmask_desc =
4917 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4918 ctx->v8i32, "");
4919
4920 LLVMValueRef fmask_word1 =
4921 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4922 ctx->i32_1, "");
4923
4924 LLVMValueRef word1_is_nonzero =
4925 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4926 fmask_word1, ctx->i32_0, "");
4927
4928 /* Replace the MSAA sample index. */
4929 address[sample_chan] =
4930 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4931 final_sample, address[sample_chan], "");
4932 }
4933
4934 if (opcode == TGSI_OPCODE_TXF ||
4935 opcode == TGSI_OPCODE_TXF_LZ) {
4936 /* add tex offsets */
4937 if (inst->Texture.NumOffsets) {
4938 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4939 const struct tgsi_texture_offset *off = inst->TexOffsets;
4940
4941 assert(inst->Texture.NumOffsets == 1);
4942
4943 switch (target) {
4944 case TGSI_TEXTURE_3D:
4945 address[2] = lp_build_add(uint_bld, address[2],
4946 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4947 /* fall through */
4948 case TGSI_TEXTURE_2D:
4949 case TGSI_TEXTURE_SHADOW2D:
4950 case TGSI_TEXTURE_RECT:
4951 case TGSI_TEXTURE_SHADOWRECT:
4952 case TGSI_TEXTURE_2D_ARRAY:
4953 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4954 address[1] =
4955 lp_build_add(uint_bld, address[1],
4956 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4957 /* fall through */
4958 case TGSI_TEXTURE_1D:
4959 case TGSI_TEXTURE_SHADOW1D:
4960 case TGSI_TEXTURE_1D_ARRAY:
4961 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4962 address[0] =
4963 lp_build_add(uint_bld, address[0],
4964 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4965 break;
4966 /* texture offsets do not apply to other texture targets */
4967 }
4968 }
4969 }
4970
4971 if (opcode == TGSI_OPCODE_TG4) {
4972 unsigned gather_comp = 0;
4973
4974 /* DMASK was repurposed for GATHER4. 4 components are always
4975 * returned and DMASK works like a swizzle - it selects
4976 * the component to fetch. The only valid DMASK values are
4977 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4978 * (red,red,red,red) etc.) The ISA document doesn't mention
4979 * this.
4980 */
4981
4982 /* Get the component index from src1.x for Gather4. */
4983 if (!tgsi_is_shadow_target(target)) {
4984 LLVMValueRef comp_imm;
4985 struct tgsi_src_register src1 = inst->Src[1].Register;
4986
4987 assert(src1.File == TGSI_FILE_IMMEDIATE);
4988
4989 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4990 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4991 gather_comp = CLAMP(gather_comp, 0, 3);
4992 }
4993
4994 dmask = 1 << gather_comp;
4995 }
4996
4997 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4998 samp_ptr, address, count, dmask);
4999 }
5000
5001 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
5002 * incorrectly forces nearest filtering if the texture format is integer.
5003 * The only effect it has on Gather4, which always returns 4 texels for
5004 * bilinear filtering, is that the final coordinates are off by 0.5 of
5005 * the texel size.
5006 *
5007 * The workaround is to subtract 0.5 from the unnormalized coordinates,
5008 * or (0.5 / size) from the normalized coordinates.
5009 */
5010 static void si_lower_gather4_integer(struct si_shader_context *ctx,
5011 struct ac_image_args *args,
5012 unsigned target)
5013 {
5014 LLVMBuilderRef builder = ctx->gallivm.builder;
5015 LLVMValueRef coord = args->addr;
5016 LLVMValueRef half_texel[2];
5017 /* Texture coordinates start after:
5018 * {offset, bias, z-compare, derivatives}
5019 * Only the offset and z-compare can occur here.
5020 */
5021 unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
5022 int c;
5023
5024 if (target == TGSI_TEXTURE_RECT ||
5025 target == TGSI_TEXTURE_SHADOWRECT) {
5026 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
5027 } else {
5028 struct tgsi_full_instruction txq_inst = {};
5029 struct lp_build_emit_data txq_emit_data = {};
5030
5031 /* Query the texture size. */
5032 txq_inst.Texture.Texture = target;
5033 txq_emit_data.inst = &txq_inst;
5034 txq_emit_data.dst_type = ctx->v4i32;
5035 set_tex_fetch_args(ctx, &txq_emit_data, target,
5036 args->resource, NULL, &ctx->i32_0,
5037 1, 0xf);
5038 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
5039
5040 /* Compute -0.5 / size. */
5041 for (c = 0; c < 2; c++) {
5042 half_texel[c] =
5043 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
5044 LLVMConstInt(ctx->i32, c, 0), "");
5045 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
5046 half_texel[c] =
5047 lp_build_emit_llvm_unary(&ctx->bld_base,
5048 TGSI_OPCODE_RCP, half_texel[c]);
5049 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
5050 LLVMConstReal(ctx->f32, -0.5), "");
5051 }
5052 }
5053
5054 for (c = 0; c < 2; c++) {
5055 LLVMValueRef tmp;
5056 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
5057
5058 tmp = LLVMBuildExtractElement(builder, coord, index, "");
5059 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
5060 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
5061 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
5062 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
5063 }
5064
5065 args->addr = coord;
5066 }
5067
5068 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
5069 struct lp_build_tgsi_context *bld_base,
5070 struct lp_build_emit_data *emit_data)
5071 {
5072 struct si_shader_context *ctx = si_shader_context(bld_base);
5073 const struct tgsi_full_instruction *inst = emit_data->inst;
5074 struct ac_image_args args;
5075 unsigned opcode = inst->Instruction.Opcode;
5076 unsigned target = inst->Texture.Texture;
5077
5078 if (target == TGSI_TEXTURE_BUFFER) {
5079 emit_data->output[emit_data->chan] =
5080 ac_build_buffer_load_format(&ctx->ac,
5081 emit_data->args[0],
5082 emit_data->args[2],
5083 emit_data->args[1],
5084 true);
5085 return;
5086 }
5087
5088 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
5089
5090 args.opcode = ac_image_sample;
5091 args.compare = tgsi_is_shadow_target(target);
5092 args.offset = inst->Texture.NumOffsets > 0;
5093
5094 switch (opcode) {
5095 case TGSI_OPCODE_TXF:
5096 case TGSI_OPCODE_TXF_LZ:
5097 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
5098 target == TGSI_TEXTURE_2D_MSAA ||
5099 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
5100 ac_image_load : ac_image_load_mip;
5101 args.compare = false;
5102 args.offset = false;
5103 break;
5104 case TGSI_OPCODE_LODQ:
5105 args.opcode = ac_image_get_lod;
5106 args.compare = false;
5107 args.offset = false;
5108 break;
5109 case TGSI_OPCODE_TEX:
5110 case TGSI_OPCODE_TEX2:
5111 case TGSI_OPCODE_TXP:
5112 if (ctx->type != PIPE_SHADER_FRAGMENT)
5113 args.level_zero = true;
5114 break;
5115 case TGSI_OPCODE_TEX_LZ:
5116 args.level_zero = true;
5117 break;
5118 case TGSI_OPCODE_TXB:
5119 case TGSI_OPCODE_TXB2:
5120 assert(ctx->type == PIPE_SHADER_FRAGMENT);
5121 args.bias = true;
5122 break;
5123 case TGSI_OPCODE_TXL:
5124 case TGSI_OPCODE_TXL2:
5125 args.lod = true;
5126 break;
5127 case TGSI_OPCODE_TXD:
5128 args.deriv = true;
5129 break;
5130 case TGSI_OPCODE_TG4:
5131 args.opcode = ac_image_gather4;
5132 args.level_zero = true;
5133 break;
5134 default:
5135 assert(0);
5136 return;
5137 }
5138
5139 /* The hardware needs special lowering for Gather4 with integer formats. */
5140 if (ctx->screen->b.chip_class <= VI &&
5141 opcode == TGSI_OPCODE_TG4) {
5142 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5143 /* This will also work with non-constant indexing because of how
5144 * glsl_to_tgsi works and we intent to preserve that behavior.
5145 */
5146 const unsigned src_idx = 2;
5147 unsigned sampler = inst->Src[src_idx].Register.Index;
5148
5149 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
5150
5151 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
5152 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
5153 si_lower_gather4_integer(ctx, &args, target);
5154 }
5155
5156 emit_data->output[emit_data->chan] =
5157 ac_build_image_opcode(&ctx->ac, &args);
5158 }
5159
5160 static void si_llvm_emit_txqs(
5161 const struct lp_build_tgsi_action *action,
5162 struct lp_build_tgsi_context *bld_base,
5163 struct lp_build_emit_data *emit_data)
5164 {
5165 struct si_shader_context *ctx = si_shader_context(bld_base);
5166 struct gallivm_state *gallivm = &ctx->gallivm;
5167 LLVMBuilderRef builder = gallivm->builder;
5168 LLVMValueRef res, samples;
5169 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5170
5171 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5172
5173
5174 /* Read the samples from the descriptor directly. */
5175 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5176 samples = LLVMBuildExtractElement(
5177 builder, res,
5178 LLVMConstInt(ctx->i32, 3, 0), "");
5179 samples = LLVMBuildLShr(builder, samples,
5180 LLVMConstInt(ctx->i32, 16, 0), "");
5181 samples = LLVMBuildAnd(builder, samples,
5182 LLVMConstInt(ctx->i32, 0xf, 0), "");
5183 samples = LLVMBuildShl(builder, ctx->i32_1,
5184 samples, "");
5185
5186 emit_data->output[emit_data->chan] = samples;
5187 }
5188
5189 static void si_llvm_emit_ddxy(
5190 const struct lp_build_tgsi_action *action,
5191 struct lp_build_tgsi_context *bld_base,
5192 struct lp_build_emit_data *emit_data)
5193 {
5194 struct si_shader_context *ctx = si_shader_context(bld_base);
5195 struct gallivm_state *gallivm = &ctx->gallivm;
5196 unsigned opcode = emit_data->info->opcode;
5197 LLVMValueRef val;
5198 int idx;
5199 unsigned mask;
5200
5201 if (opcode == TGSI_OPCODE_DDX_FINE)
5202 mask = AC_TID_MASK_LEFT;
5203 else if (opcode == TGSI_OPCODE_DDY_FINE)
5204 mask = AC_TID_MASK_TOP;
5205 else
5206 mask = AC_TID_MASK_TOP_LEFT;
5207
5208 /* for DDX we want to next X pixel, DDY next Y pixel. */
5209 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5210
5211 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5212 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5213 mask, idx, ctx->lds, val);
5214 emit_data->output[emit_data->chan] = val;
5215 }
5216
5217 /*
5218 * this takes an I,J coordinate pair,
5219 * and works out the X and Y derivatives.
5220 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5221 */
5222 static LLVMValueRef si_llvm_emit_ddxy_interp(
5223 struct lp_build_tgsi_context *bld_base,
5224 LLVMValueRef interp_ij)
5225 {
5226 struct si_shader_context *ctx = si_shader_context(bld_base);
5227 struct gallivm_state *gallivm = &ctx->gallivm;
5228 LLVMValueRef result[4], a;
5229 unsigned i;
5230
5231 for (i = 0; i < 2; i++) {
5232 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5233 LLVMConstInt(ctx->i32, i, 0), "");
5234 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5235 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5236 }
5237
5238 return lp_build_gather_values(gallivm, result, 4);
5239 }
5240
5241 static void interp_fetch_args(
5242 struct lp_build_tgsi_context *bld_base,
5243 struct lp_build_emit_data *emit_data)
5244 {
5245 struct si_shader_context *ctx = si_shader_context(bld_base);
5246 struct gallivm_state *gallivm = &ctx->gallivm;
5247 const struct tgsi_full_instruction *inst = emit_data->inst;
5248
5249 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5250 /* offset is in second src, first two channels */
5251 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5252 emit_data->inst, 1,
5253 TGSI_CHAN_X);
5254 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5255 emit_data->inst, 1,
5256 TGSI_CHAN_Y);
5257 emit_data->arg_count = 2;
5258 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5259 LLVMValueRef sample_position;
5260 LLVMValueRef sample_id;
5261 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5262
5263 /* fetch sample ID, then fetch its sample position,
5264 * and place into first two channels.
5265 */
5266 sample_id = lp_build_emit_fetch(bld_base,
5267 emit_data->inst, 1, TGSI_CHAN_X);
5268 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5269 ctx->i32, "");
5270 sample_position = load_sample_position(ctx, sample_id);
5271
5272 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5273 sample_position,
5274 ctx->i32_0, "");
5275
5276 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5277 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5278 sample_position,
5279 ctx->i32_1, "");
5280 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5281 emit_data->arg_count = 2;
5282 }
5283 }
5284
5285 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5286 struct lp_build_tgsi_context *bld_base,
5287 struct lp_build_emit_data *emit_data)
5288 {
5289 struct si_shader_context *ctx = si_shader_context(bld_base);
5290 struct si_shader *shader = ctx->shader;
5291 struct gallivm_state *gallivm = &ctx->gallivm;
5292 LLVMValueRef interp_param;
5293 const struct tgsi_full_instruction *inst = emit_data->inst;
5294 int input_index = inst->Src[0].Register.Index;
5295 int chan;
5296 int i;
5297 LLVMValueRef attr_number;
5298 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5299 int interp_param_idx;
5300 unsigned interp = shader->selector->info.input_interpolate[input_index];
5301 unsigned location;
5302
5303 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5304
5305 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5306 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5307 location = TGSI_INTERPOLATE_LOC_CENTER;
5308 else
5309 location = TGSI_INTERPOLATE_LOC_CENTROID;
5310
5311 interp_param_idx = lookup_interp_param_index(interp, location);
5312 if (interp_param_idx == -1)
5313 return;
5314 else if (interp_param_idx)
5315 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5316 else
5317 interp_param = NULL;
5318
5319 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5320
5321 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5322 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5323 LLVMValueRef ij_out[2];
5324 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5325
5326 /*
5327 * take the I then J parameters, and the DDX/Y for it, and
5328 * calculate the IJ inputs for the interpolator.
5329 * temp1 = ddx * offset/sample.x + I;
5330 * interp_param.I = ddy * offset/sample.y + temp1;
5331 * temp1 = ddx * offset/sample.x + J;
5332 * interp_param.J = ddy * offset/sample.y + temp1;
5333 */
5334 for (i = 0; i < 2; i++) {
5335 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5336 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5337 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5338 ddxy_out, ix_ll, "");
5339 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5340 ddxy_out, iy_ll, "");
5341 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5342 interp_param, ix_ll, "");
5343 LLVMValueRef temp1, temp2;
5344
5345 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5346 ctx->f32, "");
5347
5348 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5349
5350 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5351
5352 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5353
5354 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5355 }
5356 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5357 }
5358
5359 for (chan = 0; chan < 4; chan++) {
5360 LLVMValueRef llvm_chan;
5361 unsigned schan;
5362
5363 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5364 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5365
5366 if (interp_param) {
5367 interp_param = LLVMBuildBitCast(gallivm->builder,
5368 interp_param, LLVMVectorType(ctx->f32, 2), "");
5369 LLVMValueRef i = LLVMBuildExtractElement(
5370 gallivm->builder, interp_param, ctx->i32_0, "");
5371 LLVMValueRef j = LLVMBuildExtractElement(
5372 gallivm->builder, interp_param, ctx->i32_1, "");
5373 emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5374 llvm_chan, attr_number, params,
5375 i, j);
5376 } else {
5377 emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5378 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5379 llvm_chan, attr_number, params);
5380 }
5381 }
5382 }
5383
5384 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5385 LLVMValueRef value)
5386 {
5387 struct gallivm_state *gallivm = &ctx->gallivm;
5388 LLVMValueRef args[3] = {
5389 value,
5390 ctx->i32_0,
5391 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5392 };
5393
5394 /* We currently have no other way to prevent LLVM from lifting the icmp
5395 * calls to a dominating basic block.
5396 */
5397 emit_optimization_barrier(ctx, &args[0]);
5398
5399 if (LLVMTypeOf(args[0]) != ctx->i32)
5400 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5401
5402 return lp_build_intrinsic(gallivm->builder,
5403 "llvm.amdgcn.icmp.i32",
5404 ctx->i64, args, 3,
5405 LP_FUNC_ATTR_NOUNWIND |
5406 LP_FUNC_ATTR_READNONE |
5407 LP_FUNC_ATTR_CONVERGENT);
5408 }
5409
5410 static void vote_all_emit(
5411 const struct lp_build_tgsi_action *action,
5412 struct lp_build_tgsi_context *bld_base,
5413 struct lp_build_emit_data *emit_data)
5414 {
5415 struct si_shader_context *ctx = si_shader_context(bld_base);
5416 struct gallivm_state *gallivm = &ctx->gallivm;
5417 LLVMValueRef active_set, vote_set;
5418 LLVMValueRef tmp;
5419
5420 active_set = si_emit_ballot(ctx, ctx->i32_1);
5421 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5422
5423 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5424 emit_data->output[emit_data->chan] =
5425 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5426 }
5427
5428 static void vote_any_emit(
5429 const struct lp_build_tgsi_action *action,
5430 struct lp_build_tgsi_context *bld_base,
5431 struct lp_build_emit_data *emit_data)
5432 {
5433 struct si_shader_context *ctx = si_shader_context(bld_base);
5434 struct gallivm_state *gallivm = &ctx->gallivm;
5435 LLVMValueRef vote_set;
5436 LLVMValueRef tmp;
5437
5438 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5439
5440 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5441 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5442 emit_data->output[emit_data->chan] =
5443 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5444 }
5445
5446 static void vote_eq_emit(
5447 const struct lp_build_tgsi_action *action,
5448 struct lp_build_tgsi_context *bld_base,
5449 struct lp_build_emit_data *emit_data)
5450 {
5451 struct si_shader_context *ctx = si_shader_context(bld_base);
5452 struct gallivm_state *gallivm = &ctx->gallivm;
5453 LLVMValueRef active_set, vote_set;
5454 LLVMValueRef all, none, tmp;
5455
5456 active_set = si_emit_ballot(ctx, ctx->i32_1);
5457 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5458
5459 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5460 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5461 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5462 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5463 emit_data->output[emit_data->chan] =
5464 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5465 }
5466
5467 static void ballot_emit(
5468 const struct lp_build_tgsi_action *action,
5469 struct lp_build_tgsi_context *bld_base,
5470 struct lp_build_emit_data *emit_data)
5471 {
5472 struct si_shader_context *ctx = si_shader_context(bld_base);
5473 LLVMBuilderRef builder = ctx->gallivm.builder;
5474 LLVMValueRef tmp;
5475
5476 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5477 tmp = si_emit_ballot(ctx, tmp);
5478 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5479
5480 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5481 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5482 }
5483
5484 static void read_invoc_fetch_args(
5485 struct lp_build_tgsi_context *bld_base,
5486 struct lp_build_emit_data *emit_data)
5487 {
5488 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5489 0, emit_data->src_chan);
5490
5491 /* Always read the source invocation (= lane) from the X channel. */
5492 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5493 1, TGSI_CHAN_X);
5494 emit_data->arg_count = 2;
5495 }
5496
5497 static void read_lane_emit(
5498 const struct lp_build_tgsi_action *action,
5499 struct lp_build_tgsi_context *bld_base,
5500 struct lp_build_emit_data *emit_data)
5501 {
5502 struct si_shader_context *ctx = si_shader_context(bld_base);
5503 LLVMBuilderRef builder = ctx->gallivm.builder;
5504
5505 /* We currently have no other way to prevent LLVM from lifting the icmp
5506 * calls to a dominating basic block.
5507 */
5508 emit_optimization_barrier(ctx, &emit_data->args[0]);
5509
5510 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5511 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5512 ctx->i32, "");
5513 }
5514
5515 emit_data->output[emit_data->chan] =
5516 ac_build_intrinsic(&ctx->ac, action->intr_name,
5517 ctx->i32, emit_data->args, emit_data->arg_count,
5518 AC_FUNC_ATTR_READNONE |
5519 AC_FUNC_ATTR_CONVERGENT);
5520 }
5521
5522 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5523 struct lp_build_emit_data *emit_data)
5524 {
5525 struct si_shader_context *ctx = si_shader_context(bld_base);
5526 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5527 LLVMValueRef imm;
5528 unsigned stream;
5529
5530 assert(src0.File == TGSI_FILE_IMMEDIATE);
5531
5532 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5533 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5534 return stream;
5535 }
5536
5537 /* Emit one vertex from the geometry shader */
5538 static void si_llvm_emit_vertex(
5539 const struct lp_build_tgsi_action *action,
5540 struct lp_build_tgsi_context *bld_base,
5541 struct lp_build_emit_data *emit_data)
5542 {
5543 struct si_shader_context *ctx = si_shader_context(bld_base);
5544 struct lp_build_context *uint = &bld_base->uint_bld;
5545 struct si_shader *shader = ctx->shader;
5546 struct tgsi_shader_info *info = &shader->selector->info;
5547 struct gallivm_state *gallivm = &ctx->gallivm;
5548 struct lp_build_if_state if_state;
5549 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5550 ctx->param_gs2vs_offset);
5551 LLVMValueRef gs_next_vertex;
5552 LLVMValueRef can_emit, kill;
5553 unsigned chan, offset;
5554 int i;
5555 unsigned stream;
5556
5557 stream = si_llvm_get_stream(bld_base, emit_data);
5558
5559 /* Write vertex attribute values to GSVS ring */
5560 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5561 ctx->gs_next_vertex[stream],
5562 "");
5563
5564 /* If this thread has already emitted the declared maximum number of
5565 * vertices, skip the write: excessive vertex emissions are not
5566 * supposed to have any effect.
5567 *
5568 * If the shader has no writes to memory, kill it instead. This skips
5569 * further memory loads and may allow LLVM to skip to the end
5570 * altogether.
5571 */
5572 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5573 LLVMConstInt(ctx->i32,
5574 shader->selector->gs_max_out_vertices, 0), "");
5575
5576 bool use_kill = !info->writes_memory;
5577 if (use_kill) {
5578 kill = lp_build_select(&bld_base->base, can_emit,
5579 LLVMConstReal(ctx->f32, 1.0f),
5580 LLVMConstReal(ctx->f32, -1.0f));
5581
5582 ac_build_kill(&ctx->ac, kill);
5583 } else {
5584 lp_build_if(&if_state, gallivm, can_emit);
5585 }
5586
5587 offset = 0;
5588 for (i = 0; i < info->num_outputs; i++) {
5589 LLVMValueRef *out_ptr = ctx->outputs[i];
5590
5591 for (chan = 0; chan < 4; chan++) {
5592 if (!(info->output_usagemask[i] & (1 << chan)) ||
5593 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5594 continue;
5595
5596 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5597 LLVMValueRef voffset =
5598 LLVMConstInt(ctx->i32, offset *
5599 shader->selector->gs_max_out_vertices, 0);
5600 offset++;
5601
5602 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5603 voffset = lp_build_mul_imm(uint, voffset, 4);
5604
5605 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5606
5607 ac_build_buffer_store_dword(&ctx->ac,
5608 ctx->gsvs_ring[stream],
5609 out_val, 1,
5610 voffset, soffset, 0,
5611 1, 1, true, true);
5612 }
5613 }
5614
5615 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5616 ctx->i32_1);
5617
5618 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5619
5620 /* Signal vertex emission */
5621 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5622 si_get_gs_wave_id(ctx));
5623 if (!use_kill)
5624 lp_build_endif(&if_state);
5625 }
5626
5627 /* Cut one primitive from the geometry shader */
5628 static void si_llvm_emit_primitive(
5629 const struct lp_build_tgsi_action *action,
5630 struct lp_build_tgsi_context *bld_base,
5631 struct lp_build_emit_data *emit_data)
5632 {
5633 struct si_shader_context *ctx = si_shader_context(bld_base);
5634 unsigned stream;
5635
5636 /* Signal primitive cut */
5637 stream = si_llvm_get_stream(bld_base, emit_data);
5638 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5639 si_get_gs_wave_id(ctx));
5640 }
5641
5642 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5643 struct lp_build_tgsi_context *bld_base,
5644 struct lp_build_emit_data *emit_data)
5645 {
5646 struct si_shader_context *ctx = si_shader_context(bld_base);
5647 struct gallivm_state *gallivm = &ctx->gallivm;
5648
5649 /* SI only (thanks to a hw bug workaround):
5650 * The real barrier instruction isn’t needed, because an entire patch
5651 * always fits into a single wave.
5652 */
5653 if (HAVE_LLVM >= 0x0309 &&
5654 ctx->screen->b.chip_class == SI &&
5655 ctx->type == PIPE_SHADER_TESS_CTRL) {
5656 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5657 return;
5658 }
5659
5660 lp_build_intrinsic(gallivm->builder,
5661 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5662 : "llvm.AMDGPU.barrier.local",
5663 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5664 }
5665
5666 static const struct lp_build_tgsi_action tex_action = {
5667 .fetch_args = tex_fetch_args,
5668 .emit = build_tex_intrinsic,
5669 };
5670
5671 static const struct lp_build_tgsi_action interp_action = {
5672 .fetch_args = interp_fetch_args,
5673 .emit = build_interp_intrinsic,
5674 };
5675
5676 static void si_create_function(struct si_shader_context *ctx,
5677 const char *name,
5678 LLVMTypeRef *returns, unsigned num_returns,
5679 LLVMTypeRef *params, unsigned num_params,
5680 int last_sgpr)
5681 {
5682 int i;
5683
5684 si_llvm_create_func(ctx, name, returns, num_returns,
5685 params, num_params);
5686 si_llvm_shader_type(ctx->main_fn, ctx->type);
5687 ctx->return_value = LLVMGetUndef(ctx->return_type);
5688
5689 for (i = 0; i <= last_sgpr; ++i) {
5690 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5691
5692 /* The combination of:
5693 * - ByVal
5694 * - dereferenceable
5695 * - invariant.load
5696 * allows the optimization passes to move loads and reduces
5697 * SGPR spilling significantly.
5698 */
5699 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5700 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5701 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5702 ac_add_attr_dereferenceable(P, UINT64_MAX);
5703 } else
5704 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5705 }
5706
5707 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5708 "no-signed-zeros-fp-math",
5709 "true");
5710
5711 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5712 /* These were copied from some LLVM test. */
5713 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5714 "less-precise-fpmad",
5715 "true");
5716 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5717 "no-infs-fp-math",
5718 "true");
5719 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5720 "no-nans-fp-math",
5721 "true");
5722 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5723 "unsafe-fp-math",
5724 "true");
5725 }
5726 }
5727
5728 static void declare_streamout_params(struct si_shader_context *ctx,
5729 struct pipe_stream_output_info *so,
5730 LLVMTypeRef *params, LLVMTypeRef i32,
5731 unsigned *num_params)
5732 {
5733 int i;
5734
5735 /* Streamout SGPRs. */
5736 if (so->num_outputs) {
5737 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5738 params[ctx->param_streamout_config = (*num_params)++] = i32;
5739 else
5740 ctx->param_streamout_config = *num_params - 1;
5741
5742 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5743 }
5744 /* A streamout buffer offset is loaded if the stride is non-zero. */
5745 for (i = 0; i < 4; i++) {
5746 if (!so->stride[i])
5747 continue;
5748
5749 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5750 }
5751 }
5752
5753 static unsigned llvm_get_type_size(LLVMTypeRef type)
5754 {
5755 LLVMTypeKind kind = LLVMGetTypeKind(type);
5756
5757 switch (kind) {
5758 case LLVMIntegerTypeKind:
5759 return LLVMGetIntTypeWidth(type) / 8;
5760 case LLVMFloatTypeKind:
5761 return 4;
5762 case LLVMPointerTypeKind:
5763 return 8;
5764 case LLVMVectorTypeKind:
5765 return LLVMGetVectorSize(type) *
5766 llvm_get_type_size(LLVMGetElementType(type));
5767 case LLVMArrayTypeKind:
5768 return LLVMGetArrayLength(type) *
5769 llvm_get_type_size(LLVMGetElementType(type));
5770 default:
5771 assert(0);
5772 return 0;
5773 }
5774 }
5775
5776 static void declare_lds_as_pointer(struct si_shader_context *ctx)
5777 {
5778 struct gallivm_state *gallivm = &ctx->gallivm;
5779
5780 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5781 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5782 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5783 "lds");
5784 }
5785
5786 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5787 {
5788 const unsigned *properties = shader->selector->info.properties;
5789 unsigned max_work_group_size =
5790 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5791 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5792 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5793
5794 if (!max_work_group_size) {
5795 /* This is a variable group size compute shader,
5796 * compile it for the maximum possible group size.
5797 */
5798 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5799 }
5800 return max_work_group_size;
5801 }
5802
5803 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5804 LLVMTypeRef *params,
5805 unsigned *num_params,
5806 bool assign_params)
5807 {
5808 params[(*num_params)++] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5809 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5810 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5811 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5812
5813 if (assign_params) {
5814 ctx->param_const_buffers = *num_params - 4;
5815 ctx->param_samplers = *num_params - 3;
5816 ctx->param_images = *num_params - 2;
5817 ctx->param_shader_buffers = *num_params - 1;
5818 }
5819 }
5820
5821 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5822 LLVMTypeRef *params,
5823 unsigned *num_params)
5824 {
5825 params[ctx->param_rw_buffers = (*num_params)++] =
5826 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5827 declare_per_stage_desc_pointers(ctx, params, num_params, true);
5828 }
5829
5830 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5831 LLVMTypeRef *params,
5832 unsigned *num_params)
5833 {
5834 params[ctx->param_vertex_buffers = (*num_params)++] =
5835 const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5836 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5837 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5838 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5839 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5840 }
5841
5842 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5843 LLVMTypeRef *params, unsigned *num_params,
5844 unsigned *num_prolog_vgprs)
5845 {
5846 struct si_shader *shader = ctx->shader;
5847
5848 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5849 if (shader->key.as_ls) {
5850 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5851 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5852 } else {
5853 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5854 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5855 }
5856 params[(*num_params)++] = ctx->i32; /* unused */
5857
5858 if (!shader->is_gs_copy_shader) {
5859 /* Vertex load indices. */
5860 ctx->param_vertex_index0 = (*num_params);
5861 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5862 params[(*num_params)++] = ctx->i32;
5863 *num_prolog_vgprs += shader->selector->info.num_inputs;
5864 }
5865 }
5866
5867 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
5868 LLVMTypeRef *params, unsigned *num_params)
5869 {
5870 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
5871 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
5872 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
5873 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
5874 }
5875
5876 enum {
5877 /* Convenient merged shader definitions. */
5878 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5879 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5880 };
5881
5882 static void create_function(struct si_shader_context *ctx)
5883 {
5884 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5885 struct gallivm_state *gallivm = &ctx->gallivm;
5886 struct si_shader *shader = ctx->shader;
5887 LLVMTypeRef params[100]; /* just make it large enough */
5888 LLVMTypeRef returns[16+32*4];
5889 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5890 unsigned num_returns = 0;
5891 unsigned num_prolog_vgprs = 0;
5892 unsigned type = ctx->type;
5893
5894 /* Set MERGED shaders. */
5895 if (ctx->screen->b.chip_class >= GFX9) {
5896 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5897 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5898 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5899 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5900 }
5901
5902 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5903
5904 switch (type) {
5905 case PIPE_SHADER_VERTEX:
5906 declare_default_desc_pointers(ctx, params, &num_params);
5907 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5908
5909 if (shader->key.as_es) {
5910 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5911 } else if (shader->key.as_ls) {
5912 /* no extra parameters */
5913 } else {
5914 if (shader->is_gs_copy_shader)
5915 num_params = ctx->param_rw_buffers + 1;
5916
5917 /* The locations of the other parameters are assigned dynamically. */
5918 declare_streamout_params(ctx, &shader->selector->so,
5919 params, ctx->i32, &num_params);
5920 }
5921
5922 last_sgpr = num_params-1;
5923
5924 /* VGPRs */
5925 declare_vs_input_vgprs(ctx, params, &num_params,
5926 &num_prolog_vgprs);
5927
5928 /* PrimitiveID output. */
5929 if (!shader->is_gs_copy_shader &&
5930 !shader->key.as_es && !shader->key.as_ls) {
5931 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5932 returns[num_returns++] = ctx->f32;
5933 }
5934 break;
5935
5936 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5937 declare_default_desc_pointers(ctx, params, &num_params);
5938 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5939 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5940 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5941 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5942 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5943 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5944 last_sgpr = num_params - 1;
5945
5946 /* VGPRs */
5947 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5948 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5949
5950 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5951 * placed after the user SGPRs.
5952 */
5953 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5954 returns[num_returns++] = ctx->i32; /* SGPRs */
5955 for (i = 0; i < 3; i++)
5956 returns[num_returns++] = ctx->f32; /* VGPRs */
5957 break;
5958
5959 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
5960 /* Merged stages have 8 system SGPRs at the beginning. */
5961 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
5962 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5963 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5964 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
5965 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5966 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
5967 params[num_params++] = ctx->i32; /* unused */
5968 params[num_params++] = ctx->i32; /* unused */
5969
5970 params[num_params++] = ctx->i32; /* unused */
5971 params[num_params++] = ctx->i32; /* unused */
5972 declare_per_stage_desc_pointers(ctx, params, &num_params,
5973 ctx->type == PIPE_SHADER_VERTEX);
5974 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5975
5976 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5977 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5978 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5979 params[num_params++] = ctx->i32; /* unused */
5980
5981 declare_per_stage_desc_pointers(ctx, params, &num_params,
5982 ctx->type == PIPE_SHADER_TESS_CTRL);
5983 last_sgpr = num_params - 1;
5984
5985 /* VGPRs (first TCS, then VS) */
5986 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5987 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5988
5989 if (ctx->type == PIPE_SHADER_VERTEX) {
5990 declare_vs_input_vgprs(ctx, params, &num_params,
5991 &num_prolog_vgprs);
5992
5993 /* LS return values are inputs to the TCS main shader part. */
5994 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
5995 returns[num_returns++] = ctx->i32; /* SGPRs */
5996 for (i = 0; i < 2; i++)
5997 returns[num_returns++] = ctx->f32; /* VGPRs */
5998 } else {
5999 /* TCS return values are inputs to the TCS epilog.
6000 *
6001 * param_tcs_offchip_offset, param_tcs_factor_offset,
6002 * param_tcs_offchip_layout, and param_rw_buffers
6003 * should be passed to the epilog.
6004 */
6005 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT; i++)
6006 returns[num_returns++] = ctx->i32; /* SGPRs */
6007 for (i = 0; i < 3; i++)
6008 returns[num_returns++] = ctx->f32; /* VGPRs */
6009 }
6010 break;
6011
6012 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
6013 /* Merged stages have 8 system SGPRs at the beginning. */
6014 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
6015 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
6016 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6017 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6018 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6019 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6020 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
6021 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
6022
6023 params[num_params++] = ctx->i32; /* unused */
6024 params[num_params++] = ctx->i32; /* unused */
6025 declare_per_stage_desc_pointers(ctx, params, &num_params,
6026 (ctx->type == PIPE_SHADER_VERTEX ||
6027 ctx->type == PIPE_SHADER_TESS_EVAL));
6028 if (ctx->type == PIPE_SHADER_VERTEX) {
6029 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6030 } else {
6031 /* TESS_EVAL (and also GEOMETRY):
6032 * Declare as many input SGPRs as the VS has. */
6033 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6034 params[num_params++] = ctx->i32; /* unused */
6035 params[num_params++] = ctx->i32; /* unused */
6036 params[num_params++] = ctx->i32; /* unused */
6037 params[num_params++] = ctx->i32; /* unused */
6038 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
6039 }
6040
6041 declare_per_stage_desc_pointers(ctx, params, &num_params,
6042 ctx->type == PIPE_SHADER_GEOMETRY);
6043 last_sgpr = num_params - 1;
6044
6045 /* VGPRs (first GS, then VS/TES) */
6046 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
6047 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
6048 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6049 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6050 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
6051
6052 if (ctx->type == PIPE_SHADER_VERTEX) {
6053 declare_vs_input_vgprs(ctx, params, &num_params,
6054 &num_prolog_vgprs);
6055 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
6056 declare_tes_input_vgprs(ctx, params, &num_params);
6057 }
6058
6059 if (ctx->type == PIPE_SHADER_VERTEX ||
6060 ctx->type == PIPE_SHADER_TESS_EVAL) {
6061 /* ES return values are inputs to GS. */
6062 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
6063 returns[num_returns++] = ctx->i32; /* SGPRs */
6064 for (i = 0; i < 5; i++)
6065 returns[num_returns++] = ctx->f32; /* VGPRs */
6066 }
6067 break;
6068
6069 case PIPE_SHADER_TESS_EVAL:
6070 declare_default_desc_pointers(ctx, params, &num_params);
6071 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6072
6073 if (shader->key.as_es) {
6074 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6075 params[num_params++] = ctx->i32;
6076 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
6077 } else {
6078 params[num_params++] = ctx->i32;
6079 declare_streamout_params(ctx, &shader->selector->so,
6080 params, ctx->i32, &num_params);
6081 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6082 }
6083 last_sgpr = num_params - 1;
6084
6085 /* VGPRs */
6086 declare_tes_input_vgprs(ctx, params, &num_params);
6087
6088 /* PrimitiveID output. */
6089 if (!shader->key.as_es)
6090 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
6091 returns[num_returns++] = ctx->f32;
6092 break;
6093
6094 case PIPE_SHADER_GEOMETRY:
6095 declare_default_desc_pointers(ctx, params, &num_params);
6096 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6097 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
6098 last_sgpr = num_params - 1;
6099
6100 /* VGPRs */
6101 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
6102 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
6103 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6104 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
6105 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
6106 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
6107 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
6108 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6109 break;
6110
6111 case PIPE_SHADER_FRAGMENT:
6112 declare_default_desc_pointers(ctx, params, &num_params);
6113 params[SI_PARAM_ALPHA_REF] = ctx->f32;
6114 params[SI_PARAM_PRIM_MASK] = ctx->i32;
6115 last_sgpr = SI_PARAM_PRIM_MASK;
6116 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
6117 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
6118 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
6119 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
6120 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
6121 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
6122 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
6123 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
6124 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
6125 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
6126 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
6127 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
6128 params[SI_PARAM_FRONT_FACE] = ctx->i32;
6129 shader->info.face_vgpr_index = 20;
6130 params[SI_PARAM_ANCILLARY] = ctx->i32;
6131 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
6132 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
6133 num_params = SI_PARAM_POS_FIXED_PT+1;
6134
6135 /* Color inputs from the prolog. */
6136 if (shader->selector->info.colors_read) {
6137 unsigned num_color_elements =
6138 util_bitcount(shader->selector->info.colors_read);
6139
6140 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
6141 for (i = 0; i < num_color_elements; i++)
6142 params[num_params++] = ctx->f32;
6143
6144 num_prolog_vgprs += num_color_elements;
6145 }
6146
6147 /* Outputs for the epilog. */
6148 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6149 num_returns =
6150 num_return_sgprs +
6151 util_bitcount(shader->selector->info.colors_written) * 4 +
6152 shader->selector->info.writes_z +
6153 shader->selector->info.writes_stencil +
6154 shader->selector->info.writes_samplemask +
6155 1 /* SampleMaskIn */;
6156
6157 num_returns = MAX2(num_returns,
6158 num_return_sgprs +
6159 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6160
6161 for (i = 0; i < num_return_sgprs; i++)
6162 returns[i] = ctx->i32;
6163 for (; i < num_returns; i++)
6164 returns[i] = ctx->f32;
6165 break;
6166
6167 case PIPE_SHADER_COMPUTE:
6168 declare_default_desc_pointers(ctx, params, &num_params);
6169 params[SI_PARAM_GRID_SIZE] = v3i32;
6170 params[SI_PARAM_BLOCK_SIZE] = v3i32;
6171 params[SI_PARAM_BLOCK_ID] = v3i32;
6172 last_sgpr = SI_PARAM_BLOCK_ID;
6173
6174 params[SI_PARAM_THREAD_ID] = v3i32;
6175 num_params = SI_PARAM_THREAD_ID + 1;
6176 break;
6177 default:
6178 assert(0 && "unimplemented shader");
6179 return;
6180 }
6181
6182 assert(num_params <= ARRAY_SIZE(params));
6183
6184 si_create_function(ctx, "main", returns, num_returns, params,
6185 num_params, last_sgpr);
6186
6187 /* Reserve register locations for VGPR inputs the PS prolog may need. */
6188 if (ctx->type == PIPE_SHADER_FRAGMENT &&
6189 ctx->separate_prolog) {
6190 si_llvm_add_attribute(ctx->main_fn,
6191 "InitialPSInputAddr",
6192 S_0286D0_PERSP_SAMPLE_ENA(1) |
6193 S_0286D0_PERSP_CENTER_ENA(1) |
6194 S_0286D0_PERSP_CENTROID_ENA(1) |
6195 S_0286D0_LINEAR_SAMPLE_ENA(1) |
6196 S_0286D0_LINEAR_CENTER_ENA(1) |
6197 S_0286D0_LINEAR_CENTROID_ENA(1) |
6198 S_0286D0_FRONT_FACE_ENA(1) |
6199 S_0286D0_POS_FIXED_PT_ENA(1));
6200 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
6201 si_llvm_add_attribute(ctx->main_fn,
6202 "amdgpu-max-work-group-size",
6203 si_get_max_workgroup_size(shader));
6204 }
6205
6206 shader->info.num_input_sgprs = 0;
6207 shader->info.num_input_vgprs = 0;
6208
6209 for (i = 0; i <= last_sgpr; ++i)
6210 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
6211
6212 for (; i < num_params; ++i)
6213 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
6214
6215 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
6216 shader->info.num_input_vgprs -= num_prolog_vgprs;
6217
6218 if (!ctx->screen->has_ds_bpermute &&
6219 bld_base->info &&
6220 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
6221 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
6222 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
6223 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
6224 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
6225 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
6226 ctx->lds =
6227 LLVMAddGlobalInAddressSpace(gallivm->module,
6228 LLVMArrayType(ctx->i32, 64),
6229 "ddxy_lds",
6230 LOCAL_ADDR_SPACE);
6231
6232 if (shader->key.as_ls ||
6233 ctx->type == PIPE_SHADER_TESS_CTRL ||
6234 /* GFX9 has the ESGS ring buffer in LDS. */
6235 (ctx->screen->b.chip_class >= GFX9 &&
6236 (shader->key.as_es ||
6237 ctx->type == PIPE_SHADER_GEOMETRY)))
6238 declare_lds_as_pointer(ctx);
6239 }
6240
6241 /**
6242 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
6243 * for later use.
6244 */
6245 static void preload_ring_buffers(struct si_shader_context *ctx)
6246 {
6247 struct gallivm_state *gallivm = &ctx->gallivm;
6248 LLVMBuilderRef builder = gallivm->builder;
6249
6250 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6251 ctx->param_rw_buffers);
6252
6253 if (ctx->screen->b.chip_class <= VI &&
6254 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
6255 unsigned ring =
6256 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6257 : SI_ES_RING_ESGS;
6258 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6259
6260 ctx->esgs_ring =
6261 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6262 }
6263
6264 if (ctx->shader->is_gs_copy_shader) {
6265 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6266
6267 ctx->gsvs_ring[0] =
6268 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6269 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6270 const struct si_shader_selector *sel = ctx->shader->selector;
6271 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6272 LLVMValueRef base_ring;
6273
6274 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6275
6276 /* The conceptual layout of the GSVS ring is
6277 * v0c0 .. vLv0 v0c1 .. vLc1 ..
6278 * but the real memory layout is swizzled across
6279 * threads:
6280 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6281 * t16v0c0 ..
6282 * Override the buffer descriptor accordingly.
6283 */
6284 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6285 uint64_t stream_offset = 0;
6286
6287 for (unsigned stream = 0; stream < 4; ++stream) {
6288 unsigned num_components;
6289 unsigned stride;
6290 unsigned num_records;
6291 LLVMValueRef ring, tmp;
6292
6293 num_components = sel->info.num_stream_output_components[stream];
6294 if (!num_components)
6295 continue;
6296
6297 stride = 4 * num_components * sel->gs_max_out_vertices;
6298
6299 /* Limit on the stride field for <= CIK. */
6300 assert(stride < (1 << 14));
6301
6302 num_records = 64;
6303
6304 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6305 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6306 tmp = LLVMBuildAdd(builder, tmp,
6307 LLVMConstInt(ctx->i64,
6308 stream_offset, 0), "");
6309 stream_offset += stride * 64;
6310
6311 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6312 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6313 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6314 tmp = LLVMBuildOr(builder, tmp,
6315 LLVMConstInt(ctx->i32,
6316 S_008F04_STRIDE(stride) |
6317 S_008F04_SWIZZLE_ENABLE(1), 0), "");
6318 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6319 ring = LLVMBuildInsertElement(builder, ring,
6320 LLVMConstInt(ctx->i32, num_records, 0),
6321 LLVMConstInt(ctx->i32, 2, 0), "");
6322 ring = LLVMBuildInsertElement(builder, ring,
6323 LLVMConstInt(ctx->i32,
6324 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6325 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6326 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6327 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6328 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6329 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6330 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6331 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6332 S_008F0C_ADD_TID_ENABLE(1),
6333 0),
6334 LLVMConstInt(ctx->i32, 3, 0), "");
6335 ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
6336
6337 ctx->gsvs_ring[stream] = ring;
6338 }
6339 }
6340 }
6341
6342 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6343 LLVMValueRef param_rw_buffers,
6344 unsigned param_pos_fixed_pt)
6345 {
6346 struct gallivm_state *gallivm = &ctx->gallivm;
6347 LLVMBuilderRef builder = gallivm->builder;
6348 LLVMValueRef slot, desc, offset, row, bit, address[2];
6349
6350 /* Use the fixed-point gl_FragCoord input.
6351 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6352 * per coordinate to get the repeating effect.
6353 */
6354 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6355 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6356
6357 /* Load the buffer descriptor. */
6358 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6359 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6360
6361 /* The stipple pattern is 32x32, each row has 32 bits. */
6362 offset = LLVMBuildMul(builder, address[1],
6363 LLVMConstInt(ctx->i32, 4, 0), "");
6364 row = buffer_load_const(ctx, desc, offset);
6365 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6366 bit = LLVMBuildLShr(builder, row, address[0], "");
6367 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6368
6369 /* The intrinsic kills the thread if arg < 0. */
6370 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6371 LLVMConstReal(ctx->f32, -1), "");
6372 ac_build_kill(&ctx->ac, bit);
6373 }
6374
6375 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6376 struct si_shader_config *conf,
6377 unsigned symbol_offset)
6378 {
6379 unsigned i;
6380 const unsigned char *config =
6381 ac_shader_binary_config_start(binary, symbol_offset);
6382 bool really_needs_scratch = false;
6383
6384 /* LLVM adds SGPR spills to the scratch size.
6385 * Find out if we really need the scratch buffer.
6386 */
6387 for (i = 0; i < binary->reloc_count; i++) {
6388 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6389
6390 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6391 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6392 really_needs_scratch = true;
6393 break;
6394 }
6395 }
6396
6397 /* XXX: We may be able to emit some of these values directly rather than
6398 * extracting fields to be emitted later.
6399 */
6400
6401 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6402 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6403 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6404 switch (reg) {
6405 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6406 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6407 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6408 case R_00B848_COMPUTE_PGM_RSRC1:
6409 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6410 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6411 conf->float_mode = G_00B028_FLOAT_MODE(value);
6412 conf->rsrc1 = value;
6413 break;
6414 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6415 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6416 break;
6417 case R_00B84C_COMPUTE_PGM_RSRC2:
6418 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6419 conf->rsrc2 = value;
6420 break;
6421 case R_0286CC_SPI_PS_INPUT_ENA:
6422 conf->spi_ps_input_ena = value;
6423 break;
6424 case R_0286D0_SPI_PS_INPUT_ADDR:
6425 conf->spi_ps_input_addr = value;
6426 break;
6427 case R_0286E8_SPI_TMPRING_SIZE:
6428 case R_00B860_COMPUTE_TMPRING_SIZE:
6429 /* WAVESIZE is in units of 256 dwords. */
6430 if (really_needs_scratch)
6431 conf->scratch_bytes_per_wave =
6432 G_00B860_WAVESIZE(value) * 256 * 4;
6433 break;
6434 case 0x4: /* SPILLED_SGPRS */
6435 conf->spilled_sgprs = value;
6436 break;
6437 case 0x8: /* SPILLED_VGPRS */
6438 conf->spilled_vgprs = value;
6439 break;
6440 default:
6441 {
6442 static bool printed;
6443
6444 if (!printed) {
6445 fprintf(stderr, "Warning: LLVM emitted unknown "
6446 "config register: 0x%x\n", reg);
6447 printed = true;
6448 }
6449 }
6450 break;
6451 }
6452 }
6453
6454 if (!conf->spi_ps_input_addr)
6455 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6456 }
6457
6458 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6459 struct si_shader *shader,
6460 struct si_shader_config *config,
6461 uint64_t scratch_va)
6462 {
6463 unsigned i;
6464 uint32_t scratch_rsrc_dword0 = scratch_va;
6465 uint32_t scratch_rsrc_dword1 =
6466 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6467
6468 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6469 * correctly.
6470 */
6471 if (HAVE_LLVM >= 0x0309)
6472 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6473 else
6474 scratch_rsrc_dword1 |=
6475 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6476
6477 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6478 const struct ac_shader_reloc *reloc =
6479 &shader->binary.relocs[i];
6480 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6481 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6482 &scratch_rsrc_dword0, 4);
6483 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6484 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6485 &scratch_rsrc_dword1, 4);
6486 }
6487 }
6488 }
6489
6490 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6491 {
6492 unsigned size = shader->binary.code_size;
6493
6494 if (shader->prolog)
6495 size += shader->prolog->binary.code_size;
6496 if (shader->previous_stage)
6497 size += shader->previous_stage->binary.code_size;
6498 if (shader->prolog2)
6499 size += shader->prolog2->binary.code_size;
6500 if (shader->epilog)
6501 size += shader->epilog->binary.code_size;
6502 return size;
6503 }
6504
6505 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6506 {
6507 const struct ac_shader_binary *prolog =
6508 shader->prolog ? &shader->prolog->binary : NULL;
6509 const struct ac_shader_binary *previous_stage =
6510 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6511 const struct ac_shader_binary *prolog2 =
6512 shader->prolog2 ? &shader->prolog2->binary : NULL;
6513 const struct ac_shader_binary *epilog =
6514 shader->epilog ? &shader->epilog->binary : NULL;
6515 const struct ac_shader_binary *mainb = &shader->binary;
6516 unsigned bo_size = si_get_shader_binary_size(shader) +
6517 (!epilog ? mainb->rodata_size : 0);
6518 unsigned char *ptr;
6519
6520 assert(!prolog || !prolog->rodata_size);
6521 assert(!previous_stage || !previous_stage->rodata_size);
6522 assert(!prolog2 || !prolog2->rodata_size);
6523 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
6524 !mainb->rodata_size);
6525 assert(!epilog || !epilog->rodata_size);
6526
6527 /* GFX9 can fetch at most 128 bytes past the end of the shader.
6528 * Prevent VM faults.
6529 */
6530 if (sscreen->b.chip_class >= GFX9)
6531 bo_size += 128;
6532
6533 r600_resource_reference(&shader->bo, NULL);
6534 shader->bo = (struct r600_resource*)
6535 pipe_buffer_create(&sscreen->b.b, 0,
6536 PIPE_USAGE_IMMUTABLE,
6537 align(bo_size, SI_CPDMA_ALIGNMENT));
6538 if (!shader->bo)
6539 return -ENOMEM;
6540
6541 /* Upload. */
6542 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6543 PIPE_TRANSFER_READ_WRITE |
6544 PIPE_TRANSFER_UNSYNCHRONIZED);
6545
6546 if (prolog) {
6547 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6548 ptr += prolog->code_size;
6549 }
6550 if (previous_stage) {
6551 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6552 previous_stage->code_size);
6553 ptr += previous_stage->code_size;
6554 }
6555 if (prolog2) {
6556 util_memcpy_cpu_to_le32(ptr, prolog2->code, prolog2->code_size);
6557 ptr += prolog2->code_size;
6558 }
6559
6560 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6561 ptr += mainb->code_size;
6562
6563 if (epilog)
6564 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6565 else if (mainb->rodata_size > 0)
6566 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6567
6568 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6569 return 0;
6570 }
6571
6572 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6573 struct pipe_debug_callback *debug,
6574 const char *name, FILE *file)
6575 {
6576 char *line, *p;
6577 unsigned i, count;
6578
6579 if (binary->disasm_string) {
6580 fprintf(file, "Shader %s disassembly:\n", name);
6581 fprintf(file, "%s", binary->disasm_string);
6582
6583 if (debug && debug->debug_message) {
6584 /* Very long debug messages are cut off, so send the
6585 * disassembly one line at a time. This causes more
6586 * overhead, but on the plus side it simplifies
6587 * parsing of resulting logs.
6588 */
6589 pipe_debug_message(debug, SHADER_INFO,
6590 "Shader Disassembly Begin");
6591
6592 line = binary->disasm_string;
6593 while (*line) {
6594 p = util_strchrnul(line, '\n');
6595 count = p - line;
6596
6597 if (count) {
6598 pipe_debug_message(debug, SHADER_INFO,
6599 "%.*s", count, line);
6600 }
6601
6602 if (!*p)
6603 break;
6604 line = p + 1;
6605 }
6606
6607 pipe_debug_message(debug, SHADER_INFO,
6608 "Shader Disassembly End");
6609 }
6610 } else {
6611 fprintf(file, "Shader %s binary:\n", name);
6612 for (i = 0; i < binary->code_size; i += 4) {
6613 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6614 binary->code[i + 3], binary->code[i + 2],
6615 binary->code[i + 1], binary->code[i]);
6616 }
6617 }
6618 }
6619
6620 static void si_shader_dump_stats(struct si_screen *sscreen,
6621 struct si_shader *shader,
6622 struct pipe_debug_callback *debug,
6623 unsigned processor,
6624 FILE *file,
6625 bool check_debug_option)
6626 {
6627 struct si_shader_config *conf = &shader->config;
6628 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6629 unsigned code_size = si_get_shader_binary_size(shader);
6630 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6631 unsigned lds_per_wave = 0;
6632 unsigned max_simd_waves = 10;
6633
6634 /* Compute LDS usage for PS. */
6635 switch (processor) {
6636 case PIPE_SHADER_FRAGMENT:
6637 /* The minimum usage per wave is (num_inputs * 48). The maximum
6638 * usage is (num_inputs * 48 * 16).
6639 * We can get anything in between and it varies between waves.
6640 *
6641 * The 48 bytes per input for a single primitive is equal to
6642 * 4 bytes/component * 4 components/input * 3 points.
6643 *
6644 * Other stages don't know the size at compile time or don't
6645 * allocate LDS per wave, but instead they do it per thread group.
6646 */
6647 lds_per_wave = conf->lds_size * lds_increment +
6648 align(num_inputs * 48, lds_increment);
6649 break;
6650 case PIPE_SHADER_COMPUTE:
6651 if (shader->selector) {
6652 unsigned max_workgroup_size =
6653 si_get_max_workgroup_size(shader);
6654 lds_per_wave = (conf->lds_size * lds_increment) /
6655 DIV_ROUND_UP(max_workgroup_size, 64);
6656 }
6657 break;
6658 }
6659
6660 /* Compute the per-SIMD wave counts. */
6661 if (conf->num_sgprs) {
6662 if (sscreen->b.chip_class >= VI)
6663 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6664 else
6665 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6666 }
6667
6668 if (conf->num_vgprs)
6669 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6670
6671 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6672 * 16KB makes some SIMDs unoccupied). */
6673 if (lds_per_wave)
6674 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6675
6676 if (!check_debug_option ||
6677 r600_can_dump_shader(&sscreen->b, processor)) {
6678 if (processor == PIPE_SHADER_FRAGMENT) {
6679 fprintf(file, "*** SHADER CONFIG ***\n"
6680 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6681 "SPI_PS_INPUT_ENA = 0x%04x\n",
6682 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6683 }
6684
6685 fprintf(file, "*** SHADER STATS ***\n"
6686 "SGPRS: %d\n"
6687 "VGPRS: %d\n"
6688 "Spilled SGPRs: %d\n"
6689 "Spilled VGPRs: %d\n"
6690 "Private memory VGPRs: %d\n"
6691 "Code Size: %d bytes\n"
6692 "LDS: %d blocks\n"
6693 "Scratch: %d bytes per wave\n"
6694 "Max Waves: %d\n"
6695 "********************\n\n\n",
6696 conf->num_sgprs, conf->num_vgprs,
6697 conf->spilled_sgprs, conf->spilled_vgprs,
6698 conf->private_mem_vgprs, code_size,
6699 conf->lds_size, conf->scratch_bytes_per_wave,
6700 max_simd_waves);
6701 }
6702
6703 pipe_debug_message(debug, SHADER_INFO,
6704 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6705 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6706 "Spilled VGPRs: %d PrivMem VGPRs: %d",
6707 conf->num_sgprs, conf->num_vgprs, code_size,
6708 conf->lds_size, conf->scratch_bytes_per_wave,
6709 max_simd_waves, conf->spilled_sgprs,
6710 conf->spilled_vgprs, conf->private_mem_vgprs);
6711 }
6712
6713 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6714 {
6715 switch (processor) {
6716 case PIPE_SHADER_VERTEX:
6717 if (shader->key.as_es)
6718 return "Vertex Shader as ES";
6719 else if (shader->key.as_ls)
6720 return "Vertex Shader as LS";
6721 else
6722 return "Vertex Shader as VS";
6723 case PIPE_SHADER_TESS_CTRL:
6724 return "Tessellation Control Shader";
6725 case PIPE_SHADER_TESS_EVAL:
6726 if (shader->key.as_es)
6727 return "Tessellation Evaluation Shader as ES";
6728 else
6729 return "Tessellation Evaluation Shader as VS";
6730 case PIPE_SHADER_GEOMETRY:
6731 if (shader->is_gs_copy_shader)
6732 return "GS Copy Shader as VS";
6733 else
6734 return "Geometry Shader";
6735 case PIPE_SHADER_FRAGMENT:
6736 return "Pixel Shader";
6737 case PIPE_SHADER_COMPUTE:
6738 return "Compute Shader";
6739 default:
6740 return "Unknown Shader";
6741 }
6742 }
6743
6744 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6745 struct pipe_debug_callback *debug, unsigned processor,
6746 FILE *file, bool check_debug_option)
6747 {
6748 if (!check_debug_option ||
6749 r600_can_dump_shader(&sscreen->b, processor))
6750 si_dump_shader_key(processor, shader, file);
6751
6752 if (!check_debug_option && shader->binary.llvm_ir_string) {
6753 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6754 si_get_shader_name(shader, processor));
6755 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6756 }
6757
6758 if (!check_debug_option ||
6759 (r600_can_dump_shader(&sscreen->b, processor) &&
6760 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6761 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6762
6763 if (shader->prolog)
6764 si_shader_dump_disassembly(&shader->prolog->binary,
6765 debug, "prolog", file);
6766 if (shader->previous_stage)
6767 si_shader_dump_disassembly(&shader->previous_stage->binary,
6768 debug, "previous stage", file);
6769 if (shader->prolog2)
6770 si_shader_dump_disassembly(&shader->prolog2->binary,
6771 debug, "prolog2", file);
6772
6773 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6774
6775 if (shader->epilog)
6776 si_shader_dump_disassembly(&shader->epilog->binary,
6777 debug, "epilog", file);
6778 fprintf(file, "\n");
6779 }
6780
6781 si_shader_dump_stats(sscreen, shader, debug, processor, file,
6782 check_debug_option);
6783 }
6784
6785 int si_compile_llvm(struct si_screen *sscreen,
6786 struct ac_shader_binary *binary,
6787 struct si_shader_config *conf,
6788 LLVMTargetMachineRef tm,
6789 LLVMModuleRef mod,
6790 struct pipe_debug_callback *debug,
6791 unsigned processor,
6792 const char *name)
6793 {
6794 int r = 0;
6795 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6796
6797 if (r600_can_dump_shader(&sscreen->b, processor)) {
6798 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6799
6800 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6801 fprintf(stderr, "%s LLVM IR:\n\n", name);
6802 ac_dump_module(mod);
6803 fprintf(stderr, "\n");
6804 }
6805 }
6806
6807 if (sscreen->record_llvm_ir) {
6808 char *ir = LLVMPrintModuleToString(mod);
6809 binary->llvm_ir_string = strdup(ir);
6810 LLVMDisposeMessage(ir);
6811 }
6812
6813 if (!si_replace_shader(count, binary)) {
6814 r = si_llvm_compile(mod, binary, tm, debug);
6815 if (r)
6816 return r;
6817 }
6818
6819 si_shader_binary_read_config(binary, conf, 0);
6820
6821 /* Enable 64-bit and 16-bit denormals, because there is no performance
6822 * cost.
6823 *
6824 * If denormals are enabled, all floating-point output modifiers are
6825 * ignored.
6826 *
6827 * Don't enable denormals for 32-bit floats, because:
6828 * - Floating-point output modifiers would be ignored by the hw.
6829 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6830 * have to stop using those.
6831 * - SI & CI would be very slow.
6832 */
6833 conf->float_mode |= V_00B028_FP_64_DENORMS;
6834
6835 FREE(binary->config);
6836 FREE(binary->global_symbol_offsets);
6837 binary->config = NULL;
6838 binary->global_symbol_offsets = NULL;
6839
6840 /* Some shaders can't have rodata because their binaries can be
6841 * concatenated.
6842 */
6843 if (binary->rodata_size &&
6844 (processor == PIPE_SHADER_VERTEX ||
6845 processor == PIPE_SHADER_TESS_CTRL ||
6846 processor == PIPE_SHADER_TESS_EVAL ||
6847 processor == PIPE_SHADER_FRAGMENT)) {
6848 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6849 return -EINVAL;
6850 }
6851
6852 return r;
6853 }
6854
6855 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6856 {
6857 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6858 LLVMBuildRetVoid(ctx->gallivm.builder);
6859 else
6860 LLVMBuildRet(ctx->gallivm.builder, ret);
6861 }
6862
6863 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6864 struct si_shader *
6865 si_generate_gs_copy_shader(struct si_screen *sscreen,
6866 LLVMTargetMachineRef tm,
6867 struct si_shader_selector *gs_selector,
6868 struct pipe_debug_callback *debug)
6869 {
6870 struct si_shader_context ctx;
6871 struct si_shader *shader;
6872 struct gallivm_state *gallivm = &ctx.gallivm;
6873 LLVMBuilderRef builder;
6874 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6875 struct lp_build_context *uint = &bld_base->uint_bld;
6876 struct si_shader_output_values *outputs;
6877 struct tgsi_shader_info *gsinfo = &gs_selector->info;
6878 int i, r;
6879
6880 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6881
6882 if (!outputs)
6883 return NULL;
6884
6885 shader = CALLOC_STRUCT(si_shader);
6886 if (!shader) {
6887 FREE(outputs);
6888 return NULL;
6889 }
6890
6891
6892 shader->selector = gs_selector;
6893 shader->is_gs_copy_shader = true;
6894
6895 si_init_shader_ctx(&ctx, sscreen, tm);
6896 ctx.shader = shader;
6897 ctx.type = PIPE_SHADER_VERTEX;
6898
6899 builder = gallivm->builder;
6900
6901 create_function(&ctx);
6902 preload_ring_buffers(&ctx);
6903
6904 LLVMValueRef voffset =
6905 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6906 ctx.param_vertex_id), 4);
6907
6908 /* Fetch the vertex stream ID.*/
6909 LLVMValueRef stream_id;
6910
6911 if (gs_selector->so.num_outputs)
6912 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6913 else
6914 stream_id = ctx.i32_0;
6915
6916 /* Fill in output information. */
6917 for (i = 0; i < gsinfo->num_outputs; ++i) {
6918 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6919 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6920
6921 for (int chan = 0; chan < 4; chan++) {
6922 outputs[i].vertex_stream[chan] =
6923 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6924 }
6925 }
6926
6927 LLVMBasicBlockRef end_bb;
6928 LLVMValueRef switch_inst;
6929
6930 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6931 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6932
6933 for (int stream = 0; stream < 4; stream++) {
6934 LLVMBasicBlockRef bb;
6935 unsigned offset;
6936
6937 if (!gsinfo->num_stream_output_components[stream])
6938 continue;
6939
6940 if (stream > 0 && !gs_selector->so.num_outputs)
6941 continue;
6942
6943 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6944 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6945 LLVMPositionBuilderAtEnd(builder, bb);
6946
6947 /* Fetch vertex data from GSVS ring */
6948 offset = 0;
6949 for (i = 0; i < gsinfo->num_outputs; ++i) {
6950 for (unsigned chan = 0; chan < 4; chan++) {
6951 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6952 outputs[i].vertex_stream[chan] != stream) {
6953 outputs[i].values[chan] = ctx.bld_base.base.undef;
6954 continue;
6955 }
6956
6957 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6958 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6959 offset++;
6960
6961 outputs[i].values[chan] =
6962 ac_build_buffer_load(&ctx.ac,
6963 ctx.gsvs_ring[0], 1,
6964 ctx.i32_0, voffset,
6965 soffset, 0, 1, 1, true);
6966 }
6967 }
6968
6969 /* Streamout and exports. */
6970 if (gs_selector->so.num_outputs) {
6971 si_llvm_emit_streamout(&ctx, outputs,
6972 gsinfo->num_outputs,
6973 stream);
6974 }
6975
6976 if (stream == 0)
6977 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6978
6979 LLVMBuildBr(builder, end_bb);
6980 }
6981
6982 LLVMPositionBuilderAtEnd(builder, end_bb);
6983
6984 LLVMBuildRetVoid(gallivm->builder);
6985
6986 /* Dump LLVM IR before any optimization passes */
6987 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6988 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6989 ac_dump_module(ctx.gallivm.module);
6990
6991 si_llvm_finalize_module(&ctx,
6992 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6993
6994 r = si_compile_llvm(sscreen, &ctx.shader->binary,
6995 &ctx.shader->config, ctx.tm,
6996 ctx.gallivm.module,
6997 debug, PIPE_SHADER_GEOMETRY,
6998 "GS Copy Shader");
6999 if (!r) {
7000 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
7001 fprintf(stderr, "GS Copy Shader:\n");
7002 si_shader_dump(sscreen, ctx.shader, debug,
7003 PIPE_SHADER_GEOMETRY, stderr, true);
7004 r = si_shader_binary_upload(sscreen, ctx.shader);
7005 }
7006
7007 si_llvm_dispose(&ctx);
7008
7009 FREE(outputs);
7010
7011 if (r != 0) {
7012 FREE(shader);
7013 shader = NULL;
7014 }
7015 return shader;
7016 }
7017
7018 static void si_dump_shader_key_vs(struct si_shader_key *key,
7019 struct si_vs_prolog_bits *prolog,
7020 const char *prefix, FILE *f)
7021 {
7022 fprintf(f, " %s.instance_divisors = {", prefix);
7023 for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
7024 fprintf(f, !i ? "%u" : ", %u",
7025 prolog->instance_divisors[i]);
7026 }
7027 fprintf(f, "}\n");
7028
7029 fprintf(f, " mono.vs.fix_fetch = {");
7030 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
7031 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
7032 fprintf(f, "}\n");
7033 }
7034
7035 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
7036 FILE *f)
7037 {
7038 struct si_shader_key *key = &shader->key;
7039
7040 fprintf(f, "SHADER KEY\n");
7041
7042 switch (processor) {
7043 case PIPE_SHADER_VERTEX:
7044 si_dump_shader_key_vs(key, &key->part.vs.prolog,
7045 "part.vs.prolog", f);
7046 fprintf(f, " as_es = %u\n", key->as_es);
7047 fprintf(f, " as_ls = %u\n", key->as_ls);
7048 fprintf(f, " part.vs.epilog.export_prim_id = %u\n",
7049 key->part.vs.epilog.export_prim_id);
7050 break;
7051
7052 case PIPE_SHADER_TESS_CTRL:
7053 if (shader->selector->screen->b.chip_class >= GFX9) {
7054 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
7055 "part.tcs.ls_prolog", f);
7056 }
7057 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
7058 fprintf(f, " mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
7059 break;
7060
7061 case PIPE_SHADER_TESS_EVAL:
7062 fprintf(f, " part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
7063 fprintf(f, " as_es = %u\n", key->as_es);
7064 break;
7065
7066 case PIPE_SHADER_GEOMETRY:
7067 if (shader->is_gs_copy_shader)
7068 break;
7069
7070 if (shader->selector->screen->b.chip_class >= GFX9 &&
7071 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
7072 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
7073 "part.gs.vs_prolog", f);
7074 }
7075 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
7076 break;
7077
7078 case PIPE_SHADER_COMPUTE:
7079 break;
7080
7081 case PIPE_SHADER_FRAGMENT:
7082 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
7083 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
7084 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
7085 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
7086 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
7087 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
7088 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
7089 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
7090 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
7091 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
7092 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
7093 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
7094 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
7095 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
7096 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
7097 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
7098 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
7099 break;
7100
7101 default:
7102 assert(0);
7103 }
7104
7105 if ((processor == PIPE_SHADER_GEOMETRY ||
7106 processor == PIPE_SHADER_TESS_EVAL ||
7107 processor == PIPE_SHADER_VERTEX) &&
7108 !key->as_es && !key->as_ls) {
7109 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
7110 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
7111 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
7112 }
7113 }
7114
7115 static void si_init_shader_ctx(struct si_shader_context *ctx,
7116 struct si_screen *sscreen,
7117 LLVMTargetMachineRef tm)
7118 {
7119 struct lp_build_tgsi_context *bld_base;
7120 struct lp_build_tgsi_action tmpl = {};
7121
7122 si_llvm_context_init(ctx, sscreen, tm);
7123
7124 bld_base = &ctx->bld_base;
7125 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
7126
7127 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
7128 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
7129 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
7130
7131 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
7132 bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
7133 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
7134 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
7135 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
7136 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
7137 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
7138 bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
7139 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
7140 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
7141 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
7142 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
7143 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
7144 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
7145 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
7146 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
7147
7148 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
7149 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
7150 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
7151 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
7152 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
7153 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
7154
7155 tmpl.fetch_args = atomic_fetch_args;
7156 tmpl.emit = atomic_emit;
7157 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
7158 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
7159 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
7160 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
7161 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
7162 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
7163 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
7164 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
7165 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
7166 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
7167 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
7168 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
7169 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
7170 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
7171 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
7172 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
7173 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
7174 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
7175 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
7176 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
7177
7178 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
7179
7180 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
7181
7182 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
7183 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
7184 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
7185 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
7186
7187 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
7188 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
7189 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
7190 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
7191 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
7192 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
7193 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
7194 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
7195 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
7196
7197 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
7198 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
7199 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
7200 }
7201
7202 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
7203 {
7204 struct si_shader *shader = ctx->shader;
7205 struct tgsi_shader_info *info = &shader->selector->info;
7206
7207 if (ctx->type == PIPE_SHADER_FRAGMENT ||
7208 ctx->type == PIPE_SHADER_COMPUTE ||
7209 shader->key.as_es ||
7210 shader->key.as_ls)
7211 return;
7212
7213 ac_eliminate_const_vs_outputs(&ctx->ac,
7214 ctx->main_fn,
7215 shader->info.vs_output_param_offset,
7216 info->num_outputs,
7217 &shader->info.nr_param_exports);
7218 }
7219
7220 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
7221 {
7222 ctx->shader->config.private_mem_vgprs = 0;
7223
7224 /* Process all LLVM instructions. */
7225 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
7226 while (bb) {
7227 LLVMValueRef next = LLVMGetFirstInstruction(bb);
7228
7229 while (next) {
7230 LLVMValueRef inst = next;
7231 next = LLVMGetNextInstruction(next);
7232
7233 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
7234 continue;
7235
7236 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
7237 /* No idea why LLVM aligns allocas to 4 elements. */
7238 unsigned alignment = LLVMGetAlignment(inst);
7239 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
7240 ctx->shader->config.private_mem_vgprs += dw_size;
7241 }
7242 bb = LLVMGetNextBasicBlock(bb);
7243 }
7244 }
7245
7246 static void si_init_exec_full_mask(struct si_shader_context *ctx)
7247 {
7248 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
7249 lp_build_intrinsic(ctx->gallivm.builder,
7250 "llvm.amdgcn.init.exec", ctx->voidt,
7251 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
7252 }
7253
7254 static void si_init_exec_from_input(struct si_shader_context *ctx,
7255 unsigned param, unsigned bitoffset)
7256 {
7257 LLVMValueRef args[] = {
7258 LLVMGetParam(ctx->main_fn, param),
7259 LLVMConstInt(ctx->i32, bitoffset, 0),
7260 };
7261 lp_build_intrinsic(ctx->gallivm.builder,
7262 "llvm.amdgcn.init.exec.from.input",
7263 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
7264 }
7265
7266 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
7267 bool is_monolithic)
7268 {
7269 struct si_shader *shader = ctx->shader;
7270 struct si_shader_selector *sel = shader->selector;
7271 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7272
7273 switch (ctx->type) {
7274 case PIPE_SHADER_VERTEX:
7275 ctx->load_input = declare_input_vs;
7276 if (shader->key.as_ls)
7277 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
7278 else if (shader->key.as_es)
7279 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7280 else
7281 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7282 break;
7283 case PIPE_SHADER_TESS_CTRL:
7284 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7285 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7286 bld_base->emit_store = store_output_tcs;
7287 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7288 break;
7289 case PIPE_SHADER_TESS_EVAL:
7290 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7291 if (shader->key.as_es)
7292 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7293 else
7294 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7295 break;
7296 case PIPE_SHADER_GEOMETRY:
7297 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7298 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7299 break;
7300 case PIPE_SHADER_FRAGMENT:
7301 ctx->load_input = declare_input_fs;
7302 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7303 break;
7304 case PIPE_SHADER_COMPUTE:
7305 ctx->declare_memory_region = declare_compute_memory;
7306 break;
7307 default:
7308 assert(!"Unsupported shader type");
7309 return false;
7310 }
7311
7312 create_function(ctx);
7313 preload_ring_buffers(ctx);
7314
7315 /* For GFX9 merged shaders:
7316 * - Set EXEC. If the prolog is present, set EXEC there instead.
7317 * - Add a barrier before the second shader.
7318 *
7319 * The same thing for monolithic shaders is done in
7320 * si_build_wrapper_function.
7321 */
7322 if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
7323 if (sel->info.num_instructions > 1 && /* not empty shader */
7324 (shader->key.as_es || shader->key.as_ls) &&
7325 (ctx->type == PIPE_SHADER_TESS_EVAL ||
7326 (ctx->type == PIPE_SHADER_VERTEX &&
7327 !sel->vs_needs_prolog))) {
7328 si_init_exec_from_input(ctx,
7329 ctx->param_merged_wave_info, 0);
7330 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
7331 ctx->type == PIPE_SHADER_GEOMETRY) {
7332 si_init_exec_from_input(ctx,
7333 ctx->param_merged_wave_info, 8);
7334 si_llvm_emit_barrier(NULL, bld_base, NULL);
7335 }
7336 }
7337
7338 if (ctx->type == PIPE_SHADER_GEOMETRY) {
7339 int i;
7340 for (i = 0; i < 4; i++) {
7341 ctx->gs_next_vertex[i] =
7342 lp_build_alloca(&ctx->gallivm,
7343 ctx->i32, "");
7344 }
7345 }
7346
7347 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7348 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7349 return false;
7350 }
7351
7352 si_llvm_build_ret(ctx, ctx->return_value);
7353 return true;
7354 }
7355
7356 /**
7357 * Compute the VS prolog key, which contains all the information needed to
7358 * build the VS prolog function, and set shader->info bits where needed.
7359 *
7360 * \param info Shader info of the vertex shader.
7361 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
7362 * \param prolog_key Key of the VS prolog
7363 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
7364 * \param key Output shader part key.
7365 */
7366 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7367 unsigned num_input_sgprs,
7368 const struct si_vs_prolog_bits *prolog_key,
7369 struct si_shader *shader_out,
7370 union si_shader_part_key *key)
7371 {
7372 memset(key, 0, sizeof(*key));
7373 key->vs_prolog.states = *prolog_key;
7374 key->vs_prolog.num_input_sgprs = num_input_sgprs;
7375 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7376 key->vs_prolog.as_ls = shader_out->key.as_ls;
7377
7378 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
7379 key->vs_prolog.as_ls = 1;
7380 key->vs_prolog.num_merged_next_stage_vgprs = 2;
7381 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
7382 key->vs_prolog.num_merged_next_stage_vgprs = 5;
7383 }
7384
7385 /* Set the instanceID flag. */
7386 for (unsigned i = 0; i < info->num_inputs; i++)
7387 if (key->vs_prolog.states.instance_divisors[i])
7388 shader_out->info.uses_instanceid = true;
7389 }
7390
7391 /**
7392 * Compute the VS epilog key, which contains all the information needed to
7393 * build the VS epilog function, and set the PrimitiveID output offset.
7394 */
7395 static void si_get_vs_epilog_key(struct si_shader *shader,
7396 struct si_vs_epilog_bits *states,
7397 union si_shader_part_key *key)
7398 {
7399 memset(key, 0, sizeof(*key));
7400 key->vs_epilog.states = *states;
7401
7402 /* Set up the PrimitiveID output. */
7403 if (shader->key.part.vs.epilog.export_prim_id) {
7404 unsigned index = shader->selector->info.num_outputs;
7405 unsigned offset = shader->info.nr_param_exports++;
7406
7407 key->vs_epilog.prim_id_param_offset = offset;
7408 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7409 shader->info.vs_output_param_offset[index] = offset;
7410 }
7411 }
7412
7413 /**
7414 * Compute the PS prolog key, which contains all the information needed to
7415 * build the PS prolog function, and set related bits in shader->config.
7416 */
7417 static void si_get_ps_prolog_key(struct si_shader *shader,
7418 union si_shader_part_key *key,
7419 bool separate_prolog)
7420 {
7421 struct tgsi_shader_info *info = &shader->selector->info;
7422
7423 memset(key, 0, sizeof(*key));
7424 key->ps_prolog.states = shader->key.part.ps.prolog;
7425 key->ps_prolog.colors_read = info->colors_read;
7426 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7427 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7428 key->ps_prolog.wqm = info->uses_derivatives &&
7429 (key->ps_prolog.colors_read ||
7430 key->ps_prolog.states.force_persp_sample_interp ||
7431 key->ps_prolog.states.force_linear_sample_interp ||
7432 key->ps_prolog.states.force_persp_center_interp ||
7433 key->ps_prolog.states.force_linear_center_interp ||
7434 key->ps_prolog.states.bc_optimize_for_persp ||
7435 key->ps_prolog.states.bc_optimize_for_linear);
7436
7437 if (info->colors_read) {
7438 unsigned *color = shader->selector->color_attr_index;
7439
7440 if (shader->key.part.ps.prolog.color_two_side) {
7441 /* BCOLORs are stored after the last input. */
7442 key->ps_prolog.num_interp_inputs = info->num_inputs;
7443 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7444 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7445 }
7446
7447 for (unsigned i = 0; i < 2; i++) {
7448 unsigned interp = info->input_interpolate[color[i]];
7449 unsigned location = info->input_interpolate_loc[color[i]];
7450
7451 if (!(info->colors_read & (0xf << i*4)))
7452 continue;
7453
7454 key->ps_prolog.color_attr_index[i] = color[i];
7455
7456 if (shader->key.part.ps.prolog.flatshade_colors &&
7457 interp == TGSI_INTERPOLATE_COLOR)
7458 interp = TGSI_INTERPOLATE_CONSTANT;
7459
7460 switch (interp) {
7461 case TGSI_INTERPOLATE_CONSTANT:
7462 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7463 break;
7464 case TGSI_INTERPOLATE_PERSPECTIVE:
7465 case TGSI_INTERPOLATE_COLOR:
7466 /* Force the interpolation location for colors here. */
7467 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7468 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7469 if (shader->key.part.ps.prolog.force_persp_center_interp)
7470 location = TGSI_INTERPOLATE_LOC_CENTER;
7471
7472 switch (location) {
7473 case TGSI_INTERPOLATE_LOC_SAMPLE:
7474 key->ps_prolog.color_interp_vgpr_index[i] = 0;
7475 shader->config.spi_ps_input_ena |=
7476 S_0286CC_PERSP_SAMPLE_ENA(1);
7477 break;
7478 case TGSI_INTERPOLATE_LOC_CENTER:
7479 key->ps_prolog.color_interp_vgpr_index[i] = 2;
7480 shader->config.spi_ps_input_ena |=
7481 S_0286CC_PERSP_CENTER_ENA(1);
7482 break;
7483 case TGSI_INTERPOLATE_LOC_CENTROID:
7484 key->ps_prolog.color_interp_vgpr_index[i] = 4;
7485 shader->config.spi_ps_input_ena |=
7486 S_0286CC_PERSP_CENTROID_ENA(1);
7487 break;
7488 default:
7489 assert(0);
7490 }
7491 break;
7492 case TGSI_INTERPOLATE_LINEAR:
7493 /* Force the interpolation location for colors here. */
7494 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7495 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7496 if (shader->key.part.ps.prolog.force_linear_center_interp)
7497 location = TGSI_INTERPOLATE_LOC_CENTER;
7498
7499 /* The VGPR assignment for non-monolithic shaders
7500 * works because InitialPSInputAddr is set on the
7501 * main shader and PERSP_PULL_MODEL is never used.
7502 */
7503 switch (location) {
7504 case TGSI_INTERPOLATE_LOC_SAMPLE:
7505 key->ps_prolog.color_interp_vgpr_index[i] =
7506 separate_prolog ? 6 : 9;
7507 shader->config.spi_ps_input_ena |=
7508 S_0286CC_LINEAR_SAMPLE_ENA(1);
7509 break;
7510 case TGSI_INTERPOLATE_LOC_CENTER:
7511 key->ps_prolog.color_interp_vgpr_index[i] =
7512 separate_prolog ? 8 : 11;
7513 shader->config.spi_ps_input_ena |=
7514 S_0286CC_LINEAR_CENTER_ENA(1);
7515 break;
7516 case TGSI_INTERPOLATE_LOC_CENTROID:
7517 key->ps_prolog.color_interp_vgpr_index[i] =
7518 separate_prolog ? 10 : 13;
7519 shader->config.spi_ps_input_ena |=
7520 S_0286CC_LINEAR_CENTROID_ENA(1);
7521 break;
7522 default:
7523 assert(0);
7524 }
7525 break;
7526 default:
7527 assert(0);
7528 }
7529 }
7530 }
7531 }
7532
7533 /**
7534 * Check whether a PS prolog is required based on the key.
7535 */
7536 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7537 {
7538 return key->ps_prolog.colors_read ||
7539 key->ps_prolog.states.force_persp_sample_interp ||
7540 key->ps_prolog.states.force_linear_sample_interp ||
7541 key->ps_prolog.states.force_persp_center_interp ||
7542 key->ps_prolog.states.force_linear_center_interp ||
7543 key->ps_prolog.states.bc_optimize_for_persp ||
7544 key->ps_prolog.states.bc_optimize_for_linear ||
7545 key->ps_prolog.states.poly_stipple;
7546 }
7547
7548 /**
7549 * Compute the PS epilog key, which contains all the information needed to
7550 * build the PS epilog function.
7551 */
7552 static void si_get_ps_epilog_key(struct si_shader *shader,
7553 union si_shader_part_key *key)
7554 {
7555 struct tgsi_shader_info *info = &shader->selector->info;
7556 memset(key, 0, sizeof(*key));
7557 key->ps_epilog.colors_written = info->colors_written;
7558 key->ps_epilog.writes_z = info->writes_z;
7559 key->ps_epilog.writes_stencil = info->writes_stencil;
7560 key->ps_epilog.writes_samplemask = info->writes_samplemask;
7561 key->ps_epilog.states = shader->key.part.ps.epilog;
7562 }
7563
7564 /**
7565 * Build the GS prolog function. Rotate the input vertices for triangle strips
7566 * with adjacency.
7567 */
7568 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7569 union si_shader_part_key *key)
7570 {
7571 unsigned num_sgprs, num_vgprs;
7572 struct gallivm_state *gallivm = &ctx->gallivm;
7573 LLVMBuilderRef builder = gallivm->builder;
7574 LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
7575 LLVMTypeRef returns[48];
7576 LLVMValueRef func, ret;
7577
7578 if (ctx->screen->b.chip_class >= GFX9) {
7579 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
7580 num_vgprs = 5; /* ES inputs are not needed by GS */
7581 } else {
7582 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
7583 num_vgprs = 8;
7584 }
7585
7586 for (unsigned i = 0; i < num_sgprs; ++i) {
7587 params[i] = ctx->i32;
7588 returns[i] = ctx->i32;
7589 }
7590
7591 for (unsigned i = 0; i < num_vgprs; ++i) {
7592 params[num_sgprs + i] = ctx->i32;
7593 returns[num_sgprs + i] = ctx->f32;
7594 }
7595
7596 /* Create the function. */
7597 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7598 params, num_sgprs + num_vgprs, num_sgprs - 1);
7599 func = ctx->main_fn;
7600
7601 /* Set the full EXEC mask for the prolog, because we are only fiddling
7602 * with registers here. The main shader part will set the correct EXEC
7603 * mask.
7604 */
7605 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
7606 si_init_exec_full_mask(ctx);
7607
7608 /* Copy inputs to outputs. This should be no-op, as the registers match,
7609 * but it will prevent the compiler from overwriting them unintentionally.
7610 */
7611 ret = ctx->return_value;
7612 for (unsigned i = 0; i < num_sgprs; i++) {
7613 LLVMValueRef p = LLVMGetParam(func, i);
7614 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7615 }
7616 for (unsigned i = 0; i < num_vgprs; i++) {
7617 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7618 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7619 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7620 }
7621
7622 if (key->gs_prolog.states.tri_strip_adj_fix) {
7623 /* Remap the input vertices for every other primitive. */
7624 const unsigned gfx6_vtx_params[6] = {
7625 num_sgprs,
7626 num_sgprs + 1,
7627 num_sgprs + 3,
7628 num_sgprs + 4,
7629 num_sgprs + 5,
7630 num_sgprs + 6
7631 };
7632 const unsigned gfx9_vtx_params[3] = {
7633 num_sgprs,
7634 num_sgprs + 1,
7635 num_sgprs + 4,
7636 };
7637 LLVMValueRef vtx_in[6], vtx_out[6];
7638 LLVMValueRef prim_id, rotate;
7639
7640 if (ctx->screen->b.chip_class >= GFX9) {
7641 for (unsigned i = 0; i < 3; i++) {
7642 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
7643 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
7644 }
7645 } else {
7646 for (unsigned i = 0; i < 6; i++)
7647 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
7648 }
7649
7650 prim_id = LLVMGetParam(func, num_sgprs + 2);
7651 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7652
7653 for (unsigned i = 0; i < 6; ++i) {
7654 LLVMValueRef base, rotated;
7655 base = vtx_in[i];
7656 rotated = vtx_in[(i + 4) % 6];
7657 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
7658 }
7659
7660 if (ctx->screen->b.chip_class >= GFX9) {
7661 for (unsigned i = 0; i < 3; i++) {
7662 LLVMValueRef hi, out;
7663
7664 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
7665 LLVMConstInt(ctx->i32, 16, 0), "");
7666 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
7667 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
7668 ret = LLVMBuildInsertValue(builder, ret, out,
7669 gfx9_vtx_params[i], "");
7670 }
7671 } else {
7672 for (unsigned i = 0; i < 6; i++) {
7673 LLVMValueRef out;
7674
7675 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
7676 ret = LLVMBuildInsertValue(builder, ret, out,
7677 gfx6_vtx_params[i], "");
7678 }
7679 }
7680 }
7681
7682 LLVMBuildRet(builder, ret);
7683 }
7684
7685 /**
7686 * Given a list of shader part functions, build a wrapper function that
7687 * runs them in sequence to form a monolithic shader.
7688 */
7689 static void si_build_wrapper_function(struct si_shader_context *ctx,
7690 LLVMValueRef *parts,
7691 unsigned num_parts,
7692 unsigned main_part,
7693 unsigned next_shader_first_part)
7694 {
7695 struct gallivm_state *gallivm = &ctx->gallivm;
7696 LLVMBuilderRef builder = ctx->gallivm.builder;
7697 /* PS epilog has one arg per color component */
7698 LLVMTypeRef param_types[48];
7699 LLVMValueRef initial[48], out[48];
7700 LLVMTypeRef function_type;
7701 unsigned num_params;
7702 unsigned num_out, initial_num_out;
7703 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7704 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
7705 unsigned num_sgprs, num_vgprs;
7706 unsigned last_sgpr_param;
7707 unsigned gprs;
7708 struct lp_build_if_state if_state;
7709
7710 for (unsigned i = 0; i < num_parts; ++i) {
7711 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7712 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7713 }
7714
7715 /* The parameters of the wrapper function correspond to those of the
7716 * first part in terms of SGPRs and VGPRs, but we use the types of the
7717 * main part to get the right types. This is relevant for the
7718 * dereferenceable attribute on descriptor table pointers.
7719 */
7720 num_sgprs = 0;
7721 num_vgprs = 0;
7722
7723 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7724 num_params = LLVMCountParamTypes(function_type);
7725
7726 for (unsigned i = 0; i < num_params; ++i) {
7727 LLVMValueRef param = LLVMGetParam(parts[0], i);
7728
7729 if (ac_is_sgpr_param(param)) {
7730 assert(num_vgprs == 0);
7731 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7732 } else {
7733 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7734 }
7735 }
7736 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7737
7738 num_params = 0;
7739 last_sgpr_param = 0;
7740 gprs = 0;
7741 while (gprs < num_sgprs + num_vgprs) {
7742 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7743 unsigned size;
7744
7745 param_types[num_params] = LLVMTypeOf(param);
7746 if (gprs < num_sgprs)
7747 last_sgpr_param = num_params;
7748 size = llvm_get_type_size(param_types[num_params]) / 4;
7749 num_params++;
7750
7751 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7752 assert(gprs + size <= num_sgprs + num_vgprs &&
7753 (gprs >= num_sgprs || gprs + size <= num_sgprs));
7754
7755 gprs += size;
7756 }
7757
7758 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7759
7760 if (is_merged_shader(ctx->shader))
7761 si_init_exec_full_mask(ctx);
7762
7763 /* Record the arguments of the function as if they were an output of
7764 * a previous part.
7765 */
7766 num_out = 0;
7767 num_out_sgpr = 0;
7768
7769 for (unsigned i = 0; i < num_params; ++i) {
7770 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7771 LLVMTypeRef param_type = LLVMTypeOf(param);
7772 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7773 unsigned size = llvm_get_type_size(param_type) / 4;
7774
7775 if (size == 1) {
7776 if (param_type != out_type)
7777 param = LLVMBuildBitCast(builder, param, out_type, "");
7778 out[num_out++] = param;
7779 } else {
7780 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7781
7782 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7783 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7784 param_type = ctx->i64;
7785 }
7786
7787 if (param_type != vector_type)
7788 param = LLVMBuildBitCast(builder, param, vector_type, "");
7789
7790 for (unsigned j = 0; j < size; ++j)
7791 out[num_out++] = LLVMBuildExtractElement(
7792 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7793 }
7794
7795 if (i <= last_sgpr_param)
7796 num_out_sgpr = num_out;
7797 }
7798
7799 memcpy(initial, out, sizeof(out));
7800 initial_num_out = num_out;
7801 initial_num_out_sgpr = num_out_sgpr;
7802
7803 /* Now chain the parts. */
7804 for (unsigned part = 0; part < num_parts; ++part) {
7805 LLVMValueRef in[48];
7806 LLVMValueRef ret;
7807 LLVMTypeRef ret_type;
7808 unsigned out_idx = 0;
7809
7810 num_params = LLVMCountParams(parts[part]);
7811 assert(num_params <= ARRAY_SIZE(param_types));
7812
7813 /* Merged shaders are executed conditionally depending
7814 * on the number of enabled threads passed in the input SGPRs. */
7815 if (is_merged_shader(ctx->shader) &&
7816 (part == 0 || part == next_shader_first_part)) {
7817 LLVMValueRef ena, count = initial[3];
7818
7819 /* The thread count for the 2nd shader is at bit-offset 8. */
7820 if (part == next_shader_first_part) {
7821 count = LLVMBuildLShr(builder, count,
7822 LLVMConstInt(ctx->i32, 8, 0), "");
7823 }
7824 count = LLVMBuildAnd(builder, count,
7825 LLVMConstInt(ctx->i32, 0x7f, 0), "");
7826 ena = LLVMBuildICmp(builder, LLVMIntULT,
7827 ac_get_thread_id(&ctx->ac), count, "");
7828 lp_build_if(&if_state, &ctx->gallivm, ena);
7829 }
7830
7831 /* Derive arguments for the next part from outputs of the
7832 * previous one.
7833 */
7834 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7835 LLVMValueRef param;
7836 LLVMTypeRef param_type;
7837 bool is_sgpr;
7838 unsigned param_size;
7839 LLVMValueRef arg = NULL;
7840
7841 param = LLVMGetParam(parts[part], param_idx);
7842 param_type = LLVMTypeOf(param);
7843 param_size = llvm_get_type_size(param_type) / 4;
7844 is_sgpr = ac_is_sgpr_param(param);
7845
7846 if (is_sgpr) {
7847 #if HAVE_LLVM < 0x0400
7848 LLVMRemoveAttribute(param, LLVMByValAttribute);
7849 #else
7850 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7851 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7852 #endif
7853 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7854 }
7855
7856 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7857 assert(is_sgpr || out_idx >= num_out_sgpr);
7858
7859 if (param_size == 1)
7860 arg = out[out_idx];
7861 else
7862 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7863
7864 if (LLVMTypeOf(arg) != param_type) {
7865 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7866 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7867 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7868 } else {
7869 arg = LLVMBuildBitCast(builder, arg, param_type, "");
7870 }
7871 }
7872
7873 in[param_idx] = arg;
7874 out_idx += param_size;
7875 }
7876
7877 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7878
7879 if (is_merged_shader(ctx->shader) &&
7880 (part + 1 == next_shader_first_part ||
7881 part + 1 == num_parts)) {
7882 lp_build_endif(&if_state);
7883
7884 if (part + 1 == next_shader_first_part) {
7885 /* A barrier is required between 2 merged shaders. */
7886 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
7887
7888 /* The second half of the merged shader should use
7889 * the inputs from the toplevel (wrapper) function,
7890 * not the return value from the last call.
7891 *
7892 * That's because the last call was executed condi-
7893 * tionally, so we can't consume it in the main
7894 * block.
7895 */
7896 memcpy(out, initial, sizeof(initial));
7897 num_out = initial_num_out;
7898 num_out_sgpr = initial_num_out_sgpr;
7899 }
7900 continue;
7901 }
7902
7903 /* Extract the returned GPRs. */
7904 ret_type = LLVMTypeOf(ret);
7905 num_out = 0;
7906 num_out_sgpr = 0;
7907
7908 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7909 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7910
7911 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7912
7913 for (unsigned i = 0; i < ret_size; ++i) {
7914 LLVMValueRef val =
7915 LLVMBuildExtractValue(builder, ret, i, "");
7916
7917 out[num_out++] = val;
7918
7919 if (LLVMTypeOf(val) == ctx->i32) {
7920 assert(num_out_sgpr + 1 == num_out);
7921 num_out_sgpr = num_out;
7922 }
7923 }
7924 }
7925 }
7926
7927 LLVMBuildRetVoid(builder);
7928 }
7929
7930 int si_compile_tgsi_shader(struct si_screen *sscreen,
7931 LLVMTargetMachineRef tm,
7932 struct si_shader *shader,
7933 bool is_monolithic,
7934 struct pipe_debug_callback *debug)
7935 {
7936 struct si_shader_selector *sel = shader->selector;
7937 struct si_shader_context ctx;
7938 int r = -1;
7939
7940 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7941 * conversion fails. */
7942 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7943 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7944 tgsi_dump(sel->tokens, 0);
7945 si_dump_streamout(&sel->so);
7946 }
7947
7948 si_init_shader_ctx(&ctx, sscreen, tm);
7949 si_llvm_context_set_tgsi(&ctx, shader);
7950 ctx.separate_prolog = !is_monolithic;
7951
7952 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7953 sizeof(shader->info.vs_output_param_offset));
7954
7955 shader->info.uses_instanceid = sel->info.uses_instanceid;
7956
7957 ctx.load_system_value = declare_system_value;
7958
7959 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
7960 si_llvm_dispose(&ctx);
7961 return -1;
7962 }
7963
7964 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7965 LLVMValueRef parts[3];
7966 bool need_prolog;
7967 bool need_epilog;
7968
7969 need_prolog = sel->vs_needs_prolog;
7970 need_epilog = !shader->key.as_es && !shader->key.as_ls;
7971
7972 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7973
7974 if (need_prolog) {
7975 union si_shader_part_key prolog_key;
7976 si_get_vs_prolog_key(&sel->info,
7977 shader->info.num_input_sgprs,
7978 &shader->key.part.vs.prolog,
7979 shader, &prolog_key);
7980 si_build_vs_prolog_function(&ctx, &prolog_key);
7981 parts[0] = ctx.main_fn;
7982 }
7983
7984 if (need_epilog) {
7985 union si_shader_part_key epilog_key;
7986 si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
7987 si_build_vs_epilog_function(&ctx, &epilog_key);
7988 parts[need_prolog ? 2 : 1] = ctx.main_fn;
7989 }
7990
7991 si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
7992 need_prolog ? 1 : 0, 0);
7993 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7994 if (sscreen->b.chip_class >= GFX9) {
7995 struct si_shader_selector *ls = shader->key.part.tcs.ls;
7996 LLVMValueRef parts[4];
7997
7998 /* TCS main part */
7999 parts[2] = ctx.main_fn;
8000
8001 /* TCS epilog */
8002 union si_shader_part_key tcs_epilog_key;
8003 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
8004 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8005 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
8006 parts[3] = ctx.main_fn;
8007
8008 /* VS prolog */
8009 if (ls->vs_needs_prolog) {
8010 union si_shader_part_key vs_prolog_key;
8011 si_get_vs_prolog_key(&ls->info,
8012 shader->info.num_input_sgprs,
8013 &shader->key.part.tcs.ls_prolog,
8014 shader, &vs_prolog_key);
8015 vs_prolog_key.vs_prolog.is_monolithic = true;
8016 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8017 parts[0] = ctx.main_fn;
8018 }
8019
8020 /* VS as LS main part */
8021 struct si_shader shader_ls = {};
8022 shader_ls.selector = ls;
8023 shader_ls.key.as_ls = 1;
8024 shader_ls.key.mono = shader->key.mono;
8025 shader_ls.key.opt = shader->key.opt;
8026 si_llvm_context_set_tgsi(&ctx, &shader_ls);
8027
8028 if (!si_compile_tgsi_main(&ctx, true)) {
8029 si_llvm_dispose(&ctx);
8030 return -1;
8031 }
8032 shader->info.uses_instanceid |= ls->info.uses_instanceid;
8033 parts[1] = ctx.main_fn;
8034
8035 /* Reset the shader context. */
8036 ctx.shader = shader;
8037 ctx.type = PIPE_SHADER_TESS_CTRL;
8038
8039 si_build_wrapper_function(&ctx,
8040 parts + !ls->vs_needs_prolog,
8041 4 - !ls->vs_needs_prolog, 0,
8042 ls->vs_needs_prolog ? 2 : 1);
8043 } else {
8044 LLVMValueRef parts[2];
8045 union si_shader_part_key epilog_key;
8046
8047 parts[0] = ctx.main_fn;
8048
8049 memset(&epilog_key, 0, sizeof(epilog_key));
8050 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8051 si_build_tcs_epilog_function(&ctx, &epilog_key);
8052 parts[1] = ctx.main_fn;
8053
8054 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8055 }
8056 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
8057 !shader->key.as_es) {
8058 LLVMValueRef parts[2];
8059 union si_shader_part_key epilog_key;
8060
8061 parts[0] = ctx.main_fn;
8062
8063 si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
8064 si_build_vs_epilog_function(&ctx, &epilog_key);
8065 parts[1] = ctx.main_fn;
8066
8067 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8068 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
8069 if (ctx.screen->b.chip_class >= GFX9) {
8070 struct si_shader_selector *es = shader->key.part.gs.es;
8071 LLVMValueRef es_prolog = NULL;
8072 LLVMValueRef es_main = NULL;
8073 LLVMValueRef gs_prolog = NULL;
8074 LLVMValueRef gs_main = ctx.main_fn;
8075
8076 /* GS prolog */
8077 union si_shader_part_key gs_prolog_key;
8078 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
8079 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8080 gs_prolog_key.gs_prolog.is_monolithic = true;
8081 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
8082 gs_prolog = ctx.main_fn;
8083
8084 /* ES prolog */
8085 if (es->vs_needs_prolog) {
8086 union si_shader_part_key vs_prolog_key;
8087 si_get_vs_prolog_key(&es->info,
8088 shader->info.num_input_sgprs,
8089 &shader->key.part.tcs.ls_prolog,
8090 shader, &vs_prolog_key);
8091 vs_prolog_key.vs_prolog.is_monolithic = true;
8092 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8093 es_prolog = ctx.main_fn;
8094 }
8095
8096 /* ES main part */
8097 struct si_shader shader_es = {};
8098 shader_es.selector = es;
8099 shader_es.key.as_es = 1;
8100 shader_es.key.mono = shader->key.mono;
8101 shader_es.key.opt = shader->key.opt;
8102 si_llvm_context_set_tgsi(&ctx, &shader_es);
8103
8104 if (!si_compile_tgsi_main(&ctx, true)) {
8105 si_llvm_dispose(&ctx);
8106 return -1;
8107 }
8108 shader->info.uses_instanceid |= es->info.uses_instanceid;
8109 es_main = ctx.main_fn;
8110
8111 /* Reset the shader context. */
8112 ctx.shader = shader;
8113 ctx.type = PIPE_SHADER_GEOMETRY;
8114
8115 /* Prepare the array of shader parts. */
8116 LLVMValueRef parts[4];
8117 unsigned num_parts = 0, main_part, next_first_part;
8118
8119 if (es_prolog)
8120 parts[num_parts++] = es_prolog;
8121
8122 parts[main_part = num_parts++] = es_main;
8123 parts[next_first_part = num_parts++] = gs_prolog;
8124 parts[num_parts++] = gs_main;
8125
8126 si_build_wrapper_function(&ctx, parts, num_parts,
8127 main_part, next_first_part);
8128 } else {
8129 LLVMValueRef parts[2];
8130 union si_shader_part_key prolog_key;
8131
8132 parts[1] = ctx.main_fn;
8133
8134 memset(&prolog_key, 0, sizeof(prolog_key));
8135 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8136 si_build_gs_prolog_function(&ctx, &prolog_key);
8137 parts[0] = ctx.main_fn;
8138
8139 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
8140 }
8141 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
8142 LLVMValueRef parts[3];
8143 union si_shader_part_key prolog_key;
8144 union si_shader_part_key epilog_key;
8145 bool need_prolog;
8146
8147 si_get_ps_prolog_key(shader, &prolog_key, false);
8148 need_prolog = si_need_ps_prolog(&prolog_key);
8149
8150 parts[need_prolog ? 1 : 0] = ctx.main_fn;
8151
8152 if (need_prolog) {
8153 si_build_ps_prolog_function(&ctx, &prolog_key);
8154 parts[0] = ctx.main_fn;
8155 }
8156
8157 si_get_ps_epilog_key(shader, &epilog_key);
8158 si_build_ps_epilog_function(&ctx, &epilog_key);
8159 parts[need_prolog ? 2 : 1] = ctx.main_fn;
8160
8161 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
8162 need_prolog ? 1 : 0, 0);
8163 }
8164
8165 /* Dump LLVM IR before any optimization passes */
8166 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
8167 r600_can_dump_shader(&sscreen->b, ctx.type))
8168 LLVMDumpModule(ctx.gallivm.module);
8169
8170 si_llvm_finalize_module(&ctx,
8171 r600_extra_shader_checks(&sscreen->b, ctx.type));
8172
8173 /* Post-optimization transformations and analysis. */
8174 si_eliminate_const_vs_outputs(&ctx);
8175
8176 if ((debug && debug->debug_message) ||
8177 r600_can_dump_shader(&sscreen->b, ctx.type))
8178 si_count_scratch_private_memory(&ctx);
8179
8180 /* Compile to bytecode. */
8181 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
8182 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
8183 si_llvm_dispose(&ctx);
8184 if (r) {
8185 fprintf(stderr, "LLVM failed to compile shader\n");
8186 return r;
8187 }
8188
8189 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
8190 * LLVM 3.9svn has this bug.
8191 */
8192 if (sel->type == PIPE_SHADER_COMPUTE) {
8193 unsigned wave_size = 64;
8194 unsigned max_vgprs = 256;
8195 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
8196 unsigned max_sgprs_per_wave = 128;
8197 unsigned max_block_threads = si_get_max_workgroup_size(shader);
8198 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
8199 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
8200
8201 max_vgprs = max_vgprs / min_waves_per_simd;
8202 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
8203
8204 if (shader->config.num_sgprs > max_sgprs ||
8205 shader->config.num_vgprs > max_vgprs) {
8206 fprintf(stderr, "LLVM failed to compile a shader correctly: "
8207 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
8208 shader->config.num_sgprs, shader->config.num_vgprs,
8209 max_sgprs, max_vgprs);
8210
8211 /* Just terminate the process, because dependent
8212 * shaders can hang due to bad input data, but use
8213 * the env var to allow shader-db to work.
8214 */
8215 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
8216 abort();
8217 }
8218 }
8219
8220 /* Add the scratch offset to input SGPRs. */
8221 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
8222 shader->info.num_input_sgprs += 1; /* scratch byte offset */
8223
8224 /* Calculate the number of fragment input VGPRs. */
8225 if (ctx.type == PIPE_SHADER_FRAGMENT) {
8226 shader->info.num_input_vgprs = 0;
8227 shader->info.face_vgpr_index = -1;
8228
8229 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8230 shader->info.num_input_vgprs += 2;
8231 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
8232 shader->info.num_input_vgprs += 2;
8233 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
8234 shader->info.num_input_vgprs += 2;
8235 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
8236 shader->info.num_input_vgprs += 3;
8237 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8238 shader->info.num_input_vgprs += 2;
8239 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
8240 shader->info.num_input_vgprs += 2;
8241 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
8242 shader->info.num_input_vgprs += 2;
8243 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
8244 shader->info.num_input_vgprs += 1;
8245 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
8246 shader->info.num_input_vgprs += 1;
8247 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
8248 shader->info.num_input_vgprs += 1;
8249 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
8250 shader->info.num_input_vgprs += 1;
8251 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
8252 shader->info.num_input_vgprs += 1;
8253 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
8254 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
8255 shader->info.num_input_vgprs += 1;
8256 }
8257 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
8258 shader->info.num_input_vgprs += 1;
8259 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
8260 shader->info.num_input_vgprs += 1;
8261 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
8262 shader->info.num_input_vgprs += 1;
8263 }
8264
8265 return 0;
8266 }
8267
8268 /**
8269 * Create, compile and return a shader part (prolog or epilog).
8270 *
8271 * \param sscreen screen
8272 * \param list list of shader parts of the same category
8273 * \param type shader type
8274 * \param key shader part key
8275 * \param prolog whether the part being requested is a prolog
8276 * \param tm LLVM target machine
8277 * \param debug debug callback
8278 * \param build the callback responsible for building the main function
8279 * \return non-NULL on success
8280 */
8281 static struct si_shader_part *
8282 si_get_shader_part(struct si_screen *sscreen,
8283 struct si_shader_part **list,
8284 enum pipe_shader_type type,
8285 bool prolog,
8286 union si_shader_part_key *key,
8287 LLVMTargetMachineRef tm,
8288 struct pipe_debug_callback *debug,
8289 void (*build)(struct si_shader_context *,
8290 union si_shader_part_key *),
8291 const char *name)
8292 {
8293 struct si_shader_part *result;
8294
8295 mtx_lock(&sscreen->shader_parts_mutex);
8296
8297 /* Find existing. */
8298 for (result = *list; result; result = result->next) {
8299 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
8300 mtx_unlock(&sscreen->shader_parts_mutex);
8301 return result;
8302 }
8303 }
8304
8305 /* Compile a new one. */
8306 result = CALLOC_STRUCT(si_shader_part);
8307 result->key = *key;
8308
8309 struct si_shader shader = {};
8310 struct si_shader_context ctx;
8311 struct gallivm_state *gallivm = &ctx.gallivm;
8312
8313 si_init_shader_ctx(&ctx, sscreen, tm);
8314 ctx.shader = &shader;
8315 ctx.type = type;
8316
8317 switch (type) {
8318 case PIPE_SHADER_VERTEX:
8319 break;
8320 case PIPE_SHADER_TESS_CTRL:
8321 assert(!prolog);
8322 shader.key.part.tcs.epilog = key->tcs_epilog.states;
8323 break;
8324 case PIPE_SHADER_GEOMETRY:
8325 assert(prolog);
8326 break;
8327 case PIPE_SHADER_FRAGMENT:
8328 if (prolog)
8329 shader.key.part.ps.prolog = key->ps_prolog.states;
8330 else
8331 shader.key.part.ps.epilog = key->ps_epilog.states;
8332 break;
8333 default:
8334 unreachable("bad shader part");
8335 }
8336
8337 build(&ctx, key);
8338
8339 /* Compile. */
8340 si_llvm_finalize_module(&ctx,
8341 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
8342
8343 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
8344 gallivm->module, debug, ctx.type, name)) {
8345 FREE(result);
8346 result = NULL;
8347 goto out;
8348 }
8349
8350 result->next = *list;
8351 *list = result;
8352
8353 out:
8354 si_llvm_dispose(&ctx);
8355 mtx_unlock(&sscreen->shader_parts_mutex);
8356 return result;
8357 }
8358
8359 /**
8360 * Build the vertex shader prolog function.
8361 *
8362 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
8363 * All inputs are returned unmodified. The vertex load indices are
8364 * stored after them, which will be used by the API VS for fetching inputs.
8365 *
8366 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
8367 * input_v0,
8368 * input_v1,
8369 * input_v2,
8370 * input_v3,
8371 * (VertexID + BaseVertex),
8372 * (InstanceID + StartInstance),
8373 * (InstanceID / 2 + StartInstance)
8374 */
8375 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
8376 union si_shader_part_key *key)
8377 {
8378 struct gallivm_state *gallivm = &ctx->gallivm;
8379 LLVMTypeRef *params, *returns;
8380 LLVMValueRef ret, func;
8381 int last_sgpr, num_params, num_returns, i;
8382 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
8383 key->vs_prolog.num_merged_next_stage_vgprs;
8384 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
8385 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
8386 num_input_vgprs;
8387 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
8388
8389 ctx->param_vertex_id = first_vs_vgpr;
8390 ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
8391
8392 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
8393 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
8394 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
8395 sizeof(LLVMTypeRef));
8396 num_params = 0;
8397 num_returns = 0;
8398
8399 /* Declare input and output SGPRs. */
8400 num_params = 0;
8401 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8402 params[num_params++] = ctx->i32;
8403 returns[num_returns++] = ctx->i32;
8404 }
8405 last_sgpr = num_params - 1;
8406
8407 /* Preloaded VGPRs (outputs must be floats) */
8408 for (i = 0; i < num_input_vgprs; i++) {
8409 params[num_params++] = ctx->i32;
8410 returns[num_returns++] = ctx->f32;
8411 }
8412
8413 /* Vertex load indices. */
8414 for (i = 0; i <= key->vs_prolog.last_input; i++)
8415 returns[num_returns++] = ctx->f32;
8416
8417 /* Create the function. */
8418 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
8419 num_params, last_sgpr);
8420 func = ctx->main_fn;
8421
8422 if (key->vs_prolog.num_merged_next_stage_vgprs &&
8423 !key->vs_prolog.is_monolithic)
8424 si_init_exec_from_input(ctx, 3, 0);
8425
8426 /* Copy inputs to outputs. This should be no-op, as the registers match,
8427 * but it will prevent the compiler from overwriting them unintentionally.
8428 */
8429 ret = ctx->return_value;
8430 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8431 LLVMValueRef p = LLVMGetParam(func, i);
8432 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8433 }
8434 for (; i < num_params; i++) {
8435 LLVMValueRef p = LLVMGetParam(func, i);
8436 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
8437 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8438 }
8439
8440 /* Compute vertex load indices from instance divisors. */
8441 for (i = 0; i <= key->vs_prolog.last_input; i++) {
8442 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
8443 LLVMValueRef index;
8444
8445 if (divisor) {
8446 /* InstanceID / Divisor + StartInstance */
8447 index = get_instance_index_for_fetch(ctx,
8448 user_sgpr_base +
8449 SI_SGPR_START_INSTANCE,
8450 divisor);
8451 } else {
8452 /* VertexID + BaseVertex */
8453 index = LLVMBuildAdd(gallivm->builder,
8454 LLVMGetParam(func, ctx->param_vertex_id),
8455 LLVMGetParam(func, user_sgpr_base +
8456 SI_SGPR_BASE_VERTEX), "");
8457 }
8458
8459 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
8460 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
8461 num_params++, "");
8462 }
8463
8464 si_llvm_build_ret(ctx, ret);
8465 }
8466
8467 /**
8468 * Build the vertex shader epilog function. This is also used by the tessellation
8469 * evaluation shader compiled as VS.
8470 *
8471 * The input is PrimitiveID.
8472 *
8473 * If PrimitiveID is required by the pixel shader, export it.
8474 * Otherwise, do nothing.
8475 */
8476 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
8477 union si_shader_part_key *key)
8478 {
8479 struct gallivm_state *gallivm = &ctx->gallivm;
8480 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8481 LLVMTypeRef params[5];
8482 int num_params, i;
8483
8484 /* Declare input VGPRs. */
8485 num_params = key->vs_epilog.states.export_prim_id ?
8486 (VS_EPILOG_PRIMID_LOC + 1) : 0;
8487 assert(num_params <= ARRAY_SIZE(params));
8488
8489 for (i = 0; i < num_params; i++)
8490 params[i] = ctx->f32;
8491
8492 /* Create the function. */
8493 si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
8494
8495 /* Emit exports. */
8496 if (key->vs_epilog.states.export_prim_id) {
8497 struct lp_build_context *base = &bld_base->base;
8498 struct ac_export_args args;
8499
8500 args.enabled_channels = 0x1; /* enabled channels */
8501 args.valid_mask = 0; /* whether the EXEC mask is valid */
8502 args.done = 0; /* DONE bit */
8503 args.target = V_008DFC_SQ_EXP_PARAM +
8504 key->vs_epilog.prim_id_param_offset;
8505 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
8506 args.out[0] = LLVMGetParam(ctx->main_fn,
8507 VS_EPILOG_PRIMID_LOC); /* X */
8508 args.out[1] = base->undef; /* Y */
8509 args.out[2] = base->undef; /* Z */
8510 args.out[3] = base->undef; /* W */
8511
8512 ac_build_export(&ctx->ac, &args);
8513 }
8514
8515 LLVMBuildRetVoid(gallivm->builder);
8516 }
8517
8518 static bool si_get_vs_prolog(struct si_screen *sscreen,
8519 LLVMTargetMachineRef tm,
8520 struct si_shader *shader,
8521 struct pipe_debug_callback *debug,
8522 struct si_shader *main_part,
8523 const struct si_vs_prolog_bits *key)
8524 {
8525 struct si_shader_selector *vs = main_part->selector;
8526
8527 /* The prolog is a no-op if there are no inputs. */
8528 if (!vs->vs_needs_prolog)
8529 return true;
8530
8531 /* Get the prolog. */
8532 union si_shader_part_key prolog_key;
8533 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
8534 key, shader, &prolog_key);
8535
8536 shader->prolog =
8537 si_get_shader_part(sscreen, &sscreen->vs_prologs,
8538 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8539 debug, si_build_vs_prolog_function,
8540 "Vertex Shader Prolog");
8541 return shader->prolog != NULL;
8542 }
8543
8544 /**
8545 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
8546 */
8547 static bool si_get_vs_epilog(struct si_screen *sscreen,
8548 LLVMTargetMachineRef tm,
8549 struct si_shader *shader,
8550 struct pipe_debug_callback *debug,
8551 struct si_vs_epilog_bits *states)
8552 {
8553 union si_shader_part_key epilog_key;
8554
8555 si_get_vs_epilog_key(shader, states, &epilog_key);
8556
8557 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
8558 PIPE_SHADER_VERTEX, true,
8559 &epilog_key, tm, debug,
8560 si_build_vs_epilog_function,
8561 "Vertex Shader Epilog");
8562 return shader->epilog != NULL;
8563 }
8564
8565 /**
8566 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8567 */
8568 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8569 LLVMTargetMachineRef tm,
8570 struct si_shader *shader,
8571 struct pipe_debug_callback *debug)
8572 {
8573 if (!si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8574 &shader->key.part.vs.prolog))
8575 return false;
8576
8577 /* Get the epilog. */
8578 if (!shader->key.as_es && !shader->key.as_ls &&
8579 !si_get_vs_epilog(sscreen, tm, shader, debug,
8580 &shader->key.part.vs.epilog))
8581 return false;
8582
8583 return true;
8584 }
8585
8586 /**
8587 * Select and compile (or reuse) TES parts (epilog).
8588 */
8589 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
8590 LLVMTargetMachineRef tm,
8591 struct si_shader *shader,
8592 struct pipe_debug_callback *debug)
8593 {
8594 if (shader->key.as_es)
8595 return true;
8596
8597 /* TES compiled as VS. */
8598 return si_get_vs_epilog(sscreen, tm, shader, debug,
8599 &shader->key.part.tes.epilog);
8600 }
8601
8602 /**
8603 * Compile the TCS epilog function. This writes tesselation factors to memory
8604 * based on the output primitive type of the tesselator (determined by TES).
8605 */
8606 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8607 union si_shader_part_key *key)
8608 {
8609 struct gallivm_state *gallivm = &ctx->gallivm;
8610 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8611 LLVMTypeRef params[32];
8612 LLVMValueRef func;
8613 int last_sgpr, num_params = 0;
8614
8615 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
8616 params[ctx->param_rw_buffers = num_params++] =
8617 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
8618
8619 if (ctx->screen->b.chip_class >= GFX9) {
8620 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8621 params[num_params++] = ctx->i32; /* wave info */
8622 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8623 params[num_params++] = ctx->i32;
8624 params[num_params++] = ctx->i32;
8625 params[num_params++] = ctx->i32;
8626 params[num_params++] = ctx->i64;
8627 params[num_params++] = ctx->i64;
8628 params[num_params++] = ctx->i64;
8629 params[num_params++] = ctx->i64;
8630 params[num_params++] = ctx->i64;
8631 params[num_params++] = ctx->i64;
8632 params[num_params++] = ctx->i32;
8633 params[num_params++] = ctx->i32;
8634 params[num_params++] = ctx->i32;
8635 params[num_params++] = ctx->i32;
8636 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8637 } else {
8638 params[num_params++] = ctx->i64;
8639 params[num_params++] = ctx->i64;
8640 params[num_params++] = ctx->i64;
8641 params[num_params++] = ctx->i64;
8642 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8643 params[num_params++] = ctx->i32;
8644 params[num_params++] = ctx->i32;
8645 params[num_params++] = ctx->i32;
8646 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8647 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8648 }
8649 last_sgpr = num_params - 1;
8650
8651 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8652 params[num_params++] = ctx->i32; /* invocation ID within the patch */
8653 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8654
8655 /* Create the function. */
8656 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
8657 declare_lds_as_pointer(ctx);
8658 func = ctx->main_fn;
8659
8660 si_write_tess_factors(bld_base,
8661 LLVMGetParam(func, last_sgpr + 1),
8662 LLVMGetParam(func, last_sgpr + 2),
8663 LLVMGetParam(func, last_sgpr + 3));
8664
8665 LLVMBuildRetVoid(gallivm->builder);
8666 }
8667
8668 /**
8669 * Select and compile (or reuse) TCS parts (epilog).
8670 */
8671 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8672 LLVMTargetMachineRef tm,
8673 struct si_shader *shader,
8674 struct pipe_debug_callback *debug)
8675 {
8676 if (sscreen->b.chip_class >= GFX9) {
8677 struct si_shader *ls_main_part =
8678 shader->key.part.tcs.ls->main_shader_part_ls;
8679
8680 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8681 &shader->key.part.tcs.ls_prolog))
8682 return false;
8683
8684 shader->previous_stage = ls_main_part;
8685 }
8686
8687 /* Get the epilog. */
8688 union si_shader_part_key epilog_key;
8689 memset(&epilog_key, 0, sizeof(epilog_key));
8690 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8691
8692 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8693 PIPE_SHADER_TESS_CTRL, false,
8694 &epilog_key, tm, debug,
8695 si_build_tcs_epilog_function,
8696 "Tessellation Control Shader Epilog");
8697 return shader->epilog != NULL;
8698 }
8699
8700 /**
8701 * Select and compile (or reuse) GS parts (prolog).
8702 */
8703 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8704 LLVMTargetMachineRef tm,
8705 struct si_shader *shader,
8706 struct pipe_debug_callback *debug)
8707 {
8708 if (sscreen->b.chip_class >= GFX9) {
8709 struct si_shader *es_main_part =
8710 shader->key.part.gs.es->main_shader_part_es;
8711
8712 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
8713 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
8714 &shader->key.part.gs.vs_prolog))
8715 return false;
8716
8717 shader->previous_stage = es_main_part;
8718 }
8719
8720 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8721 return true;
8722
8723 union si_shader_part_key prolog_key;
8724 memset(&prolog_key, 0, sizeof(prolog_key));
8725 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8726
8727 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8728 PIPE_SHADER_GEOMETRY, true,
8729 &prolog_key, tm, debug,
8730 si_build_gs_prolog_function,
8731 "Geometry Shader Prolog");
8732 return shader->prolog2 != NULL;
8733 }
8734
8735 /**
8736 * Build the pixel shader prolog function. This handles:
8737 * - two-side color selection and interpolation
8738 * - overriding interpolation parameters for the API PS
8739 * - polygon stippling
8740 *
8741 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8742 * overriden by other states. (e.g. per-sample interpolation)
8743 * Interpolated colors are stored after the preloaded VGPRs.
8744 */
8745 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8746 union si_shader_part_key *key)
8747 {
8748 struct gallivm_state *gallivm = &ctx->gallivm;
8749 LLVMTypeRef *params;
8750 LLVMValueRef ret, func;
8751 int last_sgpr, num_params, num_returns, i, num_color_channels;
8752
8753 assert(si_need_ps_prolog(key));
8754
8755 /* Number of inputs + 8 color elements. */
8756 params = alloca((key->ps_prolog.num_input_sgprs +
8757 key->ps_prolog.num_input_vgprs + 8) *
8758 sizeof(LLVMTypeRef));
8759
8760 /* Declare inputs. */
8761 num_params = 0;
8762 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8763 params[num_params++] = ctx->i32;
8764 last_sgpr = num_params - 1;
8765
8766 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8767 params[num_params++] = ctx->f32;
8768
8769 /* Declare outputs (same as inputs + add colors if needed) */
8770 num_returns = num_params;
8771 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8772 for (i = 0; i < num_color_channels; i++)
8773 params[num_returns++] = ctx->f32;
8774
8775 /* Create the function. */
8776 si_create_function(ctx, "ps_prolog", params, num_returns, params,
8777 num_params, last_sgpr);
8778 func = ctx->main_fn;
8779
8780 /* Copy inputs to outputs. This should be no-op, as the registers match,
8781 * but it will prevent the compiler from overwriting them unintentionally.
8782 */
8783 ret = ctx->return_value;
8784 for (i = 0; i < num_params; i++) {
8785 LLVMValueRef p = LLVMGetParam(func, i);
8786 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8787 }
8788
8789 /* Polygon stippling. */
8790 if (key->ps_prolog.states.poly_stipple) {
8791 /* POS_FIXED_PT is always last. */
8792 unsigned pos = key->ps_prolog.num_input_sgprs +
8793 key->ps_prolog.num_input_vgprs - 1;
8794 LLVMValueRef ptr[2], list;
8795
8796 /* Get the pointer to rw buffers. */
8797 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8798 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8799 list = lp_build_gather_values(gallivm, ptr, 2);
8800 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8801 list = LLVMBuildIntToPtr(gallivm->builder, list,
8802 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8803
8804 si_llvm_emit_polygon_stipple(ctx, list, pos);
8805 }
8806
8807 if (key->ps_prolog.states.bc_optimize_for_persp ||
8808 key->ps_prolog.states.bc_optimize_for_linear) {
8809 unsigned i, base = key->ps_prolog.num_input_sgprs;
8810 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8811
8812 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8813 * The hw doesn't compute CENTROID if the whole wave only
8814 * contains fully-covered quads.
8815 *
8816 * PRIM_MASK is after user SGPRs.
8817 */
8818 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8819 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8820 LLVMConstInt(ctx->i32, 31, 0), "");
8821 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8822 ctx->i1, "");
8823
8824 if (key->ps_prolog.states.bc_optimize_for_persp) {
8825 /* Read PERSP_CENTER. */
8826 for (i = 0; i < 2; i++)
8827 center[i] = LLVMGetParam(func, base + 2 + i);
8828 /* Read PERSP_CENTROID. */
8829 for (i = 0; i < 2; i++)
8830 centroid[i] = LLVMGetParam(func, base + 4 + i);
8831 /* Select PERSP_CENTROID. */
8832 for (i = 0; i < 2; i++) {
8833 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8834 center[i], centroid[i], "");
8835 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8836 tmp, base + 4 + i, "");
8837 }
8838 }
8839 if (key->ps_prolog.states.bc_optimize_for_linear) {
8840 /* Read LINEAR_CENTER. */
8841 for (i = 0; i < 2; i++)
8842 center[i] = LLVMGetParam(func, base + 8 + i);
8843 /* Read LINEAR_CENTROID. */
8844 for (i = 0; i < 2; i++)
8845 centroid[i] = LLVMGetParam(func, base + 10 + i);
8846 /* Select LINEAR_CENTROID. */
8847 for (i = 0; i < 2; i++) {
8848 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8849 center[i], centroid[i], "");
8850 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8851 tmp, base + 10 + i, "");
8852 }
8853 }
8854 }
8855
8856 /* Force per-sample interpolation. */
8857 if (key->ps_prolog.states.force_persp_sample_interp) {
8858 unsigned i, base = key->ps_prolog.num_input_sgprs;
8859 LLVMValueRef persp_sample[2];
8860
8861 /* Read PERSP_SAMPLE. */
8862 for (i = 0; i < 2; i++)
8863 persp_sample[i] = LLVMGetParam(func, base + i);
8864 /* Overwrite PERSP_CENTER. */
8865 for (i = 0; i < 2; i++)
8866 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8867 persp_sample[i], base + 2 + i, "");
8868 /* Overwrite PERSP_CENTROID. */
8869 for (i = 0; i < 2; i++)
8870 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8871 persp_sample[i], base + 4 + i, "");
8872 }
8873 if (key->ps_prolog.states.force_linear_sample_interp) {
8874 unsigned i, base = key->ps_prolog.num_input_sgprs;
8875 LLVMValueRef linear_sample[2];
8876
8877 /* Read LINEAR_SAMPLE. */
8878 for (i = 0; i < 2; i++)
8879 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8880 /* Overwrite LINEAR_CENTER. */
8881 for (i = 0; i < 2; i++)
8882 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8883 linear_sample[i], base + 8 + i, "");
8884 /* Overwrite LINEAR_CENTROID. */
8885 for (i = 0; i < 2; i++)
8886 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8887 linear_sample[i], base + 10 + i, "");
8888 }
8889
8890 /* Force center interpolation. */
8891 if (key->ps_prolog.states.force_persp_center_interp) {
8892 unsigned i, base = key->ps_prolog.num_input_sgprs;
8893 LLVMValueRef persp_center[2];
8894
8895 /* Read PERSP_CENTER. */
8896 for (i = 0; i < 2; i++)
8897 persp_center[i] = LLVMGetParam(func, base + 2 + i);
8898 /* Overwrite PERSP_SAMPLE. */
8899 for (i = 0; i < 2; i++)
8900 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8901 persp_center[i], base + i, "");
8902 /* Overwrite PERSP_CENTROID. */
8903 for (i = 0; i < 2; i++)
8904 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8905 persp_center[i], base + 4 + i, "");
8906 }
8907 if (key->ps_prolog.states.force_linear_center_interp) {
8908 unsigned i, base = key->ps_prolog.num_input_sgprs;
8909 LLVMValueRef linear_center[2];
8910
8911 /* Read LINEAR_CENTER. */
8912 for (i = 0; i < 2; i++)
8913 linear_center[i] = LLVMGetParam(func, base + 8 + i);
8914 /* Overwrite LINEAR_SAMPLE. */
8915 for (i = 0; i < 2; i++)
8916 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8917 linear_center[i], base + 6 + i, "");
8918 /* Overwrite LINEAR_CENTROID. */
8919 for (i = 0; i < 2; i++)
8920 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8921 linear_center[i], base + 10 + i, "");
8922 }
8923
8924 /* Interpolate colors. */
8925 for (i = 0; i < 2; i++) {
8926 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8927 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8928 key->ps_prolog.face_vgpr_index;
8929 LLVMValueRef interp[2], color[4];
8930 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8931
8932 if (!writemask)
8933 continue;
8934
8935 /* If the interpolation qualifier is not CONSTANT (-1). */
8936 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8937 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8938 key->ps_prolog.color_interp_vgpr_index[i];
8939
8940 /* Get the (i,j) updated by bc_optimize handling. */
8941 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8942 interp_vgpr, "");
8943 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8944 interp_vgpr + 1, "");
8945 interp_ij = lp_build_gather_values(gallivm, interp, 2);
8946 }
8947
8948 /* Use the absolute location of the input. */
8949 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8950
8951 if (key->ps_prolog.states.color_two_side) {
8952 face = LLVMGetParam(func, face_vgpr);
8953 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8954 }
8955
8956 interp_fs_input(ctx,
8957 key->ps_prolog.color_attr_index[i],
8958 TGSI_SEMANTIC_COLOR, i,
8959 key->ps_prolog.num_interp_inputs,
8960 key->ps_prolog.colors_read, interp_ij,
8961 prim_mask, face, color);
8962
8963 while (writemask) {
8964 unsigned chan = u_bit_scan(&writemask);
8965 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8966 num_params++, "");
8967 }
8968 }
8969
8970 /* Tell LLVM to insert WQM instruction sequence when needed. */
8971 if (key->ps_prolog.wqm) {
8972 LLVMAddTargetDependentFunctionAttr(func,
8973 "amdgpu-ps-wqm-outputs", "");
8974 }
8975
8976 si_llvm_build_ret(ctx, ret);
8977 }
8978
8979 /**
8980 * Build the pixel shader epilog function. This handles everything that must be
8981 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8982 */
8983 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8984 union si_shader_part_key *key)
8985 {
8986 struct gallivm_state *gallivm = &ctx->gallivm;
8987 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8988 LLVMTypeRef params[16+8*4+3];
8989 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8990 int last_sgpr, num_params = 0, i;
8991 struct si_ps_exports exp = {};
8992
8993 /* Declare input SGPRs. */
8994 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8995 params[ctx->param_const_buffers = num_params++] = ctx->i64;
8996 params[ctx->param_samplers = num_params++] = ctx->i64;
8997 params[ctx->param_images = num_params++] = ctx->i64;
8998 params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8999 assert(num_params == SI_PARAM_ALPHA_REF);
9000 params[SI_PARAM_ALPHA_REF] = ctx->f32;
9001 last_sgpr = SI_PARAM_ALPHA_REF;
9002
9003 /* Declare input VGPRs. */
9004 num_params = (last_sgpr + 1) +
9005 util_bitcount(key->ps_epilog.colors_written) * 4 +
9006 key->ps_epilog.writes_z +
9007 key->ps_epilog.writes_stencil +
9008 key->ps_epilog.writes_samplemask;
9009
9010 num_params = MAX2(num_params,
9011 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
9012
9013 assert(num_params <= ARRAY_SIZE(params));
9014
9015 for (i = last_sgpr + 1; i < num_params; i++)
9016 params[i] = ctx->f32;
9017
9018 /* Create the function. */
9019 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
9020 /* Disable elimination of unused inputs. */
9021 si_llvm_add_attribute(ctx->main_fn,
9022 "InitialPSInputAddr", 0xffffff);
9023
9024 /* Process colors. */
9025 unsigned vgpr = last_sgpr + 1;
9026 unsigned colors_written = key->ps_epilog.colors_written;
9027 int last_color_export = -1;
9028
9029 /* Find the last color export. */
9030 if (!key->ps_epilog.writes_z &&
9031 !key->ps_epilog.writes_stencil &&
9032 !key->ps_epilog.writes_samplemask) {
9033 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
9034
9035 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
9036 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
9037 /* Just set this if any of the colorbuffers are enabled. */
9038 if (spi_format &
9039 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
9040 last_color_export = 0;
9041 } else {
9042 for (i = 0; i < 8; i++)
9043 if (colors_written & (1 << i) &&
9044 (spi_format >> (i * 4)) & 0xf)
9045 last_color_export = i;
9046 }
9047 }
9048
9049 while (colors_written) {
9050 LLVMValueRef color[4];
9051 int mrt = u_bit_scan(&colors_written);
9052
9053 for (i = 0; i < 4; i++)
9054 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
9055
9056 si_export_mrt_color(bld_base, color, mrt,
9057 num_params - 1,
9058 mrt == last_color_export, &exp);
9059 }
9060
9061 /* Process depth, stencil, samplemask. */
9062 if (key->ps_epilog.writes_z)
9063 depth = LLVMGetParam(ctx->main_fn, vgpr++);
9064 if (key->ps_epilog.writes_stencil)
9065 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
9066 if (key->ps_epilog.writes_samplemask)
9067 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
9068
9069 if (depth || stencil || samplemask)
9070 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
9071 else if (last_color_export == -1)
9072 si_export_null(bld_base);
9073
9074 if (exp.num)
9075 si_emit_ps_exports(ctx, &exp);
9076
9077 /* Compile. */
9078 LLVMBuildRetVoid(gallivm->builder);
9079 }
9080
9081 /**
9082 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
9083 */
9084 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
9085 LLVMTargetMachineRef tm,
9086 struct si_shader *shader,
9087 struct pipe_debug_callback *debug)
9088 {
9089 union si_shader_part_key prolog_key;
9090 union si_shader_part_key epilog_key;
9091
9092 /* Get the prolog. */
9093 si_get_ps_prolog_key(shader, &prolog_key, true);
9094
9095 /* The prolog is a no-op if these aren't set. */
9096 if (si_need_ps_prolog(&prolog_key)) {
9097 shader->prolog =
9098 si_get_shader_part(sscreen, &sscreen->ps_prologs,
9099 PIPE_SHADER_FRAGMENT, true,
9100 &prolog_key, tm, debug,
9101 si_build_ps_prolog_function,
9102 "Fragment Shader Prolog");
9103 if (!shader->prolog)
9104 return false;
9105 }
9106
9107 /* Get the epilog. */
9108 si_get_ps_epilog_key(shader, &epilog_key);
9109
9110 shader->epilog =
9111 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
9112 PIPE_SHADER_FRAGMENT, false,
9113 &epilog_key, tm, debug,
9114 si_build_ps_epilog_function,
9115 "Fragment Shader Epilog");
9116 if (!shader->epilog)
9117 return false;
9118
9119 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
9120 if (shader->key.part.ps.prolog.poly_stipple) {
9121 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
9122 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
9123 }
9124
9125 /* Set up the enable bits for per-sample shading if needed. */
9126 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
9127 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9128 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9129 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
9130 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9131 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
9132 }
9133 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
9134 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9135 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9136 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
9137 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9138 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
9139 }
9140 if (shader->key.part.ps.prolog.force_persp_center_interp &&
9141 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9142 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9143 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
9144 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9145 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9146 }
9147 if (shader->key.part.ps.prolog.force_linear_center_interp &&
9148 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9149 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9150 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
9151 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9152 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9153 }
9154
9155 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
9156 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
9157 !(shader->config.spi_ps_input_ena & 0xf)) {
9158 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9159 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
9160 }
9161
9162 /* At least one pair of interpolation weights must be enabled. */
9163 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
9164 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9165 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
9166 }
9167
9168 /* The sample mask input is always enabled, because the API shader always
9169 * passes it through to the epilog. Disable it here if it's unused.
9170 */
9171 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
9172 !shader->selector->info.reads_samplemask)
9173 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
9174
9175 return true;
9176 }
9177
9178 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
9179 unsigned *lds_size)
9180 {
9181 /* SPI barrier management bug:
9182 * Make sure we have at least 4k of LDS in use to avoid the bug.
9183 * It applies to workgroup sizes of more than one wavefront.
9184 */
9185 if (sscreen->b.family == CHIP_BONAIRE ||
9186 sscreen->b.family == CHIP_KABINI ||
9187 sscreen->b.family == CHIP_MULLINS)
9188 *lds_size = MAX2(*lds_size, 8);
9189 }
9190
9191 static void si_fix_resource_usage(struct si_screen *sscreen,
9192 struct si_shader *shader)
9193 {
9194 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
9195
9196 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
9197
9198 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
9199 si_get_max_workgroup_size(shader) > 64) {
9200 si_multiwave_lds_size_workaround(sscreen,
9201 &shader->config.lds_size);
9202 }
9203 }
9204
9205 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
9206 struct si_shader *shader,
9207 struct pipe_debug_callback *debug)
9208 {
9209 struct si_shader_selector *sel = shader->selector;
9210 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
9211 int r;
9212
9213 /* LS, ES, VS are compiled on demand if the main part hasn't been
9214 * compiled for that stage.
9215 *
9216 * Vertex shaders are compiled on demand when a vertex fetch
9217 * workaround must be applied.
9218 */
9219 if (shader->is_monolithic) {
9220 /* Monolithic shader (compiled as a whole, has many variants,
9221 * may take a long time to compile).
9222 */
9223 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
9224 if (r)
9225 return r;
9226 } else {
9227 /* The shader consists of 2-3 parts:
9228 *
9229 * - the middle part is the user shader, it has 1 variant only
9230 * and it was compiled during the creation of the shader
9231 * selector
9232 * - the prolog part is inserted at the beginning
9233 * - the epilog part is inserted at the end
9234 *
9235 * The prolog and epilog have many (but simple) variants.
9236 */
9237
9238 /* Copy the compiled TGSI shader data over. */
9239 shader->is_binary_shared = true;
9240 shader->binary = mainp->binary;
9241 shader->config = mainp->config;
9242 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
9243 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
9244 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
9245 memcpy(shader->info.vs_output_param_offset,
9246 mainp->info.vs_output_param_offset,
9247 sizeof(mainp->info.vs_output_param_offset));
9248 shader->info.uses_instanceid = mainp->info.uses_instanceid;
9249 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
9250 shader->info.nr_param_exports = mainp->info.nr_param_exports;
9251
9252 /* Select prologs and/or epilogs. */
9253 switch (sel->type) {
9254 case PIPE_SHADER_VERTEX:
9255 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
9256 return -1;
9257 break;
9258 case PIPE_SHADER_TESS_CTRL:
9259 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
9260 return -1;
9261 break;
9262 case PIPE_SHADER_TESS_EVAL:
9263 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
9264 return -1;
9265 break;
9266 case PIPE_SHADER_GEOMETRY:
9267 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
9268 return -1;
9269 break;
9270 case PIPE_SHADER_FRAGMENT:
9271 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
9272 return -1;
9273
9274 /* Make sure we have at least as many VGPRs as there
9275 * are allocated inputs.
9276 */
9277 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9278 shader->info.num_input_vgprs);
9279 break;
9280 }
9281
9282 /* Update SGPR and VGPR counts. */
9283 if (shader->prolog) {
9284 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9285 shader->prolog->config.num_sgprs);
9286 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9287 shader->prolog->config.num_vgprs);
9288 }
9289 if (shader->previous_stage) {
9290 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9291 shader->previous_stage->config.num_sgprs);
9292 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9293 shader->previous_stage->config.num_vgprs);
9294 shader->config.spilled_sgprs =
9295 MAX2(shader->config.spilled_sgprs,
9296 shader->previous_stage->config.spilled_sgprs);
9297 shader->config.spilled_vgprs =
9298 MAX2(shader->config.spilled_vgprs,
9299 shader->previous_stage->config.spilled_vgprs);
9300 shader->config.private_mem_vgprs =
9301 MAX2(shader->config.private_mem_vgprs,
9302 shader->previous_stage->config.private_mem_vgprs);
9303 shader->config.scratch_bytes_per_wave =
9304 MAX2(shader->config.scratch_bytes_per_wave,
9305 shader->previous_stage->config.scratch_bytes_per_wave);
9306 shader->info.uses_instanceid |=
9307 shader->previous_stage->info.uses_instanceid;
9308 }
9309 if (shader->prolog2) {
9310 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9311 shader->prolog2->config.num_sgprs);
9312 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9313 shader->prolog2->config.num_vgprs);
9314 }
9315 if (shader->epilog) {
9316 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9317 shader->epilog->config.num_sgprs);
9318 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9319 shader->epilog->config.num_vgprs);
9320 }
9321 }
9322
9323 si_fix_resource_usage(sscreen, shader);
9324 si_shader_dump(sscreen, shader, debug, sel->info.processor,
9325 stderr, true);
9326
9327 /* Upload. */
9328 r = si_shader_binary_upload(sscreen, shader);
9329 if (r) {
9330 fprintf(stderr, "LLVM failed to upload shader\n");
9331 return r;
9332 }
9333
9334 return 0;
9335 }
9336
9337 void si_shader_destroy(struct si_shader *shader)
9338 {
9339 if (shader->scratch_bo)
9340 r600_resource_reference(&shader->scratch_bo, NULL);
9341
9342 r600_resource_reference(&shader->bo, NULL);
9343
9344 if (!shader->is_binary_shared)
9345 radeon_shader_binary_clean(&shader->binary);
9346
9347 free(shader->shader_log);
9348 }