radeonsi/gfx9: add GS prolog support for merged ES-GS
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
86 union si_shader_part_key *key);
87
88 /* Ideally pass the sample mask input to the PS epilog as v13, which
89 * is its usual location, so that the shader doesn't have to add v_mov.
90 */
91 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
92
93 /* The VS location of the PrimitiveID input is the same in the epilog,
94 * so that the main shader part doesn't have to move it.
95 */
96 #define VS_EPILOG_PRIMID_LOC 2
97
98 enum {
99 CONST_ADDR_SPACE = 2,
100 LOCAL_ADDR_SPACE = 3,
101 };
102
103 static bool is_merged_shader(struct si_shader *shader)
104 {
105 if (shader->selector->screen->b.chip_class <= VI)
106 return false;
107
108 return shader->key.as_ls ||
109 shader->key.as_es ||
110 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
111 shader->selector->type == PIPE_SHADER_GEOMETRY;
112 }
113
114 /**
115 * Returns a unique index for a semantic name and index. The index must be
116 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
117 * calculated.
118 */
119 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
120 {
121 switch (semantic_name) {
122 case TGSI_SEMANTIC_POSITION:
123 return 0;
124 case TGSI_SEMANTIC_PSIZE:
125 return 1;
126 case TGSI_SEMANTIC_CLIPDIST:
127 assert(index <= 1);
128 return 2 + index;
129 case TGSI_SEMANTIC_GENERIC:
130 if (index <= 63-4)
131 return 4 + index;
132
133 assert(!"invalid generic index");
134 return 0;
135
136 /* patch indices are completely separate and thus start from 0 */
137 case TGSI_SEMANTIC_TESSOUTER:
138 return 0;
139 case TGSI_SEMANTIC_TESSINNER:
140 return 1;
141 case TGSI_SEMANTIC_PATCH:
142 return 2 + index;
143
144 default:
145 assert(!"invalid semantic name");
146 return 0;
147 }
148 }
149
150 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
151 {
152 switch (name) {
153 case TGSI_SEMANTIC_FOG:
154 return 0;
155 case TGSI_SEMANTIC_LAYER:
156 return 1;
157 case TGSI_SEMANTIC_VIEWPORT_INDEX:
158 return 2;
159 case TGSI_SEMANTIC_PRIMID:
160 return 3;
161 case TGSI_SEMANTIC_COLOR: /* these alias */
162 case TGSI_SEMANTIC_BCOLOR:
163 return 4 + index;
164 case TGSI_SEMANTIC_TEXCOORD:
165 return 6 + index;
166 default:
167 assert(!"invalid semantic name");
168 return 0;
169 }
170 }
171
172 /**
173 * Get the value of a shader input parameter and extract a bitfield.
174 */
175 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
176 unsigned param, unsigned rshift,
177 unsigned bitwidth)
178 {
179 struct gallivm_state *gallivm = &ctx->gallivm;
180 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
181 param);
182
183 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
184 value = bitcast(&ctx->bld_base,
185 TGSI_TYPE_UNSIGNED, value);
186
187 if (rshift)
188 value = LLVMBuildLShr(gallivm->builder, value,
189 LLVMConstInt(ctx->i32, rshift, 0), "");
190
191 if (rshift + bitwidth < 32) {
192 unsigned mask = (1 << bitwidth) - 1;
193 value = LLVMBuildAnd(gallivm->builder, value,
194 LLVMConstInt(ctx->i32, mask, 0), "");
195 }
196
197 return value;
198 }
199
200 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
201 {
202 switch (ctx->type) {
203 case PIPE_SHADER_TESS_CTRL:
204 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
205
206 case PIPE_SHADER_TESS_EVAL:
207 return LLVMGetParam(ctx->main_fn,
208 ctx->param_tes_rel_patch_id);
209
210 default:
211 assert(0);
212 return NULL;
213 }
214 }
215
216 /* Tessellation shaders pass outputs to the next shader using LDS.
217 *
218 * LS outputs = TCS inputs
219 * TCS outputs = TES inputs
220 *
221 * The LDS layout is:
222 * - TCS inputs for patch 0
223 * - TCS inputs for patch 1
224 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
225 * - ...
226 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
227 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
228 * - TCS outputs for patch 1
229 * - Per-patch TCS outputs for patch 1
230 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
231 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
232 * - ...
233 *
234 * All three shaders VS(LS), TCS, TES share the same LDS space.
235 */
236
237 static LLVMValueRef
238 get_tcs_in_patch_stride(struct si_shader_context *ctx)
239 {
240 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
241 }
242
243 static LLVMValueRef
244 get_tcs_out_patch_stride(struct si_shader_context *ctx)
245 {
246 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
247 }
248
249 static LLVMValueRef
250 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
251 {
252 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
253 unpack_param(ctx,
254 ctx->param_tcs_out_lds_offsets,
255 0, 16),
256 4);
257 }
258
259 static LLVMValueRef
260 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
261 {
262 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
263 unpack_param(ctx,
264 ctx->param_tcs_out_lds_offsets,
265 16, 16),
266 4);
267 }
268
269 static LLVMValueRef
270 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
271 {
272 struct gallivm_state *gallivm = &ctx->gallivm;
273 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
274 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
275
276 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
277 }
278
279 static LLVMValueRef
280 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
281 {
282 struct gallivm_state *gallivm = &ctx->gallivm;
283 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
284 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
285 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
286
287 return LLVMBuildAdd(gallivm->builder, patch0_offset,
288 LLVMBuildMul(gallivm->builder, patch_stride,
289 rel_patch_id, ""),
290 "");
291 }
292
293 static LLVMValueRef
294 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
295 {
296 struct gallivm_state *gallivm = &ctx->gallivm;
297 LLVMValueRef patch0_patch_data_offset =
298 get_tcs_out_patch0_patch_data_offset(ctx);
299 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
300 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
301
302 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
303 LLVMBuildMul(gallivm->builder, patch_stride,
304 rel_patch_id, ""),
305 "");
306 }
307
308 static LLVMValueRef get_instance_index_for_fetch(
309 struct si_shader_context *ctx,
310 unsigned param_start_instance, unsigned divisor)
311 {
312 struct gallivm_state *gallivm = &ctx->gallivm;
313
314 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
315 ctx->param_instance_id);
316
317 /* The division must be done before START_INSTANCE is added. */
318 if (divisor > 1)
319 result = LLVMBuildUDiv(gallivm->builder, result,
320 LLVMConstInt(ctx->i32, divisor, 0), "");
321
322 return LLVMBuildAdd(gallivm->builder, result,
323 LLVMGetParam(ctx->main_fn, param_start_instance), "");
324 }
325
326 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
327 * to float. */
328 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
329 LLVMValueRef vec4,
330 unsigned double_index)
331 {
332 LLVMBuilderRef builder = ctx->gallivm.builder;
333 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
334 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
335 LLVMVectorType(f64, 2), "");
336 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
337 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
338 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
339 }
340
341 static void declare_input_vs(
342 struct si_shader_context *ctx,
343 unsigned input_index,
344 const struct tgsi_full_declaration *decl,
345 LLVMValueRef out[4])
346 {
347 struct gallivm_state *gallivm = &ctx->gallivm;
348
349 unsigned chan;
350 unsigned fix_fetch;
351 unsigned num_fetches;
352 unsigned fetch_stride;
353
354 LLVMValueRef t_list_ptr;
355 LLVMValueRef t_offset;
356 LLVMValueRef t_list;
357 LLVMValueRef vertex_index;
358 LLVMValueRef input[3];
359
360 /* Load the T list */
361 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
362
363 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
364
365 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
366
367 vertex_index = LLVMGetParam(ctx->main_fn,
368 ctx->param_vertex_index0 +
369 input_index);
370
371 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
372
373 /* Do multiple loads for special formats. */
374 switch (fix_fetch) {
375 case SI_FIX_FETCH_RGB_64_FLOAT:
376 num_fetches = 3; /* 3 2-dword loads */
377 fetch_stride = 8;
378 break;
379 case SI_FIX_FETCH_RGBA_64_FLOAT:
380 num_fetches = 2; /* 2 4-dword loads */
381 fetch_stride = 16;
382 break;
383 case SI_FIX_FETCH_RGB_8:
384 case SI_FIX_FETCH_RGB_8_INT:
385 num_fetches = 3;
386 fetch_stride = 1;
387 break;
388 case SI_FIX_FETCH_RGB_16:
389 case SI_FIX_FETCH_RGB_16_INT:
390 num_fetches = 3;
391 fetch_stride = 2;
392 break;
393 default:
394 num_fetches = 1;
395 fetch_stride = 0;
396 }
397
398 for (unsigned i = 0; i < num_fetches; i++) {
399 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
400
401 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
402 vertex_index, voffset,
403 true);
404 }
405
406 /* Break up the vec4 into individual components */
407 for (chan = 0; chan < 4; chan++) {
408 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
409 out[chan] = LLVMBuildExtractElement(gallivm->builder,
410 input[0], llvm_chan, "");
411 }
412
413 switch (fix_fetch) {
414 case SI_FIX_FETCH_A2_SNORM:
415 case SI_FIX_FETCH_A2_SSCALED:
416 case SI_FIX_FETCH_A2_SINT: {
417 /* The hardware returns an unsigned value; convert it to a
418 * signed one.
419 */
420 LLVMValueRef tmp = out[3];
421 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
422
423 /* First, recover the sign-extended signed integer value. */
424 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
425 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
426 else
427 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
428
429 /* For the integer-like cases, do a natural sign extension.
430 *
431 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
432 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
433 * exponent.
434 */
435 tmp = LLVMBuildShl(gallivm->builder, tmp,
436 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
437 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
438 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
439
440 /* Convert back to the right type. */
441 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
442 LLVMValueRef clamp;
443 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
444 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
445 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
446 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
447 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
448 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
449 }
450
451 out[3] = tmp;
452 break;
453 }
454 case SI_FIX_FETCH_RGBA_32_UNORM:
455 case SI_FIX_FETCH_RGBX_32_UNORM:
456 for (chan = 0; chan < 4; chan++) {
457 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
458 ctx->i32, "");
459 out[chan] = LLVMBuildUIToFP(gallivm->builder,
460 out[chan], ctx->f32, "");
461 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
462 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
463 }
464 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
465 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
466 out[3] = LLVMConstReal(ctx->f32, 1);
467 break;
468 case SI_FIX_FETCH_RGBA_32_SNORM:
469 case SI_FIX_FETCH_RGBX_32_SNORM:
470 case SI_FIX_FETCH_RGBA_32_FIXED:
471 case SI_FIX_FETCH_RGBX_32_FIXED: {
472 double scale;
473 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
474 scale = 1.0 / 0x10000;
475 else
476 scale = 1.0 / INT_MAX;
477
478 for (chan = 0; chan < 4; chan++) {
479 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
480 ctx->i32, "");
481 out[chan] = LLVMBuildSIToFP(gallivm->builder,
482 out[chan], ctx->f32, "");
483 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
484 LLVMConstReal(ctx->f32, scale), "");
485 }
486 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
487 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
488 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
489 out[3] = LLVMConstReal(ctx->f32, 1);
490 break;
491 }
492 case SI_FIX_FETCH_RGBA_32_USCALED:
493 for (chan = 0; chan < 4; chan++) {
494 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
495 ctx->i32, "");
496 out[chan] = LLVMBuildUIToFP(gallivm->builder,
497 out[chan], ctx->f32, "");
498 }
499 break;
500 case SI_FIX_FETCH_RGBA_32_SSCALED:
501 for (chan = 0; chan < 4; chan++) {
502 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
503 ctx->i32, "");
504 out[chan] = LLVMBuildSIToFP(gallivm->builder,
505 out[chan], ctx->f32, "");
506 }
507 break;
508 case SI_FIX_FETCH_RG_64_FLOAT:
509 for (chan = 0; chan < 2; chan++)
510 out[chan] = extract_double_to_float(ctx, input[0], chan);
511
512 out[2] = LLVMConstReal(ctx->f32, 0);
513 out[3] = LLVMConstReal(ctx->f32, 1);
514 break;
515 case SI_FIX_FETCH_RGB_64_FLOAT:
516 for (chan = 0; chan < 3; chan++)
517 out[chan] = extract_double_to_float(ctx, input[chan], 0);
518
519 out[3] = LLVMConstReal(ctx->f32, 1);
520 break;
521 case SI_FIX_FETCH_RGBA_64_FLOAT:
522 for (chan = 0; chan < 4; chan++) {
523 out[chan] = extract_double_to_float(ctx, input[chan / 2],
524 chan % 2);
525 }
526 break;
527 case SI_FIX_FETCH_RGB_8:
528 case SI_FIX_FETCH_RGB_8_INT:
529 case SI_FIX_FETCH_RGB_16:
530 case SI_FIX_FETCH_RGB_16_INT:
531 for (chan = 0; chan < 3; chan++) {
532 out[chan] = LLVMBuildExtractElement(gallivm->builder,
533 input[chan],
534 ctx->i32_0, "");
535 }
536 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
537 fix_fetch == SI_FIX_FETCH_RGB_16) {
538 out[3] = LLVMConstReal(ctx->f32, 1);
539 } else {
540 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
541 ctx->f32, "");
542 }
543 break;
544 }
545 }
546
547 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
548 unsigned swizzle)
549 {
550 struct si_shader_context *ctx = si_shader_context(bld_base);
551
552 if (swizzle > 0)
553 return ctx->i32_0;
554
555 switch (ctx->type) {
556 case PIPE_SHADER_VERTEX:
557 return LLVMGetParam(ctx->main_fn,
558 ctx->param_vs_prim_id);
559 case PIPE_SHADER_TESS_CTRL:
560 return LLVMGetParam(ctx->main_fn,
561 ctx->param_tcs_patch_id);
562 case PIPE_SHADER_TESS_EVAL:
563 return LLVMGetParam(ctx->main_fn,
564 ctx->param_tes_patch_id);
565 case PIPE_SHADER_GEOMETRY:
566 return LLVMGetParam(ctx->main_fn,
567 ctx->param_gs_prim_id);
568 default:
569 assert(0);
570 return ctx->i32_0;
571 }
572 }
573
574 /**
575 * Return the value of tgsi_ind_register for indexing.
576 * This is the indirect index with the constant offset added to it.
577 */
578 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
579 const struct tgsi_ind_register *ind,
580 int rel_index)
581 {
582 struct gallivm_state *gallivm = &ctx->gallivm;
583 LLVMValueRef result;
584
585 result = ctx->addrs[ind->Index][ind->Swizzle];
586 result = LLVMBuildLoad(gallivm->builder, result, "");
587 result = LLVMBuildAdd(gallivm->builder, result,
588 LLVMConstInt(ctx->i32, rel_index, 0), "");
589 return result;
590 }
591
592 /**
593 * Like get_indirect_index, but restricts the return value to a (possibly
594 * undefined) value inside [0..num).
595 */
596 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
597 const struct tgsi_ind_register *ind,
598 int rel_index, unsigned num)
599 {
600 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
601
602 /* LLVM 3.8: If indirect resource indexing is used:
603 * - SI & CIK hang
604 * - VI crashes
605 */
606 if (HAVE_LLVM == 0x0308)
607 return LLVMGetUndef(ctx->i32);
608
609 return si_llvm_bound_index(ctx, result, num);
610 }
611
612
613 /**
614 * Calculate a dword address given an input or output register and a stride.
615 */
616 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
617 const struct tgsi_full_dst_register *dst,
618 const struct tgsi_full_src_register *src,
619 LLVMValueRef vertex_dw_stride,
620 LLVMValueRef base_addr)
621 {
622 struct gallivm_state *gallivm = &ctx->gallivm;
623 struct tgsi_shader_info *info = &ctx->shader->selector->info;
624 ubyte *name, *index, *array_first;
625 int first, param;
626 struct tgsi_full_dst_register reg;
627
628 /* Set the register description. The address computation is the same
629 * for sources and destinations. */
630 if (src) {
631 reg.Register.File = src->Register.File;
632 reg.Register.Index = src->Register.Index;
633 reg.Register.Indirect = src->Register.Indirect;
634 reg.Register.Dimension = src->Register.Dimension;
635 reg.Indirect = src->Indirect;
636 reg.Dimension = src->Dimension;
637 reg.DimIndirect = src->DimIndirect;
638 } else
639 reg = *dst;
640
641 /* If the register is 2-dimensional (e.g. an array of vertices
642 * in a primitive), calculate the base address of the vertex. */
643 if (reg.Register.Dimension) {
644 LLVMValueRef index;
645
646 if (reg.Dimension.Indirect)
647 index = get_indirect_index(ctx, &reg.DimIndirect,
648 reg.Dimension.Index);
649 else
650 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
651
652 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
653 LLVMBuildMul(gallivm->builder, index,
654 vertex_dw_stride, ""), "");
655 }
656
657 /* Get information about the register. */
658 if (reg.Register.File == TGSI_FILE_INPUT) {
659 name = info->input_semantic_name;
660 index = info->input_semantic_index;
661 array_first = info->input_array_first;
662 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
663 name = info->output_semantic_name;
664 index = info->output_semantic_index;
665 array_first = info->output_array_first;
666 } else {
667 assert(0);
668 return NULL;
669 }
670
671 if (reg.Register.Indirect) {
672 /* Add the relative address of the element. */
673 LLVMValueRef ind_index;
674
675 if (reg.Indirect.ArrayID)
676 first = array_first[reg.Indirect.ArrayID];
677 else
678 first = reg.Register.Index;
679
680 ind_index = get_indirect_index(ctx, &reg.Indirect,
681 reg.Register.Index - first);
682
683 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
684 LLVMBuildMul(gallivm->builder, ind_index,
685 LLVMConstInt(ctx->i32, 4, 0), ""), "");
686
687 param = si_shader_io_get_unique_index(name[first], index[first]);
688 } else {
689 param = si_shader_io_get_unique_index(name[reg.Register.Index],
690 index[reg.Register.Index]);
691 }
692
693 /* Add the base address of the element. */
694 return LLVMBuildAdd(gallivm->builder, base_addr,
695 LLVMConstInt(ctx->i32, param * 4, 0), "");
696 }
697
698 /* The offchip buffer layout for TCS->TES is
699 *
700 * - attribute 0 of patch 0 vertex 0
701 * - attribute 0 of patch 0 vertex 1
702 * - attribute 0 of patch 0 vertex 2
703 * ...
704 * - attribute 0 of patch 1 vertex 0
705 * - attribute 0 of patch 1 vertex 1
706 * ...
707 * - attribute 1 of patch 0 vertex 0
708 * - attribute 1 of patch 0 vertex 1
709 * ...
710 * - per patch attribute 0 of patch 0
711 * - per patch attribute 0 of patch 1
712 * ...
713 *
714 * Note that every attribute has 4 components.
715 */
716 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
717 LLVMValueRef rel_patch_id,
718 LLVMValueRef vertex_index,
719 LLVMValueRef param_index)
720 {
721 struct gallivm_state *gallivm = &ctx->gallivm;
722 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
723 LLVMValueRef param_stride, constant16;
724
725 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 6);
726 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 9);
727 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
728 num_patches, "");
729
730 constant16 = LLVMConstInt(ctx->i32, 16, 0);
731 if (vertex_index) {
732 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
733 vertices_per_patch, "");
734
735 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
736 vertex_index, "");
737
738 param_stride = total_vertices;
739 } else {
740 base_addr = rel_patch_id;
741 param_stride = num_patches;
742 }
743
744 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
745 LLVMBuildMul(gallivm->builder, param_index,
746 param_stride, ""), "");
747
748 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
749
750 if (!vertex_index) {
751 LLVMValueRef patch_data_offset =
752 unpack_param(ctx, ctx->param_tcs_offchip_layout, 16, 16);
753
754 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
755 patch_data_offset, "");
756 }
757 return base_addr;
758 }
759
760 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
761 struct si_shader_context *ctx,
762 const struct tgsi_full_dst_register *dst,
763 const struct tgsi_full_src_register *src)
764 {
765 struct gallivm_state *gallivm = &ctx->gallivm;
766 struct tgsi_shader_info *info = &ctx->shader->selector->info;
767 ubyte *name, *index, *array_first;
768 struct tgsi_full_src_register reg;
769 LLVMValueRef vertex_index = NULL;
770 LLVMValueRef param_index = NULL;
771 unsigned param_index_base, param_base;
772
773 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
774
775 if (reg.Register.Dimension) {
776
777 if (reg.Dimension.Indirect)
778 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
779 reg.Dimension.Index);
780 else
781 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
782 }
783
784 /* Get information about the register. */
785 if (reg.Register.File == TGSI_FILE_INPUT) {
786 name = info->input_semantic_name;
787 index = info->input_semantic_index;
788 array_first = info->input_array_first;
789 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
790 name = info->output_semantic_name;
791 index = info->output_semantic_index;
792 array_first = info->output_array_first;
793 } else {
794 assert(0);
795 return NULL;
796 }
797
798 if (reg.Register.Indirect) {
799 if (reg.Indirect.ArrayID)
800 param_base = array_first[reg.Indirect.ArrayID];
801 else
802 param_base = reg.Register.Index;
803
804 param_index = get_indirect_index(ctx, &reg.Indirect,
805 reg.Register.Index - param_base);
806
807 } else {
808 param_base = reg.Register.Index;
809 param_index = ctx->i32_0;
810 }
811
812 param_index_base = si_shader_io_get_unique_index(name[param_base],
813 index[param_base]);
814
815 param_index = LLVMBuildAdd(gallivm->builder, param_index,
816 LLVMConstInt(ctx->i32, param_index_base, 0),
817 "");
818
819 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
820 vertex_index, param_index);
821 }
822
823 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
824 enum tgsi_opcode_type type, unsigned swizzle,
825 LLVMValueRef buffer, LLVMValueRef offset,
826 LLVMValueRef base, bool readonly_memory)
827 {
828 struct si_shader_context *ctx = si_shader_context(bld_base);
829 struct gallivm_state *gallivm = &ctx->gallivm;
830 LLVMValueRef value, value2;
831 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
832 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
833
834 if (swizzle == ~0) {
835 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
836 0, 1, 0, readonly_memory);
837
838 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
839 }
840
841 if (!tgsi_type_is_64bit(type)) {
842 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
843 0, 1, 0, readonly_memory);
844
845 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
846 return LLVMBuildExtractElement(gallivm->builder, value,
847 LLVMConstInt(ctx->i32, swizzle, 0), "");
848 }
849
850 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
851 swizzle * 4, 1, 0, readonly_memory);
852
853 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
854 swizzle * 4 + 4, 1, 0, readonly_memory);
855
856 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
857 }
858
859 /**
860 * Load from LDS.
861 *
862 * \param type output value type
863 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
864 * \param dw_addr address in dwords
865 */
866 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
867 enum tgsi_opcode_type type, unsigned swizzle,
868 LLVMValueRef dw_addr)
869 {
870 struct si_shader_context *ctx = si_shader_context(bld_base);
871 struct gallivm_state *gallivm = &ctx->gallivm;
872 LLVMValueRef value;
873
874 if (swizzle == ~0) {
875 LLVMValueRef values[TGSI_NUM_CHANNELS];
876
877 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
878 values[chan] = lds_load(bld_base, type, chan, dw_addr);
879
880 return lp_build_gather_values(gallivm, values,
881 TGSI_NUM_CHANNELS);
882 }
883
884 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
885 LLVMConstInt(ctx->i32, swizzle, 0));
886
887 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
888 if (tgsi_type_is_64bit(type)) {
889 LLVMValueRef value2;
890 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
891 ctx->i32_1);
892 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
893 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
894 }
895
896 return LLVMBuildBitCast(gallivm->builder, value,
897 tgsi2llvmtype(bld_base, type), "");
898 }
899
900 /**
901 * Store to LDS.
902 *
903 * \param swizzle offset (typically 0..3)
904 * \param dw_addr address in dwords
905 * \param value value to store
906 */
907 static void lds_store(struct lp_build_tgsi_context *bld_base,
908 unsigned dw_offset_imm, LLVMValueRef dw_addr,
909 LLVMValueRef value)
910 {
911 struct si_shader_context *ctx = si_shader_context(bld_base);
912 struct gallivm_state *gallivm = &ctx->gallivm;
913
914 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
915 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
916
917 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
918 ac_build_indexed_store(&ctx->ac, ctx->lds,
919 dw_addr, value);
920 }
921
922 static LLVMValueRef fetch_input_tcs(
923 struct lp_build_tgsi_context *bld_base,
924 const struct tgsi_full_src_register *reg,
925 enum tgsi_opcode_type type, unsigned swizzle)
926 {
927 struct si_shader_context *ctx = si_shader_context(bld_base);
928 LLVMValueRef dw_addr, stride;
929
930 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
931 dw_addr = get_tcs_in_current_patch_offset(ctx);
932 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
933
934 return lds_load(bld_base, type, swizzle, dw_addr);
935 }
936
937 static LLVMValueRef fetch_output_tcs(
938 struct lp_build_tgsi_context *bld_base,
939 const struct tgsi_full_src_register *reg,
940 enum tgsi_opcode_type type, unsigned swizzle)
941 {
942 struct si_shader_context *ctx = si_shader_context(bld_base);
943 LLVMValueRef dw_addr, stride;
944
945 if (reg->Register.Dimension) {
946 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
947 dw_addr = get_tcs_out_current_patch_offset(ctx);
948 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
949 } else {
950 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
951 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
952 }
953
954 return lds_load(bld_base, type, swizzle, dw_addr);
955 }
956
957 static LLVMValueRef fetch_input_tes(
958 struct lp_build_tgsi_context *bld_base,
959 const struct tgsi_full_src_register *reg,
960 enum tgsi_opcode_type type, unsigned swizzle)
961 {
962 struct si_shader_context *ctx = si_shader_context(bld_base);
963 LLVMValueRef rw_buffers, buffer, base, addr;
964
965 rw_buffers = LLVMGetParam(ctx->main_fn,
966 ctx->param_rw_buffers);
967 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
968 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
969
970 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
971 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
972
973 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
974 }
975
976 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
977 const struct tgsi_full_instruction *inst,
978 const struct tgsi_opcode_info *info,
979 LLVMValueRef dst[4])
980 {
981 struct si_shader_context *ctx = si_shader_context(bld_base);
982 struct gallivm_state *gallivm = &ctx->gallivm;
983 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
984 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
985 unsigned chan_index;
986 LLVMValueRef dw_addr, stride;
987 LLVMValueRef rw_buffers, buffer, base, buf_addr;
988 LLVMValueRef values[4];
989 bool skip_lds_store;
990 bool is_tess_factor = false;
991
992 /* Only handle per-patch and per-vertex outputs here.
993 * Vectors will be lowered to scalars and this function will be called again.
994 */
995 if (reg->Register.File != TGSI_FILE_OUTPUT ||
996 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
997 si_llvm_emit_store(bld_base, inst, info, dst);
998 return;
999 }
1000
1001 if (reg->Register.Dimension) {
1002 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1003 dw_addr = get_tcs_out_current_patch_offset(ctx);
1004 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1005 skip_lds_store = !sh_info->reads_pervertex_outputs;
1006 } else {
1007 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1008 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1009 skip_lds_store = !sh_info->reads_perpatch_outputs;
1010
1011 if (!reg->Register.Indirect) {
1012 int name = sh_info->output_semantic_name[reg->Register.Index];
1013
1014 /* Always write tess factors into LDS for the TCS epilog. */
1015 if (name == TGSI_SEMANTIC_TESSINNER ||
1016 name == TGSI_SEMANTIC_TESSOUTER) {
1017 skip_lds_store = false;
1018 is_tess_factor = true;
1019 }
1020 }
1021 }
1022
1023 rw_buffers = LLVMGetParam(ctx->main_fn,
1024 ctx->param_rw_buffers);
1025 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1026 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1027
1028 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1029 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1030
1031
1032 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1033 LLVMValueRef value = dst[chan_index];
1034
1035 if (inst->Instruction.Saturate)
1036 value = ac_build_clamp(&ctx->ac, value);
1037
1038 /* Skip LDS stores if there is no LDS read of this output. */
1039 if (!skip_lds_store)
1040 lds_store(bld_base, chan_index, dw_addr, value);
1041
1042 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1043 values[chan_index] = value;
1044
1045 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1046 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1047 buf_addr, base,
1048 4 * chan_index, 1, 0, true, false);
1049 }
1050 }
1051
1052 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1053 LLVMValueRef value = lp_build_gather_values(gallivm,
1054 values, 4);
1055 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1056 base, 0, 1, 0, true, false);
1057 }
1058 }
1059
1060 static LLVMValueRef fetch_input_gs(
1061 struct lp_build_tgsi_context *bld_base,
1062 const struct tgsi_full_src_register *reg,
1063 enum tgsi_opcode_type type,
1064 unsigned swizzle)
1065 {
1066 struct si_shader_context *ctx = si_shader_context(bld_base);
1067 struct si_shader *shader = ctx->shader;
1068 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1069 struct gallivm_state *gallivm = &ctx->gallivm;
1070 LLVMValueRef vtx_offset, soffset;
1071 struct tgsi_shader_info *info = &shader->selector->info;
1072 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1073 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1074 unsigned param;
1075 LLVMValueRef value;
1076
1077 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1078 return get_primitive_id(bld_base, swizzle);
1079
1080 if (!reg->Register.Dimension)
1081 return NULL;
1082
1083 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1084
1085 /* GFX9 has the ESGS ring in LDS. */
1086 if (ctx->screen->b.chip_class >= GFX9) {
1087 unsigned index = reg->Dimension.Index;
1088
1089 switch (index / 2) {
1090 case 0:
1091 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1092 index % 2 ? 16 : 0, 16);
1093 break;
1094 case 1:
1095 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1096 index % 2 ? 16 : 0, 16);
1097 break;
1098 case 2:
1099 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1100 index % 2 ? 16 : 0, 16);
1101 break;
1102 default:
1103 assert(0);
1104 return NULL;
1105 }
1106
1107 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1108 LLVMConstInt(ctx->i32, param * 4, 0), "");
1109 return lds_load(bld_base, type, swizzle, vtx_offset);
1110 }
1111
1112 /* GFX6: input load from the ESGS ring in memory. */
1113 if (swizzle == ~0) {
1114 LLVMValueRef values[TGSI_NUM_CHANNELS];
1115 unsigned chan;
1116 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1117 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1118 }
1119 return lp_build_gather_values(gallivm, values,
1120 TGSI_NUM_CHANNELS);
1121 }
1122
1123 /* Get the vertex offset parameter on GFX6. */
1124 unsigned vtx_offset_param = reg->Dimension.Index;
1125 if (vtx_offset_param < 2) {
1126 vtx_offset_param += ctx->param_gs_vtx0_offset;
1127 } else {
1128 assert(vtx_offset_param < 6);
1129 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1130 }
1131 vtx_offset = lp_build_mul_imm(uint,
1132 LLVMGetParam(ctx->main_fn,
1133 vtx_offset_param),
1134 4);
1135
1136 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1137
1138 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1139 vtx_offset, soffset, 0, 1, 0, true);
1140 if (tgsi_type_is_64bit(type)) {
1141 LLVMValueRef value2;
1142 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1143
1144 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1145 ctx->i32_0, vtx_offset, soffset,
1146 0, 1, 0, true);
1147 return si_llvm_emit_fetch_64bit(bld_base, type,
1148 value, value2);
1149 }
1150 return LLVMBuildBitCast(gallivm->builder,
1151 value,
1152 tgsi2llvmtype(bld_base, type), "");
1153 }
1154
1155 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1156 {
1157 switch (interpolate) {
1158 case TGSI_INTERPOLATE_CONSTANT:
1159 return 0;
1160
1161 case TGSI_INTERPOLATE_LINEAR:
1162 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1163 return SI_PARAM_LINEAR_SAMPLE;
1164 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1165 return SI_PARAM_LINEAR_CENTROID;
1166 else
1167 return SI_PARAM_LINEAR_CENTER;
1168 break;
1169 case TGSI_INTERPOLATE_COLOR:
1170 case TGSI_INTERPOLATE_PERSPECTIVE:
1171 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1172 return SI_PARAM_PERSP_SAMPLE;
1173 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1174 return SI_PARAM_PERSP_CENTROID;
1175 else
1176 return SI_PARAM_PERSP_CENTER;
1177 break;
1178 default:
1179 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1180 return -1;
1181 }
1182 }
1183
1184 /**
1185 * Interpolate a fragment shader input.
1186 *
1187 * @param ctx context
1188 * @param input_index index of the input in hardware
1189 * @param semantic_name TGSI_SEMANTIC_*
1190 * @param semantic_index semantic index
1191 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1192 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1193 * @param interp_param interpolation weights (i,j)
1194 * @param prim_mask SI_PARAM_PRIM_MASK
1195 * @param face SI_PARAM_FRONT_FACE
1196 * @param result the return value (4 components)
1197 */
1198 static void interp_fs_input(struct si_shader_context *ctx,
1199 unsigned input_index,
1200 unsigned semantic_name,
1201 unsigned semantic_index,
1202 unsigned num_interp_inputs,
1203 unsigned colors_read_mask,
1204 LLVMValueRef interp_param,
1205 LLVMValueRef prim_mask,
1206 LLVMValueRef face,
1207 LLVMValueRef result[4])
1208 {
1209 struct gallivm_state *gallivm = &ctx->gallivm;
1210 LLVMValueRef attr_number;
1211 LLVMValueRef i, j;
1212
1213 unsigned chan;
1214
1215 /* fs.constant returns the param from the middle vertex, so it's not
1216 * really useful for flat shading. It's meant to be used for custom
1217 * interpolation (but the intrinsic can't fetch from the other two
1218 * vertices).
1219 *
1220 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1221 * to do the right thing. The only reason we use fs.constant is that
1222 * fs.interp cannot be used on integers, because they can be equal
1223 * to NaN.
1224 *
1225 * When interp is false we will use fs.constant or for newer llvm,
1226 * amdgcn.interp.mov.
1227 */
1228 bool interp = interp_param != NULL;
1229
1230 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1231
1232 if (interp) {
1233 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1234 LLVMVectorType(ctx->f32, 2), "");
1235
1236 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1237 ctx->i32_0, "");
1238 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1239 ctx->i32_1, "");
1240 }
1241
1242 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1243 ctx->shader->key.part.ps.prolog.color_two_side) {
1244 LLVMValueRef is_face_positive;
1245 LLVMValueRef back_attr_number;
1246
1247 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1248 * otherwise it's at offset "num_inputs".
1249 */
1250 unsigned back_attr_offset = num_interp_inputs;
1251 if (semantic_index == 1 && colors_read_mask & 0xf)
1252 back_attr_offset += 1;
1253
1254 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1255
1256 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1257 face, ctx->i32_0, "");
1258
1259 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1260 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1261 LLVMValueRef front, back;
1262
1263 if (interp) {
1264 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1265 attr_number, prim_mask,
1266 i, j);
1267 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1268 back_attr_number, prim_mask,
1269 i, j);
1270 } else {
1271 front = ac_build_fs_interp_mov(&ctx->ac,
1272 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1273 llvm_chan, attr_number, prim_mask);
1274 back = ac_build_fs_interp_mov(&ctx->ac,
1275 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1276 llvm_chan, back_attr_number, prim_mask);
1277 }
1278
1279 result[chan] = LLVMBuildSelect(gallivm->builder,
1280 is_face_positive,
1281 front,
1282 back,
1283 "");
1284 }
1285 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1286 if (interp) {
1287 result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1288 attr_number, prim_mask, i, j);
1289 } else {
1290 result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1291 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1292 attr_number, prim_mask);
1293 }
1294 result[1] =
1295 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1296 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1297 } else {
1298 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1299 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1300
1301 if (interp) {
1302 result[chan] = ac_build_fs_interp(&ctx->ac,
1303 llvm_chan, attr_number, prim_mask, i, j);
1304 } else {
1305 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1306 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1307 llvm_chan, attr_number, prim_mask);
1308 }
1309 }
1310 }
1311 }
1312
1313 static void declare_input_fs(
1314 struct si_shader_context *ctx,
1315 unsigned input_index,
1316 const struct tgsi_full_declaration *decl,
1317 LLVMValueRef out[4])
1318 {
1319 struct lp_build_context *base = &ctx->bld_base.base;
1320 struct si_shader *shader = ctx->shader;
1321 LLVMValueRef main_fn = ctx->main_fn;
1322 LLVMValueRef interp_param = NULL;
1323 int interp_param_idx;
1324
1325 /* Get colors from input VGPRs (set by the prolog). */
1326 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1327 unsigned i = decl->Semantic.Index;
1328 unsigned colors_read = shader->selector->info.colors_read;
1329 unsigned mask = colors_read >> (i * 4);
1330 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1331 (i ? util_bitcount(colors_read & 0xf) : 0);
1332
1333 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1334 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1335 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1336 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1337 return;
1338 }
1339
1340 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1341 decl->Interp.Location);
1342 if (interp_param_idx == -1)
1343 return;
1344 else if (interp_param_idx) {
1345 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1346 }
1347
1348 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1349 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1350 ctx->shader->key.part.ps.prolog.flatshade_colors)
1351 interp_param = NULL; /* load the constant color */
1352
1353 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1354 decl->Semantic.Index, shader->selector->info.num_inputs,
1355 shader->selector->info.colors_read, interp_param,
1356 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1357 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1358 &out[0]);
1359 }
1360
1361 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1362 {
1363 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1364 }
1365
1366
1367 /**
1368 * Load a dword from a constant buffer.
1369 */
1370 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1371 LLVMValueRef resource,
1372 LLVMValueRef offset)
1373 {
1374 LLVMBuilderRef builder = ctx->gallivm.builder;
1375 LLVMValueRef args[2] = {resource, offset};
1376
1377 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1378 LP_FUNC_ATTR_READNONE |
1379 LP_FUNC_ATTR_LEGACY);
1380 }
1381
1382 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1383 {
1384 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1385 struct gallivm_state *gallivm = &ctx->gallivm;
1386 LLVMBuilderRef builder = gallivm->builder;
1387 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1388 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1389 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1390
1391 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1392 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1393 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1394
1395 LLVMValueRef pos[4] = {
1396 buffer_load_const(ctx, resource, offset0),
1397 buffer_load_const(ctx, resource, offset1),
1398 LLVMConstReal(ctx->f32, 0),
1399 LLVMConstReal(ctx->f32, 0)
1400 };
1401
1402 return lp_build_gather_values(gallivm, pos, 4);
1403 }
1404
1405 static void declare_system_value(struct si_shader_context *ctx,
1406 unsigned index,
1407 const struct tgsi_full_declaration *decl)
1408 {
1409 struct lp_build_context *bld = &ctx->bld_base.base;
1410 struct gallivm_state *gallivm = &ctx->gallivm;
1411 LLVMValueRef value = 0;
1412
1413 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1414
1415 switch (decl->Semantic.Name) {
1416 case TGSI_SEMANTIC_INSTANCEID:
1417 value = LLVMGetParam(ctx->main_fn,
1418 ctx->param_instance_id);
1419 break;
1420
1421 case TGSI_SEMANTIC_VERTEXID:
1422 value = LLVMBuildAdd(gallivm->builder,
1423 LLVMGetParam(ctx->main_fn,
1424 ctx->param_vertex_id),
1425 LLVMGetParam(ctx->main_fn,
1426 ctx->param_base_vertex), "");
1427 break;
1428
1429 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1430 /* Unused. Clarify the meaning in indexed vs. non-indexed
1431 * draws if this is ever used again. */
1432 assert(false);
1433 break;
1434
1435 case TGSI_SEMANTIC_BASEVERTEX:
1436 {
1437 /* For non-indexed draws, the base vertex set by the driver
1438 * (for direct draws) or the CP (for indirect draws) is the
1439 * first vertex ID, but GLSL expects 0 to be returned.
1440 */
1441 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1442 LLVMValueRef indexed;
1443
1444 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1445 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1446
1447 value = LLVMBuildSelect(gallivm->builder, indexed,
1448 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1449 ctx->i32_0, "");
1450 break;
1451 }
1452
1453 case TGSI_SEMANTIC_BASEINSTANCE:
1454 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1455 break;
1456
1457 case TGSI_SEMANTIC_DRAWID:
1458 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1459 break;
1460
1461 case TGSI_SEMANTIC_INVOCATIONID:
1462 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1463 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1464 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1465 value = LLVMGetParam(ctx->main_fn,
1466 ctx->param_gs_instance_id);
1467 else
1468 assert(!"INVOCATIONID not implemented");
1469 break;
1470
1471 case TGSI_SEMANTIC_POSITION:
1472 {
1473 LLVMValueRef pos[4] = {
1474 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1475 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1476 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1477 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1478 LLVMGetParam(ctx->main_fn,
1479 SI_PARAM_POS_W_FLOAT)),
1480 };
1481 value = lp_build_gather_values(gallivm, pos, 4);
1482 break;
1483 }
1484
1485 case TGSI_SEMANTIC_FACE:
1486 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1487 break;
1488
1489 case TGSI_SEMANTIC_SAMPLEID:
1490 value = get_sample_id(ctx);
1491 break;
1492
1493 case TGSI_SEMANTIC_SAMPLEPOS: {
1494 LLVMValueRef pos[4] = {
1495 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1496 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1497 LLVMConstReal(ctx->f32, 0),
1498 LLVMConstReal(ctx->f32, 0)
1499 };
1500 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1501 TGSI_OPCODE_FRC, pos[0]);
1502 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1503 TGSI_OPCODE_FRC, pos[1]);
1504 value = lp_build_gather_values(gallivm, pos, 4);
1505 break;
1506 }
1507
1508 case TGSI_SEMANTIC_SAMPLEMASK:
1509 /* This can only occur with the OpenGL Core profile, which
1510 * doesn't support smoothing.
1511 */
1512 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1513 break;
1514
1515 case TGSI_SEMANTIC_TESSCOORD:
1516 {
1517 LLVMValueRef coord[4] = {
1518 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1519 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1520 bld->zero,
1521 bld->zero
1522 };
1523
1524 /* For triangles, the vector should be (u, v, 1-u-v). */
1525 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1526 PIPE_PRIM_TRIANGLES)
1527 coord[2] = lp_build_sub(bld, bld->one,
1528 lp_build_add(bld, coord[0], coord[1]));
1529
1530 value = lp_build_gather_values(gallivm, coord, 4);
1531 break;
1532 }
1533
1534 case TGSI_SEMANTIC_VERTICESIN:
1535 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1536 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1537 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1538 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 7);
1539 else
1540 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1541 break;
1542
1543 case TGSI_SEMANTIC_TESSINNER:
1544 case TGSI_SEMANTIC_TESSOUTER:
1545 {
1546 LLVMValueRef rw_buffers, buffer, base, addr;
1547 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1548
1549 rw_buffers = LLVMGetParam(ctx->main_fn,
1550 ctx->param_rw_buffers);
1551 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1552 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1553
1554 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1555 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1556 LLVMConstInt(ctx->i32, param, 0));
1557
1558 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1559 ~0, buffer, base, addr, true);
1560
1561 break;
1562 }
1563
1564 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1565 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1566 {
1567 LLVMValueRef buf, slot, val[4];
1568 int i, offset;
1569
1570 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1571 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1572 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1573 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1574
1575 for (i = 0; i < 4; i++)
1576 val[i] = buffer_load_const(ctx, buf,
1577 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1578 value = lp_build_gather_values(gallivm, val, 4);
1579 break;
1580 }
1581
1582 case TGSI_SEMANTIC_PRIMID:
1583 value = get_primitive_id(&ctx->bld_base, 0);
1584 break;
1585
1586 case TGSI_SEMANTIC_GRID_SIZE:
1587 value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
1588 break;
1589
1590 case TGSI_SEMANTIC_BLOCK_SIZE:
1591 {
1592 LLVMValueRef values[3];
1593 unsigned i;
1594 unsigned *properties = ctx->shader->selector->info.properties;
1595
1596 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1597 unsigned sizes[3] = {
1598 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1599 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1600 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1601 };
1602
1603 for (i = 0; i < 3; ++i)
1604 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1605
1606 value = lp_build_gather_values(gallivm, values, 3);
1607 } else {
1608 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
1609 }
1610 break;
1611 }
1612
1613 case TGSI_SEMANTIC_BLOCK_ID:
1614 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
1615 break;
1616
1617 case TGSI_SEMANTIC_THREAD_ID:
1618 value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
1619 break;
1620
1621 case TGSI_SEMANTIC_HELPER_INVOCATION:
1622 if (HAVE_LLVM >= 0x0309) {
1623 value = lp_build_intrinsic(gallivm->builder,
1624 "llvm.amdgcn.ps.live",
1625 ctx->i1, NULL, 0,
1626 LP_FUNC_ATTR_READNONE);
1627 value = LLVMBuildNot(gallivm->builder, value, "");
1628 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1629 } else {
1630 assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1631 return;
1632 }
1633 break;
1634
1635 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1636 value = LLVMConstInt(ctx->i32, 64, 0);
1637 break;
1638
1639 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1640 value = ac_get_thread_id(&ctx->ac);
1641 break;
1642
1643 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1644 {
1645 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1646 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1647 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1648 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1649 break;
1650 }
1651
1652 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1653 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1654 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1655 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1656 {
1657 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1658 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1659 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1660 /* All bits set except LSB */
1661 value = LLVMConstInt(ctx->i64, -2, 0);
1662 } else {
1663 /* All bits set */
1664 value = LLVMConstInt(ctx->i64, -1, 0);
1665 }
1666 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1667 value = LLVMBuildShl(gallivm->builder, value, id, "");
1668 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1669 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1670 value = LLVMBuildNot(gallivm->builder, value, "");
1671 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1672 break;
1673 }
1674
1675 default:
1676 assert(!"unknown system value");
1677 return;
1678 }
1679
1680 ctx->system_values[index] = value;
1681 }
1682
1683 static void declare_compute_memory(struct si_shader_context *ctx,
1684 const struct tgsi_full_declaration *decl)
1685 {
1686 struct si_shader_selector *sel = ctx->shader->selector;
1687 struct gallivm_state *gallivm = &ctx->gallivm;
1688
1689 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1690 LLVMValueRef var;
1691
1692 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1693 assert(decl->Range.First == decl->Range.Last);
1694 assert(!ctx->shared_memory);
1695
1696 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1697 LLVMArrayType(ctx->i8, sel->local_size),
1698 "compute_lds",
1699 LOCAL_ADDR_SPACE);
1700 LLVMSetAlignment(var, 4);
1701
1702 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1703 }
1704
1705 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1706 {
1707 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1708 ctx->param_const_buffers);
1709
1710 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1711 LLVMConstInt(ctx->i32, i, 0));
1712 }
1713
1714 static LLVMValueRef fetch_constant(
1715 struct lp_build_tgsi_context *bld_base,
1716 const struct tgsi_full_src_register *reg,
1717 enum tgsi_opcode_type type,
1718 unsigned swizzle)
1719 {
1720 struct si_shader_context *ctx = si_shader_context(bld_base);
1721 struct lp_build_context *base = &bld_base->base;
1722 const struct tgsi_ind_register *ireg = &reg->Indirect;
1723 unsigned buf, idx;
1724
1725 LLVMValueRef addr, bufp;
1726 LLVMValueRef result;
1727
1728 if (swizzle == LP_CHAN_ALL) {
1729 unsigned chan;
1730 LLVMValueRef values[4];
1731 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1732 values[chan] = fetch_constant(bld_base, reg, type, chan);
1733
1734 return lp_build_gather_values(&ctx->gallivm, values, 4);
1735 }
1736
1737 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1738 idx = reg->Register.Index * 4 + swizzle;
1739
1740 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1741 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1742 LLVMValueRef index;
1743 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1744 reg->Dimension.Index,
1745 SI_NUM_CONST_BUFFERS);
1746 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1747 } else
1748 bufp = load_const_buffer_desc(ctx, buf);
1749
1750 if (reg->Register.Indirect) {
1751 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1752 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1753 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1754 addr = lp_build_add(&bld_base->uint_bld, addr,
1755 LLVMConstInt(ctx->i32, idx * 4, 0));
1756 } else {
1757 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1758 }
1759
1760 result = buffer_load_const(ctx, bufp, addr);
1761
1762 if (!tgsi_type_is_64bit(type))
1763 result = bitcast(bld_base, type, result);
1764 else {
1765 LLVMValueRef addr2, result2;
1766
1767 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1768 LLVMConstInt(ctx->i32, 4, 0));
1769 result2 = buffer_load_const(ctx, bufp, addr2);
1770
1771 result = si_llvm_emit_fetch_64bit(bld_base, type,
1772 result, result2);
1773 }
1774 return result;
1775 }
1776
1777 /* Upper 16 bits must be zero. */
1778 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1779 LLVMValueRef val[2])
1780 {
1781 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1782 LLVMBuildShl(ctx->gallivm.builder, val[1],
1783 LLVMConstInt(ctx->i32, 16, 0),
1784 ""), "");
1785 }
1786
1787 /* Upper 16 bits are ignored and will be dropped. */
1788 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1789 LLVMValueRef val[2])
1790 {
1791 LLVMValueRef v[2] = {
1792 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1793 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1794 val[1],
1795 };
1796 return si_llvm_pack_two_int16(ctx, v);
1797 }
1798
1799 /* Initialize arguments for the shader export intrinsic */
1800 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1801 LLVMValueRef *values,
1802 unsigned target,
1803 struct ac_export_args *args)
1804 {
1805 struct si_shader_context *ctx = si_shader_context(bld_base);
1806 struct lp_build_context *base = &bld_base->base;
1807 LLVMBuilderRef builder = ctx->gallivm.builder;
1808 LLVMValueRef val[4];
1809 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1810 unsigned chan;
1811 bool is_int8, is_int10;
1812
1813 /* Default is 0xf. Adjusted below depending on the format. */
1814 args->enabled_channels = 0xf; /* writemask */
1815
1816 /* Specify whether the EXEC mask represents the valid mask */
1817 args->valid_mask = 0;
1818
1819 /* Specify whether this is the last export */
1820 args->done = 0;
1821
1822 /* Specify the target we are exporting */
1823 args->target = target;
1824
1825 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1826 const struct si_shader_key *key = &ctx->shader->key;
1827 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1828 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1829
1830 assert(cbuf >= 0 && cbuf < 8);
1831 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1832 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1833 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1834 }
1835
1836 args->compr = false;
1837 args->out[0] = base->undef;
1838 args->out[1] = base->undef;
1839 args->out[2] = base->undef;
1840 args->out[3] = base->undef;
1841
1842 switch (spi_shader_col_format) {
1843 case V_028714_SPI_SHADER_ZERO:
1844 args->enabled_channels = 0; /* writemask */
1845 args->target = V_008DFC_SQ_EXP_NULL;
1846 break;
1847
1848 case V_028714_SPI_SHADER_32_R:
1849 args->enabled_channels = 1; /* writemask */
1850 args->out[0] = values[0];
1851 break;
1852
1853 case V_028714_SPI_SHADER_32_GR:
1854 args->enabled_channels = 0x3; /* writemask */
1855 args->out[0] = values[0];
1856 args->out[1] = values[1];
1857 break;
1858
1859 case V_028714_SPI_SHADER_32_AR:
1860 args->enabled_channels = 0x9; /* writemask */
1861 args->out[0] = values[0];
1862 args->out[3] = values[3];
1863 break;
1864
1865 case V_028714_SPI_SHADER_FP16_ABGR:
1866 args->compr = 1; /* COMPR flag */
1867
1868 for (chan = 0; chan < 2; chan++) {
1869 LLVMValueRef pack_args[2] = {
1870 values[2 * chan],
1871 values[2 * chan + 1]
1872 };
1873 LLVMValueRef packed;
1874
1875 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1876 args->out[chan] =
1877 LLVMBuildBitCast(ctx->gallivm.builder,
1878 packed, ctx->f32, "");
1879 }
1880 break;
1881
1882 case V_028714_SPI_SHADER_UNORM16_ABGR:
1883 for (chan = 0; chan < 4; chan++) {
1884 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1885 val[chan] = LLVMBuildFMul(builder, val[chan],
1886 LLVMConstReal(ctx->f32, 65535), "");
1887 val[chan] = LLVMBuildFAdd(builder, val[chan],
1888 LLVMConstReal(ctx->f32, 0.5), "");
1889 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1890 ctx->i32, "");
1891 }
1892
1893 args->compr = 1; /* COMPR flag */
1894 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1895 si_llvm_pack_two_int16(ctx, val));
1896 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1897 si_llvm_pack_two_int16(ctx, val+2));
1898 break;
1899
1900 case V_028714_SPI_SHADER_SNORM16_ABGR:
1901 for (chan = 0; chan < 4; chan++) {
1902 /* Clamp between [-1, 1]. */
1903 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1904 values[chan],
1905 LLVMConstReal(ctx->f32, 1));
1906 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1907 val[chan],
1908 LLVMConstReal(ctx->f32, -1));
1909 /* Convert to a signed integer in [-32767, 32767]. */
1910 val[chan] = LLVMBuildFMul(builder, val[chan],
1911 LLVMConstReal(ctx->f32, 32767), "");
1912 /* If positive, add 0.5, else add -0.5. */
1913 val[chan] = LLVMBuildFAdd(builder, val[chan],
1914 LLVMBuildSelect(builder,
1915 LLVMBuildFCmp(builder, LLVMRealOGE,
1916 val[chan], base->zero, ""),
1917 LLVMConstReal(ctx->f32, 0.5),
1918 LLVMConstReal(ctx->f32, -0.5), ""), "");
1919 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1920 }
1921
1922 args->compr = 1; /* COMPR flag */
1923 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1924 si_llvm_pack_two_int32_as_int16(ctx, val));
1925 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1926 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1927 break;
1928
1929 case V_028714_SPI_SHADER_UINT16_ABGR: {
1930 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1931 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1932 LLVMValueRef max_alpha =
1933 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1934
1935 /* Clamp. */
1936 for (chan = 0; chan < 4; chan++) {
1937 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1938 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1939 val[chan],
1940 chan == 3 ? max_alpha : max_rgb);
1941 }
1942
1943 args->compr = 1; /* COMPR flag */
1944 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1945 si_llvm_pack_two_int16(ctx, val));
1946 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1947 si_llvm_pack_two_int16(ctx, val+2));
1948 break;
1949 }
1950
1951 case V_028714_SPI_SHADER_SINT16_ABGR: {
1952 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1953 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1954 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1955 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1956 LLVMValueRef max_alpha =
1957 !is_int10 ? max_rgb : ctx->i32_1;
1958 LLVMValueRef min_alpha =
1959 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1960
1961 /* Clamp. */
1962 for (chan = 0; chan < 4; chan++) {
1963 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1964 val[chan] = lp_build_emit_llvm_binary(bld_base,
1965 TGSI_OPCODE_IMIN,
1966 val[chan], chan == 3 ? max_alpha : max_rgb);
1967 val[chan] = lp_build_emit_llvm_binary(bld_base,
1968 TGSI_OPCODE_IMAX,
1969 val[chan], chan == 3 ? min_alpha : min_rgb);
1970 }
1971
1972 args->compr = 1; /* COMPR flag */
1973 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1974 si_llvm_pack_two_int32_as_int16(ctx, val));
1975 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1976 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1977 break;
1978 }
1979
1980 case V_028714_SPI_SHADER_32_ABGR:
1981 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1982 break;
1983 }
1984 }
1985
1986 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1987 LLVMValueRef alpha)
1988 {
1989 struct si_shader_context *ctx = si_shader_context(bld_base);
1990
1991 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1992 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
1993 SI_PARAM_ALPHA_REF);
1994
1995 LLVMValueRef alpha_pass =
1996 lp_build_cmp(&bld_base->base,
1997 ctx->shader->key.part.ps.epilog.alpha_func,
1998 alpha, alpha_ref);
1999 LLVMValueRef arg =
2000 lp_build_select(&bld_base->base,
2001 alpha_pass,
2002 LLVMConstReal(ctx->f32, 1.0f),
2003 LLVMConstReal(ctx->f32, -1.0f));
2004
2005 ac_build_kill(&ctx->ac, arg);
2006 } else {
2007 ac_build_kill(&ctx->ac, NULL);
2008 }
2009 }
2010
2011 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2012 LLVMValueRef alpha,
2013 unsigned samplemask_param)
2014 {
2015 struct si_shader_context *ctx = si_shader_context(bld_base);
2016 struct gallivm_state *gallivm = &ctx->gallivm;
2017 LLVMValueRef coverage;
2018
2019 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2020 coverage = LLVMGetParam(ctx->main_fn,
2021 samplemask_param);
2022 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2023
2024 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2025 ctx->i32,
2026 &coverage, 1, LP_FUNC_ATTR_READNONE);
2027
2028 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2029 ctx->f32, "");
2030
2031 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2032 LLVMConstReal(ctx->f32,
2033 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2034
2035 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2036 }
2037
2038 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2039 struct ac_export_args *pos, LLVMValueRef *out_elts)
2040 {
2041 struct si_shader_context *ctx = si_shader_context(bld_base);
2042 struct lp_build_context *base = &bld_base->base;
2043 unsigned reg_index;
2044 unsigned chan;
2045 unsigned const_chan;
2046 LLVMValueRef base_elt;
2047 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2048 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2049 SI_VS_CONST_CLIP_PLANES, 0);
2050 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2051
2052 for (reg_index = 0; reg_index < 2; reg_index ++) {
2053 struct ac_export_args *args = &pos[2 + reg_index];
2054
2055 args->out[0] =
2056 args->out[1] =
2057 args->out[2] =
2058 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2059
2060 /* Compute dot products of position and user clip plane vectors */
2061 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2062 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2063 LLVMValueRef addr =
2064 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2065 const_chan) * 4, 0);
2066 base_elt = buffer_load_const(ctx, const_resource,
2067 addr);
2068 args->out[chan] =
2069 lp_build_add(base, args->out[chan],
2070 lp_build_mul(base, base_elt,
2071 out_elts[const_chan]));
2072 }
2073 }
2074
2075 args->enabled_channels = 0xf;
2076 args->valid_mask = 0;
2077 args->done = 0;
2078 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2079 args->compr = 0;
2080 }
2081 }
2082
2083 static void si_dump_streamout(struct pipe_stream_output_info *so)
2084 {
2085 unsigned i;
2086
2087 if (so->num_outputs)
2088 fprintf(stderr, "STREAMOUT\n");
2089
2090 for (i = 0; i < so->num_outputs; i++) {
2091 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2092 so->output[i].start_component;
2093 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2094 i, so->output[i].output_buffer,
2095 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2096 so->output[i].register_index,
2097 mask & 1 ? "x" : "",
2098 mask & 2 ? "y" : "",
2099 mask & 4 ? "z" : "",
2100 mask & 8 ? "w" : "");
2101 }
2102 }
2103
2104 static void emit_streamout_output(struct si_shader_context *ctx,
2105 LLVMValueRef const *so_buffers,
2106 LLVMValueRef const *so_write_offsets,
2107 struct pipe_stream_output *stream_out,
2108 struct si_shader_output_values *shader_out)
2109 {
2110 struct gallivm_state *gallivm = &ctx->gallivm;
2111 LLVMBuilderRef builder = gallivm->builder;
2112 unsigned buf_idx = stream_out->output_buffer;
2113 unsigned start = stream_out->start_component;
2114 unsigned num_comps = stream_out->num_components;
2115 LLVMValueRef out[4];
2116
2117 assert(num_comps && num_comps <= 4);
2118 if (!num_comps || num_comps > 4)
2119 return;
2120
2121 /* Load the output as int. */
2122 for (int j = 0; j < num_comps; j++) {
2123 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2124
2125 out[j] = LLVMBuildBitCast(builder,
2126 shader_out->values[start + j],
2127 ctx->i32, "");
2128 }
2129
2130 /* Pack the output. */
2131 LLVMValueRef vdata = NULL;
2132
2133 switch (num_comps) {
2134 case 1: /* as i32 */
2135 vdata = out[0];
2136 break;
2137 case 2: /* as v2i32 */
2138 case 3: /* as v4i32 (aligned to 4) */
2139 case 4: /* as v4i32 */
2140 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2141 for (int j = 0; j < num_comps; j++) {
2142 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2143 LLVMConstInt(ctx->i32, j, 0), "");
2144 }
2145 break;
2146 }
2147
2148 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2149 vdata, num_comps,
2150 so_write_offsets[buf_idx],
2151 ctx->i32_0,
2152 stream_out->dst_offset * 4, 1, 1, true, false);
2153 }
2154
2155 /**
2156 * Write streamout data to buffers for vertex stream @p stream (different
2157 * vertex streams can occur for GS copy shaders).
2158 */
2159 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2160 struct si_shader_output_values *outputs,
2161 unsigned noutput, unsigned stream)
2162 {
2163 struct si_shader_selector *sel = ctx->shader->selector;
2164 struct pipe_stream_output_info *so = &sel->so;
2165 struct gallivm_state *gallivm = &ctx->gallivm;
2166 LLVMBuilderRef builder = gallivm->builder;
2167 int i;
2168 struct lp_build_if_state if_ctx;
2169
2170 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2171 LLVMValueRef so_vtx_count =
2172 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2173
2174 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2175
2176 /* can_emit = tid < so_vtx_count; */
2177 LLVMValueRef can_emit =
2178 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2179
2180 /* Emit the streamout code conditionally. This actually avoids
2181 * out-of-bounds buffer access. The hw tells us via the SGPR
2182 * (so_vtx_count) which threads are allowed to emit streamout data. */
2183 lp_build_if(&if_ctx, gallivm, can_emit);
2184 {
2185 /* The buffer offset is computed as follows:
2186 * ByteOffset = streamout_offset[buffer_id]*4 +
2187 * (streamout_write_index + thread_id)*stride[buffer_id] +
2188 * attrib_offset
2189 */
2190
2191 LLVMValueRef so_write_index =
2192 LLVMGetParam(ctx->main_fn,
2193 ctx->param_streamout_write_index);
2194
2195 /* Compute (streamout_write_index + thread_id). */
2196 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2197
2198 /* Load the descriptor and compute the write offset for each
2199 * enabled buffer. */
2200 LLVMValueRef so_write_offset[4] = {};
2201 LLVMValueRef so_buffers[4];
2202 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2203 ctx->param_rw_buffers);
2204
2205 for (i = 0; i < 4; i++) {
2206 if (!so->stride[i])
2207 continue;
2208
2209 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2210 SI_VS_STREAMOUT_BUF0 + i, 0);
2211
2212 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2213
2214 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2215 ctx->param_streamout_offset[i]);
2216 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2217
2218 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2219 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2220 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2221 }
2222
2223 /* Write streamout data. */
2224 for (i = 0; i < so->num_outputs; i++) {
2225 unsigned reg = so->output[i].register_index;
2226
2227 if (reg >= noutput)
2228 continue;
2229
2230 if (stream != so->output[i].stream)
2231 continue;
2232
2233 emit_streamout_output(ctx, so_buffers, so_write_offset,
2234 &so->output[i], &outputs[reg]);
2235 }
2236 }
2237 lp_build_endif(&if_ctx);
2238 }
2239
2240
2241 /* Generate export instructions for hardware VS shader stage */
2242 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2243 struct si_shader_output_values *outputs,
2244 unsigned noutput)
2245 {
2246 struct si_shader_context *ctx = si_shader_context(bld_base);
2247 struct si_shader *shader = ctx->shader;
2248 struct lp_build_context *base = &bld_base->base;
2249 struct ac_export_args args, pos_args[4] = {};
2250 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2251 unsigned semantic_name, semantic_index;
2252 unsigned target;
2253 unsigned param_count = 0;
2254 unsigned pos_idx;
2255 int i;
2256
2257 for (i = 0; i < noutput; i++) {
2258 semantic_name = outputs[i].semantic_name;
2259 semantic_index = outputs[i].semantic_index;
2260 bool export_param = true;
2261
2262 switch (semantic_name) {
2263 case TGSI_SEMANTIC_POSITION: /* ignore these */
2264 case TGSI_SEMANTIC_PSIZE:
2265 case TGSI_SEMANTIC_CLIPVERTEX:
2266 case TGSI_SEMANTIC_EDGEFLAG:
2267 break;
2268 case TGSI_SEMANTIC_GENERIC:
2269 case TGSI_SEMANTIC_CLIPDIST:
2270 if (shader->key.opt.hw_vs.kill_outputs &
2271 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2272 export_param = false;
2273 break;
2274 default:
2275 if (shader->key.opt.hw_vs.kill_outputs2 &
2276 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2277 export_param = false;
2278 break;
2279 }
2280
2281 if (outputs[i].vertex_stream[0] != 0 &&
2282 outputs[i].vertex_stream[1] != 0 &&
2283 outputs[i].vertex_stream[2] != 0 &&
2284 outputs[i].vertex_stream[3] != 0)
2285 export_param = false;
2286
2287 handle_semantic:
2288 /* Select the correct target */
2289 switch(semantic_name) {
2290 case TGSI_SEMANTIC_PSIZE:
2291 psize_value = outputs[i].values[0];
2292 continue;
2293 case TGSI_SEMANTIC_EDGEFLAG:
2294 edgeflag_value = outputs[i].values[0];
2295 continue;
2296 case TGSI_SEMANTIC_LAYER:
2297 layer_value = outputs[i].values[0];
2298 semantic_name = TGSI_SEMANTIC_GENERIC;
2299 goto handle_semantic;
2300 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2301 viewport_index_value = outputs[i].values[0];
2302 semantic_name = TGSI_SEMANTIC_GENERIC;
2303 goto handle_semantic;
2304 case TGSI_SEMANTIC_POSITION:
2305 target = V_008DFC_SQ_EXP_POS;
2306 break;
2307 case TGSI_SEMANTIC_CLIPDIST:
2308 if (shader->key.opt.hw_vs.clip_disable) {
2309 semantic_name = TGSI_SEMANTIC_GENERIC;
2310 goto handle_semantic;
2311 }
2312 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2313 break;
2314 case TGSI_SEMANTIC_CLIPVERTEX:
2315 if (shader->key.opt.hw_vs.clip_disable)
2316 continue;
2317 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2318 continue;
2319 case TGSI_SEMANTIC_COLOR:
2320 case TGSI_SEMANTIC_BCOLOR:
2321 case TGSI_SEMANTIC_PRIMID:
2322 case TGSI_SEMANTIC_FOG:
2323 case TGSI_SEMANTIC_TEXCOORD:
2324 case TGSI_SEMANTIC_GENERIC:
2325 if (!export_param)
2326 continue;
2327 target = V_008DFC_SQ_EXP_PARAM + param_count;
2328 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2329 shader->info.vs_output_param_offset[i] = param_count;
2330 param_count++;
2331 break;
2332 default:
2333 target = 0;
2334 fprintf(stderr,
2335 "Warning: SI unhandled vs output type:%d\n",
2336 semantic_name);
2337 }
2338
2339 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2340
2341 if (target >= V_008DFC_SQ_EXP_POS &&
2342 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2343 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2344 &args, sizeof(args));
2345 } else {
2346 ac_build_export(&ctx->ac, &args);
2347 }
2348
2349 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2350 semantic_name = TGSI_SEMANTIC_GENERIC;
2351 goto handle_semantic;
2352 }
2353 }
2354
2355 shader->info.nr_param_exports = param_count;
2356
2357 /* We need to add the position output manually if it's missing. */
2358 if (!pos_args[0].out[0]) {
2359 pos_args[0].enabled_channels = 0xf; /* writemask */
2360 pos_args[0].valid_mask = 0; /* EXEC mask */
2361 pos_args[0].done = 0; /* last export? */
2362 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2363 pos_args[0].compr = 0; /* COMPR flag */
2364 pos_args[0].out[0] = base->zero; /* X */
2365 pos_args[0].out[1] = base->zero; /* Y */
2366 pos_args[0].out[2] = base->zero; /* Z */
2367 pos_args[0].out[3] = base->one; /* W */
2368 }
2369
2370 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2371 if (shader->selector->info.writes_psize ||
2372 shader->selector->info.writes_edgeflag ||
2373 shader->selector->info.writes_viewport_index ||
2374 shader->selector->info.writes_layer) {
2375 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2376 (shader->selector->info.writes_edgeflag << 1) |
2377 (shader->selector->info.writes_layer << 2) |
2378 (shader->selector->info.writes_viewport_index << 3);
2379 pos_args[1].valid_mask = 0; /* EXEC mask */
2380 pos_args[1].done = 0; /* last export? */
2381 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2382 pos_args[1].compr = 0; /* COMPR flag */
2383 pos_args[1].out[0] = base->zero; /* X */
2384 pos_args[1].out[1] = base->zero; /* Y */
2385 pos_args[1].out[2] = base->zero; /* Z */
2386 pos_args[1].out[3] = base->zero; /* W */
2387
2388 if (shader->selector->info.writes_psize)
2389 pos_args[1].out[0] = psize_value;
2390
2391 if (shader->selector->info.writes_edgeflag) {
2392 /* The output is a float, but the hw expects an integer
2393 * with the first bit containing the edge flag. */
2394 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2395 edgeflag_value,
2396 ctx->i32, "");
2397 edgeflag_value = lp_build_min(&bld_base->int_bld,
2398 edgeflag_value,
2399 ctx->i32_1);
2400
2401 /* The LLVM intrinsic expects a float. */
2402 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2403 edgeflag_value,
2404 ctx->f32, "");
2405 }
2406
2407 if (shader->selector->info.writes_layer)
2408 pos_args[1].out[2] = layer_value;
2409
2410 if (shader->selector->info.writes_viewport_index)
2411 pos_args[1].out[3] = viewport_index_value;
2412 }
2413
2414 for (i = 0; i < 4; i++)
2415 if (pos_args[i].out[0])
2416 shader->info.nr_pos_exports++;
2417
2418 pos_idx = 0;
2419 for (i = 0; i < 4; i++) {
2420 if (!pos_args[i].out[0])
2421 continue;
2422
2423 /* Specify the target we are exporting */
2424 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2425
2426 if (pos_idx == shader->info.nr_pos_exports)
2427 /* Specify that this is the last export */
2428 pos_args[i].done = 1;
2429
2430 ac_build_export(&ctx->ac, &pos_args[i]);
2431 }
2432 }
2433
2434 /**
2435 * Forward all outputs from the vertex shader to the TES. This is only used
2436 * for the fixed function TCS.
2437 */
2438 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2439 {
2440 struct si_shader_context *ctx = si_shader_context(bld_base);
2441 struct gallivm_state *gallivm = &ctx->gallivm;
2442 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2443 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2444 uint64_t inputs;
2445
2446 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2447
2448 rw_buffers = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2449 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2450 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2451
2452 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2453
2454 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2455 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2456 lds_vertex_stride, "");
2457 lds_base = get_tcs_in_current_patch_offset(ctx);
2458 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2459
2460 inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2461 while (inputs) {
2462 unsigned i = u_bit_scan64(&inputs);
2463
2464 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2465 LLVMConstInt(ctx->i32, 4 * i, 0),
2466 "");
2467
2468 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2469 get_rel_patch_id(ctx),
2470 invocation_id,
2471 LLVMConstInt(ctx->i32, i, 0));
2472
2473 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2474 lds_ptr);
2475
2476 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2477 buffer_offset, 0, 1, 0, true, false);
2478 }
2479 }
2480
2481 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2482 LLVMValueRef rel_patch_id,
2483 LLVMValueRef invocation_id,
2484 LLVMValueRef tcs_out_current_patch_data_offset)
2485 {
2486 struct si_shader_context *ctx = si_shader_context(bld_base);
2487 struct gallivm_state *gallivm = &ctx->gallivm;
2488 struct si_shader *shader = ctx->shader;
2489 unsigned tess_inner_index, tess_outer_index;
2490 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2491 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
2492 unsigned stride, outer_comps, inner_comps, i, offset;
2493 struct lp_build_if_state if_ctx, inner_if_ctx;
2494
2495 si_llvm_emit_barrier(NULL, bld_base, NULL);
2496
2497 /* Do this only for invocation 0, because the tess levels are per-patch,
2498 * not per-vertex.
2499 *
2500 * This can't jump, because invocation 0 executes this. It should
2501 * at least mask out the loads and stores for other invocations.
2502 */
2503 lp_build_if(&if_ctx, gallivm,
2504 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2505 invocation_id, ctx->i32_0, ""));
2506
2507 /* Determine the layout of one tess factor element in the buffer. */
2508 switch (shader->key.part.tcs.epilog.prim_mode) {
2509 case PIPE_PRIM_LINES:
2510 stride = 2; /* 2 dwords, 1 vec2 store */
2511 outer_comps = 2;
2512 inner_comps = 0;
2513 break;
2514 case PIPE_PRIM_TRIANGLES:
2515 stride = 4; /* 4 dwords, 1 vec4 store */
2516 outer_comps = 3;
2517 inner_comps = 1;
2518 break;
2519 case PIPE_PRIM_QUADS:
2520 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2521 outer_comps = 4;
2522 inner_comps = 2;
2523 break;
2524 default:
2525 assert(0);
2526 return;
2527 }
2528
2529 /* Load tess_inner and tess_outer from LDS.
2530 * Any invocation can write them, so we can't get them from a temporary.
2531 */
2532 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2533 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2534
2535 lds_base = tcs_out_current_patch_data_offset;
2536 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2537 LLVMConstInt(ctx->i32,
2538 tess_inner_index * 4, 0), "");
2539 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2540 LLVMConstInt(ctx->i32,
2541 tess_outer_index * 4, 0), "");
2542
2543 for (i = 0; i < 4; i++) {
2544 inner[i] = LLVMGetUndef(ctx->i32);
2545 outer[i] = LLVMGetUndef(ctx->i32);
2546 }
2547
2548 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2549 /* For isolines, the hardware expects tess factors in the
2550 * reverse order from what GLSL / TGSI specify.
2551 */
2552 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2553 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2554 } else {
2555 for (i = 0; i < outer_comps; i++) {
2556 outer[i] = out[i] =
2557 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2558 }
2559 for (i = 0; i < inner_comps; i++) {
2560 inner[i] = out[outer_comps+i] =
2561 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2562 }
2563 }
2564
2565 /* Convert the outputs to vectors for stores. */
2566 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2567 vec1 = NULL;
2568
2569 if (stride > 4)
2570 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2571
2572 /* Get the buffer. */
2573 rw_buffers = LLVMGetParam(ctx->main_fn,
2574 ctx->param_rw_buffers);
2575 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2576 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
2577
2578 /* Get the offset. */
2579 tf_base = LLVMGetParam(ctx->main_fn,
2580 ctx->param_tcs_factor_offset);
2581 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2582 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2583
2584 lp_build_if(&inner_if_ctx, gallivm,
2585 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2586 rel_patch_id, ctx->i32_0, ""));
2587
2588 /* Store the dynamic HS control word. */
2589 offset = 0;
2590 if (ctx->screen->b.chip_class <= VI) {
2591 ac_build_buffer_store_dword(&ctx->ac, buffer,
2592 LLVMConstInt(ctx->i32, 0x80000000, 0),
2593 1, ctx->i32_0, tf_base,
2594 offset, 1, 0, true, false);
2595 offset += 4;
2596 }
2597
2598 lp_build_endif(&inner_if_ctx);
2599
2600 /* Store the tessellation factors. */
2601 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2602 MIN2(stride, 4), byteoffset, tf_base,
2603 offset, 1, 0, true, false);
2604 offset += 16;
2605 if (vec1)
2606 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2607 stride - 4, byteoffset, tf_base,
2608 offset, 1, 0, true, false);
2609
2610 /* Store the tess factors into the offchip buffer if TES reads them. */
2611 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2612 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2613 LLVMValueRef tf_inner_offset;
2614 unsigned param_outer, param_inner;
2615
2616 buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2617 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2618 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2619
2620 param_outer = si_shader_io_get_unique_index(
2621 TGSI_SEMANTIC_TESSOUTER, 0);
2622 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2623 LLVMConstInt(ctx->i32, param_outer, 0));
2624
2625 outer_vec = lp_build_gather_values(gallivm, outer,
2626 util_next_power_of_two(outer_comps));
2627
2628 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2629 outer_comps, tf_outer_offset,
2630 base, 0, 1, 0, true, false);
2631 if (inner_comps) {
2632 param_inner = si_shader_io_get_unique_index(
2633 TGSI_SEMANTIC_TESSINNER, 0);
2634 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2635 LLVMConstInt(ctx->i32, param_inner, 0));
2636
2637 inner_vec = inner_comps == 1 ? inner[0] :
2638 lp_build_gather_values(gallivm, inner, inner_comps);
2639 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2640 inner_comps, tf_inner_offset,
2641 base, 0, 1, 0, true, false);
2642 }
2643 }
2644
2645 lp_build_endif(&if_ctx);
2646 }
2647
2648 static LLVMValueRef
2649 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2650 unsigned param, unsigned return_index)
2651 {
2652 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2653 LLVMGetParam(ctx->main_fn, param),
2654 return_index, "");
2655 }
2656
2657 static LLVMValueRef
2658 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2659 unsigned param, unsigned return_index)
2660 {
2661 LLVMBuilderRef builder = ctx->gallivm.builder;
2662 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2663
2664 return LLVMBuildInsertValue(builder, ret,
2665 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2666 return_index, "");
2667 }
2668
2669 static LLVMValueRef
2670 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2671 unsigned param, unsigned return_index)
2672 {
2673 LLVMBuilderRef builder = ctx->gallivm.builder;
2674 LLVMValueRef ptr, lo, hi;
2675
2676 ptr = LLVMGetParam(ctx->main_fn, param);
2677 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2678 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2679 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2680 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2681 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2682 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2683 }
2684
2685 /* This only writes the tessellation factor levels. */
2686 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2687 {
2688 struct si_shader_context *ctx = si_shader_context(bld_base);
2689 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2690 LLVMValueRef offchip_soffset, offchip_layout;
2691
2692 si_copy_tcs_inputs(bld_base);
2693
2694 rel_patch_id = get_rel_patch_id(ctx);
2695 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2696 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2697
2698 /* Return epilog parameters from this function. */
2699 LLVMBuilderRef builder = ctx->gallivm.builder;
2700 LLVMValueRef ret = ctx->return_value;
2701 LLVMValueRef tf_soffset;
2702 unsigned vgpr;
2703
2704 offchip_layout = LLVMGetParam(ctx->main_fn,
2705 ctx->param_tcs_offchip_layout);
2706 offchip_soffset = LLVMGetParam(ctx->main_fn,
2707 ctx->param_tcs_offchip_offset);
2708 tf_soffset = LLVMGetParam(ctx->main_fn,
2709 ctx->param_tcs_factor_offset);
2710
2711 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2712 ctx->param_rw_buffers, 0);
2713
2714 if (ctx->screen->b.chip_class >= GFX9) {
2715 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2716 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT, "");
2717 /* Tess offchip and tess factor offsets are at the beginning. */
2718 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset, 2, "");
2719 ret = LLVMBuildInsertValue(builder, ret, tf_soffset, 4, "");
2720 vgpr = 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT + 1;
2721 } else {
2722 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2723 GFX6_SGPR_TCS_OFFCHIP_LAYOUT, "");
2724 /* Tess offchip and tess factor offsets are after user SGPRs. */
2725 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
2726 GFX6_TCS_NUM_USER_SGPR, "");
2727 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2728 GFX6_TCS_NUM_USER_SGPR + 1, "");
2729 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2730 }
2731
2732 /* VGPRs */
2733 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2734 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2735 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2736
2737 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2738 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2739 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2740 ctx->return_value = ret;
2741 }
2742
2743 /* Pass TCS inputs from LS to TCS on GFX9. */
2744 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2745 {
2746 LLVMValueRef ret = ctx->return_value;
2747
2748 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2749 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2750 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2751 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2752 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2753
2754 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2755 8 + SI_SGPR_VS_STATE_BITS);
2756 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2757 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2758 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2759 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2760 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2761 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2762
2763 unsigned desc_param = ctx->param_tcs_out_lds_layout + 2;
2764 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2765 8 + GFX9_SGPR_TCS_CONST_BUFFERS);
2766 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2767 8 + GFX9_SGPR_TCS_SAMPLERS);
2768 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2769 8 + GFX9_SGPR_TCS_IMAGES);
2770 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2771 8 + GFX9_SGPR_TCS_SHADER_BUFFERS);
2772
2773 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2774 ret = si_insert_input_ret_float(ctx, ret,
2775 ctx->param_tcs_patch_id, vgpr++);
2776 ret = si_insert_input_ret_float(ctx, ret,
2777 ctx->param_tcs_rel_ids, vgpr++);
2778 ctx->return_value = ret;
2779 }
2780
2781 /* Pass GS inputs from ES to GS on GFX9. */
2782 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2783 {
2784 LLVMValueRef ret = ctx->return_value;
2785
2786 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2787 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2788 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2789
2790 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2791
2792 unsigned desc_param = ctx->param_vs_state_bits + 1;
2793 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2794 8 + GFX9_SGPR_GS_CONST_BUFFERS);
2795 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2796 8 + GFX9_SGPR_GS_SAMPLERS);
2797 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2798 8 + GFX9_SGPR_GS_IMAGES);
2799 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2800 8 + GFX9_SGPR_GS_SHADER_BUFFERS);
2801
2802 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2803 for (unsigned i = 0; i < 5; i++) {
2804 unsigned param = ctx->param_gs_vtx01_offset + i;
2805 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2806 }
2807 ctx->return_value = ret;
2808 }
2809
2810 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2811 {
2812 struct si_shader_context *ctx = si_shader_context(bld_base);
2813 struct si_shader *shader = ctx->shader;
2814 struct tgsi_shader_info *info = &shader->selector->info;
2815 struct gallivm_state *gallivm = &ctx->gallivm;
2816 unsigned i, chan;
2817 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2818 ctx->param_rel_auto_id);
2819 LLVMValueRef vertex_dw_stride =
2820 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2821 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2822 vertex_dw_stride, "");
2823
2824 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2825 * its inputs from it. */
2826 for (i = 0; i < info->num_outputs; i++) {
2827 LLVMValueRef *out_ptr = ctx->outputs[i];
2828 unsigned name = info->output_semantic_name[i];
2829 unsigned index = info->output_semantic_index[i];
2830
2831 /* The ARB_shader_viewport_layer_array spec contains the
2832 * following issue:
2833 *
2834 * 2) What happens if gl_ViewportIndex or gl_Layer is
2835 * written in the vertex shader and a geometry shader is
2836 * present?
2837 *
2838 * RESOLVED: The value written by the last vertex processing
2839 * stage is used. If the last vertex processing stage
2840 * (vertex, tessellation evaluation or geometry) does not
2841 * statically assign to gl_ViewportIndex or gl_Layer, index
2842 * or layer zero is assumed.
2843 *
2844 * So writes to those outputs in VS-as-LS are simply ignored.
2845 */
2846 if (name == TGSI_SEMANTIC_LAYER ||
2847 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2848 continue;
2849
2850 int param = si_shader_io_get_unique_index(name, index);
2851 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2852 LLVMConstInt(ctx->i32, param * 4, 0), "");
2853
2854 for (chan = 0; chan < 4; chan++) {
2855 lds_store(bld_base, chan, dw_addr,
2856 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2857 }
2858 }
2859
2860 if (ctx->screen->b.chip_class >= GFX9)
2861 si_set_ls_return_value_for_tcs(ctx);
2862 }
2863
2864 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2865 {
2866 struct si_shader_context *ctx = si_shader_context(bld_base);
2867 struct gallivm_state *gallivm = &ctx->gallivm;
2868 struct si_shader *es = ctx->shader;
2869 struct tgsi_shader_info *info = &es->selector->info;
2870 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2871 ctx->param_es2gs_offset);
2872 LLVMValueRef lds_base = NULL;
2873 unsigned chan;
2874 int i;
2875
2876 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2877 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2878 lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
2879 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2880 }
2881
2882 for (i = 0; i < info->num_outputs; i++) {
2883 LLVMValueRef *out_ptr = ctx->outputs[i];
2884 int param;
2885
2886 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2887 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2888 continue;
2889
2890 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2891 info->output_semantic_index[i]);
2892
2893 for (chan = 0; chan < 4; chan++) {
2894 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2895 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2896
2897 /* GFX9 has the ESGS ring in LDS. */
2898 if (ctx->screen->b.chip_class >= GFX9) {
2899 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2900 continue;
2901 }
2902
2903 ac_build_buffer_store_dword(&ctx->ac,
2904 ctx->esgs_ring,
2905 out_val, 1, NULL, soffset,
2906 (4 * param + chan) * 4,
2907 1, 1, true, true);
2908 }
2909 }
2910
2911 if (ctx->screen->b.chip_class >= GFX9)
2912 si_set_es_return_value_for_gs(ctx);
2913 }
2914
2915 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2916 {
2917 if (ctx->screen->b.chip_class >= GFX9)
2918 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2919 else
2920 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2921 }
2922
2923 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2924 {
2925 struct si_shader_context *ctx = si_shader_context(bld_base);
2926
2927 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2928 si_get_gs_wave_id(ctx));
2929 }
2930
2931 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2932 {
2933 struct si_shader_context *ctx = si_shader_context(bld_base);
2934 struct gallivm_state *gallivm = &ctx->gallivm;
2935 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2936 struct si_shader_output_values *outputs = NULL;
2937 int i,j;
2938
2939 assert(!ctx->shader->is_gs_copy_shader);
2940
2941 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2942
2943 /* Vertex color clamping.
2944 *
2945 * This uses a state constant loaded in a user data SGPR and
2946 * an IF statement is added that clamps all colors if the constant
2947 * is true.
2948 */
2949 if (ctx->type == PIPE_SHADER_VERTEX) {
2950 struct lp_build_if_state if_ctx;
2951 LLVMValueRef cond = NULL;
2952 LLVMValueRef addr, val;
2953
2954 for (i = 0; i < info->num_outputs; i++) {
2955 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2956 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2957 continue;
2958
2959 /* We've found a color. */
2960 if (!cond) {
2961 /* The state is in the first bit of the user SGPR. */
2962 cond = LLVMGetParam(ctx->main_fn,
2963 ctx->param_vs_state_bits);
2964 cond = LLVMBuildTrunc(gallivm->builder, cond,
2965 ctx->i1, "");
2966 lp_build_if(&if_ctx, gallivm, cond);
2967 }
2968
2969 for (j = 0; j < 4; j++) {
2970 addr = ctx->outputs[i][j];
2971 val = LLVMBuildLoad(gallivm->builder, addr, "");
2972 val = ac_build_clamp(&ctx->ac, val);
2973 LLVMBuildStore(gallivm->builder, val, addr);
2974 }
2975 }
2976
2977 if (cond)
2978 lp_build_endif(&if_ctx);
2979 }
2980
2981 for (i = 0; i < info->num_outputs; i++) {
2982 outputs[i].semantic_name = info->output_semantic_name[i];
2983 outputs[i].semantic_index = info->output_semantic_index[i];
2984
2985 for (j = 0; j < 4; j++) {
2986 outputs[i].values[j] =
2987 LLVMBuildLoad(gallivm->builder,
2988 ctx->outputs[i][j],
2989 "");
2990 outputs[i].vertex_stream[j] =
2991 (info->output_streams[i] >> (2 * j)) & 3;
2992 }
2993
2994 }
2995
2996 /* Return the primitive ID from the LLVM function. */
2997 ctx->return_value =
2998 LLVMBuildInsertValue(gallivm->builder,
2999 ctx->return_value,
3000 bitcast(bld_base, TGSI_TYPE_FLOAT,
3001 get_primitive_id(bld_base, 0)),
3002 VS_EPILOG_PRIMID_LOC, "");
3003
3004 if (ctx->shader->selector->so.num_outputs)
3005 si_llvm_emit_streamout(ctx, outputs, i, 0);
3006 si_llvm_export_vs(bld_base, outputs, i);
3007 FREE(outputs);
3008 }
3009
3010 struct si_ps_exports {
3011 unsigned num;
3012 struct ac_export_args args[10];
3013 };
3014
3015 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3016 bool writes_samplemask)
3017 {
3018 if (writes_z) {
3019 /* Z needs 32 bits. */
3020 if (writes_samplemask)
3021 return V_028710_SPI_SHADER_32_ABGR;
3022 else if (writes_stencil)
3023 return V_028710_SPI_SHADER_32_GR;
3024 else
3025 return V_028710_SPI_SHADER_32_R;
3026 } else if (writes_stencil || writes_samplemask) {
3027 /* Both stencil and sample mask need only 16 bits. */
3028 return V_028710_SPI_SHADER_UINT16_ABGR;
3029 } else {
3030 return V_028710_SPI_SHADER_ZERO;
3031 }
3032 }
3033
3034 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3035 LLVMValueRef depth, LLVMValueRef stencil,
3036 LLVMValueRef samplemask, struct si_ps_exports *exp)
3037 {
3038 struct si_shader_context *ctx = si_shader_context(bld_base);
3039 struct lp_build_context *base = &bld_base->base;
3040 struct ac_export_args args;
3041 unsigned mask = 0;
3042 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3043 stencil != NULL,
3044 samplemask != NULL);
3045
3046 assert(depth || stencil || samplemask);
3047
3048 args.valid_mask = 1; /* whether the EXEC mask is valid */
3049 args.done = 1; /* DONE bit */
3050
3051 /* Specify the target we are exporting */
3052 args.target = V_008DFC_SQ_EXP_MRTZ;
3053
3054 args.compr = 0; /* COMP flag */
3055 args.out[0] = base->undef; /* R, depth */
3056 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3057 args.out[2] = base->undef; /* B, sample mask */
3058 args.out[3] = base->undef; /* A, alpha to mask */
3059
3060 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3061 assert(!depth);
3062 args.compr = 1; /* COMPR flag */
3063
3064 if (stencil) {
3065 /* Stencil should be in X[23:16]. */
3066 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3067 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3068 LLVMConstInt(ctx->i32, 16, 0), "");
3069 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3070 mask |= 0x3;
3071 }
3072 if (samplemask) {
3073 /* SampleMask should be in Y[15:0]. */
3074 args.out[1] = samplemask;
3075 mask |= 0xc;
3076 }
3077 } else {
3078 if (depth) {
3079 args.out[0] = depth;
3080 mask |= 0x1;
3081 }
3082 if (stencil) {
3083 args.out[1] = stencil;
3084 mask |= 0x2;
3085 }
3086 if (samplemask) {
3087 args.out[2] = samplemask;
3088 mask |= 0x4;
3089 }
3090 }
3091
3092 /* SI (except OLAND and HAINAN) has a bug that it only looks
3093 * at the X writemask component. */
3094 if (ctx->screen->b.chip_class == SI &&
3095 ctx->screen->b.family != CHIP_OLAND &&
3096 ctx->screen->b.family != CHIP_HAINAN)
3097 mask |= 0x1;
3098
3099 /* Specify which components to enable */
3100 args.enabled_channels = mask;
3101
3102 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3103 }
3104
3105 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3106 LLVMValueRef *color, unsigned index,
3107 unsigned samplemask_param,
3108 bool is_last, struct si_ps_exports *exp)
3109 {
3110 struct si_shader_context *ctx = si_shader_context(bld_base);
3111 struct lp_build_context *base = &bld_base->base;
3112 int i;
3113
3114 /* Clamp color */
3115 if (ctx->shader->key.part.ps.epilog.clamp_color)
3116 for (i = 0; i < 4; i++)
3117 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3118
3119 /* Alpha to one */
3120 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3121 color[3] = base->one;
3122
3123 /* Alpha test */
3124 if (index == 0 &&
3125 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3126 si_alpha_test(bld_base, color[3]);
3127
3128 /* Line & polygon smoothing */
3129 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3130 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3131 samplemask_param);
3132
3133 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3134 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3135 struct ac_export_args args[8];
3136 int c, last = -1;
3137
3138 /* Get the export arguments, also find out what the last one is. */
3139 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3140 si_llvm_init_export_args(bld_base, color,
3141 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3142 if (args[c].enabled_channels)
3143 last = c;
3144 }
3145
3146 /* Emit all exports. */
3147 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3148 if (is_last && last == c) {
3149 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3150 args[c].done = 1; /* DONE bit */
3151 } else if (!args[c].enabled_channels)
3152 continue; /* unnecessary NULL export */
3153
3154 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3155 }
3156 } else {
3157 struct ac_export_args args;
3158
3159 /* Export */
3160 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3161 &args);
3162 if (is_last) {
3163 args.valid_mask = 1; /* whether the EXEC mask is valid */
3164 args.done = 1; /* DONE bit */
3165 } else if (!args.enabled_channels)
3166 return; /* unnecessary NULL export */
3167
3168 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3169 }
3170 }
3171
3172 static void si_emit_ps_exports(struct si_shader_context *ctx,
3173 struct si_ps_exports *exp)
3174 {
3175 for (unsigned i = 0; i < exp->num; i++)
3176 ac_build_export(&ctx->ac, &exp->args[i]);
3177 }
3178
3179 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3180 {
3181 struct si_shader_context *ctx = si_shader_context(bld_base);
3182 struct lp_build_context *base = &bld_base->base;
3183 struct ac_export_args args;
3184
3185 args.enabled_channels = 0x0; /* enabled channels */
3186 args.valid_mask = 1; /* whether the EXEC mask is valid */
3187 args.done = 1; /* DONE bit */
3188 args.target = V_008DFC_SQ_EXP_NULL;
3189 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3190 args.out[0] = base->undef; /* R */
3191 args.out[1] = base->undef; /* G */
3192 args.out[2] = base->undef; /* B */
3193 args.out[3] = base->undef; /* A */
3194
3195 ac_build_export(&ctx->ac, &args);
3196 }
3197
3198 /**
3199 * Return PS outputs in this order:
3200 *
3201 * v[0:3] = color0.xyzw
3202 * v[4:7] = color1.xyzw
3203 * ...
3204 * vN+0 = Depth
3205 * vN+1 = Stencil
3206 * vN+2 = SampleMask
3207 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3208 *
3209 * The alpha-ref SGPR is returned via its original location.
3210 */
3211 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3212 {
3213 struct si_shader_context *ctx = si_shader_context(bld_base);
3214 struct si_shader *shader = ctx->shader;
3215 struct tgsi_shader_info *info = &shader->selector->info;
3216 LLVMBuilderRef builder = ctx->gallivm.builder;
3217 unsigned i, j, first_vgpr, vgpr;
3218
3219 LLVMValueRef color[8][4] = {};
3220 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3221 LLVMValueRef ret;
3222
3223 /* Read the output values. */
3224 for (i = 0; i < info->num_outputs; i++) {
3225 unsigned semantic_name = info->output_semantic_name[i];
3226 unsigned semantic_index = info->output_semantic_index[i];
3227
3228 switch (semantic_name) {
3229 case TGSI_SEMANTIC_COLOR:
3230 assert(semantic_index < 8);
3231 for (j = 0; j < 4; j++) {
3232 LLVMValueRef ptr = ctx->outputs[i][j];
3233 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3234 color[semantic_index][j] = result;
3235 }
3236 break;
3237 case TGSI_SEMANTIC_POSITION:
3238 depth = LLVMBuildLoad(builder,
3239 ctx->outputs[i][2], "");
3240 break;
3241 case TGSI_SEMANTIC_STENCIL:
3242 stencil = LLVMBuildLoad(builder,
3243 ctx->outputs[i][1], "");
3244 break;
3245 case TGSI_SEMANTIC_SAMPLEMASK:
3246 samplemask = LLVMBuildLoad(builder,
3247 ctx->outputs[i][0], "");
3248 break;
3249 default:
3250 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3251 semantic_name);
3252 }
3253 }
3254
3255 /* Fill the return structure. */
3256 ret = ctx->return_value;
3257
3258 /* Set SGPRs. */
3259 ret = LLVMBuildInsertValue(builder, ret,
3260 bitcast(bld_base, TGSI_TYPE_SIGNED,
3261 LLVMGetParam(ctx->main_fn,
3262 SI_PARAM_ALPHA_REF)),
3263 SI_SGPR_ALPHA_REF, "");
3264
3265 /* Set VGPRs */
3266 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3267 for (i = 0; i < ARRAY_SIZE(color); i++) {
3268 if (!color[i][0])
3269 continue;
3270
3271 for (j = 0; j < 4; j++)
3272 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3273 }
3274 if (depth)
3275 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3276 if (stencil)
3277 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3278 if (samplemask)
3279 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3280
3281 /* Add the input sample mask for smoothing at the end. */
3282 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3283 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3284 ret = LLVMBuildInsertValue(builder, ret,
3285 LLVMGetParam(ctx->main_fn,
3286 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3287
3288 ctx->return_value = ret;
3289 }
3290
3291 /**
3292 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3293 * buffer in number of elements and return it as an i32.
3294 */
3295 static LLVMValueRef get_buffer_size(
3296 struct lp_build_tgsi_context *bld_base,
3297 LLVMValueRef descriptor)
3298 {
3299 struct si_shader_context *ctx = si_shader_context(bld_base);
3300 struct gallivm_state *gallivm = &ctx->gallivm;
3301 LLVMBuilderRef builder = gallivm->builder;
3302 LLVMValueRef size =
3303 LLVMBuildExtractElement(builder, descriptor,
3304 LLVMConstInt(ctx->i32, 2, 0), "");
3305
3306 if (ctx->screen->b.chip_class == VI) {
3307 /* On VI, the descriptor contains the size in bytes,
3308 * but TXQ must return the size in elements.
3309 * The stride is always non-zero for resources using TXQ.
3310 */
3311 LLVMValueRef stride =
3312 LLVMBuildExtractElement(builder, descriptor,
3313 ctx->i32_1, "");
3314 stride = LLVMBuildLShr(builder, stride,
3315 LLVMConstInt(ctx->i32, 16, 0), "");
3316 stride = LLVMBuildAnd(builder, stride,
3317 LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3318
3319 size = LLVMBuildUDiv(builder, size, stride, "");
3320 }
3321
3322 return size;
3323 }
3324
3325 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3326 struct lp_build_tgsi_context *bld_base,
3327 struct lp_build_emit_data *emit_data);
3328
3329 /* Prevent optimizations (at least of memory accesses) across the current
3330 * point in the program by emitting empty inline assembly that is marked as
3331 * having side effects.
3332 *
3333 * Optionally, a value can be passed through the inline assembly to prevent
3334 * LLVM from hoisting calls to ReadNone functions.
3335 */
3336 static void emit_optimization_barrier(struct si_shader_context *ctx,
3337 LLVMValueRef *pvgpr)
3338 {
3339 static int counter = 0;
3340
3341 LLVMBuilderRef builder = ctx->gallivm.builder;
3342 char code[16];
3343
3344 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3345
3346 if (!pvgpr) {
3347 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3348 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3349 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3350 } else {
3351 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3352 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3353 LLVMValueRef vgpr = *pvgpr;
3354 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3355 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3356 LLVMValueRef vgpr0;
3357
3358 assert(vgpr_size % 4 == 0);
3359
3360 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3361 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3362 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3363 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3364 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3365
3366 *pvgpr = vgpr;
3367 }
3368 }
3369
3370 /* Combine these with & instead of |. */
3371 #define NOOP_WAITCNT 0xf7f
3372 #define LGKM_CNT 0x07f
3373 #define VM_CNT 0xf70
3374
3375 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3376 {
3377 struct gallivm_state *gallivm = &ctx->gallivm;
3378 LLVMBuilderRef builder = gallivm->builder;
3379 LLVMValueRef args[1] = {
3380 LLVMConstInt(ctx->i32, simm16, 0)
3381 };
3382 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3383 ctx->voidt, args, 1, 0);
3384 }
3385
3386 static void membar_emit(
3387 const struct lp_build_tgsi_action *action,
3388 struct lp_build_tgsi_context *bld_base,
3389 struct lp_build_emit_data *emit_data)
3390 {
3391 struct si_shader_context *ctx = si_shader_context(bld_base);
3392 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3393 unsigned flags = LLVMConstIntGetZExtValue(src0);
3394 unsigned waitcnt = NOOP_WAITCNT;
3395
3396 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3397 waitcnt &= VM_CNT & LGKM_CNT;
3398
3399 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3400 TGSI_MEMBAR_SHADER_BUFFER |
3401 TGSI_MEMBAR_SHADER_IMAGE))
3402 waitcnt &= VM_CNT;
3403
3404 if (flags & TGSI_MEMBAR_SHARED)
3405 waitcnt &= LGKM_CNT;
3406
3407 if (waitcnt != NOOP_WAITCNT)
3408 emit_waitcnt(ctx, waitcnt);
3409 }
3410
3411 static void clock_emit(
3412 const struct lp_build_tgsi_action *action,
3413 struct lp_build_tgsi_context *bld_base,
3414 struct lp_build_emit_data *emit_data)
3415 {
3416 struct si_shader_context *ctx = si_shader_context(bld_base);
3417 struct gallivm_state *gallivm = &ctx->gallivm;
3418 LLVMValueRef tmp;
3419
3420 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3421 ctx->i64, NULL, 0, 0);
3422 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3423
3424 emit_data->output[0] =
3425 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3426 emit_data->output[1] =
3427 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3428 }
3429
3430 static LLVMValueRef
3431 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3432 const struct tgsi_full_src_register *reg)
3433 {
3434 LLVMValueRef index;
3435 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3436 ctx->param_shader_buffers);
3437
3438 if (!reg->Register.Indirect)
3439 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3440 else
3441 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3442 reg->Register.Index,
3443 SI_NUM_SHADER_BUFFERS);
3444
3445 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3446 }
3447
3448 static bool tgsi_is_array_sampler(unsigned target)
3449 {
3450 return target == TGSI_TEXTURE_1D_ARRAY ||
3451 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3452 target == TGSI_TEXTURE_2D_ARRAY ||
3453 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3454 target == TGSI_TEXTURE_CUBE_ARRAY ||
3455 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3456 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3457 }
3458
3459 static bool tgsi_is_array_image(unsigned target)
3460 {
3461 return target == TGSI_TEXTURE_3D ||
3462 target == TGSI_TEXTURE_CUBE ||
3463 target == TGSI_TEXTURE_1D_ARRAY ||
3464 target == TGSI_TEXTURE_2D_ARRAY ||
3465 target == TGSI_TEXTURE_CUBE_ARRAY ||
3466 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3467 }
3468
3469 /**
3470 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3471 *
3472 * At least on Tonga, executing image stores on images with DCC enabled and
3473 * non-trivial can eventually lead to lockups. This can occur when an
3474 * application binds an image as read-only but then uses a shader that writes
3475 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3476 * program termination) in this case, but it doesn't cost much to be a bit
3477 * nicer: disabling DCC in the shader still leads to undefined results but
3478 * avoids the lockup.
3479 */
3480 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3481 LLVMValueRef rsrc)
3482 {
3483 if (ctx->screen->b.chip_class <= CIK) {
3484 return rsrc;
3485 } else {
3486 LLVMBuilderRef builder = ctx->gallivm.builder;
3487 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3488 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3489 LLVMValueRef tmp;
3490
3491 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3492 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3493 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3494 }
3495 }
3496
3497 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3498 {
3499 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3500 CONST_ADDR_SPACE);
3501 }
3502
3503 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3504 LLVMValueRef list, LLVMValueRef index,
3505 unsigned target)
3506 {
3507 LLVMBuilderRef builder = ctx->gallivm.builder;
3508
3509 if (target == TGSI_TEXTURE_BUFFER) {
3510 index = LLVMBuildMul(builder, index,
3511 LLVMConstInt(ctx->i32, 2, 0), "");
3512 index = LLVMBuildAdd(builder, index,
3513 ctx->i32_1, "");
3514 list = LLVMBuildPointerCast(builder, list,
3515 const_array(ctx->v4i32, 0), "");
3516 }
3517
3518 return ac_build_indexed_load_const(&ctx->ac, list, index);
3519 }
3520
3521 /**
3522 * Load the resource descriptor for \p image.
3523 */
3524 static void
3525 image_fetch_rsrc(
3526 struct lp_build_tgsi_context *bld_base,
3527 const struct tgsi_full_src_register *image,
3528 bool is_store, unsigned target,
3529 LLVMValueRef *rsrc)
3530 {
3531 struct si_shader_context *ctx = si_shader_context(bld_base);
3532 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3533 ctx->param_images);
3534 LLVMValueRef index;
3535 bool dcc_off = is_store;
3536
3537 assert(image->Register.File == TGSI_FILE_IMAGE);
3538
3539 if (!image->Register.Indirect) {
3540 const struct tgsi_shader_info *info = bld_base->info;
3541 unsigned images_writemask = info->images_store |
3542 info->images_atomic;
3543
3544 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3545
3546 if (images_writemask & (1 << image->Register.Index))
3547 dcc_off = true;
3548 } else {
3549 /* From the GL_ARB_shader_image_load_store extension spec:
3550 *
3551 * If a shader performs an image load, store, or atomic
3552 * operation using an image variable declared as an array,
3553 * and if the index used to select an individual element is
3554 * negative or greater than or equal to the size of the
3555 * array, the results of the operation are undefined but may
3556 * not lead to termination.
3557 */
3558 index = get_bounded_indirect_index(ctx, &image->Indirect,
3559 image->Register.Index,
3560 SI_NUM_IMAGES);
3561 }
3562
3563 *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3564 if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3565 *rsrc = force_dcc_off(ctx, *rsrc);
3566 }
3567
3568 static LLVMValueRef image_fetch_coords(
3569 struct lp_build_tgsi_context *bld_base,
3570 const struct tgsi_full_instruction *inst,
3571 unsigned src, LLVMValueRef desc)
3572 {
3573 struct si_shader_context *ctx = si_shader_context(bld_base);
3574 struct gallivm_state *gallivm = &ctx->gallivm;
3575 LLVMBuilderRef builder = gallivm->builder;
3576 unsigned target = inst->Memory.Texture;
3577 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3578 LLVMValueRef coords[4];
3579 LLVMValueRef tmp;
3580 int chan;
3581
3582 for (chan = 0; chan < num_coords; ++chan) {
3583 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3584 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3585 coords[chan] = tmp;
3586 }
3587
3588 if (ctx->screen->b.chip_class >= GFX9) {
3589 /* 1D textures are allocated and used as 2D on GFX9. */
3590 if (target == TGSI_TEXTURE_1D) {
3591 coords[1] = ctx->i32_0;
3592 num_coords++;
3593 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3594 coords[2] = coords[1];
3595 coords[1] = ctx->i32_0;
3596 num_coords++;
3597 } else if (target == TGSI_TEXTURE_2D) {
3598 /* The hw can't bind a slice of a 3D image as a 2D
3599 * image, because it ignores BASE_ARRAY if the target
3600 * is 3D. The workaround is to read BASE_ARRAY and set
3601 * it as the 3rd address operand for all 2D images.
3602 */
3603 LLVMValueRef first_layer, const5, mask;
3604
3605 const5 = LLVMConstInt(ctx->i32, 5, 0);
3606 mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3607 first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3608 first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3609
3610 coords[2] = first_layer;
3611 num_coords++;
3612 }
3613 }
3614
3615 if (num_coords == 1)
3616 return coords[0];
3617
3618 if (num_coords == 3) {
3619 /* LLVM has difficulties lowering 3-element vectors. */
3620 coords[3] = bld_base->uint_bld.undef;
3621 num_coords = 4;
3622 }
3623
3624 return lp_build_gather_values(gallivm, coords, num_coords);
3625 }
3626
3627 /**
3628 * Append the extra mode bits that are used by image load and store.
3629 */
3630 static void image_append_args(
3631 struct si_shader_context *ctx,
3632 struct lp_build_emit_data * emit_data,
3633 unsigned target,
3634 bool atomic,
3635 bool force_glc)
3636 {
3637 const struct tgsi_full_instruction *inst = emit_data->inst;
3638 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3639 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3640 LLVMValueRef r128 = i1false;
3641 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3642 LLVMValueRef glc =
3643 force_glc ||
3644 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3645 i1true : i1false;
3646 LLVMValueRef slc = i1false;
3647 LLVMValueRef lwe = i1false;
3648
3649 if (atomic || (HAVE_LLVM <= 0x0309)) {
3650 emit_data->args[emit_data->arg_count++] = r128;
3651 emit_data->args[emit_data->arg_count++] = da;
3652 if (!atomic) {
3653 emit_data->args[emit_data->arg_count++] = glc;
3654 }
3655 emit_data->args[emit_data->arg_count++] = slc;
3656 return;
3657 }
3658
3659 /* HAVE_LLVM >= 0x0400 */
3660 emit_data->args[emit_data->arg_count++] = glc;
3661 emit_data->args[emit_data->arg_count++] = slc;
3662 emit_data->args[emit_data->arg_count++] = lwe;
3663 emit_data->args[emit_data->arg_count++] = da;
3664 }
3665
3666 /**
3667 * Append the resource and indexing arguments for buffer intrinsics.
3668 *
3669 * \param rsrc the v4i32 buffer resource
3670 * \param index index into the buffer (stride-based)
3671 * \param offset byte offset into the buffer
3672 */
3673 static void buffer_append_args(
3674 struct si_shader_context *ctx,
3675 struct lp_build_emit_data *emit_data,
3676 LLVMValueRef rsrc,
3677 LLVMValueRef index,
3678 LLVMValueRef offset,
3679 bool atomic,
3680 bool force_glc)
3681 {
3682 const struct tgsi_full_instruction *inst = emit_data->inst;
3683 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3684 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3685
3686 emit_data->args[emit_data->arg_count++] = rsrc;
3687 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3688 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3689 if (!atomic) {
3690 emit_data->args[emit_data->arg_count++] =
3691 force_glc ||
3692 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3693 i1true : i1false; /* glc */
3694 }
3695 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3696 }
3697
3698 static void load_fetch_args(
3699 struct lp_build_tgsi_context * bld_base,
3700 struct lp_build_emit_data * emit_data)
3701 {
3702 struct si_shader_context *ctx = si_shader_context(bld_base);
3703 struct gallivm_state *gallivm = &ctx->gallivm;
3704 const struct tgsi_full_instruction * inst = emit_data->inst;
3705 unsigned target = inst->Memory.Texture;
3706 LLVMValueRef rsrc;
3707
3708 emit_data->dst_type = ctx->v4f32;
3709
3710 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3711 LLVMBuilderRef builder = gallivm->builder;
3712 LLVMValueRef offset;
3713 LLVMValueRef tmp;
3714
3715 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3716
3717 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3718 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3719
3720 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3721 offset, false, false);
3722 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3723 LLVMValueRef coords;
3724
3725 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3726 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3727
3728 if (target == TGSI_TEXTURE_BUFFER) {
3729 buffer_append_args(ctx, emit_data, rsrc, coords,
3730 ctx->i32_0, false, false);
3731 } else {
3732 emit_data->args[0] = coords;
3733 emit_data->args[1] = rsrc;
3734 emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3735 emit_data->arg_count = 3;
3736
3737 image_append_args(ctx, emit_data, target, false, false);
3738 }
3739 }
3740 }
3741
3742 static unsigned get_load_intr_attribs(bool readonly_memory)
3743 {
3744 /* READNONE means writes can't affect it, while READONLY means that
3745 * writes can affect it. */
3746 return readonly_memory && HAVE_LLVM >= 0x0400 ?
3747 LP_FUNC_ATTR_READNONE :
3748 LP_FUNC_ATTR_READONLY;
3749 }
3750
3751 static unsigned get_store_intr_attribs(bool writeonly_memory)
3752 {
3753 return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3754 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3755 LP_FUNC_ATTR_WRITEONLY;
3756 }
3757
3758 static void load_emit_buffer(struct si_shader_context *ctx,
3759 struct lp_build_emit_data *emit_data,
3760 bool readonly_memory)
3761 {
3762 const struct tgsi_full_instruction *inst = emit_data->inst;
3763 struct gallivm_state *gallivm = &ctx->gallivm;
3764 LLVMBuilderRef builder = gallivm->builder;
3765 uint writemask = inst->Dst[0].Register.WriteMask;
3766 uint count = util_last_bit(writemask);
3767 const char *intrinsic_name;
3768 LLVMTypeRef dst_type;
3769
3770 switch (count) {
3771 case 1:
3772 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3773 dst_type = ctx->f32;
3774 break;
3775 case 2:
3776 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3777 dst_type = LLVMVectorType(ctx->f32, 2);
3778 break;
3779 default: // 3 & 4
3780 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3781 dst_type = ctx->v4f32;
3782 count = 4;
3783 }
3784
3785 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3786 builder, intrinsic_name, dst_type,
3787 emit_data->args, emit_data->arg_count,
3788 get_load_intr_attribs(readonly_memory));
3789 }
3790
3791 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3792 const struct tgsi_full_instruction *inst,
3793 LLVMTypeRef type, int arg)
3794 {
3795 struct gallivm_state *gallivm = &ctx->gallivm;
3796 LLVMBuilderRef builder = gallivm->builder;
3797 LLVMValueRef offset, ptr;
3798 int addr_space;
3799
3800 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3801 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3802
3803 ptr = ctx->shared_memory;
3804 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3805 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3806 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3807
3808 return ptr;
3809 }
3810
3811 static void load_emit_memory(
3812 struct si_shader_context *ctx,
3813 struct lp_build_emit_data *emit_data)
3814 {
3815 const struct tgsi_full_instruction *inst = emit_data->inst;
3816 struct gallivm_state *gallivm = &ctx->gallivm;
3817 LLVMBuilderRef builder = gallivm->builder;
3818 unsigned writemask = inst->Dst[0].Register.WriteMask;
3819 LLVMValueRef channels[4], ptr, derived_ptr, index;
3820 int chan;
3821
3822 ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3823
3824 for (chan = 0; chan < 4; ++chan) {
3825 if (!(writemask & (1 << chan))) {
3826 channels[chan] = LLVMGetUndef(ctx->f32);
3827 continue;
3828 }
3829
3830 index = LLVMConstInt(ctx->i32, chan, 0);
3831 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3832 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3833 }
3834 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3835 }
3836
3837 /**
3838 * Return true if the memory accessed by a LOAD or STORE instruction is
3839 * read-only or write-only, respectively.
3840 *
3841 * \param shader_buffers_reverse_access_mask
3842 * For LOAD, set this to (store | atomic) slot usage in the shader.
3843 * For STORE, set this to (load | atomic) slot usage in the shader.
3844 * \param images_reverse_access_mask Same as above, but for images.
3845 */
3846 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3847 const struct tgsi_shader_info *info,
3848 unsigned shader_buffers_reverse_access_mask,
3849 unsigned images_reverse_access_mask)
3850 {
3851 /* RESTRICT means NOALIAS.
3852 * If there are no writes, we can assume the accessed memory is read-only.
3853 * If there are no reads, we can assume the accessed memory is write-only.
3854 */
3855 if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3856 unsigned reverse_access_mask;
3857
3858 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3859 reverse_access_mask = shader_buffers_reverse_access_mask;
3860 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3861 reverse_access_mask = info->images_buffers &
3862 images_reverse_access_mask;
3863 } else {
3864 reverse_access_mask = ~info->images_buffers &
3865 images_reverse_access_mask;
3866 }
3867
3868 if (inst->Src[0].Register.Indirect) {
3869 if (!reverse_access_mask)
3870 return true;
3871 } else {
3872 if (!(reverse_access_mask &
3873 (1u << inst->Src[0].Register.Index)))
3874 return true;
3875 }
3876 }
3877
3878 /* If there are no buffer writes (for both shader buffers & image
3879 * buffers), it implies that buffer memory is read-only.
3880 * If there are no buffer reads (for both shader buffers & image
3881 * buffers), it implies that buffer memory is write-only.
3882 *
3883 * Same for the case when there are no writes/reads for non-buffer
3884 * images.
3885 */
3886 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3887 (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3888 inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3889 if (!shader_buffers_reverse_access_mask &&
3890 !(info->images_buffers & images_reverse_access_mask))
3891 return true;
3892 } else {
3893 if (!(~info->images_buffers & images_reverse_access_mask))
3894 return true;
3895 }
3896 return false;
3897 }
3898
3899 static void load_emit(
3900 const struct lp_build_tgsi_action *action,
3901 struct lp_build_tgsi_context *bld_base,
3902 struct lp_build_emit_data *emit_data)
3903 {
3904 struct si_shader_context *ctx = si_shader_context(bld_base);
3905 struct gallivm_state *gallivm = &ctx->gallivm;
3906 LLVMBuilderRef builder = gallivm->builder;
3907 const struct tgsi_full_instruction * inst = emit_data->inst;
3908 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3909 char intrinsic_name[64];
3910 bool readonly_memory = false;
3911
3912 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3913 load_emit_memory(ctx, emit_data);
3914 return;
3915 }
3916
3917 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3918 emit_waitcnt(ctx, VM_CNT);
3919
3920 readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3921 is_oneway_access_only(inst, info,
3922 info->shader_buffers_store |
3923 info->shader_buffers_atomic,
3924 info->images_store |
3925 info->images_atomic);
3926
3927 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3928 load_emit_buffer(ctx, emit_data, readonly_memory);
3929 return;
3930 }
3931
3932 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3933 emit_data->output[emit_data->chan] =
3934 lp_build_intrinsic(
3935 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3936 emit_data->args, emit_data->arg_count,
3937 get_load_intr_attribs(readonly_memory));
3938 } else {
3939 ac_get_image_intr_name("llvm.amdgcn.image.load",
3940 emit_data->dst_type, /* vdata */
3941 LLVMTypeOf(emit_data->args[0]), /* coords */
3942 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3943 intrinsic_name, sizeof(intrinsic_name));
3944
3945 emit_data->output[emit_data->chan] =
3946 lp_build_intrinsic(
3947 builder, intrinsic_name, emit_data->dst_type,
3948 emit_data->args, emit_data->arg_count,
3949 get_load_intr_attribs(readonly_memory));
3950 }
3951 }
3952
3953 static void store_fetch_args(
3954 struct lp_build_tgsi_context * bld_base,
3955 struct lp_build_emit_data * emit_data)
3956 {
3957 struct si_shader_context *ctx = si_shader_context(bld_base);
3958 struct gallivm_state *gallivm = &ctx->gallivm;
3959 LLVMBuilderRef builder = gallivm->builder;
3960 const struct tgsi_full_instruction * inst = emit_data->inst;
3961 struct tgsi_full_src_register memory;
3962 LLVMValueRef chans[4];
3963 LLVMValueRef data;
3964 LLVMValueRef rsrc;
3965 unsigned chan;
3966
3967 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3968
3969 for (chan = 0; chan < 4; ++chan) {
3970 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3971 }
3972 data = lp_build_gather_values(gallivm, chans, 4);
3973
3974 emit_data->args[emit_data->arg_count++] = data;
3975
3976 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3977
3978 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3979 LLVMValueRef offset;
3980 LLVMValueRef tmp;
3981
3982 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3983
3984 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3985 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3986
3987 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3988 offset, false, false);
3989 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3990 unsigned target = inst->Memory.Texture;
3991 LLVMValueRef coords;
3992
3993 /* 8bit/16bit TC L1 write corruption bug on SI.
3994 * All store opcodes not aligned to a dword are affected.
3995 *
3996 * The only way to get unaligned stores in radeonsi is through
3997 * shader images.
3998 */
3999 bool force_glc = ctx->screen->b.chip_class == SI;
4000
4001 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
4002 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
4003
4004 if (target == TGSI_TEXTURE_BUFFER) {
4005 buffer_append_args(ctx, emit_data, rsrc, coords,
4006 ctx->i32_0, false, force_glc);
4007 } else {
4008 emit_data->args[1] = coords;
4009 emit_data->args[2] = rsrc;
4010 emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
4011 emit_data->arg_count = 4;
4012
4013 image_append_args(ctx, emit_data, target, false, force_glc);
4014 }
4015 }
4016 }
4017
4018 static void store_emit_buffer(
4019 struct si_shader_context *ctx,
4020 struct lp_build_emit_data *emit_data,
4021 bool writeonly_memory)
4022 {
4023 const struct tgsi_full_instruction *inst = emit_data->inst;
4024 struct gallivm_state *gallivm = &ctx->gallivm;
4025 LLVMBuilderRef builder = gallivm->builder;
4026 LLVMValueRef base_data = emit_data->args[0];
4027 LLVMValueRef base_offset = emit_data->args[3];
4028 unsigned writemask = inst->Dst[0].Register.WriteMask;
4029
4030 while (writemask) {
4031 int start, count;
4032 const char *intrinsic_name;
4033 LLVMValueRef data;
4034 LLVMValueRef offset;
4035 LLVMValueRef tmp;
4036
4037 u_bit_scan_consecutive_range(&writemask, &start, &count);
4038
4039 /* Due to an LLVM limitation, split 3-element writes
4040 * into a 2-element and a 1-element write. */
4041 if (count == 3) {
4042 writemask |= 1 << (start + 2);
4043 count = 2;
4044 }
4045
4046 if (count == 4) {
4047 data = base_data;
4048 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
4049 } else if (count == 2) {
4050 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
4051
4052 tmp = LLVMBuildExtractElement(
4053 builder, base_data,
4054 LLVMConstInt(ctx->i32, start, 0), "");
4055 data = LLVMBuildInsertElement(
4056 builder, LLVMGetUndef(v2f32), tmp,
4057 ctx->i32_0, "");
4058
4059 tmp = LLVMBuildExtractElement(
4060 builder, base_data,
4061 LLVMConstInt(ctx->i32, start + 1, 0), "");
4062 data = LLVMBuildInsertElement(
4063 builder, data, tmp, ctx->i32_1, "");
4064
4065 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
4066 } else {
4067 assert(count == 1);
4068 data = LLVMBuildExtractElement(
4069 builder, base_data,
4070 LLVMConstInt(ctx->i32, start, 0), "");
4071 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
4072 }
4073
4074 offset = base_offset;
4075 if (start != 0) {
4076 offset = LLVMBuildAdd(
4077 builder, offset,
4078 LLVMConstInt(ctx->i32, start * 4, 0), "");
4079 }
4080
4081 emit_data->args[0] = data;
4082 emit_data->args[3] = offset;
4083
4084 lp_build_intrinsic(
4085 builder, intrinsic_name, emit_data->dst_type,
4086 emit_data->args, emit_data->arg_count,
4087 get_store_intr_attribs(writeonly_memory));
4088 }
4089 }
4090
4091 static void store_emit_memory(
4092 struct si_shader_context *ctx,
4093 struct lp_build_emit_data *emit_data)
4094 {
4095 const struct tgsi_full_instruction *inst = emit_data->inst;
4096 struct gallivm_state *gallivm = &ctx->gallivm;
4097 LLVMBuilderRef builder = gallivm->builder;
4098 unsigned writemask = inst->Dst[0].Register.WriteMask;
4099 LLVMValueRef ptr, derived_ptr, data, index;
4100 int chan;
4101
4102 ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
4103
4104 for (chan = 0; chan < 4; ++chan) {
4105 if (!(writemask & (1 << chan))) {
4106 continue;
4107 }
4108 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
4109 index = LLVMConstInt(ctx->i32, chan, 0);
4110 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
4111 LLVMBuildStore(builder, data, derived_ptr);
4112 }
4113 }
4114
4115 static void store_emit(
4116 const struct lp_build_tgsi_action *action,
4117 struct lp_build_tgsi_context *bld_base,
4118 struct lp_build_emit_data *emit_data)
4119 {
4120 struct si_shader_context *ctx = si_shader_context(bld_base);
4121 struct gallivm_state *gallivm = &ctx->gallivm;
4122 LLVMBuilderRef builder = gallivm->builder;
4123 const struct tgsi_full_instruction * inst = emit_data->inst;
4124 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
4125 unsigned target = inst->Memory.Texture;
4126 char intrinsic_name[64];
4127 bool writeonly_memory = false;
4128
4129 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4130 store_emit_memory(ctx, emit_data);
4131 return;
4132 }
4133
4134 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4135 emit_waitcnt(ctx, VM_CNT);
4136
4137 writeonly_memory = is_oneway_access_only(inst, info,
4138 info->shader_buffers_load |
4139 info->shader_buffers_atomic,
4140 info->images_load |
4141 info->images_atomic);
4142
4143 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4144 store_emit_buffer(ctx, emit_data, writeonly_memory);
4145 return;
4146 }
4147
4148 if (target == TGSI_TEXTURE_BUFFER) {
4149 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4150 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4151 emit_data->dst_type, emit_data->args,
4152 emit_data->arg_count,
4153 get_store_intr_attribs(writeonly_memory));
4154 } else {
4155 ac_get_image_intr_name("llvm.amdgcn.image.store",
4156 LLVMTypeOf(emit_data->args[0]), /* vdata */
4157 LLVMTypeOf(emit_data->args[1]), /* coords */
4158 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4159 intrinsic_name, sizeof(intrinsic_name));
4160
4161 emit_data->output[emit_data->chan] =
4162 lp_build_intrinsic(
4163 builder, intrinsic_name, emit_data->dst_type,
4164 emit_data->args, emit_data->arg_count,
4165 get_store_intr_attribs(writeonly_memory));
4166 }
4167 }
4168
4169 static void atomic_fetch_args(
4170 struct lp_build_tgsi_context * bld_base,
4171 struct lp_build_emit_data * emit_data)
4172 {
4173 struct si_shader_context *ctx = si_shader_context(bld_base);
4174 struct gallivm_state *gallivm = &ctx->gallivm;
4175 LLVMBuilderRef builder = gallivm->builder;
4176 const struct tgsi_full_instruction * inst = emit_data->inst;
4177 LLVMValueRef data1, data2;
4178 LLVMValueRef rsrc;
4179 LLVMValueRef tmp;
4180
4181 emit_data->dst_type = ctx->f32;
4182
4183 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4184 data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4185
4186 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4187 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4188 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4189 }
4190
4191 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4192 * of arguments, which is reversed relative to TGSI (and GLSL)
4193 */
4194 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4195 emit_data->args[emit_data->arg_count++] = data2;
4196 emit_data->args[emit_data->arg_count++] = data1;
4197
4198 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4199 LLVMValueRef offset;
4200
4201 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4202
4203 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4204 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4205
4206 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4207 offset, true, false);
4208 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4209 unsigned target = inst->Memory.Texture;
4210 LLVMValueRef coords;
4211
4212 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4213 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4214
4215 if (target == TGSI_TEXTURE_BUFFER) {
4216 buffer_append_args(ctx, emit_data, rsrc, coords,
4217 ctx->i32_0, true, false);
4218 } else {
4219 emit_data->args[emit_data->arg_count++] = coords;
4220 emit_data->args[emit_data->arg_count++] = rsrc;
4221
4222 image_append_args(ctx, emit_data, target, true, false);
4223 }
4224 }
4225 }
4226
4227 static void atomic_emit_memory(struct si_shader_context *ctx,
4228 struct lp_build_emit_data *emit_data) {
4229 struct gallivm_state *gallivm = &ctx->gallivm;
4230 LLVMBuilderRef builder = gallivm->builder;
4231 const struct tgsi_full_instruction * inst = emit_data->inst;
4232 LLVMValueRef ptr, result, arg;
4233
4234 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4235
4236 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4237 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4238
4239 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4240 LLVMValueRef new_data;
4241 new_data = lp_build_emit_fetch(&ctx->bld_base,
4242 inst, 3, 0);
4243
4244 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4245
4246 #if HAVE_LLVM >= 0x309
4247 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4248 LLVMAtomicOrderingSequentiallyConsistent,
4249 LLVMAtomicOrderingSequentiallyConsistent,
4250 false);
4251 #endif
4252
4253 result = LLVMBuildExtractValue(builder, result, 0, "");
4254 } else {
4255 LLVMAtomicRMWBinOp op;
4256
4257 switch(inst->Instruction.Opcode) {
4258 case TGSI_OPCODE_ATOMUADD:
4259 op = LLVMAtomicRMWBinOpAdd;
4260 break;
4261 case TGSI_OPCODE_ATOMXCHG:
4262 op = LLVMAtomicRMWBinOpXchg;
4263 break;
4264 case TGSI_OPCODE_ATOMAND:
4265 op = LLVMAtomicRMWBinOpAnd;
4266 break;
4267 case TGSI_OPCODE_ATOMOR:
4268 op = LLVMAtomicRMWBinOpOr;
4269 break;
4270 case TGSI_OPCODE_ATOMXOR:
4271 op = LLVMAtomicRMWBinOpXor;
4272 break;
4273 case TGSI_OPCODE_ATOMUMIN:
4274 op = LLVMAtomicRMWBinOpUMin;
4275 break;
4276 case TGSI_OPCODE_ATOMUMAX:
4277 op = LLVMAtomicRMWBinOpUMax;
4278 break;
4279 case TGSI_OPCODE_ATOMIMIN:
4280 op = LLVMAtomicRMWBinOpMin;
4281 break;
4282 case TGSI_OPCODE_ATOMIMAX:
4283 op = LLVMAtomicRMWBinOpMax;
4284 break;
4285 default:
4286 unreachable("unknown atomic opcode");
4287 }
4288
4289 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4290 LLVMAtomicOrderingSequentiallyConsistent,
4291 false);
4292 }
4293 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4294 }
4295
4296 static void atomic_emit(
4297 const struct lp_build_tgsi_action *action,
4298 struct lp_build_tgsi_context *bld_base,
4299 struct lp_build_emit_data *emit_data)
4300 {
4301 struct si_shader_context *ctx = si_shader_context(bld_base);
4302 struct gallivm_state *gallivm = &ctx->gallivm;
4303 LLVMBuilderRef builder = gallivm->builder;
4304 const struct tgsi_full_instruction * inst = emit_data->inst;
4305 char intrinsic_name[40];
4306 LLVMValueRef tmp;
4307
4308 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4309 atomic_emit_memory(ctx, emit_data);
4310 return;
4311 }
4312
4313 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4314 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4315 snprintf(intrinsic_name, sizeof(intrinsic_name),
4316 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4317 } else {
4318 LLVMValueRef coords;
4319 char coords_type[8];
4320
4321 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4322 coords = emit_data->args[2];
4323 else
4324 coords = emit_data->args[1];
4325
4326 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4327 snprintf(intrinsic_name, sizeof(intrinsic_name),
4328 "llvm.amdgcn.image.atomic.%s.%s",
4329 action->intr_name, coords_type);
4330 }
4331
4332 tmp = lp_build_intrinsic(
4333 builder, intrinsic_name, ctx->i32,
4334 emit_data->args, emit_data->arg_count, 0);
4335 emit_data->output[emit_data->chan] =
4336 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4337 }
4338
4339 static void set_tex_fetch_args(struct si_shader_context *ctx,
4340 struct lp_build_emit_data *emit_data,
4341 unsigned target,
4342 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4343 LLVMValueRef *param, unsigned count,
4344 unsigned dmask)
4345 {
4346 struct gallivm_state *gallivm = &ctx->gallivm;
4347 struct ac_image_args args = {};
4348
4349 /* Pad to power of two vector */
4350 while (count < util_next_power_of_two(count))
4351 param[count++] = LLVMGetUndef(ctx->i32);
4352
4353 if (count > 1)
4354 args.addr = lp_build_gather_values(gallivm, param, count);
4355 else
4356 args.addr = param[0];
4357
4358 args.resource = res_ptr;
4359 args.sampler = samp_ptr;
4360 args.dmask = dmask;
4361 args.unorm = target == TGSI_TEXTURE_RECT ||
4362 target == TGSI_TEXTURE_SHADOWRECT;
4363 args.da = tgsi_is_array_sampler(target);
4364
4365 /* Ugly, but we seem to have no other choice right now. */
4366 STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4367 memcpy(emit_data->args, &args, sizeof(args));
4368 }
4369
4370 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4371 unsigned target, LLVMValueRef out)
4372 {
4373 LLVMBuilderRef builder = ctx->gallivm.builder;
4374
4375 /* 1D textures are allocated and used as 2D on GFX9. */
4376 if (ctx->screen->b.chip_class >= GFX9 &&
4377 (target == TGSI_TEXTURE_1D_ARRAY ||
4378 target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4379 LLVMValueRef layers =
4380 LLVMBuildExtractElement(builder, out,
4381 LLVMConstInt(ctx->i32, 2, 0), "");
4382 out = LLVMBuildInsertElement(builder, out, layers,
4383 ctx->i32_1, "");
4384 }
4385
4386 /* Divide the number of layers by 6 to get the number of cubes. */
4387 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4388 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4389 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4390
4391 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4392 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4393
4394 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4395 }
4396 return out;
4397 }
4398
4399 static void resq_fetch_args(
4400 struct lp_build_tgsi_context * bld_base,
4401 struct lp_build_emit_data * emit_data)
4402 {
4403 struct si_shader_context *ctx = si_shader_context(bld_base);
4404 const struct tgsi_full_instruction *inst = emit_data->inst;
4405 const struct tgsi_full_src_register *reg = &inst->Src[0];
4406
4407 emit_data->dst_type = ctx->v4i32;
4408
4409 if (reg->Register.File == TGSI_FILE_BUFFER) {
4410 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4411 emit_data->arg_count = 1;
4412 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4413 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4414 &emit_data->args[0]);
4415 emit_data->arg_count = 1;
4416 } else {
4417 LLVMValueRef res_ptr;
4418 unsigned image_target;
4419
4420 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4421 image_target = TGSI_TEXTURE_2D_ARRAY;
4422 else
4423 image_target = inst->Memory.Texture;
4424
4425 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4426 &res_ptr);
4427 set_tex_fetch_args(ctx, emit_data, image_target,
4428 res_ptr, NULL, &ctx->i32_0, 1,
4429 0xf);
4430 }
4431 }
4432
4433 static void resq_emit(
4434 const struct lp_build_tgsi_action *action,
4435 struct lp_build_tgsi_context *bld_base,
4436 struct lp_build_emit_data *emit_data)
4437 {
4438 struct si_shader_context *ctx = si_shader_context(bld_base);
4439 struct gallivm_state *gallivm = &ctx->gallivm;
4440 LLVMBuilderRef builder = gallivm->builder;
4441 const struct tgsi_full_instruction *inst = emit_data->inst;
4442 LLVMValueRef out;
4443
4444 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4445 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4446 LLVMConstInt(ctx->i32, 2, 0), "");
4447 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4448 out = get_buffer_size(bld_base, emit_data->args[0]);
4449 } else {
4450 struct ac_image_args args;
4451
4452 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4453 args.opcode = ac_image_get_resinfo;
4454 out = ac_build_image_opcode(&ctx->ac, &args);
4455
4456 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4457 }
4458
4459 emit_data->output[emit_data->chan] = out;
4460 }
4461
4462 static const struct lp_build_tgsi_action tex_action;
4463
4464 enum desc_type {
4465 DESC_IMAGE,
4466 DESC_BUFFER,
4467 DESC_FMASK,
4468 DESC_SAMPLER,
4469 };
4470
4471 /**
4472 * Load an image view, fmask view. or sampler state descriptor.
4473 */
4474 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4475 LLVMValueRef list, LLVMValueRef index,
4476 enum desc_type type)
4477 {
4478 struct gallivm_state *gallivm = &ctx->gallivm;
4479 LLVMBuilderRef builder = gallivm->builder;
4480
4481 switch (type) {
4482 case DESC_IMAGE:
4483 /* The image is at [0:7]. */
4484 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4485 break;
4486 case DESC_BUFFER:
4487 /* The buffer is in [4:7]. */
4488 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4489 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4490 list = LLVMBuildPointerCast(builder, list,
4491 const_array(ctx->v4i32, 0), "");
4492 break;
4493 case DESC_FMASK:
4494 /* The FMASK is at [8:15]. */
4495 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4496 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4497 break;
4498 case DESC_SAMPLER:
4499 /* The sampler state is at [12:15]. */
4500 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4501 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4502 list = LLVMBuildPointerCast(builder, list,
4503 const_array(ctx->v4i32, 0), "");
4504 break;
4505 }
4506
4507 return ac_build_indexed_load_const(&ctx->ac, list, index);
4508 }
4509
4510 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4511 *
4512 * SI-CI:
4513 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4514 * filtering manually. The driver sets img7 to a mask clearing
4515 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4516 * s_and_b32 samp0, samp0, img7
4517 *
4518 * VI:
4519 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4520 */
4521 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4522 LLVMValueRef res, LLVMValueRef samp)
4523 {
4524 LLVMBuilderRef builder = ctx->gallivm.builder;
4525 LLVMValueRef img7, samp0;
4526
4527 if (ctx->screen->b.chip_class >= VI)
4528 return samp;
4529
4530 img7 = LLVMBuildExtractElement(builder, res,
4531 LLVMConstInt(ctx->i32, 7, 0), "");
4532 samp0 = LLVMBuildExtractElement(builder, samp,
4533 ctx->i32_0, "");
4534 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4535 return LLVMBuildInsertElement(builder, samp, samp0,
4536 ctx->i32_0, "");
4537 }
4538
4539 static void tex_fetch_ptrs(
4540 struct lp_build_tgsi_context *bld_base,
4541 struct lp_build_emit_data *emit_data,
4542 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4543 {
4544 struct si_shader_context *ctx = si_shader_context(bld_base);
4545 LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4546 const struct tgsi_full_instruction *inst = emit_data->inst;
4547 const struct tgsi_full_src_register *reg;
4548 unsigned target = inst->Texture.Texture;
4549 unsigned sampler_src;
4550 LLVMValueRef index;
4551
4552 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4553 reg = &emit_data->inst->Src[sampler_src];
4554
4555 if (reg->Register.Indirect) {
4556 index = get_bounded_indirect_index(ctx,
4557 &reg->Indirect,
4558 reg->Register.Index,
4559 SI_NUM_SAMPLERS);
4560 } else {
4561 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4562 }
4563
4564 if (target == TGSI_TEXTURE_BUFFER)
4565 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4566 else
4567 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4568
4569 if (samp_ptr)
4570 *samp_ptr = NULL;
4571 if (fmask_ptr)
4572 *fmask_ptr = NULL;
4573
4574 if (target == TGSI_TEXTURE_2D_MSAA ||
4575 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4576 if (fmask_ptr)
4577 *fmask_ptr = load_sampler_desc(ctx, list, index,
4578 DESC_FMASK);
4579 } else if (target != TGSI_TEXTURE_BUFFER) {
4580 if (samp_ptr) {
4581 *samp_ptr = load_sampler_desc(ctx, list, index,
4582 DESC_SAMPLER);
4583 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4584 }
4585 }
4586 }
4587
4588 static void txq_fetch_args(
4589 struct lp_build_tgsi_context *bld_base,
4590 struct lp_build_emit_data *emit_data)
4591 {
4592 struct si_shader_context *ctx = si_shader_context(bld_base);
4593 const struct tgsi_full_instruction *inst = emit_data->inst;
4594 unsigned target = inst->Texture.Texture;
4595 LLVMValueRef res_ptr;
4596 LLVMValueRef address;
4597
4598 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4599
4600 if (target == TGSI_TEXTURE_BUFFER) {
4601 /* Read the size from the buffer descriptor directly. */
4602 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4603 return;
4604 }
4605
4606 /* Textures - set the mip level. */
4607 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4608
4609 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4610 NULL, &address, 1, 0xf);
4611 }
4612
4613 static void txq_emit(const struct lp_build_tgsi_action *action,
4614 struct lp_build_tgsi_context *bld_base,
4615 struct lp_build_emit_data *emit_data)
4616 {
4617 struct si_shader_context *ctx = si_shader_context(bld_base);
4618 struct ac_image_args args;
4619 unsigned target = emit_data->inst->Texture.Texture;
4620
4621 if (target == TGSI_TEXTURE_BUFFER) {
4622 /* Just return the buffer size. */
4623 emit_data->output[emit_data->chan] = emit_data->args[0];
4624 return;
4625 }
4626
4627 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4628
4629 args.opcode = ac_image_get_resinfo;
4630 LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4631
4632 emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4633 }
4634
4635 static void tex_fetch_args(
4636 struct lp_build_tgsi_context *bld_base,
4637 struct lp_build_emit_data *emit_data)
4638 {
4639 struct si_shader_context *ctx = si_shader_context(bld_base);
4640 struct gallivm_state *gallivm = &ctx->gallivm;
4641 const struct tgsi_full_instruction *inst = emit_data->inst;
4642 unsigned opcode = inst->Instruction.Opcode;
4643 unsigned target = inst->Texture.Texture;
4644 LLVMValueRef coords[5], derivs[6];
4645 LLVMValueRef address[16];
4646 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4647 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4648 unsigned count = 0;
4649 unsigned chan;
4650 unsigned num_deriv_channels = 0;
4651 bool has_offset = inst->Texture.NumOffsets > 0;
4652 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4653 unsigned dmask = 0xf;
4654
4655 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4656
4657 if (target == TGSI_TEXTURE_BUFFER) {
4658 emit_data->dst_type = ctx->v4f32;
4659 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4660 ctx->v16i8, "");
4661 emit_data->args[1] = ctx->i32_0;
4662 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4663 emit_data->arg_count = 3;
4664 return;
4665 }
4666
4667 /* Fetch and project texture coordinates */
4668 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4669 for (chan = 0; chan < 3; chan++ ) {
4670 coords[chan] = lp_build_emit_fetch(bld_base,
4671 emit_data->inst, 0,
4672 chan);
4673 if (opcode == TGSI_OPCODE_TXP)
4674 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4675 TGSI_OPCODE_DIV,
4676 coords[chan],
4677 coords[3]);
4678 }
4679
4680 if (opcode == TGSI_OPCODE_TXP)
4681 coords[3] = bld_base->base.one;
4682
4683 /* Pack offsets. */
4684 if (has_offset &&
4685 opcode != TGSI_OPCODE_TXF &&
4686 opcode != TGSI_OPCODE_TXF_LZ) {
4687 /* The offsets are six-bit signed integers packed like this:
4688 * X=[5:0], Y=[13:8], and Z=[21:16].
4689 */
4690 LLVMValueRef offset[3], pack;
4691
4692 assert(inst->Texture.NumOffsets == 1);
4693
4694 for (chan = 0; chan < 3; chan++) {
4695 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4696 emit_data->inst, 0, chan);
4697 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4698 LLVMConstInt(ctx->i32, 0x3f, 0), "");
4699 if (chan)
4700 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4701 LLVMConstInt(ctx->i32, chan*8, 0), "");
4702 }
4703
4704 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4705 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4706 address[count++] = pack;
4707 }
4708
4709 /* Pack LOD bias value */
4710 if (opcode == TGSI_OPCODE_TXB)
4711 address[count++] = coords[3];
4712 if (opcode == TGSI_OPCODE_TXB2)
4713 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4714
4715 /* Pack depth comparison value */
4716 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4717 LLVMValueRef z;
4718
4719 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4720 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4721 } else {
4722 assert(ref_pos >= 0);
4723 z = coords[ref_pos];
4724 }
4725
4726 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4727 * so the depth comparison value isn't clamped for Z16 and
4728 * Z24 anymore. Do it manually here.
4729 *
4730 * It's unnecessary if the original texture format was
4731 * Z32_FLOAT, but we don't know that here.
4732 */
4733 if (ctx->screen->b.chip_class == VI)
4734 z = ac_build_clamp(&ctx->ac, z);
4735
4736 address[count++] = z;
4737 }
4738
4739 /* Pack user derivatives */
4740 if (opcode == TGSI_OPCODE_TXD) {
4741 int param, num_src_deriv_channels, num_dst_deriv_channels;
4742
4743 switch (target) {
4744 case TGSI_TEXTURE_3D:
4745 num_src_deriv_channels = 3;
4746 num_dst_deriv_channels = 3;
4747 num_deriv_channels = 3;
4748 break;
4749 case TGSI_TEXTURE_2D:
4750 case TGSI_TEXTURE_SHADOW2D:
4751 case TGSI_TEXTURE_RECT:
4752 case TGSI_TEXTURE_SHADOWRECT:
4753 case TGSI_TEXTURE_2D_ARRAY:
4754 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4755 num_src_deriv_channels = 2;
4756 num_dst_deriv_channels = 2;
4757 num_deriv_channels = 2;
4758 break;
4759 case TGSI_TEXTURE_CUBE:
4760 case TGSI_TEXTURE_SHADOWCUBE:
4761 case TGSI_TEXTURE_CUBE_ARRAY:
4762 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4763 /* Cube derivatives will be converted to 2D. */
4764 num_src_deriv_channels = 3;
4765 num_dst_deriv_channels = 3;
4766 num_deriv_channels = 2;
4767 break;
4768 case TGSI_TEXTURE_1D:
4769 case TGSI_TEXTURE_SHADOW1D:
4770 case TGSI_TEXTURE_1D_ARRAY:
4771 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4772 num_src_deriv_channels = 1;
4773
4774 /* 1D textures are allocated and used as 2D on GFX9. */
4775 if (ctx->screen->b.chip_class >= GFX9) {
4776 num_dst_deriv_channels = 2;
4777 num_deriv_channels = 2;
4778 } else {
4779 num_dst_deriv_channels = 1;
4780 num_deriv_channels = 1;
4781 }
4782 break;
4783 default:
4784 unreachable("invalid target");
4785 }
4786
4787 for (param = 0; param < 2; param++) {
4788 for (chan = 0; chan < num_src_deriv_channels; chan++)
4789 derivs[param * num_dst_deriv_channels + chan] =
4790 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4791
4792 /* Fill in the rest with zeros. */
4793 for (chan = num_src_deriv_channels;
4794 chan < num_dst_deriv_channels; chan++)
4795 derivs[param * num_dst_deriv_channels + chan] =
4796 bld_base->base.zero;
4797 }
4798 }
4799
4800 if (target == TGSI_TEXTURE_CUBE ||
4801 target == TGSI_TEXTURE_CUBE_ARRAY ||
4802 target == TGSI_TEXTURE_SHADOWCUBE ||
4803 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4804 ac_prepare_cube_coords(&ctx->ac,
4805 opcode == TGSI_OPCODE_TXD,
4806 target == TGSI_TEXTURE_CUBE_ARRAY ||
4807 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4808 coords, derivs);
4809
4810 if (opcode == TGSI_OPCODE_TXD)
4811 for (int i = 0; i < num_deriv_channels * 2; i++)
4812 address[count++] = derivs[i];
4813
4814 /* Pack texture coordinates */
4815 address[count++] = coords[0];
4816 if (num_coords > 1)
4817 address[count++] = coords[1];
4818 if (num_coords > 2)
4819 address[count++] = coords[2];
4820
4821 /* 1D textures are allocated and used as 2D on GFX9. */
4822 if (ctx->screen->b.chip_class >= GFX9) {
4823 LLVMValueRef filler;
4824
4825 /* Use 0.5, so that we don't sample the border color. */
4826 if (opcode == TGSI_OPCODE_TXF)
4827 filler = ctx->i32_0;
4828 else
4829 filler = LLVMConstReal(ctx->f32, 0.5);
4830
4831 if (target == TGSI_TEXTURE_1D ||
4832 target == TGSI_TEXTURE_SHADOW1D) {
4833 address[count++] = filler;
4834 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4835 target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4836 address[count] = address[count - 1];
4837 address[count - 1] = filler;
4838 count++;
4839 }
4840 }
4841
4842 /* Pack LOD or sample index */
4843 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4844 address[count++] = coords[3];
4845 else if (opcode == TGSI_OPCODE_TXL2)
4846 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4847
4848 if (count > 16) {
4849 assert(!"Cannot handle more than 16 texture address parameters");
4850 count = 16;
4851 }
4852
4853 for (chan = 0; chan < count; chan++ ) {
4854 address[chan] = LLVMBuildBitCast(gallivm->builder,
4855 address[chan], ctx->i32, "");
4856 }
4857
4858 /* Adjust the sample index according to FMASK.
4859 *
4860 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4861 * which is the identity mapping. Each nibble says which physical sample
4862 * should be fetched to get that sample.
4863 *
4864 * For example, 0x11111100 means there are only 2 samples stored and
4865 * the second sample covers 3/4 of the pixel. When reading samples 0
4866 * and 1, return physical sample 0 (determined by the first two 0s
4867 * in FMASK), otherwise return physical sample 1.
4868 *
4869 * The sample index should be adjusted as follows:
4870 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4871 */
4872 if (target == TGSI_TEXTURE_2D_MSAA ||
4873 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4874 struct lp_build_emit_data txf_emit_data = *emit_data;
4875 LLVMValueRef txf_address[4];
4876 /* We only need .xy for non-arrays, and .xyz for arrays. */
4877 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4878 struct tgsi_full_instruction inst = {};
4879
4880 memcpy(txf_address, address, sizeof(txf_address));
4881
4882 /* Read FMASK using TXF_LZ. */
4883 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4884 inst.Texture.Texture = target;
4885 txf_emit_data.inst = &inst;
4886 txf_emit_data.chan = 0;
4887 set_tex_fetch_args(ctx, &txf_emit_data,
4888 target, fmask_ptr, NULL,
4889 txf_address, txf_count, 0xf);
4890 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4891
4892 /* Initialize some constants. */
4893 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4894 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4895
4896 /* Apply the formula. */
4897 LLVMValueRef fmask =
4898 LLVMBuildExtractElement(gallivm->builder,
4899 txf_emit_data.output[0],
4900 ctx->i32_0, "");
4901
4902 unsigned sample_chan = txf_count; /* the sample index is last */
4903
4904 LLVMValueRef sample_index4 =
4905 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4906
4907 LLVMValueRef shifted_fmask =
4908 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4909
4910 LLVMValueRef final_sample =
4911 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4912
4913 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4914 * resource descriptor is 0 (invalid),
4915 */
4916 LLVMValueRef fmask_desc =
4917 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4918 ctx->v8i32, "");
4919
4920 LLVMValueRef fmask_word1 =
4921 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4922 ctx->i32_1, "");
4923
4924 LLVMValueRef word1_is_nonzero =
4925 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4926 fmask_word1, ctx->i32_0, "");
4927
4928 /* Replace the MSAA sample index. */
4929 address[sample_chan] =
4930 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4931 final_sample, address[sample_chan], "");
4932 }
4933
4934 if (opcode == TGSI_OPCODE_TXF ||
4935 opcode == TGSI_OPCODE_TXF_LZ) {
4936 /* add tex offsets */
4937 if (inst->Texture.NumOffsets) {
4938 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4939 const struct tgsi_texture_offset *off = inst->TexOffsets;
4940
4941 assert(inst->Texture.NumOffsets == 1);
4942
4943 switch (target) {
4944 case TGSI_TEXTURE_3D:
4945 address[2] = lp_build_add(uint_bld, address[2],
4946 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4947 /* fall through */
4948 case TGSI_TEXTURE_2D:
4949 case TGSI_TEXTURE_SHADOW2D:
4950 case TGSI_TEXTURE_RECT:
4951 case TGSI_TEXTURE_SHADOWRECT:
4952 case TGSI_TEXTURE_2D_ARRAY:
4953 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4954 address[1] =
4955 lp_build_add(uint_bld, address[1],
4956 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4957 /* fall through */
4958 case TGSI_TEXTURE_1D:
4959 case TGSI_TEXTURE_SHADOW1D:
4960 case TGSI_TEXTURE_1D_ARRAY:
4961 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4962 address[0] =
4963 lp_build_add(uint_bld, address[0],
4964 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4965 break;
4966 /* texture offsets do not apply to other texture targets */
4967 }
4968 }
4969 }
4970
4971 if (opcode == TGSI_OPCODE_TG4) {
4972 unsigned gather_comp = 0;
4973
4974 /* DMASK was repurposed for GATHER4. 4 components are always
4975 * returned and DMASK works like a swizzle - it selects
4976 * the component to fetch. The only valid DMASK values are
4977 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4978 * (red,red,red,red) etc.) The ISA document doesn't mention
4979 * this.
4980 */
4981
4982 /* Get the component index from src1.x for Gather4. */
4983 if (!tgsi_is_shadow_target(target)) {
4984 LLVMValueRef comp_imm;
4985 struct tgsi_src_register src1 = inst->Src[1].Register;
4986
4987 assert(src1.File == TGSI_FILE_IMMEDIATE);
4988
4989 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4990 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4991 gather_comp = CLAMP(gather_comp, 0, 3);
4992 }
4993
4994 dmask = 1 << gather_comp;
4995 }
4996
4997 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4998 samp_ptr, address, count, dmask);
4999 }
5000
5001 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
5002 * incorrectly forces nearest filtering if the texture format is integer.
5003 * The only effect it has on Gather4, which always returns 4 texels for
5004 * bilinear filtering, is that the final coordinates are off by 0.5 of
5005 * the texel size.
5006 *
5007 * The workaround is to subtract 0.5 from the unnormalized coordinates,
5008 * or (0.5 / size) from the normalized coordinates.
5009 */
5010 static void si_lower_gather4_integer(struct si_shader_context *ctx,
5011 struct ac_image_args *args,
5012 unsigned target)
5013 {
5014 LLVMBuilderRef builder = ctx->gallivm.builder;
5015 LLVMValueRef coord = args->addr;
5016 LLVMValueRef half_texel[2];
5017 /* Texture coordinates start after:
5018 * {offset, bias, z-compare, derivatives}
5019 * Only the offset and z-compare can occur here.
5020 */
5021 unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
5022 int c;
5023
5024 if (target == TGSI_TEXTURE_RECT ||
5025 target == TGSI_TEXTURE_SHADOWRECT) {
5026 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
5027 } else {
5028 struct tgsi_full_instruction txq_inst = {};
5029 struct lp_build_emit_data txq_emit_data = {};
5030
5031 /* Query the texture size. */
5032 txq_inst.Texture.Texture = target;
5033 txq_emit_data.inst = &txq_inst;
5034 txq_emit_data.dst_type = ctx->v4i32;
5035 set_tex_fetch_args(ctx, &txq_emit_data, target,
5036 args->resource, NULL, &ctx->i32_0,
5037 1, 0xf);
5038 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
5039
5040 /* Compute -0.5 / size. */
5041 for (c = 0; c < 2; c++) {
5042 half_texel[c] =
5043 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
5044 LLVMConstInt(ctx->i32, c, 0), "");
5045 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
5046 half_texel[c] =
5047 lp_build_emit_llvm_unary(&ctx->bld_base,
5048 TGSI_OPCODE_RCP, half_texel[c]);
5049 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
5050 LLVMConstReal(ctx->f32, -0.5), "");
5051 }
5052 }
5053
5054 for (c = 0; c < 2; c++) {
5055 LLVMValueRef tmp;
5056 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
5057
5058 tmp = LLVMBuildExtractElement(builder, coord, index, "");
5059 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
5060 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
5061 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
5062 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
5063 }
5064
5065 args->addr = coord;
5066 }
5067
5068 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
5069 struct lp_build_tgsi_context *bld_base,
5070 struct lp_build_emit_data *emit_data)
5071 {
5072 struct si_shader_context *ctx = si_shader_context(bld_base);
5073 const struct tgsi_full_instruction *inst = emit_data->inst;
5074 struct ac_image_args args;
5075 unsigned opcode = inst->Instruction.Opcode;
5076 unsigned target = inst->Texture.Texture;
5077
5078 if (target == TGSI_TEXTURE_BUFFER) {
5079 emit_data->output[emit_data->chan] =
5080 ac_build_buffer_load_format(&ctx->ac,
5081 emit_data->args[0],
5082 emit_data->args[2],
5083 emit_data->args[1],
5084 true);
5085 return;
5086 }
5087
5088 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
5089
5090 args.opcode = ac_image_sample;
5091 args.compare = tgsi_is_shadow_target(target);
5092 args.offset = inst->Texture.NumOffsets > 0;
5093
5094 switch (opcode) {
5095 case TGSI_OPCODE_TXF:
5096 case TGSI_OPCODE_TXF_LZ:
5097 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
5098 target == TGSI_TEXTURE_2D_MSAA ||
5099 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
5100 ac_image_load : ac_image_load_mip;
5101 args.compare = false;
5102 args.offset = false;
5103 break;
5104 case TGSI_OPCODE_LODQ:
5105 args.opcode = ac_image_get_lod;
5106 args.compare = false;
5107 args.offset = false;
5108 break;
5109 case TGSI_OPCODE_TEX:
5110 case TGSI_OPCODE_TEX2:
5111 case TGSI_OPCODE_TXP:
5112 if (ctx->type != PIPE_SHADER_FRAGMENT)
5113 args.level_zero = true;
5114 break;
5115 case TGSI_OPCODE_TEX_LZ:
5116 args.level_zero = true;
5117 break;
5118 case TGSI_OPCODE_TXB:
5119 case TGSI_OPCODE_TXB2:
5120 assert(ctx->type == PIPE_SHADER_FRAGMENT);
5121 args.bias = true;
5122 break;
5123 case TGSI_OPCODE_TXL:
5124 case TGSI_OPCODE_TXL2:
5125 args.lod = true;
5126 break;
5127 case TGSI_OPCODE_TXD:
5128 args.deriv = true;
5129 break;
5130 case TGSI_OPCODE_TG4:
5131 args.opcode = ac_image_gather4;
5132 args.level_zero = true;
5133 break;
5134 default:
5135 assert(0);
5136 return;
5137 }
5138
5139 /* The hardware needs special lowering for Gather4 with integer formats. */
5140 if (ctx->screen->b.chip_class <= VI &&
5141 opcode == TGSI_OPCODE_TG4) {
5142 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5143 /* This will also work with non-constant indexing because of how
5144 * glsl_to_tgsi works and we intent to preserve that behavior.
5145 */
5146 const unsigned src_idx = 2;
5147 unsigned sampler = inst->Src[src_idx].Register.Index;
5148
5149 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
5150
5151 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
5152 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
5153 si_lower_gather4_integer(ctx, &args, target);
5154 }
5155
5156 emit_data->output[emit_data->chan] =
5157 ac_build_image_opcode(&ctx->ac, &args);
5158 }
5159
5160 static void si_llvm_emit_txqs(
5161 const struct lp_build_tgsi_action *action,
5162 struct lp_build_tgsi_context *bld_base,
5163 struct lp_build_emit_data *emit_data)
5164 {
5165 struct si_shader_context *ctx = si_shader_context(bld_base);
5166 struct gallivm_state *gallivm = &ctx->gallivm;
5167 LLVMBuilderRef builder = gallivm->builder;
5168 LLVMValueRef res, samples;
5169 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5170
5171 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5172
5173
5174 /* Read the samples from the descriptor directly. */
5175 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5176 samples = LLVMBuildExtractElement(
5177 builder, res,
5178 LLVMConstInt(ctx->i32, 3, 0), "");
5179 samples = LLVMBuildLShr(builder, samples,
5180 LLVMConstInt(ctx->i32, 16, 0), "");
5181 samples = LLVMBuildAnd(builder, samples,
5182 LLVMConstInt(ctx->i32, 0xf, 0), "");
5183 samples = LLVMBuildShl(builder, ctx->i32_1,
5184 samples, "");
5185
5186 emit_data->output[emit_data->chan] = samples;
5187 }
5188
5189 static void si_llvm_emit_ddxy(
5190 const struct lp_build_tgsi_action *action,
5191 struct lp_build_tgsi_context *bld_base,
5192 struct lp_build_emit_data *emit_data)
5193 {
5194 struct si_shader_context *ctx = si_shader_context(bld_base);
5195 struct gallivm_state *gallivm = &ctx->gallivm;
5196 unsigned opcode = emit_data->info->opcode;
5197 LLVMValueRef val;
5198 int idx;
5199 unsigned mask;
5200
5201 if (opcode == TGSI_OPCODE_DDX_FINE)
5202 mask = AC_TID_MASK_LEFT;
5203 else if (opcode == TGSI_OPCODE_DDY_FINE)
5204 mask = AC_TID_MASK_TOP;
5205 else
5206 mask = AC_TID_MASK_TOP_LEFT;
5207
5208 /* for DDX we want to next X pixel, DDY next Y pixel. */
5209 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5210
5211 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5212 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5213 mask, idx, ctx->lds, val);
5214 emit_data->output[emit_data->chan] = val;
5215 }
5216
5217 /*
5218 * this takes an I,J coordinate pair,
5219 * and works out the X and Y derivatives.
5220 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5221 */
5222 static LLVMValueRef si_llvm_emit_ddxy_interp(
5223 struct lp_build_tgsi_context *bld_base,
5224 LLVMValueRef interp_ij)
5225 {
5226 struct si_shader_context *ctx = si_shader_context(bld_base);
5227 struct gallivm_state *gallivm = &ctx->gallivm;
5228 LLVMValueRef result[4], a;
5229 unsigned i;
5230
5231 for (i = 0; i < 2; i++) {
5232 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5233 LLVMConstInt(ctx->i32, i, 0), "");
5234 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5235 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5236 }
5237
5238 return lp_build_gather_values(gallivm, result, 4);
5239 }
5240
5241 static void interp_fetch_args(
5242 struct lp_build_tgsi_context *bld_base,
5243 struct lp_build_emit_data *emit_data)
5244 {
5245 struct si_shader_context *ctx = si_shader_context(bld_base);
5246 struct gallivm_state *gallivm = &ctx->gallivm;
5247 const struct tgsi_full_instruction *inst = emit_data->inst;
5248
5249 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5250 /* offset is in second src, first two channels */
5251 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5252 emit_data->inst, 1,
5253 TGSI_CHAN_X);
5254 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5255 emit_data->inst, 1,
5256 TGSI_CHAN_Y);
5257 emit_data->arg_count = 2;
5258 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5259 LLVMValueRef sample_position;
5260 LLVMValueRef sample_id;
5261 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5262
5263 /* fetch sample ID, then fetch its sample position,
5264 * and place into first two channels.
5265 */
5266 sample_id = lp_build_emit_fetch(bld_base,
5267 emit_data->inst, 1, TGSI_CHAN_X);
5268 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5269 ctx->i32, "");
5270 sample_position = load_sample_position(ctx, sample_id);
5271
5272 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5273 sample_position,
5274 ctx->i32_0, "");
5275
5276 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5277 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5278 sample_position,
5279 ctx->i32_1, "");
5280 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5281 emit_data->arg_count = 2;
5282 }
5283 }
5284
5285 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5286 struct lp_build_tgsi_context *bld_base,
5287 struct lp_build_emit_data *emit_data)
5288 {
5289 struct si_shader_context *ctx = si_shader_context(bld_base);
5290 struct si_shader *shader = ctx->shader;
5291 struct gallivm_state *gallivm = &ctx->gallivm;
5292 LLVMValueRef interp_param;
5293 const struct tgsi_full_instruction *inst = emit_data->inst;
5294 int input_index = inst->Src[0].Register.Index;
5295 int chan;
5296 int i;
5297 LLVMValueRef attr_number;
5298 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5299 int interp_param_idx;
5300 unsigned interp = shader->selector->info.input_interpolate[input_index];
5301 unsigned location;
5302
5303 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5304
5305 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5306 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5307 location = TGSI_INTERPOLATE_LOC_CENTER;
5308 else
5309 location = TGSI_INTERPOLATE_LOC_CENTROID;
5310
5311 interp_param_idx = lookup_interp_param_index(interp, location);
5312 if (interp_param_idx == -1)
5313 return;
5314 else if (interp_param_idx)
5315 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5316 else
5317 interp_param = NULL;
5318
5319 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5320
5321 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5322 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5323 LLVMValueRef ij_out[2];
5324 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5325
5326 /*
5327 * take the I then J parameters, and the DDX/Y for it, and
5328 * calculate the IJ inputs for the interpolator.
5329 * temp1 = ddx * offset/sample.x + I;
5330 * interp_param.I = ddy * offset/sample.y + temp1;
5331 * temp1 = ddx * offset/sample.x + J;
5332 * interp_param.J = ddy * offset/sample.y + temp1;
5333 */
5334 for (i = 0; i < 2; i++) {
5335 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5336 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5337 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5338 ddxy_out, ix_ll, "");
5339 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5340 ddxy_out, iy_ll, "");
5341 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5342 interp_param, ix_ll, "");
5343 LLVMValueRef temp1, temp2;
5344
5345 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5346 ctx->f32, "");
5347
5348 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5349
5350 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5351
5352 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5353
5354 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5355 }
5356 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5357 }
5358
5359 for (chan = 0; chan < 4; chan++) {
5360 LLVMValueRef llvm_chan;
5361 unsigned schan;
5362
5363 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5364 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5365
5366 if (interp_param) {
5367 interp_param = LLVMBuildBitCast(gallivm->builder,
5368 interp_param, LLVMVectorType(ctx->f32, 2), "");
5369 LLVMValueRef i = LLVMBuildExtractElement(
5370 gallivm->builder, interp_param, ctx->i32_0, "");
5371 LLVMValueRef j = LLVMBuildExtractElement(
5372 gallivm->builder, interp_param, ctx->i32_1, "");
5373 emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5374 llvm_chan, attr_number, params,
5375 i, j);
5376 } else {
5377 emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5378 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5379 llvm_chan, attr_number, params);
5380 }
5381 }
5382 }
5383
5384 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5385 LLVMValueRef value)
5386 {
5387 struct gallivm_state *gallivm = &ctx->gallivm;
5388 LLVMValueRef args[3] = {
5389 value,
5390 ctx->i32_0,
5391 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5392 };
5393
5394 /* We currently have no other way to prevent LLVM from lifting the icmp
5395 * calls to a dominating basic block.
5396 */
5397 emit_optimization_barrier(ctx, &args[0]);
5398
5399 if (LLVMTypeOf(args[0]) != ctx->i32)
5400 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5401
5402 return lp_build_intrinsic(gallivm->builder,
5403 "llvm.amdgcn.icmp.i32",
5404 ctx->i64, args, 3,
5405 LP_FUNC_ATTR_NOUNWIND |
5406 LP_FUNC_ATTR_READNONE |
5407 LP_FUNC_ATTR_CONVERGENT);
5408 }
5409
5410 static void vote_all_emit(
5411 const struct lp_build_tgsi_action *action,
5412 struct lp_build_tgsi_context *bld_base,
5413 struct lp_build_emit_data *emit_data)
5414 {
5415 struct si_shader_context *ctx = si_shader_context(bld_base);
5416 struct gallivm_state *gallivm = &ctx->gallivm;
5417 LLVMValueRef active_set, vote_set;
5418 LLVMValueRef tmp;
5419
5420 active_set = si_emit_ballot(ctx, ctx->i32_1);
5421 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5422
5423 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5424 emit_data->output[emit_data->chan] =
5425 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5426 }
5427
5428 static void vote_any_emit(
5429 const struct lp_build_tgsi_action *action,
5430 struct lp_build_tgsi_context *bld_base,
5431 struct lp_build_emit_data *emit_data)
5432 {
5433 struct si_shader_context *ctx = si_shader_context(bld_base);
5434 struct gallivm_state *gallivm = &ctx->gallivm;
5435 LLVMValueRef vote_set;
5436 LLVMValueRef tmp;
5437
5438 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5439
5440 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5441 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5442 emit_data->output[emit_data->chan] =
5443 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5444 }
5445
5446 static void vote_eq_emit(
5447 const struct lp_build_tgsi_action *action,
5448 struct lp_build_tgsi_context *bld_base,
5449 struct lp_build_emit_data *emit_data)
5450 {
5451 struct si_shader_context *ctx = si_shader_context(bld_base);
5452 struct gallivm_state *gallivm = &ctx->gallivm;
5453 LLVMValueRef active_set, vote_set;
5454 LLVMValueRef all, none, tmp;
5455
5456 active_set = si_emit_ballot(ctx, ctx->i32_1);
5457 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5458
5459 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5460 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5461 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5462 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5463 emit_data->output[emit_data->chan] =
5464 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5465 }
5466
5467 static void ballot_emit(
5468 const struct lp_build_tgsi_action *action,
5469 struct lp_build_tgsi_context *bld_base,
5470 struct lp_build_emit_data *emit_data)
5471 {
5472 struct si_shader_context *ctx = si_shader_context(bld_base);
5473 LLVMBuilderRef builder = ctx->gallivm.builder;
5474 LLVMValueRef tmp;
5475
5476 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5477 tmp = si_emit_ballot(ctx, tmp);
5478 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5479
5480 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5481 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5482 }
5483
5484 static void read_invoc_fetch_args(
5485 struct lp_build_tgsi_context *bld_base,
5486 struct lp_build_emit_data *emit_data)
5487 {
5488 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5489 0, emit_data->src_chan);
5490
5491 /* Always read the source invocation (= lane) from the X channel. */
5492 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5493 1, TGSI_CHAN_X);
5494 emit_data->arg_count = 2;
5495 }
5496
5497 static void read_lane_emit(
5498 const struct lp_build_tgsi_action *action,
5499 struct lp_build_tgsi_context *bld_base,
5500 struct lp_build_emit_data *emit_data)
5501 {
5502 struct si_shader_context *ctx = si_shader_context(bld_base);
5503 LLVMBuilderRef builder = ctx->gallivm.builder;
5504
5505 /* We currently have no other way to prevent LLVM from lifting the icmp
5506 * calls to a dominating basic block.
5507 */
5508 emit_optimization_barrier(ctx, &emit_data->args[0]);
5509
5510 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5511 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5512 ctx->i32, "");
5513 }
5514
5515 emit_data->output[emit_data->chan] =
5516 ac_build_intrinsic(&ctx->ac, action->intr_name,
5517 ctx->i32, emit_data->args, emit_data->arg_count,
5518 AC_FUNC_ATTR_READNONE |
5519 AC_FUNC_ATTR_CONVERGENT);
5520 }
5521
5522 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5523 struct lp_build_emit_data *emit_data)
5524 {
5525 struct si_shader_context *ctx = si_shader_context(bld_base);
5526 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5527 LLVMValueRef imm;
5528 unsigned stream;
5529
5530 assert(src0.File == TGSI_FILE_IMMEDIATE);
5531
5532 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5533 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5534 return stream;
5535 }
5536
5537 /* Emit one vertex from the geometry shader */
5538 static void si_llvm_emit_vertex(
5539 const struct lp_build_tgsi_action *action,
5540 struct lp_build_tgsi_context *bld_base,
5541 struct lp_build_emit_data *emit_data)
5542 {
5543 struct si_shader_context *ctx = si_shader_context(bld_base);
5544 struct lp_build_context *uint = &bld_base->uint_bld;
5545 struct si_shader *shader = ctx->shader;
5546 struct tgsi_shader_info *info = &shader->selector->info;
5547 struct gallivm_state *gallivm = &ctx->gallivm;
5548 struct lp_build_if_state if_state;
5549 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5550 ctx->param_gs2vs_offset);
5551 LLVMValueRef gs_next_vertex;
5552 LLVMValueRef can_emit, kill;
5553 unsigned chan, offset;
5554 int i;
5555 unsigned stream;
5556
5557 stream = si_llvm_get_stream(bld_base, emit_data);
5558
5559 /* Write vertex attribute values to GSVS ring */
5560 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5561 ctx->gs_next_vertex[stream],
5562 "");
5563
5564 /* If this thread has already emitted the declared maximum number of
5565 * vertices, skip the write: excessive vertex emissions are not
5566 * supposed to have any effect.
5567 *
5568 * If the shader has no writes to memory, kill it instead. This skips
5569 * further memory loads and may allow LLVM to skip to the end
5570 * altogether.
5571 */
5572 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5573 LLVMConstInt(ctx->i32,
5574 shader->selector->gs_max_out_vertices, 0), "");
5575
5576 bool use_kill = !info->writes_memory;
5577 if (use_kill) {
5578 kill = lp_build_select(&bld_base->base, can_emit,
5579 LLVMConstReal(ctx->f32, 1.0f),
5580 LLVMConstReal(ctx->f32, -1.0f));
5581
5582 ac_build_kill(&ctx->ac, kill);
5583 } else {
5584 lp_build_if(&if_state, gallivm, can_emit);
5585 }
5586
5587 offset = 0;
5588 for (i = 0; i < info->num_outputs; i++) {
5589 LLVMValueRef *out_ptr = ctx->outputs[i];
5590
5591 for (chan = 0; chan < 4; chan++) {
5592 if (!(info->output_usagemask[i] & (1 << chan)) ||
5593 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5594 continue;
5595
5596 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5597 LLVMValueRef voffset =
5598 LLVMConstInt(ctx->i32, offset *
5599 shader->selector->gs_max_out_vertices, 0);
5600 offset++;
5601
5602 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5603 voffset = lp_build_mul_imm(uint, voffset, 4);
5604
5605 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5606
5607 ac_build_buffer_store_dword(&ctx->ac,
5608 ctx->gsvs_ring[stream],
5609 out_val, 1,
5610 voffset, soffset, 0,
5611 1, 1, true, true);
5612 }
5613 }
5614
5615 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5616 ctx->i32_1);
5617
5618 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5619
5620 /* Signal vertex emission */
5621 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5622 si_get_gs_wave_id(ctx));
5623 if (!use_kill)
5624 lp_build_endif(&if_state);
5625 }
5626
5627 /* Cut one primitive from the geometry shader */
5628 static void si_llvm_emit_primitive(
5629 const struct lp_build_tgsi_action *action,
5630 struct lp_build_tgsi_context *bld_base,
5631 struct lp_build_emit_data *emit_data)
5632 {
5633 struct si_shader_context *ctx = si_shader_context(bld_base);
5634 unsigned stream;
5635
5636 /* Signal primitive cut */
5637 stream = si_llvm_get_stream(bld_base, emit_data);
5638 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5639 si_get_gs_wave_id(ctx));
5640 }
5641
5642 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5643 struct lp_build_tgsi_context *bld_base,
5644 struct lp_build_emit_data *emit_data)
5645 {
5646 struct si_shader_context *ctx = si_shader_context(bld_base);
5647 struct gallivm_state *gallivm = &ctx->gallivm;
5648
5649 /* SI only (thanks to a hw bug workaround):
5650 * The real barrier instruction isn’t needed, because an entire patch
5651 * always fits into a single wave.
5652 */
5653 if (HAVE_LLVM >= 0x0309 &&
5654 ctx->screen->b.chip_class == SI &&
5655 ctx->type == PIPE_SHADER_TESS_CTRL) {
5656 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5657 return;
5658 }
5659
5660 lp_build_intrinsic(gallivm->builder,
5661 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5662 : "llvm.AMDGPU.barrier.local",
5663 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5664 }
5665
5666 static const struct lp_build_tgsi_action tex_action = {
5667 .fetch_args = tex_fetch_args,
5668 .emit = build_tex_intrinsic,
5669 };
5670
5671 static const struct lp_build_tgsi_action interp_action = {
5672 .fetch_args = interp_fetch_args,
5673 .emit = build_interp_intrinsic,
5674 };
5675
5676 static void si_create_function(struct si_shader_context *ctx,
5677 const char *name,
5678 LLVMTypeRef *returns, unsigned num_returns,
5679 LLVMTypeRef *params, unsigned num_params,
5680 int last_sgpr)
5681 {
5682 int i;
5683
5684 si_llvm_create_func(ctx, name, returns, num_returns,
5685 params, num_params);
5686 si_llvm_shader_type(ctx->main_fn, ctx->type);
5687 ctx->return_value = LLVMGetUndef(ctx->return_type);
5688
5689 for (i = 0; i <= last_sgpr; ++i) {
5690 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5691
5692 /* The combination of:
5693 * - ByVal
5694 * - dereferenceable
5695 * - invariant.load
5696 * allows the optimization passes to move loads and reduces
5697 * SGPR spilling significantly.
5698 */
5699 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5700 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5701 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5702 ac_add_attr_dereferenceable(P, UINT64_MAX);
5703 } else
5704 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5705 }
5706
5707 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5708 "no-signed-zeros-fp-math",
5709 "true");
5710
5711 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5712 /* These were copied from some LLVM test. */
5713 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5714 "less-precise-fpmad",
5715 "true");
5716 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5717 "no-infs-fp-math",
5718 "true");
5719 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5720 "no-nans-fp-math",
5721 "true");
5722 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5723 "unsafe-fp-math",
5724 "true");
5725 }
5726 }
5727
5728 static void declare_streamout_params(struct si_shader_context *ctx,
5729 struct pipe_stream_output_info *so,
5730 LLVMTypeRef *params, LLVMTypeRef i32,
5731 unsigned *num_params)
5732 {
5733 int i;
5734
5735 /* Streamout SGPRs. */
5736 if (so->num_outputs) {
5737 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5738 params[ctx->param_streamout_config = (*num_params)++] = i32;
5739 else
5740 ctx->param_streamout_config = *num_params - 1;
5741
5742 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5743 }
5744 /* A streamout buffer offset is loaded if the stride is non-zero. */
5745 for (i = 0; i < 4; i++) {
5746 if (!so->stride[i])
5747 continue;
5748
5749 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5750 }
5751 }
5752
5753 static unsigned llvm_get_type_size(LLVMTypeRef type)
5754 {
5755 LLVMTypeKind kind = LLVMGetTypeKind(type);
5756
5757 switch (kind) {
5758 case LLVMIntegerTypeKind:
5759 return LLVMGetIntTypeWidth(type) / 8;
5760 case LLVMFloatTypeKind:
5761 return 4;
5762 case LLVMPointerTypeKind:
5763 return 8;
5764 case LLVMVectorTypeKind:
5765 return LLVMGetVectorSize(type) *
5766 llvm_get_type_size(LLVMGetElementType(type));
5767 case LLVMArrayTypeKind:
5768 return LLVMGetArrayLength(type) *
5769 llvm_get_type_size(LLVMGetElementType(type));
5770 default:
5771 assert(0);
5772 return 0;
5773 }
5774 }
5775
5776 static void declare_lds_as_pointer(struct si_shader_context *ctx)
5777 {
5778 struct gallivm_state *gallivm = &ctx->gallivm;
5779
5780 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5781 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5782 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5783 "lds");
5784 }
5785
5786 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5787 {
5788 const unsigned *properties = shader->selector->info.properties;
5789 unsigned max_work_group_size =
5790 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5791 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5792 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5793
5794 if (!max_work_group_size) {
5795 /* This is a variable group size compute shader,
5796 * compile it for the maximum possible group size.
5797 */
5798 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5799 }
5800 return max_work_group_size;
5801 }
5802
5803 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5804 LLVMTypeRef *params,
5805 unsigned *num_params,
5806 bool assign_params)
5807 {
5808 params[(*num_params)++] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5809 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5810 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5811 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5812
5813 if (assign_params) {
5814 ctx->param_const_buffers = *num_params - 4;
5815 ctx->param_samplers = *num_params - 3;
5816 ctx->param_images = *num_params - 2;
5817 ctx->param_shader_buffers = *num_params - 1;
5818 }
5819 }
5820
5821 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5822 LLVMTypeRef *params,
5823 unsigned *num_params)
5824 {
5825 params[ctx->param_rw_buffers = (*num_params)++] =
5826 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5827 declare_per_stage_desc_pointers(ctx, params, num_params, true);
5828 }
5829
5830 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5831 LLVMTypeRef *params,
5832 unsigned *num_params)
5833 {
5834 params[ctx->param_vertex_buffers = (*num_params)++] =
5835 const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5836 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5837 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5838 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5839 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5840 }
5841
5842 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5843 LLVMTypeRef *params, unsigned *num_params,
5844 unsigned *num_prolog_vgprs)
5845 {
5846 struct si_shader *shader = ctx->shader;
5847
5848 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5849 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5850 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5851 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5852
5853 if (!shader->is_gs_copy_shader) {
5854 /* Vertex load indices. */
5855 ctx->param_vertex_index0 = (*num_params);
5856 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5857 params[(*num_params)++] = ctx->i32;
5858 *num_prolog_vgprs += shader->selector->info.num_inputs;
5859 }
5860 }
5861
5862 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
5863 LLVMTypeRef *params, unsigned *num_params)
5864 {
5865 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
5866 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
5867 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
5868 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
5869 }
5870
5871 enum {
5872 /* Convenient merged shader definitions. */
5873 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5874 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5875 };
5876
5877 static void create_function(struct si_shader_context *ctx)
5878 {
5879 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5880 struct gallivm_state *gallivm = &ctx->gallivm;
5881 struct si_shader *shader = ctx->shader;
5882 LLVMTypeRef params[100]; /* just make it large enough */
5883 LLVMTypeRef returns[16+32*4];
5884 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5885 unsigned num_returns = 0;
5886 unsigned num_prolog_vgprs = 0;
5887 unsigned type = ctx->type;
5888
5889 /* Set MERGED shaders. */
5890 if (ctx->screen->b.chip_class >= GFX9) {
5891 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5892 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5893 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5894 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5895 }
5896
5897 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5898
5899 switch (type) {
5900 case PIPE_SHADER_VERTEX:
5901 declare_default_desc_pointers(ctx, params, &num_params);
5902 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5903
5904 if (shader->key.as_es) {
5905 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5906 } else if (shader->key.as_ls) {
5907 /* no extra parameters */
5908 } else {
5909 if (shader->is_gs_copy_shader)
5910 num_params = ctx->param_rw_buffers + 1;
5911
5912 /* The locations of the other parameters are assigned dynamically. */
5913 declare_streamout_params(ctx, &shader->selector->so,
5914 params, ctx->i32, &num_params);
5915 }
5916
5917 last_sgpr = num_params-1;
5918
5919 /* VGPRs */
5920 declare_vs_input_vgprs(ctx, params, &num_params,
5921 &num_prolog_vgprs);
5922
5923 /* PrimitiveID output. */
5924 if (!shader->is_gs_copy_shader &&
5925 !shader->key.as_es && !shader->key.as_ls) {
5926 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5927 returns[num_returns++] = ctx->f32;
5928 }
5929 break;
5930
5931 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5932 declare_default_desc_pointers(ctx, params, &num_params);
5933 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5934 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5935 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5936 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5937 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5938 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5939 last_sgpr = num_params - 1;
5940
5941 /* VGPRs */
5942 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5943 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5944
5945 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5946 * placed after the user SGPRs.
5947 */
5948 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5949 returns[num_returns++] = ctx->i32; /* SGPRs */
5950 for (i = 0; i < 3; i++)
5951 returns[num_returns++] = ctx->f32; /* VGPRs */
5952 break;
5953
5954 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
5955 /* Merged stages have 8 system SGPRs at the beginning. */
5956 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
5957 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5958 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5959 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
5960 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5961 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
5962 params[num_params++] = ctx->i32; /* unused */
5963 params[num_params++] = ctx->i32; /* unused */
5964
5965 params[num_params++] = ctx->i32; /* unused */
5966 params[num_params++] = ctx->i32; /* unused */
5967 declare_per_stage_desc_pointers(ctx, params, &num_params,
5968 ctx->type == PIPE_SHADER_VERTEX);
5969 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5970
5971 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5972 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5973 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5974 params[num_params++] = ctx->i32; /* unused */
5975
5976 declare_per_stage_desc_pointers(ctx, params, &num_params,
5977 ctx->type == PIPE_SHADER_TESS_CTRL);
5978 last_sgpr = num_params - 1;
5979
5980 /* VGPRs (first TCS, then VS) */
5981 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5982 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5983
5984 if (ctx->type == PIPE_SHADER_VERTEX) {
5985 declare_vs_input_vgprs(ctx, params, &num_params,
5986 &num_prolog_vgprs);
5987
5988 /* LS return values are inputs to the TCS main shader part. */
5989 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
5990 returns[num_returns++] = ctx->i32; /* SGPRs */
5991 for (i = 0; i < 2; i++)
5992 returns[num_returns++] = ctx->f32; /* VGPRs */
5993 } else {
5994 /* TCS return values are inputs to the TCS epilog.
5995 *
5996 * param_tcs_offchip_offset, param_tcs_factor_offset,
5997 * param_tcs_offchip_layout, and param_rw_buffers
5998 * should be passed to the epilog.
5999 */
6000 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT; i++)
6001 returns[num_returns++] = ctx->i32; /* SGPRs */
6002 for (i = 0; i < 3; i++)
6003 returns[num_returns++] = ctx->f32; /* VGPRs */
6004 }
6005 break;
6006
6007 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
6008 /* Merged stages have 8 system SGPRs at the beginning. */
6009 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
6010 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
6011 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6012 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6013 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6014 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6015 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
6016 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
6017
6018 params[num_params++] = ctx->i32; /* unused */
6019 params[num_params++] = ctx->i32; /* unused */
6020 declare_per_stage_desc_pointers(ctx, params, &num_params,
6021 (ctx->type == PIPE_SHADER_VERTEX ||
6022 ctx->type == PIPE_SHADER_TESS_EVAL));
6023 if (ctx->type == PIPE_SHADER_VERTEX) {
6024 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6025 } else {
6026 /* TESS_EVAL (and also GEOMETRY):
6027 * Declare as many input SGPRs as the VS has. */
6028 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6029 params[num_params++] = ctx->i32; /* unused */
6030 params[num_params++] = ctx->i32; /* unused */
6031 params[num_params++] = ctx->i32; /* unused */
6032 params[num_params++] = ctx->i32; /* unused */
6033 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
6034 }
6035
6036 declare_per_stage_desc_pointers(ctx, params, &num_params,
6037 ctx->type == PIPE_SHADER_GEOMETRY);
6038 last_sgpr = num_params - 1;
6039
6040 /* VGPRs (first GS, then VS/TES) */
6041 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
6042 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
6043 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6044 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6045 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
6046
6047 if (ctx->type == PIPE_SHADER_VERTEX) {
6048 declare_vs_input_vgprs(ctx, params, &num_params,
6049 &num_prolog_vgprs);
6050 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
6051 declare_tes_input_vgprs(ctx, params, &num_params);
6052 }
6053
6054 if (ctx->type == PIPE_SHADER_VERTEX ||
6055 ctx->type == PIPE_SHADER_TESS_EVAL) {
6056 /* ES return values are inputs to GS. */
6057 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
6058 returns[num_returns++] = ctx->i32; /* SGPRs */
6059 for (i = 0; i < 5; i++)
6060 returns[num_returns++] = ctx->f32; /* VGPRs */
6061 }
6062 break;
6063
6064 case PIPE_SHADER_TESS_EVAL:
6065 declare_default_desc_pointers(ctx, params, &num_params);
6066 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6067
6068 if (shader->key.as_es) {
6069 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6070 params[num_params++] = ctx->i32;
6071 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
6072 } else {
6073 params[num_params++] = ctx->i32;
6074 declare_streamout_params(ctx, &shader->selector->so,
6075 params, ctx->i32, &num_params);
6076 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6077 }
6078 last_sgpr = num_params - 1;
6079
6080 /* VGPRs */
6081 declare_tes_input_vgprs(ctx, params, &num_params);
6082
6083 /* PrimitiveID output. */
6084 if (!shader->key.as_es)
6085 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
6086 returns[num_returns++] = ctx->f32;
6087 break;
6088
6089 case PIPE_SHADER_GEOMETRY:
6090 declare_default_desc_pointers(ctx, params, &num_params);
6091 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6092 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
6093 last_sgpr = num_params - 1;
6094
6095 /* VGPRs */
6096 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
6097 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
6098 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6099 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
6100 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
6101 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
6102 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
6103 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6104 break;
6105
6106 case PIPE_SHADER_FRAGMENT:
6107 declare_default_desc_pointers(ctx, params, &num_params);
6108 params[SI_PARAM_ALPHA_REF] = ctx->f32;
6109 params[SI_PARAM_PRIM_MASK] = ctx->i32;
6110 last_sgpr = SI_PARAM_PRIM_MASK;
6111 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
6112 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
6113 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
6114 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
6115 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
6116 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
6117 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
6118 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
6119 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
6120 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
6121 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
6122 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
6123 params[SI_PARAM_FRONT_FACE] = ctx->i32;
6124 shader->info.face_vgpr_index = 20;
6125 params[SI_PARAM_ANCILLARY] = ctx->i32;
6126 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
6127 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
6128 num_params = SI_PARAM_POS_FIXED_PT+1;
6129
6130 /* Color inputs from the prolog. */
6131 if (shader->selector->info.colors_read) {
6132 unsigned num_color_elements =
6133 util_bitcount(shader->selector->info.colors_read);
6134
6135 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
6136 for (i = 0; i < num_color_elements; i++)
6137 params[num_params++] = ctx->f32;
6138
6139 num_prolog_vgprs += num_color_elements;
6140 }
6141
6142 /* Outputs for the epilog. */
6143 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6144 num_returns =
6145 num_return_sgprs +
6146 util_bitcount(shader->selector->info.colors_written) * 4 +
6147 shader->selector->info.writes_z +
6148 shader->selector->info.writes_stencil +
6149 shader->selector->info.writes_samplemask +
6150 1 /* SampleMaskIn */;
6151
6152 num_returns = MAX2(num_returns,
6153 num_return_sgprs +
6154 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6155
6156 for (i = 0; i < num_return_sgprs; i++)
6157 returns[i] = ctx->i32;
6158 for (; i < num_returns; i++)
6159 returns[i] = ctx->f32;
6160 break;
6161
6162 case PIPE_SHADER_COMPUTE:
6163 declare_default_desc_pointers(ctx, params, &num_params);
6164 params[SI_PARAM_GRID_SIZE] = v3i32;
6165 params[SI_PARAM_BLOCK_SIZE] = v3i32;
6166 params[SI_PARAM_BLOCK_ID] = v3i32;
6167 last_sgpr = SI_PARAM_BLOCK_ID;
6168
6169 params[SI_PARAM_THREAD_ID] = v3i32;
6170 num_params = SI_PARAM_THREAD_ID + 1;
6171 break;
6172 default:
6173 assert(0 && "unimplemented shader");
6174 return;
6175 }
6176
6177 assert(num_params <= ARRAY_SIZE(params));
6178
6179 si_create_function(ctx, "main", returns, num_returns, params,
6180 num_params, last_sgpr);
6181
6182 /* Reserve register locations for VGPR inputs the PS prolog may need. */
6183 if (ctx->type == PIPE_SHADER_FRAGMENT &&
6184 ctx->separate_prolog) {
6185 si_llvm_add_attribute(ctx->main_fn,
6186 "InitialPSInputAddr",
6187 S_0286D0_PERSP_SAMPLE_ENA(1) |
6188 S_0286D0_PERSP_CENTER_ENA(1) |
6189 S_0286D0_PERSP_CENTROID_ENA(1) |
6190 S_0286D0_LINEAR_SAMPLE_ENA(1) |
6191 S_0286D0_LINEAR_CENTER_ENA(1) |
6192 S_0286D0_LINEAR_CENTROID_ENA(1) |
6193 S_0286D0_FRONT_FACE_ENA(1) |
6194 S_0286D0_POS_FIXED_PT_ENA(1));
6195 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
6196 si_llvm_add_attribute(ctx->main_fn,
6197 "amdgpu-max-work-group-size",
6198 si_get_max_workgroup_size(shader));
6199 }
6200
6201 shader->info.num_input_sgprs = 0;
6202 shader->info.num_input_vgprs = 0;
6203
6204 for (i = 0; i <= last_sgpr; ++i)
6205 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
6206
6207 for (; i < num_params; ++i)
6208 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
6209
6210 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
6211 shader->info.num_input_vgprs -= num_prolog_vgprs;
6212
6213 if (!ctx->screen->has_ds_bpermute &&
6214 bld_base->info &&
6215 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
6216 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
6217 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
6218 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
6219 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
6220 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
6221 ctx->lds =
6222 LLVMAddGlobalInAddressSpace(gallivm->module,
6223 LLVMArrayType(ctx->i32, 64),
6224 "ddxy_lds",
6225 LOCAL_ADDR_SPACE);
6226
6227 if (shader->key.as_ls ||
6228 ctx->type == PIPE_SHADER_TESS_CTRL ||
6229 /* GFX9 has the ESGS ring buffer in LDS. */
6230 (ctx->screen->b.chip_class >= GFX9 &&
6231 (shader->key.as_es ||
6232 ctx->type == PIPE_SHADER_GEOMETRY)))
6233 declare_lds_as_pointer(ctx);
6234 }
6235
6236 /**
6237 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
6238 * for later use.
6239 */
6240 static void preload_ring_buffers(struct si_shader_context *ctx)
6241 {
6242 struct gallivm_state *gallivm = &ctx->gallivm;
6243 LLVMBuilderRef builder = gallivm->builder;
6244
6245 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6246 ctx->param_rw_buffers);
6247
6248 if (ctx->screen->b.chip_class <= VI &&
6249 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
6250 unsigned ring =
6251 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6252 : SI_ES_RING_ESGS;
6253 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6254
6255 ctx->esgs_ring =
6256 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6257 }
6258
6259 if (ctx->shader->is_gs_copy_shader) {
6260 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6261
6262 ctx->gsvs_ring[0] =
6263 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6264 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6265 const struct si_shader_selector *sel = ctx->shader->selector;
6266 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6267 LLVMValueRef base_ring;
6268
6269 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6270
6271 /* The conceptual layout of the GSVS ring is
6272 * v0c0 .. vLv0 v0c1 .. vLc1 ..
6273 * but the real memory layout is swizzled across
6274 * threads:
6275 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6276 * t16v0c0 ..
6277 * Override the buffer descriptor accordingly.
6278 */
6279 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6280 uint64_t stream_offset = 0;
6281
6282 for (unsigned stream = 0; stream < 4; ++stream) {
6283 unsigned num_components;
6284 unsigned stride;
6285 unsigned num_records;
6286 LLVMValueRef ring, tmp;
6287
6288 num_components = sel->info.num_stream_output_components[stream];
6289 if (!num_components)
6290 continue;
6291
6292 stride = 4 * num_components * sel->gs_max_out_vertices;
6293
6294 /* Limit on the stride field for <= CIK. */
6295 assert(stride < (1 << 14));
6296
6297 num_records = 64;
6298
6299 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6300 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6301 tmp = LLVMBuildAdd(builder, tmp,
6302 LLVMConstInt(ctx->i64,
6303 stream_offset, 0), "");
6304 stream_offset += stride * 64;
6305
6306 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6307 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6308 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6309 tmp = LLVMBuildOr(builder, tmp,
6310 LLVMConstInt(ctx->i32,
6311 S_008F04_STRIDE(stride) |
6312 S_008F04_SWIZZLE_ENABLE(1), 0), "");
6313 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6314 ring = LLVMBuildInsertElement(builder, ring,
6315 LLVMConstInt(ctx->i32, num_records, 0),
6316 LLVMConstInt(ctx->i32, 2, 0), "");
6317 ring = LLVMBuildInsertElement(builder, ring,
6318 LLVMConstInt(ctx->i32,
6319 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6320 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6321 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6322 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6323 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6324 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6325 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6326 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6327 S_008F0C_ADD_TID_ENABLE(1),
6328 0),
6329 LLVMConstInt(ctx->i32, 3, 0), "");
6330 ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
6331
6332 ctx->gsvs_ring[stream] = ring;
6333 }
6334 }
6335 }
6336
6337 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6338 LLVMValueRef param_rw_buffers,
6339 unsigned param_pos_fixed_pt)
6340 {
6341 struct gallivm_state *gallivm = &ctx->gallivm;
6342 LLVMBuilderRef builder = gallivm->builder;
6343 LLVMValueRef slot, desc, offset, row, bit, address[2];
6344
6345 /* Use the fixed-point gl_FragCoord input.
6346 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6347 * per coordinate to get the repeating effect.
6348 */
6349 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6350 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6351
6352 /* Load the buffer descriptor. */
6353 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6354 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6355
6356 /* The stipple pattern is 32x32, each row has 32 bits. */
6357 offset = LLVMBuildMul(builder, address[1],
6358 LLVMConstInt(ctx->i32, 4, 0), "");
6359 row = buffer_load_const(ctx, desc, offset);
6360 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6361 bit = LLVMBuildLShr(builder, row, address[0], "");
6362 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6363
6364 /* The intrinsic kills the thread if arg < 0. */
6365 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6366 LLVMConstReal(ctx->f32, -1), "");
6367 ac_build_kill(&ctx->ac, bit);
6368 }
6369
6370 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6371 struct si_shader_config *conf,
6372 unsigned symbol_offset)
6373 {
6374 unsigned i;
6375 const unsigned char *config =
6376 ac_shader_binary_config_start(binary, symbol_offset);
6377 bool really_needs_scratch = false;
6378
6379 /* LLVM adds SGPR spills to the scratch size.
6380 * Find out if we really need the scratch buffer.
6381 */
6382 for (i = 0; i < binary->reloc_count; i++) {
6383 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6384
6385 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6386 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6387 really_needs_scratch = true;
6388 break;
6389 }
6390 }
6391
6392 /* XXX: We may be able to emit some of these values directly rather than
6393 * extracting fields to be emitted later.
6394 */
6395
6396 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6397 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6398 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6399 switch (reg) {
6400 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6401 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6402 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6403 case R_00B848_COMPUTE_PGM_RSRC1:
6404 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6405 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6406 conf->float_mode = G_00B028_FLOAT_MODE(value);
6407 conf->rsrc1 = value;
6408 break;
6409 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6410 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6411 break;
6412 case R_00B84C_COMPUTE_PGM_RSRC2:
6413 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6414 conf->rsrc2 = value;
6415 break;
6416 case R_0286CC_SPI_PS_INPUT_ENA:
6417 conf->spi_ps_input_ena = value;
6418 break;
6419 case R_0286D0_SPI_PS_INPUT_ADDR:
6420 conf->spi_ps_input_addr = value;
6421 break;
6422 case R_0286E8_SPI_TMPRING_SIZE:
6423 case R_00B860_COMPUTE_TMPRING_SIZE:
6424 /* WAVESIZE is in units of 256 dwords. */
6425 if (really_needs_scratch)
6426 conf->scratch_bytes_per_wave =
6427 G_00B860_WAVESIZE(value) * 256 * 4;
6428 break;
6429 case 0x4: /* SPILLED_SGPRS */
6430 conf->spilled_sgprs = value;
6431 break;
6432 case 0x8: /* SPILLED_VGPRS */
6433 conf->spilled_vgprs = value;
6434 break;
6435 default:
6436 {
6437 static bool printed;
6438
6439 if (!printed) {
6440 fprintf(stderr, "Warning: LLVM emitted unknown "
6441 "config register: 0x%x\n", reg);
6442 printed = true;
6443 }
6444 }
6445 break;
6446 }
6447 }
6448
6449 if (!conf->spi_ps_input_addr)
6450 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6451 }
6452
6453 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6454 struct si_shader *shader,
6455 struct si_shader_config *config,
6456 uint64_t scratch_va)
6457 {
6458 unsigned i;
6459 uint32_t scratch_rsrc_dword0 = scratch_va;
6460 uint32_t scratch_rsrc_dword1 =
6461 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6462
6463 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6464 * correctly.
6465 */
6466 if (HAVE_LLVM >= 0x0309)
6467 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6468 else
6469 scratch_rsrc_dword1 |=
6470 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6471
6472 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6473 const struct ac_shader_reloc *reloc =
6474 &shader->binary.relocs[i];
6475 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6476 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6477 &scratch_rsrc_dword0, 4);
6478 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6479 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6480 &scratch_rsrc_dword1, 4);
6481 }
6482 }
6483 }
6484
6485 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6486 {
6487 unsigned size = shader->binary.code_size;
6488
6489 if (shader->prolog)
6490 size += shader->prolog->binary.code_size;
6491 if (shader->previous_stage)
6492 size += shader->previous_stage->binary.code_size;
6493 if (shader->prolog2)
6494 size += shader->prolog2->binary.code_size;
6495 if (shader->epilog)
6496 size += shader->epilog->binary.code_size;
6497 return size;
6498 }
6499
6500 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6501 {
6502 const struct ac_shader_binary *prolog =
6503 shader->prolog ? &shader->prolog->binary : NULL;
6504 const struct ac_shader_binary *previous_stage =
6505 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6506 const struct ac_shader_binary *prolog2 =
6507 shader->prolog2 ? &shader->prolog2->binary : NULL;
6508 const struct ac_shader_binary *epilog =
6509 shader->epilog ? &shader->epilog->binary : NULL;
6510 const struct ac_shader_binary *mainb = &shader->binary;
6511 unsigned bo_size = si_get_shader_binary_size(shader) +
6512 (!epilog ? mainb->rodata_size : 0);
6513 unsigned char *ptr;
6514
6515 assert(!prolog || !prolog->rodata_size);
6516 assert(!previous_stage || !previous_stage->rodata_size);
6517 assert(!prolog2 || !prolog2->rodata_size);
6518 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
6519 !mainb->rodata_size);
6520 assert(!epilog || !epilog->rodata_size);
6521
6522 /* GFX9 can fetch at most 128 bytes past the end of the shader.
6523 * Prevent VM faults.
6524 */
6525 if (sscreen->b.chip_class >= GFX9)
6526 bo_size += 128;
6527
6528 r600_resource_reference(&shader->bo, NULL);
6529 shader->bo = (struct r600_resource*)
6530 pipe_buffer_create(&sscreen->b.b, 0,
6531 PIPE_USAGE_IMMUTABLE,
6532 align(bo_size, SI_CPDMA_ALIGNMENT));
6533 if (!shader->bo)
6534 return -ENOMEM;
6535
6536 /* Upload. */
6537 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6538 PIPE_TRANSFER_READ_WRITE |
6539 PIPE_TRANSFER_UNSYNCHRONIZED);
6540
6541 if (prolog) {
6542 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6543 ptr += prolog->code_size;
6544 }
6545 if (previous_stage) {
6546 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6547 previous_stage->code_size);
6548 ptr += previous_stage->code_size;
6549 }
6550 if (prolog2) {
6551 util_memcpy_cpu_to_le32(ptr, prolog2->code, prolog2->code_size);
6552 ptr += prolog2->code_size;
6553 }
6554
6555 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6556 ptr += mainb->code_size;
6557
6558 if (epilog)
6559 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6560 else if (mainb->rodata_size > 0)
6561 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6562
6563 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6564 return 0;
6565 }
6566
6567 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6568 struct pipe_debug_callback *debug,
6569 const char *name, FILE *file)
6570 {
6571 char *line, *p;
6572 unsigned i, count;
6573
6574 if (binary->disasm_string) {
6575 fprintf(file, "Shader %s disassembly:\n", name);
6576 fprintf(file, "%s", binary->disasm_string);
6577
6578 if (debug && debug->debug_message) {
6579 /* Very long debug messages are cut off, so send the
6580 * disassembly one line at a time. This causes more
6581 * overhead, but on the plus side it simplifies
6582 * parsing of resulting logs.
6583 */
6584 pipe_debug_message(debug, SHADER_INFO,
6585 "Shader Disassembly Begin");
6586
6587 line = binary->disasm_string;
6588 while (*line) {
6589 p = util_strchrnul(line, '\n');
6590 count = p - line;
6591
6592 if (count) {
6593 pipe_debug_message(debug, SHADER_INFO,
6594 "%.*s", count, line);
6595 }
6596
6597 if (!*p)
6598 break;
6599 line = p + 1;
6600 }
6601
6602 pipe_debug_message(debug, SHADER_INFO,
6603 "Shader Disassembly End");
6604 }
6605 } else {
6606 fprintf(file, "Shader %s binary:\n", name);
6607 for (i = 0; i < binary->code_size; i += 4) {
6608 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6609 binary->code[i + 3], binary->code[i + 2],
6610 binary->code[i + 1], binary->code[i]);
6611 }
6612 }
6613 }
6614
6615 static void si_shader_dump_stats(struct si_screen *sscreen,
6616 struct si_shader *shader,
6617 struct pipe_debug_callback *debug,
6618 unsigned processor,
6619 FILE *file,
6620 bool check_debug_option)
6621 {
6622 struct si_shader_config *conf = &shader->config;
6623 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6624 unsigned code_size = si_get_shader_binary_size(shader);
6625 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6626 unsigned lds_per_wave = 0;
6627 unsigned max_simd_waves = 10;
6628
6629 /* Compute LDS usage for PS. */
6630 switch (processor) {
6631 case PIPE_SHADER_FRAGMENT:
6632 /* The minimum usage per wave is (num_inputs * 48). The maximum
6633 * usage is (num_inputs * 48 * 16).
6634 * We can get anything in between and it varies between waves.
6635 *
6636 * The 48 bytes per input for a single primitive is equal to
6637 * 4 bytes/component * 4 components/input * 3 points.
6638 *
6639 * Other stages don't know the size at compile time or don't
6640 * allocate LDS per wave, but instead they do it per thread group.
6641 */
6642 lds_per_wave = conf->lds_size * lds_increment +
6643 align(num_inputs * 48, lds_increment);
6644 break;
6645 case PIPE_SHADER_COMPUTE:
6646 if (shader->selector) {
6647 unsigned max_workgroup_size =
6648 si_get_max_workgroup_size(shader);
6649 lds_per_wave = (conf->lds_size * lds_increment) /
6650 DIV_ROUND_UP(max_workgroup_size, 64);
6651 }
6652 break;
6653 }
6654
6655 /* Compute the per-SIMD wave counts. */
6656 if (conf->num_sgprs) {
6657 if (sscreen->b.chip_class >= VI)
6658 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6659 else
6660 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6661 }
6662
6663 if (conf->num_vgprs)
6664 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6665
6666 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6667 * 16KB makes some SIMDs unoccupied). */
6668 if (lds_per_wave)
6669 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6670
6671 if (!check_debug_option ||
6672 r600_can_dump_shader(&sscreen->b, processor)) {
6673 if (processor == PIPE_SHADER_FRAGMENT) {
6674 fprintf(file, "*** SHADER CONFIG ***\n"
6675 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6676 "SPI_PS_INPUT_ENA = 0x%04x\n",
6677 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6678 }
6679
6680 fprintf(file, "*** SHADER STATS ***\n"
6681 "SGPRS: %d\n"
6682 "VGPRS: %d\n"
6683 "Spilled SGPRs: %d\n"
6684 "Spilled VGPRs: %d\n"
6685 "Private memory VGPRs: %d\n"
6686 "Code Size: %d bytes\n"
6687 "LDS: %d blocks\n"
6688 "Scratch: %d bytes per wave\n"
6689 "Max Waves: %d\n"
6690 "********************\n\n\n",
6691 conf->num_sgprs, conf->num_vgprs,
6692 conf->spilled_sgprs, conf->spilled_vgprs,
6693 conf->private_mem_vgprs, code_size,
6694 conf->lds_size, conf->scratch_bytes_per_wave,
6695 max_simd_waves);
6696 }
6697
6698 pipe_debug_message(debug, SHADER_INFO,
6699 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6700 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6701 "Spilled VGPRs: %d PrivMem VGPRs: %d",
6702 conf->num_sgprs, conf->num_vgprs, code_size,
6703 conf->lds_size, conf->scratch_bytes_per_wave,
6704 max_simd_waves, conf->spilled_sgprs,
6705 conf->spilled_vgprs, conf->private_mem_vgprs);
6706 }
6707
6708 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6709 {
6710 switch (processor) {
6711 case PIPE_SHADER_VERTEX:
6712 if (shader->key.as_es)
6713 return "Vertex Shader as ES";
6714 else if (shader->key.as_ls)
6715 return "Vertex Shader as LS";
6716 else
6717 return "Vertex Shader as VS";
6718 case PIPE_SHADER_TESS_CTRL:
6719 return "Tessellation Control Shader";
6720 case PIPE_SHADER_TESS_EVAL:
6721 if (shader->key.as_es)
6722 return "Tessellation Evaluation Shader as ES";
6723 else
6724 return "Tessellation Evaluation Shader as VS";
6725 case PIPE_SHADER_GEOMETRY:
6726 if (shader->is_gs_copy_shader)
6727 return "GS Copy Shader as VS";
6728 else
6729 return "Geometry Shader";
6730 case PIPE_SHADER_FRAGMENT:
6731 return "Pixel Shader";
6732 case PIPE_SHADER_COMPUTE:
6733 return "Compute Shader";
6734 default:
6735 return "Unknown Shader";
6736 }
6737 }
6738
6739 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6740 struct pipe_debug_callback *debug, unsigned processor,
6741 FILE *file, bool check_debug_option)
6742 {
6743 if (!check_debug_option ||
6744 r600_can_dump_shader(&sscreen->b, processor))
6745 si_dump_shader_key(processor, shader, file);
6746
6747 if (!check_debug_option && shader->binary.llvm_ir_string) {
6748 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6749 si_get_shader_name(shader, processor));
6750 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6751 }
6752
6753 if (!check_debug_option ||
6754 (r600_can_dump_shader(&sscreen->b, processor) &&
6755 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6756 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6757
6758 if (shader->prolog)
6759 si_shader_dump_disassembly(&shader->prolog->binary,
6760 debug, "prolog", file);
6761 if (shader->previous_stage)
6762 si_shader_dump_disassembly(&shader->previous_stage->binary,
6763 debug, "previous stage", file);
6764 if (shader->prolog2)
6765 si_shader_dump_disassembly(&shader->prolog2->binary,
6766 debug, "prolog2", file);
6767
6768 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6769
6770 if (shader->epilog)
6771 si_shader_dump_disassembly(&shader->epilog->binary,
6772 debug, "epilog", file);
6773 fprintf(file, "\n");
6774 }
6775
6776 si_shader_dump_stats(sscreen, shader, debug, processor, file,
6777 check_debug_option);
6778 }
6779
6780 int si_compile_llvm(struct si_screen *sscreen,
6781 struct ac_shader_binary *binary,
6782 struct si_shader_config *conf,
6783 LLVMTargetMachineRef tm,
6784 LLVMModuleRef mod,
6785 struct pipe_debug_callback *debug,
6786 unsigned processor,
6787 const char *name)
6788 {
6789 int r = 0;
6790 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6791
6792 if (r600_can_dump_shader(&sscreen->b, processor)) {
6793 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6794
6795 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6796 fprintf(stderr, "%s LLVM IR:\n\n", name);
6797 ac_dump_module(mod);
6798 fprintf(stderr, "\n");
6799 }
6800 }
6801
6802 if (sscreen->record_llvm_ir) {
6803 char *ir = LLVMPrintModuleToString(mod);
6804 binary->llvm_ir_string = strdup(ir);
6805 LLVMDisposeMessage(ir);
6806 }
6807
6808 if (!si_replace_shader(count, binary)) {
6809 r = si_llvm_compile(mod, binary, tm, debug);
6810 if (r)
6811 return r;
6812 }
6813
6814 si_shader_binary_read_config(binary, conf, 0);
6815
6816 /* Enable 64-bit and 16-bit denormals, because there is no performance
6817 * cost.
6818 *
6819 * If denormals are enabled, all floating-point output modifiers are
6820 * ignored.
6821 *
6822 * Don't enable denormals for 32-bit floats, because:
6823 * - Floating-point output modifiers would be ignored by the hw.
6824 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6825 * have to stop using those.
6826 * - SI & CI would be very slow.
6827 */
6828 conf->float_mode |= V_00B028_FP_64_DENORMS;
6829
6830 FREE(binary->config);
6831 FREE(binary->global_symbol_offsets);
6832 binary->config = NULL;
6833 binary->global_symbol_offsets = NULL;
6834
6835 /* Some shaders can't have rodata because their binaries can be
6836 * concatenated.
6837 */
6838 if (binary->rodata_size &&
6839 (processor == PIPE_SHADER_VERTEX ||
6840 processor == PIPE_SHADER_TESS_CTRL ||
6841 processor == PIPE_SHADER_TESS_EVAL ||
6842 processor == PIPE_SHADER_FRAGMENT)) {
6843 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6844 return -EINVAL;
6845 }
6846
6847 return r;
6848 }
6849
6850 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6851 {
6852 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6853 LLVMBuildRetVoid(ctx->gallivm.builder);
6854 else
6855 LLVMBuildRet(ctx->gallivm.builder, ret);
6856 }
6857
6858 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6859 struct si_shader *
6860 si_generate_gs_copy_shader(struct si_screen *sscreen,
6861 LLVMTargetMachineRef tm,
6862 struct si_shader_selector *gs_selector,
6863 struct pipe_debug_callback *debug)
6864 {
6865 struct si_shader_context ctx;
6866 struct si_shader *shader;
6867 struct gallivm_state *gallivm = &ctx.gallivm;
6868 LLVMBuilderRef builder;
6869 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6870 struct lp_build_context *uint = &bld_base->uint_bld;
6871 struct si_shader_output_values *outputs;
6872 struct tgsi_shader_info *gsinfo = &gs_selector->info;
6873 int i, r;
6874
6875 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6876
6877 if (!outputs)
6878 return NULL;
6879
6880 shader = CALLOC_STRUCT(si_shader);
6881 if (!shader) {
6882 FREE(outputs);
6883 return NULL;
6884 }
6885
6886
6887 shader->selector = gs_selector;
6888 shader->is_gs_copy_shader = true;
6889
6890 si_init_shader_ctx(&ctx, sscreen, tm);
6891 ctx.shader = shader;
6892 ctx.type = PIPE_SHADER_VERTEX;
6893
6894 builder = gallivm->builder;
6895
6896 create_function(&ctx);
6897 preload_ring_buffers(&ctx);
6898
6899 LLVMValueRef voffset =
6900 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6901 ctx.param_vertex_id), 4);
6902
6903 /* Fetch the vertex stream ID.*/
6904 LLVMValueRef stream_id;
6905
6906 if (gs_selector->so.num_outputs)
6907 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6908 else
6909 stream_id = ctx.i32_0;
6910
6911 /* Fill in output information. */
6912 for (i = 0; i < gsinfo->num_outputs; ++i) {
6913 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6914 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6915
6916 for (int chan = 0; chan < 4; chan++) {
6917 outputs[i].vertex_stream[chan] =
6918 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6919 }
6920 }
6921
6922 LLVMBasicBlockRef end_bb;
6923 LLVMValueRef switch_inst;
6924
6925 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6926 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6927
6928 for (int stream = 0; stream < 4; stream++) {
6929 LLVMBasicBlockRef bb;
6930 unsigned offset;
6931
6932 if (!gsinfo->num_stream_output_components[stream])
6933 continue;
6934
6935 if (stream > 0 && !gs_selector->so.num_outputs)
6936 continue;
6937
6938 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6939 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6940 LLVMPositionBuilderAtEnd(builder, bb);
6941
6942 /* Fetch vertex data from GSVS ring */
6943 offset = 0;
6944 for (i = 0; i < gsinfo->num_outputs; ++i) {
6945 for (unsigned chan = 0; chan < 4; chan++) {
6946 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6947 outputs[i].vertex_stream[chan] != stream) {
6948 outputs[i].values[chan] = ctx.bld_base.base.undef;
6949 continue;
6950 }
6951
6952 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6953 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6954 offset++;
6955
6956 outputs[i].values[chan] =
6957 ac_build_buffer_load(&ctx.ac,
6958 ctx.gsvs_ring[0], 1,
6959 ctx.i32_0, voffset,
6960 soffset, 0, 1, 1, true);
6961 }
6962 }
6963
6964 /* Streamout and exports. */
6965 if (gs_selector->so.num_outputs) {
6966 si_llvm_emit_streamout(&ctx, outputs,
6967 gsinfo->num_outputs,
6968 stream);
6969 }
6970
6971 if (stream == 0)
6972 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6973
6974 LLVMBuildBr(builder, end_bb);
6975 }
6976
6977 LLVMPositionBuilderAtEnd(builder, end_bb);
6978
6979 LLVMBuildRetVoid(gallivm->builder);
6980
6981 /* Dump LLVM IR before any optimization passes */
6982 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6983 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6984 ac_dump_module(ctx.gallivm.module);
6985
6986 si_llvm_finalize_module(&ctx,
6987 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6988
6989 r = si_compile_llvm(sscreen, &ctx.shader->binary,
6990 &ctx.shader->config, ctx.tm,
6991 ctx.gallivm.module,
6992 debug, PIPE_SHADER_GEOMETRY,
6993 "GS Copy Shader");
6994 if (!r) {
6995 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6996 fprintf(stderr, "GS Copy Shader:\n");
6997 si_shader_dump(sscreen, ctx.shader, debug,
6998 PIPE_SHADER_GEOMETRY, stderr, true);
6999 r = si_shader_binary_upload(sscreen, ctx.shader);
7000 }
7001
7002 si_llvm_dispose(&ctx);
7003
7004 FREE(outputs);
7005
7006 if (r != 0) {
7007 FREE(shader);
7008 shader = NULL;
7009 }
7010 return shader;
7011 }
7012
7013 static void si_dump_shader_key_vs(struct si_shader_key *key,
7014 struct si_vs_prolog_bits *prolog,
7015 const char *prefix, FILE *f)
7016 {
7017 fprintf(f, " %s.instance_divisors = {", prefix);
7018 for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
7019 fprintf(f, !i ? "%u" : ", %u",
7020 prolog->instance_divisors[i]);
7021 }
7022 fprintf(f, "}\n");
7023
7024 fprintf(f, " mono.vs.fix_fetch = {");
7025 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
7026 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
7027 fprintf(f, "}\n");
7028 }
7029
7030 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
7031 FILE *f)
7032 {
7033 struct si_shader_key *key = &shader->key;
7034
7035 fprintf(f, "SHADER KEY\n");
7036
7037 switch (processor) {
7038 case PIPE_SHADER_VERTEX:
7039 si_dump_shader_key_vs(key, &key->part.vs.prolog,
7040 "part.vs.prolog", f);
7041 fprintf(f, " as_es = %u\n", key->as_es);
7042 fprintf(f, " as_ls = %u\n", key->as_ls);
7043 fprintf(f, " part.vs.epilog.export_prim_id = %u\n",
7044 key->part.vs.epilog.export_prim_id);
7045 break;
7046
7047 case PIPE_SHADER_TESS_CTRL:
7048 if (shader->selector->screen->b.chip_class >= GFX9) {
7049 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
7050 "part.tcs.ls_prolog", f);
7051 }
7052 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
7053 fprintf(f, " mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
7054 break;
7055
7056 case PIPE_SHADER_TESS_EVAL:
7057 fprintf(f, " part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
7058 fprintf(f, " as_es = %u\n", key->as_es);
7059 break;
7060
7061 case PIPE_SHADER_GEOMETRY:
7062 if (shader->is_gs_copy_shader)
7063 break;
7064
7065 if (shader->selector->screen->b.chip_class >= GFX9 &&
7066 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
7067 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
7068 "part.gs.vs_prolog", f);
7069 }
7070 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
7071 break;
7072
7073 case PIPE_SHADER_COMPUTE:
7074 break;
7075
7076 case PIPE_SHADER_FRAGMENT:
7077 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
7078 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
7079 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
7080 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
7081 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
7082 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
7083 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
7084 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
7085 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
7086 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
7087 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
7088 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
7089 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
7090 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
7091 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
7092 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
7093 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
7094 break;
7095
7096 default:
7097 assert(0);
7098 }
7099
7100 if ((processor == PIPE_SHADER_GEOMETRY ||
7101 processor == PIPE_SHADER_TESS_EVAL ||
7102 processor == PIPE_SHADER_VERTEX) &&
7103 !key->as_es && !key->as_ls) {
7104 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
7105 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
7106 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
7107 }
7108 }
7109
7110 static void si_init_shader_ctx(struct si_shader_context *ctx,
7111 struct si_screen *sscreen,
7112 LLVMTargetMachineRef tm)
7113 {
7114 struct lp_build_tgsi_context *bld_base;
7115 struct lp_build_tgsi_action tmpl = {};
7116
7117 si_llvm_context_init(ctx, sscreen, tm);
7118
7119 bld_base = &ctx->bld_base;
7120 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
7121
7122 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
7123 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
7124 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
7125
7126 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
7127 bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
7128 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
7129 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
7130 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
7131 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
7132 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
7133 bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
7134 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
7135 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
7136 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
7137 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
7138 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
7139 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
7140 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
7141 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
7142
7143 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
7144 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
7145 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
7146 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
7147 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
7148 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
7149
7150 tmpl.fetch_args = atomic_fetch_args;
7151 tmpl.emit = atomic_emit;
7152 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
7153 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
7154 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
7155 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
7156 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
7157 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
7158 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
7159 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
7160 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
7161 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
7162 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
7163 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
7164 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
7165 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
7166 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
7167 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
7168 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
7169 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
7170 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
7171 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
7172
7173 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
7174
7175 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
7176
7177 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
7178 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
7179 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
7180 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
7181
7182 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
7183 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
7184 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
7185 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
7186 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
7187 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
7188 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
7189 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
7190 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
7191
7192 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
7193 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
7194 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
7195 }
7196
7197 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
7198 {
7199 struct si_shader *shader = ctx->shader;
7200 struct tgsi_shader_info *info = &shader->selector->info;
7201
7202 if (ctx->type == PIPE_SHADER_FRAGMENT ||
7203 ctx->type == PIPE_SHADER_COMPUTE ||
7204 shader->key.as_es ||
7205 shader->key.as_ls)
7206 return;
7207
7208 ac_eliminate_const_vs_outputs(&ctx->ac,
7209 ctx->main_fn,
7210 shader->info.vs_output_param_offset,
7211 info->num_outputs,
7212 &shader->info.nr_param_exports);
7213 }
7214
7215 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
7216 {
7217 ctx->shader->config.private_mem_vgprs = 0;
7218
7219 /* Process all LLVM instructions. */
7220 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
7221 while (bb) {
7222 LLVMValueRef next = LLVMGetFirstInstruction(bb);
7223
7224 while (next) {
7225 LLVMValueRef inst = next;
7226 next = LLVMGetNextInstruction(next);
7227
7228 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
7229 continue;
7230
7231 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
7232 /* No idea why LLVM aligns allocas to 4 elements. */
7233 unsigned alignment = LLVMGetAlignment(inst);
7234 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
7235 ctx->shader->config.private_mem_vgprs += dw_size;
7236 }
7237 bb = LLVMGetNextBasicBlock(bb);
7238 }
7239 }
7240
7241 static void si_init_exec_full_mask(struct si_shader_context *ctx)
7242 {
7243 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
7244 lp_build_intrinsic(ctx->gallivm.builder,
7245 "llvm.amdgcn.init.exec", ctx->voidt,
7246 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
7247 }
7248
7249 static void si_init_exec_from_input(struct si_shader_context *ctx,
7250 unsigned param, unsigned bitoffset)
7251 {
7252 LLVMValueRef args[] = {
7253 LLVMGetParam(ctx->main_fn, param),
7254 LLVMConstInt(ctx->i32, bitoffset, 0),
7255 };
7256 lp_build_intrinsic(ctx->gallivm.builder,
7257 "llvm.amdgcn.init.exec.from.input",
7258 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
7259 }
7260
7261 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
7262 bool is_monolithic)
7263 {
7264 struct si_shader *shader = ctx->shader;
7265 struct si_shader_selector *sel = shader->selector;
7266 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7267
7268 switch (ctx->type) {
7269 case PIPE_SHADER_VERTEX:
7270 ctx->load_input = declare_input_vs;
7271 if (shader->key.as_ls)
7272 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
7273 else if (shader->key.as_es)
7274 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7275 else
7276 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7277 break;
7278 case PIPE_SHADER_TESS_CTRL:
7279 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7280 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7281 bld_base->emit_store = store_output_tcs;
7282 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7283 break;
7284 case PIPE_SHADER_TESS_EVAL:
7285 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7286 if (shader->key.as_es)
7287 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7288 else
7289 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7290 break;
7291 case PIPE_SHADER_GEOMETRY:
7292 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7293 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7294 break;
7295 case PIPE_SHADER_FRAGMENT:
7296 ctx->load_input = declare_input_fs;
7297 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7298 break;
7299 case PIPE_SHADER_COMPUTE:
7300 ctx->declare_memory_region = declare_compute_memory;
7301 break;
7302 default:
7303 assert(!"Unsupported shader type");
7304 return false;
7305 }
7306
7307 create_function(ctx);
7308 preload_ring_buffers(ctx);
7309
7310 /* For GFX9 merged shaders:
7311 * - Set EXEC. If the prolog is present, set EXEC there instead.
7312 * - Add a barrier before the second shader.
7313 *
7314 * The same thing for monolithic shaders is done in
7315 * si_build_wrapper_function.
7316 */
7317 if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
7318 if (sel->info.num_instructions > 1 && /* not empty shader */
7319 (shader->key.as_es || shader->key.as_ls) &&
7320 (ctx->type == PIPE_SHADER_TESS_EVAL ||
7321 (ctx->type == PIPE_SHADER_VERTEX &&
7322 !sel->vs_needs_prolog))) {
7323 si_init_exec_from_input(ctx,
7324 ctx->param_merged_wave_info, 0);
7325 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
7326 ctx->type == PIPE_SHADER_GEOMETRY) {
7327 si_init_exec_from_input(ctx,
7328 ctx->param_merged_wave_info, 8);
7329 si_llvm_emit_barrier(NULL, bld_base, NULL);
7330 }
7331 }
7332
7333 if (ctx->type == PIPE_SHADER_GEOMETRY) {
7334 int i;
7335 for (i = 0; i < 4; i++) {
7336 ctx->gs_next_vertex[i] =
7337 lp_build_alloca(&ctx->gallivm,
7338 ctx->i32, "");
7339 }
7340 }
7341
7342 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7343 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7344 return false;
7345 }
7346
7347 si_llvm_build_ret(ctx, ctx->return_value);
7348 return true;
7349 }
7350
7351 /**
7352 * Compute the VS prolog key, which contains all the information needed to
7353 * build the VS prolog function, and set shader->info bits where needed.
7354 *
7355 * \param info Shader info of the vertex shader.
7356 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
7357 * \param prolog_key Key of the VS prolog
7358 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
7359 * \param key Output shader part key.
7360 */
7361 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7362 unsigned num_input_sgprs,
7363 const struct si_vs_prolog_bits *prolog_key,
7364 struct si_shader *shader_out,
7365 union si_shader_part_key *key)
7366 {
7367 memset(key, 0, sizeof(*key));
7368 key->vs_prolog.states = *prolog_key;
7369 key->vs_prolog.num_input_sgprs = num_input_sgprs;
7370 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7371
7372 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL)
7373 key->vs_prolog.num_merged_next_stage_vgprs = 2;
7374 else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY)
7375 key->vs_prolog.num_merged_next_stage_vgprs = 5;
7376
7377 /* Set the instanceID flag. */
7378 for (unsigned i = 0; i < info->num_inputs; i++)
7379 if (key->vs_prolog.states.instance_divisors[i])
7380 shader_out->info.uses_instanceid = true;
7381 }
7382
7383 /**
7384 * Compute the VS epilog key, which contains all the information needed to
7385 * build the VS epilog function, and set the PrimitiveID output offset.
7386 */
7387 static void si_get_vs_epilog_key(struct si_shader *shader,
7388 struct si_vs_epilog_bits *states,
7389 union si_shader_part_key *key)
7390 {
7391 memset(key, 0, sizeof(*key));
7392 key->vs_epilog.states = *states;
7393
7394 /* Set up the PrimitiveID output. */
7395 if (shader->key.part.vs.epilog.export_prim_id) {
7396 unsigned index = shader->selector->info.num_outputs;
7397 unsigned offset = shader->info.nr_param_exports++;
7398
7399 key->vs_epilog.prim_id_param_offset = offset;
7400 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7401 shader->info.vs_output_param_offset[index] = offset;
7402 }
7403 }
7404
7405 /**
7406 * Compute the PS prolog key, which contains all the information needed to
7407 * build the PS prolog function, and set related bits in shader->config.
7408 */
7409 static void si_get_ps_prolog_key(struct si_shader *shader,
7410 union si_shader_part_key *key,
7411 bool separate_prolog)
7412 {
7413 struct tgsi_shader_info *info = &shader->selector->info;
7414
7415 memset(key, 0, sizeof(*key));
7416 key->ps_prolog.states = shader->key.part.ps.prolog;
7417 key->ps_prolog.colors_read = info->colors_read;
7418 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7419 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7420 key->ps_prolog.wqm = info->uses_derivatives &&
7421 (key->ps_prolog.colors_read ||
7422 key->ps_prolog.states.force_persp_sample_interp ||
7423 key->ps_prolog.states.force_linear_sample_interp ||
7424 key->ps_prolog.states.force_persp_center_interp ||
7425 key->ps_prolog.states.force_linear_center_interp ||
7426 key->ps_prolog.states.bc_optimize_for_persp ||
7427 key->ps_prolog.states.bc_optimize_for_linear);
7428
7429 if (info->colors_read) {
7430 unsigned *color = shader->selector->color_attr_index;
7431
7432 if (shader->key.part.ps.prolog.color_two_side) {
7433 /* BCOLORs are stored after the last input. */
7434 key->ps_prolog.num_interp_inputs = info->num_inputs;
7435 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7436 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7437 }
7438
7439 for (unsigned i = 0; i < 2; i++) {
7440 unsigned interp = info->input_interpolate[color[i]];
7441 unsigned location = info->input_interpolate_loc[color[i]];
7442
7443 if (!(info->colors_read & (0xf << i*4)))
7444 continue;
7445
7446 key->ps_prolog.color_attr_index[i] = color[i];
7447
7448 if (shader->key.part.ps.prolog.flatshade_colors &&
7449 interp == TGSI_INTERPOLATE_COLOR)
7450 interp = TGSI_INTERPOLATE_CONSTANT;
7451
7452 switch (interp) {
7453 case TGSI_INTERPOLATE_CONSTANT:
7454 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7455 break;
7456 case TGSI_INTERPOLATE_PERSPECTIVE:
7457 case TGSI_INTERPOLATE_COLOR:
7458 /* Force the interpolation location for colors here. */
7459 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7460 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7461 if (shader->key.part.ps.prolog.force_persp_center_interp)
7462 location = TGSI_INTERPOLATE_LOC_CENTER;
7463
7464 switch (location) {
7465 case TGSI_INTERPOLATE_LOC_SAMPLE:
7466 key->ps_prolog.color_interp_vgpr_index[i] = 0;
7467 shader->config.spi_ps_input_ena |=
7468 S_0286CC_PERSP_SAMPLE_ENA(1);
7469 break;
7470 case TGSI_INTERPOLATE_LOC_CENTER:
7471 key->ps_prolog.color_interp_vgpr_index[i] = 2;
7472 shader->config.spi_ps_input_ena |=
7473 S_0286CC_PERSP_CENTER_ENA(1);
7474 break;
7475 case TGSI_INTERPOLATE_LOC_CENTROID:
7476 key->ps_prolog.color_interp_vgpr_index[i] = 4;
7477 shader->config.spi_ps_input_ena |=
7478 S_0286CC_PERSP_CENTROID_ENA(1);
7479 break;
7480 default:
7481 assert(0);
7482 }
7483 break;
7484 case TGSI_INTERPOLATE_LINEAR:
7485 /* Force the interpolation location for colors here. */
7486 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7487 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7488 if (shader->key.part.ps.prolog.force_linear_center_interp)
7489 location = TGSI_INTERPOLATE_LOC_CENTER;
7490
7491 /* The VGPR assignment for non-monolithic shaders
7492 * works because InitialPSInputAddr is set on the
7493 * main shader and PERSP_PULL_MODEL is never used.
7494 */
7495 switch (location) {
7496 case TGSI_INTERPOLATE_LOC_SAMPLE:
7497 key->ps_prolog.color_interp_vgpr_index[i] =
7498 separate_prolog ? 6 : 9;
7499 shader->config.spi_ps_input_ena |=
7500 S_0286CC_LINEAR_SAMPLE_ENA(1);
7501 break;
7502 case TGSI_INTERPOLATE_LOC_CENTER:
7503 key->ps_prolog.color_interp_vgpr_index[i] =
7504 separate_prolog ? 8 : 11;
7505 shader->config.spi_ps_input_ena |=
7506 S_0286CC_LINEAR_CENTER_ENA(1);
7507 break;
7508 case TGSI_INTERPOLATE_LOC_CENTROID:
7509 key->ps_prolog.color_interp_vgpr_index[i] =
7510 separate_prolog ? 10 : 13;
7511 shader->config.spi_ps_input_ena |=
7512 S_0286CC_LINEAR_CENTROID_ENA(1);
7513 break;
7514 default:
7515 assert(0);
7516 }
7517 break;
7518 default:
7519 assert(0);
7520 }
7521 }
7522 }
7523 }
7524
7525 /**
7526 * Check whether a PS prolog is required based on the key.
7527 */
7528 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7529 {
7530 return key->ps_prolog.colors_read ||
7531 key->ps_prolog.states.force_persp_sample_interp ||
7532 key->ps_prolog.states.force_linear_sample_interp ||
7533 key->ps_prolog.states.force_persp_center_interp ||
7534 key->ps_prolog.states.force_linear_center_interp ||
7535 key->ps_prolog.states.bc_optimize_for_persp ||
7536 key->ps_prolog.states.bc_optimize_for_linear ||
7537 key->ps_prolog.states.poly_stipple;
7538 }
7539
7540 /**
7541 * Compute the PS epilog key, which contains all the information needed to
7542 * build the PS epilog function.
7543 */
7544 static void si_get_ps_epilog_key(struct si_shader *shader,
7545 union si_shader_part_key *key)
7546 {
7547 struct tgsi_shader_info *info = &shader->selector->info;
7548 memset(key, 0, sizeof(*key));
7549 key->ps_epilog.colors_written = info->colors_written;
7550 key->ps_epilog.writes_z = info->writes_z;
7551 key->ps_epilog.writes_stencil = info->writes_stencil;
7552 key->ps_epilog.writes_samplemask = info->writes_samplemask;
7553 key->ps_epilog.states = shader->key.part.ps.epilog;
7554 }
7555
7556 /**
7557 * Build the GS prolog function. Rotate the input vertices for triangle strips
7558 * with adjacency.
7559 */
7560 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7561 union si_shader_part_key *key)
7562 {
7563 unsigned num_sgprs, num_vgprs;
7564 struct gallivm_state *gallivm = &ctx->gallivm;
7565 LLVMBuilderRef builder = gallivm->builder;
7566 LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
7567 LLVMTypeRef returns[48];
7568 LLVMValueRef func, ret;
7569
7570 if (ctx->screen->b.chip_class >= GFX9) {
7571 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
7572 num_vgprs = 5; /* ES inputs are not needed by GS */
7573 } else {
7574 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
7575 num_vgprs = 8;
7576 }
7577
7578 for (unsigned i = 0; i < num_sgprs; ++i) {
7579 params[i] = ctx->i32;
7580 returns[i] = ctx->i32;
7581 }
7582
7583 for (unsigned i = 0; i < num_vgprs; ++i) {
7584 params[num_sgprs + i] = ctx->i32;
7585 returns[num_sgprs + i] = ctx->f32;
7586 }
7587
7588 /* Create the function. */
7589 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7590 params, num_sgprs + num_vgprs, num_sgprs - 1);
7591 func = ctx->main_fn;
7592
7593 /* Set the full EXEC mask for the prolog, because we are only fiddling
7594 * with registers here. The main shader part will set the correct EXEC
7595 * mask.
7596 */
7597 if (ctx->screen->b.chip_class >= GFX9)
7598 si_init_exec_full_mask(ctx);
7599
7600 /* Copy inputs to outputs. This should be no-op, as the registers match,
7601 * but it will prevent the compiler from overwriting them unintentionally.
7602 */
7603 ret = ctx->return_value;
7604 for (unsigned i = 0; i < num_sgprs; i++) {
7605 LLVMValueRef p = LLVMGetParam(func, i);
7606 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7607 }
7608 for (unsigned i = 0; i < num_vgprs; i++) {
7609 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7610 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7611 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7612 }
7613
7614 if (key->gs_prolog.states.tri_strip_adj_fix) {
7615 /* Remap the input vertices for every other primitive. */
7616 const unsigned gfx6_vtx_params[6] = {
7617 num_sgprs,
7618 num_sgprs + 1,
7619 num_sgprs + 3,
7620 num_sgprs + 4,
7621 num_sgprs + 5,
7622 num_sgprs + 6
7623 };
7624 const unsigned gfx9_vtx_params[3] = {
7625 num_sgprs,
7626 num_sgprs + 1,
7627 num_sgprs + 4,
7628 };
7629 LLVMValueRef vtx_in[6], vtx_out[6];
7630 LLVMValueRef prim_id, rotate;
7631
7632 if (ctx->screen->b.chip_class >= GFX9) {
7633 for (unsigned i = 0; i < 3; i++) {
7634 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
7635 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
7636 }
7637 } else {
7638 for (unsigned i = 0; i < 6; i++)
7639 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
7640 }
7641
7642 prim_id = LLVMGetParam(func, num_sgprs + 2);
7643 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7644
7645 for (unsigned i = 0; i < 6; ++i) {
7646 LLVMValueRef base, rotated;
7647 base = vtx_in[i];
7648 rotated = vtx_in[(i + 4) % 6];
7649 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
7650 }
7651
7652 if (ctx->screen->b.chip_class >= GFX9) {
7653 for (unsigned i = 0; i < 3; i++) {
7654 LLVMValueRef hi, out;
7655
7656 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
7657 LLVMConstInt(ctx->i32, 16, 0), "");
7658 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
7659 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
7660 ret = LLVMBuildInsertValue(builder, ret, out,
7661 gfx9_vtx_params[i], "");
7662 }
7663 } else {
7664 for (unsigned i = 0; i < 6; i++) {
7665 LLVMValueRef out;
7666
7667 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
7668 ret = LLVMBuildInsertValue(builder, ret, out,
7669 gfx6_vtx_params[i], "");
7670 }
7671 }
7672 }
7673
7674 LLVMBuildRet(builder, ret);
7675 }
7676
7677 /**
7678 * Given a list of shader part functions, build a wrapper function that
7679 * runs them in sequence to form a monolithic shader.
7680 */
7681 static void si_build_wrapper_function(struct si_shader_context *ctx,
7682 LLVMValueRef *parts,
7683 unsigned num_parts,
7684 unsigned main_part,
7685 unsigned next_shader_first_part)
7686 {
7687 struct gallivm_state *gallivm = &ctx->gallivm;
7688 LLVMBuilderRef builder = ctx->gallivm.builder;
7689 /* PS epilog has one arg per color component */
7690 LLVMTypeRef param_types[48];
7691 LLVMValueRef initial[48], out[48];
7692 LLVMTypeRef function_type;
7693 unsigned num_params;
7694 unsigned num_out, initial_num_out;
7695 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7696 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
7697 unsigned num_sgprs, num_vgprs;
7698 unsigned last_sgpr_param;
7699 unsigned gprs;
7700 struct lp_build_if_state if_state;
7701
7702 for (unsigned i = 0; i < num_parts; ++i) {
7703 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7704 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7705 }
7706
7707 /* The parameters of the wrapper function correspond to those of the
7708 * first part in terms of SGPRs and VGPRs, but we use the types of the
7709 * main part to get the right types. This is relevant for the
7710 * dereferenceable attribute on descriptor table pointers.
7711 */
7712 num_sgprs = 0;
7713 num_vgprs = 0;
7714
7715 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7716 num_params = LLVMCountParamTypes(function_type);
7717
7718 for (unsigned i = 0; i < num_params; ++i) {
7719 LLVMValueRef param = LLVMGetParam(parts[0], i);
7720
7721 if (ac_is_sgpr_param(param)) {
7722 assert(num_vgprs == 0);
7723 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7724 } else {
7725 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7726 }
7727 }
7728 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7729
7730 num_params = 0;
7731 last_sgpr_param = 0;
7732 gprs = 0;
7733 while (gprs < num_sgprs + num_vgprs) {
7734 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7735 unsigned size;
7736
7737 param_types[num_params] = LLVMTypeOf(param);
7738 if (gprs < num_sgprs)
7739 last_sgpr_param = num_params;
7740 size = llvm_get_type_size(param_types[num_params]) / 4;
7741 num_params++;
7742
7743 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7744 assert(gprs + size <= num_sgprs + num_vgprs &&
7745 (gprs >= num_sgprs || gprs + size <= num_sgprs));
7746
7747 gprs += size;
7748 }
7749
7750 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7751
7752 if (is_merged_shader(ctx->shader))
7753 si_init_exec_full_mask(ctx);
7754
7755 /* Record the arguments of the function as if they were an output of
7756 * a previous part.
7757 */
7758 num_out = 0;
7759 num_out_sgpr = 0;
7760
7761 for (unsigned i = 0; i < num_params; ++i) {
7762 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7763 LLVMTypeRef param_type = LLVMTypeOf(param);
7764 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7765 unsigned size = llvm_get_type_size(param_type) / 4;
7766
7767 if (size == 1) {
7768 if (param_type != out_type)
7769 param = LLVMBuildBitCast(builder, param, out_type, "");
7770 out[num_out++] = param;
7771 } else {
7772 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7773
7774 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7775 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7776 param_type = ctx->i64;
7777 }
7778
7779 if (param_type != vector_type)
7780 param = LLVMBuildBitCast(builder, param, vector_type, "");
7781
7782 for (unsigned j = 0; j < size; ++j)
7783 out[num_out++] = LLVMBuildExtractElement(
7784 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7785 }
7786
7787 if (i <= last_sgpr_param)
7788 num_out_sgpr = num_out;
7789 }
7790
7791 memcpy(initial, out, sizeof(out));
7792 initial_num_out = num_out;
7793 initial_num_out_sgpr = num_out_sgpr;
7794
7795 /* Now chain the parts. */
7796 for (unsigned part = 0; part < num_parts; ++part) {
7797 LLVMValueRef in[48];
7798 LLVMValueRef ret;
7799 LLVMTypeRef ret_type;
7800 unsigned out_idx = 0;
7801
7802 num_params = LLVMCountParams(parts[part]);
7803 assert(num_params <= ARRAY_SIZE(param_types));
7804
7805 /* Merged shaders are executed conditionally depending
7806 * on the number of enabled threads passed in the input SGPRs. */
7807 if (is_merged_shader(ctx->shader) &&
7808 (part == 0 || part == next_shader_first_part)) {
7809 LLVMValueRef ena, count = initial[3];
7810
7811 /* The thread count for the 2nd shader is at bit-offset 8. */
7812 if (part == next_shader_first_part) {
7813 count = LLVMBuildLShr(builder, count,
7814 LLVMConstInt(ctx->i32, 8, 0), "");
7815 }
7816 count = LLVMBuildAnd(builder, count,
7817 LLVMConstInt(ctx->i32, 0x7f, 0), "");
7818 ena = LLVMBuildICmp(builder, LLVMIntULT,
7819 ac_get_thread_id(&ctx->ac), count, "");
7820 lp_build_if(&if_state, &ctx->gallivm, ena);
7821 }
7822
7823 /* Derive arguments for the next part from outputs of the
7824 * previous one.
7825 */
7826 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7827 LLVMValueRef param;
7828 LLVMTypeRef param_type;
7829 bool is_sgpr;
7830 unsigned param_size;
7831 LLVMValueRef arg = NULL;
7832
7833 param = LLVMGetParam(parts[part], param_idx);
7834 param_type = LLVMTypeOf(param);
7835 param_size = llvm_get_type_size(param_type) / 4;
7836 is_sgpr = ac_is_sgpr_param(param);
7837
7838 if (is_sgpr) {
7839 #if HAVE_LLVM < 0x0400
7840 LLVMRemoveAttribute(param, LLVMByValAttribute);
7841 #else
7842 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7843 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7844 #endif
7845 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7846 }
7847
7848 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7849 assert(is_sgpr || out_idx >= num_out_sgpr);
7850
7851 if (param_size == 1)
7852 arg = out[out_idx];
7853 else
7854 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7855
7856 if (LLVMTypeOf(arg) != param_type) {
7857 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7858 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7859 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7860 } else {
7861 arg = LLVMBuildBitCast(builder, arg, param_type, "");
7862 }
7863 }
7864
7865 in[param_idx] = arg;
7866 out_idx += param_size;
7867 }
7868
7869 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7870
7871 if (is_merged_shader(ctx->shader) &&
7872 (part + 1 == next_shader_first_part ||
7873 part + 1 == num_parts)) {
7874 lp_build_endif(&if_state);
7875
7876 if (part + 1 == next_shader_first_part) {
7877 /* A barrier is required between 2 merged shaders. */
7878 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
7879
7880 /* The second half of the merged shader should use
7881 * the inputs from the toplevel (wrapper) function,
7882 * not the return value from the last call.
7883 *
7884 * That's because the last call was executed condi-
7885 * tionally, so we can't consume it in the main
7886 * block.
7887 */
7888 memcpy(out, initial, sizeof(initial));
7889 num_out = initial_num_out;
7890 num_out_sgpr = initial_num_out_sgpr;
7891 }
7892 continue;
7893 }
7894
7895 /* Extract the returned GPRs. */
7896 ret_type = LLVMTypeOf(ret);
7897 num_out = 0;
7898 num_out_sgpr = 0;
7899
7900 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7901 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7902
7903 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7904
7905 for (unsigned i = 0; i < ret_size; ++i) {
7906 LLVMValueRef val =
7907 LLVMBuildExtractValue(builder, ret, i, "");
7908
7909 out[num_out++] = val;
7910
7911 if (LLVMTypeOf(val) == ctx->i32) {
7912 assert(num_out_sgpr + 1 == num_out);
7913 num_out_sgpr = num_out;
7914 }
7915 }
7916 }
7917 }
7918
7919 LLVMBuildRetVoid(builder);
7920 }
7921
7922 int si_compile_tgsi_shader(struct si_screen *sscreen,
7923 LLVMTargetMachineRef tm,
7924 struct si_shader *shader,
7925 bool is_monolithic,
7926 struct pipe_debug_callback *debug)
7927 {
7928 struct si_shader_selector *sel = shader->selector;
7929 struct si_shader_context ctx;
7930 int r = -1;
7931
7932 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7933 * conversion fails. */
7934 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7935 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7936 tgsi_dump(sel->tokens, 0);
7937 si_dump_streamout(&sel->so);
7938 }
7939
7940 si_init_shader_ctx(&ctx, sscreen, tm);
7941 si_llvm_context_set_tgsi(&ctx, shader);
7942 ctx.separate_prolog = !is_monolithic;
7943
7944 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7945 sizeof(shader->info.vs_output_param_offset));
7946
7947 shader->info.uses_instanceid = sel->info.uses_instanceid;
7948
7949 ctx.load_system_value = declare_system_value;
7950
7951 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
7952 si_llvm_dispose(&ctx);
7953 return -1;
7954 }
7955
7956 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7957 LLVMValueRef parts[3];
7958 bool need_prolog;
7959 bool need_epilog;
7960
7961 need_prolog = sel->vs_needs_prolog;
7962 need_epilog = !shader->key.as_es && !shader->key.as_ls;
7963
7964 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7965
7966 if (need_prolog) {
7967 union si_shader_part_key prolog_key;
7968 si_get_vs_prolog_key(&sel->info,
7969 shader->info.num_input_sgprs,
7970 &shader->key.part.vs.prolog,
7971 shader, &prolog_key);
7972 si_build_vs_prolog_function(&ctx, &prolog_key);
7973 parts[0] = ctx.main_fn;
7974 }
7975
7976 if (need_epilog) {
7977 union si_shader_part_key epilog_key;
7978 si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
7979 si_build_vs_epilog_function(&ctx, &epilog_key);
7980 parts[need_prolog ? 2 : 1] = ctx.main_fn;
7981 }
7982
7983 si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
7984 need_prolog ? 1 : 0, 0);
7985 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7986 if (sscreen->b.chip_class >= GFX9) {
7987 struct si_shader_selector *ls = shader->key.part.tcs.ls;
7988 LLVMValueRef parts[4];
7989
7990 /* TCS main part */
7991 parts[2] = ctx.main_fn;
7992
7993 /* TCS epilog */
7994 union si_shader_part_key tcs_epilog_key;
7995 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
7996 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7997 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
7998 parts[3] = ctx.main_fn;
7999
8000 /* VS prolog */
8001 if (ls->vs_needs_prolog) {
8002 union si_shader_part_key vs_prolog_key;
8003 si_get_vs_prolog_key(&ls->info,
8004 shader->info.num_input_sgprs,
8005 &shader->key.part.tcs.ls_prolog,
8006 shader, &vs_prolog_key);
8007 vs_prolog_key.vs_prolog.is_monolithic = true;
8008 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8009 parts[0] = ctx.main_fn;
8010 }
8011
8012 /* VS as LS main part */
8013 struct si_shader shader_ls = {};
8014 shader_ls.selector = ls;
8015 shader_ls.key.as_ls = 1;
8016 shader_ls.key.mono = shader->key.mono;
8017 shader_ls.key.opt = shader->key.opt;
8018 si_llvm_context_set_tgsi(&ctx, &shader_ls);
8019
8020 if (!si_compile_tgsi_main(&ctx, true)) {
8021 si_llvm_dispose(&ctx);
8022 return -1;
8023 }
8024 shader->info.uses_instanceid |= ls->info.uses_instanceid;
8025 parts[1] = ctx.main_fn;
8026
8027 /* Reset the shader context. */
8028 ctx.shader = shader;
8029 ctx.type = PIPE_SHADER_TESS_CTRL;
8030
8031 si_build_wrapper_function(&ctx,
8032 parts + !ls->vs_needs_prolog,
8033 4 - !ls->vs_needs_prolog, 0,
8034 ls->vs_needs_prolog ? 2 : 1);
8035 } else {
8036 LLVMValueRef parts[2];
8037 union si_shader_part_key epilog_key;
8038
8039 parts[0] = ctx.main_fn;
8040
8041 memset(&epilog_key, 0, sizeof(epilog_key));
8042 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8043 si_build_tcs_epilog_function(&ctx, &epilog_key);
8044 parts[1] = ctx.main_fn;
8045
8046 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8047 }
8048 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
8049 !shader->key.as_es) {
8050 LLVMValueRef parts[2];
8051 union si_shader_part_key epilog_key;
8052
8053 parts[0] = ctx.main_fn;
8054
8055 si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
8056 si_build_vs_epilog_function(&ctx, &epilog_key);
8057 parts[1] = ctx.main_fn;
8058
8059 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8060 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
8061 LLVMValueRef parts[2];
8062 union si_shader_part_key prolog_key;
8063
8064 parts[1] = ctx.main_fn;
8065
8066 memset(&prolog_key, 0, sizeof(prolog_key));
8067 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8068 si_build_gs_prolog_function(&ctx, &prolog_key);
8069 parts[0] = ctx.main_fn;
8070
8071 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
8072 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
8073 LLVMValueRef parts[3];
8074 union si_shader_part_key prolog_key;
8075 union si_shader_part_key epilog_key;
8076 bool need_prolog;
8077
8078 si_get_ps_prolog_key(shader, &prolog_key, false);
8079 need_prolog = si_need_ps_prolog(&prolog_key);
8080
8081 parts[need_prolog ? 1 : 0] = ctx.main_fn;
8082
8083 if (need_prolog) {
8084 si_build_ps_prolog_function(&ctx, &prolog_key);
8085 parts[0] = ctx.main_fn;
8086 }
8087
8088 si_get_ps_epilog_key(shader, &epilog_key);
8089 si_build_ps_epilog_function(&ctx, &epilog_key);
8090 parts[need_prolog ? 2 : 1] = ctx.main_fn;
8091
8092 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
8093 need_prolog ? 1 : 0, 0);
8094 }
8095
8096 /* Dump LLVM IR before any optimization passes */
8097 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
8098 r600_can_dump_shader(&sscreen->b, ctx.type))
8099 LLVMDumpModule(ctx.gallivm.module);
8100
8101 si_llvm_finalize_module(&ctx,
8102 r600_extra_shader_checks(&sscreen->b, ctx.type));
8103
8104 /* Post-optimization transformations and analysis. */
8105 si_eliminate_const_vs_outputs(&ctx);
8106
8107 if ((debug && debug->debug_message) ||
8108 r600_can_dump_shader(&sscreen->b, ctx.type))
8109 si_count_scratch_private_memory(&ctx);
8110
8111 /* Compile to bytecode. */
8112 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
8113 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
8114 si_llvm_dispose(&ctx);
8115 if (r) {
8116 fprintf(stderr, "LLVM failed to compile shader\n");
8117 return r;
8118 }
8119
8120 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
8121 * LLVM 3.9svn has this bug.
8122 */
8123 if (sel->type == PIPE_SHADER_COMPUTE) {
8124 unsigned wave_size = 64;
8125 unsigned max_vgprs = 256;
8126 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
8127 unsigned max_sgprs_per_wave = 128;
8128 unsigned max_block_threads = si_get_max_workgroup_size(shader);
8129 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
8130 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
8131
8132 max_vgprs = max_vgprs / min_waves_per_simd;
8133 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
8134
8135 if (shader->config.num_sgprs > max_sgprs ||
8136 shader->config.num_vgprs > max_vgprs) {
8137 fprintf(stderr, "LLVM failed to compile a shader correctly: "
8138 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
8139 shader->config.num_sgprs, shader->config.num_vgprs,
8140 max_sgprs, max_vgprs);
8141
8142 /* Just terminate the process, because dependent
8143 * shaders can hang due to bad input data, but use
8144 * the env var to allow shader-db to work.
8145 */
8146 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
8147 abort();
8148 }
8149 }
8150
8151 /* Add the scratch offset to input SGPRs. */
8152 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
8153 shader->info.num_input_sgprs += 1; /* scratch byte offset */
8154
8155 /* Calculate the number of fragment input VGPRs. */
8156 if (ctx.type == PIPE_SHADER_FRAGMENT) {
8157 shader->info.num_input_vgprs = 0;
8158 shader->info.face_vgpr_index = -1;
8159
8160 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8161 shader->info.num_input_vgprs += 2;
8162 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
8163 shader->info.num_input_vgprs += 2;
8164 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
8165 shader->info.num_input_vgprs += 2;
8166 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
8167 shader->info.num_input_vgprs += 3;
8168 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8169 shader->info.num_input_vgprs += 2;
8170 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
8171 shader->info.num_input_vgprs += 2;
8172 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
8173 shader->info.num_input_vgprs += 2;
8174 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
8175 shader->info.num_input_vgprs += 1;
8176 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
8177 shader->info.num_input_vgprs += 1;
8178 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
8179 shader->info.num_input_vgprs += 1;
8180 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
8181 shader->info.num_input_vgprs += 1;
8182 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
8183 shader->info.num_input_vgprs += 1;
8184 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
8185 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
8186 shader->info.num_input_vgprs += 1;
8187 }
8188 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
8189 shader->info.num_input_vgprs += 1;
8190 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
8191 shader->info.num_input_vgprs += 1;
8192 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
8193 shader->info.num_input_vgprs += 1;
8194 }
8195
8196 return 0;
8197 }
8198
8199 /**
8200 * Create, compile and return a shader part (prolog or epilog).
8201 *
8202 * \param sscreen screen
8203 * \param list list of shader parts of the same category
8204 * \param type shader type
8205 * \param key shader part key
8206 * \param prolog whether the part being requested is a prolog
8207 * \param tm LLVM target machine
8208 * \param debug debug callback
8209 * \param build the callback responsible for building the main function
8210 * \return non-NULL on success
8211 */
8212 static struct si_shader_part *
8213 si_get_shader_part(struct si_screen *sscreen,
8214 struct si_shader_part **list,
8215 enum pipe_shader_type type,
8216 bool prolog,
8217 union si_shader_part_key *key,
8218 LLVMTargetMachineRef tm,
8219 struct pipe_debug_callback *debug,
8220 void (*build)(struct si_shader_context *,
8221 union si_shader_part_key *),
8222 const char *name)
8223 {
8224 struct si_shader_part *result;
8225
8226 mtx_lock(&sscreen->shader_parts_mutex);
8227
8228 /* Find existing. */
8229 for (result = *list; result; result = result->next) {
8230 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
8231 mtx_unlock(&sscreen->shader_parts_mutex);
8232 return result;
8233 }
8234 }
8235
8236 /* Compile a new one. */
8237 result = CALLOC_STRUCT(si_shader_part);
8238 result->key = *key;
8239
8240 struct si_shader shader = {};
8241 struct si_shader_context ctx;
8242 struct gallivm_state *gallivm = &ctx.gallivm;
8243
8244 si_init_shader_ctx(&ctx, sscreen, tm);
8245 ctx.shader = &shader;
8246 ctx.type = type;
8247
8248 switch (type) {
8249 case PIPE_SHADER_VERTEX:
8250 break;
8251 case PIPE_SHADER_TESS_CTRL:
8252 assert(!prolog);
8253 shader.key.part.tcs.epilog = key->tcs_epilog.states;
8254 break;
8255 case PIPE_SHADER_GEOMETRY:
8256 assert(prolog);
8257 break;
8258 case PIPE_SHADER_FRAGMENT:
8259 if (prolog)
8260 shader.key.part.ps.prolog = key->ps_prolog.states;
8261 else
8262 shader.key.part.ps.epilog = key->ps_epilog.states;
8263 break;
8264 default:
8265 unreachable("bad shader part");
8266 }
8267
8268 build(&ctx, key);
8269
8270 /* Compile. */
8271 si_llvm_finalize_module(&ctx,
8272 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
8273
8274 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
8275 gallivm->module, debug, ctx.type, name)) {
8276 FREE(result);
8277 result = NULL;
8278 goto out;
8279 }
8280
8281 result->next = *list;
8282 *list = result;
8283
8284 out:
8285 si_llvm_dispose(&ctx);
8286 mtx_unlock(&sscreen->shader_parts_mutex);
8287 return result;
8288 }
8289
8290 /**
8291 * Build the vertex shader prolog function.
8292 *
8293 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
8294 * All inputs are returned unmodified. The vertex load indices are
8295 * stored after them, which will be used by the API VS for fetching inputs.
8296 *
8297 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
8298 * input_v0,
8299 * input_v1,
8300 * input_v2,
8301 * input_v3,
8302 * (VertexID + BaseVertex),
8303 * (InstanceID + StartInstance),
8304 * (InstanceID / 2 + StartInstance)
8305 */
8306 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
8307 union si_shader_part_key *key)
8308 {
8309 struct gallivm_state *gallivm = &ctx->gallivm;
8310 LLVMTypeRef *params, *returns;
8311 LLVMValueRef ret, func;
8312 int last_sgpr, num_params, num_returns, i;
8313 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
8314 key->vs_prolog.num_merged_next_stage_vgprs;
8315 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
8316 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
8317 num_input_vgprs;
8318 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
8319
8320 ctx->param_vertex_id = first_vs_vgpr;
8321 ctx->param_instance_id = first_vs_vgpr + 3;
8322
8323 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
8324 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
8325 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
8326 sizeof(LLVMTypeRef));
8327 num_params = 0;
8328 num_returns = 0;
8329
8330 /* Declare input and output SGPRs. */
8331 num_params = 0;
8332 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8333 params[num_params++] = ctx->i32;
8334 returns[num_returns++] = ctx->i32;
8335 }
8336 last_sgpr = num_params - 1;
8337
8338 /* Preloaded VGPRs (outputs must be floats) */
8339 for (i = 0; i < num_input_vgprs; i++) {
8340 params[num_params++] = ctx->i32;
8341 returns[num_returns++] = ctx->f32;
8342 }
8343
8344 /* Vertex load indices. */
8345 for (i = 0; i <= key->vs_prolog.last_input; i++)
8346 returns[num_returns++] = ctx->f32;
8347
8348 /* Create the function. */
8349 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
8350 num_params, last_sgpr);
8351 func = ctx->main_fn;
8352
8353 if (key->vs_prolog.num_merged_next_stage_vgprs &&
8354 !key->vs_prolog.is_monolithic)
8355 si_init_exec_from_input(ctx, 3, 0);
8356
8357 /* Copy inputs to outputs. This should be no-op, as the registers match,
8358 * but it will prevent the compiler from overwriting them unintentionally.
8359 */
8360 ret = ctx->return_value;
8361 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8362 LLVMValueRef p = LLVMGetParam(func, i);
8363 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8364 }
8365 for (; i < num_params; i++) {
8366 LLVMValueRef p = LLVMGetParam(func, i);
8367 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
8368 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8369 }
8370
8371 /* Compute vertex load indices from instance divisors. */
8372 for (i = 0; i <= key->vs_prolog.last_input; i++) {
8373 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
8374 LLVMValueRef index;
8375
8376 if (divisor) {
8377 /* InstanceID / Divisor + StartInstance */
8378 index = get_instance_index_for_fetch(ctx,
8379 user_sgpr_base +
8380 SI_SGPR_START_INSTANCE,
8381 divisor);
8382 } else {
8383 /* VertexID + BaseVertex */
8384 index = LLVMBuildAdd(gallivm->builder,
8385 LLVMGetParam(func, ctx->param_vertex_id),
8386 LLVMGetParam(func, user_sgpr_base +
8387 SI_SGPR_BASE_VERTEX), "");
8388 }
8389
8390 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
8391 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
8392 num_params++, "");
8393 }
8394
8395 si_llvm_build_ret(ctx, ret);
8396 }
8397
8398 /**
8399 * Build the vertex shader epilog function. This is also used by the tessellation
8400 * evaluation shader compiled as VS.
8401 *
8402 * The input is PrimitiveID.
8403 *
8404 * If PrimitiveID is required by the pixel shader, export it.
8405 * Otherwise, do nothing.
8406 */
8407 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
8408 union si_shader_part_key *key)
8409 {
8410 struct gallivm_state *gallivm = &ctx->gallivm;
8411 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8412 LLVMTypeRef params[5];
8413 int num_params, i;
8414
8415 /* Declare input VGPRs. */
8416 num_params = key->vs_epilog.states.export_prim_id ?
8417 (VS_EPILOG_PRIMID_LOC + 1) : 0;
8418 assert(num_params <= ARRAY_SIZE(params));
8419
8420 for (i = 0; i < num_params; i++)
8421 params[i] = ctx->f32;
8422
8423 /* Create the function. */
8424 si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
8425
8426 /* Emit exports. */
8427 if (key->vs_epilog.states.export_prim_id) {
8428 struct lp_build_context *base = &bld_base->base;
8429 struct ac_export_args args;
8430
8431 args.enabled_channels = 0x1; /* enabled channels */
8432 args.valid_mask = 0; /* whether the EXEC mask is valid */
8433 args.done = 0; /* DONE bit */
8434 args.target = V_008DFC_SQ_EXP_PARAM +
8435 key->vs_epilog.prim_id_param_offset;
8436 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
8437 args.out[0] = LLVMGetParam(ctx->main_fn,
8438 VS_EPILOG_PRIMID_LOC); /* X */
8439 args.out[1] = base->undef; /* Y */
8440 args.out[2] = base->undef; /* Z */
8441 args.out[3] = base->undef; /* W */
8442
8443 ac_build_export(&ctx->ac, &args);
8444 }
8445
8446 LLVMBuildRetVoid(gallivm->builder);
8447 }
8448
8449 static bool si_get_vs_prolog(struct si_screen *sscreen,
8450 LLVMTargetMachineRef tm,
8451 struct si_shader *shader,
8452 struct pipe_debug_callback *debug,
8453 struct si_shader *main_part,
8454 const struct si_vs_prolog_bits *key)
8455 {
8456 struct si_shader_selector *vs = main_part->selector;
8457
8458 /* The prolog is a no-op if there are no inputs. */
8459 if (!vs->vs_needs_prolog)
8460 return true;
8461
8462 /* Get the prolog. */
8463 union si_shader_part_key prolog_key;
8464 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
8465 key, shader, &prolog_key);
8466
8467 shader->prolog =
8468 si_get_shader_part(sscreen, &sscreen->vs_prologs,
8469 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8470 debug, si_build_vs_prolog_function,
8471 "Vertex Shader Prolog");
8472 return shader->prolog != NULL;
8473 }
8474
8475 /**
8476 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
8477 */
8478 static bool si_get_vs_epilog(struct si_screen *sscreen,
8479 LLVMTargetMachineRef tm,
8480 struct si_shader *shader,
8481 struct pipe_debug_callback *debug,
8482 struct si_vs_epilog_bits *states)
8483 {
8484 union si_shader_part_key epilog_key;
8485
8486 si_get_vs_epilog_key(shader, states, &epilog_key);
8487
8488 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
8489 PIPE_SHADER_VERTEX, true,
8490 &epilog_key, tm, debug,
8491 si_build_vs_epilog_function,
8492 "Vertex Shader Epilog");
8493 return shader->epilog != NULL;
8494 }
8495
8496 /**
8497 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8498 */
8499 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8500 LLVMTargetMachineRef tm,
8501 struct si_shader *shader,
8502 struct pipe_debug_callback *debug)
8503 {
8504 if (!si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8505 &shader->key.part.vs.prolog))
8506 return false;
8507
8508 /* Get the epilog. */
8509 if (!shader->key.as_es && !shader->key.as_ls &&
8510 !si_get_vs_epilog(sscreen, tm, shader, debug,
8511 &shader->key.part.vs.epilog))
8512 return false;
8513
8514 return true;
8515 }
8516
8517 /**
8518 * Select and compile (or reuse) TES parts (epilog).
8519 */
8520 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
8521 LLVMTargetMachineRef tm,
8522 struct si_shader *shader,
8523 struct pipe_debug_callback *debug)
8524 {
8525 if (shader->key.as_es)
8526 return true;
8527
8528 /* TES compiled as VS. */
8529 return si_get_vs_epilog(sscreen, tm, shader, debug,
8530 &shader->key.part.tes.epilog);
8531 }
8532
8533 /**
8534 * Compile the TCS epilog function. This writes tesselation factors to memory
8535 * based on the output primitive type of the tesselator (determined by TES).
8536 */
8537 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8538 union si_shader_part_key *key)
8539 {
8540 struct gallivm_state *gallivm = &ctx->gallivm;
8541 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8542 LLVMTypeRef params[32];
8543 LLVMValueRef func;
8544 int last_sgpr, num_params = 0;
8545
8546 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
8547 params[ctx->param_rw_buffers = num_params++] =
8548 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
8549
8550 if (ctx->screen->b.chip_class >= GFX9) {
8551 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8552 params[num_params++] = ctx->i32; /* wave info */
8553 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8554 params[num_params++] = ctx->i32;
8555 params[num_params++] = ctx->i32;
8556 params[num_params++] = ctx->i32;
8557 params[num_params++] = ctx->i64;
8558 params[num_params++] = ctx->i64;
8559 params[num_params++] = ctx->i64;
8560 params[num_params++] = ctx->i64;
8561 params[num_params++] = ctx->i64;
8562 params[num_params++] = ctx->i64;
8563 params[num_params++] = ctx->i32;
8564 params[num_params++] = ctx->i32;
8565 params[num_params++] = ctx->i32;
8566 params[num_params++] = ctx->i32;
8567 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8568 } else {
8569 params[num_params++] = ctx->i64;
8570 params[num_params++] = ctx->i64;
8571 params[num_params++] = ctx->i64;
8572 params[num_params++] = ctx->i64;
8573 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8574 params[num_params++] = ctx->i32;
8575 params[num_params++] = ctx->i32;
8576 params[num_params++] = ctx->i32;
8577 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8578 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8579 }
8580 last_sgpr = num_params - 1;
8581
8582 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8583 params[num_params++] = ctx->i32; /* invocation ID within the patch */
8584 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8585
8586 /* Create the function. */
8587 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
8588 declare_lds_as_pointer(ctx);
8589 func = ctx->main_fn;
8590
8591 si_write_tess_factors(bld_base,
8592 LLVMGetParam(func, last_sgpr + 1),
8593 LLVMGetParam(func, last_sgpr + 2),
8594 LLVMGetParam(func, last_sgpr + 3));
8595
8596 LLVMBuildRetVoid(gallivm->builder);
8597 }
8598
8599 /**
8600 * Select and compile (or reuse) TCS parts (epilog).
8601 */
8602 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8603 LLVMTargetMachineRef tm,
8604 struct si_shader *shader,
8605 struct pipe_debug_callback *debug)
8606 {
8607 if (sscreen->b.chip_class >= GFX9) {
8608 struct si_shader *ls_main_part =
8609 shader->key.part.tcs.ls->main_shader_part_ls;
8610
8611 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8612 &shader->key.part.tcs.ls_prolog))
8613 return false;
8614
8615 shader->previous_stage = ls_main_part;
8616 }
8617
8618 /* Get the epilog. */
8619 union si_shader_part_key epilog_key;
8620 memset(&epilog_key, 0, sizeof(epilog_key));
8621 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8622
8623 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8624 PIPE_SHADER_TESS_CTRL, false,
8625 &epilog_key, tm, debug,
8626 si_build_tcs_epilog_function,
8627 "Tessellation Control Shader Epilog");
8628 return shader->epilog != NULL;
8629 }
8630
8631 /**
8632 * Select and compile (or reuse) GS parts (prolog).
8633 */
8634 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8635 LLVMTargetMachineRef tm,
8636 struct si_shader *shader,
8637 struct pipe_debug_callback *debug)
8638 {
8639 union si_shader_part_key prolog_key;
8640
8641 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8642 return true;
8643
8644 memset(&prolog_key, 0, sizeof(prolog_key));
8645 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8646
8647 shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8648 PIPE_SHADER_GEOMETRY, true,
8649 &prolog_key, tm, debug,
8650 si_build_gs_prolog_function,
8651 "Geometry Shader Prolog");
8652 return shader->prolog != NULL;
8653 }
8654
8655 /**
8656 * Build the pixel shader prolog function. This handles:
8657 * - two-side color selection and interpolation
8658 * - overriding interpolation parameters for the API PS
8659 * - polygon stippling
8660 *
8661 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8662 * overriden by other states. (e.g. per-sample interpolation)
8663 * Interpolated colors are stored after the preloaded VGPRs.
8664 */
8665 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8666 union si_shader_part_key *key)
8667 {
8668 struct gallivm_state *gallivm = &ctx->gallivm;
8669 LLVMTypeRef *params;
8670 LLVMValueRef ret, func;
8671 int last_sgpr, num_params, num_returns, i, num_color_channels;
8672
8673 assert(si_need_ps_prolog(key));
8674
8675 /* Number of inputs + 8 color elements. */
8676 params = alloca((key->ps_prolog.num_input_sgprs +
8677 key->ps_prolog.num_input_vgprs + 8) *
8678 sizeof(LLVMTypeRef));
8679
8680 /* Declare inputs. */
8681 num_params = 0;
8682 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8683 params[num_params++] = ctx->i32;
8684 last_sgpr = num_params - 1;
8685
8686 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8687 params[num_params++] = ctx->f32;
8688
8689 /* Declare outputs (same as inputs + add colors if needed) */
8690 num_returns = num_params;
8691 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8692 for (i = 0; i < num_color_channels; i++)
8693 params[num_returns++] = ctx->f32;
8694
8695 /* Create the function. */
8696 si_create_function(ctx, "ps_prolog", params, num_returns, params,
8697 num_params, last_sgpr);
8698 func = ctx->main_fn;
8699
8700 /* Copy inputs to outputs. This should be no-op, as the registers match,
8701 * but it will prevent the compiler from overwriting them unintentionally.
8702 */
8703 ret = ctx->return_value;
8704 for (i = 0; i < num_params; i++) {
8705 LLVMValueRef p = LLVMGetParam(func, i);
8706 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8707 }
8708
8709 /* Polygon stippling. */
8710 if (key->ps_prolog.states.poly_stipple) {
8711 /* POS_FIXED_PT is always last. */
8712 unsigned pos = key->ps_prolog.num_input_sgprs +
8713 key->ps_prolog.num_input_vgprs - 1;
8714 LLVMValueRef ptr[2], list;
8715
8716 /* Get the pointer to rw buffers. */
8717 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8718 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8719 list = lp_build_gather_values(gallivm, ptr, 2);
8720 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8721 list = LLVMBuildIntToPtr(gallivm->builder, list,
8722 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8723
8724 si_llvm_emit_polygon_stipple(ctx, list, pos);
8725 }
8726
8727 if (key->ps_prolog.states.bc_optimize_for_persp ||
8728 key->ps_prolog.states.bc_optimize_for_linear) {
8729 unsigned i, base = key->ps_prolog.num_input_sgprs;
8730 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8731
8732 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8733 * The hw doesn't compute CENTROID if the whole wave only
8734 * contains fully-covered quads.
8735 *
8736 * PRIM_MASK is after user SGPRs.
8737 */
8738 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8739 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8740 LLVMConstInt(ctx->i32, 31, 0), "");
8741 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8742 ctx->i1, "");
8743
8744 if (key->ps_prolog.states.bc_optimize_for_persp) {
8745 /* Read PERSP_CENTER. */
8746 for (i = 0; i < 2; i++)
8747 center[i] = LLVMGetParam(func, base + 2 + i);
8748 /* Read PERSP_CENTROID. */
8749 for (i = 0; i < 2; i++)
8750 centroid[i] = LLVMGetParam(func, base + 4 + i);
8751 /* Select PERSP_CENTROID. */
8752 for (i = 0; i < 2; i++) {
8753 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8754 center[i], centroid[i], "");
8755 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8756 tmp, base + 4 + i, "");
8757 }
8758 }
8759 if (key->ps_prolog.states.bc_optimize_for_linear) {
8760 /* Read LINEAR_CENTER. */
8761 for (i = 0; i < 2; i++)
8762 center[i] = LLVMGetParam(func, base + 8 + i);
8763 /* Read LINEAR_CENTROID. */
8764 for (i = 0; i < 2; i++)
8765 centroid[i] = LLVMGetParam(func, base + 10 + i);
8766 /* Select LINEAR_CENTROID. */
8767 for (i = 0; i < 2; i++) {
8768 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8769 center[i], centroid[i], "");
8770 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8771 tmp, base + 10 + i, "");
8772 }
8773 }
8774 }
8775
8776 /* Force per-sample interpolation. */
8777 if (key->ps_prolog.states.force_persp_sample_interp) {
8778 unsigned i, base = key->ps_prolog.num_input_sgprs;
8779 LLVMValueRef persp_sample[2];
8780
8781 /* Read PERSP_SAMPLE. */
8782 for (i = 0; i < 2; i++)
8783 persp_sample[i] = LLVMGetParam(func, base + i);
8784 /* Overwrite PERSP_CENTER. */
8785 for (i = 0; i < 2; i++)
8786 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8787 persp_sample[i], base + 2 + i, "");
8788 /* Overwrite PERSP_CENTROID. */
8789 for (i = 0; i < 2; i++)
8790 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8791 persp_sample[i], base + 4 + i, "");
8792 }
8793 if (key->ps_prolog.states.force_linear_sample_interp) {
8794 unsigned i, base = key->ps_prolog.num_input_sgprs;
8795 LLVMValueRef linear_sample[2];
8796
8797 /* Read LINEAR_SAMPLE. */
8798 for (i = 0; i < 2; i++)
8799 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8800 /* Overwrite LINEAR_CENTER. */
8801 for (i = 0; i < 2; i++)
8802 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8803 linear_sample[i], base + 8 + i, "");
8804 /* Overwrite LINEAR_CENTROID. */
8805 for (i = 0; i < 2; i++)
8806 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8807 linear_sample[i], base + 10 + i, "");
8808 }
8809
8810 /* Force center interpolation. */
8811 if (key->ps_prolog.states.force_persp_center_interp) {
8812 unsigned i, base = key->ps_prolog.num_input_sgprs;
8813 LLVMValueRef persp_center[2];
8814
8815 /* Read PERSP_CENTER. */
8816 for (i = 0; i < 2; i++)
8817 persp_center[i] = LLVMGetParam(func, base + 2 + i);
8818 /* Overwrite PERSP_SAMPLE. */
8819 for (i = 0; i < 2; i++)
8820 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8821 persp_center[i], base + i, "");
8822 /* Overwrite PERSP_CENTROID. */
8823 for (i = 0; i < 2; i++)
8824 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8825 persp_center[i], base + 4 + i, "");
8826 }
8827 if (key->ps_prolog.states.force_linear_center_interp) {
8828 unsigned i, base = key->ps_prolog.num_input_sgprs;
8829 LLVMValueRef linear_center[2];
8830
8831 /* Read LINEAR_CENTER. */
8832 for (i = 0; i < 2; i++)
8833 linear_center[i] = LLVMGetParam(func, base + 8 + i);
8834 /* Overwrite LINEAR_SAMPLE. */
8835 for (i = 0; i < 2; i++)
8836 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8837 linear_center[i], base + 6 + i, "");
8838 /* Overwrite LINEAR_CENTROID. */
8839 for (i = 0; i < 2; i++)
8840 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8841 linear_center[i], base + 10 + i, "");
8842 }
8843
8844 /* Interpolate colors. */
8845 for (i = 0; i < 2; i++) {
8846 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8847 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8848 key->ps_prolog.face_vgpr_index;
8849 LLVMValueRef interp[2], color[4];
8850 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8851
8852 if (!writemask)
8853 continue;
8854
8855 /* If the interpolation qualifier is not CONSTANT (-1). */
8856 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8857 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8858 key->ps_prolog.color_interp_vgpr_index[i];
8859
8860 /* Get the (i,j) updated by bc_optimize handling. */
8861 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8862 interp_vgpr, "");
8863 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8864 interp_vgpr + 1, "");
8865 interp_ij = lp_build_gather_values(gallivm, interp, 2);
8866 }
8867
8868 /* Use the absolute location of the input. */
8869 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8870
8871 if (key->ps_prolog.states.color_two_side) {
8872 face = LLVMGetParam(func, face_vgpr);
8873 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8874 }
8875
8876 interp_fs_input(ctx,
8877 key->ps_prolog.color_attr_index[i],
8878 TGSI_SEMANTIC_COLOR, i,
8879 key->ps_prolog.num_interp_inputs,
8880 key->ps_prolog.colors_read, interp_ij,
8881 prim_mask, face, color);
8882
8883 while (writemask) {
8884 unsigned chan = u_bit_scan(&writemask);
8885 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8886 num_params++, "");
8887 }
8888 }
8889
8890 /* Tell LLVM to insert WQM instruction sequence when needed. */
8891 if (key->ps_prolog.wqm) {
8892 LLVMAddTargetDependentFunctionAttr(func,
8893 "amdgpu-ps-wqm-outputs", "");
8894 }
8895
8896 si_llvm_build_ret(ctx, ret);
8897 }
8898
8899 /**
8900 * Build the pixel shader epilog function. This handles everything that must be
8901 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8902 */
8903 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8904 union si_shader_part_key *key)
8905 {
8906 struct gallivm_state *gallivm = &ctx->gallivm;
8907 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8908 LLVMTypeRef params[16+8*4+3];
8909 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8910 int last_sgpr, num_params = 0, i;
8911 struct si_ps_exports exp = {};
8912
8913 /* Declare input SGPRs. */
8914 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8915 params[ctx->param_const_buffers = num_params++] = ctx->i64;
8916 params[ctx->param_samplers = num_params++] = ctx->i64;
8917 params[ctx->param_images = num_params++] = ctx->i64;
8918 params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8919 assert(num_params == SI_PARAM_ALPHA_REF);
8920 params[SI_PARAM_ALPHA_REF] = ctx->f32;
8921 last_sgpr = SI_PARAM_ALPHA_REF;
8922
8923 /* Declare input VGPRs. */
8924 num_params = (last_sgpr + 1) +
8925 util_bitcount(key->ps_epilog.colors_written) * 4 +
8926 key->ps_epilog.writes_z +
8927 key->ps_epilog.writes_stencil +
8928 key->ps_epilog.writes_samplemask;
8929
8930 num_params = MAX2(num_params,
8931 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8932
8933 assert(num_params <= ARRAY_SIZE(params));
8934
8935 for (i = last_sgpr + 1; i < num_params; i++)
8936 params[i] = ctx->f32;
8937
8938 /* Create the function. */
8939 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
8940 /* Disable elimination of unused inputs. */
8941 si_llvm_add_attribute(ctx->main_fn,
8942 "InitialPSInputAddr", 0xffffff);
8943
8944 /* Process colors. */
8945 unsigned vgpr = last_sgpr + 1;
8946 unsigned colors_written = key->ps_epilog.colors_written;
8947 int last_color_export = -1;
8948
8949 /* Find the last color export. */
8950 if (!key->ps_epilog.writes_z &&
8951 !key->ps_epilog.writes_stencil &&
8952 !key->ps_epilog.writes_samplemask) {
8953 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8954
8955 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8956 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8957 /* Just set this if any of the colorbuffers are enabled. */
8958 if (spi_format &
8959 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8960 last_color_export = 0;
8961 } else {
8962 for (i = 0; i < 8; i++)
8963 if (colors_written & (1 << i) &&
8964 (spi_format >> (i * 4)) & 0xf)
8965 last_color_export = i;
8966 }
8967 }
8968
8969 while (colors_written) {
8970 LLVMValueRef color[4];
8971 int mrt = u_bit_scan(&colors_written);
8972
8973 for (i = 0; i < 4; i++)
8974 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8975
8976 si_export_mrt_color(bld_base, color, mrt,
8977 num_params - 1,
8978 mrt == last_color_export, &exp);
8979 }
8980
8981 /* Process depth, stencil, samplemask. */
8982 if (key->ps_epilog.writes_z)
8983 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8984 if (key->ps_epilog.writes_stencil)
8985 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8986 if (key->ps_epilog.writes_samplemask)
8987 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8988
8989 if (depth || stencil || samplemask)
8990 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8991 else if (last_color_export == -1)
8992 si_export_null(bld_base);
8993
8994 if (exp.num)
8995 si_emit_ps_exports(ctx, &exp);
8996
8997 /* Compile. */
8998 LLVMBuildRetVoid(gallivm->builder);
8999 }
9000
9001 /**
9002 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
9003 */
9004 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
9005 LLVMTargetMachineRef tm,
9006 struct si_shader *shader,
9007 struct pipe_debug_callback *debug)
9008 {
9009 union si_shader_part_key prolog_key;
9010 union si_shader_part_key epilog_key;
9011
9012 /* Get the prolog. */
9013 si_get_ps_prolog_key(shader, &prolog_key, true);
9014
9015 /* The prolog is a no-op if these aren't set. */
9016 if (si_need_ps_prolog(&prolog_key)) {
9017 shader->prolog =
9018 si_get_shader_part(sscreen, &sscreen->ps_prologs,
9019 PIPE_SHADER_FRAGMENT, true,
9020 &prolog_key, tm, debug,
9021 si_build_ps_prolog_function,
9022 "Fragment Shader Prolog");
9023 if (!shader->prolog)
9024 return false;
9025 }
9026
9027 /* Get the epilog. */
9028 si_get_ps_epilog_key(shader, &epilog_key);
9029
9030 shader->epilog =
9031 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
9032 PIPE_SHADER_FRAGMENT, false,
9033 &epilog_key, tm, debug,
9034 si_build_ps_epilog_function,
9035 "Fragment Shader Epilog");
9036 if (!shader->epilog)
9037 return false;
9038
9039 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
9040 if (shader->key.part.ps.prolog.poly_stipple) {
9041 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
9042 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
9043 }
9044
9045 /* Set up the enable bits for per-sample shading if needed. */
9046 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
9047 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9048 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9049 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
9050 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9051 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
9052 }
9053 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
9054 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9055 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9056 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
9057 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9058 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
9059 }
9060 if (shader->key.part.ps.prolog.force_persp_center_interp &&
9061 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9062 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9063 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
9064 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9065 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9066 }
9067 if (shader->key.part.ps.prolog.force_linear_center_interp &&
9068 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9069 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9070 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
9071 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9072 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9073 }
9074
9075 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
9076 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
9077 !(shader->config.spi_ps_input_ena & 0xf)) {
9078 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9079 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
9080 }
9081
9082 /* At least one pair of interpolation weights must be enabled. */
9083 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
9084 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9085 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
9086 }
9087
9088 /* The sample mask input is always enabled, because the API shader always
9089 * passes it through to the epilog. Disable it here if it's unused.
9090 */
9091 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
9092 !shader->selector->info.reads_samplemask)
9093 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
9094
9095 return true;
9096 }
9097
9098 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
9099 unsigned *lds_size)
9100 {
9101 /* SPI barrier management bug:
9102 * Make sure we have at least 4k of LDS in use to avoid the bug.
9103 * It applies to workgroup sizes of more than one wavefront.
9104 */
9105 if (sscreen->b.family == CHIP_BONAIRE ||
9106 sscreen->b.family == CHIP_KABINI ||
9107 sscreen->b.family == CHIP_MULLINS)
9108 *lds_size = MAX2(*lds_size, 8);
9109 }
9110
9111 static void si_fix_resource_usage(struct si_screen *sscreen,
9112 struct si_shader *shader)
9113 {
9114 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
9115
9116 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
9117
9118 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
9119 si_get_max_workgroup_size(shader) > 64) {
9120 si_multiwave_lds_size_workaround(sscreen,
9121 &shader->config.lds_size);
9122 }
9123 }
9124
9125 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
9126 struct si_shader *shader,
9127 struct pipe_debug_callback *debug)
9128 {
9129 struct si_shader_selector *sel = shader->selector;
9130 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
9131 int r;
9132
9133 /* LS, ES, VS are compiled on demand if the main part hasn't been
9134 * compiled for that stage.
9135 *
9136 * Vertex shaders are compiled on demand when a vertex fetch
9137 * workaround must be applied.
9138 */
9139 if (shader->is_monolithic) {
9140 /* Monolithic shader (compiled as a whole, has many variants,
9141 * may take a long time to compile).
9142 */
9143 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
9144 if (r)
9145 return r;
9146 } else {
9147 /* The shader consists of 2-3 parts:
9148 *
9149 * - the middle part is the user shader, it has 1 variant only
9150 * and it was compiled during the creation of the shader
9151 * selector
9152 * - the prolog part is inserted at the beginning
9153 * - the epilog part is inserted at the end
9154 *
9155 * The prolog and epilog have many (but simple) variants.
9156 */
9157
9158 /* Copy the compiled TGSI shader data over. */
9159 shader->is_binary_shared = true;
9160 shader->binary = mainp->binary;
9161 shader->config = mainp->config;
9162 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
9163 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
9164 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
9165 memcpy(shader->info.vs_output_param_offset,
9166 mainp->info.vs_output_param_offset,
9167 sizeof(mainp->info.vs_output_param_offset));
9168 shader->info.uses_instanceid = mainp->info.uses_instanceid;
9169 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
9170 shader->info.nr_param_exports = mainp->info.nr_param_exports;
9171
9172 /* Select prologs and/or epilogs. */
9173 switch (sel->type) {
9174 case PIPE_SHADER_VERTEX:
9175 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
9176 return -1;
9177 break;
9178 case PIPE_SHADER_TESS_CTRL:
9179 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
9180 return -1;
9181 break;
9182 case PIPE_SHADER_TESS_EVAL:
9183 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
9184 return -1;
9185 break;
9186 case PIPE_SHADER_GEOMETRY:
9187 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
9188 return -1;
9189 break;
9190 case PIPE_SHADER_FRAGMENT:
9191 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
9192 return -1;
9193
9194 /* Make sure we have at least as many VGPRs as there
9195 * are allocated inputs.
9196 */
9197 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9198 shader->info.num_input_vgprs);
9199 break;
9200 }
9201
9202 /* Update SGPR and VGPR counts. */
9203 if (shader->prolog) {
9204 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9205 shader->prolog->config.num_sgprs);
9206 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9207 shader->prolog->config.num_vgprs);
9208 }
9209 if (shader->previous_stage) {
9210 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9211 shader->previous_stage->config.num_sgprs);
9212 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9213 shader->previous_stage->config.num_vgprs);
9214 shader->config.spilled_sgprs =
9215 MAX2(shader->config.spilled_sgprs,
9216 shader->previous_stage->config.spilled_sgprs);
9217 shader->config.spilled_vgprs =
9218 MAX2(shader->config.spilled_vgprs,
9219 shader->previous_stage->config.spilled_vgprs);
9220 shader->config.private_mem_vgprs =
9221 MAX2(shader->config.private_mem_vgprs,
9222 shader->previous_stage->config.private_mem_vgprs);
9223 shader->config.scratch_bytes_per_wave =
9224 MAX2(shader->config.scratch_bytes_per_wave,
9225 shader->previous_stage->config.scratch_bytes_per_wave);
9226 shader->info.uses_instanceid |=
9227 shader->previous_stage->info.uses_instanceid;
9228 }
9229 if (shader->prolog2) {
9230 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9231 shader->prolog2->config.num_sgprs);
9232 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9233 shader->prolog2->config.num_vgprs);
9234 }
9235 if (shader->epilog) {
9236 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9237 shader->epilog->config.num_sgprs);
9238 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9239 shader->epilog->config.num_vgprs);
9240 }
9241 }
9242
9243 si_fix_resource_usage(sscreen, shader);
9244 si_shader_dump(sscreen, shader, debug, sel->info.processor,
9245 stderr, true);
9246
9247 /* Upload. */
9248 r = si_shader_binary_upload(sscreen, shader);
9249 if (r) {
9250 fprintf(stderr, "LLVM failed to upload shader\n");
9251 return r;
9252 }
9253
9254 return 0;
9255 }
9256
9257 void si_shader_destroy(struct si_shader *shader)
9258 {
9259 if (shader->scratch_bo)
9260 r600_resource_reference(&shader->scratch_bo, NULL);
9261
9262 r600_resource_reference(&shader->bo, NULL);
9263
9264 if (!shader->is_binary_shared)
9265 radeon_shader_binary_clean(&shader->binary);
9266
9267 free(shader->shader_log);
9268 }