radeonsi: fix tess offchip offset for per-patch attributes
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85
86 /* Ideally pass the sample mask input to the PS epilog as v13, which
87 * is its usual location, so that the shader doesn't have to add v_mov.
88 */
89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
90
91 enum {
92 CONST_ADDR_SPACE = 2,
93 LOCAL_ADDR_SPACE = 3,
94 };
95
96 static bool is_merged_shader(struct si_shader *shader)
97 {
98 if (shader->selector->screen->b.chip_class <= VI)
99 return false;
100
101 return shader->key.as_ls ||
102 shader->key.as_es ||
103 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
104 shader->selector->type == PIPE_SHADER_GEOMETRY;
105 }
106
107 /**
108 * Returns a unique index for a semantic name and index. The index must be
109 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
110 * calculated.
111 */
112 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
113 {
114 switch (semantic_name) {
115 case TGSI_SEMANTIC_POSITION:
116 return 0;
117 case TGSI_SEMANTIC_PSIZE:
118 return 1;
119 case TGSI_SEMANTIC_CLIPDIST:
120 assert(index <= 1);
121 return 2 + index;
122 case TGSI_SEMANTIC_GENERIC:
123 if (index <= 63-4)
124 return 4 + index;
125
126 assert(!"invalid generic index");
127 return 0;
128
129 /* patch indices are completely separate and thus start from 0 */
130 case TGSI_SEMANTIC_TESSOUTER:
131 return 0;
132 case TGSI_SEMANTIC_TESSINNER:
133 return 1;
134 case TGSI_SEMANTIC_PATCH:
135 return 2 + index;
136
137 default:
138 assert(!"invalid semantic name");
139 return 0;
140 }
141 }
142
143 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
144 {
145 switch (name) {
146 case TGSI_SEMANTIC_FOG:
147 return 0;
148 case TGSI_SEMANTIC_LAYER:
149 return 1;
150 case TGSI_SEMANTIC_VIEWPORT_INDEX:
151 return 2;
152 case TGSI_SEMANTIC_PRIMID:
153 return 3;
154 case TGSI_SEMANTIC_COLOR: /* these alias */
155 case TGSI_SEMANTIC_BCOLOR:
156 return 4 + index;
157 case TGSI_SEMANTIC_TEXCOORD:
158 return 6 + index;
159 default:
160 assert(!"invalid semantic name");
161 return 0;
162 }
163 }
164
165 /**
166 * Get the value of a shader input parameter and extract a bitfield.
167 */
168 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
169 unsigned param, unsigned rshift,
170 unsigned bitwidth)
171 {
172 struct gallivm_state *gallivm = &ctx->gallivm;
173 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
174 param);
175
176 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
177 value = bitcast(&ctx->bld_base,
178 TGSI_TYPE_UNSIGNED, value);
179
180 if (rshift)
181 value = LLVMBuildLShr(gallivm->builder, value,
182 LLVMConstInt(ctx->i32, rshift, 0), "");
183
184 if (rshift + bitwidth < 32) {
185 unsigned mask = (1 << bitwidth) - 1;
186 value = LLVMBuildAnd(gallivm->builder, value,
187 LLVMConstInt(ctx->i32, mask, 0), "");
188 }
189
190 return value;
191 }
192
193 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
194 {
195 switch (ctx->type) {
196 case PIPE_SHADER_TESS_CTRL:
197 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
198
199 case PIPE_SHADER_TESS_EVAL:
200 return LLVMGetParam(ctx->main_fn,
201 ctx->param_tes_rel_patch_id);
202
203 default:
204 assert(0);
205 return NULL;
206 }
207 }
208
209 /* Tessellation shaders pass outputs to the next shader using LDS.
210 *
211 * LS outputs = TCS inputs
212 * TCS outputs = TES inputs
213 *
214 * The LDS layout is:
215 * - TCS inputs for patch 0
216 * - TCS inputs for patch 1
217 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
218 * - ...
219 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
220 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
221 * - TCS outputs for patch 1
222 * - Per-patch TCS outputs for patch 1
223 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
224 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
225 * - ...
226 *
227 * All three shaders VS(LS), TCS, TES share the same LDS space.
228 */
229
230 static LLVMValueRef
231 get_tcs_in_patch_stride(struct si_shader_context *ctx)
232 {
233 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
234 }
235
236 static LLVMValueRef
237 get_tcs_out_patch_stride(struct si_shader_context *ctx)
238 {
239 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
240 }
241
242 static LLVMValueRef
243 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
244 {
245 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
246 unpack_param(ctx,
247 ctx->param_tcs_out_lds_offsets,
248 0, 16),
249 4);
250 }
251
252 static LLVMValueRef
253 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
254 {
255 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
256 unpack_param(ctx,
257 ctx->param_tcs_out_lds_offsets,
258 16, 16),
259 4);
260 }
261
262 static LLVMValueRef
263 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
264 {
265 struct gallivm_state *gallivm = &ctx->gallivm;
266 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
267 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
268
269 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
270 }
271
272 static LLVMValueRef
273 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
274 {
275 struct gallivm_state *gallivm = &ctx->gallivm;
276 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
277 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
278 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
279
280 return LLVMBuildAdd(gallivm->builder, patch0_offset,
281 LLVMBuildMul(gallivm->builder, patch_stride,
282 rel_patch_id, ""),
283 "");
284 }
285
286 static LLVMValueRef
287 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
288 {
289 struct gallivm_state *gallivm = &ctx->gallivm;
290 LLVMValueRef patch0_patch_data_offset =
291 get_tcs_out_patch0_patch_data_offset(ctx);
292 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
293 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
294
295 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
296 LLVMBuildMul(gallivm->builder, patch_stride,
297 rel_patch_id, ""),
298 "");
299 }
300
301 static LLVMValueRef get_instance_index_for_fetch(
302 struct si_shader_context *ctx,
303 unsigned param_start_instance, unsigned divisor)
304 {
305 struct gallivm_state *gallivm = &ctx->gallivm;
306
307 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
308 ctx->param_instance_id);
309
310 /* The division must be done before START_INSTANCE is added. */
311 if (divisor > 1)
312 result = LLVMBuildUDiv(gallivm->builder, result,
313 LLVMConstInt(ctx->i32, divisor, 0), "");
314
315 return LLVMBuildAdd(gallivm->builder, result,
316 LLVMGetParam(ctx->main_fn, param_start_instance), "");
317 }
318
319 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
320 * to float. */
321 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
322 LLVMValueRef vec4,
323 unsigned double_index)
324 {
325 LLVMBuilderRef builder = ctx->gallivm.builder;
326 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
327 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
328 LLVMVectorType(f64, 2), "");
329 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
330 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
331 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
332 }
333
334 static void declare_input_vs(
335 struct si_shader_context *ctx,
336 unsigned input_index,
337 const struct tgsi_full_declaration *decl,
338 LLVMValueRef out[4])
339 {
340 struct gallivm_state *gallivm = &ctx->gallivm;
341
342 unsigned chan;
343 unsigned fix_fetch;
344 unsigned num_fetches;
345 unsigned fetch_stride;
346
347 LLVMValueRef t_list_ptr;
348 LLVMValueRef t_offset;
349 LLVMValueRef t_list;
350 LLVMValueRef vertex_index;
351 LLVMValueRef input[3];
352
353 /* Load the T list */
354 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
355
356 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
357
358 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
359
360 vertex_index = LLVMGetParam(ctx->main_fn,
361 ctx->param_vertex_index0 +
362 input_index);
363
364 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
365
366 /* Do multiple loads for special formats. */
367 switch (fix_fetch) {
368 case SI_FIX_FETCH_RGB_64_FLOAT:
369 num_fetches = 3; /* 3 2-dword loads */
370 fetch_stride = 8;
371 break;
372 case SI_FIX_FETCH_RGBA_64_FLOAT:
373 num_fetches = 2; /* 2 4-dword loads */
374 fetch_stride = 16;
375 break;
376 case SI_FIX_FETCH_RGB_8:
377 case SI_FIX_FETCH_RGB_8_INT:
378 num_fetches = 3;
379 fetch_stride = 1;
380 break;
381 case SI_FIX_FETCH_RGB_16:
382 case SI_FIX_FETCH_RGB_16_INT:
383 num_fetches = 3;
384 fetch_stride = 2;
385 break;
386 default:
387 num_fetches = 1;
388 fetch_stride = 0;
389 }
390
391 for (unsigned i = 0; i < num_fetches; i++) {
392 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
393
394 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
395 vertex_index, voffset,
396 true);
397 }
398
399 /* Break up the vec4 into individual components */
400 for (chan = 0; chan < 4; chan++) {
401 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
402 out[chan] = LLVMBuildExtractElement(gallivm->builder,
403 input[0], llvm_chan, "");
404 }
405
406 switch (fix_fetch) {
407 case SI_FIX_FETCH_A2_SNORM:
408 case SI_FIX_FETCH_A2_SSCALED:
409 case SI_FIX_FETCH_A2_SINT: {
410 /* The hardware returns an unsigned value; convert it to a
411 * signed one.
412 */
413 LLVMValueRef tmp = out[3];
414 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
415
416 /* First, recover the sign-extended signed integer value. */
417 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
418 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
419 else
420 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
421
422 /* For the integer-like cases, do a natural sign extension.
423 *
424 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
425 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
426 * exponent.
427 */
428 tmp = LLVMBuildShl(gallivm->builder, tmp,
429 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
430 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
431 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
432
433 /* Convert back to the right type. */
434 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
435 LLVMValueRef clamp;
436 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
437 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
438 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
439 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
440 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
441 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
442 }
443
444 out[3] = tmp;
445 break;
446 }
447 case SI_FIX_FETCH_RGBA_32_UNORM:
448 case SI_FIX_FETCH_RGBX_32_UNORM:
449 for (chan = 0; chan < 4; chan++) {
450 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
451 ctx->i32, "");
452 out[chan] = LLVMBuildUIToFP(gallivm->builder,
453 out[chan], ctx->f32, "");
454 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
455 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
456 }
457 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
458 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
459 out[3] = LLVMConstReal(ctx->f32, 1);
460 break;
461 case SI_FIX_FETCH_RGBA_32_SNORM:
462 case SI_FIX_FETCH_RGBX_32_SNORM:
463 case SI_FIX_FETCH_RGBA_32_FIXED:
464 case SI_FIX_FETCH_RGBX_32_FIXED: {
465 double scale;
466 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
467 scale = 1.0 / 0x10000;
468 else
469 scale = 1.0 / INT_MAX;
470
471 for (chan = 0; chan < 4; chan++) {
472 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
473 ctx->i32, "");
474 out[chan] = LLVMBuildSIToFP(gallivm->builder,
475 out[chan], ctx->f32, "");
476 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
477 LLVMConstReal(ctx->f32, scale), "");
478 }
479 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
480 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
481 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
482 out[3] = LLVMConstReal(ctx->f32, 1);
483 break;
484 }
485 case SI_FIX_FETCH_RGBA_32_USCALED:
486 for (chan = 0; chan < 4; chan++) {
487 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
488 ctx->i32, "");
489 out[chan] = LLVMBuildUIToFP(gallivm->builder,
490 out[chan], ctx->f32, "");
491 }
492 break;
493 case SI_FIX_FETCH_RGBA_32_SSCALED:
494 for (chan = 0; chan < 4; chan++) {
495 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
496 ctx->i32, "");
497 out[chan] = LLVMBuildSIToFP(gallivm->builder,
498 out[chan], ctx->f32, "");
499 }
500 break;
501 case SI_FIX_FETCH_RG_64_FLOAT:
502 for (chan = 0; chan < 2; chan++)
503 out[chan] = extract_double_to_float(ctx, input[0], chan);
504
505 out[2] = LLVMConstReal(ctx->f32, 0);
506 out[3] = LLVMConstReal(ctx->f32, 1);
507 break;
508 case SI_FIX_FETCH_RGB_64_FLOAT:
509 for (chan = 0; chan < 3; chan++)
510 out[chan] = extract_double_to_float(ctx, input[chan], 0);
511
512 out[3] = LLVMConstReal(ctx->f32, 1);
513 break;
514 case SI_FIX_FETCH_RGBA_64_FLOAT:
515 for (chan = 0; chan < 4; chan++) {
516 out[chan] = extract_double_to_float(ctx, input[chan / 2],
517 chan % 2);
518 }
519 break;
520 case SI_FIX_FETCH_RGB_8:
521 case SI_FIX_FETCH_RGB_8_INT:
522 case SI_FIX_FETCH_RGB_16:
523 case SI_FIX_FETCH_RGB_16_INT:
524 for (chan = 0; chan < 3; chan++) {
525 out[chan] = LLVMBuildExtractElement(gallivm->builder,
526 input[chan],
527 ctx->i32_0, "");
528 }
529 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
530 fix_fetch == SI_FIX_FETCH_RGB_16) {
531 out[3] = LLVMConstReal(ctx->f32, 1);
532 } else {
533 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
534 ctx->f32, "");
535 }
536 break;
537 }
538 }
539
540 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
541 unsigned swizzle)
542 {
543 struct si_shader_context *ctx = si_shader_context(bld_base);
544
545 if (swizzle > 0)
546 return ctx->i32_0;
547
548 switch (ctx->type) {
549 case PIPE_SHADER_VERTEX:
550 return LLVMGetParam(ctx->main_fn,
551 ctx->param_vs_prim_id);
552 case PIPE_SHADER_TESS_CTRL:
553 return LLVMGetParam(ctx->main_fn,
554 ctx->param_tcs_patch_id);
555 case PIPE_SHADER_TESS_EVAL:
556 return LLVMGetParam(ctx->main_fn,
557 ctx->param_tes_patch_id);
558 case PIPE_SHADER_GEOMETRY:
559 return LLVMGetParam(ctx->main_fn,
560 ctx->param_gs_prim_id);
561 default:
562 assert(0);
563 return ctx->i32_0;
564 }
565 }
566
567 /**
568 * Return the value of tgsi_ind_register for indexing.
569 * This is the indirect index with the constant offset added to it.
570 */
571 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
572 const struct tgsi_ind_register *ind,
573 int rel_index)
574 {
575 struct gallivm_state *gallivm = &ctx->gallivm;
576 LLVMValueRef result;
577
578 result = ctx->addrs[ind->Index][ind->Swizzle];
579 result = LLVMBuildLoad(gallivm->builder, result, "");
580 result = LLVMBuildAdd(gallivm->builder, result,
581 LLVMConstInt(ctx->i32, rel_index, 0), "");
582 return result;
583 }
584
585 /**
586 * Like get_indirect_index, but restricts the return value to a (possibly
587 * undefined) value inside [0..num).
588 */
589 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
590 const struct tgsi_ind_register *ind,
591 int rel_index, unsigned num)
592 {
593 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
594
595 /* LLVM 3.8: If indirect resource indexing is used:
596 * - SI & CIK hang
597 * - VI crashes
598 */
599 if (HAVE_LLVM == 0x0308)
600 return LLVMGetUndef(ctx->i32);
601
602 return si_llvm_bound_index(ctx, result, num);
603 }
604
605
606 /**
607 * Calculate a dword address given an input or output register and a stride.
608 */
609 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
610 const struct tgsi_full_dst_register *dst,
611 const struct tgsi_full_src_register *src,
612 LLVMValueRef vertex_dw_stride,
613 LLVMValueRef base_addr)
614 {
615 struct gallivm_state *gallivm = &ctx->gallivm;
616 struct tgsi_shader_info *info = &ctx->shader->selector->info;
617 ubyte *name, *index, *array_first;
618 int first, param;
619 struct tgsi_full_dst_register reg;
620
621 /* Set the register description. The address computation is the same
622 * for sources and destinations. */
623 if (src) {
624 reg.Register.File = src->Register.File;
625 reg.Register.Index = src->Register.Index;
626 reg.Register.Indirect = src->Register.Indirect;
627 reg.Register.Dimension = src->Register.Dimension;
628 reg.Indirect = src->Indirect;
629 reg.Dimension = src->Dimension;
630 reg.DimIndirect = src->DimIndirect;
631 } else
632 reg = *dst;
633
634 /* If the register is 2-dimensional (e.g. an array of vertices
635 * in a primitive), calculate the base address of the vertex. */
636 if (reg.Register.Dimension) {
637 LLVMValueRef index;
638
639 if (reg.Dimension.Indirect)
640 index = get_indirect_index(ctx, &reg.DimIndirect,
641 reg.Dimension.Index);
642 else
643 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
644
645 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
646 LLVMBuildMul(gallivm->builder, index,
647 vertex_dw_stride, ""), "");
648 }
649
650 /* Get information about the register. */
651 if (reg.Register.File == TGSI_FILE_INPUT) {
652 name = info->input_semantic_name;
653 index = info->input_semantic_index;
654 array_first = info->input_array_first;
655 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
656 name = info->output_semantic_name;
657 index = info->output_semantic_index;
658 array_first = info->output_array_first;
659 } else {
660 assert(0);
661 return NULL;
662 }
663
664 if (reg.Register.Indirect) {
665 /* Add the relative address of the element. */
666 LLVMValueRef ind_index;
667
668 if (reg.Indirect.ArrayID)
669 first = array_first[reg.Indirect.ArrayID];
670 else
671 first = reg.Register.Index;
672
673 ind_index = get_indirect_index(ctx, &reg.Indirect,
674 reg.Register.Index - first);
675
676 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
677 LLVMBuildMul(gallivm->builder, ind_index,
678 LLVMConstInt(ctx->i32, 4, 0), ""), "");
679
680 param = si_shader_io_get_unique_index(name[first], index[first]);
681 } else {
682 param = si_shader_io_get_unique_index(name[reg.Register.Index],
683 index[reg.Register.Index]);
684 }
685
686 /* Add the base address of the element. */
687 return LLVMBuildAdd(gallivm->builder, base_addr,
688 LLVMConstInt(ctx->i32, param * 4, 0), "");
689 }
690
691 /* The offchip buffer layout for TCS->TES is
692 *
693 * - attribute 0 of patch 0 vertex 0
694 * - attribute 0 of patch 0 vertex 1
695 * - attribute 0 of patch 0 vertex 2
696 * ...
697 * - attribute 0 of patch 1 vertex 0
698 * - attribute 0 of patch 1 vertex 1
699 * ...
700 * - attribute 1 of patch 0 vertex 0
701 * - attribute 1 of patch 0 vertex 1
702 * ...
703 * - per patch attribute 0 of patch 0
704 * - per patch attribute 0 of patch 1
705 * ...
706 *
707 * Note that every attribute has 4 components.
708 */
709 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
710 LLVMValueRef rel_patch_id,
711 LLVMValueRef vertex_index,
712 LLVMValueRef param_index)
713 {
714 struct gallivm_state *gallivm = &ctx->gallivm;
715 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
716 LLVMValueRef param_stride, constant16;
717
718 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
719 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
720 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
721 num_patches, "");
722
723 constant16 = LLVMConstInt(ctx->i32, 16, 0);
724 if (vertex_index) {
725 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
726 vertices_per_patch, "");
727
728 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
729 vertex_index, "");
730
731 param_stride = total_vertices;
732 } else {
733 base_addr = rel_patch_id;
734 param_stride = num_patches;
735 }
736
737 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
738 LLVMBuildMul(gallivm->builder, param_index,
739 param_stride, ""), "");
740
741 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
742
743 if (!vertex_index) {
744 LLVMValueRef patch_data_offset =
745 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
746
747 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
748 patch_data_offset, "");
749 }
750 return base_addr;
751 }
752
753 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
754 struct si_shader_context *ctx,
755 const struct tgsi_full_dst_register *dst,
756 const struct tgsi_full_src_register *src)
757 {
758 struct gallivm_state *gallivm = &ctx->gallivm;
759 struct tgsi_shader_info *info = &ctx->shader->selector->info;
760 ubyte *name, *index, *array_first;
761 struct tgsi_full_src_register reg;
762 LLVMValueRef vertex_index = NULL;
763 LLVMValueRef param_index = NULL;
764 unsigned param_index_base, param_base;
765
766 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
767
768 if (reg.Register.Dimension) {
769
770 if (reg.Dimension.Indirect)
771 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
772 reg.Dimension.Index);
773 else
774 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
775 }
776
777 /* Get information about the register. */
778 if (reg.Register.File == TGSI_FILE_INPUT) {
779 name = info->input_semantic_name;
780 index = info->input_semantic_index;
781 array_first = info->input_array_first;
782 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
783 name = info->output_semantic_name;
784 index = info->output_semantic_index;
785 array_first = info->output_array_first;
786 } else {
787 assert(0);
788 return NULL;
789 }
790
791 if (reg.Register.Indirect) {
792 if (reg.Indirect.ArrayID)
793 param_base = array_first[reg.Indirect.ArrayID];
794 else
795 param_base = reg.Register.Index;
796
797 param_index = get_indirect_index(ctx, &reg.Indirect,
798 reg.Register.Index - param_base);
799
800 } else {
801 param_base = reg.Register.Index;
802 param_index = ctx->i32_0;
803 }
804
805 param_index_base = si_shader_io_get_unique_index(name[param_base],
806 index[param_base]);
807
808 param_index = LLVMBuildAdd(gallivm->builder, param_index,
809 LLVMConstInt(ctx->i32, param_index_base, 0),
810 "");
811
812 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
813 vertex_index, param_index);
814 }
815
816 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
817 enum tgsi_opcode_type type, unsigned swizzle,
818 LLVMValueRef buffer, LLVMValueRef offset,
819 LLVMValueRef base, bool readonly_memory)
820 {
821 struct si_shader_context *ctx = si_shader_context(bld_base);
822 struct gallivm_state *gallivm = &ctx->gallivm;
823 LLVMValueRef value, value2;
824 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
825 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
826
827 if (swizzle == ~0) {
828 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
829 0, 1, 0, readonly_memory);
830
831 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
832 }
833
834 if (!tgsi_type_is_64bit(type)) {
835 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
836 0, 1, 0, readonly_memory);
837
838 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
839 return LLVMBuildExtractElement(gallivm->builder, value,
840 LLVMConstInt(ctx->i32, swizzle, 0), "");
841 }
842
843 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
844 swizzle * 4, 1, 0, readonly_memory);
845
846 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
847 swizzle * 4 + 4, 1, 0, readonly_memory);
848
849 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
850 }
851
852 /**
853 * Load from LDS.
854 *
855 * \param type output value type
856 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
857 * \param dw_addr address in dwords
858 */
859 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
860 enum tgsi_opcode_type type, unsigned swizzle,
861 LLVMValueRef dw_addr)
862 {
863 struct si_shader_context *ctx = si_shader_context(bld_base);
864 struct gallivm_state *gallivm = &ctx->gallivm;
865 LLVMValueRef value;
866
867 if (swizzle == ~0) {
868 LLVMValueRef values[TGSI_NUM_CHANNELS];
869
870 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
871 values[chan] = lds_load(bld_base, type, chan, dw_addr);
872
873 return lp_build_gather_values(gallivm, values,
874 TGSI_NUM_CHANNELS);
875 }
876
877 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
878 LLVMConstInt(ctx->i32, swizzle, 0));
879
880 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
881 if (tgsi_type_is_64bit(type)) {
882 LLVMValueRef value2;
883 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
884 ctx->i32_1);
885 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
886 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
887 }
888
889 return LLVMBuildBitCast(gallivm->builder, value,
890 tgsi2llvmtype(bld_base, type), "");
891 }
892
893 /**
894 * Store to LDS.
895 *
896 * \param swizzle offset (typically 0..3)
897 * \param dw_addr address in dwords
898 * \param value value to store
899 */
900 static void lds_store(struct lp_build_tgsi_context *bld_base,
901 unsigned dw_offset_imm, LLVMValueRef dw_addr,
902 LLVMValueRef value)
903 {
904 struct si_shader_context *ctx = si_shader_context(bld_base);
905 struct gallivm_state *gallivm = &ctx->gallivm;
906
907 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
908 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
909
910 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
911 ac_build_indexed_store(&ctx->ac, ctx->lds,
912 dw_addr, value);
913 }
914
915 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
916 unsigned param)
917 {
918 LLVMBuilderRef builder = ctx->gallivm.builder;
919
920 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
921 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
922 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
923
924 uint64_t desc2 = 0xffffffff;
925 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
926 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
927 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
928 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
929 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
930 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
931 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
932
933 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
934 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
935 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
936 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
937 }
938
939 static LLVMValueRef fetch_input_tcs(
940 struct lp_build_tgsi_context *bld_base,
941 const struct tgsi_full_src_register *reg,
942 enum tgsi_opcode_type type, unsigned swizzle)
943 {
944 struct si_shader_context *ctx = si_shader_context(bld_base);
945 LLVMValueRef dw_addr, stride;
946
947 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
948 dw_addr = get_tcs_in_current_patch_offset(ctx);
949 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
950
951 return lds_load(bld_base, type, swizzle, dw_addr);
952 }
953
954 static LLVMValueRef fetch_output_tcs(
955 struct lp_build_tgsi_context *bld_base,
956 const struct tgsi_full_src_register *reg,
957 enum tgsi_opcode_type type, unsigned swizzle)
958 {
959 struct si_shader_context *ctx = si_shader_context(bld_base);
960 LLVMValueRef dw_addr, stride;
961
962 if (reg->Register.Dimension) {
963 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
964 dw_addr = get_tcs_out_current_patch_offset(ctx);
965 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
966 } else {
967 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
968 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
969 }
970
971 return lds_load(bld_base, type, swizzle, dw_addr);
972 }
973
974 static LLVMValueRef fetch_input_tes(
975 struct lp_build_tgsi_context *bld_base,
976 const struct tgsi_full_src_register *reg,
977 enum tgsi_opcode_type type, unsigned swizzle)
978 {
979 struct si_shader_context *ctx = si_shader_context(bld_base);
980 LLVMValueRef buffer, base, addr;
981
982 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
983
984 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
985 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
986
987 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
988 }
989
990 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
991 const struct tgsi_full_instruction *inst,
992 const struct tgsi_opcode_info *info,
993 LLVMValueRef dst[4])
994 {
995 struct si_shader_context *ctx = si_shader_context(bld_base);
996 struct gallivm_state *gallivm = &ctx->gallivm;
997 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
998 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
999 unsigned chan_index;
1000 LLVMValueRef dw_addr, stride;
1001 LLVMValueRef buffer, base, buf_addr;
1002 LLVMValueRef values[4];
1003 bool skip_lds_store;
1004 bool is_tess_factor = false;
1005
1006 /* Only handle per-patch and per-vertex outputs here.
1007 * Vectors will be lowered to scalars and this function will be called again.
1008 */
1009 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1010 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1011 si_llvm_emit_store(bld_base, inst, info, dst);
1012 return;
1013 }
1014
1015 if (reg->Register.Dimension) {
1016 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1017 dw_addr = get_tcs_out_current_patch_offset(ctx);
1018 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1019 skip_lds_store = !sh_info->reads_pervertex_outputs;
1020 } else {
1021 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1022 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1023 skip_lds_store = !sh_info->reads_perpatch_outputs;
1024
1025 if (!reg->Register.Indirect) {
1026 int name = sh_info->output_semantic_name[reg->Register.Index];
1027
1028 /* Always write tess factors into LDS for the TCS epilog. */
1029 if (name == TGSI_SEMANTIC_TESSINNER ||
1030 name == TGSI_SEMANTIC_TESSOUTER) {
1031 skip_lds_store = false;
1032 is_tess_factor = true;
1033 }
1034 }
1035 }
1036
1037 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1038
1039 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1040 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1041
1042
1043 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1044 LLVMValueRef value = dst[chan_index];
1045
1046 if (inst->Instruction.Saturate)
1047 value = ac_build_clamp(&ctx->ac, value);
1048
1049 /* Skip LDS stores if there is no LDS read of this output. */
1050 if (!skip_lds_store)
1051 lds_store(bld_base, chan_index, dw_addr, value);
1052
1053 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1054 values[chan_index] = value;
1055
1056 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1057 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1058 buf_addr, base,
1059 4 * chan_index, 1, 0, true, false);
1060 }
1061 }
1062
1063 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1064 LLVMValueRef value = lp_build_gather_values(gallivm,
1065 values, 4);
1066 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1067 base, 0, 1, 0, true, false);
1068 }
1069 }
1070
1071 static LLVMValueRef fetch_input_gs(
1072 struct lp_build_tgsi_context *bld_base,
1073 const struct tgsi_full_src_register *reg,
1074 enum tgsi_opcode_type type,
1075 unsigned swizzle)
1076 {
1077 struct si_shader_context *ctx = si_shader_context(bld_base);
1078 struct si_shader *shader = ctx->shader;
1079 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1080 struct gallivm_state *gallivm = &ctx->gallivm;
1081 LLVMValueRef vtx_offset, soffset;
1082 struct tgsi_shader_info *info = &shader->selector->info;
1083 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1084 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1085 unsigned param;
1086 LLVMValueRef value;
1087
1088 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1089 return get_primitive_id(bld_base, swizzle);
1090
1091 if (!reg->Register.Dimension)
1092 return NULL;
1093
1094 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1095
1096 /* GFX9 has the ESGS ring in LDS. */
1097 if (ctx->screen->b.chip_class >= GFX9) {
1098 unsigned index = reg->Dimension.Index;
1099
1100 switch (index / 2) {
1101 case 0:
1102 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1103 index % 2 ? 16 : 0, 16);
1104 break;
1105 case 1:
1106 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1107 index % 2 ? 16 : 0, 16);
1108 break;
1109 case 2:
1110 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1111 index % 2 ? 16 : 0, 16);
1112 break;
1113 default:
1114 assert(0);
1115 return NULL;
1116 }
1117
1118 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1119 LLVMConstInt(ctx->i32, param * 4, 0), "");
1120 return lds_load(bld_base, type, swizzle, vtx_offset);
1121 }
1122
1123 /* GFX6: input load from the ESGS ring in memory. */
1124 if (swizzle == ~0) {
1125 LLVMValueRef values[TGSI_NUM_CHANNELS];
1126 unsigned chan;
1127 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1128 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1129 }
1130 return lp_build_gather_values(gallivm, values,
1131 TGSI_NUM_CHANNELS);
1132 }
1133
1134 /* Get the vertex offset parameter on GFX6. */
1135 unsigned vtx_offset_param = reg->Dimension.Index;
1136 if (vtx_offset_param < 2) {
1137 vtx_offset_param += ctx->param_gs_vtx0_offset;
1138 } else {
1139 assert(vtx_offset_param < 6);
1140 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1141 }
1142 vtx_offset = lp_build_mul_imm(uint,
1143 LLVMGetParam(ctx->main_fn,
1144 vtx_offset_param),
1145 4);
1146
1147 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1148
1149 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1150 vtx_offset, soffset, 0, 1, 0, true);
1151 if (tgsi_type_is_64bit(type)) {
1152 LLVMValueRef value2;
1153 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1154
1155 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1156 ctx->i32_0, vtx_offset, soffset,
1157 0, 1, 0, true);
1158 return si_llvm_emit_fetch_64bit(bld_base, type,
1159 value, value2);
1160 }
1161 return LLVMBuildBitCast(gallivm->builder,
1162 value,
1163 tgsi2llvmtype(bld_base, type), "");
1164 }
1165
1166 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1167 {
1168 switch (interpolate) {
1169 case TGSI_INTERPOLATE_CONSTANT:
1170 return 0;
1171
1172 case TGSI_INTERPOLATE_LINEAR:
1173 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1174 return SI_PARAM_LINEAR_SAMPLE;
1175 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1176 return SI_PARAM_LINEAR_CENTROID;
1177 else
1178 return SI_PARAM_LINEAR_CENTER;
1179 break;
1180 case TGSI_INTERPOLATE_COLOR:
1181 case TGSI_INTERPOLATE_PERSPECTIVE:
1182 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1183 return SI_PARAM_PERSP_SAMPLE;
1184 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1185 return SI_PARAM_PERSP_CENTROID;
1186 else
1187 return SI_PARAM_PERSP_CENTER;
1188 break;
1189 default:
1190 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1191 return -1;
1192 }
1193 }
1194
1195 /**
1196 * Interpolate a fragment shader input.
1197 *
1198 * @param ctx context
1199 * @param input_index index of the input in hardware
1200 * @param semantic_name TGSI_SEMANTIC_*
1201 * @param semantic_index semantic index
1202 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1203 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1204 * @param interp_param interpolation weights (i,j)
1205 * @param prim_mask SI_PARAM_PRIM_MASK
1206 * @param face SI_PARAM_FRONT_FACE
1207 * @param result the return value (4 components)
1208 */
1209 static void interp_fs_input(struct si_shader_context *ctx,
1210 unsigned input_index,
1211 unsigned semantic_name,
1212 unsigned semantic_index,
1213 unsigned num_interp_inputs,
1214 unsigned colors_read_mask,
1215 LLVMValueRef interp_param,
1216 LLVMValueRef prim_mask,
1217 LLVMValueRef face,
1218 LLVMValueRef result[4])
1219 {
1220 struct gallivm_state *gallivm = &ctx->gallivm;
1221 LLVMValueRef attr_number;
1222 LLVMValueRef i, j;
1223
1224 unsigned chan;
1225
1226 /* fs.constant returns the param from the middle vertex, so it's not
1227 * really useful for flat shading. It's meant to be used for custom
1228 * interpolation (but the intrinsic can't fetch from the other two
1229 * vertices).
1230 *
1231 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1232 * to do the right thing. The only reason we use fs.constant is that
1233 * fs.interp cannot be used on integers, because they can be equal
1234 * to NaN.
1235 *
1236 * When interp is false we will use fs.constant or for newer llvm,
1237 * amdgcn.interp.mov.
1238 */
1239 bool interp = interp_param != NULL;
1240
1241 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1242
1243 if (interp) {
1244 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1245 LLVMVectorType(ctx->f32, 2), "");
1246
1247 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1248 ctx->i32_0, "");
1249 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1250 ctx->i32_1, "");
1251 }
1252
1253 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1254 ctx->shader->key.part.ps.prolog.color_two_side) {
1255 LLVMValueRef is_face_positive;
1256 LLVMValueRef back_attr_number;
1257
1258 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1259 * otherwise it's at offset "num_inputs".
1260 */
1261 unsigned back_attr_offset = num_interp_inputs;
1262 if (semantic_index == 1 && colors_read_mask & 0xf)
1263 back_attr_offset += 1;
1264
1265 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1266
1267 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1268 face, ctx->i32_0, "");
1269
1270 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1271 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1272 LLVMValueRef front, back;
1273
1274 if (interp) {
1275 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1276 attr_number, prim_mask,
1277 i, j);
1278 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1279 back_attr_number, prim_mask,
1280 i, j);
1281 } else {
1282 front = ac_build_fs_interp_mov(&ctx->ac,
1283 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1284 llvm_chan, attr_number, prim_mask);
1285 back = ac_build_fs_interp_mov(&ctx->ac,
1286 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1287 llvm_chan, back_attr_number, prim_mask);
1288 }
1289
1290 result[chan] = LLVMBuildSelect(gallivm->builder,
1291 is_face_positive,
1292 front,
1293 back,
1294 "");
1295 }
1296 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1297 if (interp) {
1298 result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1299 attr_number, prim_mask, i, j);
1300 } else {
1301 result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1302 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1303 attr_number, prim_mask);
1304 }
1305 result[1] =
1306 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1307 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1308 } else {
1309 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1310 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1311
1312 if (interp) {
1313 result[chan] = ac_build_fs_interp(&ctx->ac,
1314 llvm_chan, attr_number, prim_mask, i, j);
1315 } else {
1316 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1317 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1318 llvm_chan, attr_number, prim_mask);
1319 }
1320 }
1321 }
1322 }
1323
1324 static void declare_input_fs(
1325 struct si_shader_context *ctx,
1326 unsigned input_index,
1327 const struct tgsi_full_declaration *decl,
1328 LLVMValueRef out[4])
1329 {
1330 struct lp_build_context *base = &ctx->bld_base.base;
1331 struct si_shader *shader = ctx->shader;
1332 LLVMValueRef main_fn = ctx->main_fn;
1333 LLVMValueRef interp_param = NULL;
1334 int interp_param_idx;
1335
1336 /* Get colors from input VGPRs (set by the prolog). */
1337 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1338 unsigned i = decl->Semantic.Index;
1339 unsigned colors_read = shader->selector->info.colors_read;
1340 unsigned mask = colors_read >> (i * 4);
1341 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1342 (i ? util_bitcount(colors_read & 0xf) : 0);
1343
1344 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1345 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1346 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1347 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1348 return;
1349 }
1350
1351 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1352 decl->Interp.Location);
1353 if (interp_param_idx == -1)
1354 return;
1355 else if (interp_param_idx) {
1356 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1357 }
1358
1359 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1360 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1361 ctx->shader->key.part.ps.prolog.flatshade_colors)
1362 interp_param = NULL; /* load the constant color */
1363
1364 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1365 decl->Semantic.Index, shader->selector->info.num_inputs,
1366 shader->selector->info.colors_read, interp_param,
1367 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1368 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1369 &out[0]);
1370 }
1371
1372 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1373 {
1374 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1375 }
1376
1377
1378 /**
1379 * Load a dword from a constant buffer.
1380 */
1381 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1382 LLVMValueRef resource,
1383 LLVMValueRef offset)
1384 {
1385 LLVMBuilderRef builder = ctx->gallivm.builder;
1386 LLVMValueRef args[2] = {resource, offset};
1387
1388 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1389 LP_FUNC_ATTR_READNONE |
1390 LP_FUNC_ATTR_LEGACY);
1391 }
1392
1393 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1394 {
1395 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1396 struct gallivm_state *gallivm = &ctx->gallivm;
1397 LLVMBuilderRef builder = gallivm->builder;
1398 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1399 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1400 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1401
1402 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1403 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1404 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1405
1406 LLVMValueRef pos[4] = {
1407 buffer_load_const(ctx, resource, offset0),
1408 buffer_load_const(ctx, resource, offset1),
1409 LLVMConstReal(ctx->f32, 0),
1410 LLVMConstReal(ctx->f32, 0)
1411 };
1412
1413 return lp_build_gather_values(gallivm, pos, 4);
1414 }
1415
1416 static void declare_system_value(struct si_shader_context *ctx,
1417 unsigned index,
1418 const struct tgsi_full_declaration *decl)
1419 {
1420 struct lp_build_context *bld = &ctx->bld_base.base;
1421 struct gallivm_state *gallivm = &ctx->gallivm;
1422 LLVMValueRef value = 0;
1423
1424 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1425
1426 switch (decl->Semantic.Name) {
1427 case TGSI_SEMANTIC_INSTANCEID:
1428 value = LLVMGetParam(ctx->main_fn,
1429 ctx->param_instance_id);
1430 break;
1431
1432 case TGSI_SEMANTIC_VERTEXID:
1433 value = LLVMBuildAdd(gallivm->builder,
1434 LLVMGetParam(ctx->main_fn,
1435 ctx->param_vertex_id),
1436 LLVMGetParam(ctx->main_fn,
1437 ctx->param_base_vertex), "");
1438 break;
1439
1440 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1441 /* Unused. Clarify the meaning in indexed vs. non-indexed
1442 * draws if this is ever used again. */
1443 assert(false);
1444 break;
1445
1446 case TGSI_SEMANTIC_BASEVERTEX:
1447 {
1448 /* For non-indexed draws, the base vertex set by the driver
1449 * (for direct draws) or the CP (for indirect draws) is the
1450 * first vertex ID, but GLSL expects 0 to be returned.
1451 */
1452 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1453 LLVMValueRef indexed;
1454
1455 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1456 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1457
1458 value = LLVMBuildSelect(gallivm->builder, indexed,
1459 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1460 ctx->i32_0, "");
1461 break;
1462 }
1463
1464 case TGSI_SEMANTIC_BASEINSTANCE:
1465 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1466 break;
1467
1468 case TGSI_SEMANTIC_DRAWID:
1469 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1470 break;
1471
1472 case TGSI_SEMANTIC_INVOCATIONID:
1473 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1474 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1475 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1476 value = LLVMGetParam(ctx->main_fn,
1477 ctx->param_gs_instance_id);
1478 else
1479 assert(!"INVOCATIONID not implemented");
1480 break;
1481
1482 case TGSI_SEMANTIC_POSITION:
1483 {
1484 LLVMValueRef pos[4] = {
1485 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1486 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1487 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1488 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1489 LLVMGetParam(ctx->main_fn,
1490 SI_PARAM_POS_W_FLOAT)),
1491 };
1492 value = lp_build_gather_values(gallivm, pos, 4);
1493 break;
1494 }
1495
1496 case TGSI_SEMANTIC_FACE:
1497 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1498 break;
1499
1500 case TGSI_SEMANTIC_SAMPLEID:
1501 value = get_sample_id(ctx);
1502 break;
1503
1504 case TGSI_SEMANTIC_SAMPLEPOS: {
1505 LLVMValueRef pos[4] = {
1506 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1507 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1508 LLVMConstReal(ctx->f32, 0),
1509 LLVMConstReal(ctx->f32, 0)
1510 };
1511 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1512 TGSI_OPCODE_FRC, pos[0]);
1513 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1514 TGSI_OPCODE_FRC, pos[1]);
1515 value = lp_build_gather_values(gallivm, pos, 4);
1516 break;
1517 }
1518
1519 case TGSI_SEMANTIC_SAMPLEMASK:
1520 /* This can only occur with the OpenGL Core profile, which
1521 * doesn't support smoothing.
1522 */
1523 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1524 break;
1525
1526 case TGSI_SEMANTIC_TESSCOORD:
1527 {
1528 LLVMValueRef coord[4] = {
1529 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1530 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1531 bld->zero,
1532 bld->zero
1533 };
1534
1535 /* For triangles, the vector should be (u, v, 1-u-v). */
1536 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1537 PIPE_PRIM_TRIANGLES)
1538 coord[2] = lp_build_sub(bld, bld->one,
1539 lp_build_add(bld, coord[0], coord[1]));
1540
1541 value = lp_build_gather_values(gallivm, coord, 4);
1542 break;
1543 }
1544
1545 case TGSI_SEMANTIC_VERTICESIN:
1546 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1547 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1548 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1549 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1550 else
1551 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1552 break;
1553
1554 case TGSI_SEMANTIC_TESSINNER:
1555 case TGSI_SEMANTIC_TESSOUTER:
1556 {
1557 LLVMValueRef buffer, base, addr;
1558 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1559
1560 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1561
1562 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1563 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1564 LLVMConstInt(ctx->i32, param, 0));
1565
1566 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1567 ~0, buffer, base, addr, true);
1568
1569 break;
1570 }
1571
1572 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1573 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1574 {
1575 LLVMValueRef buf, slot, val[4];
1576 int i, offset;
1577
1578 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1579 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1580 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1581 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1582
1583 for (i = 0; i < 4; i++)
1584 val[i] = buffer_load_const(ctx, buf,
1585 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1586 value = lp_build_gather_values(gallivm, val, 4);
1587 break;
1588 }
1589
1590 case TGSI_SEMANTIC_PRIMID:
1591 value = get_primitive_id(&ctx->bld_base, 0);
1592 break;
1593
1594 case TGSI_SEMANTIC_GRID_SIZE:
1595 value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
1596 break;
1597
1598 case TGSI_SEMANTIC_BLOCK_SIZE:
1599 {
1600 LLVMValueRef values[3];
1601 unsigned i;
1602 unsigned *properties = ctx->shader->selector->info.properties;
1603
1604 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1605 unsigned sizes[3] = {
1606 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1607 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1608 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1609 };
1610
1611 for (i = 0; i < 3; ++i)
1612 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1613
1614 value = lp_build_gather_values(gallivm, values, 3);
1615 } else {
1616 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
1617 }
1618 break;
1619 }
1620
1621 case TGSI_SEMANTIC_BLOCK_ID:
1622 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
1623 break;
1624
1625 case TGSI_SEMANTIC_THREAD_ID:
1626 value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
1627 break;
1628
1629 case TGSI_SEMANTIC_HELPER_INVOCATION:
1630 if (HAVE_LLVM >= 0x0309) {
1631 value = lp_build_intrinsic(gallivm->builder,
1632 "llvm.amdgcn.ps.live",
1633 ctx->i1, NULL, 0,
1634 LP_FUNC_ATTR_READNONE);
1635 value = LLVMBuildNot(gallivm->builder, value, "");
1636 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1637 } else {
1638 assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1639 return;
1640 }
1641 break;
1642
1643 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1644 value = LLVMConstInt(ctx->i32, 64, 0);
1645 break;
1646
1647 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1648 value = ac_get_thread_id(&ctx->ac);
1649 break;
1650
1651 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1652 {
1653 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1654 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1655 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1656 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1657 break;
1658 }
1659
1660 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1661 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1662 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1663 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1664 {
1665 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1666 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1667 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1668 /* All bits set except LSB */
1669 value = LLVMConstInt(ctx->i64, -2, 0);
1670 } else {
1671 /* All bits set */
1672 value = LLVMConstInt(ctx->i64, -1, 0);
1673 }
1674 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1675 value = LLVMBuildShl(gallivm->builder, value, id, "");
1676 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1677 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1678 value = LLVMBuildNot(gallivm->builder, value, "");
1679 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1680 break;
1681 }
1682
1683 default:
1684 assert(!"unknown system value");
1685 return;
1686 }
1687
1688 ctx->system_values[index] = value;
1689 }
1690
1691 static void declare_compute_memory(struct si_shader_context *ctx,
1692 const struct tgsi_full_declaration *decl)
1693 {
1694 struct si_shader_selector *sel = ctx->shader->selector;
1695 struct gallivm_state *gallivm = &ctx->gallivm;
1696
1697 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1698 LLVMValueRef var;
1699
1700 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1701 assert(decl->Range.First == decl->Range.Last);
1702 assert(!ctx->shared_memory);
1703
1704 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1705 LLVMArrayType(ctx->i8, sel->local_size),
1706 "compute_lds",
1707 LOCAL_ADDR_SPACE);
1708 LLVMSetAlignment(var, 4);
1709
1710 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1711 }
1712
1713 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1714 {
1715 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1716 ctx->param_const_buffers);
1717
1718 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1719 LLVMConstInt(ctx->i32, i, 0));
1720 }
1721
1722 static LLVMValueRef fetch_constant(
1723 struct lp_build_tgsi_context *bld_base,
1724 const struct tgsi_full_src_register *reg,
1725 enum tgsi_opcode_type type,
1726 unsigned swizzle)
1727 {
1728 struct si_shader_context *ctx = si_shader_context(bld_base);
1729 struct lp_build_context *base = &bld_base->base;
1730 const struct tgsi_ind_register *ireg = &reg->Indirect;
1731 unsigned buf, idx;
1732
1733 LLVMValueRef addr, bufp;
1734 LLVMValueRef result;
1735
1736 if (swizzle == LP_CHAN_ALL) {
1737 unsigned chan;
1738 LLVMValueRef values[4];
1739 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1740 values[chan] = fetch_constant(bld_base, reg, type, chan);
1741
1742 return lp_build_gather_values(&ctx->gallivm, values, 4);
1743 }
1744
1745 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1746 idx = reg->Register.Index * 4 + swizzle;
1747
1748 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1749 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1750 LLVMValueRef index;
1751 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1752 reg->Dimension.Index,
1753 SI_NUM_CONST_BUFFERS);
1754 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1755 } else
1756 bufp = load_const_buffer_desc(ctx, buf);
1757
1758 if (reg->Register.Indirect) {
1759 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1760 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1761 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1762 addr = lp_build_add(&bld_base->uint_bld, addr,
1763 LLVMConstInt(ctx->i32, idx * 4, 0));
1764 } else {
1765 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1766 }
1767
1768 result = buffer_load_const(ctx, bufp, addr);
1769
1770 if (!tgsi_type_is_64bit(type))
1771 result = bitcast(bld_base, type, result);
1772 else {
1773 LLVMValueRef addr2, result2;
1774
1775 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1776 LLVMConstInt(ctx->i32, 4, 0));
1777 result2 = buffer_load_const(ctx, bufp, addr2);
1778
1779 result = si_llvm_emit_fetch_64bit(bld_base, type,
1780 result, result2);
1781 }
1782 return result;
1783 }
1784
1785 /* Upper 16 bits must be zero. */
1786 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1787 LLVMValueRef val[2])
1788 {
1789 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1790 LLVMBuildShl(ctx->gallivm.builder, val[1],
1791 LLVMConstInt(ctx->i32, 16, 0),
1792 ""), "");
1793 }
1794
1795 /* Upper 16 bits are ignored and will be dropped. */
1796 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1797 LLVMValueRef val[2])
1798 {
1799 LLVMValueRef v[2] = {
1800 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1801 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1802 val[1],
1803 };
1804 return si_llvm_pack_two_int16(ctx, v);
1805 }
1806
1807 /* Initialize arguments for the shader export intrinsic */
1808 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1809 LLVMValueRef *values,
1810 unsigned target,
1811 struct ac_export_args *args)
1812 {
1813 struct si_shader_context *ctx = si_shader_context(bld_base);
1814 struct lp_build_context *base = &bld_base->base;
1815 LLVMBuilderRef builder = ctx->gallivm.builder;
1816 LLVMValueRef val[4];
1817 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1818 unsigned chan;
1819 bool is_int8, is_int10;
1820
1821 /* Default is 0xf. Adjusted below depending on the format. */
1822 args->enabled_channels = 0xf; /* writemask */
1823
1824 /* Specify whether the EXEC mask represents the valid mask */
1825 args->valid_mask = 0;
1826
1827 /* Specify whether this is the last export */
1828 args->done = 0;
1829
1830 /* Specify the target we are exporting */
1831 args->target = target;
1832
1833 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1834 const struct si_shader_key *key = &ctx->shader->key;
1835 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1836 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1837
1838 assert(cbuf >= 0 && cbuf < 8);
1839 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1840 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1841 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1842 }
1843
1844 args->compr = false;
1845 args->out[0] = base->undef;
1846 args->out[1] = base->undef;
1847 args->out[2] = base->undef;
1848 args->out[3] = base->undef;
1849
1850 switch (spi_shader_col_format) {
1851 case V_028714_SPI_SHADER_ZERO:
1852 args->enabled_channels = 0; /* writemask */
1853 args->target = V_008DFC_SQ_EXP_NULL;
1854 break;
1855
1856 case V_028714_SPI_SHADER_32_R:
1857 args->enabled_channels = 1; /* writemask */
1858 args->out[0] = values[0];
1859 break;
1860
1861 case V_028714_SPI_SHADER_32_GR:
1862 args->enabled_channels = 0x3; /* writemask */
1863 args->out[0] = values[0];
1864 args->out[1] = values[1];
1865 break;
1866
1867 case V_028714_SPI_SHADER_32_AR:
1868 args->enabled_channels = 0x9; /* writemask */
1869 args->out[0] = values[0];
1870 args->out[3] = values[3];
1871 break;
1872
1873 case V_028714_SPI_SHADER_FP16_ABGR:
1874 args->compr = 1; /* COMPR flag */
1875
1876 for (chan = 0; chan < 2; chan++) {
1877 LLVMValueRef pack_args[2] = {
1878 values[2 * chan],
1879 values[2 * chan + 1]
1880 };
1881 LLVMValueRef packed;
1882
1883 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1884 args->out[chan] =
1885 LLVMBuildBitCast(ctx->gallivm.builder,
1886 packed, ctx->f32, "");
1887 }
1888 break;
1889
1890 case V_028714_SPI_SHADER_UNORM16_ABGR:
1891 for (chan = 0; chan < 4; chan++) {
1892 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1893 val[chan] = LLVMBuildFMul(builder, val[chan],
1894 LLVMConstReal(ctx->f32, 65535), "");
1895 val[chan] = LLVMBuildFAdd(builder, val[chan],
1896 LLVMConstReal(ctx->f32, 0.5), "");
1897 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1898 ctx->i32, "");
1899 }
1900
1901 args->compr = 1; /* COMPR flag */
1902 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1903 si_llvm_pack_two_int16(ctx, val));
1904 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1905 si_llvm_pack_two_int16(ctx, val+2));
1906 break;
1907
1908 case V_028714_SPI_SHADER_SNORM16_ABGR:
1909 for (chan = 0; chan < 4; chan++) {
1910 /* Clamp between [-1, 1]. */
1911 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1912 values[chan],
1913 LLVMConstReal(ctx->f32, 1));
1914 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1915 val[chan],
1916 LLVMConstReal(ctx->f32, -1));
1917 /* Convert to a signed integer in [-32767, 32767]. */
1918 val[chan] = LLVMBuildFMul(builder, val[chan],
1919 LLVMConstReal(ctx->f32, 32767), "");
1920 /* If positive, add 0.5, else add -0.5. */
1921 val[chan] = LLVMBuildFAdd(builder, val[chan],
1922 LLVMBuildSelect(builder,
1923 LLVMBuildFCmp(builder, LLVMRealOGE,
1924 val[chan], base->zero, ""),
1925 LLVMConstReal(ctx->f32, 0.5),
1926 LLVMConstReal(ctx->f32, -0.5), ""), "");
1927 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1928 }
1929
1930 args->compr = 1; /* COMPR flag */
1931 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1932 si_llvm_pack_two_int32_as_int16(ctx, val));
1933 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1934 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1935 break;
1936
1937 case V_028714_SPI_SHADER_UINT16_ABGR: {
1938 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1939 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1940 LLVMValueRef max_alpha =
1941 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1942
1943 /* Clamp. */
1944 for (chan = 0; chan < 4; chan++) {
1945 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1946 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1947 val[chan],
1948 chan == 3 ? max_alpha : max_rgb);
1949 }
1950
1951 args->compr = 1; /* COMPR flag */
1952 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1953 si_llvm_pack_two_int16(ctx, val));
1954 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1955 si_llvm_pack_two_int16(ctx, val+2));
1956 break;
1957 }
1958
1959 case V_028714_SPI_SHADER_SINT16_ABGR: {
1960 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1961 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1962 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1963 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1964 LLVMValueRef max_alpha =
1965 !is_int10 ? max_rgb : ctx->i32_1;
1966 LLVMValueRef min_alpha =
1967 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1968
1969 /* Clamp. */
1970 for (chan = 0; chan < 4; chan++) {
1971 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1972 val[chan] = lp_build_emit_llvm_binary(bld_base,
1973 TGSI_OPCODE_IMIN,
1974 val[chan], chan == 3 ? max_alpha : max_rgb);
1975 val[chan] = lp_build_emit_llvm_binary(bld_base,
1976 TGSI_OPCODE_IMAX,
1977 val[chan], chan == 3 ? min_alpha : min_rgb);
1978 }
1979
1980 args->compr = 1; /* COMPR flag */
1981 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1982 si_llvm_pack_two_int32_as_int16(ctx, val));
1983 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1984 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1985 break;
1986 }
1987
1988 case V_028714_SPI_SHADER_32_ABGR:
1989 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1990 break;
1991 }
1992 }
1993
1994 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1995 LLVMValueRef alpha)
1996 {
1997 struct si_shader_context *ctx = si_shader_context(bld_base);
1998
1999 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2000 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2001 SI_PARAM_ALPHA_REF);
2002
2003 LLVMValueRef alpha_pass =
2004 lp_build_cmp(&bld_base->base,
2005 ctx->shader->key.part.ps.epilog.alpha_func,
2006 alpha, alpha_ref);
2007 LLVMValueRef arg =
2008 lp_build_select(&bld_base->base,
2009 alpha_pass,
2010 LLVMConstReal(ctx->f32, 1.0f),
2011 LLVMConstReal(ctx->f32, -1.0f));
2012
2013 ac_build_kill(&ctx->ac, arg);
2014 } else {
2015 ac_build_kill(&ctx->ac, NULL);
2016 }
2017 }
2018
2019 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2020 LLVMValueRef alpha,
2021 unsigned samplemask_param)
2022 {
2023 struct si_shader_context *ctx = si_shader_context(bld_base);
2024 struct gallivm_state *gallivm = &ctx->gallivm;
2025 LLVMValueRef coverage;
2026
2027 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2028 coverage = LLVMGetParam(ctx->main_fn,
2029 samplemask_param);
2030 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2031
2032 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2033 ctx->i32,
2034 &coverage, 1, LP_FUNC_ATTR_READNONE);
2035
2036 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2037 ctx->f32, "");
2038
2039 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2040 LLVMConstReal(ctx->f32,
2041 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2042
2043 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2044 }
2045
2046 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2047 struct ac_export_args *pos, LLVMValueRef *out_elts)
2048 {
2049 struct si_shader_context *ctx = si_shader_context(bld_base);
2050 struct lp_build_context *base = &bld_base->base;
2051 unsigned reg_index;
2052 unsigned chan;
2053 unsigned const_chan;
2054 LLVMValueRef base_elt;
2055 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2056 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2057 SI_VS_CONST_CLIP_PLANES, 0);
2058 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2059
2060 for (reg_index = 0; reg_index < 2; reg_index ++) {
2061 struct ac_export_args *args = &pos[2 + reg_index];
2062
2063 args->out[0] =
2064 args->out[1] =
2065 args->out[2] =
2066 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2067
2068 /* Compute dot products of position and user clip plane vectors */
2069 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2070 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2071 LLVMValueRef addr =
2072 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2073 const_chan) * 4, 0);
2074 base_elt = buffer_load_const(ctx, const_resource,
2075 addr);
2076 args->out[chan] =
2077 lp_build_add(base, args->out[chan],
2078 lp_build_mul(base, base_elt,
2079 out_elts[const_chan]));
2080 }
2081 }
2082
2083 args->enabled_channels = 0xf;
2084 args->valid_mask = 0;
2085 args->done = 0;
2086 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2087 args->compr = 0;
2088 }
2089 }
2090
2091 static void si_dump_streamout(struct pipe_stream_output_info *so)
2092 {
2093 unsigned i;
2094
2095 if (so->num_outputs)
2096 fprintf(stderr, "STREAMOUT\n");
2097
2098 for (i = 0; i < so->num_outputs; i++) {
2099 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2100 so->output[i].start_component;
2101 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2102 i, so->output[i].output_buffer,
2103 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2104 so->output[i].register_index,
2105 mask & 1 ? "x" : "",
2106 mask & 2 ? "y" : "",
2107 mask & 4 ? "z" : "",
2108 mask & 8 ? "w" : "");
2109 }
2110 }
2111
2112 static void emit_streamout_output(struct si_shader_context *ctx,
2113 LLVMValueRef const *so_buffers,
2114 LLVMValueRef const *so_write_offsets,
2115 struct pipe_stream_output *stream_out,
2116 struct si_shader_output_values *shader_out)
2117 {
2118 struct gallivm_state *gallivm = &ctx->gallivm;
2119 LLVMBuilderRef builder = gallivm->builder;
2120 unsigned buf_idx = stream_out->output_buffer;
2121 unsigned start = stream_out->start_component;
2122 unsigned num_comps = stream_out->num_components;
2123 LLVMValueRef out[4];
2124
2125 assert(num_comps && num_comps <= 4);
2126 if (!num_comps || num_comps > 4)
2127 return;
2128
2129 /* Load the output as int. */
2130 for (int j = 0; j < num_comps; j++) {
2131 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2132
2133 out[j] = LLVMBuildBitCast(builder,
2134 shader_out->values[start + j],
2135 ctx->i32, "");
2136 }
2137
2138 /* Pack the output. */
2139 LLVMValueRef vdata = NULL;
2140
2141 switch (num_comps) {
2142 case 1: /* as i32 */
2143 vdata = out[0];
2144 break;
2145 case 2: /* as v2i32 */
2146 case 3: /* as v4i32 (aligned to 4) */
2147 case 4: /* as v4i32 */
2148 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2149 for (int j = 0; j < num_comps; j++) {
2150 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2151 LLVMConstInt(ctx->i32, j, 0), "");
2152 }
2153 break;
2154 }
2155
2156 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2157 vdata, num_comps,
2158 so_write_offsets[buf_idx],
2159 ctx->i32_0,
2160 stream_out->dst_offset * 4, 1, 1, true, false);
2161 }
2162
2163 /**
2164 * Write streamout data to buffers for vertex stream @p stream (different
2165 * vertex streams can occur for GS copy shaders).
2166 */
2167 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2168 struct si_shader_output_values *outputs,
2169 unsigned noutput, unsigned stream)
2170 {
2171 struct si_shader_selector *sel = ctx->shader->selector;
2172 struct pipe_stream_output_info *so = &sel->so;
2173 struct gallivm_state *gallivm = &ctx->gallivm;
2174 LLVMBuilderRef builder = gallivm->builder;
2175 int i;
2176 struct lp_build_if_state if_ctx;
2177
2178 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2179 LLVMValueRef so_vtx_count =
2180 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2181
2182 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2183
2184 /* can_emit = tid < so_vtx_count; */
2185 LLVMValueRef can_emit =
2186 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2187
2188 /* Emit the streamout code conditionally. This actually avoids
2189 * out-of-bounds buffer access. The hw tells us via the SGPR
2190 * (so_vtx_count) which threads are allowed to emit streamout data. */
2191 lp_build_if(&if_ctx, gallivm, can_emit);
2192 {
2193 /* The buffer offset is computed as follows:
2194 * ByteOffset = streamout_offset[buffer_id]*4 +
2195 * (streamout_write_index + thread_id)*stride[buffer_id] +
2196 * attrib_offset
2197 */
2198
2199 LLVMValueRef so_write_index =
2200 LLVMGetParam(ctx->main_fn,
2201 ctx->param_streamout_write_index);
2202
2203 /* Compute (streamout_write_index + thread_id). */
2204 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2205
2206 /* Load the descriptor and compute the write offset for each
2207 * enabled buffer. */
2208 LLVMValueRef so_write_offset[4] = {};
2209 LLVMValueRef so_buffers[4];
2210 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2211 ctx->param_rw_buffers);
2212
2213 for (i = 0; i < 4; i++) {
2214 if (!so->stride[i])
2215 continue;
2216
2217 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2218 SI_VS_STREAMOUT_BUF0 + i, 0);
2219
2220 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2221
2222 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2223 ctx->param_streamout_offset[i]);
2224 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2225
2226 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2227 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2228 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2229 }
2230
2231 /* Write streamout data. */
2232 for (i = 0; i < so->num_outputs; i++) {
2233 unsigned reg = so->output[i].register_index;
2234
2235 if (reg >= noutput)
2236 continue;
2237
2238 if (stream != so->output[i].stream)
2239 continue;
2240
2241 emit_streamout_output(ctx, so_buffers, so_write_offset,
2242 &so->output[i], &outputs[reg]);
2243 }
2244 }
2245 lp_build_endif(&if_ctx);
2246 }
2247
2248
2249 /* Generate export instructions for hardware VS shader stage */
2250 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2251 struct si_shader_output_values *outputs,
2252 unsigned noutput)
2253 {
2254 struct si_shader_context *ctx = si_shader_context(bld_base);
2255 struct si_shader *shader = ctx->shader;
2256 struct lp_build_context *base = &bld_base->base;
2257 struct ac_export_args args, pos_args[4] = {};
2258 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2259 unsigned semantic_name, semantic_index;
2260 unsigned target;
2261 unsigned param_count = 0;
2262 unsigned pos_idx;
2263 int i;
2264
2265 for (i = 0; i < noutput; i++) {
2266 semantic_name = outputs[i].semantic_name;
2267 semantic_index = outputs[i].semantic_index;
2268 bool export_param = true;
2269
2270 switch (semantic_name) {
2271 case TGSI_SEMANTIC_POSITION: /* ignore these */
2272 case TGSI_SEMANTIC_PSIZE:
2273 case TGSI_SEMANTIC_CLIPVERTEX:
2274 case TGSI_SEMANTIC_EDGEFLAG:
2275 break;
2276 case TGSI_SEMANTIC_GENERIC:
2277 case TGSI_SEMANTIC_CLIPDIST:
2278 if (shader->key.opt.hw_vs.kill_outputs &
2279 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2280 export_param = false;
2281 break;
2282 default:
2283 if (shader->key.opt.hw_vs.kill_outputs2 &
2284 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2285 export_param = false;
2286 break;
2287 }
2288
2289 if (outputs[i].vertex_stream[0] != 0 &&
2290 outputs[i].vertex_stream[1] != 0 &&
2291 outputs[i].vertex_stream[2] != 0 &&
2292 outputs[i].vertex_stream[3] != 0)
2293 export_param = false;
2294
2295 handle_semantic:
2296 /* Select the correct target */
2297 switch(semantic_name) {
2298 case TGSI_SEMANTIC_PSIZE:
2299 psize_value = outputs[i].values[0];
2300 continue;
2301 case TGSI_SEMANTIC_EDGEFLAG:
2302 edgeflag_value = outputs[i].values[0];
2303 continue;
2304 case TGSI_SEMANTIC_LAYER:
2305 layer_value = outputs[i].values[0];
2306 semantic_name = TGSI_SEMANTIC_GENERIC;
2307 goto handle_semantic;
2308 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2309 viewport_index_value = outputs[i].values[0];
2310 semantic_name = TGSI_SEMANTIC_GENERIC;
2311 goto handle_semantic;
2312 case TGSI_SEMANTIC_POSITION:
2313 target = V_008DFC_SQ_EXP_POS;
2314 break;
2315 case TGSI_SEMANTIC_CLIPDIST:
2316 if (shader->key.opt.hw_vs.clip_disable) {
2317 semantic_name = TGSI_SEMANTIC_GENERIC;
2318 goto handle_semantic;
2319 }
2320 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2321 break;
2322 case TGSI_SEMANTIC_CLIPVERTEX:
2323 if (shader->key.opt.hw_vs.clip_disable)
2324 continue;
2325 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2326 continue;
2327 case TGSI_SEMANTIC_COLOR:
2328 case TGSI_SEMANTIC_BCOLOR:
2329 case TGSI_SEMANTIC_PRIMID:
2330 case TGSI_SEMANTIC_FOG:
2331 case TGSI_SEMANTIC_TEXCOORD:
2332 case TGSI_SEMANTIC_GENERIC:
2333 if (!export_param)
2334 continue;
2335 target = V_008DFC_SQ_EXP_PARAM + param_count;
2336 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2337 shader->info.vs_output_param_offset[i] = param_count;
2338 param_count++;
2339 break;
2340 default:
2341 target = 0;
2342 fprintf(stderr,
2343 "Warning: SI unhandled vs output type:%d\n",
2344 semantic_name);
2345 }
2346
2347 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2348
2349 if (target >= V_008DFC_SQ_EXP_POS &&
2350 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2351 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2352 &args, sizeof(args));
2353 } else {
2354 ac_build_export(&ctx->ac, &args);
2355 }
2356
2357 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2358 semantic_name = TGSI_SEMANTIC_GENERIC;
2359 goto handle_semantic;
2360 }
2361 }
2362
2363 shader->info.nr_param_exports = param_count;
2364
2365 /* We need to add the position output manually if it's missing. */
2366 if (!pos_args[0].out[0]) {
2367 pos_args[0].enabled_channels = 0xf; /* writemask */
2368 pos_args[0].valid_mask = 0; /* EXEC mask */
2369 pos_args[0].done = 0; /* last export? */
2370 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2371 pos_args[0].compr = 0; /* COMPR flag */
2372 pos_args[0].out[0] = base->zero; /* X */
2373 pos_args[0].out[1] = base->zero; /* Y */
2374 pos_args[0].out[2] = base->zero; /* Z */
2375 pos_args[0].out[3] = base->one; /* W */
2376 }
2377
2378 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2379 if (shader->selector->info.writes_psize ||
2380 shader->selector->info.writes_edgeflag ||
2381 shader->selector->info.writes_viewport_index ||
2382 shader->selector->info.writes_layer) {
2383 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2384 (shader->selector->info.writes_edgeflag << 1) |
2385 (shader->selector->info.writes_layer << 2) |
2386 (shader->selector->info.writes_viewport_index << 3);
2387 pos_args[1].valid_mask = 0; /* EXEC mask */
2388 pos_args[1].done = 0; /* last export? */
2389 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2390 pos_args[1].compr = 0; /* COMPR flag */
2391 pos_args[1].out[0] = base->zero; /* X */
2392 pos_args[1].out[1] = base->zero; /* Y */
2393 pos_args[1].out[2] = base->zero; /* Z */
2394 pos_args[1].out[3] = base->zero; /* W */
2395
2396 if (shader->selector->info.writes_psize)
2397 pos_args[1].out[0] = psize_value;
2398
2399 if (shader->selector->info.writes_edgeflag) {
2400 /* The output is a float, but the hw expects an integer
2401 * with the first bit containing the edge flag. */
2402 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2403 edgeflag_value,
2404 ctx->i32, "");
2405 edgeflag_value = lp_build_min(&bld_base->int_bld,
2406 edgeflag_value,
2407 ctx->i32_1);
2408
2409 /* The LLVM intrinsic expects a float. */
2410 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2411 edgeflag_value,
2412 ctx->f32, "");
2413 }
2414
2415 if (shader->selector->info.writes_layer)
2416 pos_args[1].out[2] = layer_value;
2417
2418 if (shader->selector->info.writes_viewport_index)
2419 pos_args[1].out[3] = viewport_index_value;
2420 }
2421
2422 for (i = 0; i < 4; i++)
2423 if (pos_args[i].out[0])
2424 shader->info.nr_pos_exports++;
2425
2426 pos_idx = 0;
2427 for (i = 0; i < 4; i++) {
2428 if (!pos_args[i].out[0])
2429 continue;
2430
2431 /* Specify the target we are exporting */
2432 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2433
2434 if (pos_idx == shader->info.nr_pos_exports)
2435 /* Specify that this is the last export */
2436 pos_args[i].done = 1;
2437
2438 ac_build_export(&ctx->ac, &pos_args[i]);
2439 }
2440 }
2441
2442 /**
2443 * Forward all outputs from the vertex shader to the TES. This is only used
2444 * for the fixed function TCS.
2445 */
2446 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2447 {
2448 struct si_shader_context *ctx = si_shader_context(bld_base);
2449 struct gallivm_state *gallivm = &ctx->gallivm;
2450 LLVMValueRef invocation_id, buffer, buffer_offset;
2451 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2452 uint64_t inputs;
2453
2454 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2455 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2456 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2457
2458 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2459 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2460 lds_vertex_stride, "");
2461 lds_base = get_tcs_in_current_patch_offset(ctx);
2462 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2463
2464 inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2465 while (inputs) {
2466 unsigned i = u_bit_scan64(&inputs);
2467
2468 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2469 LLVMConstInt(ctx->i32, 4 * i, 0),
2470 "");
2471
2472 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2473 get_rel_patch_id(ctx),
2474 invocation_id,
2475 LLVMConstInt(ctx->i32, i, 0));
2476
2477 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2478 lds_ptr);
2479
2480 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2481 buffer_offset, 0, 1, 0, true, false);
2482 }
2483 }
2484
2485 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2486 LLVMValueRef rel_patch_id,
2487 LLVMValueRef invocation_id,
2488 LLVMValueRef tcs_out_current_patch_data_offset)
2489 {
2490 struct si_shader_context *ctx = si_shader_context(bld_base);
2491 struct gallivm_state *gallivm = &ctx->gallivm;
2492 struct si_shader *shader = ctx->shader;
2493 unsigned tess_inner_index, tess_outer_index;
2494 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2495 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2496 unsigned stride, outer_comps, inner_comps, i, offset;
2497 struct lp_build_if_state if_ctx, inner_if_ctx;
2498
2499 si_llvm_emit_barrier(NULL, bld_base, NULL);
2500
2501 /* Do this only for invocation 0, because the tess levels are per-patch,
2502 * not per-vertex.
2503 *
2504 * This can't jump, because invocation 0 executes this. It should
2505 * at least mask out the loads and stores for other invocations.
2506 */
2507 lp_build_if(&if_ctx, gallivm,
2508 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2509 invocation_id, ctx->i32_0, ""));
2510
2511 /* Determine the layout of one tess factor element in the buffer. */
2512 switch (shader->key.part.tcs.epilog.prim_mode) {
2513 case PIPE_PRIM_LINES:
2514 stride = 2; /* 2 dwords, 1 vec2 store */
2515 outer_comps = 2;
2516 inner_comps = 0;
2517 break;
2518 case PIPE_PRIM_TRIANGLES:
2519 stride = 4; /* 4 dwords, 1 vec4 store */
2520 outer_comps = 3;
2521 inner_comps = 1;
2522 break;
2523 case PIPE_PRIM_QUADS:
2524 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2525 outer_comps = 4;
2526 inner_comps = 2;
2527 break;
2528 default:
2529 assert(0);
2530 return;
2531 }
2532
2533 /* Load tess_inner and tess_outer from LDS.
2534 * Any invocation can write them, so we can't get them from a temporary.
2535 */
2536 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2537 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2538
2539 lds_base = tcs_out_current_patch_data_offset;
2540 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2541 LLVMConstInt(ctx->i32,
2542 tess_inner_index * 4, 0), "");
2543 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2544 LLVMConstInt(ctx->i32,
2545 tess_outer_index * 4, 0), "");
2546
2547 for (i = 0; i < 4; i++) {
2548 inner[i] = LLVMGetUndef(ctx->i32);
2549 outer[i] = LLVMGetUndef(ctx->i32);
2550 }
2551
2552 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2553 /* For isolines, the hardware expects tess factors in the
2554 * reverse order from what GLSL / TGSI specify.
2555 */
2556 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2557 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2558 } else {
2559 for (i = 0; i < outer_comps; i++) {
2560 outer[i] = out[i] =
2561 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2562 }
2563 for (i = 0; i < inner_comps; i++) {
2564 inner[i] = out[outer_comps+i] =
2565 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2566 }
2567 }
2568
2569 /* Convert the outputs to vectors for stores. */
2570 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2571 vec1 = NULL;
2572
2573 if (stride > 4)
2574 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2575
2576 /* Get the buffer. */
2577 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2578
2579 /* Get the offset. */
2580 tf_base = LLVMGetParam(ctx->main_fn,
2581 ctx->param_tcs_factor_offset);
2582 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2583 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2584
2585 lp_build_if(&inner_if_ctx, gallivm,
2586 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2587 rel_patch_id, ctx->i32_0, ""));
2588
2589 /* Store the dynamic HS control word. */
2590 offset = 0;
2591 if (ctx->screen->b.chip_class <= VI) {
2592 ac_build_buffer_store_dword(&ctx->ac, buffer,
2593 LLVMConstInt(ctx->i32, 0x80000000, 0),
2594 1, ctx->i32_0, tf_base,
2595 offset, 1, 0, true, false);
2596 offset += 4;
2597 }
2598
2599 lp_build_endif(&inner_if_ctx);
2600
2601 /* Store the tessellation factors. */
2602 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2603 MIN2(stride, 4), byteoffset, tf_base,
2604 offset, 1, 0, true, false);
2605 offset += 16;
2606 if (vec1)
2607 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2608 stride - 4, byteoffset, tf_base,
2609 offset, 1, 0, true, false);
2610
2611 /* Store the tess factors into the offchip buffer if TES reads them. */
2612 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2613 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2614 LLVMValueRef tf_inner_offset;
2615 unsigned param_outer, param_inner;
2616
2617 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2618 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2619
2620 param_outer = si_shader_io_get_unique_index(
2621 TGSI_SEMANTIC_TESSOUTER, 0);
2622 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2623 LLVMConstInt(ctx->i32, param_outer, 0));
2624
2625 outer_vec = lp_build_gather_values(gallivm, outer,
2626 util_next_power_of_two(outer_comps));
2627
2628 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2629 outer_comps, tf_outer_offset,
2630 base, 0, 1, 0, true, false);
2631 if (inner_comps) {
2632 param_inner = si_shader_io_get_unique_index(
2633 TGSI_SEMANTIC_TESSINNER, 0);
2634 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2635 LLVMConstInt(ctx->i32, param_inner, 0));
2636
2637 inner_vec = inner_comps == 1 ? inner[0] :
2638 lp_build_gather_values(gallivm, inner, inner_comps);
2639 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2640 inner_comps, tf_inner_offset,
2641 base, 0, 1, 0, true, false);
2642 }
2643 }
2644
2645 lp_build_endif(&if_ctx);
2646 }
2647
2648 static LLVMValueRef
2649 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2650 unsigned param, unsigned return_index)
2651 {
2652 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2653 LLVMGetParam(ctx->main_fn, param),
2654 return_index, "");
2655 }
2656
2657 static LLVMValueRef
2658 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2659 unsigned param, unsigned return_index)
2660 {
2661 LLVMBuilderRef builder = ctx->gallivm.builder;
2662 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2663
2664 return LLVMBuildInsertValue(builder, ret,
2665 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2666 return_index, "");
2667 }
2668
2669 static LLVMValueRef
2670 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2671 unsigned param, unsigned return_index)
2672 {
2673 LLVMBuilderRef builder = ctx->gallivm.builder;
2674 LLVMValueRef ptr, lo, hi;
2675
2676 ptr = LLVMGetParam(ctx->main_fn, param);
2677 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2678 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2679 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2680 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2681 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2682 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2683 }
2684
2685 /* This only writes the tessellation factor levels. */
2686 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2687 {
2688 struct si_shader_context *ctx = si_shader_context(bld_base);
2689 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2690
2691 si_copy_tcs_inputs(bld_base);
2692
2693 rel_patch_id = get_rel_patch_id(ctx);
2694 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2695 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2696
2697 /* Return epilog parameters from this function. */
2698 LLVMBuilderRef builder = ctx->gallivm.builder;
2699 LLVMValueRef ret = ctx->return_value;
2700 unsigned vgpr;
2701
2702 if (ctx->screen->b.chip_class >= GFX9) {
2703 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2704 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2705 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2706 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2707 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2708 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2709 /* Tess offchip and tess factor offsets are at the beginning. */
2710 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2711 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2712 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2713 } else {
2714 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2715 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2716 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2717 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2718 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2719 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2720 /* Tess offchip and tess factor offsets are after user SGPRs. */
2721 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2722 GFX6_TCS_NUM_USER_SGPR);
2723 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2724 GFX6_TCS_NUM_USER_SGPR + 1);
2725 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2726 }
2727
2728 /* VGPRs */
2729 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2730 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2731 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2732
2733 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2734 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2735 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2736 ctx->return_value = ret;
2737 }
2738
2739 /* Pass TCS inputs from LS to TCS on GFX9. */
2740 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2741 {
2742 LLVMValueRef ret = ctx->return_value;
2743
2744 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2745 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2746 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2747 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2748 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2749
2750 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2751 8 + SI_SGPR_VS_STATE_BITS);
2752 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2753 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2754 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2755 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2756 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2757 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2758 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2759 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2760 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2761 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2762
2763 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2764 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2765 8 + GFX9_SGPR_TCS_CONST_BUFFERS);
2766 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2767 8 + GFX9_SGPR_TCS_SAMPLERS);
2768 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2769 8 + GFX9_SGPR_TCS_IMAGES);
2770 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2771 8 + GFX9_SGPR_TCS_SHADER_BUFFERS);
2772
2773 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2774 ret = si_insert_input_ret_float(ctx, ret,
2775 ctx->param_tcs_patch_id, vgpr++);
2776 ret = si_insert_input_ret_float(ctx, ret,
2777 ctx->param_tcs_rel_ids, vgpr++);
2778 ctx->return_value = ret;
2779 }
2780
2781 /* Pass GS inputs from ES to GS on GFX9. */
2782 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2783 {
2784 LLVMValueRef ret = ctx->return_value;
2785
2786 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2787 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2788 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2789
2790 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2791
2792 unsigned desc_param = ctx->param_vs_state_bits + 1;
2793 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2794 8 + GFX9_SGPR_GS_CONST_BUFFERS);
2795 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2796 8 + GFX9_SGPR_GS_SAMPLERS);
2797 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2798 8 + GFX9_SGPR_GS_IMAGES);
2799 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2800 8 + GFX9_SGPR_GS_SHADER_BUFFERS);
2801
2802 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2803 for (unsigned i = 0; i < 5; i++) {
2804 unsigned param = ctx->param_gs_vtx01_offset + i;
2805 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2806 }
2807 ctx->return_value = ret;
2808 }
2809
2810 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2811 {
2812 struct si_shader_context *ctx = si_shader_context(bld_base);
2813 struct si_shader *shader = ctx->shader;
2814 struct tgsi_shader_info *info = &shader->selector->info;
2815 struct gallivm_state *gallivm = &ctx->gallivm;
2816 unsigned i, chan;
2817 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2818 ctx->param_rel_auto_id);
2819 LLVMValueRef vertex_dw_stride =
2820 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2821 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2822 vertex_dw_stride, "");
2823
2824 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2825 * its inputs from it. */
2826 for (i = 0; i < info->num_outputs; i++) {
2827 LLVMValueRef *out_ptr = ctx->outputs[i];
2828 unsigned name = info->output_semantic_name[i];
2829 unsigned index = info->output_semantic_index[i];
2830
2831 /* The ARB_shader_viewport_layer_array spec contains the
2832 * following issue:
2833 *
2834 * 2) What happens if gl_ViewportIndex or gl_Layer is
2835 * written in the vertex shader and a geometry shader is
2836 * present?
2837 *
2838 * RESOLVED: The value written by the last vertex processing
2839 * stage is used. If the last vertex processing stage
2840 * (vertex, tessellation evaluation or geometry) does not
2841 * statically assign to gl_ViewportIndex or gl_Layer, index
2842 * or layer zero is assumed.
2843 *
2844 * So writes to those outputs in VS-as-LS are simply ignored.
2845 */
2846 if (name == TGSI_SEMANTIC_LAYER ||
2847 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2848 continue;
2849
2850 int param = si_shader_io_get_unique_index(name, index);
2851 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2852 LLVMConstInt(ctx->i32, param * 4, 0), "");
2853
2854 for (chan = 0; chan < 4; chan++) {
2855 lds_store(bld_base, chan, dw_addr,
2856 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2857 }
2858 }
2859
2860 if (ctx->screen->b.chip_class >= GFX9)
2861 si_set_ls_return_value_for_tcs(ctx);
2862 }
2863
2864 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2865 {
2866 struct si_shader_context *ctx = si_shader_context(bld_base);
2867 struct gallivm_state *gallivm = &ctx->gallivm;
2868 struct si_shader *es = ctx->shader;
2869 struct tgsi_shader_info *info = &es->selector->info;
2870 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2871 ctx->param_es2gs_offset);
2872 LLVMValueRef lds_base = NULL;
2873 unsigned chan;
2874 int i;
2875
2876 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2877 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2878 lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
2879 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2880 }
2881
2882 for (i = 0; i < info->num_outputs; i++) {
2883 LLVMValueRef *out_ptr = ctx->outputs[i];
2884 int param;
2885
2886 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2887 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2888 continue;
2889
2890 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2891 info->output_semantic_index[i]);
2892
2893 for (chan = 0; chan < 4; chan++) {
2894 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2895 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2896
2897 /* GFX9 has the ESGS ring in LDS. */
2898 if (ctx->screen->b.chip_class >= GFX9) {
2899 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2900 continue;
2901 }
2902
2903 ac_build_buffer_store_dword(&ctx->ac,
2904 ctx->esgs_ring,
2905 out_val, 1, NULL, soffset,
2906 (4 * param + chan) * 4,
2907 1, 1, true, true);
2908 }
2909 }
2910
2911 if (ctx->screen->b.chip_class >= GFX9)
2912 si_set_es_return_value_for_gs(ctx);
2913 }
2914
2915 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2916 {
2917 if (ctx->screen->b.chip_class >= GFX9)
2918 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2919 else
2920 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2921 }
2922
2923 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2924 {
2925 struct si_shader_context *ctx = si_shader_context(bld_base);
2926
2927 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2928 si_get_gs_wave_id(ctx));
2929 }
2930
2931 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2932 {
2933 struct si_shader_context *ctx = si_shader_context(bld_base);
2934 struct gallivm_state *gallivm = &ctx->gallivm;
2935 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2936 struct si_shader_output_values *outputs = NULL;
2937 int i,j;
2938
2939 assert(!ctx->shader->is_gs_copy_shader);
2940
2941 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2942
2943 /* Vertex color clamping.
2944 *
2945 * This uses a state constant loaded in a user data SGPR and
2946 * an IF statement is added that clamps all colors if the constant
2947 * is true.
2948 */
2949 if (ctx->type == PIPE_SHADER_VERTEX) {
2950 struct lp_build_if_state if_ctx;
2951 LLVMValueRef cond = NULL;
2952 LLVMValueRef addr, val;
2953
2954 for (i = 0; i < info->num_outputs; i++) {
2955 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2956 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2957 continue;
2958
2959 /* We've found a color. */
2960 if (!cond) {
2961 /* The state is in the first bit of the user SGPR. */
2962 cond = LLVMGetParam(ctx->main_fn,
2963 ctx->param_vs_state_bits);
2964 cond = LLVMBuildTrunc(gallivm->builder, cond,
2965 ctx->i1, "");
2966 lp_build_if(&if_ctx, gallivm, cond);
2967 }
2968
2969 for (j = 0; j < 4; j++) {
2970 addr = ctx->outputs[i][j];
2971 val = LLVMBuildLoad(gallivm->builder, addr, "");
2972 val = ac_build_clamp(&ctx->ac, val);
2973 LLVMBuildStore(gallivm->builder, val, addr);
2974 }
2975 }
2976
2977 if (cond)
2978 lp_build_endif(&if_ctx);
2979 }
2980
2981 for (i = 0; i < info->num_outputs; i++) {
2982 outputs[i].semantic_name = info->output_semantic_name[i];
2983 outputs[i].semantic_index = info->output_semantic_index[i];
2984
2985 for (j = 0; j < 4; j++) {
2986 outputs[i].values[j] =
2987 LLVMBuildLoad(gallivm->builder,
2988 ctx->outputs[i][j],
2989 "");
2990 outputs[i].vertex_stream[j] =
2991 (info->output_streams[i] >> (2 * j)) & 3;
2992 }
2993 }
2994
2995 if (ctx->shader->selector->so.num_outputs)
2996 si_llvm_emit_streamout(ctx, outputs, i, 0);
2997
2998 /* Export PrimitiveID. */
2999 if (ctx->shader->key.mono.vs_export_prim_id) {
3000 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3001 outputs[i].semantic_index = 0;
3002 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
3003 get_primitive_id(bld_base, 0));
3004 for (j = 1; j < 4; j++)
3005 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3006
3007 memset(outputs[i].vertex_stream, 0,
3008 sizeof(outputs[i].vertex_stream));
3009 i++;
3010 }
3011
3012 si_llvm_export_vs(bld_base, outputs, i);
3013 FREE(outputs);
3014 }
3015
3016 struct si_ps_exports {
3017 unsigned num;
3018 struct ac_export_args args[10];
3019 };
3020
3021 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3022 bool writes_samplemask)
3023 {
3024 if (writes_z) {
3025 /* Z needs 32 bits. */
3026 if (writes_samplemask)
3027 return V_028710_SPI_SHADER_32_ABGR;
3028 else if (writes_stencil)
3029 return V_028710_SPI_SHADER_32_GR;
3030 else
3031 return V_028710_SPI_SHADER_32_R;
3032 } else if (writes_stencil || writes_samplemask) {
3033 /* Both stencil and sample mask need only 16 bits. */
3034 return V_028710_SPI_SHADER_UINT16_ABGR;
3035 } else {
3036 return V_028710_SPI_SHADER_ZERO;
3037 }
3038 }
3039
3040 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3041 LLVMValueRef depth, LLVMValueRef stencil,
3042 LLVMValueRef samplemask, struct si_ps_exports *exp)
3043 {
3044 struct si_shader_context *ctx = si_shader_context(bld_base);
3045 struct lp_build_context *base = &bld_base->base;
3046 struct ac_export_args args;
3047 unsigned mask = 0;
3048 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3049 stencil != NULL,
3050 samplemask != NULL);
3051
3052 assert(depth || stencil || samplemask);
3053
3054 args.valid_mask = 1; /* whether the EXEC mask is valid */
3055 args.done = 1; /* DONE bit */
3056
3057 /* Specify the target we are exporting */
3058 args.target = V_008DFC_SQ_EXP_MRTZ;
3059
3060 args.compr = 0; /* COMP flag */
3061 args.out[0] = base->undef; /* R, depth */
3062 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3063 args.out[2] = base->undef; /* B, sample mask */
3064 args.out[3] = base->undef; /* A, alpha to mask */
3065
3066 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3067 assert(!depth);
3068 args.compr = 1; /* COMPR flag */
3069
3070 if (stencil) {
3071 /* Stencil should be in X[23:16]. */
3072 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3073 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3074 LLVMConstInt(ctx->i32, 16, 0), "");
3075 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3076 mask |= 0x3;
3077 }
3078 if (samplemask) {
3079 /* SampleMask should be in Y[15:0]. */
3080 args.out[1] = samplemask;
3081 mask |= 0xc;
3082 }
3083 } else {
3084 if (depth) {
3085 args.out[0] = depth;
3086 mask |= 0x1;
3087 }
3088 if (stencil) {
3089 args.out[1] = stencil;
3090 mask |= 0x2;
3091 }
3092 if (samplemask) {
3093 args.out[2] = samplemask;
3094 mask |= 0x4;
3095 }
3096 }
3097
3098 /* SI (except OLAND and HAINAN) has a bug that it only looks
3099 * at the X writemask component. */
3100 if (ctx->screen->b.chip_class == SI &&
3101 ctx->screen->b.family != CHIP_OLAND &&
3102 ctx->screen->b.family != CHIP_HAINAN)
3103 mask |= 0x1;
3104
3105 /* Specify which components to enable */
3106 args.enabled_channels = mask;
3107
3108 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3109 }
3110
3111 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3112 LLVMValueRef *color, unsigned index,
3113 unsigned samplemask_param,
3114 bool is_last, struct si_ps_exports *exp)
3115 {
3116 struct si_shader_context *ctx = si_shader_context(bld_base);
3117 struct lp_build_context *base = &bld_base->base;
3118 int i;
3119
3120 /* Clamp color */
3121 if (ctx->shader->key.part.ps.epilog.clamp_color)
3122 for (i = 0; i < 4; i++)
3123 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3124
3125 /* Alpha to one */
3126 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3127 color[3] = base->one;
3128
3129 /* Alpha test */
3130 if (index == 0 &&
3131 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3132 si_alpha_test(bld_base, color[3]);
3133
3134 /* Line & polygon smoothing */
3135 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3136 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3137 samplemask_param);
3138
3139 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3140 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3141 struct ac_export_args args[8];
3142 int c, last = -1;
3143
3144 /* Get the export arguments, also find out what the last one is. */
3145 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3146 si_llvm_init_export_args(bld_base, color,
3147 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3148 if (args[c].enabled_channels)
3149 last = c;
3150 }
3151
3152 /* Emit all exports. */
3153 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3154 if (is_last && last == c) {
3155 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3156 args[c].done = 1; /* DONE bit */
3157 } else if (!args[c].enabled_channels)
3158 continue; /* unnecessary NULL export */
3159
3160 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3161 }
3162 } else {
3163 struct ac_export_args args;
3164
3165 /* Export */
3166 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3167 &args);
3168 if (is_last) {
3169 args.valid_mask = 1; /* whether the EXEC mask is valid */
3170 args.done = 1; /* DONE bit */
3171 } else if (!args.enabled_channels)
3172 return; /* unnecessary NULL export */
3173
3174 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3175 }
3176 }
3177
3178 static void si_emit_ps_exports(struct si_shader_context *ctx,
3179 struct si_ps_exports *exp)
3180 {
3181 for (unsigned i = 0; i < exp->num; i++)
3182 ac_build_export(&ctx->ac, &exp->args[i]);
3183 }
3184
3185 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3186 {
3187 struct si_shader_context *ctx = si_shader_context(bld_base);
3188 struct lp_build_context *base = &bld_base->base;
3189 struct ac_export_args args;
3190
3191 args.enabled_channels = 0x0; /* enabled channels */
3192 args.valid_mask = 1; /* whether the EXEC mask is valid */
3193 args.done = 1; /* DONE bit */
3194 args.target = V_008DFC_SQ_EXP_NULL;
3195 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3196 args.out[0] = base->undef; /* R */
3197 args.out[1] = base->undef; /* G */
3198 args.out[2] = base->undef; /* B */
3199 args.out[3] = base->undef; /* A */
3200
3201 ac_build_export(&ctx->ac, &args);
3202 }
3203
3204 /**
3205 * Return PS outputs in this order:
3206 *
3207 * v[0:3] = color0.xyzw
3208 * v[4:7] = color1.xyzw
3209 * ...
3210 * vN+0 = Depth
3211 * vN+1 = Stencil
3212 * vN+2 = SampleMask
3213 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3214 *
3215 * The alpha-ref SGPR is returned via its original location.
3216 */
3217 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3218 {
3219 struct si_shader_context *ctx = si_shader_context(bld_base);
3220 struct si_shader *shader = ctx->shader;
3221 struct tgsi_shader_info *info = &shader->selector->info;
3222 LLVMBuilderRef builder = ctx->gallivm.builder;
3223 unsigned i, j, first_vgpr, vgpr;
3224
3225 LLVMValueRef color[8][4] = {};
3226 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3227 LLVMValueRef ret;
3228
3229 /* Read the output values. */
3230 for (i = 0; i < info->num_outputs; i++) {
3231 unsigned semantic_name = info->output_semantic_name[i];
3232 unsigned semantic_index = info->output_semantic_index[i];
3233
3234 switch (semantic_name) {
3235 case TGSI_SEMANTIC_COLOR:
3236 assert(semantic_index < 8);
3237 for (j = 0; j < 4; j++) {
3238 LLVMValueRef ptr = ctx->outputs[i][j];
3239 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3240 color[semantic_index][j] = result;
3241 }
3242 break;
3243 case TGSI_SEMANTIC_POSITION:
3244 depth = LLVMBuildLoad(builder,
3245 ctx->outputs[i][2], "");
3246 break;
3247 case TGSI_SEMANTIC_STENCIL:
3248 stencil = LLVMBuildLoad(builder,
3249 ctx->outputs[i][1], "");
3250 break;
3251 case TGSI_SEMANTIC_SAMPLEMASK:
3252 samplemask = LLVMBuildLoad(builder,
3253 ctx->outputs[i][0], "");
3254 break;
3255 default:
3256 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3257 semantic_name);
3258 }
3259 }
3260
3261 /* Fill the return structure. */
3262 ret = ctx->return_value;
3263
3264 /* Set SGPRs. */
3265 ret = LLVMBuildInsertValue(builder, ret,
3266 bitcast(bld_base, TGSI_TYPE_SIGNED,
3267 LLVMGetParam(ctx->main_fn,
3268 SI_PARAM_ALPHA_REF)),
3269 SI_SGPR_ALPHA_REF, "");
3270
3271 /* Set VGPRs */
3272 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3273 for (i = 0; i < ARRAY_SIZE(color); i++) {
3274 if (!color[i][0])
3275 continue;
3276
3277 for (j = 0; j < 4; j++)
3278 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3279 }
3280 if (depth)
3281 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3282 if (stencil)
3283 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3284 if (samplemask)
3285 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3286
3287 /* Add the input sample mask for smoothing at the end. */
3288 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3289 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3290 ret = LLVMBuildInsertValue(builder, ret,
3291 LLVMGetParam(ctx->main_fn,
3292 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3293
3294 ctx->return_value = ret;
3295 }
3296
3297 /**
3298 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3299 * buffer in number of elements and return it as an i32.
3300 */
3301 static LLVMValueRef get_buffer_size(
3302 struct lp_build_tgsi_context *bld_base,
3303 LLVMValueRef descriptor)
3304 {
3305 struct si_shader_context *ctx = si_shader_context(bld_base);
3306 struct gallivm_state *gallivm = &ctx->gallivm;
3307 LLVMBuilderRef builder = gallivm->builder;
3308 LLVMValueRef size =
3309 LLVMBuildExtractElement(builder, descriptor,
3310 LLVMConstInt(ctx->i32, 2, 0), "");
3311
3312 if (ctx->screen->b.chip_class == VI) {
3313 /* On VI, the descriptor contains the size in bytes,
3314 * but TXQ must return the size in elements.
3315 * The stride is always non-zero for resources using TXQ.
3316 */
3317 LLVMValueRef stride =
3318 LLVMBuildExtractElement(builder, descriptor,
3319 ctx->i32_1, "");
3320 stride = LLVMBuildLShr(builder, stride,
3321 LLVMConstInt(ctx->i32, 16, 0), "");
3322 stride = LLVMBuildAnd(builder, stride,
3323 LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3324
3325 size = LLVMBuildUDiv(builder, size, stride, "");
3326 }
3327
3328 return size;
3329 }
3330
3331 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3332 struct lp_build_tgsi_context *bld_base,
3333 struct lp_build_emit_data *emit_data);
3334
3335 /* Prevent optimizations (at least of memory accesses) across the current
3336 * point in the program by emitting empty inline assembly that is marked as
3337 * having side effects.
3338 *
3339 * Optionally, a value can be passed through the inline assembly to prevent
3340 * LLVM from hoisting calls to ReadNone functions.
3341 */
3342 static void emit_optimization_barrier(struct si_shader_context *ctx,
3343 LLVMValueRef *pvgpr)
3344 {
3345 static int counter = 0;
3346
3347 LLVMBuilderRef builder = ctx->gallivm.builder;
3348 char code[16];
3349
3350 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3351
3352 if (!pvgpr) {
3353 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3354 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3355 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3356 } else {
3357 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3358 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3359 LLVMValueRef vgpr = *pvgpr;
3360 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3361 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3362 LLVMValueRef vgpr0;
3363
3364 assert(vgpr_size % 4 == 0);
3365
3366 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3367 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3368 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3369 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3370 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3371
3372 *pvgpr = vgpr;
3373 }
3374 }
3375
3376 /* Combine these with & instead of |. */
3377 #define NOOP_WAITCNT 0xf7f
3378 #define LGKM_CNT 0x07f
3379 #define VM_CNT 0xf70
3380
3381 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3382 {
3383 struct gallivm_state *gallivm = &ctx->gallivm;
3384 LLVMBuilderRef builder = gallivm->builder;
3385 LLVMValueRef args[1] = {
3386 LLVMConstInt(ctx->i32, simm16, 0)
3387 };
3388 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3389 ctx->voidt, args, 1, 0);
3390 }
3391
3392 static void membar_emit(
3393 const struct lp_build_tgsi_action *action,
3394 struct lp_build_tgsi_context *bld_base,
3395 struct lp_build_emit_data *emit_data)
3396 {
3397 struct si_shader_context *ctx = si_shader_context(bld_base);
3398 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3399 unsigned flags = LLVMConstIntGetZExtValue(src0);
3400 unsigned waitcnt = NOOP_WAITCNT;
3401
3402 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3403 waitcnt &= VM_CNT & LGKM_CNT;
3404
3405 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3406 TGSI_MEMBAR_SHADER_BUFFER |
3407 TGSI_MEMBAR_SHADER_IMAGE))
3408 waitcnt &= VM_CNT;
3409
3410 if (flags & TGSI_MEMBAR_SHARED)
3411 waitcnt &= LGKM_CNT;
3412
3413 if (waitcnt != NOOP_WAITCNT)
3414 emit_waitcnt(ctx, waitcnt);
3415 }
3416
3417 static void clock_emit(
3418 const struct lp_build_tgsi_action *action,
3419 struct lp_build_tgsi_context *bld_base,
3420 struct lp_build_emit_data *emit_data)
3421 {
3422 struct si_shader_context *ctx = si_shader_context(bld_base);
3423 struct gallivm_state *gallivm = &ctx->gallivm;
3424 LLVMValueRef tmp;
3425
3426 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3427 ctx->i64, NULL, 0, 0);
3428 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3429
3430 emit_data->output[0] =
3431 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3432 emit_data->output[1] =
3433 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3434 }
3435
3436 static LLVMValueRef
3437 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3438 const struct tgsi_full_src_register *reg)
3439 {
3440 LLVMValueRef index;
3441 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3442 ctx->param_shader_buffers);
3443
3444 if (!reg->Register.Indirect)
3445 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3446 else
3447 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3448 reg->Register.Index,
3449 SI_NUM_SHADER_BUFFERS);
3450
3451 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3452 }
3453
3454 static bool tgsi_is_array_sampler(unsigned target)
3455 {
3456 return target == TGSI_TEXTURE_1D_ARRAY ||
3457 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3458 target == TGSI_TEXTURE_2D_ARRAY ||
3459 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3460 target == TGSI_TEXTURE_CUBE_ARRAY ||
3461 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3462 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3463 }
3464
3465 static bool tgsi_is_array_image(unsigned target)
3466 {
3467 return target == TGSI_TEXTURE_3D ||
3468 target == TGSI_TEXTURE_CUBE ||
3469 target == TGSI_TEXTURE_1D_ARRAY ||
3470 target == TGSI_TEXTURE_2D_ARRAY ||
3471 target == TGSI_TEXTURE_CUBE_ARRAY ||
3472 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3473 }
3474
3475 /**
3476 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3477 *
3478 * At least on Tonga, executing image stores on images with DCC enabled and
3479 * non-trivial can eventually lead to lockups. This can occur when an
3480 * application binds an image as read-only but then uses a shader that writes
3481 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3482 * program termination) in this case, but it doesn't cost much to be a bit
3483 * nicer: disabling DCC in the shader still leads to undefined results but
3484 * avoids the lockup.
3485 */
3486 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3487 LLVMValueRef rsrc)
3488 {
3489 if (ctx->screen->b.chip_class <= CIK) {
3490 return rsrc;
3491 } else {
3492 LLVMBuilderRef builder = ctx->gallivm.builder;
3493 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3494 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3495 LLVMValueRef tmp;
3496
3497 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3498 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3499 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3500 }
3501 }
3502
3503 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3504 {
3505 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3506 CONST_ADDR_SPACE);
3507 }
3508
3509 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3510 LLVMValueRef list, LLVMValueRef index,
3511 unsigned target)
3512 {
3513 LLVMBuilderRef builder = ctx->gallivm.builder;
3514
3515 if (target == TGSI_TEXTURE_BUFFER) {
3516 index = LLVMBuildMul(builder, index,
3517 LLVMConstInt(ctx->i32, 2, 0), "");
3518 index = LLVMBuildAdd(builder, index,
3519 ctx->i32_1, "");
3520 list = LLVMBuildPointerCast(builder, list,
3521 const_array(ctx->v4i32, 0), "");
3522 }
3523
3524 return ac_build_indexed_load_const(&ctx->ac, list, index);
3525 }
3526
3527 /**
3528 * Load the resource descriptor for \p image.
3529 */
3530 static void
3531 image_fetch_rsrc(
3532 struct lp_build_tgsi_context *bld_base,
3533 const struct tgsi_full_src_register *image,
3534 bool is_store, unsigned target,
3535 LLVMValueRef *rsrc)
3536 {
3537 struct si_shader_context *ctx = si_shader_context(bld_base);
3538 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3539 ctx->param_images);
3540 LLVMValueRef index;
3541 bool dcc_off = is_store;
3542
3543 assert(image->Register.File == TGSI_FILE_IMAGE);
3544
3545 if (!image->Register.Indirect) {
3546 const struct tgsi_shader_info *info = bld_base->info;
3547 unsigned images_writemask = info->images_store |
3548 info->images_atomic;
3549
3550 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3551
3552 if (images_writemask & (1 << image->Register.Index))
3553 dcc_off = true;
3554 } else {
3555 /* From the GL_ARB_shader_image_load_store extension spec:
3556 *
3557 * If a shader performs an image load, store, or atomic
3558 * operation using an image variable declared as an array,
3559 * and if the index used to select an individual element is
3560 * negative or greater than or equal to the size of the
3561 * array, the results of the operation are undefined but may
3562 * not lead to termination.
3563 */
3564 index = get_bounded_indirect_index(ctx, &image->Indirect,
3565 image->Register.Index,
3566 SI_NUM_IMAGES);
3567 }
3568
3569 *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3570 if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3571 *rsrc = force_dcc_off(ctx, *rsrc);
3572 }
3573
3574 static LLVMValueRef image_fetch_coords(
3575 struct lp_build_tgsi_context *bld_base,
3576 const struct tgsi_full_instruction *inst,
3577 unsigned src, LLVMValueRef desc)
3578 {
3579 struct si_shader_context *ctx = si_shader_context(bld_base);
3580 struct gallivm_state *gallivm = &ctx->gallivm;
3581 LLVMBuilderRef builder = gallivm->builder;
3582 unsigned target = inst->Memory.Texture;
3583 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3584 LLVMValueRef coords[4];
3585 LLVMValueRef tmp;
3586 int chan;
3587
3588 for (chan = 0; chan < num_coords; ++chan) {
3589 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3590 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3591 coords[chan] = tmp;
3592 }
3593
3594 if (ctx->screen->b.chip_class >= GFX9) {
3595 /* 1D textures are allocated and used as 2D on GFX9. */
3596 if (target == TGSI_TEXTURE_1D) {
3597 coords[1] = ctx->i32_0;
3598 num_coords++;
3599 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3600 coords[2] = coords[1];
3601 coords[1] = ctx->i32_0;
3602 num_coords++;
3603 } else if (target == TGSI_TEXTURE_2D) {
3604 /* The hw can't bind a slice of a 3D image as a 2D
3605 * image, because it ignores BASE_ARRAY if the target
3606 * is 3D. The workaround is to read BASE_ARRAY and set
3607 * it as the 3rd address operand for all 2D images.
3608 */
3609 LLVMValueRef first_layer, const5, mask;
3610
3611 const5 = LLVMConstInt(ctx->i32, 5, 0);
3612 mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3613 first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3614 first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3615
3616 coords[2] = first_layer;
3617 num_coords++;
3618 }
3619 }
3620
3621 if (num_coords == 1)
3622 return coords[0];
3623
3624 if (num_coords == 3) {
3625 /* LLVM has difficulties lowering 3-element vectors. */
3626 coords[3] = bld_base->uint_bld.undef;
3627 num_coords = 4;
3628 }
3629
3630 return lp_build_gather_values(gallivm, coords, num_coords);
3631 }
3632
3633 /**
3634 * Append the extra mode bits that are used by image load and store.
3635 */
3636 static void image_append_args(
3637 struct si_shader_context *ctx,
3638 struct lp_build_emit_data * emit_data,
3639 unsigned target,
3640 bool atomic,
3641 bool force_glc)
3642 {
3643 const struct tgsi_full_instruction *inst = emit_data->inst;
3644 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3645 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3646 LLVMValueRef r128 = i1false;
3647 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3648 LLVMValueRef glc =
3649 force_glc ||
3650 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3651 i1true : i1false;
3652 LLVMValueRef slc = i1false;
3653 LLVMValueRef lwe = i1false;
3654
3655 if (atomic || (HAVE_LLVM <= 0x0309)) {
3656 emit_data->args[emit_data->arg_count++] = r128;
3657 emit_data->args[emit_data->arg_count++] = da;
3658 if (!atomic) {
3659 emit_data->args[emit_data->arg_count++] = glc;
3660 }
3661 emit_data->args[emit_data->arg_count++] = slc;
3662 return;
3663 }
3664
3665 /* HAVE_LLVM >= 0x0400 */
3666 emit_data->args[emit_data->arg_count++] = glc;
3667 emit_data->args[emit_data->arg_count++] = slc;
3668 emit_data->args[emit_data->arg_count++] = lwe;
3669 emit_data->args[emit_data->arg_count++] = da;
3670 }
3671
3672 /**
3673 * Append the resource and indexing arguments for buffer intrinsics.
3674 *
3675 * \param rsrc the v4i32 buffer resource
3676 * \param index index into the buffer (stride-based)
3677 * \param offset byte offset into the buffer
3678 */
3679 static void buffer_append_args(
3680 struct si_shader_context *ctx,
3681 struct lp_build_emit_data *emit_data,
3682 LLVMValueRef rsrc,
3683 LLVMValueRef index,
3684 LLVMValueRef offset,
3685 bool atomic,
3686 bool force_glc)
3687 {
3688 const struct tgsi_full_instruction *inst = emit_data->inst;
3689 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3690 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3691
3692 emit_data->args[emit_data->arg_count++] = rsrc;
3693 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3694 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3695 if (!atomic) {
3696 emit_data->args[emit_data->arg_count++] =
3697 force_glc ||
3698 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3699 i1true : i1false; /* glc */
3700 }
3701 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3702 }
3703
3704 static void load_fetch_args(
3705 struct lp_build_tgsi_context * bld_base,
3706 struct lp_build_emit_data * emit_data)
3707 {
3708 struct si_shader_context *ctx = si_shader_context(bld_base);
3709 struct gallivm_state *gallivm = &ctx->gallivm;
3710 const struct tgsi_full_instruction * inst = emit_data->inst;
3711 unsigned target = inst->Memory.Texture;
3712 LLVMValueRef rsrc;
3713
3714 emit_data->dst_type = ctx->v4f32;
3715
3716 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3717 LLVMBuilderRef builder = gallivm->builder;
3718 LLVMValueRef offset;
3719 LLVMValueRef tmp;
3720
3721 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3722
3723 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3724 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3725
3726 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3727 offset, false, false);
3728 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3729 LLVMValueRef coords;
3730
3731 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3732 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3733
3734 if (target == TGSI_TEXTURE_BUFFER) {
3735 buffer_append_args(ctx, emit_data, rsrc, coords,
3736 ctx->i32_0, false, false);
3737 } else {
3738 emit_data->args[0] = coords;
3739 emit_data->args[1] = rsrc;
3740 emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3741 emit_data->arg_count = 3;
3742
3743 image_append_args(ctx, emit_data, target, false, false);
3744 }
3745 }
3746 }
3747
3748 static unsigned get_load_intr_attribs(bool readonly_memory)
3749 {
3750 /* READNONE means writes can't affect it, while READONLY means that
3751 * writes can affect it. */
3752 return readonly_memory && HAVE_LLVM >= 0x0400 ?
3753 LP_FUNC_ATTR_READNONE :
3754 LP_FUNC_ATTR_READONLY;
3755 }
3756
3757 static unsigned get_store_intr_attribs(bool writeonly_memory)
3758 {
3759 return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3760 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3761 LP_FUNC_ATTR_WRITEONLY;
3762 }
3763
3764 static void load_emit_buffer(struct si_shader_context *ctx,
3765 struct lp_build_emit_data *emit_data,
3766 bool readonly_memory)
3767 {
3768 const struct tgsi_full_instruction *inst = emit_data->inst;
3769 struct gallivm_state *gallivm = &ctx->gallivm;
3770 LLVMBuilderRef builder = gallivm->builder;
3771 uint writemask = inst->Dst[0].Register.WriteMask;
3772 uint count = util_last_bit(writemask);
3773 const char *intrinsic_name;
3774 LLVMTypeRef dst_type;
3775
3776 switch (count) {
3777 case 1:
3778 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3779 dst_type = ctx->f32;
3780 break;
3781 case 2:
3782 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3783 dst_type = LLVMVectorType(ctx->f32, 2);
3784 break;
3785 default: // 3 & 4
3786 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3787 dst_type = ctx->v4f32;
3788 count = 4;
3789 }
3790
3791 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3792 builder, intrinsic_name, dst_type,
3793 emit_data->args, emit_data->arg_count,
3794 get_load_intr_attribs(readonly_memory));
3795 }
3796
3797 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3798 const struct tgsi_full_instruction *inst,
3799 LLVMTypeRef type, int arg)
3800 {
3801 struct gallivm_state *gallivm = &ctx->gallivm;
3802 LLVMBuilderRef builder = gallivm->builder;
3803 LLVMValueRef offset, ptr;
3804 int addr_space;
3805
3806 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3807 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3808
3809 ptr = ctx->shared_memory;
3810 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3811 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3812 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3813
3814 return ptr;
3815 }
3816
3817 static void load_emit_memory(
3818 struct si_shader_context *ctx,
3819 struct lp_build_emit_data *emit_data)
3820 {
3821 const struct tgsi_full_instruction *inst = emit_data->inst;
3822 struct gallivm_state *gallivm = &ctx->gallivm;
3823 LLVMBuilderRef builder = gallivm->builder;
3824 unsigned writemask = inst->Dst[0].Register.WriteMask;
3825 LLVMValueRef channels[4], ptr, derived_ptr, index;
3826 int chan;
3827
3828 ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3829
3830 for (chan = 0; chan < 4; ++chan) {
3831 if (!(writemask & (1 << chan))) {
3832 channels[chan] = LLVMGetUndef(ctx->f32);
3833 continue;
3834 }
3835
3836 index = LLVMConstInt(ctx->i32, chan, 0);
3837 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3838 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3839 }
3840 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3841 }
3842
3843 /**
3844 * Return true if the memory accessed by a LOAD or STORE instruction is
3845 * read-only or write-only, respectively.
3846 *
3847 * \param shader_buffers_reverse_access_mask
3848 * For LOAD, set this to (store | atomic) slot usage in the shader.
3849 * For STORE, set this to (load | atomic) slot usage in the shader.
3850 * \param images_reverse_access_mask Same as above, but for images.
3851 */
3852 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3853 const struct tgsi_shader_info *info,
3854 unsigned shader_buffers_reverse_access_mask,
3855 unsigned images_reverse_access_mask)
3856 {
3857 /* RESTRICT means NOALIAS.
3858 * If there are no writes, we can assume the accessed memory is read-only.
3859 * If there are no reads, we can assume the accessed memory is write-only.
3860 */
3861 if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3862 unsigned reverse_access_mask;
3863
3864 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3865 reverse_access_mask = shader_buffers_reverse_access_mask;
3866 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3867 reverse_access_mask = info->images_buffers &
3868 images_reverse_access_mask;
3869 } else {
3870 reverse_access_mask = ~info->images_buffers &
3871 images_reverse_access_mask;
3872 }
3873
3874 if (inst->Src[0].Register.Indirect) {
3875 if (!reverse_access_mask)
3876 return true;
3877 } else {
3878 if (!(reverse_access_mask &
3879 (1u << inst->Src[0].Register.Index)))
3880 return true;
3881 }
3882 }
3883
3884 /* If there are no buffer writes (for both shader buffers & image
3885 * buffers), it implies that buffer memory is read-only.
3886 * If there are no buffer reads (for both shader buffers & image
3887 * buffers), it implies that buffer memory is write-only.
3888 *
3889 * Same for the case when there are no writes/reads for non-buffer
3890 * images.
3891 */
3892 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3893 (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3894 inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3895 if (!shader_buffers_reverse_access_mask &&
3896 !(info->images_buffers & images_reverse_access_mask))
3897 return true;
3898 } else {
3899 if (!(~info->images_buffers & images_reverse_access_mask))
3900 return true;
3901 }
3902 return false;
3903 }
3904
3905 static void load_emit(
3906 const struct lp_build_tgsi_action *action,
3907 struct lp_build_tgsi_context *bld_base,
3908 struct lp_build_emit_data *emit_data)
3909 {
3910 struct si_shader_context *ctx = si_shader_context(bld_base);
3911 struct gallivm_state *gallivm = &ctx->gallivm;
3912 LLVMBuilderRef builder = gallivm->builder;
3913 const struct tgsi_full_instruction * inst = emit_data->inst;
3914 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3915 char intrinsic_name[64];
3916 bool readonly_memory = false;
3917
3918 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3919 load_emit_memory(ctx, emit_data);
3920 return;
3921 }
3922
3923 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3924 emit_waitcnt(ctx, VM_CNT);
3925
3926 readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3927 is_oneway_access_only(inst, info,
3928 info->shader_buffers_store |
3929 info->shader_buffers_atomic,
3930 info->images_store |
3931 info->images_atomic);
3932
3933 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3934 load_emit_buffer(ctx, emit_data, readonly_memory);
3935 return;
3936 }
3937
3938 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3939 emit_data->output[emit_data->chan] =
3940 lp_build_intrinsic(
3941 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3942 emit_data->args, emit_data->arg_count,
3943 get_load_intr_attribs(readonly_memory));
3944 } else {
3945 ac_get_image_intr_name("llvm.amdgcn.image.load",
3946 emit_data->dst_type, /* vdata */
3947 LLVMTypeOf(emit_data->args[0]), /* coords */
3948 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3949 intrinsic_name, sizeof(intrinsic_name));
3950
3951 emit_data->output[emit_data->chan] =
3952 lp_build_intrinsic(
3953 builder, intrinsic_name, emit_data->dst_type,
3954 emit_data->args, emit_data->arg_count,
3955 get_load_intr_attribs(readonly_memory));
3956 }
3957 }
3958
3959 static void store_fetch_args(
3960 struct lp_build_tgsi_context * bld_base,
3961 struct lp_build_emit_data * emit_data)
3962 {
3963 struct si_shader_context *ctx = si_shader_context(bld_base);
3964 struct gallivm_state *gallivm = &ctx->gallivm;
3965 LLVMBuilderRef builder = gallivm->builder;
3966 const struct tgsi_full_instruction * inst = emit_data->inst;
3967 struct tgsi_full_src_register memory;
3968 LLVMValueRef chans[4];
3969 LLVMValueRef data;
3970 LLVMValueRef rsrc;
3971 unsigned chan;
3972
3973 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3974
3975 for (chan = 0; chan < 4; ++chan) {
3976 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3977 }
3978 data = lp_build_gather_values(gallivm, chans, 4);
3979
3980 emit_data->args[emit_data->arg_count++] = data;
3981
3982 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3983
3984 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3985 LLVMValueRef offset;
3986 LLVMValueRef tmp;
3987
3988 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3989
3990 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3991 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3992
3993 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3994 offset, false, false);
3995 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3996 unsigned target = inst->Memory.Texture;
3997 LLVMValueRef coords;
3998
3999 /* 8bit/16bit TC L1 write corruption bug on SI.
4000 * All store opcodes not aligned to a dword are affected.
4001 *
4002 * The only way to get unaligned stores in radeonsi is through
4003 * shader images.
4004 */
4005 bool force_glc = ctx->screen->b.chip_class == SI;
4006
4007 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
4008 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
4009
4010 if (target == TGSI_TEXTURE_BUFFER) {
4011 buffer_append_args(ctx, emit_data, rsrc, coords,
4012 ctx->i32_0, false, force_glc);
4013 } else {
4014 emit_data->args[1] = coords;
4015 emit_data->args[2] = rsrc;
4016 emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
4017 emit_data->arg_count = 4;
4018
4019 image_append_args(ctx, emit_data, target, false, force_glc);
4020 }
4021 }
4022 }
4023
4024 static void store_emit_buffer(
4025 struct si_shader_context *ctx,
4026 struct lp_build_emit_data *emit_data,
4027 bool writeonly_memory)
4028 {
4029 const struct tgsi_full_instruction *inst = emit_data->inst;
4030 struct gallivm_state *gallivm = &ctx->gallivm;
4031 LLVMBuilderRef builder = gallivm->builder;
4032 LLVMValueRef base_data = emit_data->args[0];
4033 LLVMValueRef base_offset = emit_data->args[3];
4034 unsigned writemask = inst->Dst[0].Register.WriteMask;
4035
4036 while (writemask) {
4037 int start, count;
4038 const char *intrinsic_name;
4039 LLVMValueRef data;
4040 LLVMValueRef offset;
4041 LLVMValueRef tmp;
4042
4043 u_bit_scan_consecutive_range(&writemask, &start, &count);
4044
4045 /* Due to an LLVM limitation, split 3-element writes
4046 * into a 2-element and a 1-element write. */
4047 if (count == 3) {
4048 writemask |= 1 << (start + 2);
4049 count = 2;
4050 }
4051
4052 if (count == 4) {
4053 data = base_data;
4054 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
4055 } else if (count == 2) {
4056 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
4057
4058 tmp = LLVMBuildExtractElement(
4059 builder, base_data,
4060 LLVMConstInt(ctx->i32, start, 0), "");
4061 data = LLVMBuildInsertElement(
4062 builder, LLVMGetUndef(v2f32), tmp,
4063 ctx->i32_0, "");
4064
4065 tmp = LLVMBuildExtractElement(
4066 builder, base_data,
4067 LLVMConstInt(ctx->i32, start + 1, 0), "");
4068 data = LLVMBuildInsertElement(
4069 builder, data, tmp, ctx->i32_1, "");
4070
4071 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
4072 } else {
4073 assert(count == 1);
4074 data = LLVMBuildExtractElement(
4075 builder, base_data,
4076 LLVMConstInt(ctx->i32, start, 0), "");
4077 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
4078 }
4079
4080 offset = base_offset;
4081 if (start != 0) {
4082 offset = LLVMBuildAdd(
4083 builder, offset,
4084 LLVMConstInt(ctx->i32, start * 4, 0), "");
4085 }
4086
4087 emit_data->args[0] = data;
4088 emit_data->args[3] = offset;
4089
4090 lp_build_intrinsic(
4091 builder, intrinsic_name, emit_data->dst_type,
4092 emit_data->args, emit_data->arg_count,
4093 get_store_intr_attribs(writeonly_memory));
4094 }
4095 }
4096
4097 static void store_emit_memory(
4098 struct si_shader_context *ctx,
4099 struct lp_build_emit_data *emit_data)
4100 {
4101 const struct tgsi_full_instruction *inst = emit_data->inst;
4102 struct gallivm_state *gallivm = &ctx->gallivm;
4103 LLVMBuilderRef builder = gallivm->builder;
4104 unsigned writemask = inst->Dst[0].Register.WriteMask;
4105 LLVMValueRef ptr, derived_ptr, data, index;
4106 int chan;
4107
4108 ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
4109
4110 for (chan = 0; chan < 4; ++chan) {
4111 if (!(writemask & (1 << chan))) {
4112 continue;
4113 }
4114 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
4115 index = LLVMConstInt(ctx->i32, chan, 0);
4116 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
4117 LLVMBuildStore(builder, data, derived_ptr);
4118 }
4119 }
4120
4121 static void store_emit(
4122 const struct lp_build_tgsi_action *action,
4123 struct lp_build_tgsi_context *bld_base,
4124 struct lp_build_emit_data *emit_data)
4125 {
4126 struct si_shader_context *ctx = si_shader_context(bld_base);
4127 struct gallivm_state *gallivm = &ctx->gallivm;
4128 LLVMBuilderRef builder = gallivm->builder;
4129 const struct tgsi_full_instruction * inst = emit_data->inst;
4130 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
4131 unsigned target = inst->Memory.Texture;
4132 char intrinsic_name[64];
4133 bool writeonly_memory = false;
4134
4135 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4136 store_emit_memory(ctx, emit_data);
4137 return;
4138 }
4139
4140 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4141 emit_waitcnt(ctx, VM_CNT);
4142
4143 writeonly_memory = is_oneway_access_only(inst, info,
4144 info->shader_buffers_load |
4145 info->shader_buffers_atomic,
4146 info->images_load |
4147 info->images_atomic);
4148
4149 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4150 store_emit_buffer(ctx, emit_data, writeonly_memory);
4151 return;
4152 }
4153
4154 if (target == TGSI_TEXTURE_BUFFER) {
4155 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4156 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4157 emit_data->dst_type, emit_data->args,
4158 emit_data->arg_count,
4159 get_store_intr_attribs(writeonly_memory));
4160 } else {
4161 ac_get_image_intr_name("llvm.amdgcn.image.store",
4162 LLVMTypeOf(emit_data->args[0]), /* vdata */
4163 LLVMTypeOf(emit_data->args[1]), /* coords */
4164 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4165 intrinsic_name, sizeof(intrinsic_name));
4166
4167 emit_data->output[emit_data->chan] =
4168 lp_build_intrinsic(
4169 builder, intrinsic_name, emit_data->dst_type,
4170 emit_data->args, emit_data->arg_count,
4171 get_store_intr_attribs(writeonly_memory));
4172 }
4173 }
4174
4175 static void atomic_fetch_args(
4176 struct lp_build_tgsi_context * bld_base,
4177 struct lp_build_emit_data * emit_data)
4178 {
4179 struct si_shader_context *ctx = si_shader_context(bld_base);
4180 struct gallivm_state *gallivm = &ctx->gallivm;
4181 LLVMBuilderRef builder = gallivm->builder;
4182 const struct tgsi_full_instruction * inst = emit_data->inst;
4183 LLVMValueRef data1, data2;
4184 LLVMValueRef rsrc;
4185 LLVMValueRef tmp;
4186
4187 emit_data->dst_type = ctx->f32;
4188
4189 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4190 data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4191
4192 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4193 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4194 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4195 }
4196
4197 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4198 * of arguments, which is reversed relative to TGSI (and GLSL)
4199 */
4200 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4201 emit_data->args[emit_data->arg_count++] = data2;
4202 emit_data->args[emit_data->arg_count++] = data1;
4203
4204 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4205 LLVMValueRef offset;
4206
4207 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4208
4209 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4210 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4211
4212 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4213 offset, true, false);
4214 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4215 unsigned target = inst->Memory.Texture;
4216 LLVMValueRef coords;
4217
4218 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4219 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4220
4221 if (target == TGSI_TEXTURE_BUFFER) {
4222 buffer_append_args(ctx, emit_data, rsrc, coords,
4223 ctx->i32_0, true, false);
4224 } else {
4225 emit_data->args[emit_data->arg_count++] = coords;
4226 emit_data->args[emit_data->arg_count++] = rsrc;
4227
4228 image_append_args(ctx, emit_data, target, true, false);
4229 }
4230 }
4231 }
4232
4233 static void atomic_emit_memory(struct si_shader_context *ctx,
4234 struct lp_build_emit_data *emit_data) {
4235 struct gallivm_state *gallivm = &ctx->gallivm;
4236 LLVMBuilderRef builder = gallivm->builder;
4237 const struct tgsi_full_instruction * inst = emit_data->inst;
4238 LLVMValueRef ptr, result, arg;
4239
4240 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4241
4242 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4243 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4244
4245 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4246 LLVMValueRef new_data;
4247 new_data = lp_build_emit_fetch(&ctx->bld_base,
4248 inst, 3, 0);
4249
4250 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4251
4252 #if HAVE_LLVM >= 0x309
4253 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4254 LLVMAtomicOrderingSequentiallyConsistent,
4255 LLVMAtomicOrderingSequentiallyConsistent,
4256 false);
4257 #endif
4258
4259 result = LLVMBuildExtractValue(builder, result, 0, "");
4260 } else {
4261 LLVMAtomicRMWBinOp op;
4262
4263 switch(inst->Instruction.Opcode) {
4264 case TGSI_OPCODE_ATOMUADD:
4265 op = LLVMAtomicRMWBinOpAdd;
4266 break;
4267 case TGSI_OPCODE_ATOMXCHG:
4268 op = LLVMAtomicRMWBinOpXchg;
4269 break;
4270 case TGSI_OPCODE_ATOMAND:
4271 op = LLVMAtomicRMWBinOpAnd;
4272 break;
4273 case TGSI_OPCODE_ATOMOR:
4274 op = LLVMAtomicRMWBinOpOr;
4275 break;
4276 case TGSI_OPCODE_ATOMXOR:
4277 op = LLVMAtomicRMWBinOpXor;
4278 break;
4279 case TGSI_OPCODE_ATOMUMIN:
4280 op = LLVMAtomicRMWBinOpUMin;
4281 break;
4282 case TGSI_OPCODE_ATOMUMAX:
4283 op = LLVMAtomicRMWBinOpUMax;
4284 break;
4285 case TGSI_OPCODE_ATOMIMIN:
4286 op = LLVMAtomicRMWBinOpMin;
4287 break;
4288 case TGSI_OPCODE_ATOMIMAX:
4289 op = LLVMAtomicRMWBinOpMax;
4290 break;
4291 default:
4292 unreachable("unknown atomic opcode");
4293 }
4294
4295 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4296 LLVMAtomicOrderingSequentiallyConsistent,
4297 false);
4298 }
4299 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4300 }
4301
4302 static void atomic_emit(
4303 const struct lp_build_tgsi_action *action,
4304 struct lp_build_tgsi_context *bld_base,
4305 struct lp_build_emit_data *emit_data)
4306 {
4307 struct si_shader_context *ctx = si_shader_context(bld_base);
4308 struct gallivm_state *gallivm = &ctx->gallivm;
4309 LLVMBuilderRef builder = gallivm->builder;
4310 const struct tgsi_full_instruction * inst = emit_data->inst;
4311 char intrinsic_name[40];
4312 LLVMValueRef tmp;
4313
4314 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4315 atomic_emit_memory(ctx, emit_data);
4316 return;
4317 }
4318
4319 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4320 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4321 snprintf(intrinsic_name, sizeof(intrinsic_name),
4322 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4323 } else {
4324 LLVMValueRef coords;
4325 char coords_type[8];
4326
4327 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4328 coords = emit_data->args[2];
4329 else
4330 coords = emit_data->args[1];
4331
4332 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4333 snprintf(intrinsic_name, sizeof(intrinsic_name),
4334 "llvm.amdgcn.image.atomic.%s.%s",
4335 action->intr_name, coords_type);
4336 }
4337
4338 tmp = lp_build_intrinsic(
4339 builder, intrinsic_name, ctx->i32,
4340 emit_data->args, emit_data->arg_count, 0);
4341 emit_data->output[emit_data->chan] =
4342 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4343 }
4344
4345 static void set_tex_fetch_args(struct si_shader_context *ctx,
4346 struct lp_build_emit_data *emit_data,
4347 unsigned target,
4348 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4349 LLVMValueRef *param, unsigned count,
4350 unsigned dmask)
4351 {
4352 struct gallivm_state *gallivm = &ctx->gallivm;
4353 struct ac_image_args args = {};
4354
4355 /* Pad to power of two vector */
4356 while (count < util_next_power_of_two(count))
4357 param[count++] = LLVMGetUndef(ctx->i32);
4358
4359 if (count > 1)
4360 args.addr = lp_build_gather_values(gallivm, param, count);
4361 else
4362 args.addr = param[0];
4363
4364 args.resource = res_ptr;
4365 args.sampler = samp_ptr;
4366 args.dmask = dmask;
4367 args.unorm = target == TGSI_TEXTURE_RECT ||
4368 target == TGSI_TEXTURE_SHADOWRECT;
4369 args.da = tgsi_is_array_sampler(target);
4370
4371 /* Ugly, but we seem to have no other choice right now. */
4372 STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4373 memcpy(emit_data->args, &args, sizeof(args));
4374 }
4375
4376 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4377 unsigned target, LLVMValueRef out)
4378 {
4379 LLVMBuilderRef builder = ctx->gallivm.builder;
4380
4381 /* 1D textures are allocated and used as 2D on GFX9. */
4382 if (ctx->screen->b.chip_class >= GFX9 &&
4383 (target == TGSI_TEXTURE_1D_ARRAY ||
4384 target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4385 LLVMValueRef layers =
4386 LLVMBuildExtractElement(builder, out,
4387 LLVMConstInt(ctx->i32, 2, 0), "");
4388 out = LLVMBuildInsertElement(builder, out, layers,
4389 ctx->i32_1, "");
4390 }
4391
4392 /* Divide the number of layers by 6 to get the number of cubes. */
4393 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4394 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4395 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4396
4397 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4398 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4399
4400 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4401 }
4402 return out;
4403 }
4404
4405 static void resq_fetch_args(
4406 struct lp_build_tgsi_context * bld_base,
4407 struct lp_build_emit_data * emit_data)
4408 {
4409 struct si_shader_context *ctx = si_shader_context(bld_base);
4410 const struct tgsi_full_instruction *inst = emit_data->inst;
4411 const struct tgsi_full_src_register *reg = &inst->Src[0];
4412
4413 emit_data->dst_type = ctx->v4i32;
4414
4415 if (reg->Register.File == TGSI_FILE_BUFFER) {
4416 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4417 emit_data->arg_count = 1;
4418 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4419 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4420 &emit_data->args[0]);
4421 emit_data->arg_count = 1;
4422 } else {
4423 LLVMValueRef res_ptr;
4424 unsigned image_target;
4425
4426 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4427 image_target = TGSI_TEXTURE_2D_ARRAY;
4428 else
4429 image_target = inst->Memory.Texture;
4430
4431 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4432 &res_ptr);
4433 set_tex_fetch_args(ctx, emit_data, image_target,
4434 res_ptr, NULL, &ctx->i32_0, 1,
4435 0xf);
4436 }
4437 }
4438
4439 static void resq_emit(
4440 const struct lp_build_tgsi_action *action,
4441 struct lp_build_tgsi_context *bld_base,
4442 struct lp_build_emit_data *emit_data)
4443 {
4444 struct si_shader_context *ctx = si_shader_context(bld_base);
4445 struct gallivm_state *gallivm = &ctx->gallivm;
4446 LLVMBuilderRef builder = gallivm->builder;
4447 const struct tgsi_full_instruction *inst = emit_data->inst;
4448 LLVMValueRef out;
4449
4450 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4451 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4452 LLVMConstInt(ctx->i32, 2, 0), "");
4453 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4454 out = get_buffer_size(bld_base, emit_data->args[0]);
4455 } else {
4456 struct ac_image_args args;
4457
4458 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4459 args.opcode = ac_image_get_resinfo;
4460 out = ac_build_image_opcode(&ctx->ac, &args);
4461
4462 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4463 }
4464
4465 emit_data->output[emit_data->chan] = out;
4466 }
4467
4468 static const struct lp_build_tgsi_action tex_action;
4469
4470 enum desc_type {
4471 DESC_IMAGE,
4472 DESC_BUFFER,
4473 DESC_FMASK,
4474 DESC_SAMPLER,
4475 };
4476
4477 /**
4478 * Load an image view, fmask view. or sampler state descriptor.
4479 */
4480 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4481 LLVMValueRef list, LLVMValueRef index,
4482 enum desc_type type)
4483 {
4484 struct gallivm_state *gallivm = &ctx->gallivm;
4485 LLVMBuilderRef builder = gallivm->builder;
4486
4487 switch (type) {
4488 case DESC_IMAGE:
4489 /* The image is at [0:7]. */
4490 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4491 break;
4492 case DESC_BUFFER:
4493 /* The buffer is in [4:7]. */
4494 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4495 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4496 list = LLVMBuildPointerCast(builder, list,
4497 const_array(ctx->v4i32, 0), "");
4498 break;
4499 case DESC_FMASK:
4500 /* The FMASK is at [8:15]. */
4501 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4502 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4503 break;
4504 case DESC_SAMPLER:
4505 /* The sampler state is at [12:15]. */
4506 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4507 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4508 list = LLVMBuildPointerCast(builder, list,
4509 const_array(ctx->v4i32, 0), "");
4510 break;
4511 }
4512
4513 return ac_build_indexed_load_const(&ctx->ac, list, index);
4514 }
4515
4516 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4517 *
4518 * SI-CI:
4519 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4520 * filtering manually. The driver sets img7 to a mask clearing
4521 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4522 * s_and_b32 samp0, samp0, img7
4523 *
4524 * VI:
4525 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4526 */
4527 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4528 LLVMValueRef res, LLVMValueRef samp)
4529 {
4530 LLVMBuilderRef builder = ctx->gallivm.builder;
4531 LLVMValueRef img7, samp0;
4532
4533 if (ctx->screen->b.chip_class >= VI)
4534 return samp;
4535
4536 img7 = LLVMBuildExtractElement(builder, res,
4537 LLVMConstInt(ctx->i32, 7, 0), "");
4538 samp0 = LLVMBuildExtractElement(builder, samp,
4539 ctx->i32_0, "");
4540 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4541 return LLVMBuildInsertElement(builder, samp, samp0,
4542 ctx->i32_0, "");
4543 }
4544
4545 static void tex_fetch_ptrs(
4546 struct lp_build_tgsi_context *bld_base,
4547 struct lp_build_emit_data *emit_data,
4548 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4549 {
4550 struct si_shader_context *ctx = si_shader_context(bld_base);
4551 LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4552 const struct tgsi_full_instruction *inst = emit_data->inst;
4553 const struct tgsi_full_src_register *reg;
4554 unsigned target = inst->Texture.Texture;
4555 unsigned sampler_src;
4556 LLVMValueRef index;
4557
4558 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4559 reg = &emit_data->inst->Src[sampler_src];
4560
4561 if (reg->Register.Indirect) {
4562 index = get_bounded_indirect_index(ctx,
4563 &reg->Indirect,
4564 reg->Register.Index,
4565 SI_NUM_SAMPLERS);
4566 } else {
4567 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4568 }
4569
4570 if (target == TGSI_TEXTURE_BUFFER)
4571 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4572 else
4573 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4574
4575 if (samp_ptr)
4576 *samp_ptr = NULL;
4577 if (fmask_ptr)
4578 *fmask_ptr = NULL;
4579
4580 if (target == TGSI_TEXTURE_2D_MSAA ||
4581 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4582 if (fmask_ptr)
4583 *fmask_ptr = load_sampler_desc(ctx, list, index,
4584 DESC_FMASK);
4585 } else if (target != TGSI_TEXTURE_BUFFER) {
4586 if (samp_ptr) {
4587 *samp_ptr = load_sampler_desc(ctx, list, index,
4588 DESC_SAMPLER);
4589 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4590 }
4591 }
4592 }
4593
4594 static void txq_fetch_args(
4595 struct lp_build_tgsi_context *bld_base,
4596 struct lp_build_emit_data *emit_data)
4597 {
4598 struct si_shader_context *ctx = si_shader_context(bld_base);
4599 const struct tgsi_full_instruction *inst = emit_data->inst;
4600 unsigned target = inst->Texture.Texture;
4601 LLVMValueRef res_ptr;
4602 LLVMValueRef address;
4603
4604 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4605
4606 if (target == TGSI_TEXTURE_BUFFER) {
4607 /* Read the size from the buffer descriptor directly. */
4608 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4609 return;
4610 }
4611
4612 /* Textures - set the mip level. */
4613 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4614
4615 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4616 NULL, &address, 1, 0xf);
4617 }
4618
4619 static void txq_emit(const struct lp_build_tgsi_action *action,
4620 struct lp_build_tgsi_context *bld_base,
4621 struct lp_build_emit_data *emit_data)
4622 {
4623 struct si_shader_context *ctx = si_shader_context(bld_base);
4624 struct ac_image_args args;
4625 unsigned target = emit_data->inst->Texture.Texture;
4626
4627 if (target == TGSI_TEXTURE_BUFFER) {
4628 /* Just return the buffer size. */
4629 emit_data->output[emit_data->chan] = emit_data->args[0];
4630 return;
4631 }
4632
4633 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4634
4635 args.opcode = ac_image_get_resinfo;
4636 LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4637
4638 emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4639 }
4640
4641 static void tex_fetch_args(
4642 struct lp_build_tgsi_context *bld_base,
4643 struct lp_build_emit_data *emit_data)
4644 {
4645 struct si_shader_context *ctx = si_shader_context(bld_base);
4646 struct gallivm_state *gallivm = &ctx->gallivm;
4647 const struct tgsi_full_instruction *inst = emit_data->inst;
4648 unsigned opcode = inst->Instruction.Opcode;
4649 unsigned target = inst->Texture.Texture;
4650 LLVMValueRef coords[5], derivs[6];
4651 LLVMValueRef address[16];
4652 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4653 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4654 unsigned count = 0;
4655 unsigned chan;
4656 unsigned num_deriv_channels = 0;
4657 bool has_offset = inst->Texture.NumOffsets > 0;
4658 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4659 unsigned dmask = 0xf;
4660
4661 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4662
4663 if (target == TGSI_TEXTURE_BUFFER) {
4664 emit_data->dst_type = ctx->v4f32;
4665 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4666 ctx->v16i8, "");
4667 emit_data->args[1] = ctx->i32_0;
4668 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4669 emit_data->arg_count = 3;
4670 return;
4671 }
4672
4673 /* Fetch and project texture coordinates */
4674 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4675 for (chan = 0; chan < 3; chan++ ) {
4676 coords[chan] = lp_build_emit_fetch(bld_base,
4677 emit_data->inst, 0,
4678 chan);
4679 if (opcode == TGSI_OPCODE_TXP)
4680 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4681 TGSI_OPCODE_DIV,
4682 coords[chan],
4683 coords[3]);
4684 }
4685
4686 if (opcode == TGSI_OPCODE_TXP)
4687 coords[3] = bld_base->base.one;
4688
4689 /* Pack offsets. */
4690 if (has_offset &&
4691 opcode != TGSI_OPCODE_TXF &&
4692 opcode != TGSI_OPCODE_TXF_LZ) {
4693 /* The offsets are six-bit signed integers packed like this:
4694 * X=[5:0], Y=[13:8], and Z=[21:16].
4695 */
4696 LLVMValueRef offset[3], pack;
4697
4698 assert(inst->Texture.NumOffsets == 1);
4699
4700 for (chan = 0; chan < 3; chan++) {
4701 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4702 emit_data->inst, 0, chan);
4703 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4704 LLVMConstInt(ctx->i32, 0x3f, 0), "");
4705 if (chan)
4706 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4707 LLVMConstInt(ctx->i32, chan*8, 0), "");
4708 }
4709
4710 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4711 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4712 address[count++] = pack;
4713 }
4714
4715 /* Pack LOD bias value */
4716 if (opcode == TGSI_OPCODE_TXB)
4717 address[count++] = coords[3];
4718 if (opcode == TGSI_OPCODE_TXB2)
4719 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4720
4721 /* Pack depth comparison value */
4722 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4723 LLVMValueRef z;
4724
4725 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4726 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4727 } else {
4728 assert(ref_pos >= 0);
4729 z = coords[ref_pos];
4730 }
4731
4732 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4733 * so the depth comparison value isn't clamped for Z16 and
4734 * Z24 anymore. Do it manually here.
4735 *
4736 * It's unnecessary if the original texture format was
4737 * Z32_FLOAT, but we don't know that here.
4738 */
4739 if (ctx->screen->b.chip_class == VI)
4740 z = ac_build_clamp(&ctx->ac, z);
4741
4742 address[count++] = z;
4743 }
4744
4745 /* Pack user derivatives */
4746 if (opcode == TGSI_OPCODE_TXD) {
4747 int param, num_src_deriv_channels, num_dst_deriv_channels;
4748
4749 switch (target) {
4750 case TGSI_TEXTURE_3D:
4751 num_src_deriv_channels = 3;
4752 num_dst_deriv_channels = 3;
4753 num_deriv_channels = 3;
4754 break;
4755 case TGSI_TEXTURE_2D:
4756 case TGSI_TEXTURE_SHADOW2D:
4757 case TGSI_TEXTURE_RECT:
4758 case TGSI_TEXTURE_SHADOWRECT:
4759 case TGSI_TEXTURE_2D_ARRAY:
4760 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4761 num_src_deriv_channels = 2;
4762 num_dst_deriv_channels = 2;
4763 num_deriv_channels = 2;
4764 break;
4765 case TGSI_TEXTURE_CUBE:
4766 case TGSI_TEXTURE_SHADOWCUBE:
4767 case TGSI_TEXTURE_CUBE_ARRAY:
4768 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4769 /* Cube derivatives will be converted to 2D. */
4770 num_src_deriv_channels = 3;
4771 num_dst_deriv_channels = 3;
4772 num_deriv_channels = 2;
4773 break;
4774 case TGSI_TEXTURE_1D:
4775 case TGSI_TEXTURE_SHADOW1D:
4776 case TGSI_TEXTURE_1D_ARRAY:
4777 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4778 num_src_deriv_channels = 1;
4779
4780 /* 1D textures are allocated and used as 2D on GFX9. */
4781 if (ctx->screen->b.chip_class >= GFX9) {
4782 num_dst_deriv_channels = 2;
4783 num_deriv_channels = 2;
4784 } else {
4785 num_dst_deriv_channels = 1;
4786 num_deriv_channels = 1;
4787 }
4788 break;
4789 default:
4790 unreachable("invalid target");
4791 }
4792
4793 for (param = 0; param < 2; param++) {
4794 for (chan = 0; chan < num_src_deriv_channels; chan++)
4795 derivs[param * num_dst_deriv_channels + chan] =
4796 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4797
4798 /* Fill in the rest with zeros. */
4799 for (chan = num_src_deriv_channels;
4800 chan < num_dst_deriv_channels; chan++)
4801 derivs[param * num_dst_deriv_channels + chan] =
4802 bld_base->base.zero;
4803 }
4804 }
4805
4806 if (target == TGSI_TEXTURE_CUBE ||
4807 target == TGSI_TEXTURE_CUBE_ARRAY ||
4808 target == TGSI_TEXTURE_SHADOWCUBE ||
4809 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4810 ac_prepare_cube_coords(&ctx->ac,
4811 opcode == TGSI_OPCODE_TXD,
4812 target == TGSI_TEXTURE_CUBE_ARRAY ||
4813 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4814 coords, derivs);
4815
4816 if (opcode == TGSI_OPCODE_TXD)
4817 for (int i = 0; i < num_deriv_channels * 2; i++)
4818 address[count++] = derivs[i];
4819
4820 /* Pack texture coordinates */
4821 address[count++] = coords[0];
4822 if (num_coords > 1)
4823 address[count++] = coords[1];
4824 if (num_coords > 2)
4825 address[count++] = coords[2];
4826
4827 /* 1D textures are allocated and used as 2D on GFX9. */
4828 if (ctx->screen->b.chip_class >= GFX9) {
4829 LLVMValueRef filler;
4830
4831 /* Use 0.5, so that we don't sample the border color. */
4832 if (opcode == TGSI_OPCODE_TXF)
4833 filler = ctx->i32_0;
4834 else
4835 filler = LLVMConstReal(ctx->f32, 0.5);
4836
4837 if (target == TGSI_TEXTURE_1D ||
4838 target == TGSI_TEXTURE_SHADOW1D) {
4839 address[count++] = filler;
4840 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4841 target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4842 address[count] = address[count - 1];
4843 address[count - 1] = filler;
4844 count++;
4845 }
4846 }
4847
4848 /* Pack LOD or sample index */
4849 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4850 address[count++] = coords[3];
4851 else if (opcode == TGSI_OPCODE_TXL2)
4852 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4853
4854 if (count > 16) {
4855 assert(!"Cannot handle more than 16 texture address parameters");
4856 count = 16;
4857 }
4858
4859 for (chan = 0; chan < count; chan++ ) {
4860 address[chan] = LLVMBuildBitCast(gallivm->builder,
4861 address[chan], ctx->i32, "");
4862 }
4863
4864 /* Adjust the sample index according to FMASK.
4865 *
4866 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4867 * which is the identity mapping. Each nibble says which physical sample
4868 * should be fetched to get that sample.
4869 *
4870 * For example, 0x11111100 means there are only 2 samples stored and
4871 * the second sample covers 3/4 of the pixel. When reading samples 0
4872 * and 1, return physical sample 0 (determined by the first two 0s
4873 * in FMASK), otherwise return physical sample 1.
4874 *
4875 * The sample index should be adjusted as follows:
4876 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4877 */
4878 if (target == TGSI_TEXTURE_2D_MSAA ||
4879 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4880 struct lp_build_emit_data txf_emit_data = *emit_data;
4881 LLVMValueRef txf_address[4];
4882 /* We only need .xy for non-arrays, and .xyz for arrays. */
4883 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4884 struct tgsi_full_instruction inst = {};
4885
4886 memcpy(txf_address, address, sizeof(txf_address));
4887
4888 /* Read FMASK using TXF_LZ. */
4889 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4890 inst.Texture.Texture = target;
4891 txf_emit_data.inst = &inst;
4892 txf_emit_data.chan = 0;
4893 set_tex_fetch_args(ctx, &txf_emit_data,
4894 target, fmask_ptr, NULL,
4895 txf_address, txf_count, 0xf);
4896 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4897
4898 /* Initialize some constants. */
4899 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4900 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4901
4902 /* Apply the formula. */
4903 LLVMValueRef fmask =
4904 LLVMBuildExtractElement(gallivm->builder,
4905 txf_emit_data.output[0],
4906 ctx->i32_0, "");
4907
4908 unsigned sample_chan = txf_count; /* the sample index is last */
4909
4910 LLVMValueRef sample_index4 =
4911 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4912
4913 LLVMValueRef shifted_fmask =
4914 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4915
4916 LLVMValueRef final_sample =
4917 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4918
4919 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4920 * resource descriptor is 0 (invalid),
4921 */
4922 LLVMValueRef fmask_desc =
4923 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4924 ctx->v8i32, "");
4925
4926 LLVMValueRef fmask_word1 =
4927 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4928 ctx->i32_1, "");
4929
4930 LLVMValueRef word1_is_nonzero =
4931 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4932 fmask_word1, ctx->i32_0, "");
4933
4934 /* Replace the MSAA sample index. */
4935 address[sample_chan] =
4936 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4937 final_sample, address[sample_chan], "");
4938 }
4939
4940 if (opcode == TGSI_OPCODE_TXF ||
4941 opcode == TGSI_OPCODE_TXF_LZ) {
4942 /* add tex offsets */
4943 if (inst->Texture.NumOffsets) {
4944 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4945 const struct tgsi_texture_offset *off = inst->TexOffsets;
4946
4947 assert(inst->Texture.NumOffsets == 1);
4948
4949 switch (target) {
4950 case TGSI_TEXTURE_3D:
4951 address[2] = lp_build_add(uint_bld, address[2],
4952 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4953 /* fall through */
4954 case TGSI_TEXTURE_2D:
4955 case TGSI_TEXTURE_SHADOW2D:
4956 case TGSI_TEXTURE_RECT:
4957 case TGSI_TEXTURE_SHADOWRECT:
4958 case TGSI_TEXTURE_2D_ARRAY:
4959 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4960 address[1] =
4961 lp_build_add(uint_bld, address[1],
4962 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4963 /* fall through */
4964 case TGSI_TEXTURE_1D:
4965 case TGSI_TEXTURE_SHADOW1D:
4966 case TGSI_TEXTURE_1D_ARRAY:
4967 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4968 address[0] =
4969 lp_build_add(uint_bld, address[0],
4970 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4971 break;
4972 /* texture offsets do not apply to other texture targets */
4973 }
4974 }
4975 }
4976
4977 if (opcode == TGSI_OPCODE_TG4) {
4978 unsigned gather_comp = 0;
4979
4980 /* DMASK was repurposed for GATHER4. 4 components are always
4981 * returned and DMASK works like a swizzle - it selects
4982 * the component to fetch. The only valid DMASK values are
4983 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4984 * (red,red,red,red) etc.) The ISA document doesn't mention
4985 * this.
4986 */
4987
4988 /* Get the component index from src1.x for Gather4. */
4989 if (!tgsi_is_shadow_target(target)) {
4990 LLVMValueRef comp_imm;
4991 struct tgsi_src_register src1 = inst->Src[1].Register;
4992
4993 assert(src1.File == TGSI_FILE_IMMEDIATE);
4994
4995 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4996 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4997 gather_comp = CLAMP(gather_comp, 0, 3);
4998 }
4999
5000 dmask = 1 << gather_comp;
5001 }
5002
5003 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
5004 samp_ptr, address, count, dmask);
5005 }
5006
5007 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
5008 * incorrectly forces nearest filtering if the texture format is integer.
5009 * The only effect it has on Gather4, which always returns 4 texels for
5010 * bilinear filtering, is that the final coordinates are off by 0.5 of
5011 * the texel size.
5012 *
5013 * The workaround is to subtract 0.5 from the unnormalized coordinates,
5014 * or (0.5 / size) from the normalized coordinates.
5015 */
5016 static void si_lower_gather4_integer(struct si_shader_context *ctx,
5017 struct ac_image_args *args,
5018 unsigned target)
5019 {
5020 LLVMBuilderRef builder = ctx->gallivm.builder;
5021 LLVMValueRef coord = args->addr;
5022 LLVMValueRef half_texel[2];
5023 /* Texture coordinates start after:
5024 * {offset, bias, z-compare, derivatives}
5025 * Only the offset and z-compare can occur here.
5026 */
5027 unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
5028 int c;
5029
5030 if (target == TGSI_TEXTURE_RECT ||
5031 target == TGSI_TEXTURE_SHADOWRECT) {
5032 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
5033 } else {
5034 struct tgsi_full_instruction txq_inst = {};
5035 struct lp_build_emit_data txq_emit_data = {};
5036
5037 /* Query the texture size. */
5038 txq_inst.Texture.Texture = target;
5039 txq_emit_data.inst = &txq_inst;
5040 txq_emit_data.dst_type = ctx->v4i32;
5041 set_tex_fetch_args(ctx, &txq_emit_data, target,
5042 args->resource, NULL, &ctx->i32_0,
5043 1, 0xf);
5044 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
5045
5046 /* Compute -0.5 / size. */
5047 for (c = 0; c < 2; c++) {
5048 half_texel[c] =
5049 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
5050 LLVMConstInt(ctx->i32, c, 0), "");
5051 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
5052 half_texel[c] =
5053 lp_build_emit_llvm_unary(&ctx->bld_base,
5054 TGSI_OPCODE_RCP, half_texel[c]);
5055 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
5056 LLVMConstReal(ctx->f32, -0.5), "");
5057 }
5058 }
5059
5060 for (c = 0; c < 2; c++) {
5061 LLVMValueRef tmp;
5062 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
5063
5064 tmp = LLVMBuildExtractElement(builder, coord, index, "");
5065 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
5066 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
5067 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
5068 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
5069 }
5070
5071 args->addr = coord;
5072 }
5073
5074 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
5075 struct lp_build_tgsi_context *bld_base,
5076 struct lp_build_emit_data *emit_data)
5077 {
5078 struct si_shader_context *ctx = si_shader_context(bld_base);
5079 const struct tgsi_full_instruction *inst = emit_data->inst;
5080 struct ac_image_args args;
5081 unsigned opcode = inst->Instruction.Opcode;
5082 unsigned target = inst->Texture.Texture;
5083
5084 if (target == TGSI_TEXTURE_BUFFER) {
5085 emit_data->output[emit_data->chan] =
5086 ac_build_buffer_load_format(&ctx->ac,
5087 emit_data->args[0],
5088 emit_data->args[2],
5089 emit_data->args[1],
5090 true);
5091 return;
5092 }
5093
5094 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
5095
5096 args.opcode = ac_image_sample;
5097 args.compare = tgsi_is_shadow_target(target);
5098 args.offset = inst->Texture.NumOffsets > 0;
5099
5100 switch (opcode) {
5101 case TGSI_OPCODE_TXF:
5102 case TGSI_OPCODE_TXF_LZ:
5103 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
5104 target == TGSI_TEXTURE_2D_MSAA ||
5105 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
5106 ac_image_load : ac_image_load_mip;
5107 args.compare = false;
5108 args.offset = false;
5109 break;
5110 case TGSI_OPCODE_LODQ:
5111 args.opcode = ac_image_get_lod;
5112 args.compare = false;
5113 args.offset = false;
5114 break;
5115 case TGSI_OPCODE_TEX:
5116 case TGSI_OPCODE_TEX2:
5117 case TGSI_OPCODE_TXP:
5118 if (ctx->type != PIPE_SHADER_FRAGMENT)
5119 args.level_zero = true;
5120 break;
5121 case TGSI_OPCODE_TEX_LZ:
5122 args.level_zero = true;
5123 break;
5124 case TGSI_OPCODE_TXB:
5125 case TGSI_OPCODE_TXB2:
5126 assert(ctx->type == PIPE_SHADER_FRAGMENT);
5127 args.bias = true;
5128 break;
5129 case TGSI_OPCODE_TXL:
5130 case TGSI_OPCODE_TXL2:
5131 args.lod = true;
5132 break;
5133 case TGSI_OPCODE_TXD:
5134 args.deriv = true;
5135 break;
5136 case TGSI_OPCODE_TG4:
5137 args.opcode = ac_image_gather4;
5138 args.level_zero = true;
5139 break;
5140 default:
5141 assert(0);
5142 return;
5143 }
5144
5145 /* The hardware needs special lowering for Gather4 with integer formats. */
5146 if (ctx->screen->b.chip_class <= VI &&
5147 opcode == TGSI_OPCODE_TG4) {
5148 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5149 /* This will also work with non-constant indexing because of how
5150 * glsl_to_tgsi works and we intent to preserve that behavior.
5151 */
5152 const unsigned src_idx = 2;
5153 unsigned sampler = inst->Src[src_idx].Register.Index;
5154
5155 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
5156
5157 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
5158 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
5159 si_lower_gather4_integer(ctx, &args, target);
5160 }
5161
5162 emit_data->output[emit_data->chan] =
5163 ac_build_image_opcode(&ctx->ac, &args);
5164 }
5165
5166 static void si_llvm_emit_txqs(
5167 const struct lp_build_tgsi_action *action,
5168 struct lp_build_tgsi_context *bld_base,
5169 struct lp_build_emit_data *emit_data)
5170 {
5171 struct si_shader_context *ctx = si_shader_context(bld_base);
5172 struct gallivm_state *gallivm = &ctx->gallivm;
5173 LLVMBuilderRef builder = gallivm->builder;
5174 LLVMValueRef res, samples;
5175 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5176
5177 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5178
5179
5180 /* Read the samples from the descriptor directly. */
5181 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5182 samples = LLVMBuildExtractElement(
5183 builder, res,
5184 LLVMConstInt(ctx->i32, 3, 0), "");
5185 samples = LLVMBuildLShr(builder, samples,
5186 LLVMConstInt(ctx->i32, 16, 0), "");
5187 samples = LLVMBuildAnd(builder, samples,
5188 LLVMConstInt(ctx->i32, 0xf, 0), "");
5189 samples = LLVMBuildShl(builder, ctx->i32_1,
5190 samples, "");
5191
5192 emit_data->output[emit_data->chan] = samples;
5193 }
5194
5195 static void si_llvm_emit_ddxy(
5196 const struct lp_build_tgsi_action *action,
5197 struct lp_build_tgsi_context *bld_base,
5198 struct lp_build_emit_data *emit_data)
5199 {
5200 struct si_shader_context *ctx = si_shader_context(bld_base);
5201 struct gallivm_state *gallivm = &ctx->gallivm;
5202 unsigned opcode = emit_data->info->opcode;
5203 LLVMValueRef val;
5204 int idx;
5205 unsigned mask;
5206
5207 if (opcode == TGSI_OPCODE_DDX_FINE)
5208 mask = AC_TID_MASK_LEFT;
5209 else if (opcode == TGSI_OPCODE_DDY_FINE)
5210 mask = AC_TID_MASK_TOP;
5211 else
5212 mask = AC_TID_MASK_TOP_LEFT;
5213
5214 /* for DDX we want to next X pixel, DDY next Y pixel. */
5215 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5216
5217 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5218 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5219 mask, idx, ctx->lds, val);
5220 emit_data->output[emit_data->chan] = val;
5221 }
5222
5223 /*
5224 * this takes an I,J coordinate pair,
5225 * and works out the X and Y derivatives.
5226 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5227 */
5228 static LLVMValueRef si_llvm_emit_ddxy_interp(
5229 struct lp_build_tgsi_context *bld_base,
5230 LLVMValueRef interp_ij)
5231 {
5232 struct si_shader_context *ctx = si_shader_context(bld_base);
5233 struct gallivm_state *gallivm = &ctx->gallivm;
5234 LLVMValueRef result[4], a;
5235 unsigned i;
5236
5237 for (i = 0; i < 2; i++) {
5238 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5239 LLVMConstInt(ctx->i32, i, 0), "");
5240 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5241 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5242 }
5243
5244 return lp_build_gather_values(gallivm, result, 4);
5245 }
5246
5247 static void interp_fetch_args(
5248 struct lp_build_tgsi_context *bld_base,
5249 struct lp_build_emit_data *emit_data)
5250 {
5251 struct si_shader_context *ctx = si_shader_context(bld_base);
5252 struct gallivm_state *gallivm = &ctx->gallivm;
5253 const struct tgsi_full_instruction *inst = emit_data->inst;
5254
5255 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5256 /* offset is in second src, first two channels */
5257 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5258 emit_data->inst, 1,
5259 TGSI_CHAN_X);
5260 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5261 emit_data->inst, 1,
5262 TGSI_CHAN_Y);
5263 emit_data->arg_count = 2;
5264 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5265 LLVMValueRef sample_position;
5266 LLVMValueRef sample_id;
5267 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5268
5269 /* fetch sample ID, then fetch its sample position,
5270 * and place into first two channels.
5271 */
5272 sample_id = lp_build_emit_fetch(bld_base,
5273 emit_data->inst, 1, TGSI_CHAN_X);
5274 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5275 ctx->i32, "");
5276 sample_position = load_sample_position(ctx, sample_id);
5277
5278 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5279 sample_position,
5280 ctx->i32_0, "");
5281
5282 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5283 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5284 sample_position,
5285 ctx->i32_1, "");
5286 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5287 emit_data->arg_count = 2;
5288 }
5289 }
5290
5291 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5292 struct lp_build_tgsi_context *bld_base,
5293 struct lp_build_emit_data *emit_data)
5294 {
5295 struct si_shader_context *ctx = si_shader_context(bld_base);
5296 struct si_shader *shader = ctx->shader;
5297 struct gallivm_state *gallivm = &ctx->gallivm;
5298 LLVMValueRef interp_param;
5299 const struct tgsi_full_instruction *inst = emit_data->inst;
5300 int input_index = inst->Src[0].Register.Index;
5301 int chan;
5302 int i;
5303 LLVMValueRef attr_number;
5304 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5305 int interp_param_idx;
5306 unsigned interp = shader->selector->info.input_interpolate[input_index];
5307 unsigned location;
5308
5309 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5310
5311 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5312 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5313 location = TGSI_INTERPOLATE_LOC_CENTER;
5314 else
5315 location = TGSI_INTERPOLATE_LOC_CENTROID;
5316
5317 interp_param_idx = lookup_interp_param_index(interp, location);
5318 if (interp_param_idx == -1)
5319 return;
5320 else if (interp_param_idx)
5321 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5322 else
5323 interp_param = NULL;
5324
5325 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5326
5327 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5328 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5329 LLVMValueRef ij_out[2];
5330 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5331
5332 /*
5333 * take the I then J parameters, and the DDX/Y for it, and
5334 * calculate the IJ inputs for the interpolator.
5335 * temp1 = ddx * offset/sample.x + I;
5336 * interp_param.I = ddy * offset/sample.y + temp1;
5337 * temp1 = ddx * offset/sample.x + J;
5338 * interp_param.J = ddy * offset/sample.y + temp1;
5339 */
5340 for (i = 0; i < 2; i++) {
5341 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5342 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5343 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5344 ddxy_out, ix_ll, "");
5345 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5346 ddxy_out, iy_ll, "");
5347 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5348 interp_param, ix_ll, "");
5349 LLVMValueRef temp1, temp2;
5350
5351 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5352 ctx->f32, "");
5353
5354 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5355
5356 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5357
5358 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5359
5360 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5361 }
5362 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5363 }
5364
5365 for (chan = 0; chan < 4; chan++) {
5366 LLVMValueRef llvm_chan;
5367 unsigned schan;
5368
5369 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5370 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5371
5372 if (interp_param) {
5373 interp_param = LLVMBuildBitCast(gallivm->builder,
5374 interp_param, LLVMVectorType(ctx->f32, 2), "");
5375 LLVMValueRef i = LLVMBuildExtractElement(
5376 gallivm->builder, interp_param, ctx->i32_0, "");
5377 LLVMValueRef j = LLVMBuildExtractElement(
5378 gallivm->builder, interp_param, ctx->i32_1, "");
5379 emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5380 llvm_chan, attr_number, params,
5381 i, j);
5382 } else {
5383 emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5384 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5385 llvm_chan, attr_number, params);
5386 }
5387 }
5388 }
5389
5390 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5391 LLVMValueRef value)
5392 {
5393 struct gallivm_state *gallivm = &ctx->gallivm;
5394 LLVMValueRef args[3] = {
5395 value,
5396 ctx->i32_0,
5397 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5398 };
5399
5400 /* We currently have no other way to prevent LLVM from lifting the icmp
5401 * calls to a dominating basic block.
5402 */
5403 emit_optimization_barrier(ctx, &args[0]);
5404
5405 if (LLVMTypeOf(args[0]) != ctx->i32)
5406 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5407
5408 return lp_build_intrinsic(gallivm->builder,
5409 "llvm.amdgcn.icmp.i32",
5410 ctx->i64, args, 3,
5411 LP_FUNC_ATTR_NOUNWIND |
5412 LP_FUNC_ATTR_READNONE |
5413 LP_FUNC_ATTR_CONVERGENT);
5414 }
5415
5416 static void vote_all_emit(
5417 const struct lp_build_tgsi_action *action,
5418 struct lp_build_tgsi_context *bld_base,
5419 struct lp_build_emit_data *emit_data)
5420 {
5421 struct si_shader_context *ctx = si_shader_context(bld_base);
5422 struct gallivm_state *gallivm = &ctx->gallivm;
5423 LLVMValueRef active_set, vote_set;
5424 LLVMValueRef tmp;
5425
5426 active_set = si_emit_ballot(ctx, ctx->i32_1);
5427 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5428
5429 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5430 emit_data->output[emit_data->chan] =
5431 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5432 }
5433
5434 static void vote_any_emit(
5435 const struct lp_build_tgsi_action *action,
5436 struct lp_build_tgsi_context *bld_base,
5437 struct lp_build_emit_data *emit_data)
5438 {
5439 struct si_shader_context *ctx = si_shader_context(bld_base);
5440 struct gallivm_state *gallivm = &ctx->gallivm;
5441 LLVMValueRef vote_set;
5442 LLVMValueRef tmp;
5443
5444 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5445
5446 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5447 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5448 emit_data->output[emit_data->chan] =
5449 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5450 }
5451
5452 static void vote_eq_emit(
5453 const struct lp_build_tgsi_action *action,
5454 struct lp_build_tgsi_context *bld_base,
5455 struct lp_build_emit_data *emit_data)
5456 {
5457 struct si_shader_context *ctx = si_shader_context(bld_base);
5458 struct gallivm_state *gallivm = &ctx->gallivm;
5459 LLVMValueRef active_set, vote_set;
5460 LLVMValueRef all, none, tmp;
5461
5462 active_set = si_emit_ballot(ctx, ctx->i32_1);
5463 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5464
5465 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5466 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5467 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5468 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5469 emit_data->output[emit_data->chan] =
5470 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5471 }
5472
5473 static void ballot_emit(
5474 const struct lp_build_tgsi_action *action,
5475 struct lp_build_tgsi_context *bld_base,
5476 struct lp_build_emit_data *emit_data)
5477 {
5478 struct si_shader_context *ctx = si_shader_context(bld_base);
5479 LLVMBuilderRef builder = ctx->gallivm.builder;
5480 LLVMValueRef tmp;
5481
5482 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5483 tmp = si_emit_ballot(ctx, tmp);
5484 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5485
5486 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5487 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5488 }
5489
5490 static void read_invoc_fetch_args(
5491 struct lp_build_tgsi_context *bld_base,
5492 struct lp_build_emit_data *emit_data)
5493 {
5494 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5495 0, emit_data->src_chan);
5496
5497 /* Always read the source invocation (= lane) from the X channel. */
5498 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5499 1, TGSI_CHAN_X);
5500 emit_data->arg_count = 2;
5501 }
5502
5503 static void read_lane_emit(
5504 const struct lp_build_tgsi_action *action,
5505 struct lp_build_tgsi_context *bld_base,
5506 struct lp_build_emit_data *emit_data)
5507 {
5508 struct si_shader_context *ctx = si_shader_context(bld_base);
5509 LLVMBuilderRef builder = ctx->gallivm.builder;
5510
5511 /* We currently have no other way to prevent LLVM from lifting the icmp
5512 * calls to a dominating basic block.
5513 */
5514 emit_optimization_barrier(ctx, &emit_data->args[0]);
5515
5516 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5517 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5518 ctx->i32, "");
5519 }
5520
5521 emit_data->output[emit_data->chan] =
5522 ac_build_intrinsic(&ctx->ac, action->intr_name,
5523 ctx->i32, emit_data->args, emit_data->arg_count,
5524 AC_FUNC_ATTR_READNONE |
5525 AC_FUNC_ATTR_CONVERGENT);
5526 }
5527
5528 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5529 struct lp_build_emit_data *emit_data)
5530 {
5531 struct si_shader_context *ctx = si_shader_context(bld_base);
5532 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5533 LLVMValueRef imm;
5534 unsigned stream;
5535
5536 assert(src0.File == TGSI_FILE_IMMEDIATE);
5537
5538 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5539 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5540 return stream;
5541 }
5542
5543 /* Emit one vertex from the geometry shader */
5544 static void si_llvm_emit_vertex(
5545 const struct lp_build_tgsi_action *action,
5546 struct lp_build_tgsi_context *bld_base,
5547 struct lp_build_emit_data *emit_data)
5548 {
5549 struct si_shader_context *ctx = si_shader_context(bld_base);
5550 struct lp_build_context *uint = &bld_base->uint_bld;
5551 struct si_shader *shader = ctx->shader;
5552 struct tgsi_shader_info *info = &shader->selector->info;
5553 struct gallivm_state *gallivm = &ctx->gallivm;
5554 struct lp_build_if_state if_state;
5555 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5556 ctx->param_gs2vs_offset);
5557 LLVMValueRef gs_next_vertex;
5558 LLVMValueRef can_emit, kill;
5559 unsigned chan, offset;
5560 int i;
5561 unsigned stream;
5562
5563 stream = si_llvm_get_stream(bld_base, emit_data);
5564
5565 /* Write vertex attribute values to GSVS ring */
5566 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5567 ctx->gs_next_vertex[stream],
5568 "");
5569
5570 /* If this thread has already emitted the declared maximum number of
5571 * vertices, skip the write: excessive vertex emissions are not
5572 * supposed to have any effect.
5573 *
5574 * If the shader has no writes to memory, kill it instead. This skips
5575 * further memory loads and may allow LLVM to skip to the end
5576 * altogether.
5577 */
5578 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5579 LLVMConstInt(ctx->i32,
5580 shader->selector->gs_max_out_vertices, 0), "");
5581
5582 bool use_kill = !info->writes_memory;
5583 if (use_kill) {
5584 kill = lp_build_select(&bld_base->base, can_emit,
5585 LLVMConstReal(ctx->f32, 1.0f),
5586 LLVMConstReal(ctx->f32, -1.0f));
5587
5588 ac_build_kill(&ctx->ac, kill);
5589 } else {
5590 lp_build_if(&if_state, gallivm, can_emit);
5591 }
5592
5593 offset = 0;
5594 for (i = 0; i < info->num_outputs; i++) {
5595 LLVMValueRef *out_ptr = ctx->outputs[i];
5596
5597 for (chan = 0; chan < 4; chan++) {
5598 if (!(info->output_usagemask[i] & (1 << chan)) ||
5599 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5600 continue;
5601
5602 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5603 LLVMValueRef voffset =
5604 LLVMConstInt(ctx->i32, offset *
5605 shader->selector->gs_max_out_vertices, 0);
5606 offset++;
5607
5608 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5609 voffset = lp_build_mul_imm(uint, voffset, 4);
5610
5611 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5612
5613 ac_build_buffer_store_dword(&ctx->ac,
5614 ctx->gsvs_ring[stream],
5615 out_val, 1,
5616 voffset, soffset, 0,
5617 1, 1, true, true);
5618 }
5619 }
5620
5621 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5622 ctx->i32_1);
5623
5624 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5625
5626 /* Signal vertex emission */
5627 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5628 si_get_gs_wave_id(ctx));
5629 if (!use_kill)
5630 lp_build_endif(&if_state);
5631 }
5632
5633 /* Cut one primitive from the geometry shader */
5634 static void si_llvm_emit_primitive(
5635 const struct lp_build_tgsi_action *action,
5636 struct lp_build_tgsi_context *bld_base,
5637 struct lp_build_emit_data *emit_data)
5638 {
5639 struct si_shader_context *ctx = si_shader_context(bld_base);
5640 unsigned stream;
5641
5642 /* Signal primitive cut */
5643 stream = si_llvm_get_stream(bld_base, emit_data);
5644 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5645 si_get_gs_wave_id(ctx));
5646 }
5647
5648 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5649 struct lp_build_tgsi_context *bld_base,
5650 struct lp_build_emit_data *emit_data)
5651 {
5652 struct si_shader_context *ctx = si_shader_context(bld_base);
5653 struct gallivm_state *gallivm = &ctx->gallivm;
5654
5655 /* SI only (thanks to a hw bug workaround):
5656 * The real barrier instruction isn’t needed, because an entire patch
5657 * always fits into a single wave.
5658 */
5659 if (HAVE_LLVM >= 0x0309 &&
5660 ctx->screen->b.chip_class == SI &&
5661 ctx->type == PIPE_SHADER_TESS_CTRL) {
5662 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5663 return;
5664 }
5665
5666 lp_build_intrinsic(gallivm->builder,
5667 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5668 : "llvm.AMDGPU.barrier.local",
5669 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5670 }
5671
5672 static const struct lp_build_tgsi_action tex_action = {
5673 .fetch_args = tex_fetch_args,
5674 .emit = build_tex_intrinsic,
5675 };
5676
5677 static const struct lp_build_tgsi_action interp_action = {
5678 .fetch_args = interp_fetch_args,
5679 .emit = build_interp_intrinsic,
5680 };
5681
5682 static void si_create_function(struct si_shader_context *ctx,
5683 const char *name,
5684 LLVMTypeRef *returns, unsigned num_returns,
5685 LLVMTypeRef *params, unsigned num_params,
5686 int last_sgpr)
5687 {
5688 int i;
5689
5690 si_llvm_create_func(ctx, name, returns, num_returns,
5691 params, num_params);
5692 si_llvm_shader_type(ctx->main_fn, ctx->type);
5693 ctx->return_value = LLVMGetUndef(ctx->return_type);
5694
5695 for (i = 0; i <= last_sgpr; ++i) {
5696 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5697
5698 /* The combination of:
5699 * - ByVal
5700 * - dereferenceable
5701 * - invariant.load
5702 * allows the optimization passes to move loads and reduces
5703 * SGPR spilling significantly.
5704 */
5705 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5706 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5707 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5708 ac_add_attr_dereferenceable(P, UINT64_MAX);
5709 } else
5710 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5711 }
5712
5713 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5714 "no-signed-zeros-fp-math",
5715 "true");
5716
5717 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5718 /* These were copied from some LLVM test. */
5719 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5720 "less-precise-fpmad",
5721 "true");
5722 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5723 "no-infs-fp-math",
5724 "true");
5725 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5726 "no-nans-fp-math",
5727 "true");
5728 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5729 "unsafe-fp-math",
5730 "true");
5731 }
5732 }
5733
5734 static void declare_streamout_params(struct si_shader_context *ctx,
5735 struct pipe_stream_output_info *so,
5736 LLVMTypeRef *params, LLVMTypeRef i32,
5737 unsigned *num_params)
5738 {
5739 int i;
5740
5741 /* Streamout SGPRs. */
5742 if (so->num_outputs) {
5743 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5744 params[ctx->param_streamout_config = (*num_params)++] = i32;
5745 else
5746 ctx->param_streamout_config = *num_params - 1;
5747
5748 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5749 }
5750 /* A streamout buffer offset is loaded if the stride is non-zero. */
5751 for (i = 0; i < 4; i++) {
5752 if (!so->stride[i])
5753 continue;
5754
5755 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5756 }
5757 }
5758
5759 static unsigned llvm_get_type_size(LLVMTypeRef type)
5760 {
5761 LLVMTypeKind kind = LLVMGetTypeKind(type);
5762
5763 switch (kind) {
5764 case LLVMIntegerTypeKind:
5765 return LLVMGetIntTypeWidth(type) / 8;
5766 case LLVMFloatTypeKind:
5767 return 4;
5768 case LLVMPointerTypeKind:
5769 return 8;
5770 case LLVMVectorTypeKind:
5771 return LLVMGetVectorSize(type) *
5772 llvm_get_type_size(LLVMGetElementType(type));
5773 case LLVMArrayTypeKind:
5774 return LLVMGetArrayLength(type) *
5775 llvm_get_type_size(LLVMGetElementType(type));
5776 default:
5777 assert(0);
5778 return 0;
5779 }
5780 }
5781
5782 static void declare_lds_as_pointer(struct si_shader_context *ctx)
5783 {
5784 struct gallivm_state *gallivm = &ctx->gallivm;
5785
5786 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5787 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5788 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5789 "lds");
5790 }
5791
5792 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5793 {
5794 const unsigned *properties = shader->selector->info.properties;
5795 unsigned max_work_group_size =
5796 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5797 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5798 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5799
5800 if (!max_work_group_size) {
5801 /* This is a variable group size compute shader,
5802 * compile it for the maximum possible group size.
5803 */
5804 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5805 }
5806 return max_work_group_size;
5807 }
5808
5809 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5810 LLVMTypeRef *params,
5811 unsigned *num_params,
5812 bool assign_params)
5813 {
5814 params[(*num_params)++] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5815 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5816 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5817 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5818
5819 if (assign_params) {
5820 ctx->param_const_buffers = *num_params - 4;
5821 ctx->param_samplers = *num_params - 3;
5822 ctx->param_images = *num_params - 2;
5823 ctx->param_shader_buffers = *num_params - 1;
5824 }
5825 }
5826
5827 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5828 LLVMTypeRef *params,
5829 unsigned *num_params)
5830 {
5831 params[ctx->param_rw_buffers = (*num_params)++] =
5832 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5833 declare_per_stage_desc_pointers(ctx, params, num_params, true);
5834 }
5835
5836 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5837 LLVMTypeRef *params,
5838 unsigned *num_params)
5839 {
5840 params[ctx->param_vertex_buffers = (*num_params)++] =
5841 const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5842 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5843 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5844 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5845 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5846 }
5847
5848 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5849 LLVMTypeRef *params, unsigned *num_params,
5850 unsigned *num_prolog_vgprs)
5851 {
5852 struct si_shader *shader = ctx->shader;
5853
5854 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5855 if (shader->key.as_ls) {
5856 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5857 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5858 } else {
5859 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5860 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5861 }
5862 params[(*num_params)++] = ctx->i32; /* unused */
5863
5864 if (!shader->is_gs_copy_shader) {
5865 /* Vertex load indices. */
5866 ctx->param_vertex_index0 = (*num_params);
5867 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5868 params[(*num_params)++] = ctx->i32;
5869 *num_prolog_vgprs += shader->selector->info.num_inputs;
5870 }
5871 }
5872
5873 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
5874 LLVMTypeRef *params, unsigned *num_params)
5875 {
5876 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
5877 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
5878 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
5879 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
5880 }
5881
5882 enum {
5883 /* Convenient merged shader definitions. */
5884 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5885 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5886 };
5887
5888 static void create_function(struct si_shader_context *ctx)
5889 {
5890 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5891 struct gallivm_state *gallivm = &ctx->gallivm;
5892 struct si_shader *shader = ctx->shader;
5893 LLVMTypeRef params[100]; /* just make it large enough */
5894 LLVMTypeRef returns[16+32*4];
5895 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5896 unsigned num_returns = 0;
5897 unsigned num_prolog_vgprs = 0;
5898 unsigned type = ctx->type;
5899
5900 /* Set MERGED shaders. */
5901 if (ctx->screen->b.chip_class >= GFX9) {
5902 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5903 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5904 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5905 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5906 }
5907
5908 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5909
5910 switch (type) {
5911 case PIPE_SHADER_VERTEX:
5912 declare_default_desc_pointers(ctx, params, &num_params);
5913 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5914
5915 if (shader->key.as_es) {
5916 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5917 } else if (shader->key.as_ls) {
5918 /* no extra parameters */
5919 } else {
5920 if (shader->is_gs_copy_shader)
5921 num_params = ctx->param_rw_buffers + 1;
5922
5923 /* The locations of the other parameters are assigned dynamically. */
5924 declare_streamout_params(ctx, &shader->selector->so,
5925 params, ctx->i32, &num_params);
5926 }
5927
5928 last_sgpr = num_params-1;
5929
5930 /* VGPRs */
5931 declare_vs_input_vgprs(ctx, params, &num_params,
5932 &num_prolog_vgprs);
5933 break;
5934
5935 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5936 declare_default_desc_pointers(ctx, params, &num_params);
5937 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5938 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5939 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5940 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5941 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
5942 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
5943 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5944 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5945 last_sgpr = num_params - 1;
5946
5947 /* VGPRs */
5948 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5949 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5950
5951 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5952 * placed after the user SGPRs.
5953 */
5954 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5955 returns[num_returns++] = ctx->i32; /* SGPRs */
5956 for (i = 0; i < 3; i++)
5957 returns[num_returns++] = ctx->f32; /* VGPRs */
5958 break;
5959
5960 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
5961 /* Merged stages have 8 system SGPRs at the beginning. */
5962 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
5963 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5964 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5965 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
5966 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5967 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
5968 params[num_params++] = ctx->i32; /* unused */
5969 params[num_params++] = ctx->i32; /* unused */
5970
5971 params[num_params++] = ctx->i32; /* unused */
5972 params[num_params++] = ctx->i32; /* unused */
5973 declare_per_stage_desc_pointers(ctx, params, &num_params,
5974 ctx->type == PIPE_SHADER_VERTEX);
5975 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5976
5977 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5978 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5979 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5980 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
5981 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
5982 params[num_params++] = ctx->i32; /* unused */
5983
5984 declare_per_stage_desc_pointers(ctx, params, &num_params,
5985 ctx->type == PIPE_SHADER_TESS_CTRL);
5986 last_sgpr = num_params - 1;
5987
5988 /* VGPRs (first TCS, then VS) */
5989 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5990 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5991
5992 if (ctx->type == PIPE_SHADER_VERTEX) {
5993 declare_vs_input_vgprs(ctx, params, &num_params,
5994 &num_prolog_vgprs);
5995
5996 /* LS return values are inputs to the TCS main shader part. */
5997 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
5998 returns[num_returns++] = ctx->i32; /* SGPRs */
5999 for (i = 0; i < 2; i++)
6000 returns[num_returns++] = ctx->f32; /* VGPRs */
6001 } else {
6002 /* TCS return values are inputs to the TCS epilog.
6003 *
6004 * param_tcs_offchip_offset, param_tcs_factor_offset,
6005 * param_tcs_offchip_layout, and param_rw_buffers
6006 * should be passed to the epilog.
6007 */
6008 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
6009 returns[num_returns++] = ctx->i32; /* SGPRs */
6010 for (i = 0; i < 3; i++)
6011 returns[num_returns++] = ctx->f32; /* VGPRs */
6012 }
6013 break;
6014
6015 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
6016 /* Merged stages have 8 system SGPRs at the beginning. */
6017 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
6018 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
6019 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6020 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6021 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6022 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6023 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
6024 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
6025
6026 params[num_params++] = ctx->i32; /* unused */
6027 params[num_params++] = ctx->i32; /* unused */
6028 declare_per_stage_desc_pointers(ctx, params, &num_params,
6029 (ctx->type == PIPE_SHADER_VERTEX ||
6030 ctx->type == PIPE_SHADER_TESS_EVAL));
6031 if (ctx->type == PIPE_SHADER_VERTEX) {
6032 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6033 } else {
6034 /* TESS_EVAL (and also GEOMETRY):
6035 * Declare as many input SGPRs as the VS has. */
6036 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6037 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6038 params[num_params++] = ctx->i32; /* unused */
6039 params[num_params++] = ctx->i32; /* unused */
6040 params[num_params++] = ctx->i32; /* unused */
6041 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
6042 }
6043
6044 declare_per_stage_desc_pointers(ctx, params, &num_params,
6045 ctx->type == PIPE_SHADER_GEOMETRY);
6046 last_sgpr = num_params - 1;
6047
6048 /* VGPRs (first GS, then VS/TES) */
6049 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
6050 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
6051 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6052 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6053 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
6054
6055 if (ctx->type == PIPE_SHADER_VERTEX) {
6056 declare_vs_input_vgprs(ctx, params, &num_params,
6057 &num_prolog_vgprs);
6058 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
6059 declare_tes_input_vgprs(ctx, params, &num_params);
6060 }
6061
6062 if (ctx->type == PIPE_SHADER_VERTEX ||
6063 ctx->type == PIPE_SHADER_TESS_EVAL) {
6064 /* ES return values are inputs to GS. */
6065 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
6066 returns[num_returns++] = ctx->i32; /* SGPRs */
6067 for (i = 0; i < 5; i++)
6068 returns[num_returns++] = ctx->f32; /* VGPRs */
6069 }
6070 break;
6071
6072 case PIPE_SHADER_TESS_EVAL:
6073 declare_default_desc_pointers(ctx, params, &num_params);
6074 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6075 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6076
6077 if (shader->key.as_es) {
6078 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6079 params[num_params++] = ctx->i32;
6080 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
6081 } else {
6082 params[num_params++] = ctx->i32;
6083 declare_streamout_params(ctx, &shader->selector->so,
6084 params, ctx->i32, &num_params);
6085 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6086 }
6087 last_sgpr = num_params - 1;
6088
6089 /* VGPRs */
6090 declare_tes_input_vgprs(ctx, params, &num_params);
6091 break;
6092
6093 case PIPE_SHADER_GEOMETRY:
6094 declare_default_desc_pointers(ctx, params, &num_params);
6095 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6096 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
6097 last_sgpr = num_params - 1;
6098
6099 /* VGPRs */
6100 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
6101 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
6102 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6103 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
6104 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
6105 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
6106 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
6107 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6108 break;
6109
6110 case PIPE_SHADER_FRAGMENT:
6111 declare_default_desc_pointers(ctx, params, &num_params);
6112 params[SI_PARAM_ALPHA_REF] = ctx->f32;
6113 params[SI_PARAM_PRIM_MASK] = ctx->i32;
6114 last_sgpr = SI_PARAM_PRIM_MASK;
6115 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
6116 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
6117 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
6118 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
6119 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
6120 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
6121 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
6122 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
6123 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
6124 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
6125 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
6126 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
6127 params[SI_PARAM_FRONT_FACE] = ctx->i32;
6128 shader->info.face_vgpr_index = 20;
6129 params[SI_PARAM_ANCILLARY] = ctx->i32;
6130 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
6131 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
6132 num_params = SI_PARAM_POS_FIXED_PT+1;
6133
6134 /* Color inputs from the prolog. */
6135 if (shader->selector->info.colors_read) {
6136 unsigned num_color_elements =
6137 util_bitcount(shader->selector->info.colors_read);
6138
6139 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
6140 for (i = 0; i < num_color_elements; i++)
6141 params[num_params++] = ctx->f32;
6142
6143 num_prolog_vgprs += num_color_elements;
6144 }
6145
6146 /* Outputs for the epilog. */
6147 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6148 num_returns =
6149 num_return_sgprs +
6150 util_bitcount(shader->selector->info.colors_written) * 4 +
6151 shader->selector->info.writes_z +
6152 shader->selector->info.writes_stencil +
6153 shader->selector->info.writes_samplemask +
6154 1 /* SampleMaskIn */;
6155
6156 num_returns = MAX2(num_returns,
6157 num_return_sgprs +
6158 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6159
6160 for (i = 0; i < num_return_sgprs; i++)
6161 returns[i] = ctx->i32;
6162 for (; i < num_returns; i++)
6163 returns[i] = ctx->f32;
6164 break;
6165
6166 case PIPE_SHADER_COMPUTE:
6167 declare_default_desc_pointers(ctx, params, &num_params);
6168 params[SI_PARAM_GRID_SIZE] = v3i32;
6169 params[SI_PARAM_BLOCK_SIZE] = v3i32;
6170 params[SI_PARAM_BLOCK_ID] = v3i32;
6171 last_sgpr = SI_PARAM_BLOCK_ID;
6172
6173 params[SI_PARAM_THREAD_ID] = v3i32;
6174 num_params = SI_PARAM_THREAD_ID + 1;
6175 break;
6176 default:
6177 assert(0 && "unimplemented shader");
6178 return;
6179 }
6180
6181 assert(num_params <= ARRAY_SIZE(params));
6182
6183 si_create_function(ctx, "main", returns, num_returns, params,
6184 num_params, last_sgpr);
6185
6186 /* Reserve register locations for VGPR inputs the PS prolog may need. */
6187 if (ctx->type == PIPE_SHADER_FRAGMENT &&
6188 ctx->separate_prolog) {
6189 si_llvm_add_attribute(ctx->main_fn,
6190 "InitialPSInputAddr",
6191 S_0286D0_PERSP_SAMPLE_ENA(1) |
6192 S_0286D0_PERSP_CENTER_ENA(1) |
6193 S_0286D0_PERSP_CENTROID_ENA(1) |
6194 S_0286D0_LINEAR_SAMPLE_ENA(1) |
6195 S_0286D0_LINEAR_CENTER_ENA(1) |
6196 S_0286D0_LINEAR_CENTROID_ENA(1) |
6197 S_0286D0_FRONT_FACE_ENA(1) |
6198 S_0286D0_POS_FIXED_PT_ENA(1));
6199 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
6200 si_llvm_add_attribute(ctx->main_fn,
6201 "amdgpu-max-work-group-size",
6202 si_get_max_workgroup_size(shader));
6203 }
6204
6205 shader->info.num_input_sgprs = 0;
6206 shader->info.num_input_vgprs = 0;
6207
6208 for (i = 0; i <= last_sgpr; ++i)
6209 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
6210
6211 for (; i < num_params; ++i)
6212 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
6213
6214 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
6215 shader->info.num_input_vgprs -= num_prolog_vgprs;
6216
6217 if (!ctx->screen->has_ds_bpermute &&
6218 bld_base->info &&
6219 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
6220 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
6221 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
6222 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
6223 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
6224 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
6225 ctx->lds =
6226 LLVMAddGlobalInAddressSpace(gallivm->module,
6227 LLVMArrayType(ctx->i32, 64),
6228 "ddxy_lds",
6229 LOCAL_ADDR_SPACE);
6230
6231 if (shader->key.as_ls ||
6232 ctx->type == PIPE_SHADER_TESS_CTRL ||
6233 /* GFX9 has the ESGS ring buffer in LDS. */
6234 (ctx->screen->b.chip_class >= GFX9 &&
6235 (shader->key.as_es ||
6236 ctx->type == PIPE_SHADER_GEOMETRY)))
6237 declare_lds_as_pointer(ctx);
6238 }
6239
6240 /**
6241 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
6242 * for later use.
6243 */
6244 static void preload_ring_buffers(struct si_shader_context *ctx)
6245 {
6246 struct gallivm_state *gallivm = &ctx->gallivm;
6247 LLVMBuilderRef builder = gallivm->builder;
6248
6249 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6250 ctx->param_rw_buffers);
6251
6252 if (ctx->screen->b.chip_class <= VI &&
6253 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
6254 unsigned ring =
6255 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6256 : SI_ES_RING_ESGS;
6257 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6258
6259 ctx->esgs_ring =
6260 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6261 }
6262
6263 if (ctx->shader->is_gs_copy_shader) {
6264 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6265
6266 ctx->gsvs_ring[0] =
6267 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6268 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6269 const struct si_shader_selector *sel = ctx->shader->selector;
6270 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6271 LLVMValueRef base_ring;
6272
6273 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6274
6275 /* The conceptual layout of the GSVS ring is
6276 * v0c0 .. vLv0 v0c1 .. vLc1 ..
6277 * but the real memory layout is swizzled across
6278 * threads:
6279 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6280 * t16v0c0 ..
6281 * Override the buffer descriptor accordingly.
6282 */
6283 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6284 uint64_t stream_offset = 0;
6285
6286 for (unsigned stream = 0; stream < 4; ++stream) {
6287 unsigned num_components;
6288 unsigned stride;
6289 unsigned num_records;
6290 LLVMValueRef ring, tmp;
6291
6292 num_components = sel->info.num_stream_output_components[stream];
6293 if (!num_components)
6294 continue;
6295
6296 stride = 4 * num_components * sel->gs_max_out_vertices;
6297
6298 /* Limit on the stride field for <= CIK. */
6299 assert(stride < (1 << 14));
6300
6301 num_records = 64;
6302
6303 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6304 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6305 tmp = LLVMBuildAdd(builder, tmp,
6306 LLVMConstInt(ctx->i64,
6307 stream_offset, 0), "");
6308 stream_offset += stride * 64;
6309
6310 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6311 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6312 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6313 tmp = LLVMBuildOr(builder, tmp,
6314 LLVMConstInt(ctx->i32,
6315 S_008F04_STRIDE(stride) |
6316 S_008F04_SWIZZLE_ENABLE(1), 0), "");
6317 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6318 ring = LLVMBuildInsertElement(builder, ring,
6319 LLVMConstInt(ctx->i32, num_records, 0),
6320 LLVMConstInt(ctx->i32, 2, 0), "");
6321 ring = LLVMBuildInsertElement(builder, ring,
6322 LLVMConstInt(ctx->i32,
6323 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6324 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6325 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6326 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6327 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6328 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6329 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6330 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6331 S_008F0C_ADD_TID_ENABLE(1),
6332 0),
6333 LLVMConstInt(ctx->i32, 3, 0), "");
6334 ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
6335
6336 ctx->gsvs_ring[stream] = ring;
6337 }
6338 }
6339 }
6340
6341 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6342 LLVMValueRef param_rw_buffers,
6343 unsigned param_pos_fixed_pt)
6344 {
6345 struct gallivm_state *gallivm = &ctx->gallivm;
6346 LLVMBuilderRef builder = gallivm->builder;
6347 LLVMValueRef slot, desc, offset, row, bit, address[2];
6348
6349 /* Use the fixed-point gl_FragCoord input.
6350 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6351 * per coordinate to get the repeating effect.
6352 */
6353 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6354 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6355
6356 /* Load the buffer descriptor. */
6357 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6358 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6359
6360 /* The stipple pattern is 32x32, each row has 32 bits. */
6361 offset = LLVMBuildMul(builder, address[1],
6362 LLVMConstInt(ctx->i32, 4, 0), "");
6363 row = buffer_load_const(ctx, desc, offset);
6364 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6365 bit = LLVMBuildLShr(builder, row, address[0], "");
6366 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6367
6368 /* The intrinsic kills the thread if arg < 0. */
6369 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6370 LLVMConstReal(ctx->f32, -1), "");
6371 ac_build_kill(&ctx->ac, bit);
6372 }
6373
6374 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6375 struct si_shader_config *conf,
6376 unsigned symbol_offset)
6377 {
6378 unsigned i;
6379 const unsigned char *config =
6380 ac_shader_binary_config_start(binary, symbol_offset);
6381 bool really_needs_scratch = false;
6382
6383 /* LLVM adds SGPR spills to the scratch size.
6384 * Find out if we really need the scratch buffer.
6385 */
6386 for (i = 0; i < binary->reloc_count; i++) {
6387 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6388
6389 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6390 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6391 really_needs_scratch = true;
6392 break;
6393 }
6394 }
6395
6396 /* XXX: We may be able to emit some of these values directly rather than
6397 * extracting fields to be emitted later.
6398 */
6399
6400 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6401 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6402 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6403 switch (reg) {
6404 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6405 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6406 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6407 case R_00B848_COMPUTE_PGM_RSRC1:
6408 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6409 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6410 conf->float_mode = G_00B028_FLOAT_MODE(value);
6411 conf->rsrc1 = value;
6412 break;
6413 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6414 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6415 break;
6416 case R_00B84C_COMPUTE_PGM_RSRC2:
6417 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6418 conf->rsrc2 = value;
6419 break;
6420 case R_0286CC_SPI_PS_INPUT_ENA:
6421 conf->spi_ps_input_ena = value;
6422 break;
6423 case R_0286D0_SPI_PS_INPUT_ADDR:
6424 conf->spi_ps_input_addr = value;
6425 break;
6426 case R_0286E8_SPI_TMPRING_SIZE:
6427 case R_00B860_COMPUTE_TMPRING_SIZE:
6428 /* WAVESIZE is in units of 256 dwords. */
6429 if (really_needs_scratch)
6430 conf->scratch_bytes_per_wave =
6431 G_00B860_WAVESIZE(value) * 256 * 4;
6432 break;
6433 case 0x4: /* SPILLED_SGPRS */
6434 conf->spilled_sgprs = value;
6435 break;
6436 case 0x8: /* SPILLED_VGPRS */
6437 conf->spilled_vgprs = value;
6438 break;
6439 default:
6440 {
6441 static bool printed;
6442
6443 if (!printed) {
6444 fprintf(stderr, "Warning: LLVM emitted unknown "
6445 "config register: 0x%x\n", reg);
6446 printed = true;
6447 }
6448 }
6449 break;
6450 }
6451 }
6452
6453 if (!conf->spi_ps_input_addr)
6454 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6455 }
6456
6457 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6458 struct si_shader *shader,
6459 struct si_shader_config *config,
6460 uint64_t scratch_va)
6461 {
6462 unsigned i;
6463 uint32_t scratch_rsrc_dword0 = scratch_va;
6464 uint32_t scratch_rsrc_dword1 =
6465 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6466
6467 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6468 * correctly.
6469 */
6470 if (HAVE_LLVM >= 0x0309)
6471 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6472 else
6473 scratch_rsrc_dword1 |=
6474 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6475
6476 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6477 const struct ac_shader_reloc *reloc =
6478 &shader->binary.relocs[i];
6479 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6480 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6481 &scratch_rsrc_dword0, 4);
6482 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6483 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6484 &scratch_rsrc_dword1, 4);
6485 }
6486 }
6487 }
6488
6489 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6490 {
6491 unsigned size = shader->binary.code_size;
6492
6493 if (shader->prolog)
6494 size += shader->prolog->binary.code_size;
6495 if (shader->previous_stage)
6496 size += shader->previous_stage->binary.code_size;
6497 if (shader->prolog2)
6498 size += shader->prolog2->binary.code_size;
6499 if (shader->epilog)
6500 size += shader->epilog->binary.code_size;
6501 return size;
6502 }
6503
6504 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6505 {
6506 const struct ac_shader_binary *prolog =
6507 shader->prolog ? &shader->prolog->binary : NULL;
6508 const struct ac_shader_binary *previous_stage =
6509 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6510 const struct ac_shader_binary *prolog2 =
6511 shader->prolog2 ? &shader->prolog2->binary : NULL;
6512 const struct ac_shader_binary *epilog =
6513 shader->epilog ? &shader->epilog->binary : NULL;
6514 const struct ac_shader_binary *mainb = &shader->binary;
6515 unsigned bo_size = si_get_shader_binary_size(shader) +
6516 (!epilog ? mainb->rodata_size : 0);
6517 unsigned char *ptr;
6518
6519 assert(!prolog || !prolog->rodata_size);
6520 assert(!previous_stage || !previous_stage->rodata_size);
6521 assert(!prolog2 || !prolog2->rodata_size);
6522 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
6523 !mainb->rodata_size);
6524 assert(!epilog || !epilog->rodata_size);
6525
6526 /* GFX9 can fetch at most 128 bytes past the end of the shader.
6527 * Prevent VM faults.
6528 */
6529 if (sscreen->b.chip_class >= GFX9)
6530 bo_size += 128;
6531
6532 r600_resource_reference(&shader->bo, NULL);
6533 shader->bo = (struct r600_resource*)
6534 pipe_buffer_create(&sscreen->b.b, 0,
6535 PIPE_USAGE_IMMUTABLE,
6536 align(bo_size, SI_CPDMA_ALIGNMENT));
6537 if (!shader->bo)
6538 return -ENOMEM;
6539
6540 /* Upload. */
6541 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6542 PIPE_TRANSFER_READ_WRITE |
6543 PIPE_TRANSFER_UNSYNCHRONIZED);
6544
6545 if (prolog) {
6546 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6547 ptr += prolog->code_size;
6548 }
6549 if (previous_stage) {
6550 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6551 previous_stage->code_size);
6552 ptr += previous_stage->code_size;
6553 }
6554 if (prolog2) {
6555 util_memcpy_cpu_to_le32(ptr, prolog2->code, prolog2->code_size);
6556 ptr += prolog2->code_size;
6557 }
6558
6559 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6560 ptr += mainb->code_size;
6561
6562 if (epilog)
6563 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6564 else if (mainb->rodata_size > 0)
6565 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6566
6567 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6568 return 0;
6569 }
6570
6571 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6572 struct pipe_debug_callback *debug,
6573 const char *name, FILE *file)
6574 {
6575 char *line, *p;
6576 unsigned i, count;
6577
6578 if (binary->disasm_string) {
6579 fprintf(file, "Shader %s disassembly:\n", name);
6580 fprintf(file, "%s", binary->disasm_string);
6581
6582 if (debug && debug->debug_message) {
6583 /* Very long debug messages are cut off, so send the
6584 * disassembly one line at a time. This causes more
6585 * overhead, but on the plus side it simplifies
6586 * parsing of resulting logs.
6587 */
6588 pipe_debug_message(debug, SHADER_INFO,
6589 "Shader Disassembly Begin");
6590
6591 line = binary->disasm_string;
6592 while (*line) {
6593 p = util_strchrnul(line, '\n');
6594 count = p - line;
6595
6596 if (count) {
6597 pipe_debug_message(debug, SHADER_INFO,
6598 "%.*s", count, line);
6599 }
6600
6601 if (!*p)
6602 break;
6603 line = p + 1;
6604 }
6605
6606 pipe_debug_message(debug, SHADER_INFO,
6607 "Shader Disassembly End");
6608 }
6609 } else {
6610 fprintf(file, "Shader %s binary:\n", name);
6611 for (i = 0; i < binary->code_size; i += 4) {
6612 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6613 binary->code[i + 3], binary->code[i + 2],
6614 binary->code[i + 1], binary->code[i]);
6615 }
6616 }
6617 }
6618
6619 static void si_shader_dump_stats(struct si_screen *sscreen,
6620 struct si_shader *shader,
6621 struct pipe_debug_callback *debug,
6622 unsigned processor,
6623 FILE *file,
6624 bool check_debug_option)
6625 {
6626 struct si_shader_config *conf = &shader->config;
6627 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6628 unsigned code_size = si_get_shader_binary_size(shader);
6629 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6630 unsigned lds_per_wave = 0;
6631 unsigned max_simd_waves = 10;
6632
6633 /* Compute LDS usage for PS. */
6634 switch (processor) {
6635 case PIPE_SHADER_FRAGMENT:
6636 /* The minimum usage per wave is (num_inputs * 48). The maximum
6637 * usage is (num_inputs * 48 * 16).
6638 * We can get anything in between and it varies between waves.
6639 *
6640 * The 48 bytes per input for a single primitive is equal to
6641 * 4 bytes/component * 4 components/input * 3 points.
6642 *
6643 * Other stages don't know the size at compile time or don't
6644 * allocate LDS per wave, but instead they do it per thread group.
6645 */
6646 lds_per_wave = conf->lds_size * lds_increment +
6647 align(num_inputs * 48, lds_increment);
6648 break;
6649 case PIPE_SHADER_COMPUTE:
6650 if (shader->selector) {
6651 unsigned max_workgroup_size =
6652 si_get_max_workgroup_size(shader);
6653 lds_per_wave = (conf->lds_size * lds_increment) /
6654 DIV_ROUND_UP(max_workgroup_size, 64);
6655 }
6656 break;
6657 }
6658
6659 /* Compute the per-SIMD wave counts. */
6660 if (conf->num_sgprs) {
6661 if (sscreen->b.chip_class >= VI)
6662 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6663 else
6664 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6665 }
6666
6667 if (conf->num_vgprs)
6668 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6669
6670 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6671 * 16KB makes some SIMDs unoccupied). */
6672 if (lds_per_wave)
6673 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6674
6675 if (!check_debug_option ||
6676 r600_can_dump_shader(&sscreen->b, processor)) {
6677 if (processor == PIPE_SHADER_FRAGMENT) {
6678 fprintf(file, "*** SHADER CONFIG ***\n"
6679 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6680 "SPI_PS_INPUT_ENA = 0x%04x\n",
6681 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6682 }
6683
6684 fprintf(file, "*** SHADER STATS ***\n"
6685 "SGPRS: %d\n"
6686 "VGPRS: %d\n"
6687 "Spilled SGPRs: %d\n"
6688 "Spilled VGPRs: %d\n"
6689 "Private memory VGPRs: %d\n"
6690 "Code Size: %d bytes\n"
6691 "LDS: %d blocks\n"
6692 "Scratch: %d bytes per wave\n"
6693 "Max Waves: %d\n"
6694 "********************\n\n\n",
6695 conf->num_sgprs, conf->num_vgprs,
6696 conf->spilled_sgprs, conf->spilled_vgprs,
6697 conf->private_mem_vgprs, code_size,
6698 conf->lds_size, conf->scratch_bytes_per_wave,
6699 max_simd_waves);
6700 }
6701
6702 pipe_debug_message(debug, SHADER_INFO,
6703 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6704 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6705 "Spilled VGPRs: %d PrivMem VGPRs: %d",
6706 conf->num_sgprs, conf->num_vgprs, code_size,
6707 conf->lds_size, conf->scratch_bytes_per_wave,
6708 max_simd_waves, conf->spilled_sgprs,
6709 conf->spilled_vgprs, conf->private_mem_vgprs);
6710 }
6711
6712 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6713 {
6714 switch (processor) {
6715 case PIPE_SHADER_VERTEX:
6716 if (shader->key.as_es)
6717 return "Vertex Shader as ES";
6718 else if (shader->key.as_ls)
6719 return "Vertex Shader as LS";
6720 else
6721 return "Vertex Shader as VS";
6722 case PIPE_SHADER_TESS_CTRL:
6723 return "Tessellation Control Shader";
6724 case PIPE_SHADER_TESS_EVAL:
6725 if (shader->key.as_es)
6726 return "Tessellation Evaluation Shader as ES";
6727 else
6728 return "Tessellation Evaluation Shader as VS";
6729 case PIPE_SHADER_GEOMETRY:
6730 if (shader->is_gs_copy_shader)
6731 return "GS Copy Shader as VS";
6732 else
6733 return "Geometry Shader";
6734 case PIPE_SHADER_FRAGMENT:
6735 return "Pixel Shader";
6736 case PIPE_SHADER_COMPUTE:
6737 return "Compute Shader";
6738 default:
6739 return "Unknown Shader";
6740 }
6741 }
6742
6743 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6744 struct pipe_debug_callback *debug, unsigned processor,
6745 FILE *file, bool check_debug_option)
6746 {
6747 if (!check_debug_option ||
6748 r600_can_dump_shader(&sscreen->b, processor))
6749 si_dump_shader_key(processor, shader, file);
6750
6751 if (!check_debug_option && shader->binary.llvm_ir_string) {
6752 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6753 si_get_shader_name(shader, processor));
6754 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6755 }
6756
6757 if (!check_debug_option ||
6758 (r600_can_dump_shader(&sscreen->b, processor) &&
6759 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6760 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6761
6762 if (shader->prolog)
6763 si_shader_dump_disassembly(&shader->prolog->binary,
6764 debug, "prolog", file);
6765 if (shader->previous_stage)
6766 si_shader_dump_disassembly(&shader->previous_stage->binary,
6767 debug, "previous stage", file);
6768 if (shader->prolog2)
6769 si_shader_dump_disassembly(&shader->prolog2->binary,
6770 debug, "prolog2", file);
6771
6772 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6773
6774 if (shader->epilog)
6775 si_shader_dump_disassembly(&shader->epilog->binary,
6776 debug, "epilog", file);
6777 fprintf(file, "\n");
6778 }
6779
6780 si_shader_dump_stats(sscreen, shader, debug, processor, file,
6781 check_debug_option);
6782 }
6783
6784 int si_compile_llvm(struct si_screen *sscreen,
6785 struct ac_shader_binary *binary,
6786 struct si_shader_config *conf,
6787 LLVMTargetMachineRef tm,
6788 LLVMModuleRef mod,
6789 struct pipe_debug_callback *debug,
6790 unsigned processor,
6791 const char *name)
6792 {
6793 int r = 0;
6794 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6795
6796 if (r600_can_dump_shader(&sscreen->b, processor)) {
6797 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6798
6799 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6800 fprintf(stderr, "%s LLVM IR:\n\n", name);
6801 ac_dump_module(mod);
6802 fprintf(stderr, "\n");
6803 }
6804 }
6805
6806 if (sscreen->record_llvm_ir) {
6807 char *ir = LLVMPrintModuleToString(mod);
6808 binary->llvm_ir_string = strdup(ir);
6809 LLVMDisposeMessage(ir);
6810 }
6811
6812 if (!si_replace_shader(count, binary)) {
6813 r = si_llvm_compile(mod, binary, tm, debug);
6814 if (r)
6815 return r;
6816 }
6817
6818 si_shader_binary_read_config(binary, conf, 0);
6819
6820 /* Enable 64-bit and 16-bit denormals, because there is no performance
6821 * cost.
6822 *
6823 * If denormals are enabled, all floating-point output modifiers are
6824 * ignored.
6825 *
6826 * Don't enable denormals for 32-bit floats, because:
6827 * - Floating-point output modifiers would be ignored by the hw.
6828 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6829 * have to stop using those.
6830 * - SI & CI would be very slow.
6831 */
6832 conf->float_mode |= V_00B028_FP_64_DENORMS;
6833
6834 FREE(binary->config);
6835 FREE(binary->global_symbol_offsets);
6836 binary->config = NULL;
6837 binary->global_symbol_offsets = NULL;
6838
6839 /* Some shaders can't have rodata because their binaries can be
6840 * concatenated.
6841 */
6842 if (binary->rodata_size &&
6843 (processor == PIPE_SHADER_VERTEX ||
6844 processor == PIPE_SHADER_TESS_CTRL ||
6845 processor == PIPE_SHADER_TESS_EVAL ||
6846 processor == PIPE_SHADER_FRAGMENT)) {
6847 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6848 return -EINVAL;
6849 }
6850
6851 return r;
6852 }
6853
6854 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6855 {
6856 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6857 LLVMBuildRetVoid(ctx->gallivm.builder);
6858 else
6859 LLVMBuildRet(ctx->gallivm.builder, ret);
6860 }
6861
6862 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6863 struct si_shader *
6864 si_generate_gs_copy_shader(struct si_screen *sscreen,
6865 LLVMTargetMachineRef tm,
6866 struct si_shader_selector *gs_selector,
6867 struct pipe_debug_callback *debug)
6868 {
6869 struct si_shader_context ctx;
6870 struct si_shader *shader;
6871 struct gallivm_state *gallivm = &ctx.gallivm;
6872 LLVMBuilderRef builder;
6873 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6874 struct lp_build_context *uint = &bld_base->uint_bld;
6875 struct si_shader_output_values *outputs;
6876 struct tgsi_shader_info *gsinfo = &gs_selector->info;
6877 int i, r;
6878
6879 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6880
6881 if (!outputs)
6882 return NULL;
6883
6884 shader = CALLOC_STRUCT(si_shader);
6885 if (!shader) {
6886 FREE(outputs);
6887 return NULL;
6888 }
6889
6890
6891 shader->selector = gs_selector;
6892 shader->is_gs_copy_shader = true;
6893
6894 si_init_shader_ctx(&ctx, sscreen, tm);
6895 ctx.shader = shader;
6896 ctx.type = PIPE_SHADER_VERTEX;
6897
6898 builder = gallivm->builder;
6899
6900 create_function(&ctx);
6901 preload_ring_buffers(&ctx);
6902
6903 LLVMValueRef voffset =
6904 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6905 ctx.param_vertex_id), 4);
6906
6907 /* Fetch the vertex stream ID.*/
6908 LLVMValueRef stream_id;
6909
6910 if (gs_selector->so.num_outputs)
6911 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6912 else
6913 stream_id = ctx.i32_0;
6914
6915 /* Fill in output information. */
6916 for (i = 0; i < gsinfo->num_outputs; ++i) {
6917 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6918 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6919
6920 for (int chan = 0; chan < 4; chan++) {
6921 outputs[i].vertex_stream[chan] =
6922 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6923 }
6924 }
6925
6926 LLVMBasicBlockRef end_bb;
6927 LLVMValueRef switch_inst;
6928
6929 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6930 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6931
6932 for (int stream = 0; stream < 4; stream++) {
6933 LLVMBasicBlockRef bb;
6934 unsigned offset;
6935
6936 if (!gsinfo->num_stream_output_components[stream])
6937 continue;
6938
6939 if (stream > 0 && !gs_selector->so.num_outputs)
6940 continue;
6941
6942 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6943 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6944 LLVMPositionBuilderAtEnd(builder, bb);
6945
6946 /* Fetch vertex data from GSVS ring */
6947 offset = 0;
6948 for (i = 0; i < gsinfo->num_outputs; ++i) {
6949 for (unsigned chan = 0; chan < 4; chan++) {
6950 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6951 outputs[i].vertex_stream[chan] != stream) {
6952 outputs[i].values[chan] = ctx.bld_base.base.undef;
6953 continue;
6954 }
6955
6956 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6957 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6958 offset++;
6959
6960 outputs[i].values[chan] =
6961 ac_build_buffer_load(&ctx.ac,
6962 ctx.gsvs_ring[0], 1,
6963 ctx.i32_0, voffset,
6964 soffset, 0, 1, 1, true);
6965 }
6966 }
6967
6968 /* Streamout and exports. */
6969 if (gs_selector->so.num_outputs) {
6970 si_llvm_emit_streamout(&ctx, outputs,
6971 gsinfo->num_outputs,
6972 stream);
6973 }
6974
6975 if (stream == 0)
6976 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6977
6978 LLVMBuildBr(builder, end_bb);
6979 }
6980
6981 LLVMPositionBuilderAtEnd(builder, end_bb);
6982
6983 LLVMBuildRetVoid(gallivm->builder);
6984
6985 /* Dump LLVM IR before any optimization passes */
6986 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6987 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6988 ac_dump_module(ctx.gallivm.module);
6989
6990 si_llvm_finalize_module(&ctx,
6991 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6992
6993 r = si_compile_llvm(sscreen, &ctx.shader->binary,
6994 &ctx.shader->config, ctx.tm,
6995 ctx.gallivm.module,
6996 debug, PIPE_SHADER_GEOMETRY,
6997 "GS Copy Shader");
6998 if (!r) {
6999 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
7000 fprintf(stderr, "GS Copy Shader:\n");
7001 si_shader_dump(sscreen, ctx.shader, debug,
7002 PIPE_SHADER_GEOMETRY, stderr, true);
7003 r = si_shader_binary_upload(sscreen, ctx.shader);
7004 }
7005
7006 si_llvm_dispose(&ctx);
7007
7008 FREE(outputs);
7009
7010 if (r != 0) {
7011 FREE(shader);
7012 shader = NULL;
7013 }
7014 return shader;
7015 }
7016
7017 static void si_dump_shader_key_vs(struct si_shader_key *key,
7018 struct si_vs_prolog_bits *prolog,
7019 const char *prefix, FILE *f)
7020 {
7021 fprintf(f, " %s.instance_divisors = {", prefix);
7022 for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
7023 fprintf(f, !i ? "%u" : ", %u",
7024 prolog->instance_divisors[i]);
7025 }
7026 fprintf(f, "}\n");
7027
7028 fprintf(f, " mono.vs.fix_fetch = {");
7029 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
7030 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
7031 fprintf(f, "}\n");
7032 }
7033
7034 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
7035 FILE *f)
7036 {
7037 struct si_shader_key *key = &shader->key;
7038
7039 fprintf(f, "SHADER KEY\n");
7040
7041 switch (processor) {
7042 case PIPE_SHADER_VERTEX:
7043 si_dump_shader_key_vs(key, &key->part.vs.prolog,
7044 "part.vs.prolog", f);
7045 fprintf(f, " as_es = %u\n", key->as_es);
7046 fprintf(f, " as_ls = %u\n", key->as_ls);
7047 fprintf(f, " mono.vs_export_prim_id = %u\n",
7048 key->mono.vs_export_prim_id);
7049 break;
7050
7051 case PIPE_SHADER_TESS_CTRL:
7052 if (shader->selector->screen->b.chip_class >= GFX9) {
7053 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
7054 "part.tcs.ls_prolog", f);
7055 }
7056 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
7057 fprintf(f, " mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
7058 break;
7059
7060 case PIPE_SHADER_TESS_EVAL:
7061 fprintf(f, " as_es = %u\n", key->as_es);
7062 fprintf(f, " mono.vs_export_prim_id = %u\n",
7063 key->mono.vs_export_prim_id);
7064 break;
7065
7066 case PIPE_SHADER_GEOMETRY:
7067 if (shader->is_gs_copy_shader)
7068 break;
7069
7070 if (shader->selector->screen->b.chip_class >= GFX9 &&
7071 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
7072 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
7073 "part.gs.vs_prolog", f);
7074 }
7075 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
7076 break;
7077
7078 case PIPE_SHADER_COMPUTE:
7079 break;
7080
7081 case PIPE_SHADER_FRAGMENT:
7082 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
7083 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
7084 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
7085 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
7086 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
7087 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
7088 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
7089 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
7090 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
7091 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
7092 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
7093 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
7094 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
7095 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
7096 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
7097 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
7098 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
7099 break;
7100
7101 default:
7102 assert(0);
7103 }
7104
7105 if ((processor == PIPE_SHADER_GEOMETRY ||
7106 processor == PIPE_SHADER_TESS_EVAL ||
7107 processor == PIPE_SHADER_VERTEX) &&
7108 !key->as_es && !key->as_ls) {
7109 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
7110 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
7111 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
7112 }
7113 }
7114
7115 static void si_init_shader_ctx(struct si_shader_context *ctx,
7116 struct si_screen *sscreen,
7117 LLVMTargetMachineRef tm)
7118 {
7119 struct lp_build_tgsi_context *bld_base;
7120 struct lp_build_tgsi_action tmpl = {};
7121
7122 si_llvm_context_init(ctx, sscreen, tm);
7123
7124 bld_base = &ctx->bld_base;
7125 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
7126
7127 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
7128 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
7129 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
7130
7131 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
7132 bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
7133 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
7134 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
7135 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
7136 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
7137 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
7138 bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
7139 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
7140 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
7141 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
7142 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
7143 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
7144 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
7145 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
7146 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
7147
7148 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
7149 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
7150 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
7151 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
7152 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
7153 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
7154
7155 tmpl.fetch_args = atomic_fetch_args;
7156 tmpl.emit = atomic_emit;
7157 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
7158 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
7159 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
7160 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
7161 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
7162 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
7163 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
7164 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
7165 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
7166 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
7167 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
7168 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
7169 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
7170 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
7171 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
7172 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
7173 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
7174 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
7175 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
7176 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
7177
7178 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
7179
7180 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
7181
7182 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
7183 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
7184 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
7185 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
7186
7187 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
7188 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
7189 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
7190 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
7191 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
7192 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
7193 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
7194 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
7195 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
7196
7197 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
7198 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
7199 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
7200 }
7201
7202 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
7203 {
7204 struct si_shader *shader = ctx->shader;
7205 struct tgsi_shader_info *info = &shader->selector->info;
7206
7207 if (ctx->type == PIPE_SHADER_FRAGMENT ||
7208 ctx->type == PIPE_SHADER_COMPUTE ||
7209 shader->key.as_es ||
7210 shader->key.as_ls)
7211 return;
7212
7213 ac_eliminate_const_vs_outputs(&ctx->ac,
7214 ctx->main_fn,
7215 shader->info.vs_output_param_offset,
7216 info->num_outputs,
7217 &shader->info.nr_param_exports);
7218 }
7219
7220 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
7221 {
7222 ctx->shader->config.private_mem_vgprs = 0;
7223
7224 /* Process all LLVM instructions. */
7225 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
7226 while (bb) {
7227 LLVMValueRef next = LLVMGetFirstInstruction(bb);
7228
7229 while (next) {
7230 LLVMValueRef inst = next;
7231 next = LLVMGetNextInstruction(next);
7232
7233 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
7234 continue;
7235
7236 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
7237 /* No idea why LLVM aligns allocas to 4 elements. */
7238 unsigned alignment = LLVMGetAlignment(inst);
7239 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
7240 ctx->shader->config.private_mem_vgprs += dw_size;
7241 }
7242 bb = LLVMGetNextBasicBlock(bb);
7243 }
7244 }
7245
7246 static void si_init_exec_full_mask(struct si_shader_context *ctx)
7247 {
7248 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
7249 lp_build_intrinsic(ctx->gallivm.builder,
7250 "llvm.amdgcn.init.exec", ctx->voidt,
7251 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
7252 }
7253
7254 static void si_init_exec_from_input(struct si_shader_context *ctx,
7255 unsigned param, unsigned bitoffset)
7256 {
7257 LLVMValueRef args[] = {
7258 LLVMGetParam(ctx->main_fn, param),
7259 LLVMConstInt(ctx->i32, bitoffset, 0),
7260 };
7261 lp_build_intrinsic(ctx->gallivm.builder,
7262 "llvm.amdgcn.init.exec.from.input",
7263 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
7264 }
7265
7266 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
7267 bool is_monolithic)
7268 {
7269 struct si_shader *shader = ctx->shader;
7270 struct si_shader_selector *sel = shader->selector;
7271 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7272
7273 switch (ctx->type) {
7274 case PIPE_SHADER_VERTEX:
7275 ctx->load_input = declare_input_vs;
7276 if (shader->key.as_ls)
7277 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
7278 else if (shader->key.as_es)
7279 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7280 else
7281 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7282 break;
7283 case PIPE_SHADER_TESS_CTRL:
7284 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7285 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7286 bld_base->emit_store = store_output_tcs;
7287 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7288 break;
7289 case PIPE_SHADER_TESS_EVAL:
7290 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7291 if (shader->key.as_es)
7292 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7293 else
7294 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7295 break;
7296 case PIPE_SHADER_GEOMETRY:
7297 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7298 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7299 break;
7300 case PIPE_SHADER_FRAGMENT:
7301 ctx->load_input = declare_input_fs;
7302 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7303 break;
7304 case PIPE_SHADER_COMPUTE:
7305 ctx->declare_memory_region = declare_compute_memory;
7306 break;
7307 default:
7308 assert(!"Unsupported shader type");
7309 return false;
7310 }
7311
7312 create_function(ctx);
7313 preload_ring_buffers(ctx);
7314
7315 /* For GFX9 merged shaders:
7316 * - Set EXEC. If the prolog is present, set EXEC there instead.
7317 * - Add a barrier before the second shader.
7318 *
7319 * The same thing for monolithic shaders is done in
7320 * si_build_wrapper_function.
7321 */
7322 if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
7323 if (sel->info.num_instructions > 1 && /* not empty shader */
7324 (shader->key.as_es || shader->key.as_ls) &&
7325 (ctx->type == PIPE_SHADER_TESS_EVAL ||
7326 (ctx->type == PIPE_SHADER_VERTEX &&
7327 !sel->vs_needs_prolog))) {
7328 si_init_exec_from_input(ctx,
7329 ctx->param_merged_wave_info, 0);
7330 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
7331 ctx->type == PIPE_SHADER_GEOMETRY) {
7332 si_init_exec_from_input(ctx,
7333 ctx->param_merged_wave_info, 8);
7334 si_llvm_emit_barrier(NULL, bld_base, NULL);
7335 }
7336 }
7337
7338 if (ctx->type == PIPE_SHADER_GEOMETRY) {
7339 int i;
7340 for (i = 0; i < 4; i++) {
7341 ctx->gs_next_vertex[i] =
7342 lp_build_alloca(&ctx->gallivm,
7343 ctx->i32, "");
7344 }
7345 }
7346
7347 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7348 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7349 return false;
7350 }
7351
7352 si_llvm_build_ret(ctx, ctx->return_value);
7353 return true;
7354 }
7355
7356 /**
7357 * Compute the VS prolog key, which contains all the information needed to
7358 * build the VS prolog function, and set shader->info bits where needed.
7359 *
7360 * \param info Shader info of the vertex shader.
7361 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
7362 * \param prolog_key Key of the VS prolog
7363 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
7364 * \param key Output shader part key.
7365 */
7366 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7367 unsigned num_input_sgprs,
7368 const struct si_vs_prolog_bits *prolog_key,
7369 struct si_shader *shader_out,
7370 union si_shader_part_key *key)
7371 {
7372 memset(key, 0, sizeof(*key));
7373 key->vs_prolog.states = *prolog_key;
7374 key->vs_prolog.num_input_sgprs = num_input_sgprs;
7375 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7376 key->vs_prolog.as_ls = shader_out->key.as_ls;
7377
7378 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
7379 key->vs_prolog.as_ls = 1;
7380 key->vs_prolog.num_merged_next_stage_vgprs = 2;
7381 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
7382 key->vs_prolog.num_merged_next_stage_vgprs = 5;
7383 }
7384
7385 /* Set the instanceID flag. */
7386 for (unsigned i = 0; i < info->num_inputs; i++)
7387 if (key->vs_prolog.states.instance_divisors[i])
7388 shader_out->info.uses_instanceid = true;
7389 }
7390
7391 /**
7392 * Compute the PS prolog key, which contains all the information needed to
7393 * build the PS prolog function, and set related bits in shader->config.
7394 */
7395 static void si_get_ps_prolog_key(struct si_shader *shader,
7396 union si_shader_part_key *key,
7397 bool separate_prolog)
7398 {
7399 struct tgsi_shader_info *info = &shader->selector->info;
7400
7401 memset(key, 0, sizeof(*key));
7402 key->ps_prolog.states = shader->key.part.ps.prolog;
7403 key->ps_prolog.colors_read = info->colors_read;
7404 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7405 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7406 key->ps_prolog.wqm = info->uses_derivatives &&
7407 (key->ps_prolog.colors_read ||
7408 key->ps_prolog.states.force_persp_sample_interp ||
7409 key->ps_prolog.states.force_linear_sample_interp ||
7410 key->ps_prolog.states.force_persp_center_interp ||
7411 key->ps_prolog.states.force_linear_center_interp ||
7412 key->ps_prolog.states.bc_optimize_for_persp ||
7413 key->ps_prolog.states.bc_optimize_for_linear);
7414
7415 if (info->colors_read) {
7416 unsigned *color = shader->selector->color_attr_index;
7417
7418 if (shader->key.part.ps.prolog.color_two_side) {
7419 /* BCOLORs are stored after the last input. */
7420 key->ps_prolog.num_interp_inputs = info->num_inputs;
7421 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7422 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7423 }
7424
7425 for (unsigned i = 0; i < 2; i++) {
7426 unsigned interp = info->input_interpolate[color[i]];
7427 unsigned location = info->input_interpolate_loc[color[i]];
7428
7429 if (!(info->colors_read & (0xf << i*4)))
7430 continue;
7431
7432 key->ps_prolog.color_attr_index[i] = color[i];
7433
7434 if (shader->key.part.ps.prolog.flatshade_colors &&
7435 interp == TGSI_INTERPOLATE_COLOR)
7436 interp = TGSI_INTERPOLATE_CONSTANT;
7437
7438 switch (interp) {
7439 case TGSI_INTERPOLATE_CONSTANT:
7440 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7441 break;
7442 case TGSI_INTERPOLATE_PERSPECTIVE:
7443 case TGSI_INTERPOLATE_COLOR:
7444 /* Force the interpolation location for colors here. */
7445 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7446 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7447 if (shader->key.part.ps.prolog.force_persp_center_interp)
7448 location = TGSI_INTERPOLATE_LOC_CENTER;
7449
7450 switch (location) {
7451 case TGSI_INTERPOLATE_LOC_SAMPLE:
7452 key->ps_prolog.color_interp_vgpr_index[i] = 0;
7453 shader->config.spi_ps_input_ena |=
7454 S_0286CC_PERSP_SAMPLE_ENA(1);
7455 break;
7456 case TGSI_INTERPOLATE_LOC_CENTER:
7457 key->ps_prolog.color_interp_vgpr_index[i] = 2;
7458 shader->config.spi_ps_input_ena |=
7459 S_0286CC_PERSP_CENTER_ENA(1);
7460 break;
7461 case TGSI_INTERPOLATE_LOC_CENTROID:
7462 key->ps_prolog.color_interp_vgpr_index[i] = 4;
7463 shader->config.spi_ps_input_ena |=
7464 S_0286CC_PERSP_CENTROID_ENA(1);
7465 break;
7466 default:
7467 assert(0);
7468 }
7469 break;
7470 case TGSI_INTERPOLATE_LINEAR:
7471 /* Force the interpolation location for colors here. */
7472 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7473 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7474 if (shader->key.part.ps.prolog.force_linear_center_interp)
7475 location = TGSI_INTERPOLATE_LOC_CENTER;
7476
7477 /* The VGPR assignment for non-monolithic shaders
7478 * works because InitialPSInputAddr is set on the
7479 * main shader and PERSP_PULL_MODEL is never used.
7480 */
7481 switch (location) {
7482 case TGSI_INTERPOLATE_LOC_SAMPLE:
7483 key->ps_prolog.color_interp_vgpr_index[i] =
7484 separate_prolog ? 6 : 9;
7485 shader->config.spi_ps_input_ena |=
7486 S_0286CC_LINEAR_SAMPLE_ENA(1);
7487 break;
7488 case TGSI_INTERPOLATE_LOC_CENTER:
7489 key->ps_prolog.color_interp_vgpr_index[i] =
7490 separate_prolog ? 8 : 11;
7491 shader->config.spi_ps_input_ena |=
7492 S_0286CC_LINEAR_CENTER_ENA(1);
7493 break;
7494 case TGSI_INTERPOLATE_LOC_CENTROID:
7495 key->ps_prolog.color_interp_vgpr_index[i] =
7496 separate_prolog ? 10 : 13;
7497 shader->config.spi_ps_input_ena |=
7498 S_0286CC_LINEAR_CENTROID_ENA(1);
7499 break;
7500 default:
7501 assert(0);
7502 }
7503 break;
7504 default:
7505 assert(0);
7506 }
7507 }
7508 }
7509 }
7510
7511 /**
7512 * Check whether a PS prolog is required based on the key.
7513 */
7514 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7515 {
7516 return key->ps_prolog.colors_read ||
7517 key->ps_prolog.states.force_persp_sample_interp ||
7518 key->ps_prolog.states.force_linear_sample_interp ||
7519 key->ps_prolog.states.force_persp_center_interp ||
7520 key->ps_prolog.states.force_linear_center_interp ||
7521 key->ps_prolog.states.bc_optimize_for_persp ||
7522 key->ps_prolog.states.bc_optimize_for_linear ||
7523 key->ps_prolog.states.poly_stipple;
7524 }
7525
7526 /**
7527 * Compute the PS epilog key, which contains all the information needed to
7528 * build the PS epilog function.
7529 */
7530 static void si_get_ps_epilog_key(struct si_shader *shader,
7531 union si_shader_part_key *key)
7532 {
7533 struct tgsi_shader_info *info = &shader->selector->info;
7534 memset(key, 0, sizeof(*key));
7535 key->ps_epilog.colors_written = info->colors_written;
7536 key->ps_epilog.writes_z = info->writes_z;
7537 key->ps_epilog.writes_stencil = info->writes_stencil;
7538 key->ps_epilog.writes_samplemask = info->writes_samplemask;
7539 key->ps_epilog.states = shader->key.part.ps.epilog;
7540 }
7541
7542 /**
7543 * Build the GS prolog function. Rotate the input vertices for triangle strips
7544 * with adjacency.
7545 */
7546 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7547 union si_shader_part_key *key)
7548 {
7549 unsigned num_sgprs, num_vgprs;
7550 struct gallivm_state *gallivm = &ctx->gallivm;
7551 LLVMBuilderRef builder = gallivm->builder;
7552 LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
7553 LLVMTypeRef returns[48];
7554 LLVMValueRef func, ret;
7555
7556 if (ctx->screen->b.chip_class >= GFX9) {
7557 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
7558 num_vgprs = 5; /* ES inputs are not needed by GS */
7559 } else {
7560 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
7561 num_vgprs = 8;
7562 }
7563
7564 for (unsigned i = 0; i < num_sgprs; ++i) {
7565 params[i] = ctx->i32;
7566 returns[i] = ctx->i32;
7567 }
7568
7569 for (unsigned i = 0; i < num_vgprs; ++i) {
7570 params[num_sgprs + i] = ctx->i32;
7571 returns[num_sgprs + i] = ctx->f32;
7572 }
7573
7574 /* Create the function. */
7575 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7576 params, num_sgprs + num_vgprs, num_sgprs - 1);
7577 func = ctx->main_fn;
7578
7579 /* Set the full EXEC mask for the prolog, because we are only fiddling
7580 * with registers here. The main shader part will set the correct EXEC
7581 * mask.
7582 */
7583 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
7584 si_init_exec_full_mask(ctx);
7585
7586 /* Copy inputs to outputs. This should be no-op, as the registers match,
7587 * but it will prevent the compiler from overwriting them unintentionally.
7588 */
7589 ret = ctx->return_value;
7590 for (unsigned i = 0; i < num_sgprs; i++) {
7591 LLVMValueRef p = LLVMGetParam(func, i);
7592 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7593 }
7594 for (unsigned i = 0; i < num_vgprs; i++) {
7595 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7596 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7597 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7598 }
7599
7600 if (key->gs_prolog.states.tri_strip_adj_fix) {
7601 /* Remap the input vertices for every other primitive. */
7602 const unsigned gfx6_vtx_params[6] = {
7603 num_sgprs,
7604 num_sgprs + 1,
7605 num_sgprs + 3,
7606 num_sgprs + 4,
7607 num_sgprs + 5,
7608 num_sgprs + 6
7609 };
7610 const unsigned gfx9_vtx_params[3] = {
7611 num_sgprs,
7612 num_sgprs + 1,
7613 num_sgprs + 4,
7614 };
7615 LLVMValueRef vtx_in[6], vtx_out[6];
7616 LLVMValueRef prim_id, rotate;
7617
7618 if (ctx->screen->b.chip_class >= GFX9) {
7619 for (unsigned i = 0; i < 3; i++) {
7620 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
7621 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
7622 }
7623 } else {
7624 for (unsigned i = 0; i < 6; i++)
7625 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
7626 }
7627
7628 prim_id = LLVMGetParam(func, num_sgprs + 2);
7629 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7630
7631 for (unsigned i = 0; i < 6; ++i) {
7632 LLVMValueRef base, rotated;
7633 base = vtx_in[i];
7634 rotated = vtx_in[(i + 4) % 6];
7635 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
7636 }
7637
7638 if (ctx->screen->b.chip_class >= GFX9) {
7639 for (unsigned i = 0; i < 3; i++) {
7640 LLVMValueRef hi, out;
7641
7642 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
7643 LLVMConstInt(ctx->i32, 16, 0), "");
7644 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
7645 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
7646 ret = LLVMBuildInsertValue(builder, ret, out,
7647 gfx9_vtx_params[i], "");
7648 }
7649 } else {
7650 for (unsigned i = 0; i < 6; i++) {
7651 LLVMValueRef out;
7652
7653 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
7654 ret = LLVMBuildInsertValue(builder, ret, out,
7655 gfx6_vtx_params[i], "");
7656 }
7657 }
7658 }
7659
7660 LLVMBuildRet(builder, ret);
7661 }
7662
7663 /**
7664 * Given a list of shader part functions, build a wrapper function that
7665 * runs them in sequence to form a monolithic shader.
7666 */
7667 static void si_build_wrapper_function(struct si_shader_context *ctx,
7668 LLVMValueRef *parts,
7669 unsigned num_parts,
7670 unsigned main_part,
7671 unsigned next_shader_first_part)
7672 {
7673 struct gallivm_state *gallivm = &ctx->gallivm;
7674 LLVMBuilderRef builder = ctx->gallivm.builder;
7675 /* PS epilog has one arg per color component */
7676 LLVMTypeRef param_types[48];
7677 LLVMValueRef initial[48], out[48];
7678 LLVMTypeRef function_type;
7679 unsigned num_params;
7680 unsigned num_out, initial_num_out;
7681 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7682 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
7683 unsigned num_sgprs, num_vgprs;
7684 unsigned last_sgpr_param;
7685 unsigned gprs;
7686 struct lp_build_if_state if_state;
7687
7688 for (unsigned i = 0; i < num_parts; ++i) {
7689 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7690 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7691 }
7692
7693 /* The parameters of the wrapper function correspond to those of the
7694 * first part in terms of SGPRs and VGPRs, but we use the types of the
7695 * main part to get the right types. This is relevant for the
7696 * dereferenceable attribute on descriptor table pointers.
7697 */
7698 num_sgprs = 0;
7699 num_vgprs = 0;
7700
7701 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7702 num_params = LLVMCountParamTypes(function_type);
7703
7704 for (unsigned i = 0; i < num_params; ++i) {
7705 LLVMValueRef param = LLVMGetParam(parts[0], i);
7706
7707 if (ac_is_sgpr_param(param)) {
7708 assert(num_vgprs == 0);
7709 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7710 } else {
7711 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7712 }
7713 }
7714 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7715
7716 num_params = 0;
7717 last_sgpr_param = 0;
7718 gprs = 0;
7719 while (gprs < num_sgprs + num_vgprs) {
7720 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7721 unsigned size;
7722
7723 param_types[num_params] = LLVMTypeOf(param);
7724 if (gprs < num_sgprs)
7725 last_sgpr_param = num_params;
7726 size = llvm_get_type_size(param_types[num_params]) / 4;
7727 num_params++;
7728
7729 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7730 assert(gprs + size <= num_sgprs + num_vgprs &&
7731 (gprs >= num_sgprs || gprs + size <= num_sgprs));
7732
7733 gprs += size;
7734 }
7735
7736 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7737
7738 if (is_merged_shader(ctx->shader))
7739 si_init_exec_full_mask(ctx);
7740
7741 /* Record the arguments of the function as if they were an output of
7742 * a previous part.
7743 */
7744 num_out = 0;
7745 num_out_sgpr = 0;
7746
7747 for (unsigned i = 0; i < num_params; ++i) {
7748 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7749 LLVMTypeRef param_type = LLVMTypeOf(param);
7750 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7751 unsigned size = llvm_get_type_size(param_type) / 4;
7752
7753 if (size == 1) {
7754 if (param_type != out_type)
7755 param = LLVMBuildBitCast(builder, param, out_type, "");
7756 out[num_out++] = param;
7757 } else {
7758 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7759
7760 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7761 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7762 param_type = ctx->i64;
7763 }
7764
7765 if (param_type != vector_type)
7766 param = LLVMBuildBitCast(builder, param, vector_type, "");
7767
7768 for (unsigned j = 0; j < size; ++j)
7769 out[num_out++] = LLVMBuildExtractElement(
7770 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7771 }
7772
7773 if (i <= last_sgpr_param)
7774 num_out_sgpr = num_out;
7775 }
7776
7777 memcpy(initial, out, sizeof(out));
7778 initial_num_out = num_out;
7779 initial_num_out_sgpr = num_out_sgpr;
7780
7781 /* Now chain the parts. */
7782 for (unsigned part = 0; part < num_parts; ++part) {
7783 LLVMValueRef in[48];
7784 LLVMValueRef ret;
7785 LLVMTypeRef ret_type;
7786 unsigned out_idx = 0;
7787
7788 num_params = LLVMCountParams(parts[part]);
7789 assert(num_params <= ARRAY_SIZE(param_types));
7790
7791 /* Merged shaders are executed conditionally depending
7792 * on the number of enabled threads passed in the input SGPRs. */
7793 if (is_merged_shader(ctx->shader) &&
7794 (part == 0 || part == next_shader_first_part)) {
7795 LLVMValueRef ena, count = initial[3];
7796
7797 /* The thread count for the 2nd shader is at bit-offset 8. */
7798 if (part == next_shader_first_part) {
7799 count = LLVMBuildLShr(builder, count,
7800 LLVMConstInt(ctx->i32, 8, 0), "");
7801 }
7802 count = LLVMBuildAnd(builder, count,
7803 LLVMConstInt(ctx->i32, 0x7f, 0), "");
7804 ena = LLVMBuildICmp(builder, LLVMIntULT,
7805 ac_get_thread_id(&ctx->ac), count, "");
7806 lp_build_if(&if_state, &ctx->gallivm, ena);
7807 }
7808
7809 /* Derive arguments for the next part from outputs of the
7810 * previous one.
7811 */
7812 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7813 LLVMValueRef param;
7814 LLVMTypeRef param_type;
7815 bool is_sgpr;
7816 unsigned param_size;
7817 LLVMValueRef arg = NULL;
7818
7819 param = LLVMGetParam(parts[part], param_idx);
7820 param_type = LLVMTypeOf(param);
7821 param_size = llvm_get_type_size(param_type) / 4;
7822 is_sgpr = ac_is_sgpr_param(param);
7823
7824 if (is_sgpr) {
7825 #if HAVE_LLVM < 0x0400
7826 LLVMRemoveAttribute(param, LLVMByValAttribute);
7827 #else
7828 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7829 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7830 #endif
7831 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7832 }
7833
7834 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7835 assert(is_sgpr || out_idx >= num_out_sgpr);
7836
7837 if (param_size == 1)
7838 arg = out[out_idx];
7839 else
7840 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7841
7842 if (LLVMTypeOf(arg) != param_type) {
7843 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7844 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7845 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7846 } else {
7847 arg = LLVMBuildBitCast(builder, arg, param_type, "");
7848 }
7849 }
7850
7851 in[param_idx] = arg;
7852 out_idx += param_size;
7853 }
7854
7855 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7856
7857 if (is_merged_shader(ctx->shader) &&
7858 (part + 1 == next_shader_first_part ||
7859 part + 1 == num_parts)) {
7860 lp_build_endif(&if_state);
7861
7862 if (part + 1 == next_shader_first_part) {
7863 /* A barrier is required between 2 merged shaders. */
7864 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
7865
7866 /* The second half of the merged shader should use
7867 * the inputs from the toplevel (wrapper) function,
7868 * not the return value from the last call.
7869 *
7870 * That's because the last call was executed condi-
7871 * tionally, so we can't consume it in the main
7872 * block.
7873 */
7874 memcpy(out, initial, sizeof(initial));
7875 num_out = initial_num_out;
7876 num_out_sgpr = initial_num_out_sgpr;
7877 }
7878 continue;
7879 }
7880
7881 /* Extract the returned GPRs. */
7882 ret_type = LLVMTypeOf(ret);
7883 num_out = 0;
7884 num_out_sgpr = 0;
7885
7886 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7887 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7888
7889 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7890
7891 for (unsigned i = 0; i < ret_size; ++i) {
7892 LLVMValueRef val =
7893 LLVMBuildExtractValue(builder, ret, i, "");
7894
7895 out[num_out++] = val;
7896
7897 if (LLVMTypeOf(val) == ctx->i32) {
7898 assert(num_out_sgpr + 1 == num_out);
7899 num_out_sgpr = num_out;
7900 }
7901 }
7902 }
7903 }
7904
7905 LLVMBuildRetVoid(builder);
7906 }
7907
7908 int si_compile_tgsi_shader(struct si_screen *sscreen,
7909 LLVMTargetMachineRef tm,
7910 struct si_shader *shader,
7911 bool is_monolithic,
7912 struct pipe_debug_callback *debug)
7913 {
7914 struct si_shader_selector *sel = shader->selector;
7915 struct si_shader_context ctx;
7916 int r = -1;
7917
7918 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7919 * conversion fails. */
7920 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7921 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7922 tgsi_dump(sel->tokens, 0);
7923 si_dump_streamout(&sel->so);
7924 }
7925
7926 si_init_shader_ctx(&ctx, sscreen, tm);
7927 si_llvm_context_set_tgsi(&ctx, shader);
7928 ctx.separate_prolog = !is_monolithic;
7929
7930 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7931 sizeof(shader->info.vs_output_param_offset));
7932
7933 shader->info.uses_instanceid = sel->info.uses_instanceid;
7934
7935 ctx.load_system_value = declare_system_value;
7936
7937 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
7938 si_llvm_dispose(&ctx);
7939 return -1;
7940 }
7941
7942 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7943 LLVMValueRef parts[2];
7944 bool need_prolog = sel->vs_needs_prolog;
7945
7946 parts[1] = ctx.main_fn;
7947
7948 if (need_prolog) {
7949 union si_shader_part_key prolog_key;
7950 si_get_vs_prolog_key(&sel->info,
7951 shader->info.num_input_sgprs,
7952 &shader->key.part.vs.prolog,
7953 shader, &prolog_key);
7954 si_build_vs_prolog_function(&ctx, &prolog_key);
7955 parts[0] = ctx.main_fn;
7956 }
7957
7958 si_build_wrapper_function(&ctx, parts + !need_prolog,
7959 1 + need_prolog, need_prolog, 0);
7960 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7961 if (sscreen->b.chip_class >= GFX9) {
7962 struct si_shader_selector *ls = shader->key.part.tcs.ls;
7963 LLVMValueRef parts[4];
7964
7965 /* TCS main part */
7966 parts[2] = ctx.main_fn;
7967
7968 /* TCS epilog */
7969 union si_shader_part_key tcs_epilog_key;
7970 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
7971 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7972 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
7973 parts[3] = ctx.main_fn;
7974
7975 /* VS prolog */
7976 if (ls->vs_needs_prolog) {
7977 union si_shader_part_key vs_prolog_key;
7978 si_get_vs_prolog_key(&ls->info,
7979 shader->info.num_input_sgprs,
7980 &shader->key.part.tcs.ls_prolog,
7981 shader, &vs_prolog_key);
7982 vs_prolog_key.vs_prolog.is_monolithic = true;
7983 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
7984 parts[0] = ctx.main_fn;
7985 }
7986
7987 /* VS as LS main part */
7988 struct si_shader shader_ls = {};
7989 shader_ls.selector = ls;
7990 shader_ls.key.as_ls = 1;
7991 shader_ls.key.mono = shader->key.mono;
7992 shader_ls.key.opt = shader->key.opt;
7993 si_llvm_context_set_tgsi(&ctx, &shader_ls);
7994
7995 if (!si_compile_tgsi_main(&ctx, true)) {
7996 si_llvm_dispose(&ctx);
7997 return -1;
7998 }
7999 shader->info.uses_instanceid |= ls->info.uses_instanceid;
8000 parts[1] = ctx.main_fn;
8001
8002 /* Reset the shader context. */
8003 ctx.shader = shader;
8004 ctx.type = PIPE_SHADER_TESS_CTRL;
8005
8006 si_build_wrapper_function(&ctx,
8007 parts + !ls->vs_needs_prolog,
8008 4 - !ls->vs_needs_prolog, 0,
8009 ls->vs_needs_prolog ? 2 : 1);
8010 } else {
8011 LLVMValueRef parts[2];
8012 union si_shader_part_key epilog_key;
8013
8014 parts[0] = ctx.main_fn;
8015
8016 memset(&epilog_key, 0, sizeof(epilog_key));
8017 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8018 si_build_tcs_epilog_function(&ctx, &epilog_key);
8019 parts[1] = ctx.main_fn;
8020
8021 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8022 }
8023 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
8024 if (ctx.screen->b.chip_class >= GFX9) {
8025 struct si_shader_selector *es = shader->key.part.gs.es;
8026 LLVMValueRef es_prolog = NULL;
8027 LLVMValueRef es_main = NULL;
8028 LLVMValueRef gs_prolog = NULL;
8029 LLVMValueRef gs_main = ctx.main_fn;
8030
8031 /* GS prolog */
8032 union si_shader_part_key gs_prolog_key;
8033 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
8034 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8035 gs_prolog_key.gs_prolog.is_monolithic = true;
8036 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
8037 gs_prolog = ctx.main_fn;
8038
8039 /* ES prolog */
8040 if (es->vs_needs_prolog) {
8041 union si_shader_part_key vs_prolog_key;
8042 si_get_vs_prolog_key(&es->info,
8043 shader->info.num_input_sgprs,
8044 &shader->key.part.tcs.ls_prolog,
8045 shader, &vs_prolog_key);
8046 vs_prolog_key.vs_prolog.is_monolithic = true;
8047 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8048 es_prolog = ctx.main_fn;
8049 }
8050
8051 /* ES main part */
8052 struct si_shader shader_es = {};
8053 shader_es.selector = es;
8054 shader_es.key.as_es = 1;
8055 shader_es.key.mono = shader->key.mono;
8056 shader_es.key.opt = shader->key.opt;
8057 si_llvm_context_set_tgsi(&ctx, &shader_es);
8058
8059 if (!si_compile_tgsi_main(&ctx, true)) {
8060 si_llvm_dispose(&ctx);
8061 return -1;
8062 }
8063 shader->info.uses_instanceid |= es->info.uses_instanceid;
8064 es_main = ctx.main_fn;
8065
8066 /* Reset the shader context. */
8067 ctx.shader = shader;
8068 ctx.type = PIPE_SHADER_GEOMETRY;
8069
8070 /* Prepare the array of shader parts. */
8071 LLVMValueRef parts[4];
8072 unsigned num_parts = 0, main_part, next_first_part;
8073
8074 if (es_prolog)
8075 parts[num_parts++] = es_prolog;
8076
8077 parts[main_part = num_parts++] = es_main;
8078 parts[next_first_part = num_parts++] = gs_prolog;
8079 parts[num_parts++] = gs_main;
8080
8081 si_build_wrapper_function(&ctx, parts, num_parts,
8082 main_part, next_first_part);
8083 } else {
8084 LLVMValueRef parts[2];
8085 union si_shader_part_key prolog_key;
8086
8087 parts[1] = ctx.main_fn;
8088
8089 memset(&prolog_key, 0, sizeof(prolog_key));
8090 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8091 si_build_gs_prolog_function(&ctx, &prolog_key);
8092 parts[0] = ctx.main_fn;
8093
8094 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
8095 }
8096 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
8097 LLVMValueRef parts[3];
8098 union si_shader_part_key prolog_key;
8099 union si_shader_part_key epilog_key;
8100 bool need_prolog;
8101
8102 si_get_ps_prolog_key(shader, &prolog_key, false);
8103 need_prolog = si_need_ps_prolog(&prolog_key);
8104
8105 parts[need_prolog ? 1 : 0] = ctx.main_fn;
8106
8107 if (need_prolog) {
8108 si_build_ps_prolog_function(&ctx, &prolog_key);
8109 parts[0] = ctx.main_fn;
8110 }
8111
8112 si_get_ps_epilog_key(shader, &epilog_key);
8113 si_build_ps_epilog_function(&ctx, &epilog_key);
8114 parts[need_prolog ? 2 : 1] = ctx.main_fn;
8115
8116 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
8117 need_prolog ? 1 : 0, 0);
8118 }
8119
8120 /* Dump LLVM IR before any optimization passes */
8121 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
8122 r600_can_dump_shader(&sscreen->b, ctx.type))
8123 LLVMDumpModule(ctx.gallivm.module);
8124
8125 si_llvm_finalize_module(&ctx,
8126 r600_extra_shader_checks(&sscreen->b, ctx.type));
8127
8128 /* Post-optimization transformations and analysis. */
8129 si_eliminate_const_vs_outputs(&ctx);
8130
8131 if ((debug && debug->debug_message) ||
8132 r600_can_dump_shader(&sscreen->b, ctx.type))
8133 si_count_scratch_private_memory(&ctx);
8134
8135 /* Compile to bytecode. */
8136 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
8137 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
8138 si_llvm_dispose(&ctx);
8139 if (r) {
8140 fprintf(stderr, "LLVM failed to compile shader\n");
8141 return r;
8142 }
8143
8144 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
8145 * LLVM 3.9svn has this bug.
8146 */
8147 if (sel->type == PIPE_SHADER_COMPUTE) {
8148 unsigned wave_size = 64;
8149 unsigned max_vgprs = 256;
8150 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
8151 unsigned max_sgprs_per_wave = 128;
8152 unsigned max_block_threads = si_get_max_workgroup_size(shader);
8153 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
8154 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
8155
8156 max_vgprs = max_vgprs / min_waves_per_simd;
8157 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
8158
8159 if (shader->config.num_sgprs > max_sgprs ||
8160 shader->config.num_vgprs > max_vgprs) {
8161 fprintf(stderr, "LLVM failed to compile a shader correctly: "
8162 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
8163 shader->config.num_sgprs, shader->config.num_vgprs,
8164 max_sgprs, max_vgprs);
8165
8166 /* Just terminate the process, because dependent
8167 * shaders can hang due to bad input data, but use
8168 * the env var to allow shader-db to work.
8169 */
8170 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
8171 abort();
8172 }
8173 }
8174
8175 /* Add the scratch offset to input SGPRs. */
8176 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
8177 shader->info.num_input_sgprs += 1; /* scratch byte offset */
8178
8179 /* Calculate the number of fragment input VGPRs. */
8180 if (ctx.type == PIPE_SHADER_FRAGMENT) {
8181 shader->info.num_input_vgprs = 0;
8182 shader->info.face_vgpr_index = -1;
8183
8184 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8185 shader->info.num_input_vgprs += 2;
8186 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
8187 shader->info.num_input_vgprs += 2;
8188 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
8189 shader->info.num_input_vgprs += 2;
8190 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
8191 shader->info.num_input_vgprs += 3;
8192 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8193 shader->info.num_input_vgprs += 2;
8194 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
8195 shader->info.num_input_vgprs += 2;
8196 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
8197 shader->info.num_input_vgprs += 2;
8198 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
8199 shader->info.num_input_vgprs += 1;
8200 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
8201 shader->info.num_input_vgprs += 1;
8202 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
8203 shader->info.num_input_vgprs += 1;
8204 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
8205 shader->info.num_input_vgprs += 1;
8206 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
8207 shader->info.num_input_vgprs += 1;
8208 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
8209 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
8210 shader->info.num_input_vgprs += 1;
8211 }
8212 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
8213 shader->info.num_input_vgprs += 1;
8214 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
8215 shader->info.num_input_vgprs += 1;
8216 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
8217 shader->info.num_input_vgprs += 1;
8218 }
8219
8220 return 0;
8221 }
8222
8223 /**
8224 * Create, compile and return a shader part (prolog or epilog).
8225 *
8226 * \param sscreen screen
8227 * \param list list of shader parts of the same category
8228 * \param type shader type
8229 * \param key shader part key
8230 * \param prolog whether the part being requested is a prolog
8231 * \param tm LLVM target machine
8232 * \param debug debug callback
8233 * \param build the callback responsible for building the main function
8234 * \return non-NULL on success
8235 */
8236 static struct si_shader_part *
8237 si_get_shader_part(struct si_screen *sscreen,
8238 struct si_shader_part **list,
8239 enum pipe_shader_type type,
8240 bool prolog,
8241 union si_shader_part_key *key,
8242 LLVMTargetMachineRef tm,
8243 struct pipe_debug_callback *debug,
8244 void (*build)(struct si_shader_context *,
8245 union si_shader_part_key *),
8246 const char *name)
8247 {
8248 struct si_shader_part *result;
8249
8250 mtx_lock(&sscreen->shader_parts_mutex);
8251
8252 /* Find existing. */
8253 for (result = *list; result; result = result->next) {
8254 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
8255 mtx_unlock(&sscreen->shader_parts_mutex);
8256 return result;
8257 }
8258 }
8259
8260 /* Compile a new one. */
8261 result = CALLOC_STRUCT(si_shader_part);
8262 result->key = *key;
8263
8264 struct si_shader shader = {};
8265 struct si_shader_context ctx;
8266 struct gallivm_state *gallivm = &ctx.gallivm;
8267
8268 si_init_shader_ctx(&ctx, sscreen, tm);
8269 ctx.shader = &shader;
8270 ctx.type = type;
8271
8272 switch (type) {
8273 case PIPE_SHADER_VERTEX:
8274 break;
8275 case PIPE_SHADER_TESS_CTRL:
8276 assert(!prolog);
8277 shader.key.part.tcs.epilog = key->tcs_epilog.states;
8278 break;
8279 case PIPE_SHADER_GEOMETRY:
8280 assert(prolog);
8281 break;
8282 case PIPE_SHADER_FRAGMENT:
8283 if (prolog)
8284 shader.key.part.ps.prolog = key->ps_prolog.states;
8285 else
8286 shader.key.part.ps.epilog = key->ps_epilog.states;
8287 break;
8288 default:
8289 unreachable("bad shader part");
8290 }
8291
8292 build(&ctx, key);
8293
8294 /* Compile. */
8295 si_llvm_finalize_module(&ctx,
8296 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
8297
8298 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
8299 gallivm->module, debug, ctx.type, name)) {
8300 FREE(result);
8301 result = NULL;
8302 goto out;
8303 }
8304
8305 result->next = *list;
8306 *list = result;
8307
8308 out:
8309 si_llvm_dispose(&ctx);
8310 mtx_unlock(&sscreen->shader_parts_mutex);
8311 return result;
8312 }
8313
8314 /**
8315 * Build the vertex shader prolog function.
8316 *
8317 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
8318 * All inputs are returned unmodified. The vertex load indices are
8319 * stored after them, which will be used by the API VS for fetching inputs.
8320 *
8321 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
8322 * input_v0,
8323 * input_v1,
8324 * input_v2,
8325 * input_v3,
8326 * (VertexID + BaseVertex),
8327 * (InstanceID + StartInstance),
8328 * (InstanceID / 2 + StartInstance)
8329 */
8330 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
8331 union si_shader_part_key *key)
8332 {
8333 struct gallivm_state *gallivm = &ctx->gallivm;
8334 LLVMTypeRef *params, *returns;
8335 LLVMValueRef ret, func;
8336 int last_sgpr, num_params, num_returns, i;
8337 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
8338 key->vs_prolog.num_merged_next_stage_vgprs;
8339 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
8340 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
8341 num_input_vgprs;
8342 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
8343
8344 ctx->param_vertex_id = first_vs_vgpr;
8345 ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
8346
8347 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
8348 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
8349 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
8350 sizeof(LLVMTypeRef));
8351 num_params = 0;
8352 num_returns = 0;
8353
8354 /* Declare input and output SGPRs. */
8355 num_params = 0;
8356 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8357 params[num_params++] = ctx->i32;
8358 returns[num_returns++] = ctx->i32;
8359 }
8360 last_sgpr = num_params - 1;
8361
8362 /* Preloaded VGPRs (outputs must be floats) */
8363 for (i = 0; i < num_input_vgprs; i++) {
8364 params[num_params++] = ctx->i32;
8365 returns[num_returns++] = ctx->f32;
8366 }
8367
8368 /* Vertex load indices. */
8369 for (i = 0; i <= key->vs_prolog.last_input; i++)
8370 returns[num_returns++] = ctx->f32;
8371
8372 /* Create the function. */
8373 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
8374 num_params, last_sgpr);
8375 func = ctx->main_fn;
8376
8377 if (key->vs_prolog.num_merged_next_stage_vgprs &&
8378 !key->vs_prolog.is_monolithic)
8379 si_init_exec_from_input(ctx, 3, 0);
8380
8381 /* Copy inputs to outputs. This should be no-op, as the registers match,
8382 * but it will prevent the compiler from overwriting them unintentionally.
8383 */
8384 ret = ctx->return_value;
8385 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8386 LLVMValueRef p = LLVMGetParam(func, i);
8387 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8388 }
8389 for (; i < num_params; i++) {
8390 LLVMValueRef p = LLVMGetParam(func, i);
8391 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
8392 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8393 }
8394
8395 /* Compute vertex load indices from instance divisors. */
8396 for (i = 0; i <= key->vs_prolog.last_input; i++) {
8397 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
8398 LLVMValueRef index;
8399
8400 if (divisor) {
8401 /* InstanceID / Divisor + StartInstance */
8402 index = get_instance_index_for_fetch(ctx,
8403 user_sgpr_base +
8404 SI_SGPR_START_INSTANCE,
8405 divisor);
8406 } else {
8407 /* VertexID + BaseVertex */
8408 index = LLVMBuildAdd(gallivm->builder,
8409 LLVMGetParam(func, ctx->param_vertex_id),
8410 LLVMGetParam(func, user_sgpr_base +
8411 SI_SGPR_BASE_VERTEX), "");
8412 }
8413
8414 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
8415 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
8416 num_params++, "");
8417 }
8418
8419 si_llvm_build_ret(ctx, ret);
8420 }
8421
8422 static bool si_get_vs_prolog(struct si_screen *sscreen,
8423 LLVMTargetMachineRef tm,
8424 struct si_shader *shader,
8425 struct pipe_debug_callback *debug,
8426 struct si_shader *main_part,
8427 const struct si_vs_prolog_bits *key)
8428 {
8429 struct si_shader_selector *vs = main_part->selector;
8430
8431 /* The prolog is a no-op if there are no inputs. */
8432 if (!vs->vs_needs_prolog)
8433 return true;
8434
8435 /* Get the prolog. */
8436 union si_shader_part_key prolog_key;
8437 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
8438 key, shader, &prolog_key);
8439
8440 shader->prolog =
8441 si_get_shader_part(sscreen, &sscreen->vs_prologs,
8442 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8443 debug, si_build_vs_prolog_function,
8444 "Vertex Shader Prolog");
8445 return shader->prolog != NULL;
8446 }
8447
8448 /**
8449 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8450 */
8451 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8452 LLVMTargetMachineRef tm,
8453 struct si_shader *shader,
8454 struct pipe_debug_callback *debug)
8455 {
8456 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8457 &shader->key.part.vs.prolog);
8458 }
8459
8460 /**
8461 * Compile the TCS epilog function. This writes tesselation factors to memory
8462 * based on the output primitive type of the tesselator (determined by TES).
8463 */
8464 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8465 union si_shader_part_key *key)
8466 {
8467 struct gallivm_state *gallivm = &ctx->gallivm;
8468 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8469 LLVMTypeRef params[32];
8470 LLVMValueRef func;
8471 int last_sgpr, num_params = 0;
8472
8473 if (ctx->screen->b.chip_class >= GFX9) {
8474 params[num_params++] = ctx->i64;
8475 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8476 params[num_params++] = ctx->i32; /* wave info */
8477 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8478 params[num_params++] = ctx->i32;
8479 params[num_params++] = ctx->i32;
8480 params[num_params++] = ctx->i32;
8481 params[num_params++] = ctx->i64;
8482 params[num_params++] = ctx->i64;
8483 params[num_params++] = ctx->i64;
8484 params[num_params++] = ctx->i64;
8485 params[num_params++] = ctx->i64;
8486 params[num_params++] = ctx->i64;
8487 params[num_params++] = ctx->i32;
8488 params[num_params++] = ctx->i32;
8489 params[num_params++] = ctx->i32;
8490 params[num_params++] = ctx->i32;
8491 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8492 params[num_params++] = ctx->i32;
8493 params[num_params++] = ctx->i32;
8494 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
8495 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
8496 } else {
8497 params[num_params++] = ctx->i64;
8498 params[num_params++] = ctx->i64;
8499 params[num_params++] = ctx->i64;
8500 params[num_params++] = ctx->i64;
8501 params[num_params++] = ctx->i64;
8502 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8503 params[num_params++] = ctx->i32;
8504 params[num_params++] = ctx->i32;
8505 params[num_params++] = ctx->i32;
8506 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
8507 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
8508 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8509 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8510 }
8511 last_sgpr = num_params - 1;
8512
8513 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8514 params[num_params++] = ctx->i32; /* invocation ID within the patch */
8515 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8516
8517 /* Create the function. */
8518 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
8519 declare_lds_as_pointer(ctx);
8520 func = ctx->main_fn;
8521
8522 si_write_tess_factors(bld_base,
8523 LLVMGetParam(func, last_sgpr + 1),
8524 LLVMGetParam(func, last_sgpr + 2),
8525 LLVMGetParam(func, last_sgpr + 3));
8526
8527 LLVMBuildRetVoid(gallivm->builder);
8528 }
8529
8530 /**
8531 * Select and compile (or reuse) TCS parts (epilog).
8532 */
8533 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8534 LLVMTargetMachineRef tm,
8535 struct si_shader *shader,
8536 struct pipe_debug_callback *debug)
8537 {
8538 if (sscreen->b.chip_class >= GFX9) {
8539 struct si_shader *ls_main_part =
8540 shader->key.part.tcs.ls->main_shader_part_ls;
8541
8542 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8543 &shader->key.part.tcs.ls_prolog))
8544 return false;
8545
8546 shader->previous_stage = ls_main_part;
8547 }
8548
8549 /* Get the epilog. */
8550 union si_shader_part_key epilog_key;
8551 memset(&epilog_key, 0, sizeof(epilog_key));
8552 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8553
8554 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8555 PIPE_SHADER_TESS_CTRL, false,
8556 &epilog_key, tm, debug,
8557 si_build_tcs_epilog_function,
8558 "Tessellation Control Shader Epilog");
8559 return shader->epilog != NULL;
8560 }
8561
8562 /**
8563 * Select and compile (or reuse) GS parts (prolog).
8564 */
8565 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8566 LLVMTargetMachineRef tm,
8567 struct si_shader *shader,
8568 struct pipe_debug_callback *debug)
8569 {
8570 if (sscreen->b.chip_class >= GFX9) {
8571 struct si_shader *es_main_part =
8572 shader->key.part.gs.es->main_shader_part_es;
8573
8574 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
8575 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
8576 &shader->key.part.gs.vs_prolog))
8577 return false;
8578
8579 shader->previous_stage = es_main_part;
8580 }
8581
8582 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8583 return true;
8584
8585 union si_shader_part_key prolog_key;
8586 memset(&prolog_key, 0, sizeof(prolog_key));
8587 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8588
8589 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8590 PIPE_SHADER_GEOMETRY, true,
8591 &prolog_key, tm, debug,
8592 si_build_gs_prolog_function,
8593 "Geometry Shader Prolog");
8594 return shader->prolog2 != NULL;
8595 }
8596
8597 /**
8598 * Build the pixel shader prolog function. This handles:
8599 * - two-side color selection and interpolation
8600 * - overriding interpolation parameters for the API PS
8601 * - polygon stippling
8602 *
8603 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8604 * overriden by other states. (e.g. per-sample interpolation)
8605 * Interpolated colors are stored after the preloaded VGPRs.
8606 */
8607 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8608 union si_shader_part_key *key)
8609 {
8610 struct gallivm_state *gallivm = &ctx->gallivm;
8611 LLVMTypeRef *params;
8612 LLVMValueRef ret, func;
8613 int last_sgpr, num_params, num_returns, i, num_color_channels;
8614
8615 assert(si_need_ps_prolog(key));
8616
8617 /* Number of inputs + 8 color elements. */
8618 params = alloca((key->ps_prolog.num_input_sgprs +
8619 key->ps_prolog.num_input_vgprs + 8) *
8620 sizeof(LLVMTypeRef));
8621
8622 /* Declare inputs. */
8623 num_params = 0;
8624 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8625 params[num_params++] = ctx->i32;
8626 last_sgpr = num_params - 1;
8627
8628 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8629 params[num_params++] = ctx->f32;
8630
8631 /* Declare outputs (same as inputs + add colors if needed) */
8632 num_returns = num_params;
8633 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8634 for (i = 0; i < num_color_channels; i++)
8635 params[num_returns++] = ctx->f32;
8636
8637 /* Create the function. */
8638 si_create_function(ctx, "ps_prolog", params, num_returns, params,
8639 num_params, last_sgpr);
8640 func = ctx->main_fn;
8641
8642 /* Copy inputs to outputs. This should be no-op, as the registers match,
8643 * but it will prevent the compiler from overwriting them unintentionally.
8644 */
8645 ret = ctx->return_value;
8646 for (i = 0; i < num_params; i++) {
8647 LLVMValueRef p = LLVMGetParam(func, i);
8648 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8649 }
8650
8651 /* Polygon stippling. */
8652 if (key->ps_prolog.states.poly_stipple) {
8653 /* POS_FIXED_PT is always last. */
8654 unsigned pos = key->ps_prolog.num_input_sgprs +
8655 key->ps_prolog.num_input_vgprs - 1;
8656 LLVMValueRef ptr[2], list;
8657
8658 /* Get the pointer to rw buffers. */
8659 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8660 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8661 list = lp_build_gather_values(gallivm, ptr, 2);
8662 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8663 list = LLVMBuildIntToPtr(gallivm->builder, list,
8664 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8665
8666 si_llvm_emit_polygon_stipple(ctx, list, pos);
8667 }
8668
8669 if (key->ps_prolog.states.bc_optimize_for_persp ||
8670 key->ps_prolog.states.bc_optimize_for_linear) {
8671 unsigned i, base = key->ps_prolog.num_input_sgprs;
8672 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8673
8674 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8675 * The hw doesn't compute CENTROID if the whole wave only
8676 * contains fully-covered quads.
8677 *
8678 * PRIM_MASK is after user SGPRs.
8679 */
8680 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8681 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8682 LLVMConstInt(ctx->i32, 31, 0), "");
8683 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8684 ctx->i1, "");
8685
8686 if (key->ps_prolog.states.bc_optimize_for_persp) {
8687 /* Read PERSP_CENTER. */
8688 for (i = 0; i < 2; i++)
8689 center[i] = LLVMGetParam(func, base + 2 + i);
8690 /* Read PERSP_CENTROID. */
8691 for (i = 0; i < 2; i++)
8692 centroid[i] = LLVMGetParam(func, base + 4 + i);
8693 /* Select PERSP_CENTROID. */
8694 for (i = 0; i < 2; i++) {
8695 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8696 center[i], centroid[i], "");
8697 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8698 tmp, base + 4 + i, "");
8699 }
8700 }
8701 if (key->ps_prolog.states.bc_optimize_for_linear) {
8702 /* Read LINEAR_CENTER. */
8703 for (i = 0; i < 2; i++)
8704 center[i] = LLVMGetParam(func, base + 8 + i);
8705 /* Read LINEAR_CENTROID. */
8706 for (i = 0; i < 2; i++)
8707 centroid[i] = LLVMGetParam(func, base + 10 + i);
8708 /* Select LINEAR_CENTROID. */
8709 for (i = 0; i < 2; i++) {
8710 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8711 center[i], centroid[i], "");
8712 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8713 tmp, base + 10 + i, "");
8714 }
8715 }
8716 }
8717
8718 /* Force per-sample interpolation. */
8719 if (key->ps_prolog.states.force_persp_sample_interp) {
8720 unsigned i, base = key->ps_prolog.num_input_sgprs;
8721 LLVMValueRef persp_sample[2];
8722
8723 /* Read PERSP_SAMPLE. */
8724 for (i = 0; i < 2; i++)
8725 persp_sample[i] = LLVMGetParam(func, base + i);
8726 /* Overwrite PERSP_CENTER. */
8727 for (i = 0; i < 2; i++)
8728 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8729 persp_sample[i], base + 2 + i, "");
8730 /* Overwrite PERSP_CENTROID. */
8731 for (i = 0; i < 2; i++)
8732 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8733 persp_sample[i], base + 4 + i, "");
8734 }
8735 if (key->ps_prolog.states.force_linear_sample_interp) {
8736 unsigned i, base = key->ps_prolog.num_input_sgprs;
8737 LLVMValueRef linear_sample[2];
8738
8739 /* Read LINEAR_SAMPLE. */
8740 for (i = 0; i < 2; i++)
8741 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8742 /* Overwrite LINEAR_CENTER. */
8743 for (i = 0; i < 2; i++)
8744 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8745 linear_sample[i], base + 8 + i, "");
8746 /* Overwrite LINEAR_CENTROID. */
8747 for (i = 0; i < 2; i++)
8748 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8749 linear_sample[i], base + 10 + i, "");
8750 }
8751
8752 /* Force center interpolation. */
8753 if (key->ps_prolog.states.force_persp_center_interp) {
8754 unsigned i, base = key->ps_prolog.num_input_sgprs;
8755 LLVMValueRef persp_center[2];
8756
8757 /* Read PERSP_CENTER. */
8758 for (i = 0; i < 2; i++)
8759 persp_center[i] = LLVMGetParam(func, base + 2 + i);
8760 /* Overwrite PERSP_SAMPLE. */
8761 for (i = 0; i < 2; i++)
8762 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8763 persp_center[i], base + i, "");
8764 /* Overwrite PERSP_CENTROID. */
8765 for (i = 0; i < 2; i++)
8766 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8767 persp_center[i], base + 4 + i, "");
8768 }
8769 if (key->ps_prolog.states.force_linear_center_interp) {
8770 unsigned i, base = key->ps_prolog.num_input_sgprs;
8771 LLVMValueRef linear_center[2];
8772
8773 /* Read LINEAR_CENTER. */
8774 for (i = 0; i < 2; i++)
8775 linear_center[i] = LLVMGetParam(func, base + 8 + i);
8776 /* Overwrite LINEAR_SAMPLE. */
8777 for (i = 0; i < 2; i++)
8778 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8779 linear_center[i], base + 6 + i, "");
8780 /* Overwrite LINEAR_CENTROID. */
8781 for (i = 0; i < 2; i++)
8782 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8783 linear_center[i], base + 10 + i, "");
8784 }
8785
8786 /* Interpolate colors. */
8787 for (i = 0; i < 2; i++) {
8788 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8789 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8790 key->ps_prolog.face_vgpr_index;
8791 LLVMValueRef interp[2], color[4];
8792 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8793
8794 if (!writemask)
8795 continue;
8796
8797 /* If the interpolation qualifier is not CONSTANT (-1). */
8798 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8799 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8800 key->ps_prolog.color_interp_vgpr_index[i];
8801
8802 /* Get the (i,j) updated by bc_optimize handling. */
8803 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8804 interp_vgpr, "");
8805 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8806 interp_vgpr + 1, "");
8807 interp_ij = lp_build_gather_values(gallivm, interp, 2);
8808 }
8809
8810 /* Use the absolute location of the input. */
8811 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8812
8813 if (key->ps_prolog.states.color_two_side) {
8814 face = LLVMGetParam(func, face_vgpr);
8815 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8816 }
8817
8818 interp_fs_input(ctx,
8819 key->ps_prolog.color_attr_index[i],
8820 TGSI_SEMANTIC_COLOR, i,
8821 key->ps_prolog.num_interp_inputs,
8822 key->ps_prolog.colors_read, interp_ij,
8823 prim_mask, face, color);
8824
8825 while (writemask) {
8826 unsigned chan = u_bit_scan(&writemask);
8827 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8828 num_params++, "");
8829 }
8830 }
8831
8832 /* Tell LLVM to insert WQM instruction sequence when needed. */
8833 if (key->ps_prolog.wqm) {
8834 LLVMAddTargetDependentFunctionAttr(func,
8835 "amdgpu-ps-wqm-outputs", "");
8836 }
8837
8838 si_llvm_build_ret(ctx, ret);
8839 }
8840
8841 /**
8842 * Build the pixel shader epilog function. This handles everything that must be
8843 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8844 */
8845 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8846 union si_shader_part_key *key)
8847 {
8848 struct gallivm_state *gallivm = &ctx->gallivm;
8849 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8850 LLVMTypeRef params[16+8*4+3];
8851 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8852 int last_sgpr, num_params = 0, i;
8853 struct si_ps_exports exp = {};
8854
8855 /* Declare input SGPRs. */
8856 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8857 params[ctx->param_const_buffers = num_params++] = ctx->i64;
8858 params[ctx->param_samplers = num_params++] = ctx->i64;
8859 params[ctx->param_images = num_params++] = ctx->i64;
8860 params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8861 assert(num_params == SI_PARAM_ALPHA_REF);
8862 params[SI_PARAM_ALPHA_REF] = ctx->f32;
8863 last_sgpr = SI_PARAM_ALPHA_REF;
8864
8865 /* Declare input VGPRs. */
8866 num_params = (last_sgpr + 1) +
8867 util_bitcount(key->ps_epilog.colors_written) * 4 +
8868 key->ps_epilog.writes_z +
8869 key->ps_epilog.writes_stencil +
8870 key->ps_epilog.writes_samplemask;
8871
8872 num_params = MAX2(num_params,
8873 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8874
8875 assert(num_params <= ARRAY_SIZE(params));
8876
8877 for (i = last_sgpr + 1; i < num_params; i++)
8878 params[i] = ctx->f32;
8879
8880 /* Create the function. */
8881 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
8882 /* Disable elimination of unused inputs. */
8883 si_llvm_add_attribute(ctx->main_fn,
8884 "InitialPSInputAddr", 0xffffff);
8885
8886 /* Process colors. */
8887 unsigned vgpr = last_sgpr + 1;
8888 unsigned colors_written = key->ps_epilog.colors_written;
8889 int last_color_export = -1;
8890
8891 /* Find the last color export. */
8892 if (!key->ps_epilog.writes_z &&
8893 !key->ps_epilog.writes_stencil &&
8894 !key->ps_epilog.writes_samplemask) {
8895 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8896
8897 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8898 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8899 /* Just set this if any of the colorbuffers are enabled. */
8900 if (spi_format &
8901 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8902 last_color_export = 0;
8903 } else {
8904 for (i = 0; i < 8; i++)
8905 if (colors_written & (1 << i) &&
8906 (spi_format >> (i * 4)) & 0xf)
8907 last_color_export = i;
8908 }
8909 }
8910
8911 while (colors_written) {
8912 LLVMValueRef color[4];
8913 int mrt = u_bit_scan(&colors_written);
8914
8915 for (i = 0; i < 4; i++)
8916 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8917
8918 si_export_mrt_color(bld_base, color, mrt,
8919 num_params - 1,
8920 mrt == last_color_export, &exp);
8921 }
8922
8923 /* Process depth, stencil, samplemask. */
8924 if (key->ps_epilog.writes_z)
8925 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8926 if (key->ps_epilog.writes_stencil)
8927 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8928 if (key->ps_epilog.writes_samplemask)
8929 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8930
8931 if (depth || stencil || samplemask)
8932 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8933 else if (last_color_export == -1)
8934 si_export_null(bld_base);
8935
8936 if (exp.num)
8937 si_emit_ps_exports(ctx, &exp);
8938
8939 /* Compile. */
8940 LLVMBuildRetVoid(gallivm->builder);
8941 }
8942
8943 /**
8944 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8945 */
8946 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8947 LLVMTargetMachineRef tm,
8948 struct si_shader *shader,
8949 struct pipe_debug_callback *debug)
8950 {
8951 union si_shader_part_key prolog_key;
8952 union si_shader_part_key epilog_key;
8953
8954 /* Get the prolog. */
8955 si_get_ps_prolog_key(shader, &prolog_key, true);
8956
8957 /* The prolog is a no-op if these aren't set. */
8958 if (si_need_ps_prolog(&prolog_key)) {
8959 shader->prolog =
8960 si_get_shader_part(sscreen, &sscreen->ps_prologs,
8961 PIPE_SHADER_FRAGMENT, true,
8962 &prolog_key, tm, debug,
8963 si_build_ps_prolog_function,
8964 "Fragment Shader Prolog");
8965 if (!shader->prolog)
8966 return false;
8967 }
8968
8969 /* Get the epilog. */
8970 si_get_ps_epilog_key(shader, &epilog_key);
8971
8972 shader->epilog =
8973 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
8974 PIPE_SHADER_FRAGMENT, false,
8975 &epilog_key, tm, debug,
8976 si_build_ps_epilog_function,
8977 "Fragment Shader Epilog");
8978 if (!shader->epilog)
8979 return false;
8980
8981 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
8982 if (shader->key.part.ps.prolog.poly_stipple) {
8983 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
8984 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
8985 }
8986
8987 /* Set up the enable bits for per-sample shading if needed. */
8988 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
8989 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8990 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8991 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
8992 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8993 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
8994 }
8995 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
8996 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8997 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8998 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
8999 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9000 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
9001 }
9002 if (shader->key.part.ps.prolog.force_persp_center_interp &&
9003 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9004 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9005 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
9006 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9007 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9008 }
9009 if (shader->key.part.ps.prolog.force_linear_center_interp &&
9010 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9011 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9012 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
9013 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9014 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9015 }
9016
9017 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
9018 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
9019 !(shader->config.spi_ps_input_ena & 0xf)) {
9020 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9021 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
9022 }
9023
9024 /* At least one pair of interpolation weights must be enabled. */
9025 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
9026 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9027 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
9028 }
9029
9030 /* The sample mask input is always enabled, because the API shader always
9031 * passes it through to the epilog. Disable it here if it's unused.
9032 */
9033 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
9034 !shader->selector->info.reads_samplemask)
9035 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
9036
9037 return true;
9038 }
9039
9040 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
9041 unsigned *lds_size)
9042 {
9043 /* SPI barrier management bug:
9044 * Make sure we have at least 4k of LDS in use to avoid the bug.
9045 * It applies to workgroup sizes of more than one wavefront.
9046 */
9047 if (sscreen->b.family == CHIP_BONAIRE ||
9048 sscreen->b.family == CHIP_KABINI ||
9049 sscreen->b.family == CHIP_MULLINS)
9050 *lds_size = MAX2(*lds_size, 8);
9051 }
9052
9053 static void si_fix_resource_usage(struct si_screen *sscreen,
9054 struct si_shader *shader)
9055 {
9056 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
9057
9058 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
9059
9060 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
9061 si_get_max_workgroup_size(shader) > 64) {
9062 si_multiwave_lds_size_workaround(sscreen,
9063 &shader->config.lds_size);
9064 }
9065 }
9066
9067 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
9068 struct si_shader *shader,
9069 struct pipe_debug_callback *debug)
9070 {
9071 struct si_shader_selector *sel = shader->selector;
9072 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
9073 int r;
9074
9075 /* LS, ES, VS are compiled on demand if the main part hasn't been
9076 * compiled for that stage.
9077 *
9078 * Vertex shaders are compiled on demand when a vertex fetch
9079 * workaround must be applied.
9080 */
9081 if (shader->is_monolithic) {
9082 /* Monolithic shader (compiled as a whole, has many variants,
9083 * may take a long time to compile).
9084 */
9085 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
9086 if (r)
9087 return r;
9088 } else {
9089 /* The shader consists of 2-3 parts:
9090 *
9091 * - the middle part is the user shader, it has 1 variant only
9092 * and it was compiled during the creation of the shader
9093 * selector
9094 * - the prolog part is inserted at the beginning
9095 * - the epilog part is inserted at the end
9096 *
9097 * The prolog and epilog have many (but simple) variants.
9098 */
9099
9100 /* Copy the compiled TGSI shader data over. */
9101 shader->is_binary_shared = true;
9102 shader->binary = mainp->binary;
9103 shader->config = mainp->config;
9104 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
9105 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
9106 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
9107 memcpy(shader->info.vs_output_param_offset,
9108 mainp->info.vs_output_param_offset,
9109 sizeof(mainp->info.vs_output_param_offset));
9110 shader->info.uses_instanceid = mainp->info.uses_instanceid;
9111 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
9112 shader->info.nr_param_exports = mainp->info.nr_param_exports;
9113
9114 /* Select prologs and/or epilogs. */
9115 switch (sel->type) {
9116 case PIPE_SHADER_VERTEX:
9117 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
9118 return -1;
9119 break;
9120 case PIPE_SHADER_TESS_CTRL:
9121 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
9122 return -1;
9123 break;
9124 case PIPE_SHADER_TESS_EVAL:
9125 break;
9126 case PIPE_SHADER_GEOMETRY:
9127 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
9128 return -1;
9129 break;
9130 case PIPE_SHADER_FRAGMENT:
9131 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
9132 return -1;
9133
9134 /* Make sure we have at least as many VGPRs as there
9135 * are allocated inputs.
9136 */
9137 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9138 shader->info.num_input_vgprs);
9139 break;
9140 }
9141
9142 /* Update SGPR and VGPR counts. */
9143 if (shader->prolog) {
9144 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9145 shader->prolog->config.num_sgprs);
9146 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9147 shader->prolog->config.num_vgprs);
9148 }
9149 if (shader->previous_stage) {
9150 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9151 shader->previous_stage->config.num_sgprs);
9152 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9153 shader->previous_stage->config.num_vgprs);
9154 shader->config.spilled_sgprs =
9155 MAX2(shader->config.spilled_sgprs,
9156 shader->previous_stage->config.spilled_sgprs);
9157 shader->config.spilled_vgprs =
9158 MAX2(shader->config.spilled_vgprs,
9159 shader->previous_stage->config.spilled_vgprs);
9160 shader->config.private_mem_vgprs =
9161 MAX2(shader->config.private_mem_vgprs,
9162 shader->previous_stage->config.private_mem_vgprs);
9163 shader->config.scratch_bytes_per_wave =
9164 MAX2(shader->config.scratch_bytes_per_wave,
9165 shader->previous_stage->config.scratch_bytes_per_wave);
9166 shader->info.uses_instanceid |=
9167 shader->previous_stage->info.uses_instanceid;
9168 }
9169 if (shader->prolog2) {
9170 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9171 shader->prolog2->config.num_sgprs);
9172 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9173 shader->prolog2->config.num_vgprs);
9174 }
9175 if (shader->epilog) {
9176 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9177 shader->epilog->config.num_sgprs);
9178 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9179 shader->epilog->config.num_vgprs);
9180 }
9181 }
9182
9183 si_fix_resource_usage(sscreen, shader);
9184 si_shader_dump(sscreen, shader, debug, sel->info.processor,
9185 stderr, true);
9186
9187 /* Upload. */
9188 r = si_shader_binary_upload(sscreen, shader);
9189 if (r) {
9190 fprintf(stderr, "LLVM failed to upload shader\n");
9191 return r;
9192 }
9193
9194 return 0;
9195 }
9196
9197 void si_shader_destroy(struct si_shader *shader)
9198 {
9199 if (shader->scratch_bo)
9200 r600_resource_reference(&shader->scratch_bo, NULL);
9201
9202 r600_resource_reference(&shader->bo, NULL);
9203
9204 if (!shader->is_binary_shared)
9205 radeon_shader_binary_clean(&shader->binary);
9206
9207 free(shader->shader_log);
9208 }