radeonsi: drop support for LLVM 3.8
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85
86 /* Ideally pass the sample mask input to the PS epilog as v13, which
87 * is its usual location, so that the shader doesn't have to add v_mov.
88 */
89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
90
91 enum {
92 CONST_ADDR_SPACE = 2,
93 LOCAL_ADDR_SPACE = 3,
94 };
95
96 static bool is_merged_shader(struct si_shader *shader)
97 {
98 if (shader->selector->screen->b.chip_class <= VI)
99 return false;
100
101 return shader->key.as_ls ||
102 shader->key.as_es ||
103 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
104 shader->selector->type == PIPE_SHADER_GEOMETRY;
105 }
106
107 /**
108 * Returns a unique index for a semantic name and index. The index must be
109 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
110 * calculated.
111 */
112 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
113 {
114 switch (semantic_name) {
115 case TGSI_SEMANTIC_POSITION:
116 return 0;
117 case TGSI_SEMANTIC_PSIZE:
118 return 1;
119 case TGSI_SEMANTIC_CLIPDIST:
120 assert(index <= 1);
121 return 2 + index;
122 case TGSI_SEMANTIC_GENERIC:
123 if (index <= 63-4)
124 return 4 + index;
125
126 assert(!"invalid generic index");
127 return 0;
128
129 /* patch indices are completely separate and thus start from 0 */
130 case TGSI_SEMANTIC_TESSOUTER:
131 return 0;
132 case TGSI_SEMANTIC_TESSINNER:
133 return 1;
134 case TGSI_SEMANTIC_PATCH:
135 return 2 + index;
136
137 default:
138 assert(!"invalid semantic name");
139 return 0;
140 }
141 }
142
143 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
144 {
145 switch (name) {
146 case TGSI_SEMANTIC_FOG:
147 return 0;
148 case TGSI_SEMANTIC_LAYER:
149 return 1;
150 case TGSI_SEMANTIC_VIEWPORT_INDEX:
151 return 2;
152 case TGSI_SEMANTIC_PRIMID:
153 return 3;
154 case TGSI_SEMANTIC_COLOR: /* these alias */
155 case TGSI_SEMANTIC_BCOLOR:
156 return 4 + index;
157 case TGSI_SEMANTIC_TEXCOORD:
158 return 6 + index;
159 default:
160 assert(!"invalid semantic name");
161 return 0;
162 }
163 }
164
165 /**
166 * Get the value of a shader input parameter and extract a bitfield.
167 */
168 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
169 unsigned param, unsigned rshift,
170 unsigned bitwidth)
171 {
172 struct gallivm_state *gallivm = &ctx->gallivm;
173 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
174 param);
175
176 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
177 value = bitcast(&ctx->bld_base,
178 TGSI_TYPE_UNSIGNED, value);
179
180 if (rshift)
181 value = LLVMBuildLShr(gallivm->builder, value,
182 LLVMConstInt(ctx->i32, rshift, 0), "");
183
184 if (rshift + bitwidth < 32) {
185 unsigned mask = (1 << bitwidth) - 1;
186 value = LLVMBuildAnd(gallivm->builder, value,
187 LLVMConstInt(ctx->i32, mask, 0), "");
188 }
189
190 return value;
191 }
192
193 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
194 {
195 switch (ctx->type) {
196 case PIPE_SHADER_TESS_CTRL:
197 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
198
199 case PIPE_SHADER_TESS_EVAL:
200 return LLVMGetParam(ctx->main_fn,
201 ctx->param_tes_rel_patch_id);
202
203 default:
204 assert(0);
205 return NULL;
206 }
207 }
208
209 /* Tessellation shaders pass outputs to the next shader using LDS.
210 *
211 * LS outputs = TCS inputs
212 * TCS outputs = TES inputs
213 *
214 * The LDS layout is:
215 * - TCS inputs for patch 0
216 * - TCS inputs for patch 1
217 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
218 * - ...
219 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
220 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
221 * - TCS outputs for patch 1
222 * - Per-patch TCS outputs for patch 1
223 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
224 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
225 * - ...
226 *
227 * All three shaders VS(LS), TCS, TES share the same LDS space.
228 */
229
230 static LLVMValueRef
231 get_tcs_in_patch_stride(struct si_shader_context *ctx)
232 {
233 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
234 }
235
236 static LLVMValueRef
237 get_tcs_out_patch_stride(struct si_shader_context *ctx)
238 {
239 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
240 }
241
242 static LLVMValueRef
243 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
244 {
245 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
246 unpack_param(ctx,
247 ctx->param_tcs_out_lds_offsets,
248 0, 16),
249 4);
250 }
251
252 static LLVMValueRef
253 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
254 {
255 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
256 unpack_param(ctx,
257 ctx->param_tcs_out_lds_offsets,
258 16, 16),
259 4);
260 }
261
262 static LLVMValueRef
263 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
264 {
265 struct gallivm_state *gallivm = &ctx->gallivm;
266 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
267 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
268
269 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
270 }
271
272 static LLVMValueRef
273 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
274 {
275 struct gallivm_state *gallivm = &ctx->gallivm;
276 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
277 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
278 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
279
280 return LLVMBuildAdd(gallivm->builder, patch0_offset,
281 LLVMBuildMul(gallivm->builder, patch_stride,
282 rel_patch_id, ""),
283 "");
284 }
285
286 static LLVMValueRef
287 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
288 {
289 struct gallivm_state *gallivm = &ctx->gallivm;
290 LLVMValueRef patch0_patch_data_offset =
291 get_tcs_out_patch0_patch_data_offset(ctx);
292 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
293 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
294
295 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
296 LLVMBuildMul(gallivm->builder, patch_stride,
297 rel_patch_id, ""),
298 "");
299 }
300
301 static LLVMValueRef get_instance_index_for_fetch(
302 struct si_shader_context *ctx,
303 unsigned param_start_instance, unsigned divisor)
304 {
305 struct gallivm_state *gallivm = &ctx->gallivm;
306
307 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
308 ctx->param_instance_id);
309
310 /* The division must be done before START_INSTANCE is added. */
311 if (divisor > 1)
312 result = LLVMBuildUDiv(gallivm->builder, result,
313 LLVMConstInt(ctx->i32, divisor, 0), "");
314
315 return LLVMBuildAdd(gallivm->builder, result,
316 LLVMGetParam(ctx->main_fn, param_start_instance), "");
317 }
318
319 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
320 * to float. */
321 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
322 LLVMValueRef vec4,
323 unsigned double_index)
324 {
325 LLVMBuilderRef builder = ctx->gallivm.builder;
326 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
327 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
328 LLVMVectorType(f64, 2), "");
329 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
330 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
331 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
332 }
333
334 static void declare_input_vs(
335 struct si_shader_context *ctx,
336 unsigned input_index,
337 const struct tgsi_full_declaration *decl,
338 LLVMValueRef out[4])
339 {
340 struct gallivm_state *gallivm = &ctx->gallivm;
341
342 unsigned chan;
343 unsigned fix_fetch;
344 unsigned num_fetches;
345 unsigned fetch_stride;
346
347 LLVMValueRef t_list_ptr;
348 LLVMValueRef t_offset;
349 LLVMValueRef t_list;
350 LLVMValueRef vertex_index;
351 LLVMValueRef input[3];
352
353 /* Load the T list */
354 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
355
356 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
357
358 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
359
360 vertex_index = LLVMGetParam(ctx->main_fn,
361 ctx->param_vertex_index0 +
362 input_index);
363
364 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
365
366 /* Do multiple loads for special formats. */
367 switch (fix_fetch) {
368 case SI_FIX_FETCH_RGB_64_FLOAT:
369 num_fetches = 3; /* 3 2-dword loads */
370 fetch_stride = 8;
371 break;
372 case SI_FIX_FETCH_RGBA_64_FLOAT:
373 num_fetches = 2; /* 2 4-dword loads */
374 fetch_stride = 16;
375 break;
376 case SI_FIX_FETCH_RGB_8:
377 case SI_FIX_FETCH_RGB_8_INT:
378 num_fetches = 3;
379 fetch_stride = 1;
380 break;
381 case SI_FIX_FETCH_RGB_16:
382 case SI_FIX_FETCH_RGB_16_INT:
383 num_fetches = 3;
384 fetch_stride = 2;
385 break;
386 default:
387 num_fetches = 1;
388 fetch_stride = 0;
389 }
390
391 for (unsigned i = 0; i < num_fetches; i++) {
392 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
393
394 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
395 vertex_index, voffset,
396 true);
397 }
398
399 /* Break up the vec4 into individual components */
400 for (chan = 0; chan < 4; chan++) {
401 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
402 out[chan] = LLVMBuildExtractElement(gallivm->builder,
403 input[0], llvm_chan, "");
404 }
405
406 switch (fix_fetch) {
407 case SI_FIX_FETCH_A2_SNORM:
408 case SI_FIX_FETCH_A2_SSCALED:
409 case SI_FIX_FETCH_A2_SINT: {
410 /* The hardware returns an unsigned value; convert it to a
411 * signed one.
412 */
413 LLVMValueRef tmp = out[3];
414 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
415
416 /* First, recover the sign-extended signed integer value. */
417 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
418 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
419 else
420 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
421
422 /* For the integer-like cases, do a natural sign extension.
423 *
424 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
425 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
426 * exponent.
427 */
428 tmp = LLVMBuildShl(gallivm->builder, tmp,
429 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
430 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
431 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
432
433 /* Convert back to the right type. */
434 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
435 LLVMValueRef clamp;
436 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
437 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
438 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
439 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
440 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
441 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
442 }
443
444 out[3] = tmp;
445 break;
446 }
447 case SI_FIX_FETCH_RGBA_32_UNORM:
448 case SI_FIX_FETCH_RGBX_32_UNORM:
449 for (chan = 0; chan < 4; chan++) {
450 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
451 ctx->i32, "");
452 out[chan] = LLVMBuildUIToFP(gallivm->builder,
453 out[chan], ctx->f32, "");
454 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
455 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
456 }
457 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
458 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
459 out[3] = LLVMConstReal(ctx->f32, 1);
460 break;
461 case SI_FIX_FETCH_RGBA_32_SNORM:
462 case SI_FIX_FETCH_RGBX_32_SNORM:
463 case SI_FIX_FETCH_RGBA_32_FIXED:
464 case SI_FIX_FETCH_RGBX_32_FIXED: {
465 double scale;
466 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
467 scale = 1.0 / 0x10000;
468 else
469 scale = 1.0 / INT_MAX;
470
471 for (chan = 0; chan < 4; chan++) {
472 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
473 ctx->i32, "");
474 out[chan] = LLVMBuildSIToFP(gallivm->builder,
475 out[chan], ctx->f32, "");
476 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
477 LLVMConstReal(ctx->f32, scale), "");
478 }
479 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
480 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
481 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
482 out[3] = LLVMConstReal(ctx->f32, 1);
483 break;
484 }
485 case SI_FIX_FETCH_RGBA_32_USCALED:
486 for (chan = 0; chan < 4; chan++) {
487 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
488 ctx->i32, "");
489 out[chan] = LLVMBuildUIToFP(gallivm->builder,
490 out[chan], ctx->f32, "");
491 }
492 break;
493 case SI_FIX_FETCH_RGBA_32_SSCALED:
494 for (chan = 0; chan < 4; chan++) {
495 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
496 ctx->i32, "");
497 out[chan] = LLVMBuildSIToFP(gallivm->builder,
498 out[chan], ctx->f32, "");
499 }
500 break;
501 case SI_FIX_FETCH_RG_64_FLOAT:
502 for (chan = 0; chan < 2; chan++)
503 out[chan] = extract_double_to_float(ctx, input[0], chan);
504
505 out[2] = LLVMConstReal(ctx->f32, 0);
506 out[3] = LLVMConstReal(ctx->f32, 1);
507 break;
508 case SI_FIX_FETCH_RGB_64_FLOAT:
509 for (chan = 0; chan < 3; chan++)
510 out[chan] = extract_double_to_float(ctx, input[chan], 0);
511
512 out[3] = LLVMConstReal(ctx->f32, 1);
513 break;
514 case SI_FIX_FETCH_RGBA_64_FLOAT:
515 for (chan = 0; chan < 4; chan++) {
516 out[chan] = extract_double_to_float(ctx, input[chan / 2],
517 chan % 2);
518 }
519 break;
520 case SI_FIX_FETCH_RGB_8:
521 case SI_FIX_FETCH_RGB_8_INT:
522 case SI_FIX_FETCH_RGB_16:
523 case SI_FIX_FETCH_RGB_16_INT:
524 for (chan = 0; chan < 3; chan++) {
525 out[chan] = LLVMBuildExtractElement(gallivm->builder,
526 input[chan],
527 ctx->i32_0, "");
528 }
529 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
530 fix_fetch == SI_FIX_FETCH_RGB_16) {
531 out[3] = LLVMConstReal(ctx->f32, 1);
532 } else {
533 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
534 ctx->f32, "");
535 }
536 break;
537 }
538 }
539
540 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
541 unsigned swizzle)
542 {
543 struct si_shader_context *ctx = si_shader_context(bld_base);
544
545 if (swizzle > 0)
546 return ctx->i32_0;
547
548 switch (ctx->type) {
549 case PIPE_SHADER_VERTEX:
550 return LLVMGetParam(ctx->main_fn,
551 ctx->param_vs_prim_id);
552 case PIPE_SHADER_TESS_CTRL:
553 return LLVMGetParam(ctx->main_fn,
554 ctx->param_tcs_patch_id);
555 case PIPE_SHADER_TESS_EVAL:
556 return LLVMGetParam(ctx->main_fn,
557 ctx->param_tes_patch_id);
558 case PIPE_SHADER_GEOMETRY:
559 return LLVMGetParam(ctx->main_fn,
560 ctx->param_gs_prim_id);
561 default:
562 assert(0);
563 return ctx->i32_0;
564 }
565 }
566
567 /**
568 * Return the value of tgsi_ind_register for indexing.
569 * This is the indirect index with the constant offset added to it.
570 */
571 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
572 const struct tgsi_ind_register *ind,
573 int rel_index)
574 {
575 struct gallivm_state *gallivm = &ctx->gallivm;
576 LLVMValueRef result;
577
578 result = ctx->addrs[ind->Index][ind->Swizzle];
579 result = LLVMBuildLoad(gallivm->builder, result, "");
580 result = LLVMBuildAdd(gallivm->builder, result,
581 LLVMConstInt(ctx->i32, rel_index, 0), "");
582 return result;
583 }
584
585 /**
586 * Like get_indirect_index, but restricts the return value to a (possibly
587 * undefined) value inside [0..num).
588 */
589 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
590 const struct tgsi_ind_register *ind,
591 int rel_index, unsigned num)
592 {
593 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
594
595 return si_llvm_bound_index(ctx, result, num);
596 }
597
598
599 /**
600 * Calculate a dword address given an input or output register and a stride.
601 */
602 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
603 const struct tgsi_full_dst_register *dst,
604 const struct tgsi_full_src_register *src,
605 LLVMValueRef vertex_dw_stride,
606 LLVMValueRef base_addr)
607 {
608 struct gallivm_state *gallivm = &ctx->gallivm;
609 struct tgsi_shader_info *info = &ctx->shader->selector->info;
610 ubyte *name, *index, *array_first;
611 int first, param;
612 struct tgsi_full_dst_register reg;
613
614 /* Set the register description. The address computation is the same
615 * for sources and destinations. */
616 if (src) {
617 reg.Register.File = src->Register.File;
618 reg.Register.Index = src->Register.Index;
619 reg.Register.Indirect = src->Register.Indirect;
620 reg.Register.Dimension = src->Register.Dimension;
621 reg.Indirect = src->Indirect;
622 reg.Dimension = src->Dimension;
623 reg.DimIndirect = src->DimIndirect;
624 } else
625 reg = *dst;
626
627 /* If the register is 2-dimensional (e.g. an array of vertices
628 * in a primitive), calculate the base address of the vertex. */
629 if (reg.Register.Dimension) {
630 LLVMValueRef index;
631
632 if (reg.Dimension.Indirect)
633 index = get_indirect_index(ctx, &reg.DimIndirect,
634 reg.Dimension.Index);
635 else
636 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
637
638 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
639 LLVMBuildMul(gallivm->builder, index,
640 vertex_dw_stride, ""), "");
641 }
642
643 /* Get information about the register. */
644 if (reg.Register.File == TGSI_FILE_INPUT) {
645 name = info->input_semantic_name;
646 index = info->input_semantic_index;
647 array_first = info->input_array_first;
648 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
649 name = info->output_semantic_name;
650 index = info->output_semantic_index;
651 array_first = info->output_array_first;
652 } else {
653 assert(0);
654 return NULL;
655 }
656
657 if (reg.Register.Indirect) {
658 /* Add the relative address of the element. */
659 LLVMValueRef ind_index;
660
661 if (reg.Indirect.ArrayID)
662 first = array_first[reg.Indirect.ArrayID];
663 else
664 first = reg.Register.Index;
665
666 ind_index = get_indirect_index(ctx, &reg.Indirect,
667 reg.Register.Index - first);
668
669 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
670 LLVMBuildMul(gallivm->builder, ind_index,
671 LLVMConstInt(ctx->i32, 4, 0), ""), "");
672
673 param = si_shader_io_get_unique_index(name[first], index[first]);
674 } else {
675 param = si_shader_io_get_unique_index(name[reg.Register.Index],
676 index[reg.Register.Index]);
677 }
678
679 /* Add the base address of the element. */
680 return LLVMBuildAdd(gallivm->builder, base_addr,
681 LLVMConstInt(ctx->i32, param * 4, 0), "");
682 }
683
684 /* The offchip buffer layout for TCS->TES is
685 *
686 * - attribute 0 of patch 0 vertex 0
687 * - attribute 0 of patch 0 vertex 1
688 * - attribute 0 of patch 0 vertex 2
689 * ...
690 * - attribute 0 of patch 1 vertex 0
691 * - attribute 0 of patch 1 vertex 1
692 * ...
693 * - attribute 1 of patch 0 vertex 0
694 * - attribute 1 of patch 0 vertex 1
695 * ...
696 * - per patch attribute 0 of patch 0
697 * - per patch attribute 0 of patch 1
698 * ...
699 *
700 * Note that every attribute has 4 components.
701 */
702 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
703 LLVMValueRef rel_patch_id,
704 LLVMValueRef vertex_index,
705 LLVMValueRef param_index)
706 {
707 struct gallivm_state *gallivm = &ctx->gallivm;
708 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
709 LLVMValueRef param_stride, constant16;
710
711 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
712 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
713 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
714 num_patches, "");
715
716 constant16 = LLVMConstInt(ctx->i32, 16, 0);
717 if (vertex_index) {
718 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
719 vertices_per_patch, "");
720
721 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
722 vertex_index, "");
723
724 param_stride = total_vertices;
725 } else {
726 base_addr = rel_patch_id;
727 param_stride = num_patches;
728 }
729
730 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
731 LLVMBuildMul(gallivm->builder, param_index,
732 param_stride, ""), "");
733
734 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
735
736 if (!vertex_index) {
737 LLVMValueRef patch_data_offset =
738 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
739
740 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
741 patch_data_offset, "");
742 }
743 return base_addr;
744 }
745
746 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
747 struct si_shader_context *ctx,
748 const struct tgsi_full_dst_register *dst,
749 const struct tgsi_full_src_register *src)
750 {
751 struct gallivm_state *gallivm = &ctx->gallivm;
752 struct tgsi_shader_info *info = &ctx->shader->selector->info;
753 ubyte *name, *index, *array_first;
754 struct tgsi_full_src_register reg;
755 LLVMValueRef vertex_index = NULL;
756 LLVMValueRef param_index = NULL;
757 unsigned param_index_base, param_base;
758
759 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
760
761 if (reg.Register.Dimension) {
762
763 if (reg.Dimension.Indirect)
764 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
765 reg.Dimension.Index);
766 else
767 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
768 }
769
770 /* Get information about the register. */
771 if (reg.Register.File == TGSI_FILE_INPUT) {
772 name = info->input_semantic_name;
773 index = info->input_semantic_index;
774 array_first = info->input_array_first;
775 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
776 name = info->output_semantic_name;
777 index = info->output_semantic_index;
778 array_first = info->output_array_first;
779 } else {
780 assert(0);
781 return NULL;
782 }
783
784 if (reg.Register.Indirect) {
785 if (reg.Indirect.ArrayID)
786 param_base = array_first[reg.Indirect.ArrayID];
787 else
788 param_base = reg.Register.Index;
789
790 param_index = get_indirect_index(ctx, &reg.Indirect,
791 reg.Register.Index - param_base);
792
793 } else {
794 param_base = reg.Register.Index;
795 param_index = ctx->i32_0;
796 }
797
798 param_index_base = si_shader_io_get_unique_index(name[param_base],
799 index[param_base]);
800
801 param_index = LLVMBuildAdd(gallivm->builder, param_index,
802 LLVMConstInt(ctx->i32, param_index_base, 0),
803 "");
804
805 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
806 vertex_index, param_index);
807 }
808
809 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
810 enum tgsi_opcode_type type, unsigned swizzle,
811 LLVMValueRef buffer, LLVMValueRef offset,
812 LLVMValueRef base, bool readonly_memory)
813 {
814 struct si_shader_context *ctx = si_shader_context(bld_base);
815 struct gallivm_state *gallivm = &ctx->gallivm;
816 LLVMValueRef value, value2;
817 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
818 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
819
820 if (swizzle == ~0) {
821 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
822 0, 1, 0, readonly_memory);
823
824 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
825 }
826
827 if (!tgsi_type_is_64bit(type)) {
828 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
829 0, 1, 0, readonly_memory);
830
831 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
832 return LLVMBuildExtractElement(gallivm->builder, value,
833 LLVMConstInt(ctx->i32, swizzle, 0), "");
834 }
835
836 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
837 swizzle * 4, 1, 0, readonly_memory);
838
839 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
840 swizzle * 4 + 4, 1, 0, readonly_memory);
841
842 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
843 }
844
845 /**
846 * Load from LDS.
847 *
848 * \param type output value type
849 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
850 * \param dw_addr address in dwords
851 */
852 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
853 enum tgsi_opcode_type type, unsigned swizzle,
854 LLVMValueRef dw_addr)
855 {
856 struct si_shader_context *ctx = si_shader_context(bld_base);
857 struct gallivm_state *gallivm = &ctx->gallivm;
858 LLVMValueRef value;
859
860 if (swizzle == ~0) {
861 LLVMValueRef values[TGSI_NUM_CHANNELS];
862
863 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
864 values[chan] = lds_load(bld_base, type, chan, dw_addr);
865
866 return lp_build_gather_values(gallivm, values,
867 TGSI_NUM_CHANNELS);
868 }
869
870 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
871 LLVMConstInt(ctx->i32, swizzle, 0));
872
873 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
874 if (tgsi_type_is_64bit(type)) {
875 LLVMValueRef value2;
876 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
877 ctx->i32_1);
878 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
879 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
880 }
881
882 return LLVMBuildBitCast(gallivm->builder, value,
883 tgsi2llvmtype(bld_base, type), "");
884 }
885
886 /**
887 * Store to LDS.
888 *
889 * \param swizzle offset (typically 0..3)
890 * \param dw_addr address in dwords
891 * \param value value to store
892 */
893 static void lds_store(struct lp_build_tgsi_context *bld_base,
894 unsigned dw_offset_imm, LLVMValueRef dw_addr,
895 LLVMValueRef value)
896 {
897 struct si_shader_context *ctx = si_shader_context(bld_base);
898 struct gallivm_state *gallivm = &ctx->gallivm;
899
900 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
901 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
902
903 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
904 ac_build_indexed_store(&ctx->ac, ctx->lds,
905 dw_addr, value);
906 }
907
908 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
909 unsigned param)
910 {
911 LLVMBuilderRef builder = ctx->gallivm.builder;
912
913 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
914 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
915 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
916
917 uint64_t desc2 = 0xffffffff;
918 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
919 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
920 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
921 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
922 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
923 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
924 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
925
926 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
927 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
928 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
929 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
930 }
931
932 static LLVMValueRef fetch_input_tcs(
933 struct lp_build_tgsi_context *bld_base,
934 const struct tgsi_full_src_register *reg,
935 enum tgsi_opcode_type type, unsigned swizzle)
936 {
937 struct si_shader_context *ctx = si_shader_context(bld_base);
938 LLVMValueRef dw_addr, stride;
939
940 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
941 dw_addr = get_tcs_in_current_patch_offset(ctx);
942 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
943
944 return lds_load(bld_base, type, swizzle, dw_addr);
945 }
946
947 static LLVMValueRef fetch_output_tcs(
948 struct lp_build_tgsi_context *bld_base,
949 const struct tgsi_full_src_register *reg,
950 enum tgsi_opcode_type type, unsigned swizzle)
951 {
952 struct si_shader_context *ctx = si_shader_context(bld_base);
953 LLVMValueRef dw_addr, stride;
954
955 if (reg->Register.Dimension) {
956 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
957 dw_addr = get_tcs_out_current_patch_offset(ctx);
958 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
959 } else {
960 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
961 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
962 }
963
964 return lds_load(bld_base, type, swizzle, dw_addr);
965 }
966
967 static LLVMValueRef fetch_input_tes(
968 struct lp_build_tgsi_context *bld_base,
969 const struct tgsi_full_src_register *reg,
970 enum tgsi_opcode_type type, unsigned swizzle)
971 {
972 struct si_shader_context *ctx = si_shader_context(bld_base);
973 LLVMValueRef buffer, base, addr;
974
975 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
976
977 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
978 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
979
980 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
981 }
982
983 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
984 const struct tgsi_full_instruction *inst,
985 const struct tgsi_opcode_info *info,
986 LLVMValueRef dst[4])
987 {
988 struct si_shader_context *ctx = si_shader_context(bld_base);
989 struct gallivm_state *gallivm = &ctx->gallivm;
990 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
991 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
992 unsigned chan_index;
993 LLVMValueRef dw_addr, stride;
994 LLVMValueRef buffer, base, buf_addr;
995 LLVMValueRef values[4];
996 bool skip_lds_store;
997 bool is_tess_factor = false;
998
999 /* Only handle per-patch and per-vertex outputs here.
1000 * Vectors will be lowered to scalars and this function will be called again.
1001 */
1002 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1003 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1004 si_llvm_emit_store(bld_base, inst, info, dst);
1005 return;
1006 }
1007
1008 if (reg->Register.Dimension) {
1009 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1010 dw_addr = get_tcs_out_current_patch_offset(ctx);
1011 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1012 skip_lds_store = !sh_info->reads_pervertex_outputs;
1013 } else {
1014 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1015 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1016 skip_lds_store = !sh_info->reads_perpatch_outputs;
1017
1018 if (!reg->Register.Indirect) {
1019 int name = sh_info->output_semantic_name[reg->Register.Index];
1020
1021 /* Always write tess factors into LDS for the TCS epilog. */
1022 if (name == TGSI_SEMANTIC_TESSINNER ||
1023 name == TGSI_SEMANTIC_TESSOUTER) {
1024 skip_lds_store = false;
1025 is_tess_factor = true;
1026 }
1027 }
1028 }
1029
1030 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1031
1032 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1033 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1034
1035
1036 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1037 LLVMValueRef value = dst[chan_index];
1038
1039 if (inst->Instruction.Saturate)
1040 value = ac_build_clamp(&ctx->ac, value);
1041
1042 /* Skip LDS stores if there is no LDS read of this output. */
1043 if (!skip_lds_store)
1044 lds_store(bld_base, chan_index, dw_addr, value);
1045
1046 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1047 values[chan_index] = value;
1048
1049 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1050 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1051 buf_addr, base,
1052 4 * chan_index, 1, 0, true, false);
1053 }
1054 }
1055
1056 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1057 LLVMValueRef value = lp_build_gather_values(gallivm,
1058 values, 4);
1059 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1060 base, 0, 1, 0, true, false);
1061 }
1062 }
1063
1064 static LLVMValueRef fetch_input_gs(
1065 struct lp_build_tgsi_context *bld_base,
1066 const struct tgsi_full_src_register *reg,
1067 enum tgsi_opcode_type type,
1068 unsigned swizzle)
1069 {
1070 struct si_shader_context *ctx = si_shader_context(bld_base);
1071 struct si_shader *shader = ctx->shader;
1072 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1073 struct gallivm_state *gallivm = &ctx->gallivm;
1074 LLVMValueRef vtx_offset, soffset;
1075 struct tgsi_shader_info *info = &shader->selector->info;
1076 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1077 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1078 unsigned param;
1079 LLVMValueRef value;
1080
1081 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1082 return get_primitive_id(bld_base, swizzle);
1083
1084 if (!reg->Register.Dimension)
1085 return NULL;
1086
1087 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1088
1089 /* GFX9 has the ESGS ring in LDS. */
1090 if (ctx->screen->b.chip_class >= GFX9) {
1091 unsigned index = reg->Dimension.Index;
1092
1093 switch (index / 2) {
1094 case 0:
1095 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1096 index % 2 ? 16 : 0, 16);
1097 break;
1098 case 1:
1099 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1100 index % 2 ? 16 : 0, 16);
1101 break;
1102 case 2:
1103 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1104 index % 2 ? 16 : 0, 16);
1105 break;
1106 default:
1107 assert(0);
1108 return NULL;
1109 }
1110
1111 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1112 LLVMConstInt(ctx->i32, param * 4, 0), "");
1113 return lds_load(bld_base, type, swizzle, vtx_offset);
1114 }
1115
1116 /* GFX6: input load from the ESGS ring in memory. */
1117 if (swizzle == ~0) {
1118 LLVMValueRef values[TGSI_NUM_CHANNELS];
1119 unsigned chan;
1120 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1121 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1122 }
1123 return lp_build_gather_values(gallivm, values,
1124 TGSI_NUM_CHANNELS);
1125 }
1126
1127 /* Get the vertex offset parameter on GFX6. */
1128 unsigned vtx_offset_param = reg->Dimension.Index;
1129 if (vtx_offset_param < 2) {
1130 vtx_offset_param += ctx->param_gs_vtx0_offset;
1131 } else {
1132 assert(vtx_offset_param < 6);
1133 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1134 }
1135 vtx_offset = lp_build_mul_imm(uint,
1136 LLVMGetParam(ctx->main_fn,
1137 vtx_offset_param),
1138 4);
1139
1140 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1141
1142 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1143 vtx_offset, soffset, 0, 1, 0, true);
1144 if (tgsi_type_is_64bit(type)) {
1145 LLVMValueRef value2;
1146 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1147
1148 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1149 ctx->i32_0, vtx_offset, soffset,
1150 0, 1, 0, true);
1151 return si_llvm_emit_fetch_64bit(bld_base, type,
1152 value, value2);
1153 }
1154 return LLVMBuildBitCast(gallivm->builder,
1155 value,
1156 tgsi2llvmtype(bld_base, type), "");
1157 }
1158
1159 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1160 {
1161 switch (interpolate) {
1162 case TGSI_INTERPOLATE_CONSTANT:
1163 return 0;
1164
1165 case TGSI_INTERPOLATE_LINEAR:
1166 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1167 return SI_PARAM_LINEAR_SAMPLE;
1168 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1169 return SI_PARAM_LINEAR_CENTROID;
1170 else
1171 return SI_PARAM_LINEAR_CENTER;
1172 break;
1173 case TGSI_INTERPOLATE_COLOR:
1174 case TGSI_INTERPOLATE_PERSPECTIVE:
1175 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1176 return SI_PARAM_PERSP_SAMPLE;
1177 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1178 return SI_PARAM_PERSP_CENTROID;
1179 else
1180 return SI_PARAM_PERSP_CENTER;
1181 break;
1182 default:
1183 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1184 return -1;
1185 }
1186 }
1187
1188 /**
1189 * Interpolate a fragment shader input.
1190 *
1191 * @param ctx context
1192 * @param input_index index of the input in hardware
1193 * @param semantic_name TGSI_SEMANTIC_*
1194 * @param semantic_index semantic index
1195 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1196 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1197 * @param interp_param interpolation weights (i,j)
1198 * @param prim_mask SI_PARAM_PRIM_MASK
1199 * @param face SI_PARAM_FRONT_FACE
1200 * @param result the return value (4 components)
1201 */
1202 static void interp_fs_input(struct si_shader_context *ctx,
1203 unsigned input_index,
1204 unsigned semantic_name,
1205 unsigned semantic_index,
1206 unsigned num_interp_inputs,
1207 unsigned colors_read_mask,
1208 LLVMValueRef interp_param,
1209 LLVMValueRef prim_mask,
1210 LLVMValueRef face,
1211 LLVMValueRef result[4])
1212 {
1213 struct gallivm_state *gallivm = &ctx->gallivm;
1214 LLVMValueRef attr_number;
1215 LLVMValueRef i, j;
1216
1217 unsigned chan;
1218
1219 /* fs.constant returns the param from the middle vertex, so it's not
1220 * really useful for flat shading. It's meant to be used for custom
1221 * interpolation (but the intrinsic can't fetch from the other two
1222 * vertices).
1223 *
1224 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1225 * to do the right thing. The only reason we use fs.constant is that
1226 * fs.interp cannot be used on integers, because they can be equal
1227 * to NaN.
1228 *
1229 * When interp is false we will use fs.constant or for newer llvm,
1230 * amdgcn.interp.mov.
1231 */
1232 bool interp = interp_param != NULL;
1233
1234 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1235
1236 if (interp) {
1237 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1238 LLVMVectorType(ctx->f32, 2), "");
1239
1240 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1241 ctx->i32_0, "");
1242 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1243 ctx->i32_1, "");
1244 }
1245
1246 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1247 ctx->shader->key.part.ps.prolog.color_two_side) {
1248 LLVMValueRef is_face_positive;
1249 LLVMValueRef back_attr_number;
1250
1251 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1252 * otherwise it's at offset "num_inputs".
1253 */
1254 unsigned back_attr_offset = num_interp_inputs;
1255 if (semantic_index == 1 && colors_read_mask & 0xf)
1256 back_attr_offset += 1;
1257
1258 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1259
1260 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1261 face, ctx->i32_0, "");
1262
1263 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1264 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1265 LLVMValueRef front, back;
1266
1267 if (interp) {
1268 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1269 attr_number, prim_mask,
1270 i, j);
1271 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1272 back_attr_number, prim_mask,
1273 i, j);
1274 } else {
1275 front = ac_build_fs_interp_mov(&ctx->ac,
1276 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1277 llvm_chan, attr_number, prim_mask);
1278 back = ac_build_fs_interp_mov(&ctx->ac,
1279 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1280 llvm_chan, back_attr_number, prim_mask);
1281 }
1282
1283 result[chan] = LLVMBuildSelect(gallivm->builder,
1284 is_face_positive,
1285 front,
1286 back,
1287 "");
1288 }
1289 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1290 if (interp) {
1291 result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1292 attr_number, prim_mask, i, j);
1293 } else {
1294 result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1295 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1296 attr_number, prim_mask);
1297 }
1298 result[1] =
1299 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1300 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1301 } else {
1302 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1303 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1304
1305 if (interp) {
1306 result[chan] = ac_build_fs_interp(&ctx->ac,
1307 llvm_chan, attr_number, prim_mask, i, j);
1308 } else {
1309 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1310 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1311 llvm_chan, attr_number, prim_mask);
1312 }
1313 }
1314 }
1315 }
1316
1317 static void declare_input_fs(
1318 struct si_shader_context *ctx,
1319 unsigned input_index,
1320 const struct tgsi_full_declaration *decl,
1321 LLVMValueRef out[4])
1322 {
1323 struct lp_build_context *base = &ctx->bld_base.base;
1324 struct si_shader *shader = ctx->shader;
1325 LLVMValueRef main_fn = ctx->main_fn;
1326 LLVMValueRef interp_param = NULL;
1327 int interp_param_idx;
1328
1329 /* Get colors from input VGPRs (set by the prolog). */
1330 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1331 unsigned i = decl->Semantic.Index;
1332 unsigned colors_read = shader->selector->info.colors_read;
1333 unsigned mask = colors_read >> (i * 4);
1334 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1335 (i ? util_bitcount(colors_read & 0xf) : 0);
1336
1337 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1338 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1339 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1340 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1341 return;
1342 }
1343
1344 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1345 decl->Interp.Location);
1346 if (interp_param_idx == -1)
1347 return;
1348 else if (interp_param_idx) {
1349 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1350 }
1351
1352 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1353 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1354 ctx->shader->key.part.ps.prolog.flatshade_colors)
1355 interp_param = NULL; /* load the constant color */
1356
1357 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1358 decl->Semantic.Index, shader->selector->info.num_inputs,
1359 shader->selector->info.colors_read, interp_param,
1360 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1361 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1362 &out[0]);
1363 }
1364
1365 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1366 {
1367 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1368 }
1369
1370
1371 /**
1372 * Load a dword from a constant buffer.
1373 */
1374 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1375 LLVMValueRef resource,
1376 LLVMValueRef offset)
1377 {
1378 LLVMBuilderRef builder = ctx->gallivm.builder;
1379 LLVMValueRef args[2] = {resource, offset};
1380
1381 return lp_build_intrinsic(builder, "llvm.SI.load.const.v4i32", ctx->f32, args, 2,
1382 LP_FUNC_ATTR_READNONE |
1383 LP_FUNC_ATTR_LEGACY);
1384 }
1385
1386 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1387 {
1388 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1389 struct gallivm_state *gallivm = &ctx->gallivm;
1390 LLVMBuilderRef builder = gallivm->builder;
1391 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1392 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1393 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1394
1395 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1396 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1397 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1398
1399 LLVMValueRef pos[4] = {
1400 buffer_load_const(ctx, resource, offset0),
1401 buffer_load_const(ctx, resource, offset1),
1402 LLVMConstReal(ctx->f32, 0),
1403 LLVMConstReal(ctx->f32, 0)
1404 };
1405
1406 return lp_build_gather_values(gallivm, pos, 4);
1407 }
1408
1409 static void declare_system_value(struct si_shader_context *ctx,
1410 unsigned index,
1411 const struct tgsi_full_declaration *decl)
1412 {
1413 struct lp_build_context *bld = &ctx->bld_base.base;
1414 struct gallivm_state *gallivm = &ctx->gallivm;
1415 LLVMValueRef value = 0;
1416
1417 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1418
1419 switch (decl->Semantic.Name) {
1420 case TGSI_SEMANTIC_INSTANCEID:
1421 value = LLVMGetParam(ctx->main_fn,
1422 ctx->param_instance_id);
1423 break;
1424
1425 case TGSI_SEMANTIC_VERTEXID:
1426 value = LLVMBuildAdd(gallivm->builder,
1427 LLVMGetParam(ctx->main_fn,
1428 ctx->param_vertex_id),
1429 LLVMGetParam(ctx->main_fn,
1430 ctx->param_base_vertex), "");
1431 break;
1432
1433 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1434 /* Unused. Clarify the meaning in indexed vs. non-indexed
1435 * draws if this is ever used again. */
1436 assert(false);
1437 break;
1438
1439 case TGSI_SEMANTIC_BASEVERTEX:
1440 {
1441 /* For non-indexed draws, the base vertex set by the driver
1442 * (for direct draws) or the CP (for indirect draws) is the
1443 * first vertex ID, but GLSL expects 0 to be returned.
1444 */
1445 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1446 LLVMValueRef indexed;
1447
1448 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1449 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1450
1451 value = LLVMBuildSelect(gallivm->builder, indexed,
1452 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1453 ctx->i32_0, "");
1454 break;
1455 }
1456
1457 case TGSI_SEMANTIC_BASEINSTANCE:
1458 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1459 break;
1460
1461 case TGSI_SEMANTIC_DRAWID:
1462 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1463 break;
1464
1465 case TGSI_SEMANTIC_INVOCATIONID:
1466 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1467 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1468 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1469 value = LLVMGetParam(ctx->main_fn,
1470 ctx->param_gs_instance_id);
1471 else
1472 assert(!"INVOCATIONID not implemented");
1473 break;
1474
1475 case TGSI_SEMANTIC_POSITION:
1476 {
1477 LLVMValueRef pos[4] = {
1478 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1479 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1480 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1481 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1482 LLVMGetParam(ctx->main_fn,
1483 SI_PARAM_POS_W_FLOAT)),
1484 };
1485 value = lp_build_gather_values(gallivm, pos, 4);
1486 break;
1487 }
1488
1489 case TGSI_SEMANTIC_FACE:
1490 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1491 break;
1492
1493 case TGSI_SEMANTIC_SAMPLEID:
1494 value = get_sample_id(ctx);
1495 break;
1496
1497 case TGSI_SEMANTIC_SAMPLEPOS: {
1498 LLVMValueRef pos[4] = {
1499 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1500 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1501 LLVMConstReal(ctx->f32, 0),
1502 LLVMConstReal(ctx->f32, 0)
1503 };
1504 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1505 TGSI_OPCODE_FRC, pos[0]);
1506 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1507 TGSI_OPCODE_FRC, pos[1]);
1508 value = lp_build_gather_values(gallivm, pos, 4);
1509 break;
1510 }
1511
1512 case TGSI_SEMANTIC_SAMPLEMASK:
1513 /* This can only occur with the OpenGL Core profile, which
1514 * doesn't support smoothing.
1515 */
1516 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1517 break;
1518
1519 case TGSI_SEMANTIC_TESSCOORD:
1520 {
1521 LLVMValueRef coord[4] = {
1522 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1523 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1524 bld->zero,
1525 bld->zero
1526 };
1527
1528 /* For triangles, the vector should be (u, v, 1-u-v). */
1529 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1530 PIPE_PRIM_TRIANGLES)
1531 coord[2] = lp_build_sub(bld, bld->one,
1532 lp_build_add(bld, coord[0], coord[1]));
1533
1534 value = lp_build_gather_values(gallivm, coord, 4);
1535 break;
1536 }
1537
1538 case TGSI_SEMANTIC_VERTICESIN:
1539 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1540 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1541 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1542 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
1543 else
1544 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1545 break;
1546
1547 case TGSI_SEMANTIC_TESSINNER:
1548 case TGSI_SEMANTIC_TESSOUTER:
1549 {
1550 LLVMValueRef buffer, base, addr;
1551 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1552
1553 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1554
1555 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1556 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1557 LLVMConstInt(ctx->i32, param, 0));
1558
1559 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1560 ~0, buffer, base, addr, true);
1561
1562 break;
1563 }
1564
1565 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1566 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1567 {
1568 LLVMValueRef buf, slot, val[4];
1569 int i, offset;
1570
1571 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1572 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1573 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1574 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1575
1576 for (i = 0; i < 4; i++)
1577 val[i] = buffer_load_const(ctx, buf,
1578 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1579 value = lp_build_gather_values(gallivm, val, 4);
1580 break;
1581 }
1582
1583 case TGSI_SEMANTIC_PRIMID:
1584 value = get_primitive_id(&ctx->bld_base, 0);
1585 break;
1586
1587 case TGSI_SEMANTIC_GRID_SIZE:
1588 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1589 break;
1590
1591 case TGSI_SEMANTIC_BLOCK_SIZE:
1592 {
1593 LLVMValueRef values[3];
1594 unsigned i;
1595 unsigned *properties = ctx->shader->selector->info.properties;
1596
1597 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1598 unsigned sizes[3] = {
1599 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1600 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1601 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1602 };
1603
1604 for (i = 0; i < 3; ++i)
1605 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1606
1607 value = lp_build_gather_values(gallivm, values, 3);
1608 } else {
1609 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1610 }
1611 break;
1612 }
1613
1614 case TGSI_SEMANTIC_BLOCK_ID:
1615 {
1616 LLVMValueRef values[3];
1617
1618 for (int i = 0; i < 3; i++) {
1619 values[i] = ctx->i32_0;
1620 if (ctx->param_block_id[i] >= 0) {
1621 values[i] = LLVMGetParam(ctx->main_fn,
1622 ctx->param_block_id[i]);
1623 }
1624 }
1625 value = lp_build_gather_values(gallivm, values, 3);
1626 break;
1627 }
1628
1629 case TGSI_SEMANTIC_THREAD_ID:
1630 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1631 break;
1632
1633 case TGSI_SEMANTIC_HELPER_INVOCATION:
1634 value = lp_build_intrinsic(gallivm->builder,
1635 "llvm.amdgcn.ps.live",
1636 ctx->i1, NULL, 0,
1637 LP_FUNC_ATTR_READNONE);
1638 value = LLVMBuildNot(gallivm->builder, value, "");
1639 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1640 break;
1641
1642 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1643 value = LLVMConstInt(ctx->i32, 64, 0);
1644 break;
1645
1646 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1647 value = ac_get_thread_id(&ctx->ac);
1648 break;
1649
1650 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1651 {
1652 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1653 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1654 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1655 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1656 break;
1657 }
1658
1659 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1660 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1661 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1662 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1663 {
1664 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1665 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1666 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1667 /* All bits set except LSB */
1668 value = LLVMConstInt(ctx->i64, -2, 0);
1669 } else {
1670 /* All bits set */
1671 value = LLVMConstInt(ctx->i64, -1, 0);
1672 }
1673 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1674 value = LLVMBuildShl(gallivm->builder, value, id, "");
1675 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1676 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1677 value = LLVMBuildNot(gallivm->builder, value, "");
1678 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1679 break;
1680 }
1681
1682 default:
1683 assert(!"unknown system value");
1684 return;
1685 }
1686
1687 ctx->system_values[index] = value;
1688 }
1689
1690 static void declare_compute_memory(struct si_shader_context *ctx,
1691 const struct tgsi_full_declaration *decl)
1692 {
1693 struct si_shader_selector *sel = ctx->shader->selector;
1694 struct gallivm_state *gallivm = &ctx->gallivm;
1695
1696 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1697 LLVMValueRef var;
1698
1699 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1700 assert(decl->Range.First == decl->Range.Last);
1701 assert(!ctx->shared_memory);
1702
1703 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1704 LLVMArrayType(ctx->i8, sel->local_size),
1705 "compute_lds",
1706 LOCAL_ADDR_SPACE);
1707 LLVMSetAlignment(var, 4);
1708
1709 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1710 }
1711
1712 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1713 {
1714 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1715 ctx->param_const_buffers);
1716
1717 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1718 LLVMConstInt(ctx->i32, i, 0));
1719 }
1720
1721 static LLVMValueRef fetch_constant(
1722 struct lp_build_tgsi_context *bld_base,
1723 const struct tgsi_full_src_register *reg,
1724 enum tgsi_opcode_type type,
1725 unsigned swizzle)
1726 {
1727 struct si_shader_context *ctx = si_shader_context(bld_base);
1728 struct lp_build_context *base = &bld_base->base;
1729 const struct tgsi_ind_register *ireg = &reg->Indirect;
1730 unsigned buf, idx;
1731
1732 LLVMValueRef addr, bufp;
1733 LLVMValueRef result;
1734
1735 if (swizzle == LP_CHAN_ALL) {
1736 unsigned chan;
1737 LLVMValueRef values[4];
1738 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1739 values[chan] = fetch_constant(bld_base, reg, type, chan);
1740
1741 return lp_build_gather_values(&ctx->gallivm, values, 4);
1742 }
1743
1744 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1745 idx = reg->Register.Index * 4 + swizzle;
1746
1747 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1748 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1749 LLVMValueRef index;
1750 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1751 reg->Dimension.Index,
1752 SI_NUM_CONST_BUFFERS);
1753 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1754 } else
1755 bufp = load_const_buffer_desc(ctx, buf);
1756
1757 if (reg->Register.Indirect) {
1758 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1759 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1760 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1761 addr = lp_build_add(&bld_base->uint_bld, addr,
1762 LLVMConstInt(ctx->i32, idx * 4, 0));
1763 } else {
1764 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1765 }
1766
1767 result = buffer_load_const(ctx, bufp, addr);
1768
1769 if (!tgsi_type_is_64bit(type))
1770 result = bitcast(bld_base, type, result);
1771 else {
1772 LLVMValueRef addr2, result2;
1773
1774 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1775 LLVMConstInt(ctx->i32, 4, 0));
1776 result2 = buffer_load_const(ctx, bufp, addr2);
1777
1778 result = si_llvm_emit_fetch_64bit(bld_base, type,
1779 result, result2);
1780 }
1781 return result;
1782 }
1783
1784 /* Upper 16 bits must be zero. */
1785 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1786 LLVMValueRef val[2])
1787 {
1788 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1789 LLVMBuildShl(ctx->gallivm.builder, val[1],
1790 LLVMConstInt(ctx->i32, 16, 0),
1791 ""), "");
1792 }
1793
1794 /* Upper 16 bits are ignored and will be dropped. */
1795 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1796 LLVMValueRef val[2])
1797 {
1798 LLVMValueRef v[2] = {
1799 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1800 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1801 val[1],
1802 };
1803 return si_llvm_pack_two_int16(ctx, v);
1804 }
1805
1806 /* Initialize arguments for the shader export intrinsic */
1807 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1808 LLVMValueRef *values,
1809 unsigned target,
1810 struct ac_export_args *args)
1811 {
1812 struct si_shader_context *ctx = si_shader_context(bld_base);
1813 struct lp_build_context *base = &bld_base->base;
1814 LLVMBuilderRef builder = ctx->gallivm.builder;
1815 LLVMValueRef val[4];
1816 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1817 unsigned chan;
1818 bool is_int8, is_int10;
1819
1820 /* Default is 0xf. Adjusted below depending on the format. */
1821 args->enabled_channels = 0xf; /* writemask */
1822
1823 /* Specify whether the EXEC mask represents the valid mask */
1824 args->valid_mask = 0;
1825
1826 /* Specify whether this is the last export */
1827 args->done = 0;
1828
1829 /* Specify the target we are exporting */
1830 args->target = target;
1831
1832 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1833 const struct si_shader_key *key = &ctx->shader->key;
1834 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1835 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1836
1837 assert(cbuf >= 0 && cbuf < 8);
1838 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1839 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1840 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1841 }
1842
1843 args->compr = false;
1844 args->out[0] = base->undef;
1845 args->out[1] = base->undef;
1846 args->out[2] = base->undef;
1847 args->out[3] = base->undef;
1848
1849 switch (spi_shader_col_format) {
1850 case V_028714_SPI_SHADER_ZERO:
1851 args->enabled_channels = 0; /* writemask */
1852 args->target = V_008DFC_SQ_EXP_NULL;
1853 break;
1854
1855 case V_028714_SPI_SHADER_32_R:
1856 args->enabled_channels = 1; /* writemask */
1857 args->out[0] = values[0];
1858 break;
1859
1860 case V_028714_SPI_SHADER_32_GR:
1861 args->enabled_channels = 0x3; /* writemask */
1862 args->out[0] = values[0];
1863 args->out[1] = values[1];
1864 break;
1865
1866 case V_028714_SPI_SHADER_32_AR:
1867 args->enabled_channels = 0x9; /* writemask */
1868 args->out[0] = values[0];
1869 args->out[3] = values[3];
1870 break;
1871
1872 case V_028714_SPI_SHADER_FP16_ABGR:
1873 args->compr = 1; /* COMPR flag */
1874
1875 for (chan = 0; chan < 2; chan++) {
1876 LLVMValueRef pack_args[2] = {
1877 values[2 * chan],
1878 values[2 * chan + 1]
1879 };
1880 LLVMValueRef packed;
1881
1882 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1883 args->out[chan] =
1884 LLVMBuildBitCast(ctx->gallivm.builder,
1885 packed, ctx->f32, "");
1886 }
1887 break;
1888
1889 case V_028714_SPI_SHADER_UNORM16_ABGR:
1890 for (chan = 0; chan < 4; chan++) {
1891 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1892 val[chan] = LLVMBuildFMul(builder, val[chan],
1893 LLVMConstReal(ctx->f32, 65535), "");
1894 val[chan] = LLVMBuildFAdd(builder, val[chan],
1895 LLVMConstReal(ctx->f32, 0.5), "");
1896 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1897 ctx->i32, "");
1898 }
1899
1900 args->compr = 1; /* COMPR flag */
1901 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1902 si_llvm_pack_two_int16(ctx, val));
1903 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1904 si_llvm_pack_two_int16(ctx, val+2));
1905 break;
1906
1907 case V_028714_SPI_SHADER_SNORM16_ABGR:
1908 for (chan = 0; chan < 4; chan++) {
1909 /* Clamp between [-1, 1]. */
1910 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1911 values[chan],
1912 LLVMConstReal(ctx->f32, 1));
1913 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1914 val[chan],
1915 LLVMConstReal(ctx->f32, -1));
1916 /* Convert to a signed integer in [-32767, 32767]. */
1917 val[chan] = LLVMBuildFMul(builder, val[chan],
1918 LLVMConstReal(ctx->f32, 32767), "");
1919 /* If positive, add 0.5, else add -0.5. */
1920 val[chan] = LLVMBuildFAdd(builder, val[chan],
1921 LLVMBuildSelect(builder,
1922 LLVMBuildFCmp(builder, LLVMRealOGE,
1923 val[chan], base->zero, ""),
1924 LLVMConstReal(ctx->f32, 0.5),
1925 LLVMConstReal(ctx->f32, -0.5), ""), "");
1926 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1927 }
1928
1929 args->compr = 1; /* COMPR flag */
1930 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1931 si_llvm_pack_two_int32_as_int16(ctx, val));
1932 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1933 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1934 break;
1935
1936 case V_028714_SPI_SHADER_UINT16_ABGR: {
1937 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1938 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1939 LLVMValueRef max_alpha =
1940 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1941
1942 /* Clamp. */
1943 for (chan = 0; chan < 4; chan++) {
1944 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1945 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1946 val[chan],
1947 chan == 3 ? max_alpha : max_rgb);
1948 }
1949
1950 args->compr = 1; /* COMPR flag */
1951 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1952 si_llvm_pack_two_int16(ctx, val));
1953 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1954 si_llvm_pack_two_int16(ctx, val+2));
1955 break;
1956 }
1957
1958 case V_028714_SPI_SHADER_SINT16_ABGR: {
1959 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1960 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1961 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1962 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1963 LLVMValueRef max_alpha =
1964 !is_int10 ? max_rgb : ctx->i32_1;
1965 LLVMValueRef min_alpha =
1966 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1967
1968 /* Clamp. */
1969 for (chan = 0; chan < 4; chan++) {
1970 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1971 val[chan] = lp_build_emit_llvm_binary(bld_base,
1972 TGSI_OPCODE_IMIN,
1973 val[chan], chan == 3 ? max_alpha : max_rgb);
1974 val[chan] = lp_build_emit_llvm_binary(bld_base,
1975 TGSI_OPCODE_IMAX,
1976 val[chan], chan == 3 ? min_alpha : min_rgb);
1977 }
1978
1979 args->compr = 1; /* COMPR flag */
1980 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1981 si_llvm_pack_two_int32_as_int16(ctx, val));
1982 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1983 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1984 break;
1985 }
1986
1987 case V_028714_SPI_SHADER_32_ABGR:
1988 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1989 break;
1990 }
1991 }
1992
1993 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1994 LLVMValueRef alpha)
1995 {
1996 struct si_shader_context *ctx = si_shader_context(bld_base);
1997
1998 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1999 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2000 SI_PARAM_ALPHA_REF);
2001
2002 LLVMValueRef alpha_pass =
2003 lp_build_cmp(&bld_base->base,
2004 ctx->shader->key.part.ps.epilog.alpha_func,
2005 alpha, alpha_ref);
2006 LLVMValueRef arg =
2007 lp_build_select(&bld_base->base,
2008 alpha_pass,
2009 LLVMConstReal(ctx->f32, 1.0f),
2010 LLVMConstReal(ctx->f32, -1.0f));
2011
2012 ac_build_kill(&ctx->ac, arg);
2013 } else {
2014 ac_build_kill(&ctx->ac, NULL);
2015 }
2016 }
2017
2018 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2019 LLVMValueRef alpha,
2020 unsigned samplemask_param)
2021 {
2022 struct si_shader_context *ctx = si_shader_context(bld_base);
2023 struct gallivm_state *gallivm = &ctx->gallivm;
2024 LLVMValueRef coverage;
2025
2026 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2027 coverage = LLVMGetParam(ctx->main_fn,
2028 samplemask_param);
2029 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2030
2031 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2032 ctx->i32,
2033 &coverage, 1, LP_FUNC_ATTR_READNONE);
2034
2035 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2036 ctx->f32, "");
2037
2038 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2039 LLVMConstReal(ctx->f32,
2040 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2041
2042 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2043 }
2044
2045 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2046 struct ac_export_args *pos, LLVMValueRef *out_elts)
2047 {
2048 struct si_shader_context *ctx = si_shader_context(bld_base);
2049 struct lp_build_context *base = &bld_base->base;
2050 unsigned reg_index;
2051 unsigned chan;
2052 unsigned const_chan;
2053 LLVMValueRef base_elt;
2054 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2055 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2056 SI_VS_CONST_CLIP_PLANES, 0);
2057 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2058
2059 for (reg_index = 0; reg_index < 2; reg_index ++) {
2060 struct ac_export_args *args = &pos[2 + reg_index];
2061
2062 args->out[0] =
2063 args->out[1] =
2064 args->out[2] =
2065 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2066
2067 /* Compute dot products of position and user clip plane vectors */
2068 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2069 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2070 LLVMValueRef addr =
2071 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2072 const_chan) * 4, 0);
2073 base_elt = buffer_load_const(ctx, const_resource,
2074 addr);
2075 args->out[chan] =
2076 lp_build_add(base, args->out[chan],
2077 lp_build_mul(base, base_elt,
2078 out_elts[const_chan]));
2079 }
2080 }
2081
2082 args->enabled_channels = 0xf;
2083 args->valid_mask = 0;
2084 args->done = 0;
2085 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2086 args->compr = 0;
2087 }
2088 }
2089
2090 static void si_dump_streamout(struct pipe_stream_output_info *so)
2091 {
2092 unsigned i;
2093
2094 if (so->num_outputs)
2095 fprintf(stderr, "STREAMOUT\n");
2096
2097 for (i = 0; i < so->num_outputs; i++) {
2098 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2099 so->output[i].start_component;
2100 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2101 i, so->output[i].output_buffer,
2102 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2103 so->output[i].register_index,
2104 mask & 1 ? "x" : "",
2105 mask & 2 ? "y" : "",
2106 mask & 4 ? "z" : "",
2107 mask & 8 ? "w" : "");
2108 }
2109 }
2110
2111 static void emit_streamout_output(struct si_shader_context *ctx,
2112 LLVMValueRef const *so_buffers,
2113 LLVMValueRef const *so_write_offsets,
2114 struct pipe_stream_output *stream_out,
2115 struct si_shader_output_values *shader_out)
2116 {
2117 struct gallivm_state *gallivm = &ctx->gallivm;
2118 LLVMBuilderRef builder = gallivm->builder;
2119 unsigned buf_idx = stream_out->output_buffer;
2120 unsigned start = stream_out->start_component;
2121 unsigned num_comps = stream_out->num_components;
2122 LLVMValueRef out[4];
2123
2124 assert(num_comps && num_comps <= 4);
2125 if (!num_comps || num_comps > 4)
2126 return;
2127
2128 /* Load the output as int. */
2129 for (int j = 0; j < num_comps; j++) {
2130 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2131
2132 out[j] = LLVMBuildBitCast(builder,
2133 shader_out->values[start + j],
2134 ctx->i32, "");
2135 }
2136
2137 /* Pack the output. */
2138 LLVMValueRef vdata = NULL;
2139
2140 switch (num_comps) {
2141 case 1: /* as i32 */
2142 vdata = out[0];
2143 break;
2144 case 2: /* as v2i32 */
2145 case 3: /* as v4i32 (aligned to 4) */
2146 case 4: /* as v4i32 */
2147 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2148 for (int j = 0; j < num_comps; j++) {
2149 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2150 LLVMConstInt(ctx->i32, j, 0), "");
2151 }
2152 break;
2153 }
2154
2155 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2156 vdata, num_comps,
2157 so_write_offsets[buf_idx],
2158 ctx->i32_0,
2159 stream_out->dst_offset * 4, 1, 1, true, false);
2160 }
2161
2162 /**
2163 * Write streamout data to buffers for vertex stream @p stream (different
2164 * vertex streams can occur for GS copy shaders).
2165 */
2166 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2167 struct si_shader_output_values *outputs,
2168 unsigned noutput, unsigned stream)
2169 {
2170 struct si_shader_selector *sel = ctx->shader->selector;
2171 struct pipe_stream_output_info *so = &sel->so;
2172 struct gallivm_state *gallivm = &ctx->gallivm;
2173 LLVMBuilderRef builder = gallivm->builder;
2174 int i;
2175 struct lp_build_if_state if_ctx;
2176
2177 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2178 LLVMValueRef so_vtx_count =
2179 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2180
2181 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2182
2183 /* can_emit = tid < so_vtx_count; */
2184 LLVMValueRef can_emit =
2185 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2186
2187 /* Emit the streamout code conditionally. This actually avoids
2188 * out-of-bounds buffer access. The hw tells us via the SGPR
2189 * (so_vtx_count) which threads are allowed to emit streamout data. */
2190 lp_build_if(&if_ctx, gallivm, can_emit);
2191 {
2192 /* The buffer offset is computed as follows:
2193 * ByteOffset = streamout_offset[buffer_id]*4 +
2194 * (streamout_write_index + thread_id)*stride[buffer_id] +
2195 * attrib_offset
2196 */
2197
2198 LLVMValueRef so_write_index =
2199 LLVMGetParam(ctx->main_fn,
2200 ctx->param_streamout_write_index);
2201
2202 /* Compute (streamout_write_index + thread_id). */
2203 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2204
2205 /* Load the descriptor and compute the write offset for each
2206 * enabled buffer. */
2207 LLVMValueRef so_write_offset[4] = {};
2208 LLVMValueRef so_buffers[4];
2209 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2210 ctx->param_rw_buffers);
2211
2212 for (i = 0; i < 4; i++) {
2213 if (!so->stride[i])
2214 continue;
2215
2216 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2217 SI_VS_STREAMOUT_BUF0 + i, 0);
2218
2219 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2220
2221 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2222 ctx->param_streamout_offset[i]);
2223 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2224
2225 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2226 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2227 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2228 }
2229
2230 /* Write streamout data. */
2231 for (i = 0; i < so->num_outputs; i++) {
2232 unsigned reg = so->output[i].register_index;
2233
2234 if (reg >= noutput)
2235 continue;
2236
2237 if (stream != so->output[i].stream)
2238 continue;
2239
2240 emit_streamout_output(ctx, so_buffers, so_write_offset,
2241 &so->output[i], &outputs[reg]);
2242 }
2243 }
2244 lp_build_endif(&if_ctx);
2245 }
2246
2247
2248 /* Generate export instructions for hardware VS shader stage */
2249 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2250 struct si_shader_output_values *outputs,
2251 unsigned noutput)
2252 {
2253 struct si_shader_context *ctx = si_shader_context(bld_base);
2254 struct si_shader *shader = ctx->shader;
2255 struct lp_build_context *base = &bld_base->base;
2256 struct ac_export_args args, pos_args[4] = {};
2257 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2258 unsigned semantic_name, semantic_index;
2259 unsigned target;
2260 unsigned param_count = 0;
2261 unsigned pos_idx;
2262 int i;
2263
2264 for (i = 0; i < noutput; i++) {
2265 semantic_name = outputs[i].semantic_name;
2266 semantic_index = outputs[i].semantic_index;
2267 bool export_param = true;
2268
2269 switch (semantic_name) {
2270 case TGSI_SEMANTIC_POSITION: /* ignore these */
2271 case TGSI_SEMANTIC_PSIZE:
2272 case TGSI_SEMANTIC_CLIPVERTEX:
2273 case TGSI_SEMANTIC_EDGEFLAG:
2274 break;
2275 case TGSI_SEMANTIC_GENERIC:
2276 case TGSI_SEMANTIC_CLIPDIST:
2277 if (shader->key.opt.hw_vs.kill_outputs &
2278 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2279 export_param = false;
2280 break;
2281 default:
2282 if (shader->key.opt.hw_vs.kill_outputs2 &
2283 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2284 export_param = false;
2285 break;
2286 }
2287
2288 if (outputs[i].vertex_stream[0] != 0 &&
2289 outputs[i].vertex_stream[1] != 0 &&
2290 outputs[i].vertex_stream[2] != 0 &&
2291 outputs[i].vertex_stream[3] != 0)
2292 export_param = false;
2293
2294 handle_semantic:
2295 /* Select the correct target */
2296 switch(semantic_name) {
2297 case TGSI_SEMANTIC_PSIZE:
2298 psize_value = outputs[i].values[0];
2299 continue;
2300 case TGSI_SEMANTIC_EDGEFLAG:
2301 edgeflag_value = outputs[i].values[0];
2302 continue;
2303 case TGSI_SEMANTIC_LAYER:
2304 layer_value = outputs[i].values[0];
2305 semantic_name = TGSI_SEMANTIC_GENERIC;
2306 goto handle_semantic;
2307 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2308 viewport_index_value = outputs[i].values[0];
2309 semantic_name = TGSI_SEMANTIC_GENERIC;
2310 goto handle_semantic;
2311 case TGSI_SEMANTIC_POSITION:
2312 target = V_008DFC_SQ_EXP_POS;
2313 break;
2314 case TGSI_SEMANTIC_CLIPDIST:
2315 if (shader->key.opt.hw_vs.clip_disable) {
2316 semantic_name = TGSI_SEMANTIC_GENERIC;
2317 goto handle_semantic;
2318 }
2319 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2320 break;
2321 case TGSI_SEMANTIC_CLIPVERTEX:
2322 if (shader->key.opt.hw_vs.clip_disable)
2323 continue;
2324 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2325 continue;
2326 case TGSI_SEMANTIC_COLOR:
2327 case TGSI_SEMANTIC_BCOLOR:
2328 case TGSI_SEMANTIC_PRIMID:
2329 case TGSI_SEMANTIC_FOG:
2330 case TGSI_SEMANTIC_TEXCOORD:
2331 case TGSI_SEMANTIC_GENERIC:
2332 if (!export_param)
2333 continue;
2334 target = V_008DFC_SQ_EXP_PARAM + param_count;
2335 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2336 shader->info.vs_output_param_offset[i] = param_count;
2337 param_count++;
2338 break;
2339 default:
2340 target = 0;
2341 fprintf(stderr,
2342 "Warning: SI unhandled vs output type:%d\n",
2343 semantic_name);
2344 }
2345
2346 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2347
2348 if (target >= V_008DFC_SQ_EXP_POS &&
2349 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2350 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2351 &args, sizeof(args));
2352 } else {
2353 ac_build_export(&ctx->ac, &args);
2354 }
2355
2356 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2357 semantic_name = TGSI_SEMANTIC_GENERIC;
2358 goto handle_semantic;
2359 }
2360 }
2361
2362 shader->info.nr_param_exports = param_count;
2363
2364 /* We need to add the position output manually if it's missing. */
2365 if (!pos_args[0].out[0]) {
2366 pos_args[0].enabled_channels = 0xf; /* writemask */
2367 pos_args[0].valid_mask = 0; /* EXEC mask */
2368 pos_args[0].done = 0; /* last export? */
2369 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2370 pos_args[0].compr = 0; /* COMPR flag */
2371 pos_args[0].out[0] = base->zero; /* X */
2372 pos_args[0].out[1] = base->zero; /* Y */
2373 pos_args[0].out[2] = base->zero; /* Z */
2374 pos_args[0].out[3] = base->one; /* W */
2375 }
2376
2377 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2378 if (shader->selector->info.writes_psize ||
2379 shader->selector->info.writes_edgeflag ||
2380 shader->selector->info.writes_viewport_index ||
2381 shader->selector->info.writes_layer) {
2382 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2383 (shader->selector->info.writes_edgeflag << 1) |
2384 (shader->selector->info.writes_layer << 2);
2385
2386 pos_args[1].valid_mask = 0; /* EXEC mask */
2387 pos_args[1].done = 0; /* last export? */
2388 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2389 pos_args[1].compr = 0; /* COMPR flag */
2390 pos_args[1].out[0] = base->zero; /* X */
2391 pos_args[1].out[1] = base->zero; /* Y */
2392 pos_args[1].out[2] = base->zero; /* Z */
2393 pos_args[1].out[3] = base->zero; /* W */
2394
2395 if (shader->selector->info.writes_psize)
2396 pos_args[1].out[0] = psize_value;
2397
2398 if (shader->selector->info.writes_edgeflag) {
2399 /* The output is a float, but the hw expects an integer
2400 * with the first bit containing the edge flag. */
2401 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2402 edgeflag_value,
2403 ctx->i32, "");
2404 edgeflag_value = lp_build_min(&bld_base->int_bld,
2405 edgeflag_value,
2406 ctx->i32_1);
2407
2408 /* The LLVM intrinsic expects a float. */
2409 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2410 edgeflag_value,
2411 ctx->f32, "");
2412 }
2413
2414 if (ctx->screen->b.chip_class >= GFX9) {
2415 /* GFX9 has the layer in out.z[10:0] and the viewport
2416 * index in out.z[19:16].
2417 */
2418 if (shader->selector->info.writes_layer)
2419 pos_args[1].out[2] = layer_value;
2420
2421 if (shader->selector->info.writes_viewport_index) {
2422 LLVMValueRef v = viewport_index_value;
2423
2424 v = bitcast(bld_base, TGSI_TYPE_UNSIGNED, v);
2425 v = LLVMBuildShl(ctx->gallivm.builder, v,
2426 LLVMConstInt(ctx->i32, 16, 0), "");
2427 v = LLVMBuildOr(ctx->gallivm.builder, v,
2428 bitcast(bld_base, TGSI_TYPE_UNSIGNED,
2429 pos_args[1].out[2]), "");
2430 pos_args[1].out[2] = bitcast(bld_base, TGSI_TYPE_FLOAT, v);
2431 pos_args[1].enabled_channels |= 1 << 2;
2432 }
2433 } else {
2434 if (shader->selector->info.writes_layer)
2435 pos_args[1].out[2] = layer_value;
2436
2437 if (shader->selector->info.writes_viewport_index) {
2438 pos_args[1].out[3] = viewport_index_value;
2439 pos_args[1].enabled_channels |= 1 << 3;
2440 }
2441 }
2442 }
2443
2444 for (i = 0; i < 4; i++)
2445 if (pos_args[i].out[0])
2446 shader->info.nr_pos_exports++;
2447
2448 pos_idx = 0;
2449 for (i = 0; i < 4; i++) {
2450 if (!pos_args[i].out[0])
2451 continue;
2452
2453 /* Specify the target we are exporting */
2454 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2455
2456 if (pos_idx == shader->info.nr_pos_exports)
2457 /* Specify that this is the last export */
2458 pos_args[i].done = 1;
2459
2460 ac_build_export(&ctx->ac, &pos_args[i]);
2461 }
2462 }
2463
2464 /**
2465 * Forward all outputs from the vertex shader to the TES. This is only used
2466 * for the fixed function TCS.
2467 */
2468 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2469 {
2470 struct si_shader_context *ctx = si_shader_context(bld_base);
2471 struct gallivm_state *gallivm = &ctx->gallivm;
2472 LLVMValueRef invocation_id, buffer, buffer_offset;
2473 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2474 uint64_t inputs;
2475
2476 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2477 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2478 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2479
2480 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2481 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2482 lds_vertex_stride, "");
2483 lds_base = get_tcs_in_current_patch_offset(ctx);
2484 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2485
2486 inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2487 while (inputs) {
2488 unsigned i = u_bit_scan64(&inputs);
2489
2490 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2491 LLVMConstInt(ctx->i32, 4 * i, 0),
2492 "");
2493
2494 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2495 get_rel_patch_id(ctx),
2496 invocation_id,
2497 LLVMConstInt(ctx->i32, i, 0));
2498
2499 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2500 lds_ptr);
2501
2502 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2503 buffer_offset, 0, 1, 0, true, false);
2504 }
2505 }
2506
2507 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2508 LLVMValueRef rel_patch_id,
2509 LLVMValueRef invocation_id,
2510 LLVMValueRef tcs_out_current_patch_data_offset)
2511 {
2512 struct si_shader_context *ctx = si_shader_context(bld_base);
2513 struct gallivm_state *gallivm = &ctx->gallivm;
2514 struct si_shader *shader = ctx->shader;
2515 unsigned tess_inner_index, tess_outer_index;
2516 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2517 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2518 unsigned stride, outer_comps, inner_comps, i, offset;
2519 struct lp_build_if_state if_ctx, inner_if_ctx;
2520
2521 si_llvm_emit_barrier(NULL, bld_base, NULL);
2522
2523 /* Do this only for invocation 0, because the tess levels are per-patch,
2524 * not per-vertex.
2525 *
2526 * This can't jump, because invocation 0 executes this. It should
2527 * at least mask out the loads and stores for other invocations.
2528 */
2529 lp_build_if(&if_ctx, gallivm,
2530 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2531 invocation_id, ctx->i32_0, ""));
2532
2533 /* Determine the layout of one tess factor element in the buffer. */
2534 switch (shader->key.part.tcs.epilog.prim_mode) {
2535 case PIPE_PRIM_LINES:
2536 stride = 2; /* 2 dwords, 1 vec2 store */
2537 outer_comps = 2;
2538 inner_comps = 0;
2539 break;
2540 case PIPE_PRIM_TRIANGLES:
2541 stride = 4; /* 4 dwords, 1 vec4 store */
2542 outer_comps = 3;
2543 inner_comps = 1;
2544 break;
2545 case PIPE_PRIM_QUADS:
2546 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2547 outer_comps = 4;
2548 inner_comps = 2;
2549 break;
2550 default:
2551 assert(0);
2552 return;
2553 }
2554
2555 /* Load tess_inner and tess_outer from LDS.
2556 * Any invocation can write them, so we can't get them from a temporary.
2557 */
2558 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2559 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2560
2561 lds_base = tcs_out_current_patch_data_offset;
2562 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2563 LLVMConstInt(ctx->i32,
2564 tess_inner_index * 4, 0), "");
2565 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2566 LLVMConstInt(ctx->i32,
2567 tess_outer_index * 4, 0), "");
2568
2569 for (i = 0; i < 4; i++) {
2570 inner[i] = LLVMGetUndef(ctx->i32);
2571 outer[i] = LLVMGetUndef(ctx->i32);
2572 }
2573
2574 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2575 /* For isolines, the hardware expects tess factors in the
2576 * reverse order from what GLSL / TGSI specify.
2577 */
2578 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2579 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2580 } else {
2581 for (i = 0; i < outer_comps; i++) {
2582 outer[i] = out[i] =
2583 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2584 }
2585 for (i = 0; i < inner_comps; i++) {
2586 inner[i] = out[outer_comps+i] =
2587 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2588 }
2589 }
2590
2591 /* Convert the outputs to vectors for stores. */
2592 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2593 vec1 = NULL;
2594
2595 if (stride > 4)
2596 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2597
2598 /* Get the buffer. */
2599 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2600
2601 /* Get the offset. */
2602 tf_base = LLVMGetParam(ctx->main_fn,
2603 ctx->param_tcs_factor_offset);
2604 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2605 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2606
2607 lp_build_if(&inner_if_ctx, gallivm,
2608 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2609 rel_patch_id, ctx->i32_0, ""));
2610
2611 /* Store the dynamic HS control word. */
2612 offset = 0;
2613 if (ctx->screen->b.chip_class <= VI) {
2614 ac_build_buffer_store_dword(&ctx->ac, buffer,
2615 LLVMConstInt(ctx->i32, 0x80000000, 0),
2616 1, ctx->i32_0, tf_base,
2617 offset, 1, 0, true, false);
2618 offset += 4;
2619 }
2620
2621 lp_build_endif(&inner_if_ctx);
2622
2623 /* Store the tessellation factors. */
2624 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2625 MIN2(stride, 4), byteoffset, tf_base,
2626 offset, 1, 0, true, false);
2627 offset += 16;
2628 if (vec1)
2629 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2630 stride - 4, byteoffset, tf_base,
2631 offset, 1, 0, true, false);
2632
2633 /* Store the tess factors into the offchip buffer if TES reads them. */
2634 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2635 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2636 LLVMValueRef tf_inner_offset;
2637 unsigned param_outer, param_inner;
2638
2639 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2640 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2641
2642 param_outer = si_shader_io_get_unique_index(
2643 TGSI_SEMANTIC_TESSOUTER, 0);
2644 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2645 LLVMConstInt(ctx->i32, param_outer, 0));
2646
2647 outer_vec = lp_build_gather_values(gallivm, outer,
2648 util_next_power_of_two(outer_comps));
2649
2650 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2651 outer_comps, tf_outer_offset,
2652 base, 0, 1, 0, true, false);
2653 if (inner_comps) {
2654 param_inner = si_shader_io_get_unique_index(
2655 TGSI_SEMANTIC_TESSINNER, 0);
2656 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2657 LLVMConstInt(ctx->i32, param_inner, 0));
2658
2659 inner_vec = inner_comps == 1 ? inner[0] :
2660 lp_build_gather_values(gallivm, inner, inner_comps);
2661 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2662 inner_comps, tf_inner_offset,
2663 base, 0, 1, 0, true, false);
2664 }
2665 }
2666
2667 lp_build_endif(&if_ctx);
2668 }
2669
2670 static LLVMValueRef
2671 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2672 unsigned param, unsigned return_index)
2673 {
2674 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2675 LLVMGetParam(ctx->main_fn, param),
2676 return_index, "");
2677 }
2678
2679 static LLVMValueRef
2680 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2681 unsigned param, unsigned return_index)
2682 {
2683 LLVMBuilderRef builder = ctx->gallivm.builder;
2684 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2685
2686 return LLVMBuildInsertValue(builder, ret,
2687 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2688 return_index, "");
2689 }
2690
2691 static LLVMValueRef
2692 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2693 unsigned param, unsigned return_index)
2694 {
2695 LLVMBuilderRef builder = ctx->gallivm.builder;
2696 LLVMValueRef ptr, lo, hi;
2697
2698 ptr = LLVMGetParam(ctx->main_fn, param);
2699 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2700 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2701 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2702 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2703 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2704 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2705 }
2706
2707 /* This only writes the tessellation factor levels. */
2708 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2709 {
2710 struct si_shader_context *ctx = si_shader_context(bld_base);
2711 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2712
2713 si_copy_tcs_inputs(bld_base);
2714
2715 rel_patch_id = get_rel_patch_id(ctx);
2716 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2717 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2718
2719 /* Return epilog parameters from this function. */
2720 LLVMBuilderRef builder = ctx->gallivm.builder;
2721 LLVMValueRef ret = ctx->return_value;
2722 unsigned vgpr;
2723
2724 if (ctx->screen->b.chip_class >= GFX9) {
2725 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2726 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2727 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2728 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2729 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2730 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2731 /* Tess offchip and tess factor offsets are at the beginning. */
2732 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2733 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2734 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
2735 } else {
2736 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2737 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
2738 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2739 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2740 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2741 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
2742 /* Tess offchip and tess factor offsets are after user SGPRs. */
2743 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
2744 GFX6_TCS_NUM_USER_SGPR);
2745 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
2746 GFX6_TCS_NUM_USER_SGPR + 1);
2747 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2748 }
2749
2750 /* VGPRs */
2751 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2752 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2753 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2754
2755 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2756 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2757 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2758 ctx->return_value = ret;
2759 }
2760
2761 /* Pass TCS inputs from LS to TCS on GFX9. */
2762 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2763 {
2764 LLVMValueRef ret = ctx->return_value;
2765
2766 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2767 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2768 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2769 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2770 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2771
2772 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2773 8 + SI_SGPR_VS_STATE_BITS);
2774 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2775 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2776 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2777 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2778 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2779 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2780 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
2781 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
2782 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
2783 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
2784
2785 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
2786 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2787 8 + GFX9_SGPR_TCS_CONST_BUFFERS);
2788 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2789 8 + GFX9_SGPR_TCS_SAMPLERS);
2790 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2791 8 + GFX9_SGPR_TCS_IMAGES);
2792 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2793 8 + GFX9_SGPR_TCS_SHADER_BUFFERS);
2794
2795 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2796 ret = si_insert_input_ret_float(ctx, ret,
2797 ctx->param_tcs_patch_id, vgpr++);
2798 ret = si_insert_input_ret_float(ctx, ret,
2799 ctx->param_tcs_rel_ids, vgpr++);
2800 ctx->return_value = ret;
2801 }
2802
2803 /* Pass GS inputs from ES to GS on GFX9. */
2804 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2805 {
2806 LLVMValueRef ret = ctx->return_value;
2807
2808 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2809 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2810 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2811
2812 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2813
2814 unsigned desc_param = ctx->param_vs_state_bits + 1;
2815 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2816 8 + GFX9_SGPR_GS_CONST_BUFFERS);
2817 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2818 8 + GFX9_SGPR_GS_SAMPLERS);
2819 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2820 8 + GFX9_SGPR_GS_IMAGES);
2821 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2822 8 + GFX9_SGPR_GS_SHADER_BUFFERS);
2823
2824 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2825 for (unsigned i = 0; i < 5; i++) {
2826 unsigned param = ctx->param_gs_vtx01_offset + i;
2827 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2828 }
2829 ctx->return_value = ret;
2830 }
2831
2832 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2833 {
2834 struct si_shader_context *ctx = si_shader_context(bld_base);
2835 struct si_shader *shader = ctx->shader;
2836 struct tgsi_shader_info *info = &shader->selector->info;
2837 struct gallivm_state *gallivm = &ctx->gallivm;
2838 unsigned i, chan;
2839 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2840 ctx->param_rel_auto_id);
2841 LLVMValueRef vertex_dw_stride =
2842 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2843 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2844 vertex_dw_stride, "");
2845
2846 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2847 * its inputs from it. */
2848 for (i = 0; i < info->num_outputs; i++) {
2849 LLVMValueRef *out_ptr = ctx->outputs[i];
2850 unsigned name = info->output_semantic_name[i];
2851 unsigned index = info->output_semantic_index[i];
2852
2853 /* The ARB_shader_viewport_layer_array spec contains the
2854 * following issue:
2855 *
2856 * 2) What happens if gl_ViewportIndex or gl_Layer is
2857 * written in the vertex shader and a geometry shader is
2858 * present?
2859 *
2860 * RESOLVED: The value written by the last vertex processing
2861 * stage is used. If the last vertex processing stage
2862 * (vertex, tessellation evaluation or geometry) does not
2863 * statically assign to gl_ViewportIndex or gl_Layer, index
2864 * or layer zero is assumed.
2865 *
2866 * So writes to those outputs in VS-as-LS are simply ignored.
2867 */
2868 if (name == TGSI_SEMANTIC_LAYER ||
2869 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2870 continue;
2871
2872 int param = si_shader_io_get_unique_index(name, index);
2873 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2874 LLVMConstInt(ctx->i32, param * 4, 0), "");
2875
2876 for (chan = 0; chan < 4; chan++) {
2877 lds_store(bld_base, chan, dw_addr,
2878 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2879 }
2880 }
2881
2882 if (ctx->screen->b.chip_class >= GFX9)
2883 si_set_ls_return_value_for_tcs(ctx);
2884 }
2885
2886 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2887 {
2888 struct si_shader_context *ctx = si_shader_context(bld_base);
2889 struct gallivm_state *gallivm = &ctx->gallivm;
2890 struct si_shader *es = ctx->shader;
2891 struct tgsi_shader_info *info = &es->selector->info;
2892 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2893 ctx->param_es2gs_offset);
2894 LLVMValueRef lds_base = NULL;
2895 unsigned chan;
2896 int i;
2897
2898 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2899 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2900 lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
2901 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2902 }
2903
2904 for (i = 0; i < info->num_outputs; i++) {
2905 LLVMValueRef *out_ptr = ctx->outputs[i];
2906 int param;
2907
2908 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2909 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2910 continue;
2911
2912 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2913 info->output_semantic_index[i]);
2914
2915 for (chan = 0; chan < 4; chan++) {
2916 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2917 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2918
2919 /* GFX9 has the ESGS ring in LDS. */
2920 if (ctx->screen->b.chip_class >= GFX9) {
2921 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2922 continue;
2923 }
2924
2925 ac_build_buffer_store_dword(&ctx->ac,
2926 ctx->esgs_ring,
2927 out_val, 1, NULL, soffset,
2928 (4 * param + chan) * 4,
2929 1, 1, true, true);
2930 }
2931 }
2932
2933 if (ctx->screen->b.chip_class >= GFX9)
2934 si_set_es_return_value_for_gs(ctx);
2935 }
2936
2937 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2938 {
2939 if (ctx->screen->b.chip_class >= GFX9)
2940 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2941 else
2942 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2943 }
2944
2945 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2946 {
2947 struct si_shader_context *ctx = si_shader_context(bld_base);
2948
2949 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2950 si_get_gs_wave_id(ctx));
2951 }
2952
2953 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2954 {
2955 struct si_shader_context *ctx = si_shader_context(bld_base);
2956 struct gallivm_state *gallivm = &ctx->gallivm;
2957 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2958 struct si_shader_output_values *outputs = NULL;
2959 int i,j;
2960
2961 assert(!ctx->shader->is_gs_copy_shader);
2962
2963 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2964
2965 /* Vertex color clamping.
2966 *
2967 * This uses a state constant loaded in a user data SGPR and
2968 * an IF statement is added that clamps all colors if the constant
2969 * is true.
2970 */
2971 if (ctx->type == PIPE_SHADER_VERTEX) {
2972 struct lp_build_if_state if_ctx;
2973 LLVMValueRef cond = NULL;
2974 LLVMValueRef addr, val;
2975
2976 for (i = 0; i < info->num_outputs; i++) {
2977 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2978 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2979 continue;
2980
2981 /* We've found a color. */
2982 if (!cond) {
2983 /* The state is in the first bit of the user SGPR. */
2984 cond = LLVMGetParam(ctx->main_fn,
2985 ctx->param_vs_state_bits);
2986 cond = LLVMBuildTrunc(gallivm->builder, cond,
2987 ctx->i1, "");
2988 lp_build_if(&if_ctx, gallivm, cond);
2989 }
2990
2991 for (j = 0; j < 4; j++) {
2992 addr = ctx->outputs[i][j];
2993 val = LLVMBuildLoad(gallivm->builder, addr, "");
2994 val = ac_build_clamp(&ctx->ac, val);
2995 LLVMBuildStore(gallivm->builder, val, addr);
2996 }
2997 }
2998
2999 if (cond)
3000 lp_build_endif(&if_ctx);
3001 }
3002
3003 for (i = 0; i < info->num_outputs; i++) {
3004 outputs[i].semantic_name = info->output_semantic_name[i];
3005 outputs[i].semantic_index = info->output_semantic_index[i];
3006
3007 for (j = 0; j < 4; j++) {
3008 outputs[i].values[j] =
3009 LLVMBuildLoad(gallivm->builder,
3010 ctx->outputs[i][j],
3011 "");
3012 outputs[i].vertex_stream[j] =
3013 (info->output_streams[i] >> (2 * j)) & 3;
3014 }
3015 }
3016
3017 if (ctx->shader->selector->so.num_outputs)
3018 si_llvm_emit_streamout(ctx, outputs, i, 0);
3019
3020 /* Export PrimitiveID. */
3021 if (ctx->shader->key.mono.vs_export_prim_id) {
3022 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3023 outputs[i].semantic_index = 0;
3024 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
3025 get_primitive_id(bld_base, 0));
3026 for (j = 1; j < 4; j++)
3027 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3028
3029 memset(outputs[i].vertex_stream, 0,
3030 sizeof(outputs[i].vertex_stream));
3031 i++;
3032 }
3033
3034 si_llvm_export_vs(bld_base, outputs, i);
3035 FREE(outputs);
3036 }
3037
3038 struct si_ps_exports {
3039 unsigned num;
3040 struct ac_export_args args[10];
3041 };
3042
3043 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3044 bool writes_samplemask)
3045 {
3046 if (writes_z) {
3047 /* Z needs 32 bits. */
3048 if (writes_samplemask)
3049 return V_028710_SPI_SHADER_32_ABGR;
3050 else if (writes_stencil)
3051 return V_028710_SPI_SHADER_32_GR;
3052 else
3053 return V_028710_SPI_SHADER_32_R;
3054 } else if (writes_stencil || writes_samplemask) {
3055 /* Both stencil and sample mask need only 16 bits. */
3056 return V_028710_SPI_SHADER_UINT16_ABGR;
3057 } else {
3058 return V_028710_SPI_SHADER_ZERO;
3059 }
3060 }
3061
3062 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3063 LLVMValueRef depth, LLVMValueRef stencil,
3064 LLVMValueRef samplemask, struct si_ps_exports *exp)
3065 {
3066 struct si_shader_context *ctx = si_shader_context(bld_base);
3067 struct lp_build_context *base = &bld_base->base;
3068 struct ac_export_args args;
3069 unsigned mask = 0;
3070 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3071 stencil != NULL,
3072 samplemask != NULL);
3073
3074 assert(depth || stencil || samplemask);
3075
3076 args.valid_mask = 1; /* whether the EXEC mask is valid */
3077 args.done = 1; /* DONE bit */
3078
3079 /* Specify the target we are exporting */
3080 args.target = V_008DFC_SQ_EXP_MRTZ;
3081
3082 args.compr = 0; /* COMP flag */
3083 args.out[0] = base->undef; /* R, depth */
3084 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3085 args.out[2] = base->undef; /* B, sample mask */
3086 args.out[3] = base->undef; /* A, alpha to mask */
3087
3088 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3089 assert(!depth);
3090 args.compr = 1; /* COMPR flag */
3091
3092 if (stencil) {
3093 /* Stencil should be in X[23:16]. */
3094 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3095 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3096 LLVMConstInt(ctx->i32, 16, 0), "");
3097 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3098 mask |= 0x3;
3099 }
3100 if (samplemask) {
3101 /* SampleMask should be in Y[15:0]. */
3102 args.out[1] = samplemask;
3103 mask |= 0xc;
3104 }
3105 } else {
3106 if (depth) {
3107 args.out[0] = depth;
3108 mask |= 0x1;
3109 }
3110 if (stencil) {
3111 args.out[1] = stencil;
3112 mask |= 0x2;
3113 }
3114 if (samplemask) {
3115 args.out[2] = samplemask;
3116 mask |= 0x4;
3117 }
3118 }
3119
3120 /* SI (except OLAND and HAINAN) has a bug that it only looks
3121 * at the X writemask component. */
3122 if (ctx->screen->b.chip_class == SI &&
3123 ctx->screen->b.family != CHIP_OLAND &&
3124 ctx->screen->b.family != CHIP_HAINAN)
3125 mask |= 0x1;
3126
3127 /* Specify which components to enable */
3128 args.enabled_channels = mask;
3129
3130 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3131 }
3132
3133 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3134 LLVMValueRef *color, unsigned index,
3135 unsigned samplemask_param,
3136 bool is_last, struct si_ps_exports *exp)
3137 {
3138 struct si_shader_context *ctx = si_shader_context(bld_base);
3139 struct lp_build_context *base = &bld_base->base;
3140 int i;
3141
3142 /* Clamp color */
3143 if (ctx->shader->key.part.ps.epilog.clamp_color)
3144 for (i = 0; i < 4; i++)
3145 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3146
3147 /* Alpha to one */
3148 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3149 color[3] = base->one;
3150
3151 /* Alpha test */
3152 if (index == 0 &&
3153 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3154 si_alpha_test(bld_base, color[3]);
3155
3156 /* Line & polygon smoothing */
3157 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3158 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3159 samplemask_param);
3160
3161 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3162 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3163 struct ac_export_args args[8];
3164 int c, last = -1;
3165
3166 /* Get the export arguments, also find out what the last one is. */
3167 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3168 si_llvm_init_export_args(bld_base, color,
3169 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3170 if (args[c].enabled_channels)
3171 last = c;
3172 }
3173
3174 /* Emit all exports. */
3175 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3176 if (is_last && last == c) {
3177 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3178 args[c].done = 1; /* DONE bit */
3179 } else if (!args[c].enabled_channels)
3180 continue; /* unnecessary NULL export */
3181
3182 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3183 }
3184 } else {
3185 struct ac_export_args args;
3186
3187 /* Export */
3188 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3189 &args);
3190 if (is_last) {
3191 args.valid_mask = 1; /* whether the EXEC mask is valid */
3192 args.done = 1; /* DONE bit */
3193 } else if (!args.enabled_channels)
3194 return; /* unnecessary NULL export */
3195
3196 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3197 }
3198 }
3199
3200 static void si_emit_ps_exports(struct si_shader_context *ctx,
3201 struct si_ps_exports *exp)
3202 {
3203 for (unsigned i = 0; i < exp->num; i++)
3204 ac_build_export(&ctx->ac, &exp->args[i]);
3205 }
3206
3207 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3208 {
3209 struct si_shader_context *ctx = si_shader_context(bld_base);
3210 struct lp_build_context *base = &bld_base->base;
3211 struct ac_export_args args;
3212
3213 args.enabled_channels = 0x0; /* enabled channels */
3214 args.valid_mask = 1; /* whether the EXEC mask is valid */
3215 args.done = 1; /* DONE bit */
3216 args.target = V_008DFC_SQ_EXP_NULL;
3217 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3218 args.out[0] = base->undef; /* R */
3219 args.out[1] = base->undef; /* G */
3220 args.out[2] = base->undef; /* B */
3221 args.out[3] = base->undef; /* A */
3222
3223 ac_build_export(&ctx->ac, &args);
3224 }
3225
3226 /**
3227 * Return PS outputs in this order:
3228 *
3229 * v[0:3] = color0.xyzw
3230 * v[4:7] = color1.xyzw
3231 * ...
3232 * vN+0 = Depth
3233 * vN+1 = Stencil
3234 * vN+2 = SampleMask
3235 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3236 *
3237 * The alpha-ref SGPR is returned via its original location.
3238 */
3239 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3240 {
3241 struct si_shader_context *ctx = si_shader_context(bld_base);
3242 struct si_shader *shader = ctx->shader;
3243 struct tgsi_shader_info *info = &shader->selector->info;
3244 LLVMBuilderRef builder = ctx->gallivm.builder;
3245 unsigned i, j, first_vgpr, vgpr;
3246
3247 LLVMValueRef color[8][4] = {};
3248 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3249 LLVMValueRef ret;
3250
3251 /* Read the output values. */
3252 for (i = 0; i < info->num_outputs; i++) {
3253 unsigned semantic_name = info->output_semantic_name[i];
3254 unsigned semantic_index = info->output_semantic_index[i];
3255
3256 switch (semantic_name) {
3257 case TGSI_SEMANTIC_COLOR:
3258 assert(semantic_index < 8);
3259 for (j = 0; j < 4; j++) {
3260 LLVMValueRef ptr = ctx->outputs[i][j];
3261 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3262 color[semantic_index][j] = result;
3263 }
3264 break;
3265 case TGSI_SEMANTIC_POSITION:
3266 depth = LLVMBuildLoad(builder,
3267 ctx->outputs[i][2], "");
3268 break;
3269 case TGSI_SEMANTIC_STENCIL:
3270 stencil = LLVMBuildLoad(builder,
3271 ctx->outputs[i][1], "");
3272 break;
3273 case TGSI_SEMANTIC_SAMPLEMASK:
3274 samplemask = LLVMBuildLoad(builder,
3275 ctx->outputs[i][0], "");
3276 break;
3277 default:
3278 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3279 semantic_name);
3280 }
3281 }
3282
3283 /* Fill the return structure. */
3284 ret = ctx->return_value;
3285
3286 /* Set SGPRs. */
3287 ret = LLVMBuildInsertValue(builder, ret,
3288 bitcast(bld_base, TGSI_TYPE_SIGNED,
3289 LLVMGetParam(ctx->main_fn,
3290 SI_PARAM_ALPHA_REF)),
3291 SI_SGPR_ALPHA_REF, "");
3292
3293 /* Set VGPRs */
3294 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3295 for (i = 0; i < ARRAY_SIZE(color); i++) {
3296 if (!color[i][0])
3297 continue;
3298
3299 for (j = 0; j < 4; j++)
3300 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3301 }
3302 if (depth)
3303 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3304 if (stencil)
3305 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3306 if (samplemask)
3307 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3308
3309 /* Add the input sample mask for smoothing at the end. */
3310 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3311 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3312 ret = LLVMBuildInsertValue(builder, ret,
3313 LLVMGetParam(ctx->main_fn,
3314 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3315
3316 ctx->return_value = ret;
3317 }
3318
3319 /**
3320 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3321 * buffer in number of elements and return it as an i32.
3322 */
3323 static LLVMValueRef get_buffer_size(
3324 struct lp_build_tgsi_context *bld_base,
3325 LLVMValueRef descriptor)
3326 {
3327 struct si_shader_context *ctx = si_shader_context(bld_base);
3328 struct gallivm_state *gallivm = &ctx->gallivm;
3329 LLVMBuilderRef builder = gallivm->builder;
3330 LLVMValueRef size =
3331 LLVMBuildExtractElement(builder, descriptor,
3332 LLVMConstInt(ctx->i32, 2, 0), "");
3333
3334 if (ctx->screen->b.chip_class == VI) {
3335 /* On VI, the descriptor contains the size in bytes,
3336 * but TXQ must return the size in elements.
3337 * The stride is always non-zero for resources using TXQ.
3338 */
3339 LLVMValueRef stride =
3340 LLVMBuildExtractElement(builder, descriptor,
3341 ctx->i32_1, "");
3342 stride = LLVMBuildLShr(builder, stride,
3343 LLVMConstInt(ctx->i32, 16, 0), "");
3344 stride = LLVMBuildAnd(builder, stride,
3345 LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3346
3347 size = LLVMBuildUDiv(builder, size, stride, "");
3348 }
3349
3350 return size;
3351 }
3352
3353 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3354 struct lp_build_tgsi_context *bld_base,
3355 struct lp_build_emit_data *emit_data);
3356
3357 /* Prevent optimizations (at least of memory accesses) across the current
3358 * point in the program by emitting empty inline assembly that is marked as
3359 * having side effects.
3360 *
3361 * Optionally, a value can be passed through the inline assembly to prevent
3362 * LLVM from hoisting calls to ReadNone functions.
3363 */
3364 static void emit_optimization_barrier(struct si_shader_context *ctx,
3365 LLVMValueRef *pvgpr)
3366 {
3367 static int counter = 0;
3368
3369 LLVMBuilderRef builder = ctx->gallivm.builder;
3370 char code[16];
3371
3372 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3373
3374 if (!pvgpr) {
3375 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3376 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3377 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3378 } else {
3379 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3380 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3381 LLVMValueRef vgpr = *pvgpr;
3382 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3383 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3384 LLVMValueRef vgpr0;
3385
3386 assert(vgpr_size % 4 == 0);
3387
3388 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3389 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3390 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3391 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3392 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3393
3394 *pvgpr = vgpr;
3395 }
3396 }
3397
3398 /* Combine these with & instead of |. */
3399 #define NOOP_WAITCNT 0xf7f
3400 #define LGKM_CNT 0x07f
3401 #define VM_CNT 0xf70
3402
3403 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3404 {
3405 struct gallivm_state *gallivm = &ctx->gallivm;
3406 LLVMBuilderRef builder = gallivm->builder;
3407 LLVMValueRef args[1] = {
3408 LLVMConstInt(ctx->i32, simm16, 0)
3409 };
3410 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3411 ctx->voidt, args, 1, 0);
3412 }
3413
3414 static void membar_emit(
3415 const struct lp_build_tgsi_action *action,
3416 struct lp_build_tgsi_context *bld_base,
3417 struct lp_build_emit_data *emit_data)
3418 {
3419 struct si_shader_context *ctx = si_shader_context(bld_base);
3420 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3421 unsigned flags = LLVMConstIntGetZExtValue(src0);
3422 unsigned waitcnt = NOOP_WAITCNT;
3423
3424 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3425 waitcnt &= VM_CNT & LGKM_CNT;
3426
3427 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3428 TGSI_MEMBAR_SHADER_BUFFER |
3429 TGSI_MEMBAR_SHADER_IMAGE))
3430 waitcnt &= VM_CNT;
3431
3432 if (flags & TGSI_MEMBAR_SHARED)
3433 waitcnt &= LGKM_CNT;
3434
3435 if (waitcnt != NOOP_WAITCNT)
3436 emit_waitcnt(ctx, waitcnt);
3437 }
3438
3439 static void clock_emit(
3440 const struct lp_build_tgsi_action *action,
3441 struct lp_build_tgsi_context *bld_base,
3442 struct lp_build_emit_data *emit_data)
3443 {
3444 struct si_shader_context *ctx = si_shader_context(bld_base);
3445 struct gallivm_state *gallivm = &ctx->gallivm;
3446 LLVMValueRef tmp;
3447
3448 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3449 ctx->i64, NULL, 0, 0);
3450 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3451
3452 emit_data->output[0] =
3453 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3454 emit_data->output[1] =
3455 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3456 }
3457
3458 static LLVMValueRef
3459 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3460 const struct tgsi_full_src_register *reg)
3461 {
3462 LLVMValueRef index;
3463 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3464 ctx->param_shader_buffers);
3465
3466 if (!reg->Register.Indirect)
3467 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3468 else
3469 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3470 reg->Register.Index,
3471 SI_NUM_SHADER_BUFFERS);
3472
3473 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3474 }
3475
3476 static bool tgsi_is_array_sampler(unsigned target)
3477 {
3478 return target == TGSI_TEXTURE_1D_ARRAY ||
3479 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3480 target == TGSI_TEXTURE_2D_ARRAY ||
3481 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3482 target == TGSI_TEXTURE_CUBE_ARRAY ||
3483 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3484 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3485 }
3486
3487 static bool tgsi_is_array_image(unsigned target)
3488 {
3489 return target == TGSI_TEXTURE_3D ||
3490 target == TGSI_TEXTURE_CUBE ||
3491 target == TGSI_TEXTURE_1D_ARRAY ||
3492 target == TGSI_TEXTURE_2D_ARRAY ||
3493 target == TGSI_TEXTURE_CUBE_ARRAY ||
3494 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3495 }
3496
3497 /**
3498 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3499 *
3500 * At least on Tonga, executing image stores on images with DCC enabled and
3501 * non-trivial can eventually lead to lockups. This can occur when an
3502 * application binds an image as read-only but then uses a shader that writes
3503 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3504 * program termination) in this case, but it doesn't cost much to be a bit
3505 * nicer: disabling DCC in the shader still leads to undefined results but
3506 * avoids the lockup.
3507 */
3508 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3509 LLVMValueRef rsrc)
3510 {
3511 if (ctx->screen->b.chip_class <= CIK) {
3512 return rsrc;
3513 } else {
3514 LLVMBuilderRef builder = ctx->gallivm.builder;
3515 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3516 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3517 LLVMValueRef tmp;
3518
3519 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3520 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3521 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3522 }
3523 }
3524
3525 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3526 {
3527 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3528 CONST_ADDR_SPACE);
3529 }
3530
3531 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3532 LLVMValueRef list, LLVMValueRef index,
3533 unsigned target)
3534 {
3535 LLVMBuilderRef builder = ctx->gallivm.builder;
3536
3537 if (target == TGSI_TEXTURE_BUFFER) {
3538 index = LLVMBuildMul(builder, index,
3539 LLVMConstInt(ctx->i32, 2, 0), "");
3540 index = LLVMBuildAdd(builder, index,
3541 ctx->i32_1, "");
3542 list = LLVMBuildPointerCast(builder, list,
3543 const_array(ctx->v4i32, 0), "");
3544 }
3545
3546 return ac_build_indexed_load_const(&ctx->ac, list, index);
3547 }
3548
3549 /**
3550 * Load the resource descriptor for \p image.
3551 */
3552 static void
3553 image_fetch_rsrc(
3554 struct lp_build_tgsi_context *bld_base,
3555 const struct tgsi_full_src_register *image,
3556 bool is_store, unsigned target,
3557 LLVMValueRef *rsrc)
3558 {
3559 struct si_shader_context *ctx = si_shader_context(bld_base);
3560 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3561 ctx->param_images);
3562 LLVMValueRef index;
3563 bool dcc_off = is_store;
3564
3565 assert(image->Register.File == TGSI_FILE_IMAGE);
3566
3567 if (!image->Register.Indirect) {
3568 const struct tgsi_shader_info *info = bld_base->info;
3569 unsigned images_writemask = info->images_store |
3570 info->images_atomic;
3571
3572 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3573
3574 if (images_writemask & (1 << image->Register.Index))
3575 dcc_off = true;
3576 } else {
3577 /* From the GL_ARB_shader_image_load_store extension spec:
3578 *
3579 * If a shader performs an image load, store, or atomic
3580 * operation using an image variable declared as an array,
3581 * and if the index used to select an individual element is
3582 * negative or greater than or equal to the size of the
3583 * array, the results of the operation are undefined but may
3584 * not lead to termination.
3585 */
3586 index = get_bounded_indirect_index(ctx, &image->Indirect,
3587 image->Register.Index,
3588 SI_NUM_IMAGES);
3589 }
3590
3591 *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3592 if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3593 *rsrc = force_dcc_off(ctx, *rsrc);
3594 }
3595
3596 static LLVMValueRef image_fetch_coords(
3597 struct lp_build_tgsi_context *bld_base,
3598 const struct tgsi_full_instruction *inst,
3599 unsigned src, LLVMValueRef desc)
3600 {
3601 struct si_shader_context *ctx = si_shader_context(bld_base);
3602 struct gallivm_state *gallivm = &ctx->gallivm;
3603 LLVMBuilderRef builder = gallivm->builder;
3604 unsigned target = inst->Memory.Texture;
3605 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3606 LLVMValueRef coords[4];
3607 LLVMValueRef tmp;
3608 int chan;
3609
3610 for (chan = 0; chan < num_coords; ++chan) {
3611 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3612 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3613 coords[chan] = tmp;
3614 }
3615
3616 if (ctx->screen->b.chip_class >= GFX9) {
3617 /* 1D textures are allocated and used as 2D on GFX9. */
3618 if (target == TGSI_TEXTURE_1D) {
3619 coords[1] = ctx->i32_0;
3620 num_coords++;
3621 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3622 coords[2] = coords[1];
3623 coords[1] = ctx->i32_0;
3624 num_coords++;
3625 } else if (target == TGSI_TEXTURE_2D) {
3626 /* The hw can't bind a slice of a 3D image as a 2D
3627 * image, because it ignores BASE_ARRAY if the target
3628 * is 3D. The workaround is to read BASE_ARRAY and set
3629 * it as the 3rd address operand for all 2D images.
3630 */
3631 LLVMValueRef first_layer, const5, mask;
3632
3633 const5 = LLVMConstInt(ctx->i32, 5, 0);
3634 mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3635 first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3636 first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3637
3638 coords[2] = first_layer;
3639 num_coords++;
3640 }
3641 }
3642
3643 if (num_coords == 1)
3644 return coords[0];
3645
3646 if (num_coords == 3) {
3647 /* LLVM has difficulties lowering 3-element vectors. */
3648 coords[3] = bld_base->uint_bld.undef;
3649 num_coords = 4;
3650 }
3651
3652 return lp_build_gather_values(gallivm, coords, num_coords);
3653 }
3654
3655 /**
3656 * Append the extra mode bits that are used by image load and store.
3657 */
3658 static void image_append_args(
3659 struct si_shader_context *ctx,
3660 struct lp_build_emit_data * emit_data,
3661 unsigned target,
3662 bool atomic,
3663 bool force_glc)
3664 {
3665 const struct tgsi_full_instruction *inst = emit_data->inst;
3666 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3667 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3668 LLVMValueRef r128 = i1false;
3669 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3670 LLVMValueRef glc =
3671 force_glc ||
3672 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3673 i1true : i1false;
3674 LLVMValueRef slc = i1false;
3675 LLVMValueRef lwe = i1false;
3676
3677 if (atomic || (HAVE_LLVM <= 0x0309)) {
3678 emit_data->args[emit_data->arg_count++] = r128;
3679 emit_data->args[emit_data->arg_count++] = da;
3680 if (!atomic) {
3681 emit_data->args[emit_data->arg_count++] = glc;
3682 }
3683 emit_data->args[emit_data->arg_count++] = slc;
3684 return;
3685 }
3686
3687 /* HAVE_LLVM >= 0x0400 */
3688 emit_data->args[emit_data->arg_count++] = glc;
3689 emit_data->args[emit_data->arg_count++] = slc;
3690 emit_data->args[emit_data->arg_count++] = lwe;
3691 emit_data->args[emit_data->arg_count++] = da;
3692 }
3693
3694 /**
3695 * Append the resource and indexing arguments for buffer intrinsics.
3696 *
3697 * \param rsrc the v4i32 buffer resource
3698 * \param index index into the buffer (stride-based)
3699 * \param offset byte offset into the buffer
3700 */
3701 static void buffer_append_args(
3702 struct si_shader_context *ctx,
3703 struct lp_build_emit_data *emit_data,
3704 LLVMValueRef rsrc,
3705 LLVMValueRef index,
3706 LLVMValueRef offset,
3707 bool atomic,
3708 bool force_glc)
3709 {
3710 const struct tgsi_full_instruction *inst = emit_data->inst;
3711 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3712 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3713
3714 emit_data->args[emit_data->arg_count++] = rsrc;
3715 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3716 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3717 if (!atomic) {
3718 emit_data->args[emit_data->arg_count++] =
3719 force_glc ||
3720 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3721 i1true : i1false; /* glc */
3722 }
3723 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3724 }
3725
3726 static void load_fetch_args(
3727 struct lp_build_tgsi_context * bld_base,
3728 struct lp_build_emit_data * emit_data)
3729 {
3730 struct si_shader_context *ctx = si_shader_context(bld_base);
3731 struct gallivm_state *gallivm = &ctx->gallivm;
3732 const struct tgsi_full_instruction * inst = emit_data->inst;
3733 unsigned target = inst->Memory.Texture;
3734 LLVMValueRef rsrc;
3735
3736 emit_data->dst_type = ctx->v4f32;
3737
3738 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3739 LLVMBuilderRef builder = gallivm->builder;
3740 LLVMValueRef offset;
3741 LLVMValueRef tmp;
3742
3743 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3744
3745 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3746 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3747
3748 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3749 offset, false, false);
3750 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3751 LLVMValueRef coords;
3752
3753 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3754 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3755
3756 if (target == TGSI_TEXTURE_BUFFER) {
3757 buffer_append_args(ctx, emit_data, rsrc, coords,
3758 ctx->i32_0, false, false);
3759 } else {
3760 emit_data->args[0] = coords;
3761 emit_data->args[1] = rsrc;
3762 emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3763 emit_data->arg_count = 3;
3764
3765 image_append_args(ctx, emit_data, target, false, false);
3766 }
3767 }
3768 }
3769
3770 static unsigned get_load_intr_attribs(bool readonly_memory)
3771 {
3772 /* READNONE means writes can't affect it, while READONLY means that
3773 * writes can affect it. */
3774 return readonly_memory && HAVE_LLVM >= 0x0400 ?
3775 LP_FUNC_ATTR_READNONE :
3776 LP_FUNC_ATTR_READONLY;
3777 }
3778
3779 static unsigned get_store_intr_attribs(bool writeonly_memory)
3780 {
3781 return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3782 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3783 LP_FUNC_ATTR_WRITEONLY;
3784 }
3785
3786 static void load_emit_buffer(struct si_shader_context *ctx,
3787 struct lp_build_emit_data *emit_data,
3788 bool readonly_memory)
3789 {
3790 const struct tgsi_full_instruction *inst = emit_data->inst;
3791 struct gallivm_state *gallivm = &ctx->gallivm;
3792 LLVMBuilderRef builder = gallivm->builder;
3793 uint writemask = inst->Dst[0].Register.WriteMask;
3794 uint count = util_last_bit(writemask);
3795 const char *intrinsic_name;
3796 LLVMTypeRef dst_type;
3797
3798 switch (count) {
3799 case 1:
3800 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3801 dst_type = ctx->f32;
3802 break;
3803 case 2:
3804 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3805 dst_type = LLVMVectorType(ctx->f32, 2);
3806 break;
3807 default: // 3 & 4
3808 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3809 dst_type = ctx->v4f32;
3810 count = 4;
3811 }
3812
3813 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3814 builder, intrinsic_name, dst_type,
3815 emit_data->args, emit_data->arg_count,
3816 get_load_intr_attribs(readonly_memory));
3817 }
3818
3819 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3820 const struct tgsi_full_instruction *inst,
3821 LLVMTypeRef type, int arg)
3822 {
3823 struct gallivm_state *gallivm = &ctx->gallivm;
3824 LLVMBuilderRef builder = gallivm->builder;
3825 LLVMValueRef offset, ptr;
3826 int addr_space;
3827
3828 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3829 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3830
3831 ptr = ctx->shared_memory;
3832 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3833 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3834 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3835
3836 return ptr;
3837 }
3838
3839 static void load_emit_memory(
3840 struct si_shader_context *ctx,
3841 struct lp_build_emit_data *emit_data)
3842 {
3843 const struct tgsi_full_instruction *inst = emit_data->inst;
3844 struct gallivm_state *gallivm = &ctx->gallivm;
3845 LLVMBuilderRef builder = gallivm->builder;
3846 unsigned writemask = inst->Dst[0].Register.WriteMask;
3847 LLVMValueRef channels[4], ptr, derived_ptr, index;
3848 int chan;
3849
3850 ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3851
3852 for (chan = 0; chan < 4; ++chan) {
3853 if (!(writemask & (1 << chan))) {
3854 channels[chan] = LLVMGetUndef(ctx->f32);
3855 continue;
3856 }
3857
3858 index = LLVMConstInt(ctx->i32, chan, 0);
3859 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3860 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3861 }
3862 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3863 }
3864
3865 /**
3866 * Return true if the memory accessed by a LOAD or STORE instruction is
3867 * read-only or write-only, respectively.
3868 *
3869 * \param shader_buffers_reverse_access_mask
3870 * For LOAD, set this to (store | atomic) slot usage in the shader.
3871 * For STORE, set this to (load | atomic) slot usage in the shader.
3872 * \param images_reverse_access_mask Same as above, but for images.
3873 */
3874 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3875 const struct tgsi_shader_info *info,
3876 unsigned shader_buffers_reverse_access_mask,
3877 unsigned images_reverse_access_mask)
3878 {
3879 /* RESTRICT means NOALIAS.
3880 * If there are no writes, we can assume the accessed memory is read-only.
3881 * If there are no reads, we can assume the accessed memory is write-only.
3882 */
3883 if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3884 unsigned reverse_access_mask;
3885
3886 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3887 reverse_access_mask = shader_buffers_reverse_access_mask;
3888 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3889 reverse_access_mask = info->images_buffers &
3890 images_reverse_access_mask;
3891 } else {
3892 reverse_access_mask = ~info->images_buffers &
3893 images_reverse_access_mask;
3894 }
3895
3896 if (inst->Src[0].Register.Indirect) {
3897 if (!reverse_access_mask)
3898 return true;
3899 } else {
3900 if (!(reverse_access_mask &
3901 (1u << inst->Src[0].Register.Index)))
3902 return true;
3903 }
3904 }
3905
3906 /* If there are no buffer writes (for both shader buffers & image
3907 * buffers), it implies that buffer memory is read-only.
3908 * If there are no buffer reads (for both shader buffers & image
3909 * buffers), it implies that buffer memory is write-only.
3910 *
3911 * Same for the case when there are no writes/reads for non-buffer
3912 * images.
3913 */
3914 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3915 (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3916 inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3917 if (!shader_buffers_reverse_access_mask &&
3918 !(info->images_buffers & images_reverse_access_mask))
3919 return true;
3920 } else {
3921 if (!(~info->images_buffers & images_reverse_access_mask))
3922 return true;
3923 }
3924 return false;
3925 }
3926
3927 static void load_emit(
3928 const struct lp_build_tgsi_action *action,
3929 struct lp_build_tgsi_context *bld_base,
3930 struct lp_build_emit_data *emit_data)
3931 {
3932 struct si_shader_context *ctx = si_shader_context(bld_base);
3933 struct gallivm_state *gallivm = &ctx->gallivm;
3934 LLVMBuilderRef builder = gallivm->builder;
3935 const struct tgsi_full_instruction * inst = emit_data->inst;
3936 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3937 char intrinsic_name[64];
3938 bool readonly_memory = false;
3939
3940 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3941 load_emit_memory(ctx, emit_data);
3942 return;
3943 }
3944
3945 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3946 emit_waitcnt(ctx, VM_CNT);
3947
3948 readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3949 is_oneway_access_only(inst, info,
3950 info->shader_buffers_store |
3951 info->shader_buffers_atomic,
3952 info->images_store |
3953 info->images_atomic);
3954
3955 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3956 load_emit_buffer(ctx, emit_data, readonly_memory);
3957 return;
3958 }
3959
3960 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3961 emit_data->output[emit_data->chan] =
3962 lp_build_intrinsic(
3963 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3964 emit_data->args, emit_data->arg_count,
3965 get_load_intr_attribs(readonly_memory));
3966 } else {
3967 ac_get_image_intr_name("llvm.amdgcn.image.load",
3968 emit_data->dst_type, /* vdata */
3969 LLVMTypeOf(emit_data->args[0]), /* coords */
3970 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3971 intrinsic_name, sizeof(intrinsic_name));
3972
3973 emit_data->output[emit_data->chan] =
3974 lp_build_intrinsic(
3975 builder, intrinsic_name, emit_data->dst_type,
3976 emit_data->args, emit_data->arg_count,
3977 get_load_intr_attribs(readonly_memory));
3978 }
3979 }
3980
3981 static void store_fetch_args(
3982 struct lp_build_tgsi_context * bld_base,
3983 struct lp_build_emit_data * emit_data)
3984 {
3985 struct si_shader_context *ctx = si_shader_context(bld_base);
3986 struct gallivm_state *gallivm = &ctx->gallivm;
3987 LLVMBuilderRef builder = gallivm->builder;
3988 const struct tgsi_full_instruction * inst = emit_data->inst;
3989 struct tgsi_full_src_register memory;
3990 LLVMValueRef chans[4];
3991 LLVMValueRef data;
3992 LLVMValueRef rsrc;
3993 unsigned chan;
3994
3995 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3996
3997 for (chan = 0; chan < 4; ++chan) {
3998 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3999 }
4000 data = lp_build_gather_values(gallivm, chans, 4);
4001
4002 emit_data->args[emit_data->arg_count++] = data;
4003
4004 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
4005
4006 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4007 LLVMValueRef offset;
4008 LLVMValueRef tmp;
4009
4010 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
4011
4012 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
4013 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4014
4015 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4016 offset, false, false);
4017 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
4018 unsigned target = inst->Memory.Texture;
4019 LLVMValueRef coords;
4020
4021 /* 8bit/16bit TC L1 write corruption bug on SI.
4022 * All store opcodes not aligned to a dword are affected.
4023 *
4024 * The only way to get unaligned stores in radeonsi is through
4025 * shader images.
4026 */
4027 bool force_glc = ctx->screen->b.chip_class == SI;
4028
4029 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
4030 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
4031
4032 if (target == TGSI_TEXTURE_BUFFER) {
4033 buffer_append_args(ctx, emit_data, rsrc, coords,
4034 ctx->i32_0, false, force_glc);
4035 } else {
4036 emit_data->args[1] = coords;
4037 emit_data->args[2] = rsrc;
4038 emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
4039 emit_data->arg_count = 4;
4040
4041 image_append_args(ctx, emit_data, target, false, force_glc);
4042 }
4043 }
4044 }
4045
4046 static void store_emit_buffer(
4047 struct si_shader_context *ctx,
4048 struct lp_build_emit_data *emit_data,
4049 bool writeonly_memory)
4050 {
4051 const struct tgsi_full_instruction *inst = emit_data->inst;
4052 struct gallivm_state *gallivm = &ctx->gallivm;
4053 LLVMBuilderRef builder = gallivm->builder;
4054 LLVMValueRef base_data = emit_data->args[0];
4055 LLVMValueRef base_offset = emit_data->args[3];
4056 unsigned writemask = inst->Dst[0].Register.WriteMask;
4057
4058 while (writemask) {
4059 int start, count;
4060 const char *intrinsic_name;
4061 LLVMValueRef data;
4062 LLVMValueRef offset;
4063 LLVMValueRef tmp;
4064
4065 u_bit_scan_consecutive_range(&writemask, &start, &count);
4066
4067 /* Due to an LLVM limitation, split 3-element writes
4068 * into a 2-element and a 1-element write. */
4069 if (count == 3) {
4070 writemask |= 1 << (start + 2);
4071 count = 2;
4072 }
4073
4074 if (count == 4) {
4075 data = base_data;
4076 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
4077 } else if (count == 2) {
4078 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
4079
4080 tmp = LLVMBuildExtractElement(
4081 builder, base_data,
4082 LLVMConstInt(ctx->i32, start, 0), "");
4083 data = LLVMBuildInsertElement(
4084 builder, LLVMGetUndef(v2f32), tmp,
4085 ctx->i32_0, "");
4086
4087 tmp = LLVMBuildExtractElement(
4088 builder, base_data,
4089 LLVMConstInt(ctx->i32, start + 1, 0), "");
4090 data = LLVMBuildInsertElement(
4091 builder, data, tmp, ctx->i32_1, "");
4092
4093 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
4094 } else {
4095 assert(count == 1);
4096 data = LLVMBuildExtractElement(
4097 builder, base_data,
4098 LLVMConstInt(ctx->i32, start, 0), "");
4099 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
4100 }
4101
4102 offset = base_offset;
4103 if (start != 0) {
4104 offset = LLVMBuildAdd(
4105 builder, offset,
4106 LLVMConstInt(ctx->i32, start * 4, 0), "");
4107 }
4108
4109 emit_data->args[0] = data;
4110 emit_data->args[3] = offset;
4111
4112 lp_build_intrinsic(
4113 builder, intrinsic_name, emit_data->dst_type,
4114 emit_data->args, emit_data->arg_count,
4115 get_store_intr_attribs(writeonly_memory));
4116 }
4117 }
4118
4119 static void store_emit_memory(
4120 struct si_shader_context *ctx,
4121 struct lp_build_emit_data *emit_data)
4122 {
4123 const struct tgsi_full_instruction *inst = emit_data->inst;
4124 struct gallivm_state *gallivm = &ctx->gallivm;
4125 LLVMBuilderRef builder = gallivm->builder;
4126 unsigned writemask = inst->Dst[0].Register.WriteMask;
4127 LLVMValueRef ptr, derived_ptr, data, index;
4128 int chan;
4129
4130 ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
4131
4132 for (chan = 0; chan < 4; ++chan) {
4133 if (!(writemask & (1 << chan))) {
4134 continue;
4135 }
4136 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
4137 index = LLVMConstInt(ctx->i32, chan, 0);
4138 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
4139 LLVMBuildStore(builder, data, derived_ptr);
4140 }
4141 }
4142
4143 static void store_emit(
4144 const struct lp_build_tgsi_action *action,
4145 struct lp_build_tgsi_context *bld_base,
4146 struct lp_build_emit_data *emit_data)
4147 {
4148 struct si_shader_context *ctx = si_shader_context(bld_base);
4149 struct gallivm_state *gallivm = &ctx->gallivm;
4150 LLVMBuilderRef builder = gallivm->builder;
4151 const struct tgsi_full_instruction * inst = emit_data->inst;
4152 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
4153 unsigned target = inst->Memory.Texture;
4154 char intrinsic_name[64];
4155 bool writeonly_memory = false;
4156
4157 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4158 store_emit_memory(ctx, emit_data);
4159 return;
4160 }
4161
4162 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4163 emit_waitcnt(ctx, VM_CNT);
4164
4165 writeonly_memory = is_oneway_access_only(inst, info,
4166 info->shader_buffers_load |
4167 info->shader_buffers_atomic,
4168 info->images_load |
4169 info->images_atomic);
4170
4171 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4172 store_emit_buffer(ctx, emit_data, writeonly_memory);
4173 return;
4174 }
4175
4176 if (target == TGSI_TEXTURE_BUFFER) {
4177 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4178 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4179 emit_data->dst_type, emit_data->args,
4180 emit_data->arg_count,
4181 get_store_intr_attribs(writeonly_memory));
4182 } else {
4183 ac_get_image_intr_name("llvm.amdgcn.image.store",
4184 LLVMTypeOf(emit_data->args[0]), /* vdata */
4185 LLVMTypeOf(emit_data->args[1]), /* coords */
4186 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4187 intrinsic_name, sizeof(intrinsic_name));
4188
4189 emit_data->output[emit_data->chan] =
4190 lp_build_intrinsic(
4191 builder, intrinsic_name, emit_data->dst_type,
4192 emit_data->args, emit_data->arg_count,
4193 get_store_intr_attribs(writeonly_memory));
4194 }
4195 }
4196
4197 static void atomic_fetch_args(
4198 struct lp_build_tgsi_context * bld_base,
4199 struct lp_build_emit_data * emit_data)
4200 {
4201 struct si_shader_context *ctx = si_shader_context(bld_base);
4202 struct gallivm_state *gallivm = &ctx->gallivm;
4203 LLVMBuilderRef builder = gallivm->builder;
4204 const struct tgsi_full_instruction * inst = emit_data->inst;
4205 LLVMValueRef data1, data2;
4206 LLVMValueRef rsrc;
4207 LLVMValueRef tmp;
4208
4209 emit_data->dst_type = ctx->f32;
4210
4211 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4212 data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4213
4214 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4215 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4216 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4217 }
4218
4219 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4220 * of arguments, which is reversed relative to TGSI (and GLSL)
4221 */
4222 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4223 emit_data->args[emit_data->arg_count++] = data2;
4224 emit_data->args[emit_data->arg_count++] = data1;
4225
4226 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4227 LLVMValueRef offset;
4228
4229 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4230
4231 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4232 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4233
4234 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4235 offset, true, false);
4236 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4237 unsigned target = inst->Memory.Texture;
4238 LLVMValueRef coords;
4239
4240 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4241 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4242
4243 if (target == TGSI_TEXTURE_BUFFER) {
4244 buffer_append_args(ctx, emit_data, rsrc, coords,
4245 ctx->i32_0, true, false);
4246 } else {
4247 emit_data->args[emit_data->arg_count++] = coords;
4248 emit_data->args[emit_data->arg_count++] = rsrc;
4249
4250 image_append_args(ctx, emit_data, target, true, false);
4251 }
4252 }
4253 }
4254
4255 static void atomic_emit_memory(struct si_shader_context *ctx,
4256 struct lp_build_emit_data *emit_data) {
4257 struct gallivm_state *gallivm = &ctx->gallivm;
4258 LLVMBuilderRef builder = gallivm->builder;
4259 const struct tgsi_full_instruction * inst = emit_data->inst;
4260 LLVMValueRef ptr, result, arg;
4261
4262 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4263
4264 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4265 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4266
4267 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4268 LLVMValueRef new_data;
4269 new_data = lp_build_emit_fetch(&ctx->bld_base,
4270 inst, 3, 0);
4271
4272 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4273
4274 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4275 LLVMAtomicOrderingSequentiallyConsistent,
4276 LLVMAtomicOrderingSequentiallyConsistent,
4277 false);
4278
4279 result = LLVMBuildExtractValue(builder, result, 0, "");
4280 } else {
4281 LLVMAtomicRMWBinOp op;
4282
4283 switch(inst->Instruction.Opcode) {
4284 case TGSI_OPCODE_ATOMUADD:
4285 op = LLVMAtomicRMWBinOpAdd;
4286 break;
4287 case TGSI_OPCODE_ATOMXCHG:
4288 op = LLVMAtomicRMWBinOpXchg;
4289 break;
4290 case TGSI_OPCODE_ATOMAND:
4291 op = LLVMAtomicRMWBinOpAnd;
4292 break;
4293 case TGSI_OPCODE_ATOMOR:
4294 op = LLVMAtomicRMWBinOpOr;
4295 break;
4296 case TGSI_OPCODE_ATOMXOR:
4297 op = LLVMAtomicRMWBinOpXor;
4298 break;
4299 case TGSI_OPCODE_ATOMUMIN:
4300 op = LLVMAtomicRMWBinOpUMin;
4301 break;
4302 case TGSI_OPCODE_ATOMUMAX:
4303 op = LLVMAtomicRMWBinOpUMax;
4304 break;
4305 case TGSI_OPCODE_ATOMIMIN:
4306 op = LLVMAtomicRMWBinOpMin;
4307 break;
4308 case TGSI_OPCODE_ATOMIMAX:
4309 op = LLVMAtomicRMWBinOpMax;
4310 break;
4311 default:
4312 unreachable("unknown atomic opcode");
4313 }
4314
4315 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4316 LLVMAtomicOrderingSequentiallyConsistent,
4317 false);
4318 }
4319 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4320 }
4321
4322 static void atomic_emit(
4323 const struct lp_build_tgsi_action *action,
4324 struct lp_build_tgsi_context *bld_base,
4325 struct lp_build_emit_data *emit_data)
4326 {
4327 struct si_shader_context *ctx = si_shader_context(bld_base);
4328 struct gallivm_state *gallivm = &ctx->gallivm;
4329 LLVMBuilderRef builder = gallivm->builder;
4330 const struct tgsi_full_instruction * inst = emit_data->inst;
4331 char intrinsic_name[40];
4332 LLVMValueRef tmp;
4333
4334 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4335 atomic_emit_memory(ctx, emit_data);
4336 return;
4337 }
4338
4339 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4340 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4341 snprintf(intrinsic_name, sizeof(intrinsic_name),
4342 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4343 } else {
4344 LLVMValueRef coords;
4345 char coords_type[8];
4346
4347 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4348 coords = emit_data->args[2];
4349 else
4350 coords = emit_data->args[1];
4351
4352 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4353 snprintf(intrinsic_name, sizeof(intrinsic_name),
4354 "llvm.amdgcn.image.atomic.%s.%s",
4355 action->intr_name, coords_type);
4356 }
4357
4358 tmp = lp_build_intrinsic(
4359 builder, intrinsic_name, ctx->i32,
4360 emit_data->args, emit_data->arg_count, 0);
4361 emit_data->output[emit_data->chan] =
4362 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4363 }
4364
4365 static void set_tex_fetch_args(struct si_shader_context *ctx,
4366 struct lp_build_emit_data *emit_data,
4367 unsigned target,
4368 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4369 LLVMValueRef *param, unsigned count,
4370 unsigned dmask)
4371 {
4372 struct gallivm_state *gallivm = &ctx->gallivm;
4373 struct ac_image_args args = {};
4374
4375 /* Pad to power of two vector */
4376 while (count < util_next_power_of_two(count))
4377 param[count++] = LLVMGetUndef(ctx->i32);
4378
4379 if (count > 1)
4380 args.addr = lp_build_gather_values(gallivm, param, count);
4381 else
4382 args.addr = param[0];
4383
4384 args.resource = res_ptr;
4385 args.sampler = samp_ptr;
4386 args.dmask = dmask;
4387 args.unorm = target == TGSI_TEXTURE_RECT ||
4388 target == TGSI_TEXTURE_SHADOWRECT;
4389 args.da = tgsi_is_array_sampler(target);
4390
4391 /* Ugly, but we seem to have no other choice right now. */
4392 STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4393 memcpy(emit_data->args, &args, sizeof(args));
4394 }
4395
4396 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4397 unsigned target, LLVMValueRef out)
4398 {
4399 LLVMBuilderRef builder = ctx->gallivm.builder;
4400
4401 /* 1D textures are allocated and used as 2D on GFX9. */
4402 if (ctx->screen->b.chip_class >= GFX9 &&
4403 (target == TGSI_TEXTURE_1D_ARRAY ||
4404 target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4405 LLVMValueRef layers =
4406 LLVMBuildExtractElement(builder, out,
4407 LLVMConstInt(ctx->i32, 2, 0), "");
4408 out = LLVMBuildInsertElement(builder, out, layers,
4409 ctx->i32_1, "");
4410 }
4411
4412 /* Divide the number of layers by 6 to get the number of cubes. */
4413 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4414 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4415 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4416
4417 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4418 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4419
4420 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4421 }
4422 return out;
4423 }
4424
4425 static void resq_fetch_args(
4426 struct lp_build_tgsi_context * bld_base,
4427 struct lp_build_emit_data * emit_data)
4428 {
4429 struct si_shader_context *ctx = si_shader_context(bld_base);
4430 const struct tgsi_full_instruction *inst = emit_data->inst;
4431 const struct tgsi_full_src_register *reg = &inst->Src[0];
4432
4433 emit_data->dst_type = ctx->v4i32;
4434
4435 if (reg->Register.File == TGSI_FILE_BUFFER) {
4436 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4437 emit_data->arg_count = 1;
4438 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4439 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4440 &emit_data->args[0]);
4441 emit_data->arg_count = 1;
4442 } else {
4443 LLVMValueRef res_ptr;
4444 unsigned image_target;
4445
4446 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4447 image_target = TGSI_TEXTURE_2D_ARRAY;
4448 else
4449 image_target = inst->Memory.Texture;
4450
4451 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4452 &res_ptr);
4453 set_tex_fetch_args(ctx, emit_data, image_target,
4454 res_ptr, NULL, &ctx->i32_0, 1,
4455 0xf);
4456 }
4457 }
4458
4459 static void resq_emit(
4460 const struct lp_build_tgsi_action *action,
4461 struct lp_build_tgsi_context *bld_base,
4462 struct lp_build_emit_data *emit_data)
4463 {
4464 struct si_shader_context *ctx = si_shader_context(bld_base);
4465 struct gallivm_state *gallivm = &ctx->gallivm;
4466 LLVMBuilderRef builder = gallivm->builder;
4467 const struct tgsi_full_instruction *inst = emit_data->inst;
4468 LLVMValueRef out;
4469
4470 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4471 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4472 LLVMConstInt(ctx->i32, 2, 0), "");
4473 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4474 out = get_buffer_size(bld_base, emit_data->args[0]);
4475 } else {
4476 struct ac_image_args args;
4477
4478 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4479 args.opcode = ac_image_get_resinfo;
4480 out = ac_build_image_opcode(&ctx->ac, &args);
4481
4482 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4483 }
4484
4485 emit_data->output[emit_data->chan] = out;
4486 }
4487
4488 static const struct lp_build_tgsi_action tex_action;
4489
4490 enum desc_type {
4491 DESC_IMAGE,
4492 DESC_BUFFER,
4493 DESC_FMASK,
4494 DESC_SAMPLER,
4495 };
4496
4497 /**
4498 * Load an image view, fmask view. or sampler state descriptor.
4499 */
4500 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4501 LLVMValueRef list, LLVMValueRef index,
4502 enum desc_type type)
4503 {
4504 struct gallivm_state *gallivm = &ctx->gallivm;
4505 LLVMBuilderRef builder = gallivm->builder;
4506
4507 switch (type) {
4508 case DESC_IMAGE:
4509 /* The image is at [0:7]. */
4510 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4511 break;
4512 case DESC_BUFFER:
4513 /* The buffer is in [4:7]. */
4514 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4515 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4516 list = LLVMBuildPointerCast(builder, list,
4517 const_array(ctx->v4i32, 0), "");
4518 break;
4519 case DESC_FMASK:
4520 /* The FMASK is at [8:15]. */
4521 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4522 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4523 break;
4524 case DESC_SAMPLER:
4525 /* The sampler state is at [12:15]. */
4526 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4527 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4528 list = LLVMBuildPointerCast(builder, list,
4529 const_array(ctx->v4i32, 0), "");
4530 break;
4531 }
4532
4533 return ac_build_indexed_load_const(&ctx->ac, list, index);
4534 }
4535
4536 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4537 *
4538 * SI-CI:
4539 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4540 * filtering manually. The driver sets img7 to a mask clearing
4541 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4542 * s_and_b32 samp0, samp0, img7
4543 *
4544 * VI:
4545 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4546 */
4547 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4548 LLVMValueRef res, LLVMValueRef samp)
4549 {
4550 LLVMBuilderRef builder = ctx->gallivm.builder;
4551 LLVMValueRef img7, samp0;
4552
4553 if (ctx->screen->b.chip_class >= VI)
4554 return samp;
4555
4556 img7 = LLVMBuildExtractElement(builder, res,
4557 LLVMConstInt(ctx->i32, 7, 0), "");
4558 samp0 = LLVMBuildExtractElement(builder, samp,
4559 ctx->i32_0, "");
4560 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4561 return LLVMBuildInsertElement(builder, samp, samp0,
4562 ctx->i32_0, "");
4563 }
4564
4565 static void tex_fetch_ptrs(
4566 struct lp_build_tgsi_context *bld_base,
4567 struct lp_build_emit_data *emit_data,
4568 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4569 {
4570 struct si_shader_context *ctx = si_shader_context(bld_base);
4571 LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4572 const struct tgsi_full_instruction *inst = emit_data->inst;
4573 const struct tgsi_full_src_register *reg;
4574 unsigned target = inst->Texture.Texture;
4575 unsigned sampler_src;
4576 LLVMValueRef index;
4577
4578 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4579 reg = &emit_data->inst->Src[sampler_src];
4580
4581 if (reg->Register.Indirect) {
4582 index = get_bounded_indirect_index(ctx,
4583 &reg->Indirect,
4584 reg->Register.Index,
4585 SI_NUM_SAMPLERS);
4586 } else {
4587 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4588 }
4589
4590 if (target == TGSI_TEXTURE_BUFFER)
4591 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4592 else
4593 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4594
4595 if (samp_ptr)
4596 *samp_ptr = NULL;
4597 if (fmask_ptr)
4598 *fmask_ptr = NULL;
4599
4600 if (target == TGSI_TEXTURE_2D_MSAA ||
4601 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4602 if (fmask_ptr)
4603 *fmask_ptr = load_sampler_desc(ctx, list, index,
4604 DESC_FMASK);
4605 } else if (target != TGSI_TEXTURE_BUFFER) {
4606 if (samp_ptr) {
4607 *samp_ptr = load_sampler_desc(ctx, list, index,
4608 DESC_SAMPLER);
4609 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4610 }
4611 }
4612 }
4613
4614 static void txq_fetch_args(
4615 struct lp_build_tgsi_context *bld_base,
4616 struct lp_build_emit_data *emit_data)
4617 {
4618 struct si_shader_context *ctx = si_shader_context(bld_base);
4619 const struct tgsi_full_instruction *inst = emit_data->inst;
4620 unsigned target = inst->Texture.Texture;
4621 LLVMValueRef res_ptr;
4622 LLVMValueRef address;
4623
4624 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4625
4626 if (target == TGSI_TEXTURE_BUFFER) {
4627 /* Read the size from the buffer descriptor directly. */
4628 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4629 return;
4630 }
4631
4632 /* Textures - set the mip level. */
4633 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4634
4635 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4636 NULL, &address, 1, 0xf);
4637 }
4638
4639 static void txq_emit(const struct lp_build_tgsi_action *action,
4640 struct lp_build_tgsi_context *bld_base,
4641 struct lp_build_emit_data *emit_data)
4642 {
4643 struct si_shader_context *ctx = si_shader_context(bld_base);
4644 struct ac_image_args args;
4645 unsigned target = emit_data->inst->Texture.Texture;
4646
4647 if (target == TGSI_TEXTURE_BUFFER) {
4648 /* Just return the buffer size. */
4649 emit_data->output[emit_data->chan] = emit_data->args[0];
4650 return;
4651 }
4652
4653 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4654
4655 args.opcode = ac_image_get_resinfo;
4656 LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4657
4658 emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4659 }
4660
4661 static void tex_fetch_args(
4662 struct lp_build_tgsi_context *bld_base,
4663 struct lp_build_emit_data *emit_data)
4664 {
4665 struct si_shader_context *ctx = si_shader_context(bld_base);
4666 struct gallivm_state *gallivm = &ctx->gallivm;
4667 const struct tgsi_full_instruction *inst = emit_data->inst;
4668 unsigned opcode = inst->Instruction.Opcode;
4669 unsigned target = inst->Texture.Texture;
4670 LLVMValueRef coords[5], derivs[6];
4671 LLVMValueRef address[16];
4672 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4673 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4674 unsigned count = 0;
4675 unsigned chan;
4676 unsigned num_deriv_channels = 0;
4677 bool has_offset = inst->Texture.NumOffsets > 0;
4678 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4679 unsigned dmask = 0xf;
4680
4681 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4682
4683 if (target == TGSI_TEXTURE_BUFFER) {
4684 emit_data->dst_type = ctx->v4f32;
4685 emit_data->args[0] = res_ptr;
4686 emit_data->args[1] = ctx->i32_0;
4687 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4688 emit_data->arg_count = 3;
4689 return;
4690 }
4691
4692 /* Fetch and project texture coordinates */
4693 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4694 for (chan = 0; chan < 3; chan++ ) {
4695 coords[chan] = lp_build_emit_fetch(bld_base,
4696 emit_data->inst, 0,
4697 chan);
4698 if (opcode == TGSI_OPCODE_TXP)
4699 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4700 TGSI_OPCODE_DIV,
4701 coords[chan],
4702 coords[3]);
4703 }
4704
4705 if (opcode == TGSI_OPCODE_TXP)
4706 coords[3] = bld_base->base.one;
4707
4708 /* Pack offsets. */
4709 if (has_offset &&
4710 opcode != TGSI_OPCODE_TXF &&
4711 opcode != TGSI_OPCODE_TXF_LZ) {
4712 /* The offsets are six-bit signed integers packed like this:
4713 * X=[5:0], Y=[13:8], and Z=[21:16].
4714 */
4715 LLVMValueRef offset[3], pack;
4716
4717 assert(inst->Texture.NumOffsets == 1);
4718
4719 for (chan = 0; chan < 3; chan++) {
4720 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4721 emit_data->inst, 0, chan);
4722 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4723 LLVMConstInt(ctx->i32, 0x3f, 0), "");
4724 if (chan)
4725 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4726 LLVMConstInt(ctx->i32, chan*8, 0), "");
4727 }
4728
4729 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4730 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4731 address[count++] = pack;
4732 }
4733
4734 /* Pack LOD bias value */
4735 if (opcode == TGSI_OPCODE_TXB)
4736 address[count++] = coords[3];
4737 if (opcode == TGSI_OPCODE_TXB2)
4738 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4739
4740 /* Pack depth comparison value */
4741 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4742 LLVMValueRef z;
4743
4744 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4745 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4746 } else {
4747 assert(ref_pos >= 0);
4748 z = coords[ref_pos];
4749 }
4750
4751 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4752 * so the depth comparison value isn't clamped for Z16 and
4753 * Z24 anymore. Do it manually here.
4754 *
4755 * It's unnecessary if the original texture format was
4756 * Z32_FLOAT, but we don't know that here.
4757 */
4758 if (ctx->screen->b.chip_class == VI)
4759 z = ac_build_clamp(&ctx->ac, z);
4760
4761 address[count++] = z;
4762 }
4763
4764 /* Pack user derivatives */
4765 if (opcode == TGSI_OPCODE_TXD) {
4766 int param, num_src_deriv_channels, num_dst_deriv_channels;
4767
4768 switch (target) {
4769 case TGSI_TEXTURE_3D:
4770 num_src_deriv_channels = 3;
4771 num_dst_deriv_channels = 3;
4772 num_deriv_channels = 3;
4773 break;
4774 case TGSI_TEXTURE_2D:
4775 case TGSI_TEXTURE_SHADOW2D:
4776 case TGSI_TEXTURE_RECT:
4777 case TGSI_TEXTURE_SHADOWRECT:
4778 case TGSI_TEXTURE_2D_ARRAY:
4779 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4780 num_src_deriv_channels = 2;
4781 num_dst_deriv_channels = 2;
4782 num_deriv_channels = 2;
4783 break;
4784 case TGSI_TEXTURE_CUBE:
4785 case TGSI_TEXTURE_SHADOWCUBE:
4786 case TGSI_TEXTURE_CUBE_ARRAY:
4787 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4788 /* Cube derivatives will be converted to 2D. */
4789 num_src_deriv_channels = 3;
4790 num_dst_deriv_channels = 3;
4791 num_deriv_channels = 2;
4792 break;
4793 case TGSI_TEXTURE_1D:
4794 case TGSI_TEXTURE_SHADOW1D:
4795 case TGSI_TEXTURE_1D_ARRAY:
4796 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4797 num_src_deriv_channels = 1;
4798
4799 /* 1D textures are allocated and used as 2D on GFX9. */
4800 if (ctx->screen->b.chip_class >= GFX9) {
4801 num_dst_deriv_channels = 2;
4802 num_deriv_channels = 2;
4803 } else {
4804 num_dst_deriv_channels = 1;
4805 num_deriv_channels = 1;
4806 }
4807 break;
4808 default:
4809 unreachable("invalid target");
4810 }
4811
4812 for (param = 0; param < 2; param++) {
4813 for (chan = 0; chan < num_src_deriv_channels; chan++)
4814 derivs[param * num_dst_deriv_channels + chan] =
4815 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4816
4817 /* Fill in the rest with zeros. */
4818 for (chan = num_src_deriv_channels;
4819 chan < num_dst_deriv_channels; chan++)
4820 derivs[param * num_dst_deriv_channels + chan] =
4821 bld_base->base.zero;
4822 }
4823 }
4824
4825 if (target == TGSI_TEXTURE_CUBE ||
4826 target == TGSI_TEXTURE_CUBE_ARRAY ||
4827 target == TGSI_TEXTURE_SHADOWCUBE ||
4828 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4829 ac_prepare_cube_coords(&ctx->ac,
4830 opcode == TGSI_OPCODE_TXD,
4831 target == TGSI_TEXTURE_CUBE_ARRAY ||
4832 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4833 coords, derivs);
4834
4835 if (opcode == TGSI_OPCODE_TXD)
4836 for (int i = 0; i < num_deriv_channels * 2; i++)
4837 address[count++] = derivs[i];
4838
4839 /* Pack texture coordinates */
4840 address[count++] = coords[0];
4841 if (num_coords > 1)
4842 address[count++] = coords[1];
4843 if (num_coords > 2)
4844 address[count++] = coords[2];
4845
4846 /* 1D textures are allocated and used as 2D on GFX9. */
4847 if (ctx->screen->b.chip_class >= GFX9) {
4848 LLVMValueRef filler;
4849
4850 /* Use 0.5, so that we don't sample the border color. */
4851 if (opcode == TGSI_OPCODE_TXF)
4852 filler = ctx->i32_0;
4853 else
4854 filler = LLVMConstReal(ctx->f32, 0.5);
4855
4856 if (target == TGSI_TEXTURE_1D ||
4857 target == TGSI_TEXTURE_SHADOW1D) {
4858 address[count++] = filler;
4859 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4860 target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4861 address[count] = address[count - 1];
4862 address[count - 1] = filler;
4863 count++;
4864 }
4865 }
4866
4867 /* Pack LOD or sample index */
4868 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4869 address[count++] = coords[3];
4870 else if (opcode == TGSI_OPCODE_TXL2)
4871 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4872
4873 if (count > 16) {
4874 assert(!"Cannot handle more than 16 texture address parameters");
4875 count = 16;
4876 }
4877
4878 for (chan = 0; chan < count; chan++ ) {
4879 address[chan] = LLVMBuildBitCast(gallivm->builder,
4880 address[chan], ctx->i32, "");
4881 }
4882
4883 /* Adjust the sample index according to FMASK.
4884 *
4885 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4886 * which is the identity mapping. Each nibble says which physical sample
4887 * should be fetched to get that sample.
4888 *
4889 * For example, 0x11111100 means there are only 2 samples stored and
4890 * the second sample covers 3/4 of the pixel. When reading samples 0
4891 * and 1, return physical sample 0 (determined by the first two 0s
4892 * in FMASK), otherwise return physical sample 1.
4893 *
4894 * The sample index should be adjusted as follows:
4895 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4896 */
4897 if (target == TGSI_TEXTURE_2D_MSAA ||
4898 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4899 struct lp_build_emit_data txf_emit_data = *emit_data;
4900 LLVMValueRef txf_address[4];
4901 /* We only need .xy for non-arrays, and .xyz for arrays. */
4902 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4903 struct tgsi_full_instruction inst = {};
4904
4905 memcpy(txf_address, address, sizeof(txf_address));
4906
4907 /* Read FMASK using TXF_LZ. */
4908 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4909 inst.Texture.Texture = target;
4910 txf_emit_data.inst = &inst;
4911 txf_emit_data.chan = 0;
4912 set_tex_fetch_args(ctx, &txf_emit_data,
4913 target, fmask_ptr, NULL,
4914 txf_address, txf_count, 0xf);
4915 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4916
4917 /* Initialize some constants. */
4918 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4919 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4920
4921 /* Apply the formula. */
4922 LLVMValueRef fmask =
4923 LLVMBuildExtractElement(gallivm->builder,
4924 txf_emit_data.output[0],
4925 ctx->i32_0, "");
4926
4927 unsigned sample_chan = txf_count; /* the sample index is last */
4928
4929 LLVMValueRef sample_index4 =
4930 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4931
4932 LLVMValueRef shifted_fmask =
4933 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4934
4935 LLVMValueRef final_sample =
4936 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4937
4938 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4939 * resource descriptor is 0 (invalid),
4940 */
4941 LLVMValueRef fmask_desc =
4942 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4943 ctx->v8i32, "");
4944
4945 LLVMValueRef fmask_word1 =
4946 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4947 ctx->i32_1, "");
4948
4949 LLVMValueRef word1_is_nonzero =
4950 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4951 fmask_word1, ctx->i32_0, "");
4952
4953 /* Replace the MSAA sample index. */
4954 address[sample_chan] =
4955 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4956 final_sample, address[sample_chan], "");
4957 }
4958
4959 if (opcode == TGSI_OPCODE_TXF ||
4960 opcode == TGSI_OPCODE_TXF_LZ) {
4961 /* add tex offsets */
4962 if (inst->Texture.NumOffsets) {
4963 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4964 const struct tgsi_texture_offset *off = inst->TexOffsets;
4965
4966 assert(inst->Texture.NumOffsets == 1);
4967
4968 switch (target) {
4969 case TGSI_TEXTURE_3D:
4970 address[2] = lp_build_add(uint_bld, address[2],
4971 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4972 /* fall through */
4973 case TGSI_TEXTURE_2D:
4974 case TGSI_TEXTURE_SHADOW2D:
4975 case TGSI_TEXTURE_RECT:
4976 case TGSI_TEXTURE_SHADOWRECT:
4977 case TGSI_TEXTURE_2D_ARRAY:
4978 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4979 address[1] =
4980 lp_build_add(uint_bld, address[1],
4981 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4982 /* fall through */
4983 case TGSI_TEXTURE_1D:
4984 case TGSI_TEXTURE_SHADOW1D:
4985 case TGSI_TEXTURE_1D_ARRAY:
4986 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4987 address[0] =
4988 lp_build_add(uint_bld, address[0],
4989 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4990 break;
4991 /* texture offsets do not apply to other texture targets */
4992 }
4993 }
4994 }
4995
4996 if (opcode == TGSI_OPCODE_TG4) {
4997 unsigned gather_comp = 0;
4998
4999 /* DMASK was repurposed for GATHER4. 4 components are always
5000 * returned and DMASK works like a swizzle - it selects
5001 * the component to fetch. The only valid DMASK values are
5002 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
5003 * (red,red,red,red) etc.) The ISA document doesn't mention
5004 * this.
5005 */
5006
5007 /* Get the component index from src1.x for Gather4. */
5008 if (!tgsi_is_shadow_target(target)) {
5009 LLVMValueRef comp_imm;
5010 struct tgsi_src_register src1 = inst->Src[1].Register;
5011
5012 assert(src1.File == TGSI_FILE_IMMEDIATE);
5013
5014 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
5015 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
5016 gather_comp = CLAMP(gather_comp, 0, 3);
5017 }
5018
5019 dmask = 1 << gather_comp;
5020 }
5021
5022 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
5023 samp_ptr, address, count, dmask);
5024 }
5025
5026 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
5027 * incorrectly forces nearest filtering if the texture format is integer.
5028 * The only effect it has on Gather4, which always returns 4 texels for
5029 * bilinear filtering, is that the final coordinates are off by 0.5 of
5030 * the texel size.
5031 *
5032 * The workaround is to subtract 0.5 from the unnormalized coordinates,
5033 * or (0.5 / size) from the normalized coordinates.
5034 */
5035 static void si_lower_gather4_integer(struct si_shader_context *ctx,
5036 struct ac_image_args *args,
5037 unsigned target)
5038 {
5039 LLVMBuilderRef builder = ctx->gallivm.builder;
5040 LLVMValueRef coord = args->addr;
5041 LLVMValueRef half_texel[2];
5042 /* Texture coordinates start after:
5043 * {offset, bias, z-compare, derivatives}
5044 * Only the offset and z-compare can occur here.
5045 */
5046 unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
5047 int c;
5048
5049 if (target == TGSI_TEXTURE_RECT ||
5050 target == TGSI_TEXTURE_SHADOWRECT) {
5051 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
5052 } else {
5053 struct tgsi_full_instruction txq_inst = {};
5054 struct lp_build_emit_data txq_emit_data = {};
5055
5056 /* Query the texture size. */
5057 txq_inst.Texture.Texture = target;
5058 txq_emit_data.inst = &txq_inst;
5059 txq_emit_data.dst_type = ctx->v4i32;
5060 set_tex_fetch_args(ctx, &txq_emit_data, target,
5061 args->resource, NULL, &ctx->i32_0,
5062 1, 0xf);
5063 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
5064
5065 /* Compute -0.5 / size. */
5066 for (c = 0; c < 2; c++) {
5067 half_texel[c] =
5068 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
5069 LLVMConstInt(ctx->i32, c, 0), "");
5070 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
5071 half_texel[c] =
5072 lp_build_emit_llvm_unary(&ctx->bld_base,
5073 TGSI_OPCODE_RCP, half_texel[c]);
5074 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
5075 LLVMConstReal(ctx->f32, -0.5), "");
5076 }
5077 }
5078
5079 for (c = 0; c < 2; c++) {
5080 LLVMValueRef tmp;
5081 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
5082
5083 tmp = LLVMBuildExtractElement(builder, coord, index, "");
5084 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
5085 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
5086 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
5087 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
5088 }
5089
5090 args->addr = coord;
5091 }
5092
5093 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
5094 struct lp_build_tgsi_context *bld_base,
5095 struct lp_build_emit_data *emit_data)
5096 {
5097 struct si_shader_context *ctx = si_shader_context(bld_base);
5098 const struct tgsi_full_instruction *inst = emit_data->inst;
5099 struct ac_image_args args;
5100 unsigned opcode = inst->Instruction.Opcode;
5101 unsigned target = inst->Texture.Texture;
5102
5103 if (target == TGSI_TEXTURE_BUFFER) {
5104 emit_data->output[emit_data->chan] =
5105 ac_build_buffer_load_format(&ctx->ac,
5106 emit_data->args[0],
5107 emit_data->args[2],
5108 emit_data->args[1],
5109 true);
5110 return;
5111 }
5112
5113 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
5114
5115 args.opcode = ac_image_sample;
5116 args.compare = tgsi_is_shadow_target(target);
5117 args.offset = inst->Texture.NumOffsets > 0;
5118
5119 switch (opcode) {
5120 case TGSI_OPCODE_TXF:
5121 case TGSI_OPCODE_TXF_LZ:
5122 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
5123 target == TGSI_TEXTURE_2D_MSAA ||
5124 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
5125 ac_image_load : ac_image_load_mip;
5126 args.compare = false;
5127 args.offset = false;
5128 break;
5129 case TGSI_OPCODE_LODQ:
5130 args.opcode = ac_image_get_lod;
5131 args.compare = false;
5132 args.offset = false;
5133 break;
5134 case TGSI_OPCODE_TEX:
5135 case TGSI_OPCODE_TEX2:
5136 case TGSI_OPCODE_TXP:
5137 if (ctx->type != PIPE_SHADER_FRAGMENT)
5138 args.level_zero = true;
5139 break;
5140 case TGSI_OPCODE_TEX_LZ:
5141 args.level_zero = true;
5142 break;
5143 case TGSI_OPCODE_TXB:
5144 case TGSI_OPCODE_TXB2:
5145 assert(ctx->type == PIPE_SHADER_FRAGMENT);
5146 args.bias = true;
5147 break;
5148 case TGSI_OPCODE_TXL:
5149 case TGSI_OPCODE_TXL2:
5150 args.lod = true;
5151 break;
5152 case TGSI_OPCODE_TXD:
5153 args.deriv = true;
5154 break;
5155 case TGSI_OPCODE_TG4:
5156 args.opcode = ac_image_gather4;
5157 args.level_zero = true;
5158 break;
5159 default:
5160 assert(0);
5161 return;
5162 }
5163
5164 /* The hardware needs special lowering for Gather4 with integer formats. */
5165 if (ctx->screen->b.chip_class <= VI &&
5166 opcode == TGSI_OPCODE_TG4) {
5167 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5168 /* This will also work with non-constant indexing because of how
5169 * glsl_to_tgsi works and we intent to preserve that behavior.
5170 */
5171 const unsigned src_idx = 2;
5172 unsigned sampler = inst->Src[src_idx].Register.Index;
5173
5174 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
5175
5176 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
5177 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
5178 si_lower_gather4_integer(ctx, &args, target);
5179 }
5180
5181 emit_data->output[emit_data->chan] =
5182 ac_build_image_opcode(&ctx->ac, &args);
5183 }
5184
5185 static void si_llvm_emit_txqs(
5186 const struct lp_build_tgsi_action *action,
5187 struct lp_build_tgsi_context *bld_base,
5188 struct lp_build_emit_data *emit_data)
5189 {
5190 struct si_shader_context *ctx = si_shader_context(bld_base);
5191 struct gallivm_state *gallivm = &ctx->gallivm;
5192 LLVMBuilderRef builder = gallivm->builder;
5193 LLVMValueRef res, samples;
5194 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5195
5196 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5197
5198
5199 /* Read the samples from the descriptor directly. */
5200 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5201 samples = LLVMBuildExtractElement(
5202 builder, res,
5203 LLVMConstInt(ctx->i32, 3, 0), "");
5204 samples = LLVMBuildLShr(builder, samples,
5205 LLVMConstInt(ctx->i32, 16, 0), "");
5206 samples = LLVMBuildAnd(builder, samples,
5207 LLVMConstInt(ctx->i32, 0xf, 0), "");
5208 samples = LLVMBuildShl(builder, ctx->i32_1,
5209 samples, "");
5210
5211 emit_data->output[emit_data->chan] = samples;
5212 }
5213
5214 static void si_llvm_emit_ddxy(
5215 const struct lp_build_tgsi_action *action,
5216 struct lp_build_tgsi_context *bld_base,
5217 struct lp_build_emit_data *emit_data)
5218 {
5219 struct si_shader_context *ctx = si_shader_context(bld_base);
5220 struct gallivm_state *gallivm = &ctx->gallivm;
5221 unsigned opcode = emit_data->info->opcode;
5222 LLVMValueRef val;
5223 int idx;
5224 unsigned mask;
5225
5226 if (opcode == TGSI_OPCODE_DDX_FINE)
5227 mask = AC_TID_MASK_LEFT;
5228 else if (opcode == TGSI_OPCODE_DDY_FINE)
5229 mask = AC_TID_MASK_TOP;
5230 else
5231 mask = AC_TID_MASK_TOP_LEFT;
5232
5233 /* for DDX we want to next X pixel, DDY next Y pixel. */
5234 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5235
5236 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5237 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5238 mask, idx, ctx->lds, val);
5239 emit_data->output[emit_data->chan] = val;
5240 }
5241
5242 /*
5243 * this takes an I,J coordinate pair,
5244 * and works out the X and Y derivatives.
5245 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5246 */
5247 static LLVMValueRef si_llvm_emit_ddxy_interp(
5248 struct lp_build_tgsi_context *bld_base,
5249 LLVMValueRef interp_ij)
5250 {
5251 struct si_shader_context *ctx = si_shader_context(bld_base);
5252 struct gallivm_state *gallivm = &ctx->gallivm;
5253 LLVMValueRef result[4], a;
5254 unsigned i;
5255
5256 for (i = 0; i < 2; i++) {
5257 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5258 LLVMConstInt(ctx->i32, i, 0), "");
5259 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5260 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5261 }
5262
5263 return lp_build_gather_values(gallivm, result, 4);
5264 }
5265
5266 static void interp_fetch_args(
5267 struct lp_build_tgsi_context *bld_base,
5268 struct lp_build_emit_data *emit_data)
5269 {
5270 struct si_shader_context *ctx = si_shader_context(bld_base);
5271 struct gallivm_state *gallivm = &ctx->gallivm;
5272 const struct tgsi_full_instruction *inst = emit_data->inst;
5273
5274 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5275 /* offset is in second src, first two channels */
5276 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5277 emit_data->inst, 1,
5278 TGSI_CHAN_X);
5279 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5280 emit_data->inst, 1,
5281 TGSI_CHAN_Y);
5282 emit_data->arg_count = 2;
5283 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5284 LLVMValueRef sample_position;
5285 LLVMValueRef sample_id;
5286 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5287
5288 /* fetch sample ID, then fetch its sample position,
5289 * and place into first two channels.
5290 */
5291 sample_id = lp_build_emit_fetch(bld_base,
5292 emit_data->inst, 1, TGSI_CHAN_X);
5293 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5294 ctx->i32, "");
5295 sample_position = load_sample_position(ctx, sample_id);
5296
5297 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5298 sample_position,
5299 ctx->i32_0, "");
5300
5301 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5302 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5303 sample_position,
5304 ctx->i32_1, "");
5305 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5306 emit_data->arg_count = 2;
5307 }
5308 }
5309
5310 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5311 struct lp_build_tgsi_context *bld_base,
5312 struct lp_build_emit_data *emit_data)
5313 {
5314 struct si_shader_context *ctx = si_shader_context(bld_base);
5315 struct si_shader *shader = ctx->shader;
5316 struct gallivm_state *gallivm = &ctx->gallivm;
5317 LLVMValueRef interp_param;
5318 const struct tgsi_full_instruction *inst = emit_data->inst;
5319 int input_index = inst->Src[0].Register.Index;
5320 int chan;
5321 int i;
5322 LLVMValueRef attr_number;
5323 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5324 int interp_param_idx;
5325 unsigned interp = shader->selector->info.input_interpolate[input_index];
5326 unsigned location;
5327
5328 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5329
5330 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5331 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5332 location = TGSI_INTERPOLATE_LOC_CENTER;
5333 else
5334 location = TGSI_INTERPOLATE_LOC_CENTROID;
5335
5336 interp_param_idx = lookup_interp_param_index(interp, location);
5337 if (interp_param_idx == -1)
5338 return;
5339 else if (interp_param_idx)
5340 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5341 else
5342 interp_param = NULL;
5343
5344 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5345
5346 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5347 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5348 LLVMValueRef ij_out[2];
5349 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5350
5351 /*
5352 * take the I then J parameters, and the DDX/Y for it, and
5353 * calculate the IJ inputs for the interpolator.
5354 * temp1 = ddx * offset/sample.x + I;
5355 * interp_param.I = ddy * offset/sample.y + temp1;
5356 * temp1 = ddx * offset/sample.x + J;
5357 * interp_param.J = ddy * offset/sample.y + temp1;
5358 */
5359 for (i = 0; i < 2; i++) {
5360 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5361 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5362 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5363 ddxy_out, ix_ll, "");
5364 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5365 ddxy_out, iy_ll, "");
5366 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5367 interp_param, ix_ll, "");
5368 LLVMValueRef temp1, temp2;
5369
5370 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5371 ctx->f32, "");
5372
5373 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5374
5375 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5376
5377 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5378
5379 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5380 }
5381 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5382 }
5383
5384 for (chan = 0; chan < 4; chan++) {
5385 LLVMValueRef llvm_chan;
5386 unsigned schan;
5387
5388 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5389 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5390
5391 if (interp_param) {
5392 interp_param = LLVMBuildBitCast(gallivm->builder,
5393 interp_param, LLVMVectorType(ctx->f32, 2), "");
5394 LLVMValueRef i = LLVMBuildExtractElement(
5395 gallivm->builder, interp_param, ctx->i32_0, "");
5396 LLVMValueRef j = LLVMBuildExtractElement(
5397 gallivm->builder, interp_param, ctx->i32_1, "");
5398 emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5399 llvm_chan, attr_number, params,
5400 i, j);
5401 } else {
5402 emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5403 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5404 llvm_chan, attr_number, params);
5405 }
5406 }
5407 }
5408
5409 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5410 LLVMValueRef value)
5411 {
5412 struct gallivm_state *gallivm = &ctx->gallivm;
5413 LLVMValueRef args[3] = {
5414 value,
5415 ctx->i32_0,
5416 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5417 };
5418
5419 /* We currently have no other way to prevent LLVM from lifting the icmp
5420 * calls to a dominating basic block.
5421 */
5422 emit_optimization_barrier(ctx, &args[0]);
5423
5424 if (LLVMTypeOf(args[0]) != ctx->i32)
5425 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5426
5427 return lp_build_intrinsic(gallivm->builder,
5428 "llvm.amdgcn.icmp.i32",
5429 ctx->i64, args, 3,
5430 LP_FUNC_ATTR_NOUNWIND |
5431 LP_FUNC_ATTR_READNONE |
5432 LP_FUNC_ATTR_CONVERGENT);
5433 }
5434
5435 static void vote_all_emit(
5436 const struct lp_build_tgsi_action *action,
5437 struct lp_build_tgsi_context *bld_base,
5438 struct lp_build_emit_data *emit_data)
5439 {
5440 struct si_shader_context *ctx = si_shader_context(bld_base);
5441 struct gallivm_state *gallivm = &ctx->gallivm;
5442 LLVMValueRef active_set, vote_set;
5443 LLVMValueRef tmp;
5444
5445 active_set = si_emit_ballot(ctx, ctx->i32_1);
5446 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5447
5448 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5449 emit_data->output[emit_data->chan] =
5450 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5451 }
5452
5453 static void vote_any_emit(
5454 const struct lp_build_tgsi_action *action,
5455 struct lp_build_tgsi_context *bld_base,
5456 struct lp_build_emit_data *emit_data)
5457 {
5458 struct si_shader_context *ctx = si_shader_context(bld_base);
5459 struct gallivm_state *gallivm = &ctx->gallivm;
5460 LLVMValueRef vote_set;
5461 LLVMValueRef tmp;
5462
5463 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5464
5465 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5466 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5467 emit_data->output[emit_data->chan] =
5468 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5469 }
5470
5471 static void vote_eq_emit(
5472 const struct lp_build_tgsi_action *action,
5473 struct lp_build_tgsi_context *bld_base,
5474 struct lp_build_emit_data *emit_data)
5475 {
5476 struct si_shader_context *ctx = si_shader_context(bld_base);
5477 struct gallivm_state *gallivm = &ctx->gallivm;
5478 LLVMValueRef active_set, vote_set;
5479 LLVMValueRef all, none, tmp;
5480
5481 active_set = si_emit_ballot(ctx, ctx->i32_1);
5482 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5483
5484 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5485 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5486 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5487 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5488 emit_data->output[emit_data->chan] =
5489 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5490 }
5491
5492 static void ballot_emit(
5493 const struct lp_build_tgsi_action *action,
5494 struct lp_build_tgsi_context *bld_base,
5495 struct lp_build_emit_data *emit_data)
5496 {
5497 struct si_shader_context *ctx = si_shader_context(bld_base);
5498 LLVMBuilderRef builder = ctx->gallivm.builder;
5499 LLVMValueRef tmp;
5500
5501 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5502 tmp = si_emit_ballot(ctx, tmp);
5503 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5504
5505 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5506 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5507 }
5508
5509 static void read_invoc_fetch_args(
5510 struct lp_build_tgsi_context *bld_base,
5511 struct lp_build_emit_data *emit_data)
5512 {
5513 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5514 0, emit_data->src_chan);
5515
5516 /* Always read the source invocation (= lane) from the X channel. */
5517 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5518 1, TGSI_CHAN_X);
5519 emit_data->arg_count = 2;
5520 }
5521
5522 static void read_lane_emit(
5523 const struct lp_build_tgsi_action *action,
5524 struct lp_build_tgsi_context *bld_base,
5525 struct lp_build_emit_data *emit_data)
5526 {
5527 struct si_shader_context *ctx = si_shader_context(bld_base);
5528 LLVMBuilderRef builder = ctx->gallivm.builder;
5529
5530 /* We currently have no other way to prevent LLVM from lifting the icmp
5531 * calls to a dominating basic block.
5532 */
5533 emit_optimization_barrier(ctx, &emit_data->args[0]);
5534
5535 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5536 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5537 ctx->i32, "");
5538 }
5539
5540 emit_data->output[emit_data->chan] =
5541 ac_build_intrinsic(&ctx->ac, action->intr_name,
5542 ctx->i32, emit_data->args, emit_data->arg_count,
5543 AC_FUNC_ATTR_READNONE |
5544 AC_FUNC_ATTR_CONVERGENT);
5545 }
5546
5547 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5548 struct lp_build_emit_data *emit_data)
5549 {
5550 struct si_shader_context *ctx = si_shader_context(bld_base);
5551 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5552 LLVMValueRef imm;
5553 unsigned stream;
5554
5555 assert(src0.File == TGSI_FILE_IMMEDIATE);
5556
5557 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5558 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5559 return stream;
5560 }
5561
5562 /* Emit one vertex from the geometry shader */
5563 static void si_llvm_emit_vertex(
5564 const struct lp_build_tgsi_action *action,
5565 struct lp_build_tgsi_context *bld_base,
5566 struct lp_build_emit_data *emit_data)
5567 {
5568 struct si_shader_context *ctx = si_shader_context(bld_base);
5569 struct lp_build_context *uint = &bld_base->uint_bld;
5570 struct si_shader *shader = ctx->shader;
5571 struct tgsi_shader_info *info = &shader->selector->info;
5572 struct gallivm_state *gallivm = &ctx->gallivm;
5573 struct lp_build_if_state if_state;
5574 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5575 ctx->param_gs2vs_offset);
5576 LLVMValueRef gs_next_vertex;
5577 LLVMValueRef can_emit, kill;
5578 unsigned chan, offset;
5579 int i;
5580 unsigned stream;
5581
5582 stream = si_llvm_get_stream(bld_base, emit_data);
5583
5584 /* Write vertex attribute values to GSVS ring */
5585 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5586 ctx->gs_next_vertex[stream],
5587 "");
5588
5589 /* If this thread has already emitted the declared maximum number of
5590 * vertices, skip the write: excessive vertex emissions are not
5591 * supposed to have any effect.
5592 *
5593 * If the shader has no writes to memory, kill it instead. This skips
5594 * further memory loads and may allow LLVM to skip to the end
5595 * altogether.
5596 */
5597 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5598 LLVMConstInt(ctx->i32,
5599 shader->selector->gs_max_out_vertices, 0), "");
5600
5601 bool use_kill = !info->writes_memory;
5602 if (use_kill) {
5603 kill = lp_build_select(&bld_base->base, can_emit,
5604 LLVMConstReal(ctx->f32, 1.0f),
5605 LLVMConstReal(ctx->f32, -1.0f));
5606
5607 ac_build_kill(&ctx->ac, kill);
5608 } else {
5609 lp_build_if(&if_state, gallivm, can_emit);
5610 }
5611
5612 offset = 0;
5613 for (i = 0; i < info->num_outputs; i++) {
5614 LLVMValueRef *out_ptr = ctx->outputs[i];
5615
5616 for (chan = 0; chan < 4; chan++) {
5617 if (!(info->output_usagemask[i] & (1 << chan)) ||
5618 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5619 continue;
5620
5621 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5622 LLVMValueRef voffset =
5623 LLVMConstInt(ctx->i32, offset *
5624 shader->selector->gs_max_out_vertices, 0);
5625 offset++;
5626
5627 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5628 voffset = lp_build_mul_imm(uint, voffset, 4);
5629
5630 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5631
5632 ac_build_buffer_store_dword(&ctx->ac,
5633 ctx->gsvs_ring[stream],
5634 out_val, 1,
5635 voffset, soffset, 0,
5636 1, 1, true, true);
5637 }
5638 }
5639
5640 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5641 ctx->i32_1);
5642
5643 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5644
5645 /* Signal vertex emission */
5646 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5647 si_get_gs_wave_id(ctx));
5648 if (!use_kill)
5649 lp_build_endif(&if_state);
5650 }
5651
5652 /* Cut one primitive from the geometry shader */
5653 static void si_llvm_emit_primitive(
5654 const struct lp_build_tgsi_action *action,
5655 struct lp_build_tgsi_context *bld_base,
5656 struct lp_build_emit_data *emit_data)
5657 {
5658 struct si_shader_context *ctx = si_shader_context(bld_base);
5659 unsigned stream;
5660
5661 /* Signal primitive cut */
5662 stream = si_llvm_get_stream(bld_base, emit_data);
5663 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5664 si_get_gs_wave_id(ctx));
5665 }
5666
5667 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5668 struct lp_build_tgsi_context *bld_base,
5669 struct lp_build_emit_data *emit_data)
5670 {
5671 struct si_shader_context *ctx = si_shader_context(bld_base);
5672 struct gallivm_state *gallivm = &ctx->gallivm;
5673
5674 /* SI only (thanks to a hw bug workaround):
5675 * The real barrier instruction isn’t needed, because an entire patch
5676 * always fits into a single wave.
5677 */
5678 if (ctx->screen->b.chip_class == SI &&
5679 ctx->type == PIPE_SHADER_TESS_CTRL) {
5680 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5681 return;
5682 }
5683
5684 lp_build_intrinsic(gallivm->builder,
5685 "llvm.amdgcn.s.barrier",
5686 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5687 }
5688
5689 static const struct lp_build_tgsi_action tex_action = {
5690 .fetch_args = tex_fetch_args,
5691 .emit = build_tex_intrinsic,
5692 };
5693
5694 static const struct lp_build_tgsi_action interp_action = {
5695 .fetch_args = interp_fetch_args,
5696 .emit = build_interp_intrinsic,
5697 };
5698
5699 static void si_create_function(struct si_shader_context *ctx,
5700 const char *name,
5701 LLVMTypeRef *returns, unsigned num_returns,
5702 LLVMTypeRef *params, unsigned num_params,
5703 int last_sgpr, unsigned max_workgroup_size)
5704 {
5705 int i;
5706
5707 si_llvm_create_func(ctx, name, returns, num_returns,
5708 params, num_params);
5709 si_llvm_shader_type(ctx->main_fn, ctx->type);
5710 ctx->return_value = LLVMGetUndef(ctx->return_type);
5711
5712 for (i = 0; i <= last_sgpr; ++i) {
5713 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5714
5715 /* The combination of:
5716 * - ByVal
5717 * - dereferenceable
5718 * - invariant.load
5719 * allows the optimization passes to move loads and reduces
5720 * SGPR spilling significantly.
5721 */
5722 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5723 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5724 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5725 ac_add_attr_dereferenceable(P, UINT64_MAX);
5726 } else
5727 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5728 }
5729
5730 if (max_workgroup_size) {
5731 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
5732 max_workgroup_size);
5733 }
5734 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5735 "no-signed-zeros-fp-math",
5736 "true");
5737
5738 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5739 /* These were copied from some LLVM test. */
5740 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5741 "less-precise-fpmad",
5742 "true");
5743 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5744 "no-infs-fp-math",
5745 "true");
5746 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5747 "no-nans-fp-math",
5748 "true");
5749 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5750 "unsafe-fp-math",
5751 "true");
5752 }
5753 }
5754
5755 static void declare_streamout_params(struct si_shader_context *ctx,
5756 struct pipe_stream_output_info *so,
5757 LLVMTypeRef *params, LLVMTypeRef i32,
5758 unsigned *num_params)
5759 {
5760 int i;
5761
5762 /* Streamout SGPRs. */
5763 if (so->num_outputs) {
5764 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5765 params[ctx->param_streamout_config = (*num_params)++] = i32;
5766 else
5767 ctx->param_streamout_config = *num_params - 1;
5768
5769 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5770 }
5771 /* A streamout buffer offset is loaded if the stride is non-zero. */
5772 for (i = 0; i < 4; i++) {
5773 if (!so->stride[i])
5774 continue;
5775
5776 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5777 }
5778 }
5779
5780 static unsigned llvm_get_type_size(LLVMTypeRef type)
5781 {
5782 LLVMTypeKind kind = LLVMGetTypeKind(type);
5783
5784 switch (kind) {
5785 case LLVMIntegerTypeKind:
5786 return LLVMGetIntTypeWidth(type) / 8;
5787 case LLVMFloatTypeKind:
5788 return 4;
5789 case LLVMPointerTypeKind:
5790 return 8;
5791 case LLVMVectorTypeKind:
5792 return LLVMGetVectorSize(type) *
5793 llvm_get_type_size(LLVMGetElementType(type));
5794 case LLVMArrayTypeKind:
5795 return LLVMGetArrayLength(type) *
5796 llvm_get_type_size(LLVMGetElementType(type));
5797 default:
5798 assert(0);
5799 return 0;
5800 }
5801 }
5802
5803 static void declare_lds_as_pointer(struct si_shader_context *ctx)
5804 {
5805 struct gallivm_state *gallivm = &ctx->gallivm;
5806
5807 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5808 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5809 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5810 "lds");
5811 }
5812
5813 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5814 {
5815 switch (shader->selector->type) {
5816 case PIPE_SHADER_TESS_CTRL:
5817 /* Return this so that LLVM doesn't remove s_barrier
5818 * instructions on chips where we use s_barrier. */
5819 return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
5820
5821 case PIPE_SHADER_GEOMETRY:
5822 return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64;
5823
5824 case PIPE_SHADER_COMPUTE:
5825 break; /* see below */
5826
5827 default:
5828 return 0;
5829 }
5830
5831 const unsigned *properties = shader->selector->info.properties;
5832 unsigned max_work_group_size =
5833 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5834 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5835 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5836
5837 if (!max_work_group_size) {
5838 /* This is a variable group size compute shader,
5839 * compile it for the maximum possible group size.
5840 */
5841 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5842 }
5843 return max_work_group_size;
5844 }
5845
5846 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5847 LLVMTypeRef *params,
5848 unsigned *num_params,
5849 bool assign_params)
5850 {
5851 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_CONST_BUFFERS);
5852 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5853 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5854 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5855
5856 if (assign_params) {
5857 ctx->param_const_buffers = *num_params - 4;
5858 ctx->param_samplers = *num_params - 3;
5859 ctx->param_images = *num_params - 2;
5860 ctx->param_shader_buffers = *num_params - 1;
5861 }
5862 }
5863
5864 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5865 LLVMTypeRef *params,
5866 unsigned *num_params)
5867 {
5868 params[ctx->param_rw_buffers = (*num_params)++] =
5869 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
5870 declare_per_stage_desc_pointers(ctx, params, num_params, true);
5871 }
5872
5873 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5874 LLVMTypeRef *params,
5875 unsigned *num_params)
5876 {
5877 params[ctx->param_vertex_buffers = (*num_params)++] =
5878 const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS);
5879 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5880 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5881 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5882 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5883 }
5884
5885 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5886 LLVMTypeRef *params, unsigned *num_params,
5887 unsigned *num_prolog_vgprs)
5888 {
5889 struct si_shader *shader = ctx->shader;
5890
5891 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5892 if (shader->key.as_ls) {
5893 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5894 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5895 } else {
5896 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5897 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5898 }
5899 params[(*num_params)++] = ctx->i32; /* unused */
5900
5901 if (!shader->is_gs_copy_shader) {
5902 /* Vertex load indices. */
5903 ctx->param_vertex_index0 = (*num_params);
5904 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5905 params[(*num_params)++] = ctx->i32;
5906 *num_prolog_vgprs += shader->selector->info.num_inputs;
5907 }
5908 }
5909
5910 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
5911 LLVMTypeRef *params, unsigned *num_params)
5912 {
5913 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
5914 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
5915 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
5916 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
5917 }
5918
5919 enum {
5920 /* Convenient merged shader definitions. */
5921 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5922 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5923 };
5924
5925 static void create_function(struct si_shader_context *ctx)
5926 {
5927 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5928 struct gallivm_state *gallivm = &ctx->gallivm;
5929 struct si_shader *shader = ctx->shader;
5930 LLVMTypeRef params[100]; /* just make it large enough */
5931 LLVMTypeRef returns[16+32*4];
5932 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5933 unsigned num_returns = 0;
5934 unsigned num_prolog_vgprs = 0;
5935 unsigned type = ctx->type;
5936
5937 /* Set MERGED shaders. */
5938 if (ctx->screen->b.chip_class >= GFX9) {
5939 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5940 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5941 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5942 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5943 }
5944
5945 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5946
5947 switch (type) {
5948 case PIPE_SHADER_VERTEX:
5949 declare_default_desc_pointers(ctx, params, &num_params);
5950 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5951
5952 if (shader->key.as_es) {
5953 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5954 } else if (shader->key.as_ls) {
5955 /* no extra parameters */
5956 } else {
5957 if (shader->is_gs_copy_shader)
5958 num_params = ctx->param_rw_buffers + 1;
5959
5960 /* The locations of the other parameters are assigned dynamically. */
5961 declare_streamout_params(ctx, &shader->selector->so,
5962 params, ctx->i32, &num_params);
5963 }
5964
5965 last_sgpr = num_params-1;
5966
5967 /* VGPRs */
5968 declare_vs_input_vgprs(ctx, params, &num_params,
5969 &num_prolog_vgprs);
5970 break;
5971
5972 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5973 declare_default_desc_pointers(ctx, params, &num_params);
5974 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5975 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5976 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5977 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5978 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
5979 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
5980 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5981 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5982 last_sgpr = num_params - 1;
5983
5984 /* VGPRs */
5985 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5986 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5987
5988 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5989 * placed after the user SGPRs.
5990 */
5991 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5992 returns[num_returns++] = ctx->i32; /* SGPRs */
5993 for (i = 0; i < 3; i++)
5994 returns[num_returns++] = ctx->f32; /* VGPRs */
5995 break;
5996
5997 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
5998 /* Merged stages have 8 system SGPRs at the beginning. */
5999 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
6000 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
6001 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6002 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6003 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
6004 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6005 params[num_params++] = ctx->i32; /* unused */
6006 params[num_params++] = ctx->i32; /* unused */
6007
6008 params[num_params++] = ctx->i32; /* unused */
6009 params[num_params++] = ctx->i32; /* unused */
6010 declare_per_stage_desc_pointers(ctx, params, &num_params,
6011 ctx->type == PIPE_SHADER_VERTEX);
6012 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6013
6014 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6015 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
6016 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
6017 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6018 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
6019 params[num_params++] = ctx->i32; /* unused */
6020
6021 declare_per_stage_desc_pointers(ctx, params, &num_params,
6022 ctx->type == PIPE_SHADER_TESS_CTRL);
6023 last_sgpr = num_params - 1;
6024
6025 /* VGPRs (first TCS, then VS) */
6026 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
6027 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
6028
6029 if (ctx->type == PIPE_SHADER_VERTEX) {
6030 declare_vs_input_vgprs(ctx, params, &num_params,
6031 &num_prolog_vgprs);
6032
6033 /* LS return values are inputs to the TCS main shader part. */
6034 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
6035 returns[num_returns++] = ctx->i32; /* SGPRs */
6036 for (i = 0; i < 2; i++)
6037 returns[num_returns++] = ctx->f32; /* VGPRs */
6038 } else {
6039 /* TCS return values are inputs to the TCS epilog.
6040 *
6041 * param_tcs_offchip_offset, param_tcs_factor_offset,
6042 * param_tcs_offchip_layout, and param_rw_buffers
6043 * should be passed to the epilog.
6044 */
6045 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
6046 returns[num_returns++] = ctx->i32; /* SGPRs */
6047 for (i = 0; i < 3; i++)
6048 returns[num_returns++] = ctx->f32; /* VGPRs */
6049 }
6050 break;
6051
6052 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
6053 /* Merged stages have 8 system SGPRs at the beginning. */
6054 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
6055 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS);
6056 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6057 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6058 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6059 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6060 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
6061 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
6062
6063 params[num_params++] = ctx->i32; /* unused */
6064 params[num_params++] = ctx->i32; /* unused */
6065 declare_per_stage_desc_pointers(ctx, params, &num_params,
6066 (ctx->type == PIPE_SHADER_VERTEX ||
6067 ctx->type == PIPE_SHADER_TESS_EVAL));
6068 if (ctx->type == PIPE_SHADER_VERTEX) {
6069 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6070 } else {
6071 /* TESS_EVAL (and also GEOMETRY):
6072 * Declare as many input SGPRs as the VS has. */
6073 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6074 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6075 params[num_params++] = ctx->i32; /* unused */
6076 params[num_params++] = ctx->i32; /* unused */
6077 params[num_params++] = ctx->i32; /* unused */
6078 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
6079 }
6080
6081 declare_per_stage_desc_pointers(ctx, params, &num_params,
6082 ctx->type == PIPE_SHADER_GEOMETRY);
6083 last_sgpr = num_params - 1;
6084
6085 /* VGPRs (first GS, then VS/TES) */
6086 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
6087 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
6088 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6089 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6090 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
6091
6092 if (ctx->type == PIPE_SHADER_VERTEX) {
6093 declare_vs_input_vgprs(ctx, params, &num_params,
6094 &num_prolog_vgprs);
6095 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
6096 declare_tes_input_vgprs(ctx, params, &num_params);
6097 }
6098
6099 if (ctx->type == PIPE_SHADER_VERTEX ||
6100 ctx->type == PIPE_SHADER_TESS_EVAL) {
6101 /* ES return values are inputs to GS. */
6102 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
6103 returns[num_returns++] = ctx->i32; /* SGPRs */
6104 for (i = 0; i < 5; i++)
6105 returns[num_returns++] = ctx->f32; /* VGPRs */
6106 }
6107 break;
6108
6109 case PIPE_SHADER_TESS_EVAL:
6110 declare_default_desc_pointers(ctx, params, &num_params);
6111 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6112 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
6113
6114 if (shader->key.as_es) {
6115 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6116 params[num_params++] = ctx->i32;
6117 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
6118 } else {
6119 params[num_params++] = ctx->i32;
6120 declare_streamout_params(ctx, &shader->selector->so,
6121 params, ctx->i32, &num_params);
6122 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6123 }
6124 last_sgpr = num_params - 1;
6125
6126 /* VGPRs */
6127 declare_tes_input_vgprs(ctx, params, &num_params);
6128 break;
6129
6130 case PIPE_SHADER_GEOMETRY:
6131 declare_default_desc_pointers(ctx, params, &num_params);
6132 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6133 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
6134 last_sgpr = num_params - 1;
6135
6136 /* VGPRs */
6137 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
6138 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
6139 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6140 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
6141 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
6142 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
6143 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
6144 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6145 break;
6146
6147 case PIPE_SHADER_FRAGMENT:
6148 declare_default_desc_pointers(ctx, params, &num_params);
6149 params[SI_PARAM_ALPHA_REF] = ctx->f32;
6150 params[SI_PARAM_PRIM_MASK] = ctx->i32;
6151 last_sgpr = SI_PARAM_PRIM_MASK;
6152 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
6153 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
6154 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
6155 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
6156 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
6157 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
6158 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
6159 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
6160 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
6161 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
6162 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
6163 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
6164 params[SI_PARAM_FRONT_FACE] = ctx->i32;
6165 shader->info.face_vgpr_index = 20;
6166 params[SI_PARAM_ANCILLARY] = ctx->i32;
6167 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
6168 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
6169 num_params = SI_PARAM_POS_FIXED_PT+1;
6170
6171 /* Color inputs from the prolog. */
6172 if (shader->selector->info.colors_read) {
6173 unsigned num_color_elements =
6174 util_bitcount(shader->selector->info.colors_read);
6175
6176 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
6177 for (i = 0; i < num_color_elements; i++)
6178 params[num_params++] = ctx->f32;
6179
6180 num_prolog_vgprs += num_color_elements;
6181 }
6182
6183 /* Outputs for the epilog. */
6184 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6185 num_returns =
6186 num_return_sgprs +
6187 util_bitcount(shader->selector->info.colors_written) * 4 +
6188 shader->selector->info.writes_z +
6189 shader->selector->info.writes_stencil +
6190 shader->selector->info.writes_samplemask +
6191 1 /* SampleMaskIn */;
6192
6193 num_returns = MAX2(num_returns,
6194 num_return_sgprs +
6195 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6196
6197 for (i = 0; i < num_return_sgprs; i++)
6198 returns[i] = ctx->i32;
6199 for (; i < num_returns; i++)
6200 returns[i] = ctx->f32;
6201 break;
6202
6203 case PIPE_SHADER_COMPUTE:
6204 declare_default_desc_pointers(ctx, params, &num_params);
6205 if (shader->selector->info.uses_grid_size)
6206 params[ctx->param_grid_size = num_params++] = v3i32;
6207 if (shader->selector->info.uses_block_size)
6208 params[ctx->param_block_size = num_params++] = v3i32;
6209
6210 for (i = 0; i < 3; i++) {
6211 ctx->param_block_id[i] = -1;
6212 if (shader->selector->info.uses_block_id[i])
6213 params[ctx->param_block_id[i] = num_params++] = ctx->i32;
6214 }
6215 last_sgpr = num_params - 1;
6216
6217 params[ctx->param_thread_id = num_params++] = v3i32;
6218 break;
6219 default:
6220 assert(0 && "unimplemented shader");
6221 return;
6222 }
6223
6224 assert(num_params <= ARRAY_SIZE(params));
6225
6226 si_create_function(ctx, "main", returns, num_returns, params,
6227 num_params, last_sgpr,
6228 si_get_max_workgroup_size(shader));
6229
6230 /* Reserve register locations for VGPR inputs the PS prolog may need. */
6231 if (ctx->type == PIPE_SHADER_FRAGMENT &&
6232 ctx->separate_prolog) {
6233 si_llvm_add_attribute(ctx->main_fn,
6234 "InitialPSInputAddr",
6235 S_0286D0_PERSP_SAMPLE_ENA(1) |
6236 S_0286D0_PERSP_CENTER_ENA(1) |
6237 S_0286D0_PERSP_CENTROID_ENA(1) |
6238 S_0286D0_LINEAR_SAMPLE_ENA(1) |
6239 S_0286D0_LINEAR_CENTER_ENA(1) |
6240 S_0286D0_LINEAR_CENTROID_ENA(1) |
6241 S_0286D0_FRONT_FACE_ENA(1) |
6242 S_0286D0_POS_FIXED_PT_ENA(1));
6243 }
6244
6245 shader->info.num_input_sgprs = 0;
6246 shader->info.num_input_vgprs = 0;
6247
6248 for (i = 0; i <= last_sgpr; ++i)
6249 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
6250
6251 for (; i < num_params; ++i)
6252 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
6253
6254 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
6255 shader->info.num_input_vgprs -= num_prolog_vgprs;
6256
6257 if (!ctx->screen->has_ds_bpermute &&
6258 bld_base->info &&
6259 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
6260 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
6261 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
6262 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
6263 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
6264 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
6265 ctx->lds =
6266 LLVMAddGlobalInAddressSpace(gallivm->module,
6267 LLVMArrayType(ctx->i32, 64),
6268 "ddxy_lds",
6269 LOCAL_ADDR_SPACE);
6270
6271 if (shader->key.as_ls ||
6272 ctx->type == PIPE_SHADER_TESS_CTRL ||
6273 /* GFX9 has the ESGS ring buffer in LDS. */
6274 (ctx->screen->b.chip_class >= GFX9 &&
6275 (shader->key.as_es ||
6276 ctx->type == PIPE_SHADER_GEOMETRY)))
6277 declare_lds_as_pointer(ctx);
6278 }
6279
6280 /**
6281 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
6282 * for later use.
6283 */
6284 static void preload_ring_buffers(struct si_shader_context *ctx)
6285 {
6286 struct gallivm_state *gallivm = &ctx->gallivm;
6287 LLVMBuilderRef builder = gallivm->builder;
6288
6289 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6290 ctx->param_rw_buffers);
6291
6292 if (ctx->screen->b.chip_class <= VI &&
6293 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
6294 unsigned ring =
6295 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6296 : SI_ES_RING_ESGS;
6297 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6298
6299 ctx->esgs_ring =
6300 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6301 }
6302
6303 if (ctx->shader->is_gs_copy_shader) {
6304 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6305
6306 ctx->gsvs_ring[0] =
6307 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6308 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6309 const struct si_shader_selector *sel = ctx->shader->selector;
6310 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6311 LLVMValueRef base_ring;
6312
6313 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6314
6315 /* The conceptual layout of the GSVS ring is
6316 * v0c0 .. vLv0 v0c1 .. vLc1 ..
6317 * but the real memory layout is swizzled across
6318 * threads:
6319 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6320 * t16v0c0 ..
6321 * Override the buffer descriptor accordingly.
6322 */
6323 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6324 uint64_t stream_offset = 0;
6325
6326 for (unsigned stream = 0; stream < 4; ++stream) {
6327 unsigned num_components;
6328 unsigned stride;
6329 unsigned num_records;
6330 LLVMValueRef ring, tmp;
6331
6332 num_components = sel->info.num_stream_output_components[stream];
6333 if (!num_components)
6334 continue;
6335
6336 stride = 4 * num_components * sel->gs_max_out_vertices;
6337
6338 /* Limit on the stride field for <= CIK. */
6339 assert(stride < (1 << 14));
6340
6341 num_records = 64;
6342
6343 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6344 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6345 tmp = LLVMBuildAdd(builder, tmp,
6346 LLVMConstInt(ctx->i64,
6347 stream_offset, 0), "");
6348 stream_offset += stride * 64;
6349
6350 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6351 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6352 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6353 tmp = LLVMBuildOr(builder, tmp,
6354 LLVMConstInt(ctx->i32,
6355 S_008F04_STRIDE(stride) |
6356 S_008F04_SWIZZLE_ENABLE(1), 0), "");
6357 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6358 ring = LLVMBuildInsertElement(builder, ring,
6359 LLVMConstInt(ctx->i32, num_records, 0),
6360 LLVMConstInt(ctx->i32, 2, 0), "");
6361 ring = LLVMBuildInsertElement(builder, ring,
6362 LLVMConstInt(ctx->i32,
6363 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6364 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6365 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6366 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6367 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6368 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6369 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6370 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6371 S_008F0C_ADD_TID_ENABLE(1),
6372 0),
6373 LLVMConstInt(ctx->i32, 3, 0), "");
6374
6375 ctx->gsvs_ring[stream] = ring;
6376 }
6377 }
6378 }
6379
6380 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6381 LLVMValueRef param_rw_buffers,
6382 unsigned param_pos_fixed_pt)
6383 {
6384 struct gallivm_state *gallivm = &ctx->gallivm;
6385 LLVMBuilderRef builder = gallivm->builder;
6386 LLVMValueRef slot, desc, offset, row, bit, address[2];
6387
6388 /* Use the fixed-point gl_FragCoord input.
6389 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6390 * per coordinate to get the repeating effect.
6391 */
6392 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6393 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6394
6395 /* Load the buffer descriptor. */
6396 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6397 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6398
6399 /* The stipple pattern is 32x32, each row has 32 bits. */
6400 offset = LLVMBuildMul(builder, address[1],
6401 LLVMConstInt(ctx->i32, 4, 0), "");
6402 row = buffer_load_const(ctx, desc, offset);
6403 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6404 bit = LLVMBuildLShr(builder, row, address[0], "");
6405 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6406
6407 /* The intrinsic kills the thread if arg < 0. */
6408 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6409 LLVMConstReal(ctx->f32, -1), "");
6410 ac_build_kill(&ctx->ac, bit);
6411 }
6412
6413 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6414 struct si_shader_config *conf,
6415 unsigned symbol_offset)
6416 {
6417 unsigned i;
6418 const unsigned char *config =
6419 ac_shader_binary_config_start(binary, symbol_offset);
6420 bool really_needs_scratch = false;
6421
6422 /* LLVM adds SGPR spills to the scratch size.
6423 * Find out if we really need the scratch buffer.
6424 */
6425 for (i = 0; i < binary->reloc_count; i++) {
6426 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6427
6428 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6429 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6430 really_needs_scratch = true;
6431 break;
6432 }
6433 }
6434
6435 /* XXX: We may be able to emit some of these values directly rather than
6436 * extracting fields to be emitted later.
6437 */
6438
6439 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6440 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6441 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6442 switch (reg) {
6443 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6444 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6445 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6446 case R_00B848_COMPUTE_PGM_RSRC1:
6447 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6448 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6449 conf->float_mode = G_00B028_FLOAT_MODE(value);
6450 conf->rsrc1 = value;
6451 break;
6452 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6453 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6454 break;
6455 case R_00B84C_COMPUTE_PGM_RSRC2:
6456 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6457 conf->rsrc2 = value;
6458 break;
6459 case R_0286CC_SPI_PS_INPUT_ENA:
6460 conf->spi_ps_input_ena = value;
6461 break;
6462 case R_0286D0_SPI_PS_INPUT_ADDR:
6463 conf->spi_ps_input_addr = value;
6464 break;
6465 case R_0286E8_SPI_TMPRING_SIZE:
6466 case R_00B860_COMPUTE_TMPRING_SIZE:
6467 /* WAVESIZE is in units of 256 dwords. */
6468 if (really_needs_scratch)
6469 conf->scratch_bytes_per_wave =
6470 G_00B860_WAVESIZE(value) * 256 * 4;
6471 break;
6472 case 0x4: /* SPILLED_SGPRS */
6473 conf->spilled_sgprs = value;
6474 break;
6475 case 0x8: /* SPILLED_VGPRS */
6476 conf->spilled_vgprs = value;
6477 break;
6478 default:
6479 {
6480 static bool printed;
6481
6482 if (!printed) {
6483 fprintf(stderr, "Warning: LLVM emitted unknown "
6484 "config register: 0x%x\n", reg);
6485 printed = true;
6486 }
6487 }
6488 break;
6489 }
6490 }
6491
6492 if (!conf->spi_ps_input_addr)
6493 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6494 }
6495
6496 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6497 struct si_shader *shader,
6498 struct si_shader_config *config,
6499 uint64_t scratch_va)
6500 {
6501 unsigned i;
6502 uint32_t scratch_rsrc_dword0 = scratch_va;
6503 uint32_t scratch_rsrc_dword1 =
6504 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6505
6506 /* Enable scratch coalescing. */
6507 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6508
6509 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6510 const struct ac_shader_reloc *reloc =
6511 &shader->binary.relocs[i];
6512 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6513 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6514 &scratch_rsrc_dword0, 4);
6515 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6516 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6517 &scratch_rsrc_dword1, 4);
6518 }
6519 }
6520 }
6521
6522 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6523 {
6524 unsigned size = shader->binary.code_size;
6525
6526 if (shader->prolog)
6527 size += shader->prolog->binary.code_size;
6528 if (shader->previous_stage)
6529 size += shader->previous_stage->binary.code_size;
6530 if (shader->prolog2)
6531 size += shader->prolog2->binary.code_size;
6532 if (shader->epilog)
6533 size += shader->epilog->binary.code_size;
6534 return size;
6535 }
6536
6537 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6538 {
6539 const struct ac_shader_binary *prolog =
6540 shader->prolog ? &shader->prolog->binary : NULL;
6541 const struct ac_shader_binary *previous_stage =
6542 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6543 const struct ac_shader_binary *prolog2 =
6544 shader->prolog2 ? &shader->prolog2->binary : NULL;
6545 const struct ac_shader_binary *epilog =
6546 shader->epilog ? &shader->epilog->binary : NULL;
6547 const struct ac_shader_binary *mainb = &shader->binary;
6548 unsigned bo_size = si_get_shader_binary_size(shader) +
6549 (!epilog ? mainb->rodata_size : 0);
6550 unsigned char *ptr;
6551
6552 assert(!prolog || !prolog->rodata_size);
6553 assert(!previous_stage || !previous_stage->rodata_size);
6554 assert(!prolog2 || !prolog2->rodata_size);
6555 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
6556 !mainb->rodata_size);
6557 assert(!epilog || !epilog->rodata_size);
6558
6559 /* GFX9 can fetch at most 128 bytes past the end of the shader.
6560 * Prevent VM faults.
6561 */
6562 if (sscreen->b.chip_class >= GFX9)
6563 bo_size += 128;
6564
6565 r600_resource_reference(&shader->bo, NULL);
6566 shader->bo = (struct r600_resource*)
6567 pipe_buffer_create(&sscreen->b.b, 0,
6568 PIPE_USAGE_IMMUTABLE,
6569 align(bo_size, SI_CPDMA_ALIGNMENT));
6570 if (!shader->bo)
6571 return -ENOMEM;
6572
6573 /* Upload. */
6574 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6575 PIPE_TRANSFER_READ_WRITE |
6576 PIPE_TRANSFER_UNSYNCHRONIZED);
6577
6578 if (prolog) {
6579 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6580 ptr += prolog->code_size;
6581 }
6582 if (previous_stage) {
6583 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6584 previous_stage->code_size);
6585 ptr += previous_stage->code_size;
6586 }
6587 if (prolog2) {
6588 util_memcpy_cpu_to_le32(ptr, prolog2->code, prolog2->code_size);
6589 ptr += prolog2->code_size;
6590 }
6591
6592 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6593 ptr += mainb->code_size;
6594
6595 if (epilog)
6596 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6597 else if (mainb->rodata_size > 0)
6598 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6599
6600 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6601 return 0;
6602 }
6603
6604 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6605 struct pipe_debug_callback *debug,
6606 const char *name, FILE *file)
6607 {
6608 char *line, *p;
6609 unsigned i, count;
6610
6611 if (binary->disasm_string) {
6612 fprintf(file, "Shader %s disassembly:\n", name);
6613 fprintf(file, "%s", binary->disasm_string);
6614
6615 if (debug && debug->debug_message) {
6616 /* Very long debug messages are cut off, so send the
6617 * disassembly one line at a time. This causes more
6618 * overhead, but on the plus side it simplifies
6619 * parsing of resulting logs.
6620 */
6621 pipe_debug_message(debug, SHADER_INFO,
6622 "Shader Disassembly Begin");
6623
6624 line = binary->disasm_string;
6625 while (*line) {
6626 p = util_strchrnul(line, '\n');
6627 count = p - line;
6628
6629 if (count) {
6630 pipe_debug_message(debug, SHADER_INFO,
6631 "%.*s", count, line);
6632 }
6633
6634 if (!*p)
6635 break;
6636 line = p + 1;
6637 }
6638
6639 pipe_debug_message(debug, SHADER_INFO,
6640 "Shader Disassembly End");
6641 }
6642 } else {
6643 fprintf(file, "Shader %s binary:\n", name);
6644 for (i = 0; i < binary->code_size; i += 4) {
6645 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6646 binary->code[i + 3], binary->code[i + 2],
6647 binary->code[i + 1], binary->code[i]);
6648 }
6649 }
6650 }
6651
6652 static void si_shader_dump_stats(struct si_screen *sscreen,
6653 struct si_shader *shader,
6654 struct pipe_debug_callback *debug,
6655 unsigned processor,
6656 FILE *file,
6657 bool check_debug_option)
6658 {
6659 struct si_shader_config *conf = &shader->config;
6660 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6661 unsigned code_size = si_get_shader_binary_size(shader);
6662 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6663 unsigned lds_per_wave = 0;
6664 unsigned max_simd_waves = 10;
6665
6666 /* Compute LDS usage for PS. */
6667 switch (processor) {
6668 case PIPE_SHADER_FRAGMENT:
6669 /* The minimum usage per wave is (num_inputs * 48). The maximum
6670 * usage is (num_inputs * 48 * 16).
6671 * We can get anything in between and it varies between waves.
6672 *
6673 * The 48 bytes per input for a single primitive is equal to
6674 * 4 bytes/component * 4 components/input * 3 points.
6675 *
6676 * Other stages don't know the size at compile time or don't
6677 * allocate LDS per wave, but instead they do it per thread group.
6678 */
6679 lds_per_wave = conf->lds_size * lds_increment +
6680 align(num_inputs * 48, lds_increment);
6681 break;
6682 case PIPE_SHADER_COMPUTE:
6683 if (shader->selector) {
6684 unsigned max_workgroup_size =
6685 si_get_max_workgroup_size(shader);
6686 lds_per_wave = (conf->lds_size * lds_increment) /
6687 DIV_ROUND_UP(max_workgroup_size, 64);
6688 }
6689 break;
6690 }
6691
6692 /* Compute the per-SIMD wave counts. */
6693 if (conf->num_sgprs) {
6694 if (sscreen->b.chip_class >= VI)
6695 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6696 else
6697 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6698 }
6699
6700 if (conf->num_vgprs)
6701 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6702
6703 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6704 * 16KB makes some SIMDs unoccupied). */
6705 if (lds_per_wave)
6706 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6707
6708 if (!check_debug_option ||
6709 r600_can_dump_shader(&sscreen->b, processor)) {
6710 if (processor == PIPE_SHADER_FRAGMENT) {
6711 fprintf(file, "*** SHADER CONFIG ***\n"
6712 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6713 "SPI_PS_INPUT_ENA = 0x%04x\n",
6714 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6715 }
6716
6717 fprintf(file, "*** SHADER STATS ***\n"
6718 "SGPRS: %d\n"
6719 "VGPRS: %d\n"
6720 "Spilled SGPRs: %d\n"
6721 "Spilled VGPRs: %d\n"
6722 "Private memory VGPRs: %d\n"
6723 "Code Size: %d bytes\n"
6724 "LDS: %d blocks\n"
6725 "Scratch: %d bytes per wave\n"
6726 "Max Waves: %d\n"
6727 "********************\n\n\n",
6728 conf->num_sgprs, conf->num_vgprs,
6729 conf->spilled_sgprs, conf->spilled_vgprs,
6730 conf->private_mem_vgprs, code_size,
6731 conf->lds_size, conf->scratch_bytes_per_wave,
6732 max_simd_waves);
6733 }
6734
6735 pipe_debug_message(debug, SHADER_INFO,
6736 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6737 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6738 "Spilled VGPRs: %d PrivMem VGPRs: %d",
6739 conf->num_sgprs, conf->num_vgprs, code_size,
6740 conf->lds_size, conf->scratch_bytes_per_wave,
6741 max_simd_waves, conf->spilled_sgprs,
6742 conf->spilled_vgprs, conf->private_mem_vgprs);
6743 }
6744
6745 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6746 {
6747 switch (processor) {
6748 case PIPE_SHADER_VERTEX:
6749 if (shader->key.as_es)
6750 return "Vertex Shader as ES";
6751 else if (shader->key.as_ls)
6752 return "Vertex Shader as LS";
6753 else
6754 return "Vertex Shader as VS";
6755 case PIPE_SHADER_TESS_CTRL:
6756 return "Tessellation Control Shader";
6757 case PIPE_SHADER_TESS_EVAL:
6758 if (shader->key.as_es)
6759 return "Tessellation Evaluation Shader as ES";
6760 else
6761 return "Tessellation Evaluation Shader as VS";
6762 case PIPE_SHADER_GEOMETRY:
6763 if (shader->is_gs_copy_shader)
6764 return "GS Copy Shader as VS";
6765 else
6766 return "Geometry Shader";
6767 case PIPE_SHADER_FRAGMENT:
6768 return "Pixel Shader";
6769 case PIPE_SHADER_COMPUTE:
6770 return "Compute Shader";
6771 default:
6772 return "Unknown Shader";
6773 }
6774 }
6775
6776 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6777 struct pipe_debug_callback *debug, unsigned processor,
6778 FILE *file, bool check_debug_option)
6779 {
6780 if (!check_debug_option ||
6781 r600_can_dump_shader(&sscreen->b, processor))
6782 si_dump_shader_key(processor, shader, file);
6783
6784 if (!check_debug_option && shader->binary.llvm_ir_string) {
6785 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6786 si_get_shader_name(shader, processor));
6787 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6788 }
6789
6790 if (!check_debug_option ||
6791 (r600_can_dump_shader(&sscreen->b, processor) &&
6792 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6793 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6794
6795 if (shader->prolog)
6796 si_shader_dump_disassembly(&shader->prolog->binary,
6797 debug, "prolog", file);
6798 if (shader->previous_stage)
6799 si_shader_dump_disassembly(&shader->previous_stage->binary,
6800 debug, "previous stage", file);
6801 if (shader->prolog2)
6802 si_shader_dump_disassembly(&shader->prolog2->binary,
6803 debug, "prolog2", file);
6804
6805 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6806
6807 if (shader->epilog)
6808 si_shader_dump_disassembly(&shader->epilog->binary,
6809 debug, "epilog", file);
6810 fprintf(file, "\n");
6811 }
6812
6813 si_shader_dump_stats(sscreen, shader, debug, processor, file,
6814 check_debug_option);
6815 }
6816
6817 int si_compile_llvm(struct si_screen *sscreen,
6818 struct ac_shader_binary *binary,
6819 struct si_shader_config *conf,
6820 LLVMTargetMachineRef tm,
6821 LLVMModuleRef mod,
6822 struct pipe_debug_callback *debug,
6823 unsigned processor,
6824 const char *name)
6825 {
6826 int r = 0;
6827 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6828
6829 if (r600_can_dump_shader(&sscreen->b, processor)) {
6830 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6831
6832 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6833 fprintf(stderr, "%s LLVM IR:\n\n", name);
6834 ac_dump_module(mod);
6835 fprintf(stderr, "\n");
6836 }
6837 }
6838
6839 if (sscreen->record_llvm_ir) {
6840 char *ir = LLVMPrintModuleToString(mod);
6841 binary->llvm_ir_string = strdup(ir);
6842 LLVMDisposeMessage(ir);
6843 }
6844
6845 if (!si_replace_shader(count, binary)) {
6846 r = si_llvm_compile(mod, binary, tm, debug);
6847 if (r)
6848 return r;
6849 }
6850
6851 si_shader_binary_read_config(binary, conf, 0);
6852
6853 /* Enable 64-bit and 16-bit denormals, because there is no performance
6854 * cost.
6855 *
6856 * If denormals are enabled, all floating-point output modifiers are
6857 * ignored.
6858 *
6859 * Don't enable denormals for 32-bit floats, because:
6860 * - Floating-point output modifiers would be ignored by the hw.
6861 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6862 * have to stop using those.
6863 * - SI & CI would be very slow.
6864 */
6865 conf->float_mode |= V_00B028_FP_64_DENORMS;
6866
6867 FREE(binary->config);
6868 FREE(binary->global_symbol_offsets);
6869 binary->config = NULL;
6870 binary->global_symbol_offsets = NULL;
6871
6872 /* Some shaders can't have rodata because their binaries can be
6873 * concatenated.
6874 */
6875 if (binary->rodata_size &&
6876 (processor == PIPE_SHADER_VERTEX ||
6877 processor == PIPE_SHADER_TESS_CTRL ||
6878 processor == PIPE_SHADER_TESS_EVAL ||
6879 processor == PIPE_SHADER_FRAGMENT)) {
6880 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6881 return -EINVAL;
6882 }
6883
6884 return r;
6885 }
6886
6887 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6888 {
6889 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6890 LLVMBuildRetVoid(ctx->gallivm.builder);
6891 else
6892 LLVMBuildRet(ctx->gallivm.builder, ret);
6893 }
6894
6895 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6896 struct si_shader *
6897 si_generate_gs_copy_shader(struct si_screen *sscreen,
6898 LLVMTargetMachineRef tm,
6899 struct si_shader_selector *gs_selector,
6900 struct pipe_debug_callback *debug)
6901 {
6902 struct si_shader_context ctx;
6903 struct si_shader *shader;
6904 struct gallivm_state *gallivm = &ctx.gallivm;
6905 LLVMBuilderRef builder;
6906 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6907 struct lp_build_context *uint = &bld_base->uint_bld;
6908 struct si_shader_output_values *outputs;
6909 struct tgsi_shader_info *gsinfo = &gs_selector->info;
6910 int i, r;
6911
6912 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6913
6914 if (!outputs)
6915 return NULL;
6916
6917 shader = CALLOC_STRUCT(si_shader);
6918 if (!shader) {
6919 FREE(outputs);
6920 return NULL;
6921 }
6922
6923
6924 shader->selector = gs_selector;
6925 shader->is_gs_copy_shader = true;
6926
6927 si_init_shader_ctx(&ctx, sscreen, tm);
6928 ctx.shader = shader;
6929 ctx.type = PIPE_SHADER_VERTEX;
6930
6931 builder = gallivm->builder;
6932
6933 create_function(&ctx);
6934 preload_ring_buffers(&ctx);
6935
6936 LLVMValueRef voffset =
6937 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6938 ctx.param_vertex_id), 4);
6939
6940 /* Fetch the vertex stream ID.*/
6941 LLVMValueRef stream_id;
6942
6943 if (gs_selector->so.num_outputs)
6944 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6945 else
6946 stream_id = ctx.i32_0;
6947
6948 /* Fill in output information. */
6949 for (i = 0; i < gsinfo->num_outputs; ++i) {
6950 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6951 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6952
6953 for (int chan = 0; chan < 4; chan++) {
6954 outputs[i].vertex_stream[chan] =
6955 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6956 }
6957 }
6958
6959 LLVMBasicBlockRef end_bb;
6960 LLVMValueRef switch_inst;
6961
6962 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6963 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6964
6965 for (int stream = 0; stream < 4; stream++) {
6966 LLVMBasicBlockRef bb;
6967 unsigned offset;
6968
6969 if (!gsinfo->num_stream_output_components[stream])
6970 continue;
6971
6972 if (stream > 0 && !gs_selector->so.num_outputs)
6973 continue;
6974
6975 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6976 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6977 LLVMPositionBuilderAtEnd(builder, bb);
6978
6979 /* Fetch vertex data from GSVS ring */
6980 offset = 0;
6981 for (i = 0; i < gsinfo->num_outputs; ++i) {
6982 for (unsigned chan = 0; chan < 4; chan++) {
6983 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6984 outputs[i].vertex_stream[chan] != stream) {
6985 outputs[i].values[chan] = ctx.bld_base.base.undef;
6986 continue;
6987 }
6988
6989 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6990 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6991 offset++;
6992
6993 outputs[i].values[chan] =
6994 ac_build_buffer_load(&ctx.ac,
6995 ctx.gsvs_ring[0], 1,
6996 ctx.i32_0, voffset,
6997 soffset, 0, 1, 1, true);
6998 }
6999 }
7000
7001 /* Streamout and exports. */
7002 if (gs_selector->so.num_outputs) {
7003 si_llvm_emit_streamout(&ctx, outputs,
7004 gsinfo->num_outputs,
7005 stream);
7006 }
7007
7008 if (stream == 0)
7009 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
7010
7011 LLVMBuildBr(builder, end_bb);
7012 }
7013
7014 LLVMPositionBuilderAtEnd(builder, end_bb);
7015
7016 LLVMBuildRetVoid(gallivm->builder);
7017
7018 /* Dump LLVM IR before any optimization passes */
7019 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
7020 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
7021 ac_dump_module(ctx.gallivm.module);
7022
7023 si_llvm_finalize_module(&ctx,
7024 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
7025
7026 r = si_compile_llvm(sscreen, &ctx.shader->binary,
7027 &ctx.shader->config, ctx.tm,
7028 ctx.gallivm.module,
7029 debug, PIPE_SHADER_GEOMETRY,
7030 "GS Copy Shader");
7031 if (!r) {
7032 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
7033 fprintf(stderr, "GS Copy Shader:\n");
7034 si_shader_dump(sscreen, ctx.shader, debug,
7035 PIPE_SHADER_GEOMETRY, stderr, true);
7036 r = si_shader_binary_upload(sscreen, ctx.shader);
7037 }
7038
7039 si_llvm_dispose(&ctx);
7040
7041 FREE(outputs);
7042
7043 if (r != 0) {
7044 FREE(shader);
7045 shader = NULL;
7046 }
7047 return shader;
7048 }
7049
7050 static void si_dump_shader_key_vs(struct si_shader_key *key,
7051 struct si_vs_prolog_bits *prolog,
7052 const char *prefix, FILE *f)
7053 {
7054 fprintf(f, " %s.instance_divisors = {", prefix);
7055 for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
7056 fprintf(f, !i ? "%u" : ", %u",
7057 prolog->instance_divisors[i]);
7058 }
7059 fprintf(f, "}\n");
7060
7061 fprintf(f, " mono.vs.fix_fetch = {");
7062 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
7063 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
7064 fprintf(f, "}\n");
7065 }
7066
7067 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
7068 FILE *f)
7069 {
7070 struct si_shader_key *key = &shader->key;
7071
7072 fprintf(f, "SHADER KEY\n");
7073
7074 switch (processor) {
7075 case PIPE_SHADER_VERTEX:
7076 si_dump_shader_key_vs(key, &key->part.vs.prolog,
7077 "part.vs.prolog", f);
7078 fprintf(f, " as_es = %u\n", key->as_es);
7079 fprintf(f, " as_ls = %u\n", key->as_ls);
7080 fprintf(f, " mono.vs_export_prim_id = %u\n",
7081 key->mono.vs_export_prim_id);
7082 break;
7083
7084 case PIPE_SHADER_TESS_CTRL:
7085 if (shader->selector->screen->b.chip_class >= GFX9) {
7086 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
7087 "part.tcs.ls_prolog", f);
7088 }
7089 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
7090 fprintf(f, " mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
7091 break;
7092
7093 case PIPE_SHADER_TESS_EVAL:
7094 fprintf(f, " as_es = %u\n", key->as_es);
7095 fprintf(f, " mono.vs_export_prim_id = %u\n",
7096 key->mono.vs_export_prim_id);
7097 break;
7098
7099 case PIPE_SHADER_GEOMETRY:
7100 if (shader->is_gs_copy_shader)
7101 break;
7102
7103 if (shader->selector->screen->b.chip_class >= GFX9 &&
7104 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
7105 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
7106 "part.gs.vs_prolog", f);
7107 }
7108 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
7109 break;
7110
7111 case PIPE_SHADER_COMPUTE:
7112 break;
7113
7114 case PIPE_SHADER_FRAGMENT:
7115 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
7116 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
7117 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
7118 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
7119 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
7120 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
7121 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
7122 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
7123 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
7124 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
7125 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
7126 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
7127 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
7128 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
7129 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
7130 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
7131 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
7132 break;
7133
7134 default:
7135 assert(0);
7136 }
7137
7138 if ((processor == PIPE_SHADER_GEOMETRY ||
7139 processor == PIPE_SHADER_TESS_EVAL ||
7140 processor == PIPE_SHADER_VERTEX) &&
7141 !key->as_es && !key->as_ls) {
7142 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
7143 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
7144 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
7145 }
7146 }
7147
7148 static void si_init_shader_ctx(struct si_shader_context *ctx,
7149 struct si_screen *sscreen,
7150 LLVMTargetMachineRef tm)
7151 {
7152 struct lp_build_tgsi_context *bld_base;
7153 struct lp_build_tgsi_action tmpl = {};
7154
7155 si_llvm_context_init(ctx, sscreen, tm);
7156
7157 bld_base = &ctx->bld_base;
7158 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
7159
7160 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
7161 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
7162 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
7163
7164 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
7165 bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
7166 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
7167 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
7168 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
7169 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
7170 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
7171 bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
7172 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
7173 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
7174 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
7175 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
7176 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
7177 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
7178 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
7179 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
7180
7181 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
7182 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
7183 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
7184 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
7185 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
7186 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
7187
7188 tmpl.fetch_args = atomic_fetch_args;
7189 tmpl.emit = atomic_emit;
7190 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
7191 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
7192 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
7193 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
7194 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
7195 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
7196 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
7197 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
7198 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
7199 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
7200 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
7201 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
7202 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
7203 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
7204 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
7205 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
7206 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
7207 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
7208 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
7209 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
7210
7211 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
7212
7213 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
7214
7215 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
7216 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
7217 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
7218 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
7219
7220 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
7221 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
7222 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
7223 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
7224 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
7225 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
7226 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
7227 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
7228 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
7229
7230 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
7231 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
7232 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
7233 }
7234
7235 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
7236 {
7237 struct si_shader *shader = ctx->shader;
7238 struct tgsi_shader_info *info = &shader->selector->info;
7239
7240 if (ctx->type == PIPE_SHADER_FRAGMENT ||
7241 ctx->type == PIPE_SHADER_COMPUTE ||
7242 shader->key.as_es ||
7243 shader->key.as_ls)
7244 return;
7245
7246 ac_optimize_vs_outputs(&ctx->ac,
7247 ctx->main_fn,
7248 shader->info.vs_output_param_offset,
7249 info->num_outputs,
7250 &shader->info.nr_param_exports);
7251 }
7252
7253 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
7254 {
7255 ctx->shader->config.private_mem_vgprs = 0;
7256
7257 /* Process all LLVM instructions. */
7258 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
7259 while (bb) {
7260 LLVMValueRef next = LLVMGetFirstInstruction(bb);
7261
7262 while (next) {
7263 LLVMValueRef inst = next;
7264 next = LLVMGetNextInstruction(next);
7265
7266 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
7267 continue;
7268
7269 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
7270 /* No idea why LLVM aligns allocas to 4 elements. */
7271 unsigned alignment = LLVMGetAlignment(inst);
7272 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
7273 ctx->shader->config.private_mem_vgprs += dw_size;
7274 }
7275 bb = LLVMGetNextBasicBlock(bb);
7276 }
7277 }
7278
7279 static void si_init_exec_full_mask(struct si_shader_context *ctx)
7280 {
7281 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
7282 lp_build_intrinsic(ctx->gallivm.builder,
7283 "llvm.amdgcn.init.exec", ctx->voidt,
7284 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
7285 }
7286
7287 static void si_init_exec_from_input(struct si_shader_context *ctx,
7288 unsigned param, unsigned bitoffset)
7289 {
7290 LLVMValueRef args[] = {
7291 LLVMGetParam(ctx->main_fn, param),
7292 LLVMConstInt(ctx->i32, bitoffset, 0),
7293 };
7294 lp_build_intrinsic(ctx->gallivm.builder,
7295 "llvm.amdgcn.init.exec.from.input",
7296 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
7297 }
7298
7299 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
7300 bool is_monolithic)
7301 {
7302 struct si_shader *shader = ctx->shader;
7303 struct si_shader_selector *sel = shader->selector;
7304 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7305
7306 switch (ctx->type) {
7307 case PIPE_SHADER_VERTEX:
7308 ctx->load_input = declare_input_vs;
7309 if (shader->key.as_ls)
7310 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
7311 else if (shader->key.as_es)
7312 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7313 else
7314 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7315 break;
7316 case PIPE_SHADER_TESS_CTRL:
7317 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7318 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7319 bld_base->emit_store = store_output_tcs;
7320 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7321 break;
7322 case PIPE_SHADER_TESS_EVAL:
7323 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7324 if (shader->key.as_es)
7325 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7326 else
7327 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7328 break;
7329 case PIPE_SHADER_GEOMETRY:
7330 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7331 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7332 break;
7333 case PIPE_SHADER_FRAGMENT:
7334 ctx->load_input = declare_input_fs;
7335 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7336 break;
7337 case PIPE_SHADER_COMPUTE:
7338 ctx->declare_memory_region = declare_compute_memory;
7339 break;
7340 default:
7341 assert(!"Unsupported shader type");
7342 return false;
7343 }
7344
7345 create_function(ctx);
7346 preload_ring_buffers(ctx);
7347
7348 /* For GFX9 merged shaders:
7349 * - Set EXEC. If the prolog is present, set EXEC there instead.
7350 * - Add a barrier before the second shader.
7351 *
7352 * The same thing for monolithic shaders is done in
7353 * si_build_wrapper_function.
7354 */
7355 if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
7356 if (sel->info.num_instructions > 1 && /* not empty shader */
7357 (shader->key.as_es || shader->key.as_ls) &&
7358 (ctx->type == PIPE_SHADER_TESS_EVAL ||
7359 (ctx->type == PIPE_SHADER_VERTEX &&
7360 !sel->vs_needs_prolog))) {
7361 si_init_exec_from_input(ctx,
7362 ctx->param_merged_wave_info, 0);
7363 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
7364 ctx->type == PIPE_SHADER_GEOMETRY) {
7365 si_init_exec_from_input(ctx,
7366 ctx->param_merged_wave_info, 8);
7367 si_llvm_emit_barrier(NULL, bld_base, NULL);
7368 }
7369 }
7370
7371 if (ctx->type == PIPE_SHADER_GEOMETRY) {
7372 int i;
7373 for (i = 0; i < 4; i++) {
7374 ctx->gs_next_vertex[i] =
7375 lp_build_alloca(&ctx->gallivm,
7376 ctx->i32, "");
7377 }
7378 }
7379
7380 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7381 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7382 return false;
7383 }
7384
7385 si_llvm_build_ret(ctx, ctx->return_value);
7386 return true;
7387 }
7388
7389 /**
7390 * Compute the VS prolog key, which contains all the information needed to
7391 * build the VS prolog function, and set shader->info bits where needed.
7392 *
7393 * \param info Shader info of the vertex shader.
7394 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
7395 * \param prolog_key Key of the VS prolog
7396 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
7397 * \param key Output shader part key.
7398 */
7399 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7400 unsigned num_input_sgprs,
7401 const struct si_vs_prolog_bits *prolog_key,
7402 struct si_shader *shader_out,
7403 union si_shader_part_key *key)
7404 {
7405 memset(key, 0, sizeof(*key));
7406 key->vs_prolog.states = *prolog_key;
7407 key->vs_prolog.num_input_sgprs = num_input_sgprs;
7408 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7409 key->vs_prolog.as_ls = shader_out->key.as_ls;
7410
7411 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
7412 key->vs_prolog.as_ls = 1;
7413 key->vs_prolog.num_merged_next_stage_vgprs = 2;
7414 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
7415 key->vs_prolog.num_merged_next_stage_vgprs = 5;
7416 }
7417
7418 /* Set the instanceID flag. */
7419 for (unsigned i = 0; i < info->num_inputs; i++)
7420 if (key->vs_prolog.states.instance_divisors[i])
7421 shader_out->info.uses_instanceid = true;
7422 }
7423
7424 /**
7425 * Compute the PS prolog key, which contains all the information needed to
7426 * build the PS prolog function, and set related bits in shader->config.
7427 */
7428 static void si_get_ps_prolog_key(struct si_shader *shader,
7429 union si_shader_part_key *key,
7430 bool separate_prolog)
7431 {
7432 struct tgsi_shader_info *info = &shader->selector->info;
7433
7434 memset(key, 0, sizeof(*key));
7435 key->ps_prolog.states = shader->key.part.ps.prolog;
7436 key->ps_prolog.colors_read = info->colors_read;
7437 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7438 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7439 key->ps_prolog.wqm = info->uses_derivatives &&
7440 (key->ps_prolog.colors_read ||
7441 key->ps_prolog.states.force_persp_sample_interp ||
7442 key->ps_prolog.states.force_linear_sample_interp ||
7443 key->ps_prolog.states.force_persp_center_interp ||
7444 key->ps_prolog.states.force_linear_center_interp ||
7445 key->ps_prolog.states.bc_optimize_for_persp ||
7446 key->ps_prolog.states.bc_optimize_for_linear);
7447
7448 if (info->colors_read) {
7449 unsigned *color = shader->selector->color_attr_index;
7450
7451 if (shader->key.part.ps.prolog.color_two_side) {
7452 /* BCOLORs are stored after the last input. */
7453 key->ps_prolog.num_interp_inputs = info->num_inputs;
7454 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7455 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7456 }
7457
7458 for (unsigned i = 0; i < 2; i++) {
7459 unsigned interp = info->input_interpolate[color[i]];
7460 unsigned location = info->input_interpolate_loc[color[i]];
7461
7462 if (!(info->colors_read & (0xf << i*4)))
7463 continue;
7464
7465 key->ps_prolog.color_attr_index[i] = color[i];
7466
7467 if (shader->key.part.ps.prolog.flatshade_colors &&
7468 interp == TGSI_INTERPOLATE_COLOR)
7469 interp = TGSI_INTERPOLATE_CONSTANT;
7470
7471 switch (interp) {
7472 case TGSI_INTERPOLATE_CONSTANT:
7473 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7474 break;
7475 case TGSI_INTERPOLATE_PERSPECTIVE:
7476 case TGSI_INTERPOLATE_COLOR:
7477 /* Force the interpolation location for colors here. */
7478 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7479 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7480 if (shader->key.part.ps.prolog.force_persp_center_interp)
7481 location = TGSI_INTERPOLATE_LOC_CENTER;
7482
7483 switch (location) {
7484 case TGSI_INTERPOLATE_LOC_SAMPLE:
7485 key->ps_prolog.color_interp_vgpr_index[i] = 0;
7486 shader->config.spi_ps_input_ena |=
7487 S_0286CC_PERSP_SAMPLE_ENA(1);
7488 break;
7489 case TGSI_INTERPOLATE_LOC_CENTER:
7490 key->ps_prolog.color_interp_vgpr_index[i] = 2;
7491 shader->config.spi_ps_input_ena |=
7492 S_0286CC_PERSP_CENTER_ENA(1);
7493 break;
7494 case TGSI_INTERPOLATE_LOC_CENTROID:
7495 key->ps_prolog.color_interp_vgpr_index[i] = 4;
7496 shader->config.spi_ps_input_ena |=
7497 S_0286CC_PERSP_CENTROID_ENA(1);
7498 break;
7499 default:
7500 assert(0);
7501 }
7502 break;
7503 case TGSI_INTERPOLATE_LINEAR:
7504 /* Force the interpolation location for colors here. */
7505 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7506 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7507 if (shader->key.part.ps.prolog.force_linear_center_interp)
7508 location = TGSI_INTERPOLATE_LOC_CENTER;
7509
7510 /* The VGPR assignment for non-monolithic shaders
7511 * works because InitialPSInputAddr is set on the
7512 * main shader and PERSP_PULL_MODEL is never used.
7513 */
7514 switch (location) {
7515 case TGSI_INTERPOLATE_LOC_SAMPLE:
7516 key->ps_prolog.color_interp_vgpr_index[i] =
7517 separate_prolog ? 6 : 9;
7518 shader->config.spi_ps_input_ena |=
7519 S_0286CC_LINEAR_SAMPLE_ENA(1);
7520 break;
7521 case TGSI_INTERPOLATE_LOC_CENTER:
7522 key->ps_prolog.color_interp_vgpr_index[i] =
7523 separate_prolog ? 8 : 11;
7524 shader->config.spi_ps_input_ena |=
7525 S_0286CC_LINEAR_CENTER_ENA(1);
7526 break;
7527 case TGSI_INTERPOLATE_LOC_CENTROID:
7528 key->ps_prolog.color_interp_vgpr_index[i] =
7529 separate_prolog ? 10 : 13;
7530 shader->config.spi_ps_input_ena |=
7531 S_0286CC_LINEAR_CENTROID_ENA(1);
7532 break;
7533 default:
7534 assert(0);
7535 }
7536 break;
7537 default:
7538 assert(0);
7539 }
7540 }
7541 }
7542 }
7543
7544 /**
7545 * Check whether a PS prolog is required based on the key.
7546 */
7547 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7548 {
7549 return key->ps_prolog.colors_read ||
7550 key->ps_prolog.states.force_persp_sample_interp ||
7551 key->ps_prolog.states.force_linear_sample_interp ||
7552 key->ps_prolog.states.force_persp_center_interp ||
7553 key->ps_prolog.states.force_linear_center_interp ||
7554 key->ps_prolog.states.bc_optimize_for_persp ||
7555 key->ps_prolog.states.bc_optimize_for_linear ||
7556 key->ps_prolog.states.poly_stipple;
7557 }
7558
7559 /**
7560 * Compute the PS epilog key, which contains all the information needed to
7561 * build the PS epilog function.
7562 */
7563 static void si_get_ps_epilog_key(struct si_shader *shader,
7564 union si_shader_part_key *key)
7565 {
7566 struct tgsi_shader_info *info = &shader->selector->info;
7567 memset(key, 0, sizeof(*key));
7568 key->ps_epilog.colors_written = info->colors_written;
7569 key->ps_epilog.writes_z = info->writes_z;
7570 key->ps_epilog.writes_stencil = info->writes_stencil;
7571 key->ps_epilog.writes_samplemask = info->writes_samplemask;
7572 key->ps_epilog.states = shader->key.part.ps.epilog;
7573 }
7574
7575 /**
7576 * Build the GS prolog function. Rotate the input vertices for triangle strips
7577 * with adjacency.
7578 */
7579 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7580 union si_shader_part_key *key)
7581 {
7582 unsigned num_sgprs, num_vgprs;
7583 struct gallivm_state *gallivm = &ctx->gallivm;
7584 LLVMBuilderRef builder = gallivm->builder;
7585 LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
7586 LLVMTypeRef returns[48];
7587 LLVMValueRef func, ret;
7588
7589 if (ctx->screen->b.chip_class >= GFX9) {
7590 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
7591 num_vgprs = 5; /* ES inputs are not needed by GS */
7592 } else {
7593 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
7594 num_vgprs = 8;
7595 }
7596
7597 for (unsigned i = 0; i < num_sgprs; ++i) {
7598 params[i] = ctx->i32;
7599 returns[i] = ctx->i32;
7600 }
7601
7602 for (unsigned i = 0; i < num_vgprs; ++i) {
7603 params[num_sgprs + i] = ctx->i32;
7604 returns[num_sgprs + i] = ctx->f32;
7605 }
7606
7607 /* Create the function. */
7608 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7609 params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
7610 func = ctx->main_fn;
7611
7612 /* Set the full EXEC mask for the prolog, because we are only fiddling
7613 * with registers here. The main shader part will set the correct EXEC
7614 * mask.
7615 */
7616 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
7617 si_init_exec_full_mask(ctx);
7618
7619 /* Copy inputs to outputs. This should be no-op, as the registers match,
7620 * but it will prevent the compiler from overwriting them unintentionally.
7621 */
7622 ret = ctx->return_value;
7623 for (unsigned i = 0; i < num_sgprs; i++) {
7624 LLVMValueRef p = LLVMGetParam(func, i);
7625 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7626 }
7627 for (unsigned i = 0; i < num_vgprs; i++) {
7628 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7629 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7630 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7631 }
7632
7633 if (key->gs_prolog.states.tri_strip_adj_fix) {
7634 /* Remap the input vertices for every other primitive. */
7635 const unsigned gfx6_vtx_params[6] = {
7636 num_sgprs,
7637 num_sgprs + 1,
7638 num_sgprs + 3,
7639 num_sgprs + 4,
7640 num_sgprs + 5,
7641 num_sgprs + 6
7642 };
7643 const unsigned gfx9_vtx_params[3] = {
7644 num_sgprs,
7645 num_sgprs + 1,
7646 num_sgprs + 4,
7647 };
7648 LLVMValueRef vtx_in[6], vtx_out[6];
7649 LLVMValueRef prim_id, rotate;
7650
7651 if (ctx->screen->b.chip_class >= GFX9) {
7652 for (unsigned i = 0; i < 3; i++) {
7653 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
7654 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
7655 }
7656 } else {
7657 for (unsigned i = 0; i < 6; i++)
7658 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
7659 }
7660
7661 prim_id = LLVMGetParam(func, num_sgprs + 2);
7662 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7663
7664 for (unsigned i = 0; i < 6; ++i) {
7665 LLVMValueRef base, rotated;
7666 base = vtx_in[i];
7667 rotated = vtx_in[(i + 4) % 6];
7668 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
7669 }
7670
7671 if (ctx->screen->b.chip_class >= GFX9) {
7672 for (unsigned i = 0; i < 3; i++) {
7673 LLVMValueRef hi, out;
7674
7675 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
7676 LLVMConstInt(ctx->i32, 16, 0), "");
7677 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
7678 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
7679 ret = LLVMBuildInsertValue(builder, ret, out,
7680 gfx9_vtx_params[i], "");
7681 }
7682 } else {
7683 for (unsigned i = 0; i < 6; i++) {
7684 LLVMValueRef out;
7685
7686 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
7687 ret = LLVMBuildInsertValue(builder, ret, out,
7688 gfx6_vtx_params[i], "");
7689 }
7690 }
7691 }
7692
7693 LLVMBuildRet(builder, ret);
7694 }
7695
7696 /**
7697 * Given a list of shader part functions, build a wrapper function that
7698 * runs them in sequence to form a monolithic shader.
7699 */
7700 static void si_build_wrapper_function(struct si_shader_context *ctx,
7701 LLVMValueRef *parts,
7702 unsigned num_parts,
7703 unsigned main_part,
7704 unsigned next_shader_first_part)
7705 {
7706 struct gallivm_state *gallivm = &ctx->gallivm;
7707 LLVMBuilderRef builder = ctx->gallivm.builder;
7708 /* PS epilog has one arg per color component */
7709 LLVMTypeRef param_types[48];
7710 LLVMValueRef initial[48], out[48];
7711 LLVMTypeRef function_type;
7712 unsigned num_params;
7713 unsigned num_out, initial_num_out;
7714 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7715 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
7716 unsigned num_sgprs, num_vgprs;
7717 unsigned last_sgpr_param;
7718 unsigned gprs;
7719 struct lp_build_if_state if_state;
7720
7721 for (unsigned i = 0; i < num_parts; ++i) {
7722 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7723 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7724 }
7725
7726 /* The parameters of the wrapper function correspond to those of the
7727 * first part in terms of SGPRs and VGPRs, but we use the types of the
7728 * main part to get the right types. This is relevant for the
7729 * dereferenceable attribute on descriptor table pointers.
7730 */
7731 num_sgprs = 0;
7732 num_vgprs = 0;
7733
7734 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7735 num_params = LLVMCountParamTypes(function_type);
7736
7737 for (unsigned i = 0; i < num_params; ++i) {
7738 LLVMValueRef param = LLVMGetParam(parts[0], i);
7739
7740 if (ac_is_sgpr_param(param)) {
7741 assert(num_vgprs == 0);
7742 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7743 } else {
7744 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7745 }
7746 }
7747 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7748
7749 num_params = 0;
7750 last_sgpr_param = 0;
7751 gprs = 0;
7752 while (gprs < num_sgprs + num_vgprs) {
7753 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7754 unsigned size;
7755
7756 param_types[num_params] = LLVMTypeOf(param);
7757 if (gprs < num_sgprs)
7758 last_sgpr_param = num_params;
7759 size = llvm_get_type_size(param_types[num_params]) / 4;
7760 num_params++;
7761
7762 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7763 assert(gprs + size <= num_sgprs + num_vgprs &&
7764 (gprs >= num_sgprs || gprs + size <= num_sgprs));
7765
7766 gprs += size;
7767 }
7768
7769 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
7770 last_sgpr_param,
7771 si_get_max_workgroup_size(ctx->shader));
7772
7773 if (is_merged_shader(ctx->shader))
7774 si_init_exec_full_mask(ctx);
7775
7776 /* Record the arguments of the function as if they were an output of
7777 * a previous part.
7778 */
7779 num_out = 0;
7780 num_out_sgpr = 0;
7781
7782 for (unsigned i = 0; i < num_params; ++i) {
7783 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7784 LLVMTypeRef param_type = LLVMTypeOf(param);
7785 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7786 unsigned size = llvm_get_type_size(param_type) / 4;
7787
7788 if (size == 1) {
7789 if (param_type != out_type)
7790 param = LLVMBuildBitCast(builder, param, out_type, "");
7791 out[num_out++] = param;
7792 } else {
7793 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7794
7795 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7796 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7797 param_type = ctx->i64;
7798 }
7799
7800 if (param_type != vector_type)
7801 param = LLVMBuildBitCast(builder, param, vector_type, "");
7802
7803 for (unsigned j = 0; j < size; ++j)
7804 out[num_out++] = LLVMBuildExtractElement(
7805 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7806 }
7807
7808 if (i <= last_sgpr_param)
7809 num_out_sgpr = num_out;
7810 }
7811
7812 memcpy(initial, out, sizeof(out));
7813 initial_num_out = num_out;
7814 initial_num_out_sgpr = num_out_sgpr;
7815
7816 /* Now chain the parts. */
7817 for (unsigned part = 0; part < num_parts; ++part) {
7818 LLVMValueRef in[48];
7819 LLVMValueRef ret;
7820 LLVMTypeRef ret_type;
7821 unsigned out_idx = 0;
7822
7823 num_params = LLVMCountParams(parts[part]);
7824 assert(num_params <= ARRAY_SIZE(param_types));
7825
7826 /* Merged shaders are executed conditionally depending
7827 * on the number of enabled threads passed in the input SGPRs. */
7828 if (is_merged_shader(ctx->shader) &&
7829 (part == 0 || part == next_shader_first_part)) {
7830 LLVMValueRef ena, count = initial[3];
7831
7832 /* The thread count for the 2nd shader is at bit-offset 8. */
7833 if (part == next_shader_first_part) {
7834 count = LLVMBuildLShr(builder, count,
7835 LLVMConstInt(ctx->i32, 8, 0), "");
7836 }
7837 count = LLVMBuildAnd(builder, count,
7838 LLVMConstInt(ctx->i32, 0x7f, 0), "");
7839 ena = LLVMBuildICmp(builder, LLVMIntULT,
7840 ac_get_thread_id(&ctx->ac), count, "");
7841 lp_build_if(&if_state, &ctx->gallivm, ena);
7842 }
7843
7844 /* Derive arguments for the next part from outputs of the
7845 * previous one.
7846 */
7847 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7848 LLVMValueRef param;
7849 LLVMTypeRef param_type;
7850 bool is_sgpr;
7851 unsigned param_size;
7852 LLVMValueRef arg = NULL;
7853
7854 param = LLVMGetParam(parts[part], param_idx);
7855 param_type = LLVMTypeOf(param);
7856 param_size = llvm_get_type_size(param_type) / 4;
7857 is_sgpr = ac_is_sgpr_param(param);
7858
7859 if (is_sgpr) {
7860 #if HAVE_LLVM < 0x0400
7861 LLVMRemoveAttribute(param, LLVMByValAttribute);
7862 #else
7863 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7864 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7865 #endif
7866 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7867 }
7868
7869 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7870 assert(is_sgpr || out_idx >= num_out_sgpr);
7871
7872 if (param_size == 1)
7873 arg = out[out_idx];
7874 else
7875 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7876
7877 if (LLVMTypeOf(arg) != param_type) {
7878 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7879 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7880 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7881 } else {
7882 arg = LLVMBuildBitCast(builder, arg, param_type, "");
7883 }
7884 }
7885
7886 in[param_idx] = arg;
7887 out_idx += param_size;
7888 }
7889
7890 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7891
7892 if (is_merged_shader(ctx->shader) &&
7893 (part + 1 == next_shader_first_part ||
7894 part + 1 == num_parts)) {
7895 lp_build_endif(&if_state);
7896
7897 if (part + 1 == next_shader_first_part) {
7898 /* A barrier is required between 2 merged shaders. */
7899 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
7900
7901 /* The second half of the merged shader should use
7902 * the inputs from the toplevel (wrapper) function,
7903 * not the return value from the last call.
7904 *
7905 * That's because the last call was executed condi-
7906 * tionally, so we can't consume it in the main
7907 * block.
7908 */
7909 memcpy(out, initial, sizeof(initial));
7910 num_out = initial_num_out;
7911 num_out_sgpr = initial_num_out_sgpr;
7912 }
7913 continue;
7914 }
7915
7916 /* Extract the returned GPRs. */
7917 ret_type = LLVMTypeOf(ret);
7918 num_out = 0;
7919 num_out_sgpr = 0;
7920
7921 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7922 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7923
7924 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7925
7926 for (unsigned i = 0; i < ret_size; ++i) {
7927 LLVMValueRef val =
7928 LLVMBuildExtractValue(builder, ret, i, "");
7929
7930 out[num_out++] = val;
7931
7932 if (LLVMTypeOf(val) == ctx->i32) {
7933 assert(num_out_sgpr + 1 == num_out);
7934 num_out_sgpr = num_out;
7935 }
7936 }
7937 }
7938 }
7939
7940 LLVMBuildRetVoid(builder);
7941 }
7942
7943 int si_compile_tgsi_shader(struct si_screen *sscreen,
7944 LLVMTargetMachineRef tm,
7945 struct si_shader *shader,
7946 bool is_monolithic,
7947 struct pipe_debug_callback *debug)
7948 {
7949 struct si_shader_selector *sel = shader->selector;
7950 struct si_shader_context ctx;
7951 int r = -1;
7952
7953 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7954 * conversion fails. */
7955 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7956 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7957 tgsi_dump(sel->tokens, 0);
7958 si_dump_streamout(&sel->so);
7959 }
7960
7961 si_init_shader_ctx(&ctx, sscreen, tm);
7962 si_llvm_context_set_tgsi(&ctx, shader);
7963 ctx.separate_prolog = !is_monolithic;
7964
7965 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7966 sizeof(shader->info.vs_output_param_offset));
7967
7968 shader->info.uses_instanceid = sel->info.uses_instanceid;
7969
7970 ctx.load_system_value = declare_system_value;
7971
7972 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
7973 si_llvm_dispose(&ctx);
7974 return -1;
7975 }
7976
7977 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7978 LLVMValueRef parts[2];
7979 bool need_prolog = sel->vs_needs_prolog;
7980
7981 parts[1] = ctx.main_fn;
7982
7983 if (need_prolog) {
7984 union si_shader_part_key prolog_key;
7985 si_get_vs_prolog_key(&sel->info,
7986 shader->info.num_input_sgprs,
7987 &shader->key.part.vs.prolog,
7988 shader, &prolog_key);
7989 si_build_vs_prolog_function(&ctx, &prolog_key);
7990 parts[0] = ctx.main_fn;
7991 }
7992
7993 si_build_wrapper_function(&ctx, parts + !need_prolog,
7994 1 + need_prolog, need_prolog, 0);
7995 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7996 if (sscreen->b.chip_class >= GFX9) {
7997 struct si_shader_selector *ls = shader->key.part.tcs.ls;
7998 LLVMValueRef parts[4];
7999
8000 /* TCS main part */
8001 parts[2] = ctx.main_fn;
8002
8003 /* TCS epilog */
8004 union si_shader_part_key tcs_epilog_key;
8005 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
8006 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8007 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
8008 parts[3] = ctx.main_fn;
8009
8010 /* VS prolog */
8011 if (ls->vs_needs_prolog) {
8012 union si_shader_part_key vs_prolog_key;
8013 si_get_vs_prolog_key(&ls->info,
8014 shader->info.num_input_sgprs,
8015 &shader->key.part.tcs.ls_prolog,
8016 shader, &vs_prolog_key);
8017 vs_prolog_key.vs_prolog.is_monolithic = true;
8018 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8019 parts[0] = ctx.main_fn;
8020 }
8021
8022 /* VS as LS main part */
8023 struct si_shader shader_ls = {};
8024 shader_ls.selector = ls;
8025 shader_ls.key.as_ls = 1;
8026 shader_ls.key.mono = shader->key.mono;
8027 shader_ls.key.opt = shader->key.opt;
8028 si_llvm_context_set_tgsi(&ctx, &shader_ls);
8029
8030 if (!si_compile_tgsi_main(&ctx, true)) {
8031 si_llvm_dispose(&ctx);
8032 return -1;
8033 }
8034 shader->info.uses_instanceid |= ls->info.uses_instanceid;
8035 parts[1] = ctx.main_fn;
8036
8037 /* Reset the shader context. */
8038 ctx.shader = shader;
8039 ctx.type = PIPE_SHADER_TESS_CTRL;
8040
8041 si_build_wrapper_function(&ctx,
8042 parts + !ls->vs_needs_prolog,
8043 4 - !ls->vs_needs_prolog, 0,
8044 ls->vs_needs_prolog ? 2 : 1);
8045 } else {
8046 LLVMValueRef parts[2];
8047 union si_shader_part_key epilog_key;
8048
8049 parts[0] = ctx.main_fn;
8050
8051 memset(&epilog_key, 0, sizeof(epilog_key));
8052 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8053 si_build_tcs_epilog_function(&ctx, &epilog_key);
8054 parts[1] = ctx.main_fn;
8055
8056 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8057 }
8058 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
8059 if (ctx.screen->b.chip_class >= GFX9) {
8060 struct si_shader_selector *es = shader->key.part.gs.es;
8061 LLVMValueRef es_prolog = NULL;
8062 LLVMValueRef es_main = NULL;
8063 LLVMValueRef gs_prolog = NULL;
8064 LLVMValueRef gs_main = ctx.main_fn;
8065
8066 /* GS prolog */
8067 union si_shader_part_key gs_prolog_key;
8068 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
8069 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8070 gs_prolog_key.gs_prolog.is_monolithic = true;
8071 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
8072 gs_prolog = ctx.main_fn;
8073
8074 /* ES prolog */
8075 if (es->vs_needs_prolog) {
8076 union si_shader_part_key vs_prolog_key;
8077 si_get_vs_prolog_key(&es->info,
8078 shader->info.num_input_sgprs,
8079 &shader->key.part.tcs.ls_prolog,
8080 shader, &vs_prolog_key);
8081 vs_prolog_key.vs_prolog.is_monolithic = true;
8082 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8083 es_prolog = ctx.main_fn;
8084 }
8085
8086 /* ES main part */
8087 struct si_shader shader_es = {};
8088 shader_es.selector = es;
8089 shader_es.key.as_es = 1;
8090 shader_es.key.mono = shader->key.mono;
8091 shader_es.key.opt = shader->key.opt;
8092 si_llvm_context_set_tgsi(&ctx, &shader_es);
8093
8094 if (!si_compile_tgsi_main(&ctx, true)) {
8095 si_llvm_dispose(&ctx);
8096 return -1;
8097 }
8098 shader->info.uses_instanceid |= es->info.uses_instanceid;
8099 es_main = ctx.main_fn;
8100
8101 /* Reset the shader context. */
8102 ctx.shader = shader;
8103 ctx.type = PIPE_SHADER_GEOMETRY;
8104
8105 /* Prepare the array of shader parts. */
8106 LLVMValueRef parts[4];
8107 unsigned num_parts = 0, main_part, next_first_part;
8108
8109 if (es_prolog)
8110 parts[num_parts++] = es_prolog;
8111
8112 parts[main_part = num_parts++] = es_main;
8113 parts[next_first_part = num_parts++] = gs_prolog;
8114 parts[num_parts++] = gs_main;
8115
8116 si_build_wrapper_function(&ctx, parts, num_parts,
8117 main_part, next_first_part);
8118 } else {
8119 LLVMValueRef parts[2];
8120 union si_shader_part_key prolog_key;
8121
8122 parts[1] = ctx.main_fn;
8123
8124 memset(&prolog_key, 0, sizeof(prolog_key));
8125 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8126 si_build_gs_prolog_function(&ctx, &prolog_key);
8127 parts[0] = ctx.main_fn;
8128
8129 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
8130 }
8131 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
8132 LLVMValueRef parts[3];
8133 union si_shader_part_key prolog_key;
8134 union si_shader_part_key epilog_key;
8135 bool need_prolog;
8136
8137 si_get_ps_prolog_key(shader, &prolog_key, false);
8138 need_prolog = si_need_ps_prolog(&prolog_key);
8139
8140 parts[need_prolog ? 1 : 0] = ctx.main_fn;
8141
8142 if (need_prolog) {
8143 si_build_ps_prolog_function(&ctx, &prolog_key);
8144 parts[0] = ctx.main_fn;
8145 }
8146
8147 si_get_ps_epilog_key(shader, &epilog_key);
8148 si_build_ps_epilog_function(&ctx, &epilog_key);
8149 parts[need_prolog ? 2 : 1] = ctx.main_fn;
8150
8151 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
8152 need_prolog ? 1 : 0, 0);
8153 }
8154
8155 /* Dump LLVM IR before any optimization passes */
8156 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
8157 r600_can_dump_shader(&sscreen->b, ctx.type))
8158 LLVMDumpModule(ctx.gallivm.module);
8159
8160 si_llvm_finalize_module(&ctx,
8161 r600_extra_shader_checks(&sscreen->b, ctx.type));
8162
8163 /* Post-optimization transformations and analysis. */
8164 si_eliminate_const_vs_outputs(&ctx);
8165
8166 if ((debug && debug->debug_message) ||
8167 r600_can_dump_shader(&sscreen->b, ctx.type))
8168 si_count_scratch_private_memory(&ctx);
8169
8170 /* Compile to bytecode. */
8171 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
8172 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
8173 si_llvm_dispose(&ctx);
8174 if (r) {
8175 fprintf(stderr, "LLVM failed to compile shader\n");
8176 return r;
8177 }
8178
8179 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
8180 * LLVM 3.9svn has this bug.
8181 */
8182 if (sel->type == PIPE_SHADER_COMPUTE) {
8183 unsigned wave_size = 64;
8184 unsigned max_vgprs = 256;
8185 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
8186 unsigned max_sgprs_per_wave = 128;
8187 unsigned max_block_threads = si_get_max_workgroup_size(shader);
8188 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
8189 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
8190
8191 max_vgprs = max_vgprs / min_waves_per_simd;
8192 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
8193
8194 if (shader->config.num_sgprs > max_sgprs ||
8195 shader->config.num_vgprs > max_vgprs) {
8196 fprintf(stderr, "LLVM failed to compile a shader correctly: "
8197 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
8198 shader->config.num_sgprs, shader->config.num_vgprs,
8199 max_sgprs, max_vgprs);
8200
8201 /* Just terminate the process, because dependent
8202 * shaders can hang due to bad input data, but use
8203 * the env var to allow shader-db to work.
8204 */
8205 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
8206 abort();
8207 }
8208 }
8209
8210 /* Add the scratch offset to input SGPRs. */
8211 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
8212 shader->info.num_input_sgprs += 1; /* scratch byte offset */
8213
8214 /* Calculate the number of fragment input VGPRs. */
8215 if (ctx.type == PIPE_SHADER_FRAGMENT) {
8216 shader->info.num_input_vgprs = 0;
8217 shader->info.face_vgpr_index = -1;
8218
8219 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8220 shader->info.num_input_vgprs += 2;
8221 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
8222 shader->info.num_input_vgprs += 2;
8223 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
8224 shader->info.num_input_vgprs += 2;
8225 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
8226 shader->info.num_input_vgprs += 3;
8227 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8228 shader->info.num_input_vgprs += 2;
8229 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
8230 shader->info.num_input_vgprs += 2;
8231 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
8232 shader->info.num_input_vgprs += 2;
8233 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
8234 shader->info.num_input_vgprs += 1;
8235 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
8236 shader->info.num_input_vgprs += 1;
8237 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
8238 shader->info.num_input_vgprs += 1;
8239 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
8240 shader->info.num_input_vgprs += 1;
8241 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
8242 shader->info.num_input_vgprs += 1;
8243 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
8244 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
8245 shader->info.num_input_vgprs += 1;
8246 }
8247 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
8248 shader->info.num_input_vgprs += 1;
8249 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
8250 shader->info.num_input_vgprs += 1;
8251 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
8252 shader->info.num_input_vgprs += 1;
8253 }
8254
8255 return 0;
8256 }
8257
8258 /**
8259 * Create, compile and return a shader part (prolog or epilog).
8260 *
8261 * \param sscreen screen
8262 * \param list list of shader parts of the same category
8263 * \param type shader type
8264 * \param key shader part key
8265 * \param prolog whether the part being requested is a prolog
8266 * \param tm LLVM target machine
8267 * \param debug debug callback
8268 * \param build the callback responsible for building the main function
8269 * \return non-NULL on success
8270 */
8271 static struct si_shader_part *
8272 si_get_shader_part(struct si_screen *sscreen,
8273 struct si_shader_part **list,
8274 enum pipe_shader_type type,
8275 bool prolog,
8276 union si_shader_part_key *key,
8277 LLVMTargetMachineRef tm,
8278 struct pipe_debug_callback *debug,
8279 void (*build)(struct si_shader_context *,
8280 union si_shader_part_key *),
8281 const char *name)
8282 {
8283 struct si_shader_part *result;
8284
8285 mtx_lock(&sscreen->shader_parts_mutex);
8286
8287 /* Find existing. */
8288 for (result = *list; result; result = result->next) {
8289 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
8290 mtx_unlock(&sscreen->shader_parts_mutex);
8291 return result;
8292 }
8293 }
8294
8295 /* Compile a new one. */
8296 result = CALLOC_STRUCT(si_shader_part);
8297 result->key = *key;
8298
8299 struct si_shader shader = {};
8300 struct si_shader_context ctx;
8301 struct gallivm_state *gallivm = &ctx.gallivm;
8302
8303 si_init_shader_ctx(&ctx, sscreen, tm);
8304 ctx.shader = &shader;
8305 ctx.type = type;
8306
8307 switch (type) {
8308 case PIPE_SHADER_VERTEX:
8309 break;
8310 case PIPE_SHADER_TESS_CTRL:
8311 assert(!prolog);
8312 shader.key.part.tcs.epilog = key->tcs_epilog.states;
8313 break;
8314 case PIPE_SHADER_GEOMETRY:
8315 assert(prolog);
8316 break;
8317 case PIPE_SHADER_FRAGMENT:
8318 if (prolog)
8319 shader.key.part.ps.prolog = key->ps_prolog.states;
8320 else
8321 shader.key.part.ps.epilog = key->ps_epilog.states;
8322 break;
8323 default:
8324 unreachable("bad shader part");
8325 }
8326
8327 build(&ctx, key);
8328
8329 /* Compile. */
8330 si_llvm_finalize_module(&ctx,
8331 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
8332
8333 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
8334 gallivm->module, debug, ctx.type, name)) {
8335 FREE(result);
8336 result = NULL;
8337 goto out;
8338 }
8339
8340 result->next = *list;
8341 *list = result;
8342
8343 out:
8344 si_llvm_dispose(&ctx);
8345 mtx_unlock(&sscreen->shader_parts_mutex);
8346 return result;
8347 }
8348
8349 /**
8350 * Build the vertex shader prolog function.
8351 *
8352 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
8353 * All inputs are returned unmodified. The vertex load indices are
8354 * stored after them, which will be used by the API VS for fetching inputs.
8355 *
8356 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
8357 * input_v0,
8358 * input_v1,
8359 * input_v2,
8360 * input_v3,
8361 * (VertexID + BaseVertex),
8362 * (InstanceID + StartInstance),
8363 * (InstanceID / 2 + StartInstance)
8364 */
8365 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
8366 union si_shader_part_key *key)
8367 {
8368 struct gallivm_state *gallivm = &ctx->gallivm;
8369 LLVMTypeRef *params, *returns;
8370 LLVMValueRef ret, func;
8371 int last_sgpr, num_params, num_returns, i;
8372 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
8373 key->vs_prolog.num_merged_next_stage_vgprs;
8374 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
8375 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
8376 num_input_vgprs;
8377 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
8378
8379 ctx->param_vertex_id = first_vs_vgpr;
8380 ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
8381
8382 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
8383 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
8384 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
8385 sizeof(LLVMTypeRef));
8386 num_params = 0;
8387 num_returns = 0;
8388
8389 /* Declare input and output SGPRs. */
8390 num_params = 0;
8391 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8392 params[num_params++] = ctx->i32;
8393 returns[num_returns++] = ctx->i32;
8394 }
8395 last_sgpr = num_params - 1;
8396
8397 /* Preloaded VGPRs (outputs must be floats) */
8398 for (i = 0; i < num_input_vgprs; i++) {
8399 params[num_params++] = ctx->i32;
8400 returns[num_returns++] = ctx->f32;
8401 }
8402
8403 /* Vertex load indices. */
8404 for (i = 0; i <= key->vs_prolog.last_input; i++)
8405 returns[num_returns++] = ctx->f32;
8406
8407 /* Create the function. */
8408 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
8409 num_params, last_sgpr, 0);
8410 func = ctx->main_fn;
8411
8412 if (key->vs_prolog.num_merged_next_stage_vgprs &&
8413 !key->vs_prolog.is_monolithic)
8414 si_init_exec_from_input(ctx, 3, 0);
8415
8416 /* Copy inputs to outputs. This should be no-op, as the registers match,
8417 * but it will prevent the compiler from overwriting them unintentionally.
8418 */
8419 ret = ctx->return_value;
8420 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8421 LLVMValueRef p = LLVMGetParam(func, i);
8422 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8423 }
8424 for (; i < num_params; i++) {
8425 LLVMValueRef p = LLVMGetParam(func, i);
8426 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
8427 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8428 }
8429
8430 /* Compute vertex load indices from instance divisors. */
8431 for (i = 0; i <= key->vs_prolog.last_input; i++) {
8432 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
8433 LLVMValueRef index;
8434
8435 if (divisor) {
8436 /* InstanceID / Divisor + StartInstance */
8437 index = get_instance_index_for_fetch(ctx,
8438 user_sgpr_base +
8439 SI_SGPR_START_INSTANCE,
8440 divisor);
8441 } else {
8442 /* VertexID + BaseVertex */
8443 index = LLVMBuildAdd(gallivm->builder,
8444 LLVMGetParam(func, ctx->param_vertex_id),
8445 LLVMGetParam(func, user_sgpr_base +
8446 SI_SGPR_BASE_VERTEX), "");
8447 }
8448
8449 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
8450 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
8451 num_params++, "");
8452 }
8453
8454 si_llvm_build_ret(ctx, ret);
8455 }
8456
8457 static bool si_get_vs_prolog(struct si_screen *sscreen,
8458 LLVMTargetMachineRef tm,
8459 struct si_shader *shader,
8460 struct pipe_debug_callback *debug,
8461 struct si_shader *main_part,
8462 const struct si_vs_prolog_bits *key)
8463 {
8464 struct si_shader_selector *vs = main_part->selector;
8465
8466 /* The prolog is a no-op if there are no inputs. */
8467 if (!vs->vs_needs_prolog)
8468 return true;
8469
8470 /* Get the prolog. */
8471 union si_shader_part_key prolog_key;
8472 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
8473 key, shader, &prolog_key);
8474
8475 shader->prolog =
8476 si_get_shader_part(sscreen, &sscreen->vs_prologs,
8477 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8478 debug, si_build_vs_prolog_function,
8479 "Vertex Shader Prolog");
8480 return shader->prolog != NULL;
8481 }
8482
8483 /**
8484 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8485 */
8486 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8487 LLVMTargetMachineRef tm,
8488 struct si_shader *shader,
8489 struct pipe_debug_callback *debug)
8490 {
8491 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8492 &shader->key.part.vs.prolog);
8493 }
8494
8495 /**
8496 * Compile the TCS epilog function. This writes tesselation factors to memory
8497 * based on the output primitive type of the tesselator (determined by TES).
8498 */
8499 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8500 union si_shader_part_key *key)
8501 {
8502 struct gallivm_state *gallivm = &ctx->gallivm;
8503 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8504 LLVMTypeRef params[32];
8505 LLVMValueRef func;
8506 int last_sgpr, num_params = 0;
8507
8508 if (ctx->screen->b.chip_class >= GFX9) {
8509 params[num_params++] = ctx->i64;
8510 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8511 params[num_params++] = ctx->i32; /* wave info */
8512 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8513 params[num_params++] = ctx->i32;
8514 params[num_params++] = ctx->i32;
8515 params[num_params++] = ctx->i32;
8516 params[num_params++] = ctx->i64;
8517 params[num_params++] = ctx->i64;
8518 params[num_params++] = ctx->i64;
8519 params[num_params++] = ctx->i64;
8520 params[num_params++] = ctx->i64;
8521 params[num_params++] = ctx->i64;
8522 params[num_params++] = ctx->i32;
8523 params[num_params++] = ctx->i32;
8524 params[num_params++] = ctx->i32;
8525 params[num_params++] = ctx->i32;
8526 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8527 params[num_params++] = ctx->i32;
8528 params[num_params++] = ctx->i32;
8529 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
8530 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
8531 } else {
8532 params[num_params++] = ctx->i64;
8533 params[num_params++] = ctx->i64;
8534 params[num_params++] = ctx->i64;
8535 params[num_params++] = ctx->i64;
8536 params[num_params++] = ctx->i64;
8537 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8538 params[num_params++] = ctx->i32;
8539 params[num_params++] = ctx->i32;
8540 params[num_params++] = ctx->i32;
8541 params[ctx->param_tcs_offchip_addr_base64k = num_params++] = ctx->i32;
8542 params[ctx->param_tcs_factor_addr_base64k = num_params++] = ctx->i32;
8543 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8544 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8545 }
8546 last_sgpr = num_params - 1;
8547
8548 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8549 params[num_params++] = ctx->i32; /* invocation ID within the patch */
8550 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8551
8552 /* Create the function. */
8553 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr,
8554 ctx->screen->b.chip_class >= CIK ? 128 : 64);
8555 declare_lds_as_pointer(ctx);
8556 func = ctx->main_fn;
8557
8558 si_write_tess_factors(bld_base,
8559 LLVMGetParam(func, last_sgpr + 1),
8560 LLVMGetParam(func, last_sgpr + 2),
8561 LLVMGetParam(func, last_sgpr + 3));
8562
8563 LLVMBuildRetVoid(gallivm->builder);
8564 }
8565
8566 /**
8567 * Select and compile (or reuse) TCS parts (epilog).
8568 */
8569 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8570 LLVMTargetMachineRef tm,
8571 struct si_shader *shader,
8572 struct pipe_debug_callback *debug)
8573 {
8574 if (sscreen->b.chip_class >= GFX9) {
8575 struct si_shader *ls_main_part =
8576 shader->key.part.tcs.ls->main_shader_part_ls;
8577
8578 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8579 &shader->key.part.tcs.ls_prolog))
8580 return false;
8581
8582 shader->previous_stage = ls_main_part;
8583 }
8584
8585 /* Get the epilog. */
8586 union si_shader_part_key epilog_key;
8587 memset(&epilog_key, 0, sizeof(epilog_key));
8588 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8589
8590 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8591 PIPE_SHADER_TESS_CTRL, false,
8592 &epilog_key, tm, debug,
8593 si_build_tcs_epilog_function,
8594 "Tessellation Control Shader Epilog");
8595 return shader->epilog != NULL;
8596 }
8597
8598 /**
8599 * Select and compile (or reuse) GS parts (prolog).
8600 */
8601 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8602 LLVMTargetMachineRef tm,
8603 struct si_shader *shader,
8604 struct pipe_debug_callback *debug)
8605 {
8606 if (sscreen->b.chip_class >= GFX9) {
8607 struct si_shader *es_main_part =
8608 shader->key.part.gs.es->main_shader_part_es;
8609
8610 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
8611 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
8612 &shader->key.part.gs.vs_prolog))
8613 return false;
8614
8615 shader->previous_stage = es_main_part;
8616 }
8617
8618 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8619 return true;
8620
8621 union si_shader_part_key prolog_key;
8622 memset(&prolog_key, 0, sizeof(prolog_key));
8623 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8624
8625 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8626 PIPE_SHADER_GEOMETRY, true,
8627 &prolog_key, tm, debug,
8628 si_build_gs_prolog_function,
8629 "Geometry Shader Prolog");
8630 return shader->prolog2 != NULL;
8631 }
8632
8633 /**
8634 * Build the pixel shader prolog function. This handles:
8635 * - two-side color selection and interpolation
8636 * - overriding interpolation parameters for the API PS
8637 * - polygon stippling
8638 *
8639 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8640 * overriden by other states. (e.g. per-sample interpolation)
8641 * Interpolated colors are stored after the preloaded VGPRs.
8642 */
8643 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8644 union si_shader_part_key *key)
8645 {
8646 struct gallivm_state *gallivm = &ctx->gallivm;
8647 LLVMTypeRef *params;
8648 LLVMValueRef ret, func;
8649 int last_sgpr, num_params, num_returns, i, num_color_channels;
8650
8651 assert(si_need_ps_prolog(key));
8652
8653 /* Number of inputs + 8 color elements. */
8654 params = alloca((key->ps_prolog.num_input_sgprs +
8655 key->ps_prolog.num_input_vgprs + 8) *
8656 sizeof(LLVMTypeRef));
8657
8658 /* Declare inputs. */
8659 num_params = 0;
8660 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8661 params[num_params++] = ctx->i32;
8662 last_sgpr = num_params - 1;
8663
8664 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8665 params[num_params++] = ctx->f32;
8666
8667 /* Declare outputs (same as inputs + add colors if needed) */
8668 num_returns = num_params;
8669 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8670 for (i = 0; i < num_color_channels; i++)
8671 params[num_returns++] = ctx->f32;
8672
8673 /* Create the function. */
8674 si_create_function(ctx, "ps_prolog", params, num_returns, params,
8675 num_params, last_sgpr, 0);
8676 func = ctx->main_fn;
8677
8678 /* Copy inputs to outputs. This should be no-op, as the registers match,
8679 * but it will prevent the compiler from overwriting them unintentionally.
8680 */
8681 ret = ctx->return_value;
8682 for (i = 0; i < num_params; i++) {
8683 LLVMValueRef p = LLVMGetParam(func, i);
8684 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8685 }
8686
8687 /* Polygon stippling. */
8688 if (key->ps_prolog.states.poly_stipple) {
8689 /* POS_FIXED_PT is always last. */
8690 unsigned pos = key->ps_prolog.num_input_sgprs +
8691 key->ps_prolog.num_input_vgprs - 1;
8692 LLVMValueRef ptr[2], list;
8693
8694 /* Get the pointer to rw buffers. */
8695 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8696 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8697 list = lp_build_gather_values(gallivm, ptr, 2);
8698 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8699 list = LLVMBuildIntToPtr(gallivm->builder, list,
8700 const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
8701
8702 si_llvm_emit_polygon_stipple(ctx, list, pos);
8703 }
8704
8705 if (key->ps_prolog.states.bc_optimize_for_persp ||
8706 key->ps_prolog.states.bc_optimize_for_linear) {
8707 unsigned i, base = key->ps_prolog.num_input_sgprs;
8708 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8709
8710 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8711 * The hw doesn't compute CENTROID if the whole wave only
8712 * contains fully-covered quads.
8713 *
8714 * PRIM_MASK is after user SGPRs.
8715 */
8716 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8717 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8718 LLVMConstInt(ctx->i32, 31, 0), "");
8719 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8720 ctx->i1, "");
8721
8722 if (key->ps_prolog.states.bc_optimize_for_persp) {
8723 /* Read PERSP_CENTER. */
8724 for (i = 0; i < 2; i++)
8725 center[i] = LLVMGetParam(func, base + 2 + i);
8726 /* Read PERSP_CENTROID. */
8727 for (i = 0; i < 2; i++)
8728 centroid[i] = LLVMGetParam(func, base + 4 + i);
8729 /* Select PERSP_CENTROID. */
8730 for (i = 0; i < 2; i++) {
8731 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8732 center[i], centroid[i], "");
8733 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8734 tmp, base + 4 + i, "");
8735 }
8736 }
8737 if (key->ps_prolog.states.bc_optimize_for_linear) {
8738 /* Read LINEAR_CENTER. */
8739 for (i = 0; i < 2; i++)
8740 center[i] = LLVMGetParam(func, base + 8 + i);
8741 /* Read LINEAR_CENTROID. */
8742 for (i = 0; i < 2; i++)
8743 centroid[i] = LLVMGetParam(func, base + 10 + i);
8744 /* Select LINEAR_CENTROID. */
8745 for (i = 0; i < 2; i++) {
8746 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8747 center[i], centroid[i], "");
8748 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8749 tmp, base + 10 + i, "");
8750 }
8751 }
8752 }
8753
8754 /* Force per-sample interpolation. */
8755 if (key->ps_prolog.states.force_persp_sample_interp) {
8756 unsigned i, base = key->ps_prolog.num_input_sgprs;
8757 LLVMValueRef persp_sample[2];
8758
8759 /* Read PERSP_SAMPLE. */
8760 for (i = 0; i < 2; i++)
8761 persp_sample[i] = LLVMGetParam(func, base + i);
8762 /* Overwrite PERSP_CENTER. */
8763 for (i = 0; i < 2; i++)
8764 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8765 persp_sample[i], base + 2 + i, "");
8766 /* Overwrite PERSP_CENTROID. */
8767 for (i = 0; i < 2; i++)
8768 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8769 persp_sample[i], base + 4 + i, "");
8770 }
8771 if (key->ps_prolog.states.force_linear_sample_interp) {
8772 unsigned i, base = key->ps_prolog.num_input_sgprs;
8773 LLVMValueRef linear_sample[2];
8774
8775 /* Read LINEAR_SAMPLE. */
8776 for (i = 0; i < 2; i++)
8777 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8778 /* Overwrite LINEAR_CENTER. */
8779 for (i = 0; i < 2; i++)
8780 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8781 linear_sample[i], base + 8 + i, "");
8782 /* Overwrite LINEAR_CENTROID. */
8783 for (i = 0; i < 2; i++)
8784 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8785 linear_sample[i], base + 10 + i, "");
8786 }
8787
8788 /* Force center interpolation. */
8789 if (key->ps_prolog.states.force_persp_center_interp) {
8790 unsigned i, base = key->ps_prolog.num_input_sgprs;
8791 LLVMValueRef persp_center[2];
8792
8793 /* Read PERSP_CENTER. */
8794 for (i = 0; i < 2; i++)
8795 persp_center[i] = LLVMGetParam(func, base + 2 + i);
8796 /* Overwrite PERSP_SAMPLE. */
8797 for (i = 0; i < 2; i++)
8798 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8799 persp_center[i], base + i, "");
8800 /* Overwrite PERSP_CENTROID. */
8801 for (i = 0; i < 2; i++)
8802 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8803 persp_center[i], base + 4 + i, "");
8804 }
8805 if (key->ps_prolog.states.force_linear_center_interp) {
8806 unsigned i, base = key->ps_prolog.num_input_sgprs;
8807 LLVMValueRef linear_center[2];
8808
8809 /* Read LINEAR_CENTER. */
8810 for (i = 0; i < 2; i++)
8811 linear_center[i] = LLVMGetParam(func, base + 8 + i);
8812 /* Overwrite LINEAR_SAMPLE. */
8813 for (i = 0; i < 2; i++)
8814 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8815 linear_center[i], base + 6 + i, "");
8816 /* Overwrite LINEAR_CENTROID. */
8817 for (i = 0; i < 2; i++)
8818 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8819 linear_center[i], base + 10 + i, "");
8820 }
8821
8822 /* Interpolate colors. */
8823 for (i = 0; i < 2; i++) {
8824 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8825 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8826 key->ps_prolog.face_vgpr_index;
8827 LLVMValueRef interp[2], color[4];
8828 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8829
8830 if (!writemask)
8831 continue;
8832
8833 /* If the interpolation qualifier is not CONSTANT (-1). */
8834 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8835 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8836 key->ps_prolog.color_interp_vgpr_index[i];
8837
8838 /* Get the (i,j) updated by bc_optimize handling. */
8839 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8840 interp_vgpr, "");
8841 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8842 interp_vgpr + 1, "");
8843 interp_ij = lp_build_gather_values(gallivm, interp, 2);
8844 }
8845
8846 /* Use the absolute location of the input. */
8847 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8848
8849 if (key->ps_prolog.states.color_two_side) {
8850 face = LLVMGetParam(func, face_vgpr);
8851 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8852 }
8853
8854 interp_fs_input(ctx,
8855 key->ps_prolog.color_attr_index[i],
8856 TGSI_SEMANTIC_COLOR, i,
8857 key->ps_prolog.num_interp_inputs,
8858 key->ps_prolog.colors_read, interp_ij,
8859 prim_mask, face, color);
8860
8861 while (writemask) {
8862 unsigned chan = u_bit_scan(&writemask);
8863 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8864 num_params++, "");
8865 }
8866 }
8867
8868 /* Tell LLVM to insert WQM instruction sequence when needed. */
8869 if (key->ps_prolog.wqm) {
8870 LLVMAddTargetDependentFunctionAttr(func,
8871 "amdgpu-ps-wqm-outputs", "");
8872 }
8873
8874 si_llvm_build_ret(ctx, ret);
8875 }
8876
8877 /**
8878 * Build the pixel shader epilog function. This handles everything that must be
8879 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8880 */
8881 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8882 union si_shader_part_key *key)
8883 {
8884 struct gallivm_state *gallivm = &ctx->gallivm;
8885 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8886 LLVMTypeRef params[16+8*4+3];
8887 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8888 int last_sgpr, num_params = 0, i;
8889 struct si_ps_exports exp = {};
8890
8891 /* Declare input SGPRs. */
8892 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8893 params[ctx->param_const_buffers = num_params++] = ctx->i64;
8894 params[ctx->param_samplers = num_params++] = ctx->i64;
8895 params[ctx->param_images = num_params++] = ctx->i64;
8896 params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8897 assert(num_params == SI_PARAM_ALPHA_REF);
8898 params[SI_PARAM_ALPHA_REF] = ctx->f32;
8899 last_sgpr = SI_PARAM_ALPHA_REF;
8900
8901 /* Declare input VGPRs. */
8902 num_params = (last_sgpr + 1) +
8903 util_bitcount(key->ps_epilog.colors_written) * 4 +
8904 key->ps_epilog.writes_z +
8905 key->ps_epilog.writes_stencil +
8906 key->ps_epilog.writes_samplemask;
8907
8908 num_params = MAX2(num_params,
8909 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8910
8911 assert(num_params <= ARRAY_SIZE(params));
8912
8913 for (i = last_sgpr + 1; i < num_params; i++)
8914 params[i] = ctx->f32;
8915
8916 /* Create the function. */
8917 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
8918 last_sgpr, 0);
8919 /* Disable elimination of unused inputs. */
8920 si_llvm_add_attribute(ctx->main_fn,
8921 "InitialPSInputAddr", 0xffffff);
8922
8923 /* Process colors. */
8924 unsigned vgpr = last_sgpr + 1;
8925 unsigned colors_written = key->ps_epilog.colors_written;
8926 int last_color_export = -1;
8927
8928 /* Find the last color export. */
8929 if (!key->ps_epilog.writes_z &&
8930 !key->ps_epilog.writes_stencil &&
8931 !key->ps_epilog.writes_samplemask) {
8932 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8933
8934 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8935 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8936 /* Just set this if any of the colorbuffers are enabled. */
8937 if (spi_format &
8938 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8939 last_color_export = 0;
8940 } else {
8941 for (i = 0; i < 8; i++)
8942 if (colors_written & (1 << i) &&
8943 (spi_format >> (i * 4)) & 0xf)
8944 last_color_export = i;
8945 }
8946 }
8947
8948 while (colors_written) {
8949 LLVMValueRef color[4];
8950 int mrt = u_bit_scan(&colors_written);
8951
8952 for (i = 0; i < 4; i++)
8953 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8954
8955 si_export_mrt_color(bld_base, color, mrt,
8956 num_params - 1,
8957 mrt == last_color_export, &exp);
8958 }
8959
8960 /* Process depth, stencil, samplemask. */
8961 if (key->ps_epilog.writes_z)
8962 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8963 if (key->ps_epilog.writes_stencil)
8964 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8965 if (key->ps_epilog.writes_samplemask)
8966 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8967
8968 if (depth || stencil || samplemask)
8969 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8970 else if (last_color_export == -1)
8971 si_export_null(bld_base);
8972
8973 if (exp.num)
8974 si_emit_ps_exports(ctx, &exp);
8975
8976 /* Compile. */
8977 LLVMBuildRetVoid(gallivm->builder);
8978 }
8979
8980 /**
8981 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8982 */
8983 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8984 LLVMTargetMachineRef tm,
8985 struct si_shader *shader,
8986 struct pipe_debug_callback *debug)
8987 {
8988 union si_shader_part_key prolog_key;
8989 union si_shader_part_key epilog_key;
8990
8991 /* Get the prolog. */
8992 si_get_ps_prolog_key(shader, &prolog_key, true);
8993
8994 /* The prolog is a no-op if these aren't set. */
8995 if (si_need_ps_prolog(&prolog_key)) {
8996 shader->prolog =
8997 si_get_shader_part(sscreen, &sscreen->ps_prologs,
8998 PIPE_SHADER_FRAGMENT, true,
8999 &prolog_key, tm, debug,
9000 si_build_ps_prolog_function,
9001 "Fragment Shader Prolog");
9002 if (!shader->prolog)
9003 return false;
9004 }
9005
9006 /* Get the epilog. */
9007 si_get_ps_epilog_key(shader, &epilog_key);
9008
9009 shader->epilog =
9010 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
9011 PIPE_SHADER_FRAGMENT, false,
9012 &epilog_key, tm, debug,
9013 si_build_ps_epilog_function,
9014 "Fragment Shader Epilog");
9015 if (!shader->epilog)
9016 return false;
9017
9018 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
9019 if (shader->key.part.ps.prolog.poly_stipple) {
9020 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
9021 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
9022 }
9023
9024 /* Set up the enable bits for per-sample shading if needed. */
9025 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
9026 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9027 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9028 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
9029 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9030 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
9031 }
9032 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
9033 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
9034 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9035 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
9036 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9037 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
9038 }
9039 if (shader->key.part.ps.prolog.force_persp_center_interp &&
9040 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9041 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9042 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
9043 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
9044 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9045 }
9046 if (shader->key.part.ps.prolog.force_linear_center_interp &&
9047 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
9048 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
9049 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
9050 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
9051 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9052 }
9053
9054 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
9055 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
9056 !(shader->config.spi_ps_input_ena & 0xf)) {
9057 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9058 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
9059 }
9060
9061 /* At least one pair of interpolation weights must be enabled. */
9062 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
9063 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9064 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
9065 }
9066
9067 /* The sample mask input is always enabled, because the API shader always
9068 * passes it through to the epilog. Disable it here if it's unused.
9069 */
9070 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
9071 !shader->selector->info.reads_samplemask)
9072 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
9073
9074 return true;
9075 }
9076
9077 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
9078 unsigned *lds_size)
9079 {
9080 /* SPI barrier management bug:
9081 * Make sure we have at least 4k of LDS in use to avoid the bug.
9082 * It applies to workgroup sizes of more than one wavefront.
9083 */
9084 if (sscreen->b.family == CHIP_BONAIRE ||
9085 sscreen->b.family == CHIP_KABINI ||
9086 sscreen->b.family == CHIP_MULLINS)
9087 *lds_size = MAX2(*lds_size, 8);
9088 }
9089
9090 static void si_fix_resource_usage(struct si_screen *sscreen,
9091 struct si_shader *shader)
9092 {
9093 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
9094
9095 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
9096
9097 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
9098 si_get_max_workgroup_size(shader) > 64) {
9099 si_multiwave_lds_size_workaround(sscreen,
9100 &shader->config.lds_size);
9101 }
9102 }
9103
9104 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
9105 struct si_shader *shader,
9106 struct pipe_debug_callback *debug)
9107 {
9108 struct si_shader_selector *sel = shader->selector;
9109 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
9110 int r;
9111
9112 /* LS, ES, VS are compiled on demand if the main part hasn't been
9113 * compiled for that stage.
9114 *
9115 * Vertex shaders are compiled on demand when a vertex fetch
9116 * workaround must be applied.
9117 */
9118 if (shader->is_monolithic) {
9119 /* Monolithic shader (compiled as a whole, has many variants,
9120 * may take a long time to compile).
9121 */
9122 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
9123 if (r)
9124 return r;
9125 } else {
9126 /* The shader consists of 2-3 parts:
9127 *
9128 * - the middle part is the user shader, it has 1 variant only
9129 * and it was compiled during the creation of the shader
9130 * selector
9131 * - the prolog part is inserted at the beginning
9132 * - the epilog part is inserted at the end
9133 *
9134 * The prolog and epilog have many (but simple) variants.
9135 */
9136
9137 /* Copy the compiled TGSI shader data over. */
9138 shader->is_binary_shared = true;
9139 shader->binary = mainp->binary;
9140 shader->config = mainp->config;
9141 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
9142 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
9143 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
9144 memcpy(shader->info.vs_output_param_offset,
9145 mainp->info.vs_output_param_offset,
9146 sizeof(mainp->info.vs_output_param_offset));
9147 shader->info.uses_instanceid = mainp->info.uses_instanceid;
9148 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
9149 shader->info.nr_param_exports = mainp->info.nr_param_exports;
9150
9151 /* Select prologs and/or epilogs. */
9152 switch (sel->type) {
9153 case PIPE_SHADER_VERTEX:
9154 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
9155 return -1;
9156 break;
9157 case PIPE_SHADER_TESS_CTRL:
9158 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
9159 return -1;
9160 break;
9161 case PIPE_SHADER_TESS_EVAL:
9162 break;
9163 case PIPE_SHADER_GEOMETRY:
9164 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
9165 return -1;
9166 break;
9167 case PIPE_SHADER_FRAGMENT:
9168 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
9169 return -1;
9170
9171 /* Make sure we have at least as many VGPRs as there
9172 * are allocated inputs.
9173 */
9174 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9175 shader->info.num_input_vgprs);
9176 break;
9177 }
9178
9179 /* Update SGPR and VGPR counts. */
9180 if (shader->prolog) {
9181 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9182 shader->prolog->config.num_sgprs);
9183 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9184 shader->prolog->config.num_vgprs);
9185 }
9186 if (shader->previous_stage) {
9187 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9188 shader->previous_stage->config.num_sgprs);
9189 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9190 shader->previous_stage->config.num_vgprs);
9191 shader->config.spilled_sgprs =
9192 MAX2(shader->config.spilled_sgprs,
9193 shader->previous_stage->config.spilled_sgprs);
9194 shader->config.spilled_vgprs =
9195 MAX2(shader->config.spilled_vgprs,
9196 shader->previous_stage->config.spilled_vgprs);
9197 shader->config.private_mem_vgprs =
9198 MAX2(shader->config.private_mem_vgprs,
9199 shader->previous_stage->config.private_mem_vgprs);
9200 shader->config.scratch_bytes_per_wave =
9201 MAX2(shader->config.scratch_bytes_per_wave,
9202 shader->previous_stage->config.scratch_bytes_per_wave);
9203 shader->info.uses_instanceid |=
9204 shader->previous_stage->info.uses_instanceid;
9205 }
9206 if (shader->prolog2) {
9207 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9208 shader->prolog2->config.num_sgprs);
9209 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9210 shader->prolog2->config.num_vgprs);
9211 }
9212 if (shader->epilog) {
9213 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9214 shader->epilog->config.num_sgprs);
9215 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9216 shader->epilog->config.num_vgprs);
9217 }
9218 }
9219
9220 si_fix_resource_usage(sscreen, shader);
9221 si_shader_dump(sscreen, shader, debug, sel->info.processor,
9222 stderr, true);
9223
9224 /* Upload. */
9225 r = si_shader_binary_upload(sscreen, shader);
9226 if (r) {
9227 fprintf(stderr, "LLVM failed to upload shader\n");
9228 return r;
9229 }
9230
9231 return 0;
9232 }
9233
9234 void si_shader_destroy(struct si_shader *shader)
9235 {
9236 if (shader->scratch_bo)
9237 r600_resource_reference(&shader->scratch_bo, NULL);
9238
9239 r600_resource_reference(&shader->bo, NULL);
9240
9241 if (!shader->is_binary_shared)
9242 radeon_shader_binary_clean(&shader->binary);
9243
9244 free(shader->shader_log);
9245 }