radeonsi/gfx9: load GS inputs from LDS
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
86 union si_shader_part_key *key);
87
88 /* Ideally pass the sample mask input to the PS epilog as v13, which
89 * is its usual location, so that the shader doesn't have to add v_mov.
90 */
91 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
92
93 /* The VS location of the PrimitiveID input is the same in the epilog,
94 * so that the main shader part doesn't have to move it.
95 */
96 #define VS_EPILOG_PRIMID_LOC 2
97
98 enum {
99 CONST_ADDR_SPACE = 2,
100 LOCAL_ADDR_SPACE = 3,
101 };
102
103 static bool is_merged_shader(struct si_shader *shader)
104 {
105 if (shader->selector->screen->b.chip_class <= VI)
106 return false;
107
108 return shader->key.as_ls ||
109 shader->key.as_es ||
110 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
111 shader->selector->type == PIPE_SHADER_GEOMETRY;
112 }
113
114 /**
115 * Returns a unique index for a semantic name and index. The index must be
116 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
117 * calculated.
118 */
119 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
120 {
121 switch (semantic_name) {
122 case TGSI_SEMANTIC_POSITION:
123 return 0;
124 case TGSI_SEMANTIC_PSIZE:
125 return 1;
126 case TGSI_SEMANTIC_CLIPDIST:
127 assert(index <= 1);
128 return 2 + index;
129 case TGSI_SEMANTIC_GENERIC:
130 if (index <= 63-4)
131 return 4 + index;
132
133 assert(!"invalid generic index");
134 return 0;
135
136 /* patch indices are completely separate and thus start from 0 */
137 case TGSI_SEMANTIC_TESSOUTER:
138 return 0;
139 case TGSI_SEMANTIC_TESSINNER:
140 return 1;
141 case TGSI_SEMANTIC_PATCH:
142 return 2 + index;
143
144 default:
145 assert(!"invalid semantic name");
146 return 0;
147 }
148 }
149
150 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
151 {
152 switch (name) {
153 case TGSI_SEMANTIC_FOG:
154 return 0;
155 case TGSI_SEMANTIC_LAYER:
156 return 1;
157 case TGSI_SEMANTIC_VIEWPORT_INDEX:
158 return 2;
159 case TGSI_SEMANTIC_PRIMID:
160 return 3;
161 case TGSI_SEMANTIC_COLOR: /* these alias */
162 case TGSI_SEMANTIC_BCOLOR:
163 return 4 + index;
164 case TGSI_SEMANTIC_TEXCOORD:
165 return 6 + index;
166 default:
167 assert(!"invalid semantic name");
168 return 0;
169 }
170 }
171
172 /**
173 * Get the value of a shader input parameter and extract a bitfield.
174 */
175 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
176 unsigned param, unsigned rshift,
177 unsigned bitwidth)
178 {
179 struct gallivm_state *gallivm = &ctx->gallivm;
180 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
181 param);
182
183 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
184 value = bitcast(&ctx->bld_base,
185 TGSI_TYPE_UNSIGNED, value);
186
187 if (rshift)
188 value = LLVMBuildLShr(gallivm->builder, value,
189 LLVMConstInt(ctx->i32, rshift, 0), "");
190
191 if (rshift + bitwidth < 32) {
192 unsigned mask = (1 << bitwidth) - 1;
193 value = LLVMBuildAnd(gallivm->builder, value,
194 LLVMConstInt(ctx->i32, mask, 0), "");
195 }
196
197 return value;
198 }
199
200 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
201 {
202 switch (ctx->type) {
203 case PIPE_SHADER_TESS_CTRL:
204 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
205
206 case PIPE_SHADER_TESS_EVAL:
207 return LLVMGetParam(ctx->main_fn,
208 ctx->param_tes_rel_patch_id);
209
210 default:
211 assert(0);
212 return NULL;
213 }
214 }
215
216 /* Tessellation shaders pass outputs to the next shader using LDS.
217 *
218 * LS outputs = TCS inputs
219 * TCS outputs = TES inputs
220 *
221 * The LDS layout is:
222 * - TCS inputs for patch 0
223 * - TCS inputs for patch 1
224 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
225 * - ...
226 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
227 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
228 * - TCS outputs for patch 1
229 * - Per-patch TCS outputs for patch 1
230 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
231 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
232 * - ...
233 *
234 * All three shaders VS(LS), TCS, TES share the same LDS space.
235 */
236
237 static LLVMValueRef
238 get_tcs_in_patch_stride(struct si_shader_context *ctx)
239 {
240 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
241 }
242
243 static LLVMValueRef
244 get_tcs_out_patch_stride(struct si_shader_context *ctx)
245 {
246 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
247 }
248
249 static LLVMValueRef
250 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
251 {
252 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
253 unpack_param(ctx,
254 ctx->param_tcs_out_lds_offsets,
255 0, 16),
256 4);
257 }
258
259 static LLVMValueRef
260 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
261 {
262 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
263 unpack_param(ctx,
264 ctx->param_tcs_out_lds_offsets,
265 16, 16),
266 4);
267 }
268
269 static LLVMValueRef
270 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
271 {
272 struct gallivm_state *gallivm = &ctx->gallivm;
273 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
274 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
275
276 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
277 }
278
279 static LLVMValueRef
280 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
281 {
282 struct gallivm_state *gallivm = &ctx->gallivm;
283 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
284 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
285 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
286
287 return LLVMBuildAdd(gallivm->builder, patch0_offset,
288 LLVMBuildMul(gallivm->builder, patch_stride,
289 rel_patch_id, ""),
290 "");
291 }
292
293 static LLVMValueRef
294 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
295 {
296 struct gallivm_state *gallivm = &ctx->gallivm;
297 LLVMValueRef patch0_patch_data_offset =
298 get_tcs_out_patch0_patch_data_offset(ctx);
299 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
300 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
301
302 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
303 LLVMBuildMul(gallivm->builder, patch_stride,
304 rel_patch_id, ""),
305 "");
306 }
307
308 static LLVMValueRef get_instance_index_for_fetch(
309 struct si_shader_context *ctx,
310 unsigned param_start_instance, unsigned divisor)
311 {
312 struct gallivm_state *gallivm = &ctx->gallivm;
313
314 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
315 ctx->param_instance_id);
316
317 /* The division must be done before START_INSTANCE is added. */
318 if (divisor > 1)
319 result = LLVMBuildUDiv(gallivm->builder, result,
320 LLVMConstInt(ctx->i32, divisor, 0), "");
321
322 return LLVMBuildAdd(gallivm->builder, result,
323 LLVMGetParam(ctx->main_fn, param_start_instance), "");
324 }
325
326 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
327 * to float. */
328 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
329 LLVMValueRef vec4,
330 unsigned double_index)
331 {
332 LLVMBuilderRef builder = ctx->gallivm.builder;
333 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
334 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
335 LLVMVectorType(f64, 2), "");
336 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
337 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
338 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
339 }
340
341 static void declare_input_vs(
342 struct si_shader_context *ctx,
343 unsigned input_index,
344 const struct tgsi_full_declaration *decl,
345 LLVMValueRef out[4])
346 {
347 struct gallivm_state *gallivm = &ctx->gallivm;
348
349 unsigned chan;
350 unsigned fix_fetch;
351 unsigned num_fetches;
352 unsigned fetch_stride;
353
354 LLVMValueRef t_list_ptr;
355 LLVMValueRef t_offset;
356 LLVMValueRef t_list;
357 LLVMValueRef vertex_index;
358 LLVMValueRef input[3];
359
360 /* Load the T list */
361 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
362
363 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
364
365 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
366
367 vertex_index = LLVMGetParam(ctx->main_fn,
368 ctx->param_vertex_index0 +
369 input_index);
370
371 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
372
373 /* Do multiple loads for special formats. */
374 switch (fix_fetch) {
375 case SI_FIX_FETCH_RGB_64_FLOAT:
376 num_fetches = 3; /* 3 2-dword loads */
377 fetch_stride = 8;
378 break;
379 case SI_FIX_FETCH_RGBA_64_FLOAT:
380 num_fetches = 2; /* 2 4-dword loads */
381 fetch_stride = 16;
382 break;
383 case SI_FIX_FETCH_RGB_8:
384 case SI_FIX_FETCH_RGB_8_INT:
385 num_fetches = 3;
386 fetch_stride = 1;
387 break;
388 case SI_FIX_FETCH_RGB_16:
389 case SI_FIX_FETCH_RGB_16_INT:
390 num_fetches = 3;
391 fetch_stride = 2;
392 break;
393 default:
394 num_fetches = 1;
395 fetch_stride = 0;
396 }
397
398 for (unsigned i = 0; i < num_fetches; i++) {
399 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
400
401 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
402 vertex_index, voffset,
403 true);
404 }
405
406 /* Break up the vec4 into individual components */
407 for (chan = 0; chan < 4; chan++) {
408 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
409 out[chan] = LLVMBuildExtractElement(gallivm->builder,
410 input[0], llvm_chan, "");
411 }
412
413 switch (fix_fetch) {
414 case SI_FIX_FETCH_A2_SNORM:
415 case SI_FIX_FETCH_A2_SSCALED:
416 case SI_FIX_FETCH_A2_SINT: {
417 /* The hardware returns an unsigned value; convert it to a
418 * signed one.
419 */
420 LLVMValueRef tmp = out[3];
421 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
422
423 /* First, recover the sign-extended signed integer value. */
424 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
425 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
426 else
427 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
428
429 /* For the integer-like cases, do a natural sign extension.
430 *
431 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
432 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
433 * exponent.
434 */
435 tmp = LLVMBuildShl(gallivm->builder, tmp,
436 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
437 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
438 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
439
440 /* Convert back to the right type. */
441 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
442 LLVMValueRef clamp;
443 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
444 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
445 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
446 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
447 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
448 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
449 }
450
451 out[3] = tmp;
452 break;
453 }
454 case SI_FIX_FETCH_RGBA_32_UNORM:
455 case SI_FIX_FETCH_RGBX_32_UNORM:
456 for (chan = 0; chan < 4; chan++) {
457 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
458 ctx->i32, "");
459 out[chan] = LLVMBuildUIToFP(gallivm->builder,
460 out[chan], ctx->f32, "");
461 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
462 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
463 }
464 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
465 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
466 out[3] = LLVMConstReal(ctx->f32, 1);
467 break;
468 case SI_FIX_FETCH_RGBA_32_SNORM:
469 case SI_FIX_FETCH_RGBX_32_SNORM:
470 case SI_FIX_FETCH_RGBA_32_FIXED:
471 case SI_FIX_FETCH_RGBX_32_FIXED: {
472 double scale;
473 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
474 scale = 1.0 / 0x10000;
475 else
476 scale = 1.0 / INT_MAX;
477
478 for (chan = 0; chan < 4; chan++) {
479 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
480 ctx->i32, "");
481 out[chan] = LLVMBuildSIToFP(gallivm->builder,
482 out[chan], ctx->f32, "");
483 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
484 LLVMConstReal(ctx->f32, scale), "");
485 }
486 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
487 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
488 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
489 out[3] = LLVMConstReal(ctx->f32, 1);
490 break;
491 }
492 case SI_FIX_FETCH_RGBA_32_USCALED:
493 for (chan = 0; chan < 4; chan++) {
494 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
495 ctx->i32, "");
496 out[chan] = LLVMBuildUIToFP(gallivm->builder,
497 out[chan], ctx->f32, "");
498 }
499 break;
500 case SI_FIX_FETCH_RGBA_32_SSCALED:
501 for (chan = 0; chan < 4; chan++) {
502 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
503 ctx->i32, "");
504 out[chan] = LLVMBuildSIToFP(gallivm->builder,
505 out[chan], ctx->f32, "");
506 }
507 break;
508 case SI_FIX_FETCH_RG_64_FLOAT:
509 for (chan = 0; chan < 2; chan++)
510 out[chan] = extract_double_to_float(ctx, input[0], chan);
511
512 out[2] = LLVMConstReal(ctx->f32, 0);
513 out[3] = LLVMConstReal(ctx->f32, 1);
514 break;
515 case SI_FIX_FETCH_RGB_64_FLOAT:
516 for (chan = 0; chan < 3; chan++)
517 out[chan] = extract_double_to_float(ctx, input[chan], 0);
518
519 out[3] = LLVMConstReal(ctx->f32, 1);
520 break;
521 case SI_FIX_FETCH_RGBA_64_FLOAT:
522 for (chan = 0; chan < 4; chan++) {
523 out[chan] = extract_double_to_float(ctx, input[chan / 2],
524 chan % 2);
525 }
526 break;
527 case SI_FIX_FETCH_RGB_8:
528 case SI_FIX_FETCH_RGB_8_INT:
529 case SI_FIX_FETCH_RGB_16:
530 case SI_FIX_FETCH_RGB_16_INT:
531 for (chan = 0; chan < 3; chan++) {
532 out[chan] = LLVMBuildExtractElement(gallivm->builder,
533 input[chan],
534 ctx->i32_0, "");
535 }
536 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
537 fix_fetch == SI_FIX_FETCH_RGB_16) {
538 out[3] = LLVMConstReal(ctx->f32, 1);
539 } else {
540 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
541 ctx->f32, "");
542 }
543 break;
544 }
545 }
546
547 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
548 unsigned swizzle)
549 {
550 struct si_shader_context *ctx = si_shader_context(bld_base);
551
552 if (swizzle > 0)
553 return ctx->i32_0;
554
555 switch (ctx->type) {
556 case PIPE_SHADER_VERTEX:
557 return LLVMGetParam(ctx->main_fn,
558 ctx->param_vs_prim_id);
559 case PIPE_SHADER_TESS_CTRL:
560 return LLVMGetParam(ctx->main_fn,
561 ctx->param_tcs_patch_id);
562 case PIPE_SHADER_TESS_EVAL:
563 return LLVMGetParam(ctx->main_fn,
564 ctx->param_tes_patch_id);
565 case PIPE_SHADER_GEOMETRY:
566 return LLVMGetParam(ctx->main_fn,
567 ctx->param_gs_prim_id);
568 default:
569 assert(0);
570 return ctx->i32_0;
571 }
572 }
573
574 /**
575 * Return the value of tgsi_ind_register for indexing.
576 * This is the indirect index with the constant offset added to it.
577 */
578 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
579 const struct tgsi_ind_register *ind,
580 int rel_index)
581 {
582 struct gallivm_state *gallivm = &ctx->gallivm;
583 LLVMValueRef result;
584
585 result = ctx->addrs[ind->Index][ind->Swizzle];
586 result = LLVMBuildLoad(gallivm->builder, result, "");
587 result = LLVMBuildAdd(gallivm->builder, result,
588 LLVMConstInt(ctx->i32, rel_index, 0), "");
589 return result;
590 }
591
592 /**
593 * Like get_indirect_index, but restricts the return value to a (possibly
594 * undefined) value inside [0..num).
595 */
596 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
597 const struct tgsi_ind_register *ind,
598 int rel_index, unsigned num)
599 {
600 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
601
602 /* LLVM 3.8: If indirect resource indexing is used:
603 * - SI & CIK hang
604 * - VI crashes
605 */
606 if (HAVE_LLVM == 0x0308)
607 return LLVMGetUndef(ctx->i32);
608
609 return si_llvm_bound_index(ctx, result, num);
610 }
611
612
613 /**
614 * Calculate a dword address given an input or output register and a stride.
615 */
616 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
617 const struct tgsi_full_dst_register *dst,
618 const struct tgsi_full_src_register *src,
619 LLVMValueRef vertex_dw_stride,
620 LLVMValueRef base_addr)
621 {
622 struct gallivm_state *gallivm = &ctx->gallivm;
623 struct tgsi_shader_info *info = &ctx->shader->selector->info;
624 ubyte *name, *index, *array_first;
625 int first, param;
626 struct tgsi_full_dst_register reg;
627
628 /* Set the register description. The address computation is the same
629 * for sources and destinations. */
630 if (src) {
631 reg.Register.File = src->Register.File;
632 reg.Register.Index = src->Register.Index;
633 reg.Register.Indirect = src->Register.Indirect;
634 reg.Register.Dimension = src->Register.Dimension;
635 reg.Indirect = src->Indirect;
636 reg.Dimension = src->Dimension;
637 reg.DimIndirect = src->DimIndirect;
638 } else
639 reg = *dst;
640
641 /* If the register is 2-dimensional (e.g. an array of vertices
642 * in a primitive), calculate the base address of the vertex. */
643 if (reg.Register.Dimension) {
644 LLVMValueRef index;
645
646 if (reg.Dimension.Indirect)
647 index = get_indirect_index(ctx, &reg.DimIndirect,
648 reg.Dimension.Index);
649 else
650 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
651
652 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
653 LLVMBuildMul(gallivm->builder, index,
654 vertex_dw_stride, ""), "");
655 }
656
657 /* Get information about the register. */
658 if (reg.Register.File == TGSI_FILE_INPUT) {
659 name = info->input_semantic_name;
660 index = info->input_semantic_index;
661 array_first = info->input_array_first;
662 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
663 name = info->output_semantic_name;
664 index = info->output_semantic_index;
665 array_first = info->output_array_first;
666 } else {
667 assert(0);
668 return NULL;
669 }
670
671 if (reg.Register.Indirect) {
672 /* Add the relative address of the element. */
673 LLVMValueRef ind_index;
674
675 if (reg.Indirect.ArrayID)
676 first = array_first[reg.Indirect.ArrayID];
677 else
678 first = reg.Register.Index;
679
680 ind_index = get_indirect_index(ctx, &reg.Indirect,
681 reg.Register.Index - first);
682
683 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
684 LLVMBuildMul(gallivm->builder, ind_index,
685 LLVMConstInt(ctx->i32, 4, 0), ""), "");
686
687 param = si_shader_io_get_unique_index(name[first], index[first]);
688 } else {
689 param = si_shader_io_get_unique_index(name[reg.Register.Index],
690 index[reg.Register.Index]);
691 }
692
693 /* Add the base address of the element. */
694 return LLVMBuildAdd(gallivm->builder, base_addr,
695 LLVMConstInt(ctx->i32, param * 4, 0), "");
696 }
697
698 /* The offchip buffer layout for TCS->TES is
699 *
700 * - attribute 0 of patch 0 vertex 0
701 * - attribute 0 of patch 0 vertex 1
702 * - attribute 0 of patch 0 vertex 2
703 * ...
704 * - attribute 0 of patch 1 vertex 0
705 * - attribute 0 of patch 1 vertex 1
706 * ...
707 * - attribute 1 of patch 0 vertex 0
708 * - attribute 1 of patch 0 vertex 1
709 * ...
710 * - per patch attribute 0 of patch 0
711 * - per patch attribute 0 of patch 1
712 * ...
713 *
714 * Note that every attribute has 4 components.
715 */
716 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
717 LLVMValueRef rel_patch_id,
718 LLVMValueRef vertex_index,
719 LLVMValueRef param_index)
720 {
721 struct gallivm_state *gallivm = &ctx->gallivm;
722 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
723 LLVMValueRef param_stride, constant16;
724
725 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 6);
726 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 9);
727 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
728 num_patches, "");
729
730 constant16 = LLVMConstInt(ctx->i32, 16, 0);
731 if (vertex_index) {
732 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
733 vertices_per_patch, "");
734
735 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
736 vertex_index, "");
737
738 param_stride = total_vertices;
739 } else {
740 base_addr = rel_patch_id;
741 param_stride = num_patches;
742 }
743
744 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
745 LLVMBuildMul(gallivm->builder, param_index,
746 param_stride, ""), "");
747
748 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
749
750 if (!vertex_index) {
751 LLVMValueRef patch_data_offset =
752 unpack_param(ctx, ctx->param_tcs_offchip_layout, 16, 16);
753
754 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
755 patch_data_offset, "");
756 }
757 return base_addr;
758 }
759
760 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
761 struct si_shader_context *ctx,
762 const struct tgsi_full_dst_register *dst,
763 const struct tgsi_full_src_register *src)
764 {
765 struct gallivm_state *gallivm = &ctx->gallivm;
766 struct tgsi_shader_info *info = &ctx->shader->selector->info;
767 ubyte *name, *index, *array_first;
768 struct tgsi_full_src_register reg;
769 LLVMValueRef vertex_index = NULL;
770 LLVMValueRef param_index = NULL;
771 unsigned param_index_base, param_base;
772
773 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
774
775 if (reg.Register.Dimension) {
776
777 if (reg.Dimension.Indirect)
778 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
779 reg.Dimension.Index);
780 else
781 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
782 }
783
784 /* Get information about the register. */
785 if (reg.Register.File == TGSI_FILE_INPUT) {
786 name = info->input_semantic_name;
787 index = info->input_semantic_index;
788 array_first = info->input_array_first;
789 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
790 name = info->output_semantic_name;
791 index = info->output_semantic_index;
792 array_first = info->output_array_first;
793 } else {
794 assert(0);
795 return NULL;
796 }
797
798 if (reg.Register.Indirect) {
799 if (reg.Indirect.ArrayID)
800 param_base = array_first[reg.Indirect.ArrayID];
801 else
802 param_base = reg.Register.Index;
803
804 param_index = get_indirect_index(ctx, &reg.Indirect,
805 reg.Register.Index - param_base);
806
807 } else {
808 param_base = reg.Register.Index;
809 param_index = ctx->i32_0;
810 }
811
812 param_index_base = si_shader_io_get_unique_index(name[param_base],
813 index[param_base]);
814
815 param_index = LLVMBuildAdd(gallivm->builder, param_index,
816 LLVMConstInt(ctx->i32, param_index_base, 0),
817 "");
818
819 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
820 vertex_index, param_index);
821 }
822
823 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
824 enum tgsi_opcode_type type, unsigned swizzle,
825 LLVMValueRef buffer, LLVMValueRef offset,
826 LLVMValueRef base, bool readonly_memory)
827 {
828 struct si_shader_context *ctx = si_shader_context(bld_base);
829 struct gallivm_state *gallivm = &ctx->gallivm;
830 LLVMValueRef value, value2;
831 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
832 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
833
834 if (swizzle == ~0) {
835 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
836 0, 1, 0, readonly_memory);
837
838 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
839 }
840
841 if (!tgsi_type_is_64bit(type)) {
842 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
843 0, 1, 0, readonly_memory);
844
845 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
846 return LLVMBuildExtractElement(gallivm->builder, value,
847 LLVMConstInt(ctx->i32, swizzle, 0), "");
848 }
849
850 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
851 swizzle * 4, 1, 0, readonly_memory);
852
853 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
854 swizzle * 4 + 4, 1, 0, readonly_memory);
855
856 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
857 }
858
859 /**
860 * Load from LDS.
861 *
862 * \param type output value type
863 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
864 * \param dw_addr address in dwords
865 */
866 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
867 enum tgsi_opcode_type type, unsigned swizzle,
868 LLVMValueRef dw_addr)
869 {
870 struct si_shader_context *ctx = si_shader_context(bld_base);
871 struct gallivm_state *gallivm = &ctx->gallivm;
872 LLVMValueRef value;
873
874 if (swizzle == ~0) {
875 LLVMValueRef values[TGSI_NUM_CHANNELS];
876
877 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
878 values[chan] = lds_load(bld_base, type, chan, dw_addr);
879
880 return lp_build_gather_values(gallivm, values,
881 TGSI_NUM_CHANNELS);
882 }
883
884 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
885 LLVMConstInt(ctx->i32, swizzle, 0));
886
887 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
888 if (tgsi_type_is_64bit(type)) {
889 LLVMValueRef value2;
890 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
891 ctx->i32_1);
892 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
893 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
894 }
895
896 return LLVMBuildBitCast(gallivm->builder, value,
897 tgsi2llvmtype(bld_base, type), "");
898 }
899
900 /**
901 * Store to LDS.
902 *
903 * \param swizzle offset (typically 0..3)
904 * \param dw_addr address in dwords
905 * \param value value to store
906 */
907 static void lds_store(struct lp_build_tgsi_context *bld_base,
908 unsigned dw_offset_imm, LLVMValueRef dw_addr,
909 LLVMValueRef value)
910 {
911 struct si_shader_context *ctx = si_shader_context(bld_base);
912 struct gallivm_state *gallivm = &ctx->gallivm;
913
914 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
915 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
916
917 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
918 ac_build_indexed_store(&ctx->ac, ctx->lds,
919 dw_addr, value);
920 }
921
922 static LLVMValueRef fetch_input_tcs(
923 struct lp_build_tgsi_context *bld_base,
924 const struct tgsi_full_src_register *reg,
925 enum tgsi_opcode_type type, unsigned swizzle)
926 {
927 struct si_shader_context *ctx = si_shader_context(bld_base);
928 LLVMValueRef dw_addr, stride;
929
930 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
931 dw_addr = get_tcs_in_current_patch_offset(ctx);
932 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
933
934 return lds_load(bld_base, type, swizzle, dw_addr);
935 }
936
937 static LLVMValueRef fetch_output_tcs(
938 struct lp_build_tgsi_context *bld_base,
939 const struct tgsi_full_src_register *reg,
940 enum tgsi_opcode_type type, unsigned swizzle)
941 {
942 struct si_shader_context *ctx = si_shader_context(bld_base);
943 LLVMValueRef dw_addr, stride;
944
945 if (reg->Register.Dimension) {
946 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
947 dw_addr = get_tcs_out_current_patch_offset(ctx);
948 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
949 } else {
950 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
951 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
952 }
953
954 return lds_load(bld_base, type, swizzle, dw_addr);
955 }
956
957 static LLVMValueRef fetch_input_tes(
958 struct lp_build_tgsi_context *bld_base,
959 const struct tgsi_full_src_register *reg,
960 enum tgsi_opcode_type type, unsigned swizzle)
961 {
962 struct si_shader_context *ctx = si_shader_context(bld_base);
963 LLVMValueRef rw_buffers, buffer, base, addr;
964
965 rw_buffers = LLVMGetParam(ctx->main_fn,
966 ctx->param_rw_buffers);
967 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
968 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
969
970 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
971 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
972
973 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
974 }
975
976 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
977 const struct tgsi_full_instruction *inst,
978 const struct tgsi_opcode_info *info,
979 LLVMValueRef dst[4])
980 {
981 struct si_shader_context *ctx = si_shader_context(bld_base);
982 struct gallivm_state *gallivm = &ctx->gallivm;
983 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
984 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
985 unsigned chan_index;
986 LLVMValueRef dw_addr, stride;
987 LLVMValueRef rw_buffers, buffer, base, buf_addr;
988 LLVMValueRef values[4];
989 bool skip_lds_store;
990 bool is_tess_factor = false;
991
992 /* Only handle per-patch and per-vertex outputs here.
993 * Vectors will be lowered to scalars and this function will be called again.
994 */
995 if (reg->Register.File != TGSI_FILE_OUTPUT ||
996 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
997 si_llvm_emit_store(bld_base, inst, info, dst);
998 return;
999 }
1000
1001 if (reg->Register.Dimension) {
1002 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
1003 dw_addr = get_tcs_out_current_patch_offset(ctx);
1004 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1005 skip_lds_store = !sh_info->reads_pervertex_outputs;
1006 } else {
1007 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1008 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1009 skip_lds_store = !sh_info->reads_perpatch_outputs;
1010
1011 if (!reg->Register.Indirect) {
1012 int name = sh_info->output_semantic_name[reg->Register.Index];
1013
1014 /* Always write tess factors into LDS for the TCS epilog. */
1015 if (name == TGSI_SEMANTIC_TESSINNER ||
1016 name == TGSI_SEMANTIC_TESSOUTER) {
1017 skip_lds_store = false;
1018 is_tess_factor = true;
1019 }
1020 }
1021 }
1022
1023 rw_buffers = LLVMGetParam(ctx->main_fn,
1024 ctx->param_rw_buffers);
1025 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1026 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1027
1028 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1029 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1030
1031
1032 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1033 LLVMValueRef value = dst[chan_index];
1034
1035 if (inst->Instruction.Saturate)
1036 value = ac_build_clamp(&ctx->ac, value);
1037
1038 /* Skip LDS stores if there is no LDS read of this output. */
1039 if (!skip_lds_store)
1040 lds_store(bld_base, chan_index, dw_addr, value);
1041
1042 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1043 values[chan_index] = value;
1044
1045 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1046 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1047 buf_addr, base,
1048 4 * chan_index, 1, 0, true, false);
1049 }
1050 }
1051
1052 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1053 LLVMValueRef value = lp_build_gather_values(gallivm,
1054 values, 4);
1055 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1056 base, 0, 1, 0, true, false);
1057 }
1058 }
1059
1060 static LLVMValueRef fetch_input_gs(
1061 struct lp_build_tgsi_context *bld_base,
1062 const struct tgsi_full_src_register *reg,
1063 enum tgsi_opcode_type type,
1064 unsigned swizzle)
1065 {
1066 struct si_shader_context *ctx = si_shader_context(bld_base);
1067 struct si_shader *shader = ctx->shader;
1068 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1069 struct gallivm_state *gallivm = &ctx->gallivm;
1070 LLVMValueRef vtx_offset, soffset;
1071 struct tgsi_shader_info *info = &shader->selector->info;
1072 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1073 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1074 unsigned param;
1075 LLVMValueRef value;
1076
1077 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1078 return get_primitive_id(bld_base, swizzle);
1079
1080 if (!reg->Register.Dimension)
1081 return NULL;
1082
1083 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1084
1085 /* GFX9 has the ESGS ring in LDS. */
1086 if (ctx->screen->b.chip_class >= GFX9) {
1087 unsigned index = reg->Dimension.Index;
1088
1089 switch (index / 2) {
1090 case 0:
1091 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1092 index % 2 ? 16 : 0, 16);
1093 break;
1094 case 1:
1095 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1096 index % 2 ? 16 : 0, 16);
1097 break;
1098 case 2:
1099 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1100 index % 2 ? 16 : 0, 16);
1101 break;
1102 default:
1103 assert(0);
1104 return NULL;
1105 }
1106
1107 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1108 LLVMConstInt(ctx->i32, param * 4, 0), "");
1109 return lds_load(bld_base, type, swizzle, vtx_offset);
1110 }
1111
1112 /* GFX6: input load from the ESGS ring in memory. */
1113 if (swizzle == ~0) {
1114 LLVMValueRef values[TGSI_NUM_CHANNELS];
1115 unsigned chan;
1116 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1117 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1118 }
1119 return lp_build_gather_values(gallivm, values,
1120 TGSI_NUM_CHANNELS);
1121 }
1122
1123 /* Get the vertex offset parameter on GFX6. */
1124 unsigned vtx_offset_param = reg->Dimension.Index;
1125 if (vtx_offset_param < 2) {
1126 vtx_offset_param += ctx->param_gs_vtx0_offset;
1127 } else {
1128 assert(vtx_offset_param < 6);
1129 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1130 }
1131 vtx_offset = lp_build_mul_imm(uint,
1132 LLVMGetParam(ctx->main_fn,
1133 vtx_offset_param),
1134 4);
1135
1136 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1137
1138 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1139 vtx_offset, soffset, 0, 1, 0, true);
1140 if (tgsi_type_is_64bit(type)) {
1141 LLVMValueRef value2;
1142 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1143
1144 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1145 ctx->i32_0, vtx_offset, soffset,
1146 0, 1, 0, true);
1147 return si_llvm_emit_fetch_64bit(bld_base, type,
1148 value, value2);
1149 }
1150 return LLVMBuildBitCast(gallivm->builder,
1151 value,
1152 tgsi2llvmtype(bld_base, type), "");
1153 }
1154
1155 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1156 {
1157 switch (interpolate) {
1158 case TGSI_INTERPOLATE_CONSTANT:
1159 return 0;
1160
1161 case TGSI_INTERPOLATE_LINEAR:
1162 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1163 return SI_PARAM_LINEAR_SAMPLE;
1164 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1165 return SI_PARAM_LINEAR_CENTROID;
1166 else
1167 return SI_PARAM_LINEAR_CENTER;
1168 break;
1169 case TGSI_INTERPOLATE_COLOR:
1170 case TGSI_INTERPOLATE_PERSPECTIVE:
1171 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1172 return SI_PARAM_PERSP_SAMPLE;
1173 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1174 return SI_PARAM_PERSP_CENTROID;
1175 else
1176 return SI_PARAM_PERSP_CENTER;
1177 break;
1178 default:
1179 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1180 return -1;
1181 }
1182 }
1183
1184 /**
1185 * Interpolate a fragment shader input.
1186 *
1187 * @param ctx context
1188 * @param input_index index of the input in hardware
1189 * @param semantic_name TGSI_SEMANTIC_*
1190 * @param semantic_index semantic index
1191 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1192 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1193 * @param interp_param interpolation weights (i,j)
1194 * @param prim_mask SI_PARAM_PRIM_MASK
1195 * @param face SI_PARAM_FRONT_FACE
1196 * @param result the return value (4 components)
1197 */
1198 static void interp_fs_input(struct si_shader_context *ctx,
1199 unsigned input_index,
1200 unsigned semantic_name,
1201 unsigned semantic_index,
1202 unsigned num_interp_inputs,
1203 unsigned colors_read_mask,
1204 LLVMValueRef interp_param,
1205 LLVMValueRef prim_mask,
1206 LLVMValueRef face,
1207 LLVMValueRef result[4])
1208 {
1209 struct gallivm_state *gallivm = &ctx->gallivm;
1210 LLVMValueRef attr_number;
1211 LLVMValueRef i, j;
1212
1213 unsigned chan;
1214
1215 /* fs.constant returns the param from the middle vertex, so it's not
1216 * really useful for flat shading. It's meant to be used for custom
1217 * interpolation (but the intrinsic can't fetch from the other two
1218 * vertices).
1219 *
1220 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1221 * to do the right thing. The only reason we use fs.constant is that
1222 * fs.interp cannot be used on integers, because they can be equal
1223 * to NaN.
1224 *
1225 * When interp is false we will use fs.constant or for newer llvm,
1226 * amdgcn.interp.mov.
1227 */
1228 bool interp = interp_param != NULL;
1229
1230 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1231
1232 if (interp) {
1233 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1234 LLVMVectorType(ctx->f32, 2), "");
1235
1236 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1237 ctx->i32_0, "");
1238 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1239 ctx->i32_1, "");
1240 }
1241
1242 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1243 ctx->shader->key.part.ps.prolog.color_two_side) {
1244 LLVMValueRef is_face_positive;
1245 LLVMValueRef back_attr_number;
1246
1247 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1248 * otherwise it's at offset "num_inputs".
1249 */
1250 unsigned back_attr_offset = num_interp_inputs;
1251 if (semantic_index == 1 && colors_read_mask & 0xf)
1252 back_attr_offset += 1;
1253
1254 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1255
1256 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1257 face, ctx->i32_0, "");
1258
1259 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1260 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1261 LLVMValueRef front, back;
1262
1263 if (interp) {
1264 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1265 attr_number, prim_mask,
1266 i, j);
1267 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1268 back_attr_number, prim_mask,
1269 i, j);
1270 } else {
1271 front = ac_build_fs_interp_mov(&ctx->ac,
1272 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1273 llvm_chan, attr_number, prim_mask);
1274 back = ac_build_fs_interp_mov(&ctx->ac,
1275 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1276 llvm_chan, back_attr_number, prim_mask);
1277 }
1278
1279 result[chan] = LLVMBuildSelect(gallivm->builder,
1280 is_face_positive,
1281 front,
1282 back,
1283 "");
1284 }
1285 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1286 if (interp) {
1287 result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1288 attr_number, prim_mask, i, j);
1289 } else {
1290 result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1291 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1292 attr_number, prim_mask);
1293 }
1294 result[1] =
1295 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1296 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1297 } else {
1298 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1299 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1300
1301 if (interp) {
1302 result[chan] = ac_build_fs_interp(&ctx->ac,
1303 llvm_chan, attr_number, prim_mask, i, j);
1304 } else {
1305 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1306 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1307 llvm_chan, attr_number, prim_mask);
1308 }
1309 }
1310 }
1311 }
1312
1313 static void declare_input_fs(
1314 struct si_shader_context *ctx,
1315 unsigned input_index,
1316 const struct tgsi_full_declaration *decl,
1317 LLVMValueRef out[4])
1318 {
1319 struct lp_build_context *base = &ctx->bld_base.base;
1320 struct si_shader *shader = ctx->shader;
1321 LLVMValueRef main_fn = ctx->main_fn;
1322 LLVMValueRef interp_param = NULL;
1323 int interp_param_idx;
1324
1325 /* Get colors from input VGPRs (set by the prolog). */
1326 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1327 unsigned i = decl->Semantic.Index;
1328 unsigned colors_read = shader->selector->info.colors_read;
1329 unsigned mask = colors_read >> (i * 4);
1330 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1331 (i ? util_bitcount(colors_read & 0xf) : 0);
1332
1333 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1334 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1335 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1336 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1337 return;
1338 }
1339
1340 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1341 decl->Interp.Location);
1342 if (interp_param_idx == -1)
1343 return;
1344 else if (interp_param_idx) {
1345 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1346 }
1347
1348 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1349 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1350 ctx->shader->key.part.ps.prolog.flatshade_colors)
1351 interp_param = NULL; /* load the constant color */
1352
1353 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1354 decl->Semantic.Index, shader->selector->info.num_inputs,
1355 shader->selector->info.colors_read, interp_param,
1356 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1357 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1358 &out[0]);
1359 }
1360
1361 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1362 {
1363 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1364 }
1365
1366
1367 /**
1368 * Load a dword from a constant buffer.
1369 */
1370 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1371 LLVMValueRef resource,
1372 LLVMValueRef offset)
1373 {
1374 LLVMBuilderRef builder = ctx->gallivm.builder;
1375 LLVMValueRef args[2] = {resource, offset};
1376
1377 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1378 LP_FUNC_ATTR_READNONE |
1379 LP_FUNC_ATTR_LEGACY);
1380 }
1381
1382 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1383 {
1384 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1385 struct gallivm_state *gallivm = &ctx->gallivm;
1386 LLVMBuilderRef builder = gallivm->builder;
1387 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1388 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1389 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1390
1391 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1392 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1393 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1394
1395 LLVMValueRef pos[4] = {
1396 buffer_load_const(ctx, resource, offset0),
1397 buffer_load_const(ctx, resource, offset1),
1398 LLVMConstReal(ctx->f32, 0),
1399 LLVMConstReal(ctx->f32, 0)
1400 };
1401
1402 return lp_build_gather_values(gallivm, pos, 4);
1403 }
1404
1405 static void declare_system_value(struct si_shader_context *ctx,
1406 unsigned index,
1407 const struct tgsi_full_declaration *decl)
1408 {
1409 struct lp_build_context *bld = &ctx->bld_base.base;
1410 struct gallivm_state *gallivm = &ctx->gallivm;
1411 LLVMValueRef value = 0;
1412
1413 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1414
1415 switch (decl->Semantic.Name) {
1416 case TGSI_SEMANTIC_INSTANCEID:
1417 value = LLVMGetParam(ctx->main_fn,
1418 ctx->param_instance_id);
1419 break;
1420
1421 case TGSI_SEMANTIC_VERTEXID:
1422 value = LLVMBuildAdd(gallivm->builder,
1423 LLVMGetParam(ctx->main_fn,
1424 ctx->param_vertex_id),
1425 LLVMGetParam(ctx->main_fn,
1426 ctx->param_base_vertex), "");
1427 break;
1428
1429 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1430 /* Unused. Clarify the meaning in indexed vs. non-indexed
1431 * draws if this is ever used again. */
1432 assert(false);
1433 break;
1434
1435 case TGSI_SEMANTIC_BASEVERTEX:
1436 {
1437 /* For non-indexed draws, the base vertex set by the driver
1438 * (for direct draws) or the CP (for indirect draws) is the
1439 * first vertex ID, but GLSL expects 0 to be returned.
1440 */
1441 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1442 LLVMValueRef indexed;
1443
1444 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1445 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1446
1447 value = LLVMBuildSelect(gallivm->builder, indexed,
1448 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1449 ctx->i32_0, "");
1450 break;
1451 }
1452
1453 case TGSI_SEMANTIC_BASEINSTANCE:
1454 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1455 break;
1456
1457 case TGSI_SEMANTIC_DRAWID:
1458 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1459 break;
1460
1461 case TGSI_SEMANTIC_INVOCATIONID:
1462 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1463 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1464 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1465 value = LLVMGetParam(ctx->main_fn,
1466 ctx->param_gs_instance_id);
1467 else
1468 assert(!"INVOCATIONID not implemented");
1469 break;
1470
1471 case TGSI_SEMANTIC_POSITION:
1472 {
1473 LLVMValueRef pos[4] = {
1474 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1475 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1476 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1477 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1478 LLVMGetParam(ctx->main_fn,
1479 SI_PARAM_POS_W_FLOAT)),
1480 };
1481 value = lp_build_gather_values(gallivm, pos, 4);
1482 break;
1483 }
1484
1485 case TGSI_SEMANTIC_FACE:
1486 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1487 break;
1488
1489 case TGSI_SEMANTIC_SAMPLEID:
1490 value = get_sample_id(ctx);
1491 break;
1492
1493 case TGSI_SEMANTIC_SAMPLEPOS: {
1494 LLVMValueRef pos[4] = {
1495 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1496 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1497 LLVMConstReal(ctx->f32, 0),
1498 LLVMConstReal(ctx->f32, 0)
1499 };
1500 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1501 TGSI_OPCODE_FRC, pos[0]);
1502 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1503 TGSI_OPCODE_FRC, pos[1]);
1504 value = lp_build_gather_values(gallivm, pos, 4);
1505 break;
1506 }
1507
1508 case TGSI_SEMANTIC_SAMPLEMASK:
1509 /* This can only occur with the OpenGL Core profile, which
1510 * doesn't support smoothing.
1511 */
1512 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1513 break;
1514
1515 case TGSI_SEMANTIC_TESSCOORD:
1516 {
1517 LLVMValueRef coord[4] = {
1518 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1519 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1520 bld->zero,
1521 bld->zero
1522 };
1523
1524 /* For triangles, the vector should be (u, v, 1-u-v). */
1525 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1526 PIPE_PRIM_TRIANGLES)
1527 coord[2] = lp_build_sub(bld, bld->one,
1528 lp_build_add(bld, coord[0], coord[1]));
1529
1530 value = lp_build_gather_values(gallivm, coord, 4);
1531 break;
1532 }
1533
1534 case TGSI_SEMANTIC_VERTICESIN:
1535 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1536 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1537 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1538 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 7);
1539 else
1540 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1541 break;
1542
1543 case TGSI_SEMANTIC_TESSINNER:
1544 case TGSI_SEMANTIC_TESSOUTER:
1545 {
1546 LLVMValueRef rw_buffers, buffer, base, addr;
1547 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1548
1549 rw_buffers = LLVMGetParam(ctx->main_fn,
1550 ctx->param_rw_buffers);
1551 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1552 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1553
1554 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1555 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1556 LLVMConstInt(ctx->i32, param, 0));
1557
1558 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1559 ~0, buffer, base, addr, true);
1560
1561 break;
1562 }
1563
1564 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1565 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1566 {
1567 LLVMValueRef buf, slot, val[4];
1568 int i, offset;
1569
1570 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1571 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1572 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1573 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1574
1575 for (i = 0; i < 4; i++)
1576 val[i] = buffer_load_const(ctx, buf,
1577 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1578 value = lp_build_gather_values(gallivm, val, 4);
1579 break;
1580 }
1581
1582 case TGSI_SEMANTIC_PRIMID:
1583 value = get_primitive_id(&ctx->bld_base, 0);
1584 break;
1585
1586 case TGSI_SEMANTIC_GRID_SIZE:
1587 value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
1588 break;
1589
1590 case TGSI_SEMANTIC_BLOCK_SIZE:
1591 {
1592 LLVMValueRef values[3];
1593 unsigned i;
1594 unsigned *properties = ctx->shader->selector->info.properties;
1595
1596 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1597 unsigned sizes[3] = {
1598 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1599 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1600 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1601 };
1602
1603 for (i = 0; i < 3; ++i)
1604 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1605
1606 value = lp_build_gather_values(gallivm, values, 3);
1607 } else {
1608 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
1609 }
1610 break;
1611 }
1612
1613 case TGSI_SEMANTIC_BLOCK_ID:
1614 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
1615 break;
1616
1617 case TGSI_SEMANTIC_THREAD_ID:
1618 value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
1619 break;
1620
1621 case TGSI_SEMANTIC_HELPER_INVOCATION:
1622 if (HAVE_LLVM >= 0x0309) {
1623 value = lp_build_intrinsic(gallivm->builder,
1624 "llvm.amdgcn.ps.live",
1625 ctx->i1, NULL, 0,
1626 LP_FUNC_ATTR_READNONE);
1627 value = LLVMBuildNot(gallivm->builder, value, "");
1628 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1629 } else {
1630 assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1631 return;
1632 }
1633 break;
1634
1635 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1636 value = LLVMConstInt(ctx->i32, 64, 0);
1637 break;
1638
1639 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1640 value = ac_get_thread_id(&ctx->ac);
1641 break;
1642
1643 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1644 {
1645 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1646 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1647 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1648 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1649 break;
1650 }
1651
1652 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1653 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1654 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1655 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1656 {
1657 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1658 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1659 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1660 /* All bits set except LSB */
1661 value = LLVMConstInt(ctx->i64, -2, 0);
1662 } else {
1663 /* All bits set */
1664 value = LLVMConstInt(ctx->i64, -1, 0);
1665 }
1666 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1667 value = LLVMBuildShl(gallivm->builder, value, id, "");
1668 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1669 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1670 value = LLVMBuildNot(gallivm->builder, value, "");
1671 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1672 break;
1673 }
1674
1675 default:
1676 assert(!"unknown system value");
1677 return;
1678 }
1679
1680 ctx->system_values[index] = value;
1681 }
1682
1683 static void declare_compute_memory(struct si_shader_context *ctx,
1684 const struct tgsi_full_declaration *decl)
1685 {
1686 struct si_shader_selector *sel = ctx->shader->selector;
1687 struct gallivm_state *gallivm = &ctx->gallivm;
1688
1689 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1690 LLVMValueRef var;
1691
1692 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1693 assert(decl->Range.First == decl->Range.Last);
1694 assert(!ctx->shared_memory);
1695
1696 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1697 LLVMArrayType(ctx->i8, sel->local_size),
1698 "compute_lds",
1699 LOCAL_ADDR_SPACE);
1700 LLVMSetAlignment(var, 4);
1701
1702 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1703 }
1704
1705 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1706 {
1707 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1708 ctx->param_const_buffers);
1709
1710 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1711 LLVMConstInt(ctx->i32, i, 0));
1712 }
1713
1714 static LLVMValueRef fetch_constant(
1715 struct lp_build_tgsi_context *bld_base,
1716 const struct tgsi_full_src_register *reg,
1717 enum tgsi_opcode_type type,
1718 unsigned swizzle)
1719 {
1720 struct si_shader_context *ctx = si_shader_context(bld_base);
1721 struct lp_build_context *base = &bld_base->base;
1722 const struct tgsi_ind_register *ireg = &reg->Indirect;
1723 unsigned buf, idx;
1724
1725 LLVMValueRef addr, bufp;
1726 LLVMValueRef result;
1727
1728 if (swizzle == LP_CHAN_ALL) {
1729 unsigned chan;
1730 LLVMValueRef values[4];
1731 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1732 values[chan] = fetch_constant(bld_base, reg, type, chan);
1733
1734 return lp_build_gather_values(&ctx->gallivm, values, 4);
1735 }
1736
1737 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1738 idx = reg->Register.Index * 4 + swizzle;
1739
1740 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1741 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1742 LLVMValueRef index;
1743 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1744 reg->Dimension.Index,
1745 SI_NUM_CONST_BUFFERS);
1746 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1747 } else
1748 bufp = load_const_buffer_desc(ctx, buf);
1749
1750 if (reg->Register.Indirect) {
1751 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1752 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1753 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1754 addr = lp_build_add(&bld_base->uint_bld, addr,
1755 LLVMConstInt(ctx->i32, idx * 4, 0));
1756 } else {
1757 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1758 }
1759
1760 result = buffer_load_const(ctx, bufp, addr);
1761
1762 if (!tgsi_type_is_64bit(type))
1763 result = bitcast(bld_base, type, result);
1764 else {
1765 LLVMValueRef addr2, result2;
1766
1767 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1768 LLVMConstInt(ctx->i32, 4, 0));
1769 result2 = buffer_load_const(ctx, bufp, addr2);
1770
1771 result = si_llvm_emit_fetch_64bit(bld_base, type,
1772 result, result2);
1773 }
1774 return result;
1775 }
1776
1777 /* Upper 16 bits must be zero. */
1778 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1779 LLVMValueRef val[2])
1780 {
1781 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1782 LLVMBuildShl(ctx->gallivm.builder, val[1],
1783 LLVMConstInt(ctx->i32, 16, 0),
1784 ""), "");
1785 }
1786
1787 /* Upper 16 bits are ignored and will be dropped. */
1788 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1789 LLVMValueRef val[2])
1790 {
1791 LLVMValueRef v[2] = {
1792 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1793 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1794 val[1],
1795 };
1796 return si_llvm_pack_two_int16(ctx, v);
1797 }
1798
1799 /* Initialize arguments for the shader export intrinsic */
1800 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1801 LLVMValueRef *values,
1802 unsigned target,
1803 struct ac_export_args *args)
1804 {
1805 struct si_shader_context *ctx = si_shader_context(bld_base);
1806 struct lp_build_context *base = &bld_base->base;
1807 LLVMBuilderRef builder = ctx->gallivm.builder;
1808 LLVMValueRef val[4];
1809 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1810 unsigned chan;
1811 bool is_int8, is_int10;
1812
1813 /* Default is 0xf. Adjusted below depending on the format. */
1814 args->enabled_channels = 0xf; /* writemask */
1815
1816 /* Specify whether the EXEC mask represents the valid mask */
1817 args->valid_mask = 0;
1818
1819 /* Specify whether this is the last export */
1820 args->done = 0;
1821
1822 /* Specify the target we are exporting */
1823 args->target = target;
1824
1825 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1826 const struct si_shader_key *key = &ctx->shader->key;
1827 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1828 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1829
1830 assert(cbuf >= 0 && cbuf < 8);
1831 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1832 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1833 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1834 }
1835
1836 args->compr = false;
1837 args->out[0] = base->undef;
1838 args->out[1] = base->undef;
1839 args->out[2] = base->undef;
1840 args->out[3] = base->undef;
1841
1842 switch (spi_shader_col_format) {
1843 case V_028714_SPI_SHADER_ZERO:
1844 args->enabled_channels = 0; /* writemask */
1845 args->target = V_008DFC_SQ_EXP_NULL;
1846 break;
1847
1848 case V_028714_SPI_SHADER_32_R:
1849 args->enabled_channels = 1; /* writemask */
1850 args->out[0] = values[0];
1851 break;
1852
1853 case V_028714_SPI_SHADER_32_GR:
1854 args->enabled_channels = 0x3; /* writemask */
1855 args->out[0] = values[0];
1856 args->out[1] = values[1];
1857 break;
1858
1859 case V_028714_SPI_SHADER_32_AR:
1860 args->enabled_channels = 0x9; /* writemask */
1861 args->out[0] = values[0];
1862 args->out[3] = values[3];
1863 break;
1864
1865 case V_028714_SPI_SHADER_FP16_ABGR:
1866 args->compr = 1; /* COMPR flag */
1867
1868 for (chan = 0; chan < 2; chan++) {
1869 LLVMValueRef pack_args[2] = {
1870 values[2 * chan],
1871 values[2 * chan + 1]
1872 };
1873 LLVMValueRef packed;
1874
1875 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1876 args->out[chan] =
1877 LLVMBuildBitCast(ctx->gallivm.builder,
1878 packed, ctx->f32, "");
1879 }
1880 break;
1881
1882 case V_028714_SPI_SHADER_UNORM16_ABGR:
1883 for (chan = 0; chan < 4; chan++) {
1884 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1885 val[chan] = LLVMBuildFMul(builder, val[chan],
1886 LLVMConstReal(ctx->f32, 65535), "");
1887 val[chan] = LLVMBuildFAdd(builder, val[chan],
1888 LLVMConstReal(ctx->f32, 0.5), "");
1889 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1890 ctx->i32, "");
1891 }
1892
1893 args->compr = 1; /* COMPR flag */
1894 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1895 si_llvm_pack_two_int16(ctx, val));
1896 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1897 si_llvm_pack_two_int16(ctx, val+2));
1898 break;
1899
1900 case V_028714_SPI_SHADER_SNORM16_ABGR:
1901 for (chan = 0; chan < 4; chan++) {
1902 /* Clamp between [-1, 1]. */
1903 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1904 values[chan],
1905 LLVMConstReal(ctx->f32, 1));
1906 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1907 val[chan],
1908 LLVMConstReal(ctx->f32, -1));
1909 /* Convert to a signed integer in [-32767, 32767]. */
1910 val[chan] = LLVMBuildFMul(builder, val[chan],
1911 LLVMConstReal(ctx->f32, 32767), "");
1912 /* If positive, add 0.5, else add -0.5. */
1913 val[chan] = LLVMBuildFAdd(builder, val[chan],
1914 LLVMBuildSelect(builder,
1915 LLVMBuildFCmp(builder, LLVMRealOGE,
1916 val[chan], base->zero, ""),
1917 LLVMConstReal(ctx->f32, 0.5),
1918 LLVMConstReal(ctx->f32, -0.5), ""), "");
1919 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1920 }
1921
1922 args->compr = 1; /* COMPR flag */
1923 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1924 si_llvm_pack_two_int32_as_int16(ctx, val));
1925 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1926 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1927 break;
1928
1929 case V_028714_SPI_SHADER_UINT16_ABGR: {
1930 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1931 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1932 LLVMValueRef max_alpha =
1933 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1934
1935 /* Clamp. */
1936 for (chan = 0; chan < 4; chan++) {
1937 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1938 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1939 val[chan],
1940 chan == 3 ? max_alpha : max_rgb);
1941 }
1942
1943 args->compr = 1; /* COMPR flag */
1944 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1945 si_llvm_pack_two_int16(ctx, val));
1946 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1947 si_llvm_pack_two_int16(ctx, val+2));
1948 break;
1949 }
1950
1951 case V_028714_SPI_SHADER_SINT16_ABGR: {
1952 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1953 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1954 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1955 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1956 LLVMValueRef max_alpha =
1957 !is_int10 ? max_rgb : ctx->i32_1;
1958 LLVMValueRef min_alpha =
1959 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1960
1961 /* Clamp. */
1962 for (chan = 0; chan < 4; chan++) {
1963 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1964 val[chan] = lp_build_emit_llvm_binary(bld_base,
1965 TGSI_OPCODE_IMIN,
1966 val[chan], chan == 3 ? max_alpha : max_rgb);
1967 val[chan] = lp_build_emit_llvm_binary(bld_base,
1968 TGSI_OPCODE_IMAX,
1969 val[chan], chan == 3 ? min_alpha : min_rgb);
1970 }
1971
1972 args->compr = 1; /* COMPR flag */
1973 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1974 si_llvm_pack_two_int32_as_int16(ctx, val));
1975 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1976 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1977 break;
1978 }
1979
1980 case V_028714_SPI_SHADER_32_ABGR:
1981 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1982 break;
1983 }
1984 }
1985
1986 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1987 LLVMValueRef alpha)
1988 {
1989 struct si_shader_context *ctx = si_shader_context(bld_base);
1990
1991 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1992 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
1993 SI_PARAM_ALPHA_REF);
1994
1995 LLVMValueRef alpha_pass =
1996 lp_build_cmp(&bld_base->base,
1997 ctx->shader->key.part.ps.epilog.alpha_func,
1998 alpha, alpha_ref);
1999 LLVMValueRef arg =
2000 lp_build_select(&bld_base->base,
2001 alpha_pass,
2002 LLVMConstReal(ctx->f32, 1.0f),
2003 LLVMConstReal(ctx->f32, -1.0f));
2004
2005 ac_build_kill(&ctx->ac, arg);
2006 } else {
2007 ac_build_kill(&ctx->ac, NULL);
2008 }
2009 }
2010
2011 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2012 LLVMValueRef alpha,
2013 unsigned samplemask_param)
2014 {
2015 struct si_shader_context *ctx = si_shader_context(bld_base);
2016 struct gallivm_state *gallivm = &ctx->gallivm;
2017 LLVMValueRef coverage;
2018
2019 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2020 coverage = LLVMGetParam(ctx->main_fn,
2021 samplemask_param);
2022 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2023
2024 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2025 ctx->i32,
2026 &coverage, 1, LP_FUNC_ATTR_READNONE);
2027
2028 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2029 ctx->f32, "");
2030
2031 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2032 LLVMConstReal(ctx->f32,
2033 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2034
2035 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2036 }
2037
2038 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2039 struct ac_export_args *pos, LLVMValueRef *out_elts)
2040 {
2041 struct si_shader_context *ctx = si_shader_context(bld_base);
2042 struct lp_build_context *base = &bld_base->base;
2043 unsigned reg_index;
2044 unsigned chan;
2045 unsigned const_chan;
2046 LLVMValueRef base_elt;
2047 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2048 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2049 SI_VS_CONST_CLIP_PLANES, 0);
2050 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2051
2052 for (reg_index = 0; reg_index < 2; reg_index ++) {
2053 struct ac_export_args *args = &pos[2 + reg_index];
2054
2055 args->out[0] =
2056 args->out[1] =
2057 args->out[2] =
2058 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2059
2060 /* Compute dot products of position and user clip plane vectors */
2061 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2062 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2063 LLVMValueRef addr =
2064 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2065 const_chan) * 4, 0);
2066 base_elt = buffer_load_const(ctx, const_resource,
2067 addr);
2068 args->out[chan] =
2069 lp_build_add(base, args->out[chan],
2070 lp_build_mul(base, base_elt,
2071 out_elts[const_chan]));
2072 }
2073 }
2074
2075 args->enabled_channels = 0xf;
2076 args->valid_mask = 0;
2077 args->done = 0;
2078 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2079 args->compr = 0;
2080 }
2081 }
2082
2083 static void si_dump_streamout(struct pipe_stream_output_info *so)
2084 {
2085 unsigned i;
2086
2087 if (so->num_outputs)
2088 fprintf(stderr, "STREAMOUT\n");
2089
2090 for (i = 0; i < so->num_outputs; i++) {
2091 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2092 so->output[i].start_component;
2093 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2094 i, so->output[i].output_buffer,
2095 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2096 so->output[i].register_index,
2097 mask & 1 ? "x" : "",
2098 mask & 2 ? "y" : "",
2099 mask & 4 ? "z" : "",
2100 mask & 8 ? "w" : "");
2101 }
2102 }
2103
2104 static void emit_streamout_output(struct si_shader_context *ctx,
2105 LLVMValueRef const *so_buffers,
2106 LLVMValueRef const *so_write_offsets,
2107 struct pipe_stream_output *stream_out,
2108 struct si_shader_output_values *shader_out)
2109 {
2110 struct gallivm_state *gallivm = &ctx->gallivm;
2111 LLVMBuilderRef builder = gallivm->builder;
2112 unsigned buf_idx = stream_out->output_buffer;
2113 unsigned start = stream_out->start_component;
2114 unsigned num_comps = stream_out->num_components;
2115 LLVMValueRef out[4];
2116
2117 assert(num_comps && num_comps <= 4);
2118 if (!num_comps || num_comps > 4)
2119 return;
2120
2121 /* Load the output as int. */
2122 for (int j = 0; j < num_comps; j++) {
2123 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2124
2125 out[j] = LLVMBuildBitCast(builder,
2126 shader_out->values[start + j],
2127 ctx->i32, "");
2128 }
2129
2130 /* Pack the output. */
2131 LLVMValueRef vdata = NULL;
2132
2133 switch (num_comps) {
2134 case 1: /* as i32 */
2135 vdata = out[0];
2136 break;
2137 case 2: /* as v2i32 */
2138 case 3: /* as v4i32 (aligned to 4) */
2139 case 4: /* as v4i32 */
2140 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2141 for (int j = 0; j < num_comps; j++) {
2142 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2143 LLVMConstInt(ctx->i32, j, 0), "");
2144 }
2145 break;
2146 }
2147
2148 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2149 vdata, num_comps,
2150 so_write_offsets[buf_idx],
2151 ctx->i32_0,
2152 stream_out->dst_offset * 4, 1, 1, true, false);
2153 }
2154
2155 /**
2156 * Write streamout data to buffers for vertex stream @p stream (different
2157 * vertex streams can occur for GS copy shaders).
2158 */
2159 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2160 struct si_shader_output_values *outputs,
2161 unsigned noutput, unsigned stream)
2162 {
2163 struct si_shader_selector *sel = ctx->shader->selector;
2164 struct pipe_stream_output_info *so = &sel->so;
2165 struct gallivm_state *gallivm = &ctx->gallivm;
2166 LLVMBuilderRef builder = gallivm->builder;
2167 int i;
2168 struct lp_build_if_state if_ctx;
2169
2170 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2171 LLVMValueRef so_vtx_count =
2172 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2173
2174 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2175
2176 /* can_emit = tid < so_vtx_count; */
2177 LLVMValueRef can_emit =
2178 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2179
2180 /* Emit the streamout code conditionally. This actually avoids
2181 * out-of-bounds buffer access. The hw tells us via the SGPR
2182 * (so_vtx_count) which threads are allowed to emit streamout data. */
2183 lp_build_if(&if_ctx, gallivm, can_emit);
2184 {
2185 /* The buffer offset is computed as follows:
2186 * ByteOffset = streamout_offset[buffer_id]*4 +
2187 * (streamout_write_index + thread_id)*stride[buffer_id] +
2188 * attrib_offset
2189 */
2190
2191 LLVMValueRef so_write_index =
2192 LLVMGetParam(ctx->main_fn,
2193 ctx->param_streamout_write_index);
2194
2195 /* Compute (streamout_write_index + thread_id). */
2196 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2197
2198 /* Load the descriptor and compute the write offset for each
2199 * enabled buffer. */
2200 LLVMValueRef so_write_offset[4] = {};
2201 LLVMValueRef so_buffers[4];
2202 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2203 ctx->param_rw_buffers);
2204
2205 for (i = 0; i < 4; i++) {
2206 if (!so->stride[i])
2207 continue;
2208
2209 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2210 SI_VS_STREAMOUT_BUF0 + i, 0);
2211
2212 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2213
2214 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2215 ctx->param_streamout_offset[i]);
2216 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2217
2218 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2219 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2220 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2221 }
2222
2223 /* Write streamout data. */
2224 for (i = 0; i < so->num_outputs; i++) {
2225 unsigned reg = so->output[i].register_index;
2226
2227 if (reg >= noutput)
2228 continue;
2229
2230 if (stream != so->output[i].stream)
2231 continue;
2232
2233 emit_streamout_output(ctx, so_buffers, so_write_offset,
2234 &so->output[i], &outputs[reg]);
2235 }
2236 }
2237 lp_build_endif(&if_ctx);
2238 }
2239
2240
2241 /* Generate export instructions for hardware VS shader stage */
2242 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2243 struct si_shader_output_values *outputs,
2244 unsigned noutput)
2245 {
2246 struct si_shader_context *ctx = si_shader_context(bld_base);
2247 struct si_shader *shader = ctx->shader;
2248 struct lp_build_context *base = &bld_base->base;
2249 struct ac_export_args args, pos_args[4] = {};
2250 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2251 unsigned semantic_name, semantic_index;
2252 unsigned target;
2253 unsigned param_count = 0;
2254 unsigned pos_idx;
2255 int i;
2256
2257 for (i = 0; i < noutput; i++) {
2258 semantic_name = outputs[i].semantic_name;
2259 semantic_index = outputs[i].semantic_index;
2260 bool export_param = true;
2261
2262 switch (semantic_name) {
2263 case TGSI_SEMANTIC_POSITION: /* ignore these */
2264 case TGSI_SEMANTIC_PSIZE:
2265 case TGSI_SEMANTIC_CLIPVERTEX:
2266 case TGSI_SEMANTIC_EDGEFLAG:
2267 break;
2268 case TGSI_SEMANTIC_GENERIC:
2269 case TGSI_SEMANTIC_CLIPDIST:
2270 if (shader->key.opt.hw_vs.kill_outputs &
2271 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2272 export_param = false;
2273 break;
2274 default:
2275 if (shader->key.opt.hw_vs.kill_outputs2 &
2276 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2277 export_param = false;
2278 break;
2279 }
2280
2281 if (outputs[i].vertex_stream[0] != 0 &&
2282 outputs[i].vertex_stream[1] != 0 &&
2283 outputs[i].vertex_stream[2] != 0 &&
2284 outputs[i].vertex_stream[3] != 0)
2285 export_param = false;
2286
2287 handle_semantic:
2288 /* Select the correct target */
2289 switch(semantic_name) {
2290 case TGSI_SEMANTIC_PSIZE:
2291 psize_value = outputs[i].values[0];
2292 continue;
2293 case TGSI_SEMANTIC_EDGEFLAG:
2294 edgeflag_value = outputs[i].values[0];
2295 continue;
2296 case TGSI_SEMANTIC_LAYER:
2297 layer_value = outputs[i].values[0];
2298 semantic_name = TGSI_SEMANTIC_GENERIC;
2299 goto handle_semantic;
2300 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2301 viewport_index_value = outputs[i].values[0];
2302 semantic_name = TGSI_SEMANTIC_GENERIC;
2303 goto handle_semantic;
2304 case TGSI_SEMANTIC_POSITION:
2305 target = V_008DFC_SQ_EXP_POS;
2306 break;
2307 case TGSI_SEMANTIC_CLIPDIST:
2308 if (shader->key.opt.hw_vs.clip_disable) {
2309 semantic_name = TGSI_SEMANTIC_GENERIC;
2310 goto handle_semantic;
2311 }
2312 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2313 break;
2314 case TGSI_SEMANTIC_CLIPVERTEX:
2315 if (shader->key.opt.hw_vs.clip_disable)
2316 continue;
2317 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2318 continue;
2319 case TGSI_SEMANTIC_COLOR:
2320 case TGSI_SEMANTIC_BCOLOR:
2321 case TGSI_SEMANTIC_PRIMID:
2322 case TGSI_SEMANTIC_FOG:
2323 case TGSI_SEMANTIC_TEXCOORD:
2324 case TGSI_SEMANTIC_GENERIC:
2325 if (!export_param)
2326 continue;
2327 target = V_008DFC_SQ_EXP_PARAM + param_count;
2328 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2329 shader->info.vs_output_param_offset[i] = param_count;
2330 param_count++;
2331 break;
2332 default:
2333 target = 0;
2334 fprintf(stderr,
2335 "Warning: SI unhandled vs output type:%d\n",
2336 semantic_name);
2337 }
2338
2339 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2340
2341 if (target >= V_008DFC_SQ_EXP_POS &&
2342 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2343 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2344 &args, sizeof(args));
2345 } else {
2346 ac_build_export(&ctx->ac, &args);
2347 }
2348
2349 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2350 semantic_name = TGSI_SEMANTIC_GENERIC;
2351 goto handle_semantic;
2352 }
2353 }
2354
2355 shader->info.nr_param_exports = param_count;
2356
2357 /* We need to add the position output manually if it's missing. */
2358 if (!pos_args[0].out[0]) {
2359 pos_args[0].enabled_channels = 0xf; /* writemask */
2360 pos_args[0].valid_mask = 0; /* EXEC mask */
2361 pos_args[0].done = 0; /* last export? */
2362 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2363 pos_args[0].compr = 0; /* COMPR flag */
2364 pos_args[0].out[0] = base->zero; /* X */
2365 pos_args[0].out[1] = base->zero; /* Y */
2366 pos_args[0].out[2] = base->zero; /* Z */
2367 pos_args[0].out[3] = base->one; /* W */
2368 }
2369
2370 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2371 if (shader->selector->info.writes_psize ||
2372 shader->selector->info.writes_edgeflag ||
2373 shader->selector->info.writes_viewport_index ||
2374 shader->selector->info.writes_layer) {
2375 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2376 (shader->selector->info.writes_edgeflag << 1) |
2377 (shader->selector->info.writes_layer << 2) |
2378 (shader->selector->info.writes_viewport_index << 3);
2379 pos_args[1].valid_mask = 0; /* EXEC mask */
2380 pos_args[1].done = 0; /* last export? */
2381 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2382 pos_args[1].compr = 0; /* COMPR flag */
2383 pos_args[1].out[0] = base->zero; /* X */
2384 pos_args[1].out[1] = base->zero; /* Y */
2385 pos_args[1].out[2] = base->zero; /* Z */
2386 pos_args[1].out[3] = base->zero; /* W */
2387
2388 if (shader->selector->info.writes_psize)
2389 pos_args[1].out[0] = psize_value;
2390
2391 if (shader->selector->info.writes_edgeflag) {
2392 /* The output is a float, but the hw expects an integer
2393 * with the first bit containing the edge flag. */
2394 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2395 edgeflag_value,
2396 ctx->i32, "");
2397 edgeflag_value = lp_build_min(&bld_base->int_bld,
2398 edgeflag_value,
2399 ctx->i32_1);
2400
2401 /* The LLVM intrinsic expects a float. */
2402 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2403 edgeflag_value,
2404 ctx->f32, "");
2405 }
2406
2407 if (shader->selector->info.writes_layer)
2408 pos_args[1].out[2] = layer_value;
2409
2410 if (shader->selector->info.writes_viewport_index)
2411 pos_args[1].out[3] = viewport_index_value;
2412 }
2413
2414 for (i = 0; i < 4; i++)
2415 if (pos_args[i].out[0])
2416 shader->info.nr_pos_exports++;
2417
2418 pos_idx = 0;
2419 for (i = 0; i < 4; i++) {
2420 if (!pos_args[i].out[0])
2421 continue;
2422
2423 /* Specify the target we are exporting */
2424 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2425
2426 if (pos_idx == shader->info.nr_pos_exports)
2427 /* Specify that this is the last export */
2428 pos_args[i].done = 1;
2429
2430 ac_build_export(&ctx->ac, &pos_args[i]);
2431 }
2432 }
2433
2434 /**
2435 * Forward all outputs from the vertex shader to the TES. This is only used
2436 * for the fixed function TCS.
2437 */
2438 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2439 {
2440 struct si_shader_context *ctx = si_shader_context(bld_base);
2441 struct gallivm_state *gallivm = &ctx->gallivm;
2442 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2443 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2444 uint64_t inputs;
2445
2446 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2447
2448 rw_buffers = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2449 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2450 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2451
2452 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2453
2454 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2455 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2456 lds_vertex_stride, "");
2457 lds_base = get_tcs_in_current_patch_offset(ctx);
2458 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2459
2460 inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2461 while (inputs) {
2462 unsigned i = u_bit_scan64(&inputs);
2463
2464 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2465 LLVMConstInt(ctx->i32, 4 * i, 0),
2466 "");
2467
2468 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2469 get_rel_patch_id(ctx),
2470 invocation_id,
2471 LLVMConstInt(ctx->i32, i, 0));
2472
2473 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2474 lds_ptr);
2475
2476 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2477 buffer_offset, 0, 1, 0, true, false);
2478 }
2479 }
2480
2481 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2482 LLVMValueRef rel_patch_id,
2483 LLVMValueRef invocation_id,
2484 LLVMValueRef tcs_out_current_patch_data_offset)
2485 {
2486 struct si_shader_context *ctx = si_shader_context(bld_base);
2487 struct gallivm_state *gallivm = &ctx->gallivm;
2488 struct si_shader *shader = ctx->shader;
2489 unsigned tess_inner_index, tess_outer_index;
2490 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2491 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
2492 unsigned stride, outer_comps, inner_comps, i, offset;
2493 struct lp_build_if_state if_ctx, inner_if_ctx;
2494
2495 si_llvm_emit_barrier(NULL, bld_base, NULL);
2496
2497 /* Do this only for invocation 0, because the tess levels are per-patch,
2498 * not per-vertex.
2499 *
2500 * This can't jump, because invocation 0 executes this. It should
2501 * at least mask out the loads and stores for other invocations.
2502 */
2503 lp_build_if(&if_ctx, gallivm,
2504 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2505 invocation_id, ctx->i32_0, ""));
2506
2507 /* Determine the layout of one tess factor element in the buffer. */
2508 switch (shader->key.part.tcs.epilog.prim_mode) {
2509 case PIPE_PRIM_LINES:
2510 stride = 2; /* 2 dwords, 1 vec2 store */
2511 outer_comps = 2;
2512 inner_comps = 0;
2513 break;
2514 case PIPE_PRIM_TRIANGLES:
2515 stride = 4; /* 4 dwords, 1 vec4 store */
2516 outer_comps = 3;
2517 inner_comps = 1;
2518 break;
2519 case PIPE_PRIM_QUADS:
2520 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2521 outer_comps = 4;
2522 inner_comps = 2;
2523 break;
2524 default:
2525 assert(0);
2526 return;
2527 }
2528
2529 /* Load tess_inner and tess_outer from LDS.
2530 * Any invocation can write them, so we can't get them from a temporary.
2531 */
2532 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2533 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2534
2535 lds_base = tcs_out_current_patch_data_offset;
2536 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2537 LLVMConstInt(ctx->i32,
2538 tess_inner_index * 4, 0), "");
2539 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2540 LLVMConstInt(ctx->i32,
2541 tess_outer_index * 4, 0), "");
2542
2543 for (i = 0; i < 4; i++) {
2544 inner[i] = LLVMGetUndef(ctx->i32);
2545 outer[i] = LLVMGetUndef(ctx->i32);
2546 }
2547
2548 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2549 /* For isolines, the hardware expects tess factors in the
2550 * reverse order from what GLSL / TGSI specify.
2551 */
2552 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2553 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2554 } else {
2555 for (i = 0; i < outer_comps; i++) {
2556 outer[i] = out[i] =
2557 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2558 }
2559 for (i = 0; i < inner_comps; i++) {
2560 inner[i] = out[outer_comps+i] =
2561 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2562 }
2563 }
2564
2565 /* Convert the outputs to vectors for stores. */
2566 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2567 vec1 = NULL;
2568
2569 if (stride > 4)
2570 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2571
2572 /* Get the buffer. */
2573 rw_buffers = LLVMGetParam(ctx->main_fn,
2574 ctx->param_rw_buffers);
2575 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2576 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
2577
2578 /* Get the offset. */
2579 tf_base = LLVMGetParam(ctx->main_fn,
2580 ctx->param_tcs_factor_offset);
2581 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2582 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2583
2584 lp_build_if(&inner_if_ctx, gallivm,
2585 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2586 rel_patch_id, ctx->i32_0, ""));
2587
2588 /* Store the dynamic HS control word. */
2589 offset = 0;
2590 if (ctx->screen->b.chip_class <= VI) {
2591 ac_build_buffer_store_dword(&ctx->ac, buffer,
2592 LLVMConstInt(ctx->i32, 0x80000000, 0),
2593 1, ctx->i32_0, tf_base,
2594 offset, 1, 0, true, false);
2595 offset += 4;
2596 }
2597
2598 lp_build_endif(&inner_if_ctx);
2599
2600 /* Store the tessellation factors. */
2601 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2602 MIN2(stride, 4), byteoffset, tf_base,
2603 offset, 1, 0, true, false);
2604 offset += 16;
2605 if (vec1)
2606 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2607 stride - 4, byteoffset, tf_base,
2608 offset, 1, 0, true, false);
2609
2610 /* Store the tess factors into the offchip buffer if TES reads them. */
2611 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2612 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2613 LLVMValueRef tf_inner_offset;
2614 unsigned param_outer, param_inner;
2615
2616 buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2617 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2618 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2619
2620 param_outer = si_shader_io_get_unique_index(
2621 TGSI_SEMANTIC_TESSOUTER, 0);
2622 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2623 LLVMConstInt(ctx->i32, param_outer, 0));
2624
2625 outer_vec = lp_build_gather_values(gallivm, outer,
2626 util_next_power_of_two(outer_comps));
2627
2628 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2629 outer_comps, tf_outer_offset,
2630 base, 0, 1, 0, true, false);
2631 if (inner_comps) {
2632 param_inner = si_shader_io_get_unique_index(
2633 TGSI_SEMANTIC_TESSINNER, 0);
2634 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2635 LLVMConstInt(ctx->i32, param_inner, 0));
2636
2637 inner_vec = inner_comps == 1 ? inner[0] :
2638 lp_build_gather_values(gallivm, inner, inner_comps);
2639 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2640 inner_comps, tf_inner_offset,
2641 base, 0, 1, 0, true, false);
2642 }
2643 }
2644
2645 lp_build_endif(&if_ctx);
2646 }
2647
2648 static LLVMValueRef
2649 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2650 unsigned param, unsigned return_index)
2651 {
2652 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2653 LLVMGetParam(ctx->main_fn, param),
2654 return_index, "");
2655 }
2656
2657 static LLVMValueRef
2658 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2659 unsigned param, unsigned return_index)
2660 {
2661 LLVMBuilderRef builder = ctx->gallivm.builder;
2662 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2663
2664 return LLVMBuildInsertValue(builder, ret,
2665 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2666 return_index, "");
2667 }
2668
2669 static LLVMValueRef
2670 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2671 unsigned param, unsigned return_index)
2672 {
2673 LLVMBuilderRef builder = ctx->gallivm.builder;
2674 LLVMValueRef ptr, lo, hi;
2675
2676 ptr = LLVMGetParam(ctx->main_fn, param);
2677 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2678 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2679 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2680 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2681 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2682 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2683 }
2684
2685 /* This only writes the tessellation factor levels. */
2686 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2687 {
2688 struct si_shader_context *ctx = si_shader_context(bld_base);
2689 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2690 LLVMValueRef offchip_soffset, offchip_layout;
2691
2692 si_copy_tcs_inputs(bld_base);
2693
2694 rel_patch_id = get_rel_patch_id(ctx);
2695 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2696 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2697
2698 /* Return epilog parameters from this function. */
2699 LLVMBuilderRef builder = ctx->gallivm.builder;
2700 LLVMValueRef ret = ctx->return_value;
2701 LLVMValueRef tf_soffset;
2702 unsigned vgpr;
2703
2704 offchip_layout = LLVMGetParam(ctx->main_fn,
2705 ctx->param_tcs_offchip_layout);
2706 offchip_soffset = LLVMGetParam(ctx->main_fn,
2707 ctx->param_tcs_offchip_offset);
2708 tf_soffset = LLVMGetParam(ctx->main_fn,
2709 ctx->param_tcs_factor_offset);
2710
2711 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2712 ctx->param_rw_buffers, 0);
2713
2714 if (ctx->screen->b.chip_class >= GFX9) {
2715 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2716 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT, "");
2717 /* Tess offchip and tess factor offsets are at the beginning. */
2718 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset, 2, "");
2719 ret = LLVMBuildInsertValue(builder, ret, tf_soffset, 4, "");
2720 vgpr = 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT + 1;
2721 } else {
2722 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2723 GFX6_SGPR_TCS_OFFCHIP_LAYOUT, "");
2724 /* Tess offchip and tess factor offsets are after user SGPRs. */
2725 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
2726 GFX6_TCS_NUM_USER_SGPR, "");
2727 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2728 GFX6_TCS_NUM_USER_SGPR + 1, "");
2729 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2730 }
2731
2732 /* VGPRs */
2733 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2734 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2735 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2736
2737 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2738 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2739 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2740 ctx->return_value = ret;
2741 }
2742
2743 /* Pass TCS inputs from LS to TCS on GFX9. */
2744 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2745 {
2746 LLVMValueRef ret = ctx->return_value;
2747
2748 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2749 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2750 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2751 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2752 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2753
2754 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2755 8 + SI_SGPR_VS_STATE_BITS);
2756 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2757 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2758 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2759 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2760 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2761 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2762
2763 unsigned desc_param = ctx->param_tcs_out_lds_layout + 2;
2764 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2765 8 + GFX9_SGPR_TCS_CONST_BUFFERS);
2766 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2767 8 + GFX9_SGPR_TCS_SAMPLERS);
2768 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2769 8 + GFX9_SGPR_TCS_IMAGES);
2770 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2771 8 + GFX9_SGPR_TCS_SHADER_BUFFERS);
2772
2773 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2774 ret = si_insert_input_ret_float(ctx, ret,
2775 ctx->param_tcs_patch_id, vgpr++);
2776 ret = si_insert_input_ret_float(ctx, ret,
2777 ctx->param_tcs_rel_ids, vgpr++);
2778 ctx->return_value = ret;
2779 }
2780
2781 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2782 {
2783 struct si_shader_context *ctx = si_shader_context(bld_base);
2784 struct si_shader *shader = ctx->shader;
2785 struct tgsi_shader_info *info = &shader->selector->info;
2786 struct gallivm_state *gallivm = &ctx->gallivm;
2787 unsigned i, chan;
2788 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2789 ctx->param_rel_auto_id);
2790 LLVMValueRef vertex_dw_stride =
2791 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2792 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2793 vertex_dw_stride, "");
2794
2795 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2796 * its inputs from it. */
2797 for (i = 0; i < info->num_outputs; i++) {
2798 LLVMValueRef *out_ptr = ctx->outputs[i];
2799 unsigned name = info->output_semantic_name[i];
2800 unsigned index = info->output_semantic_index[i];
2801
2802 /* The ARB_shader_viewport_layer_array spec contains the
2803 * following issue:
2804 *
2805 * 2) What happens if gl_ViewportIndex or gl_Layer is
2806 * written in the vertex shader and a geometry shader is
2807 * present?
2808 *
2809 * RESOLVED: The value written by the last vertex processing
2810 * stage is used. If the last vertex processing stage
2811 * (vertex, tessellation evaluation or geometry) does not
2812 * statically assign to gl_ViewportIndex or gl_Layer, index
2813 * or layer zero is assumed.
2814 *
2815 * So writes to those outputs in VS-as-LS are simply ignored.
2816 */
2817 if (name == TGSI_SEMANTIC_LAYER ||
2818 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2819 continue;
2820
2821 int param = si_shader_io_get_unique_index(name, index);
2822 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2823 LLVMConstInt(ctx->i32, param * 4, 0), "");
2824
2825 for (chan = 0; chan < 4; chan++) {
2826 lds_store(bld_base, chan, dw_addr,
2827 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2828 }
2829 }
2830
2831 if (ctx->screen->b.chip_class >= GFX9)
2832 si_set_ls_return_value_for_tcs(ctx);
2833 }
2834
2835 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2836 {
2837 struct si_shader_context *ctx = si_shader_context(bld_base);
2838 struct gallivm_state *gallivm = &ctx->gallivm;
2839 struct si_shader *es = ctx->shader;
2840 struct tgsi_shader_info *info = &es->selector->info;
2841 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2842 ctx->param_es2gs_offset);
2843 unsigned chan;
2844 int i;
2845
2846 for (i = 0; i < info->num_outputs; i++) {
2847 LLVMValueRef *out_ptr = ctx->outputs[i];
2848 int param_index;
2849
2850 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2851 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2852 continue;
2853
2854 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2855 info->output_semantic_index[i]);
2856
2857 for (chan = 0; chan < 4; chan++) {
2858 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2859 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2860
2861 ac_build_buffer_store_dword(&ctx->ac,
2862 ctx->esgs_ring,
2863 out_val, 1, NULL, soffset,
2864 (4 * param_index + chan) * 4,
2865 1, 1, true, true);
2866 }
2867 }
2868 }
2869
2870 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2871 {
2872 if (ctx->screen->b.chip_class >= GFX9)
2873 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2874 else
2875 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2876 }
2877
2878 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2879 {
2880 struct si_shader_context *ctx = si_shader_context(bld_base);
2881
2882 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2883 si_get_gs_wave_id(ctx));
2884 }
2885
2886 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2887 {
2888 struct si_shader_context *ctx = si_shader_context(bld_base);
2889 struct gallivm_state *gallivm = &ctx->gallivm;
2890 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2891 struct si_shader_output_values *outputs = NULL;
2892 int i,j;
2893
2894 assert(!ctx->shader->is_gs_copy_shader);
2895
2896 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2897
2898 /* Vertex color clamping.
2899 *
2900 * This uses a state constant loaded in a user data SGPR and
2901 * an IF statement is added that clamps all colors if the constant
2902 * is true.
2903 */
2904 if (ctx->type == PIPE_SHADER_VERTEX) {
2905 struct lp_build_if_state if_ctx;
2906 LLVMValueRef cond = NULL;
2907 LLVMValueRef addr, val;
2908
2909 for (i = 0; i < info->num_outputs; i++) {
2910 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2911 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2912 continue;
2913
2914 /* We've found a color. */
2915 if (!cond) {
2916 /* The state is in the first bit of the user SGPR. */
2917 cond = LLVMGetParam(ctx->main_fn,
2918 ctx->param_vs_state_bits);
2919 cond = LLVMBuildTrunc(gallivm->builder, cond,
2920 ctx->i1, "");
2921 lp_build_if(&if_ctx, gallivm, cond);
2922 }
2923
2924 for (j = 0; j < 4; j++) {
2925 addr = ctx->outputs[i][j];
2926 val = LLVMBuildLoad(gallivm->builder, addr, "");
2927 val = ac_build_clamp(&ctx->ac, val);
2928 LLVMBuildStore(gallivm->builder, val, addr);
2929 }
2930 }
2931
2932 if (cond)
2933 lp_build_endif(&if_ctx);
2934 }
2935
2936 for (i = 0; i < info->num_outputs; i++) {
2937 outputs[i].semantic_name = info->output_semantic_name[i];
2938 outputs[i].semantic_index = info->output_semantic_index[i];
2939
2940 for (j = 0; j < 4; j++) {
2941 outputs[i].values[j] =
2942 LLVMBuildLoad(gallivm->builder,
2943 ctx->outputs[i][j],
2944 "");
2945 outputs[i].vertex_stream[j] =
2946 (info->output_streams[i] >> (2 * j)) & 3;
2947 }
2948
2949 }
2950
2951 /* Return the primitive ID from the LLVM function. */
2952 ctx->return_value =
2953 LLVMBuildInsertValue(gallivm->builder,
2954 ctx->return_value,
2955 bitcast(bld_base, TGSI_TYPE_FLOAT,
2956 get_primitive_id(bld_base, 0)),
2957 VS_EPILOG_PRIMID_LOC, "");
2958
2959 if (ctx->shader->selector->so.num_outputs)
2960 si_llvm_emit_streamout(ctx, outputs, i, 0);
2961 si_llvm_export_vs(bld_base, outputs, i);
2962 FREE(outputs);
2963 }
2964
2965 struct si_ps_exports {
2966 unsigned num;
2967 struct ac_export_args args[10];
2968 };
2969
2970 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
2971 bool writes_samplemask)
2972 {
2973 if (writes_z) {
2974 /* Z needs 32 bits. */
2975 if (writes_samplemask)
2976 return V_028710_SPI_SHADER_32_ABGR;
2977 else if (writes_stencil)
2978 return V_028710_SPI_SHADER_32_GR;
2979 else
2980 return V_028710_SPI_SHADER_32_R;
2981 } else if (writes_stencil || writes_samplemask) {
2982 /* Both stencil and sample mask need only 16 bits. */
2983 return V_028710_SPI_SHADER_UINT16_ABGR;
2984 } else {
2985 return V_028710_SPI_SHADER_ZERO;
2986 }
2987 }
2988
2989 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2990 LLVMValueRef depth, LLVMValueRef stencil,
2991 LLVMValueRef samplemask, struct si_ps_exports *exp)
2992 {
2993 struct si_shader_context *ctx = si_shader_context(bld_base);
2994 struct lp_build_context *base = &bld_base->base;
2995 struct ac_export_args args;
2996 unsigned mask = 0;
2997 unsigned format = si_get_spi_shader_z_format(depth != NULL,
2998 stencil != NULL,
2999 samplemask != NULL);
3000
3001 assert(depth || stencil || samplemask);
3002
3003 args.valid_mask = 1; /* whether the EXEC mask is valid */
3004 args.done = 1; /* DONE bit */
3005
3006 /* Specify the target we are exporting */
3007 args.target = V_008DFC_SQ_EXP_MRTZ;
3008
3009 args.compr = 0; /* COMP flag */
3010 args.out[0] = base->undef; /* R, depth */
3011 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3012 args.out[2] = base->undef; /* B, sample mask */
3013 args.out[3] = base->undef; /* A, alpha to mask */
3014
3015 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3016 assert(!depth);
3017 args.compr = 1; /* COMPR flag */
3018
3019 if (stencil) {
3020 /* Stencil should be in X[23:16]. */
3021 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3022 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3023 LLVMConstInt(ctx->i32, 16, 0), "");
3024 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3025 mask |= 0x3;
3026 }
3027 if (samplemask) {
3028 /* SampleMask should be in Y[15:0]. */
3029 args.out[1] = samplemask;
3030 mask |= 0xc;
3031 }
3032 } else {
3033 if (depth) {
3034 args.out[0] = depth;
3035 mask |= 0x1;
3036 }
3037 if (stencil) {
3038 args.out[1] = stencil;
3039 mask |= 0x2;
3040 }
3041 if (samplemask) {
3042 args.out[2] = samplemask;
3043 mask |= 0x4;
3044 }
3045 }
3046
3047 /* SI (except OLAND and HAINAN) has a bug that it only looks
3048 * at the X writemask component. */
3049 if (ctx->screen->b.chip_class == SI &&
3050 ctx->screen->b.family != CHIP_OLAND &&
3051 ctx->screen->b.family != CHIP_HAINAN)
3052 mask |= 0x1;
3053
3054 /* Specify which components to enable */
3055 args.enabled_channels = mask;
3056
3057 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3058 }
3059
3060 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3061 LLVMValueRef *color, unsigned index,
3062 unsigned samplemask_param,
3063 bool is_last, struct si_ps_exports *exp)
3064 {
3065 struct si_shader_context *ctx = si_shader_context(bld_base);
3066 struct lp_build_context *base = &bld_base->base;
3067 int i;
3068
3069 /* Clamp color */
3070 if (ctx->shader->key.part.ps.epilog.clamp_color)
3071 for (i = 0; i < 4; i++)
3072 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3073
3074 /* Alpha to one */
3075 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3076 color[3] = base->one;
3077
3078 /* Alpha test */
3079 if (index == 0 &&
3080 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3081 si_alpha_test(bld_base, color[3]);
3082
3083 /* Line & polygon smoothing */
3084 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3085 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3086 samplemask_param);
3087
3088 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3089 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3090 struct ac_export_args args[8];
3091 int c, last = -1;
3092
3093 /* Get the export arguments, also find out what the last one is. */
3094 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3095 si_llvm_init_export_args(bld_base, color,
3096 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3097 if (args[c].enabled_channels)
3098 last = c;
3099 }
3100
3101 /* Emit all exports. */
3102 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3103 if (is_last && last == c) {
3104 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3105 args[c].done = 1; /* DONE bit */
3106 } else if (!args[c].enabled_channels)
3107 continue; /* unnecessary NULL export */
3108
3109 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3110 }
3111 } else {
3112 struct ac_export_args args;
3113
3114 /* Export */
3115 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3116 &args);
3117 if (is_last) {
3118 args.valid_mask = 1; /* whether the EXEC mask is valid */
3119 args.done = 1; /* DONE bit */
3120 } else if (!args.enabled_channels)
3121 return; /* unnecessary NULL export */
3122
3123 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3124 }
3125 }
3126
3127 static void si_emit_ps_exports(struct si_shader_context *ctx,
3128 struct si_ps_exports *exp)
3129 {
3130 for (unsigned i = 0; i < exp->num; i++)
3131 ac_build_export(&ctx->ac, &exp->args[i]);
3132 }
3133
3134 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3135 {
3136 struct si_shader_context *ctx = si_shader_context(bld_base);
3137 struct lp_build_context *base = &bld_base->base;
3138 struct ac_export_args args;
3139
3140 args.enabled_channels = 0x0; /* enabled channels */
3141 args.valid_mask = 1; /* whether the EXEC mask is valid */
3142 args.done = 1; /* DONE bit */
3143 args.target = V_008DFC_SQ_EXP_NULL;
3144 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3145 args.out[0] = base->undef; /* R */
3146 args.out[1] = base->undef; /* G */
3147 args.out[2] = base->undef; /* B */
3148 args.out[3] = base->undef; /* A */
3149
3150 ac_build_export(&ctx->ac, &args);
3151 }
3152
3153 /**
3154 * Return PS outputs in this order:
3155 *
3156 * v[0:3] = color0.xyzw
3157 * v[4:7] = color1.xyzw
3158 * ...
3159 * vN+0 = Depth
3160 * vN+1 = Stencil
3161 * vN+2 = SampleMask
3162 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3163 *
3164 * The alpha-ref SGPR is returned via its original location.
3165 */
3166 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3167 {
3168 struct si_shader_context *ctx = si_shader_context(bld_base);
3169 struct si_shader *shader = ctx->shader;
3170 struct tgsi_shader_info *info = &shader->selector->info;
3171 LLVMBuilderRef builder = ctx->gallivm.builder;
3172 unsigned i, j, first_vgpr, vgpr;
3173
3174 LLVMValueRef color[8][4] = {};
3175 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3176 LLVMValueRef ret;
3177
3178 /* Read the output values. */
3179 for (i = 0; i < info->num_outputs; i++) {
3180 unsigned semantic_name = info->output_semantic_name[i];
3181 unsigned semantic_index = info->output_semantic_index[i];
3182
3183 switch (semantic_name) {
3184 case TGSI_SEMANTIC_COLOR:
3185 assert(semantic_index < 8);
3186 for (j = 0; j < 4; j++) {
3187 LLVMValueRef ptr = ctx->outputs[i][j];
3188 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3189 color[semantic_index][j] = result;
3190 }
3191 break;
3192 case TGSI_SEMANTIC_POSITION:
3193 depth = LLVMBuildLoad(builder,
3194 ctx->outputs[i][2], "");
3195 break;
3196 case TGSI_SEMANTIC_STENCIL:
3197 stencil = LLVMBuildLoad(builder,
3198 ctx->outputs[i][1], "");
3199 break;
3200 case TGSI_SEMANTIC_SAMPLEMASK:
3201 samplemask = LLVMBuildLoad(builder,
3202 ctx->outputs[i][0], "");
3203 break;
3204 default:
3205 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3206 semantic_name);
3207 }
3208 }
3209
3210 /* Fill the return structure. */
3211 ret = ctx->return_value;
3212
3213 /* Set SGPRs. */
3214 ret = LLVMBuildInsertValue(builder, ret,
3215 bitcast(bld_base, TGSI_TYPE_SIGNED,
3216 LLVMGetParam(ctx->main_fn,
3217 SI_PARAM_ALPHA_REF)),
3218 SI_SGPR_ALPHA_REF, "");
3219
3220 /* Set VGPRs */
3221 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3222 for (i = 0; i < ARRAY_SIZE(color); i++) {
3223 if (!color[i][0])
3224 continue;
3225
3226 for (j = 0; j < 4; j++)
3227 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3228 }
3229 if (depth)
3230 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3231 if (stencil)
3232 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3233 if (samplemask)
3234 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3235
3236 /* Add the input sample mask for smoothing at the end. */
3237 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3238 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3239 ret = LLVMBuildInsertValue(builder, ret,
3240 LLVMGetParam(ctx->main_fn,
3241 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3242
3243 ctx->return_value = ret;
3244 }
3245
3246 /**
3247 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3248 * buffer in number of elements and return it as an i32.
3249 */
3250 static LLVMValueRef get_buffer_size(
3251 struct lp_build_tgsi_context *bld_base,
3252 LLVMValueRef descriptor)
3253 {
3254 struct si_shader_context *ctx = si_shader_context(bld_base);
3255 struct gallivm_state *gallivm = &ctx->gallivm;
3256 LLVMBuilderRef builder = gallivm->builder;
3257 LLVMValueRef size =
3258 LLVMBuildExtractElement(builder, descriptor,
3259 LLVMConstInt(ctx->i32, 2, 0), "");
3260
3261 if (ctx->screen->b.chip_class == VI) {
3262 /* On VI, the descriptor contains the size in bytes,
3263 * but TXQ must return the size in elements.
3264 * The stride is always non-zero for resources using TXQ.
3265 */
3266 LLVMValueRef stride =
3267 LLVMBuildExtractElement(builder, descriptor,
3268 ctx->i32_1, "");
3269 stride = LLVMBuildLShr(builder, stride,
3270 LLVMConstInt(ctx->i32, 16, 0), "");
3271 stride = LLVMBuildAnd(builder, stride,
3272 LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3273
3274 size = LLVMBuildUDiv(builder, size, stride, "");
3275 }
3276
3277 return size;
3278 }
3279
3280 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3281 struct lp_build_tgsi_context *bld_base,
3282 struct lp_build_emit_data *emit_data);
3283
3284 /* Prevent optimizations (at least of memory accesses) across the current
3285 * point in the program by emitting empty inline assembly that is marked as
3286 * having side effects.
3287 *
3288 * Optionally, a value can be passed through the inline assembly to prevent
3289 * LLVM from hoisting calls to ReadNone functions.
3290 */
3291 static void emit_optimization_barrier(struct si_shader_context *ctx,
3292 LLVMValueRef *pvgpr)
3293 {
3294 static int counter = 0;
3295
3296 LLVMBuilderRef builder = ctx->gallivm.builder;
3297 char code[16];
3298
3299 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3300
3301 if (!pvgpr) {
3302 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3303 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3304 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3305 } else {
3306 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3307 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3308 LLVMValueRef vgpr = *pvgpr;
3309 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3310 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3311 LLVMValueRef vgpr0;
3312
3313 assert(vgpr_size % 4 == 0);
3314
3315 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3316 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3317 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3318 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3319 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3320
3321 *pvgpr = vgpr;
3322 }
3323 }
3324
3325 /* Combine these with & instead of |. */
3326 #define NOOP_WAITCNT 0xf7f
3327 #define LGKM_CNT 0x07f
3328 #define VM_CNT 0xf70
3329
3330 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3331 {
3332 struct gallivm_state *gallivm = &ctx->gallivm;
3333 LLVMBuilderRef builder = gallivm->builder;
3334 LLVMValueRef args[1] = {
3335 LLVMConstInt(ctx->i32, simm16, 0)
3336 };
3337 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3338 ctx->voidt, args, 1, 0);
3339 }
3340
3341 static void membar_emit(
3342 const struct lp_build_tgsi_action *action,
3343 struct lp_build_tgsi_context *bld_base,
3344 struct lp_build_emit_data *emit_data)
3345 {
3346 struct si_shader_context *ctx = si_shader_context(bld_base);
3347 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3348 unsigned flags = LLVMConstIntGetZExtValue(src0);
3349 unsigned waitcnt = NOOP_WAITCNT;
3350
3351 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3352 waitcnt &= VM_CNT & LGKM_CNT;
3353
3354 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3355 TGSI_MEMBAR_SHADER_BUFFER |
3356 TGSI_MEMBAR_SHADER_IMAGE))
3357 waitcnt &= VM_CNT;
3358
3359 if (flags & TGSI_MEMBAR_SHARED)
3360 waitcnt &= LGKM_CNT;
3361
3362 if (waitcnt != NOOP_WAITCNT)
3363 emit_waitcnt(ctx, waitcnt);
3364 }
3365
3366 static void clock_emit(
3367 const struct lp_build_tgsi_action *action,
3368 struct lp_build_tgsi_context *bld_base,
3369 struct lp_build_emit_data *emit_data)
3370 {
3371 struct si_shader_context *ctx = si_shader_context(bld_base);
3372 struct gallivm_state *gallivm = &ctx->gallivm;
3373 LLVMValueRef tmp;
3374
3375 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3376 ctx->i64, NULL, 0, 0);
3377 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3378
3379 emit_data->output[0] =
3380 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3381 emit_data->output[1] =
3382 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3383 }
3384
3385 static LLVMValueRef
3386 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3387 const struct tgsi_full_src_register *reg)
3388 {
3389 LLVMValueRef index;
3390 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3391 ctx->param_shader_buffers);
3392
3393 if (!reg->Register.Indirect)
3394 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3395 else
3396 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3397 reg->Register.Index,
3398 SI_NUM_SHADER_BUFFERS);
3399
3400 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3401 }
3402
3403 static bool tgsi_is_array_sampler(unsigned target)
3404 {
3405 return target == TGSI_TEXTURE_1D_ARRAY ||
3406 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3407 target == TGSI_TEXTURE_2D_ARRAY ||
3408 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3409 target == TGSI_TEXTURE_CUBE_ARRAY ||
3410 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3411 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3412 }
3413
3414 static bool tgsi_is_array_image(unsigned target)
3415 {
3416 return target == TGSI_TEXTURE_3D ||
3417 target == TGSI_TEXTURE_CUBE ||
3418 target == TGSI_TEXTURE_1D_ARRAY ||
3419 target == TGSI_TEXTURE_2D_ARRAY ||
3420 target == TGSI_TEXTURE_CUBE_ARRAY ||
3421 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3422 }
3423
3424 /**
3425 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3426 *
3427 * At least on Tonga, executing image stores on images with DCC enabled and
3428 * non-trivial can eventually lead to lockups. This can occur when an
3429 * application binds an image as read-only but then uses a shader that writes
3430 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3431 * program termination) in this case, but it doesn't cost much to be a bit
3432 * nicer: disabling DCC in the shader still leads to undefined results but
3433 * avoids the lockup.
3434 */
3435 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3436 LLVMValueRef rsrc)
3437 {
3438 if (ctx->screen->b.chip_class <= CIK) {
3439 return rsrc;
3440 } else {
3441 LLVMBuilderRef builder = ctx->gallivm.builder;
3442 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3443 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3444 LLVMValueRef tmp;
3445
3446 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3447 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3448 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3449 }
3450 }
3451
3452 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3453 {
3454 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3455 CONST_ADDR_SPACE);
3456 }
3457
3458 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3459 LLVMValueRef list, LLVMValueRef index,
3460 unsigned target)
3461 {
3462 LLVMBuilderRef builder = ctx->gallivm.builder;
3463
3464 if (target == TGSI_TEXTURE_BUFFER) {
3465 index = LLVMBuildMul(builder, index,
3466 LLVMConstInt(ctx->i32, 2, 0), "");
3467 index = LLVMBuildAdd(builder, index,
3468 ctx->i32_1, "");
3469 list = LLVMBuildPointerCast(builder, list,
3470 const_array(ctx->v4i32, 0), "");
3471 }
3472
3473 return ac_build_indexed_load_const(&ctx->ac, list, index);
3474 }
3475
3476 /**
3477 * Load the resource descriptor for \p image.
3478 */
3479 static void
3480 image_fetch_rsrc(
3481 struct lp_build_tgsi_context *bld_base,
3482 const struct tgsi_full_src_register *image,
3483 bool is_store, unsigned target,
3484 LLVMValueRef *rsrc)
3485 {
3486 struct si_shader_context *ctx = si_shader_context(bld_base);
3487 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3488 ctx->param_images);
3489 LLVMValueRef index;
3490 bool dcc_off = is_store;
3491
3492 assert(image->Register.File == TGSI_FILE_IMAGE);
3493
3494 if (!image->Register.Indirect) {
3495 const struct tgsi_shader_info *info = bld_base->info;
3496 unsigned images_writemask = info->images_store |
3497 info->images_atomic;
3498
3499 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3500
3501 if (images_writemask & (1 << image->Register.Index))
3502 dcc_off = true;
3503 } else {
3504 /* From the GL_ARB_shader_image_load_store extension spec:
3505 *
3506 * If a shader performs an image load, store, or atomic
3507 * operation using an image variable declared as an array,
3508 * and if the index used to select an individual element is
3509 * negative or greater than or equal to the size of the
3510 * array, the results of the operation are undefined but may
3511 * not lead to termination.
3512 */
3513 index = get_bounded_indirect_index(ctx, &image->Indirect,
3514 image->Register.Index,
3515 SI_NUM_IMAGES);
3516 }
3517
3518 *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3519 if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3520 *rsrc = force_dcc_off(ctx, *rsrc);
3521 }
3522
3523 static LLVMValueRef image_fetch_coords(
3524 struct lp_build_tgsi_context *bld_base,
3525 const struct tgsi_full_instruction *inst,
3526 unsigned src, LLVMValueRef desc)
3527 {
3528 struct si_shader_context *ctx = si_shader_context(bld_base);
3529 struct gallivm_state *gallivm = &ctx->gallivm;
3530 LLVMBuilderRef builder = gallivm->builder;
3531 unsigned target = inst->Memory.Texture;
3532 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3533 LLVMValueRef coords[4];
3534 LLVMValueRef tmp;
3535 int chan;
3536
3537 for (chan = 0; chan < num_coords; ++chan) {
3538 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3539 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3540 coords[chan] = tmp;
3541 }
3542
3543 if (ctx->screen->b.chip_class >= GFX9) {
3544 /* 1D textures are allocated and used as 2D on GFX9. */
3545 if (target == TGSI_TEXTURE_1D) {
3546 coords[1] = ctx->i32_0;
3547 num_coords++;
3548 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3549 coords[2] = coords[1];
3550 coords[1] = ctx->i32_0;
3551 num_coords++;
3552 } else if (target == TGSI_TEXTURE_2D) {
3553 /* The hw can't bind a slice of a 3D image as a 2D
3554 * image, because it ignores BASE_ARRAY if the target
3555 * is 3D. The workaround is to read BASE_ARRAY and set
3556 * it as the 3rd address operand for all 2D images.
3557 */
3558 LLVMValueRef first_layer, const5, mask;
3559
3560 const5 = LLVMConstInt(ctx->i32, 5, 0);
3561 mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3562 first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3563 first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3564
3565 coords[2] = first_layer;
3566 num_coords++;
3567 }
3568 }
3569
3570 if (num_coords == 1)
3571 return coords[0];
3572
3573 if (num_coords == 3) {
3574 /* LLVM has difficulties lowering 3-element vectors. */
3575 coords[3] = bld_base->uint_bld.undef;
3576 num_coords = 4;
3577 }
3578
3579 return lp_build_gather_values(gallivm, coords, num_coords);
3580 }
3581
3582 /**
3583 * Append the extra mode bits that are used by image load and store.
3584 */
3585 static void image_append_args(
3586 struct si_shader_context *ctx,
3587 struct lp_build_emit_data * emit_data,
3588 unsigned target,
3589 bool atomic,
3590 bool force_glc)
3591 {
3592 const struct tgsi_full_instruction *inst = emit_data->inst;
3593 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3594 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3595 LLVMValueRef r128 = i1false;
3596 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3597 LLVMValueRef glc =
3598 force_glc ||
3599 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3600 i1true : i1false;
3601 LLVMValueRef slc = i1false;
3602 LLVMValueRef lwe = i1false;
3603
3604 if (atomic || (HAVE_LLVM <= 0x0309)) {
3605 emit_data->args[emit_data->arg_count++] = r128;
3606 emit_data->args[emit_data->arg_count++] = da;
3607 if (!atomic) {
3608 emit_data->args[emit_data->arg_count++] = glc;
3609 }
3610 emit_data->args[emit_data->arg_count++] = slc;
3611 return;
3612 }
3613
3614 /* HAVE_LLVM >= 0x0400 */
3615 emit_data->args[emit_data->arg_count++] = glc;
3616 emit_data->args[emit_data->arg_count++] = slc;
3617 emit_data->args[emit_data->arg_count++] = lwe;
3618 emit_data->args[emit_data->arg_count++] = da;
3619 }
3620
3621 /**
3622 * Append the resource and indexing arguments for buffer intrinsics.
3623 *
3624 * \param rsrc the v4i32 buffer resource
3625 * \param index index into the buffer (stride-based)
3626 * \param offset byte offset into the buffer
3627 */
3628 static void buffer_append_args(
3629 struct si_shader_context *ctx,
3630 struct lp_build_emit_data *emit_data,
3631 LLVMValueRef rsrc,
3632 LLVMValueRef index,
3633 LLVMValueRef offset,
3634 bool atomic,
3635 bool force_glc)
3636 {
3637 const struct tgsi_full_instruction *inst = emit_data->inst;
3638 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3639 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3640
3641 emit_data->args[emit_data->arg_count++] = rsrc;
3642 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3643 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3644 if (!atomic) {
3645 emit_data->args[emit_data->arg_count++] =
3646 force_glc ||
3647 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3648 i1true : i1false; /* glc */
3649 }
3650 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3651 }
3652
3653 static void load_fetch_args(
3654 struct lp_build_tgsi_context * bld_base,
3655 struct lp_build_emit_data * emit_data)
3656 {
3657 struct si_shader_context *ctx = si_shader_context(bld_base);
3658 struct gallivm_state *gallivm = &ctx->gallivm;
3659 const struct tgsi_full_instruction * inst = emit_data->inst;
3660 unsigned target = inst->Memory.Texture;
3661 LLVMValueRef rsrc;
3662
3663 emit_data->dst_type = ctx->v4f32;
3664
3665 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3666 LLVMBuilderRef builder = gallivm->builder;
3667 LLVMValueRef offset;
3668 LLVMValueRef tmp;
3669
3670 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3671
3672 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3673 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3674
3675 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3676 offset, false, false);
3677 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3678 LLVMValueRef coords;
3679
3680 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3681 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3682
3683 if (target == TGSI_TEXTURE_BUFFER) {
3684 buffer_append_args(ctx, emit_data, rsrc, coords,
3685 ctx->i32_0, false, false);
3686 } else {
3687 emit_data->args[0] = coords;
3688 emit_data->args[1] = rsrc;
3689 emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3690 emit_data->arg_count = 3;
3691
3692 image_append_args(ctx, emit_data, target, false, false);
3693 }
3694 }
3695 }
3696
3697 static unsigned get_load_intr_attribs(bool readonly_memory)
3698 {
3699 /* READNONE means writes can't affect it, while READONLY means that
3700 * writes can affect it. */
3701 return readonly_memory && HAVE_LLVM >= 0x0400 ?
3702 LP_FUNC_ATTR_READNONE :
3703 LP_FUNC_ATTR_READONLY;
3704 }
3705
3706 static unsigned get_store_intr_attribs(bool writeonly_memory)
3707 {
3708 return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3709 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3710 LP_FUNC_ATTR_WRITEONLY;
3711 }
3712
3713 static void load_emit_buffer(struct si_shader_context *ctx,
3714 struct lp_build_emit_data *emit_data,
3715 bool readonly_memory)
3716 {
3717 const struct tgsi_full_instruction *inst = emit_data->inst;
3718 struct gallivm_state *gallivm = &ctx->gallivm;
3719 LLVMBuilderRef builder = gallivm->builder;
3720 uint writemask = inst->Dst[0].Register.WriteMask;
3721 uint count = util_last_bit(writemask);
3722 const char *intrinsic_name;
3723 LLVMTypeRef dst_type;
3724
3725 switch (count) {
3726 case 1:
3727 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3728 dst_type = ctx->f32;
3729 break;
3730 case 2:
3731 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3732 dst_type = LLVMVectorType(ctx->f32, 2);
3733 break;
3734 default: // 3 & 4
3735 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3736 dst_type = ctx->v4f32;
3737 count = 4;
3738 }
3739
3740 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3741 builder, intrinsic_name, dst_type,
3742 emit_data->args, emit_data->arg_count,
3743 get_load_intr_attribs(readonly_memory));
3744 }
3745
3746 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3747 const struct tgsi_full_instruction *inst,
3748 LLVMTypeRef type, int arg)
3749 {
3750 struct gallivm_state *gallivm = &ctx->gallivm;
3751 LLVMBuilderRef builder = gallivm->builder;
3752 LLVMValueRef offset, ptr;
3753 int addr_space;
3754
3755 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3756 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3757
3758 ptr = ctx->shared_memory;
3759 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3760 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3761 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3762
3763 return ptr;
3764 }
3765
3766 static void load_emit_memory(
3767 struct si_shader_context *ctx,
3768 struct lp_build_emit_data *emit_data)
3769 {
3770 const struct tgsi_full_instruction *inst = emit_data->inst;
3771 struct gallivm_state *gallivm = &ctx->gallivm;
3772 LLVMBuilderRef builder = gallivm->builder;
3773 unsigned writemask = inst->Dst[0].Register.WriteMask;
3774 LLVMValueRef channels[4], ptr, derived_ptr, index;
3775 int chan;
3776
3777 ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3778
3779 for (chan = 0; chan < 4; ++chan) {
3780 if (!(writemask & (1 << chan))) {
3781 channels[chan] = LLVMGetUndef(ctx->f32);
3782 continue;
3783 }
3784
3785 index = LLVMConstInt(ctx->i32, chan, 0);
3786 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3787 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3788 }
3789 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3790 }
3791
3792 /**
3793 * Return true if the memory accessed by a LOAD or STORE instruction is
3794 * read-only or write-only, respectively.
3795 *
3796 * \param shader_buffers_reverse_access_mask
3797 * For LOAD, set this to (store | atomic) slot usage in the shader.
3798 * For STORE, set this to (load | atomic) slot usage in the shader.
3799 * \param images_reverse_access_mask Same as above, but for images.
3800 */
3801 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3802 const struct tgsi_shader_info *info,
3803 unsigned shader_buffers_reverse_access_mask,
3804 unsigned images_reverse_access_mask)
3805 {
3806 /* RESTRICT means NOALIAS.
3807 * If there are no writes, we can assume the accessed memory is read-only.
3808 * If there are no reads, we can assume the accessed memory is write-only.
3809 */
3810 if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3811 unsigned reverse_access_mask;
3812
3813 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3814 reverse_access_mask = shader_buffers_reverse_access_mask;
3815 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3816 reverse_access_mask = info->images_buffers &
3817 images_reverse_access_mask;
3818 } else {
3819 reverse_access_mask = ~info->images_buffers &
3820 images_reverse_access_mask;
3821 }
3822
3823 if (inst->Src[0].Register.Indirect) {
3824 if (!reverse_access_mask)
3825 return true;
3826 } else {
3827 if (!(reverse_access_mask &
3828 (1u << inst->Src[0].Register.Index)))
3829 return true;
3830 }
3831 }
3832
3833 /* If there are no buffer writes (for both shader buffers & image
3834 * buffers), it implies that buffer memory is read-only.
3835 * If there are no buffer reads (for both shader buffers & image
3836 * buffers), it implies that buffer memory is write-only.
3837 *
3838 * Same for the case when there are no writes/reads for non-buffer
3839 * images.
3840 */
3841 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3842 (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3843 inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3844 if (!shader_buffers_reverse_access_mask &&
3845 !(info->images_buffers & images_reverse_access_mask))
3846 return true;
3847 } else {
3848 if (!(~info->images_buffers & images_reverse_access_mask))
3849 return true;
3850 }
3851 return false;
3852 }
3853
3854 static void load_emit(
3855 const struct lp_build_tgsi_action *action,
3856 struct lp_build_tgsi_context *bld_base,
3857 struct lp_build_emit_data *emit_data)
3858 {
3859 struct si_shader_context *ctx = si_shader_context(bld_base);
3860 struct gallivm_state *gallivm = &ctx->gallivm;
3861 LLVMBuilderRef builder = gallivm->builder;
3862 const struct tgsi_full_instruction * inst = emit_data->inst;
3863 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3864 char intrinsic_name[64];
3865 bool readonly_memory = false;
3866
3867 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3868 load_emit_memory(ctx, emit_data);
3869 return;
3870 }
3871
3872 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3873 emit_waitcnt(ctx, VM_CNT);
3874
3875 readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3876 is_oneway_access_only(inst, info,
3877 info->shader_buffers_store |
3878 info->shader_buffers_atomic,
3879 info->images_store |
3880 info->images_atomic);
3881
3882 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3883 load_emit_buffer(ctx, emit_data, readonly_memory);
3884 return;
3885 }
3886
3887 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3888 emit_data->output[emit_data->chan] =
3889 lp_build_intrinsic(
3890 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3891 emit_data->args, emit_data->arg_count,
3892 get_load_intr_attribs(readonly_memory));
3893 } else {
3894 ac_get_image_intr_name("llvm.amdgcn.image.load",
3895 emit_data->dst_type, /* vdata */
3896 LLVMTypeOf(emit_data->args[0]), /* coords */
3897 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3898 intrinsic_name, sizeof(intrinsic_name));
3899
3900 emit_data->output[emit_data->chan] =
3901 lp_build_intrinsic(
3902 builder, intrinsic_name, emit_data->dst_type,
3903 emit_data->args, emit_data->arg_count,
3904 get_load_intr_attribs(readonly_memory));
3905 }
3906 }
3907
3908 static void store_fetch_args(
3909 struct lp_build_tgsi_context * bld_base,
3910 struct lp_build_emit_data * emit_data)
3911 {
3912 struct si_shader_context *ctx = si_shader_context(bld_base);
3913 struct gallivm_state *gallivm = &ctx->gallivm;
3914 LLVMBuilderRef builder = gallivm->builder;
3915 const struct tgsi_full_instruction * inst = emit_data->inst;
3916 struct tgsi_full_src_register memory;
3917 LLVMValueRef chans[4];
3918 LLVMValueRef data;
3919 LLVMValueRef rsrc;
3920 unsigned chan;
3921
3922 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3923
3924 for (chan = 0; chan < 4; ++chan) {
3925 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3926 }
3927 data = lp_build_gather_values(gallivm, chans, 4);
3928
3929 emit_data->args[emit_data->arg_count++] = data;
3930
3931 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3932
3933 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3934 LLVMValueRef offset;
3935 LLVMValueRef tmp;
3936
3937 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3938
3939 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3940 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3941
3942 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3943 offset, false, false);
3944 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3945 unsigned target = inst->Memory.Texture;
3946 LLVMValueRef coords;
3947
3948 /* 8bit/16bit TC L1 write corruption bug on SI.
3949 * All store opcodes not aligned to a dword are affected.
3950 *
3951 * The only way to get unaligned stores in radeonsi is through
3952 * shader images.
3953 */
3954 bool force_glc = ctx->screen->b.chip_class == SI;
3955
3956 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
3957 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
3958
3959 if (target == TGSI_TEXTURE_BUFFER) {
3960 buffer_append_args(ctx, emit_data, rsrc, coords,
3961 ctx->i32_0, false, force_glc);
3962 } else {
3963 emit_data->args[1] = coords;
3964 emit_data->args[2] = rsrc;
3965 emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3966 emit_data->arg_count = 4;
3967
3968 image_append_args(ctx, emit_data, target, false, force_glc);
3969 }
3970 }
3971 }
3972
3973 static void store_emit_buffer(
3974 struct si_shader_context *ctx,
3975 struct lp_build_emit_data *emit_data,
3976 bool writeonly_memory)
3977 {
3978 const struct tgsi_full_instruction *inst = emit_data->inst;
3979 struct gallivm_state *gallivm = &ctx->gallivm;
3980 LLVMBuilderRef builder = gallivm->builder;
3981 LLVMValueRef base_data = emit_data->args[0];
3982 LLVMValueRef base_offset = emit_data->args[3];
3983 unsigned writemask = inst->Dst[0].Register.WriteMask;
3984
3985 while (writemask) {
3986 int start, count;
3987 const char *intrinsic_name;
3988 LLVMValueRef data;
3989 LLVMValueRef offset;
3990 LLVMValueRef tmp;
3991
3992 u_bit_scan_consecutive_range(&writemask, &start, &count);
3993
3994 /* Due to an LLVM limitation, split 3-element writes
3995 * into a 2-element and a 1-element write. */
3996 if (count == 3) {
3997 writemask |= 1 << (start + 2);
3998 count = 2;
3999 }
4000
4001 if (count == 4) {
4002 data = base_data;
4003 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
4004 } else if (count == 2) {
4005 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
4006
4007 tmp = LLVMBuildExtractElement(
4008 builder, base_data,
4009 LLVMConstInt(ctx->i32, start, 0), "");
4010 data = LLVMBuildInsertElement(
4011 builder, LLVMGetUndef(v2f32), tmp,
4012 ctx->i32_0, "");
4013
4014 tmp = LLVMBuildExtractElement(
4015 builder, base_data,
4016 LLVMConstInt(ctx->i32, start + 1, 0), "");
4017 data = LLVMBuildInsertElement(
4018 builder, data, tmp, ctx->i32_1, "");
4019
4020 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
4021 } else {
4022 assert(count == 1);
4023 data = LLVMBuildExtractElement(
4024 builder, base_data,
4025 LLVMConstInt(ctx->i32, start, 0), "");
4026 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
4027 }
4028
4029 offset = base_offset;
4030 if (start != 0) {
4031 offset = LLVMBuildAdd(
4032 builder, offset,
4033 LLVMConstInt(ctx->i32, start * 4, 0), "");
4034 }
4035
4036 emit_data->args[0] = data;
4037 emit_data->args[3] = offset;
4038
4039 lp_build_intrinsic(
4040 builder, intrinsic_name, emit_data->dst_type,
4041 emit_data->args, emit_data->arg_count,
4042 get_store_intr_attribs(writeonly_memory));
4043 }
4044 }
4045
4046 static void store_emit_memory(
4047 struct si_shader_context *ctx,
4048 struct lp_build_emit_data *emit_data)
4049 {
4050 const struct tgsi_full_instruction *inst = emit_data->inst;
4051 struct gallivm_state *gallivm = &ctx->gallivm;
4052 LLVMBuilderRef builder = gallivm->builder;
4053 unsigned writemask = inst->Dst[0].Register.WriteMask;
4054 LLVMValueRef ptr, derived_ptr, data, index;
4055 int chan;
4056
4057 ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
4058
4059 for (chan = 0; chan < 4; ++chan) {
4060 if (!(writemask & (1 << chan))) {
4061 continue;
4062 }
4063 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
4064 index = LLVMConstInt(ctx->i32, chan, 0);
4065 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
4066 LLVMBuildStore(builder, data, derived_ptr);
4067 }
4068 }
4069
4070 static void store_emit(
4071 const struct lp_build_tgsi_action *action,
4072 struct lp_build_tgsi_context *bld_base,
4073 struct lp_build_emit_data *emit_data)
4074 {
4075 struct si_shader_context *ctx = si_shader_context(bld_base);
4076 struct gallivm_state *gallivm = &ctx->gallivm;
4077 LLVMBuilderRef builder = gallivm->builder;
4078 const struct tgsi_full_instruction * inst = emit_data->inst;
4079 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
4080 unsigned target = inst->Memory.Texture;
4081 char intrinsic_name[64];
4082 bool writeonly_memory = false;
4083
4084 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4085 store_emit_memory(ctx, emit_data);
4086 return;
4087 }
4088
4089 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4090 emit_waitcnt(ctx, VM_CNT);
4091
4092 writeonly_memory = is_oneway_access_only(inst, info,
4093 info->shader_buffers_load |
4094 info->shader_buffers_atomic,
4095 info->images_load |
4096 info->images_atomic);
4097
4098 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4099 store_emit_buffer(ctx, emit_data, writeonly_memory);
4100 return;
4101 }
4102
4103 if (target == TGSI_TEXTURE_BUFFER) {
4104 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4105 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4106 emit_data->dst_type, emit_data->args,
4107 emit_data->arg_count,
4108 get_store_intr_attribs(writeonly_memory));
4109 } else {
4110 ac_get_image_intr_name("llvm.amdgcn.image.store",
4111 LLVMTypeOf(emit_data->args[0]), /* vdata */
4112 LLVMTypeOf(emit_data->args[1]), /* coords */
4113 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4114 intrinsic_name, sizeof(intrinsic_name));
4115
4116 emit_data->output[emit_data->chan] =
4117 lp_build_intrinsic(
4118 builder, intrinsic_name, emit_data->dst_type,
4119 emit_data->args, emit_data->arg_count,
4120 get_store_intr_attribs(writeonly_memory));
4121 }
4122 }
4123
4124 static void atomic_fetch_args(
4125 struct lp_build_tgsi_context * bld_base,
4126 struct lp_build_emit_data * emit_data)
4127 {
4128 struct si_shader_context *ctx = si_shader_context(bld_base);
4129 struct gallivm_state *gallivm = &ctx->gallivm;
4130 LLVMBuilderRef builder = gallivm->builder;
4131 const struct tgsi_full_instruction * inst = emit_data->inst;
4132 LLVMValueRef data1, data2;
4133 LLVMValueRef rsrc;
4134 LLVMValueRef tmp;
4135
4136 emit_data->dst_type = ctx->f32;
4137
4138 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4139 data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4140
4141 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4142 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4143 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4144 }
4145
4146 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4147 * of arguments, which is reversed relative to TGSI (and GLSL)
4148 */
4149 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4150 emit_data->args[emit_data->arg_count++] = data2;
4151 emit_data->args[emit_data->arg_count++] = data1;
4152
4153 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4154 LLVMValueRef offset;
4155
4156 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4157
4158 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4159 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4160
4161 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4162 offset, true, false);
4163 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4164 unsigned target = inst->Memory.Texture;
4165 LLVMValueRef coords;
4166
4167 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4168 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4169
4170 if (target == TGSI_TEXTURE_BUFFER) {
4171 buffer_append_args(ctx, emit_data, rsrc, coords,
4172 ctx->i32_0, true, false);
4173 } else {
4174 emit_data->args[emit_data->arg_count++] = coords;
4175 emit_data->args[emit_data->arg_count++] = rsrc;
4176
4177 image_append_args(ctx, emit_data, target, true, false);
4178 }
4179 }
4180 }
4181
4182 static void atomic_emit_memory(struct si_shader_context *ctx,
4183 struct lp_build_emit_data *emit_data) {
4184 struct gallivm_state *gallivm = &ctx->gallivm;
4185 LLVMBuilderRef builder = gallivm->builder;
4186 const struct tgsi_full_instruction * inst = emit_data->inst;
4187 LLVMValueRef ptr, result, arg;
4188
4189 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4190
4191 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4192 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4193
4194 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4195 LLVMValueRef new_data;
4196 new_data = lp_build_emit_fetch(&ctx->bld_base,
4197 inst, 3, 0);
4198
4199 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4200
4201 #if HAVE_LLVM >= 0x309
4202 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4203 LLVMAtomicOrderingSequentiallyConsistent,
4204 LLVMAtomicOrderingSequentiallyConsistent,
4205 false);
4206 #endif
4207
4208 result = LLVMBuildExtractValue(builder, result, 0, "");
4209 } else {
4210 LLVMAtomicRMWBinOp op;
4211
4212 switch(inst->Instruction.Opcode) {
4213 case TGSI_OPCODE_ATOMUADD:
4214 op = LLVMAtomicRMWBinOpAdd;
4215 break;
4216 case TGSI_OPCODE_ATOMXCHG:
4217 op = LLVMAtomicRMWBinOpXchg;
4218 break;
4219 case TGSI_OPCODE_ATOMAND:
4220 op = LLVMAtomicRMWBinOpAnd;
4221 break;
4222 case TGSI_OPCODE_ATOMOR:
4223 op = LLVMAtomicRMWBinOpOr;
4224 break;
4225 case TGSI_OPCODE_ATOMXOR:
4226 op = LLVMAtomicRMWBinOpXor;
4227 break;
4228 case TGSI_OPCODE_ATOMUMIN:
4229 op = LLVMAtomicRMWBinOpUMin;
4230 break;
4231 case TGSI_OPCODE_ATOMUMAX:
4232 op = LLVMAtomicRMWBinOpUMax;
4233 break;
4234 case TGSI_OPCODE_ATOMIMIN:
4235 op = LLVMAtomicRMWBinOpMin;
4236 break;
4237 case TGSI_OPCODE_ATOMIMAX:
4238 op = LLVMAtomicRMWBinOpMax;
4239 break;
4240 default:
4241 unreachable("unknown atomic opcode");
4242 }
4243
4244 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4245 LLVMAtomicOrderingSequentiallyConsistent,
4246 false);
4247 }
4248 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4249 }
4250
4251 static void atomic_emit(
4252 const struct lp_build_tgsi_action *action,
4253 struct lp_build_tgsi_context *bld_base,
4254 struct lp_build_emit_data *emit_data)
4255 {
4256 struct si_shader_context *ctx = si_shader_context(bld_base);
4257 struct gallivm_state *gallivm = &ctx->gallivm;
4258 LLVMBuilderRef builder = gallivm->builder;
4259 const struct tgsi_full_instruction * inst = emit_data->inst;
4260 char intrinsic_name[40];
4261 LLVMValueRef tmp;
4262
4263 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4264 atomic_emit_memory(ctx, emit_data);
4265 return;
4266 }
4267
4268 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4269 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4270 snprintf(intrinsic_name, sizeof(intrinsic_name),
4271 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4272 } else {
4273 LLVMValueRef coords;
4274 char coords_type[8];
4275
4276 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4277 coords = emit_data->args[2];
4278 else
4279 coords = emit_data->args[1];
4280
4281 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4282 snprintf(intrinsic_name, sizeof(intrinsic_name),
4283 "llvm.amdgcn.image.atomic.%s.%s",
4284 action->intr_name, coords_type);
4285 }
4286
4287 tmp = lp_build_intrinsic(
4288 builder, intrinsic_name, ctx->i32,
4289 emit_data->args, emit_data->arg_count, 0);
4290 emit_data->output[emit_data->chan] =
4291 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4292 }
4293
4294 static void set_tex_fetch_args(struct si_shader_context *ctx,
4295 struct lp_build_emit_data *emit_data,
4296 unsigned target,
4297 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4298 LLVMValueRef *param, unsigned count,
4299 unsigned dmask)
4300 {
4301 struct gallivm_state *gallivm = &ctx->gallivm;
4302 struct ac_image_args args = {};
4303
4304 /* Pad to power of two vector */
4305 while (count < util_next_power_of_two(count))
4306 param[count++] = LLVMGetUndef(ctx->i32);
4307
4308 if (count > 1)
4309 args.addr = lp_build_gather_values(gallivm, param, count);
4310 else
4311 args.addr = param[0];
4312
4313 args.resource = res_ptr;
4314 args.sampler = samp_ptr;
4315 args.dmask = dmask;
4316 args.unorm = target == TGSI_TEXTURE_RECT ||
4317 target == TGSI_TEXTURE_SHADOWRECT;
4318 args.da = tgsi_is_array_sampler(target);
4319
4320 /* Ugly, but we seem to have no other choice right now. */
4321 STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4322 memcpy(emit_data->args, &args, sizeof(args));
4323 }
4324
4325 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4326 unsigned target, LLVMValueRef out)
4327 {
4328 LLVMBuilderRef builder = ctx->gallivm.builder;
4329
4330 /* 1D textures are allocated and used as 2D on GFX9. */
4331 if (ctx->screen->b.chip_class >= GFX9 &&
4332 (target == TGSI_TEXTURE_1D_ARRAY ||
4333 target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4334 LLVMValueRef layers =
4335 LLVMBuildExtractElement(builder, out,
4336 LLVMConstInt(ctx->i32, 2, 0), "");
4337 out = LLVMBuildInsertElement(builder, out, layers,
4338 ctx->i32_1, "");
4339 }
4340
4341 /* Divide the number of layers by 6 to get the number of cubes. */
4342 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4343 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4344 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4345
4346 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4347 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4348
4349 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4350 }
4351 return out;
4352 }
4353
4354 static void resq_fetch_args(
4355 struct lp_build_tgsi_context * bld_base,
4356 struct lp_build_emit_data * emit_data)
4357 {
4358 struct si_shader_context *ctx = si_shader_context(bld_base);
4359 const struct tgsi_full_instruction *inst = emit_data->inst;
4360 const struct tgsi_full_src_register *reg = &inst->Src[0];
4361
4362 emit_data->dst_type = ctx->v4i32;
4363
4364 if (reg->Register.File == TGSI_FILE_BUFFER) {
4365 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4366 emit_data->arg_count = 1;
4367 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4368 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4369 &emit_data->args[0]);
4370 emit_data->arg_count = 1;
4371 } else {
4372 LLVMValueRef res_ptr;
4373 unsigned image_target;
4374
4375 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4376 image_target = TGSI_TEXTURE_2D_ARRAY;
4377 else
4378 image_target = inst->Memory.Texture;
4379
4380 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4381 &res_ptr);
4382 set_tex_fetch_args(ctx, emit_data, image_target,
4383 res_ptr, NULL, &ctx->i32_0, 1,
4384 0xf);
4385 }
4386 }
4387
4388 static void resq_emit(
4389 const struct lp_build_tgsi_action *action,
4390 struct lp_build_tgsi_context *bld_base,
4391 struct lp_build_emit_data *emit_data)
4392 {
4393 struct si_shader_context *ctx = si_shader_context(bld_base);
4394 struct gallivm_state *gallivm = &ctx->gallivm;
4395 LLVMBuilderRef builder = gallivm->builder;
4396 const struct tgsi_full_instruction *inst = emit_data->inst;
4397 LLVMValueRef out;
4398
4399 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4400 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4401 LLVMConstInt(ctx->i32, 2, 0), "");
4402 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4403 out = get_buffer_size(bld_base, emit_data->args[0]);
4404 } else {
4405 struct ac_image_args args;
4406
4407 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4408 args.opcode = ac_image_get_resinfo;
4409 out = ac_build_image_opcode(&ctx->ac, &args);
4410
4411 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4412 }
4413
4414 emit_data->output[emit_data->chan] = out;
4415 }
4416
4417 static const struct lp_build_tgsi_action tex_action;
4418
4419 enum desc_type {
4420 DESC_IMAGE,
4421 DESC_BUFFER,
4422 DESC_FMASK,
4423 DESC_SAMPLER,
4424 };
4425
4426 /**
4427 * Load an image view, fmask view. or sampler state descriptor.
4428 */
4429 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4430 LLVMValueRef list, LLVMValueRef index,
4431 enum desc_type type)
4432 {
4433 struct gallivm_state *gallivm = &ctx->gallivm;
4434 LLVMBuilderRef builder = gallivm->builder;
4435
4436 switch (type) {
4437 case DESC_IMAGE:
4438 /* The image is at [0:7]. */
4439 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4440 break;
4441 case DESC_BUFFER:
4442 /* The buffer is in [4:7]. */
4443 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4444 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4445 list = LLVMBuildPointerCast(builder, list,
4446 const_array(ctx->v4i32, 0), "");
4447 break;
4448 case DESC_FMASK:
4449 /* The FMASK is at [8:15]. */
4450 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4451 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4452 break;
4453 case DESC_SAMPLER:
4454 /* The sampler state is at [12:15]. */
4455 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4456 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4457 list = LLVMBuildPointerCast(builder, list,
4458 const_array(ctx->v4i32, 0), "");
4459 break;
4460 }
4461
4462 return ac_build_indexed_load_const(&ctx->ac, list, index);
4463 }
4464
4465 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4466 *
4467 * SI-CI:
4468 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4469 * filtering manually. The driver sets img7 to a mask clearing
4470 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4471 * s_and_b32 samp0, samp0, img7
4472 *
4473 * VI:
4474 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4475 */
4476 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4477 LLVMValueRef res, LLVMValueRef samp)
4478 {
4479 LLVMBuilderRef builder = ctx->gallivm.builder;
4480 LLVMValueRef img7, samp0;
4481
4482 if (ctx->screen->b.chip_class >= VI)
4483 return samp;
4484
4485 img7 = LLVMBuildExtractElement(builder, res,
4486 LLVMConstInt(ctx->i32, 7, 0), "");
4487 samp0 = LLVMBuildExtractElement(builder, samp,
4488 ctx->i32_0, "");
4489 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4490 return LLVMBuildInsertElement(builder, samp, samp0,
4491 ctx->i32_0, "");
4492 }
4493
4494 static void tex_fetch_ptrs(
4495 struct lp_build_tgsi_context *bld_base,
4496 struct lp_build_emit_data *emit_data,
4497 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4498 {
4499 struct si_shader_context *ctx = si_shader_context(bld_base);
4500 LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4501 const struct tgsi_full_instruction *inst = emit_data->inst;
4502 const struct tgsi_full_src_register *reg;
4503 unsigned target = inst->Texture.Texture;
4504 unsigned sampler_src;
4505 LLVMValueRef index;
4506
4507 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4508 reg = &emit_data->inst->Src[sampler_src];
4509
4510 if (reg->Register.Indirect) {
4511 index = get_bounded_indirect_index(ctx,
4512 &reg->Indirect,
4513 reg->Register.Index,
4514 SI_NUM_SAMPLERS);
4515 } else {
4516 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4517 }
4518
4519 if (target == TGSI_TEXTURE_BUFFER)
4520 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4521 else
4522 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4523
4524 if (samp_ptr)
4525 *samp_ptr = NULL;
4526 if (fmask_ptr)
4527 *fmask_ptr = NULL;
4528
4529 if (target == TGSI_TEXTURE_2D_MSAA ||
4530 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4531 if (fmask_ptr)
4532 *fmask_ptr = load_sampler_desc(ctx, list, index,
4533 DESC_FMASK);
4534 } else if (target != TGSI_TEXTURE_BUFFER) {
4535 if (samp_ptr) {
4536 *samp_ptr = load_sampler_desc(ctx, list, index,
4537 DESC_SAMPLER);
4538 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4539 }
4540 }
4541 }
4542
4543 static void txq_fetch_args(
4544 struct lp_build_tgsi_context *bld_base,
4545 struct lp_build_emit_data *emit_data)
4546 {
4547 struct si_shader_context *ctx = si_shader_context(bld_base);
4548 const struct tgsi_full_instruction *inst = emit_data->inst;
4549 unsigned target = inst->Texture.Texture;
4550 LLVMValueRef res_ptr;
4551 LLVMValueRef address;
4552
4553 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4554
4555 if (target == TGSI_TEXTURE_BUFFER) {
4556 /* Read the size from the buffer descriptor directly. */
4557 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4558 return;
4559 }
4560
4561 /* Textures - set the mip level. */
4562 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4563
4564 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4565 NULL, &address, 1, 0xf);
4566 }
4567
4568 static void txq_emit(const struct lp_build_tgsi_action *action,
4569 struct lp_build_tgsi_context *bld_base,
4570 struct lp_build_emit_data *emit_data)
4571 {
4572 struct si_shader_context *ctx = si_shader_context(bld_base);
4573 struct ac_image_args args;
4574 unsigned target = emit_data->inst->Texture.Texture;
4575
4576 if (target == TGSI_TEXTURE_BUFFER) {
4577 /* Just return the buffer size. */
4578 emit_data->output[emit_data->chan] = emit_data->args[0];
4579 return;
4580 }
4581
4582 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4583
4584 args.opcode = ac_image_get_resinfo;
4585 LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4586
4587 emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4588 }
4589
4590 static void tex_fetch_args(
4591 struct lp_build_tgsi_context *bld_base,
4592 struct lp_build_emit_data *emit_data)
4593 {
4594 struct si_shader_context *ctx = si_shader_context(bld_base);
4595 struct gallivm_state *gallivm = &ctx->gallivm;
4596 const struct tgsi_full_instruction *inst = emit_data->inst;
4597 unsigned opcode = inst->Instruction.Opcode;
4598 unsigned target = inst->Texture.Texture;
4599 LLVMValueRef coords[5], derivs[6];
4600 LLVMValueRef address[16];
4601 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4602 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4603 unsigned count = 0;
4604 unsigned chan;
4605 unsigned num_deriv_channels = 0;
4606 bool has_offset = inst->Texture.NumOffsets > 0;
4607 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4608 unsigned dmask = 0xf;
4609
4610 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4611
4612 if (target == TGSI_TEXTURE_BUFFER) {
4613 emit_data->dst_type = ctx->v4f32;
4614 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4615 ctx->v16i8, "");
4616 emit_data->args[1] = ctx->i32_0;
4617 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4618 emit_data->arg_count = 3;
4619 return;
4620 }
4621
4622 /* Fetch and project texture coordinates */
4623 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4624 for (chan = 0; chan < 3; chan++ ) {
4625 coords[chan] = lp_build_emit_fetch(bld_base,
4626 emit_data->inst, 0,
4627 chan);
4628 if (opcode == TGSI_OPCODE_TXP)
4629 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4630 TGSI_OPCODE_DIV,
4631 coords[chan],
4632 coords[3]);
4633 }
4634
4635 if (opcode == TGSI_OPCODE_TXP)
4636 coords[3] = bld_base->base.one;
4637
4638 /* Pack offsets. */
4639 if (has_offset &&
4640 opcode != TGSI_OPCODE_TXF &&
4641 opcode != TGSI_OPCODE_TXF_LZ) {
4642 /* The offsets are six-bit signed integers packed like this:
4643 * X=[5:0], Y=[13:8], and Z=[21:16].
4644 */
4645 LLVMValueRef offset[3], pack;
4646
4647 assert(inst->Texture.NumOffsets == 1);
4648
4649 for (chan = 0; chan < 3; chan++) {
4650 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4651 emit_data->inst, 0, chan);
4652 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4653 LLVMConstInt(ctx->i32, 0x3f, 0), "");
4654 if (chan)
4655 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4656 LLVMConstInt(ctx->i32, chan*8, 0), "");
4657 }
4658
4659 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4660 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4661 address[count++] = pack;
4662 }
4663
4664 /* Pack LOD bias value */
4665 if (opcode == TGSI_OPCODE_TXB)
4666 address[count++] = coords[3];
4667 if (opcode == TGSI_OPCODE_TXB2)
4668 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4669
4670 /* Pack depth comparison value */
4671 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4672 LLVMValueRef z;
4673
4674 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4675 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4676 } else {
4677 assert(ref_pos >= 0);
4678 z = coords[ref_pos];
4679 }
4680
4681 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4682 * so the depth comparison value isn't clamped for Z16 and
4683 * Z24 anymore. Do it manually here.
4684 *
4685 * It's unnecessary if the original texture format was
4686 * Z32_FLOAT, but we don't know that here.
4687 */
4688 if (ctx->screen->b.chip_class == VI)
4689 z = ac_build_clamp(&ctx->ac, z);
4690
4691 address[count++] = z;
4692 }
4693
4694 /* Pack user derivatives */
4695 if (opcode == TGSI_OPCODE_TXD) {
4696 int param, num_src_deriv_channels, num_dst_deriv_channels;
4697
4698 switch (target) {
4699 case TGSI_TEXTURE_3D:
4700 num_src_deriv_channels = 3;
4701 num_dst_deriv_channels = 3;
4702 num_deriv_channels = 3;
4703 break;
4704 case TGSI_TEXTURE_2D:
4705 case TGSI_TEXTURE_SHADOW2D:
4706 case TGSI_TEXTURE_RECT:
4707 case TGSI_TEXTURE_SHADOWRECT:
4708 case TGSI_TEXTURE_2D_ARRAY:
4709 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4710 num_src_deriv_channels = 2;
4711 num_dst_deriv_channels = 2;
4712 num_deriv_channels = 2;
4713 break;
4714 case TGSI_TEXTURE_CUBE:
4715 case TGSI_TEXTURE_SHADOWCUBE:
4716 case TGSI_TEXTURE_CUBE_ARRAY:
4717 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4718 /* Cube derivatives will be converted to 2D. */
4719 num_src_deriv_channels = 3;
4720 num_dst_deriv_channels = 3;
4721 num_deriv_channels = 2;
4722 break;
4723 case TGSI_TEXTURE_1D:
4724 case TGSI_TEXTURE_SHADOW1D:
4725 case TGSI_TEXTURE_1D_ARRAY:
4726 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4727 num_src_deriv_channels = 1;
4728
4729 /* 1D textures are allocated and used as 2D on GFX9. */
4730 if (ctx->screen->b.chip_class >= GFX9) {
4731 num_dst_deriv_channels = 2;
4732 num_deriv_channels = 2;
4733 } else {
4734 num_dst_deriv_channels = 1;
4735 num_deriv_channels = 1;
4736 }
4737 break;
4738 default:
4739 unreachable("invalid target");
4740 }
4741
4742 for (param = 0; param < 2; param++) {
4743 for (chan = 0; chan < num_src_deriv_channels; chan++)
4744 derivs[param * num_dst_deriv_channels + chan] =
4745 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4746
4747 /* Fill in the rest with zeros. */
4748 for (chan = num_src_deriv_channels;
4749 chan < num_dst_deriv_channels; chan++)
4750 derivs[param * num_dst_deriv_channels + chan] =
4751 bld_base->base.zero;
4752 }
4753 }
4754
4755 if (target == TGSI_TEXTURE_CUBE ||
4756 target == TGSI_TEXTURE_CUBE_ARRAY ||
4757 target == TGSI_TEXTURE_SHADOWCUBE ||
4758 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4759 ac_prepare_cube_coords(&ctx->ac,
4760 opcode == TGSI_OPCODE_TXD,
4761 target == TGSI_TEXTURE_CUBE_ARRAY ||
4762 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4763 coords, derivs);
4764
4765 if (opcode == TGSI_OPCODE_TXD)
4766 for (int i = 0; i < num_deriv_channels * 2; i++)
4767 address[count++] = derivs[i];
4768
4769 /* Pack texture coordinates */
4770 address[count++] = coords[0];
4771 if (num_coords > 1)
4772 address[count++] = coords[1];
4773 if (num_coords > 2)
4774 address[count++] = coords[2];
4775
4776 /* 1D textures are allocated and used as 2D on GFX9. */
4777 if (ctx->screen->b.chip_class >= GFX9) {
4778 LLVMValueRef filler;
4779
4780 /* Use 0.5, so that we don't sample the border color. */
4781 if (opcode == TGSI_OPCODE_TXF)
4782 filler = ctx->i32_0;
4783 else
4784 filler = LLVMConstReal(ctx->f32, 0.5);
4785
4786 if (target == TGSI_TEXTURE_1D ||
4787 target == TGSI_TEXTURE_SHADOW1D) {
4788 address[count++] = filler;
4789 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4790 target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4791 address[count] = address[count - 1];
4792 address[count - 1] = filler;
4793 count++;
4794 }
4795 }
4796
4797 /* Pack LOD or sample index */
4798 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4799 address[count++] = coords[3];
4800 else if (opcode == TGSI_OPCODE_TXL2)
4801 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4802
4803 if (count > 16) {
4804 assert(!"Cannot handle more than 16 texture address parameters");
4805 count = 16;
4806 }
4807
4808 for (chan = 0; chan < count; chan++ ) {
4809 address[chan] = LLVMBuildBitCast(gallivm->builder,
4810 address[chan], ctx->i32, "");
4811 }
4812
4813 /* Adjust the sample index according to FMASK.
4814 *
4815 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4816 * which is the identity mapping. Each nibble says which physical sample
4817 * should be fetched to get that sample.
4818 *
4819 * For example, 0x11111100 means there are only 2 samples stored and
4820 * the second sample covers 3/4 of the pixel. When reading samples 0
4821 * and 1, return physical sample 0 (determined by the first two 0s
4822 * in FMASK), otherwise return physical sample 1.
4823 *
4824 * The sample index should be adjusted as follows:
4825 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4826 */
4827 if (target == TGSI_TEXTURE_2D_MSAA ||
4828 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4829 struct lp_build_emit_data txf_emit_data = *emit_data;
4830 LLVMValueRef txf_address[4];
4831 /* We only need .xy for non-arrays, and .xyz for arrays. */
4832 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4833 struct tgsi_full_instruction inst = {};
4834
4835 memcpy(txf_address, address, sizeof(txf_address));
4836
4837 /* Read FMASK using TXF_LZ. */
4838 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4839 inst.Texture.Texture = target;
4840 txf_emit_data.inst = &inst;
4841 txf_emit_data.chan = 0;
4842 set_tex_fetch_args(ctx, &txf_emit_data,
4843 target, fmask_ptr, NULL,
4844 txf_address, txf_count, 0xf);
4845 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4846
4847 /* Initialize some constants. */
4848 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4849 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4850
4851 /* Apply the formula. */
4852 LLVMValueRef fmask =
4853 LLVMBuildExtractElement(gallivm->builder,
4854 txf_emit_data.output[0],
4855 ctx->i32_0, "");
4856
4857 unsigned sample_chan = txf_count; /* the sample index is last */
4858
4859 LLVMValueRef sample_index4 =
4860 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4861
4862 LLVMValueRef shifted_fmask =
4863 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4864
4865 LLVMValueRef final_sample =
4866 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4867
4868 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4869 * resource descriptor is 0 (invalid),
4870 */
4871 LLVMValueRef fmask_desc =
4872 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4873 ctx->v8i32, "");
4874
4875 LLVMValueRef fmask_word1 =
4876 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4877 ctx->i32_1, "");
4878
4879 LLVMValueRef word1_is_nonzero =
4880 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4881 fmask_word1, ctx->i32_0, "");
4882
4883 /* Replace the MSAA sample index. */
4884 address[sample_chan] =
4885 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4886 final_sample, address[sample_chan], "");
4887 }
4888
4889 if (opcode == TGSI_OPCODE_TXF ||
4890 opcode == TGSI_OPCODE_TXF_LZ) {
4891 /* add tex offsets */
4892 if (inst->Texture.NumOffsets) {
4893 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4894 const struct tgsi_texture_offset *off = inst->TexOffsets;
4895
4896 assert(inst->Texture.NumOffsets == 1);
4897
4898 switch (target) {
4899 case TGSI_TEXTURE_3D:
4900 address[2] = lp_build_add(uint_bld, address[2],
4901 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4902 /* fall through */
4903 case TGSI_TEXTURE_2D:
4904 case TGSI_TEXTURE_SHADOW2D:
4905 case TGSI_TEXTURE_RECT:
4906 case TGSI_TEXTURE_SHADOWRECT:
4907 case TGSI_TEXTURE_2D_ARRAY:
4908 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4909 address[1] =
4910 lp_build_add(uint_bld, address[1],
4911 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4912 /* fall through */
4913 case TGSI_TEXTURE_1D:
4914 case TGSI_TEXTURE_SHADOW1D:
4915 case TGSI_TEXTURE_1D_ARRAY:
4916 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4917 address[0] =
4918 lp_build_add(uint_bld, address[0],
4919 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4920 break;
4921 /* texture offsets do not apply to other texture targets */
4922 }
4923 }
4924 }
4925
4926 if (opcode == TGSI_OPCODE_TG4) {
4927 unsigned gather_comp = 0;
4928
4929 /* DMASK was repurposed for GATHER4. 4 components are always
4930 * returned and DMASK works like a swizzle - it selects
4931 * the component to fetch. The only valid DMASK values are
4932 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4933 * (red,red,red,red) etc.) The ISA document doesn't mention
4934 * this.
4935 */
4936
4937 /* Get the component index from src1.x for Gather4. */
4938 if (!tgsi_is_shadow_target(target)) {
4939 LLVMValueRef comp_imm;
4940 struct tgsi_src_register src1 = inst->Src[1].Register;
4941
4942 assert(src1.File == TGSI_FILE_IMMEDIATE);
4943
4944 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4945 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4946 gather_comp = CLAMP(gather_comp, 0, 3);
4947 }
4948
4949 dmask = 1 << gather_comp;
4950 }
4951
4952 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4953 samp_ptr, address, count, dmask);
4954 }
4955
4956 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
4957 * incorrectly forces nearest filtering if the texture format is integer.
4958 * The only effect it has on Gather4, which always returns 4 texels for
4959 * bilinear filtering, is that the final coordinates are off by 0.5 of
4960 * the texel size.
4961 *
4962 * The workaround is to subtract 0.5 from the unnormalized coordinates,
4963 * or (0.5 / size) from the normalized coordinates.
4964 */
4965 static void si_lower_gather4_integer(struct si_shader_context *ctx,
4966 struct ac_image_args *args,
4967 unsigned target)
4968 {
4969 LLVMBuilderRef builder = ctx->gallivm.builder;
4970 LLVMValueRef coord = args->addr;
4971 LLVMValueRef half_texel[2];
4972 /* Texture coordinates start after:
4973 * {offset, bias, z-compare, derivatives}
4974 * Only the offset and z-compare can occur here.
4975 */
4976 unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
4977 int c;
4978
4979 if (target == TGSI_TEXTURE_RECT ||
4980 target == TGSI_TEXTURE_SHADOWRECT) {
4981 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
4982 } else {
4983 struct tgsi_full_instruction txq_inst = {};
4984 struct lp_build_emit_data txq_emit_data = {};
4985
4986 /* Query the texture size. */
4987 txq_inst.Texture.Texture = target;
4988 txq_emit_data.inst = &txq_inst;
4989 txq_emit_data.dst_type = ctx->v4i32;
4990 set_tex_fetch_args(ctx, &txq_emit_data, target,
4991 args->resource, NULL, &ctx->i32_0,
4992 1, 0xf);
4993 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
4994
4995 /* Compute -0.5 / size. */
4996 for (c = 0; c < 2; c++) {
4997 half_texel[c] =
4998 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
4999 LLVMConstInt(ctx->i32, c, 0), "");
5000 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
5001 half_texel[c] =
5002 lp_build_emit_llvm_unary(&ctx->bld_base,
5003 TGSI_OPCODE_RCP, half_texel[c]);
5004 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
5005 LLVMConstReal(ctx->f32, -0.5), "");
5006 }
5007 }
5008
5009 for (c = 0; c < 2; c++) {
5010 LLVMValueRef tmp;
5011 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
5012
5013 tmp = LLVMBuildExtractElement(builder, coord, index, "");
5014 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
5015 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
5016 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
5017 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
5018 }
5019
5020 args->addr = coord;
5021 }
5022
5023 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
5024 struct lp_build_tgsi_context *bld_base,
5025 struct lp_build_emit_data *emit_data)
5026 {
5027 struct si_shader_context *ctx = si_shader_context(bld_base);
5028 const struct tgsi_full_instruction *inst = emit_data->inst;
5029 struct ac_image_args args;
5030 unsigned opcode = inst->Instruction.Opcode;
5031 unsigned target = inst->Texture.Texture;
5032
5033 if (target == TGSI_TEXTURE_BUFFER) {
5034 emit_data->output[emit_data->chan] =
5035 ac_build_buffer_load_format(&ctx->ac,
5036 emit_data->args[0],
5037 emit_data->args[2],
5038 emit_data->args[1],
5039 true);
5040 return;
5041 }
5042
5043 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
5044
5045 args.opcode = ac_image_sample;
5046 args.compare = tgsi_is_shadow_target(target);
5047 args.offset = inst->Texture.NumOffsets > 0;
5048
5049 switch (opcode) {
5050 case TGSI_OPCODE_TXF:
5051 case TGSI_OPCODE_TXF_LZ:
5052 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
5053 target == TGSI_TEXTURE_2D_MSAA ||
5054 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
5055 ac_image_load : ac_image_load_mip;
5056 args.compare = false;
5057 args.offset = false;
5058 break;
5059 case TGSI_OPCODE_LODQ:
5060 args.opcode = ac_image_get_lod;
5061 args.compare = false;
5062 args.offset = false;
5063 break;
5064 case TGSI_OPCODE_TEX:
5065 case TGSI_OPCODE_TEX2:
5066 case TGSI_OPCODE_TXP:
5067 if (ctx->type != PIPE_SHADER_FRAGMENT)
5068 args.level_zero = true;
5069 break;
5070 case TGSI_OPCODE_TEX_LZ:
5071 args.level_zero = true;
5072 break;
5073 case TGSI_OPCODE_TXB:
5074 case TGSI_OPCODE_TXB2:
5075 assert(ctx->type == PIPE_SHADER_FRAGMENT);
5076 args.bias = true;
5077 break;
5078 case TGSI_OPCODE_TXL:
5079 case TGSI_OPCODE_TXL2:
5080 args.lod = true;
5081 break;
5082 case TGSI_OPCODE_TXD:
5083 args.deriv = true;
5084 break;
5085 case TGSI_OPCODE_TG4:
5086 args.opcode = ac_image_gather4;
5087 args.level_zero = true;
5088 break;
5089 default:
5090 assert(0);
5091 return;
5092 }
5093
5094 /* The hardware needs special lowering for Gather4 with integer formats. */
5095 if (ctx->screen->b.chip_class <= VI &&
5096 opcode == TGSI_OPCODE_TG4) {
5097 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5098 /* This will also work with non-constant indexing because of how
5099 * glsl_to_tgsi works and we intent to preserve that behavior.
5100 */
5101 const unsigned src_idx = 2;
5102 unsigned sampler = inst->Src[src_idx].Register.Index;
5103
5104 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
5105
5106 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
5107 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
5108 si_lower_gather4_integer(ctx, &args, target);
5109 }
5110
5111 emit_data->output[emit_data->chan] =
5112 ac_build_image_opcode(&ctx->ac, &args);
5113 }
5114
5115 static void si_llvm_emit_txqs(
5116 const struct lp_build_tgsi_action *action,
5117 struct lp_build_tgsi_context *bld_base,
5118 struct lp_build_emit_data *emit_data)
5119 {
5120 struct si_shader_context *ctx = si_shader_context(bld_base);
5121 struct gallivm_state *gallivm = &ctx->gallivm;
5122 LLVMBuilderRef builder = gallivm->builder;
5123 LLVMValueRef res, samples;
5124 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5125
5126 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5127
5128
5129 /* Read the samples from the descriptor directly. */
5130 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5131 samples = LLVMBuildExtractElement(
5132 builder, res,
5133 LLVMConstInt(ctx->i32, 3, 0), "");
5134 samples = LLVMBuildLShr(builder, samples,
5135 LLVMConstInt(ctx->i32, 16, 0), "");
5136 samples = LLVMBuildAnd(builder, samples,
5137 LLVMConstInt(ctx->i32, 0xf, 0), "");
5138 samples = LLVMBuildShl(builder, ctx->i32_1,
5139 samples, "");
5140
5141 emit_data->output[emit_data->chan] = samples;
5142 }
5143
5144 static void si_llvm_emit_ddxy(
5145 const struct lp_build_tgsi_action *action,
5146 struct lp_build_tgsi_context *bld_base,
5147 struct lp_build_emit_data *emit_data)
5148 {
5149 struct si_shader_context *ctx = si_shader_context(bld_base);
5150 struct gallivm_state *gallivm = &ctx->gallivm;
5151 unsigned opcode = emit_data->info->opcode;
5152 LLVMValueRef val;
5153 int idx;
5154 unsigned mask;
5155
5156 if (opcode == TGSI_OPCODE_DDX_FINE)
5157 mask = AC_TID_MASK_LEFT;
5158 else if (opcode == TGSI_OPCODE_DDY_FINE)
5159 mask = AC_TID_MASK_TOP;
5160 else
5161 mask = AC_TID_MASK_TOP_LEFT;
5162
5163 /* for DDX we want to next X pixel, DDY next Y pixel. */
5164 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5165
5166 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5167 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5168 mask, idx, ctx->lds, val);
5169 emit_data->output[emit_data->chan] = val;
5170 }
5171
5172 /*
5173 * this takes an I,J coordinate pair,
5174 * and works out the X and Y derivatives.
5175 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5176 */
5177 static LLVMValueRef si_llvm_emit_ddxy_interp(
5178 struct lp_build_tgsi_context *bld_base,
5179 LLVMValueRef interp_ij)
5180 {
5181 struct si_shader_context *ctx = si_shader_context(bld_base);
5182 struct gallivm_state *gallivm = &ctx->gallivm;
5183 LLVMValueRef result[4], a;
5184 unsigned i;
5185
5186 for (i = 0; i < 2; i++) {
5187 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5188 LLVMConstInt(ctx->i32, i, 0), "");
5189 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5190 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5191 }
5192
5193 return lp_build_gather_values(gallivm, result, 4);
5194 }
5195
5196 static void interp_fetch_args(
5197 struct lp_build_tgsi_context *bld_base,
5198 struct lp_build_emit_data *emit_data)
5199 {
5200 struct si_shader_context *ctx = si_shader_context(bld_base);
5201 struct gallivm_state *gallivm = &ctx->gallivm;
5202 const struct tgsi_full_instruction *inst = emit_data->inst;
5203
5204 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5205 /* offset is in second src, first two channels */
5206 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5207 emit_data->inst, 1,
5208 TGSI_CHAN_X);
5209 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5210 emit_data->inst, 1,
5211 TGSI_CHAN_Y);
5212 emit_data->arg_count = 2;
5213 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5214 LLVMValueRef sample_position;
5215 LLVMValueRef sample_id;
5216 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5217
5218 /* fetch sample ID, then fetch its sample position,
5219 * and place into first two channels.
5220 */
5221 sample_id = lp_build_emit_fetch(bld_base,
5222 emit_data->inst, 1, TGSI_CHAN_X);
5223 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5224 ctx->i32, "");
5225 sample_position = load_sample_position(ctx, sample_id);
5226
5227 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5228 sample_position,
5229 ctx->i32_0, "");
5230
5231 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5232 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5233 sample_position,
5234 ctx->i32_1, "");
5235 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5236 emit_data->arg_count = 2;
5237 }
5238 }
5239
5240 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5241 struct lp_build_tgsi_context *bld_base,
5242 struct lp_build_emit_data *emit_data)
5243 {
5244 struct si_shader_context *ctx = si_shader_context(bld_base);
5245 struct si_shader *shader = ctx->shader;
5246 struct gallivm_state *gallivm = &ctx->gallivm;
5247 LLVMValueRef interp_param;
5248 const struct tgsi_full_instruction *inst = emit_data->inst;
5249 int input_index = inst->Src[0].Register.Index;
5250 int chan;
5251 int i;
5252 LLVMValueRef attr_number;
5253 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5254 int interp_param_idx;
5255 unsigned interp = shader->selector->info.input_interpolate[input_index];
5256 unsigned location;
5257
5258 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5259
5260 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5261 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5262 location = TGSI_INTERPOLATE_LOC_CENTER;
5263 else
5264 location = TGSI_INTERPOLATE_LOC_CENTROID;
5265
5266 interp_param_idx = lookup_interp_param_index(interp, location);
5267 if (interp_param_idx == -1)
5268 return;
5269 else if (interp_param_idx)
5270 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5271 else
5272 interp_param = NULL;
5273
5274 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5275
5276 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5277 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5278 LLVMValueRef ij_out[2];
5279 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5280
5281 /*
5282 * take the I then J parameters, and the DDX/Y for it, and
5283 * calculate the IJ inputs for the interpolator.
5284 * temp1 = ddx * offset/sample.x + I;
5285 * interp_param.I = ddy * offset/sample.y + temp1;
5286 * temp1 = ddx * offset/sample.x + J;
5287 * interp_param.J = ddy * offset/sample.y + temp1;
5288 */
5289 for (i = 0; i < 2; i++) {
5290 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5291 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5292 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5293 ddxy_out, ix_ll, "");
5294 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5295 ddxy_out, iy_ll, "");
5296 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5297 interp_param, ix_ll, "");
5298 LLVMValueRef temp1, temp2;
5299
5300 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5301 ctx->f32, "");
5302
5303 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5304
5305 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5306
5307 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5308
5309 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5310 }
5311 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5312 }
5313
5314 for (chan = 0; chan < 4; chan++) {
5315 LLVMValueRef llvm_chan;
5316 unsigned schan;
5317
5318 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5319 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5320
5321 if (interp_param) {
5322 interp_param = LLVMBuildBitCast(gallivm->builder,
5323 interp_param, LLVMVectorType(ctx->f32, 2), "");
5324 LLVMValueRef i = LLVMBuildExtractElement(
5325 gallivm->builder, interp_param, ctx->i32_0, "");
5326 LLVMValueRef j = LLVMBuildExtractElement(
5327 gallivm->builder, interp_param, ctx->i32_1, "");
5328 emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5329 llvm_chan, attr_number, params,
5330 i, j);
5331 } else {
5332 emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5333 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5334 llvm_chan, attr_number, params);
5335 }
5336 }
5337 }
5338
5339 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5340 LLVMValueRef value)
5341 {
5342 struct gallivm_state *gallivm = &ctx->gallivm;
5343 LLVMValueRef args[3] = {
5344 value,
5345 ctx->i32_0,
5346 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5347 };
5348
5349 /* We currently have no other way to prevent LLVM from lifting the icmp
5350 * calls to a dominating basic block.
5351 */
5352 emit_optimization_barrier(ctx, &args[0]);
5353
5354 if (LLVMTypeOf(args[0]) != ctx->i32)
5355 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5356
5357 return lp_build_intrinsic(gallivm->builder,
5358 "llvm.amdgcn.icmp.i32",
5359 ctx->i64, args, 3,
5360 LP_FUNC_ATTR_NOUNWIND |
5361 LP_FUNC_ATTR_READNONE |
5362 LP_FUNC_ATTR_CONVERGENT);
5363 }
5364
5365 static void vote_all_emit(
5366 const struct lp_build_tgsi_action *action,
5367 struct lp_build_tgsi_context *bld_base,
5368 struct lp_build_emit_data *emit_data)
5369 {
5370 struct si_shader_context *ctx = si_shader_context(bld_base);
5371 struct gallivm_state *gallivm = &ctx->gallivm;
5372 LLVMValueRef active_set, vote_set;
5373 LLVMValueRef tmp;
5374
5375 active_set = si_emit_ballot(ctx, ctx->i32_1);
5376 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5377
5378 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5379 emit_data->output[emit_data->chan] =
5380 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5381 }
5382
5383 static void vote_any_emit(
5384 const struct lp_build_tgsi_action *action,
5385 struct lp_build_tgsi_context *bld_base,
5386 struct lp_build_emit_data *emit_data)
5387 {
5388 struct si_shader_context *ctx = si_shader_context(bld_base);
5389 struct gallivm_state *gallivm = &ctx->gallivm;
5390 LLVMValueRef vote_set;
5391 LLVMValueRef tmp;
5392
5393 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5394
5395 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5396 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5397 emit_data->output[emit_data->chan] =
5398 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5399 }
5400
5401 static void vote_eq_emit(
5402 const struct lp_build_tgsi_action *action,
5403 struct lp_build_tgsi_context *bld_base,
5404 struct lp_build_emit_data *emit_data)
5405 {
5406 struct si_shader_context *ctx = si_shader_context(bld_base);
5407 struct gallivm_state *gallivm = &ctx->gallivm;
5408 LLVMValueRef active_set, vote_set;
5409 LLVMValueRef all, none, tmp;
5410
5411 active_set = si_emit_ballot(ctx, ctx->i32_1);
5412 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5413
5414 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5415 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5416 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5417 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5418 emit_data->output[emit_data->chan] =
5419 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5420 }
5421
5422 static void ballot_emit(
5423 const struct lp_build_tgsi_action *action,
5424 struct lp_build_tgsi_context *bld_base,
5425 struct lp_build_emit_data *emit_data)
5426 {
5427 struct si_shader_context *ctx = si_shader_context(bld_base);
5428 LLVMBuilderRef builder = ctx->gallivm.builder;
5429 LLVMValueRef tmp;
5430
5431 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5432 tmp = si_emit_ballot(ctx, tmp);
5433 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5434
5435 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5436 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5437 }
5438
5439 static void read_invoc_fetch_args(
5440 struct lp_build_tgsi_context *bld_base,
5441 struct lp_build_emit_data *emit_data)
5442 {
5443 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5444 0, emit_data->src_chan);
5445
5446 /* Always read the source invocation (= lane) from the X channel. */
5447 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5448 1, TGSI_CHAN_X);
5449 emit_data->arg_count = 2;
5450 }
5451
5452 static void read_lane_emit(
5453 const struct lp_build_tgsi_action *action,
5454 struct lp_build_tgsi_context *bld_base,
5455 struct lp_build_emit_data *emit_data)
5456 {
5457 struct si_shader_context *ctx = si_shader_context(bld_base);
5458 LLVMBuilderRef builder = ctx->gallivm.builder;
5459
5460 /* We currently have no other way to prevent LLVM from lifting the icmp
5461 * calls to a dominating basic block.
5462 */
5463 emit_optimization_barrier(ctx, &emit_data->args[0]);
5464
5465 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5466 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5467 ctx->i32, "");
5468 }
5469
5470 emit_data->output[emit_data->chan] =
5471 ac_build_intrinsic(&ctx->ac, action->intr_name,
5472 ctx->i32, emit_data->args, emit_data->arg_count,
5473 AC_FUNC_ATTR_READNONE |
5474 AC_FUNC_ATTR_CONVERGENT);
5475 }
5476
5477 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5478 struct lp_build_emit_data *emit_data)
5479 {
5480 struct si_shader_context *ctx = si_shader_context(bld_base);
5481 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5482 LLVMValueRef imm;
5483 unsigned stream;
5484
5485 assert(src0.File == TGSI_FILE_IMMEDIATE);
5486
5487 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5488 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5489 return stream;
5490 }
5491
5492 /* Emit one vertex from the geometry shader */
5493 static void si_llvm_emit_vertex(
5494 const struct lp_build_tgsi_action *action,
5495 struct lp_build_tgsi_context *bld_base,
5496 struct lp_build_emit_data *emit_data)
5497 {
5498 struct si_shader_context *ctx = si_shader_context(bld_base);
5499 struct lp_build_context *uint = &bld_base->uint_bld;
5500 struct si_shader *shader = ctx->shader;
5501 struct tgsi_shader_info *info = &shader->selector->info;
5502 struct gallivm_state *gallivm = &ctx->gallivm;
5503 struct lp_build_if_state if_state;
5504 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5505 ctx->param_gs2vs_offset);
5506 LLVMValueRef gs_next_vertex;
5507 LLVMValueRef can_emit, kill;
5508 unsigned chan, offset;
5509 int i;
5510 unsigned stream;
5511
5512 stream = si_llvm_get_stream(bld_base, emit_data);
5513
5514 /* Write vertex attribute values to GSVS ring */
5515 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5516 ctx->gs_next_vertex[stream],
5517 "");
5518
5519 /* If this thread has already emitted the declared maximum number of
5520 * vertices, skip the write: excessive vertex emissions are not
5521 * supposed to have any effect.
5522 *
5523 * If the shader has no writes to memory, kill it instead. This skips
5524 * further memory loads and may allow LLVM to skip to the end
5525 * altogether.
5526 */
5527 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5528 LLVMConstInt(ctx->i32,
5529 shader->selector->gs_max_out_vertices, 0), "");
5530
5531 bool use_kill = !info->writes_memory;
5532 if (use_kill) {
5533 kill = lp_build_select(&bld_base->base, can_emit,
5534 LLVMConstReal(ctx->f32, 1.0f),
5535 LLVMConstReal(ctx->f32, -1.0f));
5536
5537 ac_build_kill(&ctx->ac, kill);
5538 } else {
5539 lp_build_if(&if_state, gallivm, can_emit);
5540 }
5541
5542 offset = 0;
5543 for (i = 0; i < info->num_outputs; i++) {
5544 LLVMValueRef *out_ptr = ctx->outputs[i];
5545
5546 for (chan = 0; chan < 4; chan++) {
5547 if (!(info->output_usagemask[i] & (1 << chan)) ||
5548 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5549 continue;
5550
5551 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5552 LLVMValueRef voffset =
5553 LLVMConstInt(ctx->i32, offset *
5554 shader->selector->gs_max_out_vertices, 0);
5555 offset++;
5556
5557 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5558 voffset = lp_build_mul_imm(uint, voffset, 4);
5559
5560 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5561
5562 ac_build_buffer_store_dword(&ctx->ac,
5563 ctx->gsvs_ring[stream],
5564 out_val, 1,
5565 voffset, soffset, 0,
5566 1, 1, true, true);
5567 }
5568 }
5569
5570 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5571 ctx->i32_1);
5572
5573 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5574
5575 /* Signal vertex emission */
5576 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5577 si_get_gs_wave_id(ctx));
5578 if (!use_kill)
5579 lp_build_endif(&if_state);
5580 }
5581
5582 /* Cut one primitive from the geometry shader */
5583 static void si_llvm_emit_primitive(
5584 const struct lp_build_tgsi_action *action,
5585 struct lp_build_tgsi_context *bld_base,
5586 struct lp_build_emit_data *emit_data)
5587 {
5588 struct si_shader_context *ctx = si_shader_context(bld_base);
5589 unsigned stream;
5590
5591 /* Signal primitive cut */
5592 stream = si_llvm_get_stream(bld_base, emit_data);
5593 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5594 si_get_gs_wave_id(ctx));
5595 }
5596
5597 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5598 struct lp_build_tgsi_context *bld_base,
5599 struct lp_build_emit_data *emit_data)
5600 {
5601 struct si_shader_context *ctx = si_shader_context(bld_base);
5602 struct gallivm_state *gallivm = &ctx->gallivm;
5603
5604 /* SI only (thanks to a hw bug workaround):
5605 * The real barrier instruction isn’t needed, because an entire patch
5606 * always fits into a single wave.
5607 */
5608 if (HAVE_LLVM >= 0x0309 &&
5609 ctx->screen->b.chip_class == SI &&
5610 ctx->type == PIPE_SHADER_TESS_CTRL) {
5611 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5612 return;
5613 }
5614
5615 lp_build_intrinsic(gallivm->builder,
5616 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5617 : "llvm.AMDGPU.barrier.local",
5618 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5619 }
5620
5621 static const struct lp_build_tgsi_action tex_action = {
5622 .fetch_args = tex_fetch_args,
5623 .emit = build_tex_intrinsic,
5624 };
5625
5626 static const struct lp_build_tgsi_action interp_action = {
5627 .fetch_args = interp_fetch_args,
5628 .emit = build_interp_intrinsic,
5629 };
5630
5631 static void si_create_function(struct si_shader_context *ctx,
5632 const char *name,
5633 LLVMTypeRef *returns, unsigned num_returns,
5634 LLVMTypeRef *params, unsigned num_params,
5635 int last_sgpr)
5636 {
5637 int i;
5638
5639 si_llvm_create_func(ctx, name, returns, num_returns,
5640 params, num_params);
5641 si_llvm_shader_type(ctx->main_fn, ctx->type);
5642 ctx->return_value = LLVMGetUndef(ctx->return_type);
5643
5644 for (i = 0; i <= last_sgpr; ++i) {
5645 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5646
5647 /* The combination of:
5648 * - ByVal
5649 * - dereferenceable
5650 * - invariant.load
5651 * allows the optimization passes to move loads and reduces
5652 * SGPR spilling significantly.
5653 */
5654 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5655 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5656 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5657 ac_add_attr_dereferenceable(P, UINT64_MAX);
5658 } else
5659 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5660 }
5661
5662 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5663 "no-signed-zeros-fp-math",
5664 "true");
5665
5666 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5667 /* These were copied from some LLVM test. */
5668 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5669 "less-precise-fpmad",
5670 "true");
5671 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5672 "no-infs-fp-math",
5673 "true");
5674 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5675 "no-nans-fp-math",
5676 "true");
5677 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5678 "unsafe-fp-math",
5679 "true");
5680 }
5681 }
5682
5683 static void declare_streamout_params(struct si_shader_context *ctx,
5684 struct pipe_stream_output_info *so,
5685 LLVMTypeRef *params, LLVMTypeRef i32,
5686 unsigned *num_params)
5687 {
5688 int i;
5689
5690 /* Streamout SGPRs. */
5691 if (so->num_outputs) {
5692 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5693 params[ctx->param_streamout_config = (*num_params)++] = i32;
5694 else
5695 ctx->param_streamout_config = *num_params - 1;
5696
5697 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5698 }
5699 /* A streamout buffer offset is loaded if the stride is non-zero. */
5700 for (i = 0; i < 4; i++) {
5701 if (!so->stride[i])
5702 continue;
5703
5704 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5705 }
5706 }
5707
5708 static unsigned llvm_get_type_size(LLVMTypeRef type)
5709 {
5710 LLVMTypeKind kind = LLVMGetTypeKind(type);
5711
5712 switch (kind) {
5713 case LLVMIntegerTypeKind:
5714 return LLVMGetIntTypeWidth(type) / 8;
5715 case LLVMFloatTypeKind:
5716 return 4;
5717 case LLVMPointerTypeKind:
5718 return 8;
5719 case LLVMVectorTypeKind:
5720 return LLVMGetVectorSize(type) *
5721 llvm_get_type_size(LLVMGetElementType(type));
5722 case LLVMArrayTypeKind:
5723 return LLVMGetArrayLength(type) *
5724 llvm_get_type_size(LLVMGetElementType(type));
5725 default:
5726 assert(0);
5727 return 0;
5728 }
5729 }
5730
5731 static void declare_lds_as_pointer(struct si_shader_context *ctx)
5732 {
5733 struct gallivm_state *gallivm = &ctx->gallivm;
5734
5735 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5736 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5737 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5738 "lds");
5739 }
5740
5741 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5742 {
5743 const unsigned *properties = shader->selector->info.properties;
5744 unsigned max_work_group_size =
5745 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5746 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5747 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5748
5749 if (!max_work_group_size) {
5750 /* This is a variable group size compute shader,
5751 * compile it for the maximum possible group size.
5752 */
5753 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5754 }
5755 return max_work_group_size;
5756 }
5757
5758 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5759 LLVMTypeRef *params,
5760 unsigned *num_params,
5761 bool assign_params)
5762 {
5763 params[(*num_params)++] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5764 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5765 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5766 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5767
5768 if (assign_params) {
5769 ctx->param_const_buffers = *num_params - 4;
5770 ctx->param_samplers = *num_params - 3;
5771 ctx->param_images = *num_params - 2;
5772 ctx->param_shader_buffers = *num_params - 1;
5773 }
5774 }
5775
5776 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5777 LLVMTypeRef *params,
5778 unsigned *num_params)
5779 {
5780 params[ctx->param_rw_buffers = (*num_params)++] =
5781 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5782 declare_per_stage_desc_pointers(ctx, params, num_params, true);
5783 }
5784
5785 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5786 LLVMTypeRef *params,
5787 unsigned *num_params)
5788 {
5789 params[ctx->param_vertex_buffers = (*num_params)++] =
5790 const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5791 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5792 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5793 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5794 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5795 }
5796
5797 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5798 LLVMTypeRef *params, unsigned *num_params,
5799 unsigned *num_prolog_vgprs)
5800 {
5801 struct si_shader *shader = ctx->shader;
5802
5803 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5804 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5805 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5806 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5807
5808 if (!shader->is_gs_copy_shader) {
5809 /* Vertex load indices. */
5810 ctx->param_vertex_index0 = (*num_params);
5811 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5812 params[(*num_params)++] = ctx->i32;
5813 *num_prolog_vgprs += shader->selector->info.num_inputs;
5814 }
5815 }
5816
5817 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
5818 LLVMTypeRef *params, unsigned *num_params)
5819 {
5820 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
5821 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
5822 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
5823 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
5824 }
5825
5826 enum {
5827 /* Convenient merged shader definitions. */
5828 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5829 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5830 };
5831
5832 static void create_function(struct si_shader_context *ctx)
5833 {
5834 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5835 struct gallivm_state *gallivm = &ctx->gallivm;
5836 struct si_shader *shader = ctx->shader;
5837 LLVMTypeRef params[100]; /* just make it large enough */
5838 LLVMTypeRef returns[16+32*4];
5839 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5840 unsigned num_returns = 0;
5841 unsigned num_prolog_vgprs = 0;
5842 unsigned type = ctx->type;
5843
5844 /* Set MERGED shaders. */
5845 if (ctx->screen->b.chip_class >= GFX9) {
5846 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5847 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5848 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5849 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5850 }
5851
5852 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5853
5854 switch (type) {
5855 case PIPE_SHADER_VERTEX:
5856 declare_default_desc_pointers(ctx, params, &num_params);
5857 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5858
5859 if (shader->key.as_es) {
5860 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5861 } else if (shader->key.as_ls) {
5862 /* no extra parameters */
5863 } else {
5864 if (shader->is_gs_copy_shader)
5865 num_params = ctx->param_rw_buffers + 1;
5866
5867 /* The locations of the other parameters are assigned dynamically. */
5868 declare_streamout_params(ctx, &shader->selector->so,
5869 params, ctx->i32, &num_params);
5870 }
5871
5872 last_sgpr = num_params-1;
5873
5874 /* VGPRs */
5875 declare_vs_input_vgprs(ctx, params, &num_params,
5876 &num_prolog_vgprs);
5877
5878 /* PrimitiveID output. */
5879 if (!shader->is_gs_copy_shader &&
5880 !shader->key.as_es && !shader->key.as_ls) {
5881 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5882 returns[num_returns++] = ctx->f32;
5883 }
5884 break;
5885
5886 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5887 declare_default_desc_pointers(ctx, params, &num_params);
5888 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5889 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5890 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5891 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5892 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5893 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5894 last_sgpr = num_params - 1;
5895
5896 /* VGPRs */
5897 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5898 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5899
5900 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5901 * placed after the user SGPRs.
5902 */
5903 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5904 returns[num_returns++] = ctx->i32; /* SGPRs */
5905 for (i = 0; i < 3; i++)
5906 returns[num_returns++] = ctx->f32; /* VGPRs */
5907 break;
5908
5909 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
5910 /* Merged stages have 8 system SGPRs at the beginning. */
5911 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
5912 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5913 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5914 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
5915 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5916 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
5917 params[num_params++] = ctx->i32; /* unused */
5918 params[num_params++] = ctx->i32; /* unused */
5919
5920 params[num_params++] = ctx->i32; /* unused */
5921 params[num_params++] = ctx->i32; /* unused */
5922 declare_per_stage_desc_pointers(ctx, params, &num_params,
5923 ctx->type == PIPE_SHADER_VERTEX);
5924 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5925
5926 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5927 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5928 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5929 params[num_params++] = ctx->i32; /* unused */
5930
5931 declare_per_stage_desc_pointers(ctx, params, &num_params,
5932 ctx->type == PIPE_SHADER_TESS_CTRL);
5933 last_sgpr = num_params - 1;
5934
5935 /* VGPRs (first TCS, then VS) */
5936 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5937 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5938
5939 if (ctx->type == PIPE_SHADER_VERTEX) {
5940 declare_vs_input_vgprs(ctx, params, &num_params,
5941 &num_prolog_vgprs);
5942
5943 /* LS return values are inputs to the TCS main shader part. */
5944 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
5945 returns[num_returns++] = ctx->i32; /* SGPRs */
5946 for (i = 0; i < 2; i++)
5947 returns[num_returns++] = ctx->f32; /* VGPRs */
5948 } else {
5949 /* TCS return values are inputs to the TCS epilog.
5950 *
5951 * param_tcs_offchip_offset, param_tcs_factor_offset,
5952 * param_tcs_offchip_layout, and param_rw_buffers
5953 * should be passed to the epilog.
5954 */
5955 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT; i++)
5956 returns[num_returns++] = ctx->i32; /* SGPRs */
5957 for (i = 0; i < 3; i++)
5958 returns[num_returns++] = ctx->f32; /* VGPRs */
5959 }
5960 break;
5961
5962 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
5963 /* Merged stages have 8 system SGPRs at the beginning. */
5964 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
5965 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5966 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
5967 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
5968 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5969 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
5970 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
5971 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
5972
5973 params[num_params++] = ctx->i32; /* unused */
5974 params[num_params++] = ctx->i32; /* unused */
5975 declare_per_stage_desc_pointers(ctx, params, &num_params,
5976 (ctx->type == PIPE_SHADER_VERTEX ||
5977 ctx->type == PIPE_SHADER_TESS_EVAL));
5978 if (ctx->type == PIPE_SHADER_VERTEX) {
5979 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5980 } else {
5981 /* TESS_EVAL (and also GEOMETRY):
5982 * Declare as many input SGPRs as the VS has. */
5983 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5984 params[num_params++] = ctx->i32; /* unused */
5985 params[num_params++] = ctx->i32; /* unused */
5986 params[num_params++] = ctx->i32; /* unused */
5987 params[num_params++] = ctx->i32; /* unused */
5988 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
5989 }
5990
5991 declare_per_stage_desc_pointers(ctx, params, &num_params,
5992 ctx->type == PIPE_SHADER_GEOMETRY);
5993 last_sgpr = num_params - 1;
5994
5995 /* VGPRs (first GS, then VS/TES) */
5996 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
5997 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
5998 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
5999 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6000 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
6001
6002 if (ctx->type == PIPE_SHADER_VERTEX) {
6003 declare_vs_input_vgprs(ctx, params, &num_params,
6004 &num_prolog_vgprs);
6005 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
6006 declare_tes_input_vgprs(ctx, params, &num_params);
6007 }
6008
6009 if (ctx->type == PIPE_SHADER_VERTEX ||
6010 ctx->type == PIPE_SHADER_TESS_EVAL) {
6011 /* ES return values are inputs to GS. */
6012 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
6013 returns[num_returns++] = ctx->i32; /* SGPRs */
6014 for (i = 0; i < 5; i++)
6015 returns[num_returns++] = ctx->f32; /* VGPRs */
6016 }
6017 break;
6018
6019 case PIPE_SHADER_TESS_EVAL:
6020 declare_default_desc_pointers(ctx, params, &num_params);
6021 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6022
6023 if (shader->key.as_es) {
6024 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6025 params[num_params++] = ctx->i32;
6026 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
6027 } else {
6028 params[num_params++] = ctx->i32;
6029 declare_streamout_params(ctx, &shader->selector->so,
6030 params, ctx->i32, &num_params);
6031 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6032 }
6033 last_sgpr = num_params - 1;
6034
6035 /* VGPRs */
6036 declare_tes_input_vgprs(ctx, params, &num_params);
6037
6038 /* PrimitiveID output. */
6039 if (!shader->key.as_es)
6040 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
6041 returns[num_returns++] = ctx->f32;
6042 break;
6043
6044 case PIPE_SHADER_GEOMETRY:
6045 declare_default_desc_pointers(ctx, params, &num_params);
6046 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6047 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
6048 last_sgpr = num_params - 1;
6049
6050 /* VGPRs */
6051 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
6052 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
6053 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6054 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
6055 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
6056 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
6057 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
6058 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6059 break;
6060
6061 case PIPE_SHADER_FRAGMENT:
6062 declare_default_desc_pointers(ctx, params, &num_params);
6063 params[SI_PARAM_ALPHA_REF] = ctx->f32;
6064 params[SI_PARAM_PRIM_MASK] = ctx->i32;
6065 last_sgpr = SI_PARAM_PRIM_MASK;
6066 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
6067 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
6068 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
6069 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
6070 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
6071 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
6072 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
6073 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
6074 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
6075 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
6076 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
6077 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
6078 params[SI_PARAM_FRONT_FACE] = ctx->i32;
6079 shader->info.face_vgpr_index = 20;
6080 params[SI_PARAM_ANCILLARY] = ctx->i32;
6081 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
6082 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
6083 num_params = SI_PARAM_POS_FIXED_PT+1;
6084
6085 /* Color inputs from the prolog. */
6086 if (shader->selector->info.colors_read) {
6087 unsigned num_color_elements =
6088 util_bitcount(shader->selector->info.colors_read);
6089
6090 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
6091 for (i = 0; i < num_color_elements; i++)
6092 params[num_params++] = ctx->f32;
6093
6094 num_prolog_vgprs += num_color_elements;
6095 }
6096
6097 /* Outputs for the epilog. */
6098 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6099 num_returns =
6100 num_return_sgprs +
6101 util_bitcount(shader->selector->info.colors_written) * 4 +
6102 shader->selector->info.writes_z +
6103 shader->selector->info.writes_stencil +
6104 shader->selector->info.writes_samplemask +
6105 1 /* SampleMaskIn */;
6106
6107 num_returns = MAX2(num_returns,
6108 num_return_sgprs +
6109 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6110
6111 for (i = 0; i < num_return_sgprs; i++)
6112 returns[i] = ctx->i32;
6113 for (; i < num_returns; i++)
6114 returns[i] = ctx->f32;
6115 break;
6116
6117 case PIPE_SHADER_COMPUTE:
6118 declare_default_desc_pointers(ctx, params, &num_params);
6119 params[SI_PARAM_GRID_SIZE] = v3i32;
6120 params[SI_PARAM_BLOCK_SIZE] = v3i32;
6121 params[SI_PARAM_BLOCK_ID] = v3i32;
6122 last_sgpr = SI_PARAM_BLOCK_ID;
6123
6124 params[SI_PARAM_THREAD_ID] = v3i32;
6125 num_params = SI_PARAM_THREAD_ID + 1;
6126 break;
6127 default:
6128 assert(0 && "unimplemented shader");
6129 return;
6130 }
6131
6132 assert(num_params <= ARRAY_SIZE(params));
6133
6134 si_create_function(ctx, "main", returns, num_returns, params,
6135 num_params, last_sgpr);
6136
6137 /* Reserve register locations for VGPR inputs the PS prolog may need. */
6138 if (ctx->type == PIPE_SHADER_FRAGMENT &&
6139 ctx->separate_prolog) {
6140 si_llvm_add_attribute(ctx->main_fn,
6141 "InitialPSInputAddr",
6142 S_0286D0_PERSP_SAMPLE_ENA(1) |
6143 S_0286D0_PERSP_CENTER_ENA(1) |
6144 S_0286D0_PERSP_CENTROID_ENA(1) |
6145 S_0286D0_LINEAR_SAMPLE_ENA(1) |
6146 S_0286D0_LINEAR_CENTER_ENA(1) |
6147 S_0286D0_LINEAR_CENTROID_ENA(1) |
6148 S_0286D0_FRONT_FACE_ENA(1) |
6149 S_0286D0_POS_FIXED_PT_ENA(1));
6150 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
6151 si_llvm_add_attribute(ctx->main_fn,
6152 "amdgpu-max-work-group-size",
6153 si_get_max_workgroup_size(shader));
6154 }
6155
6156 shader->info.num_input_sgprs = 0;
6157 shader->info.num_input_vgprs = 0;
6158
6159 for (i = 0; i <= last_sgpr; ++i)
6160 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
6161
6162 for (; i < num_params; ++i)
6163 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
6164
6165 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
6166 shader->info.num_input_vgprs -= num_prolog_vgprs;
6167
6168 if (!ctx->screen->has_ds_bpermute &&
6169 bld_base->info &&
6170 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
6171 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
6172 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
6173 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
6174 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
6175 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
6176 ctx->lds =
6177 LLVMAddGlobalInAddressSpace(gallivm->module,
6178 LLVMArrayType(ctx->i32, 64),
6179 "ddxy_lds",
6180 LOCAL_ADDR_SPACE);
6181
6182 if (shader->key.as_ls ||
6183 ctx->type == PIPE_SHADER_TESS_CTRL ||
6184 /* GFX9 has the ESGS ring buffer in LDS. */
6185 (ctx->screen->b.chip_class >= GFX9 &&
6186 (shader->key.as_es ||
6187 ctx->type == PIPE_SHADER_GEOMETRY)))
6188 declare_lds_as_pointer(ctx);
6189 }
6190
6191 /**
6192 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
6193 * for later use.
6194 */
6195 static void preload_ring_buffers(struct si_shader_context *ctx)
6196 {
6197 struct gallivm_state *gallivm = &ctx->gallivm;
6198 LLVMBuilderRef builder = gallivm->builder;
6199
6200 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6201 ctx->param_rw_buffers);
6202
6203 if (ctx->screen->b.chip_class <= VI &&
6204 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
6205 unsigned ring =
6206 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6207 : SI_ES_RING_ESGS;
6208 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6209
6210 ctx->esgs_ring =
6211 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6212 }
6213
6214 if (ctx->shader->is_gs_copy_shader) {
6215 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6216
6217 ctx->gsvs_ring[0] =
6218 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6219 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6220 const struct si_shader_selector *sel = ctx->shader->selector;
6221 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6222 LLVMValueRef base_ring;
6223
6224 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6225
6226 /* The conceptual layout of the GSVS ring is
6227 * v0c0 .. vLv0 v0c1 .. vLc1 ..
6228 * but the real memory layout is swizzled across
6229 * threads:
6230 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6231 * t16v0c0 ..
6232 * Override the buffer descriptor accordingly.
6233 */
6234 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6235 uint64_t stream_offset = 0;
6236
6237 for (unsigned stream = 0; stream < 4; ++stream) {
6238 unsigned num_components;
6239 unsigned stride;
6240 unsigned num_records;
6241 LLVMValueRef ring, tmp;
6242
6243 num_components = sel->info.num_stream_output_components[stream];
6244 if (!num_components)
6245 continue;
6246
6247 stride = 4 * num_components * sel->gs_max_out_vertices;
6248
6249 /* Limit on the stride field for <= CIK. */
6250 assert(stride < (1 << 14));
6251
6252 num_records = 64;
6253
6254 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6255 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6256 tmp = LLVMBuildAdd(builder, tmp,
6257 LLVMConstInt(ctx->i64,
6258 stream_offset, 0), "");
6259 stream_offset += stride * 64;
6260
6261 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6262 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6263 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6264 tmp = LLVMBuildOr(builder, tmp,
6265 LLVMConstInt(ctx->i32,
6266 S_008F04_STRIDE(stride) |
6267 S_008F04_SWIZZLE_ENABLE(1), 0), "");
6268 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6269 ring = LLVMBuildInsertElement(builder, ring,
6270 LLVMConstInt(ctx->i32, num_records, 0),
6271 LLVMConstInt(ctx->i32, 2, 0), "");
6272 ring = LLVMBuildInsertElement(builder, ring,
6273 LLVMConstInt(ctx->i32,
6274 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6275 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6276 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6277 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6278 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6279 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6280 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6281 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6282 S_008F0C_ADD_TID_ENABLE(1),
6283 0),
6284 LLVMConstInt(ctx->i32, 3, 0), "");
6285 ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
6286
6287 ctx->gsvs_ring[stream] = ring;
6288 }
6289 }
6290 }
6291
6292 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6293 LLVMValueRef param_rw_buffers,
6294 unsigned param_pos_fixed_pt)
6295 {
6296 struct gallivm_state *gallivm = &ctx->gallivm;
6297 LLVMBuilderRef builder = gallivm->builder;
6298 LLVMValueRef slot, desc, offset, row, bit, address[2];
6299
6300 /* Use the fixed-point gl_FragCoord input.
6301 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6302 * per coordinate to get the repeating effect.
6303 */
6304 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6305 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6306
6307 /* Load the buffer descriptor. */
6308 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6309 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6310
6311 /* The stipple pattern is 32x32, each row has 32 bits. */
6312 offset = LLVMBuildMul(builder, address[1],
6313 LLVMConstInt(ctx->i32, 4, 0), "");
6314 row = buffer_load_const(ctx, desc, offset);
6315 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6316 bit = LLVMBuildLShr(builder, row, address[0], "");
6317 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6318
6319 /* The intrinsic kills the thread if arg < 0. */
6320 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6321 LLVMConstReal(ctx->f32, -1), "");
6322 ac_build_kill(&ctx->ac, bit);
6323 }
6324
6325 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6326 struct si_shader_config *conf,
6327 unsigned symbol_offset)
6328 {
6329 unsigned i;
6330 const unsigned char *config =
6331 ac_shader_binary_config_start(binary, symbol_offset);
6332 bool really_needs_scratch = false;
6333
6334 /* LLVM adds SGPR spills to the scratch size.
6335 * Find out if we really need the scratch buffer.
6336 */
6337 for (i = 0; i < binary->reloc_count; i++) {
6338 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6339
6340 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6341 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6342 really_needs_scratch = true;
6343 break;
6344 }
6345 }
6346
6347 /* XXX: We may be able to emit some of these values directly rather than
6348 * extracting fields to be emitted later.
6349 */
6350
6351 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6352 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6353 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6354 switch (reg) {
6355 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6356 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6357 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6358 case R_00B848_COMPUTE_PGM_RSRC1:
6359 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6360 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6361 conf->float_mode = G_00B028_FLOAT_MODE(value);
6362 conf->rsrc1 = value;
6363 break;
6364 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6365 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6366 break;
6367 case R_00B84C_COMPUTE_PGM_RSRC2:
6368 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6369 conf->rsrc2 = value;
6370 break;
6371 case R_0286CC_SPI_PS_INPUT_ENA:
6372 conf->spi_ps_input_ena = value;
6373 break;
6374 case R_0286D0_SPI_PS_INPUT_ADDR:
6375 conf->spi_ps_input_addr = value;
6376 break;
6377 case R_0286E8_SPI_TMPRING_SIZE:
6378 case R_00B860_COMPUTE_TMPRING_SIZE:
6379 /* WAVESIZE is in units of 256 dwords. */
6380 if (really_needs_scratch)
6381 conf->scratch_bytes_per_wave =
6382 G_00B860_WAVESIZE(value) * 256 * 4;
6383 break;
6384 case 0x4: /* SPILLED_SGPRS */
6385 conf->spilled_sgprs = value;
6386 break;
6387 case 0x8: /* SPILLED_VGPRS */
6388 conf->spilled_vgprs = value;
6389 break;
6390 default:
6391 {
6392 static bool printed;
6393
6394 if (!printed) {
6395 fprintf(stderr, "Warning: LLVM emitted unknown "
6396 "config register: 0x%x\n", reg);
6397 printed = true;
6398 }
6399 }
6400 break;
6401 }
6402 }
6403
6404 if (!conf->spi_ps_input_addr)
6405 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6406 }
6407
6408 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6409 struct si_shader *shader,
6410 struct si_shader_config *config,
6411 uint64_t scratch_va)
6412 {
6413 unsigned i;
6414 uint32_t scratch_rsrc_dword0 = scratch_va;
6415 uint32_t scratch_rsrc_dword1 =
6416 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6417
6418 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6419 * correctly.
6420 */
6421 if (HAVE_LLVM >= 0x0309)
6422 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6423 else
6424 scratch_rsrc_dword1 |=
6425 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6426
6427 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6428 const struct ac_shader_reloc *reloc =
6429 &shader->binary.relocs[i];
6430 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6431 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6432 &scratch_rsrc_dword0, 4);
6433 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6434 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6435 &scratch_rsrc_dword1, 4);
6436 }
6437 }
6438 }
6439
6440 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6441 {
6442 unsigned size = shader->binary.code_size;
6443
6444 if (shader->prolog)
6445 size += shader->prolog->binary.code_size;
6446 if (shader->previous_stage)
6447 size += shader->previous_stage->binary.code_size;
6448 if (shader->prolog2)
6449 size += shader->prolog2->binary.code_size;
6450 if (shader->epilog)
6451 size += shader->epilog->binary.code_size;
6452 return size;
6453 }
6454
6455 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6456 {
6457 const struct ac_shader_binary *prolog =
6458 shader->prolog ? &shader->prolog->binary : NULL;
6459 const struct ac_shader_binary *previous_stage =
6460 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6461 const struct ac_shader_binary *prolog2 =
6462 shader->prolog2 ? &shader->prolog2->binary : NULL;
6463 const struct ac_shader_binary *epilog =
6464 shader->epilog ? &shader->epilog->binary : NULL;
6465 const struct ac_shader_binary *mainb = &shader->binary;
6466 unsigned bo_size = si_get_shader_binary_size(shader) +
6467 (!epilog ? mainb->rodata_size : 0);
6468 unsigned char *ptr;
6469
6470 assert(!prolog || !prolog->rodata_size);
6471 assert(!previous_stage || !previous_stage->rodata_size);
6472 assert(!prolog2 || !prolog2->rodata_size);
6473 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
6474 !mainb->rodata_size);
6475 assert(!epilog || !epilog->rodata_size);
6476
6477 /* GFX9 can fetch at most 128 bytes past the end of the shader.
6478 * Prevent VM faults.
6479 */
6480 if (sscreen->b.chip_class >= GFX9)
6481 bo_size += 128;
6482
6483 r600_resource_reference(&shader->bo, NULL);
6484 shader->bo = (struct r600_resource*)
6485 pipe_buffer_create(&sscreen->b.b, 0,
6486 PIPE_USAGE_IMMUTABLE,
6487 align(bo_size, SI_CPDMA_ALIGNMENT));
6488 if (!shader->bo)
6489 return -ENOMEM;
6490
6491 /* Upload. */
6492 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6493 PIPE_TRANSFER_READ_WRITE |
6494 PIPE_TRANSFER_UNSYNCHRONIZED);
6495
6496 if (prolog) {
6497 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6498 ptr += prolog->code_size;
6499 }
6500 if (previous_stage) {
6501 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6502 previous_stage->code_size);
6503 ptr += previous_stage->code_size;
6504 }
6505 if (prolog2) {
6506 util_memcpy_cpu_to_le32(ptr, prolog2->code, prolog2->code_size);
6507 ptr += prolog2->code_size;
6508 }
6509
6510 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6511 ptr += mainb->code_size;
6512
6513 if (epilog)
6514 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6515 else if (mainb->rodata_size > 0)
6516 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6517
6518 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6519 return 0;
6520 }
6521
6522 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6523 struct pipe_debug_callback *debug,
6524 const char *name, FILE *file)
6525 {
6526 char *line, *p;
6527 unsigned i, count;
6528
6529 if (binary->disasm_string) {
6530 fprintf(file, "Shader %s disassembly:\n", name);
6531 fprintf(file, "%s", binary->disasm_string);
6532
6533 if (debug && debug->debug_message) {
6534 /* Very long debug messages are cut off, so send the
6535 * disassembly one line at a time. This causes more
6536 * overhead, but on the plus side it simplifies
6537 * parsing of resulting logs.
6538 */
6539 pipe_debug_message(debug, SHADER_INFO,
6540 "Shader Disassembly Begin");
6541
6542 line = binary->disasm_string;
6543 while (*line) {
6544 p = util_strchrnul(line, '\n');
6545 count = p - line;
6546
6547 if (count) {
6548 pipe_debug_message(debug, SHADER_INFO,
6549 "%.*s", count, line);
6550 }
6551
6552 if (!*p)
6553 break;
6554 line = p + 1;
6555 }
6556
6557 pipe_debug_message(debug, SHADER_INFO,
6558 "Shader Disassembly End");
6559 }
6560 } else {
6561 fprintf(file, "Shader %s binary:\n", name);
6562 for (i = 0; i < binary->code_size; i += 4) {
6563 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6564 binary->code[i + 3], binary->code[i + 2],
6565 binary->code[i + 1], binary->code[i]);
6566 }
6567 }
6568 }
6569
6570 static void si_shader_dump_stats(struct si_screen *sscreen,
6571 struct si_shader *shader,
6572 struct pipe_debug_callback *debug,
6573 unsigned processor,
6574 FILE *file,
6575 bool check_debug_option)
6576 {
6577 struct si_shader_config *conf = &shader->config;
6578 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6579 unsigned code_size = si_get_shader_binary_size(shader);
6580 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6581 unsigned lds_per_wave = 0;
6582 unsigned max_simd_waves = 10;
6583
6584 /* Compute LDS usage for PS. */
6585 switch (processor) {
6586 case PIPE_SHADER_FRAGMENT:
6587 /* The minimum usage per wave is (num_inputs * 48). The maximum
6588 * usage is (num_inputs * 48 * 16).
6589 * We can get anything in between and it varies between waves.
6590 *
6591 * The 48 bytes per input for a single primitive is equal to
6592 * 4 bytes/component * 4 components/input * 3 points.
6593 *
6594 * Other stages don't know the size at compile time or don't
6595 * allocate LDS per wave, but instead they do it per thread group.
6596 */
6597 lds_per_wave = conf->lds_size * lds_increment +
6598 align(num_inputs * 48, lds_increment);
6599 break;
6600 case PIPE_SHADER_COMPUTE:
6601 if (shader->selector) {
6602 unsigned max_workgroup_size =
6603 si_get_max_workgroup_size(shader);
6604 lds_per_wave = (conf->lds_size * lds_increment) /
6605 DIV_ROUND_UP(max_workgroup_size, 64);
6606 }
6607 break;
6608 }
6609
6610 /* Compute the per-SIMD wave counts. */
6611 if (conf->num_sgprs) {
6612 if (sscreen->b.chip_class >= VI)
6613 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6614 else
6615 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6616 }
6617
6618 if (conf->num_vgprs)
6619 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6620
6621 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6622 * 16KB makes some SIMDs unoccupied). */
6623 if (lds_per_wave)
6624 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6625
6626 if (!check_debug_option ||
6627 r600_can_dump_shader(&sscreen->b, processor)) {
6628 if (processor == PIPE_SHADER_FRAGMENT) {
6629 fprintf(file, "*** SHADER CONFIG ***\n"
6630 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6631 "SPI_PS_INPUT_ENA = 0x%04x\n",
6632 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6633 }
6634
6635 fprintf(file, "*** SHADER STATS ***\n"
6636 "SGPRS: %d\n"
6637 "VGPRS: %d\n"
6638 "Spilled SGPRs: %d\n"
6639 "Spilled VGPRs: %d\n"
6640 "Private memory VGPRs: %d\n"
6641 "Code Size: %d bytes\n"
6642 "LDS: %d blocks\n"
6643 "Scratch: %d bytes per wave\n"
6644 "Max Waves: %d\n"
6645 "********************\n\n\n",
6646 conf->num_sgprs, conf->num_vgprs,
6647 conf->spilled_sgprs, conf->spilled_vgprs,
6648 conf->private_mem_vgprs, code_size,
6649 conf->lds_size, conf->scratch_bytes_per_wave,
6650 max_simd_waves);
6651 }
6652
6653 pipe_debug_message(debug, SHADER_INFO,
6654 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6655 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6656 "Spilled VGPRs: %d PrivMem VGPRs: %d",
6657 conf->num_sgprs, conf->num_vgprs, code_size,
6658 conf->lds_size, conf->scratch_bytes_per_wave,
6659 max_simd_waves, conf->spilled_sgprs,
6660 conf->spilled_vgprs, conf->private_mem_vgprs);
6661 }
6662
6663 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6664 {
6665 switch (processor) {
6666 case PIPE_SHADER_VERTEX:
6667 if (shader->key.as_es)
6668 return "Vertex Shader as ES";
6669 else if (shader->key.as_ls)
6670 return "Vertex Shader as LS";
6671 else
6672 return "Vertex Shader as VS";
6673 case PIPE_SHADER_TESS_CTRL:
6674 return "Tessellation Control Shader";
6675 case PIPE_SHADER_TESS_EVAL:
6676 if (shader->key.as_es)
6677 return "Tessellation Evaluation Shader as ES";
6678 else
6679 return "Tessellation Evaluation Shader as VS";
6680 case PIPE_SHADER_GEOMETRY:
6681 if (shader->is_gs_copy_shader)
6682 return "GS Copy Shader as VS";
6683 else
6684 return "Geometry Shader";
6685 case PIPE_SHADER_FRAGMENT:
6686 return "Pixel Shader";
6687 case PIPE_SHADER_COMPUTE:
6688 return "Compute Shader";
6689 default:
6690 return "Unknown Shader";
6691 }
6692 }
6693
6694 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6695 struct pipe_debug_callback *debug, unsigned processor,
6696 FILE *file, bool check_debug_option)
6697 {
6698 if (!check_debug_option ||
6699 r600_can_dump_shader(&sscreen->b, processor))
6700 si_dump_shader_key(processor, shader, file);
6701
6702 if (!check_debug_option && shader->binary.llvm_ir_string) {
6703 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6704 si_get_shader_name(shader, processor));
6705 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6706 }
6707
6708 if (!check_debug_option ||
6709 (r600_can_dump_shader(&sscreen->b, processor) &&
6710 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6711 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6712
6713 if (shader->prolog)
6714 si_shader_dump_disassembly(&shader->prolog->binary,
6715 debug, "prolog", file);
6716 if (shader->previous_stage)
6717 si_shader_dump_disassembly(&shader->previous_stage->binary,
6718 debug, "previous stage", file);
6719 if (shader->prolog2)
6720 si_shader_dump_disassembly(&shader->prolog2->binary,
6721 debug, "prolog2", file);
6722
6723 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6724
6725 if (shader->epilog)
6726 si_shader_dump_disassembly(&shader->epilog->binary,
6727 debug, "epilog", file);
6728 fprintf(file, "\n");
6729 }
6730
6731 si_shader_dump_stats(sscreen, shader, debug, processor, file,
6732 check_debug_option);
6733 }
6734
6735 int si_compile_llvm(struct si_screen *sscreen,
6736 struct ac_shader_binary *binary,
6737 struct si_shader_config *conf,
6738 LLVMTargetMachineRef tm,
6739 LLVMModuleRef mod,
6740 struct pipe_debug_callback *debug,
6741 unsigned processor,
6742 const char *name)
6743 {
6744 int r = 0;
6745 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6746
6747 if (r600_can_dump_shader(&sscreen->b, processor)) {
6748 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6749
6750 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6751 fprintf(stderr, "%s LLVM IR:\n\n", name);
6752 ac_dump_module(mod);
6753 fprintf(stderr, "\n");
6754 }
6755 }
6756
6757 if (sscreen->record_llvm_ir) {
6758 char *ir = LLVMPrintModuleToString(mod);
6759 binary->llvm_ir_string = strdup(ir);
6760 LLVMDisposeMessage(ir);
6761 }
6762
6763 if (!si_replace_shader(count, binary)) {
6764 r = si_llvm_compile(mod, binary, tm, debug);
6765 if (r)
6766 return r;
6767 }
6768
6769 si_shader_binary_read_config(binary, conf, 0);
6770
6771 /* Enable 64-bit and 16-bit denormals, because there is no performance
6772 * cost.
6773 *
6774 * If denormals are enabled, all floating-point output modifiers are
6775 * ignored.
6776 *
6777 * Don't enable denormals for 32-bit floats, because:
6778 * - Floating-point output modifiers would be ignored by the hw.
6779 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6780 * have to stop using those.
6781 * - SI & CI would be very slow.
6782 */
6783 conf->float_mode |= V_00B028_FP_64_DENORMS;
6784
6785 FREE(binary->config);
6786 FREE(binary->global_symbol_offsets);
6787 binary->config = NULL;
6788 binary->global_symbol_offsets = NULL;
6789
6790 /* Some shaders can't have rodata because their binaries can be
6791 * concatenated.
6792 */
6793 if (binary->rodata_size &&
6794 (processor == PIPE_SHADER_VERTEX ||
6795 processor == PIPE_SHADER_TESS_CTRL ||
6796 processor == PIPE_SHADER_TESS_EVAL ||
6797 processor == PIPE_SHADER_FRAGMENT)) {
6798 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6799 return -EINVAL;
6800 }
6801
6802 return r;
6803 }
6804
6805 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6806 {
6807 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6808 LLVMBuildRetVoid(ctx->gallivm.builder);
6809 else
6810 LLVMBuildRet(ctx->gallivm.builder, ret);
6811 }
6812
6813 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6814 struct si_shader *
6815 si_generate_gs_copy_shader(struct si_screen *sscreen,
6816 LLVMTargetMachineRef tm,
6817 struct si_shader_selector *gs_selector,
6818 struct pipe_debug_callback *debug)
6819 {
6820 struct si_shader_context ctx;
6821 struct si_shader *shader;
6822 struct gallivm_state *gallivm = &ctx.gallivm;
6823 LLVMBuilderRef builder;
6824 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6825 struct lp_build_context *uint = &bld_base->uint_bld;
6826 struct si_shader_output_values *outputs;
6827 struct tgsi_shader_info *gsinfo = &gs_selector->info;
6828 int i, r;
6829
6830 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6831
6832 if (!outputs)
6833 return NULL;
6834
6835 shader = CALLOC_STRUCT(si_shader);
6836 if (!shader) {
6837 FREE(outputs);
6838 return NULL;
6839 }
6840
6841
6842 shader->selector = gs_selector;
6843 shader->is_gs_copy_shader = true;
6844
6845 si_init_shader_ctx(&ctx, sscreen, tm);
6846 ctx.shader = shader;
6847 ctx.type = PIPE_SHADER_VERTEX;
6848
6849 builder = gallivm->builder;
6850
6851 create_function(&ctx);
6852 preload_ring_buffers(&ctx);
6853
6854 LLVMValueRef voffset =
6855 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6856 ctx.param_vertex_id), 4);
6857
6858 /* Fetch the vertex stream ID.*/
6859 LLVMValueRef stream_id;
6860
6861 if (gs_selector->so.num_outputs)
6862 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6863 else
6864 stream_id = ctx.i32_0;
6865
6866 /* Fill in output information. */
6867 for (i = 0; i < gsinfo->num_outputs; ++i) {
6868 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6869 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6870
6871 for (int chan = 0; chan < 4; chan++) {
6872 outputs[i].vertex_stream[chan] =
6873 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6874 }
6875 }
6876
6877 LLVMBasicBlockRef end_bb;
6878 LLVMValueRef switch_inst;
6879
6880 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6881 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6882
6883 for (int stream = 0; stream < 4; stream++) {
6884 LLVMBasicBlockRef bb;
6885 unsigned offset;
6886
6887 if (!gsinfo->num_stream_output_components[stream])
6888 continue;
6889
6890 if (stream > 0 && !gs_selector->so.num_outputs)
6891 continue;
6892
6893 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6894 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6895 LLVMPositionBuilderAtEnd(builder, bb);
6896
6897 /* Fetch vertex data from GSVS ring */
6898 offset = 0;
6899 for (i = 0; i < gsinfo->num_outputs; ++i) {
6900 for (unsigned chan = 0; chan < 4; chan++) {
6901 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6902 outputs[i].vertex_stream[chan] != stream) {
6903 outputs[i].values[chan] = ctx.bld_base.base.undef;
6904 continue;
6905 }
6906
6907 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6908 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6909 offset++;
6910
6911 outputs[i].values[chan] =
6912 ac_build_buffer_load(&ctx.ac,
6913 ctx.gsvs_ring[0], 1,
6914 ctx.i32_0, voffset,
6915 soffset, 0, 1, 1, true);
6916 }
6917 }
6918
6919 /* Streamout and exports. */
6920 if (gs_selector->so.num_outputs) {
6921 si_llvm_emit_streamout(&ctx, outputs,
6922 gsinfo->num_outputs,
6923 stream);
6924 }
6925
6926 if (stream == 0)
6927 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6928
6929 LLVMBuildBr(builder, end_bb);
6930 }
6931
6932 LLVMPositionBuilderAtEnd(builder, end_bb);
6933
6934 LLVMBuildRetVoid(gallivm->builder);
6935
6936 /* Dump LLVM IR before any optimization passes */
6937 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6938 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6939 ac_dump_module(ctx.gallivm.module);
6940
6941 si_llvm_finalize_module(&ctx,
6942 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6943
6944 r = si_compile_llvm(sscreen, &ctx.shader->binary,
6945 &ctx.shader->config, ctx.tm,
6946 ctx.gallivm.module,
6947 debug, PIPE_SHADER_GEOMETRY,
6948 "GS Copy Shader");
6949 if (!r) {
6950 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6951 fprintf(stderr, "GS Copy Shader:\n");
6952 si_shader_dump(sscreen, ctx.shader, debug,
6953 PIPE_SHADER_GEOMETRY, stderr, true);
6954 r = si_shader_binary_upload(sscreen, ctx.shader);
6955 }
6956
6957 si_llvm_dispose(&ctx);
6958
6959 FREE(outputs);
6960
6961 if (r != 0) {
6962 FREE(shader);
6963 shader = NULL;
6964 }
6965 return shader;
6966 }
6967
6968 static void si_dump_shader_key_vs(struct si_shader_key *key,
6969 struct si_vs_prolog_bits *prolog,
6970 const char *prefix, FILE *f)
6971 {
6972 fprintf(f, " %s.instance_divisors = {", prefix);
6973 for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
6974 fprintf(f, !i ? "%u" : ", %u",
6975 prolog->instance_divisors[i]);
6976 }
6977 fprintf(f, "}\n");
6978
6979 fprintf(f, " mono.vs.fix_fetch = {");
6980 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
6981 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
6982 fprintf(f, "}\n");
6983 }
6984
6985 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
6986 FILE *f)
6987 {
6988 struct si_shader_key *key = &shader->key;
6989
6990 fprintf(f, "SHADER KEY\n");
6991
6992 switch (processor) {
6993 case PIPE_SHADER_VERTEX:
6994 si_dump_shader_key_vs(key, &key->part.vs.prolog,
6995 "part.vs.prolog", f);
6996 fprintf(f, " as_es = %u\n", key->as_es);
6997 fprintf(f, " as_ls = %u\n", key->as_ls);
6998 fprintf(f, " part.vs.epilog.export_prim_id = %u\n",
6999 key->part.vs.epilog.export_prim_id);
7000 break;
7001
7002 case PIPE_SHADER_TESS_CTRL:
7003 if (shader->selector->screen->b.chip_class >= GFX9) {
7004 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
7005 "part.tcs.ls_prolog", f);
7006 }
7007 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
7008 fprintf(f, " mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
7009 break;
7010
7011 case PIPE_SHADER_TESS_EVAL:
7012 fprintf(f, " part.tes.epilog.export_prim_id = %u\n", key->part.tes.epilog.export_prim_id);
7013 fprintf(f, " as_es = %u\n", key->as_es);
7014 break;
7015
7016 case PIPE_SHADER_GEOMETRY:
7017 if (shader->is_gs_copy_shader)
7018 break;
7019
7020 if (shader->selector->screen->b.chip_class >= GFX9 &&
7021 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
7022 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
7023 "part.gs.vs_prolog", f);
7024 }
7025 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
7026 break;
7027
7028 case PIPE_SHADER_COMPUTE:
7029 break;
7030
7031 case PIPE_SHADER_FRAGMENT:
7032 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
7033 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
7034 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
7035 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
7036 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
7037 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
7038 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
7039 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
7040 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
7041 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
7042 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
7043 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
7044 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
7045 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
7046 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
7047 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
7048 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
7049 break;
7050
7051 default:
7052 assert(0);
7053 }
7054
7055 if ((processor == PIPE_SHADER_GEOMETRY ||
7056 processor == PIPE_SHADER_TESS_EVAL ||
7057 processor == PIPE_SHADER_VERTEX) &&
7058 !key->as_es && !key->as_ls) {
7059 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
7060 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
7061 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
7062 }
7063 }
7064
7065 static void si_init_shader_ctx(struct si_shader_context *ctx,
7066 struct si_screen *sscreen,
7067 LLVMTargetMachineRef tm)
7068 {
7069 struct lp_build_tgsi_context *bld_base;
7070 struct lp_build_tgsi_action tmpl = {};
7071
7072 si_llvm_context_init(ctx, sscreen, tm);
7073
7074 bld_base = &ctx->bld_base;
7075 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
7076
7077 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
7078 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
7079 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
7080
7081 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
7082 bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
7083 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
7084 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
7085 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
7086 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
7087 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
7088 bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
7089 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
7090 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
7091 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
7092 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
7093 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
7094 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
7095 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
7096 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
7097
7098 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
7099 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
7100 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
7101 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
7102 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
7103 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
7104
7105 tmpl.fetch_args = atomic_fetch_args;
7106 tmpl.emit = atomic_emit;
7107 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
7108 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
7109 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
7110 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
7111 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
7112 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
7113 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
7114 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
7115 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
7116 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
7117 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
7118 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
7119 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
7120 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
7121 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
7122 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
7123 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
7124 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
7125 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
7126 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
7127
7128 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
7129
7130 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
7131
7132 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
7133 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
7134 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
7135 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
7136
7137 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
7138 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
7139 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
7140 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
7141 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
7142 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
7143 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
7144 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
7145 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
7146
7147 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
7148 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
7149 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
7150 }
7151
7152 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
7153 {
7154 struct si_shader *shader = ctx->shader;
7155 struct tgsi_shader_info *info = &shader->selector->info;
7156
7157 if (ctx->type == PIPE_SHADER_FRAGMENT ||
7158 ctx->type == PIPE_SHADER_COMPUTE ||
7159 shader->key.as_es ||
7160 shader->key.as_ls)
7161 return;
7162
7163 ac_eliminate_const_vs_outputs(&ctx->ac,
7164 ctx->main_fn,
7165 shader->info.vs_output_param_offset,
7166 info->num_outputs,
7167 &shader->info.nr_param_exports);
7168 }
7169
7170 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
7171 {
7172 ctx->shader->config.private_mem_vgprs = 0;
7173
7174 /* Process all LLVM instructions. */
7175 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
7176 while (bb) {
7177 LLVMValueRef next = LLVMGetFirstInstruction(bb);
7178
7179 while (next) {
7180 LLVMValueRef inst = next;
7181 next = LLVMGetNextInstruction(next);
7182
7183 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
7184 continue;
7185
7186 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
7187 /* No idea why LLVM aligns allocas to 4 elements. */
7188 unsigned alignment = LLVMGetAlignment(inst);
7189 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
7190 ctx->shader->config.private_mem_vgprs += dw_size;
7191 }
7192 bb = LLVMGetNextBasicBlock(bb);
7193 }
7194 }
7195
7196 static void si_init_exec_from_input(struct si_shader_context *ctx,
7197 unsigned param, unsigned bitoffset)
7198 {
7199 LLVMValueRef args[] = {
7200 LLVMGetParam(ctx->main_fn, param),
7201 LLVMConstInt(ctx->i32, bitoffset, 0),
7202 };
7203 lp_build_intrinsic(ctx->gallivm.builder,
7204 "llvm.amdgcn.init.exec.from.input",
7205 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
7206 }
7207
7208 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
7209 bool is_monolithic)
7210 {
7211 struct si_shader *shader = ctx->shader;
7212 struct si_shader_selector *sel = shader->selector;
7213 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7214
7215 switch (ctx->type) {
7216 case PIPE_SHADER_VERTEX:
7217 ctx->load_input = declare_input_vs;
7218 if (shader->key.as_ls)
7219 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
7220 else if (shader->key.as_es)
7221 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7222 else
7223 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7224 break;
7225 case PIPE_SHADER_TESS_CTRL:
7226 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7227 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7228 bld_base->emit_store = store_output_tcs;
7229 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7230 break;
7231 case PIPE_SHADER_TESS_EVAL:
7232 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7233 if (shader->key.as_es)
7234 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7235 else
7236 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7237 break;
7238 case PIPE_SHADER_GEOMETRY:
7239 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7240 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7241 break;
7242 case PIPE_SHADER_FRAGMENT:
7243 ctx->load_input = declare_input_fs;
7244 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7245 break;
7246 case PIPE_SHADER_COMPUTE:
7247 ctx->declare_memory_region = declare_compute_memory;
7248 break;
7249 default:
7250 assert(!"Unsupported shader type");
7251 return false;
7252 }
7253
7254 create_function(ctx);
7255 preload_ring_buffers(ctx);
7256
7257 /* For GFX9 merged shaders:
7258 * - Set EXEC. If the prolog is present, set EXEC there instead.
7259 * - Add a barrier before the second shader.
7260 *
7261 * The same thing for monolithic shaders is done in
7262 * si_build_wrapper_function.
7263 */
7264 if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
7265 if (sel->info.num_instructions > 1 && /* not empty shader */
7266 (shader->key.as_es || shader->key.as_ls) &&
7267 (ctx->type == PIPE_SHADER_TESS_EVAL ||
7268 (ctx->type == PIPE_SHADER_VERTEX &&
7269 !sel->vs_needs_prolog))) {
7270 si_init_exec_from_input(ctx,
7271 ctx->param_merged_wave_info, 0);
7272 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
7273 ctx->type == PIPE_SHADER_GEOMETRY) {
7274 si_init_exec_from_input(ctx,
7275 ctx->param_merged_wave_info, 8);
7276 si_llvm_emit_barrier(NULL, bld_base, NULL);
7277 }
7278 }
7279
7280 if (ctx->type == PIPE_SHADER_GEOMETRY) {
7281 int i;
7282 for (i = 0; i < 4; i++) {
7283 ctx->gs_next_vertex[i] =
7284 lp_build_alloca(&ctx->gallivm,
7285 ctx->i32, "");
7286 }
7287 }
7288
7289 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7290 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7291 return false;
7292 }
7293
7294 si_llvm_build_ret(ctx, ctx->return_value);
7295 return true;
7296 }
7297
7298 /**
7299 * Compute the VS prolog key, which contains all the information needed to
7300 * build the VS prolog function, and set shader->info bits where needed.
7301 *
7302 * \param info Shader info of the vertex shader.
7303 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
7304 * \param prolog_key Key of the VS prolog
7305 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
7306 * \param key Output shader part key.
7307 */
7308 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7309 unsigned num_input_sgprs,
7310 const struct si_vs_prolog_bits *prolog_key,
7311 struct si_shader *shader_out,
7312 union si_shader_part_key *key)
7313 {
7314 memset(key, 0, sizeof(*key));
7315 key->vs_prolog.states = *prolog_key;
7316 key->vs_prolog.num_input_sgprs = num_input_sgprs;
7317 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7318
7319 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL)
7320 key->vs_prolog.num_merged_next_stage_vgprs = 2;
7321
7322 /* Set the instanceID flag. */
7323 for (unsigned i = 0; i < info->num_inputs; i++)
7324 if (key->vs_prolog.states.instance_divisors[i])
7325 shader_out->info.uses_instanceid = true;
7326 }
7327
7328 /**
7329 * Compute the VS epilog key, which contains all the information needed to
7330 * build the VS epilog function, and set the PrimitiveID output offset.
7331 */
7332 static void si_get_vs_epilog_key(struct si_shader *shader,
7333 struct si_vs_epilog_bits *states,
7334 union si_shader_part_key *key)
7335 {
7336 memset(key, 0, sizeof(*key));
7337 key->vs_epilog.states = *states;
7338
7339 /* Set up the PrimitiveID output. */
7340 if (shader->key.part.vs.epilog.export_prim_id) {
7341 unsigned index = shader->selector->info.num_outputs;
7342 unsigned offset = shader->info.nr_param_exports++;
7343
7344 key->vs_epilog.prim_id_param_offset = offset;
7345 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7346 shader->info.vs_output_param_offset[index] = offset;
7347 }
7348 }
7349
7350 /**
7351 * Compute the PS prolog key, which contains all the information needed to
7352 * build the PS prolog function, and set related bits in shader->config.
7353 */
7354 static void si_get_ps_prolog_key(struct si_shader *shader,
7355 union si_shader_part_key *key,
7356 bool separate_prolog)
7357 {
7358 struct tgsi_shader_info *info = &shader->selector->info;
7359
7360 memset(key, 0, sizeof(*key));
7361 key->ps_prolog.states = shader->key.part.ps.prolog;
7362 key->ps_prolog.colors_read = info->colors_read;
7363 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7364 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7365 key->ps_prolog.wqm = info->uses_derivatives &&
7366 (key->ps_prolog.colors_read ||
7367 key->ps_prolog.states.force_persp_sample_interp ||
7368 key->ps_prolog.states.force_linear_sample_interp ||
7369 key->ps_prolog.states.force_persp_center_interp ||
7370 key->ps_prolog.states.force_linear_center_interp ||
7371 key->ps_prolog.states.bc_optimize_for_persp ||
7372 key->ps_prolog.states.bc_optimize_for_linear);
7373
7374 if (info->colors_read) {
7375 unsigned *color = shader->selector->color_attr_index;
7376
7377 if (shader->key.part.ps.prolog.color_two_side) {
7378 /* BCOLORs are stored after the last input. */
7379 key->ps_prolog.num_interp_inputs = info->num_inputs;
7380 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7381 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7382 }
7383
7384 for (unsigned i = 0; i < 2; i++) {
7385 unsigned interp = info->input_interpolate[color[i]];
7386 unsigned location = info->input_interpolate_loc[color[i]];
7387
7388 if (!(info->colors_read & (0xf << i*4)))
7389 continue;
7390
7391 key->ps_prolog.color_attr_index[i] = color[i];
7392
7393 if (shader->key.part.ps.prolog.flatshade_colors &&
7394 interp == TGSI_INTERPOLATE_COLOR)
7395 interp = TGSI_INTERPOLATE_CONSTANT;
7396
7397 switch (interp) {
7398 case TGSI_INTERPOLATE_CONSTANT:
7399 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7400 break;
7401 case TGSI_INTERPOLATE_PERSPECTIVE:
7402 case TGSI_INTERPOLATE_COLOR:
7403 /* Force the interpolation location for colors here. */
7404 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7405 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7406 if (shader->key.part.ps.prolog.force_persp_center_interp)
7407 location = TGSI_INTERPOLATE_LOC_CENTER;
7408
7409 switch (location) {
7410 case TGSI_INTERPOLATE_LOC_SAMPLE:
7411 key->ps_prolog.color_interp_vgpr_index[i] = 0;
7412 shader->config.spi_ps_input_ena |=
7413 S_0286CC_PERSP_SAMPLE_ENA(1);
7414 break;
7415 case TGSI_INTERPOLATE_LOC_CENTER:
7416 key->ps_prolog.color_interp_vgpr_index[i] = 2;
7417 shader->config.spi_ps_input_ena |=
7418 S_0286CC_PERSP_CENTER_ENA(1);
7419 break;
7420 case TGSI_INTERPOLATE_LOC_CENTROID:
7421 key->ps_prolog.color_interp_vgpr_index[i] = 4;
7422 shader->config.spi_ps_input_ena |=
7423 S_0286CC_PERSP_CENTROID_ENA(1);
7424 break;
7425 default:
7426 assert(0);
7427 }
7428 break;
7429 case TGSI_INTERPOLATE_LINEAR:
7430 /* Force the interpolation location for colors here. */
7431 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7432 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7433 if (shader->key.part.ps.prolog.force_linear_center_interp)
7434 location = TGSI_INTERPOLATE_LOC_CENTER;
7435
7436 /* The VGPR assignment for non-monolithic shaders
7437 * works because InitialPSInputAddr is set on the
7438 * main shader and PERSP_PULL_MODEL is never used.
7439 */
7440 switch (location) {
7441 case TGSI_INTERPOLATE_LOC_SAMPLE:
7442 key->ps_prolog.color_interp_vgpr_index[i] =
7443 separate_prolog ? 6 : 9;
7444 shader->config.spi_ps_input_ena |=
7445 S_0286CC_LINEAR_SAMPLE_ENA(1);
7446 break;
7447 case TGSI_INTERPOLATE_LOC_CENTER:
7448 key->ps_prolog.color_interp_vgpr_index[i] =
7449 separate_prolog ? 8 : 11;
7450 shader->config.spi_ps_input_ena |=
7451 S_0286CC_LINEAR_CENTER_ENA(1);
7452 break;
7453 case TGSI_INTERPOLATE_LOC_CENTROID:
7454 key->ps_prolog.color_interp_vgpr_index[i] =
7455 separate_prolog ? 10 : 13;
7456 shader->config.spi_ps_input_ena |=
7457 S_0286CC_LINEAR_CENTROID_ENA(1);
7458 break;
7459 default:
7460 assert(0);
7461 }
7462 break;
7463 default:
7464 assert(0);
7465 }
7466 }
7467 }
7468 }
7469
7470 /**
7471 * Check whether a PS prolog is required based on the key.
7472 */
7473 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7474 {
7475 return key->ps_prolog.colors_read ||
7476 key->ps_prolog.states.force_persp_sample_interp ||
7477 key->ps_prolog.states.force_linear_sample_interp ||
7478 key->ps_prolog.states.force_persp_center_interp ||
7479 key->ps_prolog.states.force_linear_center_interp ||
7480 key->ps_prolog.states.bc_optimize_for_persp ||
7481 key->ps_prolog.states.bc_optimize_for_linear ||
7482 key->ps_prolog.states.poly_stipple;
7483 }
7484
7485 /**
7486 * Compute the PS epilog key, which contains all the information needed to
7487 * build the PS epilog function.
7488 */
7489 static void si_get_ps_epilog_key(struct si_shader *shader,
7490 union si_shader_part_key *key)
7491 {
7492 struct tgsi_shader_info *info = &shader->selector->info;
7493 memset(key, 0, sizeof(*key));
7494 key->ps_epilog.colors_written = info->colors_written;
7495 key->ps_epilog.writes_z = info->writes_z;
7496 key->ps_epilog.writes_stencil = info->writes_stencil;
7497 key->ps_epilog.writes_samplemask = info->writes_samplemask;
7498 key->ps_epilog.states = shader->key.part.ps.epilog;
7499 }
7500
7501 /**
7502 * Build the GS prolog function. Rotate the input vertices for triangle strips
7503 * with adjacency.
7504 */
7505 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7506 union si_shader_part_key *key)
7507 {
7508 const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
7509 const unsigned num_vgprs = 8;
7510 struct gallivm_state *gallivm = &ctx->gallivm;
7511 LLVMBuilderRef builder = gallivm->builder;
7512 LLVMTypeRef params[32];
7513 LLVMTypeRef returns[32];
7514 LLVMValueRef func, ret;
7515
7516 for (unsigned i = 0; i < num_sgprs; ++i) {
7517 params[i] = ctx->i32;
7518 returns[i] = ctx->i32;
7519 }
7520
7521 for (unsigned i = 0; i < num_vgprs; ++i) {
7522 params[num_sgprs + i] = ctx->i32;
7523 returns[num_sgprs + i] = ctx->f32;
7524 }
7525
7526 /* Create the function. */
7527 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7528 params, num_sgprs + num_vgprs, num_sgprs - 1);
7529 func = ctx->main_fn;
7530
7531 /* Copy inputs to outputs. This should be no-op, as the registers match,
7532 * but it will prevent the compiler from overwriting them unintentionally.
7533 */
7534 ret = ctx->return_value;
7535 for (unsigned i = 0; i < num_sgprs; i++) {
7536 LLVMValueRef p = LLVMGetParam(func, i);
7537 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7538 }
7539 for (unsigned i = 0; i < num_vgprs; i++) {
7540 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7541 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7542 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7543 }
7544
7545 if (key->gs_prolog.states.tri_strip_adj_fix) {
7546 /* Remap the input vertices for every other primitive. */
7547 const unsigned vtx_params[6] = {
7548 num_sgprs,
7549 num_sgprs + 1,
7550 num_sgprs + 3,
7551 num_sgprs + 4,
7552 num_sgprs + 5,
7553 num_sgprs + 6
7554 };
7555 LLVMValueRef prim_id, rotate;
7556
7557 prim_id = LLVMGetParam(func, num_sgprs + 2);
7558 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7559
7560 for (unsigned i = 0; i < 6; ++i) {
7561 LLVMValueRef base, rotated, actual;
7562 base = LLVMGetParam(func, vtx_params[i]);
7563 rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
7564 actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
7565 actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
7566 ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
7567 }
7568 }
7569
7570 LLVMBuildRet(builder, ret);
7571 }
7572
7573 /**
7574 * Given a list of shader part functions, build a wrapper function that
7575 * runs them in sequence to form a monolithic shader.
7576 */
7577 static void si_build_wrapper_function(struct si_shader_context *ctx,
7578 LLVMValueRef *parts,
7579 unsigned num_parts,
7580 unsigned main_part,
7581 unsigned next_shader_first_part)
7582 {
7583 struct gallivm_state *gallivm = &ctx->gallivm;
7584 LLVMBuilderRef builder = ctx->gallivm.builder;
7585 /* PS epilog has one arg per color component */
7586 LLVMTypeRef param_types[48];
7587 LLVMValueRef initial[48], out[48];
7588 LLVMTypeRef function_type;
7589 unsigned num_params;
7590 unsigned num_out, initial_num_out;
7591 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7592 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
7593 unsigned num_sgprs, num_vgprs;
7594 unsigned last_sgpr_param;
7595 unsigned gprs;
7596 struct lp_build_if_state if_state;
7597
7598 for (unsigned i = 0; i < num_parts; ++i) {
7599 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7600 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7601 }
7602
7603 /* The parameters of the wrapper function correspond to those of the
7604 * first part in terms of SGPRs and VGPRs, but we use the types of the
7605 * main part to get the right types. This is relevant for the
7606 * dereferenceable attribute on descriptor table pointers.
7607 */
7608 num_sgprs = 0;
7609 num_vgprs = 0;
7610
7611 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7612 num_params = LLVMCountParamTypes(function_type);
7613
7614 for (unsigned i = 0; i < num_params; ++i) {
7615 LLVMValueRef param = LLVMGetParam(parts[0], i);
7616
7617 if (ac_is_sgpr_param(param)) {
7618 assert(num_vgprs == 0);
7619 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7620 } else {
7621 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7622 }
7623 }
7624 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7625
7626 num_params = 0;
7627 last_sgpr_param = 0;
7628 gprs = 0;
7629 while (gprs < num_sgprs + num_vgprs) {
7630 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7631 unsigned size;
7632
7633 param_types[num_params] = LLVMTypeOf(param);
7634 if (gprs < num_sgprs)
7635 last_sgpr_param = num_params;
7636 size = llvm_get_type_size(param_types[num_params]) / 4;
7637 num_params++;
7638
7639 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7640 assert(gprs + size <= num_sgprs + num_vgprs &&
7641 (gprs >= num_sgprs || gprs + size <= num_sgprs));
7642
7643 gprs += size;
7644 }
7645
7646 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7647
7648 if (is_merged_shader(ctx->shader)) {
7649 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
7650 lp_build_intrinsic(ctx->gallivm.builder,
7651 "llvm.amdgcn.init.exec", ctx->voidt,
7652 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
7653 }
7654
7655 /* Record the arguments of the function as if they were an output of
7656 * a previous part.
7657 */
7658 num_out = 0;
7659 num_out_sgpr = 0;
7660
7661 for (unsigned i = 0; i < num_params; ++i) {
7662 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7663 LLVMTypeRef param_type = LLVMTypeOf(param);
7664 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7665 unsigned size = llvm_get_type_size(param_type) / 4;
7666
7667 if (size == 1) {
7668 if (param_type != out_type)
7669 param = LLVMBuildBitCast(builder, param, out_type, "");
7670 out[num_out++] = param;
7671 } else {
7672 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7673
7674 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7675 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7676 param_type = ctx->i64;
7677 }
7678
7679 if (param_type != vector_type)
7680 param = LLVMBuildBitCast(builder, param, vector_type, "");
7681
7682 for (unsigned j = 0; j < size; ++j)
7683 out[num_out++] = LLVMBuildExtractElement(
7684 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7685 }
7686
7687 if (i <= last_sgpr_param)
7688 num_out_sgpr = num_out;
7689 }
7690
7691 memcpy(initial, out, sizeof(out));
7692 initial_num_out = num_out;
7693 initial_num_out_sgpr = num_out_sgpr;
7694
7695 /* Now chain the parts. */
7696 for (unsigned part = 0; part < num_parts; ++part) {
7697 LLVMValueRef in[48];
7698 LLVMValueRef ret;
7699 LLVMTypeRef ret_type;
7700 unsigned out_idx = 0;
7701
7702 num_params = LLVMCountParams(parts[part]);
7703 assert(num_params <= ARRAY_SIZE(param_types));
7704
7705 /* Merged shaders are executed conditionally depending
7706 * on the number of enabled threads passed in the input SGPRs. */
7707 if (is_merged_shader(ctx->shader) &&
7708 (part == 0 || part == next_shader_first_part)) {
7709 LLVMValueRef ena, count = initial[3];
7710
7711 /* The thread count for the 2nd shader is at bit-offset 8. */
7712 if (part == next_shader_first_part) {
7713 count = LLVMBuildLShr(builder, count,
7714 LLVMConstInt(ctx->i32, 8, 0), "");
7715 }
7716 count = LLVMBuildAnd(builder, count,
7717 LLVMConstInt(ctx->i32, 0x7f, 0), "");
7718 ena = LLVMBuildICmp(builder, LLVMIntULT,
7719 ac_get_thread_id(&ctx->ac), count, "");
7720 lp_build_if(&if_state, &ctx->gallivm, ena);
7721 }
7722
7723 /* Derive arguments for the next part from outputs of the
7724 * previous one.
7725 */
7726 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7727 LLVMValueRef param;
7728 LLVMTypeRef param_type;
7729 bool is_sgpr;
7730 unsigned param_size;
7731 LLVMValueRef arg = NULL;
7732
7733 param = LLVMGetParam(parts[part], param_idx);
7734 param_type = LLVMTypeOf(param);
7735 param_size = llvm_get_type_size(param_type) / 4;
7736 is_sgpr = ac_is_sgpr_param(param);
7737
7738 if (is_sgpr) {
7739 #if HAVE_LLVM < 0x0400
7740 LLVMRemoveAttribute(param, LLVMByValAttribute);
7741 #else
7742 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7743 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7744 #endif
7745 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7746 }
7747
7748 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7749 assert(is_sgpr || out_idx >= num_out_sgpr);
7750
7751 if (param_size == 1)
7752 arg = out[out_idx];
7753 else
7754 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7755
7756 if (LLVMTypeOf(arg) != param_type) {
7757 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7758 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7759 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7760 } else {
7761 arg = LLVMBuildBitCast(builder, arg, param_type, "");
7762 }
7763 }
7764
7765 in[param_idx] = arg;
7766 out_idx += param_size;
7767 }
7768
7769 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7770
7771 if (is_merged_shader(ctx->shader) &&
7772 (part + 1 == next_shader_first_part ||
7773 part + 1 == num_parts)) {
7774 lp_build_endif(&if_state);
7775
7776 if (part + 1 == next_shader_first_part) {
7777 /* A barrier is required between 2 merged shaders. */
7778 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
7779
7780 /* The second half of the merged shader should use
7781 * the inputs from the toplevel (wrapper) function,
7782 * not the return value from the last call.
7783 *
7784 * That's because the last call was executed condi-
7785 * tionally, so we can't consume it in the main
7786 * block.
7787 */
7788 memcpy(out, initial, sizeof(initial));
7789 num_out = initial_num_out;
7790 num_out_sgpr = initial_num_out_sgpr;
7791 }
7792 continue;
7793 }
7794
7795 /* Extract the returned GPRs. */
7796 ret_type = LLVMTypeOf(ret);
7797 num_out = 0;
7798 num_out_sgpr = 0;
7799
7800 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7801 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7802
7803 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7804
7805 for (unsigned i = 0; i < ret_size; ++i) {
7806 LLVMValueRef val =
7807 LLVMBuildExtractValue(builder, ret, i, "");
7808
7809 out[num_out++] = val;
7810
7811 if (LLVMTypeOf(val) == ctx->i32) {
7812 assert(num_out_sgpr + 1 == num_out);
7813 num_out_sgpr = num_out;
7814 }
7815 }
7816 }
7817 }
7818
7819 LLVMBuildRetVoid(builder);
7820 }
7821
7822 int si_compile_tgsi_shader(struct si_screen *sscreen,
7823 LLVMTargetMachineRef tm,
7824 struct si_shader *shader,
7825 bool is_monolithic,
7826 struct pipe_debug_callback *debug)
7827 {
7828 struct si_shader_selector *sel = shader->selector;
7829 struct si_shader_context ctx;
7830 int r = -1;
7831
7832 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7833 * conversion fails. */
7834 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7835 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7836 tgsi_dump(sel->tokens, 0);
7837 si_dump_streamout(&sel->so);
7838 }
7839
7840 si_init_shader_ctx(&ctx, sscreen, tm);
7841 si_llvm_context_set_tgsi(&ctx, shader);
7842 ctx.separate_prolog = !is_monolithic;
7843
7844 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7845 sizeof(shader->info.vs_output_param_offset));
7846
7847 shader->info.uses_instanceid = sel->info.uses_instanceid;
7848
7849 ctx.load_system_value = declare_system_value;
7850
7851 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
7852 si_llvm_dispose(&ctx);
7853 return -1;
7854 }
7855
7856 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7857 LLVMValueRef parts[3];
7858 bool need_prolog;
7859 bool need_epilog;
7860
7861 need_prolog = sel->vs_needs_prolog;
7862 need_epilog = !shader->key.as_es && !shader->key.as_ls;
7863
7864 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7865
7866 if (need_prolog) {
7867 union si_shader_part_key prolog_key;
7868 si_get_vs_prolog_key(&sel->info,
7869 shader->info.num_input_sgprs,
7870 &shader->key.part.vs.prolog,
7871 shader, &prolog_key);
7872 si_build_vs_prolog_function(&ctx, &prolog_key);
7873 parts[0] = ctx.main_fn;
7874 }
7875
7876 if (need_epilog) {
7877 union si_shader_part_key epilog_key;
7878 si_get_vs_epilog_key(shader, &shader->key.part.vs.epilog, &epilog_key);
7879 si_build_vs_epilog_function(&ctx, &epilog_key);
7880 parts[need_prolog ? 2 : 1] = ctx.main_fn;
7881 }
7882
7883 si_build_wrapper_function(&ctx, parts, 1 + need_prolog + need_epilog,
7884 need_prolog ? 1 : 0, 0);
7885 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7886 if (sscreen->b.chip_class >= GFX9) {
7887 struct si_shader_selector *ls = shader->key.part.tcs.ls;
7888 LLVMValueRef parts[4];
7889
7890 /* TCS main part */
7891 parts[2] = ctx.main_fn;
7892
7893 /* TCS epilog */
7894 union si_shader_part_key tcs_epilog_key;
7895 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
7896 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7897 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
7898 parts[3] = ctx.main_fn;
7899
7900 /* VS prolog */
7901 if (ls->vs_needs_prolog) {
7902 union si_shader_part_key vs_prolog_key;
7903 si_get_vs_prolog_key(&ls->info,
7904 shader->info.num_input_sgprs,
7905 &shader->key.part.tcs.ls_prolog,
7906 shader, &vs_prolog_key);
7907 vs_prolog_key.vs_prolog.is_monolithic = true;
7908 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
7909 parts[0] = ctx.main_fn;
7910 }
7911
7912 /* VS as LS main part */
7913 struct si_shader shader_ls = {};
7914 shader_ls.selector = ls;
7915 shader_ls.key.as_ls = 1;
7916 shader_ls.key.mono = shader->key.mono;
7917 shader_ls.key.opt = shader->key.opt;
7918 si_llvm_context_set_tgsi(&ctx, &shader_ls);
7919
7920 if (!si_compile_tgsi_main(&ctx, true)) {
7921 si_llvm_dispose(&ctx);
7922 return -1;
7923 }
7924 shader->info.uses_instanceid |= ls->info.uses_instanceid;
7925 parts[1] = ctx.main_fn;
7926
7927 /* Reset the shader context. */
7928 ctx.shader = shader;
7929 ctx.type = PIPE_SHADER_TESS_CTRL;
7930
7931 si_build_wrapper_function(&ctx,
7932 parts + !ls->vs_needs_prolog,
7933 4 - !ls->vs_needs_prolog, 0,
7934 ls->vs_needs_prolog ? 2 : 1);
7935 } else {
7936 LLVMValueRef parts[2];
7937 union si_shader_part_key epilog_key;
7938
7939 parts[0] = ctx.main_fn;
7940
7941 memset(&epilog_key, 0, sizeof(epilog_key));
7942 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7943 si_build_tcs_epilog_function(&ctx, &epilog_key);
7944 parts[1] = ctx.main_fn;
7945
7946 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
7947 }
7948 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
7949 !shader->key.as_es) {
7950 LLVMValueRef parts[2];
7951 union si_shader_part_key epilog_key;
7952
7953 parts[0] = ctx.main_fn;
7954
7955 si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, &epilog_key);
7956 si_build_vs_epilog_function(&ctx, &epilog_key);
7957 parts[1] = ctx.main_fn;
7958
7959 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
7960 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
7961 LLVMValueRef parts[2];
7962 union si_shader_part_key prolog_key;
7963
7964 parts[1] = ctx.main_fn;
7965
7966 memset(&prolog_key, 0, sizeof(prolog_key));
7967 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7968 si_build_gs_prolog_function(&ctx, &prolog_key);
7969 parts[0] = ctx.main_fn;
7970
7971 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
7972 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
7973 LLVMValueRef parts[3];
7974 union si_shader_part_key prolog_key;
7975 union si_shader_part_key epilog_key;
7976 bool need_prolog;
7977
7978 si_get_ps_prolog_key(shader, &prolog_key, false);
7979 need_prolog = si_need_ps_prolog(&prolog_key);
7980
7981 parts[need_prolog ? 1 : 0] = ctx.main_fn;
7982
7983 if (need_prolog) {
7984 si_build_ps_prolog_function(&ctx, &prolog_key);
7985 parts[0] = ctx.main_fn;
7986 }
7987
7988 si_get_ps_epilog_key(shader, &epilog_key);
7989 si_build_ps_epilog_function(&ctx, &epilog_key);
7990 parts[need_prolog ? 2 : 1] = ctx.main_fn;
7991
7992 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
7993 need_prolog ? 1 : 0, 0);
7994 }
7995
7996 /* Dump LLVM IR before any optimization passes */
7997 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
7998 r600_can_dump_shader(&sscreen->b, ctx.type))
7999 LLVMDumpModule(ctx.gallivm.module);
8000
8001 si_llvm_finalize_module(&ctx,
8002 r600_extra_shader_checks(&sscreen->b, ctx.type));
8003
8004 /* Post-optimization transformations and analysis. */
8005 si_eliminate_const_vs_outputs(&ctx);
8006
8007 if ((debug && debug->debug_message) ||
8008 r600_can_dump_shader(&sscreen->b, ctx.type))
8009 si_count_scratch_private_memory(&ctx);
8010
8011 /* Compile to bytecode. */
8012 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
8013 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
8014 si_llvm_dispose(&ctx);
8015 if (r) {
8016 fprintf(stderr, "LLVM failed to compile shader\n");
8017 return r;
8018 }
8019
8020 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
8021 * LLVM 3.9svn has this bug.
8022 */
8023 if (sel->type == PIPE_SHADER_COMPUTE) {
8024 unsigned wave_size = 64;
8025 unsigned max_vgprs = 256;
8026 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
8027 unsigned max_sgprs_per_wave = 128;
8028 unsigned max_block_threads = si_get_max_workgroup_size(shader);
8029 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
8030 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
8031
8032 max_vgprs = max_vgprs / min_waves_per_simd;
8033 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
8034
8035 if (shader->config.num_sgprs > max_sgprs ||
8036 shader->config.num_vgprs > max_vgprs) {
8037 fprintf(stderr, "LLVM failed to compile a shader correctly: "
8038 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
8039 shader->config.num_sgprs, shader->config.num_vgprs,
8040 max_sgprs, max_vgprs);
8041
8042 /* Just terminate the process, because dependent
8043 * shaders can hang due to bad input data, but use
8044 * the env var to allow shader-db to work.
8045 */
8046 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
8047 abort();
8048 }
8049 }
8050
8051 /* Add the scratch offset to input SGPRs. */
8052 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
8053 shader->info.num_input_sgprs += 1; /* scratch byte offset */
8054
8055 /* Calculate the number of fragment input VGPRs. */
8056 if (ctx.type == PIPE_SHADER_FRAGMENT) {
8057 shader->info.num_input_vgprs = 0;
8058 shader->info.face_vgpr_index = -1;
8059
8060 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8061 shader->info.num_input_vgprs += 2;
8062 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
8063 shader->info.num_input_vgprs += 2;
8064 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
8065 shader->info.num_input_vgprs += 2;
8066 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
8067 shader->info.num_input_vgprs += 3;
8068 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8069 shader->info.num_input_vgprs += 2;
8070 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
8071 shader->info.num_input_vgprs += 2;
8072 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
8073 shader->info.num_input_vgprs += 2;
8074 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
8075 shader->info.num_input_vgprs += 1;
8076 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
8077 shader->info.num_input_vgprs += 1;
8078 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
8079 shader->info.num_input_vgprs += 1;
8080 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
8081 shader->info.num_input_vgprs += 1;
8082 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
8083 shader->info.num_input_vgprs += 1;
8084 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
8085 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
8086 shader->info.num_input_vgprs += 1;
8087 }
8088 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
8089 shader->info.num_input_vgprs += 1;
8090 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
8091 shader->info.num_input_vgprs += 1;
8092 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
8093 shader->info.num_input_vgprs += 1;
8094 }
8095
8096 return 0;
8097 }
8098
8099 /**
8100 * Create, compile and return a shader part (prolog or epilog).
8101 *
8102 * \param sscreen screen
8103 * \param list list of shader parts of the same category
8104 * \param type shader type
8105 * \param key shader part key
8106 * \param prolog whether the part being requested is a prolog
8107 * \param tm LLVM target machine
8108 * \param debug debug callback
8109 * \param build the callback responsible for building the main function
8110 * \return non-NULL on success
8111 */
8112 static struct si_shader_part *
8113 si_get_shader_part(struct si_screen *sscreen,
8114 struct si_shader_part **list,
8115 enum pipe_shader_type type,
8116 bool prolog,
8117 union si_shader_part_key *key,
8118 LLVMTargetMachineRef tm,
8119 struct pipe_debug_callback *debug,
8120 void (*build)(struct si_shader_context *,
8121 union si_shader_part_key *),
8122 const char *name)
8123 {
8124 struct si_shader_part *result;
8125
8126 mtx_lock(&sscreen->shader_parts_mutex);
8127
8128 /* Find existing. */
8129 for (result = *list; result; result = result->next) {
8130 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
8131 mtx_unlock(&sscreen->shader_parts_mutex);
8132 return result;
8133 }
8134 }
8135
8136 /* Compile a new one. */
8137 result = CALLOC_STRUCT(si_shader_part);
8138 result->key = *key;
8139
8140 struct si_shader shader = {};
8141 struct si_shader_context ctx;
8142 struct gallivm_state *gallivm = &ctx.gallivm;
8143
8144 si_init_shader_ctx(&ctx, sscreen, tm);
8145 ctx.shader = &shader;
8146 ctx.type = type;
8147
8148 switch (type) {
8149 case PIPE_SHADER_VERTEX:
8150 break;
8151 case PIPE_SHADER_TESS_CTRL:
8152 assert(!prolog);
8153 shader.key.part.tcs.epilog = key->tcs_epilog.states;
8154 break;
8155 case PIPE_SHADER_GEOMETRY:
8156 assert(prolog);
8157 break;
8158 case PIPE_SHADER_FRAGMENT:
8159 if (prolog)
8160 shader.key.part.ps.prolog = key->ps_prolog.states;
8161 else
8162 shader.key.part.ps.epilog = key->ps_epilog.states;
8163 break;
8164 default:
8165 unreachable("bad shader part");
8166 }
8167
8168 build(&ctx, key);
8169
8170 /* Compile. */
8171 si_llvm_finalize_module(&ctx,
8172 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
8173
8174 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
8175 gallivm->module, debug, ctx.type, name)) {
8176 FREE(result);
8177 result = NULL;
8178 goto out;
8179 }
8180
8181 result->next = *list;
8182 *list = result;
8183
8184 out:
8185 si_llvm_dispose(&ctx);
8186 mtx_unlock(&sscreen->shader_parts_mutex);
8187 return result;
8188 }
8189
8190 /**
8191 * Build the vertex shader prolog function.
8192 *
8193 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
8194 * All inputs are returned unmodified. The vertex load indices are
8195 * stored after them, which will be used by the API VS for fetching inputs.
8196 *
8197 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
8198 * input_v0,
8199 * input_v1,
8200 * input_v2,
8201 * input_v3,
8202 * (VertexID + BaseVertex),
8203 * (InstanceID + StartInstance),
8204 * (InstanceID / 2 + StartInstance)
8205 */
8206 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
8207 union si_shader_part_key *key)
8208 {
8209 struct gallivm_state *gallivm = &ctx->gallivm;
8210 LLVMTypeRef *params, *returns;
8211 LLVMValueRef ret, func;
8212 int last_sgpr, num_params, num_returns, i;
8213 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
8214 key->vs_prolog.num_merged_next_stage_vgprs;
8215 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
8216 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
8217 num_input_vgprs;
8218 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
8219
8220 ctx->param_vertex_id = first_vs_vgpr;
8221 ctx->param_instance_id = first_vs_vgpr + 3;
8222
8223 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
8224 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
8225 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
8226 sizeof(LLVMTypeRef));
8227 num_params = 0;
8228 num_returns = 0;
8229
8230 /* Declare input and output SGPRs. */
8231 num_params = 0;
8232 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8233 params[num_params++] = ctx->i32;
8234 returns[num_returns++] = ctx->i32;
8235 }
8236 last_sgpr = num_params - 1;
8237
8238 /* Preloaded VGPRs (outputs must be floats) */
8239 for (i = 0; i < num_input_vgprs; i++) {
8240 params[num_params++] = ctx->i32;
8241 returns[num_returns++] = ctx->f32;
8242 }
8243
8244 /* Vertex load indices. */
8245 for (i = 0; i <= key->vs_prolog.last_input; i++)
8246 returns[num_returns++] = ctx->f32;
8247
8248 /* Create the function. */
8249 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
8250 num_params, last_sgpr);
8251 func = ctx->main_fn;
8252
8253 if (key->vs_prolog.num_merged_next_stage_vgprs &&
8254 !key->vs_prolog.is_monolithic)
8255 si_init_exec_from_input(ctx, 3, 0);
8256
8257 /* Copy inputs to outputs. This should be no-op, as the registers match,
8258 * but it will prevent the compiler from overwriting them unintentionally.
8259 */
8260 ret = ctx->return_value;
8261 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8262 LLVMValueRef p = LLVMGetParam(func, i);
8263 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8264 }
8265 for (; i < num_params; i++) {
8266 LLVMValueRef p = LLVMGetParam(func, i);
8267 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
8268 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8269 }
8270
8271 /* Compute vertex load indices from instance divisors. */
8272 for (i = 0; i <= key->vs_prolog.last_input; i++) {
8273 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
8274 LLVMValueRef index;
8275
8276 if (divisor) {
8277 /* InstanceID / Divisor + StartInstance */
8278 index = get_instance_index_for_fetch(ctx,
8279 user_sgpr_base +
8280 SI_SGPR_START_INSTANCE,
8281 divisor);
8282 } else {
8283 /* VertexID + BaseVertex */
8284 index = LLVMBuildAdd(gallivm->builder,
8285 LLVMGetParam(func, ctx->param_vertex_id),
8286 LLVMGetParam(func, user_sgpr_base +
8287 SI_SGPR_BASE_VERTEX), "");
8288 }
8289
8290 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
8291 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
8292 num_params++, "");
8293 }
8294
8295 si_llvm_build_ret(ctx, ret);
8296 }
8297
8298 /**
8299 * Build the vertex shader epilog function. This is also used by the tessellation
8300 * evaluation shader compiled as VS.
8301 *
8302 * The input is PrimitiveID.
8303 *
8304 * If PrimitiveID is required by the pixel shader, export it.
8305 * Otherwise, do nothing.
8306 */
8307 static void si_build_vs_epilog_function(struct si_shader_context *ctx,
8308 union si_shader_part_key *key)
8309 {
8310 struct gallivm_state *gallivm = &ctx->gallivm;
8311 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8312 LLVMTypeRef params[5];
8313 int num_params, i;
8314
8315 /* Declare input VGPRs. */
8316 num_params = key->vs_epilog.states.export_prim_id ?
8317 (VS_EPILOG_PRIMID_LOC + 1) : 0;
8318 assert(num_params <= ARRAY_SIZE(params));
8319
8320 for (i = 0; i < num_params; i++)
8321 params[i] = ctx->f32;
8322
8323 /* Create the function. */
8324 si_create_function(ctx, "vs_epilog", NULL, 0, params, num_params, -1);
8325
8326 /* Emit exports. */
8327 if (key->vs_epilog.states.export_prim_id) {
8328 struct lp_build_context *base = &bld_base->base;
8329 struct ac_export_args args;
8330
8331 args.enabled_channels = 0x1; /* enabled channels */
8332 args.valid_mask = 0; /* whether the EXEC mask is valid */
8333 args.done = 0; /* DONE bit */
8334 args.target = V_008DFC_SQ_EXP_PARAM +
8335 key->vs_epilog.prim_id_param_offset;
8336 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
8337 args.out[0] = LLVMGetParam(ctx->main_fn,
8338 VS_EPILOG_PRIMID_LOC); /* X */
8339 args.out[1] = base->undef; /* Y */
8340 args.out[2] = base->undef; /* Z */
8341 args.out[3] = base->undef; /* W */
8342
8343 ac_build_export(&ctx->ac, &args);
8344 }
8345
8346 LLVMBuildRetVoid(gallivm->builder);
8347 }
8348
8349 static bool si_get_vs_prolog(struct si_screen *sscreen,
8350 LLVMTargetMachineRef tm,
8351 struct si_shader *shader,
8352 struct pipe_debug_callback *debug,
8353 struct si_shader *main_part,
8354 const struct si_vs_prolog_bits *key)
8355 {
8356 struct si_shader_selector *vs = main_part->selector;
8357
8358 /* The prolog is a no-op if there are no inputs. */
8359 if (!vs->vs_needs_prolog)
8360 return true;
8361
8362 /* Get the prolog. */
8363 union si_shader_part_key prolog_key;
8364 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
8365 key, shader, &prolog_key);
8366
8367 shader->prolog =
8368 si_get_shader_part(sscreen, &sscreen->vs_prologs,
8369 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8370 debug, si_build_vs_prolog_function,
8371 "Vertex Shader Prolog");
8372 return shader->prolog != NULL;
8373 }
8374
8375 /**
8376 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
8377 */
8378 static bool si_get_vs_epilog(struct si_screen *sscreen,
8379 LLVMTargetMachineRef tm,
8380 struct si_shader *shader,
8381 struct pipe_debug_callback *debug,
8382 struct si_vs_epilog_bits *states)
8383 {
8384 union si_shader_part_key epilog_key;
8385
8386 si_get_vs_epilog_key(shader, states, &epilog_key);
8387
8388 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
8389 PIPE_SHADER_VERTEX, true,
8390 &epilog_key, tm, debug,
8391 si_build_vs_epilog_function,
8392 "Vertex Shader Epilog");
8393 return shader->epilog != NULL;
8394 }
8395
8396 /**
8397 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8398 */
8399 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8400 LLVMTargetMachineRef tm,
8401 struct si_shader *shader,
8402 struct pipe_debug_callback *debug)
8403 {
8404 if (!si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8405 &shader->key.part.vs.prolog))
8406 return false;
8407
8408 /* Get the epilog. */
8409 if (!shader->key.as_es && !shader->key.as_ls &&
8410 !si_get_vs_epilog(sscreen, tm, shader, debug,
8411 &shader->key.part.vs.epilog))
8412 return false;
8413
8414 return true;
8415 }
8416
8417 /**
8418 * Select and compile (or reuse) TES parts (epilog).
8419 */
8420 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
8421 LLVMTargetMachineRef tm,
8422 struct si_shader *shader,
8423 struct pipe_debug_callback *debug)
8424 {
8425 if (shader->key.as_es)
8426 return true;
8427
8428 /* TES compiled as VS. */
8429 return si_get_vs_epilog(sscreen, tm, shader, debug,
8430 &shader->key.part.tes.epilog);
8431 }
8432
8433 /**
8434 * Compile the TCS epilog function. This writes tesselation factors to memory
8435 * based on the output primitive type of the tesselator (determined by TES).
8436 */
8437 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8438 union si_shader_part_key *key)
8439 {
8440 struct gallivm_state *gallivm = &ctx->gallivm;
8441 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8442 LLVMTypeRef params[32];
8443 LLVMValueRef func;
8444 int last_sgpr, num_params = 0;
8445
8446 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
8447 params[ctx->param_rw_buffers = num_params++] =
8448 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
8449
8450 if (ctx->screen->b.chip_class >= GFX9) {
8451 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8452 params[num_params++] = ctx->i32; /* wave info */
8453 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8454 params[num_params++] = ctx->i32;
8455 params[num_params++] = ctx->i32;
8456 params[num_params++] = ctx->i32;
8457 params[num_params++] = ctx->i64;
8458 params[num_params++] = ctx->i64;
8459 params[num_params++] = ctx->i64;
8460 params[num_params++] = ctx->i64;
8461 params[num_params++] = ctx->i64;
8462 params[num_params++] = ctx->i64;
8463 params[num_params++] = ctx->i32;
8464 params[num_params++] = ctx->i32;
8465 params[num_params++] = ctx->i32;
8466 params[num_params++] = ctx->i32;
8467 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8468 } else {
8469 params[num_params++] = ctx->i64;
8470 params[num_params++] = ctx->i64;
8471 params[num_params++] = ctx->i64;
8472 params[num_params++] = ctx->i64;
8473 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8474 params[num_params++] = ctx->i32;
8475 params[num_params++] = ctx->i32;
8476 params[num_params++] = ctx->i32;
8477 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8478 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8479 }
8480 last_sgpr = num_params - 1;
8481
8482 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8483 params[num_params++] = ctx->i32; /* invocation ID within the patch */
8484 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8485
8486 /* Create the function. */
8487 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
8488 declare_lds_as_pointer(ctx);
8489 func = ctx->main_fn;
8490
8491 si_write_tess_factors(bld_base,
8492 LLVMGetParam(func, last_sgpr + 1),
8493 LLVMGetParam(func, last_sgpr + 2),
8494 LLVMGetParam(func, last_sgpr + 3));
8495
8496 LLVMBuildRetVoid(gallivm->builder);
8497 }
8498
8499 /**
8500 * Select and compile (or reuse) TCS parts (epilog).
8501 */
8502 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8503 LLVMTargetMachineRef tm,
8504 struct si_shader *shader,
8505 struct pipe_debug_callback *debug)
8506 {
8507 if (sscreen->b.chip_class >= GFX9) {
8508 struct si_shader *ls_main_part =
8509 shader->key.part.tcs.ls->main_shader_part_ls;
8510
8511 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8512 &shader->key.part.tcs.ls_prolog))
8513 return false;
8514
8515 shader->previous_stage = ls_main_part;
8516 }
8517
8518 /* Get the epilog. */
8519 union si_shader_part_key epilog_key;
8520 memset(&epilog_key, 0, sizeof(epilog_key));
8521 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8522
8523 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8524 PIPE_SHADER_TESS_CTRL, false,
8525 &epilog_key, tm, debug,
8526 si_build_tcs_epilog_function,
8527 "Tessellation Control Shader Epilog");
8528 return shader->epilog != NULL;
8529 }
8530
8531 /**
8532 * Select and compile (or reuse) GS parts (prolog).
8533 */
8534 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8535 LLVMTargetMachineRef tm,
8536 struct si_shader *shader,
8537 struct pipe_debug_callback *debug)
8538 {
8539 union si_shader_part_key prolog_key;
8540
8541 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8542 return true;
8543
8544 memset(&prolog_key, 0, sizeof(prolog_key));
8545 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8546
8547 shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8548 PIPE_SHADER_GEOMETRY, true,
8549 &prolog_key, tm, debug,
8550 si_build_gs_prolog_function,
8551 "Geometry Shader Prolog");
8552 return shader->prolog != NULL;
8553 }
8554
8555 /**
8556 * Build the pixel shader prolog function. This handles:
8557 * - two-side color selection and interpolation
8558 * - overriding interpolation parameters for the API PS
8559 * - polygon stippling
8560 *
8561 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8562 * overriden by other states. (e.g. per-sample interpolation)
8563 * Interpolated colors are stored after the preloaded VGPRs.
8564 */
8565 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8566 union si_shader_part_key *key)
8567 {
8568 struct gallivm_state *gallivm = &ctx->gallivm;
8569 LLVMTypeRef *params;
8570 LLVMValueRef ret, func;
8571 int last_sgpr, num_params, num_returns, i, num_color_channels;
8572
8573 assert(si_need_ps_prolog(key));
8574
8575 /* Number of inputs + 8 color elements. */
8576 params = alloca((key->ps_prolog.num_input_sgprs +
8577 key->ps_prolog.num_input_vgprs + 8) *
8578 sizeof(LLVMTypeRef));
8579
8580 /* Declare inputs. */
8581 num_params = 0;
8582 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8583 params[num_params++] = ctx->i32;
8584 last_sgpr = num_params - 1;
8585
8586 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8587 params[num_params++] = ctx->f32;
8588
8589 /* Declare outputs (same as inputs + add colors if needed) */
8590 num_returns = num_params;
8591 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8592 for (i = 0; i < num_color_channels; i++)
8593 params[num_returns++] = ctx->f32;
8594
8595 /* Create the function. */
8596 si_create_function(ctx, "ps_prolog", params, num_returns, params,
8597 num_params, last_sgpr);
8598 func = ctx->main_fn;
8599
8600 /* Copy inputs to outputs. This should be no-op, as the registers match,
8601 * but it will prevent the compiler from overwriting them unintentionally.
8602 */
8603 ret = ctx->return_value;
8604 for (i = 0; i < num_params; i++) {
8605 LLVMValueRef p = LLVMGetParam(func, i);
8606 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8607 }
8608
8609 /* Polygon stippling. */
8610 if (key->ps_prolog.states.poly_stipple) {
8611 /* POS_FIXED_PT is always last. */
8612 unsigned pos = key->ps_prolog.num_input_sgprs +
8613 key->ps_prolog.num_input_vgprs - 1;
8614 LLVMValueRef ptr[2], list;
8615
8616 /* Get the pointer to rw buffers. */
8617 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8618 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8619 list = lp_build_gather_values(gallivm, ptr, 2);
8620 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8621 list = LLVMBuildIntToPtr(gallivm->builder, list,
8622 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8623
8624 si_llvm_emit_polygon_stipple(ctx, list, pos);
8625 }
8626
8627 if (key->ps_prolog.states.bc_optimize_for_persp ||
8628 key->ps_prolog.states.bc_optimize_for_linear) {
8629 unsigned i, base = key->ps_prolog.num_input_sgprs;
8630 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8631
8632 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8633 * The hw doesn't compute CENTROID if the whole wave only
8634 * contains fully-covered quads.
8635 *
8636 * PRIM_MASK is after user SGPRs.
8637 */
8638 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8639 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8640 LLVMConstInt(ctx->i32, 31, 0), "");
8641 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8642 ctx->i1, "");
8643
8644 if (key->ps_prolog.states.bc_optimize_for_persp) {
8645 /* Read PERSP_CENTER. */
8646 for (i = 0; i < 2; i++)
8647 center[i] = LLVMGetParam(func, base + 2 + i);
8648 /* Read PERSP_CENTROID. */
8649 for (i = 0; i < 2; i++)
8650 centroid[i] = LLVMGetParam(func, base + 4 + i);
8651 /* Select PERSP_CENTROID. */
8652 for (i = 0; i < 2; i++) {
8653 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8654 center[i], centroid[i], "");
8655 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8656 tmp, base + 4 + i, "");
8657 }
8658 }
8659 if (key->ps_prolog.states.bc_optimize_for_linear) {
8660 /* Read LINEAR_CENTER. */
8661 for (i = 0; i < 2; i++)
8662 center[i] = LLVMGetParam(func, base + 8 + i);
8663 /* Read LINEAR_CENTROID. */
8664 for (i = 0; i < 2; i++)
8665 centroid[i] = LLVMGetParam(func, base + 10 + i);
8666 /* Select LINEAR_CENTROID. */
8667 for (i = 0; i < 2; i++) {
8668 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8669 center[i], centroid[i], "");
8670 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8671 tmp, base + 10 + i, "");
8672 }
8673 }
8674 }
8675
8676 /* Force per-sample interpolation. */
8677 if (key->ps_prolog.states.force_persp_sample_interp) {
8678 unsigned i, base = key->ps_prolog.num_input_sgprs;
8679 LLVMValueRef persp_sample[2];
8680
8681 /* Read PERSP_SAMPLE. */
8682 for (i = 0; i < 2; i++)
8683 persp_sample[i] = LLVMGetParam(func, base + i);
8684 /* Overwrite PERSP_CENTER. */
8685 for (i = 0; i < 2; i++)
8686 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8687 persp_sample[i], base + 2 + i, "");
8688 /* Overwrite PERSP_CENTROID. */
8689 for (i = 0; i < 2; i++)
8690 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8691 persp_sample[i], base + 4 + i, "");
8692 }
8693 if (key->ps_prolog.states.force_linear_sample_interp) {
8694 unsigned i, base = key->ps_prolog.num_input_sgprs;
8695 LLVMValueRef linear_sample[2];
8696
8697 /* Read LINEAR_SAMPLE. */
8698 for (i = 0; i < 2; i++)
8699 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8700 /* Overwrite LINEAR_CENTER. */
8701 for (i = 0; i < 2; i++)
8702 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8703 linear_sample[i], base + 8 + i, "");
8704 /* Overwrite LINEAR_CENTROID. */
8705 for (i = 0; i < 2; i++)
8706 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8707 linear_sample[i], base + 10 + i, "");
8708 }
8709
8710 /* Force center interpolation. */
8711 if (key->ps_prolog.states.force_persp_center_interp) {
8712 unsigned i, base = key->ps_prolog.num_input_sgprs;
8713 LLVMValueRef persp_center[2];
8714
8715 /* Read PERSP_CENTER. */
8716 for (i = 0; i < 2; i++)
8717 persp_center[i] = LLVMGetParam(func, base + 2 + i);
8718 /* Overwrite PERSP_SAMPLE. */
8719 for (i = 0; i < 2; i++)
8720 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8721 persp_center[i], base + i, "");
8722 /* Overwrite PERSP_CENTROID. */
8723 for (i = 0; i < 2; i++)
8724 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8725 persp_center[i], base + 4 + i, "");
8726 }
8727 if (key->ps_prolog.states.force_linear_center_interp) {
8728 unsigned i, base = key->ps_prolog.num_input_sgprs;
8729 LLVMValueRef linear_center[2];
8730
8731 /* Read LINEAR_CENTER. */
8732 for (i = 0; i < 2; i++)
8733 linear_center[i] = LLVMGetParam(func, base + 8 + i);
8734 /* Overwrite LINEAR_SAMPLE. */
8735 for (i = 0; i < 2; i++)
8736 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8737 linear_center[i], base + 6 + i, "");
8738 /* Overwrite LINEAR_CENTROID. */
8739 for (i = 0; i < 2; i++)
8740 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8741 linear_center[i], base + 10 + i, "");
8742 }
8743
8744 /* Interpolate colors. */
8745 for (i = 0; i < 2; i++) {
8746 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8747 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8748 key->ps_prolog.face_vgpr_index;
8749 LLVMValueRef interp[2], color[4];
8750 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8751
8752 if (!writemask)
8753 continue;
8754
8755 /* If the interpolation qualifier is not CONSTANT (-1). */
8756 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8757 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8758 key->ps_prolog.color_interp_vgpr_index[i];
8759
8760 /* Get the (i,j) updated by bc_optimize handling. */
8761 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8762 interp_vgpr, "");
8763 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8764 interp_vgpr + 1, "");
8765 interp_ij = lp_build_gather_values(gallivm, interp, 2);
8766 }
8767
8768 /* Use the absolute location of the input. */
8769 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8770
8771 if (key->ps_prolog.states.color_two_side) {
8772 face = LLVMGetParam(func, face_vgpr);
8773 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8774 }
8775
8776 interp_fs_input(ctx,
8777 key->ps_prolog.color_attr_index[i],
8778 TGSI_SEMANTIC_COLOR, i,
8779 key->ps_prolog.num_interp_inputs,
8780 key->ps_prolog.colors_read, interp_ij,
8781 prim_mask, face, color);
8782
8783 while (writemask) {
8784 unsigned chan = u_bit_scan(&writemask);
8785 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8786 num_params++, "");
8787 }
8788 }
8789
8790 /* Tell LLVM to insert WQM instruction sequence when needed. */
8791 if (key->ps_prolog.wqm) {
8792 LLVMAddTargetDependentFunctionAttr(func,
8793 "amdgpu-ps-wqm-outputs", "");
8794 }
8795
8796 si_llvm_build_ret(ctx, ret);
8797 }
8798
8799 /**
8800 * Build the pixel shader epilog function. This handles everything that must be
8801 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8802 */
8803 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8804 union si_shader_part_key *key)
8805 {
8806 struct gallivm_state *gallivm = &ctx->gallivm;
8807 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8808 LLVMTypeRef params[16+8*4+3];
8809 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8810 int last_sgpr, num_params = 0, i;
8811 struct si_ps_exports exp = {};
8812
8813 /* Declare input SGPRs. */
8814 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8815 params[ctx->param_const_buffers = num_params++] = ctx->i64;
8816 params[ctx->param_samplers = num_params++] = ctx->i64;
8817 params[ctx->param_images = num_params++] = ctx->i64;
8818 params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8819 assert(num_params == SI_PARAM_ALPHA_REF);
8820 params[SI_PARAM_ALPHA_REF] = ctx->f32;
8821 last_sgpr = SI_PARAM_ALPHA_REF;
8822
8823 /* Declare input VGPRs. */
8824 num_params = (last_sgpr + 1) +
8825 util_bitcount(key->ps_epilog.colors_written) * 4 +
8826 key->ps_epilog.writes_z +
8827 key->ps_epilog.writes_stencil +
8828 key->ps_epilog.writes_samplemask;
8829
8830 num_params = MAX2(num_params,
8831 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8832
8833 assert(num_params <= ARRAY_SIZE(params));
8834
8835 for (i = last_sgpr + 1; i < num_params; i++)
8836 params[i] = ctx->f32;
8837
8838 /* Create the function. */
8839 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
8840 /* Disable elimination of unused inputs. */
8841 si_llvm_add_attribute(ctx->main_fn,
8842 "InitialPSInputAddr", 0xffffff);
8843
8844 /* Process colors. */
8845 unsigned vgpr = last_sgpr + 1;
8846 unsigned colors_written = key->ps_epilog.colors_written;
8847 int last_color_export = -1;
8848
8849 /* Find the last color export. */
8850 if (!key->ps_epilog.writes_z &&
8851 !key->ps_epilog.writes_stencil &&
8852 !key->ps_epilog.writes_samplemask) {
8853 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8854
8855 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8856 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8857 /* Just set this if any of the colorbuffers are enabled. */
8858 if (spi_format &
8859 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8860 last_color_export = 0;
8861 } else {
8862 for (i = 0; i < 8; i++)
8863 if (colors_written & (1 << i) &&
8864 (spi_format >> (i * 4)) & 0xf)
8865 last_color_export = i;
8866 }
8867 }
8868
8869 while (colors_written) {
8870 LLVMValueRef color[4];
8871 int mrt = u_bit_scan(&colors_written);
8872
8873 for (i = 0; i < 4; i++)
8874 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8875
8876 si_export_mrt_color(bld_base, color, mrt,
8877 num_params - 1,
8878 mrt == last_color_export, &exp);
8879 }
8880
8881 /* Process depth, stencil, samplemask. */
8882 if (key->ps_epilog.writes_z)
8883 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8884 if (key->ps_epilog.writes_stencil)
8885 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8886 if (key->ps_epilog.writes_samplemask)
8887 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8888
8889 if (depth || stencil || samplemask)
8890 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8891 else if (last_color_export == -1)
8892 si_export_null(bld_base);
8893
8894 if (exp.num)
8895 si_emit_ps_exports(ctx, &exp);
8896
8897 /* Compile. */
8898 LLVMBuildRetVoid(gallivm->builder);
8899 }
8900
8901 /**
8902 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8903 */
8904 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8905 LLVMTargetMachineRef tm,
8906 struct si_shader *shader,
8907 struct pipe_debug_callback *debug)
8908 {
8909 union si_shader_part_key prolog_key;
8910 union si_shader_part_key epilog_key;
8911
8912 /* Get the prolog. */
8913 si_get_ps_prolog_key(shader, &prolog_key, true);
8914
8915 /* The prolog is a no-op if these aren't set. */
8916 if (si_need_ps_prolog(&prolog_key)) {
8917 shader->prolog =
8918 si_get_shader_part(sscreen, &sscreen->ps_prologs,
8919 PIPE_SHADER_FRAGMENT, true,
8920 &prolog_key, tm, debug,
8921 si_build_ps_prolog_function,
8922 "Fragment Shader Prolog");
8923 if (!shader->prolog)
8924 return false;
8925 }
8926
8927 /* Get the epilog. */
8928 si_get_ps_epilog_key(shader, &epilog_key);
8929
8930 shader->epilog =
8931 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
8932 PIPE_SHADER_FRAGMENT, false,
8933 &epilog_key, tm, debug,
8934 si_build_ps_epilog_function,
8935 "Fragment Shader Epilog");
8936 if (!shader->epilog)
8937 return false;
8938
8939 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
8940 if (shader->key.part.ps.prolog.poly_stipple) {
8941 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
8942 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
8943 }
8944
8945 /* Set up the enable bits for per-sample shading if needed. */
8946 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
8947 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8948 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8949 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
8950 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8951 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
8952 }
8953 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
8954 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8955 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8956 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
8957 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8958 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
8959 }
8960 if (shader->key.part.ps.prolog.force_persp_center_interp &&
8961 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8962 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8963 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
8964 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8965 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8966 }
8967 if (shader->key.part.ps.prolog.force_linear_center_interp &&
8968 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8969 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8970 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
8971 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8972 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8973 }
8974
8975 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
8976 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
8977 !(shader->config.spi_ps_input_ena & 0xf)) {
8978 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8979 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
8980 }
8981
8982 /* At least one pair of interpolation weights must be enabled. */
8983 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
8984 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8985 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
8986 }
8987
8988 /* The sample mask input is always enabled, because the API shader always
8989 * passes it through to the epilog. Disable it here if it's unused.
8990 */
8991 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
8992 !shader->selector->info.reads_samplemask)
8993 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
8994
8995 return true;
8996 }
8997
8998 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
8999 unsigned *lds_size)
9000 {
9001 /* SPI barrier management bug:
9002 * Make sure we have at least 4k of LDS in use to avoid the bug.
9003 * It applies to workgroup sizes of more than one wavefront.
9004 */
9005 if (sscreen->b.family == CHIP_BONAIRE ||
9006 sscreen->b.family == CHIP_KABINI ||
9007 sscreen->b.family == CHIP_MULLINS)
9008 *lds_size = MAX2(*lds_size, 8);
9009 }
9010
9011 static void si_fix_resource_usage(struct si_screen *sscreen,
9012 struct si_shader *shader)
9013 {
9014 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
9015
9016 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
9017
9018 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
9019 si_get_max_workgroup_size(shader) > 64) {
9020 si_multiwave_lds_size_workaround(sscreen,
9021 &shader->config.lds_size);
9022 }
9023 }
9024
9025 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
9026 struct si_shader *shader,
9027 struct pipe_debug_callback *debug)
9028 {
9029 struct si_shader_selector *sel = shader->selector;
9030 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
9031 int r;
9032
9033 /* LS, ES, VS are compiled on demand if the main part hasn't been
9034 * compiled for that stage.
9035 *
9036 * Vertex shaders are compiled on demand when a vertex fetch
9037 * workaround must be applied.
9038 */
9039 if (shader->is_monolithic) {
9040 /* Monolithic shader (compiled as a whole, has many variants,
9041 * may take a long time to compile).
9042 */
9043 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
9044 if (r)
9045 return r;
9046 } else {
9047 /* The shader consists of 2-3 parts:
9048 *
9049 * - the middle part is the user shader, it has 1 variant only
9050 * and it was compiled during the creation of the shader
9051 * selector
9052 * - the prolog part is inserted at the beginning
9053 * - the epilog part is inserted at the end
9054 *
9055 * The prolog and epilog have many (but simple) variants.
9056 */
9057
9058 /* Copy the compiled TGSI shader data over. */
9059 shader->is_binary_shared = true;
9060 shader->binary = mainp->binary;
9061 shader->config = mainp->config;
9062 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
9063 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
9064 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
9065 memcpy(shader->info.vs_output_param_offset,
9066 mainp->info.vs_output_param_offset,
9067 sizeof(mainp->info.vs_output_param_offset));
9068 shader->info.uses_instanceid = mainp->info.uses_instanceid;
9069 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
9070 shader->info.nr_param_exports = mainp->info.nr_param_exports;
9071
9072 /* Select prologs and/or epilogs. */
9073 switch (sel->type) {
9074 case PIPE_SHADER_VERTEX:
9075 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
9076 return -1;
9077 break;
9078 case PIPE_SHADER_TESS_CTRL:
9079 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
9080 return -1;
9081 break;
9082 case PIPE_SHADER_TESS_EVAL:
9083 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
9084 return -1;
9085 break;
9086 case PIPE_SHADER_GEOMETRY:
9087 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
9088 return -1;
9089 break;
9090 case PIPE_SHADER_FRAGMENT:
9091 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
9092 return -1;
9093
9094 /* Make sure we have at least as many VGPRs as there
9095 * are allocated inputs.
9096 */
9097 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9098 shader->info.num_input_vgprs);
9099 break;
9100 }
9101
9102 /* Update SGPR and VGPR counts. */
9103 if (shader->prolog) {
9104 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9105 shader->prolog->config.num_sgprs);
9106 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9107 shader->prolog->config.num_vgprs);
9108 }
9109 if (shader->previous_stage) {
9110 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9111 shader->previous_stage->config.num_sgprs);
9112 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9113 shader->previous_stage->config.num_vgprs);
9114 shader->config.spilled_sgprs =
9115 MAX2(shader->config.spilled_sgprs,
9116 shader->previous_stage->config.spilled_sgprs);
9117 shader->config.spilled_vgprs =
9118 MAX2(shader->config.spilled_vgprs,
9119 shader->previous_stage->config.spilled_vgprs);
9120 shader->config.private_mem_vgprs =
9121 MAX2(shader->config.private_mem_vgprs,
9122 shader->previous_stage->config.private_mem_vgprs);
9123 shader->config.scratch_bytes_per_wave =
9124 MAX2(shader->config.scratch_bytes_per_wave,
9125 shader->previous_stage->config.scratch_bytes_per_wave);
9126 shader->info.uses_instanceid |=
9127 shader->previous_stage->info.uses_instanceid;
9128 }
9129 if (shader->prolog2) {
9130 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9131 shader->prolog2->config.num_sgprs);
9132 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9133 shader->prolog2->config.num_vgprs);
9134 }
9135 if (shader->epilog) {
9136 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9137 shader->epilog->config.num_sgprs);
9138 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9139 shader->epilog->config.num_vgprs);
9140 }
9141 }
9142
9143 si_fix_resource_usage(sscreen, shader);
9144 si_shader_dump(sscreen, shader, debug, sel->info.processor,
9145 stderr, true);
9146
9147 /* Upload. */
9148 r = si_shader_binary_upload(sscreen, shader);
9149 if (r) {
9150 fprintf(stderr, "LLVM failed to upload shader\n");
9151 return r;
9152 }
9153
9154 return 0;
9155 }
9156
9157 void si_shader_destroy(struct si_shader *shader)
9158 {
9159 if (shader->scratch_bo)
9160 r600_resource_reference(&shader->scratch_bo, NULL);
9161
9162 r600_resource_reference(&shader->bo, NULL);
9163
9164 if (!shader->is_binary_shared)
9165 radeon_shader_binary_clean(&shader->binary);
9166
9167 free(shader->shader_log);
9168 }