radeonsi: remove VS epilog code, compile VS with PrimID export on demand
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "util/u_memory.h"
37 #include "util/u_string.h"
38 #include "tgsi/tgsi_build.h"
39 #include "tgsi/tgsi_util.h"
40 #include "tgsi/tgsi_dump.h"
41
42 #include "ac_binary.h"
43 #include "ac_llvm_util.h"
44 #include "ac_exp_param.h"
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned semantic_name;
60 unsigned semantic_index;
61 ubyte vertex_stream[4];
62 };
63
64 static void si_init_shader_ctx(struct si_shader_context *ctx,
65 struct si_screen *sscreen,
66 LLVMTargetMachineRef tm);
67
68 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
69 struct lp_build_tgsi_context *bld_base,
70 struct lp_build_emit_data *emit_data);
71
72 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
73 FILE *f);
74
75 static unsigned llvm_get_type_size(LLVMTypeRef type);
76
77 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
78 union si_shader_part_key *key);
79 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
80 union si_shader_part_key *key);
81 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85
86 /* Ideally pass the sample mask input to the PS epilog as v13, which
87 * is its usual location, so that the shader doesn't have to add v_mov.
88 */
89 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
90
91 enum {
92 CONST_ADDR_SPACE = 2,
93 LOCAL_ADDR_SPACE = 3,
94 };
95
96 static bool is_merged_shader(struct si_shader *shader)
97 {
98 if (shader->selector->screen->b.chip_class <= VI)
99 return false;
100
101 return shader->key.as_ls ||
102 shader->key.as_es ||
103 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
104 shader->selector->type == PIPE_SHADER_GEOMETRY;
105 }
106
107 /**
108 * Returns a unique index for a semantic name and index. The index must be
109 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
110 * calculated.
111 */
112 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
113 {
114 switch (semantic_name) {
115 case TGSI_SEMANTIC_POSITION:
116 return 0;
117 case TGSI_SEMANTIC_PSIZE:
118 return 1;
119 case TGSI_SEMANTIC_CLIPDIST:
120 assert(index <= 1);
121 return 2 + index;
122 case TGSI_SEMANTIC_GENERIC:
123 if (index <= 63-4)
124 return 4 + index;
125
126 assert(!"invalid generic index");
127 return 0;
128
129 /* patch indices are completely separate and thus start from 0 */
130 case TGSI_SEMANTIC_TESSOUTER:
131 return 0;
132 case TGSI_SEMANTIC_TESSINNER:
133 return 1;
134 case TGSI_SEMANTIC_PATCH:
135 return 2 + index;
136
137 default:
138 assert(!"invalid semantic name");
139 return 0;
140 }
141 }
142
143 unsigned si_shader_io_get_unique_index2(unsigned name, unsigned index)
144 {
145 switch (name) {
146 case TGSI_SEMANTIC_FOG:
147 return 0;
148 case TGSI_SEMANTIC_LAYER:
149 return 1;
150 case TGSI_SEMANTIC_VIEWPORT_INDEX:
151 return 2;
152 case TGSI_SEMANTIC_PRIMID:
153 return 3;
154 case TGSI_SEMANTIC_COLOR: /* these alias */
155 case TGSI_SEMANTIC_BCOLOR:
156 return 4 + index;
157 case TGSI_SEMANTIC_TEXCOORD:
158 return 6 + index;
159 default:
160 assert(!"invalid semantic name");
161 return 0;
162 }
163 }
164
165 /**
166 * Get the value of a shader input parameter and extract a bitfield.
167 */
168 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
169 unsigned param, unsigned rshift,
170 unsigned bitwidth)
171 {
172 struct gallivm_state *gallivm = &ctx->gallivm;
173 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
174 param);
175
176 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
177 value = bitcast(&ctx->bld_base,
178 TGSI_TYPE_UNSIGNED, value);
179
180 if (rshift)
181 value = LLVMBuildLShr(gallivm->builder, value,
182 LLVMConstInt(ctx->i32, rshift, 0), "");
183
184 if (rshift + bitwidth < 32) {
185 unsigned mask = (1 << bitwidth) - 1;
186 value = LLVMBuildAnd(gallivm->builder, value,
187 LLVMConstInt(ctx->i32, mask, 0), "");
188 }
189
190 return value;
191 }
192
193 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
194 {
195 switch (ctx->type) {
196 case PIPE_SHADER_TESS_CTRL:
197 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
198
199 case PIPE_SHADER_TESS_EVAL:
200 return LLVMGetParam(ctx->main_fn,
201 ctx->param_tes_rel_patch_id);
202
203 default:
204 assert(0);
205 return NULL;
206 }
207 }
208
209 /* Tessellation shaders pass outputs to the next shader using LDS.
210 *
211 * LS outputs = TCS inputs
212 * TCS outputs = TES inputs
213 *
214 * The LDS layout is:
215 * - TCS inputs for patch 0
216 * - TCS inputs for patch 1
217 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
218 * - ...
219 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
220 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
221 * - TCS outputs for patch 1
222 * - Per-patch TCS outputs for patch 1
223 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
224 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
225 * - ...
226 *
227 * All three shaders VS(LS), TCS, TES share the same LDS space.
228 */
229
230 static LLVMValueRef
231 get_tcs_in_patch_stride(struct si_shader_context *ctx)
232 {
233 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
234 }
235
236 static LLVMValueRef
237 get_tcs_out_patch_stride(struct si_shader_context *ctx)
238 {
239 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
240 }
241
242 static LLVMValueRef
243 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
244 {
245 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
246 unpack_param(ctx,
247 ctx->param_tcs_out_lds_offsets,
248 0, 16),
249 4);
250 }
251
252 static LLVMValueRef
253 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
254 {
255 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
256 unpack_param(ctx,
257 ctx->param_tcs_out_lds_offsets,
258 16, 16),
259 4);
260 }
261
262 static LLVMValueRef
263 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
264 {
265 struct gallivm_state *gallivm = &ctx->gallivm;
266 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
267 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
268
269 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
270 }
271
272 static LLVMValueRef
273 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
274 {
275 struct gallivm_state *gallivm = &ctx->gallivm;
276 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
277 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
278 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
279
280 return LLVMBuildAdd(gallivm->builder, patch0_offset,
281 LLVMBuildMul(gallivm->builder, patch_stride,
282 rel_patch_id, ""),
283 "");
284 }
285
286 static LLVMValueRef
287 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
288 {
289 struct gallivm_state *gallivm = &ctx->gallivm;
290 LLVMValueRef patch0_patch_data_offset =
291 get_tcs_out_patch0_patch_data_offset(ctx);
292 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
293 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
294
295 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
296 LLVMBuildMul(gallivm->builder, patch_stride,
297 rel_patch_id, ""),
298 "");
299 }
300
301 static LLVMValueRef get_instance_index_for_fetch(
302 struct si_shader_context *ctx,
303 unsigned param_start_instance, unsigned divisor)
304 {
305 struct gallivm_state *gallivm = &ctx->gallivm;
306
307 LLVMValueRef result = LLVMGetParam(ctx->main_fn,
308 ctx->param_instance_id);
309
310 /* The division must be done before START_INSTANCE is added. */
311 if (divisor > 1)
312 result = LLVMBuildUDiv(gallivm->builder, result,
313 LLVMConstInt(ctx->i32, divisor, 0), "");
314
315 return LLVMBuildAdd(gallivm->builder, result,
316 LLVMGetParam(ctx->main_fn, param_start_instance), "");
317 }
318
319 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
320 * to float. */
321 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
322 LLVMValueRef vec4,
323 unsigned double_index)
324 {
325 LLVMBuilderRef builder = ctx->gallivm.builder;
326 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
327 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
328 LLVMVectorType(f64, 2), "");
329 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
330 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
331 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
332 }
333
334 static void declare_input_vs(
335 struct si_shader_context *ctx,
336 unsigned input_index,
337 const struct tgsi_full_declaration *decl,
338 LLVMValueRef out[4])
339 {
340 struct gallivm_state *gallivm = &ctx->gallivm;
341
342 unsigned chan;
343 unsigned fix_fetch;
344 unsigned num_fetches;
345 unsigned fetch_stride;
346
347 LLVMValueRef t_list_ptr;
348 LLVMValueRef t_offset;
349 LLVMValueRef t_list;
350 LLVMValueRef vertex_index;
351 LLVMValueRef input[3];
352
353 /* Load the T list */
354 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
355
356 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
357
358 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
359
360 vertex_index = LLVMGetParam(ctx->main_fn,
361 ctx->param_vertex_index0 +
362 input_index);
363
364 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
365
366 /* Do multiple loads for special formats. */
367 switch (fix_fetch) {
368 case SI_FIX_FETCH_RGB_64_FLOAT:
369 num_fetches = 3; /* 3 2-dword loads */
370 fetch_stride = 8;
371 break;
372 case SI_FIX_FETCH_RGBA_64_FLOAT:
373 num_fetches = 2; /* 2 4-dword loads */
374 fetch_stride = 16;
375 break;
376 case SI_FIX_FETCH_RGB_8:
377 case SI_FIX_FETCH_RGB_8_INT:
378 num_fetches = 3;
379 fetch_stride = 1;
380 break;
381 case SI_FIX_FETCH_RGB_16:
382 case SI_FIX_FETCH_RGB_16_INT:
383 num_fetches = 3;
384 fetch_stride = 2;
385 break;
386 default:
387 num_fetches = 1;
388 fetch_stride = 0;
389 }
390
391 for (unsigned i = 0; i < num_fetches; i++) {
392 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
393
394 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
395 vertex_index, voffset,
396 true);
397 }
398
399 /* Break up the vec4 into individual components */
400 for (chan = 0; chan < 4; chan++) {
401 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
402 out[chan] = LLVMBuildExtractElement(gallivm->builder,
403 input[0], llvm_chan, "");
404 }
405
406 switch (fix_fetch) {
407 case SI_FIX_FETCH_A2_SNORM:
408 case SI_FIX_FETCH_A2_SSCALED:
409 case SI_FIX_FETCH_A2_SINT: {
410 /* The hardware returns an unsigned value; convert it to a
411 * signed one.
412 */
413 LLVMValueRef tmp = out[3];
414 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
415
416 /* First, recover the sign-extended signed integer value. */
417 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
418 tmp = LLVMBuildFPToUI(gallivm->builder, tmp, ctx->i32, "");
419 else
420 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->i32, "");
421
422 /* For the integer-like cases, do a natural sign extension.
423 *
424 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
425 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
426 * exponent.
427 */
428 tmp = LLVMBuildShl(gallivm->builder, tmp,
429 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
430 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
431 tmp = LLVMBuildAShr(gallivm->builder, tmp, c30, "");
432
433 /* Convert back to the right type. */
434 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
435 LLVMValueRef clamp;
436 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
437 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
438 clamp = LLVMBuildFCmp(gallivm->builder, LLVMRealULT, tmp, neg_one, "");
439 tmp = LLVMBuildSelect(gallivm->builder, clamp, neg_one, tmp, "");
440 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
441 tmp = LLVMBuildSIToFP(gallivm->builder, tmp, ctx->f32, "");
442 }
443
444 out[3] = tmp;
445 break;
446 }
447 case SI_FIX_FETCH_RGBA_32_UNORM:
448 case SI_FIX_FETCH_RGBX_32_UNORM:
449 for (chan = 0; chan < 4; chan++) {
450 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
451 ctx->i32, "");
452 out[chan] = LLVMBuildUIToFP(gallivm->builder,
453 out[chan], ctx->f32, "");
454 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
455 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
456 }
457 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
458 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
459 out[3] = LLVMConstReal(ctx->f32, 1);
460 break;
461 case SI_FIX_FETCH_RGBA_32_SNORM:
462 case SI_FIX_FETCH_RGBX_32_SNORM:
463 case SI_FIX_FETCH_RGBA_32_FIXED:
464 case SI_FIX_FETCH_RGBX_32_FIXED: {
465 double scale;
466 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
467 scale = 1.0 / 0x10000;
468 else
469 scale = 1.0 / INT_MAX;
470
471 for (chan = 0; chan < 4; chan++) {
472 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
473 ctx->i32, "");
474 out[chan] = LLVMBuildSIToFP(gallivm->builder,
475 out[chan], ctx->f32, "");
476 out[chan] = LLVMBuildFMul(gallivm->builder, out[chan],
477 LLVMConstReal(ctx->f32, scale), "");
478 }
479 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
480 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
481 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
482 out[3] = LLVMConstReal(ctx->f32, 1);
483 break;
484 }
485 case SI_FIX_FETCH_RGBA_32_USCALED:
486 for (chan = 0; chan < 4; chan++) {
487 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
488 ctx->i32, "");
489 out[chan] = LLVMBuildUIToFP(gallivm->builder,
490 out[chan], ctx->f32, "");
491 }
492 break;
493 case SI_FIX_FETCH_RGBA_32_SSCALED:
494 for (chan = 0; chan < 4; chan++) {
495 out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
496 ctx->i32, "");
497 out[chan] = LLVMBuildSIToFP(gallivm->builder,
498 out[chan], ctx->f32, "");
499 }
500 break;
501 case SI_FIX_FETCH_RG_64_FLOAT:
502 for (chan = 0; chan < 2; chan++)
503 out[chan] = extract_double_to_float(ctx, input[0], chan);
504
505 out[2] = LLVMConstReal(ctx->f32, 0);
506 out[3] = LLVMConstReal(ctx->f32, 1);
507 break;
508 case SI_FIX_FETCH_RGB_64_FLOAT:
509 for (chan = 0; chan < 3; chan++)
510 out[chan] = extract_double_to_float(ctx, input[chan], 0);
511
512 out[3] = LLVMConstReal(ctx->f32, 1);
513 break;
514 case SI_FIX_FETCH_RGBA_64_FLOAT:
515 for (chan = 0; chan < 4; chan++) {
516 out[chan] = extract_double_to_float(ctx, input[chan / 2],
517 chan % 2);
518 }
519 break;
520 case SI_FIX_FETCH_RGB_8:
521 case SI_FIX_FETCH_RGB_8_INT:
522 case SI_FIX_FETCH_RGB_16:
523 case SI_FIX_FETCH_RGB_16_INT:
524 for (chan = 0; chan < 3; chan++) {
525 out[chan] = LLVMBuildExtractElement(gallivm->builder,
526 input[chan],
527 ctx->i32_0, "");
528 }
529 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
530 fix_fetch == SI_FIX_FETCH_RGB_16) {
531 out[3] = LLVMConstReal(ctx->f32, 1);
532 } else {
533 out[3] = LLVMBuildBitCast(gallivm->builder, ctx->i32_1,
534 ctx->f32, "");
535 }
536 break;
537 }
538 }
539
540 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
541 unsigned swizzle)
542 {
543 struct si_shader_context *ctx = si_shader_context(bld_base);
544
545 if (swizzle > 0)
546 return ctx->i32_0;
547
548 switch (ctx->type) {
549 case PIPE_SHADER_VERTEX:
550 return LLVMGetParam(ctx->main_fn,
551 ctx->param_vs_prim_id);
552 case PIPE_SHADER_TESS_CTRL:
553 return LLVMGetParam(ctx->main_fn,
554 ctx->param_tcs_patch_id);
555 case PIPE_SHADER_TESS_EVAL:
556 return LLVMGetParam(ctx->main_fn,
557 ctx->param_tes_patch_id);
558 case PIPE_SHADER_GEOMETRY:
559 return LLVMGetParam(ctx->main_fn,
560 ctx->param_gs_prim_id);
561 default:
562 assert(0);
563 return ctx->i32_0;
564 }
565 }
566
567 /**
568 * Return the value of tgsi_ind_register for indexing.
569 * This is the indirect index with the constant offset added to it.
570 */
571 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
572 const struct tgsi_ind_register *ind,
573 int rel_index)
574 {
575 struct gallivm_state *gallivm = &ctx->gallivm;
576 LLVMValueRef result;
577
578 result = ctx->addrs[ind->Index][ind->Swizzle];
579 result = LLVMBuildLoad(gallivm->builder, result, "");
580 result = LLVMBuildAdd(gallivm->builder, result,
581 LLVMConstInt(ctx->i32, rel_index, 0), "");
582 return result;
583 }
584
585 /**
586 * Like get_indirect_index, but restricts the return value to a (possibly
587 * undefined) value inside [0..num).
588 */
589 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
590 const struct tgsi_ind_register *ind,
591 int rel_index, unsigned num)
592 {
593 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
594
595 /* LLVM 3.8: If indirect resource indexing is used:
596 * - SI & CIK hang
597 * - VI crashes
598 */
599 if (HAVE_LLVM == 0x0308)
600 return LLVMGetUndef(ctx->i32);
601
602 return si_llvm_bound_index(ctx, result, num);
603 }
604
605
606 /**
607 * Calculate a dword address given an input or output register and a stride.
608 */
609 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
610 const struct tgsi_full_dst_register *dst,
611 const struct tgsi_full_src_register *src,
612 LLVMValueRef vertex_dw_stride,
613 LLVMValueRef base_addr)
614 {
615 struct gallivm_state *gallivm = &ctx->gallivm;
616 struct tgsi_shader_info *info = &ctx->shader->selector->info;
617 ubyte *name, *index, *array_first;
618 int first, param;
619 struct tgsi_full_dst_register reg;
620
621 /* Set the register description. The address computation is the same
622 * for sources and destinations. */
623 if (src) {
624 reg.Register.File = src->Register.File;
625 reg.Register.Index = src->Register.Index;
626 reg.Register.Indirect = src->Register.Indirect;
627 reg.Register.Dimension = src->Register.Dimension;
628 reg.Indirect = src->Indirect;
629 reg.Dimension = src->Dimension;
630 reg.DimIndirect = src->DimIndirect;
631 } else
632 reg = *dst;
633
634 /* If the register is 2-dimensional (e.g. an array of vertices
635 * in a primitive), calculate the base address of the vertex. */
636 if (reg.Register.Dimension) {
637 LLVMValueRef index;
638
639 if (reg.Dimension.Indirect)
640 index = get_indirect_index(ctx, &reg.DimIndirect,
641 reg.Dimension.Index);
642 else
643 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
644
645 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
646 LLVMBuildMul(gallivm->builder, index,
647 vertex_dw_stride, ""), "");
648 }
649
650 /* Get information about the register. */
651 if (reg.Register.File == TGSI_FILE_INPUT) {
652 name = info->input_semantic_name;
653 index = info->input_semantic_index;
654 array_first = info->input_array_first;
655 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
656 name = info->output_semantic_name;
657 index = info->output_semantic_index;
658 array_first = info->output_array_first;
659 } else {
660 assert(0);
661 return NULL;
662 }
663
664 if (reg.Register.Indirect) {
665 /* Add the relative address of the element. */
666 LLVMValueRef ind_index;
667
668 if (reg.Indirect.ArrayID)
669 first = array_first[reg.Indirect.ArrayID];
670 else
671 first = reg.Register.Index;
672
673 ind_index = get_indirect_index(ctx, &reg.Indirect,
674 reg.Register.Index - first);
675
676 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
677 LLVMBuildMul(gallivm->builder, ind_index,
678 LLVMConstInt(ctx->i32, 4, 0), ""), "");
679
680 param = si_shader_io_get_unique_index(name[first], index[first]);
681 } else {
682 param = si_shader_io_get_unique_index(name[reg.Register.Index],
683 index[reg.Register.Index]);
684 }
685
686 /* Add the base address of the element. */
687 return LLVMBuildAdd(gallivm->builder, base_addr,
688 LLVMConstInt(ctx->i32, param * 4, 0), "");
689 }
690
691 /* The offchip buffer layout for TCS->TES is
692 *
693 * - attribute 0 of patch 0 vertex 0
694 * - attribute 0 of patch 0 vertex 1
695 * - attribute 0 of patch 0 vertex 2
696 * ...
697 * - attribute 0 of patch 1 vertex 0
698 * - attribute 0 of patch 1 vertex 1
699 * ...
700 * - attribute 1 of patch 0 vertex 0
701 * - attribute 1 of patch 0 vertex 1
702 * ...
703 * - per patch attribute 0 of patch 0
704 * - per patch attribute 0 of patch 1
705 * ...
706 *
707 * Note that every attribute has 4 components.
708 */
709 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
710 LLVMValueRef rel_patch_id,
711 LLVMValueRef vertex_index,
712 LLVMValueRef param_index)
713 {
714 struct gallivm_state *gallivm = &ctx->gallivm;
715 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
716 LLVMValueRef param_stride, constant16;
717
718 vertices_per_patch = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 6);
719 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 9);
720 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
721 num_patches, "");
722
723 constant16 = LLVMConstInt(ctx->i32, 16, 0);
724 if (vertex_index) {
725 base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id,
726 vertices_per_patch, "");
727
728 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
729 vertex_index, "");
730
731 param_stride = total_vertices;
732 } else {
733 base_addr = rel_patch_id;
734 param_stride = num_patches;
735 }
736
737 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
738 LLVMBuildMul(gallivm->builder, param_index,
739 param_stride, ""), "");
740
741 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
742
743 if (!vertex_index) {
744 LLVMValueRef patch_data_offset =
745 unpack_param(ctx, ctx->param_tcs_offchip_layout, 16, 16);
746
747 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
748 patch_data_offset, "");
749 }
750 return base_addr;
751 }
752
753 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
754 struct si_shader_context *ctx,
755 const struct tgsi_full_dst_register *dst,
756 const struct tgsi_full_src_register *src)
757 {
758 struct gallivm_state *gallivm = &ctx->gallivm;
759 struct tgsi_shader_info *info = &ctx->shader->selector->info;
760 ubyte *name, *index, *array_first;
761 struct tgsi_full_src_register reg;
762 LLVMValueRef vertex_index = NULL;
763 LLVMValueRef param_index = NULL;
764 unsigned param_index_base, param_base;
765
766 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
767
768 if (reg.Register.Dimension) {
769
770 if (reg.Dimension.Indirect)
771 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
772 reg.Dimension.Index);
773 else
774 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
775 }
776
777 /* Get information about the register. */
778 if (reg.Register.File == TGSI_FILE_INPUT) {
779 name = info->input_semantic_name;
780 index = info->input_semantic_index;
781 array_first = info->input_array_first;
782 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
783 name = info->output_semantic_name;
784 index = info->output_semantic_index;
785 array_first = info->output_array_first;
786 } else {
787 assert(0);
788 return NULL;
789 }
790
791 if (reg.Register.Indirect) {
792 if (reg.Indirect.ArrayID)
793 param_base = array_first[reg.Indirect.ArrayID];
794 else
795 param_base = reg.Register.Index;
796
797 param_index = get_indirect_index(ctx, &reg.Indirect,
798 reg.Register.Index - param_base);
799
800 } else {
801 param_base = reg.Register.Index;
802 param_index = ctx->i32_0;
803 }
804
805 param_index_base = si_shader_io_get_unique_index(name[param_base],
806 index[param_base]);
807
808 param_index = LLVMBuildAdd(gallivm->builder, param_index,
809 LLVMConstInt(ctx->i32, param_index_base, 0),
810 "");
811
812 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
813 vertex_index, param_index);
814 }
815
816 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
817 enum tgsi_opcode_type type, unsigned swizzle,
818 LLVMValueRef buffer, LLVMValueRef offset,
819 LLVMValueRef base, bool readonly_memory)
820 {
821 struct si_shader_context *ctx = si_shader_context(bld_base);
822 struct gallivm_state *gallivm = &ctx->gallivm;
823 LLVMValueRef value, value2;
824 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
825 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
826
827 if (swizzle == ~0) {
828 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
829 0, 1, 0, readonly_memory);
830
831 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
832 }
833
834 if (!tgsi_type_is_64bit(type)) {
835 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
836 0, 1, 0, readonly_memory);
837
838 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
839 return LLVMBuildExtractElement(gallivm->builder, value,
840 LLVMConstInt(ctx->i32, swizzle, 0), "");
841 }
842
843 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
844 swizzle * 4, 1, 0, readonly_memory);
845
846 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
847 swizzle * 4 + 4, 1, 0, readonly_memory);
848
849 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
850 }
851
852 /**
853 * Load from LDS.
854 *
855 * \param type output value type
856 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
857 * \param dw_addr address in dwords
858 */
859 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
860 enum tgsi_opcode_type type, unsigned swizzle,
861 LLVMValueRef dw_addr)
862 {
863 struct si_shader_context *ctx = si_shader_context(bld_base);
864 struct gallivm_state *gallivm = &ctx->gallivm;
865 LLVMValueRef value;
866
867 if (swizzle == ~0) {
868 LLVMValueRef values[TGSI_NUM_CHANNELS];
869
870 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
871 values[chan] = lds_load(bld_base, type, chan, dw_addr);
872
873 return lp_build_gather_values(gallivm, values,
874 TGSI_NUM_CHANNELS);
875 }
876
877 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
878 LLVMConstInt(ctx->i32, swizzle, 0));
879
880 value = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
881 if (tgsi_type_is_64bit(type)) {
882 LLVMValueRef value2;
883 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
884 ctx->i32_1);
885 value2 = ac_build_indexed_load(&ctx->ac, ctx->lds, dw_addr, false);
886 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
887 }
888
889 return LLVMBuildBitCast(gallivm->builder, value,
890 tgsi2llvmtype(bld_base, type), "");
891 }
892
893 /**
894 * Store to LDS.
895 *
896 * \param swizzle offset (typically 0..3)
897 * \param dw_addr address in dwords
898 * \param value value to store
899 */
900 static void lds_store(struct lp_build_tgsi_context *bld_base,
901 unsigned dw_offset_imm, LLVMValueRef dw_addr,
902 LLVMValueRef value)
903 {
904 struct si_shader_context *ctx = si_shader_context(bld_base);
905 struct gallivm_state *gallivm = &ctx->gallivm;
906
907 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
908 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
909
910 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
911 ac_build_indexed_store(&ctx->ac, ctx->lds,
912 dw_addr, value);
913 }
914
915 static LLVMValueRef fetch_input_tcs(
916 struct lp_build_tgsi_context *bld_base,
917 const struct tgsi_full_src_register *reg,
918 enum tgsi_opcode_type type, unsigned swizzle)
919 {
920 struct si_shader_context *ctx = si_shader_context(bld_base);
921 LLVMValueRef dw_addr, stride;
922
923 stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
924 dw_addr = get_tcs_in_current_patch_offset(ctx);
925 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
926
927 return lds_load(bld_base, type, swizzle, dw_addr);
928 }
929
930 static LLVMValueRef fetch_output_tcs(
931 struct lp_build_tgsi_context *bld_base,
932 const struct tgsi_full_src_register *reg,
933 enum tgsi_opcode_type type, unsigned swizzle)
934 {
935 struct si_shader_context *ctx = si_shader_context(bld_base);
936 LLVMValueRef dw_addr, stride;
937
938 if (reg->Register.Dimension) {
939 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
940 dw_addr = get_tcs_out_current_patch_offset(ctx);
941 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
942 } else {
943 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
944 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
945 }
946
947 return lds_load(bld_base, type, swizzle, dw_addr);
948 }
949
950 static LLVMValueRef fetch_input_tes(
951 struct lp_build_tgsi_context *bld_base,
952 const struct tgsi_full_src_register *reg,
953 enum tgsi_opcode_type type, unsigned swizzle)
954 {
955 struct si_shader_context *ctx = si_shader_context(bld_base);
956 LLVMValueRef rw_buffers, buffer, base, addr;
957
958 rw_buffers = LLVMGetParam(ctx->main_fn,
959 ctx->param_rw_buffers);
960 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
961 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
962
963 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
964 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
965
966 return buffer_load(bld_base, type, swizzle, buffer, base, addr, true);
967 }
968
969 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
970 const struct tgsi_full_instruction *inst,
971 const struct tgsi_opcode_info *info,
972 LLVMValueRef dst[4])
973 {
974 struct si_shader_context *ctx = si_shader_context(bld_base);
975 struct gallivm_state *gallivm = &ctx->gallivm;
976 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
977 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
978 unsigned chan_index;
979 LLVMValueRef dw_addr, stride;
980 LLVMValueRef rw_buffers, buffer, base, buf_addr;
981 LLVMValueRef values[4];
982 bool skip_lds_store;
983 bool is_tess_factor = false;
984
985 /* Only handle per-patch and per-vertex outputs here.
986 * Vectors will be lowered to scalars and this function will be called again.
987 */
988 if (reg->Register.File != TGSI_FILE_OUTPUT ||
989 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
990 si_llvm_emit_store(bld_base, inst, info, dst);
991 return;
992 }
993
994 if (reg->Register.Dimension) {
995 stride = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 8);
996 dw_addr = get_tcs_out_current_patch_offset(ctx);
997 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
998 skip_lds_store = !sh_info->reads_pervertex_outputs;
999 } else {
1000 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1001 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1002 skip_lds_store = !sh_info->reads_perpatch_outputs;
1003
1004 if (!reg->Register.Indirect) {
1005 int name = sh_info->output_semantic_name[reg->Register.Index];
1006
1007 /* Always write tess factors into LDS for the TCS epilog. */
1008 if (name == TGSI_SEMANTIC_TESSINNER ||
1009 name == TGSI_SEMANTIC_TESSOUTER) {
1010 skip_lds_store = false;
1011 is_tess_factor = true;
1012 }
1013 }
1014 }
1015
1016 rw_buffers = LLVMGetParam(ctx->main_fn,
1017 ctx->param_rw_buffers);
1018 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1019 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1020
1021 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1022 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1023
1024
1025 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1026 LLVMValueRef value = dst[chan_index];
1027
1028 if (inst->Instruction.Saturate)
1029 value = ac_build_clamp(&ctx->ac, value);
1030
1031 /* Skip LDS stores if there is no LDS read of this output. */
1032 if (!skip_lds_store)
1033 lds_store(bld_base, chan_index, dw_addr, value);
1034
1035 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1036 values[chan_index] = value;
1037
1038 if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) {
1039 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1040 buf_addr, base,
1041 4 * chan_index, 1, 0, true, false);
1042 }
1043 }
1044
1045 if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
1046 LLVMValueRef value = lp_build_gather_values(gallivm,
1047 values, 4);
1048 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1049 base, 0, 1, 0, true, false);
1050 }
1051 }
1052
1053 static LLVMValueRef fetch_input_gs(
1054 struct lp_build_tgsi_context *bld_base,
1055 const struct tgsi_full_src_register *reg,
1056 enum tgsi_opcode_type type,
1057 unsigned swizzle)
1058 {
1059 struct si_shader_context *ctx = si_shader_context(bld_base);
1060 struct si_shader *shader = ctx->shader;
1061 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1062 struct gallivm_state *gallivm = &ctx->gallivm;
1063 LLVMValueRef vtx_offset, soffset;
1064 struct tgsi_shader_info *info = &shader->selector->info;
1065 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1066 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1067 unsigned param;
1068 LLVMValueRef value;
1069
1070 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1071 return get_primitive_id(bld_base, swizzle);
1072
1073 if (!reg->Register.Dimension)
1074 return NULL;
1075
1076 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1077
1078 /* GFX9 has the ESGS ring in LDS. */
1079 if (ctx->screen->b.chip_class >= GFX9) {
1080 unsigned index = reg->Dimension.Index;
1081
1082 switch (index / 2) {
1083 case 0:
1084 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1085 index % 2 ? 16 : 0, 16);
1086 break;
1087 case 1:
1088 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1089 index % 2 ? 16 : 0, 16);
1090 break;
1091 case 2:
1092 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1093 index % 2 ? 16 : 0, 16);
1094 break;
1095 default:
1096 assert(0);
1097 return NULL;
1098 }
1099
1100 vtx_offset = LLVMBuildAdd(gallivm->builder, vtx_offset,
1101 LLVMConstInt(ctx->i32, param * 4, 0), "");
1102 return lds_load(bld_base, type, swizzle, vtx_offset);
1103 }
1104
1105 /* GFX6: input load from the ESGS ring in memory. */
1106 if (swizzle == ~0) {
1107 LLVMValueRef values[TGSI_NUM_CHANNELS];
1108 unsigned chan;
1109 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1110 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1111 }
1112 return lp_build_gather_values(gallivm, values,
1113 TGSI_NUM_CHANNELS);
1114 }
1115
1116 /* Get the vertex offset parameter on GFX6. */
1117 unsigned vtx_offset_param = reg->Dimension.Index;
1118 if (vtx_offset_param < 2) {
1119 vtx_offset_param += ctx->param_gs_vtx0_offset;
1120 } else {
1121 assert(vtx_offset_param < 6);
1122 vtx_offset_param += ctx->param_gs_vtx2_offset - 2;
1123 }
1124 vtx_offset = lp_build_mul_imm(uint,
1125 LLVMGetParam(ctx->main_fn,
1126 vtx_offset_param),
1127 4);
1128
1129 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1130
1131 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1132 vtx_offset, soffset, 0, 1, 0, true);
1133 if (tgsi_type_is_64bit(type)) {
1134 LLVMValueRef value2;
1135 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1136
1137 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1138 ctx->i32_0, vtx_offset, soffset,
1139 0, 1, 0, true);
1140 return si_llvm_emit_fetch_64bit(bld_base, type,
1141 value, value2);
1142 }
1143 return LLVMBuildBitCast(gallivm->builder,
1144 value,
1145 tgsi2llvmtype(bld_base, type), "");
1146 }
1147
1148 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1149 {
1150 switch (interpolate) {
1151 case TGSI_INTERPOLATE_CONSTANT:
1152 return 0;
1153
1154 case TGSI_INTERPOLATE_LINEAR:
1155 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1156 return SI_PARAM_LINEAR_SAMPLE;
1157 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1158 return SI_PARAM_LINEAR_CENTROID;
1159 else
1160 return SI_PARAM_LINEAR_CENTER;
1161 break;
1162 case TGSI_INTERPOLATE_COLOR:
1163 case TGSI_INTERPOLATE_PERSPECTIVE:
1164 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1165 return SI_PARAM_PERSP_SAMPLE;
1166 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1167 return SI_PARAM_PERSP_CENTROID;
1168 else
1169 return SI_PARAM_PERSP_CENTER;
1170 break;
1171 default:
1172 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1173 return -1;
1174 }
1175 }
1176
1177 /**
1178 * Interpolate a fragment shader input.
1179 *
1180 * @param ctx context
1181 * @param input_index index of the input in hardware
1182 * @param semantic_name TGSI_SEMANTIC_*
1183 * @param semantic_index semantic index
1184 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1185 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1186 * @param interp_param interpolation weights (i,j)
1187 * @param prim_mask SI_PARAM_PRIM_MASK
1188 * @param face SI_PARAM_FRONT_FACE
1189 * @param result the return value (4 components)
1190 */
1191 static void interp_fs_input(struct si_shader_context *ctx,
1192 unsigned input_index,
1193 unsigned semantic_name,
1194 unsigned semantic_index,
1195 unsigned num_interp_inputs,
1196 unsigned colors_read_mask,
1197 LLVMValueRef interp_param,
1198 LLVMValueRef prim_mask,
1199 LLVMValueRef face,
1200 LLVMValueRef result[4])
1201 {
1202 struct gallivm_state *gallivm = &ctx->gallivm;
1203 LLVMValueRef attr_number;
1204 LLVMValueRef i, j;
1205
1206 unsigned chan;
1207
1208 /* fs.constant returns the param from the middle vertex, so it's not
1209 * really useful for flat shading. It's meant to be used for custom
1210 * interpolation (but the intrinsic can't fetch from the other two
1211 * vertices).
1212 *
1213 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1214 * to do the right thing. The only reason we use fs.constant is that
1215 * fs.interp cannot be used on integers, because they can be equal
1216 * to NaN.
1217 *
1218 * When interp is false we will use fs.constant or for newer llvm,
1219 * amdgcn.interp.mov.
1220 */
1221 bool interp = interp_param != NULL;
1222
1223 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
1224
1225 if (interp) {
1226 interp_param = LLVMBuildBitCast(gallivm->builder, interp_param,
1227 LLVMVectorType(ctx->f32, 2), "");
1228
1229 i = LLVMBuildExtractElement(gallivm->builder, interp_param,
1230 ctx->i32_0, "");
1231 j = LLVMBuildExtractElement(gallivm->builder, interp_param,
1232 ctx->i32_1, "");
1233 }
1234
1235 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1236 ctx->shader->key.part.ps.prolog.color_two_side) {
1237 LLVMValueRef is_face_positive;
1238 LLVMValueRef back_attr_number;
1239
1240 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1241 * otherwise it's at offset "num_inputs".
1242 */
1243 unsigned back_attr_offset = num_interp_inputs;
1244 if (semantic_index == 1 && colors_read_mask & 0xf)
1245 back_attr_offset += 1;
1246
1247 back_attr_number = LLVMConstInt(ctx->i32, back_attr_offset, 0);
1248
1249 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1250 face, ctx->i32_0, "");
1251
1252 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1253 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1254 LLVMValueRef front, back;
1255
1256 if (interp) {
1257 front = ac_build_fs_interp(&ctx->ac, llvm_chan,
1258 attr_number, prim_mask,
1259 i, j);
1260 back = ac_build_fs_interp(&ctx->ac, llvm_chan,
1261 back_attr_number, prim_mask,
1262 i, j);
1263 } else {
1264 front = ac_build_fs_interp_mov(&ctx->ac,
1265 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1266 llvm_chan, attr_number, prim_mask);
1267 back = ac_build_fs_interp_mov(&ctx->ac,
1268 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1269 llvm_chan, back_attr_number, prim_mask);
1270 }
1271
1272 result[chan] = LLVMBuildSelect(gallivm->builder,
1273 is_face_positive,
1274 front,
1275 back,
1276 "");
1277 }
1278 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1279 if (interp) {
1280 result[0] = ac_build_fs_interp(&ctx->ac, ctx->i32_0,
1281 attr_number, prim_mask, i, j);
1282 } else {
1283 result[0] = ac_build_fs_interp_mov(&ctx->ac, ctx->i32_0,
1284 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1285 attr_number, prim_mask);
1286 }
1287 result[1] =
1288 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1289 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1290 } else {
1291 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1292 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
1293
1294 if (interp) {
1295 result[chan] = ac_build_fs_interp(&ctx->ac,
1296 llvm_chan, attr_number, prim_mask, i, j);
1297 } else {
1298 result[chan] = ac_build_fs_interp_mov(&ctx->ac,
1299 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1300 llvm_chan, attr_number, prim_mask);
1301 }
1302 }
1303 }
1304 }
1305
1306 static void declare_input_fs(
1307 struct si_shader_context *ctx,
1308 unsigned input_index,
1309 const struct tgsi_full_declaration *decl,
1310 LLVMValueRef out[4])
1311 {
1312 struct lp_build_context *base = &ctx->bld_base.base;
1313 struct si_shader *shader = ctx->shader;
1314 LLVMValueRef main_fn = ctx->main_fn;
1315 LLVMValueRef interp_param = NULL;
1316 int interp_param_idx;
1317
1318 /* Get colors from input VGPRs (set by the prolog). */
1319 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1320 unsigned i = decl->Semantic.Index;
1321 unsigned colors_read = shader->selector->info.colors_read;
1322 unsigned mask = colors_read >> (i * 4);
1323 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1324 (i ? util_bitcount(colors_read & 0xf) : 0);
1325
1326 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1327 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1328 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1329 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1330 return;
1331 }
1332
1333 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1334 decl->Interp.Location);
1335 if (interp_param_idx == -1)
1336 return;
1337 else if (interp_param_idx) {
1338 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1339 }
1340
1341 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1342 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1343 ctx->shader->key.part.ps.prolog.flatshade_colors)
1344 interp_param = NULL; /* load the constant color */
1345
1346 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1347 decl->Semantic.Index, shader->selector->info.num_inputs,
1348 shader->selector->info.colors_read, interp_param,
1349 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1350 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1351 &out[0]);
1352 }
1353
1354 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1355 {
1356 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1357 }
1358
1359
1360 /**
1361 * Load a dword from a constant buffer.
1362 */
1363 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1364 LLVMValueRef resource,
1365 LLVMValueRef offset)
1366 {
1367 LLVMBuilderRef builder = ctx->gallivm.builder;
1368 LLVMValueRef args[2] = {resource, offset};
1369
1370 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1371 LP_FUNC_ATTR_READNONE |
1372 LP_FUNC_ATTR_LEGACY);
1373 }
1374
1375 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1376 {
1377 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1378 struct gallivm_state *gallivm = &ctx->gallivm;
1379 LLVMBuilderRef builder = gallivm->builder;
1380 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1381 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1382 LLVMValueRef resource = ac_build_indexed_load_const(&ctx->ac, desc, buf_index);
1383
1384 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1385 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1386 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1387
1388 LLVMValueRef pos[4] = {
1389 buffer_load_const(ctx, resource, offset0),
1390 buffer_load_const(ctx, resource, offset1),
1391 LLVMConstReal(ctx->f32, 0),
1392 LLVMConstReal(ctx->f32, 0)
1393 };
1394
1395 return lp_build_gather_values(gallivm, pos, 4);
1396 }
1397
1398 static void declare_system_value(struct si_shader_context *ctx,
1399 unsigned index,
1400 const struct tgsi_full_declaration *decl)
1401 {
1402 struct lp_build_context *bld = &ctx->bld_base.base;
1403 struct gallivm_state *gallivm = &ctx->gallivm;
1404 LLVMValueRef value = 0;
1405
1406 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1407
1408 switch (decl->Semantic.Name) {
1409 case TGSI_SEMANTIC_INSTANCEID:
1410 value = LLVMGetParam(ctx->main_fn,
1411 ctx->param_instance_id);
1412 break;
1413
1414 case TGSI_SEMANTIC_VERTEXID:
1415 value = LLVMBuildAdd(gallivm->builder,
1416 LLVMGetParam(ctx->main_fn,
1417 ctx->param_vertex_id),
1418 LLVMGetParam(ctx->main_fn,
1419 ctx->param_base_vertex), "");
1420 break;
1421
1422 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1423 /* Unused. Clarify the meaning in indexed vs. non-indexed
1424 * draws if this is ever used again. */
1425 assert(false);
1426 break;
1427
1428 case TGSI_SEMANTIC_BASEVERTEX:
1429 {
1430 /* For non-indexed draws, the base vertex set by the driver
1431 * (for direct draws) or the CP (for indirect draws) is the
1432 * first vertex ID, but GLSL expects 0 to be returned.
1433 */
1434 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1435 LLVMValueRef indexed;
1436
1437 indexed = LLVMBuildLShr(gallivm->builder, vs_state, ctx->i32_1, "");
1438 indexed = LLVMBuildTrunc(gallivm->builder, indexed, ctx->i1, "");
1439
1440 value = LLVMBuildSelect(gallivm->builder, indexed,
1441 LLVMGetParam(ctx->main_fn, ctx->param_base_vertex),
1442 ctx->i32_0, "");
1443 break;
1444 }
1445
1446 case TGSI_SEMANTIC_BASEINSTANCE:
1447 value = LLVMGetParam(ctx->main_fn, ctx->param_start_instance);
1448 break;
1449
1450 case TGSI_SEMANTIC_DRAWID:
1451 value = LLVMGetParam(ctx->main_fn, ctx->param_draw_id);
1452 break;
1453
1454 case TGSI_SEMANTIC_INVOCATIONID:
1455 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1456 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1457 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1458 value = LLVMGetParam(ctx->main_fn,
1459 ctx->param_gs_instance_id);
1460 else
1461 assert(!"INVOCATIONID not implemented");
1462 break;
1463
1464 case TGSI_SEMANTIC_POSITION:
1465 {
1466 LLVMValueRef pos[4] = {
1467 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1468 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1469 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1470 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1471 LLVMGetParam(ctx->main_fn,
1472 SI_PARAM_POS_W_FLOAT)),
1473 };
1474 value = lp_build_gather_values(gallivm, pos, 4);
1475 break;
1476 }
1477
1478 case TGSI_SEMANTIC_FACE:
1479 value = LLVMGetParam(ctx->main_fn, SI_PARAM_FRONT_FACE);
1480 break;
1481
1482 case TGSI_SEMANTIC_SAMPLEID:
1483 value = get_sample_id(ctx);
1484 break;
1485
1486 case TGSI_SEMANTIC_SAMPLEPOS: {
1487 LLVMValueRef pos[4] = {
1488 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1489 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1490 LLVMConstReal(ctx->f32, 0),
1491 LLVMConstReal(ctx->f32, 0)
1492 };
1493 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1494 TGSI_OPCODE_FRC, pos[0]);
1495 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1496 TGSI_OPCODE_FRC, pos[1]);
1497 value = lp_build_gather_values(gallivm, pos, 4);
1498 break;
1499 }
1500
1501 case TGSI_SEMANTIC_SAMPLEMASK:
1502 /* This can only occur with the OpenGL Core profile, which
1503 * doesn't support smoothing.
1504 */
1505 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1506 break;
1507
1508 case TGSI_SEMANTIC_TESSCOORD:
1509 {
1510 LLVMValueRef coord[4] = {
1511 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1512 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1513 bld->zero,
1514 bld->zero
1515 };
1516
1517 /* For triangles, the vector should be (u, v, 1-u-v). */
1518 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1519 PIPE_PRIM_TRIANGLES)
1520 coord[2] = lp_build_sub(bld, bld->one,
1521 lp_build_add(bld, coord[0], coord[1]));
1522
1523 value = lp_build_gather_values(gallivm, coord, 4);
1524 break;
1525 }
1526
1527 case TGSI_SEMANTIC_VERTICESIN:
1528 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1529 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1530 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1531 value = unpack_param(ctx, ctx->param_tcs_offchip_layout, 9, 7);
1532 else
1533 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1534 break;
1535
1536 case TGSI_SEMANTIC_TESSINNER:
1537 case TGSI_SEMANTIC_TESSOUTER:
1538 {
1539 LLVMValueRef rw_buffers, buffer, base, addr;
1540 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1541
1542 rw_buffers = LLVMGetParam(ctx->main_fn,
1543 ctx->param_rw_buffers);
1544 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
1545 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
1546
1547 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1548 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1549 LLVMConstInt(ctx->i32, param, 0));
1550
1551 value = buffer_load(&ctx->bld_base, TGSI_TYPE_FLOAT,
1552 ~0, buffer, base, addr, true);
1553
1554 break;
1555 }
1556
1557 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1558 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1559 {
1560 LLVMValueRef buf, slot, val[4];
1561 int i, offset;
1562
1563 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1564 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1565 buf = ac_build_indexed_load_const(&ctx->ac, buf, slot);
1566 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1567
1568 for (i = 0; i < 4; i++)
1569 val[i] = buffer_load_const(ctx, buf,
1570 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1571 value = lp_build_gather_values(gallivm, val, 4);
1572 break;
1573 }
1574
1575 case TGSI_SEMANTIC_PRIMID:
1576 value = get_primitive_id(&ctx->bld_base, 0);
1577 break;
1578
1579 case TGSI_SEMANTIC_GRID_SIZE:
1580 value = LLVMGetParam(ctx->main_fn, SI_PARAM_GRID_SIZE);
1581 break;
1582
1583 case TGSI_SEMANTIC_BLOCK_SIZE:
1584 {
1585 LLVMValueRef values[3];
1586 unsigned i;
1587 unsigned *properties = ctx->shader->selector->info.properties;
1588
1589 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1590 unsigned sizes[3] = {
1591 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1592 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1593 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1594 };
1595
1596 for (i = 0; i < 3; ++i)
1597 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1598
1599 value = lp_build_gather_values(gallivm, values, 3);
1600 } else {
1601 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_SIZE);
1602 }
1603 break;
1604 }
1605
1606 case TGSI_SEMANTIC_BLOCK_ID:
1607 value = LLVMGetParam(ctx->main_fn, SI_PARAM_BLOCK_ID);
1608 break;
1609
1610 case TGSI_SEMANTIC_THREAD_ID:
1611 value = LLVMGetParam(ctx->main_fn, SI_PARAM_THREAD_ID);
1612 break;
1613
1614 case TGSI_SEMANTIC_HELPER_INVOCATION:
1615 if (HAVE_LLVM >= 0x0309) {
1616 value = lp_build_intrinsic(gallivm->builder,
1617 "llvm.amdgcn.ps.live",
1618 ctx->i1, NULL, 0,
1619 LP_FUNC_ATTR_READNONE);
1620 value = LLVMBuildNot(gallivm->builder, value, "");
1621 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1622 } else {
1623 assert(!"TGSI_SEMANTIC_HELPER_INVOCATION unsupported");
1624 return;
1625 }
1626 break;
1627
1628 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1629 value = LLVMConstInt(ctx->i32, 64, 0);
1630 break;
1631
1632 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1633 value = ac_get_thread_id(&ctx->ac);
1634 break;
1635
1636 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1637 {
1638 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1639 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1640 value = LLVMBuildShl(gallivm->builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1641 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1642 break;
1643 }
1644
1645 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1646 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1647 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1648 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1649 {
1650 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1651 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1652 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1653 /* All bits set except LSB */
1654 value = LLVMConstInt(ctx->i64, -2, 0);
1655 } else {
1656 /* All bits set */
1657 value = LLVMConstInt(ctx->i64, -1, 0);
1658 }
1659 id = LLVMBuildZExt(gallivm->builder, id, ctx->i64, "");
1660 value = LLVMBuildShl(gallivm->builder, value, id, "");
1661 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1662 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1663 value = LLVMBuildNot(gallivm->builder, value, "");
1664 value = LLVMBuildBitCast(gallivm->builder, value, ctx->v2i32, "");
1665 break;
1666 }
1667
1668 default:
1669 assert(!"unknown system value");
1670 return;
1671 }
1672
1673 ctx->system_values[index] = value;
1674 }
1675
1676 static void declare_compute_memory(struct si_shader_context *ctx,
1677 const struct tgsi_full_declaration *decl)
1678 {
1679 struct si_shader_selector *sel = ctx->shader->selector;
1680 struct gallivm_state *gallivm = &ctx->gallivm;
1681
1682 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1683 LLVMValueRef var;
1684
1685 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1686 assert(decl->Range.First == decl->Range.Last);
1687 assert(!ctx->shared_memory);
1688
1689 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1690 LLVMArrayType(ctx->i8, sel->local_size),
1691 "compute_lds",
1692 LOCAL_ADDR_SPACE);
1693 LLVMSetAlignment(var, 4);
1694
1695 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1696 }
1697
1698 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1699 {
1700 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1701 ctx->param_const_buffers);
1702
1703 return ac_build_indexed_load_const(&ctx->ac, list_ptr,
1704 LLVMConstInt(ctx->i32, i, 0));
1705 }
1706
1707 static LLVMValueRef fetch_constant(
1708 struct lp_build_tgsi_context *bld_base,
1709 const struct tgsi_full_src_register *reg,
1710 enum tgsi_opcode_type type,
1711 unsigned swizzle)
1712 {
1713 struct si_shader_context *ctx = si_shader_context(bld_base);
1714 struct lp_build_context *base = &bld_base->base;
1715 const struct tgsi_ind_register *ireg = &reg->Indirect;
1716 unsigned buf, idx;
1717
1718 LLVMValueRef addr, bufp;
1719 LLVMValueRef result;
1720
1721 if (swizzle == LP_CHAN_ALL) {
1722 unsigned chan;
1723 LLVMValueRef values[4];
1724 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1725 values[chan] = fetch_constant(bld_base, reg, type, chan);
1726
1727 return lp_build_gather_values(&ctx->gallivm, values, 4);
1728 }
1729
1730 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1731 idx = reg->Register.Index * 4 + swizzle;
1732
1733 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1734 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_buffers);
1735 LLVMValueRef index;
1736 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1737 reg->Dimension.Index,
1738 SI_NUM_CONST_BUFFERS);
1739 bufp = ac_build_indexed_load_const(&ctx->ac, ptr, index);
1740 } else
1741 bufp = load_const_buffer_desc(ctx, buf);
1742
1743 if (reg->Register.Indirect) {
1744 addr = ctx->addrs[ireg->Index][ireg->Swizzle];
1745 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1746 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1747 addr = lp_build_add(&bld_base->uint_bld, addr,
1748 LLVMConstInt(ctx->i32, idx * 4, 0));
1749 } else {
1750 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1751 }
1752
1753 result = buffer_load_const(ctx, bufp, addr);
1754
1755 if (!tgsi_type_is_64bit(type))
1756 result = bitcast(bld_base, type, result);
1757 else {
1758 LLVMValueRef addr2, result2;
1759
1760 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1761 LLVMConstInt(ctx->i32, 4, 0));
1762 result2 = buffer_load_const(ctx, bufp, addr2);
1763
1764 result = si_llvm_emit_fetch_64bit(bld_base, type,
1765 result, result2);
1766 }
1767 return result;
1768 }
1769
1770 /* Upper 16 bits must be zero. */
1771 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
1772 LLVMValueRef val[2])
1773 {
1774 return LLVMBuildOr(ctx->gallivm.builder, val[0],
1775 LLVMBuildShl(ctx->gallivm.builder, val[1],
1776 LLVMConstInt(ctx->i32, 16, 0),
1777 ""), "");
1778 }
1779
1780 /* Upper 16 bits are ignored and will be dropped. */
1781 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
1782 LLVMValueRef val[2])
1783 {
1784 LLVMValueRef v[2] = {
1785 LLVMBuildAnd(ctx->gallivm.builder, val[0],
1786 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
1787 val[1],
1788 };
1789 return si_llvm_pack_two_int16(ctx, v);
1790 }
1791
1792 /* Initialize arguments for the shader export intrinsic */
1793 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1794 LLVMValueRef *values,
1795 unsigned target,
1796 struct ac_export_args *args)
1797 {
1798 struct si_shader_context *ctx = si_shader_context(bld_base);
1799 struct lp_build_context *base = &bld_base->base;
1800 LLVMBuilderRef builder = ctx->gallivm.builder;
1801 LLVMValueRef val[4];
1802 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1803 unsigned chan;
1804 bool is_int8, is_int10;
1805
1806 /* Default is 0xf. Adjusted below depending on the format. */
1807 args->enabled_channels = 0xf; /* writemask */
1808
1809 /* Specify whether the EXEC mask represents the valid mask */
1810 args->valid_mask = 0;
1811
1812 /* Specify whether this is the last export */
1813 args->done = 0;
1814
1815 /* Specify the target we are exporting */
1816 args->target = target;
1817
1818 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1819 const struct si_shader_key *key = &ctx->shader->key;
1820 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
1821 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1822
1823 assert(cbuf >= 0 && cbuf < 8);
1824 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1825 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
1826 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
1827 }
1828
1829 args->compr = false;
1830 args->out[0] = base->undef;
1831 args->out[1] = base->undef;
1832 args->out[2] = base->undef;
1833 args->out[3] = base->undef;
1834
1835 switch (spi_shader_col_format) {
1836 case V_028714_SPI_SHADER_ZERO:
1837 args->enabled_channels = 0; /* writemask */
1838 args->target = V_008DFC_SQ_EXP_NULL;
1839 break;
1840
1841 case V_028714_SPI_SHADER_32_R:
1842 args->enabled_channels = 1; /* writemask */
1843 args->out[0] = values[0];
1844 break;
1845
1846 case V_028714_SPI_SHADER_32_GR:
1847 args->enabled_channels = 0x3; /* writemask */
1848 args->out[0] = values[0];
1849 args->out[1] = values[1];
1850 break;
1851
1852 case V_028714_SPI_SHADER_32_AR:
1853 args->enabled_channels = 0x9; /* writemask */
1854 args->out[0] = values[0];
1855 args->out[3] = values[3];
1856 break;
1857
1858 case V_028714_SPI_SHADER_FP16_ABGR:
1859 args->compr = 1; /* COMPR flag */
1860
1861 for (chan = 0; chan < 2; chan++) {
1862 LLVMValueRef pack_args[2] = {
1863 values[2 * chan],
1864 values[2 * chan + 1]
1865 };
1866 LLVMValueRef packed;
1867
1868 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
1869 args->out[chan] =
1870 LLVMBuildBitCast(ctx->gallivm.builder,
1871 packed, ctx->f32, "");
1872 }
1873 break;
1874
1875 case V_028714_SPI_SHADER_UNORM16_ABGR:
1876 for (chan = 0; chan < 4; chan++) {
1877 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
1878 val[chan] = LLVMBuildFMul(builder, val[chan],
1879 LLVMConstReal(ctx->f32, 65535), "");
1880 val[chan] = LLVMBuildFAdd(builder, val[chan],
1881 LLVMConstReal(ctx->f32, 0.5), "");
1882 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1883 ctx->i32, "");
1884 }
1885
1886 args->compr = 1; /* COMPR flag */
1887 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1888 si_llvm_pack_two_int16(ctx, val));
1889 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1890 si_llvm_pack_two_int16(ctx, val+2));
1891 break;
1892
1893 case V_028714_SPI_SHADER_SNORM16_ABGR:
1894 for (chan = 0; chan < 4; chan++) {
1895 /* Clamp between [-1, 1]. */
1896 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1897 values[chan],
1898 LLVMConstReal(ctx->f32, 1));
1899 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1900 val[chan],
1901 LLVMConstReal(ctx->f32, -1));
1902 /* Convert to a signed integer in [-32767, 32767]. */
1903 val[chan] = LLVMBuildFMul(builder, val[chan],
1904 LLVMConstReal(ctx->f32, 32767), "");
1905 /* If positive, add 0.5, else add -0.5. */
1906 val[chan] = LLVMBuildFAdd(builder, val[chan],
1907 LLVMBuildSelect(builder,
1908 LLVMBuildFCmp(builder, LLVMRealOGE,
1909 val[chan], base->zero, ""),
1910 LLVMConstReal(ctx->f32, 0.5),
1911 LLVMConstReal(ctx->f32, -0.5), ""), "");
1912 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1913 }
1914
1915 args->compr = 1; /* COMPR flag */
1916 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1917 si_llvm_pack_two_int32_as_int16(ctx, val));
1918 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1919 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1920 break;
1921
1922 case V_028714_SPI_SHADER_UINT16_ABGR: {
1923 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1924 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
1925 LLVMValueRef max_alpha =
1926 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1927
1928 /* Clamp. */
1929 for (chan = 0; chan < 4; chan++) {
1930 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1931 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1932 val[chan],
1933 chan == 3 ? max_alpha : max_rgb);
1934 }
1935
1936 args->compr = 1; /* COMPR flag */
1937 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1938 si_llvm_pack_two_int16(ctx, val));
1939 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1940 si_llvm_pack_two_int16(ctx, val+2));
1941 break;
1942 }
1943
1944 case V_028714_SPI_SHADER_SINT16_ABGR: {
1945 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1946 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
1947 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1948 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
1949 LLVMValueRef max_alpha =
1950 !is_int10 ? max_rgb : ctx->i32_1;
1951 LLVMValueRef min_alpha =
1952 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1953
1954 /* Clamp. */
1955 for (chan = 0; chan < 4; chan++) {
1956 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1957 val[chan] = lp_build_emit_llvm_binary(bld_base,
1958 TGSI_OPCODE_IMIN,
1959 val[chan], chan == 3 ? max_alpha : max_rgb);
1960 val[chan] = lp_build_emit_llvm_binary(bld_base,
1961 TGSI_OPCODE_IMAX,
1962 val[chan], chan == 3 ? min_alpha : min_rgb);
1963 }
1964
1965 args->compr = 1; /* COMPR flag */
1966 args->out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1967 si_llvm_pack_two_int32_as_int16(ctx, val));
1968 args->out[1] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1969 si_llvm_pack_two_int32_as_int16(ctx, val+2));
1970 break;
1971 }
1972
1973 case V_028714_SPI_SHADER_32_ABGR:
1974 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
1975 break;
1976 }
1977 }
1978
1979 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1980 LLVMValueRef alpha)
1981 {
1982 struct si_shader_context *ctx = si_shader_context(bld_base);
1983
1984 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1985 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
1986 SI_PARAM_ALPHA_REF);
1987
1988 LLVMValueRef alpha_pass =
1989 lp_build_cmp(&bld_base->base,
1990 ctx->shader->key.part.ps.epilog.alpha_func,
1991 alpha, alpha_ref);
1992 LLVMValueRef arg =
1993 lp_build_select(&bld_base->base,
1994 alpha_pass,
1995 LLVMConstReal(ctx->f32, 1.0f),
1996 LLVMConstReal(ctx->f32, -1.0f));
1997
1998 ac_build_kill(&ctx->ac, arg);
1999 } else {
2000 ac_build_kill(&ctx->ac, NULL);
2001 }
2002 }
2003
2004 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2005 LLVMValueRef alpha,
2006 unsigned samplemask_param)
2007 {
2008 struct si_shader_context *ctx = si_shader_context(bld_base);
2009 struct gallivm_state *gallivm = &ctx->gallivm;
2010 LLVMValueRef coverage;
2011
2012 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2013 coverage = LLVMGetParam(ctx->main_fn,
2014 samplemask_param);
2015 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2016
2017 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2018 ctx->i32,
2019 &coverage, 1, LP_FUNC_ATTR_READNONE);
2020
2021 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2022 ctx->f32, "");
2023
2024 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2025 LLVMConstReal(ctx->f32,
2026 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2027
2028 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2029 }
2030
2031 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2032 struct ac_export_args *pos, LLVMValueRef *out_elts)
2033 {
2034 struct si_shader_context *ctx = si_shader_context(bld_base);
2035 struct lp_build_context *base = &bld_base->base;
2036 unsigned reg_index;
2037 unsigned chan;
2038 unsigned const_chan;
2039 LLVMValueRef base_elt;
2040 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2041 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2042 SI_VS_CONST_CLIP_PLANES, 0);
2043 LLVMValueRef const_resource = ac_build_indexed_load_const(&ctx->ac, ptr, constbuf_index);
2044
2045 for (reg_index = 0; reg_index < 2; reg_index ++) {
2046 struct ac_export_args *args = &pos[2 + reg_index];
2047
2048 args->out[0] =
2049 args->out[1] =
2050 args->out[2] =
2051 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2052
2053 /* Compute dot products of position and user clip plane vectors */
2054 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2055 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2056 LLVMValueRef addr =
2057 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2058 const_chan) * 4, 0);
2059 base_elt = buffer_load_const(ctx, const_resource,
2060 addr);
2061 args->out[chan] =
2062 lp_build_add(base, args->out[chan],
2063 lp_build_mul(base, base_elt,
2064 out_elts[const_chan]));
2065 }
2066 }
2067
2068 args->enabled_channels = 0xf;
2069 args->valid_mask = 0;
2070 args->done = 0;
2071 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2072 args->compr = 0;
2073 }
2074 }
2075
2076 static void si_dump_streamout(struct pipe_stream_output_info *so)
2077 {
2078 unsigned i;
2079
2080 if (so->num_outputs)
2081 fprintf(stderr, "STREAMOUT\n");
2082
2083 for (i = 0; i < so->num_outputs; i++) {
2084 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2085 so->output[i].start_component;
2086 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2087 i, so->output[i].output_buffer,
2088 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2089 so->output[i].register_index,
2090 mask & 1 ? "x" : "",
2091 mask & 2 ? "y" : "",
2092 mask & 4 ? "z" : "",
2093 mask & 8 ? "w" : "");
2094 }
2095 }
2096
2097 static void emit_streamout_output(struct si_shader_context *ctx,
2098 LLVMValueRef const *so_buffers,
2099 LLVMValueRef const *so_write_offsets,
2100 struct pipe_stream_output *stream_out,
2101 struct si_shader_output_values *shader_out)
2102 {
2103 struct gallivm_state *gallivm = &ctx->gallivm;
2104 LLVMBuilderRef builder = gallivm->builder;
2105 unsigned buf_idx = stream_out->output_buffer;
2106 unsigned start = stream_out->start_component;
2107 unsigned num_comps = stream_out->num_components;
2108 LLVMValueRef out[4];
2109
2110 assert(num_comps && num_comps <= 4);
2111 if (!num_comps || num_comps > 4)
2112 return;
2113
2114 /* Load the output as int. */
2115 for (int j = 0; j < num_comps; j++) {
2116 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2117
2118 out[j] = LLVMBuildBitCast(builder,
2119 shader_out->values[start + j],
2120 ctx->i32, "");
2121 }
2122
2123 /* Pack the output. */
2124 LLVMValueRef vdata = NULL;
2125
2126 switch (num_comps) {
2127 case 1: /* as i32 */
2128 vdata = out[0];
2129 break;
2130 case 2: /* as v2i32 */
2131 case 3: /* as v4i32 (aligned to 4) */
2132 case 4: /* as v4i32 */
2133 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2134 for (int j = 0; j < num_comps; j++) {
2135 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2136 LLVMConstInt(ctx->i32, j, 0), "");
2137 }
2138 break;
2139 }
2140
2141 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2142 vdata, num_comps,
2143 so_write_offsets[buf_idx],
2144 ctx->i32_0,
2145 stream_out->dst_offset * 4, 1, 1, true, false);
2146 }
2147
2148 /**
2149 * Write streamout data to buffers for vertex stream @p stream (different
2150 * vertex streams can occur for GS copy shaders).
2151 */
2152 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2153 struct si_shader_output_values *outputs,
2154 unsigned noutput, unsigned stream)
2155 {
2156 struct si_shader_selector *sel = ctx->shader->selector;
2157 struct pipe_stream_output_info *so = &sel->so;
2158 struct gallivm_state *gallivm = &ctx->gallivm;
2159 LLVMBuilderRef builder = gallivm->builder;
2160 int i;
2161 struct lp_build_if_state if_ctx;
2162
2163 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2164 LLVMValueRef so_vtx_count =
2165 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2166
2167 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2168
2169 /* can_emit = tid < so_vtx_count; */
2170 LLVMValueRef can_emit =
2171 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2172
2173 /* Emit the streamout code conditionally. This actually avoids
2174 * out-of-bounds buffer access. The hw tells us via the SGPR
2175 * (so_vtx_count) which threads are allowed to emit streamout data. */
2176 lp_build_if(&if_ctx, gallivm, can_emit);
2177 {
2178 /* The buffer offset is computed as follows:
2179 * ByteOffset = streamout_offset[buffer_id]*4 +
2180 * (streamout_write_index + thread_id)*stride[buffer_id] +
2181 * attrib_offset
2182 */
2183
2184 LLVMValueRef so_write_index =
2185 LLVMGetParam(ctx->main_fn,
2186 ctx->param_streamout_write_index);
2187
2188 /* Compute (streamout_write_index + thread_id). */
2189 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2190
2191 /* Load the descriptor and compute the write offset for each
2192 * enabled buffer. */
2193 LLVMValueRef so_write_offset[4] = {};
2194 LLVMValueRef so_buffers[4];
2195 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2196 ctx->param_rw_buffers);
2197
2198 for (i = 0; i < 4; i++) {
2199 if (!so->stride[i])
2200 continue;
2201
2202 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2203 SI_VS_STREAMOUT_BUF0 + i, 0);
2204
2205 so_buffers[i] = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
2206
2207 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2208 ctx->param_streamout_offset[i]);
2209 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2210
2211 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2212 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2213 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2214 }
2215
2216 /* Write streamout data. */
2217 for (i = 0; i < so->num_outputs; i++) {
2218 unsigned reg = so->output[i].register_index;
2219
2220 if (reg >= noutput)
2221 continue;
2222
2223 if (stream != so->output[i].stream)
2224 continue;
2225
2226 emit_streamout_output(ctx, so_buffers, so_write_offset,
2227 &so->output[i], &outputs[reg]);
2228 }
2229 }
2230 lp_build_endif(&if_ctx);
2231 }
2232
2233
2234 /* Generate export instructions for hardware VS shader stage */
2235 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2236 struct si_shader_output_values *outputs,
2237 unsigned noutput)
2238 {
2239 struct si_shader_context *ctx = si_shader_context(bld_base);
2240 struct si_shader *shader = ctx->shader;
2241 struct lp_build_context *base = &bld_base->base;
2242 struct ac_export_args args, pos_args[4] = {};
2243 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2244 unsigned semantic_name, semantic_index;
2245 unsigned target;
2246 unsigned param_count = 0;
2247 unsigned pos_idx;
2248 int i;
2249
2250 for (i = 0; i < noutput; i++) {
2251 semantic_name = outputs[i].semantic_name;
2252 semantic_index = outputs[i].semantic_index;
2253 bool export_param = true;
2254
2255 switch (semantic_name) {
2256 case TGSI_SEMANTIC_POSITION: /* ignore these */
2257 case TGSI_SEMANTIC_PSIZE:
2258 case TGSI_SEMANTIC_CLIPVERTEX:
2259 case TGSI_SEMANTIC_EDGEFLAG:
2260 break;
2261 case TGSI_SEMANTIC_GENERIC:
2262 case TGSI_SEMANTIC_CLIPDIST:
2263 if (shader->key.opt.hw_vs.kill_outputs &
2264 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2265 export_param = false;
2266 break;
2267 default:
2268 if (shader->key.opt.hw_vs.kill_outputs2 &
2269 (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index)))
2270 export_param = false;
2271 break;
2272 }
2273
2274 if (outputs[i].vertex_stream[0] != 0 &&
2275 outputs[i].vertex_stream[1] != 0 &&
2276 outputs[i].vertex_stream[2] != 0 &&
2277 outputs[i].vertex_stream[3] != 0)
2278 export_param = false;
2279
2280 handle_semantic:
2281 /* Select the correct target */
2282 switch(semantic_name) {
2283 case TGSI_SEMANTIC_PSIZE:
2284 psize_value = outputs[i].values[0];
2285 continue;
2286 case TGSI_SEMANTIC_EDGEFLAG:
2287 edgeflag_value = outputs[i].values[0];
2288 continue;
2289 case TGSI_SEMANTIC_LAYER:
2290 layer_value = outputs[i].values[0];
2291 semantic_name = TGSI_SEMANTIC_GENERIC;
2292 goto handle_semantic;
2293 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2294 viewport_index_value = outputs[i].values[0];
2295 semantic_name = TGSI_SEMANTIC_GENERIC;
2296 goto handle_semantic;
2297 case TGSI_SEMANTIC_POSITION:
2298 target = V_008DFC_SQ_EXP_POS;
2299 break;
2300 case TGSI_SEMANTIC_CLIPDIST:
2301 if (shader->key.opt.hw_vs.clip_disable) {
2302 semantic_name = TGSI_SEMANTIC_GENERIC;
2303 goto handle_semantic;
2304 }
2305 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2306 break;
2307 case TGSI_SEMANTIC_CLIPVERTEX:
2308 if (shader->key.opt.hw_vs.clip_disable)
2309 continue;
2310 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2311 continue;
2312 case TGSI_SEMANTIC_COLOR:
2313 case TGSI_SEMANTIC_BCOLOR:
2314 case TGSI_SEMANTIC_PRIMID:
2315 case TGSI_SEMANTIC_FOG:
2316 case TGSI_SEMANTIC_TEXCOORD:
2317 case TGSI_SEMANTIC_GENERIC:
2318 if (!export_param)
2319 continue;
2320 target = V_008DFC_SQ_EXP_PARAM + param_count;
2321 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2322 shader->info.vs_output_param_offset[i] = param_count;
2323 param_count++;
2324 break;
2325 default:
2326 target = 0;
2327 fprintf(stderr,
2328 "Warning: SI unhandled vs output type:%d\n",
2329 semantic_name);
2330 }
2331
2332 si_llvm_init_export_args(bld_base, outputs[i].values, target, &args);
2333
2334 if (target >= V_008DFC_SQ_EXP_POS &&
2335 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2336 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
2337 &args, sizeof(args));
2338 } else {
2339 ac_build_export(&ctx->ac, &args);
2340 }
2341
2342 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2343 semantic_name = TGSI_SEMANTIC_GENERIC;
2344 goto handle_semantic;
2345 }
2346 }
2347
2348 shader->info.nr_param_exports = param_count;
2349
2350 /* We need to add the position output manually if it's missing. */
2351 if (!pos_args[0].out[0]) {
2352 pos_args[0].enabled_channels = 0xf; /* writemask */
2353 pos_args[0].valid_mask = 0; /* EXEC mask */
2354 pos_args[0].done = 0; /* last export? */
2355 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2356 pos_args[0].compr = 0; /* COMPR flag */
2357 pos_args[0].out[0] = base->zero; /* X */
2358 pos_args[0].out[1] = base->zero; /* Y */
2359 pos_args[0].out[2] = base->zero; /* Z */
2360 pos_args[0].out[3] = base->one; /* W */
2361 }
2362
2363 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2364 if (shader->selector->info.writes_psize ||
2365 shader->selector->info.writes_edgeflag ||
2366 shader->selector->info.writes_viewport_index ||
2367 shader->selector->info.writes_layer) {
2368 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2369 (shader->selector->info.writes_edgeflag << 1) |
2370 (shader->selector->info.writes_layer << 2) |
2371 (shader->selector->info.writes_viewport_index << 3);
2372 pos_args[1].valid_mask = 0; /* EXEC mask */
2373 pos_args[1].done = 0; /* last export? */
2374 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2375 pos_args[1].compr = 0; /* COMPR flag */
2376 pos_args[1].out[0] = base->zero; /* X */
2377 pos_args[1].out[1] = base->zero; /* Y */
2378 pos_args[1].out[2] = base->zero; /* Z */
2379 pos_args[1].out[3] = base->zero; /* W */
2380
2381 if (shader->selector->info.writes_psize)
2382 pos_args[1].out[0] = psize_value;
2383
2384 if (shader->selector->info.writes_edgeflag) {
2385 /* The output is a float, but the hw expects an integer
2386 * with the first bit containing the edge flag. */
2387 edgeflag_value = LLVMBuildFPToUI(ctx->gallivm.builder,
2388 edgeflag_value,
2389 ctx->i32, "");
2390 edgeflag_value = lp_build_min(&bld_base->int_bld,
2391 edgeflag_value,
2392 ctx->i32_1);
2393
2394 /* The LLVM intrinsic expects a float. */
2395 pos_args[1].out[1] = LLVMBuildBitCast(ctx->gallivm.builder,
2396 edgeflag_value,
2397 ctx->f32, "");
2398 }
2399
2400 if (shader->selector->info.writes_layer)
2401 pos_args[1].out[2] = layer_value;
2402
2403 if (shader->selector->info.writes_viewport_index)
2404 pos_args[1].out[3] = viewport_index_value;
2405 }
2406
2407 for (i = 0; i < 4; i++)
2408 if (pos_args[i].out[0])
2409 shader->info.nr_pos_exports++;
2410
2411 pos_idx = 0;
2412 for (i = 0; i < 4; i++) {
2413 if (!pos_args[i].out[0])
2414 continue;
2415
2416 /* Specify the target we are exporting */
2417 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2418
2419 if (pos_idx == shader->info.nr_pos_exports)
2420 /* Specify that this is the last export */
2421 pos_args[i].done = 1;
2422
2423 ac_build_export(&ctx->ac, &pos_args[i]);
2424 }
2425 }
2426
2427 /**
2428 * Forward all outputs from the vertex shader to the TES. This is only used
2429 * for the fixed function TCS.
2430 */
2431 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2432 {
2433 struct si_shader_context *ctx = si_shader_context(bld_base);
2434 struct gallivm_state *gallivm = &ctx->gallivm;
2435 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2436 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2437 uint64_t inputs;
2438
2439 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2440
2441 rw_buffers = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2442 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2443 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2444
2445 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2446
2447 lds_vertex_stride = unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2448 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2449 lds_vertex_stride, "");
2450 lds_base = get_tcs_in_current_patch_offset(ctx);
2451 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2452
2453 inputs = ctx->shader->key.mono.ff_tcs_inputs_to_copy;
2454 while (inputs) {
2455 unsigned i = u_bit_scan64(&inputs);
2456
2457 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2458 LLVMConstInt(ctx->i32, 4 * i, 0),
2459 "");
2460
2461 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2462 get_rel_patch_id(ctx),
2463 invocation_id,
2464 LLVMConstInt(ctx->i32, i, 0));
2465
2466 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2467 lds_ptr);
2468
2469 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2470 buffer_offset, 0, 1, 0, true, false);
2471 }
2472 }
2473
2474 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2475 LLVMValueRef rel_patch_id,
2476 LLVMValueRef invocation_id,
2477 LLVMValueRef tcs_out_current_patch_data_offset)
2478 {
2479 struct si_shader_context *ctx = si_shader_context(bld_base);
2480 struct gallivm_state *gallivm = &ctx->gallivm;
2481 struct si_shader *shader = ctx->shader;
2482 unsigned tess_inner_index, tess_outer_index;
2483 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2484 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4];
2485 unsigned stride, outer_comps, inner_comps, i, offset;
2486 struct lp_build_if_state if_ctx, inner_if_ctx;
2487
2488 si_llvm_emit_barrier(NULL, bld_base, NULL);
2489
2490 /* Do this only for invocation 0, because the tess levels are per-patch,
2491 * not per-vertex.
2492 *
2493 * This can't jump, because invocation 0 executes this. It should
2494 * at least mask out the loads and stores for other invocations.
2495 */
2496 lp_build_if(&if_ctx, gallivm,
2497 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2498 invocation_id, ctx->i32_0, ""));
2499
2500 /* Determine the layout of one tess factor element in the buffer. */
2501 switch (shader->key.part.tcs.epilog.prim_mode) {
2502 case PIPE_PRIM_LINES:
2503 stride = 2; /* 2 dwords, 1 vec2 store */
2504 outer_comps = 2;
2505 inner_comps = 0;
2506 break;
2507 case PIPE_PRIM_TRIANGLES:
2508 stride = 4; /* 4 dwords, 1 vec4 store */
2509 outer_comps = 3;
2510 inner_comps = 1;
2511 break;
2512 case PIPE_PRIM_QUADS:
2513 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2514 outer_comps = 4;
2515 inner_comps = 2;
2516 break;
2517 default:
2518 assert(0);
2519 return;
2520 }
2521
2522 /* Load tess_inner and tess_outer from LDS.
2523 * Any invocation can write them, so we can't get them from a temporary.
2524 */
2525 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2526 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2527
2528 lds_base = tcs_out_current_patch_data_offset;
2529 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2530 LLVMConstInt(ctx->i32,
2531 tess_inner_index * 4, 0), "");
2532 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2533 LLVMConstInt(ctx->i32,
2534 tess_outer_index * 4, 0), "");
2535
2536 for (i = 0; i < 4; i++) {
2537 inner[i] = LLVMGetUndef(ctx->i32);
2538 outer[i] = LLVMGetUndef(ctx->i32);
2539 }
2540
2541 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2542 /* For isolines, the hardware expects tess factors in the
2543 * reverse order from what GLSL / TGSI specify.
2544 */
2545 outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer);
2546 outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer);
2547 } else {
2548 for (i = 0; i < outer_comps; i++) {
2549 outer[i] = out[i] =
2550 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2551 }
2552 for (i = 0; i < inner_comps; i++) {
2553 inner[i] = out[outer_comps+i] =
2554 lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2555 }
2556 }
2557
2558 /* Convert the outputs to vectors for stores. */
2559 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2560 vec1 = NULL;
2561
2562 if (stride > 4)
2563 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2564
2565 /* Get the buffer. */
2566 rw_buffers = LLVMGetParam(ctx->main_fn,
2567 ctx->param_rw_buffers);
2568 buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2569 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_FACTOR, 0));
2570
2571 /* Get the offset. */
2572 tf_base = LLVMGetParam(ctx->main_fn,
2573 ctx->param_tcs_factor_offset);
2574 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2575 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2576
2577 lp_build_if(&inner_if_ctx, gallivm,
2578 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2579 rel_patch_id, ctx->i32_0, ""));
2580
2581 /* Store the dynamic HS control word. */
2582 offset = 0;
2583 if (ctx->screen->b.chip_class <= VI) {
2584 ac_build_buffer_store_dword(&ctx->ac, buffer,
2585 LLVMConstInt(ctx->i32, 0x80000000, 0),
2586 1, ctx->i32_0, tf_base,
2587 offset, 1, 0, true, false);
2588 offset += 4;
2589 }
2590
2591 lp_build_endif(&inner_if_ctx);
2592
2593 /* Store the tessellation factors. */
2594 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2595 MIN2(stride, 4), byteoffset, tf_base,
2596 offset, 1, 0, true, false);
2597 offset += 16;
2598 if (vec1)
2599 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2600 stride - 4, byteoffset, tf_base,
2601 offset, 1, 0, true, false);
2602
2603 /* Store the tess factors into the offchip buffer if TES reads them. */
2604 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2605 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2606 LLVMValueRef tf_inner_offset;
2607 unsigned param_outer, param_inner;
2608
2609 buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers,
2610 LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0));
2611 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2612
2613 param_outer = si_shader_io_get_unique_index(
2614 TGSI_SEMANTIC_TESSOUTER, 0);
2615 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2616 LLVMConstInt(ctx->i32, param_outer, 0));
2617
2618 outer_vec = lp_build_gather_values(gallivm, outer,
2619 util_next_power_of_two(outer_comps));
2620
2621 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2622 outer_comps, tf_outer_offset,
2623 base, 0, 1, 0, true, false);
2624 if (inner_comps) {
2625 param_inner = si_shader_io_get_unique_index(
2626 TGSI_SEMANTIC_TESSINNER, 0);
2627 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2628 LLVMConstInt(ctx->i32, param_inner, 0));
2629
2630 inner_vec = inner_comps == 1 ? inner[0] :
2631 lp_build_gather_values(gallivm, inner, inner_comps);
2632 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2633 inner_comps, tf_inner_offset,
2634 base, 0, 1, 0, true, false);
2635 }
2636 }
2637
2638 lp_build_endif(&if_ctx);
2639 }
2640
2641 static LLVMValueRef
2642 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2643 unsigned param, unsigned return_index)
2644 {
2645 return LLVMBuildInsertValue(ctx->gallivm.builder, ret,
2646 LLVMGetParam(ctx->main_fn, param),
2647 return_index, "");
2648 }
2649
2650 static LLVMValueRef
2651 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2652 unsigned param, unsigned return_index)
2653 {
2654 LLVMBuilderRef builder = ctx->gallivm.builder;
2655 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2656
2657 return LLVMBuildInsertValue(builder, ret,
2658 LLVMBuildBitCast(builder, p, ctx->f32, ""),
2659 return_index, "");
2660 }
2661
2662 static LLVMValueRef
2663 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2664 unsigned param, unsigned return_index)
2665 {
2666 LLVMBuilderRef builder = ctx->gallivm.builder;
2667 LLVMValueRef ptr, lo, hi;
2668
2669 ptr = LLVMGetParam(ctx->main_fn, param);
2670 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
2671 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
2672 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
2673 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
2674 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
2675 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
2676 }
2677
2678 /* This only writes the tessellation factor levels. */
2679 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2680 {
2681 struct si_shader_context *ctx = si_shader_context(bld_base);
2682 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2683 LLVMValueRef offchip_soffset, offchip_layout;
2684
2685 si_copy_tcs_inputs(bld_base);
2686
2687 rel_patch_id = get_rel_patch_id(ctx);
2688 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2689 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2690
2691 /* Return epilog parameters from this function. */
2692 LLVMBuilderRef builder = ctx->gallivm.builder;
2693 LLVMValueRef ret = ctx->return_value;
2694 LLVMValueRef tf_soffset;
2695 unsigned vgpr;
2696
2697 offchip_layout = LLVMGetParam(ctx->main_fn,
2698 ctx->param_tcs_offchip_layout);
2699 offchip_soffset = LLVMGetParam(ctx->main_fn,
2700 ctx->param_tcs_offchip_offset);
2701 tf_soffset = LLVMGetParam(ctx->main_fn,
2702 ctx->param_tcs_factor_offset);
2703
2704 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
2705 ctx->param_rw_buffers, 0);
2706
2707 if (ctx->screen->b.chip_class >= GFX9) {
2708 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2709 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT, "");
2710 /* Tess offchip and tess factor offsets are at the beginning. */
2711 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset, 2, "");
2712 ret = LLVMBuildInsertValue(builder, ret, tf_soffset, 4, "");
2713 vgpr = 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT + 1;
2714 } else {
2715 ret = LLVMBuildInsertValue(builder, ret, offchip_layout,
2716 GFX6_SGPR_TCS_OFFCHIP_LAYOUT, "");
2717 /* Tess offchip and tess factor offsets are after user SGPRs. */
2718 ret = LLVMBuildInsertValue(builder, ret, offchip_soffset,
2719 GFX6_TCS_NUM_USER_SGPR, "");
2720 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2721 GFX6_TCS_NUM_USER_SGPR + 1, "");
2722 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
2723 }
2724
2725 /* VGPRs */
2726 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2727 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2728 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2729
2730 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2731 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2732 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2733 ctx->return_value = ret;
2734 }
2735
2736 /* Pass TCS inputs from LS to TCS on GFX9. */
2737 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
2738 {
2739 LLVMValueRef ret = ctx->return_value;
2740
2741 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2742 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
2743 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2744 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
2745 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2746
2747 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
2748 8 + SI_SGPR_VS_STATE_BITS);
2749 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
2750 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
2751 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
2752 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
2753 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
2754 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
2755
2756 unsigned desc_param = ctx->param_tcs_out_lds_layout + 2;
2757 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2758 8 + GFX9_SGPR_TCS_CONST_BUFFERS);
2759 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2760 8 + GFX9_SGPR_TCS_SAMPLERS);
2761 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2762 8 + GFX9_SGPR_TCS_IMAGES);
2763 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2764 8 + GFX9_SGPR_TCS_SHADER_BUFFERS);
2765
2766 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
2767 ret = si_insert_input_ret_float(ctx, ret,
2768 ctx->param_tcs_patch_id, vgpr++);
2769 ret = si_insert_input_ret_float(ctx, ret,
2770 ctx->param_tcs_rel_ids, vgpr++);
2771 ctx->return_value = ret;
2772 }
2773
2774 /* Pass GS inputs from ES to GS on GFX9. */
2775 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
2776 {
2777 LLVMValueRef ret = ctx->return_value;
2778
2779 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers, 0);
2780 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
2781 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
2782
2783 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
2784
2785 unsigned desc_param = ctx->param_vs_state_bits + 1;
2786 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
2787 8 + GFX9_SGPR_GS_CONST_BUFFERS);
2788 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
2789 8 + GFX9_SGPR_GS_SAMPLERS);
2790 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 2,
2791 8 + GFX9_SGPR_GS_IMAGES);
2792 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 3,
2793 8 + GFX9_SGPR_GS_SHADER_BUFFERS);
2794
2795 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
2796 for (unsigned i = 0; i < 5; i++) {
2797 unsigned param = ctx->param_gs_vtx01_offset + i;
2798 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
2799 }
2800 ctx->return_value = ret;
2801 }
2802
2803 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2804 {
2805 struct si_shader_context *ctx = si_shader_context(bld_base);
2806 struct si_shader *shader = ctx->shader;
2807 struct tgsi_shader_info *info = &shader->selector->info;
2808 struct gallivm_state *gallivm = &ctx->gallivm;
2809 unsigned i, chan;
2810 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
2811 ctx->param_rel_auto_id);
2812 LLVMValueRef vertex_dw_stride =
2813 unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
2814 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2815 vertex_dw_stride, "");
2816
2817 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2818 * its inputs from it. */
2819 for (i = 0; i < info->num_outputs; i++) {
2820 LLVMValueRef *out_ptr = ctx->outputs[i];
2821 unsigned name = info->output_semantic_name[i];
2822 unsigned index = info->output_semantic_index[i];
2823
2824 /* The ARB_shader_viewport_layer_array spec contains the
2825 * following issue:
2826 *
2827 * 2) What happens if gl_ViewportIndex or gl_Layer is
2828 * written in the vertex shader and a geometry shader is
2829 * present?
2830 *
2831 * RESOLVED: The value written by the last vertex processing
2832 * stage is used. If the last vertex processing stage
2833 * (vertex, tessellation evaluation or geometry) does not
2834 * statically assign to gl_ViewportIndex or gl_Layer, index
2835 * or layer zero is assumed.
2836 *
2837 * So writes to those outputs in VS-as-LS are simply ignored.
2838 */
2839 if (name == TGSI_SEMANTIC_LAYER ||
2840 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
2841 continue;
2842
2843 int param = si_shader_io_get_unique_index(name, index);
2844 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2845 LLVMConstInt(ctx->i32, param * 4, 0), "");
2846
2847 for (chan = 0; chan < 4; chan++) {
2848 lds_store(bld_base, chan, dw_addr,
2849 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2850 }
2851 }
2852
2853 if (ctx->screen->b.chip_class >= GFX9)
2854 si_set_ls_return_value_for_tcs(ctx);
2855 }
2856
2857 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2858 {
2859 struct si_shader_context *ctx = si_shader_context(bld_base);
2860 struct gallivm_state *gallivm = &ctx->gallivm;
2861 struct si_shader *es = ctx->shader;
2862 struct tgsi_shader_info *info = &es->selector->info;
2863 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
2864 ctx->param_es2gs_offset);
2865 LLVMValueRef lds_base = NULL;
2866 unsigned chan;
2867 int i;
2868
2869 if (ctx->screen->b.chip_class >= GFX9 && info->num_outputs) {
2870 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
2871 lds_base = LLVMBuildMul(gallivm->builder, ac_get_thread_id(&ctx->ac),
2872 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
2873 }
2874
2875 for (i = 0; i < info->num_outputs; i++) {
2876 LLVMValueRef *out_ptr = ctx->outputs[i];
2877 int param;
2878
2879 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2880 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2881 continue;
2882
2883 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
2884 info->output_semantic_index[i]);
2885
2886 for (chan = 0; chan < 4; chan++) {
2887 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2888 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2889
2890 /* GFX9 has the ESGS ring in LDS. */
2891 if (ctx->screen->b.chip_class >= GFX9) {
2892 lds_store(bld_base, param * 4 + chan, lds_base, out_val);
2893 continue;
2894 }
2895
2896 ac_build_buffer_store_dword(&ctx->ac,
2897 ctx->esgs_ring,
2898 out_val, 1, NULL, soffset,
2899 (4 * param + chan) * 4,
2900 1, 1, true, true);
2901 }
2902 }
2903
2904 if (ctx->screen->b.chip_class >= GFX9)
2905 si_set_es_return_value_for_gs(ctx);
2906 }
2907
2908 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
2909 {
2910 if (ctx->screen->b.chip_class >= GFX9)
2911 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
2912 else
2913 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
2914 }
2915
2916 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2917 {
2918 struct si_shader_context *ctx = si_shader_context(bld_base);
2919
2920 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
2921 si_get_gs_wave_id(ctx));
2922 }
2923
2924 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2925 {
2926 struct si_shader_context *ctx = si_shader_context(bld_base);
2927 struct gallivm_state *gallivm = &ctx->gallivm;
2928 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2929 struct si_shader_output_values *outputs = NULL;
2930 int i,j;
2931
2932 assert(!ctx->shader->is_gs_copy_shader);
2933
2934 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2935
2936 /* Vertex color clamping.
2937 *
2938 * This uses a state constant loaded in a user data SGPR and
2939 * an IF statement is added that clamps all colors if the constant
2940 * is true.
2941 */
2942 if (ctx->type == PIPE_SHADER_VERTEX) {
2943 struct lp_build_if_state if_ctx;
2944 LLVMValueRef cond = NULL;
2945 LLVMValueRef addr, val;
2946
2947 for (i = 0; i < info->num_outputs; i++) {
2948 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2949 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2950 continue;
2951
2952 /* We've found a color. */
2953 if (!cond) {
2954 /* The state is in the first bit of the user SGPR. */
2955 cond = LLVMGetParam(ctx->main_fn,
2956 ctx->param_vs_state_bits);
2957 cond = LLVMBuildTrunc(gallivm->builder, cond,
2958 ctx->i1, "");
2959 lp_build_if(&if_ctx, gallivm, cond);
2960 }
2961
2962 for (j = 0; j < 4; j++) {
2963 addr = ctx->outputs[i][j];
2964 val = LLVMBuildLoad(gallivm->builder, addr, "");
2965 val = ac_build_clamp(&ctx->ac, val);
2966 LLVMBuildStore(gallivm->builder, val, addr);
2967 }
2968 }
2969
2970 if (cond)
2971 lp_build_endif(&if_ctx);
2972 }
2973
2974 for (i = 0; i < info->num_outputs; i++) {
2975 outputs[i].semantic_name = info->output_semantic_name[i];
2976 outputs[i].semantic_index = info->output_semantic_index[i];
2977
2978 for (j = 0; j < 4; j++) {
2979 outputs[i].values[j] =
2980 LLVMBuildLoad(gallivm->builder,
2981 ctx->outputs[i][j],
2982 "");
2983 outputs[i].vertex_stream[j] =
2984 (info->output_streams[i] >> (2 * j)) & 3;
2985 }
2986 }
2987
2988 if (ctx->shader->selector->so.num_outputs)
2989 si_llvm_emit_streamout(ctx, outputs, i, 0);
2990
2991 /* Export PrimitiveID. */
2992 if (ctx->shader->key.mono.vs_export_prim_id) {
2993 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
2994 outputs[i].semantic_index = 0;
2995 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2996 get_primitive_id(bld_base, 0));
2997 for (j = 1; j < 4; j++)
2998 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
2999
3000 memset(outputs[i].vertex_stream, 0,
3001 sizeof(outputs[i].vertex_stream));
3002 i++;
3003 }
3004
3005 si_llvm_export_vs(bld_base, outputs, i);
3006 FREE(outputs);
3007 }
3008
3009 struct si_ps_exports {
3010 unsigned num;
3011 struct ac_export_args args[10];
3012 };
3013
3014 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3015 bool writes_samplemask)
3016 {
3017 if (writes_z) {
3018 /* Z needs 32 bits. */
3019 if (writes_samplemask)
3020 return V_028710_SPI_SHADER_32_ABGR;
3021 else if (writes_stencil)
3022 return V_028710_SPI_SHADER_32_GR;
3023 else
3024 return V_028710_SPI_SHADER_32_R;
3025 } else if (writes_stencil || writes_samplemask) {
3026 /* Both stencil and sample mask need only 16 bits. */
3027 return V_028710_SPI_SHADER_UINT16_ABGR;
3028 } else {
3029 return V_028710_SPI_SHADER_ZERO;
3030 }
3031 }
3032
3033 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3034 LLVMValueRef depth, LLVMValueRef stencil,
3035 LLVMValueRef samplemask, struct si_ps_exports *exp)
3036 {
3037 struct si_shader_context *ctx = si_shader_context(bld_base);
3038 struct lp_build_context *base = &bld_base->base;
3039 struct ac_export_args args;
3040 unsigned mask = 0;
3041 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3042 stencil != NULL,
3043 samplemask != NULL);
3044
3045 assert(depth || stencil || samplemask);
3046
3047 args.valid_mask = 1; /* whether the EXEC mask is valid */
3048 args.done = 1; /* DONE bit */
3049
3050 /* Specify the target we are exporting */
3051 args.target = V_008DFC_SQ_EXP_MRTZ;
3052
3053 args.compr = 0; /* COMP flag */
3054 args.out[0] = base->undef; /* R, depth */
3055 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3056 args.out[2] = base->undef; /* B, sample mask */
3057 args.out[3] = base->undef; /* A, alpha to mask */
3058
3059 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3060 assert(!depth);
3061 args.compr = 1; /* COMPR flag */
3062
3063 if (stencil) {
3064 /* Stencil should be in X[23:16]. */
3065 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
3066 stencil = LLVMBuildShl(ctx->gallivm.builder, stencil,
3067 LLVMConstInt(ctx->i32, 16, 0), "");
3068 args.out[0] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
3069 mask |= 0x3;
3070 }
3071 if (samplemask) {
3072 /* SampleMask should be in Y[15:0]. */
3073 args.out[1] = samplemask;
3074 mask |= 0xc;
3075 }
3076 } else {
3077 if (depth) {
3078 args.out[0] = depth;
3079 mask |= 0x1;
3080 }
3081 if (stencil) {
3082 args.out[1] = stencil;
3083 mask |= 0x2;
3084 }
3085 if (samplemask) {
3086 args.out[2] = samplemask;
3087 mask |= 0x4;
3088 }
3089 }
3090
3091 /* SI (except OLAND and HAINAN) has a bug that it only looks
3092 * at the X writemask component. */
3093 if (ctx->screen->b.chip_class == SI &&
3094 ctx->screen->b.family != CHIP_OLAND &&
3095 ctx->screen->b.family != CHIP_HAINAN)
3096 mask |= 0x1;
3097
3098 /* Specify which components to enable */
3099 args.enabled_channels = mask;
3100
3101 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3102 }
3103
3104 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3105 LLVMValueRef *color, unsigned index,
3106 unsigned samplemask_param,
3107 bool is_last, struct si_ps_exports *exp)
3108 {
3109 struct si_shader_context *ctx = si_shader_context(bld_base);
3110 struct lp_build_context *base = &bld_base->base;
3111 int i;
3112
3113 /* Clamp color */
3114 if (ctx->shader->key.part.ps.epilog.clamp_color)
3115 for (i = 0; i < 4; i++)
3116 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3117
3118 /* Alpha to one */
3119 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3120 color[3] = base->one;
3121
3122 /* Alpha test */
3123 if (index == 0 &&
3124 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3125 si_alpha_test(bld_base, color[3]);
3126
3127 /* Line & polygon smoothing */
3128 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3129 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3130 samplemask_param);
3131
3132 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3133 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3134 struct ac_export_args args[8];
3135 int c, last = -1;
3136
3137 /* Get the export arguments, also find out what the last one is. */
3138 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3139 si_llvm_init_export_args(bld_base, color,
3140 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3141 if (args[c].enabled_channels)
3142 last = c;
3143 }
3144
3145 /* Emit all exports. */
3146 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3147 if (is_last && last == c) {
3148 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3149 args[c].done = 1; /* DONE bit */
3150 } else if (!args[c].enabled_channels)
3151 continue; /* unnecessary NULL export */
3152
3153 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3154 }
3155 } else {
3156 struct ac_export_args args;
3157
3158 /* Export */
3159 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3160 &args);
3161 if (is_last) {
3162 args.valid_mask = 1; /* whether the EXEC mask is valid */
3163 args.done = 1; /* DONE bit */
3164 } else if (!args.enabled_channels)
3165 return; /* unnecessary NULL export */
3166
3167 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3168 }
3169 }
3170
3171 static void si_emit_ps_exports(struct si_shader_context *ctx,
3172 struct si_ps_exports *exp)
3173 {
3174 for (unsigned i = 0; i < exp->num; i++)
3175 ac_build_export(&ctx->ac, &exp->args[i]);
3176 }
3177
3178 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3179 {
3180 struct si_shader_context *ctx = si_shader_context(bld_base);
3181 struct lp_build_context *base = &bld_base->base;
3182 struct ac_export_args args;
3183
3184 args.enabled_channels = 0x0; /* enabled channels */
3185 args.valid_mask = 1; /* whether the EXEC mask is valid */
3186 args.done = 1; /* DONE bit */
3187 args.target = V_008DFC_SQ_EXP_NULL;
3188 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3189 args.out[0] = base->undef; /* R */
3190 args.out[1] = base->undef; /* G */
3191 args.out[2] = base->undef; /* B */
3192 args.out[3] = base->undef; /* A */
3193
3194 ac_build_export(&ctx->ac, &args);
3195 }
3196
3197 /**
3198 * Return PS outputs in this order:
3199 *
3200 * v[0:3] = color0.xyzw
3201 * v[4:7] = color1.xyzw
3202 * ...
3203 * vN+0 = Depth
3204 * vN+1 = Stencil
3205 * vN+2 = SampleMask
3206 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3207 *
3208 * The alpha-ref SGPR is returned via its original location.
3209 */
3210 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3211 {
3212 struct si_shader_context *ctx = si_shader_context(bld_base);
3213 struct si_shader *shader = ctx->shader;
3214 struct tgsi_shader_info *info = &shader->selector->info;
3215 LLVMBuilderRef builder = ctx->gallivm.builder;
3216 unsigned i, j, first_vgpr, vgpr;
3217
3218 LLVMValueRef color[8][4] = {};
3219 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3220 LLVMValueRef ret;
3221
3222 /* Read the output values. */
3223 for (i = 0; i < info->num_outputs; i++) {
3224 unsigned semantic_name = info->output_semantic_name[i];
3225 unsigned semantic_index = info->output_semantic_index[i];
3226
3227 switch (semantic_name) {
3228 case TGSI_SEMANTIC_COLOR:
3229 assert(semantic_index < 8);
3230 for (j = 0; j < 4; j++) {
3231 LLVMValueRef ptr = ctx->outputs[i][j];
3232 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3233 color[semantic_index][j] = result;
3234 }
3235 break;
3236 case TGSI_SEMANTIC_POSITION:
3237 depth = LLVMBuildLoad(builder,
3238 ctx->outputs[i][2], "");
3239 break;
3240 case TGSI_SEMANTIC_STENCIL:
3241 stencil = LLVMBuildLoad(builder,
3242 ctx->outputs[i][1], "");
3243 break;
3244 case TGSI_SEMANTIC_SAMPLEMASK:
3245 samplemask = LLVMBuildLoad(builder,
3246 ctx->outputs[i][0], "");
3247 break;
3248 default:
3249 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3250 semantic_name);
3251 }
3252 }
3253
3254 /* Fill the return structure. */
3255 ret = ctx->return_value;
3256
3257 /* Set SGPRs. */
3258 ret = LLVMBuildInsertValue(builder, ret,
3259 bitcast(bld_base, TGSI_TYPE_SIGNED,
3260 LLVMGetParam(ctx->main_fn,
3261 SI_PARAM_ALPHA_REF)),
3262 SI_SGPR_ALPHA_REF, "");
3263
3264 /* Set VGPRs */
3265 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3266 for (i = 0; i < ARRAY_SIZE(color); i++) {
3267 if (!color[i][0])
3268 continue;
3269
3270 for (j = 0; j < 4; j++)
3271 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3272 }
3273 if (depth)
3274 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3275 if (stencil)
3276 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3277 if (samplemask)
3278 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3279
3280 /* Add the input sample mask for smoothing at the end. */
3281 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3282 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3283 ret = LLVMBuildInsertValue(builder, ret,
3284 LLVMGetParam(ctx->main_fn,
3285 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3286
3287 ctx->return_value = ret;
3288 }
3289
3290 /**
3291 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3292 * buffer in number of elements and return it as an i32.
3293 */
3294 static LLVMValueRef get_buffer_size(
3295 struct lp_build_tgsi_context *bld_base,
3296 LLVMValueRef descriptor)
3297 {
3298 struct si_shader_context *ctx = si_shader_context(bld_base);
3299 struct gallivm_state *gallivm = &ctx->gallivm;
3300 LLVMBuilderRef builder = gallivm->builder;
3301 LLVMValueRef size =
3302 LLVMBuildExtractElement(builder, descriptor,
3303 LLVMConstInt(ctx->i32, 2, 0), "");
3304
3305 if (ctx->screen->b.chip_class == VI) {
3306 /* On VI, the descriptor contains the size in bytes,
3307 * but TXQ must return the size in elements.
3308 * The stride is always non-zero for resources using TXQ.
3309 */
3310 LLVMValueRef stride =
3311 LLVMBuildExtractElement(builder, descriptor,
3312 ctx->i32_1, "");
3313 stride = LLVMBuildLShr(builder, stride,
3314 LLVMConstInt(ctx->i32, 16, 0), "");
3315 stride = LLVMBuildAnd(builder, stride,
3316 LLVMConstInt(ctx->i32, 0x3FFF, 0), "");
3317
3318 size = LLVMBuildUDiv(builder, size, stride, "");
3319 }
3320
3321 return size;
3322 }
3323
3324 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3325 struct lp_build_tgsi_context *bld_base,
3326 struct lp_build_emit_data *emit_data);
3327
3328 /* Prevent optimizations (at least of memory accesses) across the current
3329 * point in the program by emitting empty inline assembly that is marked as
3330 * having side effects.
3331 *
3332 * Optionally, a value can be passed through the inline assembly to prevent
3333 * LLVM from hoisting calls to ReadNone functions.
3334 */
3335 static void emit_optimization_barrier(struct si_shader_context *ctx,
3336 LLVMValueRef *pvgpr)
3337 {
3338 static int counter = 0;
3339
3340 LLVMBuilderRef builder = ctx->gallivm.builder;
3341 char code[16];
3342
3343 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
3344
3345 if (!pvgpr) {
3346 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3347 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
3348 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3349 } else {
3350 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
3351 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
3352 LLVMValueRef vgpr = *pvgpr;
3353 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
3354 unsigned vgpr_size = llvm_get_type_size(vgpr_type);
3355 LLVMValueRef vgpr0;
3356
3357 assert(vgpr_size % 4 == 0);
3358
3359 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
3360 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
3361 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
3362 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
3363 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
3364
3365 *pvgpr = vgpr;
3366 }
3367 }
3368
3369 /* Combine these with & instead of |. */
3370 #define NOOP_WAITCNT 0xf7f
3371 #define LGKM_CNT 0x07f
3372 #define VM_CNT 0xf70
3373
3374 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3375 {
3376 struct gallivm_state *gallivm = &ctx->gallivm;
3377 LLVMBuilderRef builder = gallivm->builder;
3378 LLVMValueRef args[1] = {
3379 LLVMConstInt(ctx->i32, simm16, 0)
3380 };
3381 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3382 ctx->voidt, args, 1, 0);
3383 }
3384
3385 static void membar_emit(
3386 const struct lp_build_tgsi_action *action,
3387 struct lp_build_tgsi_context *bld_base,
3388 struct lp_build_emit_data *emit_data)
3389 {
3390 struct si_shader_context *ctx = si_shader_context(bld_base);
3391 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3392 unsigned flags = LLVMConstIntGetZExtValue(src0);
3393 unsigned waitcnt = NOOP_WAITCNT;
3394
3395 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3396 waitcnt &= VM_CNT & LGKM_CNT;
3397
3398 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3399 TGSI_MEMBAR_SHADER_BUFFER |
3400 TGSI_MEMBAR_SHADER_IMAGE))
3401 waitcnt &= VM_CNT;
3402
3403 if (flags & TGSI_MEMBAR_SHARED)
3404 waitcnt &= LGKM_CNT;
3405
3406 if (waitcnt != NOOP_WAITCNT)
3407 emit_waitcnt(ctx, waitcnt);
3408 }
3409
3410 static void clock_emit(
3411 const struct lp_build_tgsi_action *action,
3412 struct lp_build_tgsi_context *bld_base,
3413 struct lp_build_emit_data *emit_data)
3414 {
3415 struct si_shader_context *ctx = si_shader_context(bld_base);
3416 struct gallivm_state *gallivm = &ctx->gallivm;
3417 LLVMValueRef tmp;
3418
3419 tmp = lp_build_intrinsic(gallivm->builder, "llvm.readcyclecounter",
3420 ctx->i64, NULL, 0, 0);
3421 tmp = LLVMBuildBitCast(gallivm->builder, tmp, ctx->v2i32, "");
3422
3423 emit_data->output[0] =
3424 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_0, "");
3425 emit_data->output[1] =
3426 LLVMBuildExtractElement(gallivm->builder, tmp, ctx->i32_1, "");
3427 }
3428
3429 static LLVMValueRef
3430 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3431 const struct tgsi_full_src_register *reg)
3432 {
3433 LLVMValueRef index;
3434 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3435 ctx->param_shader_buffers);
3436
3437 if (!reg->Register.Indirect)
3438 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3439 else
3440 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3441 reg->Register.Index,
3442 SI_NUM_SHADER_BUFFERS);
3443
3444 return ac_build_indexed_load_const(&ctx->ac, rsrc_ptr, index);
3445 }
3446
3447 static bool tgsi_is_array_sampler(unsigned target)
3448 {
3449 return target == TGSI_TEXTURE_1D_ARRAY ||
3450 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3451 target == TGSI_TEXTURE_2D_ARRAY ||
3452 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3453 target == TGSI_TEXTURE_CUBE_ARRAY ||
3454 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3455 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3456 }
3457
3458 static bool tgsi_is_array_image(unsigned target)
3459 {
3460 return target == TGSI_TEXTURE_3D ||
3461 target == TGSI_TEXTURE_CUBE ||
3462 target == TGSI_TEXTURE_1D_ARRAY ||
3463 target == TGSI_TEXTURE_2D_ARRAY ||
3464 target == TGSI_TEXTURE_CUBE_ARRAY ||
3465 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3466 }
3467
3468 /**
3469 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3470 *
3471 * At least on Tonga, executing image stores on images with DCC enabled and
3472 * non-trivial can eventually lead to lockups. This can occur when an
3473 * application binds an image as read-only but then uses a shader that writes
3474 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3475 * program termination) in this case, but it doesn't cost much to be a bit
3476 * nicer: disabling DCC in the shader still leads to undefined results but
3477 * avoids the lockup.
3478 */
3479 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3480 LLVMValueRef rsrc)
3481 {
3482 if (ctx->screen->b.chip_class <= CIK) {
3483 return rsrc;
3484 } else {
3485 LLVMBuilderRef builder = ctx->gallivm.builder;
3486 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3487 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3488 LLVMValueRef tmp;
3489
3490 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3491 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3492 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3493 }
3494 }
3495
3496 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3497 {
3498 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3499 CONST_ADDR_SPACE);
3500 }
3501
3502 static LLVMValueRef load_image_desc(struct si_shader_context *ctx,
3503 LLVMValueRef list, LLVMValueRef index,
3504 unsigned target)
3505 {
3506 LLVMBuilderRef builder = ctx->gallivm.builder;
3507
3508 if (target == TGSI_TEXTURE_BUFFER) {
3509 index = LLVMBuildMul(builder, index,
3510 LLVMConstInt(ctx->i32, 2, 0), "");
3511 index = LLVMBuildAdd(builder, index,
3512 ctx->i32_1, "");
3513 list = LLVMBuildPointerCast(builder, list,
3514 const_array(ctx->v4i32, 0), "");
3515 }
3516
3517 return ac_build_indexed_load_const(&ctx->ac, list, index);
3518 }
3519
3520 /**
3521 * Load the resource descriptor for \p image.
3522 */
3523 static void
3524 image_fetch_rsrc(
3525 struct lp_build_tgsi_context *bld_base,
3526 const struct tgsi_full_src_register *image,
3527 bool is_store, unsigned target,
3528 LLVMValueRef *rsrc)
3529 {
3530 struct si_shader_context *ctx = si_shader_context(bld_base);
3531 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
3532 ctx->param_images);
3533 LLVMValueRef index;
3534 bool dcc_off = is_store;
3535
3536 assert(image->Register.File == TGSI_FILE_IMAGE);
3537
3538 if (!image->Register.Indirect) {
3539 const struct tgsi_shader_info *info = bld_base->info;
3540 unsigned images_writemask = info->images_store |
3541 info->images_atomic;
3542
3543 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3544
3545 if (images_writemask & (1 << image->Register.Index))
3546 dcc_off = true;
3547 } else {
3548 /* From the GL_ARB_shader_image_load_store extension spec:
3549 *
3550 * If a shader performs an image load, store, or atomic
3551 * operation using an image variable declared as an array,
3552 * and if the index used to select an individual element is
3553 * negative or greater than or equal to the size of the
3554 * array, the results of the operation are undefined but may
3555 * not lead to termination.
3556 */
3557 index = get_bounded_indirect_index(ctx, &image->Indirect,
3558 image->Register.Index,
3559 SI_NUM_IMAGES);
3560 }
3561
3562 *rsrc = load_image_desc(ctx, rsrc_ptr, index, target);
3563 if (dcc_off && target != TGSI_TEXTURE_BUFFER)
3564 *rsrc = force_dcc_off(ctx, *rsrc);
3565 }
3566
3567 static LLVMValueRef image_fetch_coords(
3568 struct lp_build_tgsi_context *bld_base,
3569 const struct tgsi_full_instruction *inst,
3570 unsigned src, LLVMValueRef desc)
3571 {
3572 struct si_shader_context *ctx = si_shader_context(bld_base);
3573 struct gallivm_state *gallivm = &ctx->gallivm;
3574 LLVMBuilderRef builder = gallivm->builder;
3575 unsigned target = inst->Memory.Texture;
3576 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3577 LLVMValueRef coords[4];
3578 LLVMValueRef tmp;
3579 int chan;
3580
3581 for (chan = 0; chan < num_coords; ++chan) {
3582 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3583 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3584 coords[chan] = tmp;
3585 }
3586
3587 if (ctx->screen->b.chip_class >= GFX9) {
3588 /* 1D textures are allocated and used as 2D on GFX9. */
3589 if (target == TGSI_TEXTURE_1D) {
3590 coords[1] = ctx->i32_0;
3591 num_coords++;
3592 } else if (target == TGSI_TEXTURE_1D_ARRAY) {
3593 coords[2] = coords[1];
3594 coords[1] = ctx->i32_0;
3595 num_coords++;
3596 } else if (target == TGSI_TEXTURE_2D) {
3597 /* The hw can't bind a slice of a 3D image as a 2D
3598 * image, because it ignores BASE_ARRAY if the target
3599 * is 3D. The workaround is to read BASE_ARRAY and set
3600 * it as the 3rd address operand for all 2D images.
3601 */
3602 LLVMValueRef first_layer, const5, mask;
3603
3604 const5 = LLVMConstInt(ctx->i32, 5, 0);
3605 mask = LLVMConstInt(ctx->i32, S_008F24_BASE_ARRAY(~0), 0);
3606 first_layer = LLVMBuildExtractElement(builder, desc, const5, "");
3607 first_layer = LLVMBuildAnd(builder, first_layer, mask, "");
3608
3609 coords[2] = first_layer;
3610 num_coords++;
3611 }
3612 }
3613
3614 if (num_coords == 1)
3615 return coords[0];
3616
3617 if (num_coords == 3) {
3618 /* LLVM has difficulties lowering 3-element vectors. */
3619 coords[3] = bld_base->uint_bld.undef;
3620 num_coords = 4;
3621 }
3622
3623 return lp_build_gather_values(gallivm, coords, num_coords);
3624 }
3625
3626 /**
3627 * Append the extra mode bits that are used by image load and store.
3628 */
3629 static void image_append_args(
3630 struct si_shader_context *ctx,
3631 struct lp_build_emit_data * emit_data,
3632 unsigned target,
3633 bool atomic,
3634 bool force_glc)
3635 {
3636 const struct tgsi_full_instruction *inst = emit_data->inst;
3637 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3638 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3639 LLVMValueRef r128 = i1false;
3640 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3641 LLVMValueRef glc =
3642 force_glc ||
3643 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3644 i1true : i1false;
3645 LLVMValueRef slc = i1false;
3646 LLVMValueRef lwe = i1false;
3647
3648 if (atomic || (HAVE_LLVM <= 0x0309)) {
3649 emit_data->args[emit_data->arg_count++] = r128;
3650 emit_data->args[emit_data->arg_count++] = da;
3651 if (!atomic) {
3652 emit_data->args[emit_data->arg_count++] = glc;
3653 }
3654 emit_data->args[emit_data->arg_count++] = slc;
3655 return;
3656 }
3657
3658 /* HAVE_LLVM >= 0x0400 */
3659 emit_data->args[emit_data->arg_count++] = glc;
3660 emit_data->args[emit_data->arg_count++] = slc;
3661 emit_data->args[emit_data->arg_count++] = lwe;
3662 emit_data->args[emit_data->arg_count++] = da;
3663 }
3664
3665 /**
3666 * Append the resource and indexing arguments for buffer intrinsics.
3667 *
3668 * \param rsrc the v4i32 buffer resource
3669 * \param index index into the buffer (stride-based)
3670 * \param offset byte offset into the buffer
3671 */
3672 static void buffer_append_args(
3673 struct si_shader_context *ctx,
3674 struct lp_build_emit_data *emit_data,
3675 LLVMValueRef rsrc,
3676 LLVMValueRef index,
3677 LLVMValueRef offset,
3678 bool atomic,
3679 bool force_glc)
3680 {
3681 const struct tgsi_full_instruction *inst = emit_data->inst;
3682 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3683 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3684
3685 emit_data->args[emit_data->arg_count++] = rsrc;
3686 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3687 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3688 if (!atomic) {
3689 emit_data->args[emit_data->arg_count++] =
3690 force_glc ||
3691 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3692 i1true : i1false; /* glc */
3693 }
3694 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3695 }
3696
3697 static void load_fetch_args(
3698 struct lp_build_tgsi_context * bld_base,
3699 struct lp_build_emit_data * emit_data)
3700 {
3701 struct si_shader_context *ctx = si_shader_context(bld_base);
3702 struct gallivm_state *gallivm = &ctx->gallivm;
3703 const struct tgsi_full_instruction * inst = emit_data->inst;
3704 unsigned target = inst->Memory.Texture;
3705 LLVMValueRef rsrc;
3706
3707 emit_data->dst_type = ctx->v4f32;
3708
3709 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3710 LLVMBuilderRef builder = gallivm->builder;
3711 LLVMValueRef offset;
3712 LLVMValueRef tmp;
3713
3714 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3715
3716 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3717 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3718
3719 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3720 offset, false, false);
3721 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3722 LLVMValueRef coords;
3723
3724 image_fetch_rsrc(bld_base, &inst->Src[0], false, target, &rsrc);
3725 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
3726
3727 if (target == TGSI_TEXTURE_BUFFER) {
3728 buffer_append_args(ctx, emit_data, rsrc, coords,
3729 ctx->i32_0, false, false);
3730 } else {
3731 emit_data->args[0] = coords;
3732 emit_data->args[1] = rsrc;
3733 emit_data->args[2] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
3734 emit_data->arg_count = 3;
3735
3736 image_append_args(ctx, emit_data, target, false, false);
3737 }
3738 }
3739 }
3740
3741 static unsigned get_load_intr_attribs(bool readonly_memory)
3742 {
3743 /* READNONE means writes can't affect it, while READONLY means that
3744 * writes can affect it. */
3745 return readonly_memory && HAVE_LLVM >= 0x0400 ?
3746 LP_FUNC_ATTR_READNONE :
3747 LP_FUNC_ATTR_READONLY;
3748 }
3749
3750 static unsigned get_store_intr_attribs(bool writeonly_memory)
3751 {
3752 return writeonly_memory && HAVE_LLVM >= 0x0400 ?
3753 LP_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
3754 LP_FUNC_ATTR_WRITEONLY;
3755 }
3756
3757 static void load_emit_buffer(struct si_shader_context *ctx,
3758 struct lp_build_emit_data *emit_data,
3759 bool readonly_memory)
3760 {
3761 const struct tgsi_full_instruction *inst = emit_data->inst;
3762 struct gallivm_state *gallivm = &ctx->gallivm;
3763 LLVMBuilderRef builder = gallivm->builder;
3764 uint writemask = inst->Dst[0].Register.WriteMask;
3765 uint count = util_last_bit(writemask);
3766 const char *intrinsic_name;
3767 LLVMTypeRef dst_type;
3768
3769 switch (count) {
3770 case 1:
3771 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3772 dst_type = ctx->f32;
3773 break;
3774 case 2:
3775 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3776 dst_type = LLVMVectorType(ctx->f32, 2);
3777 break;
3778 default: // 3 & 4
3779 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3780 dst_type = ctx->v4f32;
3781 count = 4;
3782 }
3783
3784 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3785 builder, intrinsic_name, dst_type,
3786 emit_data->args, emit_data->arg_count,
3787 get_load_intr_attribs(readonly_memory));
3788 }
3789
3790 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3791 const struct tgsi_full_instruction *inst,
3792 LLVMTypeRef type, int arg)
3793 {
3794 struct gallivm_state *gallivm = &ctx->gallivm;
3795 LLVMBuilderRef builder = gallivm->builder;
3796 LLVMValueRef offset, ptr;
3797 int addr_space;
3798
3799 offset = lp_build_emit_fetch(&ctx->bld_base, inst, arg, 0);
3800 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3801
3802 ptr = ctx->shared_memory;
3803 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3804 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3805 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3806
3807 return ptr;
3808 }
3809
3810 static void load_emit_memory(
3811 struct si_shader_context *ctx,
3812 struct lp_build_emit_data *emit_data)
3813 {
3814 const struct tgsi_full_instruction *inst = emit_data->inst;
3815 struct gallivm_state *gallivm = &ctx->gallivm;
3816 LLVMBuilderRef builder = gallivm->builder;
3817 unsigned writemask = inst->Dst[0].Register.WriteMask;
3818 LLVMValueRef channels[4], ptr, derived_ptr, index;
3819 int chan;
3820
3821 ptr = get_memory_ptr(ctx, inst, ctx->f32, 1);
3822
3823 for (chan = 0; chan < 4; ++chan) {
3824 if (!(writemask & (1 << chan))) {
3825 channels[chan] = LLVMGetUndef(ctx->f32);
3826 continue;
3827 }
3828
3829 index = LLVMConstInt(ctx->i32, chan, 0);
3830 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3831 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3832 }
3833 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3834 }
3835
3836 /**
3837 * Return true if the memory accessed by a LOAD or STORE instruction is
3838 * read-only or write-only, respectively.
3839 *
3840 * \param shader_buffers_reverse_access_mask
3841 * For LOAD, set this to (store | atomic) slot usage in the shader.
3842 * For STORE, set this to (load | atomic) slot usage in the shader.
3843 * \param images_reverse_access_mask Same as above, but for images.
3844 */
3845 static bool is_oneway_access_only(const struct tgsi_full_instruction *inst,
3846 const struct tgsi_shader_info *info,
3847 unsigned shader_buffers_reverse_access_mask,
3848 unsigned images_reverse_access_mask)
3849 {
3850 /* RESTRICT means NOALIAS.
3851 * If there are no writes, we can assume the accessed memory is read-only.
3852 * If there are no reads, we can assume the accessed memory is write-only.
3853 */
3854 if (inst->Memory.Qualifier & TGSI_MEMORY_RESTRICT) {
3855 unsigned reverse_access_mask;
3856
3857 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3858 reverse_access_mask = shader_buffers_reverse_access_mask;
3859 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3860 reverse_access_mask = info->images_buffers &
3861 images_reverse_access_mask;
3862 } else {
3863 reverse_access_mask = ~info->images_buffers &
3864 images_reverse_access_mask;
3865 }
3866
3867 if (inst->Src[0].Register.Indirect) {
3868 if (!reverse_access_mask)
3869 return true;
3870 } else {
3871 if (!(reverse_access_mask &
3872 (1u << inst->Src[0].Register.Index)))
3873 return true;
3874 }
3875 }
3876
3877 /* If there are no buffer writes (for both shader buffers & image
3878 * buffers), it implies that buffer memory is read-only.
3879 * If there are no buffer reads (for both shader buffers & image
3880 * buffers), it implies that buffer memory is write-only.
3881 *
3882 * Same for the case when there are no writes/reads for non-buffer
3883 * images.
3884 */
3885 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3886 (inst->Src[0].Register.File == TGSI_FILE_IMAGE &&
3887 inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
3888 if (!shader_buffers_reverse_access_mask &&
3889 !(info->images_buffers & images_reverse_access_mask))
3890 return true;
3891 } else {
3892 if (!(~info->images_buffers & images_reverse_access_mask))
3893 return true;
3894 }
3895 return false;
3896 }
3897
3898 static void load_emit(
3899 const struct lp_build_tgsi_action *action,
3900 struct lp_build_tgsi_context *bld_base,
3901 struct lp_build_emit_data *emit_data)
3902 {
3903 struct si_shader_context *ctx = si_shader_context(bld_base);
3904 struct gallivm_state *gallivm = &ctx->gallivm;
3905 LLVMBuilderRef builder = gallivm->builder;
3906 const struct tgsi_full_instruction * inst = emit_data->inst;
3907 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
3908 char intrinsic_name[64];
3909 bool readonly_memory = false;
3910
3911 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3912 load_emit_memory(ctx, emit_data);
3913 return;
3914 }
3915
3916 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3917 emit_waitcnt(ctx, VM_CNT);
3918
3919 readonly_memory = !(inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) &&
3920 is_oneway_access_only(inst, info,
3921 info->shader_buffers_store |
3922 info->shader_buffers_atomic,
3923 info->images_store |
3924 info->images_atomic);
3925
3926 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3927 load_emit_buffer(ctx, emit_data, readonly_memory);
3928 return;
3929 }
3930
3931 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3932 emit_data->output[emit_data->chan] =
3933 lp_build_intrinsic(
3934 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3935 emit_data->args, emit_data->arg_count,
3936 get_load_intr_attribs(readonly_memory));
3937 } else {
3938 ac_get_image_intr_name("llvm.amdgcn.image.load",
3939 emit_data->dst_type, /* vdata */
3940 LLVMTypeOf(emit_data->args[0]), /* coords */
3941 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3942 intrinsic_name, sizeof(intrinsic_name));
3943
3944 emit_data->output[emit_data->chan] =
3945 lp_build_intrinsic(
3946 builder, intrinsic_name, emit_data->dst_type,
3947 emit_data->args, emit_data->arg_count,
3948 get_load_intr_attribs(readonly_memory));
3949 }
3950 }
3951
3952 static void store_fetch_args(
3953 struct lp_build_tgsi_context * bld_base,
3954 struct lp_build_emit_data * emit_data)
3955 {
3956 struct si_shader_context *ctx = si_shader_context(bld_base);
3957 struct gallivm_state *gallivm = &ctx->gallivm;
3958 LLVMBuilderRef builder = gallivm->builder;
3959 const struct tgsi_full_instruction * inst = emit_data->inst;
3960 struct tgsi_full_src_register memory;
3961 LLVMValueRef chans[4];
3962 LLVMValueRef data;
3963 LLVMValueRef rsrc;
3964 unsigned chan;
3965
3966 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3967
3968 for (chan = 0; chan < 4; ++chan) {
3969 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3970 }
3971 data = lp_build_gather_values(gallivm, chans, 4);
3972
3973 emit_data->args[emit_data->arg_count++] = data;
3974
3975 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3976
3977 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3978 LLVMValueRef offset;
3979 LLVMValueRef tmp;
3980
3981 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3982
3983 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3984 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
3985
3986 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
3987 offset, false, false);
3988 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3989 unsigned target = inst->Memory.Texture;
3990 LLVMValueRef coords;
3991
3992 /* 8bit/16bit TC L1 write corruption bug on SI.
3993 * All store opcodes not aligned to a dword are affected.
3994 *
3995 * The only way to get unaligned stores in radeonsi is through
3996 * shader images.
3997 */
3998 bool force_glc = ctx->screen->b.chip_class == SI;
3999
4000 image_fetch_rsrc(bld_base, &memory, true, target, &rsrc);
4001 coords = image_fetch_coords(bld_base, inst, 0, rsrc);
4002
4003 if (target == TGSI_TEXTURE_BUFFER) {
4004 buffer_append_args(ctx, emit_data, rsrc, coords,
4005 ctx->i32_0, false, force_glc);
4006 } else {
4007 emit_data->args[1] = coords;
4008 emit_data->args[2] = rsrc;
4009 emit_data->args[3] = LLVMConstInt(ctx->i32, 15, 0); /* dmask */
4010 emit_data->arg_count = 4;
4011
4012 image_append_args(ctx, emit_data, target, false, force_glc);
4013 }
4014 }
4015 }
4016
4017 static void store_emit_buffer(
4018 struct si_shader_context *ctx,
4019 struct lp_build_emit_data *emit_data,
4020 bool writeonly_memory)
4021 {
4022 const struct tgsi_full_instruction *inst = emit_data->inst;
4023 struct gallivm_state *gallivm = &ctx->gallivm;
4024 LLVMBuilderRef builder = gallivm->builder;
4025 LLVMValueRef base_data = emit_data->args[0];
4026 LLVMValueRef base_offset = emit_data->args[3];
4027 unsigned writemask = inst->Dst[0].Register.WriteMask;
4028
4029 while (writemask) {
4030 int start, count;
4031 const char *intrinsic_name;
4032 LLVMValueRef data;
4033 LLVMValueRef offset;
4034 LLVMValueRef tmp;
4035
4036 u_bit_scan_consecutive_range(&writemask, &start, &count);
4037
4038 /* Due to an LLVM limitation, split 3-element writes
4039 * into a 2-element and a 1-element write. */
4040 if (count == 3) {
4041 writemask |= 1 << (start + 2);
4042 count = 2;
4043 }
4044
4045 if (count == 4) {
4046 data = base_data;
4047 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
4048 } else if (count == 2) {
4049 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
4050
4051 tmp = LLVMBuildExtractElement(
4052 builder, base_data,
4053 LLVMConstInt(ctx->i32, start, 0), "");
4054 data = LLVMBuildInsertElement(
4055 builder, LLVMGetUndef(v2f32), tmp,
4056 ctx->i32_0, "");
4057
4058 tmp = LLVMBuildExtractElement(
4059 builder, base_data,
4060 LLVMConstInt(ctx->i32, start + 1, 0), "");
4061 data = LLVMBuildInsertElement(
4062 builder, data, tmp, ctx->i32_1, "");
4063
4064 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
4065 } else {
4066 assert(count == 1);
4067 data = LLVMBuildExtractElement(
4068 builder, base_data,
4069 LLVMConstInt(ctx->i32, start, 0), "");
4070 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
4071 }
4072
4073 offset = base_offset;
4074 if (start != 0) {
4075 offset = LLVMBuildAdd(
4076 builder, offset,
4077 LLVMConstInt(ctx->i32, start * 4, 0), "");
4078 }
4079
4080 emit_data->args[0] = data;
4081 emit_data->args[3] = offset;
4082
4083 lp_build_intrinsic(
4084 builder, intrinsic_name, emit_data->dst_type,
4085 emit_data->args, emit_data->arg_count,
4086 get_store_intr_attribs(writeonly_memory));
4087 }
4088 }
4089
4090 static void store_emit_memory(
4091 struct si_shader_context *ctx,
4092 struct lp_build_emit_data *emit_data)
4093 {
4094 const struct tgsi_full_instruction *inst = emit_data->inst;
4095 struct gallivm_state *gallivm = &ctx->gallivm;
4096 LLVMBuilderRef builder = gallivm->builder;
4097 unsigned writemask = inst->Dst[0].Register.WriteMask;
4098 LLVMValueRef ptr, derived_ptr, data, index;
4099 int chan;
4100
4101 ptr = get_memory_ptr(ctx, inst, ctx->f32, 0);
4102
4103 for (chan = 0; chan < 4; ++chan) {
4104 if (!(writemask & (1 << chan))) {
4105 continue;
4106 }
4107 data = lp_build_emit_fetch(&ctx->bld_base, inst, 1, chan);
4108 index = LLVMConstInt(ctx->i32, chan, 0);
4109 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
4110 LLVMBuildStore(builder, data, derived_ptr);
4111 }
4112 }
4113
4114 static void store_emit(
4115 const struct lp_build_tgsi_action *action,
4116 struct lp_build_tgsi_context *bld_base,
4117 struct lp_build_emit_data *emit_data)
4118 {
4119 struct si_shader_context *ctx = si_shader_context(bld_base);
4120 struct gallivm_state *gallivm = &ctx->gallivm;
4121 LLVMBuilderRef builder = gallivm->builder;
4122 const struct tgsi_full_instruction * inst = emit_data->inst;
4123 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
4124 unsigned target = inst->Memory.Texture;
4125 char intrinsic_name[64];
4126 bool writeonly_memory = false;
4127
4128 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4129 store_emit_memory(ctx, emit_data);
4130 return;
4131 }
4132
4133 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4134 emit_waitcnt(ctx, VM_CNT);
4135
4136 writeonly_memory = is_oneway_access_only(inst, info,
4137 info->shader_buffers_load |
4138 info->shader_buffers_atomic,
4139 info->images_load |
4140 info->images_atomic);
4141
4142 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4143 store_emit_buffer(ctx, emit_data, writeonly_memory);
4144 return;
4145 }
4146
4147 if (target == TGSI_TEXTURE_BUFFER) {
4148 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4149 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4150 emit_data->dst_type, emit_data->args,
4151 emit_data->arg_count,
4152 get_store_intr_attribs(writeonly_memory));
4153 } else {
4154 ac_get_image_intr_name("llvm.amdgcn.image.store",
4155 LLVMTypeOf(emit_data->args[0]), /* vdata */
4156 LLVMTypeOf(emit_data->args[1]), /* coords */
4157 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4158 intrinsic_name, sizeof(intrinsic_name));
4159
4160 emit_data->output[emit_data->chan] =
4161 lp_build_intrinsic(
4162 builder, intrinsic_name, emit_data->dst_type,
4163 emit_data->args, emit_data->arg_count,
4164 get_store_intr_attribs(writeonly_memory));
4165 }
4166 }
4167
4168 static void atomic_fetch_args(
4169 struct lp_build_tgsi_context * bld_base,
4170 struct lp_build_emit_data * emit_data)
4171 {
4172 struct si_shader_context *ctx = si_shader_context(bld_base);
4173 struct gallivm_state *gallivm = &ctx->gallivm;
4174 LLVMBuilderRef builder = gallivm->builder;
4175 const struct tgsi_full_instruction * inst = emit_data->inst;
4176 LLVMValueRef data1, data2;
4177 LLVMValueRef rsrc;
4178 LLVMValueRef tmp;
4179
4180 emit_data->dst_type = ctx->f32;
4181
4182 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4183 data1 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4184
4185 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4186 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4187 data2 = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4188 }
4189
4190 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4191 * of arguments, which is reversed relative to TGSI (and GLSL)
4192 */
4193 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4194 emit_data->args[emit_data->arg_count++] = data2;
4195 emit_data->args[emit_data->arg_count++] = data1;
4196
4197 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4198 LLVMValueRef offset;
4199
4200 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4201
4202 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4203 offset = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4204
4205 buffer_append_args(ctx, emit_data, rsrc, ctx->i32_0,
4206 offset, true, false);
4207 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4208 unsigned target = inst->Memory.Texture;
4209 LLVMValueRef coords;
4210
4211 image_fetch_rsrc(bld_base, &inst->Src[0], true, target, &rsrc);
4212 coords = image_fetch_coords(bld_base, inst, 1, rsrc);
4213
4214 if (target == TGSI_TEXTURE_BUFFER) {
4215 buffer_append_args(ctx, emit_data, rsrc, coords,
4216 ctx->i32_0, true, false);
4217 } else {
4218 emit_data->args[emit_data->arg_count++] = coords;
4219 emit_data->args[emit_data->arg_count++] = rsrc;
4220
4221 image_append_args(ctx, emit_data, target, true, false);
4222 }
4223 }
4224 }
4225
4226 static void atomic_emit_memory(struct si_shader_context *ctx,
4227 struct lp_build_emit_data *emit_data) {
4228 struct gallivm_state *gallivm = &ctx->gallivm;
4229 LLVMBuilderRef builder = gallivm->builder;
4230 const struct tgsi_full_instruction * inst = emit_data->inst;
4231 LLVMValueRef ptr, result, arg;
4232
4233 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4234
4235 arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
4236 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4237
4238 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4239 LLVMValueRef new_data;
4240 new_data = lp_build_emit_fetch(&ctx->bld_base,
4241 inst, 3, 0);
4242
4243 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4244
4245 #if HAVE_LLVM >= 0x309
4246 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4247 LLVMAtomicOrderingSequentiallyConsistent,
4248 LLVMAtomicOrderingSequentiallyConsistent,
4249 false);
4250 #endif
4251
4252 result = LLVMBuildExtractValue(builder, result, 0, "");
4253 } else {
4254 LLVMAtomicRMWBinOp op;
4255
4256 switch(inst->Instruction.Opcode) {
4257 case TGSI_OPCODE_ATOMUADD:
4258 op = LLVMAtomicRMWBinOpAdd;
4259 break;
4260 case TGSI_OPCODE_ATOMXCHG:
4261 op = LLVMAtomicRMWBinOpXchg;
4262 break;
4263 case TGSI_OPCODE_ATOMAND:
4264 op = LLVMAtomicRMWBinOpAnd;
4265 break;
4266 case TGSI_OPCODE_ATOMOR:
4267 op = LLVMAtomicRMWBinOpOr;
4268 break;
4269 case TGSI_OPCODE_ATOMXOR:
4270 op = LLVMAtomicRMWBinOpXor;
4271 break;
4272 case TGSI_OPCODE_ATOMUMIN:
4273 op = LLVMAtomicRMWBinOpUMin;
4274 break;
4275 case TGSI_OPCODE_ATOMUMAX:
4276 op = LLVMAtomicRMWBinOpUMax;
4277 break;
4278 case TGSI_OPCODE_ATOMIMIN:
4279 op = LLVMAtomicRMWBinOpMin;
4280 break;
4281 case TGSI_OPCODE_ATOMIMAX:
4282 op = LLVMAtomicRMWBinOpMax;
4283 break;
4284 default:
4285 unreachable("unknown atomic opcode");
4286 }
4287
4288 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4289 LLVMAtomicOrderingSequentiallyConsistent,
4290 false);
4291 }
4292 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4293 }
4294
4295 static void atomic_emit(
4296 const struct lp_build_tgsi_action *action,
4297 struct lp_build_tgsi_context *bld_base,
4298 struct lp_build_emit_data *emit_data)
4299 {
4300 struct si_shader_context *ctx = si_shader_context(bld_base);
4301 struct gallivm_state *gallivm = &ctx->gallivm;
4302 LLVMBuilderRef builder = gallivm->builder;
4303 const struct tgsi_full_instruction * inst = emit_data->inst;
4304 char intrinsic_name[40];
4305 LLVMValueRef tmp;
4306
4307 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4308 atomic_emit_memory(ctx, emit_data);
4309 return;
4310 }
4311
4312 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4313 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4314 snprintf(intrinsic_name, sizeof(intrinsic_name),
4315 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4316 } else {
4317 LLVMValueRef coords;
4318 char coords_type[8];
4319
4320 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4321 coords = emit_data->args[2];
4322 else
4323 coords = emit_data->args[1];
4324
4325 ac_build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4326 snprintf(intrinsic_name, sizeof(intrinsic_name),
4327 "llvm.amdgcn.image.atomic.%s.%s",
4328 action->intr_name, coords_type);
4329 }
4330
4331 tmp = lp_build_intrinsic(
4332 builder, intrinsic_name, ctx->i32,
4333 emit_data->args, emit_data->arg_count, 0);
4334 emit_data->output[emit_data->chan] =
4335 LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4336 }
4337
4338 static void set_tex_fetch_args(struct si_shader_context *ctx,
4339 struct lp_build_emit_data *emit_data,
4340 unsigned target,
4341 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4342 LLVMValueRef *param, unsigned count,
4343 unsigned dmask)
4344 {
4345 struct gallivm_state *gallivm = &ctx->gallivm;
4346 struct ac_image_args args = {};
4347
4348 /* Pad to power of two vector */
4349 while (count < util_next_power_of_two(count))
4350 param[count++] = LLVMGetUndef(ctx->i32);
4351
4352 if (count > 1)
4353 args.addr = lp_build_gather_values(gallivm, param, count);
4354 else
4355 args.addr = param[0];
4356
4357 args.resource = res_ptr;
4358 args.sampler = samp_ptr;
4359 args.dmask = dmask;
4360 args.unorm = target == TGSI_TEXTURE_RECT ||
4361 target == TGSI_TEXTURE_SHADOWRECT;
4362 args.da = tgsi_is_array_sampler(target);
4363
4364 /* Ugly, but we seem to have no other choice right now. */
4365 STATIC_ASSERT(sizeof(args) <= sizeof(emit_data->args));
4366 memcpy(emit_data->args, &args, sizeof(args));
4367 }
4368
4369 static LLVMValueRef fix_resinfo(struct si_shader_context *ctx,
4370 unsigned target, LLVMValueRef out)
4371 {
4372 LLVMBuilderRef builder = ctx->gallivm.builder;
4373
4374 /* 1D textures are allocated and used as 2D on GFX9. */
4375 if (ctx->screen->b.chip_class >= GFX9 &&
4376 (target == TGSI_TEXTURE_1D_ARRAY ||
4377 target == TGSI_TEXTURE_SHADOW1D_ARRAY)) {
4378 LLVMValueRef layers =
4379 LLVMBuildExtractElement(builder, out,
4380 LLVMConstInt(ctx->i32, 2, 0), "");
4381 out = LLVMBuildInsertElement(builder, out, layers,
4382 ctx->i32_1, "");
4383 }
4384
4385 /* Divide the number of layers by 6 to get the number of cubes. */
4386 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4387 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4388 LLVMValueRef imm2 = LLVMConstInt(ctx->i32, 2, 0);
4389
4390 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4391 z = LLVMBuildSDiv(builder, z, LLVMConstInt(ctx->i32, 6, 0), "");
4392
4393 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4394 }
4395 return out;
4396 }
4397
4398 static void resq_fetch_args(
4399 struct lp_build_tgsi_context * bld_base,
4400 struct lp_build_emit_data * emit_data)
4401 {
4402 struct si_shader_context *ctx = si_shader_context(bld_base);
4403 const struct tgsi_full_instruction *inst = emit_data->inst;
4404 const struct tgsi_full_src_register *reg = &inst->Src[0];
4405
4406 emit_data->dst_type = ctx->v4i32;
4407
4408 if (reg->Register.File == TGSI_FILE_BUFFER) {
4409 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4410 emit_data->arg_count = 1;
4411 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4412 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4413 &emit_data->args[0]);
4414 emit_data->arg_count = 1;
4415 } else {
4416 LLVMValueRef res_ptr;
4417 unsigned image_target;
4418
4419 if (inst->Memory.Texture == TGSI_TEXTURE_3D)
4420 image_target = TGSI_TEXTURE_2D_ARRAY;
4421 else
4422 image_target = inst->Memory.Texture;
4423
4424 image_fetch_rsrc(bld_base, reg, false, inst->Memory.Texture,
4425 &res_ptr);
4426 set_tex_fetch_args(ctx, emit_data, image_target,
4427 res_ptr, NULL, &ctx->i32_0, 1,
4428 0xf);
4429 }
4430 }
4431
4432 static void resq_emit(
4433 const struct lp_build_tgsi_action *action,
4434 struct lp_build_tgsi_context *bld_base,
4435 struct lp_build_emit_data *emit_data)
4436 {
4437 struct si_shader_context *ctx = si_shader_context(bld_base);
4438 struct gallivm_state *gallivm = &ctx->gallivm;
4439 LLVMBuilderRef builder = gallivm->builder;
4440 const struct tgsi_full_instruction *inst = emit_data->inst;
4441 LLVMValueRef out;
4442
4443 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4444 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4445 LLVMConstInt(ctx->i32, 2, 0), "");
4446 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4447 out = get_buffer_size(bld_base, emit_data->args[0]);
4448 } else {
4449 struct ac_image_args args;
4450
4451 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4452 args.opcode = ac_image_get_resinfo;
4453 out = ac_build_image_opcode(&ctx->ac, &args);
4454
4455 out = fix_resinfo(ctx, inst->Memory.Texture, out);
4456 }
4457
4458 emit_data->output[emit_data->chan] = out;
4459 }
4460
4461 static const struct lp_build_tgsi_action tex_action;
4462
4463 enum desc_type {
4464 DESC_IMAGE,
4465 DESC_BUFFER,
4466 DESC_FMASK,
4467 DESC_SAMPLER,
4468 };
4469
4470 /**
4471 * Load an image view, fmask view. or sampler state descriptor.
4472 */
4473 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4474 LLVMValueRef list, LLVMValueRef index,
4475 enum desc_type type)
4476 {
4477 struct gallivm_state *gallivm = &ctx->gallivm;
4478 LLVMBuilderRef builder = gallivm->builder;
4479
4480 switch (type) {
4481 case DESC_IMAGE:
4482 /* The image is at [0:7]. */
4483 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4484 break;
4485 case DESC_BUFFER:
4486 /* The buffer is in [4:7]. */
4487 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4488 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4489 list = LLVMBuildPointerCast(builder, list,
4490 const_array(ctx->v4i32, 0), "");
4491 break;
4492 case DESC_FMASK:
4493 /* The FMASK is at [8:15]. */
4494 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4495 index = LLVMBuildAdd(builder, index, ctx->i32_1, "");
4496 break;
4497 case DESC_SAMPLER:
4498 /* The sampler state is at [12:15]. */
4499 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4500 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4501 list = LLVMBuildPointerCast(builder, list,
4502 const_array(ctx->v4i32, 0), "");
4503 break;
4504 }
4505
4506 return ac_build_indexed_load_const(&ctx->ac, list, index);
4507 }
4508
4509 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4510 *
4511 * SI-CI:
4512 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4513 * filtering manually. The driver sets img7 to a mask clearing
4514 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4515 * s_and_b32 samp0, samp0, img7
4516 *
4517 * VI:
4518 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4519 */
4520 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4521 LLVMValueRef res, LLVMValueRef samp)
4522 {
4523 LLVMBuilderRef builder = ctx->gallivm.builder;
4524 LLVMValueRef img7, samp0;
4525
4526 if (ctx->screen->b.chip_class >= VI)
4527 return samp;
4528
4529 img7 = LLVMBuildExtractElement(builder, res,
4530 LLVMConstInt(ctx->i32, 7, 0), "");
4531 samp0 = LLVMBuildExtractElement(builder, samp,
4532 ctx->i32_0, "");
4533 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4534 return LLVMBuildInsertElement(builder, samp, samp0,
4535 ctx->i32_0, "");
4536 }
4537
4538 static void tex_fetch_ptrs(
4539 struct lp_build_tgsi_context *bld_base,
4540 struct lp_build_emit_data *emit_data,
4541 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4542 {
4543 struct si_shader_context *ctx = si_shader_context(bld_base);
4544 LLVMValueRef list = LLVMGetParam(ctx->main_fn, ctx->param_samplers);
4545 const struct tgsi_full_instruction *inst = emit_data->inst;
4546 const struct tgsi_full_src_register *reg;
4547 unsigned target = inst->Texture.Texture;
4548 unsigned sampler_src;
4549 LLVMValueRef index;
4550
4551 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4552 reg = &emit_data->inst->Src[sampler_src];
4553
4554 if (reg->Register.Indirect) {
4555 index = get_bounded_indirect_index(ctx,
4556 &reg->Indirect,
4557 reg->Register.Index,
4558 SI_NUM_SAMPLERS);
4559 } else {
4560 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
4561 }
4562
4563 if (target == TGSI_TEXTURE_BUFFER)
4564 *res_ptr = load_sampler_desc(ctx, list, index, DESC_BUFFER);
4565 else
4566 *res_ptr = load_sampler_desc(ctx, list, index, DESC_IMAGE);
4567
4568 if (samp_ptr)
4569 *samp_ptr = NULL;
4570 if (fmask_ptr)
4571 *fmask_ptr = NULL;
4572
4573 if (target == TGSI_TEXTURE_2D_MSAA ||
4574 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4575 if (fmask_ptr)
4576 *fmask_ptr = load_sampler_desc(ctx, list, index,
4577 DESC_FMASK);
4578 } else if (target != TGSI_TEXTURE_BUFFER) {
4579 if (samp_ptr) {
4580 *samp_ptr = load_sampler_desc(ctx, list, index,
4581 DESC_SAMPLER);
4582 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4583 }
4584 }
4585 }
4586
4587 static void txq_fetch_args(
4588 struct lp_build_tgsi_context *bld_base,
4589 struct lp_build_emit_data *emit_data)
4590 {
4591 struct si_shader_context *ctx = si_shader_context(bld_base);
4592 const struct tgsi_full_instruction *inst = emit_data->inst;
4593 unsigned target = inst->Texture.Texture;
4594 LLVMValueRef res_ptr;
4595 LLVMValueRef address;
4596
4597 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4598
4599 if (target == TGSI_TEXTURE_BUFFER) {
4600 /* Read the size from the buffer descriptor directly. */
4601 emit_data->args[0] = get_buffer_size(bld_base, res_ptr);
4602 return;
4603 }
4604
4605 /* Textures - set the mip level. */
4606 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4607
4608 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4609 NULL, &address, 1, 0xf);
4610 }
4611
4612 static void txq_emit(const struct lp_build_tgsi_action *action,
4613 struct lp_build_tgsi_context *bld_base,
4614 struct lp_build_emit_data *emit_data)
4615 {
4616 struct si_shader_context *ctx = si_shader_context(bld_base);
4617 struct ac_image_args args;
4618 unsigned target = emit_data->inst->Texture.Texture;
4619
4620 if (target == TGSI_TEXTURE_BUFFER) {
4621 /* Just return the buffer size. */
4622 emit_data->output[emit_data->chan] = emit_data->args[0];
4623 return;
4624 }
4625
4626 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
4627
4628 args.opcode = ac_image_get_resinfo;
4629 LLVMValueRef result = ac_build_image_opcode(&ctx->ac, &args);
4630
4631 emit_data->output[emit_data->chan] = fix_resinfo(ctx, target, result);
4632 }
4633
4634 static void tex_fetch_args(
4635 struct lp_build_tgsi_context *bld_base,
4636 struct lp_build_emit_data *emit_data)
4637 {
4638 struct si_shader_context *ctx = si_shader_context(bld_base);
4639 struct gallivm_state *gallivm = &ctx->gallivm;
4640 const struct tgsi_full_instruction *inst = emit_data->inst;
4641 unsigned opcode = inst->Instruction.Opcode;
4642 unsigned target = inst->Texture.Texture;
4643 LLVMValueRef coords[5], derivs[6];
4644 LLVMValueRef address[16];
4645 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4646 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4647 unsigned count = 0;
4648 unsigned chan;
4649 unsigned num_deriv_channels = 0;
4650 bool has_offset = inst->Texture.NumOffsets > 0;
4651 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4652 unsigned dmask = 0xf;
4653
4654 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4655
4656 if (target == TGSI_TEXTURE_BUFFER) {
4657 emit_data->dst_type = ctx->v4f32;
4658 emit_data->args[0] = LLVMBuildBitCast(gallivm->builder, res_ptr,
4659 ctx->v16i8, "");
4660 emit_data->args[1] = ctx->i32_0;
4661 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4662 emit_data->arg_count = 3;
4663 return;
4664 }
4665
4666 /* Fetch and project texture coordinates */
4667 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4668 for (chan = 0; chan < 3; chan++ ) {
4669 coords[chan] = lp_build_emit_fetch(bld_base,
4670 emit_data->inst, 0,
4671 chan);
4672 if (opcode == TGSI_OPCODE_TXP)
4673 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4674 TGSI_OPCODE_DIV,
4675 coords[chan],
4676 coords[3]);
4677 }
4678
4679 if (opcode == TGSI_OPCODE_TXP)
4680 coords[3] = bld_base->base.one;
4681
4682 /* Pack offsets. */
4683 if (has_offset &&
4684 opcode != TGSI_OPCODE_TXF &&
4685 opcode != TGSI_OPCODE_TXF_LZ) {
4686 /* The offsets are six-bit signed integers packed like this:
4687 * X=[5:0], Y=[13:8], and Z=[21:16].
4688 */
4689 LLVMValueRef offset[3], pack;
4690
4691 assert(inst->Texture.NumOffsets == 1);
4692
4693 for (chan = 0; chan < 3; chan++) {
4694 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4695 emit_data->inst, 0, chan);
4696 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4697 LLVMConstInt(ctx->i32, 0x3f, 0), "");
4698 if (chan)
4699 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4700 LLVMConstInt(ctx->i32, chan*8, 0), "");
4701 }
4702
4703 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4704 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4705 address[count++] = pack;
4706 }
4707
4708 /* Pack LOD bias value */
4709 if (opcode == TGSI_OPCODE_TXB)
4710 address[count++] = coords[3];
4711 if (opcode == TGSI_OPCODE_TXB2)
4712 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4713
4714 /* Pack depth comparison value */
4715 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4716 LLVMValueRef z;
4717
4718 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4719 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4720 } else {
4721 assert(ref_pos >= 0);
4722 z = coords[ref_pos];
4723 }
4724
4725 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4726 * so the depth comparison value isn't clamped for Z16 and
4727 * Z24 anymore. Do it manually here.
4728 *
4729 * It's unnecessary if the original texture format was
4730 * Z32_FLOAT, but we don't know that here.
4731 */
4732 if (ctx->screen->b.chip_class == VI)
4733 z = ac_build_clamp(&ctx->ac, z);
4734
4735 address[count++] = z;
4736 }
4737
4738 /* Pack user derivatives */
4739 if (opcode == TGSI_OPCODE_TXD) {
4740 int param, num_src_deriv_channels, num_dst_deriv_channels;
4741
4742 switch (target) {
4743 case TGSI_TEXTURE_3D:
4744 num_src_deriv_channels = 3;
4745 num_dst_deriv_channels = 3;
4746 num_deriv_channels = 3;
4747 break;
4748 case TGSI_TEXTURE_2D:
4749 case TGSI_TEXTURE_SHADOW2D:
4750 case TGSI_TEXTURE_RECT:
4751 case TGSI_TEXTURE_SHADOWRECT:
4752 case TGSI_TEXTURE_2D_ARRAY:
4753 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4754 num_src_deriv_channels = 2;
4755 num_dst_deriv_channels = 2;
4756 num_deriv_channels = 2;
4757 break;
4758 case TGSI_TEXTURE_CUBE:
4759 case TGSI_TEXTURE_SHADOWCUBE:
4760 case TGSI_TEXTURE_CUBE_ARRAY:
4761 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4762 /* Cube derivatives will be converted to 2D. */
4763 num_src_deriv_channels = 3;
4764 num_dst_deriv_channels = 3;
4765 num_deriv_channels = 2;
4766 break;
4767 case TGSI_TEXTURE_1D:
4768 case TGSI_TEXTURE_SHADOW1D:
4769 case TGSI_TEXTURE_1D_ARRAY:
4770 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4771 num_src_deriv_channels = 1;
4772
4773 /* 1D textures are allocated and used as 2D on GFX9. */
4774 if (ctx->screen->b.chip_class >= GFX9) {
4775 num_dst_deriv_channels = 2;
4776 num_deriv_channels = 2;
4777 } else {
4778 num_dst_deriv_channels = 1;
4779 num_deriv_channels = 1;
4780 }
4781 break;
4782 default:
4783 unreachable("invalid target");
4784 }
4785
4786 for (param = 0; param < 2; param++) {
4787 for (chan = 0; chan < num_src_deriv_channels; chan++)
4788 derivs[param * num_dst_deriv_channels + chan] =
4789 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4790
4791 /* Fill in the rest with zeros. */
4792 for (chan = num_src_deriv_channels;
4793 chan < num_dst_deriv_channels; chan++)
4794 derivs[param * num_dst_deriv_channels + chan] =
4795 bld_base->base.zero;
4796 }
4797 }
4798
4799 if (target == TGSI_TEXTURE_CUBE ||
4800 target == TGSI_TEXTURE_CUBE_ARRAY ||
4801 target == TGSI_TEXTURE_SHADOWCUBE ||
4802 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4803 ac_prepare_cube_coords(&ctx->ac,
4804 opcode == TGSI_OPCODE_TXD,
4805 target == TGSI_TEXTURE_CUBE_ARRAY ||
4806 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY,
4807 coords, derivs);
4808
4809 if (opcode == TGSI_OPCODE_TXD)
4810 for (int i = 0; i < num_deriv_channels * 2; i++)
4811 address[count++] = derivs[i];
4812
4813 /* Pack texture coordinates */
4814 address[count++] = coords[0];
4815 if (num_coords > 1)
4816 address[count++] = coords[1];
4817 if (num_coords > 2)
4818 address[count++] = coords[2];
4819
4820 /* 1D textures are allocated and used as 2D on GFX9. */
4821 if (ctx->screen->b.chip_class >= GFX9) {
4822 LLVMValueRef filler;
4823
4824 /* Use 0.5, so that we don't sample the border color. */
4825 if (opcode == TGSI_OPCODE_TXF)
4826 filler = ctx->i32_0;
4827 else
4828 filler = LLVMConstReal(ctx->f32, 0.5);
4829
4830 if (target == TGSI_TEXTURE_1D ||
4831 target == TGSI_TEXTURE_SHADOW1D) {
4832 address[count++] = filler;
4833 } else if (target == TGSI_TEXTURE_1D_ARRAY ||
4834 target == TGSI_TEXTURE_SHADOW1D_ARRAY) {
4835 address[count] = address[count - 1];
4836 address[count - 1] = filler;
4837 count++;
4838 }
4839 }
4840
4841 /* Pack LOD or sample index */
4842 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4843 address[count++] = coords[3];
4844 else if (opcode == TGSI_OPCODE_TXL2)
4845 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4846
4847 if (count > 16) {
4848 assert(!"Cannot handle more than 16 texture address parameters");
4849 count = 16;
4850 }
4851
4852 for (chan = 0; chan < count; chan++ ) {
4853 address[chan] = LLVMBuildBitCast(gallivm->builder,
4854 address[chan], ctx->i32, "");
4855 }
4856
4857 /* Adjust the sample index according to FMASK.
4858 *
4859 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4860 * which is the identity mapping. Each nibble says which physical sample
4861 * should be fetched to get that sample.
4862 *
4863 * For example, 0x11111100 means there are only 2 samples stored and
4864 * the second sample covers 3/4 of the pixel. When reading samples 0
4865 * and 1, return physical sample 0 (determined by the first two 0s
4866 * in FMASK), otherwise return physical sample 1.
4867 *
4868 * The sample index should be adjusted as follows:
4869 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4870 */
4871 if (target == TGSI_TEXTURE_2D_MSAA ||
4872 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4873 struct lp_build_emit_data txf_emit_data = *emit_data;
4874 LLVMValueRef txf_address[4];
4875 /* We only need .xy for non-arrays, and .xyz for arrays. */
4876 unsigned txf_count = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4877 struct tgsi_full_instruction inst = {};
4878
4879 memcpy(txf_address, address, sizeof(txf_address));
4880
4881 /* Read FMASK using TXF_LZ. */
4882 inst.Instruction.Opcode = TGSI_OPCODE_TXF_LZ;
4883 inst.Texture.Texture = target;
4884 txf_emit_data.inst = &inst;
4885 txf_emit_data.chan = 0;
4886 set_tex_fetch_args(ctx, &txf_emit_data,
4887 target, fmask_ptr, NULL,
4888 txf_address, txf_count, 0xf);
4889 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4890
4891 /* Initialize some constants. */
4892 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4893 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4894
4895 /* Apply the formula. */
4896 LLVMValueRef fmask =
4897 LLVMBuildExtractElement(gallivm->builder,
4898 txf_emit_data.output[0],
4899 ctx->i32_0, "");
4900
4901 unsigned sample_chan = txf_count; /* the sample index is last */
4902
4903 LLVMValueRef sample_index4 =
4904 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4905
4906 LLVMValueRef shifted_fmask =
4907 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4908
4909 LLVMValueRef final_sample =
4910 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4911
4912 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4913 * resource descriptor is 0 (invalid),
4914 */
4915 LLVMValueRef fmask_desc =
4916 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4917 ctx->v8i32, "");
4918
4919 LLVMValueRef fmask_word1 =
4920 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4921 ctx->i32_1, "");
4922
4923 LLVMValueRef word1_is_nonzero =
4924 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4925 fmask_word1, ctx->i32_0, "");
4926
4927 /* Replace the MSAA sample index. */
4928 address[sample_chan] =
4929 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4930 final_sample, address[sample_chan], "");
4931 }
4932
4933 if (opcode == TGSI_OPCODE_TXF ||
4934 opcode == TGSI_OPCODE_TXF_LZ) {
4935 /* add tex offsets */
4936 if (inst->Texture.NumOffsets) {
4937 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4938 const struct tgsi_texture_offset *off = inst->TexOffsets;
4939
4940 assert(inst->Texture.NumOffsets == 1);
4941
4942 switch (target) {
4943 case TGSI_TEXTURE_3D:
4944 address[2] = lp_build_add(uint_bld, address[2],
4945 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
4946 /* fall through */
4947 case TGSI_TEXTURE_2D:
4948 case TGSI_TEXTURE_SHADOW2D:
4949 case TGSI_TEXTURE_RECT:
4950 case TGSI_TEXTURE_SHADOWRECT:
4951 case TGSI_TEXTURE_2D_ARRAY:
4952 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4953 address[1] =
4954 lp_build_add(uint_bld, address[1],
4955 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
4956 /* fall through */
4957 case TGSI_TEXTURE_1D:
4958 case TGSI_TEXTURE_SHADOW1D:
4959 case TGSI_TEXTURE_1D_ARRAY:
4960 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4961 address[0] =
4962 lp_build_add(uint_bld, address[0],
4963 ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
4964 break;
4965 /* texture offsets do not apply to other texture targets */
4966 }
4967 }
4968 }
4969
4970 if (opcode == TGSI_OPCODE_TG4) {
4971 unsigned gather_comp = 0;
4972
4973 /* DMASK was repurposed for GATHER4. 4 components are always
4974 * returned and DMASK works like a swizzle - it selects
4975 * the component to fetch. The only valid DMASK values are
4976 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4977 * (red,red,red,red) etc.) The ISA document doesn't mention
4978 * this.
4979 */
4980
4981 /* Get the component index from src1.x for Gather4. */
4982 if (!tgsi_is_shadow_target(target)) {
4983 LLVMValueRef comp_imm;
4984 struct tgsi_src_register src1 = inst->Src[1].Register;
4985
4986 assert(src1.File == TGSI_FILE_IMMEDIATE);
4987
4988 comp_imm = ctx->imms[src1.Index * TGSI_NUM_CHANNELS + src1.SwizzleX];
4989 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4990 gather_comp = CLAMP(gather_comp, 0, 3);
4991 }
4992
4993 dmask = 1 << gather_comp;
4994 }
4995
4996 set_tex_fetch_args(ctx, emit_data, target, res_ptr,
4997 samp_ptr, address, count, dmask);
4998 }
4999
5000 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
5001 * incorrectly forces nearest filtering if the texture format is integer.
5002 * The only effect it has on Gather4, which always returns 4 texels for
5003 * bilinear filtering, is that the final coordinates are off by 0.5 of
5004 * the texel size.
5005 *
5006 * The workaround is to subtract 0.5 from the unnormalized coordinates,
5007 * or (0.5 / size) from the normalized coordinates.
5008 */
5009 static void si_lower_gather4_integer(struct si_shader_context *ctx,
5010 struct ac_image_args *args,
5011 unsigned target)
5012 {
5013 LLVMBuilderRef builder = ctx->gallivm.builder;
5014 LLVMValueRef coord = args->addr;
5015 LLVMValueRef half_texel[2];
5016 /* Texture coordinates start after:
5017 * {offset, bias, z-compare, derivatives}
5018 * Only the offset and z-compare can occur here.
5019 */
5020 unsigned coord_vgpr_index = (int)args->offset + (int)args->compare;
5021 int c;
5022
5023 if (target == TGSI_TEXTURE_RECT ||
5024 target == TGSI_TEXTURE_SHADOWRECT) {
5025 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
5026 } else {
5027 struct tgsi_full_instruction txq_inst = {};
5028 struct lp_build_emit_data txq_emit_data = {};
5029
5030 /* Query the texture size. */
5031 txq_inst.Texture.Texture = target;
5032 txq_emit_data.inst = &txq_inst;
5033 txq_emit_data.dst_type = ctx->v4i32;
5034 set_tex_fetch_args(ctx, &txq_emit_data, target,
5035 args->resource, NULL, &ctx->i32_0,
5036 1, 0xf);
5037 txq_emit(NULL, &ctx->bld_base, &txq_emit_data);
5038
5039 /* Compute -0.5 / size. */
5040 for (c = 0; c < 2; c++) {
5041 half_texel[c] =
5042 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
5043 LLVMConstInt(ctx->i32, c, 0), "");
5044 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
5045 half_texel[c] =
5046 lp_build_emit_llvm_unary(&ctx->bld_base,
5047 TGSI_OPCODE_RCP, half_texel[c]);
5048 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
5049 LLVMConstReal(ctx->f32, -0.5), "");
5050 }
5051 }
5052
5053 for (c = 0; c < 2; c++) {
5054 LLVMValueRef tmp;
5055 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
5056
5057 tmp = LLVMBuildExtractElement(builder, coord, index, "");
5058 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
5059 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
5060 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
5061 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
5062 }
5063
5064 args->addr = coord;
5065 }
5066
5067 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
5068 struct lp_build_tgsi_context *bld_base,
5069 struct lp_build_emit_data *emit_data)
5070 {
5071 struct si_shader_context *ctx = si_shader_context(bld_base);
5072 const struct tgsi_full_instruction *inst = emit_data->inst;
5073 struct ac_image_args args;
5074 unsigned opcode = inst->Instruction.Opcode;
5075 unsigned target = inst->Texture.Texture;
5076
5077 if (target == TGSI_TEXTURE_BUFFER) {
5078 emit_data->output[emit_data->chan] =
5079 ac_build_buffer_load_format(&ctx->ac,
5080 emit_data->args[0],
5081 emit_data->args[2],
5082 emit_data->args[1],
5083 true);
5084 return;
5085 }
5086
5087 memcpy(&args, emit_data->args, sizeof(args)); /* ugly */
5088
5089 args.opcode = ac_image_sample;
5090 args.compare = tgsi_is_shadow_target(target);
5091 args.offset = inst->Texture.NumOffsets > 0;
5092
5093 switch (opcode) {
5094 case TGSI_OPCODE_TXF:
5095 case TGSI_OPCODE_TXF_LZ:
5096 args.opcode = opcode == TGSI_OPCODE_TXF_LZ ||
5097 target == TGSI_TEXTURE_2D_MSAA ||
5098 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
5099 ac_image_load : ac_image_load_mip;
5100 args.compare = false;
5101 args.offset = false;
5102 break;
5103 case TGSI_OPCODE_LODQ:
5104 args.opcode = ac_image_get_lod;
5105 args.compare = false;
5106 args.offset = false;
5107 break;
5108 case TGSI_OPCODE_TEX:
5109 case TGSI_OPCODE_TEX2:
5110 case TGSI_OPCODE_TXP:
5111 if (ctx->type != PIPE_SHADER_FRAGMENT)
5112 args.level_zero = true;
5113 break;
5114 case TGSI_OPCODE_TEX_LZ:
5115 args.level_zero = true;
5116 break;
5117 case TGSI_OPCODE_TXB:
5118 case TGSI_OPCODE_TXB2:
5119 assert(ctx->type == PIPE_SHADER_FRAGMENT);
5120 args.bias = true;
5121 break;
5122 case TGSI_OPCODE_TXL:
5123 case TGSI_OPCODE_TXL2:
5124 args.lod = true;
5125 break;
5126 case TGSI_OPCODE_TXD:
5127 args.deriv = true;
5128 break;
5129 case TGSI_OPCODE_TG4:
5130 args.opcode = ac_image_gather4;
5131 args.level_zero = true;
5132 break;
5133 default:
5134 assert(0);
5135 return;
5136 }
5137
5138 /* The hardware needs special lowering for Gather4 with integer formats. */
5139 if (ctx->screen->b.chip_class <= VI &&
5140 opcode == TGSI_OPCODE_TG4) {
5141 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5142 /* This will also work with non-constant indexing because of how
5143 * glsl_to_tgsi works and we intent to preserve that behavior.
5144 */
5145 const unsigned src_idx = 2;
5146 unsigned sampler = inst->Src[src_idx].Register.Index;
5147
5148 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
5149
5150 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
5151 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT)
5152 si_lower_gather4_integer(ctx, &args, target);
5153 }
5154
5155 emit_data->output[emit_data->chan] =
5156 ac_build_image_opcode(&ctx->ac, &args);
5157 }
5158
5159 static void si_llvm_emit_txqs(
5160 const struct lp_build_tgsi_action *action,
5161 struct lp_build_tgsi_context *bld_base,
5162 struct lp_build_emit_data *emit_data)
5163 {
5164 struct si_shader_context *ctx = si_shader_context(bld_base);
5165 struct gallivm_state *gallivm = &ctx->gallivm;
5166 LLVMBuilderRef builder = gallivm->builder;
5167 LLVMValueRef res, samples;
5168 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5169
5170 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5171
5172
5173 /* Read the samples from the descriptor directly. */
5174 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5175 samples = LLVMBuildExtractElement(
5176 builder, res,
5177 LLVMConstInt(ctx->i32, 3, 0), "");
5178 samples = LLVMBuildLShr(builder, samples,
5179 LLVMConstInt(ctx->i32, 16, 0), "");
5180 samples = LLVMBuildAnd(builder, samples,
5181 LLVMConstInt(ctx->i32, 0xf, 0), "");
5182 samples = LLVMBuildShl(builder, ctx->i32_1,
5183 samples, "");
5184
5185 emit_data->output[emit_data->chan] = samples;
5186 }
5187
5188 static void si_llvm_emit_ddxy(
5189 const struct lp_build_tgsi_action *action,
5190 struct lp_build_tgsi_context *bld_base,
5191 struct lp_build_emit_data *emit_data)
5192 {
5193 struct si_shader_context *ctx = si_shader_context(bld_base);
5194 struct gallivm_state *gallivm = &ctx->gallivm;
5195 unsigned opcode = emit_data->info->opcode;
5196 LLVMValueRef val;
5197 int idx;
5198 unsigned mask;
5199
5200 if (opcode == TGSI_OPCODE_DDX_FINE)
5201 mask = AC_TID_MASK_LEFT;
5202 else if (opcode == TGSI_OPCODE_DDY_FINE)
5203 mask = AC_TID_MASK_TOP;
5204 else
5205 mask = AC_TID_MASK_TOP_LEFT;
5206
5207 /* for DDX we want to next X pixel, DDY next Y pixel. */
5208 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5209
5210 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5211 val = ac_build_ddxy(&ctx->ac, ctx->screen->has_ds_bpermute,
5212 mask, idx, ctx->lds, val);
5213 emit_data->output[emit_data->chan] = val;
5214 }
5215
5216 /*
5217 * this takes an I,J coordinate pair,
5218 * and works out the X and Y derivatives.
5219 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5220 */
5221 static LLVMValueRef si_llvm_emit_ddxy_interp(
5222 struct lp_build_tgsi_context *bld_base,
5223 LLVMValueRef interp_ij)
5224 {
5225 struct si_shader_context *ctx = si_shader_context(bld_base);
5226 struct gallivm_state *gallivm = &ctx->gallivm;
5227 LLVMValueRef result[4], a;
5228 unsigned i;
5229
5230 for (i = 0; i < 2; i++) {
5231 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5232 LLVMConstInt(ctx->i32, i, 0), "");
5233 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5234 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5235 }
5236
5237 return lp_build_gather_values(gallivm, result, 4);
5238 }
5239
5240 static void interp_fetch_args(
5241 struct lp_build_tgsi_context *bld_base,
5242 struct lp_build_emit_data *emit_data)
5243 {
5244 struct si_shader_context *ctx = si_shader_context(bld_base);
5245 struct gallivm_state *gallivm = &ctx->gallivm;
5246 const struct tgsi_full_instruction *inst = emit_data->inst;
5247
5248 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5249 /* offset is in second src, first two channels */
5250 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5251 emit_data->inst, 1,
5252 TGSI_CHAN_X);
5253 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5254 emit_data->inst, 1,
5255 TGSI_CHAN_Y);
5256 emit_data->arg_count = 2;
5257 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5258 LLVMValueRef sample_position;
5259 LLVMValueRef sample_id;
5260 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
5261
5262 /* fetch sample ID, then fetch its sample position,
5263 * and place into first two channels.
5264 */
5265 sample_id = lp_build_emit_fetch(bld_base,
5266 emit_data->inst, 1, TGSI_CHAN_X);
5267 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5268 ctx->i32, "");
5269 sample_position = load_sample_position(ctx, sample_id);
5270
5271 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5272 sample_position,
5273 ctx->i32_0, "");
5274
5275 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5276 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5277 sample_position,
5278 ctx->i32_1, "");
5279 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5280 emit_data->arg_count = 2;
5281 }
5282 }
5283
5284 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5285 struct lp_build_tgsi_context *bld_base,
5286 struct lp_build_emit_data *emit_data)
5287 {
5288 struct si_shader_context *ctx = si_shader_context(bld_base);
5289 struct si_shader *shader = ctx->shader;
5290 struct gallivm_state *gallivm = &ctx->gallivm;
5291 LLVMValueRef interp_param;
5292 const struct tgsi_full_instruction *inst = emit_data->inst;
5293 int input_index = inst->Src[0].Register.Index;
5294 int chan;
5295 int i;
5296 LLVMValueRef attr_number;
5297 LLVMValueRef params = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
5298 int interp_param_idx;
5299 unsigned interp = shader->selector->info.input_interpolate[input_index];
5300 unsigned location;
5301
5302 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5303
5304 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5305 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5306 location = TGSI_INTERPOLATE_LOC_CENTER;
5307 else
5308 location = TGSI_INTERPOLATE_LOC_CENTROID;
5309
5310 interp_param_idx = lookup_interp_param_index(interp, location);
5311 if (interp_param_idx == -1)
5312 return;
5313 else if (interp_param_idx)
5314 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
5315 else
5316 interp_param = NULL;
5317
5318 attr_number = LLVMConstInt(ctx->i32, input_index, 0);
5319
5320 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5321 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5322 LLVMValueRef ij_out[2];
5323 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5324
5325 /*
5326 * take the I then J parameters, and the DDX/Y for it, and
5327 * calculate the IJ inputs for the interpolator.
5328 * temp1 = ddx * offset/sample.x + I;
5329 * interp_param.I = ddy * offset/sample.y + temp1;
5330 * temp1 = ddx * offset/sample.x + J;
5331 * interp_param.J = ddy * offset/sample.y + temp1;
5332 */
5333 for (i = 0; i < 2; i++) {
5334 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
5335 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
5336 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5337 ddxy_out, ix_ll, "");
5338 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5339 ddxy_out, iy_ll, "");
5340 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5341 interp_param, ix_ll, "");
5342 LLVMValueRef temp1, temp2;
5343
5344 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5345 ctx->f32, "");
5346
5347 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5348
5349 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5350
5351 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5352
5353 ij_out[i] = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5354 }
5355 interp_param = lp_build_gather_values(gallivm, ij_out, 2);
5356 }
5357
5358 for (chan = 0; chan < 4; chan++) {
5359 LLVMValueRef llvm_chan;
5360 unsigned schan;
5361
5362 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5363 llvm_chan = LLVMConstInt(ctx->i32, schan, 0);
5364
5365 if (interp_param) {
5366 interp_param = LLVMBuildBitCast(gallivm->builder,
5367 interp_param, LLVMVectorType(ctx->f32, 2), "");
5368 LLVMValueRef i = LLVMBuildExtractElement(
5369 gallivm->builder, interp_param, ctx->i32_0, "");
5370 LLVMValueRef j = LLVMBuildExtractElement(
5371 gallivm->builder, interp_param, ctx->i32_1, "");
5372 emit_data->output[chan] = ac_build_fs_interp(&ctx->ac,
5373 llvm_chan, attr_number, params,
5374 i, j);
5375 } else {
5376 emit_data->output[chan] = ac_build_fs_interp_mov(&ctx->ac,
5377 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
5378 llvm_chan, attr_number, params);
5379 }
5380 }
5381 }
5382
5383 static LLVMValueRef si_emit_ballot(struct si_shader_context *ctx,
5384 LLVMValueRef value)
5385 {
5386 struct gallivm_state *gallivm = &ctx->gallivm;
5387 LLVMValueRef args[3] = {
5388 value,
5389 ctx->i32_0,
5390 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
5391 };
5392
5393 /* We currently have no other way to prevent LLVM from lifting the icmp
5394 * calls to a dominating basic block.
5395 */
5396 emit_optimization_barrier(ctx, &args[0]);
5397
5398 if (LLVMTypeOf(args[0]) != ctx->i32)
5399 args[0] = LLVMBuildBitCast(gallivm->builder, args[0], ctx->i32, "");
5400
5401 return lp_build_intrinsic(gallivm->builder,
5402 "llvm.amdgcn.icmp.i32",
5403 ctx->i64, args, 3,
5404 LP_FUNC_ATTR_NOUNWIND |
5405 LP_FUNC_ATTR_READNONE |
5406 LP_FUNC_ATTR_CONVERGENT);
5407 }
5408
5409 static void vote_all_emit(
5410 const struct lp_build_tgsi_action *action,
5411 struct lp_build_tgsi_context *bld_base,
5412 struct lp_build_emit_data *emit_data)
5413 {
5414 struct si_shader_context *ctx = si_shader_context(bld_base);
5415 struct gallivm_state *gallivm = &ctx->gallivm;
5416 LLVMValueRef active_set, vote_set;
5417 LLVMValueRef tmp;
5418
5419 active_set = si_emit_ballot(ctx, ctx->i32_1);
5420 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5421
5422 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5423 emit_data->output[emit_data->chan] =
5424 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5425 }
5426
5427 static void vote_any_emit(
5428 const struct lp_build_tgsi_action *action,
5429 struct lp_build_tgsi_context *bld_base,
5430 struct lp_build_emit_data *emit_data)
5431 {
5432 struct si_shader_context *ctx = si_shader_context(bld_base);
5433 struct gallivm_state *gallivm = &ctx->gallivm;
5434 LLVMValueRef vote_set;
5435 LLVMValueRef tmp;
5436
5437 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5438
5439 tmp = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
5440 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5441 emit_data->output[emit_data->chan] =
5442 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5443 }
5444
5445 static void vote_eq_emit(
5446 const struct lp_build_tgsi_action *action,
5447 struct lp_build_tgsi_context *bld_base,
5448 struct lp_build_emit_data *emit_data)
5449 {
5450 struct si_shader_context *ctx = si_shader_context(bld_base);
5451 struct gallivm_state *gallivm = &ctx->gallivm;
5452 LLVMValueRef active_set, vote_set;
5453 LLVMValueRef all, none, tmp;
5454
5455 active_set = si_emit_ballot(ctx, ctx->i32_1);
5456 vote_set = si_emit_ballot(ctx, emit_data->args[0]);
5457
5458 all = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, vote_set, active_set, "");
5459 none = LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
5460 vote_set, LLVMConstInt(ctx->i64, 0, 0), "");
5461 tmp = LLVMBuildOr(gallivm->builder, all, none, "");
5462 emit_data->output[emit_data->chan] =
5463 LLVMBuildSExt(gallivm->builder, tmp, ctx->i32, "");
5464 }
5465
5466 static void ballot_emit(
5467 const struct lp_build_tgsi_action *action,
5468 struct lp_build_tgsi_context *bld_base,
5469 struct lp_build_emit_data *emit_data)
5470 {
5471 struct si_shader_context *ctx = si_shader_context(bld_base);
5472 LLVMBuilderRef builder = ctx->gallivm.builder;
5473 LLVMValueRef tmp;
5474
5475 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
5476 tmp = si_emit_ballot(ctx, tmp);
5477 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
5478
5479 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
5480 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
5481 }
5482
5483 static void read_invoc_fetch_args(
5484 struct lp_build_tgsi_context *bld_base,
5485 struct lp_build_emit_data *emit_data)
5486 {
5487 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
5488 0, emit_data->src_chan);
5489
5490 /* Always read the source invocation (= lane) from the X channel. */
5491 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
5492 1, TGSI_CHAN_X);
5493 emit_data->arg_count = 2;
5494 }
5495
5496 static void read_lane_emit(
5497 const struct lp_build_tgsi_action *action,
5498 struct lp_build_tgsi_context *bld_base,
5499 struct lp_build_emit_data *emit_data)
5500 {
5501 struct si_shader_context *ctx = si_shader_context(bld_base);
5502 LLVMBuilderRef builder = ctx->gallivm.builder;
5503
5504 /* We currently have no other way to prevent LLVM from lifting the icmp
5505 * calls to a dominating basic block.
5506 */
5507 emit_optimization_barrier(ctx, &emit_data->args[0]);
5508
5509 for (unsigned i = 0; i < emit_data->arg_count; ++i) {
5510 emit_data->args[i] = LLVMBuildBitCast(builder, emit_data->args[i],
5511 ctx->i32, "");
5512 }
5513
5514 emit_data->output[emit_data->chan] =
5515 ac_build_intrinsic(&ctx->ac, action->intr_name,
5516 ctx->i32, emit_data->args, emit_data->arg_count,
5517 AC_FUNC_ATTR_READNONE |
5518 AC_FUNC_ATTR_CONVERGENT);
5519 }
5520
5521 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5522 struct lp_build_emit_data *emit_data)
5523 {
5524 struct si_shader_context *ctx = si_shader_context(bld_base);
5525 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5526 LLVMValueRef imm;
5527 unsigned stream;
5528
5529 assert(src0.File == TGSI_FILE_IMMEDIATE);
5530
5531 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
5532 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
5533 return stream;
5534 }
5535
5536 /* Emit one vertex from the geometry shader */
5537 static void si_llvm_emit_vertex(
5538 const struct lp_build_tgsi_action *action,
5539 struct lp_build_tgsi_context *bld_base,
5540 struct lp_build_emit_data *emit_data)
5541 {
5542 struct si_shader_context *ctx = si_shader_context(bld_base);
5543 struct lp_build_context *uint = &bld_base->uint_bld;
5544 struct si_shader *shader = ctx->shader;
5545 struct tgsi_shader_info *info = &shader->selector->info;
5546 struct gallivm_state *gallivm = &ctx->gallivm;
5547 struct lp_build_if_state if_state;
5548 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
5549 ctx->param_gs2vs_offset);
5550 LLVMValueRef gs_next_vertex;
5551 LLVMValueRef can_emit, kill;
5552 unsigned chan, offset;
5553 int i;
5554 unsigned stream;
5555
5556 stream = si_llvm_get_stream(bld_base, emit_data);
5557
5558 /* Write vertex attribute values to GSVS ring */
5559 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5560 ctx->gs_next_vertex[stream],
5561 "");
5562
5563 /* If this thread has already emitted the declared maximum number of
5564 * vertices, skip the write: excessive vertex emissions are not
5565 * supposed to have any effect.
5566 *
5567 * If the shader has no writes to memory, kill it instead. This skips
5568 * further memory loads and may allow LLVM to skip to the end
5569 * altogether.
5570 */
5571 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex,
5572 LLVMConstInt(ctx->i32,
5573 shader->selector->gs_max_out_vertices, 0), "");
5574
5575 bool use_kill = !info->writes_memory;
5576 if (use_kill) {
5577 kill = lp_build_select(&bld_base->base, can_emit,
5578 LLVMConstReal(ctx->f32, 1.0f),
5579 LLVMConstReal(ctx->f32, -1.0f));
5580
5581 ac_build_kill(&ctx->ac, kill);
5582 } else {
5583 lp_build_if(&if_state, gallivm, can_emit);
5584 }
5585
5586 offset = 0;
5587 for (i = 0; i < info->num_outputs; i++) {
5588 LLVMValueRef *out_ptr = ctx->outputs[i];
5589
5590 for (chan = 0; chan < 4; chan++) {
5591 if (!(info->output_usagemask[i] & (1 << chan)) ||
5592 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
5593 continue;
5594
5595 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5596 LLVMValueRef voffset =
5597 LLVMConstInt(ctx->i32, offset *
5598 shader->selector->gs_max_out_vertices, 0);
5599 offset++;
5600
5601 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5602 voffset = lp_build_mul_imm(uint, voffset, 4);
5603
5604 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5605
5606 ac_build_buffer_store_dword(&ctx->ac,
5607 ctx->gsvs_ring[stream],
5608 out_val, 1,
5609 voffset, soffset, 0,
5610 1, 1, true, true);
5611 }
5612 }
5613
5614 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5615 ctx->i32_1);
5616
5617 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5618
5619 /* Signal vertex emission */
5620 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
5621 si_get_gs_wave_id(ctx));
5622 if (!use_kill)
5623 lp_build_endif(&if_state);
5624 }
5625
5626 /* Cut one primitive from the geometry shader */
5627 static void si_llvm_emit_primitive(
5628 const struct lp_build_tgsi_action *action,
5629 struct lp_build_tgsi_context *bld_base,
5630 struct lp_build_emit_data *emit_data)
5631 {
5632 struct si_shader_context *ctx = si_shader_context(bld_base);
5633 unsigned stream;
5634
5635 /* Signal primitive cut */
5636 stream = si_llvm_get_stream(bld_base, emit_data);
5637 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
5638 si_get_gs_wave_id(ctx));
5639 }
5640
5641 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5642 struct lp_build_tgsi_context *bld_base,
5643 struct lp_build_emit_data *emit_data)
5644 {
5645 struct si_shader_context *ctx = si_shader_context(bld_base);
5646 struct gallivm_state *gallivm = &ctx->gallivm;
5647
5648 /* SI only (thanks to a hw bug workaround):
5649 * The real barrier instruction isn’t needed, because an entire patch
5650 * always fits into a single wave.
5651 */
5652 if (HAVE_LLVM >= 0x0309 &&
5653 ctx->screen->b.chip_class == SI &&
5654 ctx->type == PIPE_SHADER_TESS_CTRL) {
5655 emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
5656 return;
5657 }
5658
5659 lp_build_intrinsic(gallivm->builder,
5660 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5661 : "llvm.AMDGPU.barrier.local",
5662 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
5663 }
5664
5665 static const struct lp_build_tgsi_action tex_action = {
5666 .fetch_args = tex_fetch_args,
5667 .emit = build_tex_intrinsic,
5668 };
5669
5670 static const struct lp_build_tgsi_action interp_action = {
5671 .fetch_args = interp_fetch_args,
5672 .emit = build_interp_intrinsic,
5673 };
5674
5675 static void si_create_function(struct si_shader_context *ctx,
5676 const char *name,
5677 LLVMTypeRef *returns, unsigned num_returns,
5678 LLVMTypeRef *params, unsigned num_params,
5679 int last_sgpr)
5680 {
5681 int i;
5682
5683 si_llvm_create_func(ctx, name, returns, num_returns,
5684 params, num_params);
5685 si_llvm_shader_type(ctx->main_fn, ctx->type);
5686 ctx->return_value = LLVMGetUndef(ctx->return_type);
5687
5688 for (i = 0; i <= last_sgpr; ++i) {
5689 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
5690
5691 /* The combination of:
5692 * - ByVal
5693 * - dereferenceable
5694 * - invariant.load
5695 * allows the optimization passes to move loads and reduces
5696 * SGPR spilling significantly.
5697 */
5698 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5699 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
5700 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
5701 ac_add_attr_dereferenceable(P, UINT64_MAX);
5702 } else
5703 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
5704 }
5705
5706 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5707 "no-signed-zeros-fp-math",
5708 "true");
5709
5710 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5711 /* These were copied from some LLVM test. */
5712 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5713 "less-precise-fpmad",
5714 "true");
5715 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5716 "no-infs-fp-math",
5717 "true");
5718 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5719 "no-nans-fp-math",
5720 "true");
5721 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
5722 "unsafe-fp-math",
5723 "true");
5724 }
5725 }
5726
5727 static void declare_streamout_params(struct si_shader_context *ctx,
5728 struct pipe_stream_output_info *so,
5729 LLVMTypeRef *params, LLVMTypeRef i32,
5730 unsigned *num_params)
5731 {
5732 int i;
5733
5734 /* Streamout SGPRs. */
5735 if (so->num_outputs) {
5736 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5737 params[ctx->param_streamout_config = (*num_params)++] = i32;
5738 else
5739 ctx->param_streamout_config = *num_params - 1;
5740
5741 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5742 }
5743 /* A streamout buffer offset is loaded if the stride is non-zero. */
5744 for (i = 0; i < 4; i++) {
5745 if (!so->stride[i])
5746 continue;
5747
5748 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5749 }
5750 }
5751
5752 static unsigned llvm_get_type_size(LLVMTypeRef type)
5753 {
5754 LLVMTypeKind kind = LLVMGetTypeKind(type);
5755
5756 switch (kind) {
5757 case LLVMIntegerTypeKind:
5758 return LLVMGetIntTypeWidth(type) / 8;
5759 case LLVMFloatTypeKind:
5760 return 4;
5761 case LLVMPointerTypeKind:
5762 return 8;
5763 case LLVMVectorTypeKind:
5764 return LLVMGetVectorSize(type) *
5765 llvm_get_type_size(LLVMGetElementType(type));
5766 case LLVMArrayTypeKind:
5767 return LLVMGetArrayLength(type) *
5768 llvm_get_type_size(LLVMGetElementType(type));
5769 default:
5770 assert(0);
5771 return 0;
5772 }
5773 }
5774
5775 static void declare_lds_as_pointer(struct si_shader_context *ctx)
5776 {
5777 struct gallivm_state *gallivm = &ctx->gallivm;
5778
5779 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5780 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
5781 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5782 "lds");
5783 }
5784
5785 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
5786 {
5787 const unsigned *properties = shader->selector->info.properties;
5788 unsigned max_work_group_size =
5789 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5790 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5791 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5792
5793 if (!max_work_group_size) {
5794 /* This is a variable group size compute shader,
5795 * compile it for the maximum possible group size.
5796 */
5797 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5798 }
5799 return max_work_group_size;
5800 }
5801
5802 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
5803 LLVMTypeRef *params,
5804 unsigned *num_params,
5805 bool assign_params)
5806 {
5807 params[(*num_params)++] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5808 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5809 params[(*num_params)++] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5810 params[(*num_params)++] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5811
5812 if (assign_params) {
5813 ctx->param_const_buffers = *num_params - 4;
5814 ctx->param_samplers = *num_params - 3;
5815 ctx->param_images = *num_params - 2;
5816 ctx->param_shader_buffers = *num_params - 1;
5817 }
5818 }
5819
5820 static void declare_default_desc_pointers(struct si_shader_context *ctx,
5821 LLVMTypeRef *params,
5822 unsigned *num_params)
5823 {
5824 params[ctx->param_rw_buffers = (*num_params)++] =
5825 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5826 declare_per_stage_desc_pointers(ctx, params, num_params, true);
5827 }
5828
5829 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
5830 LLVMTypeRef *params,
5831 unsigned *num_params)
5832 {
5833 params[ctx->param_vertex_buffers = (*num_params)++] =
5834 const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5835 params[ctx->param_base_vertex = (*num_params)++] = ctx->i32;
5836 params[ctx->param_start_instance = (*num_params)++] = ctx->i32;
5837 params[ctx->param_draw_id = (*num_params)++] = ctx->i32;
5838 params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32;
5839 }
5840
5841 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
5842 LLVMTypeRef *params, unsigned *num_params,
5843 unsigned *num_prolog_vgprs)
5844 {
5845 struct si_shader *shader = ctx->shader;
5846
5847 params[ctx->param_vertex_id = (*num_params)++] = ctx->i32;
5848 if (shader->key.as_ls) {
5849 params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32;
5850 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5851 } else {
5852 params[ctx->param_instance_id = (*num_params)++] = ctx->i32;
5853 params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32;
5854 }
5855 params[(*num_params)++] = ctx->i32; /* unused */
5856
5857 if (!shader->is_gs_copy_shader) {
5858 /* Vertex load indices. */
5859 ctx->param_vertex_index0 = (*num_params);
5860 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
5861 params[(*num_params)++] = ctx->i32;
5862 *num_prolog_vgprs += shader->selector->info.num_inputs;
5863 }
5864 }
5865
5866 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
5867 LLVMTypeRef *params, unsigned *num_params)
5868 {
5869 params[ctx->param_tes_u = (*num_params)++] = ctx->f32;
5870 params[ctx->param_tes_v = (*num_params)++] = ctx->f32;
5871 params[ctx->param_tes_rel_patch_id = (*num_params)++] = ctx->i32;
5872 params[ctx->param_tes_patch_id = (*num_params)++] = ctx->i32;
5873 }
5874
5875 enum {
5876 /* Convenient merged shader definitions. */
5877 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
5878 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
5879 };
5880
5881 static void create_function(struct si_shader_context *ctx)
5882 {
5883 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5884 struct gallivm_state *gallivm = &ctx->gallivm;
5885 struct si_shader *shader = ctx->shader;
5886 LLVMTypeRef params[100]; /* just make it large enough */
5887 LLVMTypeRef returns[16+32*4];
5888 unsigned i, last_sgpr, num_params = 0, num_return_sgprs;
5889 unsigned num_returns = 0;
5890 unsigned num_prolog_vgprs = 0;
5891 unsigned type = ctx->type;
5892
5893 /* Set MERGED shaders. */
5894 if (ctx->screen->b.chip_class >= GFX9) {
5895 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
5896 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
5897 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
5898 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
5899 }
5900
5901 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
5902
5903 switch (type) {
5904 case PIPE_SHADER_VERTEX:
5905 declare_default_desc_pointers(ctx, params, &num_params);
5906 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5907
5908 if (shader->key.as_es) {
5909 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5910 } else if (shader->key.as_ls) {
5911 /* no extra parameters */
5912 } else {
5913 if (shader->is_gs_copy_shader)
5914 num_params = ctx->param_rw_buffers + 1;
5915
5916 /* The locations of the other parameters are assigned dynamically. */
5917 declare_streamout_params(ctx, &shader->selector->so,
5918 params, ctx->i32, &num_params);
5919 }
5920
5921 last_sgpr = num_params-1;
5922
5923 /* VGPRs */
5924 declare_vs_input_vgprs(ctx, params, &num_params,
5925 &num_prolog_vgprs);
5926 break;
5927
5928 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
5929 declare_default_desc_pointers(ctx, params, &num_params);
5930 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5931 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5932 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5933 params[ctx->param_vs_state_bits = num_params++] = ctx->i32;
5934 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5935 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5936 last_sgpr = num_params - 1;
5937
5938 /* VGPRs */
5939 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5940 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5941
5942 /* param_tcs_offchip_offset and param_tcs_factor_offset are
5943 * placed after the user SGPRs.
5944 */
5945 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
5946 returns[num_returns++] = ctx->i32; /* SGPRs */
5947 for (i = 0; i < 3; i++)
5948 returns[num_returns++] = ctx->f32; /* VGPRs */
5949 break;
5950
5951 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
5952 /* Merged stages have 8 system SGPRs at the beginning. */
5953 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
5954 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5955 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
5956 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
5957 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
5958 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
5959 params[num_params++] = ctx->i32; /* unused */
5960 params[num_params++] = ctx->i32; /* unused */
5961
5962 params[num_params++] = ctx->i32; /* unused */
5963 params[num_params++] = ctx->i32; /* unused */
5964 declare_per_stage_desc_pointers(ctx, params, &num_params,
5965 ctx->type == PIPE_SHADER_VERTEX);
5966 declare_vs_specific_input_sgprs(ctx, params, &num_params);
5967
5968 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
5969 params[ctx->param_tcs_out_lds_offsets = num_params++] = ctx->i32;
5970 params[ctx->param_tcs_out_lds_layout = num_params++] = ctx->i32;
5971 params[num_params++] = ctx->i32; /* unused */
5972
5973 declare_per_stage_desc_pointers(ctx, params, &num_params,
5974 ctx->type == PIPE_SHADER_TESS_CTRL);
5975 last_sgpr = num_params - 1;
5976
5977 /* VGPRs (first TCS, then VS) */
5978 params[ctx->param_tcs_patch_id = num_params++] = ctx->i32;
5979 params[ctx->param_tcs_rel_ids = num_params++] = ctx->i32;
5980
5981 if (ctx->type == PIPE_SHADER_VERTEX) {
5982 declare_vs_input_vgprs(ctx, params, &num_params,
5983 &num_prolog_vgprs);
5984
5985 /* LS return values are inputs to the TCS main shader part. */
5986 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
5987 returns[num_returns++] = ctx->i32; /* SGPRs */
5988 for (i = 0; i < 2; i++)
5989 returns[num_returns++] = ctx->f32; /* VGPRs */
5990 } else {
5991 /* TCS return values are inputs to the TCS epilog.
5992 *
5993 * param_tcs_offchip_offset, param_tcs_factor_offset,
5994 * param_tcs_offchip_layout, and param_rw_buffers
5995 * should be passed to the epilog.
5996 */
5997 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT; i++)
5998 returns[num_returns++] = ctx->i32; /* SGPRs */
5999 for (i = 0; i < 3; i++)
6000 returns[num_returns++] = ctx->f32; /* VGPRs */
6001 }
6002 break;
6003
6004 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
6005 /* Merged stages have 8 system SGPRs at the beginning. */
6006 params[ctx->param_rw_buffers = num_params++] = /* SPI_SHADER_USER_DATA_ADDR_LO_GS */
6007 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
6008 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6009 params[ctx->param_merged_wave_info = num_params++] = ctx->i32;
6010 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6011 params[ctx->param_merged_scratch_offset = num_params++] = ctx->i32;
6012 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
6013 params[num_params++] = ctx->i32; /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
6014
6015 params[num_params++] = ctx->i32; /* unused */
6016 params[num_params++] = ctx->i32; /* unused */
6017 declare_per_stage_desc_pointers(ctx, params, &num_params,
6018 (ctx->type == PIPE_SHADER_VERTEX ||
6019 ctx->type == PIPE_SHADER_TESS_EVAL));
6020 if (ctx->type == PIPE_SHADER_VERTEX) {
6021 declare_vs_specific_input_sgprs(ctx, params, &num_params);
6022 } else {
6023 /* TESS_EVAL (and also GEOMETRY):
6024 * Declare as many input SGPRs as the VS has. */
6025 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6026 params[num_params++] = ctx->i32; /* unused */
6027 params[num_params++] = ctx->i32; /* unused */
6028 params[num_params++] = ctx->i32; /* unused */
6029 params[num_params++] = ctx->i32; /* unused */
6030 params[ctx->param_vs_state_bits = num_params++] = ctx->i32; /* unused */
6031 }
6032
6033 declare_per_stage_desc_pointers(ctx, params, &num_params,
6034 ctx->type == PIPE_SHADER_GEOMETRY);
6035 last_sgpr = num_params - 1;
6036
6037 /* VGPRs (first GS, then VS/TES) */
6038 params[ctx->param_gs_vtx01_offset = num_params++] = ctx->i32;
6039 params[ctx->param_gs_vtx23_offset = num_params++] = ctx->i32;
6040 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6041 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6042 params[ctx->param_gs_vtx45_offset = num_params++] = ctx->i32;
6043
6044 if (ctx->type == PIPE_SHADER_VERTEX) {
6045 declare_vs_input_vgprs(ctx, params, &num_params,
6046 &num_prolog_vgprs);
6047 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
6048 declare_tes_input_vgprs(ctx, params, &num_params);
6049 }
6050
6051 if (ctx->type == PIPE_SHADER_VERTEX ||
6052 ctx->type == PIPE_SHADER_TESS_EVAL) {
6053 /* ES return values are inputs to GS. */
6054 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
6055 returns[num_returns++] = ctx->i32; /* SGPRs */
6056 for (i = 0; i < 5; i++)
6057 returns[num_returns++] = ctx->f32; /* VGPRs */
6058 }
6059 break;
6060
6061 case PIPE_SHADER_TESS_EVAL:
6062 declare_default_desc_pointers(ctx, params, &num_params);
6063 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
6064
6065 if (shader->key.as_es) {
6066 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6067 params[num_params++] = ctx->i32;
6068 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
6069 } else {
6070 params[num_params++] = ctx->i32;
6071 declare_streamout_params(ctx, &shader->selector->so,
6072 params, ctx->i32, &num_params);
6073 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
6074 }
6075 last_sgpr = num_params - 1;
6076
6077 /* VGPRs */
6078 declare_tes_input_vgprs(ctx, params, &num_params);
6079 break;
6080
6081 case PIPE_SHADER_GEOMETRY:
6082 declare_default_desc_pointers(ctx, params, &num_params);
6083 params[ctx->param_gs2vs_offset = num_params++] = ctx->i32;
6084 params[ctx->param_gs_wave_id = num_params++] = ctx->i32;
6085 last_sgpr = num_params - 1;
6086
6087 /* VGPRs */
6088 params[ctx->param_gs_vtx0_offset = num_params++] = ctx->i32;
6089 params[ctx->param_gs_vtx1_offset = num_params++] = ctx->i32;
6090 params[ctx->param_gs_prim_id = num_params++] = ctx->i32;
6091 params[ctx->param_gs_vtx2_offset = num_params++] = ctx->i32;
6092 params[ctx->param_gs_vtx3_offset = num_params++] = ctx->i32;
6093 params[ctx->param_gs_vtx4_offset = num_params++] = ctx->i32;
6094 params[ctx->param_gs_vtx5_offset = num_params++] = ctx->i32;
6095 params[ctx->param_gs_instance_id = num_params++] = ctx->i32;
6096 break;
6097
6098 case PIPE_SHADER_FRAGMENT:
6099 declare_default_desc_pointers(ctx, params, &num_params);
6100 params[SI_PARAM_ALPHA_REF] = ctx->f32;
6101 params[SI_PARAM_PRIM_MASK] = ctx->i32;
6102 last_sgpr = SI_PARAM_PRIM_MASK;
6103 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
6104 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
6105 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
6106 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
6107 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
6108 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
6109 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
6110 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
6111 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
6112 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
6113 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
6114 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
6115 params[SI_PARAM_FRONT_FACE] = ctx->i32;
6116 shader->info.face_vgpr_index = 20;
6117 params[SI_PARAM_ANCILLARY] = ctx->i32;
6118 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
6119 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
6120 num_params = SI_PARAM_POS_FIXED_PT+1;
6121
6122 /* Color inputs from the prolog. */
6123 if (shader->selector->info.colors_read) {
6124 unsigned num_color_elements =
6125 util_bitcount(shader->selector->info.colors_read);
6126
6127 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
6128 for (i = 0; i < num_color_elements; i++)
6129 params[num_params++] = ctx->f32;
6130
6131 num_prolog_vgprs += num_color_elements;
6132 }
6133
6134 /* Outputs for the epilog. */
6135 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6136 num_returns =
6137 num_return_sgprs +
6138 util_bitcount(shader->selector->info.colors_written) * 4 +
6139 shader->selector->info.writes_z +
6140 shader->selector->info.writes_stencil +
6141 shader->selector->info.writes_samplemask +
6142 1 /* SampleMaskIn */;
6143
6144 num_returns = MAX2(num_returns,
6145 num_return_sgprs +
6146 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6147
6148 for (i = 0; i < num_return_sgprs; i++)
6149 returns[i] = ctx->i32;
6150 for (; i < num_returns; i++)
6151 returns[i] = ctx->f32;
6152 break;
6153
6154 case PIPE_SHADER_COMPUTE:
6155 declare_default_desc_pointers(ctx, params, &num_params);
6156 params[SI_PARAM_GRID_SIZE] = v3i32;
6157 params[SI_PARAM_BLOCK_SIZE] = v3i32;
6158 params[SI_PARAM_BLOCK_ID] = v3i32;
6159 last_sgpr = SI_PARAM_BLOCK_ID;
6160
6161 params[SI_PARAM_THREAD_ID] = v3i32;
6162 num_params = SI_PARAM_THREAD_ID + 1;
6163 break;
6164 default:
6165 assert(0 && "unimplemented shader");
6166 return;
6167 }
6168
6169 assert(num_params <= ARRAY_SIZE(params));
6170
6171 si_create_function(ctx, "main", returns, num_returns, params,
6172 num_params, last_sgpr);
6173
6174 /* Reserve register locations for VGPR inputs the PS prolog may need. */
6175 if (ctx->type == PIPE_SHADER_FRAGMENT &&
6176 ctx->separate_prolog) {
6177 si_llvm_add_attribute(ctx->main_fn,
6178 "InitialPSInputAddr",
6179 S_0286D0_PERSP_SAMPLE_ENA(1) |
6180 S_0286D0_PERSP_CENTER_ENA(1) |
6181 S_0286D0_PERSP_CENTROID_ENA(1) |
6182 S_0286D0_LINEAR_SAMPLE_ENA(1) |
6183 S_0286D0_LINEAR_CENTER_ENA(1) |
6184 S_0286D0_LINEAR_CENTROID_ENA(1) |
6185 S_0286D0_FRONT_FACE_ENA(1) |
6186 S_0286D0_POS_FIXED_PT_ENA(1));
6187 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
6188 si_llvm_add_attribute(ctx->main_fn,
6189 "amdgpu-max-work-group-size",
6190 si_get_max_workgroup_size(shader));
6191 }
6192
6193 shader->info.num_input_sgprs = 0;
6194 shader->info.num_input_vgprs = 0;
6195
6196 for (i = 0; i <= last_sgpr; ++i)
6197 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
6198
6199 for (; i < num_params; ++i)
6200 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
6201
6202 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
6203 shader->info.num_input_vgprs -= num_prolog_vgprs;
6204
6205 if (!ctx->screen->has_ds_bpermute &&
6206 bld_base->info &&
6207 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
6208 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
6209 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
6210 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
6211 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
6212 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
6213 ctx->lds =
6214 LLVMAddGlobalInAddressSpace(gallivm->module,
6215 LLVMArrayType(ctx->i32, 64),
6216 "ddxy_lds",
6217 LOCAL_ADDR_SPACE);
6218
6219 if (shader->key.as_ls ||
6220 ctx->type == PIPE_SHADER_TESS_CTRL ||
6221 /* GFX9 has the ESGS ring buffer in LDS. */
6222 (ctx->screen->b.chip_class >= GFX9 &&
6223 (shader->key.as_es ||
6224 ctx->type == PIPE_SHADER_GEOMETRY)))
6225 declare_lds_as_pointer(ctx);
6226 }
6227
6228 /**
6229 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
6230 * for later use.
6231 */
6232 static void preload_ring_buffers(struct si_shader_context *ctx)
6233 {
6234 struct gallivm_state *gallivm = &ctx->gallivm;
6235 LLVMBuilderRef builder = gallivm->builder;
6236
6237 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
6238 ctx->param_rw_buffers);
6239
6240 if (ctx->screen->b.chip_class <= VI &&
6241 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
6242 unsigned ring =
6243 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
6244 : SI_ES_RING_ESGS;
6245 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
6246
6247 ctx->esgs_ring =
6248 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6249 }
6250
6251 if (ctx->shader->is_gs_copy_shader) {
6252 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6253
6254 ctx->gsvs_ring[0] =
6255 ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6256 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
6257 const struct si_shader_selector *sel = ctx->shader->selector;
6258 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
6259 LLVMValueRef base_ring;
6260
6261 base_ring = ac_build_indexed_load_const(&ctx->ac, buf_ptr, offset);
6262
6263 /* The conceptual layout of the GSVS ring is
6264 * v0c0 .. vLv0 v0c1 .. vLc1 ..
6265 * but the real memory layout is swizzled across
6266 * threads:
6267 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
6268 * t16v0c0 ..
6269 * Override the buffer descriptor accordingly.
6270 */
6271 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
6272 uint64_t stream_offset = 0;
6273
6274 for (unsigned stream = 0; stream < 4; ++stream) {
6275 unsigned num_components;
6276 unsigned stride;
6277 unsigned num_records;
6278 LLVMValueRef ring, tmp;
6279
6280 num_components = sel->info.num_stream_output_components[stream];
6281 if (!num_components)
6282 continue;
6283
6284 stride = 4 * num_components * sel->gs_max_out_vertices;
6285
6286 /* Limit on the stride field for <= CIK. */
6287 assert(stride < (1 << 14));
6288
6289 num_records = 64;
6290
6291 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
6292 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
6293 tmp = LLVMBuildAdd(builder, tmp,
6294 LLVMConstInt(ctx->i64,
6295 stream_offset, 0), "");
6296 stream_offset += stride * 64;
6297
6298 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
6299 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
6300 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
6301 tmp = LLVMBuildOr(builder, tmp,
6302 LLVMConstInt(ctx->i32,
6303 S_008F04_STRIDE(stride) |
6304 S_008F04_SWIZZLE_ENABLE(1), 0), "");
6305 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
6306 ring = LLVMBuildInsertElement(builder, ring,
6307 LLVMConstInt(ctx->i32, num_records, 0),
6308 LLVMConstInt(ctx->i32, 2, 0), "");
6309 ring = LLVMBuildInsertElement(builder, ring,
6310 LLVMConstInt(ctx->i32,
6311 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
6312 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6313 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
6314 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
6315 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6316 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
6317 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
6318 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
6319 S_008F0C_ADD_TID_ENABLE(1),
6320 0),
6321 LLVMConstInt(ctx->i32, 3, 0), "");
6322 ring = LLVMBuildBitCast(builder, ring, ctx->v16i8, "");
6323
6324 ctx->gsvs_ring[stream] = ring;
6325 }
6326 }
6327 }
6328
6329 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6330 LLVMValueRef param_rw_buffers,
6331 unsigned param_pos_fixed_pt)
6332 {
6333 struct gallivm_state *gallivm = &ctx->gallivm;
6334 LLVMBuilderRef builder = gallivm->builder;
6335 LLVMValueRef slot, desc, offset, row, bit, address[2];
6336
6337 /* Use the fixed-point gl_FragCoord input.
6338 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6339 * per coordinate to get the repeating effect.
6340 */
6341 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6342 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6343
6344 /* Load the buffer descriptor. */
6345 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
6346 desc = ac_build_indexed_load_const(&ctx->ac, param_rw_buffers, slot);
6347
6348 /* The stipple pattern is 32x32, each row has 32 bits. */
6349 offset = LLVMBuildMul(builder, address[1],
6350 LLVMConstInt(ctx->i32, 4, 0), "");
6351 row = buffer_load_const(ctx, desc, offset);
6352 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6353 bit = LLVMBuildLShr(builder, row, address[0], "");
6354 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6355
6356 /* The intrinsic kills the thread if arg < 0. */
6357 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6358 LLVMConstReal(ctx->f32, -1), "");
6359 ac_build_kill(&ctx->ac, bit);
6360 }
6361
6362 void si_shader_binary_read_config(struct ac_shader_binary *binary,
6363 struct si_shader_config *conf,
6364 unsigned symbol_offset)
6365 {
6366 unsigned i;
6367 const unsigned char *config =
6368 ac_shader_binary_config_start(binary, symbol_offset);
6369 bool really_needs_scratch = false;
6370
6371 /* LLVM adds SGPR spills to the scratch size.
6372 * Find out if we really need the scratch buffer.
6373 */
6374 for (i = 0; i < binary->reloc_count; i++) {
6375 const struct ac_shader_reloc *reloc = &binary->relocs[i];
6376
6377 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6378 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6379 really_needs_scratch = true;
6380 break;
6381 }
6382 }
6383
6384 /* XXX: We may be able to emit some of these values directly rather than
6385 * extracting fields to be emitted later.
6386 */
6387
6388 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6389 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6390 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6391 switch (reg) {
6392 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6393 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6394 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6395 case R_00B848_COMPUTE_PGM_RSRC1:
6396 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6397 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6398 conf->float_mode = G_00B028_FLOAT_MODE(value);
6399 conf->rsrc1 = value;
6400 break;
6401 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6402 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6403 break;
6404 case R_00B84C_COMPUTE_PGM_RSRC2:
6405 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6406 conf->rsrc2 = value;
6407 break;
6408 case R_0286CC_SPI_PS_INPUT_ENA:
6409 conf->spi_ps_input_ena = value;
6410 break;
6411 case R_0286D0_SPI_PS_INPUT_ADDR:
6412 conf->spi_ps_input_addr = value;
6413 break;
6414 case R_0286E8_SPI_TMPRING_SIZE:
6415 case R_00B860_COMPUTE_TMPRING_SIZE:
6416 /* WAVESIZE is in units of 256 dwords. */
6417 if (really_needs_scratch)
6418 conf->scratch_bytes_per_wave =
6419 G_00B860_WAVESIZE(value) * 256 * 4;
6420 break;
6421 case 0x4: /* SPILLED_SGPRS */
6422 conf->spilled_sgprs = value;
6423 break;
6424 case 0x8: /* SPILLED_VGPRS */
6425 conf->spilled_vgprs = value;
6426 break;
6427 default:
6428 {
6429 static bool printed;
6430
6431 if (!printed) {
6432 fprintf(stderr, "Warning: LLVM emitted unknown "
6433 "config register: 0x%x\n", reg);
6434 printed = true;
6435 }
6436 }
6437 break;
6438 }
6439 }
6440
6441 if (!conf->spi_ps_input_addr)
6442 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6443 }
6444
6445 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6446 struct si_shader *shader,
6447 struct si_shader_config *config,
6448 uint64_t scratch_va)
6449 {
6450 unsigned i;
6451 uint32_t scratch_rsrc_dword0 = scratch_va;
6452 uint32_t scratch_rsrc_dword1 =
6453 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6454
6455 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6456 * correctly.
6457 */
6458 if (HAVE_LLVM >= 0x0309)
6459 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6460 else
6461 scratch_rsrc_dword1 |=
6462 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6463
6464 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6465 const struct ac_shader_reloc *reloc =
6466 &shader->binary.relocs[i];
6467 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6468 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6469 &scratch_rsrc_dword0, 4);
6470 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6471 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6472 &scratch_rsrc_dword1, 4);
6473 }
6474 }
6475 }
6476
6477 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6478 {
6479 unsigned size = shader->binary.code_size;
6480
6481 if (shader->prolog)
6482 size += shader->prolog->binary.code_size;
6483 if (shader->previous_stage)
6484 size += shader->previous_stage->binary.code_size;
6485 if (shader->prolog2)
6486 size += shader->prolog2->binary.code_size;
6487 if (shader->epilog)
6488 size += shader->epilog->binary.code_size;
6489 return size;
6490 }
6491
6492 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6493 {
6494 const struct ac_shader_binary *prolog =
6495 shader->prolog ? &shader->prolog->binary : NULL;
6496 const struct ac_shader_binary *previous_stage =
6497 shader->previous_stage ? &shader->previous_stage->binary : NULL;
6498 const struct ac_shader_binary *prolog2 =
6499 shader->prolog2 ? &shader->prolog2->binary : NULL;
6500 const struct ac_shader_binary *epilog =
6501 shader->epilog ? &shader->epilog->binary : NULL;
6502 const struct ac_shader_binary *mainb = &shader->binary;
6503 unsigned bo_size = si_get_shader_binary_size(shader) +
6504 (!epilog ? mainb->rodata_size : 0);
6505 unsigned char *ptr;
6506
6507 assert(!prolog || !prolog->rodata_size);
6508 assert(!previous_stage || !previous_stage->rodata_size);
6509 assert(!prolog2 || !prolog2->rodata_size);
6510 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
6511 !mainb->rodata_size);
6512 assert(!epilog || !epilog->rodata_size);
6513
6514 /* GFX9 can fetch at most 128 bytes past the end of the shader.
6515 * Prevent VM faults.
6516 */
6517 if (sscreen->b.chip_class >= GFX9)
6518 bo_size += 128;
6519
6520 r600_resource_reference(&shader->bo, NULL);
6521 shader->bo = (struct r600_resource*)
6522 pipe_buffer_create(&sscreen->b.b, 0,
6523 PIPE_USAGE_IMMUTABLE,
6524 align(bo_size, SI_CPDMA_ALIGNMENT));
6525 if (!shader->bo)
6526 return -ENOMEM;
6527
6528 /* Upload. */
6529 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6530 PIPE_TRANSFER_READ_WRITE |
6531 PIPE_TRANSFER_UNSYNCHRONIZED);
6532
6533 if (prolog) {
6534 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6535 ptr += prolog->code_size;
6536 }
6537 if (previous_stage) {
6538 util_memcpy_cpu_to_le32(ptr, previous_stage->code,
6539 previous_stage->code_size);
6540 ptr += previous_stage->code_size;
6541 }
6542 if (prolog2) {
6543 util_memcpy_cpu_to_le32(ptr, prolog2->code, prolog2->code_size);
6544 ptr += prolog2->code_size;
6545 }
6546
6547 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6548 ptr += mainb->code_size;
6549
6550 if (epilog)
6551 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6552 else if (mainb->rodata_size > 0)
6553 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6554
6555 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6556 return 0;
6557 }
6558
6559 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
6560 struct pipe_debug_callback *debug,
6561 const char *name, FILE *file)
6562 {
6563 char *line, *p;
6564 unsigned i, count;
6565
6566 if (binary->disasm_string) {
6567 fprintf(file, "Shader %s disassembly:\n", name);
6568 fprintf(file, "%s", binary->disasm_string);
6569
6570 if (debug && debug->debug_message) {
6571 /* Very long debug messages are cut off, so send the
6572 * disassembly one line at a time. This causes more
6573 * overhead, but on the plus side it simplifies
6574 * parsing of resulting logs.
6575 */
6576 pipe_debug_message(debug, SHADER_INFO,
6577 "Shader Disassembly Begin");
6578
6579 line = binary->disasm_string;
6580 while (*line) {
6581 p = util_strchrnul(line, '\n');
6582 count = p - line;
6583
6584 if (count) {
6585 pipe_debug_message(debug, SHADER_INFO,
6586 "%.*s", count, line);
6587 }
6588
6589 if (!*p)
6590 break;
6591 line = p + 1;
6592 }
6593
6594 pipe_debug_message(debug, SHADER_INFO,
6595 "Shader Disassembly End");
6596 }
6597 } else {
6598 fprintf(file, "Shader %s binary:\n", name);
6599 for (i = 0; i < binary->code_size; i += 4) {
6600 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6601 binary->code[i + 3], binary->code[i + 2],
6602 binary->code[i + 1], binary->code[i]);
6603 }
6604 }
6605 }
6606
6607 static void si_shader_dump_stats(struct si_screen *sscreen,
6608 struct si_shader *shader,
6609 struct pipe_debug_callback *debug,
6610 unsigned processor,
6611 FILE *file,
6612 bool check_debug_option)
6613 {
6614 struct si_shader_config *conf = &shader->config;
6615 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
6616 unsigned code_size = si_get_shader_binary_size(shader);
6617 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6618 unsigned lds_per_wave = 0;
6619 unsigned max_simd_waves = 10;
6620
6621 /* Compute LDS usage for PS. */
6622 switch (processor) {
6623 case PIPE_SHADER_FRAGMENT:
6624 /* The minimum usage per wave is (num_inputs * 48). The maximum
6625 * usage is (num_inputs * 48 * 16).
6626 * We can get anything in between and it varies between waves.
6627 *
6628 * The 48 bytes per input for a single primitive is equal to
6629 * 4 bytes/component * 4 components/input * 3 points.
6630 *
6631 * Other stages don't know the size at compile time or don't
6632 * allocate LDS per wave, but instead they do it per thread group.
6633 */
6634 lds_per_wave = conf->lds_size * lds_increment +
6635 align(num_inputs * 48, lds_increment);
6636 break;
6637 case PIPE_SHADER_COMPUTE:
6638 if (shader->selector) {
6639 unsigned max_workgroup_size =
6640 si_get_max_workgroup_size(shader);
6641 lds_per_wave = (conf->lds_size * lds_increment) /
6642 DIV_ROUND_UP(max_workgroup_size, 64);
6643 }
6644 break;
6645 }
6646
6647 /* Compute the per-SIMD wave counts. */
6648 if (conf->num_sgprs) {
6649 if (sscreen->b.chip_class >= VI)
6650 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6651 else
6652 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6653 }
6654
6655 if (conf->num_vgprs)
6656 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6657
6658 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
6659 * 16KB makes some SIMDs unoccupied). */
6660 if (lds_per_wave)
6661 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6662
6663 if (!check_debug_option ||
6664 r600_can_dump_shader(&sscreen->b, processor)) {
6665 if (processor == PIPE_SHADER_FRAGMENT) {
6666 fprintf(file, "*** SHADER CONFIG ***\n"
6667 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6668 "SPI_PS_INPUT_ENA = 0x%04x\n",
6669 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6670 }
6671
6672 fprintf(file, "*** SHADER STATS ***\n"
6673 "SGPRS: %d\n"
6674 "VGPRS: %d\n"
6675 "Spilled SGPRs: %d\n"
6676 "Spilled VGPRs: %d\n"
6677 "Private memory VGPRs: %d\n"
6678 "Code Size: %d bytes\n"
6679 "LDS: %d blocks\n"
6680 "Scratch: %d bytes per wave\n"
6681 "Max Waves: %d\n"
6682 "********************\n\n\n",
6683 conf->num_sgprs, conf->num_vgprs,
6684 conf->spilled_sgprs, conf->spilled_vgprs,
6685 conf->private_mem_vgprs, code_size,
6686 conf->lds_size, conf->scratch_bytes_per_wave,
6687 max_simd_waves);
6688 }
6689
6690 pipe_debug_message(debug, SHADER_INFO,
6691 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6692 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6693 "Spilled VGPRs: %d PrivMem VGPRs: %d",
6694 conf->num_sgprs, conf->num_vgprs, code_size,
6695 conf->lds_size, conf->scratch_bytes_per_wave,
6696 max_simd_waves, conf->spilled_sgprs,
6697 conf->spilled_vgprs, conf->private_mem_vgprs);
6698 }
6699
6700 const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
6701 {
6702 switch (processor) {
6703 case PIPE_SHADER_VERTEX:
6704 if (shader->key.as_es)
6705 return "Vertex Shader as ES";
6706 else if (shader->key.as_ls)
6707 return "Vertex Shader as LS";
6708 else
6709 return "Vertex Shader as VS";
6710 case PIPE_SHADER_TESS_CTRL:
6711 return "Tessellation Control Shader";
6712 case PIPE_SHADER_TESS_EVAL:
6713 if (shader->key.as_es)
6714 return "Tessellation Evaluation Shader as ES";
6715 else
6716 return "Tessellation Evaluation Shader as VS";
6717 case PIPE_SHADER_GEOMETRY:
6718 if (shader->is_gs_copy_shader)
6719 return "GS Copy Shader as VS";
6720 else
6721 return "Geometry Shader";
6722 case PIPE_SHADER_FRAGMENT:
6723 return "Pixel Shader";
6724 case PIPE_SHADER_COMPUTE:
6725 return "Compute Shader";
6726 default:
6727 return "Unknown Shader";
6728 }
6729 }
6730
6731 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6732 struct pipe_debug_callback *debug, unsigned processor,
6733 FILE *file, bool check_debug_option)
6734 {
6735 if (!check_debug_option ||
6736 r600_can_dump_shader(&sscreen->b, processor))
6737 si_dump_shader_key(processor, shader, file);
6738
6739 if (!check_debug_option && shader->binary.llvm_ir_string) {
6740 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6741 si_get_shader_name(shader, processor));
6742 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6743 }
6744
6745 if (!check_debug_option ||
6746 (r600_can_dump_shader(&sscreen->b, processor) &&
6747 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6748 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6749
6750 if (shader->prolog)
6751 si_shader_dump_disassembly(&shader->prolog->binary,
6752 debug, "prolog", file);
6753 if (shader->previous_stage)
6754 si_shader_dump_disassembly(&shader->previous_stage->binary,
6755 debug, "previous stage", file);
6756 if (shader->prolog2)
6757 si_shader_dump_disassembly(&shader->prolog2->binary,
6758 debug, "prolog2", file);
6759
6760 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6761
6762 if (shader->epilog)
6763 si_shader_dump_disassembly(&shader->epilog->binary,
6764 debug, "epilog", file);
6765 fprintf(file, "\n");
6766 }
6767
6768 si_shader_dump_stats(sscreen, shader, debug, processor, file,
6769 check_debug_option);
6770 }
6771
6772 int si_compile_llvm(struct si_screen *sscreen,
6773 struct ac_shader_binary *binary,
6774 struct si_shader_config *conf,
6775 LLVMTargetMachineRef tm,
6776 LLVMModuleRef mod,
6777 struct pipe_debug_callback *debug,
6778 unsigned processor,
6779 const char *name)
6780 {
6781 int r = 0;
6782 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6783
6784 if (r600_can_dump_shader(&sscreen->b, processor)) {
6785 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6786
6787 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6788 fprintf(stderr, "%s LLVM IR:\n\n", name);
6789 ac_dump_module(mod);
6790 fprintf(stderr, "\n");
6791 }
6792 }
6793
6794 if (sscreen->record_llvm_ir) {
6795 char *ir = LLVMPrintModuleToString(mod);
6796 binary->llvm_ir_string = strdup(ir);
6797 LLVMDisposeMessage(ir);
6798 }
6799
6800 if (!si_replace_shader(count, binary)) {
6801 r = si_llvm_compile(mod, binary, tm, debug);
6802 if (r)
6803 return r;
6804 }
6805
6806 si_shader_binary_read_config(binary, conf, 0);
6807
6808 /* Enable 64-bit and 16-bit denormals, because there is no performance
6809 * cost.
6810 *
6811 * If denormals are enabled, all floating-point output modifiers are
6812 * ignored.
6813 *
6814 * Don't enable denormals for 32-bit floats, because:
6815 * - Floating-point output modifiers would be ignored by the hw.
6816 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6817 * have to stop using those.
6818 * - SI & CI would be very slow.
6819 */
6820 conf->float_mode |= V_00B028_FP_64_DENORMS;
6821
6822 FREE(binary->config);
6823 FREE(binary->global_symbol_offsets);
6824 binary->config = NULL;
6825 binary->global_symbol_offsets = NULL;
6826
6827 /* Some shaders can't have rodata because their binaries can be
6828 * concatenated.
6829 */
6830 if (binary->rodata_size &&
6831 (processor == PIPE_SHADER_VERTEX ||
6832 processor == PIPE_SHADER_TESS_CTRL ||
6833 processor == PIPE_SHADER_TESS_EVAL ||
6834 processor == PIPE_SHADER_FRAGMENT)) {
6835 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6836 return -EINVAL;
6837 }
6838
6839 return r;
6840 }
6841
6842 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6843 {
6844 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6845 LLVMBuildRetVoid(ctx->gallivm.builder);
6846 else
6847 LLVMBuildRet(ctx->gallivm.builder, ret);
6848 }
6849
6850 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6851 struct si_shader *
6852 si_generate_gs_copy_shader(struct si_screen *sscreen,
6853 LLVMTargetMachineRef tm,
6854 struct si_shader_selector *gs_selector,
6855 struct pipe_debug_callback *debug)
6856 {
6857 struct si_shader_context ctx;
6858 struct si_shader *shader;
6859 struct gallivm_state *gallivm = &ctx.gallivm;
6860 LLVMBuilderRef builder;
6861 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
6862 struct lp_build_context *uint = &bld_base->uint_bld;
6863 struct si_shader_output_values *outputs;
6864 struct tgsi_shader_info *gsinfo = &gs_selector->info;
6865 int i, r;
6866
6867 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6868
6869 if (!outputs)
6870 return NULL;
6871
6872 shader = CALLOC_STRUCT(si_shader);
6873 if (!shader) {
6874 FREE(outputs);
6875 return NULL;
6876 }
6877
6878
6879 shader->selector = gs_selector;
6880 shader->is_gs_copy_shader = true;
6881
6882 si_init_shader_ctx(&ctx, sscreen, tm);
6883 ctx.shader = shader;
6884 ctx.type = PIPE_SHADER_VERTEX;
6885
6886 builder = gallivm->builder;
6887
6888 create_function(&ctx);
6889 preload_ring_buffers(&ctx);
6890
6891 LLVMValueRef voffset =
6892 lp_build_mul_imm(uint, LLVMGetParam(ctx.main_fn,
6893 ctx.param_vertex_id), 4);
6894
6895 /* Fetch the vertex stream ID.*/
6896 LLVMValueRef stream_id;
6897
6898 if (gs_selector->so.num_outputs)
6899 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
6900 else
6901 stream_id = ctx.i32_0;
6902
6903 /* Fill in output information. */
6904 for (i = 0; i < gsinfo->num_outputs; ++i) {
6905 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
6906 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
6907
6908 for (int chan = 0; chan < 4; chan++) {
6909 outputs[i].vertex_stream[chan] =
6910 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
6911 }
6912 }
6913
6914 LLVMBasicBlockRef end_bb;
6915 LLVMValueRef switch_inst;
6916
6917 end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
6918 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
6919
6920 for (int stream = 0; stream < 4; stream++) {
6921 LLVMBasicBlockRef bb;
6922 unsigned offset;
6923
6924 if (!gsinfo->num_stream_output_components[stream])
6925 continue;
6926
6927 if (stream > 0 && !gs_selector->so.num_outputs)
6928 continue;
6929
6930 bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
6931 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
6932 LLVMPositionBuilderAtEnd(builder, bb);
6933
6934 /* Fetch vertex data from GSVS ring */
6935 offset = 0;
6936 for (i = 0; i < gsinfo->num_outputs; ++i) {
6937 for (unsigned chan = 0; chan < 4; chan++) {
6938 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
6939 outputs[i].vertex_stream[chan] != stream) {
6940 outputs[i].values[chan] = ctx.bld_base.base.undef;
6941 continue;
6942 }
6943
6944 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
6945 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
6946 offset++;
6947
6948 outputs[i].values[chan] =
6949 ac_build_buffer_load(&ctx.ac,
6950 ctx.gsvs_ring[0], 1,
6951 ctx.i32_0, voffset,
6952 soffset, 0, 1, 1, true);
6953 }
6954 }
6955
6956 /* Streamout and exports. */
6957 if (gs_selector->so.num_outputs) {
6958 si_llvm_emit_streamout(&ctx, outputs,
6959 gsinfo->num_outputs,
6960 stream);
6961 }
6962
6963 if (stream == 0)
6964 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6965
6966 LLVMBuildBr(builder, end_bb);
6967 }
6968
6969 LLVMPositionBuilderAtEnd(builder, end_bb);
6970
6971 LLVMBuildRetVoid(gallivm->builder);
6972
6973 /* Dump LLVM IR before any optimization passes */
6974 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6975 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6976 ac_dump_module(ctx.gallivm.module);
6977
6978 si_llvm_finalize_module(&ctx,
6979 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6980
6981 r = si_compile_llvm(sscreen, &ctx.shader->binary,
6982 &ctx.shader->config, ctx.tm,
6983 ctx.gallivm.module,
6984 debug, PIPE_SHADER_GEOMETRY,
6985 "GS Copy Shader");
6986 if (!r) {
6987 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6988 fprintf(stderr, "GS Copy Shader:\n");
6989 si_shader_dump(sscreen, ctx.shader, debug,
6990 PIPE_SHADER_GEOMETRY, stderr, true);
6991 r = si_shader_binary_upload(sscreen, ctx.shader);
6992 }
6993
6994 si_llvm_dispose(&ctx);
6995
6996 FREE(outputs);
6997
6998 if (r != 0) {
6999 FREE(shader);
7000 shader = NULL;
7001 }
7002 return shader;
7003 }
7004
7005 static void si_dump_shader_key_vs(struct si_shader_key *key,
7006 struct si_vs_prolog_bits *prolog,
7007 const char *prefix, FILE *f)
7008 {
7009 fprintf(f, " %s.instance_divisors = {", prefix);
7010 for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
7011 fprintf(f, !i ? "%u" : ", %u",
7012 prolog->instance_divisors[i]);
7013 }
7014 fprintf(f, "}\n");
7015
7016 fprintf(f, " mono.vs.fix_fetch = {");
7017 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
7018 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
7019 fprintf(f, "}\n");
7020 }
7021
7022 static void si_dump_shader_key(unsigned processor, struct si_shader *shader,
7023 FILE *f)
7024 {
7025 struct si_shader_key *key = &shader->key;
7026
7027 fprintf(f, "SHADER KEY\n");
7028
7029 switch (processor) {
7030 case PIPE_SHADER_VERTEX:
7031 si_dump_shader_key_vs(key, &key->part.vs.prolog,
7032 "part.vs.prolog", f);
7033 fprintf(f, " as_es = %u\n", key->as_es);
7034 fprintf(f, " as_ls = %u\n", key->as_ls);
7035 fprintf(f, " mono.vs_export_prim_id = %u\n",
7036 key->mono.vs_export_prim_id);
7037 break;
7038
7039 case PIPE_SHADER_TESS_CTRL:
7040 if (shader->selector->screen->b.chip_class >= GFX9) {
7041 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
7042 "part.tcs.ls_prolog", f);
7043 }
7044 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
7045 fprintf(f, " mono.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.ff_tcs_inputs_to_copy);
7046 break;
7047
7048 case PIPE_SHADER_TESS_EVAL:
7049 fprintf(f, " as_es = %u\n", key->as_es);
7050 fprintf(f, " mono.vs_export_prim_id = %u\n",
7051 key->mono.vs_export_prim_id);
7052 break;
7053
7054 case PIPE_SHADER_GEOMETRY:
7055 if (shader->is_gs_copy_shader)
7056 break;
7057
7058 if (shader->selector->screen->b.chip_class >= GFX9 &&
7059 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
7060 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
7061 "part.gs.vs_prolog", f);
7062 }
7063 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
7064 break;
7065
7066 case PIPE_SHADER_COMPUTE:
7067 break;
7068
7069 case PIPE_SHADER_FRAGMENT:
7070 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
7071 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
7072 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
7073 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
7074 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
7075 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
7076 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
7077 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
7078 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
7079 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
7080 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
7081 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
7082 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
7083 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
7084 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
7085 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
7086 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
7087 break;
7088
7089 default:
7090 assert(0);
7091 }
7092
7093 if ((processor == PIPE_SHADER_GEOMETRY ||
7094 processor == PIPE_SHADER_TESS_EVAL ||
7095 processor == PIPE_SHADER_VERTEX) &&
7096 !key->as_es && !key->as_ls) {
7097 fprintf(f, " opt.hw_vs.kill_outputs = 0x%"PRIx64"\n", key->opt.hw_vs.kill_outputs);
7098 fprintf(f, " opt.hw_vs.kill_outputs2 = 0x%x\n", key->opt.hw_vs.kill_outputs2);
7099 fprintf(f, " opt.hw_vs.clip_disable = %u\n", key->opt.hw_vs.clip_disable);
7100 }
7101 }
7102
7103 static void si_init_shader_ctx(struct si_shader_context *ctx,
7104 struct si_screen *sscreen,
7105 LLVMTargetMachineRef tm)
7106 {
7107 struct lp_build_tgsi_context *bld_base;
7108 struct lp_build_tgsi_action tmpl = {};
7109
7110 si_llvm_context_init(ctx, sscreen, tm);
7111
7112 bld_base = &ctx->bld_base;
7113 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
7114
7115 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
7116 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
7117 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
7118
7119 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
7120 bld_base->op_actions[TGSI_OPCODE_TEX_LZ] = tex_action;
7121 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
7122 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
7123 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
7124 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
7125 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
7126 bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
7127 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
7128 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
7129 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
7130 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
7131 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
7132 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
7133 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
7134 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
7135
7136 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
7137 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
7138 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
7139 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
7140 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
7141 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
7142
7143 tmpl.fetch_args = atomic_fetch_args;
7144 tmpl.emit = atomic_emit;
7145 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
7146 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
7147 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
7148 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
7149 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
7150 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
7151 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
7152 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
7153 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
7154 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
7155 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
7156 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
7157 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
7158 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
7159 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
7160 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
7161 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
7162 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
7163 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
7164 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
7165
7166 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
7167
7168 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
7169
7170 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
7171 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
7172 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
7173 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
7174
7175 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
7176 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
7177 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
7178 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
7179 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
7180 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
7181 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
7182 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
7183 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
7184
7185 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
7186 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
7187 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
7188 }
7189
7190 static void si_eliminate_const_vs_outputs(struct si_shader_context *ctx)
7191 {
7192 struct si_shader *shader = ctx->shader;
7193 struct tgsi_shader_info *info = &shader->selector->info;
7194
7195 if (ctx->type == PIPE_SHADER_FRAGMENT ||
7196 ctx->type == PIPE_SHADER_COMPUTE ||
7197 shader->key.as_es ||
7198 shader->key.as_ls)
7199 return;
7200
7201 ac_eliminate_const_vs_outputs(&ctx->ac,
7202 ctx->main_fn,
7203 shader->info.vs_output_param_offset,
7204 info->num_outputs,
7205 &shader->info.nr_param_exports);
7206 }
7207
7208 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
7209 {
7210 ctx->shader->config.private_mem_vgprs = 0;
7211
7212 /* Process all LLVM instructions. */
7213 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
7214 while (bb) {
7215 LLVMValueRef next = LLVMGetFirstInstruction(bb);
7216
7217 while (next) {
7218 LLVMValueRef inst = next;
7219 next = LLVMGetNextInstruction(next);
7220
7221 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
7222 continue;
7223
7224 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
7225 /* No idea why LLVM aligns allocas to 4 elements. */
7226 unsigned alignment = LLVMGetAlignment(inst);
7227 unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment);
7228 ctx->shader->config.private_mem_vgprs += dw_size;
7229 }
7230 bb = LLVMGetNextBasicBlock(bb);
7231 }
7232 }
7233
7234 static void si_init_exec_full_mask(struct si_shader_context *ctx)
7235 {
7236 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
7237 lp_build_intrinsic(ctx->gallivm.builder,
7238 "llvm.amdgcn.init.exec", ctx->voidt,
7239 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
7240 }
7241
7242 static void si_init_exec_from_input(struct si_shader_context *ctx,
7243 unsigned param, unsigned bitoffset)
7244 {
7245 LLVMValueRef args[] = {
7246 LLVMGetParam(ctx->main_fn, param),
7247 LLVMConstInt(ctx->i32, bitoffset, 0),
7248 };
7249 lp_build_intrinsic(ctx->gallivm.builder,
7250 "llvm.amdgcn.init.exec.from.input",
7251 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
7252 }
7253
7254 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
7255 bool is_monolithic)
7256 {
7257 struct si_shader *shader = ctx->shader;
7258 struct si_shader_selector *sel = shader->selector;
7259 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7260
7261 switch (ctx->type) {
7262 case PIPE_SHADER_VERTEX:
7263 ctx->load_input = declare_input_vs;
7264 if (shader->key.as_ls)
7265 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
7266 else if (shader->key.as_es)
7267 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7268 else
7269 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7270 break;
7271 case PIPE_SHADER_TESS_CTRL:
7272 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
7273 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
7274 bld_base->emit_store = store_output_tcs;
7275 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
7276 break;
7277 case PIPE_SHADER_TESS_EVAL:
7278 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
7279 if (shader->key.as_es)
7280 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
7281 else
7282 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
7283 break;
7284 case PIPE_SHADER_GEOMETRY:
7285 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
7286 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
7287 break;
7288 case PIPE_SHADER_FRAGMENT:
7289 ctx->load_input = declare_input_fs;
7290 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
7291 break;
7292 case PIPE_SHADER_COMPUTE:
7293 ctx->declare_memory_region = declare_compute_memory;
7294 break;
7295 default:
7296 assert(!"Unsupported shader type");
7297 return false;
7298 }
7299
7300 create_function(ctx);
7301 preload_ring_buffers(ctx);
7302
7303 /* For GFX9 merged shaders:
7304 * - Set EXEC. If the prolog is present, set EXEC there instead.
7305 * - Add a barrier before the second shader.
7306 *
7307 * The same thing for monolithic shaders is done in
7308 * si_build_wrapper_function.
7309 */
7310 if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
7311 if (sel->info.num_instructions > 1 && /* not empty shader */
7312 (shader->key.as_es || shader->key.as_ls) &&
7313 (ctx->type == PIPE_SHADER_TESS_EVAL ||
7314 (ctx->type == PIPE_SHADER_VERTEX &&
7315 !sel->vs_needs_prolog))) {
7316 si_init_exec_from_input(ctx,
7317 ctx->param_merged_wave_info, 0);
7318 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
7319 ctx->type == PIPE_SHADER_GEOMETRY) {
7320 si_init_exec_from_input(ctx,
7321 ctx->param_merged_wave_info, 8);
7322 si_llvm_emit_barrier(NULL, bld_base, NULL);
7323 }
7324 }
7325
7326 if (ctx->type == PIPE_SHADER_GEOMETRY) {
7327 int i;
7328 for (i = 0; i < 4; i++) {
7329 ctx->gs_next_vertex[i] =
7330 lp_build_alloca(&ctx->gallivm,
7331 ctx->i32, "");
7332 }
7333 }
7334
7335 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
7336 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
7337 return false;
7338 }
7339
7340 si_llvm_build_ret(ctx, ctx->return_value);
7341 return true;
7342 }
7343
7344 /**
7345 * Compute the VS prolog key, which contains all the information needed to
7346 * build the VS prolog function, and set shader->info bits where needed.
7347 *
7348 * \param info Shader info of the vertex shader.
7349 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
7350 * \param prolog_key Key of the VS prolog
7351 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
7352 * \param key Output shader part key.
7353 */
7354 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
7355 unsigned num_input_sgprs,
7356 const struct si_vs_prolog_bits *prolog_key,
7357 struct si_shader *shader_out,
7358 union si_shader_part_key *key)
7359 {
7360 memset(key, 0, sizeof(*key));
7361 key->vs_prolog.states = *prolog_key;
7362 key->vs_prolog.num_input_sgprs = num_input_sgprs;
7363 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7364 key->vs_prolog.as_ls = shader_out->key.as_ls;
7365
7366 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
7367 key->vs_prolog.as_ls = 1;
7368 key->vs_prolog.num_merged_next_stage_vgprs = 2;
7369 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
7370 key->vs_prolog.num_merged_next_stage_vgprs = 5;
7371 }
7372
7373 /* Set the instanceID flag. */
7374 for (unsigned i = 0; i < info->num_inputs; i++)
7375 if (key->vs_prolog.states.instance_divisors[i])
7376 shader_out->info.uses_instanceid = true;
7377 }
7378
7379 /**
7380 * Compute the PS prolog key, which contains all the information needed to
7381 * build the PS prolog function, and set related bits in shader->config.
7382 */
7383 static void si_get_ps_prolog_key(struct si_shader *shader,
7384 union si_shader_part_key *key,
7385 bool separate_prolog)
7386 {
7387 struct tgsi_shader_info *info = &shader->selector->info;
7388
7389 memset(key, 0, sizeof(*key));
7390 key->ps_prolog.states = shader->key.part.ps.prolog;
7391 key->ps_prolog.colors_read = info->colors_read;
7392 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7393 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7394 key->ps_prolog.wqm = info->uses_derivatives &&
7395 (key->ps_prolog.colors_read ||
7396 key->ps_prolog.states.force_persp_sample_interp ||
7397 key->ps_prolog.states.force_linear_sample_interp ||
7398 key->ps_prolog.states.force_persp_center_interp ||
7399 key->ps_prolog.states.force_linear_center_interp ||
7400 key->ps_prolog.states.bc_optimize_for_persp ||
7401 key->ps_prolog.states.bc_optimize_for_linear);
7402
7403 if (info->colors_read) {
7404 unsigned *color = shader->selector->color_attr_index;
7405
7406 if (shader->key.part.ps.prolog.color_two_side) {
7407 /* BCOLORs are stored after the last input. */
7408 key->ps_prolog.num_interp_inputs = info->num_inputs;
7409 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7410 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7411 }
7412
7413 for (unsigned i = 0; i < 2; i++) {
7414 unsigned interp = info->input_interpolate[color[i]];
7415 unsigned location = info->input_interpolate_loc[color[i]];
7416
7417 if (!(info->colors_read & (0xf << i*4)))
7418 continue;
7419
7420 key->ps_prolog.color_attr_index[i] = color[i];
7421
7422 if (shader->key.part.ps.prolog.flatshade_colors &&
7423 interp == TGSI_INTERPOLATE_COLOR)
7424 interp = TGSI_INTERPOLATE_CONSTANT;
7425
7426 switch (interp) {
7427 case TGSI_INTERPOLATE_CONSTANT:
7428 key->ps_prolog.color_interp_vgpr_index[i] = -1;
7429 break;
7430 case TGSI_INTERPOLATE_PERSPECTIVE:
7431 case TGSI_INTERPOLATE_COLOR:
7432 /* Force the interpolation location for colors here. */
7433 if (shader->key.part.ps.prolog.force_persp_sample_interp)
7434 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7435 if (shader->key.part.ps.prolog.force_persp_center_interp)
7436 location = TGSI_INTERPOLATE_LOC_CENTER;
7437
7438 switch (location) {
7439 case TGSI_INTERPOLATE_LOC_SAMPLE:
7440 key->ps_prolog.color_interp_vgpr_index[i] = 0;
7441 shader->config.spi_ps_input_ena |=
7442 S_0286CC_PERSP_SAMPLE_ENA(1);
7443 break;
7444 case TGSI_INTERPOLATE_LOC_CENTER:
7445 key->ps_prolog.color_interp_vgpr_index[i] = 2;
7446 shader->config.spi_ps_input_ena |=
7447 S_0286CC_PERSP_CENTER_ENA(1);
7448 break;
7449 case TGSI_INTERPOLATE_LOC_CENTROID:
7450 key->ps_prolog.color_interp_vgpr_index[i] = 4;
7451 shader->config.spi_ps_input_ena |=
7452 S_0286CC_PERSP_CENTROID_ENA(1);
7453 break;
7454 default:
7455 assert(0);
7456 }
7457 break;
7458 case TGSI_INTERPOLATE_LINEAR:
7459 /* Force the interpolation location for colors here. */
7460 if (shader->key.part.ps.prolog.force_linear_sample_interp)
7461 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7462 if (shader->key.part.ps.prolog.force_linear_center_interp)
7463 location = TGSI_INTERPOLATE_LOC_CENTER;
7464
7465 /* The VGPR assignment for non-monolithic shaders
7466 * works because InitialPSInputAddr is set on the
7467 * main shader and PERSP_PULL_MODEL is never used.
7468 */
7469 switch (location) {
7470 case TGSI_INTERPOLATE_LOC_SAMPLE:
7471 key->ps_prolog.color_interp_vgpr_index[i] =
7472 separate_prolog ? 6 : 9;
7473 shader->config.spi_ps_input_ena |=
7474 S_0286CC_LINEAR_SAMPLE_ENA(1);
7475 break;
7476 case TGSI_INTERPOLATE_LOC_CENTER:
7477 key->ps_prolog.color_interp_vgpr_index[i] =
7478 separate_prolog ? 8 : 11;
7479 shader->config.spi_ps_input_ena |=
7480 S_0286CC_LINEAR_CENTER_ENA(1);
7481 break;
7482 case TGSI_INTERPOLATE_LOC_CENTROID:
7483 key->ps_prolog.color_interp_vgpr_index[i] =
7484 separate_prolog ? 10 : 13;
7485 shader->config.spi_ps_input_ena |=
7486 S_0286CC_LINEAR_CENTROID_ENA(1);
7487 break;
7488 default:
7489 assert(0);
7490 }
7491 break;
7492 default:
7493 assert(0);
7494 }
7495 }
7496 }
7497 }
7498
7499 /**
7500 * Check whether a PS prolog is required based on the key.
7501 */
7502 static bool si_need_ps_prolog(const union si_shader_part_key *key)
7503 {
7504 return key->ps_prolog.colors_read ||
7505 key->ps_prolog.states.force_persp_sample_interp ||
7506 key->ps_prolog.states.force_linear_sample_interp ||
7507 key->ps_prolog.states.force_persp_center_interp ||
7508 key->ps_prolog.states.force_linear_center_interp ||
7509 key->ps_prolog.states.bc_optimize_for_persp ||
7510 key->ps_prolog.states.bc_optimize_for_linear ||
7511 key->ps_prolog.states.poly_stipple;
7512 }
7513
7514 /**
7515 * Compute the PS epilog key, which contains all the information needed to
7516 * build the PS epilog function.
7517 */
7518 static void si_get_ps_epilog_key(struct si_shader *shader,
7519 union si_shader_part_key *key)
7520 {
7521 struct tgsi_shader_info *info = &shader->selector->info;
7522 memset(key, 0, sizeof(*key));
7523 key->ps_epilog.colors_written = info->colors_written;
7524 key->ps_epilog.writes_z = info->writes_z;
7525 key->ps_epilog.writes_stencil = info->writes_stencil;
7526 key->ps_epilog.writes_samplemask = info->writes_samplemask;
7527 key->ps_epilog.states = shader->key.part.ps.epilog;
7528 }
7529
7530 /**
7531 * Build the GS prolog function. Rotate the input vertices for triangle strips
7532 * with adjacency.
7533 */
7534 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
7535 union si_shader_part_key *key)
7536 {
7537 unsigned num_sgprs, num_vgprs;
7538 struct gallivm_state *gallivm = &ctx->gallivm;
7539 LLVMBuilderRef builder = gallivm->builder;
7540 LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
7541 LLVMTypeRef returns[48];
7542 LLVMValueRef func, ret;
7543
7544 if (ctx->screen->b.chip_class >= GFX9) {
7545 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
7546 num_vgprs = 5; /* ES inputs are not needed by GS */
7547 } else {
7548 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
7549 num_vgprs = 8;
7550 }
7551
7552 for (unsigned i = 0; i < num_sgprs; ++i) {
7553 params[i] = ctx->i32;
7554 returns[i] = ctx->i32;
7555 }
7556
7557 for (unsigned i = 0; i < num_vgprs; ++i) {
7558 params[num_sgprs + i] = ctx->i32;
7559 returns[num_sgprs + i] = ctx->f32;
7560 }
7561
7562 /* Create the function. */
7563 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
7564 params, num_sgprs + num_vgprs, num_sgprs - 1);
7565 func = ctx->main_fn;
7566
7567 /* Set the full EXEC mask for the prolog, because we are only fiddling
7568 * with registers here. The main shader part will set the correct EXEC
7569 * mask.
7570 */
7571 if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
7572 si_init_exec_full_mask(ctx);
7573
7574 /* Copy inputs to outputs. This should be no-op, as the registers match,
7575 * but it will prevent the compiler from overwriting them unintentionally.
7576 */
7577 ret = ctx->return_value;
7578 for (unsigned i = 0; i < num_sgprs; i++) {
7579 LLVMValueRef p = LLVMGetParam(func, i);
7580 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
7581 }
7582 for (unsigned i = 0; i < num_vgprs; i++) {
7583 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
7584 p = LLVMBuildBitCast(builder, p, ctx->f32, "");
7585 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
7586 }
7587
7588 if (key->gs_prolog.states.tri_strip_adj_fix) {
7589 /* Remap the input vertices for every other primitive. */
7590 const unsigned gfx6_vtx_params[6] = {
7591 num_sgprs,
7592 num_sgprs + 1,
7593 num_sgprs + 3,
7594 num_sgprs + 4,
7595 num_sgprs + 5,
7596 num_sgprs + 6
7597 };
7598 const unsigned gfx9_vtx_params[3] = {
7599 num_sgprs,
7600 num_sgprs + 1,
7601 num_sgprs + 4,
7602 };
7603 LLVMValueRef vtx_in[6], vtx_out[6];
7604 LLVMValueRef prim_id, rotate;
7605
7606 if (ctx->screen->b.chip_class >= GFX9) {
7607 for (unsigned i = 0; i < 3; i++) {
7608 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
7609 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
7610 }
7611 } else {
7612 for (unsigned i = 0; i < 6; i++)
7613 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
7614 }
7615
7616 prim_id = LLVMGetParam(func, num_sgprs + 2);
7617 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
7618
7619 for (unsigned i = 0; i < 6; ++i) {
7620 LLVMValueRef base, rotated;
7621 base = vtx_in[i];
7622 rotated = vtx_in[(i + 4) % 6];
7623 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
7624 }
7625
7626 if (ctx->screen->b.chip_class >= GFX9) {
7627 for (unsigned i = 0; i < 3; i++) {
7628 LLVMValueRef hi, out;
7629
7630 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
7631 LLVMConstInt(ctx->i32, 16, 0), "");
7632 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
7633 out = LLVMBuildBitCast(builder, out, ctx->f32, "");
7634 ret = LLVMBuildInsertValue(builder, ret, out,
7635 gfx9_vtx_params[i], "");
7636 }
7637 } else {
7638 for (unsigned i = 0; i < 6; i++) {
7639 LLVMValueRef out;
7640
7641 out = LLVMBuildBitCast(builder, vtx_out[i], ctx->f32, "");
7642 ret = LLVMBuildInsertValue(builder, ret, out,
7643 gfx6_vtx_params[i], "");
7644 }
7645 }
7646 }
7647
7648 LLVMBuildRet(builder, ret);
7649 }
7650
7651 /**
7652 * Given a list of shader part functions, build a wrapper function that
7653 * runs them in sequence to form a monolithic shader.
7654 */
7655 static void si_build_wrapper_function(struct si_shader_context *ctx,
7656 LLVMValueRef *parts,
7657 unsigned num_parts,
7658 unsigned main_part,
7659 unsigned next_shader_first_part)
7660 {
7661 struct gallivm_state *gallivm = &ctx->gallivm;
7662 LLVMBuilderRef builder = ctx->gallivm.builder;
7663 /* PS epilog has one arg per color component */
7664 LLVMTypeRef param_types[48];
7665 LLVMValueRef initial[48], out[48];
7666 LLVMTypeRef function_type;
7667 unsigned num_params;
7668 unsigned num_out, initial_num_out;
7669 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
7670 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
7671 unsigned num_sgprs, num_vgprs;
7672 unsigned last_sgpr_param;
7673 unsigned gprs;
7674 struct lp_build_if_state if_state;
7675
7676 for (unsigned i = 0; i < num_parts; ++i) {
7677 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
7678 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
7679 }
7680
7681 /* The parameters of the wrapper function correspond to those of the
7682 * first part in terms of SGPRs and VGPRs, but we use the types of the
7683 * main part to get the right types. This is relevant for the
7684 * dereferenceable attribute on descriptor table pointers.
7685 */
7686 num_sgprs = 0;
7687 num_vgprs = 0;
7688
7689 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
7690 num_params = LLVMCountParamTypes(function_type);
7691
7692 for (unsigned i = 0; i < num_params; ++i) {
7693 LLVMValueRef param = LLVMGetParam(parts[0], i);
7694
7695 if (ac_is_sgpr_param(param)) {
7696 assert(num_vgprs == 0);
7697 num_sgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7698 } else {
7699 num_vgprs += llvm_get_type_size(LLVMTypeOf(param)) / 4;
7700 }
7701 }
7702 assert(num_vgprs + num_sgprs <= ARRAY_SIZE(param_types));
7703
7704 num_params = 0;
7705 last_sgpr_param = 0;
7706 gprs = 0;
7707 while (gprs < num_sgprs + num_vgprs) {
7708 LLVMValueRef param = LLVMGetParam(parts[main_part], num_params);
7709 unsigned size;
7710
7711 param_types[num_params] = LLVMTypeOf(param);
7712 if (gprs < num_sgprs)
7713 last_sgpr_param = num_params;
7714 size = llvm_get_type_size(param_types[num_params]) / 4;
7715 num_params++;
7716
7717 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
7718 assert(gprs + size <= num_sgprs + num_vgprs &&
7719 (gprs >= num_sgprs || gprs + size <= num_sgprs));
7720
7721 gprs += size;
7722 }
7723
7724 si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param);
7725
7726 if (is_merged_shader(ctx->shader))
7727 si_init_exec_full_mask(ctx);
7728
7729 /* Record the arguments of the function as if they were an output of
7730 * a previous part.
7731 */
7732 num_out = 0;
7733 num_out_sgpr = 0;
7734
7735 for (unsigned i = 0; i < num_params; ++i) {
7736 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
7737 LLVMTypeRef param_type = LLVMTypeOf(param);
7738 LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : ctx->f32;
7739 unsigned size = llvm_get_type_size(param_type) / 4;
7740
7741 if (size == 1) {
7742 if (param_type != out_type)
7743 param = LLVMBuildBitCast(builder, param, out_type, "");
7744 out[num_out++] = param;
7745 } else {
7746 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
7747
7748 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7749 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
7750 param_type = ctx->i64;
7751 }
7752
7753 if (param_type != vector_type)
7754 param = LLVMBuildBitCast(builder, param, vector_type, "");
7755
7756 for (unsigned j = 0; j < size; ++j)
7757 out[num_out++] = LLVMBuildExtractElement(
7758 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
7759 }
7760
7761 if (i <= last_sgpr_param)
7762 num_out_sgpr = num_out;
7763 }
7764
7765 memcpy(initial, out, sizeof(out));
7766 initial_num_out = num_out;
7767 initial_num_out_sgpr = num_out_sgpr;
7768
7769 /* Now chain the parts. */
7770 for (unsigned part = 0; part < num_parts; ++part) {
7771 LLVMValueRef in[48];
7772 LLVMValueRef ret;
7773 LLVMTypeRef ret_type;
7774 unsigned out_idx = 0;
7775
7776 num_params = LLVMCountParams(parts[part]);
7777 assert(num_params <= ARRAY_SIZE(param_types));
7778
7779 /* Merged shaders are executed conditionally depending
7780 * on the number of enabled threads passed in the input SGPRs. */
7781 if (is_merged_shader(ctx->shader) &&
7782 (part == 0 || part == next_shader_first_part)) {
7783 LLVMValueRef ena, count = initial[3];
7784
7785 /* The thread count for the 2nd shader is at bit-offset 8. */
7786 if (part == next_shader_first_part) {
7787 count = LLVMBuildLShr(builder, count,
7788 LLVMConstInt(ctx->i32, 8, 0), "");
7789 }
7790 count = LLVMBuildAnd(builder, count,
7791 LLVMConstInt(ctx->i32, 0x7f, 0), "");
7792 ena = LLVMBuildICmp(builder, LLVMIntULT,
7793 ac_get_thread_id(&ctx->ac), count, "");
7794 lp_build_if(&if_state, &ctx->gallivm, ena);
7795 }
7796
7797 /* Derive arguments for the next part from outputs of the
7798 * previous one.
7799 */
7800 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
7801 LLVMValueRef param;
7802 LLVMTypeRef param_type;
7803 bool is_sgpr;
7804 unsigned param_size;
7805 LLVMValueRef arg = NULL;
7806
7807 param = LLVMGetParam(parts[part], param_idx);
7808 param_type = LLVMTypeOf(param);
7809 param_size = llvm_get_type_size(param_type) / 4;
7810 is_sgpr = ac_is_sgpr_param(param);
7811
7812 if (is_sgpr) {
7813 #if HAVE_LLVM < 0x0400
7814 LLVMRemoveAttribute(param, LLVMByValAttribute);
7815 #else
7816 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
7817 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
7818 #endif
7819 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
7820 }
7821
7822 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
7823 assert(is_sgpr || out_idx >= num_out_sgpr);
7824
7825 if (param_size == 1)
7826 arg = out[out_idx];
7827 else
7828 arg = lp_build_gather_values(gallivm, &out[out_idx], param_size);
7829
7830 if (LLVMTypeOf(arg) != param_type) {
7831 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
7832 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
7833 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
7834 } else {
7835 arg = LLVMBuildBitCast(builder, arg, param_type, "");
7836 }
7837 }
7838
7839 in[param_idx] = arg;
7840 out_idx += param_size;
7841 }
7842
7843 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
7844
7845 if (is_merged_shader(ctx->shader) &&
7846 (part + 1 == next_shader_first_part ||
7847 part + 1 == num_parts)) {
7848 lp_build_endif(&if_state);
7849
7850 if (part + 1 == next_shader_first_part) {
7851 /* A barrier is required between 2 merged shaders. */
7852 si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL);
7853
7854 /* The second half of the merged shader should use
7855 * the inputs from the toplevel (wrapper) function,
7856 * not the return value from the last call.
7857 *
7858 * That's because the last call was executed condi-
7859 * tionally, so we can't consume it in the main
7860 * block.
7861 */
7862 memcpy(out, initial, sizeof(initial));
7863 num_out = initial_num_out;
7864 num_out_sgpr = initial_num_out_sgpr;
7865 }
7866 continue;
7867 }
7868
7869 /* Extract the returned GPRs. */
7870 ret_type = LLVMTypeOf(ret);
7871 num_out = 0;
7872 num_out_sgpr = 0;
7873
7874 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
7875 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
7876
7877 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
7878
7879 for (unsigned i = 0; i < ret_size; ++i) {
7880 LLVMValueRef val =
7881 LLVMBuildExtractValue(builder, ret, i, "");
7882
7883 out[num_out++] = val;
7884
7885 if (LLVMTypeOf(val) == ctx->i32) {
7886 assert(num_out_sgpr + 1 == num_out);
7887 num_out_sgpr = num_out;
7888 }
7889 }
7890 }
7891 }
7892
7893 LLVMBuildRetVoid(builder);
7894 }
7895
7896 int si_compile_tgsi_shader(struct si_screen *sscreen,
7897 LLVMTargetMachineRef tm,
7898 struct si_shader *shader,
7899 bool is_monolithic,
7900 struct pipe_debug_callback *debug)
7901 {
7902 struct si_shader_selector *sel = shader->selector;
7903 struct si_shader_context ctx;
7904 int r = -1;
7905
7906 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
7907 * conversion fails. */
7908 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
7909 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
7910 tgsi_dump(sel->tokens, 0);
7911 si_dump_streamout(&sel->so);
7912 }
7913
7914 si_init_shader_ctx(&ctx, sscreen, tm);
7915 si_llvm_context_set_tgsi(&ctx, shader);
7916 ctx.separate_prolog = !is_monolithic;
7917
7918 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
7919 sizeof(shader->info.vs_output_param_offset));
7920
7921 shader->info.uses_instanceid = sel->info.uses_instanceid;
7922
7923 ctx.load_system_value = declare_system_value;
7924
7925 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
7926 si_llvm_dispose(&ctx);
7927 return -1;
7928 }
7929
7930 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
7931 LLVMValueRef parts[2];
7932 bool need_prolog = sel->vs_needs_prolog;
7933
7934 parts[1] = ctx.main_fn;
7935
7936 if (need_prolog) {
7937 union si_shader_part_key prolog_key;
7938 si_get_vs_prolog_key(&sel->info,
7939 shader->info.num_input_sgprs,
7940 &shader->key.part.vs.prolog,
7941 shader, &prolog_key);
7942 si_build_vs_prolog_function(&ctx, &prolog_key);
7943 parts[0] = ctx.main_fn;
7944 }
7945
7946 si_build_wrapper_function(&ctx, parts + !need_prolog,
7947 1 + need_prolog, need_prolog, 0);
7948 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
7949 if (sscreen->b.chip_class >= GFX9) {
7950 struct si_shader_selector *ls = shader->key.part.tcs.ls;
7951 LLVMValueRef parts[4];
7952
7953 /* TCS main part */
7954 parts[2] = ctx.main_fn;
7955
7956 /* TCS epilog */
7957 union si_shader_part_key tcs_epilog_key;
7958 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
7959 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7960 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
7961 parts[3] = ctx.main_fn;
7962
7963 /* VS prolog */
7964 if (ls->vs_needs_prolog) {
7965 union si_shader_part_key vs_prolog_key;
7966 si_get_vs_prolog_key(&ls->info,
7967 shader->info.num_input_sgprs,
7968 &shader->key.part.tcs.ls_prolog,
7969 shader, &vs_prolog_key);
7970 vs_prolog_key.vs_prolog.is_monolithic = true;
7971 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
7972 parts[0] = ctx.main_fn;
7973 }
7974
7975 /* VS as LS main part */
7976 struct si_shader shader_ls = {};
7977 shader_ls.selector = ls;
7978 shader_ls.key.as_ls = 1;
7979 shader_ls.key.mono = shader->key.mono;
7980 shader_ls.key.opt = shader->key.opt;
7981 si_llvm_context_set_tgsi(&ctx, &shader_ls);
7982
7983 if (!si_compile_tgsi_main(&ctx, true)) {
7984 si_llvm_dispose(&ctx);
7985 return -1;
7986 }
7987 shader->info.uses_instanceid |= ls->info.uses_instanceid;
7988 parts[1] = ctx.main_fn;
7989
7990 /* Reset the shader context. */
7991 ctx.shader = shader;
7992 ctx.type = PIPE_SHADER_TESS_CTRL;
7993
7994 si_build_wrapper_function(&ctx,
7995 parts + !ls->vs_needs_prolog,
7996 4 - !ls->vs_needs_prolog, 0,
7997 ls->vs_needs_prolog ? 2 : 1);
7998 } else {
7999 LLVMValueRef parts[2];
8000 union si_shader_part_key epilog_key;
8001
8002 parts[0] = ctx.main_fn;
8003
8004 memset(&epilog_key, 0, sizeof(epilog_key));
8005 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8006 si_build_tcs_epilog_function(&ctx, &epilog_key);
8007 parts[1] = ctx.main_fn;
8008
8009 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
8010 }
8011 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
8012 if (ctx.screen->b.chip_class >= GFX9) {
8013 struct si_shader_selector *es = shader->key.part.gs.es;
8014 LLVMValueRef es_prolog = NULL;
8015 LLVMValueRef es_main = NULL;
8016 LLVMValueRef gs_prolog = NULL;
8017 LLVMValueRef gs_main = ctx.main_fn;
8018
8019 /* GS prolog */
8020 union si_shader_part_key gs_prolog_key;
8021 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
8022 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8023 gs_prolog_key.gs_prolog.is_monolithic = true;
8024 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
8025 gs_prolog = ctx.main_fn;
8026
8027 /* ES prolog */
8028 if (es->vs_needs_prolog) {
8029 union si_shader_part_key vs_prolog_key;
8030 si_get_vs_prolog_key(&es->info,
8031 shader->info.num_input_sgprs,
8032 &shader->key.part.tcs.ls_prolog,
8033 shader, &vs_prolog_key);
8034 vs_prolog_key.vs_prolog.is_monolithic = true;
8035 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
8036 es_prolog = ctx.main_fn;
8037 }
8038
8039 /* ES main part */
8040 struct si_shader shader_es = {};
8041 shader_es.selector = es;
8042 shader_es.key.as_es = 1;
8043 shader_es.key.mono = shader->key.mono;
8044 shader_es.key.opt = shader->key.opt;
8045 si_llvm_context_set_tgsi(&ctx, &shader_es);
8046
8047 if (!si_compile_tgsi_main(&ctx, true)) {
8048 si_llvm_dispose(&ctx);
8049 return -1;
8050 }
8051 shader->info.uses_instanceid |= es->info.uses_instanceid;
8052 es_main = ctx.main_fn;
8053
8054 /* Reset the shader context. */
8055 ctx.shader = shader;
8056 ctx.type = PIPE_SHADER_GEOMETRY;
8057
8058 /* Prepare the array of shader parts. */
8059 LLVMValueRef parts[4];
8060 unsigned num_parts = 0, main_part, next_first_part;
8061
8062 if (es_prolog)
8063 parts[num_parts++] = es_prolog;
8064
8065 parts[main_part = num_parts++] = es_main;
8066 parts[next_first_part = num_parts++] = gs_prolog;
8067 parts[num_parts++] = gs_main;
8068
8069 si_build_wrapper_function(&ctx, parts, num_parts,
8070 main_part, next_first_part);
8071 } else {
8072 LLVMValueRef parts[2];
8073 union si_shader_part_key prolog_key;
8074
8075 parts[1] = ctx.main_fn;
8076
8077 memset(&prolog_key, 0, sizeof(prolog_key));
8078 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8079 si_build_gs_prolog_function(&ctx, &prolog_key);
8080 parts[0] = ctx.main_fn;
8081
8082 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
8083 }
8084 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
8085 LLVMValueRef parts[3];
8086 union si_shader_part_key prolog_key;
8087 union si_shader_part_key epilog_key;
8088 bool need_prolog;
8089
8090 si_get_ps_prolog_key(shader, &prolog_key, false);
8091 need_prolog = si_need_ps_prolog(&prolog_key);
8092
8093 parts[need_prolog ? 1 : 0] = ctx.main_fn;
8094
8095 if (need_prolog) {
8096 si_build_ps_prolog_function(&ctx, &prolog_key);
8097 parts[0] = ctx.main_fn;
8098 }
8099
8100 si_get_ps_epilog_key(shader, &epilog_key);
8101 si_build_ps_epilog_function(&ctx, &epilog_key);
8102 parts[need_prolog ? 2 : 1] = ctx.main_fn;
8103
8104 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
8105 need_prolog ? 1 : 0, 0);
8106 }
8107
8108 /* Dump LLVM IR before any optimization passes */
8109 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
8110 r600_can_dump_shader(&sscreen->b, ctx.type))
8111 LLVMDumpModule(ctx.gallivm.module);
8112
8113 si_llvm_finalize_module(&ctx,
8114 r600_extra_shader_checks(&sscreen->b, ctx.type));
8115
8116 /* Post-optimization transformations and analysis. */
8117 si_eliminate_const_vs_outputs(&ctx);
8118
8119 if ((debug && debug->debug_message) ||
8120 r600_can_dump_shader(&sscreen->b, ctx.type))
8121 si_count_scratch_private_memory(&ctx);
8122
8123 /* Compile to bytecode. */
8124 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
8125 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
8126 si_llvm_dispose(&ctx);
8127 if (r) {
8128 fprintf(stderr, "LLVM failed to compile shader\n");
8129 return r;
8130 }
8131
8132 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
8133 * LLVM 3.9svn has this bug.
8134 */
8135 if (sel->type == PIPE_SHADER_COMPUTE) {
8136 unsigned wave_size = 64;
8137 unsigned max_vgprs = 256;
8138 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
8139 unsigned max_sgprs_per_wave = 128;
8140 unsigned max_block_threads = si_get_max_workgroup_size(shader);
8141 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
8142 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
8143
8144 max_vgprs = max_vgprs / min_waves_per_simd;
8145 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
8146
8147 if (shader->config.num_sgprs > max_sgprs ||
8148 shader->config.num_vgprs > max_vgprs) {
8149 fprintf(stderr, "LLVM failed to compile a shader correctly: "
8150 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
8151 shader->config.num_sgprs, shader->config.num_vgprs,
8152 max_sgprs, max_vgprs);
8153
8154 /* Just terminate the process, because dependent
8155 * shaders can hang due to bad input data, but use
8156 * the env var to allow shader-db to work.
8157 */
8158 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
8159 abort();
8160 }
8161 }
8162
8163 /* Add the scratch offset to input SGPRs. */
8164 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
8165 shader->info.num_input_sgprs += 1; /* scratch byte offset */
8166
8167 /* Calculate the number of fragment input VGPRs. */
8168 if (ctx.type == PIPE_SHADER_FRAGMENT) {
8169 shader->info.num_input_vgprs = 0;
8170 shader->info.face_vgpr_index = -1;
8171
8172 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8173 shader->info.num_input_vgprs += 2;
8174 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
8175 shader->info.num_input_vgprs += 2;
8176 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
8177 shader->info.num_input_vgprs += 2;
8178 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
8179 shader->info.num_input_vgprs += 3;
8180 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
8181 shader->info.num_input_vgprs += 2;
8182 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
8183 shader->info.num_input_vgprs += 2;
8184 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
8185 shader->info.num_input_vgprs += 2;
8186 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
8187 shader->info.num_input_vgprs += 1;
8188 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
8189 shader->info.num_input_vgprs += 1;
8190 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
8191 shader->info.num_input_vgprs += 1;
8192 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
8193 shader->info.num_input_vgprs += 1;
8194 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
8195 shader->info.num_input_vgprs += 1;
8196 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
8197 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
8198 shader->info.num_input_vgprs += 1;
8199 }
8200 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
8201 shader->info.num_input_vgprs += 1;
8202 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
8203 shader->info.num_input_vgprs += 1;
8204 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
8205 shader->info.num_input_vgprs += 1;
8206 }
8207
8208 return 0;
8209 }
8210
8211 /**
8212 * Create, compile and return a shader part (prolog or epilog).
8213 *
8214 * \param sscreen screen
8215 * \param list list of shader parts of the same category
8216 * \param type shader type
8217 * \param key shader part key
8218 * \param prolog whether the part being requested is a prolog
8219 * \param tm LLVM target machine
8220 * \param debug debug callback
8221 * \param build the callback responsible for building the main function
8222 * \return non-NULL on success
8223 */
8224 static struct si_shader_part *
8225 si_get_shader_part(struct si_screen *sscreen,
8226 struct si_shader_part **list,
8227 enum pipe_shader_type type,
8228 bool prolog,
8229 union si_shader_part_key *key,
8230 LLVMTargetMachineRef tm,
8231 struct pipe_debug_callback *debug,
8232 void (*build)(struct si_shader_context *,
8233 union si_shader_part_key *),
8234 const char *name)
8235 {
8236 struct si_shader_part *result;
8237
8238 mtx_lock(&sscreen->shader_parts_mutex);
8239
8240 /* Find existing. */
8241 for (result = *list; result; result = result->next) {
8242 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
8243 mtx_unlock(&sscreen->shader_parts_mutex);
8244 return result;
8245 }
8246 }
8247
8248 /* Compile a new one. */
8249 result = CALLOC_STRUCT(si_shader_part);
8250 result->key = *key;
8251
8252 struct si_shader shader = {};
8253 struct si_shader_context ctx;
8254 struct gallivm_state *gallivm = &ctx.gallivm;
8255
8256 si_init_shader_ctx(&ctx, sscreen, tm);
8257 ctx.shader = &shader;
8258 ctx.type = type;
8259
8260 switch (type) {
8261 case PIPE_SHADER_VERTEX:
8262 break;
8263 case PIPE_SHADER_TESS_CTRL:
8264 assert(!prolog);
8265 shader.key.part.tcs.epilog = key->tcs_epilog.states;
8266 break;
8267 case PIPE_SHADER_GEOMETRY:
8268 assert(prolog);
8269 break;
8270 case PIPE_SHADER_FRAGMENT:
8271 if (prolog)
8272 shader.key.part.ps.prolog = key->ps_prolog.states;
8273 else
8274 shader.key.part.ps.epilog = key->ps_epilog.states;
8275 break;
8276 default:
8277 unreachable("bad shader part");
8278 }
8279
8280 build(&ctx, key);
8281
8282 /* Compile. */
8283 si_llvm_finalize_module(&ctx,
8284 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
8285
8286 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
8287 gallivm->module, debug, ctx.type, name)) {
8288 FREE(result);
8289 result = NULL;
8290 goto out;
8291 }
8292
8293 result->next = *list;
8294 *list = result;
8295
8296 out:
8297 si_llvm_dispose(&ctx);
8298 mtx_unlock(&sscreen->shader_parts_mutex);
8299 return result;
8300 }
8301
8302 /**
8303 * Build the vertex shader prolog function.
8304 *
8305 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
8306 * All inputs are returned unmodified. The vertex load indices are
8307 * stored after them, which will be used by the API VS for fetching inputs.
8308 *
8309 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
8310 * input_v0,
8311 * input_v1,
8312 * input_v2,
8313 * input_v3,
8314 * (VertexID + BaseVertex),
8315 * (InstanceID + StartInstance),
8316 * (InstanceID / 2 + StartInstance)
8317 */
8318 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
8319 union si_shader_part_key *key)
8320 {
8321 struct gallivm_state *gallivm = &ctx->gallivm;
8322 LLVMTypeRef *params, *returns;
8323 LLVMValueRef ret, func;
8324 int last_sgpr, num_params, num_returns, i;
8325 unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs +
8326 key->vs_prolog.num_merged_next_stage_vgprs;
8327 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
8328 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
8329 num_input_vgprs;
8330 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
8331
8332 ctx->param_vertex_id = first_vs_vgpr;
8333 ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1);
8334
8335 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
8336 params = alloca(num_all_input_regs * sizeof(LLVMTypeRef));
8337 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
8338 sizeof(LLVMTypeRef));
8339 num_params = 0;
8340 num_returns = 0;
8341
8342 /* Declare input and output SGPRs. */
8343 num_params = 0;
8344 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8345 params[num_params++] = ctx->i32;
8346 returns[num_returns++] = ctx->i32;
8347 }
8348 last_sgpr = num_params - 1;
8349
8350 /* Preloaded VGPRs (outputs must be floats) */
8351 for (i = 0; i < num_input_vgprs; i++) {
8352 params[num_params++] = ctx->i32;
8353 returns[num_returns++] = ctx->f32;
8354 }
8355
8356 /* Vertex load indices. */
8357 for (i = 0; i <= key->vs_prolog.last_input; i++)
8358 returns[num_returns++] = ctx->f32;
8359
8360 /* Create the function. */
8361 si_create_function(ctx, "vs_prolog", returns, num_returns, params,
8362 num_params, last_sgpr);
8363 func = ctx->main_fn;
8364
8365 if (key->vs_prolog.num_merged_next_stage_vgprs &&
8366 !key->vs_prolog.is_monolithic)
8367 si_init_exec_from_input(ctx, 3, 0);
8368
8369 /* Copy inputs to outputs. This should be no-op, as the registers match,
8370 * but it will prevent the compiler from overwriting them unintentionally.
8371 */
8372 ret = ctx->return_value;
8373 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
8374 LLVMValueRef p = LLVMGetParam(func, i);
8375 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8376 }
8377 for (; i < num_params; i++) {
8378 LLVMValueRef p = LLVMGetParam(func, i);
8379 p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
8380 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8381 }
8382
8383 /* Compute vertex load indices from instance divisors. */
8384 for (i = 0; i <= key->vs_prolog.last_input; i++) {
8385 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
8386 LLVMValueRef index;
8387
8388 if (divisor) {
8389 /* InstanceID / Divisor + StartInstance */
8390 index = get_instance_index_for_fetch(ctx,
8391 user_sgpr_base +
8392 SI_SGPR_START_INSTANCE,
8393 divisor);
8394 } else {
8395 /* VertexID + BaseVertex */
8396 index = LLVMBuildAdd(gallivm->builder,
8397 LLVMGetParam(func, ctx->param_vertex_id),
8398 LLVMGetParam(func, user_sgpr_base +
8399 SI_SGPR_BASE_VERTEX), "");
8400 }
8401
8402 index = LLVMBuildBitCast(gallivm->builder, index, ctx->f32, "");
8403 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
8404 num_params++, "");
8405 }
8406
8407 si_llvm_build_ret(ctx, ret);
8408 }
8409
8410 static bool si_get_vs_prolog(struct si_screen *sscreen,
8411 LLVMTargetMachineRef tm,
8412 struct si_shader *shader,
8413 struct pipe_debug_callback *debug,
8414 struct si_shader *main_part,
8415 const struct si_vs_prolog_bits *key)
8416 {
8417 struct si_shader_selector *vs = main_part->selector;
8418
8419 /* The prolog is a no-op if there are no inputs. */
8420 if (!vs->vs_needs_prolog)
8421 return true;
8422
8423 /* Get the prolog. */
8424 union si_shader_part_key prolog_key;
8425 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
8426 key, shader, &prolog_key);
8427
8428 shader->prolog =
8429 si_get_shader_part(sscreen, &sscreen->vs_prologs,
8430 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
8431 debug, si_build_vs_prolog_function,
8432 "Vertex Shader Prolog");
8433 return shader->prolog != NULL;
8434 }
8435
8436 /**
8437 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
8438 */
8439 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
8440 LLVMTargetMachineRef tm,
8441 struct si_shader *shader,
8442 struct pipe_debug_callback *debug)
8443 {
8444 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
8445 &shader->key.part.vs.prolog);
8446 }
8447
8448 /**
8449 * Compile the TCS epilog function. This writes tesselation factors to memory
8450 * based on the output primitive type of the tesselator (determined by TES).
8451 */
8452 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
8453 union si_shader_part_key *key)
8454 {
8455 struct gallivm_state *gallivm = &ctx->gallivm;
8456 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8457 LLVMTypeRef params[32];
8458 LLVMValueRef func;
8459 int last_sgpr, num_params = 0;
8460
8461 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
8462 params[ctx->param_rw_buffers = num_params++] =
8463 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
8464
8465 if (ctx->screen->b.chip_class >= GFX9) {
8466 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8467 params[num_params++] = ctx->i32; /* wave info */
8468 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8469 params[num_params++] = ctx->i32;
8470 params[num_params++] = ctx->i32;
8471 params[num_params++] = ctx->i32;
8472 params[num_params++] = ctx->i64;
8473 params[num_params++] = ctx->i64;
8474 params[num_params++] = ctx->i64;
8475 params[num_params++] = ctx->i64;
8476 params[num_params++] = ctx->i64;
8477 params[num_params++] = ctx->i64;
8478 params[num_params++] = ctx->i32;
8479 params[num_params++] = ctx->i32;
8480 params[num_params++] = ctx->i32;
8481 params[num_params++] = ctx->i32;
8482 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8483 } else {
8484 params[num_params++] = ctx->i64;
8485 params[num_params++] = ctx->i64;
8486 params[num_params++] = ctx->i64;
8487 params[num_params++] = ctx->i64;
8488 params[ctx->param_tcs_offchip_layout = num_params++] = ctx->i32;
8489 params[num_params++] = ctx->i32;
8490 params[num_params++] = ctx->i32;
8491 params[num_params++] = ctx->i32;
8492 params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
8493 params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
8494 }
8495 last_sgpr = num_params - 1;
8496
8497 params[num_params++] = ctx->i32; /* patch index within the wave (REL_PATCH_ID) */
8498 params[num_params++] = ctx->i32; /* invocation ID within the patch */
8499 params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */
8500
8501 /* Create the function. */
8502 si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr);
8503 declare_lds_as_pointer(ctx);
8504 func = ctx->main_fn;
8505
8506 si_write_tess_factors(bld_base,
8507 LLVMGetParam(func, last_sgpr + 1),
8508 LLVMGetParam(func, last_sgpr + 2),
8509 LLVMGetParam(func, last_sgpr + 3));
8510
8511 LLVMBuildRetVoid(gallivm->builder);
8512 }
8513
8514 /**
8515 * Select and compile (or reuse) TCS parts (epilog).
8516 */
8517 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
8518 LLVMTargetMachineRef tm,
8519 struct si_shader *shader,
8520 struct pipe_debug_callback *debug)
8521 {
8522 if (sscreen->b.chip_class >= GFX9) {
8523 struct si_shader *ls_main_part =
8524 shader->key.part.tcs.ls->main_shader_part_ls;
8525
8526 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
8527 &shader->key.part.tcs.ls_prolog))
8528 return false;
8529
8530 shader->previous_stage = ls_main_part;
8531 }
8532
8533 /* Get the epilog. */
8534 union si_shader_part_key epilog_key;
8535 memset(&epilog_key, 0, sizeof(epilog_key));
8536 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
8537
8538 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
8539 PIPE_SHADER_TESS_CTRL, false,
8540 &epilog_key, tm, debug,
8541 si_build_tcs_epilog_function,
8542 "Tessellation Control Shader Epilog");
8543 return shader->epilog != NULL;
8544 }
8545
8546 /**
8547 * Select and compile (or reuse) GS parts (prolog).
8548 */
8549 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
8550 LLVMTargetMachineRef tm,
8551 struct si_shader *shader,
8552 struct pipe_debug_callback *debug)
8553 {
8554 if (sscreen->b.chip_class >= GFX9) {
8555 struct si_shader *es_main_part =
8556 shader->key.part.gs.es->main_shader_part_es;
8557
8558 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
8559 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
8560 &shader->key.part.gs.vs_prolog))
8561 return false;
8562
8563 shader->previous_stage = es_main_part;
8564 }
8565
8566 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
8567 return true;
8568
8569 union si_shader_part_key prolog_key;
8570 memset(&prolog_key, 0, sizeof(prolog_key));
8571 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
8572
8573 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
8574 PIPE_SHADER_GEOMETRY, true,
8575 &prolog_key, tm, debug,
8576 si_build_gs_prolog_function,
8577 "Geometry Shader Prolog");
8578 return shader->prolog2 != NULL;
8579 }
8580
8581 /**
8582 * Build the pixel shader prolog function. This handles:
8583 * - two-side color selection and interpolation
8584 * - overriding interpolation parameters for the API PS
8585 * - polygon stippling
8586 *
8587 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
8588 * overriden by other states. (e.g. per-sample interpolation)
8589 * Interpolated colors are stored after the preloaded VGPRs.
8590 */
8591 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
8592 union si_shader_part_key *key)
8593 {
8594 struct gallivm_state *gallivm = &ctx->gallivm;
8595 LLVMTypeRef *params;
8596 LLVMValueRef ret, func;
8597 int last_sgpr, num_params, num_returns, i, num_color_channels;
8598
8599 assert(si_need_ps_prolog(key));
8600
8601 /* Number of inputs + 8 color elements. */
8602 params = alloca((key->ps_prolog.num_input_sgprs +
8603 key->ps_prolog.num_input_vgprs + 8) *
8604 sizeof(LLVMTypeRef));
8605
8606 /* Declare inputs. */
8607 num_params = 0;
8608 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
8609 params[num_params++] = ctx->i32;
8610 last_sgpr = num_params - 1;
8611
8612 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
8613 params[num_params++] = ctx->f32;
8614
8615 /* Declare outputs (same as inputs + add colors if needed) */
8616 num_returns = num_params;
8617 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
8618 for (i = 0; i < num_color_channels; i++)
8619 params[num_returns++] = ctx->f32;
8620
8621 /* Create the function. */
8622 si_create_function(ctx, "ps_prolog", params, num_returns, params,
8623 num_params, last_sgpr);
8624 func = ctx->main_fn;
8625
8626 /* Copy inputs to outputs. This should be no-op, as the registers match,
8627 * but it will prevent the compiler from overwriting them unintentionally.
8628 */
8629 ret = ctx->return_value;
8630 for (i = 0; i < num_params; i++) {
8631 LLVMValueRef p = LLVMGetParam(func, i);
8632 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
8633 }
8634
8635 /* Polygon stippling. */
8636 if (key->ps_prolog.states.poly_stipple) {
8637 /* POS_FIXED_PT is always last. */
8638 unsigned pos = key->ps_prolog.num_input_sgprs +
8639 key->ps_prolog.num_input_vgprs - 1;
8640 LLVMValueRef ptr[2], list;
8641
8642 /* Get the pointer to rw buffers. */
8643 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
8644 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
8645 list = lp_build_gather_values(gallivm, ptr, 2);
8646 list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
8647 list = LLVMBuildIntToPtr(gallivm->builder, list,
8648 const_array(ctx->v16i8, SI_NUM_RW_BUFFERS), "");
8649
8650 si_llvm_emit_polygon_stipple(ctx, list, pos);
8651 }
8652
8653 if (key->ps_prolog.states.bc_optimize_for_persp ||
8654 key->ps_prolog.states.bc_optimize_for_linear) {
8655 unsigned i, base = key->ps_prolog.num_input_sgprs;
8656 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
8657
8658 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
8659 * The hw doesn't compute CENTROID if the whole wave only
8660 * contains fully-covered quads.
8661 *
8662 * PRIM_MASK is after user SGPRs.
8663 */
8664 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8665 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
8666 LLVMConstInt(ctx->i32, 31, 0), "");
8667 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
8668 ctx->i1, "");
8669
8670 if (key->ps_prolog.states.bc_optimize_for_persp) {
8671 /* Read PERSP_CENTER. */
8672 for (i = 0; i < 2; i++)
8673 center[i] = LLVMGetParam(func, base + 2 + i);
8674 /* Read PERSP_CENTROID. */
8675 for (i = 0; i < 2; i++)
8676 centroid[i] = LLVMGetParam(func, base + 4 + i);
8677 /* Select PERSP_CENTROID. */
8678 for (i = 0; i < 2; i++) {
8679 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8680 center[i], centroid[i], "");
8681 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8682 tmp, base + 4 + i, "");
8683 }
8684 }
8685 if (key->ps_prolog.states.bc_optimize_for_linear) {
8686 /* Read LINEAR_CENTER. */
8687 for (i = 0; i < 2; i++)
8688 center[i] = LLVMGetParam(func, base + 8 + i);
8689 /* Read LINEAR_CENTROID. */
8690 for (i = 0; i < 2; i++)
8691 centroid[i] = LLVMGetParam(func, base + 10 + i);
8692 /* Select LINEAR_CENTROID. */
8693 for (i = 0; i < 2; i++) {
8694 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
8695 center[i], centroid[i], "");
8696 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8697 tmp, base + 10 + i, "");
8698 }
8699 }
8700 }
8701
8702 /* Force per-sample interpolation. */
8703 if (key->ps_prolog.states.force_persp_sample_interp) {
8704 unsigned i, base = key->ps_prolog.num_input_sgprs;
8705 LLVMValueRef persp_sample[2];
8706
8707 /* Read PERSP_SAMPLE. */
8708 for (i = 0; i < 2; i++)
8709 persp_sample[i] = LLVMGetParam(func, base + i);
8710 /* Overwrite PERSP_CENTER. */
8711 for (i = 0; i < 2; i++)
8712 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8713 persp_sample[i], base + 2 + i, "");
8714 /* Overwrite PERSP_CENTROID. */
8715 for (i = 0; i < 2; i++)
8716 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8717 persp_sample[i], base + 4 + i, "");
8718 }
8719 if (key->ps_prolog.states.force_linear_sample_interp) {
8720 unsigned i, base = key->ps_prolog.num_input_sgprs;
8721 LLVMValueRef linear_sample[2];
8722
8723 /* Read LINEAR_SAMPLE. */
8724 for (i = 0; i < 2; i++)
8725 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
8726 /* Overwrite LINEAR_CENTER. */
8727 for (i = 0; i < 2; i++)
8728 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8729 linear_sample[i], base + 8 + i, "");
8730 /* Overwrite LINEAR_CENTROID. */
8731 for (i = 0; i < 2; i++)
8732 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8733 linear_sample[i], base + 10 + i, "");
8734 }
8735
8736 /* Force center interpolation. */
8737 if (key->ps_prolog.states.force_persp_center_interp) {
8738 unsigned i, base = key->ps_prolog.num_input_sgprs;
8739 LLVMValueRef persp_center[2];
8740
8741 /* Read PERSP_CENTER. */
8742 for (i = 0; i < 2; i++)
8743 persp_center[i] = LLVMGetParam(func, base + 2 + i);
8744 /* Overwrite PERSP_SAMPLE. */
8745 for (i = 0; i < 2; i++)
8746 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8747 persp_center[i], base + i, "");
8748 /* Overwrite PERSP_CENTROID. */
8749 for (i = 0; i < 2; i++)
8750 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8751 persp_center[i], base + 4 + i, "");
8752 }
8753 if (key->ps_prolog.states.force_linear_center_interp) {
8754 unsigned i, base = key->ps_prolog.num_input_sgprs;
8755 LLVMValueRef linear_center[2];
8756
8757 /* Read LINEAR_CENTER. */
8758 for (i = 0; i < 2; i++)
8759 linear_center[i] = LLVMGetParam(func, base + 8 + i);
8760 /* Overwrite LINEAR_SAMPLE. */
8761 for (i = 0; i < 2; i++)
8762 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8763 linear_center[i], base + 6 + i, "");
8764 /* Overwrite LINEAR_CENTROID. */
8765 for (i = 0; i < 2; i++)
8766 ret = LLVMBuildInsertValue(gallivm->builder, ret,
8767 linear_center[i], base + 10 + i, "");
8768 }
8769
8770 /* Interpolate colors. */
8771 for (i = 0; i < 2; i++) {
8772 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
8773 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
8774 key->ps_prolog.face_vgpr_index;
8775 LLVMValueRef interp[2], color[4];
8776 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
8777
8778 if (!writemask)
8779 continue;
8780
8781 /* If the interpolation qualifier is not CONSTANT (-1). */
8782 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
8783 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
8784 key->ps_prolog.color_interp_vgpr_index[i];
8785
8786 /* Get the (i,j) updated by bc_optimize handling. */
8787 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
8788 interp_vgpr, "");
8789 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
8790 interp_vgpr + 1, "");
8791 interp_ij = lp_build_gather_values(gallivm, interp, 2);
8792 }
8793
8794 /* Use the absolute location of the input. */
8795 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
8796
8797 if (key->ps_prolog.states.color_two_side) {
8798 face = LLVMGetParam(func, face_vgpr);
8799 face = LLVMBuildBitCast(gallivm->builder, face, ctx->i32, "");
8800 }
8801
8802 interp_fs_input(ctx,
8803 key->ps_prolog.color_attr_index[i],
8804 TGSI_SEMANTIC_COLOR, i,
8805 key->ps_prolog.num_interp_inputs,
8806 key->ps_prolog.colors_read, interp_ij,
8807 prim_mask, face, color);
8808
8809 while (writemask) {
8810 unsigned chan = u_bit_scan(&writemask);
8811 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
8812 num_params++, "");
8813 }
8814 }
8815
8816 /* Tell LLVM to insert WQM instruction sequence when needed. */
8817 if (key->ps_prolog.wqm) {
8818 LLVMAddTargetDependentFunctionAttr(func,
8819 "amdgpu-ps-wqm-outputs", "");
8820 }
8821
8822 si_llvm_build_ret(ctx, ret);
8823 }
8824
8825 /**
8826 * Build the pixel shader epilog function. This handles everything that must be
8827 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
8828 */
8829 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
8830 union si_shader_part_key *key)
8831 {
8832 struct gallivm_state *gallivm = &ctx->gallivm;
8833 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
8834 LLVMTypeRef params[16+8*4+3];
8835 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
8836 int last_sgpr, num_params = 0, i;
8837 struct si_ps_exports exp = {};
8838
8839 /* Declare input SGPRs. */
8840 params[ctx->param_rw_buffers = num_params++] = ctx->i64;
8841 params[ctx->param_const_buffers = num_params++] = ctx->i64;
8842 params[ctx->param_samplers = num_params++] = ctx->i64;
8843 params[ctx->param_images = num_params++] = ctx->i64;
8844 params[ctx->param_shader_buffers = num_params++] = ctx->i64;
8845 assert(num_params == SI_PARAM_ALPHA_REF);
8846 params[SI_PARAM_ALPHA_REF] = ctx->f32;
8847 last_sgpr = SI_PARAM_ALPHA_REF;
8848
8849 /* Declare input VGPRs. */
8850 num_params = (last_sgpr + 1) +
8851 util_bitcount(key->ps_epilog.colors_written) * 4 +
8852 key->ps_epilog.writes_z +
8853 key->ps_epilog.writes_stencil +
8854 key->ps_epilog.writes_samplemask;
8855
8856 num_params = MAX2(num_params,
8857 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
8858
8859 assert(num_params <= ARRAY_SIZE(params));
8860
8861 for (i = last_sgpr + 1; i < num_params; i++)
8862 params[i] = ctx->f32;
8863
8864 /* Create the function. */
8865 si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr);
8866 /* Disable elimination of unused inputs. */
8867 si_llvm_add_attribute(ctx->main_fn,
8868 "InitialPSInputAddr", 0xffffff);
8869
8870 /* Process colors. */
8871 unsigned vgpr = last_sgpr + 1;
8872 unsigned colors_written = key->ps_epilog.colors_written;
8873 int last_color_export = -1;
8874
8875 /* Find the last color export. */
8876 if (!key->ps_epilog.writes_z &&
8877 !key->ps_epilog.writes_stencil &&
8878 !key->ps_epilog.writes_samplemask) {
8879 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
8880
8881 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
8882 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
8883 /* Just set this if any of the colorbuffers are enabled. */
8884 if (spi_format &
8885 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
8886 last_color_export = 0;
8887 } else {
8888 for (i = 0; i < 8; i++)
8889 if (colors_written & (1 << i) &&
8890 (spi_format >> (i * 4)) & 0xf)
8891 last_color_export = i;
8892 }
8893 }
8894
8895 while (colors_written) {
8896 LLVMValueRef color[4];
8897 int mrt = u_bit_scan(&colors_written);
8898
8899 for (i = 0; i < 4; i++)
8900 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
8901
8902 si_export_mrt_color(bld_base, color, mrt,
8903 num_params - 1,
8904 mrt == last_color_export, &exp);
8905 }
8906
8907 /* Process depth, stencil, samplemask. */
8908 if (key->ps_epilog.writes_z)
8909 depth = LLVMGetParam(ctx->main_fn, vgpr++);
8910 if (key->ps_epilog.writes_stencil)
8911 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
8912 if (key->ps_epilog.writes_samplemask)
8913 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
8914
8915 if (depth || stencil || samplemask)
8916 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
8917 else if (last_color_export == -1)
8918 si_export_null(bld_base);
8919
8920 if (exp.num)
8921 si_emit_ps_exports(ctx, &exp);
8922
8923 /* Compile. */
8924 LLVMBuildRetVoid(gallivm->builder);
8925 }
8926
8927 /**
8928 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
8929 */
8930 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
8931 LLVMTargetMachineRef tm,
8932 struct si_shader *shader,
8933 struct pipe_debug_callback *debug)
8934 {
8935 union si_shader_part_key prolog_key;
8936 union si_shader_part_key epilog_key;
8937
8938 /* Get the prolog. */
8939 si_get_ps_prolog_key(shader, &prolog_key, true);
8940
8941 /* The prolog is a no-op if these aren't set. */
8942 if (si_need_ps_prolog(&prolog_key)) {
8943 shader->prolog =
8944 si_get_shader_part(sscreen, &sscreen->ps_prologs,
8945 PIPE_SHADER_FRAGMENT, true,
8946 &prolog_key, tm, debug,
8947 si_build_ps_prolog_function,
8948 "Fragment Shader Prolog");
8949 if (!shader->prolog)
8950 return false;
8951 }
8952
8953 /* Get the epilog. */
8954 si_get_ps_epilog_key(shader, &epilog_key);
8955
8956 shader->epilog =
8957 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
8958 PIPE_SHADER_FRAGMENT, false,
8959 &epilog_key, tm, debug,
8960 si_build_ps_epilog_function,
8961 "Fragment Shader Epilog");
8962 if (!shader->epilog)
8963 return false;
8964
8965 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
8966 if (shader->key.part.ps.prolog.poly_stipple) {
8967 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
8968 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
8969 }
8970
8971 /* Set up the enable bits for per-sample shading if needed. */
8972 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
8973 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8974 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8975 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
8976 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8977 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
8978 }
8979 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
8980 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
8981 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8982 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
8983 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8984 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
8985 }
8986 if (shader->key.part.ps.prolog.force_persp_center_interp &&
8987 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8988 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8989 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
8990 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
8991 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
8992 }
8993 if (shader->key.part.ps.prolog.force_linear_center_interp &&
8994 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
8995 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
8996 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
8997 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
8998 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
8999 }
9000
9001 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
9002 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
9003 !(shader->config.spi_ps_input_ena & 0xf)) {
9004 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
9005 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
9006 }
9007
9008 /* At least one pair of interpolation weights must be enabled. */
9009 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
9010 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
9011 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
9012 }
9013
9014 /* The sample mask input is always enabled, because the API shader always
9015 * passes it through to the epilog. Disable it here if it's unused.
9016 */
9017 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
9018 !shader->selector->info.reads_samplemask)
9019 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
9020
9021 return true;
9022 }
9023
9024 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
9025 unsigned *lds_size)
9026 {
9027 /* SPI barrier management bug:
9028 * Make sure we have at least 4k of LDS in use to avoid the bug.
9029 * It applies to workgroup sizes of more than one wavefront.
9030 */
9031 if (sscreen->b.family == CHIP_BONAIRE ||
9032 sscreen->b.family == CHIP_KABINI ||
9033 sscreen->b.family == CHIP_MULLINS)
9034 *lds_size = MAX2(*lds_size, 8);
9035 }
9036
9037 static void si_fix_resource_usage(struct si_screen *sscreen,
9038 struct si_shader *shader)
9039 {
9040 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
9041
9042 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
9043
9044 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
9045 si_get_max_workgroup_size(shader) > 64) {
9046 si_multiwave_lds_size_workaround(sscreen,
9047 &shader->config.lds_size);
9048 }
9049 }
9050
9051 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
9052 struct si_shader *shader,
9053 struct pipe_debug_callback *debug)
9054 {
9055 struct si_shader_selector *sel = shader->selector;
9056 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
9057 int r;
9058
9059 /* LS, ES, VS are compiled on demand if the main part hasn't been
9060 * compiled for that stage.
9061 *
9062 * Vertex shaders are compiled on demand when a vertex fetch
9063 * workaround must be applied.
9064 */
9065 if (shader->is_monolithic) {
9066 /* Monolithic shader (compiled as a whole, has many variants,
9067 * may take a long time to compile).
9068 */
9069 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
9070 if (r)
9071 return r;
9072 } else {
9073 /* The shader consists of 2-3 parts:
9074 *
9075 * - the middle part is the user shader, it has 1 variant only
9076 * and it was compiled during the creation of the shader
9077 * selector
9078 * - the prolog part is inserted at the beginning
9079 * - the epilog part is inserted at the end
9080 *
9081 * The prolog and epilog have many (but simple) variants.
9082 */
9083
9084 /* Copy the compiled TGSI shader data over. */
9085 shader->is_binary_shared = true;
9086 shader->binary = mainp->binary;
9087 shader->config = mainp->config;
9088 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
9089 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
9090 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
9091 memcpy(shader->info.vs_output_param_offset,
9092 mainp->info.vs_output_param_offset,
9093 sizeof(mainp->info.vs_output_param_offset));
9094 shader->info.uses_instanceid = mainp->info.uses_instanceid;
9095 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
9096 shader->info.nr_param_exports = mainp->info.nr_param_exports;
9097
9098 /* Select prologs and/or epilogs. */
9099 switch (sel->type) {
9100 case PIPE_SHADER_VERTEX:
9101 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
9102 return -1;
9103 break;
9104 case PIPE_SHADER_TESS_CTRL:
9105 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
9106 return -1;
9107 break;
9108 case PIPE_SHADER_TESS_EVAL:
9109 break;
9110 case PIPE_SHADER_GEOMETRY:
9111 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
9112 return -1;
9113 break;
9114 case PIPE_SHADER_FRAGMENT:
9115 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
9116 return -1;
9117
9118 /* Make sure we have at least as many VGPRs as there
9119 * are allocated inputs.
9120 */
9121 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9122 shader->info.num_input_vgprs);
9123 break;
9124 }
9125
9126 /* Update SGPR and VGPR counts. */
9127 if (shader->prolog) {
9128 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9129 shader->prolog->config.num_sgprs);
9130 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9131 shader->prolog->config.num_vgprs);
9132 }
9133 if (shader->previous_stage) {
9134 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9135 shader->previous_stage->config.num_sgprs);
9136 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9137 shader->previous_stage->config.num_vgprs);
9138 shader->config.spilled_sgprs =
9139 MAX2(shader->config.spilled_sgprs,
9140 shader->previous_stage->config.spilled_sgprs);
9141 shader->config.spilled_vgprs =
9142 MAX2(shader->config.spilled_vgprs,
9143 shader->previous_stage->config.spilled_vgprs);
9144 shader->config.private_mem_vgprs =
9145 MAX2(shader->config.private_mem_vgprs,
9146 shader->previous_stage->config.private_mem_vgprs);
9147 shader->config.scratch_bytes_per_wave =
9148 MAX2(shader->config.scratch_bytes_per_wave,
9149 shader->previous_stage->config.scratch_bytes_per_wave);
9150 shader->info.uses_instanceid |=
9151 shader->previous_stage->info.uses_instanceid;
9152 }
9153 if (shader->prolog2) {
9154 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9155 shader->prolog2->config.num_sgprs);
9156 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9157 shader->prolog2->config.num_vgprs);
9158 }
9159 if (shader->epilog) {
9160 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
9161 shader->epilog->config.num_sgprs);
9162 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
9163 shader->epilog->config.num_vgprs);
9164 }
9165 }
9166
9167 si_fix_resource_usage(sscreen, shader);
9168 si_shader_dump(sscreen, shader, debug, sel->info.processor,
9169 stderr, true);
9170
9171 /* Upload. */
9172 r = si_shader_binary_upload(sscreen, shader);
9173 if (r) {
9174 fprintf(stderr, "LLVM failed to upload shader\n");
9175 return r;
9176 }
9177
9178 return 0;
9179 }
9180
9181 void si_shader_destroy(struct si_shader *shader)
9182 {
9183 if (shader->scratch_bo)
9184 r600_resource_reference(&shader->scratch_bo, NULL);
9185
9186 r600_resource_reference(&shader->bo, NULL);
9187
9188 if (!shader->is_binary_shared)
9189 radeon_shader_binary_clean(&shader->binary);
9190
9191 free(shader->shader_log);
9192 }