0077054749914fa9c492aeb1f7272882f0525185
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "gallivm/lp_bld_const.h"
25 #include "gallivm/lp_bld_gather.h"
26 #include "gallivm/lp_bld_intr.h"
27 #include "gallivm/lp_bld_logic.h"
28 #include "gallivm/lp_bld_arit.h"
29 #include "gallivm/lp_bld_flow.h"
30 #include "gallivm/lp_bld_misc.h"
31 #include "util/u_memory.h"
32 #include "util/u_string.h"
33 #include "tgsi/tgsi_build.h"
34 #include "tgsi/tgsi_util.h"
35 #include "tgsi/tgsi_dump.h"
36
37 #include "ac_binary.h"
38 #include "ac_llvm_util.h"
39 #include "ac_exp_param.h"
40 #include "si_shader_internal.h"
41 #include "si_pipe.h"
42 #include "sid.h"
43
44 #include "compiler/nir/nir.h"
45
46 static const char *scratch_rsrc_dword0_symbol =
47 "SCRATCH_RSRC_DWORD0";
48
49 static const char *scratch_rsrc_dword1_symbol =
50 "SCRATCH_RSRC_DWORD1";
51
52 struct si_shader_output_values
53 {
54 LLVMValueRef values[4];
55 unsigned semantic_name;
56 unsigned semantic_index;
57 ubyte vertex_stream[4];
58 };
59
60 /**
61 * Used to collect types and other info about arguments of the LLVM function
62 * before the function is created.
63 */
64 struct si_function_info {
65 LLVMTypeRef types[100];
66 LLVMValueRef *assign[100];
67 unsigned num_sgpr_params;
68 unsigned num_params;
69 };
70
71 enum si_arg_regfile {
72 ARG_SGPR,
73 ARG_VGPR
74 };
75
76 static void si_init_shader_ctx(struct si_shader_context *ctx,
77 struct si_screen *sscreen,
78 LLVMTargetMachineRef tm);
79
80 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
81 struct lp_build_tgsi_context *bld_base,
82 struct lp_build_emit_data *emit_data);
83
84 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
85 FILE *f);
86
87 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
88 union si_shader_part_key *key);
89 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
90 union si_shader_part_key *key);
91 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
92 union si_shader_part_key *key);
93 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
94 union si_shader_part_key *key);
95
96 /* Ideally pass the sample mask input to the PS epilog as v14, which
97 * is its usual location, so that the shader doesn't have to add v_mov.
98 */
99 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
100
101 enum {
102 CONST_ADDR_SPACE = 2,
103 LOCAL_ADDR_SPACE = 3,
104 };
105
106 static bool llvm_type_is_64bit(struct si_shader_context *ctx,
107 LLVMTypeRef type)
108 {
109 if (type == ctx->ac.i64 || type == ctx->ac.f64)
110 return true;
111
112 return false;
113 }
114
115 static bool is_merged_shader(struct si_shader *shader)
116 {
117 if (shader->selector->screen->info.chip_class <= VI)
118 return false;
119
120 return shader->key.as_ls ||
121 shader->key.as_es ||
122 shader->selector->type == PIPE_SHADER_TESS_CTRL ||
123 shader->selector->type == PIPE_SHADER_GEOMETRY;
124 }
125
126 static void si_init_function_info(struct si_function_info *fninfo)
127 {
128 fninfo->num_params = 0;
129 fninfo->num_sgpr_params = 0;
130 }
131
132 static unsigned add_arg_assign(struct si_function_info *fninfo,
133 enum si_arg_regfile regfile, LLVMTypeRef type,
134 LLVMValueRef *assign)
135 {
136 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
137
138 unsigned idx = fninfo->num_params++;
139 assert(idx < ARRAY_SIZE(fninfo->types));
140
141 if (regfile == ARG_SGPR)
142 fninfo->num_sgpr_params = fninfo->num_params;
143
144 fninfo->types[idx] = type;
145 fninfo->assign[idx] = assign;
146 return idx;
147 }
148
149 static unsigned add_arg(struct si_function_info *fninfo,
150 enum si_arg_regfile regfile, LLVMTypeRef type)
151 {
152 return add_arg_assign(fninfo, regfile, type, NULL);
153 }
154
155 static void add_arg_assign_checked(struct si_function_info *fninfo,
156 enum si_arg_regfile regfile, LLVMTypeRef type,
157 LLVMValueRef *assign, unsigned idx)
158 {
159 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
160 assert(actual == idx);
161 }
162
163 static void add_arg_checked(struct si_function_info *fninfo,
164 enum si_arg_regfile regfile, LLVMTypeRef type,
165 unsigned idx)
166 {
167 add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
168 }
169
170 /**
171 * Returns a unique index for a per-patch semantic name and index. The index
172 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
173 * can be calculated.
174 */
175 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
176 {
177 switch (semantic_name) {
178 case TGSI_SEMANTIC_TESSOUTER:
179 return 0;
180 case TGSI_SEMANTIC_TESSINNER:
181 return 1;
182 case TGSI_SEMANTIC_PATCH:
183 assert(index < 30);
184 return 2 + index;
185
186 default:
187 assert(!"invalid semantic name");
188 return 0;
189 }
190 }
191
192 /**
193 * Returns a unique index for a semantic name and index. The index must be
194 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
195 * calculated.
196 */
197 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
198 {
199 switch (semantic_name) {
200 case TGSI_SEMANTIC_POSITION:
201 return 0;
202 case TGSI_SEMANTIC_GENERIC:
203 /* Since some shader stages use the the highest used IO index
204 * to determine the size to allocate for inputs/outputs
205 * (in LDS, tess and GS rings). GENERIC should be placed right
206 * after POSITION to make that size as small as possible.
207 */
208 if (index < SI_MAX_IO_GENERIC)
209 return 1 + index;
210
211 assert(!"invalid generic index");
212 return 0;
213 case TGSI_SEMANTIC_PSIZE:
214 return SI_MAX_IO_GENERIC + 1;
215 case TGSI_SEMANTIC_CLIPDIST:
216 assert(index <= 1);
217 return SI_MAX_IO_GENERIC + 2 + index;
218 case TGSI_SEMANTIC_FOG:
219 return SI_MAX_IO_GENERIC + 4;
220 case TGSI_SEMANTIC_LAYER:
221 return SI_MAX_IO_GENERIC + 5;
222 case TGSI_SEMANTIC_VIEWPORT_INDEX:
223 return SI_MAX_IO_GENERIC + 6;
224 case TGSI_SEMANTIC_PRIMID:
225 return SI_MAX_IO_GENERIC + 7;
226 case TGSI_SEMANTIC_COLOR: /* these alias */
227 case TGSI_SEMANTIC_BCOLOR:
228 assert(index < 2);
229 return SI_MAX_IO_GENERIC + 8 + index;
230 case TGSI_SEMANTIC_TEXCOORD:
231 assert(index < 8);
232 assert(SI_MAX_IO_GENERIC + 10 + index < 64);
233 return SI_MAX_IO_GENERIC + 10 + index;
234 default:
235 assert(!"invalid semantic name");
236 return 0;
237 }
238 }
239
240 /**
241 * Get the value of a shader input parameter and extract a bitfield.
242 */
243 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
244 unsigned param, unsigned rshift,
245 unsigned bitwidth)
246 {
247 LLVMValueRef value = LLVMGetParam(ctx->main_fn,
248 param);
249
250 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
251 value = ac_to_integer(&ctx->ac, value);
252
253 if (rshift)
254 value = LLVMBuildLShr(ctx->ac.builder, value,
255 LLVMConstInt(ctx->i32, rshift, 0), "");
256
257 if (rshift + bitwidth < 32) {
258 unsigned mask = (1 << bitwidth) - 1;
259 value = LLVMBuildAnd(ctx->ac.builder, value,
260 LLVMConstInt(ctx->i32, mask, 0), "");
261 }
262
263 return value;
264 }
265
266 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
267 {
268 switch (ctx->type) {
269 case PIPE_SHADER_TESS_CTRL:
270 return unpack_param(ctx, ctx->param_tcs_rel_ids, 0, 8);
271
272 case PIPE_SHADER_TESS_EVAL:
273 return LLVMGetParam(ctx->main_fn,
274 ctx->param_tes_rel_patch_id);
275
276 default:
277 assert(0);
278 return NULL;
279 }
280 }
281
282 /* Tessellation shaders pass outputs to the next shader using LDS.
283 *
284 * LS outputs = TCS inputs
285 * TCS outputs = TES inputs
286 *
287 * The LDS layout is:
288 * - TCS inputs for patch 0
289 * - TCS inputs for patch 1
290 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
291 * - ...
292 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
293 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
294 * - TCS outputs for patch 1
295 * - Per-patch TCS outputs for patch 1
296 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
297 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
298 * - ...
299 *
300 * All three shaders VS(LS), TCS, TES share the same LDS space.
301 */
302
303 static LLVMValueRef
304 get_tcs_in_patch_stride(struct si_shader_context *ctx)
305 {
306 return unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
307 }
308
309 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
310 {
311 assert(ctx->type == PIPE_SHADER_TESS_CTRL);
312
313 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
314 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
315
316 return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
317 }
318
319 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
320 {
321 unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
322
323 return LLVMConstInt(ctx->i32, stride, 0);
324 }
325
326 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
327 {
328 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
329 return unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
330
331 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
332 unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
333 unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
334 unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
335 unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
336 num_patch_outputs * 4;
337 return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
338 }
339
340 static LLVMValueRef
341 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
342 {
343 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
344 unpack_param(ctx,
345 ctx->param_tcs_out_lds_offsets,
346 0, 16),
347 4);
348 }
349
350 static LLVMValueRef
351 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
352 {
353 return lp_build_mul_imm(&ctx->bld_base.uint_bld,
354 unpack_param(ctx,
355 ctx->param_tcs_out_lds_offsets,
356 16, 16),
357 4);
358 }
359
360 static LLVMValueRef
361 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
362 {
363 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
364 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
365
366 return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
367 }
368
369 static LLVMValueRef
370 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
371 {
372 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
373 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
374 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
375
376 return LLVMBuildAdd(ctx->ac.builder, patch0_offset,
377 LLVMBuildMul(ctx->ac.builder, patch_stride,
378 rel_patch_id, ""),
379 "");
380 }
381
382 static LLVMValueRef
383 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
384 {
385 LLVMValueRef patch0_patch_data_offset =
386 get_tcs_out_patch0_patch_data_offset(ctx);
387 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
388 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
389
390 return LLVMBuildAdd(ctx->ac.builder, patch0_patch_data_offset,
391 LLVMBuildMul(ctx->ac.builder, patch_stride,
392 rel_patch_id, ""),
393 "");
394 }
395
396 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
397 {
398 unsigned tcs_out_vertices =
399 ctx->shader->selector ?
400 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
401
402 /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
403 if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
404 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
405
406 return unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
407 }
408
409 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
410 {
411 unsigned stride;
412
413 switch (ctx->type) {
414 case PIPE_SHADER_VERTEX:
415 stride = util_last_bit64(ctx->shader->selector->outputs_written);
416 return LLVMConstInt(ctx->i32, stride * 4, 0);
417
418 case PIPE_SHADER_TESS_CTRL:
419 if (ctx->screen->info.chip_class >= GFX9 &&
420 ctx->shader->is_monolithic) {
421 stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
422 return LLVMConstInt(ctx->i32, stride * 4, 0);
423 }
424 return unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
425
426 default:
427 assert(0);
428 return NULL;
429 }
430 }
431
432 static LLVMValueRef get_instance_index_for_fetch(
433 struct si_shader_context *ctx,
434 unsigned param_start_instance, LLVMValueRef divisor)
435 {
436 LLVMValueRef result = ctx->abi.instance_id;
437
438 /* The division must be done before START_INSTANCE is added. */
439 if (divisor != ctx->i32_1)
440 result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
441
442 return LLVMBuildAdd(ctx->ac.builder, result,
443 LLVMGetParam(ctx->main_fn, param_start_instance), "");
444 }
445
446 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
447 * to float. */
448 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
449 LLVMValueRef vec4,
450 unsigned double_index)
451 {
452 LLVMBuilderRef builder = ctx->ac.builder;
453 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
454 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
455 LLVMVectorType(f64, 2), "");
456 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
457 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
458 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
459 }
460
461 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
462 LLVMValueRef i32, unsigned index)
463 {
464 assert(index <= 1);
465
466 if (index == 1)
467 return LLVMBuildAShr(ctx->ac.builder, i32,
468 LLVMConstInt(ctx->i32, 16, 0), "");
469
470 return LLVMBuildSExt(ctx->ac.builder,
471 LLVMBuildTrunc(ctx->ac.builder, i32,
472 ctx->ac.i16, ""),
473 ctx->i32, "");
474 }
475
476 void si_llvm_load_input_vs(
477 struct si_shader_context *ctx,
478 unsigned input_index,
479 LLVMValueRef out[4])
480 {
481 unsigned vs_blit_property =
482 ctx->shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
483
484 if (vs_blit_property) {
485 LLVMValueRef vertex_id = ctx->abi.vertex_id;
486 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
487 LLVMIntULE, vertex_id,
488 ctx->i32_1, "");
489 /* Use LLVMIntNE, because we have 3 vertices and only
490 * the middle one should use y2.
491 */
492 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
493 LLVMIntNE, vertex_id,
494 ctx->i32_1, "");
495
496 if (input_index == 0) {
497 /* Position: */
498 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
499 ctx->param_vs_blit_inputs);
500 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
501 ctx->param_vs_blit_inputs + 1);
502
503 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
504 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
505 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
506 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
507
508 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
509 x1, x2, "");
510 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
511 y1, y2, "");
512
513 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
514 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
515 out[2] = LLVMGetParam(ctx->main_fn,
516 ctx->param_vs_blit_inputs + 2);
517 out[3] = ctx->ac.f32_1;
518 return;
519 }
520
521 /* Color or texture coordinates: */
522 assert(input_index == 1);
523
524 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
525 for (int i = 0; i < 4; i++) {
526 out[i] = LLVMGetParam(ctx->main_fn,
527 ctx->param_vs_blit_inputs + 3 + i);
528 }
529 } else {
530 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
531 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
532 ctx->param_vs_blit_inputs + 3);
533 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
534 ctx->param_vs_blit_inputs + 4);
535 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
536 ctx->param_vs_blit_inputs + 5);
537 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
538 ctx->param_vs_blit_inputs + 6);
539
540 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
541 x1, x2, "");
542 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
543 y1, y2, "");
544 out[2] = LLVMGetParam(ctx->main_fn,
545 ctx->param_vs_blit_inputs + 7);
546 out[3] = LLVMGetParam(ctx->main_fn,
547 ctx->param_vs_blit_inputs + 8);
548 }
549 return;
550 }
551
552 unsigned chan;
553 unsigned fix_fetch;
554 unsigned num_fetches;
555 unsigned fetch_stride;
556
557 LLVMValueRef t_list_ptr;
558 LLVMValueRef t_offset;
559 LLVMValueRef t_list;
560 LLVMValueRef vertex_index;
561 LLVMValueRef input[3];
562
563 /* Load the T list */
564 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
565
566 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
567
568 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
569
570 vertex_index = LLVMGetParam(ctx->main_fn,
571 ctx->param_vertex_index0 +
572 input_index);
573
574 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
575
576 /* Do multiple loads for special formats. */
577 switch (fix_fetch) {
578 case SI_FIX_FETCH_RGB_64_FLOAT:
579 num_fetches = 3; /* 3 2-dword loads */
580 fetch_stride = 8;
581 break;
582 case SI_FIX_FETCH_RGBA_64_FLOAT:
583 num_fetches = 2; /* 2 4-dword loads */
584 fetch_stride = 16;
585 break;
586 case SI_FIX_FETCH_RGB_8:
587 case SI_FIX_FETCH_RGB_8_INT:
588 num_fetches = 3;
589 fetch_stride = 1;
590 break;
591 case SI_FIX_FETCH_RGB_16:
592 case SI_FIX_FETCH_RGB_16_INT:
593 num_fetches = 3;
594 fetch_stride = 2;
595 break;
596 default:
597 num_fetches = 1;
598 fetch_stride = 0;
599 }
600
601 for (unsigned i = 0; i < num_fetches; i++) {
602 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
603
604 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
605 vertex_index, voffset,
606 true);
607 }
608
609 /* Break up the vec4 into individual components */
610 for (chan = 0; chan < 4; chan++) {
611 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
612 out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
613 input[0], llvm_chan, "");
614 }
615
616 switch (fix_fetch) {
617 case SI_FIX_FETCH_A2_SNORM:
618 case SI_FIX_FETCH_A2_SSCALED:
619 case SI_FIX_FETCH_A2_SINT: {
620 /* The hardware returns an unsigned value; convert it to a
621 * signed one.
622 */
623 LLVMValueRef tmp = out[3];
624 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
625
626 /* First, recover the sign-extended signed integer value. */
627 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
628 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
629 else
630 tmp = ac_to_integer(&ctx->ac, tmp);
631
632 /* For the integer-like cases, do a natural sign extension.
633 *
634 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
635 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
636 * exponent.
637 */
638 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
639 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
640 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
641 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
642
643 /* Convert back to the right type. */
644 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
645 LLVMValueRef clamp;
646 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
647 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
648 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
649 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
650 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
651 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
652 }
653
654 out[3] = tmp;
655 break;
656 }
657 case SI_FIX_FETCH_RGBA_32_UNORM:
658 case SI_FIX_FETCH_RGBX_32_UNORM:
659 for (chan = 0; chan < 4; chan++) {
660 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
661 out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
662 out[chan], ctx->f32, "");
663 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
664 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
665 }
666 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
667 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
668 out[3] = LLVMConstReal(ctx->f32, 1);
669 break;
670 case SI_FIX_FETCH_RGBA_32_SNORM:
671 case SI_FIX_FETCH_RGBX_32_SNORM:
672 case SI_FIX_FETCH_RGBA_32_FIXED:
673 case SI_FIX_FETCH_RGBX_32_FIXED: {
674 double scale;
675 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
676 scale = 1.0 / 0x10000;
677 else
678 scale = 1.0 / INT_MAX;
679
680 for (chan = 0; chan < 4; chan++) {
681 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
682 out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
683 out[chan], ctx->f32, "");
684 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
685 LLVMConstReal(ctx->f32, scale), "");
686 }
687 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
688 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
689 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
690 out[3] = LLVMConstReal(ctx->f32, 1);
691 break;
692 }
693 case SI_FIX_FETCH_RGBA_32_USCALED:
694 for (chan = 0; chan < 4; chan++) {
695 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
696 out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
697 out[chan], ctx->f32, "");
698 }
699 break;
700 case SI_FIX_FETCH_RGBA_32_SSCALED:
701 for (chan = 0; chan < 4; chan++) {
702 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
703 out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
704 out[chan], ctx->f32, "");
705 }
706 break;
707 case SI_FIX_FETCH_RG_64_FLOAT:
708 for (chan = 0; chan < 2; chan++)
709 out[chan] = extract_double_to_float(ctx, input[0], chan);
710
711 out[2] = LLVMConstReal(ctx->f32, 0);
712 out[3] = LLVMConstReal(ctx->f32, 1);
713 break;
714 case SI_FIX_FETCH_RGB_64_FLOAT:
715 for (chan = 0; chan < 3; chan++)
716 out[chan] = extract_double_to_float(ctx, input[chan], 0);
717
718 out[3] = LLVMConstReal(ctx->f32, 1);
719 break;
720 case SI_FIX_FETCH_RGBA_64_FLOAT:
721 for (chan = 0; chan < 4; chan++) {
722 out[chan] = extract_double_to_float(ctx, input[chan / 2],
723 chan % 2);
724 }
725 break;
726 case SI_FIX_FETCH_RGB_8:
727 case SI_FIX_FETCH_RGB_8_INT:
728 case SI_FIX_FETCH_RGB_16:
729 case SI_FIX_FETCH_RGB_16_INT:
730 for (chan = 0; chan < 3; chan++) {
731 out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
732 input[chan],
733 ctx->i32_0, "");
734 }
735 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
736 fix_fetch == SI_FIX_FETCH_RGB_16) {
737 out[3] = LLVMConstReal(ctx->f32, 1);
738 } else {
739 out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
740 }
741 break;
742 }
743 }
744
745 static void declare_input_vs(
746 struct si_shader_context *ctx,
747 unsigned input_index,
748 const struct tgsi_full_declaration *decl,
749 LLVMValueRef out[4])
750 {
751 si_llvm_load_input_vs(ctx, input_index, out);
752 }
753
754 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
755 unsigned swizzle)
756 {
757 if (swizzle > 0)
758 return ctx->i32_0;
759
760 switch (ctx->type) {
761 case PIPE_SHADER_VERTEX:
762 return LLVMGetParam(ctx->main_fn,
763 ctx->param_vs_prim_id);
764 case PIPE_SHADER_TESS_CTRL:
765 return LLVMGetParam(ctx->main_fn,
766 ctx->param_tcs_patch_id);
767 case PIPE_SHADER_TESS_EVAL:
768 return LLVMGetParam(ctx->main_fn,
769 ctx->param_tes_patch_id);
770 case PIPE_SHADER_GEOMETRY:
771 return ctx->abi.gs_prim_id;
772 default:
773 assert(0);
774 return ctx->i32_0;
775 }
776 }
777
778 /**
779 * Return the value of tgsi_ind_register for indexing.
780 * This is the indirect index with the constant offset added to it.
781 */
782 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
783 const struct tgsi_ind_register *ind,
784 unsigned addr_mul,
785 int rel_index)
786 {
787 LLVMValueRef result;
788
789 if (ind->File == TGSI_FILE_ADDRESS) {
790 result = ctx->addrs[ind->Index][ind->Swizzle];
791 result = LLVMBuildLoad(ctx->ac.builder, result, "");
792 } else {
793 struct tgsi_full_src_register src = {};
794
795 src.Register.File = ind->File;
796 src.Register.Index = ind->Index;
797
798 /* Set the second index to 0 for constants. */
799 if (ind->File == TGSI_FILE_CONSTANT)
800 src.Register.Dimension = 1;
801
802 result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src,
803 TGSI_TYPE_SIGNED,
804 ind->Swizzle);
805 result = ac_to_integer(&ctx->ac, result);
806 }
807
808 if (addr_mul != 1)
809 result = LLVMBuildMul(ctx->ac.builder, result,
810 LLVMConstInt(ctx->i32, addr_mul, 0), "");
811 result = LLVMBuildAdd(ctx->ac.builder, result,
812 LLVMConstInt(ctx->i32, rel_index, 0), "");
813 return result;
814 }
815
816 /**
817 * Like si_get_indirect_index, but restricts the return value to a (possibly
818 * undefined) value inside [0..num).
819 */
820 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
821 const struct tgsi_ind_register *ind,
822 int rel_index, unsigned num)
823 {
824 LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index);
825
826 return si_llvm_bound_index(ctx, result, num);
827 }
828
829
830 /**
831 * Calculate a dword address given an input or output register and a stride.
832 */
833 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
834 const struct tgsi_full_dst_register *dst,
835 const struct tgsi_full_src_register *src,
836 LLVMValueRef vertex_dw_stride,
837 LLVMValueRef base_addr)
838 {
839 struct tgsi_shader_info *info = &ctx->shader->selector->info;
840 ubyte *name, *index, *array_first;
841 int first, param;
842 struct tgsi_full_dst_register reg;
843
844 /* Set the register description. The address computation is the same
845 * for sources and destinations. */
846 if (src) {
847 reg.Register.File = src->Register.File;
848 reg.Register.Index = src->Register.Index;
849 reg.Register.Indirect = src->Register.Indirect;
850 reg.Register.Dimension = src->Register.Dimension;
851 reg.Indirect = src->Indirect;
852 reg.Dimension = src->Dimension;
853 reg.DimIndirect = src->DimIndirect;
854 } else
855 reg = *dst;
856
857 /* If the register is 2-dimensional (e.g. an array of vertices
858 * in a primitive), calculate the base address of the vertex. */
859 if (reg.Register.Dimension) {
860 LLVMValueRef index;
861
862 if (reg.Dimension.Indirect)
863 index = si_get_indirect_index(ctx, &reg.DimIndirect,
864 1, reg.Dimension.Index);
865 else
866 index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
867
868 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
869 LLVMBuildMul(ctx->ac.builder, index,
870 vertex_dw_stride, ""), "");
871 }
872
873 /* Get information about the register. */
874 if (reg.Register.File == TGSI_FILE_INPUT) {
875 name = info->input_semantic_name;
876 index = info->input_semantic_index;
877 array_first = info->input_array_first;
878 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
879 name = info->output_semantic_name;
880 index = info->output_semantic_index;
881 array_first = info->output_array_first;
882 } else {
883 assert(0);
884 return NULL;
885 }
886
887 if (reg.Register.Indirect) {
888 /* Add the relative address of the element. */
889 LLVMValueRef ind_index;
890
891 if (reg.Indirect.ArrayID)
892 first = array_first[reg.Indirect.ArrayID];
893 else
894 first = reg.Register.Index;
895
896 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
897 1, reg.Register.Index - first);
898
899 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
900 LLVMBuildMul(ctx->ac.builder, ind_index,
901 LLVMConstInt(ctx->i32, 4, 0), ""), "");
902
903 param = reg.Register.Dimension ?
904 si_shader_io_get_unique_index(name[first], index[first]) :
905 si_shader_io_get_unique_index_patch(name[first], index[first]);
906 } else {
907 param = reg.Register.Dimension ?
908 si_shader_io_get_unique_index(name[reg.Register.Index],
909 index[reg.Register.Index]) :
910 si_shader_io_get_unique_index_patch(name[reg.Register.Index],
911 index[reg.Register.Index]);
912 }
913
914 /* Add the base address of the element. */
915 return LLVMBuildAdd(ctx->ac.builder, base_addr,
916 LLVMConstInt(ctx->i32, param * 4, 0), "");
917 }
918
919 /* The offchip buffer layout for TCS->TES is
920 *
921 * - attribute 0 of patch 0 vertex 0
922 * - attribute 0 of patch 0 vertex 1
923 * - attribute 0 of patch 0 vertex 2
924 * ...
925 * - attribute 0 of patch 1 vertex 0
926 * - attribute 0 of patch 1 vertex 1
927 * ...
928 * - attribute 1 of patch 0 vertex 0
929 * - attribute 1 of patch 0 vertex 1
930 * ...
931 * - per patch attribute 0 of patch 0
932 * - per patch attribute 0 of patch 1
933 * ...
934 *
935 * Note that every attribute has 4 components.
936 */
937 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
938 LLVMValueRef rel_patch_id,
939 LLVMValueRef vertex_index,
940 LLVMValueRef param_index)
941 {
942 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
943 LLVMValueRef param_stride, constant16;
944
945 vertices_per_patch = get_num_tcs_out_vertices(ctx);
946 num_patches = unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
947 total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
948 num_patches, "");
949
950 constant16 = LLVMConstInt(ctx->i32, 16, 0);
951 if (vertex_index) {
952 base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
953 vertices_per_patch, "");
954
955 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
956 vertex_index, "");
957
958 param_stride = total_vertices;
959 } else {
960 base_addr = rel_patch_id;
961 param_stride = num_patches;
962 }
963
964 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
965 LLVMBuildMul(ctx->ac.builder, param_index,
966 param_stride, ""), "");
967
968 base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
969
970 if (!vertex_index) {
971 LLVMValueRef patch_data_offset =
972 unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
973
974 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
975 patch_data_offset, "");
976 }
977 return base_addr;
978 }
979
980 /* This is a generic helper that can be shared by the NIR and TGSI backends */
981 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
982 struct si_shader_context *ctx,
983 LLVMValueRef vertex_index,
984 LLVMValueRef param_index,
985 unsigned param_base,
986 ubyte *name,
987 ubyte *index,
988 bool is_patch)
989 {
990 unsigned param_index_base;
991
992 param_index_base = is_patch ?
993 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) :
994 si_shader_io_get_unique_index(name[param_base], index[param_base]);
995
996 if (param_index) {
997 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
998 LLVMConstInt(ctx->i32, param_index_base, 0),
999 "");
1000 } else {
1001 param_index = LLVMConstInt(ctx->i32, param_index_base, 0);
1002 }
1003
1004 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
1005 vertex_index, param_index);
1006 }
1007
1008 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
1009 struct si_shader_context *ctx,
1010 const struct tgsi_full_dst_register *dst,
1011 const struct tgsi_full_src_register *src)
1012 {
1013 struct tgsi_shader_info *info = &ctx->shader->selector->info;
1014 ubyte *name, *index, *array_first;
1015 struct tgsi_full_src_register reg;
1016 LLVMValueRef vertex_index = NULL;
1017 LLVMValueRef param_index = NULL;
1018 unsigned param_base;
1019
1020 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
1021
1022 if (reg.Register.Dimension) {
1023
1024 if (reg.Dimension.Indirect)
1025 vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
1026 1, reg.Dimension.Index);
1027 else
1028 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
1029 }
1030
1031 /* Get information about the register. */
1032 if (reg.Register.File == TGSI_FILE_INPUT) {
1033 name = info->input_semantic_name;
1034 index = info->input_semantic_index;
1035 array_first = info->input_array_first;
1036 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1037 name = info->output_semantic_name;
1038 index = info->output_semantic_index;
1039 array_first = info->output_array_first;
1040 } else {
1041 assert(0);
1042 return NULL;
1043 }
1044
1045 if (reg.Register.Indirect) {
1046 if (reg.Indirect.ArrayID)
1047 param_base = array_first[reg.Indirect.ArrayID];
1048 else
1049 param_base = reg.Register.Index;
1050
1051 param_index = si_get_indirect_index(ctx, &reg.Indirect,
1052 1, reg.Register.Index - param_base);
1053
1054 } else {
1055 param_base = reg.Register.Index;
1056 }
1057
1058 return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1059 param_index, param_base,
1060 name, index, !reg.Register.Dimension);
1061 }
1062
1063 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
1064 LLVMTypeRef type, unsigned swizzle,
1065 LLVMValueRef buffer, LLVMValueRef offset,
1066 LLVMValueRef base, bool can_speculate)
1067 {
1068 struct si_shader_context *ctx = si_shader_context(bld_base);
1069 LLVMValueRef value, value2;
1070 LLVMTypeRef vec_type = LLVMVectorType(type, 4);
1071
1072 if (swizzle == ~0) {
1073 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1074 0, 1, 0, can_speculate, false);
1075
1076 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1077 }
1078
1079 if (!llvm_type_is_64bit(ctx, type)) {
1080 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1081 0, 1, 0, can_speculate, false);
1082
1083 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1084 return LLVMBuildExtractElement(ctx->ac.builder, value,
1085 LLVMConstInt(ctx->i32, swizzle, 0), "");
1086 }
1087
1088 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1089 swizzle * 4, 1, 0, can_speculate, false);
1090
1091 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1092 swizzle * 4 + 4, 1, 0, can_speculate, false);
1093
1094 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1095 }
1096
1097 /**
1098 * Load from LDS.
1099 *
1100 * \param type output value type
1101 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
1102 * \param dw_addr address in dwords
1103 */
1104 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1105 LLVMTypeRef type, unsigned swizzle,
1106 LLVMValueRef dw_addr)
1107 {
1108 struct si_shader_context *ctx = si_shader_context(bld_base);
1109 LLVMValueRef value;
1110
1111 if (swizzle == ~0) {
1112 LLVMValueRef values[TGSI_NUM_CHANNELS];
1113
1114 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1115 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1116
1117 return lp_build_gather_values(&ctx->gallivm, values,
1118 TGSI_NUM_CHANNELS);
1119 }
1120
1121 /* Split 64-bit loads. */
1122 if (llvm_type_is_64bit(ctx, type)) {
1123 LLVMValueRef lo, hi;
1124
1125 lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr);
1126 hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr);
1127 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
1128 }
1129
1130 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1131 LLVMConstInt(ctx->i32, swizzle, 0));
1132
1133 value = ac_lds_load(&ctx->ac, dw_addr);
1134
1135 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1136 }
1137
1138 /**
1139 * Store to LDS.
1140 *
1141 * \param swizzle offset (typically 0..3)
1142 * \param dw_addr address in dwords
1143 * \param value value to store
1144 */
1145 static void lds_store(struct si_shader_context *ctx,
1146 unsigned dw_offset_imm, LLVMValueRef dw_addr,
1147 LLVMValueRef value)
1148 {
1149 dw_addr = lp_build_add(&ctx->bld_base.uint_bld, dw_addr,
1150 LLVMConstInt(ctx->i32, dw_offset_imm, 0));
1151
1152 ac_lds_store(&ctx->ac, dw_addr, value);
1153 }
1154
1155 static LLVMValueRef desc_from_addr_base64k(struct si_shader_context *ctx,
1156 unsigned param)
1157 {
1158 LLVMBuilderRef builder = ctx->ac.builder;
1159
1160 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1161 addr = LLVMBuildZExt(builder, addr, ctx->i64, "");
1162 addr = LLVMBuildShl(builder, addr, LLVMConstInt(ctx->i64, 16, 0), "");
1163
1164 uint64_t desc2 = 0xffffffff;
1165 uint64_t desc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1166 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1167 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1168 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1169 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1170 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1171 LLVMValueRef hi = LLVMConstInt(ctx->i64, desc2 | (desc3 << 32), 0);
1172
1173 LLVMValueRef desc = LLVMGetUndef(LLVMVectorType(ctx->i64, 2));
1174 desc = LLVMBuildInsertElement(builder, desc, addr, ctx->i32_0, "");
1175 desc = LLVMBuildInsertElement(builder, desc, hi, ctx->i32_1, "");
1176 return LLVMBuildBitCast(builder, desc, ctx->v4i32, "");
1177 }
1178
1179 static LLVMValueRef fetch_input_tcs(
1180 struct lp_build_tgsi_context *bld_base,
1181 const struct tgsi_full_src_register *reg,
1182 enum tgsi_opcode_type type, unsigned swizzle)
1183 {
1184 struct si_shader_context *ctx = si_shader_context(bld_base);
1185 LLVMValueRef dw_addr, stride;
1186
1187 stride = get_tcs_in_vertex_dw_stride(ctx);
1188 dw_addr = get_tcs_in_current_patch_offset(ctx);
1189 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1190
1191 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1192 }
1193
1194 static LLVMValueRef fetch_output_tcs(
1195 struct lp_build_tgsi_context *bld_base,
1196 const struct tgsi_full_src_register *reg,
1197 enum tgsi_opcode_type type, unsigned swizzle)
1198 {
1199 struct si_shader_context *ctx = si_shader_context(bld_base);
1200 LLVMValueRef dw_addr, stride;
1201
1202 if (reg->Register.Dimension) {
1203 stride = get_tcs_out_vertex_dw_stride(ctx);
1204 dw_addr = get_tcs_out_current_patch_offset(ctx);
1205 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1206 } else {
1207 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1208 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1209 }
1210
1211 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1212 }
1213
1214 static LLVMValueRef fetch_input_tes(
1215 struct lp_build_tgsi_context *bld_base,
1216 const struct tgsi_full_src_register *reg,
1217 enum tgsi_opcode_type type, unsigned swizzle)
1218 {
1219 struct si_shader_context *ctx = si_shader_context(bld_base);
1220 LLVMValueRef buffer, base, addr;
1221
1222 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1223
1224 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1225 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1226
1227 return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle,
1228 buffer, base, addr, true);
1229 }
1230
1231 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1232 const struct tgsi_full_instruction *inst,
1233 const struct tgsi_opcode_info *info,
1234 unsigned index,
1235 LLVMValueRef dst[4])
1236 {
1237 struct si_shader_context *ctx = si_shader_context(bld_base);
1238 const struct tgsi_full_dst_register *reg = &inst->Dst[index];
1239 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1240 unsigned chan_index;
1241 LLVMValueRef dw_addr, stride;
1242 LLVMValueRef buffer, base, buf_addr;
1243 LLVMValueRef values[4];
1244 bool skip_lds_store;
1245 bool is_tess_factor = false, is_tess_inner = false;
1246
1247 /* Only handle per-patch and per-vertex outputs here.
1248 * Vectors will be lowered to scalars and this function will be called again.
1249 */
1250 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1251 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1252 si_llvm_emit_store(bld_base, inst, info, index, dst);
1253 return;
1254 }
1255
1256 if (reg->Register.Dimension) {
1257 stride = get_tcs_out_vertex_dw_stride(ctx);
1258 dw_addr = get_tcs_out_current_patch_offset(ctx);
1259 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1260 skip_lds_store = !sh_info->reads_pervertex_outputs;
1261 } else {
1262 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1263 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1264 skip_lds_store = !sh_info->reads_perpatch_outputs;
1265
1266 if (!reg->Register.Indirect) {
1267 int name = sh_info->output_semantic_name[reg->Register.Index];
1268
1269 /* Always write tess factors into LDS for the TCS epilog. */
1270 if (name == TGSI_SEMANTIC_TESSINNER ||
1271 name == TGSI_SEMANTIC_TESSOUTER) {
1272 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1273 skip_lds_store = !sh_info->reads_tessfactor_outputs &&
1274 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1275 is_tess_factor = true;
1276 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1277 }
1278 }
1279 }
1280
1281 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1282
1283 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1284 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1285
1286 uint32_t writemask = reg->Register.WriteMask;
1287 while (writemask) {
1288 chan_index = u_bit_scan(&writemask);
1289 LLVMValueRef value = dst[chan_index];
1290
1291 if (inst->Instruction.Saturate)
1292 value = ac_build_clamp(&ctx->ac, value);
1293
1294 /* Skip LDS stores if there is no LDS read of this output. */
1295 if (!skip_lds_store)
1296 lds_store(ctx, chan_index, dw_addr, value);
1297
1298 value = ac_to_integer(&ctx->ac, value);
1299 values[chan_index] = value;
1300
1301 if (reg->Register.WriteMask != 0xF && !is_tess_factor) {
1302 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1303 buf_addr, base,
1304 4 * chan_index, 1, 0, true, false);
1305 }
1306
1307 /* Write tess factors into VGPRs for the epilog. */
1308 if (is_tess_factor &&
1309 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1310 if (!is_tess_inner) {
1311 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1312 ctx->invoc0_tess_factors[chan_index]);
1313 } else if (chan_index < 2) {
1314 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1315 ctx->invoc0_tess_factors[4 + chan_index]);
1316 }
1317 }
1318 }
1319
1320 if (reg->Register.WriteMask == 0xF && !is_tess_factor) {
1321 LLVMValueRef value = lp_build_gather_values(&ctx->gallivm,
1322 values, 4);
1323 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1324 base, 0, 1, 0, true, false);
1325 }
1326 }
1327
1328 LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
1329 unsigned input_index,
1330 unsigned vtx_offset_param,
1331 LLVMTypeRef type,
1332 unsigned swizzle)
1333 {
1334 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1335 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1336 struct si_shader *shader = ctx->shader;
1337 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
1338 LLVMValueRef vtx_offset, soffset;
1339 struct tgsi_shader_info *info = &shader->selector->info;
1340 unsigned semantic_name = info->input_semantic_name[input_index];
1341 unsigned semantic_index = info->input_semantic_index[input_index];
1342 unsigned param;
1343 LLVMValueRef value;
1344
1345 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1346
1347 /* GFX9 has the ESGS ring in LDS. */
1348 if (ctx->screen->info.chip_class >= GFX9) {
1349 unsigned index = vtx_offset_param;
1350
1351 switch (index / 2) {
1352 case 0:
1353 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx01_offset,
1354 index % 2 ? 16 : 0, 16);
1355 break;
1356 case 1:
1357 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx23_offset,
1358 index % 2 ? 16 : 0, 16);
1359 break;
1360 case 2:
1361 vtx_offset = unpack_param(ctx, ctx->param_gs_vtx45_offset,
1362 index % 2 ? 16 : 0, 16);
1363 break;
1364 default:
1365 assert(0);
1366 return NULL;
1367 }
1368
1369 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
1370 LLVMConstInt(ctx->i32, param * 4, 0), "");
1371 return lds_load(bld_base, type, swizzle, vtx_offset);
1372 }
1373
1374 /* GFX6: input load from the ESGS ring in memory. */
1375 if (swizzle == ~0) {
1376 LLVMValueRef values[TGSI_NUM_CHANNELS];
1377 unsigned chan;
1378 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1379 values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
1380 type, chan);
1381 }
1382 return lp_build_gather_values(&ctx->gallivm, values,
1383 TGSI_NUM_CHANNELS);
1384 }
1385
1386 /* Get the vertex offset parameter on GFX6. */
1387 LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param];
1388
1389 vtx_offset = lp_build_mul_imm(uint, gs_vtx_offset, 4);
1390
1391 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1392
1393 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1394 vtx_offset, soffset, 0, 1, 0, true, false);
1395 if (llvm_type_is_64bit(ctx, type)) {
1396 LLVMValueRef value2;
1397 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1398
1399 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1400 ctx->i32_0, vtx_offset, soffset,
1401 0, 1, 0, true, false);
1402 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1403 }
1404 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1405 }
1406
1407 static LLVMValueRef fetch_input_gs(
1408 struct lp_build_tgsi_context *bld_base,
1409 const struct tgsi_full_src_register *reg,
1410 enum tgsi_opcode_type type,
1411 unsigned swizzle)
1412 {
1413 struct si_shader_context *ctx = si_shader_context(bld_base);
1414 struct tgsi_shader_info *info = &ctx->shader->selector->info;
1415
1416 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1417 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1418 return get_primitive_id(ctx, swizzle);
1419
1420 if (!reg->Register.Dimension)
1421 return NULL;
1422
1423 return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index,
1424 reg->Dimension.Index,
1425 tgsi2llvmtype(bld_base, type),
1426 swizzle);
1427 }
1428
1429 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1430 {
1431 switch (interpolate) {
1432 case TGSI_INTERPOLATE_CONSTANT:
1433 return 0;
1434
1435 case TGSI_INTERPOLATE_LINEAR:
1436 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1437 return SI_PARAM_LINEAR_SAMPLE;
1438 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1439 return SI_PARAM_LINEAR_CENTROID;
1440 else
1441 return SI_PARAM_LINEAR_CENTER;
1442 break;
1443 case TGSI_INTERPOLATE_COLOR:
1444 case TGSI_INTERPOLATE_PERSPECTIVE:
1445 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1446 return SI_PARAM_PERSP_SAMPLE;
1447 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1448 return SI_PARAM_PERSP_CENTROID;
1449 else
1450 return SI_PARAM_PERSP_CENTER;
1451 break;
1452 default:
1453 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1454 return -1;
1455 }
1456 }
1457
1458 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1459 unsigned attr_index, unsigned chan,
1460 LLVMValueRef prim_mask,
1461 LLVMValueRef i, LLVMValueRef j)
1462 {
1463 if (i || j) {
1464 return ac_build_fs_interp(&ctx->ac,
1465 LLVMConstInt(ctx->i32, chan, 0),
1466 LLVMConstInt(ctx->i32, attr_index, 0),
1467 prim_mask, i, j);
1468 }
1469 return ac_build_fs_interp_mov(&ctx->ac,
1470 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1471 LLVMConstInt(ctx->i32, chan, 0),
1472 LLVMConstInt(ctx->i32, attr_index, 0),
1473 prim_mask);
1474 }
1475
1476 /**
1477 * Interpolate a fragment shader input.
1478 *
1479 * @param ctx context
1480 * @param input_index index of the input in hardware
1481 * @param semantic_name TGSI_SEMANTIC_*
1482 * @param semantic_index semantic index
1483 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1484 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1485 * @param interp_param interpolation weights (i,j)
1486 * @param prim_mask SI_PARAM_PRIM_MASK
1487 * @param face SI_PARAM_FRONT_FACE
1488 * @param result the return value (4 components)
1489 */
1490 static void interp_fs_input(struct si_shader_context *ctx,
1491 unsigned input_index,
1492 unsigned semantic_name,
1493 unsigned semantic_index,
1494 unsigned num_interp_inputs,
1495 unsigned colors_read_mask,
1496 LLVMValueRef interp_param,
1497 LLVMValueRef prim_mask,
1498 LLVMValueRef face,
1499 LLVMValueRef result[4])
1500 {
1501 LLVMValueRef i = NULL, j = NULL;
1502 unsigned chan;
1503
1504 /* fs.constant returns the param from the middle vertex, so it's not
1505 * really useful for flat shading. It's meant to be used for custom
1506 * interpolation (but the intrinsic can't fetch from the other two
1507 * vertices).
1508 *
1509 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1510 * to do the right thing. The only reason we use fs.constant is that
1511 * fs.interp cannot be used on integers, because they can be equal
1512 * to NaN.
1513 *
1514 * When interp is false we will use fs.constant or for newer llvm,
1515 * amdgcn.interp.mov.
1516 */
1517 bool interp = interp_param != NULL;
1518
1519 if (interp) {
1520 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
1521 LLVMVectorType(ctx->f32, 2), "");
1522
1523 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1524 ctx->i32_0, "");
1525 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1526 ctx->i32_1, "");
1527 }
1528
1529 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1530 ctx->shader->key.part.ps.prolog.color_two_side) {
1531 LLVMValueRef is_face_positive;
1532
1533 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1534 * otherwise it's at offset "num_inputs".
1535 */
1536 unsigned back_attr_offset = num_interp_inputs;
1537 if (semantic_index == 1 && colors_read_mask & 0xf)
1538 back_attr_offset += 1;
1539
1540 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
1541 face, ctx->i32_0, "");
1542
1543 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1544 LLVMValueRef front, back;
1545
1546 front = si_build_fs_interp(ctx,
1547 input_index, chan,
1548 prim_mask, i, j);
1549 back = si_build_fs_interp(ctx,
1550 back_attr_offset, chan,
1551 prim_mask, i, j);
1552
1553 result[chan] = LLVMBuildSelect(ctx->ac.builder,
1554 is_face_positive,
1555 front,
1556 back,
1557 "");
1558 }
1559 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1560 result[0] = si_build_fs_interp(ctx, input_index,
1561 0, prim_mask, i, j);
1562 result[1] =
1563 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1564 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1565 } else {
1566 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1567 result[chan] = si_build_fs_interp(ctx,
1568 input_index, chan,
1569 prim_mask, i, j);
1570 }
1571 }
1572 }
1573
1574 void si_llvm_load_input_fs(
1575 struct si_shader_context *ctx,
1576 unsigned input_index,
1577 LLVMValueRef out[4])
1578 {
1579 struct lp_build_context *base = &ctx->bld_base.base;
1580 struct si_shader *shader = ctx->shader;
1581 struct tgsi_shader_info *info = &shader->selector->info;
1582 LLVMValueRef main_fn = ctx->main_fn;
1583 LLVMValueRef interp_param = NULL;
1584 int interp_param_idx;
1585 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1586 unsigned semantic_index = info->input_semantic_index[input_index];
1587 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1588 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1589
1590 /* Get colors from input VGPRs (set by the prolog). */
1591 if (semantic_name == TGSI_SEMANTIC_COLOR) {
1592 unsigned colors_read = shader->selector->info.colors_read;
1593 unsigned mask = colors_read >> (semantic_index * 4);
1594 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1595 (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1596
1597 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1598 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1599 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1600 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1601 return;
1602 }
1603
1604 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1605 if (interp_param_idx == -1)
1606 return;
1607 else if (interp_param_idx) {
1608 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1609 }
1610
1611 interp_fs_input(ctx, input_index, semantic_name,
1612 semantic_index, 0, /* this param is unused */
1613 shader->selector->info.colors_read, interp_param,
1614 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1615 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1616 &out[0]);
1617 }
1618
1619 static void declare_input_fs(
1620 struct si_shader_context *ctx,
1621 unsigned input_index,
1622 const struct tgsi_full_declaration *decl,
1623 LLVMValueRef out[4])
1624 {
1625 si_llvm_load_input_fs(ctx, input_index, out);
1626 }
1627
1628 static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
1629 {
1630 return unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1631 }
1632
1633
1634 /**
1635 * Load a dword from a constant buffer.
1636 */
1637 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1638 LLVMValueRef resource,
1639 LLVMValueRef offset)
1640 {
1641 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1642 0, 0, 0, true, true);
1643 }
1644
1645 static LLVMValueRef load_sample_position(struct si_shader_context *ctx, LLVMValueRef sample_id)
1646 {
1647 struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
1648 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1649 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1650 LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
1651
1652 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1653 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1654 LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1655
1656 LLVMValueRef pos[4] = {
1657 buffer_load_const(ctx, resource, offset0),
1658 buffer_load_const(ctx, resource, offset1),
1659 LLVMConstReal(ctx->f32, 0),
1660 LLVMConstReal(ctx->f32, 0)
1661 };
1662
1663 return lp_build_gather_values(&ctx->gallivm, pos, 4);
1664 }
1665
1666 void si_load_system_value(struct si_shader_context *ctx,
1667 unsigned index,
1668 const struct tgsi_full_declaration *decl)
1669 {
1670 struct lp_build_context *bld = &ctx->bld_base.base;
1671 LLVMValueRef value = 0;
1672
1673 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
1674
1675 switch (decl->Semantic.Name) {
1676 case TGSI_SEMANTIC_INSTANCEID:
1677 value = ctx->abi.instance_id;
1678 break;
1679
1680 case TGSI_SEMANTIC_VERTEXID:
1681 value = LLVMBuildAdd(ctx->ac.builder,
1682 ctx->abi.vertex_id,
1683 ctx->abi.base_vertex, "");
1684 break;
1685
1686 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1687 /* Unused. Clarify the meaning in indexed vs. non-indexed
1688 * draws if this is ever used again. */
1689 assert(false);
1690 break;
1691
1692 case TGSI_SEMANTIC_BASEVERTEX:
1693 {
1694 /* For non-indexed draws, the base vertex set by the driver
1695 * (for direct draws) or the CP (for indirect draws) is the
1696 * first vertex ID, but GLSL expects 0 to be returned.
1697 */
1698 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn, ctx->param_vs_state_bits);
1699 LLVMValueRef indexed;
1700
1701 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
1702 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
1703
1704 value = LLVMBuildSelect(ctx->ac.builder, indexed,
1705 ctx->abi.base_vertex, ctx->i32_0, "");
1706 break;
1707 }
1708
1709 case TGSI_SEMANTIC_BASEINSTANCE:
1710 value = ctx->abi.start_instance;
1711 break;
1712
1713 case TGSI_SEMANTIC_DRAWID:
1714 value = ctx->abi.draw_id;
1715 break;
1716
1717 case TGSI_SEMANTIC_INVOCATIONID:
1718 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1719 value = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
1720 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1721 value = ctx->abi.gs_invocation_id;
1722 else
1723 assert(!"INVOCATIONID not implemented");
1724 break;
1725
1726 case TGSI_SEMANTIC_POSITION:
1727 {
1728 LLVMValueRef pos[4] = {
1729 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1730 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1731 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
1732 lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
1733 LLVMGetParam(ctx->main_fn,
1734 SI_PARAM_POS_W_FLOAT)),
1735 };
1736 value = lp_build_gather_values(&ctx->gallivm, pos, 4);
1737 break;
1738 }
1739
1740 case TGSI_SEMANTIC_FACE:
1741 value = ctx->abi.front_face;
1742 break;
1743
1744 case TGSI_SEMANTIC_SAMPLEID:
1745 value = get_sample_id(ctx);
1746 break;
1747
1748 case TGSI_SEMANTIC_SAMPLEPOS: {
1749 LLVMValueRef pos[4] = {
1750 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
1751 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
1752 LLVMConstReal(ctx->f32, 0),
1753 LLVMConstReal(ctx->f32, 0)
1754 };
1755 pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
1756 TGSI_OPCODE_FRC, pos[0]);
1757 pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
1758 TGSI_OPCODE_FRC, pos[1]);
1759 value = lp_build_gather_values(&ctx->gallivm, pos, 4);
1760 break;
1761 }
1762
1763 case TGSI_SEMANTIC_SAMPLEMASK:
1764 /* This can only occur with the OpenGL Core profile, which
1765 * doesn't support smoothing.
1766 */
1767 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1768 break;
1769
1770 case TGSI_SEMANTIC_TESSCOORD:
1771 {
1772 LLVMValueRef coord[4] = {
1773 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
1774 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
1775 ctx->ac.f32_0,
1776 ctx->ac.f32_0
1777 };
1778
1779 /* For triangles, the vector should be (u, v, 1-u-v). */
1780 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1781 PIPE_PRIM_TRIANGLES)
1782 coord[2] = lp_build_sub(bld, ctx->ac.f32_1,
1783 lp_build_add(bld, coord[0], coord[1]));
1784
1785 value = lp_build_gather_values(&ctx->gallivm, coord, 4);
1786 break;
1787 }
1788
1789 case TGSI_SEMANTIC_VERTICESIN:
1790 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1791 value = unpack_param(ctx, ctx->param_tcs_out_lds_layout, 26, 6);
1792 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1793 value = get_num_tcs_out_vertices(ctx);
1794 else
1795 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1796 break;
1797
1798 case TGSI_SEMANTIC_TESSINNER:
1799 case TGSI_SEMANTIC_TESSOUTER:
1800 {
1801 LLVMValueRef buffer, base, addr;
1802 int param = si_shader_io_get_unique_index_patch(decl->Semantic.Name, 0);
1803
1804 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
1805
1806 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1807 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
1808 LLVMConstInt(ctx->i32, param, 0));
1809
1810 value = buffer_load(&ctx->bld_base, ctx->f32,
1811 ~0, buffer, base, addr, true);
1812
1813 break;
1814 }
1815
1816 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1817 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1818 {
1819 LLVMValueRef buf, slot, val[4];
1820 int i, offset;
1821
1822 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
1823 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1824 buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
1825 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1826
1827 for (i = 0; i < 4; i++)
1828 val[i] = buffer_load_const(ctx, buf,
1829 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
1830 value = lp_build_gather_values(&ctx->gallivm, val, 4);
1831 break;
1832 }
1833
1834 case TGSI_SEMANTIC_PRIMID:
1835 value = get_primitive_id(ctx, 0);
1836 break;
1837
1838 case TGSI_SEMANTIC_GRID_SIZE:
1839 value = LLVMGetParam(ctx->main_fn, ctx->param_grid_size);
1840 break;
1841
1842 case TGSI_SEMANTIC_BLOCK_SIZE:
1843 {
1844 LLVMValueRef values[3];
1845 unsigned i;
1846 unsigned *properties = ctx->shader->selector->info.properties;
1847
1848 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1849 unsigned sizes[3] = {
1850 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1851 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1852 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1853 };
1854
1855 for (i = 0; i < 3; ++i)
1856 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1857
1858 value = lp_build_gather_values(&ctx->gallivm, values, 3);
1859 } else {
1860 value = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1861 }
1862 break;
1863 }
1864
1865 case TGSI_SEMANTIC_BLOCK_ID:
1866 {
1867 LLVMValueRef values[3];
1868
1869 for (int i = 0; i < 3; i++) {
1870 values[i] = ctx->i32_0;
1871 if (ctx->param_block_id[i] >= 0) {
1872 values[i] = LLVMGetParam(ctx->main_fn,
1873 ctx->param_block_id[i]);
1874 }
1875 }
1876 value = lp_build_gather_values(&ctx->gallivm, values, 3);
1877 break;
1878 }
1879
1880 case TGSI_SEMANTIC_THREAD_ID:
1881 value = LLVMGetParam(ctx->main_fn, ctx->param_thread_id);
1882 break;
1883
1884 case TGSI_SEMANTIC_HELPER_INVOCATION:
1885 value = lp_build_intrinsic(ctx->ac.builder,
1886 "llvm.amdgcn.ps.live",
1887 ctx->i1, NULL, 0,
1888 LP_FUNC_ATTR_READNONE);
1889 value = LLVMBuildNot(ctx->ac.builder, value, "");
1890 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->i32, "");
1891 break;
1892
1893 case TGSI_SEMANTIC_SUBGROUP_SIZE:
1894 value = LLVMConstInt(ctx->i32, 64, 0);
1895 break;
1896
1897 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
1898 value = ac_get_thread_id(&ctx->ac);
1899 break;
1900
1901 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
1902 {
1903 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1904 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
1905 value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
1906 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
1907 break;
1908 }
1909
1910 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
1911 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
1912 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
1913 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
1914 {
1915 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
1916 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
1917 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
1918 /* All bits set except LSB */
1919 value = LLVMConstInt(ctx->i64, -2, 0);
1920 } else {
1921 /* All bits set */
1922 value = LLVMConstInt(ctx->i64, -1, 0);
1923 }
1924 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
1925 value = LLVMBuildShl(ctx->ac.builder, value, id, "");
1926 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
1927 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
1928 value = LLVMBuildNot(ctx->ac.builder, value, "");
1929 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
1930 break;
1931 }
1932
1933 default:
1934 assert(!"unknown system value");
1935 return;
1936 }
1937
1938 ctx->system_values[index] = value;
1939 }
1940
1941 void si_declare_compute_memory(struct si_shader_context *ctx,
1942 const struct tgsi_full_declaration *decl)
1943 {
1944 struct si_shader_selector *sel = ctx->shader->selector;
1945
1946 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1947 LLVMValueRef var;
1948
1949 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1950 assert(decl->Range.First == decl->Range.Last);
1951 assert(!ctx->ac.lds);
1952
1953 var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
1954 LLVMArrayType(ctx->i8, sel->local_size),
1955 "compute_lds",
1956 LOCAL_ADDR_SPACE);
1957 LLVMSetAlignment(var, 4);
1958
1959 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
1960 }
1961
1962 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1963 {
1964 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
1965 ctx->param_const_and_shader_buffers);
1966
1967 return ac_build_load_to_sgpr(&ctx->ac, list_ptr,
1968 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
1969 }
1970
1971 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
1972 {
1973 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1974 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
1975
1976 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
1977 index = LLVMBuildAdd(ctx->ac.builder, index,
1978 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
1979
1980 return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
1981 }
1982
1983 static LLVMValueRef
1984 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
1985 {
1986 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1987 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
1988 ctx->param_const_and_shader_buffers);
1989
1990 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
1991 index = LLVMBuildSub(ctx->ac.builder,
1992 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
1993 index, "");
1994
1995 return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
1996 }
1997
1998 static LLVMValueRef fetch_constant(
1999 struct lp_build_tgsi_context *bld_base,
2000 const struct tgsi_full_src_register *reg,
2001 enum tgsi_opcode_type type,
2002 unsigned swizzle)
2003 {
2004 struct si_shader_context *ctx = si_shader_context(bld_base);
2005 struct si_shader_selector *sel = ctx->shader->selector;
2006 const struct tgsi_ind_register *ireg = &reg->Indirect;
2007 unsigned buf, idx;
2008
2009 LLVMValueRef addr, bufp;
2010
2011 if (swizzle == LP_CHAN_ALL) {
2012 unsigned chan;
2013 LLVMValueRef values[4];
2014 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
2015 values[chan] = fetch_constant(bld_base, reg, type, chan);
2016
2017 return lp_build_gather_values(&ctx->gallivm, values, 4);
2018 }
2019
2020 /* Split 64-bit loads. */
2021 if (tgsi_type_is_64bit(type)) {
2022 LLVMValueRef lo, hi;
2023
2024 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
2025 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle + 1);
2026 return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
2027 lo, hi);
2028 }
2029
2030 idx = reg->Register.Index * 4 + swizzle;
2031 if (reg->Register.Indirect) {
2032 addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
2033 } else {
2034 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
2035 }
2036
2037 /* Fast path when user data SGPRs point to constant buffer 0 directly. */
2038 if (sel->info.const_buffers_declared == 1 &&
2039 sel->info.shader_buffers_declared == 0) {
2040 LLVMValueRef ptr =
2041 LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2042
2043 /* This enables use of s_load_dword and flat_load_dword for const buffer 0
2044 * loads, and up to x4 load opcode merging. However, it leads to horrible
2045 * code reducing SIMD wave occupancy from 8 to 2 in many cases.
2046 *
2047 * Using s_buffer_load_dword (x1) seems to be the best option right now.
2048 *
2049 * LLVM 5.0 on SI doesn't insert a required s_nop between SALU setting
2050 * a descriptor and s_buffer_load_dword using it, so we can't expand
2051 * the pointer into a full descriptor like below. We have to use
2052 * s_load_dword instead. The only case when LLVM 5.0 would select
2053 * s_buffer_load_dword (that we have to prevent) is when we use use
2054 * a literal offset where we don't need bounds checking.
2055 */
2056 if (ctx->screen->info.chip_class == SI &&
2057 HAVE_LLVM < 0x0600 &&
2058 !reg->Register.Indirect) {
2059 addr = LLVMBuildLShr(ctx->ac.builder, addr, LLVMConstInt(ctx->i32, 2, 0), "");
2060 LLVMValueRef result = ac_build_load_invariant(&ctx->ac, ptr, addr);
2061 return bitcast(bld_base, type, result);
2062 }
2063
2064 /* Do the bounds checking with a descriptor, because
2065 * doing computation and manual bounds checking of 64-bit
2066 * addresses generates horrible VALU code with very high
2067 * VGPR usage and very low SIMD occupancy.
2068 */
2069 ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->i64, "");
2070 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->v2i32, "");
2071
2072 LLVMValueRef desc_elems[] = {
2073 LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_0, ""),
2074 LLVMBuildExtractElement(ctx->ac.builder, ptr, ctx->i32_1, ""),
2075 LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
2076 LLVMConstInt(ctx->i32,
2077 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2078 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2079 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2080 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2081 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
2082 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
2083 };
2084 LLVMValueRef desc = ac_build_gather_values(&ctx->ac, desc_elems, 4);
2085 LLVMValueRef result = buffer_load_const(ctx, desc, addr);
2086 return bitcast(bld_base, type, result);
2087 }
2088
2089 assert(reg->Register.Dimension);
2090 buf = reg->Dimension.Index;
2091
2092 if (reg->Dimension.Indirect) {
2093 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2094 LLVMValueRef index;
2095 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
2096 reg->Dimension.Index,
2097 ctx->num_const_buffers);
2098 index = LLVMBuildAdd(ctx->ac.builder, index,
2099 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2100 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2101 } else
2102 bufp = load_const_buffer_desc(ctx, buf);
2103
2104 return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
2105 }
2106
2107 /* Upper 16 bits must be zero. */
2108 static LLVMValueRef si_llvm_pack_two_int16(struct si_shader_context *ctx,
2109 LLVMValueRef val[2])
2110 {
2111 return LLVMBuildOr(ctx->ac.builder, val[0],
2112 LLVMBuildShl(ctx->ac.builder, val[1],
2113 LLVMConstInt(ctx->i32, 16, 0),
2114 ""), "");
2115 }
2116
2117 /* Upper 16 bits are ignored and will be dropped. */
2118 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct si_shader_context *ctx,
2119 LLVMValueRef val[2])
2120 {
2121 LLVMValueRef v[2] = {
2122 LLVMBuildAnd(ctx->ac.builder, val[0],
2123 LLVMConstInt(ctx->i32, 0xffff, 0), ""),
2124 val[1],
2125 };
2126 return si_llvm_pack_two_int16(ctx, v);
2127 }
2128
2129 /* Initialize arguments for the shader export intrinsic */
2130 static void si_llvm_init_export_args(struct si_shader_context *ctx,
2131 LLVMValueRef *values,
2132 unsigned target,
2133 struct ac_export_args *args)
2134 {
2135 LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
2136 LLVMBuilderRef builder = ctx->ac.builder;
2137 LLVMValueRef val[4];
2138 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
2139 unsigned chan;
2140 bool is_int8, is_int10;
2141
2142 /* Default is 0xf. Adjusted below depending on the format. */
2143 args->enabled_channels = 0xf; /* writemask */
2144
2145 /* Specify whether the EXEC mask represents the valid mask */
2146 args->valid_mask = 0;
2147
2148 /* Specify whether this is the last export */
2149 args->done = 0;
2150
2151 /* Specify the target we are exporting */
2152 args->target = target;
2153
2154 if (ctx->type == PIPE_SHADER_FRAGMENT) {
2155 const struct si_shader_key *key = &ctx->shader->key;
2156 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2157 int cbuf = target - V_008DFC_SQ_EXP_MRT;
2158
2159 assert(cbuf >= 0 && cbuf < 8);
2160 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2161 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2162 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2163 }
2164
2165 args->compr = false;
2166 args->out[0] = f32undef;
2167 args->out[1] = f32undef;
2168 args->out[2] = f32undef;
2169 args->out[3] = f32undef;
2170
2171 switch (spi_shader_col_format) {
2172 case V_028714_SPI_SHADER_ZERO:
2173 args->enabled_channels = 0; /* writemask */
2174 args->target = V_008DFC_SQ_EXP_NULL;
2175 break;
2176
2177 case V_028714_SPI_SHADER_32_R:
2178 args->enabled_channels = 1; /* writemask */
2179 args->out[0] = values[0];
2180 break;
2181
2182 case V_028714_SPI_SHADER_32_GR:
2183 args->enabled_channels = 0x3; /* writemask */
2184 args->out[0] = values[0];
2185 args->out[1] = values[1];
2186 break;
2187
2188 case V_028714_SPI_SHADER_32_AR:
2189 args->enabled_channels = 0x9; /* writemask */
2190 args->out[0] = values[0];
2191 args->out[3] = values[3];
2192 break;
2193
2194 case V_028714_SPI_SHADER_FP16_ABGR:
2195 args->compr = 1; /* COMPR flag */
2196
2197 for (chan = 0; chan < 2; chan++) {
2198 LLVMValueRef pack_args[2] = {
2199 values[2 * chan],
2200 values[2 * chan + 1]
2201 };
2202 LLVMValueRef packed;
2203
2204 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
2205 args->out[chan] = ac_to_float(&ctx->ac, packed);
2206 }
2207 break;
2208
2209 case V_028714_SPI_SHADER_UNORM16_ABGR:
2210 for (chan = 0; chan < 4; chan++) {
2211 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
2212 val[chan] = LLVMBuildFMul(builder, val[chan],
2213 LLVMConstReal(ctx->f32, 65535), "");
2214 val[chan] = LLVMBuildFAdd(builder, val[chan],
2215 LLVMConstReal(ctx->f32, 0.5), "");
2216 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2217 ctx->i32, "");
2218 }
2219
2220 args->compr = 1; /* COMPR flag */
2221 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val));
2222 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2));
2223 break;
2224
2225 case V_028714_SPI_SHADER_SNORM16_ABGR:
2226 for (chan = 0; chan < 4; chan++) {
2227 /* Clamp between [-1, 1]. */
2228 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MIN,
2229 values[chan],
2230 LLVMConstReal(ctx->f32, 1));
2231 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_MAX,
2232 val[chan],
2233 LLVMConstReal(ctx->f32, -1));
2234 /* Convert to a signed integer in [-32767, 32767]. */
2235 val[chan] = LLVMBuildFMul(builder, val[chan],
2236 LLVMConstReal(ctx->f32, 32767), "");
2237 /* If positive, add 0.5, else add -0.5. */
2238 val[chan] = LLVMBuildFAdd(builder, val[chan],
2239 LLVMBuildSelect(builder,
2240 LLVMBuildFCmp(builder, LLVMRealOGE,
2241 val[chan], ctx->ac.f32_0, ""),
2242 LLVMConstReal(ctx->f32, 0.5),
2243 LLVMConstReal(ctx->f32, -0.5), ""), "");
2244 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2245 }
2246
2247 args->compr = 1; /* COMPR flag */
2248 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val));
2249 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2));
2250 break;
2251
2252 case V_028714_SPI_SHADER_UINT16_ABGR: {
2253 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2254 is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
2255 LLVMValueRef max_alpha =
2256 !is_int10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2257
2258 /* Clamp. */
2259 for (chan = 0; chan < 4; chan++) {
2260 val[chan] = ac_to_integer(&ctx->ac, values[chan]);
2261 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base, TGSI_OPCODE_UMIN,
2262 val[chan],
2263 chan == 3 ? max_alpha : max_rgb);
2264 }
2265
2266 args->compr = 1; /* COMPR flag */
2267 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val));
2268 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int16(ctx, val+2));
2269 break;
2270 }
2271
2272 case V_028714_SPI_SHADER_SINT16_ABGR: {
2273 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2274 is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
2275 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2276 is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
2277 LLVMValueRef max_alpha =
2278 !is_int10 ? max_rgb : ctx->i32_1;
2279 LLVMValueRef min_alpha =
2280 !is_int10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2281
2282 /* Clamp. */
2283 for (chan = 0; chan < 4; chan++) {
2284 val[chan] = ac_to_integer(&ctx->ac, values[chan]);
2285 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base,
2286 TGSI_OPCODE_IMIN,
2287 val[chan], chan == 3 ? max_alpha : max_rgb);
2288 val[chan] = lp_build_emit_llvm_binary(&ctx->bld_base,
2289 TGSI_OPCODE_IMAX,
2290 val[chan], chan == 3 ? min_alpha : min_rgb);
2291 }
2292
2293 args->compr = 1; /* COMPR flag */
2294 args->out[0] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val));
2295 args->out[1] = ac_to_float(&ctx->ac, si_llvm_pack_two_int32_as_int16(ctx, val+2));
2296 break;
2297 }
2298
2299 case V_028714_SPI_SHADER_32_ABGR:
2300 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2301 break;
2302 }
2303 }
2304
2305 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2306 LLVMValueRef alpha)
2307 {
2308 struct si_shader_context *ctx = si_shader_context(bld_base);
2309
2310 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2311 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
2312 [PIPE_FUNC_LESS] = LLVMRealOLT,
2313 [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
2314 [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
2315 [PIPE_FUNC_GREATER] = LLVMRealOGT,
2316 [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
2317 [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
2318 };
2319 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
2320 assert(cond);
2321
2322 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2323 SI_PARAM_ALPHA_REF);
2324 LLVMValueRef alpha_pass =
2325 LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
2326 ac_build_kill_if_false(&ctx->ac, alpha_pass);
2327 } else {
2328 ac_build_kill_if_false(&ctx->ac, LLVMConstInt(ctx->i1, 0, 0));
2329 }
2330 }
2331
2332 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2333 LLVMValueRef alpha,
2334 unsigned samplemask_param)
2335 {
2336 struct si_shader_context *ctx = si_shader_context(bld_base);
2337 LLVMValueRef coverage;
2338
2339 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2340 coverage = LLVMGetParam(ctx->main_fn,
2341 samplemask_param);
2342 coverage = ac_to_integer(&ctx->ac, coverage);
2343
2344 coverage = lp_build_intrinsic(ctx->ac.builder, "llvm.ctpop.i32",
2345 ctx->i32,
2346 &coverage, 1, LP_FUNC_ATTR_READNONE);
2347
2348 coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
2349 ctx->f32, "");
2350
2351 coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
2352 LLVMConstReal(ctx->f32,
2353 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2354
2355 return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
2356 }
2357
2358 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
2359 struct ac_export_args *pos, LLVMValueRef *out_elts)
2360 {
2361 unsigned reg_index;
2362 unsigned chan;
2363 unsigned const_chan;
2364 LLVMValueRef base_elt;
2365 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2366 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2367 SI_VS_CONST_CLIP_PLANES, 0);
2368 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
2369
2370 for (reg_index = 0; reg_index < 2; reg_index ++) {
2371 struct ac_export_args *args = &pos[2 + reg_index];
2372
2373 args->out[0] =
2374 args->out[1] =
2375 args->out[2] =
2376 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2377
2378 /* Compute dot products of position and user clip plane vectors */
2379 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2380 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2381 LLVMValueRef addr =
2382 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2383 const_chan) * 4, 0);
2384 base_elt = buffer_load_const(ctx, const_resource,
2385 addr);
2386 args->out[chan] =
2387 lp_build_add(&ctx->bld_base.base, args->out[chan],
2388 lp_build_mul(&ctx->bld_base.base, base_elt,
2389 out_elts[const_chan]));
2390 }
2391 }
2392
2393 args->enabled_channels = 0xf;
2394 args->valid_mask = 0;
2395 args->done = 0;
2396 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2397 args->compr = 0;
2398 }
2399 }
2400
2401 static void si_dump_streamout(struct pipe_stream_output_info *so)
2402 {
2403 unsigned i;
2404
2405 if (so->num_outputs)
2406 fprintf(stderr, "STREAMOUT\n");
2407
2408 for (i = 0; i < so->num_outputs; i++) {
2409 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2410 so->output[i].start_component;
2411 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2412 i, so->output[i].output_buffer,
2413 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2414 so->output[i].register_index,
2415 mask & 1 ? "x" : "",
2416 mask & 2 ? "y" : "",
2417 mask & 4 ? "z" : "",
2418 mask & 8 ? "w" : "");
2419 }
2420 }
2421
2422 static void emit_streamout_output(struct si_shader_context *ctx,
2423 LLVMValueRef const *so_buffers,
2424 LLVMValueRef const *so_write_offsets,
2425 struct pipe_stream_output *stream_out,
2426 struct si_shader_output_values *shader_out)
2427 {
2428 unsigned buf_idx = stream_out->output_buffer;
2429 unsigned start = stream_out->start_component;
2430 unsigned num_comps = stream_out->num_components;
2431 LLVMValueRef out[4];
2432
2433 assert(num_comps && num_comps <= 4);
2434 if (!num_comps || num_comps > 4)
2435 return;
2436
2437 /* Load the output as int. */
2438 for (int j = 0; j < num_comps; j++) {
2439 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2440
2441 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
2442 }
2443
2444 /* Pack the output. */
2445 LLVMValueRef vdata = NULL;
2446
2447 switch (num_comps) {
2448 case 1: /* as i32 */
2449 vdata = out[0];
2450 break;
2451 case 2: /* as v2i32 */
2452 case 3: /* as v4i32 (aligned to 4) */
2453 case 4: /* as v4i32 */
2454 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2455 for (int j = 0; j < num_comps; j++) {
2456 vdata = LLVMBuildInsertElement(ctx->ac.builder, vdata, out[j],
2457 LLVMConstInt(ctx->i32, j, 0), "");
2458 }
2459 break;
2460 }
2461
2462 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2463 vdata, num_comps,
2464 so_write_offsets[buf_idx],
2465 ctx->i32_0,
2466 stream_out->dst_offset * 4, 1, 1, true, false);
2467 }
2468
2469 /**
2470 * Write streamout data to buffers for vertex stream @p stream (different
2471 * vertex streams can occur for GS copy shaders).
2472 */
2473 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2474 struct si_shader_output_values *outputs,
2475 unsigned noutput, unsigned stream)
2476 {
2477 struct si_shader_selector *sel = ctx->shader->selector;
2478 struct pipe_stream_output_info *so = &sel->so;
2479 LLVMBuilderRef builder = ctx->ac.builder;
2480 int i;
2481 struct lp_build_if_state if_ctx;
2482
2483 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2484 LLVMValueRef so_vtx_count =
2485 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2486
2487 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2488
2489 /* can_emit = tid < so_vtx_count; */
2490 LLVMValueRef can_emit =
2491 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2492
2493 /* Emit the streamout code conditionally. This actually avoids
2494 * out-of-bounds buffer access. The hw tells us via the SGPR
2495 * (so_vtx_count) which threads are allowed to emit streamout data. */
2496 lp_build_if(&if_ctx, &ctx->gallivm, can_emit);
2497 {
2498 /* The buffer offset is computed as follows:
2499 * ByteOffset = streamout_offset[buffer_id]*4 +
2500 * (streamout_write_index + thread_id)*stride[buffer_id] +
2501 * attrib_offset
2502 */
2503
2504 LLVMValueRef so_write_index =
2505 LLVMGetParam(ctx->main_fn,
2506 ctx->param_streamout_write_index);
2507
2508 /* Compute (streamout_write_index + thread_id). */
2509 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2510
2511 /* Load the descriptor and compute the write offset for each
2512 * enabled buffer. */
2513 LLVMValueRef so_write_offset[4] = {};
2514 LLVMValueRef so_buffers[4];
2515 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2516 ctx->param_rw_buffers);
2517
2518 for (i = 0; i < 4; i++) {
2519 if (!so->stride[i])
2520 continue;
2521
2522 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2523 SI_VS_STREAMOUT_BUF0 + i, 0);
2524
2525 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
2526
2527 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2528 ctx->param_streamout_offset[i]);
2529 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2530
2531 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2532 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2533 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2534 }
2535
2536 /* Write streamout data. */
2537 for (i = 0; i < so->num_outputs; i++) {
2538 unsigned reg = so->output[i].register_index;
2539
2540 if (reg >= noutput)
2541 continue;
2542
2543 if (stream != so->output[i].stream)
2544 continue;
2545
2546 emit_streamout_output(ctx, so_buffers, so_write_offset,
2547 &so->output[i], &outputs[reg]);
2548 }
2549 }
2550 lp_build_endif(&if_ctx);
2551 }
2552
2553 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2554 LLVMValueRef *values)
2555 {
2556 struct ac_export_args args;
2557
2558 si_llvm_init_export_args(ctx, values,
2559 V_008DFC_SQ_EXP_PARAM + index, &args);
2560 ac_build_export(&ctx->ac, &args);
2561 }
2562
2563 static void si_build_param_exports(struct si_shader_context *ctx,
2564 struct si_shader_output_values *outputs,
2565 unsigned noutput)
2566 {
2567 struct si_shader *shader = ctx->shader;
2568 unsigned param_count = 0;
2569
2570 for (unsigned i = 0; i < noutput; i++) {
2571 unsigned semantic_name = outputs[i].semantic_name;
2572 unsigned semantic_index = outputs[i].semantic_index;
2573
2574 if (outputs[i].vertex_stream[0] != 0 &&
2575 outputs[i].vertex_stream[1] != 0 &&
2576 outputs[i].vertex_stream[2] != 0 &&
2577 outputs[i].vertex_stream[3] != 0)
2578 continue;
2579
2580 switch (semantic_name) {
2581 case TGSI_SEMANTIC_LAYER:
2582 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2583 case TGSI_SEMANTIC_CLIPDIST:
2584 case TGSI_SEMANTIC_COLOR:
2585 case TGSI_SEMANTIC_BCOLOR:
2586 case TGSI_SEMANTIC_PRIMID:
2587 case TGSI_SEMANTIC_FOG:
2588 case TGSI_SEMANTIC_TEXCOORD:
2589 case TGSI_SEMANTIC_GENERIC:
2590 break;
2591 default:
2592 continue;
2593 }
2594
2595 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2596 semantic_index < SI_MAX_IO_GENERIC) &&
2597 shader->key.opt.kill_outputs &
2598 (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
2599 continue;
2600
2601 si_export_param(ctx, param_count, outputs[i].values);
2602
2603 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2604 shader->info.vs_output_param_offset[i] = param_count++;
2605 }
2606
2607 shader->info.nr_param_exports = param_count;
2608 }
2609
2610 /* Generate export instructions for hardware VS shader stage */
2611 static void si_llvm_export_vs(struct si_shader_context *ctx,
2612 struct si_shader_output_values *outputs,
2613 unsigned noutput)
2614 {
2615 struct si_shader *shader = ctx->shader;
2616 struct ac_export_args pos_args[4] = {};
2617 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2618 unsigned pos_idx;
2619 int i;
2620
2621 /* Build position exports. */
2622 for (i = 0; i < noutput; i++) {
2623 switch (outputs[i].semantic_name) {
2624 case TGSI_SEMANTIC_POSITION:
2625 si_llvm_init_export_args(ctx, outputs[i].values,
2626 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2627 break;
2628 case TGSI_SEMANTIC_PSIZE:
2629 psize_value = outputs[i].values[0];
2630 break;
2631 case TGSI_SEMANTIC_LAYER:
2632 layer_value = outputs[i].values[0];
2633 break;
2634 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2635 viewport_index_value = outputs[i].values[0];
2636 break;
2637 case TGSI_SEMANTIC_EDGEFLAG:
2638 edgeflag_value = outputs[i].values[0];
2639 break;
2640 case TGSI_SEMANTIC_CLIPDIST:
2641 if (!shader->key.opt.clip_disable) {
2642 unsigned index = 2 + outputs[i].semantic_index;
2643 si_llvm_init_export_args(ctx, outputs[i].values,
2644 V_008DFC_SQ_EXP_POS + index,
2645 &pos_args[index]);
2646 }
2647 break;
2648 case TGSI_SEMANTIC_CLIPVERTEX:
2649 if (!shader->key.opt.clip_disable) {
2650 si_llvm_emit_clipvertex(ctx, pos_args,
2651 outputs[i].values);
2652 }
2653 break;
2654 }
2655 }
2656
2657 /* We need to add the position output manually if it's missing. */
2658 if (!pos_args[0].out[0]) {
2659 pos_args[0].enabled_channels = 0xf; /* writemask */
2660 pos_args[0].valid_mask = 0; /* EXEC mask */
2661 pos_args[0].done = 0; /* last export? */
2662 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2663 pos_args[0].compr = 0; /* COMPR flag */
2664 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2665 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2666 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2667 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
2668 }
2669
2670 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2671 if (shader->selector->info.writes_psize ||
2672 shader->selector->info.writes_edgeflag ||
2673 shader->selector->info.writes_viewport_index ||
2674 shader->selector->info.writes_layer) {
2675 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2676 (shader->selector->info.writes_edgeflag << 1) |
2677 (shader->selector->info.writes_layer << 2);
2678
2679 pos_args[1].valid_mask = 0; /* EXEC mask */
2680 pos_args[1].done = 0; /* last export? */
2681 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2682 pos_args[1].compr = 0; /* COMPR flag */
2683 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2684 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2685 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2686 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
2687
2688 if (shader->selector->info.writes_psize)
2689 pos_args[1].out[0] = psize_value;
2690
2691 if (shader->selector->info.writes_edgeflag) {
2692 /* The output is a float, but the hw expects an integer
2693 * with the first bit containing the edge flag. */
2694 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
2695 edgeflag_value,
2696 ctx->i32, "");
2697 edgeflag_value = ac_build_umin(&ctx->ac,
2698 edgeflag_value,
2699 ctx->i32_1);
2700
2701 /* The LLVM intrinsic expects a float. */
2702 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
2703 }
2704
2705 if (ctx->screen->info.chip_class >= GFX9) {
2706 /* GFX9 has the layer in out.z[10:0] and the viewport
2707 * index in out.z[19:16].
2708 */
2709 if (shader->selector->info.writes_layer)
2710 pos_args[1].out[2] = layer_value;
2711
2712 if (shader->selector->info.writes_viewport_index) {
2713 LLVMValueRef v = viewport_index_value;
2714
2715 v = ac_to_integer(&ctx->ac, v);
2716 v = LLVMBuildShl(ctx->ac.builder, v,
2717 LLVMConstInt(ctx->i32, 16, 0), "");
2718 v = LLVMBuildOr(ctx->ac.builder, v,
2719 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
2720 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2721 pos_args[1].enabled_channels |= 1 << 2;
2722 }
2723 } else {
2724 if (shader->selector->info.writes_layer)
2725 pos_args[1].out[2] = layer_value;
2726
2727 if (shader->selector->info.writes_viewport_index) {
2728 pos_args[1].out[3] = viewport_index_value;
2729 pos_args[1].enabled_channels |= 1 << 3;
2730 }
2731 }
2732 }
2733
2734 for (i = 0; i < 4; i++)
2735 if (pos_args[i].out[0])
2736 shader->info.nr_pos_exports++;
2737
2738 pos_idx = 0;
2739 for (i = 0; i < 4; i++) {
2740 if (!pos_args[i].out[0])
2741 continue;
2742
2743 /* Specify the target we are exporting */
2744 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
2745
2746 if (pos_idx == shader->info.nr_pos_exports)
2747 /* Specify that this is the last export */
2748 pos_args[i].done = 1;
2749
2750 ac_build_export(&ctx->ac, &pos_args[i]);
2751 }
2752
2753 /* Build parameter exports. */
2754 si_build_param_exports(ctx, outputs, noutput);
2755 }
2756
2757 /**
2758 * Forward all outputs from the vertex shader to the TES. This is only used
2759 * for the fixed function TCS.
2760 */
2761 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2762 {
2763 struct si_shader_context *ctx = si_shader_context(bld_base);
2764 LLVMValueRef invocation_id, buffer, buffer_offset;
2765 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2766 uint64_t inputs;
2767
2768 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
2769 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2770 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2771
2772 lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
2773 lds_vertex_offset = LLVMBuildMul(ctx->ac.builder, invocation_id,
2774 lds_vertex_stride, "");
2775 lds_base = get_tcs_in_current_patch_offset(ctx);
2776 lds_base = LLVMBuildAdd(ctx->ac.builder, lds_base, lds_vertex_offset, "");
2777
2778 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
2779 while (inputs) {
2780 unsigned i = u_bit_scan64(&inputs);
2781
2782 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
2783 LLVMConstInt(ctx->i32, 4 * i, 0),
2784 "");
2785
2786 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2787 get_rel_patch_id(ctx),
2788 invocation_id,
2789 LLVMConstInt(ctx->i32, i, 0));
2790
2791 LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0,
2792 lds_ptr);
2793
2794 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
2795 buffer_offset, 0, 1, 0, true, false);
2796 }
2797 }
2798
2799 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2800 LLVMValueRef rel_patch_id,
2801 LLVMValueRef invocation_id,
2802 LLVMValueRef tcs_out_current_patch_data_offset,
2803 LLVMValueRef invoc0_tf_outer[4],
2804 LLVMValueRef invoc0_tf_inner[2])
2805 {
2806 struct si_shader_context *ctx = si_shader_context(bld_base);
2807 struct si_shader *shader = ctx->shader;
2808 unsigned tess_inner_index, tess_outer_index;
2809 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2810 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
2811 unsigned stride, outer_comps, inner_comps, i, offset;
2812 struct lp_build_if_state if_ctx, inner_if_ctx;
2813
2814 /* Add a barrier before loading tess factors from LDS. */
2815 if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
2816 si_llvm_emit_barrier(NULL, bld_base, NULL);
2817
2818 /* Do this only for invocation 0, because the tess levels are per-patch,
2819 * not per-vertex.
2820 *
2821 * This can't jump, because invocation 0 executes this. It should
2822 * at least mask out the loads and stores for other invocations.
2823 */
2824 lp_build_if(&if_ctx, &ctx->gallivm,
2825 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
2826 invocation_id, ctx->i32_0, ""));
2827
2828 /* Determine the layout of one tess factor element in the buffer. */
2829 switch (shader->key.part.tcs.epilog.prim_mode) {
2830 case PIPE_PRIM_LINES:
2831 stride = 2; /* 2 dwords, 1 vec2 store */
2832 outer_comps = 2;
2833 inner_comps = 0;
2834 break;
2835 case PIPE_PRIM_TRIANGLES:
2836 stride = 4; /* 4 dwords, 1 vec4 store */
2837 outer_comps = 3;
2838 inner_comps = 1;
2839 break;
2840 case PIPE_PRIM_QUADS:
2841 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2842 outer_comps = 4;
2843 inner_comps = 2;
2844 break;
2845 default:
2846 assert(0);
2847 return;
2848 }
2849
2850 for (i = 0; i < 4; i++) {
2851 inner[i] = LLVMGetUndef(ctx->i32);
2852 outer[i] = LLVMGetUndef(ctx->i32);
2853 }
2854
2855 if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
2856 /* Tess factors are in VGPRs. */
2857 for (i = 0; i < outer_comps; i++)
2858 outer[i] = out[i] = invoc0_tf_outer[i];
2859 for (i = 0; i < inner_comps; i++)
2860 inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
2861 } else {
2862 /* Load tess_inner and tess_outer from LDS.
2863 * Any invocation can write them, so we can't get them from a temporary.
2864 */
2865 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
2866 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
2867
2868 lds_base = tcs_out_current_patch_data_offset;
2869 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
2870 LLVMConstInt(ctx->i32,
2871 tess_inner_index * 4, 0), "");
2872 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
2873 LLVMConstInt(ctx->i32,
2874 tess_outer_index * 4, 0), "");
2875
2876 for (i = 0; i < outer_comps; i++) {
2877 outer[i] = out[i] =
2878 lds_load(bld_base, ctx->ac.i32, i, lds_outer);
2879 }
2880 for (i = 0; i < inner_comps; i++) {
2881 inner[i] = out[outer_comps+i] =
2882 lds_load(bld_base, ctx->ac.i32, i, lds_inner);
2883 }
2884 }
2885
2886 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
2887 /* For isolines, the hardware expects tess factors in the
2888 * reverse order from what GLSL / TGSI specify.
2889 */
2890 LLVMValueRef tmp = out[0];
2891 out[0] = out[1];
2892 out[1] = tmp;
2893 }
2894
2895 /* Convert the outputs to vectors for stores. */
2896 vec0 = lp_build_gather_values(&ctx->gallivm, out, MIN2(stride, 4));
2897 vec1 = NULL;
2898
2899 if (stride > 4)
2900 vec1 = lp_build_gather_values(&ctx->gallivm, out+4, stride - 4);
2901
2902 /* Get the buffer. */
2903 buffer = desc_from_addr_base64k(ctx, ctx->param_tcs_factor_addr_base64k);
2904
2905 /* Get the offset. */
2906 tf_base = LLVMGetParam(ctx->main_fn,
2907 ctx->param_tcs_factor_offset);
2908 byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
2909 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
2910
2911 lp_build_if(&inner_if_ctx, &ctx->gallivm,
2912 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
2913 rel_patch_id, ctx->i32_0, ""));
2914
2915 /* Store the dynamic HS control word. */
2916 offset = 0;
2917 if (ctx->screen->info.chip_class <= VI) {
2918 ac_build_buffer_store_dword(&ctx->ac, buffer,
2919 LLVMConstInt(ctx->i32, 0x80000000, 0),
2920 1, ctx->i32_0, tf_base,
2921 offset, 1, 0, true, false);
2922 offset += 4;
2923 }
2924
2925 lp_build_endif(&inner_if_ctx);
2926
2927 /* Store the tessellation factors. */
2928 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
2929 MIN2(stride, 4), byteoffset, tf_base,
2930 offset, 1, 0, true, false);
2931 offset += 16;
2932 if (vec1)
2933 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
2934 stride - 4, byteoffset, tf_base,
2935 offset, 1, 0, true, false);
2936
2937 /* Store the tess factors into the offchip buffer if TES reads them. */
2938 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
2939 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
2940 LLVMValueRef tf_inner_offset;
2941 unsigned param_outer, param_inner;
2942
2943 buf = desc_from_addr_base64k(ctx, ctx->param_tcs_offchip_addr_base64k);
2944 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2945
2946 param_outer = si_shader_io_get_unique_index_patch(
2947 TGSI_SEMANTIC_TESSOUTER, 0);
2948 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2949 LLVMConstInt(ctx->i32, param_outer, 0));
2950
2951 outer_vec = lp_build_gather_values(&ctx->gallivm, outer,
2952 util_next_power_of_two(outer_comps));
2953
2954 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
2955 outer_comps, tf_outer_offset,
2956 base, 0, 1, 0, true, false);
2957 if (inner_comps) {
2958 param_inner = si_shader_io_get_unique_index_patch(
2959 TGSI_SEMANTIC_TESSINNER, 0);
2960 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
2961 LLVMConstInt(ctx->i32, param_inner, 0));
2962
2963 inner_vec = inner_comps == 1 ? inner[0] :
2964 lp_build_gather_values(&ctx->gallivm, inner, inner_comps);
2965 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
2966 inner_comps, tf_inner_offset,
2967 base, 0, 1, 0, true, false);
2968 }
2969 }
2970
2971 lp_build_endif(&if_ctx);
2972 }
2973
2974 static LLVMValueRef
2975 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
2976 unsigned param, unsigned return_index)
2977 {
2978 return LLVMBuildInsertValue(ctx->ac.builder, ret,
2979 LLVMGetParam(ctx->main_fn, param),
2980 return_index, "");
2981 }
2982
2983 static LLVMValueRef
2984 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
2985 unsigned param, unsigned return_index)
2986 {
2987 LLVMBuilderRef builder = ctx->ac.builder;
2988 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
2989
2990 return LLVMBuildInsertValue(builder, ret,
2991 ac_to_float(&ctx->ac, p),
2992 return_index, "");
2993 }
2994
2995 static LLVMValueRef
2996 si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret,
2997 unsigned param, unsigned return_index)
2998 {
2999 LLVMBuilderRef builder = ctx->ac.builder;
3000 LLVMValueRef ptr, lo, hi;
3001
3002 ptr = LLVMGetParam(ctx->main_fn, param);
3003 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i64, "");
3004 ptr = LLVMBuildBitCast(builder, ptr, ctx->v2i32, "");
3005 lo = LLVMBuildExtractElement(builder, ptr, ctx->i32_0, "");
3006 hi = LLVMBuildExtractElement(builder, ptr, ctx->i32_1, "");
3007 ret = LLVMBuildInsertValue(builder, ret, lo, return_index, "");
3008 return LLVMBuildInsertValue(builder, ret, hi, return_index + 1, "");
3009 }
3010
3011 /* This only writes the tessellation factor levels. */
3012 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
3013 {
3014 struct si_shader_context *ctx = si_shader_context(bld_base);
3015 LLVMBuilderRef builder = ctx->ac.builder;
3016 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
3017
3018 si_copy_tcs_inputs(bld_base);
3019
3020 rel_patch_id = get_rel_patch_id(ctx);
3021 invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
3022 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
3023
3024 if (ctx->screen->info.chip_class >= GFX9) {
3025 LLVMBasicBlockRef blocks[2] = {
3026 LLVMGetInsertBlock(builder),
3027 ctx->merged_wrap_if_state.entry_block
3028 };
3029 LLVMValueRef values[2];
3030
3031 lp_build_endif(&ctx->merged_wrap_if_state);
3032
3033 values[0] = rel_patch_id;
3034 values[1] = LLVMGetUndef(ctx->i32);
3035 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3036
3037 values[0] = tf_lds_offset;
3038 values[1] = LLVMGetUndef(ctx->i32);
3039 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3040
3041 values[0] = invocation_id;
3042 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
3043 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3044 }
3045
3046 /* Return epilog parameters from this function. */
3047 LLVMValueRef ret = ctx->return_value;
3048 unsigned vgpr;
3049
3050 if (ctx->screen->info.chip_class >= GFX9) {
3051 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3052 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3053 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
3054 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
3055 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
3056 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
3057 /* Tess offchip and tess factor offsets are at the beginning. */
3058 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3059 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3060 vgpr = 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K + 1;
3061 } else {
3062 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3063 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
3064 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
3065 GFX6_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
3066 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
3067 GFX6_SGPR_TCS_FACTOR_ADDR_BASE64K);
3068 /* Tess offchip and tess factor offsets are after user SGPRs. */
3069 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
3070 GFX6_TCS_NUM_USER_SGPR);
3071 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
3072 GFX6_TCS_NUM_USER_SGPR + 1);
3073 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
3074 }
3075
3076 /* VGPRs */
3077 rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
3078 invocation_id = ac_to_float(&ctx->ac, invocation_id);
3079 tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
3080
3081 /* Leave a hole corresponding to the two input VGPRs. This ensures that
3082 * the invocation_id output does not alias the param_tcs_rel_ids input,
3083 * which saves a V_MOV on gfx9.
3084 */
3085 vgpr += 2;
3086
3087 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
3088 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
3089
3090 if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
3091 vgpr++; /* skip the tess factor LDS offset */
3092 for (unsigned i = 0; i < 6; i++) {
3093 LLVMValueRef value =
3094 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
3095 value = ac_to_float(&ctx->ac, value);
3096 ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
3097 }
3098 } else {
3099 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
3100 }
3101 ctx->return_value = ret;
3102 }
3103
3104 /* Pass TCS inputs from LS to TCS on GFX9. */
3105 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
3106 {
3107 LLVMValueRef ret = ctx->return_value;
3108
3109 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3110 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3111 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3112 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3113
3114 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers,
3115 8 + SI_SGPR_RW_BUFFERS);
3116 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
3117 ctx->param_bindless_samplers_and_images,
3118 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3119
3120 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
3121 8 + SI_SGPR_VS_STATE_BITS);
3122 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3123 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3124 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
3125 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
3126 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3127 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3128 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_addr_base64k,
3129 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR_BASE64K);
3130 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_addr_base64k,
3131 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K);
3132
3133 unsigned desc_param = ctx->param_tcs_factor_addr_base64k + 2;
3134 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
3135 8 + GFX9_SGPR_TCS_CONST_AND_SHADER_BUFFERS);
3136 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
3137 8 + GFX9_SGPR_TCS_SAMPLERS_AND_IMAGES);
3138
3139 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
3140 ret = si_insert_input_ret_float(ctx, ret,
3141 ctx->param_tcs_patch_id, vgpr++);
3142 ret = si_insert_input_ret_float(ctx, ret,
3143 ctx->param_tcs_rel_ids, vgpr++);
3144 ctx->return_value = ret;
3145 }
3146
3147 /* Pass GS inputs from ES to GS on GFX9. */
3148 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
3149 {
3150 LLVMValueRef ret = ctx->return_value;
3151
3152 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3153 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3154 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3155
3156 ret = si_insert_input_ptr_as_2xi32(ctx, ret, ctx->param_rw_buffers,
3157 8 + SI_SGPR_RW_BUFFERS);
3158 ret = si_insert_input_ptr_as_2xi32(ctx, ret,
3159 ctx->param_bindless_samplers_and_images,
3160 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3161
3162 unsigned desc_param = ctx->param_vs_state_bits + 1;
3163 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param,
3164 8 + GFX9_SGPR_GS_CONST_AND_SHADER_BUFFERS);
3165 ret = si_insert_input_ptr_as_2xi32(ctx, ret, desc_param + 1,
3166 8 + GFX9_SGPR_GS_SAMPLERS_AND_IMAGES);
3167
3168 unsigned vgpr = 8 + GFX9_GS_NUM_USER_SGPR;
3169 for (unsigned i = 0; i < 5; i++) {
3170 unsigned param = ctx->param_gs_vtx01_offset + i;
3171 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3172 }
3173 ctx->return_value = ret;
3174 }
3175
3176 static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
3177 unsigned max_outputs,
3178 LLVMValueRef *addrs)
3179 {
3180 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3181 struct si_shader *shader = ctx->shader;
3182 struct tgsi_shader_info *info = &shader->selector->info;
3183 unsigned i, chan;
3184 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3185 ctx->param_rel_auto_id);
3186 LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3187 LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
3188 vertex_dw_stride, "");
3189
3190 /* Write outputs to LDS. The next shader (TCS aka HS) will read
3191 * its inputs from it. */
3192 for (i = 0; i < info->num_outputs; i++) {
3193 unsigned name = info->output_semantic_name[i];
3194 unsigned index = info->output_semantic_index[i];
3195
3196 /* The ARB_shader_viewport_layer_array spec contains the
3197 * following issue:
3198 *
3199 * 2) What happens if gl_ViewportIndex or gl_Layer is
3200 * written in the vertex shader and a geometry shader is
3201 * present?
3202 *
3203 * RESOLVED: The value written by the last vertex processing
3204 * stage is used. If the last vertex processing stage
3205 * (vertex, tessellation evaluation or geometry) does not
3206 * statically assign to gl_ViewportIndex or gl_Layer, index
3207 * or layer zero is assumed.
3208 *
3209 * So writes to those outputs in VS-as-LS are simply ignored.
3210 */
3211 if (name == TGSI_SEMANTIC_LAYER ||
3212 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3213 continue;
3214
3215 int param = si_shader_io_get_unique_index(name, index);
3216 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
3217 LLVMConstInt(ctx->i32, param * 4, 0), "");
3218
3219 for (chan = 0; chan < 4; chan++) {
3220 if (!(info->output_usagemask[i] & (1 << chan)))
3221 continue;
3222
3223 lds_store(ctx, chan, dw_addr,
3224 LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
3225 }
3226 }
3227
3228 if (ctx->screen->info.chip_class >= GFX9)
3229 si_set_ls_return_value_for_tcs(ctx);
3230 }
3231
3232 static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
3233 unsigned max_outputs,
3234 LLVMValueRef *addrs)
3235 {
3236 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3237 struct si_shader *es = ctx->shader;
3238 struct tgsi_shader_info *info = &es->selector->info;
3239 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3240 ctx->param_es2gs_offset);
3241 LLVMValueRef lds_base = NULL;
3242 unsigned chan;
3243 int i;
3244
3245 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
3246 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3247 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3248 LLVMValueRef wave_idx = unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3249 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
3250 LLVMBuildMul(ctx->ac.builder, wave_idx,
3251 LLVMConstInt(ctx->i32, 64, false), ""), "");
3252 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
3253 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3254 }
3255
3256 for (i = 0; i < info->num_outputs; i++) {
3257 int param;
3258
3259 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3260 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3261 continue;
3262
3263 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3264 info->output_semantic_index[i]);
3265
3266 for (chan = 0; chan < 4; chan++) {
3267 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
3268 out_val = ac_to_integer(&ctx->ac, out_val);
3269
3270 /* GFX9 has the ESGS ring in LDS. */
3271 if (ctx->screen->info.chip_class >= GFX9) {
3272 lds_store(ctx, param * 4 + chan, lds_base, out_val);
3273 continue;
3274 }
3275
3276 ac_build_buffer_store_dword(&ctx->ac,
3277 ctx->esgs_ring,
3278 out_val, 1, NULL, soffset,
3279 (4 * param + chan) * 4,
3280 1, 1, true, true);
3281 }
3282 }
3283
3284 if (ctx->screen->info.chip_class >= GFX9)
3285 si_set_es_return_value_for_gs(ctx);
3286 }
3287
3288 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3289 {
3290 if (ctx->screen->info.chip_class >= GFX9)
3291 return unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3292 else
3293 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3294 }
3295
3296 static void emit_gs_epilogue(struct si_shader_context *ctx)
3297 {
3298 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3299 si_get_gs_wave_id(ctx));
3300
3301 if (ctx->screen->info.chip_class >= GFX9)
3302 lp_build_endif(&ctx->merged_wrap_if_state);
3303 }
3304
3305 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
3306 unsigned max_outputs,
3307 LLVMValueRef *addrs)
3308 {
3309 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3310 struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info;
3311
3312 assert(info->num_outputs <= max_outputs);
3313
3314 emit_gs_epilogue(ctx);
3315 }
3316
3317 static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3318 {
3319 struct si_shader_context *ctx = si_shader_context(bld_base);
3320 emit_gs_epilogue(ctx);
3321 }
3322
3323 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3324 unsigned max_outputs,
3325 LLVMValueRef *addrs)
3326 {
3327 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3328 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3329 struct si_shader_output_values *outputs = NULL;
3330 int i,j;
3331
3332 assert(!ctx->shader->is_gs_copy_shader);
3333 assert(info->num_outputs <= max_outputs);
3334
3335 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3336
3337 /* Vertex color clamping.
3338 *
3339 * This uses a state constant loaded in a user data SGPR and
3340 * an IF statement is added that clamps all colors if the constant
3341 * is true.
3342 */
3343 if (ctx->type == PIPE_SHADER_VERTEX) {
3344 struct lp_build_if_state if_ctx;
3345 LLVMValueRef cond = NULL;
3346 LLVMValueRef addr, val;
3347
3348 for (i = 0; i < info->num_outputs; i++) {
3349 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3350 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3351 continue;
3352
3353 /* We've found a color. */
3354 if (!cond) {
3355 /* The state is in the first bit of the user SGPR. */
3356 cond = LLVMGetParam(ctx->main_fn,
3357 ctx->param_vs_state_bits);
3358 cond = LLVMBuildTrunc(ctx->ac.builder, cond,
3359 ctx->i1, "");
3360 lp_build_if(&if_ctx, &ctx->gallivm, cond);
3361 }
3362
3363 for (j = 0; j < 4; j++) {
3364 addr = addrs[4 * i + j];
3365 val = LLVMBuildLoad(ctx->ac.builder, addr, "");
3366 val = ac_build_clamp(&ctx->ac, val);
3367 LLVMBuildStore(ctx->ac.builder, val, addr);
3368 }
3369 }
3370
3371 if (cond)
3372 lp_build_endif(&if_ctx);
3373 }
3374
3375 for (i = 0; i < info->num_outputs; i++) {
3376 outputs[i].semantic_name = info->output_semantic_name[i];
3377 outputs[i].semantic_index = info->output_semantic_index[i];
3378
3379 for (j = 0; j < 4; j++) {
3380 outputs[i].values[j] =
3381 LLVMBuildLoad(ctx->ac.builder,
3382 addrs[4 * i + j],
3383 "");
3384 outputs[i].vertex_stream[j] =
3385 (info->output_streams[i] >> (2 * j)) & 3;
3386 }
3387 }
3388
3389 if (ctx->shader->selector->so.num_outputs)
3390 si_llvm_emit_streamout(ctx, outputs, i, 0);
3391
3392 /* Export PrimitiveID. */
3393 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3394 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3395 outputs[i].semantic_index = 0;
3396 outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0));
3397 for (j = 1; j < 4; j++)
3398 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3399
3400 memset(outputs[i].vertex_stream, 0,
3401 sizeof(outputs[i].vertex_stream));
3402 i++;
3403 }
3404
3405 si_llvm_export_vs(ctx, outputs, i);
3406 FREE(outputs);
3407 }
3408
3409 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3410 {
3411 struct si_shader_context *ctx = si_shader_context(bld_base);
3412
3413 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3414 &ctx->outputs[0][0]);
3415 }
3416
3417 struct si_ps_exports {
3418 unsigned num;
3419 struct ac_export_args args[10];
3420 };
3421
3422 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
3423 bool writes_samplemask)
3424 {
3425 if (writes_z) {
3426 /* Z needs 32 bits. */
3427 if (writes_samplemask)
3428 return V_028710_SPI_SHADER_32_ABGR;
3429 else if (writes_stencil)
3430 return V_028710_SPI_SHADER_32_GR;
3431 else
3432 return V_028710_SPI_SHADER_32_R;
3433 } else if (writes_stencil || writes_samplemask) {
3434 /* Both stencil and sample mask need only 16 bits. */
3435 return V_028710_SPI_SHADER_UINT16_ABGR;
3436 } else {
3437 return V_028710_SPI_SHADER_ZERO;
3438 }
3439 }
3440
3441 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3442 LLVMValueRef depth, LLVMValueRef stencil,
3443 LLVMValueRef samplemask, struct si_ps_exports *exp)
3444 {
3445 struct si_shader_context *ctx = si_shader_context(bld_base);
3446 struct lp_build_context *base = &bld_base->base;
3447 struct ac_export_args args;
3448 unsigned mask = 0;
3449 unsigned format = si_get_spi_shader_z_format(depth != NULL,
3450 stencil != NULL,
3451 samplemask != NULL);
3452
3453 assert(depth || stencil || samplemask);
3454
3455 args.valid_mask = 1; /* whether the EXEC mask is valid */
3456 args.done = 1; /* DONE bit */
3457
3458 /* Specify the target we are exporting */
3459 args.target = V_008DFC_SQ_EXP_MRTZ;
3460
3461 args.compr = 0; /* COMP flag */
3462 args.out[0] = base->undef; /* R, depth */
3463 args.out[1] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
3464 args.out[2] = base->undef; /* B, sample mask */
3465 args.out[3] = base->undef; /* A, alpha to mask */
3466
3467 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3468 assert(!depth);
3469 args.compr = 1; /* COMPR flag */
3470
3471 if (stencil) {
3472 /* Stencil should be in X[23:16]. */
3473 stencil = ac_to_integer(&ctx->ac, stencil);
3474 stencil = LLVMBuildShl(ctx->ac.builder, stencil,
3475 LLVMConstInt(ctx->i32, 16, 0), "");
3476 args.out[0] = ac_to_float(&ctx->ac, stencil);
3477 mask |= 0x3;
3478 }
3479 if (samplemask) {
3480 /* SampleMask should be in Y[15:0]. */
3481 args.out[1] = samplemask;
3482 mask |= 0xc;
3483 }
3484 } else {
3485 if (depth) {
3486 args.out[0] = depth;
3487 mask |= 0x1;
3488 }
3489 if (stencil) {
3490 args.out[1] = stencil;
3491 mask |= 0x2;
3492 }
3493 if (samplemask) {
3494 args.out[2] = samplemask;
3495 mask |= 0x4;
3496 }
3497 }
3498
3499 /* SI (except OLAND and HAINAN) has a bug that it only looks
3500 * at the X writemask component. */
3501 if (ctx->screen->info.chip_class == SI &&
3502 ctx->screen->info.family != CHIP_OLAND &&
3503 ctx->screen->info.family != CHIP_HAINAN)
3504 mask |= 0x1;
3505
3506 /* Specify which components to enable */
3507 args.enabled_channels = mask;
3508
3509 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3510 }
3511
3512 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3513 LLVMValueRef *color, unsigned index,
3514 unsigned samplemask_param,
3515 bool is_last, struct si_ps_exports *exp)
3516 {
3517 struct si_shader_context *ctx = si_shader_context(bld_base);
3518 int i;
3519
3520 /* Clamp color */
3521 if (ctx->shader->key.part.ps.epilog.clamp_color)
3522 for (i = 0; i < 4; i++)
3523 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3524
3525 /* Alpha to one */
3526 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3527 color[3] = ctx->ac.f32_1;
3528
3529 /* Alpha test */
3530 if (index == 0 &&
3531 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3532 si_alpha_test(bld_base, color[3]);
3533
3534 /* Line & polygon smoothing */
3535 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3536 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3537 samplemask_param);
3538
3539 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3540 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3541 struct ac_export_args args[8];
3542 int c, last = -1;
3543
3544 /* Get the export arguments, also find out what the last one is. */
3545 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3546 si_llvm_init_export_args(ctx, color,
3547 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3548 if (args[c].enabled_channels)
3549 last = c;
3550 }
3551
3552 /* Emit all exports. */
3553 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3554 if (is_last && last == c) {
3555 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3556 args[c].done = 1; /* DONE bit */
3557 } else if (!args[c].enabled_channels)
3558 continue; /* unnecessary NULL export */
3559
3560 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3561 }
3562 } else {
3563 struct ac_export_args args;
3564
3565 /* Export */
3566 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
3567 &args);
3568 if (is_last) {
3569 args.valid_mask = 1; /* whether the EXEC mask is valid */
3570 args.done = 1; /* DONE bit */
3571 } else if (!args.enabled_channels)
3572 return; /* unnecessary NULL export */
3573
3574 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3575 }
3576 }
3577
3578 static void si_emit_ps_exports(struct si_shader_context *ctx,
3579 struct si_ps_exports *exp)
3580 {
3581 for (unsigned i = 0; i < exp->num; i++)
3582 ac_build_export(&ctx->ac, &exp->args[i]);
3583 }
3584
3585 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3586 {
3587 struct si_shader_context *ctx = si_shader_context(bld_base);
3588 struct lp_build_context *base = &bld_base->base;
3589 struct ac_export_args args;
3590
3591 args.enabled_channels = 0x0; /* enabled channels */
3592 args.valid_mask = 1; /* whether the EXEC mask is valid */
3593 args.done = 1; /* DONE bit */
3594 args.target = V_008DFC_SQ_EXP_NULL;
3595 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
3596 args.out[0] = base->undef; /* R */
3597 args.out[1] = base->undef; /* G */
3598 args.out[2] = base->undef; /* B */
3599 args.out[3] = base->undef; /* A */
3600
3601 ac_build_export(&ctx->ac, &args);
3602 }
3603
3604 /**
3605 * Return PS outputs in this order:
3606 *
3607 * v[0:3] = color0.xyzw
3608 * v[4:7] = color1.xyzw
3609 * ...
3610 * vN+0 = Depth
3611 * vN+1 = Stencil
3612 * vN+2 = SampleMask
3613 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3614 *
3615 * The alpha-ref SGPR is returned via its original location.
3616 */
3617 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3618 unsigned max_outputs,
3619 LLVMValueRef *addrs)
3620 {
3621 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3622 struct si_shader *shader = ctx->shader;
3623 struct tgsi_shader_info *info = &shader->selector->info;
3624 LLVMBuilderRef builder = ctx->ac.builder;
3625 unsigned i, j, first_vgpr, vgpr;
3626
3627 LLVMValueRef color[8][4] = {};
3628 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3629 LLVMValueRef ret;
3630
3631 if (ctx->postponed_kill)
3632 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3633
3634 /* Read the output values. */
3635 for (i = 0; i < info->num_outputs; i++) {
3636 unsigned semantic_name = info->output_semantic_name[i];
3637 unsigned semantic_index = info->output_semantic_index[i];
3638
3639 switch (semantic_name) {
3640 case TGSI_SEMANTIC_COLOR:
3641 assert(semantic_index < 8);
3642 for (j = 0; j < 4; j++) {
3643 LLVMValueRef ptr = addrs[4 * i + j];
3644 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3645 color[semantic_index][j] = result;
3646 }
3647 break;
3648 case TGSI_SEMANTIC_POSITION:
3649 depth = LLVMBuildLoad(builder,
3650 addrs[4 * i + 2], "");
3651 break;
3652 case TGSI_SEMANTIC_STENCIL:
3653 stencil = LLVMBuildLoad(builder,
3654 addrs[4 * i + 1], "");
3655 break;
3656 case TGSI_SEMANTIC_SAMPLEMASK:
3657 samplemask = LLVMBuildLoad(builder,
3658 addrs[4 * i + 0], "");
3659 break;
3660 default:
3661 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3662 semantic_name);
3663 }
3664 }
3665
3666 /* Fill the return structure. */
3667 ret = ctx->return_value;
3668
3669 /* Set SGPRs. */
3670 ret = LLVMBuildInsertValue(builder, ret,
3671 ac_to_integer(&ctx->ac,
3672 LLVMGetParam(ctx->main_fn,
3673 SI_PARAM_ALPHA_REF)),
3674 SI_SGPR_ALPHA_REF, "");
3675
3676 /* Set VGPRs */
3677 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3678 for (i = 0; i < ARRAY_SIZE(color); i++) {
3679 if (!color[i][0])
3680 continue;
3681
3682 for (j = 0; j < 4; j++)
3683 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3684 }
3685 if (depth)
3686 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3687 if (stencil)
3688 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3689 if (samplemask)
3690 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3691
3692 /* Add the input sample mask for smoothing at the end. */
3693 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3694 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3695 ret = LLVMBuildInsertValue(builder, ret,
3696 LLVMGetParam(ctx->main_fn,
3697 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3698
3699 ctx->return_value = ret;
3700 }
3701
3702 void si_emit_waitcnt(struct si_shader_context *ctx, unsigned simm16)
3703 {
3704 LLVMValueRef args[1] = {
3705 LLVMConstInt(ctx->i32, simm16, 0)
3706 };
3707 lp_build_intrinsic(ctx->ac.builder, "llvm.amdgcn.s.waitcnt",
3708 ctx->voidt, args, 1, 0);
3709 }
3710
3711 static void membar_emit(
3712 const struct lp_build_tgsi_action *action,
3713 struct lp_build_tgsi_context *bld_base,
3714 struct lp_build_emit_data *emit_data)
3715 {
3716 struct si_shader_context *ctx = si_shader_context(bld_base);
3717 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3718 unsigned flags = LLVMConstIntGetZExtValue(src0);
3719 unsigned waitcnt = NOOP_WAITCNT;
3720
3721 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3722 waitcnt &= VM_CNT & LGKM_CNT;
3723
3724 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3725 TGSI_MEMBAR_SHADER_BUFFER |
3726 TGSI_MEMBAR_SHADER_IMAGE))
3727 waitcnt &= VM_CNT;
3728
3729 if (flags & TGSI_MEMBAR_SHARED)
3730 waitcnt &= LGKM_CNT;
3731
3732 if (waitcnt != NOOP_WAITCNT)
3733 si_emit_waitcnt(ctx, waitcnt);
3734 }
3735
3736 static void clock_emit(
3737 const struct lp_build_tgsi_action *action,
3738 struct lp_build_tgsi_context *bld_base,
3739 struct lp_build_emit_data *emit_data)
3740 {
3741 struct si_shader_context *ctx = si_shader_context(bld_base);
3742 LLVMValueRef tmp;
3743
3744 tmp = lp_build_intrinsic(ctx->ac.builder, "llvm.readcyclecounter",
3745 ctx->i64, NULL, 0, 0);
3746 tmp = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->v2i32, "");
3747
3748 emit_data->output[0] =
3749 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, "");
3750 emit_data->output[1] =
3751 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, "");
3752 }
3753
3754 LLVMTypeRef si_const_array(LLVMTypeRef elem_type, int num_elements)
3755 {
3756 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3757 CONST_ADDR_SPACE);
3758 }
3759
3760 static void si_llvm_emit_ddxy(
3761 const struct lp_build_tgsi_action *action,
3762 struct lp_build_tgsi_context *bld_base,
3763 struct lp_build_emit_data *emit_data)
3764 {
3765 struct si_shader_context *ctx = si_shader_context(bld_base);
3766 unsigned opcode = emit_data->info->opcode;
3767 LLVMValueRef val;
3768 int idx;
3769 unsigned mask;
3770
3771 if (opcode == TGSI_OPCODE_DDX_FINE)
3772 mask = AC_TID_MASK_LEFT;
3773 else if (opcode == TGSI_OPCODE_DDY_FINE)
3774 mask = AC_TID_MASK_TOP;
3775 else
3776 mask = AC_TID_MASK_TOP_LEFT;
3777
3778 /* for DDX we want to next X pixel, DDY next Y pixel. */
3779 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3780
3781 val = ac_to_integer(&ctx->ac, emit_data->args[0]);
3782 val = ac_build_ddxy(&ctx->ac, mask, idx, val);
3783 emit_data->output[emit_data->chan] = val;
3784 }
3785
3786 /*
3787 * this takes an I,J coordinate pair,
3788 * and works out the X and Y derivatives.
3789 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3790 */
3791 static LLVMValueRef si_llvm_emit_ddxy_interp(
3792 struct lp_build_tgsi_context *bld_base,
3793 LLVMValueRef interp_ij)
3794 {
3795 struct si_shader_context *ctx = si_shader_context(bld_base);
3796 LLVMValueRef result[4], a;
3797 unsigned i;
3798
3799 for (i = 0; i < 2; i++) {
3800 a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij,
3801 LLVMConstInt(ctx->i32, i, 0), "");
3802 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
3803 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
3804 }
3805
3806 return lp_build_gather_values(&ctx->gallivm, result, 4);
3807 }
3808
3809 static void interp_fetch_args(
3810 struct lp_build_tgsi_context *bld_base,
3811 struct lp_build_emit_data *emit_data)
3812 {
3813 struct si_shader_context *ctx = si_shader_context(bld_base);
3814 const struct tgsi_full_instruction *inst = emit_data->inst;
3815
3816 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3817 /* offset is in second src, first two channels */
3818 emit_data->args[0] = lp_build_emit_fetch(bld_base,
3819 emit_data->inst, 1,
3820 TGSI_CHAN_X);
3821 emit_data->args[1] = lp_build_emit_fetch(bld_base,
3822 emit_data->inst, 1,
3823 TGSI_CHAN_Y);
3824 emit_data->arg_count = 2;
3825 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3826 LLVMValueRef sample_position;
3827 LLVMValueRef sample_id;
3828 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3829
3830 /* fetch sample ID, then fetch its sample position,
3831 * and place into first two channels.
3832 */
3833 sample_id = lp_build_emit_fetch(bld_base,
3834 emit_data->inst, 1, TGSI_CHAN_X);
3835 sample_id = ac_to_integer(&ctx->ac, sample_id);
3836
3837 /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading
3838 * Language 4.50 spec says about interpolateAtSample:
3839 *
3840 * "Returns the value of the input interpolant variable at
3841 * the location of sample number sample. If multisample
3842 * buffers are not available, the input variable will be
3843 * evaluated at the center of the pixel. If sample sample
3844 * does not exist, the position used to interpolate the
3845 * input variable is undefined."
3846 *
3847 * This means that sample_id values outside of the valid are
3848 * in fact valid input, and the usual mechanism for loading the
3849 * sample position doesn't work.
3850 */
3851 if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) {
3852 LLVMValueRef center[4] = {
3853 LLVMConstReal(ctx->f32, 0.5),
3854 LLVMConstReal(ctx->f32, 0.5),
3855 ctx->ac.f32_0,
3856 ctx->ac.f32_0,
3857 };
3858
3859 sample_position = lp_build_gather_values(&ctx->gallivm, center, 4);
3860 } else {
3861 sample_position = load_sample_position(ctx, sample_id);
3862 }
3863
3864 emit_data->args[0] = LLVMBuildExtractElement(ctx->ac.builder,
3865 sample_position,
3866 ctx->i32_0, "");
3867
3868 emit_data->args[0] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[0], halfval, "");
3869 emit_data->args[1] = LLVMBuildExtractElement(ctx->ac.builder,
3870 sample_position,
3871 ctx->i32_1, "");
3872 emit_data->args[1] = LLVMBuildFSub(ctx->ac.builder, emit_data->args[1], halfval, "");
3873 emit_data->arg_count = 2;
3874 }
3875 }
3876
3877 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3878 struct lp_build_tgsi_context *bld_base,
3879 struct lp_build_emit_data *emit_data)
3880 {
3881 struct si_shader_context *ctx = si_shader_context(bld_base);
3882 struct si_shader *shader = ctx->shader;
3883 const struct tgsi_shader_info *info = &shader->selector->info;
3884 LLVMValueRef interp_param;
3885 const struct tgsi_full_instruction *inst = emit_data->inst;
3886 const struct tgsi_full_src_register *input = &inst->Src[0];
3887 int input_base, input_array_size;
3888 int chan;
3889 int i;
3890 LLVMValueRef prim_mask = LLVMGetParam(ctx->main_fn, SI_PARAM_PRIM_MASK);
3891 LLVMValueRef array_idx;
3892 int interp_param_idx;
3893 unsigned interp;
3894 unsigned location;
3895
3896 assert(input->Register.File == TGSI_FILE_INPUT);
3897
3898 if (input->Register.Indirect) {
3899 unsigned array_id = input->Indirect.ArrayID;
3900
3901 if (array_id) {
3902 input_base = info->input_array_first[array_id];
3903 input_array_size = info->input_array_last[array_id] - input_base + 1;
3904 } else {
3905 input_base = inst->Src[0].Register.Index;
3906 input_array_size = info->num_inputs - input_base;
3907 }
3908
3909 array_idx = si_get_indirect_index(ctx, &input->Indirect,
3910 1, input->Register.Index - input_base);
3911 } else {
3912 input_base = inst->Src[0].Register.Index;
3913 input_array_size = 1;
3914 array_idx = ctx->i32_0;
3915 }
3916
3917 interp = shader->selector->info.input_interpolate[input_base];
3918
3919 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3920 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
3921 location = TGSI_INTERPOLATE_LOC_CENTER;
3922 else
3923 location = TGSI_INTERPOLATE_LOC_CENTROID;
3924
3925 interp_param_idx = lookup_interp_param_index(interp, location);
3926 if (interp_param_idx == -1)
3927 return;
3928 else if (interp_param_idx)
3929 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
3930 else
3931 interp_param = NULL;
3932
3933 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
3934 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3935 LLVMValueRef ij_out[2];
3936 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
3937
3938 /*
3939 * take the I then J parameters, and the DDX/Y for it, and
3940 * calculate the IJ inputs for the interpolator.
3941 * temp1 = ddx * offset/sample.x + I;
3942 * interp_param.I = ddy * offset/sample.y + temp1;
3943 * temp1 = ddx * offset/sample.x + J;
3944 * interp_param.J = ddy * offset/sample.y + temp1;
3945 */
3946 for (i = 0; i < 2; i++) {
3947 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
3948 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
3949 LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
3950 ddxy_out, ix_ll, "");
3951 LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
3952 ddxy_out, iy_ll, "");
3953 LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
3954 interp_param, ix_ll, "");
3955 LLVMValueRef temp1, temp2;
3956
3957 interp_el = ac_to_float(&ctx->ac, interp_el);
3958
3959 temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, emit_data->args[0], "");
3960
3961 temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, "");
3962
3963 temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, emit_data->args[1], "");
3964
3965 ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, "");
3966 }
3967 interp_param = lp_build_gather_values(&ctx->gallivm, ij_out, 2);
3968 }
3969
3970 if (interp_param)
3971 interp_param = ac_to_float(&ctx->ac, interp_param);
3972
3973 for (chan = 0; chan < 4; chan++) {
3974 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
3975 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
3976
3977 for (unsigned idx = 0; idx < input_array_size; ++idx) {
3978 LLVMValueRef v, i = NULL, j = NULL;
3979
3980 if (interp_param) {
3981 i = LLVMBuildExtractElement(
3982 ctx->ac.builder, interp_param, ctx->i32_0, "");
3983 j = LLVMBuildExtractElement(
3984 ctx->ac.builder, interp_param, ctx->i32_1, "");
3985 }
3986 v = si_build_fs_interp(ctx, input_base + idx, schan,
3987 prim_mask, i, j);
3988
3989 gather = LLVMBuildInsertElement(ctx->ac.builder,
3990 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
3991 }
3992
3993 emit_data->output[chan] = LLVMBuildExtractElement(
3994 ctx->ac.builder, gather, array_idx, "");
3995 }
3996 }
3997
3998 static void vote_all_emit(
3999 const struct lp_build_tgsi_action *action,
4000 struct lp_build_tgsi_context *bld_base,
4001 struct lp_build_emit_data *emit_data)
4002 {
4003 struct si_shader_context *ctx = si_shader_context(bld_base);
4004
4005 LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]);
4006 emit_data->output[emit_data->chan] =
4007 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4008 }
4009
4010 static void vote_any_emit(
4011 const struct lp_build_tgsi_action *action,
4012 struct lp_build_tgsi_context *bld_base,
4013 struct lp_build_emit_data *emit_data)
4014 {
4015 struct si_shader_context *ctx = si_shader_context(bld_base);
4016
4017 LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]);
4018 emit_data->output[emit_data->chan] =
4019 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4020 }
4021
4022 static void vote_eq_emit(
4023 const struct lp_build_tgsi_action *action,
4024 struct lp_build_tgsi_context *bld_base,
4025 struct lp_build_emit_data *emit_data)
4026 {
4027 struct si_shader_context *ctx = si_shader_context(bld_base);
4028
4029 LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]);
4030 emit_data->output[emit_data->chan] =
4031 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4032 }
4033
4034 static void ballot_emit(
4035 const struct lp_build_tgsi_action *action,
4036 struct lp_build_tgsi_context *bld_base,
4037 struct lp_build_emit_data *emit_data)
4038 {
4039 struct si_shader_context *ctx = si_shader_context(bld_base);
4040 LLVMBuilderRef builder = ctx->ac.builder;
4041 LLVMValueRef tmp;
4042
4043 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4044 tmp = ac_build_ballot(&ctx->ac, tmp);
4045 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
4046
4047 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
4048 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
4049 }
4050
4051 static void read_invoc_fetch_args(
4052 struct lp_build_tgsi_context *bld_base,
4053 struct lp_build_emit_data *emit_data)
4054 {
4055 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
4056 0, emit_data->src_chan);
4057
4058 /* Always read the source invocation (= lane) from the X channel. */
4059 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
4060 1, TGSI_CHAN_X);
4061 emit_data->arg_count = 2;
4062 }
4063
4064 static void read_lane_emit(
4065 const struct lp_build_tgsi_action *action,
4066 struct lp_build_tgsi_context *bld_base,
4067 struct lp_build_emit_data *emit_data)
4068 {
4069 struct si_shader_context *ctx = si_shader_context(bld_base);
4070
4071 /* We currently have no other way to prevent LLVM from lifting the icmp
4072 * calls to a dominating basic block.
4073 */
4074 ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]);
4075
4076 for (unsigned i = 0; i < emit_data->arg_count; ++i)
4077 emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]);
4078
4079 emit_data->output[emit_data->chan] =
4080 ac_build_intrinsic(&ctx->ac, action->intr_name,
4081 ctx->i32, emit_data->args, emit_data->arg_count,
4082 AC_FUNC_ATTR_READNONE |
4083 AC_FUNC_ATTR_CONVERGENT);
4084 }
4085
4086 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4087 struct lp_build_emit_data *emit_data)
4088 {
4089 struct si_shader_context *ctx = si_shader_context(bld_base);
4090 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4091 LLVMValueRef imm;
4092 unsigned stream;
4093
4094 assert(src0.File == TGSI_FILE_IMMEDIATE);
4095
4096 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4097 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4098 return stream;
4099 }
4100
4101 /* Emit one vertex from the geometry shader */
4102 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
4103 unsigned stream,
4104 LLVMValueRef *addrs)
4105 {
4106 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4107 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4108 struct lp_build_context *uint = &ctx->bld_base.uint_bld;
4109 struct si_shader *shader = ctx->shader;
4110 struct lp_build_if_state if_state;
4111 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4112 ctx->param_gs2vs_offset);
4113 LLVMValueRef gs_next_vertex;
4114 LLVMValueRef can_emit;
4115 unsigned chan, offset;
4116 int i;
4117
4118 /* Write vertex attribute values to GSVS ring */
4119 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
4120 ctx->gs_next_vertex[stream],
4121 "");
4122
4123 /* If this thread has already emitted the declared maximum number of
4124 * vertices, skip the write: excessive vertex emissions are not
4125 * supposed to have any effect.
4126 *
4127 * If the shader has no writes to memory, kill it instead. This skips
4128 * further memory loads and may allow LLVM to skip to the end
4129 * altogether.
4130 */
4131 can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
4132 LLVMConstInt(ctx->i32,
4133 shader->selector->gs_max_out_vertices, 0), "");
4134
4135 bool use_kill = !info->writes_memory;
4136 if (use_kill) {
4137 ac_build_kill_if_false(&ctx->ac, can_emit);
4138 } else {
4139 lp_build_if(&if_state, &ctx->gallivm, can_emit);
4140 }
4141
4142 offset = 0;
4143 for (i = 0; i < info->num_outputs; i++) {
4144 for (chan = 0; chan < 4; chan++) {
4145 if (!(info->output_usagemask[i] & (1 << chan)) ||
4146 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4147 continue;
4148
4149 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
4150 LLVMValueRef voffset =
4151 LLVMConstInt(ctx->i32, offset *
4152 shader->selector->gs_max_out_vertices, 0);
4153 offset++;
4154
4155 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4156 voffset = lp_build_mul_imm(uint, voffset, 4);
4157
4158 out_val = ac_to_integer(&ctx->ac, out_val);
4159
4160 ac_build_buffer_store_dword(&ctx->ac,
4161 ctx->gsvs_ring[stream],
4162 out_val, 1,
4163 voffset, soffset, 0,
4164 1, 1, true, true);
4165 }
4166 }
4167
4168 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4169 ctx->i32_1);
4170
4171 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4172
4173 /* Signal vertex emission */
4174 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4175 si_get_gs_wave_id(ctx));
4176 if (!use_kill)
4177 lp_build_endif(&if_state);
4178 }
4179
4180 /* Emit one vertex from the geometry shader */
4181 static void si_tgsi_emit_vertex(
4182 const struct lp_build_tgsi_action *action,
4183 struct lp_build_tgsi_context *bld_base,
4184 struct lp_build_emit_data *emit_data)
4185 {
4186 struct si_shader_context *ctx = si_shader_context(bld_base);
4187 unsigned stream = si_llvm_get_stream(bld_base, emit_data);
4188
4189 si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]);
4190 }
4191
4192 /* Cut one primitive from the geometry shader */
4193 static void si_llvm_emit_primitive(
4194 const struct lp_build_tgsi_action *action,
4195 struct lp_build_tgsi_context *bld_base,
4196 struct lp_build_emit_data *emit_data)
4197 {
4198 struct si_shader_context *ctx = si_shader_context(bld_base);
4199 unsigned stream;
4200
4201 /* Signal primitive cut */
4202 stream = si_llvm_get_stream(bld_base, emit_data);
4203 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4204 si_get_gs_wave_id(ctx));
4205 }
4206
4207 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4208 struct lp_build_tgsi_context *bld_base,
4209 struct lp_build_emit_data *emit_data)
4210 {
4211 struct si_shader_context *ctx = si_shader_context(bld_base);
4212
4213 /* SI only (thanks to a hw bug workaround):
4214 * The real barrier instruction isn’t needed, because an entire patch
4215 * always fits into a single wave.
4216 */
4217 if (ctx->screen->info.chip_class == SI &&
4218 ctx->type == PIPE_SHADER_TESS_CTRL) {
4219 si_emit_waitcnt(ctx, LGKM_CNT & VM_CNT);
4220 return;
4221 }
4222
4223 lp_build_intrinsic(ctx->ac.builder,
4224 "llvm.amdgcn.s.barrier",
4225 ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
4226 }
4227
4228 static const struct lp_build_tgsi_action interp_action = {
4229 .fetch_args = interp_fetch_args,
4230 .emit = build_interp_intrinsic,
4231 };
4232
4233 static void si_create_function(struct si_shader_context *ctx,
4234 const char *name,
4235 LLVMTypeRef *returns, unsigned num_returns,
4236 struct si_function_info *fninfo,
4237 unsigned max_workgroup_size)
4238 {
4239 int i;
4240
4241 si_llvm_create_func(ctx, name, returns, num_returns,
4242 fninfo->types, fninfo->num_params);
4243 ctx->return_value = LLVMGetUndef(ctx->return_type);
4244
4245 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4246 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4247
4248 /* The combination of:
4249 * - ByVal
4250 * - dereferenceable
4251 * - invariant.load
4252 * allows the optimization passes to move loads and reduces
4253 * SGPR spilling significantly.
4254 */
4255 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4256 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_BYVAL);
4257 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
4258 ac_add_attr_dereferenceable(P, UINT64_MAX);
4259 } else
4260 lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
4261 }
4262
4263 for (i = 0; i < fninfo->num_params; ++i) {
4264 if (fninfo->assign[i])
4265 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4266 }
4267
4268 if (max_workgroup_size) {
4269 si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size",
4270 max_workgroup_size);
4271 }
4272 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4273 "no-signed-zeros-fp-math",
4274 "true");
4275
4276 if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) {
4277 /* These were copied from some LLVM test. */
4278 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4279 "less-precise-fpmad",
4280 "true");
4281 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4282 "no-infs-fp-math",
4283 "true");
4284 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4285 "no-nans-fp-math",
4286 "true");
4287 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4288 "unsafe-fp-math",
4289 "true");
4290 }
4291 }
4292
4293 static void declare_streamout_params(struct si_shader_context *ctx,
4294 struct pipe_stream_output_info *so,
4295 struct si_function_info *fninfo)
4296 {
4297 int i;
4298
4299 /* Streamout SGPRs. */
4300 if (so->num_outputs) {
4301 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4302 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4303 else
4304 ctx->param_streamout_config = fninfo->num_params - 1;
4305
4306 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4307 }
4308 /* A streamout buffer offset is loaded if the stride is non-zero. */
4309 for (i = 0; i < 4; i++) {
4310 if (!so->stride[i])
4311 continue;
4312
4313 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4314 }
4315 }
4316
4317 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4318 {
4319 switch (shader->selector->type) {
4320 case PIPE_SHADER_TESS_CTRL:
4321 /* Return this so that LLVM doesn't remove s_barrier
4322 * instructions on chips where we use s_barrier. */
4323 return shader->selector->screen->info.chip_class >= CIK ? 128 : 64;
4324
4325 case PIPE_SHADER_GEOMETRY:
4326 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64;
4327
4328 case PIPE_SHADER_COMPUTE:
4329 break; /* see below */
4330
4331 default:
4332 return 0;
4333 }
4334
4335 const unsigned *properties = shader->selector->info.properties;
4336 unsigned max_work_group_size =
4337 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4338 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4339 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4340
4341 if (!max_work_group_size) {
4342 /* This is a variable group size compute shader,
4343 * compile it for the maximum possible group size.
4344 */
4345 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4346 }
4347 return max_work_group_size;
4348 }
4349
4350 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4351 struct si_function_info *fninfo,
4352 bool assign_params)
4353 {
4354 LLVMTypeRef const_shader_buf_type;
4355
4356 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
4357 ctx->shader->selector->info.shader_buffers_declared == 0)
4358 const_shader_buf_type = ctx->f32;
4359 else
4360 const_shader_buf_type = ctx->v4i32;
4361
4362 unsigned const_and_shader_buffers =
4363 add_arg(fninfo, ARG_SGPR,
4364 si_const_array(const_shader_buf_type, 0));
4365
4366 unsigned samplers_and_images =
4367 add_arg(fninfo, ARG_SGPR,
4368 si_const_array(ctx->v8i32,
4369 SI_NUM_IMAGES + SI_NUM_SAMPLERS * 2));
4370
4371 if (assign_params) {
4372 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4373 ctx->param_samplers_and_images = samplers_and_images;
4374 }
4375 }
4376
4377 static void declare_global_desc_pointers(struct si_shader_context *ctx,
4378 struct si_function_info *fninfo)
4379 {
4380 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4381 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS));
4382 ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4383 si_const_array(ctx->v8i32, 0));
4384 }
4385
4386 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4387 struct si_function_info *fninfo)
4388 {
4389 ctx->param_vertex_buffers = add_arg(fninfo, ARG_SGPR,
4390 si_const_array(ctx->v4i32, SI_NUM_VERTEX_BUFFERS));
4391 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4392 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4393 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4394 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4395 }
4396
4397 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4398 struct si_function_info *fninfo,
4399 unsigned *num_prolog_vgprs)
4400 {
4401 struct si_shader *shader = ctx->shader;
4402
4403 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4404 if (shader->key.as_ls) {
4405 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4406 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4407 } else {
4408 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4409 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4410 }
4411 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4412
4413 if (!shader->is_gs_copy_shader) {
4414 /* Vertex load indices. */
4415 ctx->param_vertex_index0 = fninfo->num_params;
4416 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4417 add_arg(fninfo, ARG_VGPR, ctx->i32);
4418 *num_prolog_vgprs += shader->selector->info.num_inputs;
4419 }
4420 }
4421
4422 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4423 struct si_function_info *fninfo)
4424 {
4425 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4426 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4427 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4428 ctx->param_tes_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4429 }
4430
4431 enum {
4432 /* Convenient merged shader definitions. */
4433 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4434 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4435 };
4436
4437 static void create_function(struct si_shader_context *ctx)
4438 {
4439 struct si_shader *shader = ctx->shader;
4440 struct si_function_info fninfo;
4441 LLVMTypeRef returns[16+32*4];
4442 unsigned i, num_return_sgprs;
4443 unsigned num_returns = 0;
4444 unsigned num_prolog_vgprs = 0;
4445 unsigned type = ctx->type;
4446 unsigned vs_blit_property =
4447 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
4448
4449 si_init_function_info(&fninfo);
4450
4451 /* Set MERGED shaders. */
4452 if (ctx->screen->info.chip_class >= GFX9) {
4453 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4454 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4455 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4456 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4457 }
4458
4459 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4460
4461 switch (type) {
4462 case PIPE_SHADER_VERTEX:
4463 declare_global_desc_pointers(ctx, &fninfo);
4464
4465 if (vs_blit_property) {
4466 ctx->param_vs_blit_inputs = fninfo.num_params;
4467 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
4468 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
4469 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* depth */
4470
4471 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
4472 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color0 */
4473 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color1 */
4474 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color2 */
4475 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* color3 */
4476 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
4477 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
4478 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
4479 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
4480 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
4481 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
4482 add_arg(&fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
4483 }
4484
4485 /* VGPRs */
4486 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4487 break;
4488 }
4489
4490 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4491 declare_vs_specific_input_sgprs(ctx, &fninfo);
4492
4493 if (shader->key.as_es) {
4494 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4495 } else if (shader->key.as_ls) {
4496 /* no extra parameters */
4497 } else {
4498 if (shader->is_gs_copy_shader) {
4499 fninfo.num_params = ctx->param_rw_buffers + 1;
4500 fninfo.num_sgpr_params = fninfo.num_params;
4501 }
4502
4503 /* The locations of the other parameters are assigned dynamically. */
4504 declare_streamout_params(ctx, &shader->selector->so,
4505 &fninfo);
4506 }
4507
4508 /* VGPRs */
4509 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4510 break;
4511
4512 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4513 declare_global_desc_pointers(ctx, &fninfo);
4514 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4515 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4516 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4517 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4518 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4519 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4520 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4521 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4522 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4523
4524 /* VGPRs */
4525 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4526 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4527
4528 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4529 * placed after the user SGPRs.
4530 */
4531 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4532 returns[num_returns++] = ctx->i32; /* SGPRs */
4533 for (i = 0; i < 11; i++)
4534 returns[num_returns++] = ctx->f32; /* VGPRs */
4535 break;
4536
4537 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4538 /* Merged stages have 8 system SGPRs at the beginning. */
4539 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_LO_HS */
4540 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* SPI_SHADER_USER_DATA_ADDR_HI_HS */
4541 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4542 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4543 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4544 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4545 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4546 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4547
4548 declare_global_desc_pointers(ctx, &fninfo);
4549 declare_per_stage_desc_pointers(ctx, &fninfo,
4550 ctx->type == PIPE_SHADER_VERTEX);
4551 declare_vs_specific_input_sgprs(ctx, &fninfo);
4552
4553 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4554 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4555 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4556 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4557 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4558 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4559
4560 declare_per_stage_desc_pointers(ctx, &fninfo,
4561 ctx->type == PIPE_SHADER_TESS_CTRL);
4562
4563 /* VGPRs (first TCS, then VS) */
4564 ctx->param_tcs_patch_id = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4565 ctx->param_tcs_rel_ids = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4566
4567 if (ctx->type == PIPE_SHADER_VERTEX) {
4568 declare_vs_input_vgprs(ctx, &fninfo,
4569 &num_prolog_vgprs);
4570
4571 /* LS return values are inputs to the TCS main shader part. */
4572 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4573 returns[num_returns++] = ctx->i32; /* SGPRs */
4574 for (i = 0; i < 2; i++)
4575 returns[num_returns++] = ctx->f32; /* VGPRs */
4576 } else {
4577 /* TCS return values are inputs to the TCS epilog.
4578 *
4579 * param_tcs_offchip_offset, param_tcs_factor_offset,
4580 * param_tcs_offchip_layout, and param_rw_buffers
4581 * should be passed to the epilog.
4582 */
4583 for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++)
4584 returns[num_returns++] = ctx->i32; /* SGPRs */
4585 for (i = 0; i < 11; i++)
4586 returns[num_returns++] = ctx->f32; /* VGPRs */
4587 }
4588 break;
4589
4590 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4591 /* Merged stages have 8 system SGPRs at the beginning. */
4592 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_LO_GS) */
4593 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_USER_DATA_ADDR_HI_GS) */
4594 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4595 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4596 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4597 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4598 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4599 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4600
4601 declare_global_desc_pointers(ctx, &fninfo);
4602 declare_per_stage_desc_pointers(ctx, &fninfo,
4603 (ctx->type == PIPE_SHADER_VERTEX ||
4604 ctx->type == PIPE_SHADER_TESS_EVAL));
4605 if (ctx->type == PIPE_SHADER_VERTEX) {
4606 declare_vs_specific_input_sgprs(ctx, &fninfo);
4607 } else {
4608 /* TESS_EVAL (and also GEOMETRY):
4609 * Declare as many input SGPRs as the VS has. */
4610 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4611 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4612 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4613 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4614 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4615 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4616 }
4617
4618 declare_per_stage_desc_pointers(ctx, &fninfo,
4619 ctx->type == PIPE_SHADER_GEOMETRY);
4620
4621 /* VGPRs (first GS, then VS/TES) */
4622 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4623 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4624 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4625 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4626 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4627
4628 if (ctx->type == PIPE_SHADER_VERTEX) {
4629 declare_vs_input_vgprs(ctx, &fninfo,
4630 &num_prolog_vgprs);
4631 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4632 declare_tes_input_vgprs(ctx, &fninfo);
4633 }
4634
4635 if (ctx->type == PIPE_SHADER_VERTEX ||
4636 ctx->type == PIPE_SHADER_TESS_EVAL) {
4637 /* ES return values are inputs to GS. */
4638 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
4639 returns[num_returns++] = ctx->i32; /* SGPRs */
4640 for (i = 0; i < 5; i++)
4641 returns[num_returns++] = ctx->f32; /* VGPRs */
4642 }
4643 break;
4644
4645 case PIPE_SHADER_TESS_EVAL:
4646 declare_global_desc_pointers(ctx, &fninfo);
4647 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4648 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4649 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4650
4651 if (shader->key.as_es) {
4652 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4653 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4654 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4655 } else {
4656 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4657 declare_streamout_params(ctx, &shader->selector->so,
4658 &fninfo);
4659 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4660 }
4661
4662 /* VGPRs */
4663 declare_tes_input_vgprs(ctx, &fninfo);
4664 break;
4665
4666 case PIPE_SHADER_GEOMETRY:
4667 declare_global_desc_pointers(ctx, &fninfo);
4668 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4669 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4670 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4671
4672 /* VGPRs */
4673 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]);
4674 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]);
4675 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4676 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]);
4677 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]);
4678 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]);
4679 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]);
4680 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4681 break;
4682
4683 case PIPE_SHADER_FRAGMENT:
4684 declare_global_desc_pointers(ctx, &fninfo);
4685 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4686 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4687 add_arg_checked(&fninfo, ARG_SGPR, ctx->i32, SI_PARAM_PRIM_MASK);
4688
4689 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4690 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4691 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4692 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4693 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4694 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4695 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4696 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4697 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4698 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4699 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4700 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4701 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4702 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4703 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4704 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4705 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4706 &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4707 shader->info.face_vgpr_index = 20;
4708 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4709 &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4710 shader->info.ancillary_vgpr_index = 21;
4711 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4712 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4713 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4714
4715 /* Color inputs from the prolog. */
4716 if (shader->selector->info.colors_read) {
4717 unsigned num_color_elements =
4718 util_bitcount(shader->selector->info.colors_read);
4719
4720 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4721 for (i = 0; i < num_color_elements; i++)
4722 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4723
4724 num_prolog_vgprs += num_color_elements;
4725 }
4726
4727 /* Outputs for the epilog. */
4728 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4729 num_returns =
4730 num_return_sgprs +
4731 util_bitcount(shader->selector->info.colors_written) * 4 +
4732 shader->selector->info.writes_z +
4733 shader->selector->info.writes_stencil +
4734 shader->selector->info.writes_samplemask +
4735 1 /* SampleMaskIn */;
4736
4737 num_returns = MAX2(num_returns,
4738 num_return_sgprs +
4739 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4740
4741 for (i = 0; i < num_return_sgprs; i++)
4742 returns[i] = ctx->i32;
4743 for (; i < num_returns; i++)
4744 returns[i] = ctx->f32;
4745 break;
4746
4747 case PIPE_SHADER_COMPUTE:
4748 declare_global_desc_pointers(ctx, &fninfo);
4749 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4750 if (shader->selector->info.uses_grid_size)
4751 ctx->param_grid_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4752 if (shader->selector->info.uses_block_size)
4753 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4754
4755 for (i = 0; i < 3; i++) {
4756 ctx->param_block_id[i] = -1;
4757 if (shader->selector->info.uses_block_id[i])
4758 ctx->param_block_id[i] = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4759 }
4760
4761 ctx->param_thread_id = add_arg(&fninfo, ARG_VGPR, v3i32);
4762 break;
4763 default:
4764 assert(0 && "unimplemented shader");
4765 return;
4766 }
4767
4768 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4769 si_get_max_workgroup_size(shader));
4770
4771 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4772 if (ctx->type == PIPE_SHADER_FRAGMENT &&
4773 ctx->separate_prolog) {
4774 si_llvm_add_attribute(ctx->main_fn,
4775 "InitialPSInputAddr",
4776 S_0286D0_PERSP_SAMPLE_ENA(1) |
4777 S_0286D0_PERSP_CENTER_ENA(1) |
4778 S_0286D0_PERSP_CENTROID_ENA(1) |
4779 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4780 S_0286D0_LINEAR_CENTER_ENA(1) |
4781 S_0286D0_LINEAR_CENTROID_ENA(1) |
4782 S_0286D0_FRONT_FACE_ENA(1) |
4783 S_0286D0_ANCILLARY_ENA(1) |
4784 S_0286D0_POS_FIXED_PT_ENA(1));
4785 }
4786
4787 shader->info.num_input_sgprs = 0;
4788 shader->info.num_input_vgprs = 0;
4789
4790 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4791 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4;
4792
4793 for (; i < fninfo.num_params; ++i)
4794 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4;
4795
4796 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4797 shader->info.num_input_vgprs -= num_prolog_vgprs;
4798
4799 if (shader->key.as_ls ||
4800 ctx->type == PIPE_SHADER_TESS_CTRL ||
4801 /* GFX9 has the ESGS ring buffer in LDS. */
4802 type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY)
4803 ac_declare_lds_as_pointer(&ctx->ac);
4804 }
4805
4806 /**
4807 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4808 * for later use.
4809 */
4810 static void preload_ring_buffers(struct si_shader_context *ctx)
4811 {
4812 LLVMBuilderRef builder = ctx->ac.builder;
4813
4814 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4815 ctx->param_rw_buffers);
4816
4817 if (ctx->screen->info.chip_class <= VI &&
4818 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4819 unsigned ring =
4820 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4821 : SI_ES_RING_ESGS;
4822 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4823
4824 ctx->esgs_ring =
4825 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4826 }
4827
4828 if (ctx->shader->is_gs_copy_shader) {
4829 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4830
4831 ctx->gsvs_ring[0] =
4832 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4833 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4834 const struct si_shader_selector *sel = ctx->shader->selector;
4835 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4836 LLVMValueRef base_ring;
4837
4838 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4839
4840 /* The conceptual layout of the GSVS ring is
4841 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4842 * but the real memory layout is swizzled across
4843 * threads:
4844 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
4845 * t16v0c0 ..
4846 * Override the buffer descriptor accordingly.
4847 */
4848 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
4849 uint64_t stream_offset = 0;
4850
4851 for (unsigned stream = 0; stream < 4; ++stream) {
4852 unsigned num_components;
4853 unsigned stride;
4854 unsigned num_records;
4855 LLVMValueRef ring, tmp;
4856
4857 num_components = sel->info.num_stream_output_components[stream];
4858 if (!num_components)
4859 continue;
4860
4861 stride = 4 * num_components * sel->gs_max_out_vertices;
4862
4863 /* Limit on the stride field for <= CIK. */
4864 assert(stride < (1 << 14));
4865
4866 num_records = 64;
4867
4868 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
4869 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
4870 tmp = LLVMBuildAdd(builder, tmp,
4871 LLVMConstInt(ctx->i64,
4872 stream_offset, 0), "");
4873 stream_offset += stride * 64;
4874
4875 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
4876 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
4877 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
4878 tmp = LLVMBuildOr(builder, tmp,
4879 LLVMConstInt(ctx->i32,
4880 S_008F04_STRIDE(stride) |
4881 S_008F04_SWIZZLE_ENABLE(1), 0), "");
4882 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
4883 ring = LLVMBuildInsertElement(builder, ring,
4884 LLVMConstInt(ctx->i32, num_records, 0),
4885 LLVMConstInt(ctx->i32, 2, 0), "");
4886 ring = LLVMBuildInsertElement(builder, ring,
4887 LLVMConstInt(ctx->i32,
4888 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4889 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4890 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4891 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
4892 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4893 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
4894 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
4895 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
4896 S_008F0C_ADD_TID_ENABLE(1),
4897 0),
4898 LLVMConstInt(ctx->i32, 3, 0), "");
4899
4900 ctx->gsvs_ring[stream] = ring;
4901 }
4902 }
4903 }
4904
4905 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
4906 LLVMValueRef param_rw_buffers,
4907 unsigned param_pos_fixed_pt)
4908 {
4909 LLVMBuilderRef builder = ctx->ac.builder;
4910 LLVMValueRef slot, desc, offset, row, bit, address[2];
4911
4912 /* Use the fixed-point gl_FragCoord input.
4913 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
4914 * per coordinate to get the repeating effect.
4915 */
4916 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
4917 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
4918
4919 /* Load the buffer descriptor. */
4920 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
4921 desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
4922
4923 /* The stipple pattern is 32x32, each row has 32 bits. */
4924 offset = LLVMBuildMul(builder, address[1],
4925 LLVMConstInt(ctx->i32, 4, 0), "");
4926 row = buffer_load_const(ctx, desc, offset);
4927 row = ac_to_integer(&ctx->ac, row);
4928 bit = LLVMBuildLShr(builder, row, address[0], "");
4929 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
4930 ac_build_kill_if_false(&ctx->ac, bit);
4931 }
4932
4933 void si_shader_binary_read_config(struct ac_shader_binary *binary,
4934 struct si_shader_config *conf,
4935 unsigned symbol_offset)
4936 {
4937 unsigned i;
4938 const unsigned char *config =
4939 ac_shader_binary_config_start(binary, symbol_offset);
4940 bool really_needs_scratch = false;
4941
4942 /* LLVM adds SGPR spills to the scratch size.
4943 * Find out if we really need the scratch buffer.
4944 */
4945 for (i = 0; i < binary->reloc_count; i++) {
4946 const struct ac_shader_reloc *reloc = &binary->relocs[i];
4947
4948 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
4949 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
4950 really_needs_scratch = true;
4951 break;
4952 }
4953 }
4954
4955 /* XXX: We may be able to emit some of these values directly rather than
4956 * extracting fields to be emitted later.
4957 */
4958
4959 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
4960 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
4961 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
4962 switch (reg) {
4963 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
4964 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
4965 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
4966 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
4967 case R_00B848_COMPUTE_PGM_RSRC1:
4968 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
4969 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
4970 conf->float_mode = G_00B028_FLOAT_MODE(value);
4971 conf->rsrc1 = value;
4972 break;
4973 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
4974 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
4975 break;
4976 case R_00B84C_COMPUTE_PGM_RSRC2:
4977 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
4978 conf->rsrc2 = value;
4979 break;
4980 case R_0286CC_SPI_PS_INPUT_ENA:
4981 conf->spi_ps_input_ena = value;
4982 break;
4983 case R_0286D0_SPI_PS_INPUT_ADDR:
4984 conf->spi_ps_input_addr = value;
4985 break;
4986 case R_0286E8_SPI_TMPRING_SIZE:
4987 case R_00B860_COMPUTE_TMPRING_SIZE:
4988 /* WAVESIZE is in units of 256 dwords. */
4989 if (really_needs_scratch)
4990 conf->scratch_bytes_per_wave =
4991 G_00B860_WAVESIZE(value) * 256 * 4;
4992 break;
4993 case 0x4: /* SPILLED_SGPRS */
4994 conf->spilled_sgprs = value;
4995 break;
4996 case 0x8: /* SPILLED_VGPRS */
4997 conf->spilled_vgprs = value;
4998 break;
4999 default:
5000 {
5001 static bool printed;
5002
5003 if (!printed) {
5004 fprintf(stderr, "Warning: LLVM emitted unknown "
5005 "config register: 0x%x\n", reg);
5006 printed = true;
5007 }
5008 }
5009 break;
5010 }
5011 }
5012
5013 if (!conf->spi_ps_input_addr)
5014 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5015 }
5016
5017 void si_shader_apply_scratch_relocs(struct si_shader *shader,
5018 uint64_t scratch_va)
5019 {
5020 unsigned i;
5021 uint32_t scratch_rsrc_dword0 = scratch_va;
5022 uint32_t scratch_rsrc_dword1 =
5023 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5024
5025 /* Enable scratch coalescing. */
5026 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5027
5028 for (i = 0 ; i < shader->binary.reloc_count; i++) {
5029 const struct ac_shader_reloc *reloc =
5030 &shader->binary.relocs[i];
5031 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5032 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5033 &scratch_rsrc_dword0, 4);
5034 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5035 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5036 &scratch_rsrc_dword1, 4);
5037 }
5038 }
5039 }
5040
5041 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
5042 {
5043 unsigned size = shader->binary.code_size;
5044
5045 if (shader->prolog)
5046 size += shader->prolog->binary.code_size;
5047 if (shader->previous_stage)
5048 size += shader->previous_stage->binary.code_size;
5049 if (shader->prolog2)
5050 size += shader->prolog2->binary.code_size;
5051 if (shader->epilog)
5052 size += shader->epilog->binary.code_size;
5053 return size;
5054 }
5055
5056 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5057 {
5058 const struct ac_shader_binary *prolog =
5059 shader->prolog ? &shader->prolog->binary : NULL;
5060 const struct ac_shader_binary *previous_stage =
5061 shader->previous_stage ? &shader->previous_stage->binary : NULL;
5062 const struct ac_shader_binary *prolog2 =
5063 shader->prolog2 ? &shader->prolog2->binary : NULL;
5064 const struct ac_shader_binary *epilog =
5065 shader->epilog ? &shader->epilog->binary : NULL;
5066 const struct ac_shader_binary *mainb = &shader->binary;
5067 unsigned bo_size = si_get_shader_binary_size(shader) +
5068 (!epilog ? mainb->rodata_size : 0);
5069 unsigned char *ptr;
5070
5071 assert(!prolog || !prolog->rodata_size);
5072 assert(!previous_stage || !previous_stage->rodata_size);
5073 assert(!prolog2 || !prolog2->rodata_size);
5074 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
5075 !mainb->rodata_size);
5076 assert(!epilog || !epilog->rodata_size);
5077
5078 r600_resource_reference(&shader->bo, NULL);
5079 shader->bo = (struct r600_resource*)
5080 si_aligned_buffer_create(&sscreen->b,
5081 sscreen->cpdma_prefetch_writes_memory ?
5082 0 : R600_RESOURCE_FLAG_READ_ONLY,
5083 PIPE_USAGE_IMMUTABLE,
5084 align(bo_size, SI_CPDMA_ALIGNMENT),
5085 256);
5086 if (!shader->bo)
5087 return -ENOMEM;
5088
5089 /* Upload. */
5090 ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
5091 PIPE_TRANSFER_READ_WRITE |
5092 PIPE_TRANSFER_UNSYNCHRONIZED);
5093
5094 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
5095 * endian-independent. */
5096 if (prolog) {
5097 memcpy(ptr, prolog->code, prolog->code_size);
5098 ptr += prolog->code_size;
5099 }
5100 if (previous_stage) {
5101 memcpy(ptr, previous_stage->code, previous_stage->code_size);
5102 ptr += previous_stage->code_size;
5103 }
5104 if (prolog2) {
5105 memcpy(ptr, prolog2->code, prolog2->code_size);
5106 ptr += prolog2->code_size;
5107 }
5108
5109 memcpy(ptr, mainb->code, mainb->code_size);
5110 ptr += mainb->code_size;
5111
5112 if (epilog)
5113 memcpy(ptr, epilog->code, epilog->code_size);
5114 else if (mainb->rodata_size > 0)
5115 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5116
5117 sscreen->ws->buffer_unmap(shader->bo->buf);
5118 return 0;
5119 }
5120
5121 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5122 struct pipe_debug_callback *debug,
5123 const char *name, FILE *file)
5124 {
5125 char *line, *p;
5126 unsigned i, count;
5127
5128 if (binary->disasm_string) {
5129 fprintf(file, "Shader %s disassembly:\n", name);
5130 fprintf(file, "%s", binary->disasm_string);
5131
5132 if (debug && debug->debug_message) {
5133 /* Very long debug messages are cut off, so send the
5134 * disassembly one line at a time. This causes more
5135 * overhead, but on the plus side it simplifies
5136 * parsing of resulting logs.
5137 */
5138 pipe_debug_message(debug, SHADER_INFO,
5139 "Shader Disassembly Begin");
5140
5141 line = binary->disasm_string;
5142 while (*line) {
5143 p = util_strchrnul(line, '\n');
5144 count = p - line;
5145
5146 if (count) {
5147 pipe_debug_message(debug, SHADER_INFO,
5148 "%.*s", count, line);
5149 }
5150
5151 if (!*p)
5152 break;
5153 line = p + 1;
5154 }
5155
5156 pipe_debug_message(debug, SHADER_INFO,
5157 "Shader Disassembly End");
5158 }
5159 } else {
5160 fprintf(file, "Shader %s binary:\n", name);
5161 for (i = 0; i < binary->code_size; i += 4) {
5162 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5163 binary->code[i + 3], binary->code[i + 2],
5164 binary->code[i + 1], binary->code[i]);
5165 }
5166 }
5167 }
5168
5169 static void si_shader_dump_stats(struct si_screen *sscreen,
5170 const struct si_shader *shader,
5171 struct pipe_debug_callback *debug,
5172 unsigned processor,
5173 FILE *file,
5174 bool check_debug_option)
5175 {
5176 const struct si_shader_config *conf = &shader->config;
5177 unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
5178 unsigned code_size = si_get_shader_binary_size(shader);
5179 unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256;
5180 unsigned lds_per_wave = 0;
5181 unsigned max_simd_waves;
5182
5183 switch (sscreen->info.family) {
5184 /* These always have 8 waves: */
5185 case CHIP_POLARIS10:
5186 case CHIP_POLARIS11:
5187 case CHIP_POLARIS12:
5188 max_simd_waves = 8;
5189 break;
5190 default:
5191 max_simd_waves = 10;
5192 }
5193
5194 /* Compute LDS usage for PS. */
5195 switch (processor) {
5196 case PIPE_SHADER_FRAGMENT:
5197 /* The minimum usage per wave is (num_inputs * 48). The maximum
5198 * usage is (num_inputs * 48 * 16).
5199 * We can get anything in between and it varies between waves.
5200 *
5201 * The 48 bytes per input for a single primitive is equal to
5202 * 4 bytes/component * 4 components/input * 3 points.
5203 *
5204 * Other stages don't know the size at compile time or don't
5205 * allocate LDS per wave, but instead they do it per thread group.
5206 */
5207 lds_per_wave = conf->lds_size * lds_increment +
5208 align(num_inputs * 48, lds_increment);
5209 break;
5210 case PIPE_SHADER_COMPUTE:
5211 if (shader->selector) {
5212 unsigned max_workgroup_size =
5213 si_get_max_workgroup_size(shader);
5214 lds_per_wave = (conf->lds_size * lds_increment) /
5215 DIV_ROUND_UP(max_workgroup_size, 64);
5216 }
5217 break;
5218 }
5219
5220 /* Compute the per-SIMD wave counts. */
5221 if (conf->num_sgprs) {
5222 if (sscreen->info.chip_class >= VI)
5223 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5224 else
5225 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5226 }
5227
5228 if (conf->num_vgprs)
5229 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5230
5231 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5232 * 16KB makes some SIMDs unoccupied). */
5233 if (lds_per_wave)
5234 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5235
5236 if (!check_debug_option ||
5237 si_can_dump_shader(sscreen, processor)) {
5238 if (processor == PIPE_SHADER_FRAGMENT) {
5239 fprintf(file, "*** SHADER CONFIG ***\n"
5240 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5241 "SPI_PS_INPUT_ENA = 0x%04x\n",
5242 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5243 }
5244
5245 fprintf(file, "*** SHADER STATS ***\n"
5246 "SGPRS: %d\n"
5247 "VGPRS: %d\n"
5248 "Spilled SGPRs: %d\n"
5249 "Spilled VGPRs: %d\n"
5250 "Private memory VGPRs: %d\n"
5251 "Code Size: %d bytes\n"
5252 "LDS: %d blocks\n"
5253 "Scratch: %d bytes per wave\n"
5254 "Max Waves: %d\n"
5255 "********************\n\n\n",
5256 conf->num_sgprs, conf->num_vgprs,
5257 conf->spilled_sgprs, conf->spilled_vgprs,
5258 conf->private_mem_vgprs, code_size,
5259 conf->lds_size, conf->scratch_bytes_per_wave,
5260 max_simd_waves);
5261 }
5262
5263 pipe_debug_message(debug, SHADER_INFO,
5264 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5265 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5266 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5267 conf->num_sgprs, conf->num_vgprs, code_size,
5268 conf->lds_size, conf->scratch_bytes_per_wave,
5269 max_simd_waves, conf->spilled_sgprs,
5270 conf->spilled_vgprs, conf->private_mem_vgprs);
5271 }
5272
5273 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5274 {
5275 switch (processor) {
5276 case PIPE_SHADER_VERTEX:
5277 if (shader->key.as_es)
5278 return "Vertex Shader as ES";
5279 else if (shader->key.as_ls)
5280 return "Vertex Shader as LS";
5281 else
5282 return "Vertex Shader as VS";
5283 case PIPE_SHADER_TESS_CTRL:
5284 return "Tessellation Control Shader";
5285 case PIPE_SHADER_TESS_EVAL:
5286 if (shader->key.as_es)
5287 return "Tessellation Evaluation Shader as ES";
5288 else
5289 return "Tessellation Evaluation Shader as VS";
5290 case PIPE_SHADER_GEOMETRY:
5291 if (shader->is_gs_copy_shader)
5292 return "GS Copy Shader as VS";
5293 else
5294 return "Geometry Shader";
5295 case PIPE_SHADER_FRAGMENT:
5296 return "Pixel Shader";
5297 case PIPE_SHADER_COMPUTE:
5298 return "Compute Shader";
5299 default:
5300 return "Unknown Shader";
5301 }
5302 }
5303
5304 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5305 struct pipe_debug_callback *debug, unsigned processor,
5306 FILE *file, bool check_debug_option)
5307 {
5308 if (!check_debug_option ||
5309 si_can_dump_shader(sscreen, processor))
5310 si_dump_shader_key(processor, shader, file);
5311
5312 if (!check_debug_option && shader->binary.llvm_ir_string) {
5313 if (shader->previous_stage &&
5314 shader->previous_stage->binary.llvm_ir_string) {
5315 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5316 si_get_shader_name(shader, processor));
5317 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5318 }
5319
5320 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5321 si_get_shader_name(shader, processor));
5322 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5323 }
5324
5325 if (!check_debug_option ||
5326 (si_can_dump_shader(sscreen, processor) &&
5327 !(sscreen->debug_flags & DBG(NO_ASM)))) {
5328 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5329
5330 if (shader->prolog)
5331 si_shader_dump_disassembly(&shader->prolog->binary,
5332 debug, "prolog", file);
5333 if (shader->previous_stage)
5334 si_shader_dump_disassembly(&shader->previous_stage->binary,
5335 debug, "previous stage", file);
5336 if (shader->prolog2)
5337 si_shader_dump_disassembly(&shader->prolog2->binary,
5338 debug, "prolog2", file);
5339
5340 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5341
5342 if (shader->epilog)
5343 si_shader_dump_disassembly(&shader->epilog->binary,
5344 debug, "epilog", file);
5345 fprintf(file, "\n");
5346 }
5347
5348 si_shader_dump_stats(sscreen, shader, debug, processor, file,
5349 check_debug_option);
5350 }
5351
5352 static int si_compile_llvm(struct si_screen *sscreen,
5353 struct ac_shader_binary *binary,
5354 struct si_shader_config *conf,
5355 LLVMTargetMachineRef tm,
5356 LLVMModuleRef mod,
5357 struct pipe_debug_callback *debug,
5358 unsigned processor,
5359 const char *name)
5360 {
5361 int r = 0;
5362 unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
5363
5364 if (si_can_dump_shader(sscreen, processor)) {
5365 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5366
5367 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
5368 fprintf(stderr, "%s LLVM IR:\n\n", name);
5369 ac_dump_module(mod);
5370 fprintf(stderr, "\n");
5371 }
5372 }
5373
5374 if (sscreen->record_llvm_ir) {
5375 char *ir = LLVMPrintModuleToString(mod);
5376 binary->llvm_ir_string = strdup(ir);
5377 LLVMDisposeMessage(ir);
5378 }
5379
5380 if (!si_replace_shader(count, binary)) {
5381 r = si_llvm_compile(mod, binary, tm, debug);
5382 if (r)
5383 return r;
5384 }
5385
5386 si_shader_binary_read_config(binary, conf, 0);
5387
5388 /* Enable 64-bit and 16-bit denormals, because there is no performance
5389 * cost.
5390 *
5391 * If denormals are enabled, all floating-point output modifiers are
5392 * ignored.
5393 *
5394 * Don't enable denormals for 32-bit floats, because:
5395 * - Floating-point output modifiers would be ignored by the hw.
5396 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5397 * have to stop using those.
5398 * - SI & CI would be very slow.
5399 */
5400 conf->float_mode |= V_00B028_FP_64_DENORMS;
5401
5402 FREE(binary->config);
5403 FREE(binary->global_symbol_offsets);
5404 binary->config = NULL;
5405 binary->global_symbol_offsets = NULL;
5406
5407 /* Some shaders can't have rodata because their binaries can be
5408 * concatenated.
5409 */
5410 if (binary->rodata_size &&
5411 (processor == PIPE_SHADER_VERTEX ||
5412 processor == PIPE_SHADER_TESS_CTRL ||
5413 processor == PIPE_SHADER_TESS_EVAL ||
5414 processor == PIPE_SHADER_FRAGMENT)) {
5415 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5416 return -EINVAL;
5417 }
5418
5419 return r;
5420 }
5421
5422 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5423 {
5424 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5425 LLVMBuildRetVoid(ctx->ac.builder);
5426 else
5427 LLVMBuildRet(ctx->ac.builder, ret);
5428 }
5429
5430 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5431 struct si_shader *
5432 si_generate_gs_copy_shader(struct si_screen *sscreen,
5433 LLVMTargetMachineRef tm,
5434 struct si_shader_selector *gs_selector,
5435 struct pipe_debug_callback *debug)
5436 {
5437 struct si_shader_context ctx;
5438 struct si_shader *shader;
5439 LLVMBuilderRef builder;
5440 struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
5441 struct lp_build_context *uint = &bld_base->uint_bld;
5442 struct si_shader_output_values *outputs;
5443 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5444 int i, r;
5445
5446 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5447
5448 if (!outputs)
5449 return NULL;
5450
5451 shader = CALLOC_STRUCT(si_shader);
5452 if (!shader) {
5453 FREE(outputs);
5454 return NULL;
5455 }
5456
5457 /* We can leave the fence as permanently signaled because the GS copy
5458 * shader only becomes visible globally after it has been compiled. */
5459 util_queue_fence_init(&shader->ready);
5460
5461 shader->selector = gs_selector;
5462 shader->is_gs_copy_shader = true;
5463
5464 si_init_shader_ctx(&ctx, sscreen, tm);
5465 ctx.shader = shader;
5466 ctx.type = PIPE_SHADER_VERTEX;
5467
5468 builder = ctx.ac.builder;
5469
5470 create_function(&ctx);
5471 preload_ring_buffers(&ctx);
5472
5473 LLVMValueRef voffset =
5474 lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
5475
5476 /* Fetch the vertex stream ID.*/
5477 LLVMValueRef stream_id;
5478
5479 if (gs_selector->so.num_outputs)
5480 stream_id = unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5481 else
5482 stream_id = ctx.i32_0;
5483
5484 /* Fill in output information. */
5485 for (i = 0; i < gsinfo->num_outputs; ++i) {
5486 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5487 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5488
5489 for (int chan = 0; chan < 4; chan++) {
5490 outputs[i].vertex_stream[chan] =
5491 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5492 }
5493 }
5494
5495 LLVMBasicBlockRef end_bb;
5496 LLVMValueRef switch_inst;
5497
5498 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
5499 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5500
5501 for (int stream = 0; stream < 4; stream++) {
5502 LLVMBasicBlockRef bb;
5503 unsigned offset;
5504
5505 if (!gsinfo->num_stream_output_components[stream])
5506 continue;
5507
5508 if (stream > 0 && !gs_selector->so.num_outputs)
5509 continue;
5510
5511 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
5512 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5513 LLVMPositionBuilderAtEnd(builder, bb);
5514
5515 /* Fetch vertex data from GSVS ring */
5516 offset = 0;
5517 for (i = 0; i < gsinfo->num_outputs; ++i) {
5518 for (unsigned chan = 0; chan < 4; chan++) {
5519 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5520 outputs[i].vertex_stream[chan] != stream) {
5521 outputs[i].values[chan] = ctx.bld_base.base.undef;
5522 continue;
5523 }
5524
5525 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5526 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5527 offset++;
5528
5529 outputs[i].values[chan] =
5530 ac_build_buffer_load(&ctx.ac,
5531 ctx.gsvs_ring[0], 1,
5532 ctx.i32_0, voffset,
5533 soffset, 0, 1, 1,
5534 true, false);
5535 }
5536 }
5537
5538 /* Streamout and exports. */
5539 if (gs_selector->so.num_outputs) {
5540 si_llvm_emit_streamout(&ctx, outputs,
5541 gsinfo->num_outputs,
5542 stream);
5543 }
5544
5545 if (stream == 0)
5546 si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
5547
5548 LLVMBuildBr(builder, end_bb);
5549 }
5550
5551 LLVMPositionBuilderAtEnd(builder, end_bb);
5552
5553 LLVMBuildRetVoid(ctx.ac.builder);
5554
5555 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5556 si_llvm_optimize_module(&ctx);
5557
5558 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5559 &ctx.shader->config, ctx.tm,
5560 ctx.gallivm.module,
5561 debug, PIPE_SHADER_GEOMETRY,
5562 "GS Copy Shader");
5563 if (!r) {
5564 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
5565 fprintf(stderr, "GS Copy Shader:\n");
5566 si_shader_dump(sscreen, ctx.shader, debug,
5567 PIPE_SHADER_GEOMETRY, stderr, true);
5568 r = si_shader_binary_upload(sscreen, ctx.shader);
5569 }
5570
5571 si_llvm_dispose(&ctx);
5572
5573 FREE(outputs);
5574
5575 if (r != 0) {
5576 FREE(shader);
5577 shader = NULL;
5578 }
5579 return shader;
5580 }
5581
5582 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5583 const struct si_vs_prolog_bits *prolog,
5584 const char *prefix, FILE *f)
5585 {
5586 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5587 prefix, prolog->instance_divisor_is_one);
5588 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5589 prefix, prolog->instance_divisor_is_fetched);
5590 fprintf(f, " %s.ls_vgpr_fix = %u\n",
5591 prefix, prolog->ls_vgpr_fix);
5592
5593 fprintf(f, " mono.vs.fix_fetch = {");
5594 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5595 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5596 fprintf(f, "}\n");
5597 }
5598
5599 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5600 FILE *f)
5601 {
5602 const struct si_shader_key *key = &shader->key;
5603
5604 fprintf(f, "SHADER KEY\n");
5605
5606 switch (processor) {
5607 case PIPE_SHADER_VERTEX:
5608 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5609 "part.vs.prolog", f);
5610 fprintf(f, " as_es = %u\n", key->as_es);
5611 fprintf(f, " as_ls = %u\n", key->as_ls);
5612 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5613 key->mono.u.vs_export_prim_id);
5614 break;
5615
5616 case PIPE_SHADER_TESS_CTRL:
5617 if (shader->selector->screen->info.chip_class >= GFX9) {
5618 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5619 "part.tcs.ls_prolog", f);
5620 }
5621 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5622 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5623 break;
5624
5625 case PIPE_SHADER_TESS_EVAL:
5626 fprintf(f, " as_es = %u\n", key->as_es);
5627 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5628 key->mono.u.vs_export_prim_id);
5629 break;
5630
5631 case PIPE_SHADER_GEOMETRY:
5632 if (shader->is_gs_copy_shader)
5633 break;
5634
5635 if (shader->selector->screen->info.chip_class >= GFX9 &&
5636 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5637 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5638 "part.gs.vs_prolog", f);
5639 }
5640 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5641 break;
5642
5643 case PIPE_SHADER_COMPUTE:
5644 break;
5645
5646 case PIPE_SHADER_FRAGMENT:
5647 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5648 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5649 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5650 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5651 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5652 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5653 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5654 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5655 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5656 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5657 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5658 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5659 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5660 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5661 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5662 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5663 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5664 break;
5665
5666 default:
5667 assert(0);
5668 }
5669
5670 if ((processor == PIPE_SHADER_GEOMETRY ||
5671 processor == PIPE_SHADER_TESS_EVAL ||
5672 processor == PIPE_SHADER_VERTEX) &&
5673 !key->as_es && !key->as_ls) {
5674 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5675 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5676 }
5677 }
5678
5679 static void si_init_shader_ctx(struct si_shader_context *ctx,
5680 struct si_screen *sscreen,
5681 LLVMTargetMachineRef tm)
5682 {
5683 struct lp_build_tgsi_context *bld_base;
5684
5685 si_llvm_context_init(ctx, sscreen, tm);
5686
5687 bld_base = &ctx->bld_base;
5688 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5689
5690 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5691 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5692 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5693
5694 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5695
5696 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5697
5698 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5699 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5700 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5701 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5702
5703 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5704 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5705 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5706 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5707 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5708 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5709 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5710 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].fetch_args = read_invoc_fetch_args;
5711 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5712
5713 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex;
5714 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5715 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5716 }
5717
5718 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5719 {
5720 struct si_shader *shader = ctx->shader;
5721 struct tgsi_shader_info *info = &shader->selector->info;
5722
5723 if ((ctx->type != PIPE_SHADER_VERTEX &&
5724 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5725 shader->key.as_ls ||
5726 shader->key.as_es)
5727 return;
5728
5729 ac_optimize_vs_outputs(&ctx->ac,
5730 ctx->main_fn,
5731 shader->info.vs_output_param_offset,
5732 info->num_outputs,
5733 &shader->info.nr_param_exports);
5734 }
5735
5736 static void si_count_scratch_private_memory(struct si_shader_context *ctx)
5737 {
5738 ctx->shader->config.private_mem_vgprs = 0;
5739
5740 /* Process all LLVM instructions. */
5741 LLVMBasicBlockRef bb = LLVMGetFirstBasicBlock(ctx->main_fn);
5742 while (bb) {
5743 LLVMValueRef next = LLVMGetFirstInstruction(bb);
5744
5745 while (next) {
5746 LLVMValueRef inst = next;
5747 next = LLVMGetNextInstruction(next);
5748
5749 if (LLVMGetInstructionOpcode(inst) != LLVMAlloca)
5750 continue;
5751
5752 LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
5753 /* No idea why LLVM aligns allocas to 4 elements. */
5754 unsigned alignment = LLVMGetAlignment(inst);
5755 unsigned dw_size = align(ac_get_type_size(type) / 4, alignment);
5756 ctx->shader->config.private_mem_vgprs += dw_size;
5757 }
5758 bb = LLVMGetNextBasicBlock(bb);
5759 }
5760 }
5761
5762 static void si_init_exec_full_mask(struct si_shader_context *ctx)
5763 {
5764 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
5765 lp_build_intrinsic(ctx->ac.builder,
5766 "llvm.amdgcn.init.exec", ctx->voidt,
5767 &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
5768 }
5769
5770 static void si_init_exec_from_input(struct si_shader_context *ctx,
5771 unsigned param, unsigned bitoffset)
5772 {
5773 LLVMValueRef args[] = {
5774 LLVMGetParam(ctx->main_fn, param),
5775 LLVMConstInt(ctx->i32, bitoffset, 0),
5776 };
5777 lp_build_intrinsic(ctx->ac.builder,
5778 "llvm.amdgcn.init.exec.from.input",
5779 ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
5780 }
5781
5782 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5783 const struct si_vs_prolog_bits *key)
5784 {
5785 /* VGPR initialization fixup for Vega10 and Raven is always done in the
5786 * VS prolog. */
5787 return sel->vs_needs_prolog || key->ls_vgpr_fix;
5788 }
5789
5790 static bool si_compile_tgsi_main(struct si_shader_context *ctx,
5791 bool is_monolithic)
5792 {
5793 struct si_shader *shader = ctx->shader;
5794 struct si_shader_selector *sel = shader->selector;
5795 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5796
5797 // TODO clean all this up!
5798 switch (ctx->type) {
5799 case PIPE_SHADER_VERTEX:
5800 ctx->load_input = declare_input_vs;
5801 if (shader->key.as_ls)
5802 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
5803 else if (shader->key.as_es)
5804 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5805 else
5806 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5807 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5808 break;
5809 case PIPE_SHADER_TESS_CTRL:
5810 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5811 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5812 bld_base->emit_store = store_output_tcs;
5813 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
5814 break;
5815 case PIPE_SHADER_TESS_EVAL:
5816 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5817 if (shader->key.as_es)
5818 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5819 else
5820 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5821 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5822 break;
5823 case PIPE_SHADER_GEOMETRY:
5824 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
5825 ctx->abi.load_inputs = si_nir_load_input_gs;
5826 ctx->abi.emit_vertex = si_llvm_emit_vertex;
5827 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
5828 bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue;
5829 break;
5830 case PIPE_SHADER_FRAGMENT:
5831 ctx->load_input = declare_input_fs;
5832 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
5833 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5834 break;
5835 case PIPE_SHADER_COMPUTE:
5836 break;
5837 default:
5838 assert(!"Unsupported shader type");
5839 return false;
5840 }
5841
5842 ctx->abi.load_ubo = load_ubo;
5843 ctx->abi.load_ssbo = load_ssbo;
5844
5845 create_function(ctx);
5846 preload_ring_buffers(ctx);
5847
5848 /* For GFX9 merged shaders:
5849 * - Set EXEC for the first shader. If the prolog is present, set
5850 * EXEC there instead.
5851 * - Add a barrier before the second shader.
5852 * - In the second shader, reset EXEC to ~0 and wrap the main part in
5853 * an if-statement. This is required for correctness in geometry
5854 * shaders, to ensure that empty GS waves do not send GS_EMIT and
5855 * GS_CUT messages.
5856 *
5857 * For monolithic merged shaders, the first shader is wrapped in an
5858 * if-block together with its prolog in si_build_wrapper_function.
5859 */
5860 if (ctx->screen->info.chip_class >= GFX9) {
5861 if (!is_monolithic &&
5862 sel->info.num_instructions > 1 && /* not empty shader */
5863 (shader->key.as_es || shader->key.as_ls) &&
5864 (ctx->type == PIPE_SHADER_TESS_EVAL ||
5865 (ctx->type == PIPE_SHADER_VERTEX &&
5866 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
5867 si_init_exec_from_input(ctx,
5868 ctx->param_merged_wave_info, 0);
5869 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
5870 ctx->type == PIPE_SHADER_GEOMETRY) {
5871 if (!is_monolithic)
5872 si_init_exec_full_mask(ctx);
5873
5874 /* The barrier must execute for all shaders in a
5875 * threadgroup.
5876 */
5877 si_llvm_emit_barrier(NULL, bld_base, NULL);
5878
5879 LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
5880 LLVMValueRef ena =
5881 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
5882 ac_get_thread_id(&ctx->ac), num_threads, "");
5883 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
5884 }
5885 }
5886
5887 if (ctx->type == PIPE_SHADER_TESS_CTRL &&
5888 sel->tcs_info.tessfactors_are_def_in_all_invocs) {
5889 for (unsigned i = 0; i < 6; i++) {
5890 ctx->invoc0_tess_factors[i] =
5891 lp_build_alloca_undef(&ctx->gallivm, ctx->i32, "");
5892 }
5893 }
5894
5895 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5896 int i;
5897 for (i = 0; i < 4; i++) {
5898 ctx->gs_next_vertex[i] =
5899 lp_build_alloca(&ctx->gallivm,
5900 ctx->i32, "");
5901 }
5902 }
5903
5904 if (sel->force_correct_derivs_after_kill) {
5905 ctx->postponed_kill = lp_build_alloca_undef(&ctx->gallivm, ctx->i1, "");
5906 /* true = don't kill. */
5907 LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0),
5908 ctx->postponed_kill);
5909 }
5910
5911 if (sel->tokens) {
5912 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
5913 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
5914 return false;
5915 }
5916 } else {
5917 if (!si_nir_build_llvm(ctx, sel->nir)) {
5918 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
5919 return false;
5920 }
5921 }
5922
5923 si_llvm_build_ret(ctx, ctx->return_value);
5924 return true;
5925 }
5926
5927 /**
5928 * Compute the VS prolog key, which contains all the information needed to
5929 * build the VS prolog function, and set shader->info bits where needed.
5930 *
5931 * \param info Shader info of the vertex shader.
5932 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
5933 * \param prolog_key Key of the VS prolog
5934 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
5935 * \param key Output shader part key.
5936 */
5937 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
5938 unsigned num_input_sgprs,
5939 const struct si_vs_prolog_bits *prolog_key,
5940 struct si_shader *shader_out,
5941 union si_shader_part_key *key)
5942 {
5943 memset(key, 0, sizeof(*key));
5944 key->vs_prolog.states = *prolog_key;
5945 key->vs_prolog.num_input_sgprs = num_input_sgprs;
5946 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
5947 key->vs_prolog.as_ls = shader_out->key.as_ls;
5948 key->vs_prolog.as_es = shader_out->key.as_es;
5949
5950 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
5951 key->vs_prolog.as_ls = 1;
5952 key->vs_prolog.num_merged_next_stage_vgprs = 2;
5953 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
5954 key->vs_prolog.as_es = 1;
5955 key->vs_prolog.num_merged_next_stage_vgprs = 5;
5956 }
5957
5958 /* Enable loading the InstanceID VGPR. */
5959 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
5960
5961 if ((key->vs_prolog.states.instance_divisor_is_one |
5962 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
5963 shader_out->info.uses_instanceid = true;
5964 }
5965
5966 /**
5967 * Compute the PS prolog key, which contains all the information needed to
5968 * build the PS prolog function, and set related bits in shader->config.
5969 */
5970 static void si_get_ps_prolog_key(struct si_shader *shader,
5971 union si_shader_part_key *key,
5972 bool separate_prolog)
5973 {
5974 struct tgsi_shader_info *info = &shader->selector->info;
5975
5976 memset(key, 0, sizeof(*key));
5977 key->ps_prolog.states = shader->key.part.ps.prolog;
5978 key->ps_prolog.colors_read = info->colors_read;
5979 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
5980 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
5981 key->ps_prolog.wqm = info->uses_derivatives &&
5982 (key->ps_prolog.colors_read ||
5983 key->ps_prolog.states.force_persp_sample_interp ||
5984 key->ps_prolog.states.force_linear_sample_interp ||
5985 key->ps_prolog.states.force_persp_center_interp ||
5986 key->ps_prolog.states.force_linear_center_interp ||
5987 key->ps_prolog.states.bc_optimize_for_persp ||
5988 key->ps_prolog.states.bc_optimize_for_linear);
5989 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
5990
5991 if (info->colors_read) {
5992 unsigned *color = shader->selector->color_attr_index;
5993
5994 if (shader->key.part.ps.prolog.color_two_side) {
5995 /* BCOLORs are stored after the last input. */
5996 key->ps_prolog.num_interp_inputs = info->num_inputs;
5997 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
5998 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
5999 }
6000
6001 for (unsigned i = 0; i < 2; i++) {
6002 unsigned interp = info->input_interpolate[color[i]];
6003 unsigned location = info->input_interpolate_loc[color[i]];
6004
6005 if (!(info->colors_read & (0xf << i*4)))
6006 continue;
6007
6008 key->ps_prolog.color_attr_index[i] = color[i];
6009
6010 if (shader->key.part.ps.prolog.flatshade_colors &&
6011 interp == TGSI_INTERPOLATE_COLOR)
6012 interp = TGSI_INTERPOLATE_CONSTANT;
6013
6014 switch (interp) {
6015 case TGSI_INTERPOLATE_CONSTANT:
6016 key->ps_prolog.color_interp_vgpr_index[i] = -1;
6017 break;
6018 case TGSI_INTERPOLATE_PERSPECTIVE:
6019 case TGSI_INTERPOLATE_COLOR:
6020 /* Force the interpolation location for colors here. */
6021 if (shader->key.part.ps.prolog.force_persp_sample_interp)
6022 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6023 if (shader->key.part.ps.prolog.force_persp_center_interp)
6024 location = TGSI_INTERPOLATE_LOC_CENTER;
6025
6026 switch (location) {
6027 case TGSI_INTERPOLATE_LOC_SAMPLE:
6028 key->ps_prolog.color_interp_vgpr_index[i] = 0;
6029 shader->config.spi_ps_input_ena |=
6030 S_0286CC_PERSP_SAMPLE_ENA(1);
6031 break;
6032 case TGSI_INTERPOLATE_LOC_CENTER:
6033 key->ps_prolog.color_interp_vgpr_index[i] = 2;
6034 shader->config.spi_ps_input_ena |=
6035 S_0286CC_PERSP_CENTER_ENA(1);
6036 break;
6037 case TGSI_INTERPOLATE_LOC_CENTROID:
6038 key->ps_prolog.color_interp_vgpr_index[i] = 4;
6039 shader->config.spi_ps_input_ena |=
6040 S_0286CC_PERSP_CENTROID_ENA(1);
6041 break;
6042 default:
6043 assert(0);
6044 }
6045 break;
6046 case TGSI_INTERPOLATE_LINEAR:
6047 /* Force the interpolation location for colors here. */
6048 if (shader->key.part.ps.prolog.force_linear_sample_interp)
6049 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6050 if (shader->key.part.ps.prolog.force_linear_center_interp)
6051 location = TGSI_INTERPOLATE_LOC_CENTER;
6052
6053 /* The VGPR assignment for non-monolithic shaders
6054 * works because InitialPSInputAddr is set on the
6055 * main shader and PERSP_PULL_MODEL is never used.
6056 */
6057 switch (location) {
6058 case TGSI_INTERPOLATE_LOC_SAMPLE:
6059 key->ps_prolog.color_interp_vgpr_index[i] =
6060 separate_prolog ? 6 : 9;
6061 shader->config.spi_ps_input_ena |=
6062 S_0286CC_LINEAR_SAMPLE_ENA(1);
6063 break;
6064 case TGSI_INTERPOLATE_LOC_CENTER:
6065 key->ps_prolog.color_interp_vgpr_index[i] =
6066 separate_prolog ? 8 : 11;
6067 shader->config.spi_ps_input_ena |=
6068 S_0286CC_LINEAR_CENTER_ENA(1);
6069 break;
6070 case TGSI_INTERPOLATE_LOC_CENTROID:
6071 key->ps_prolog.color_interp_vgpr_index[i] =
6072 separate_prolog ? 10 : 13;
6073 shader->config.spi_ps_input_ena |=
6074 S_0286CC_LINEAR_CENTROID_ENA(1);
6075 break;
6076 default:
6077 assert(0);
6078 }
6079 break;
6080 default:
6081 assert(0);
6082 }
6083 }
6084 }
6085 }
6086
6087 /**
6088 * Check whether a PS prolog is required based on the key.
6089 */
6090 static bool si_need_ps_prolog(const union si_shader_part_key *key)
6091 {
6092 return key->ps_prolog.colors_read ||
6093 key->ps_prolog.states.force_persp_sample_interp ||
6094 key->ps_prolog.states.force_linear_sample_interp ||
6095 key->ps_prolog.states.force_persp_center_interp ||
6096 key->ps_prolog.states.force_linear_center_interp ||
6097 key->ps_prolog.states.bc_optimize_for_persp ||
6098 key->ps_prolog.states.bc_optimize_for_linear ||
6099 key->ps_prolog.states.poly_stipple ||
6100 key->ps_prolog.states.samplemask_log_ps_iter;
6101 }
6102
6103 /**
6104 * Compute the PS epilog key, which contains all the information needed to
6105 * build the PS epilog function.
6106 */
6107 static void si_get_ps_epilog_key(struct si_shader *shader,
6108 union si_shader_part_key *key)
6109 {
6110 struct tgsi_shader_info *info = &shader->selector->info;
6111 memset(key, 0, sizeof(*key));
6112 key->ps_epilog.colors_written = info->colors_written;
6113 key->ps_epilog.writes_z = info->writes_z;
6114 key->ps_epilog.writes_stencil = info->writes_stencil;
6115 key->ps_epilog.writes_samplemask = info->writes_samplemask;
6116 key->ps_epilog.states = shader->key.part.ps.epilog;
6117 }
6118
6119 /**
6120 * Build the GS prolog function. Rotate the input vertices for triangle strips
6121 * with adjacency.
6122 */
6123 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
6124 union si_shader_part_key *key)
6125 {
6126 unsigned num_sgprs, num_vgprs;
6127 struct si_function_info fninfo;
6128 LLVMBuilderRef builder = ctx->ac.builder;
6129 LLVMTypeRef returns[48];
6130 LLVMValueRef func, ret;
6131
6132 si_init_function_info(&fninfo);
6133
6134 if (ctx->screen->info.chip_class >= GFX9) {
6135 num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
6136 num_vgprs = 5; /* ES inputs are not needed by GS */
6137 } else {
6138 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6139 num_vgprs = 8;
6140 }
6141
6142 for (unsigned i = 0; i < num_sgprs; ++i) {
6143 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6144 returns[i] = ctx->i32;
6145 }
6146
6147 for (unsigned i = 0; i < num_vgprs; ++i) {
6148 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6149 returns[num_sgprs + i] = ctx->f32;
6150 }
6151
6152 /* Create the function. */
6153 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6154 &fninfo, 0);
6155 func = ctx->main_fn;
6156
6157 /* Set the full EXEC mask for the prolog, because we are only fiddling
6158 * with registers here. The main shader part will set the correct EXEC
6159 * mask.
6160 */
6161 if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6162 si_init_exec_full_mask(ctx);
6163
6164 /* Copy inputs to outputs. This should be no-op, as the registers match,
6165 * but it will prevent the compiler from overwriting them unintentionally.
6166 */
6167 ret = ctx->return_value;
6168 for (unsigned i = 0; i < num_sgprs; i++) {
6169 LLVMValueRef p = LLVMGetParam(func, i);
6170 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6171 }
6172 for (unsigned i = 0; i < num_vgprs; i++) {
6173 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6174 p = ac_to_float(&ctx->ac, p);
6175 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6176 }
6177
6178 if (key->gs_prolog.states.tri_strip_adj_fix) {
6179 /* Remap the input vertices for every other primitive. */
6180 const unsigned gfx6_vtx_params[6] = {
6181 num_sgprs,
6182 num_sgprs + 1,
6183 num_sgprs + 3,
6184 num_sgprs + 4,
6185 num_sgprs + 5,
6186 num_sgprs + 6
6187 };
6188 const unsigned gfx9_vtx_params[3] = {
6189 num_sgprs,
6190 num_sgprs + 1,
6191 num_sgprs + 4,
6192 };
6193 LLVMValueRef vtx_in[6], vtx_out[6];
6194 LLVMValueRef prim_id, rotate;
6195
6196 if (ctx->screen->info.chip_class >= GFX9) {
6197 for (unsigned i = 0; i < 3; i++) {
6198 vtx_in[i*2] = unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6199 vtx_in[i*2+1] = unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6200 }
6201 } else {
6202 for (unsigned i = 0; i < 6; i++)
6203 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6204 }
6205
6206 prim_id = LLVMGetParam(func, num_sgprs + 2);
6207 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6208
6209 for (unsigned i = 0; i < 6; ++i) {
6210 LLVMValueRef base, rotated;
6211 base = vtx_in[i];
6212 rotated = vtx_in[(i + 4) % 6];
6213 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6214 }
6215
6216 if (ctx->screen->info.chip_class >= GFX9) {
6217 for (unsigned i = 0; i < 3; i++) {
6218 LLVMValueRef hi, out;
6219
6220 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6221 LLVMConstInt(ctx->i32, 16, 0), "");
6222 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6223 out = ac_to_float(&ctx->ac, out);
6224 ret = LLVMBuildInsertValue(builder, ret, out,
6225 gfx9_vtx_params[i], "");
6226 }
6227 } else {
6228 for (unsigned i = 0; i < 6; i++) {
6229 LLVMValueRef out;
6230
6231 out = ac_to_float(&ctx->ac, vtx_out[i]);
6232 ret = LLVMBuildInsertValue(builder, ret, out,
6233 gfx6_vtx_params[i], "");
6234 }
6235 }
6236 }
6237
6238 LLVMBuildRet(builder, ret);
6239 }
6240
6241 /**
6242 * Given a list of shader part functions, build a wrapper function that
6243 * runs them in sequence to form a monolithic shader.
6244 */
6245 static void si_build_wrapper_function(struct si_shader_context *ctx,
6246 LLVMValueRef *parts,
6247 unsigned num_parts,
6248 unsigned main_part,
6249 unsigned next_shader_first_part)
6250 {
6251 LLVMBuilderRef builder = ctx->ac.builder;
6252 /* PS epilog has one arg per color component; gfx9 merged shader
6253 * prologs need to forward 32 user SGPRs.
6254 */
6255 struct si_function_info fninfo;
6256 LLVMValueRef initial[64], out[64];
6257 LLVMTypeRef function_type;
6258 unsigned num_first_params;
6259 unsigned num_out, initial_num_out;
6260 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6261 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6262 unsigned num_sgprs, num_vgprs;
6263 unsigned gprs;
6264 struct lp_build_if_state if_state;
6265
6266 si_init_function_info(&fninfo);
6267
6268 for (unsigned i = 0; i < num_parts; ++i) {
6269 lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
6270 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6271 }
6272
6273 /* The parameters of the wrapper function correspond to those of the
6274 * first part in terms of SGPRs and VGPRs, but we use the types of the
6275 * main part to get the right types. This is relevant for the
6276 * dereferenceable attribute on descriptor table pointers.
6277 */
6278 num_sgprs = 0;
6279 num_vgprs = 0;
6280
6281 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6282 num_first_params = LLVMCountParamTypes(function_type);
6283
6284 for (unsigned i = 0; i < num_first_params; ++i) {
6285 LLVMValueRef param = LLVMGetParam(parts[0], i);
6286
6287 if (ac_is_sgpr_param(param)) {
6288 assert(num_vgprs == 0);
6289 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6290 } else {
6291 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6292 }
6293 }
6294
6295 gprs = 0;
6296 while (gprs < num_sgprs + num_vgprs) {
6297 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6298 LLVMTypeRef type = LLVMTypeOf(param);
6299 unsigned size = ac_get_type_size(type) / 4;
6300
6301 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6302
6303 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6304 assert(gprs + size <= num_sgprs + num_vgprs &&
6305 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6306
6307 gprs += size;
6308 }
6309
6310 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6311 si_get_max_workgroup_size(ctx->shader));
6312
6313 if (is_merged_shader(ctx->shader))
6314 si_init_exec_full_mask(ctx);
6315
6316 /* Record the arguments of the function as if they were an output of
6317 * a previous part.
6318 */
6319 num_out = 0;
6320 num_out_sgpr = 0;
6321
6322 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6323 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6324 LLVMTypeRef param_type = LLVMTypeOf(param);
6325 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6326 unsigned size = ac_get_type_size(param_type) / 4;
6327
6328 if (size == 1) {
6329 if (param_type != out_type)
6330 param = LLVMBuildBitCast(builder, param, out_type, "");
6331 out[num_out++] = param;
6332 } else {
6333 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6334
6335 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6336 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6337 param_type = ctx->i64;
6338 }
6339
6340 if (param_type != vector_type)
6341 param = LLVMBuildBitCast(builder, param, vector_type, "");
6342
6343 for (unsigned j = 0; j < size; ++j)
6344 out[num_out++] = LLVMBuildExtractElement(
6345 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6346 }
6347
6348 if (i < fninfo.num_sgpr_params)
6349 num_out_sgpr = num_out;
6350 }
6351
6352 memcpy(initial, out, sizeof(out));
6353 initial_num_out = num_out;
6354 initial_num_out_sgpr = num_out_sgpr;
6355
6356 /* Now chain the parts. */
6357 for (unsigned part = 0; part < num_parts; ++part) {
6358 LLVMValueRef in[48];
6359 LLVMValueRef ret;
6360 LLVMTypeRef ret_type;
6361 unsigned out_idx = 0;
6362 unsigned num_params = LLVMCountParams(parts[part]);
6363
6364 /* Merged shaders are executed conditionally depending
6365 * on the number of enabled threads passed in the input SGPRs. */
6366 if (is_merged_shader(ctx->shader) && part == 0) {
6367 LLVMValueRef ena, count = initial[3];
6368
6369 count = LLVMBuildAnd(builder, count,
6370 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6371 ena = LLVMBuildICmp(builder, LLVMIntULT,
6372 ac_get_thread_id(&ctx->ac), count, "");
6373 lp_build_if(&if_state, &ctx->gallivm, ena);
6374 }
6375
6376 /* Derive arguments for the next part from outputs of the
6377 * previous one.
6378 */
6379 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6380 LLVMValueRef param;
6381 LLVMTypeRef param_type;
6382 bool is_sgpr;
6383 unsigned param_size;
6384 LLVMValueRef arg = NULL;
6385
6386 param = LLVMGetParam(parts[part], param_idx);
6387 param_type = LLVMTypeOf(param);
6388 param_size = ac_get_type_size(param_type) / 4;
6389 is_sgpr = ac_is_sgpr_param(param);
6390
6391 if (is_sgpr) {
6392 #if HAVE_LLVM < 0x0400
6393 LLVMRemoveAttribute(param, LLVMByValAttribute);
6394 #else
6395 unsigned kind_id = LLVMGetEnumAttributeKindForName("byval", 5);
6396 LLVMRemoveEnumAttributeAtIndex(parts[part], param_idx + 1, kind_id);
6397 #endif
6398 lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
6399 }
6400
6401 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6402 assert(is_sgpr || out_idx >= num_out_sgpr);
6403
6404 if (param_size == 1)
6405 arg = out[out_idx];
6406 else
6407 arg = lp_build_gather_values(&ctx->gallivm, &out[out_idx], param_size);
6408
6409 if (LLVMTypeOf(arg) != param_type) {
6410 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6411 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6412 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6413 } else {
6414 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6415 }
6416 }
6417
6418 in[param_idx] = arg;
6419 out_idx += param_size;
6420 }
6421
6422 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6423
6424 if (is_merged_shader(ctx->shader) &&
6425 part + 1 == next_shader_first_part) {
6426 lp_build_endif(&if_state);
6427
6428 /* The second half of the merged shader should use
6429 * the inputs from the toplevel (wrapper) function,
6430 * not the return value from the last call.
6431 *
6432 * That's because the last call was executed condi-
6433 * tionally, so we can't consume it in the main
6434 * block.
6435 */
6436 memcpy(out, initial, sizeof(initial));
6437 num_out = initial_num_out;
6438 num_out_sgpr = initial_num_out_sgpr;
6439 continue;
6440 }
6441
6442 /* Extract the returned GPRs. */
6443 ret_type = LLVMTypeOf(ret);
6444 num_out = 0;
6445 num_out_sgpr = 0;
6446
6447 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6448 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6449
6450 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6451
6452 for (unsigned i = 0; i < ret_size; ++i) {
6453 LLVMValueRef val =
6454 LLVMBuildExtractValue(builder, ret, i, "");
6455
6456 assert(num_out < ARRAY_SIZE(out));
6457 out[num_out++] = val;
6458
6459 if (LLVMTypeOf(val) == ctx->i32) {
6460 assert(num_out_sgpr + 1 == num_out);
6461 num_out_sgpr = num_out;
6462 }
6463 }
6464 }
6465 }
6466
6467 LLVMBuildRetVoid(builder);
6468 }
6469
6470 int si_compile_tgsi_shader(struct si_screen *sscreen,
6471 LLVMTargetMachineRef tm,
6472 struct si_shader *shader,
6473 bool is_monolithic,
6474 struct pipe_debug_callback *debug)
6475 {
6476 struct si_shader_selector *sel = shader->selector;
6477 struct si_shader_context ctx;
6478 int r = -1;
6479
6480 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6481 * conversion fails. */
6482 if (si_can_dump_shader(sscreen, sel->info.processor) &&
6483 !(sscreen->debug_flags & DBG(NO_TGSI))) {
6484 if (sel->tokens)
6485 tgsi_dump(sel->tokens, 0);
6486 else
6487 nir_print_shader(sel->nir, stderr);
6488 si_dump_streamout(&sel->so);
6489 }
6490
6491 si_init_shader_ctx(&ctx, sscreen, tm);
6492 si_llvm_context_set_tgsi(&ctx, shader);
6493 ctx.separate_prolog = !is_monolithic;
6494
6495 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6496 sizeof(shader->info.vs_output_param_offset));
6497
6498 shader->info.uses_instanceid = sel->info.uses_instanceid;
6499
6500 if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
6501 si_llvm_dispose(&ctx);
6502 return -1;
6503 }
6504
6505 if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6506 LLVMValueRef parts[2];
6507 bool need_prolog = sel->vs_needs_prolog;
6508
6509 parts[1] = ctx.main_fn;
6510
6511 if (need_prolog) {
6512 union si_shader_part_key prolog_key;
6513 si_get_vs_prolog_key(&sel->info,
6514 shader->info.num_input_sgprs,
6515 &shader->key.part.vs.prolog,
6516 shader, &prolog_key);
6517 si_build_vs_prolog_function(&ctx, &prolog_key);
6518 parts[0] = ctx.main_fn;
6519 }
6520
6521 si_build_wrapper_function(&ctx, parts + !need_prolog,
6522 1 + need_prolog, need_prolog, 0);
6523 } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6524 if (sscreen->info.chip_class >= GFX9) {
6525 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6526 LLVMValueRef parts[4];
6527 bool vs_needs_prolog =
6528 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6529
6530 /* TCS main part */
6531 parts[2] = ctx.main_fn;
6532
6533 /* TCS epilog */
6534 union si_shader_part_key tcs_epilog_key;
6535 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6536 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6537 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6538 parts[3] = ctx.main_fn;
6539
6540 /* VS prolog */
6541 if (vs_needs_prolog) {
6542 union si_shader_part_key vs_prolog_key;
6543 si_get_vs_prolog_key(&ls->info,
6544 shader->info.num_input_sgprs,
6545 &shader->key.part.tcs.ls_prolog,
6546 shader, &vs_prolog_key);
6547 vs_prolog_key.vs_prolog.is_monolithic = true;
6548 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6549 parts[0] = ctx.main_fn;
6550 }
6551
6552 /* VS as LS main part */
6553 struct si_shader shader_ls = {};
6554 shader_ls.selector = ls;
6555 shader_ls.key.as_ls = 1;
6556 shader_ls.key.mono = shader->key.mono;
6557 shader_ls.key.opt = shader->key.opt;
6558 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6559
6560 if (!si_compile_tgsi_main(&ctx, true)) {
6561 si_llvm_dispose(&ctx);
6562 return -1;
6563 }
6564 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6565 parts[1] = ctx.main_fn;
6566
6567 /* Reset the shader context. */
6568 ctx.shader = shader;
6569 ctx.type = PIPE_SHADER_TESS_CTRL;
6570
6571 si_build_wrapper_function(&ctx,
6572 parts + !vs_needs_prolog,
6573 4 - !vs_needs_prolog, 0,
6574 vs_needs_prolog ? 2 : 1);
6575 } else {
6576 LLVMValueRef parts[2];
6577 union si_shader_part_key epilog_key;
6578
6579 parts[0] = ctx.main_fn;
6580
6581 memset(&epilog_key, 0, sizeof(epilog_key));
6582 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6583 si_build_tcs_epilog_function(&ctx, &epilog_key);
6584 parts[1] = ctx.main_fn;
6585
6586 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6587 }
6588 } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6589 if (ctx.screen->info.chip_class >= GFX9) {
6590 struct si_shader_selector *es = shader->key.part.gs.es;
6591 LLVMValueRef es_prolog = NULL;
6592 LLVMValueRef es_main = NULL;
6593 LLVMValueRef gs_prolog = NULL;
6594 LLVMValueRef gs_main = ctx.main_fn;
6595
6596 /* GS prolog */
6597 union si_shader_part_key gs_prolog_key;
6598 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6599 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6600 gs_prolog_key.gs_prolog.is_monolithic = true;
6601 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6602 gs_prolog = ctx.main_fn;
6603
6604 /* ES prolog */
6605 if (es->vs_needs_prolog) {
6606 union si_shader_part_key vs_prolog_key;
6607 si_get_vs_prolog_key(&es->info,
6608 shader->info.num_input_sgprs,
6609 &shader->key.part.gs.vs_prolog,
6610 shader, &vs_prolog_key);
6611 vs_prolog_key.vs_prolog.is_monolithic = true;
6612 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6613 es_prolog = ctx.main_fn;
6614 }
6615
6616 /* ES main part */
6617 struct si_shader shader_es = {};
6618 shader_es.selector = es;
6619 shader_es.key.as_es = 1;
6620 shader_es.key.mono = shader->key.mono;
6621 shader_es.key.opt = shader->key.opt;
6622 si_llvm_context_set_tgsi(&ctx, &shader_es);
6623
6624 if (!si_compile_tgsi_main(&ctx, true)) {
6625 si_llvm_dispose(&ctx);
6626 return -1;
6627 }
6628 shader->info.uses_instanceid |= es->info.uses_instanceid;
6629 es_main = ctx.main_fn;
6630
6631 /* Reset the shader context. */
6632 ctx.shader = shader;
6633 ctx.type = PIPE_SHADER_GEOMETRY;
6634
6635 /* Prepare the array of shader parts. */
6636 LLVMValueRef parts[4];
6637 unsigned num_parts = 0, main_part, next_first_part;
6638
6639 if (es_prolog)
6640 parts[num_parts++] = es_prolog;
6641
6642 parts[main_part = num_parts++] = es_main;
6643 parts[next_first_part = num_parts++] = gs_prolog;
6644 parts[num_parts++] = gs_main;
6645
6646 si_build_wrapper_function(&ctx, parts, num_parts,
6647 main_part, next_first_part);
6648 } else {
6649 LLVMValueRef parts[2];
6650 union si_shader_part_key prolog_key;
6651
6652 parts[1] = ctx.main_fn;
6653
6654 memset(&prolog_key, 0, sizeof(prolog_key));
6655 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6656 si_build_gs_prolog_function(&ctx, &prolog_key);
6657 parts[0] = ctx.main_fn;
6658
6659 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6660 }
6661 } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6662 LLVMValueRef parts[3];
6663 union si_shader_part_key prolog_key;
6664 union si_shader_part_key epilog_key;
6665 bool need_prolog;
6666
6667 si_get_ps_prolog_key(shader, &prolog_key, false);
6668 need_prolog = si_need_ps_prolog(&prolog_key);
6669
6670 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6671
6672 if (need_prolog) {
6673 si_build_ps_prolog_function(&ctx, &prolog_key);
6674 parts[0] = ctx.main_fn;
6675 }
6676
6677 si_get_ps_epilog_key(shader, &epilog_key);
6678 si_build_ps_epilog_function(&ctx, &epilog_key);
6679 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6680
6681 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6682 need_prolog ? 1 : 0, 0);
6683 }
6684
6685 si_llvm_optimize_module(&ctx);
6686
6687 /* Post-optimization transformations and analysis. */
6688 si_optimize_vs_outputs(&ctx);
6689
6690 if ((debug && debug->debug_message) ||
6691 si_can_dump_shader(sscreen, ctx.type))
6692 si_count_scratch_private_memory(&ctx);
6693
6694 /* Compile to bytecode. */
6695 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6696 ctx.gallivm.module, debug, ctx.type, "TGSI shader");
6697 si_llvm_dispose(&ctx);
6698 if (r) {
6699 fprintf(stderr, "LLVM failed to compile shader\n");
6700 return r;
6701 }
6702
6703 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6704 * LLVM 3.9svn has this bug.
6705 */
6706 if (sel->type == PIPE_SHADER_COMPUTE) {
6707 unsigned wave_size = 64;
6708 unsigned max_vgprs = 256;
6709 unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512;
6710 unsigned max_sgprs_per_wave = 128;
6711 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6712 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6713 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6714
6715 max_vgprs = max_vgprs / min_waves_per_simd;
6716 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6717
6718 if (shader->config.num_sgprs > max_sgprs ||
6719 shader->config.num_vgprs > max_vgprs) {
6720 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6721 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6722 shader->config.num_sgprs, shader->config.num_vgprs,
6723 max_sgprs, max_vgprs);
6724
6725 /* Just terminate the process, because dependent
6726 * shaders can hang due to bad input data, but use
6727 * the env var to allow shader-db to work.
6728 */
6729 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6730 abort();
6731 }
6732 }
6733
6734 /* Add the scratch offset to input SGPRs. */
6735 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(shader))
6736 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6737
6738 /* Calculate the number of fragment input VGPRs. */
6739 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6740 shader->info.num_input_vgprs = 0;
6741 shader->info.face_vgpr_index = -1;
6742 shader->info.ancillary_vgpr_index = -1;
6743
6744 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6745 shader->info.num_input_vgprs += 2;
6746 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6747 shader->info.num_input_vgprs += 2;
6748 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6749 shader->info.num_input_vgprs += 2;
6750 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6751 shader->info.num_input_vgprs += 3;
6752 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6753 shader->info.num_input_vgprs += 2;
6754 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6755 shader->info.num_input_vgprs += 2;
6756 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6757 shader->info.num_input_vgprs += 2;
6758 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6759 shader->info.num_input_vgprs += 1;
6760 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6761 shader->info.num_input_vgprs += 1;
6762 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6763 shader->info.num_input_vgprs += 1;
6764 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6765 shader->info.num_input_vgprs += 1;
6766 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6767 shader->info.num_input_vgprs += 1;
6768 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6769 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6770 shader->info.num_input_vgprs += 1;
6771 }
6772 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) {
6773 shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs;
6774 shader->info.num_input_vgprs += 1;
6775 }
6776 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6777 shader->info.num_input_vgprs += 1;
6778 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6779 shader->info.num_input_vgprs += 1;
6780 }
6781
6782 return 0;
6783 }
6784
6785 /**
6786 * Create, compile and return a shader part (prolog or epilog).
6787 *
6788 * \param sscreen screen
6789 * \param list list of shader parts of the same category
6790 * \param type shader type
6791 * \param key shader part key
6792 * \param prolog whether the part being requested is a prolog
6793 * \param tm LLVM target machine
6794 * \param debug debug callback
6795 * \param build the callback responsible for building the main function
6796 * \return non-NULL on success
6797 */
6798 static struct si_shader_part *
6799 si_get_shader_part(struct si_screen *sscreen,
6800 struct si_shader_part **list,
6801 enum pipe_shader_type type,
6802 bool prolog,
6803 union si_shader_part_key *key,
6804 LLVMTargetMachineRef tm,
6805 struct pipe_debug_callback *debug,
6806 void (*build)(struct si_shader_context *,
6807 union si_shader_part_key *),
6808 const char *name)
6809 {
6810 struct si_shader_part *result;
6811
6812 mtx_lock(&sscreen->shader_parts_mutex);
6813
6814 /* Find existing. */
6815 for (result = *list; result; result = result->next) {
6816 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6817 mtx_unlock(&sscreen->shader_parts_mutex);
6818 return result;
6819 }
6820 }
6821
6822 /* Compile a new one. */
6823 result = CALLOC_STRUCT(si_shader_part);
6824 result->key = *key;
6825
6826 struct si_shader shader = {};
6827 struct si_shader_context ctx;
6828
6829 si_init_shader_ctx(&ctx, sscreen, tm);
6830 ctx.shader = &shader;
6831 ctx.type = type;
6832
6833 switch (type) {
6834 case PIPE_SHADER_VERTEX:
6835 shader.key.as_ls = key->vs_prolog.as_ls;
6836 shader.key.as_es = key->vs_prolog.as_es;
6837 break;
6838 case PIPE_SHADER_TESS_CTRL:
6839 assert(!prolog);
6840 shader.key.part.tcs.epilog = key->tcs_epilog.states;
6841 break;
6842 case PIPE_SHADER_GEOMETRY:
6843 assert(prolog);
6844 break;
6845 case PIPE_SHADER_FRAGMENT:
6846 if (prolog)
6847 shader.key.part.ps.prolog = key->ps_prolog.states;
6848 else
6849 shader.key.part.ps.epilog = key->ps_epilog.states;
6850 break;
6851 default:
6852 unreachable("bad shader part");
6853 }
6854
6855 build(&ctx, key);
6856
6857 /* Compile. */
6858 si_llvm_optimize_module(&ctx);
6859
6860 if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
6861 ctx.ac.module, debug, ctx.type, name)) {
6862 FREE(result);
6863 result = NULL;
6864 goto out;
6865 }
6866
6867 result->next = *list;
6868 *list = result;
6869
6870 out:
6871 si_llvm_dispose(&ctx);
6872 mtx_unlock(&sscreen->shader_parts_mutex);
6873 return result;
6874 }
6875
6876 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
6877 {
6878 LLVMValueRef ptr[2], list;
6879 bool is_merged_shader =
6880 ctx->screen->info.chip_class >= GFX9 &&
6881 (ctx->type == PIPE_SHADER_TESS_CTRL ||
6882 ctx->type == PIPE_SHADER_GEOMETRY ||
6883 ctx->shader->key.as_ls || ctx->shader->key.as_es);
6884
6885 /* Get the pointer to rw buffers. */
6886 ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
6887 ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS_HI);
6888 list = lp_build_gather_values(&ctx->gallivm, ptr, 2);
6889 list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, "");
6890 list = LLVMBuildIntToPtr(ctx->ac.builder, list,
6891 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
6892 return list;
6893 }
6894
6895 /**
6896 * Build the vertex shader prolog function.
6897 *
6898 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6899 * All inputs are returned unmodified. The vertex load indices are
6900 * stored after them, which will be used by the API VS for fetching inputs.
6901 *
6902 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6903 * input_v0,
6904 * input_v1,
6905 * input_v2,
6906 * input_v3,
6907 * (VertexID + BaseVertex),
6908 * (InstanceID + StartInstance),
6909 * (InstanceID / 2 + StartInstance)
6910 */
6911 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
6912 union si_shader_part_key *key)
6913 {
6914 struct si_function_info fninfo;
6915 LLVMTypeRef *returns;
6916 LLVMValueRef ret, func;
6917 int num_returns, i;
6918 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
6919 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
6920 LLVMValueRef input_vgprs[9];
6921 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
6922 num_input_vgprs;
6923 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
6924
6925 si_init_function_info(&fninfo);
6926
6927 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6928 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
6929 sizeof(LLVMTypeRef));
6930 num_returns = 0;
6931
6932 /* Declare input and output SGPRs. */
6933 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6934 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6935 returns[num_returns++] = ctx->i32;
6936 }
6937
6938 /* Preloaded VGPRs (outputs must be floats) */
6939 for (i = 0; i < num_input_vgprs; i++) {
6940 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
6941 returns[num_returns++] = ctx->f32;
6942 }
6943
6944 /* Vertex load indices. */
6945 for (i = 0; i <= key->vs_prolog.last_input; i++)
6946 returns[num_returns++] = ctx->f32;
6947
6948 /* Create the function. */
6949 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
6950 func = ctx->main_fn;
6951
6952 if (key->vs_prolog.num_merged_next_stage_vgprs) {
6953 if (!key->vs_prolog.is_monolithic)
6954 si_init_exec_from_input(ctx, 3, 0);
6955
6956 if (key->vs_prolog.as_ls &&
6957 ctx->screen->has_ls_vgpr_init_bug) {
6958 /* If there are no HS threads, SPI loads the LS VGPRs
6959 * starting at VGPR 0. Shift them back to where they
6960 * belong.
6961 */
6962 LLVMValueRef has_hs_threads =
6963 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
6964 unpack_param(ctx, 3, 8, 8),
6965 ctx->i32_0, "");
6966
6967 for (i = 4; i > 0; --i) {
6968 input_vgprs[i + 1] =
6969 LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
6970 input_vgprs[i + 1],
6971 input_vgprs[i - 1], "");
6972 }
6973 }
6974 }
6975
6976 ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
6977 ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
6978
6979 /* Copy inputs to outputs. This should be no-op, as the registers match,
6980 * but it will prevent the compiler from overwriting them unintentionally.
6981 */
6982 ret = ctx->return_value;
6983 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6984 LLVMValueRef p = LLVMGetParam(func, i);
6985 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
6986 }
6987 for (i = 0; i < num_input_vgprs; i++) {
6988 LLVMValueRef p = input_vgprs[i];
6989 p = ac_to_float(&ctx->ac, p);
6990 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
6991 key->vs_prolog.num_input_sgprs + i, "");
6992 }
6993
6994 /* Compute vertex load indices from instance divisors. */
6995 LLVMValueRef instance_divisor_constbuf = NULL;
6996
6997 if (key->vs_prolog.states.instance_divisor_is_fetched) {
6998 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
6999 LLVMValueRef buf_index =
7000 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
7001 instance_divisor_constbuf =
7002 ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
7003 }
7004
7005 for (i = 0; i <= key->vs_prolog.last_input; i++) {
7006 bool divisor_is_one =
7007 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
7008 bool divisor_is_fetched =
7009 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
7010 LLVMValueRef index;
7011
7012 if (divisor_is_one || divisor_is_fetched) {
7013 LLVMValueRef divisor = ctx->i32_1;
7014
7015 if (divisor_is_fetched) {
7016 divisor = buffer_load_const(ctx, instance_divisor_constbuf,
7017 LLVMConstInt(ctx->i32, i * 4, 0));
7018 divisor = ac_to_integer(&ctx->ac, divisor);
7019 }
7020
7021 /* InstanceID / Divisor + StartInstance */
7022 index = get_instance_index_for_fetch(ctx,
7023 user_sgpr_base +
7024 SI_SGPR_START_INSTANCE,
7025 divisor);
7026 } else {
7027 /* VertexID + BaseVertex */
7028 index = LLVMBuildAdd(ctx->ac.builder,
7029 ctx->abi.vertex_id,
7030 LLVMGetParam(func, user_sgpr_base +
7031 SI_SGPR_BASE_VERTEX), "");
7032 }
7033
7034 index = ac_to_float(&ctx->ac, index);
7035 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
7036 fninfo.num_params + i, "");
7037 }
7038
7039 si_llvm_build_ret(ctx, ret);
7040 }
7041
7042 static bool si_get_vs_prolog(struct si_screen *sscreen,
7043 LLVMTargetMachineRef tm,
7044 struct si_shader *shader,
7045 struct pipe_debug_callback *debug,
7046 struct si_shader *main_part,
7047 const struct si_vs_prolog_bits *key)
7048 {
7049 struct si_shader_selector *vs = main_part->selector;
7050
7051 if (!si_vs_needs_prolog(vs, key))
7052 return true;
7053
7054 /* Get the prolog. */
7055 union si_shader_part_key prolog_key;
7056 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7057 key, shader, &prolog_key);
7058
7059 shader->prolog =
7060 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7061 PIPE_SHADER_VERTEX, true, &prolog_key, tm,
7062 debug, si_build_vs_prolog_function,
7063 "Vertex Shader Prolog");
7064 return shader->prolog != NULL;
7065 }
7066
7067 /**
7068 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7069 */
7070 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7071 LLVMTargetMachineRef tm,
7072 struct si_shader *shader,
7073 struct pipe_debug_callback *debug)
7074 {
7075 return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
7076 &shader->key.part.vs.prolog);
7077 }
7078
7079 /**
7080 * Compile the TCS epilog function. This writes tesselation factors to memory
7081 * based on the output primitive type of the tesselator (determined by TES).
7082 */
7083 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7084 union si_shader_part_key *key)
7085 {
7086 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7087 struct si_function_info fninfo;
7088 LLVMValueRef func;
7089
7090 si_init_function_info(&fninfo);
7091
7092 if (ctx->screen->info.chip_class >= GFX9) {
7093 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7094 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7095 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
7096 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7097 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7098 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7099 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7100 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7101 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7102 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7103 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7104 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7105 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7106 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7107 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7108 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7109 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7110 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7111 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7112 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7113 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7114 } else {
7115 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7116 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7117 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7118 add_arg(&fninfo, ARG_SGPR, ctx->i64);
7119 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7120 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7121 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7122 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7123 ctx->param_tcs_offchip_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7124 ctx->param_tcs_factor_addr_base64k = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7125 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7126 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7127 }
7128
7129 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7130 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7131 unsigned tess_factors_idx =
7132 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7133 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7134 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7135
7136 for (unsigned i = 0; i < 6; i++)
7137 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
7138
7139 /* Create the function. */
7140 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7141 ctx->screen->info.chip_class >= CIK ? 128 : 64);
7142 ac_declare_lds_as_pointer(&ctx->ac);
7143 func = ctx->main_fn;
7144
7145 LLVMValueRef invoc0_tess_factors[6];
7146 for (unsigned i = 0; i < 6; i++)
7147 invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
7148
7149 si_write_tess_factors(bld_base,
7150 LLVMGetParam(func, tess_factors_idx),
7151 LLVMGetParam(func, tess_factors_idx + 1),
7152 LLVMGetParam(func, tess_factors_idx + 2),
7153 invoc0_tess_factors, invoc0_tess_factors + 4);
7154
7155 LLVMBuildRetVoid(ctx->ac.builder);
7156 }
7157
7158 /**
7159 * Select and compile (or reuse) TCS parts (epilog).
7160 */
7161 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7162 LLVMTargetMachineRef tm,
7163 struct si_shader *shader,
7164 struct pipe_debug_callback *debug)
7165 {
7166 if (sscreen->info.chip_class >= GFX9) {
7167 struct si_shader *ls_main_part =
7168 shader->key.part.tcs.ls->main_shader_part_ls;
7169
7170 if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
7171 &shader->key.part.tcs.ls_prolog))
7172 return false;
7173
7174 shader->previous_stage = ls_main_part;
7175 }
7176
7177 /* Get the epilog. */
7178 union si_shader_part_key epilog_key;
7179 memset(&epilog_key, 0, sizeof(epilog_key));
7180 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7181
7182 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7183 PIPE_SHADER_TESS_CTRL, false,
7184 &epilog_key, tm, debug,
7185 si_build_tcs_epilog_function,
7186 "Tessellation Control Shader Epilog");
7187 return shader->epilog != NULL;
7188 }
7189
7190 /**
7191 * Select and compile (or reuse) GS parts (prolog).
7192 */
7193 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7194 LLVMTargetMachineRef tm,
7195 struct si_shader *shader,
7196 struct pipe_debug_callback *debug)
7197 {
7198 if (sscreen->info.chip_class >= GFX9) {
7199 struct si_shader *es_main_part =
7200 shader->key.part.gs.es->main_shader_part_es;
7201
7202 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7203 !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
7204 &shader->key.part.gs.vs_prolog))
7205 return false;
7206
7207 shader->previous_stage = es_main_part;
7208 }
7209
7210 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7211 return true;
7212
7213 union si_shader_part_key prolog_key;
7214 memset(&prolog_key, 0, sizeof(prolog_key));
7215 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7216
7217 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7218 PIPE_SHADER_GEOMETRY, true,
7219 &prolog_key, tm, debug,
7220 si_build_gs_prolog_function,
7221 "Geometry Shader Prolog");
7222 return shader->prolog2 != NULL;
7223 }
7224
7225 /**
7226 * Build the pixel shader prolog function. This handles:
7227 * - two-side color selection and interpolation
7228 * - overriding interpolation parameters for the API PS
7229 * - polygon stippling
7230 *
7231 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7232 * overriden by other states. (e.g. per-sample interpolation)
7233 * Interpolated colors are stored after the preloaded VGPRs.
7234 */
7235 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7236 union si_shader_part_key *key)
7237 {
7238 struct si_function_info fninfo;
7239 LLVMValueRef ret, func;
7240 int num_returns, i, num_color_channels;
7241
7242 assert(si_need_ps_prolog(key));
7243
7244 si_init_function_info(&fninfo);
7245
7246 /* Declare inputs. */
7247 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7248 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7249
7250 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7251 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7252
7253 /* Declare outputs (same as inputs + add colors if needed) */
7254 num_returns = fninfo.num_params;
7255 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7256 for (i = 0; i < num_color_channels; i++)
7257 fninfo.types[num_returns++] = ctx->f32;
7258
7259 /* Create the function. */
7260 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7261 &fninfo, 0);
7262 func = ctx->main_fn;
7263
7264 /* Copy inputs to outputs. This should be no-op, as the registers match,
7265 * but it will prevent the compiler from overwriting them unintentionally.
7266 */
7267 ret = ctx->return_value;
7268 for (i = 0; i < fninfo.num_params; i++) {
7269 LLVMValueRef p = LLVMGetParam(func, i);
7270 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7271 }
7272
7273 /* Polygon stippling. */
7274 if (key->ps_prolog.states.poly_stipple) {
7275 /* POS_FIXED_PT is always last. */
7276 unsigned pos = key->ps_prolog.num_input_sgprs +
7277 key->ps_prolog.num_input_vgprs - 1;
7278 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7279
7280 si_llvm_emit_polygon_stipple(ctx, list, pos);
7281 }
7282
7283 if (key->ps_prolog.states.bc_optimize_for_persp ||
7284 key->ps_prolog.states.bc_optimize_for_linear) {
7285 unsigned i, base = key->ps_prolog.num_input_sgprs;
7286 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7287
7288 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7289 * The hw doesn't compute CENTROID if the whole wave only
7290 * contains fully-covered quads.
7291 *
7292 * PRIM_MASK is after user SGPRs.
7293 */
7294 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7295 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
7296 LLVMConstInt(ctx->i32, 31, 0), "");
7297 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
7298 ctx->i1, "");
7299
7300 if (key->ps_prolog.states.bc_optimize_for_persp) {
7301 /* Read PERSP_CENTER. */
7302 for (i = 0; i < 2; i++)
7303 center[i] = LLVMGetParam(func, base + 2 + i);
7304 /* Read PERSP_CENTROID. */
7305 for (i = 0; i < 2; i++)
7306 centroid[i] = LLVMGetParam(func, base + 4 + i);
7307 /* Select PERSP_CENTROID. */
7308 for (i = 0; i < 2; i++) {
7309 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7310 center[i], centroid[i], "");
7311 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7312 tmp, base + 4 + i, "");
7313 }
7314 }
7315 if (key->ps_prolog.states.bc_optimize_for_linear) {
7316 /* Read LINEAR_CENTER. */
7317 for (i = 0; i < 2; i++)
7318 center[i] = LLVMGetParam(func, base + 8 + i);
7319 /* Read LINEAR_CENTROID. */
7320 for (i = 0; i < 2; i++)
7321 centroid[i] = LLVMGetParam(func, base + 10 + i);
7322 /* Select LINEAR_CENTROID. */
7323 for (i = 0; i < 2; i++) {
7324 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7325 center[i], centroid[i], "");
7326 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7327 tmp, base + 10 + i, "");
7328 }
7329 }
7330 }
7331
7332 /* Force per-sample interpolation. */
7333 if (key->ps_prolog.states.force_persp_sample_interp) {
7334 unsigned i, base = key->ps_prolog.num_input_sgprs;
7335 LLVMValueRef persp_sample[2];
7336
7337 /* Read PERSP_SAMPLE. */
7338 for (i = 0; i < 2; i++)
7339 persp_sample[i] = LLVMGetParam(func, base + i);
7340 /* Overwrite PERSP_CENTER. */
7341 for (i = 0; i < 2; i++)
7342 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7343 persp_sample[i], base + 2 + i, "");
7344 /* Overwrite PERSP_CENTROID. */
7345 for (i = 0; i < 2; i++)
7346 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7347 persp_sample[i], base + 4 + i, "");
7348 }
7349 if (key->ps_prolog.states.force_linear_sample_interp) {
7350 unsigned i, base = key->ps_prolog.num_input_sgprs;
7351 LLVMValueRef linear_sample[2];
7352
7353 /* Read LINEAR_SAMPLE. */
7354 for (i = 0; i < 2; i++)
7355 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7356 /* Overwrite LINEAR_CENTER. */
7357 for (i = 0; i < 2; i++)
7358 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7359 linear_sample[i], base + 8 + i, "");
7360 /* Overwrite LINEAR_CENTROID. */
7361 for (i = 0; i < 2; i++)
7362 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7363 linear_sample[i], base + 10 + i, "");
7364 }
7365
7366 /* Force center interpolation. */
7367 if (key->ps_prolog.states.force_persp_center_interp) {
7368 unsigned i, base = key->ps_prolog.num_input_sgprs;
7369 LLVMValueRef persp_center[2];
7370
7371 /* Read PERSP_CENTER. */
7372 for (i = 0; i < 2; i++)
7373 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7374 /* Overwrite PERSP_SAMPLE. */
7375 for (i = 0; i < 2; i++)
7376 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7377 persp_center[i], base + i, "");
7378 /* Overwrite PERSP_CENTROID. */
7379 for (i = 0; i < 2; i++)
7380 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7381 persp_center[i], base + 4 + i, "");
7382 }
7383 if (key->ps_prolog.states.force_linear_center_interp) {
7384 unsigned i, base = key->ps_prolog.num_input_sgprs;
7385 LLVMValueRef linear_center[2];
7386
7387 /* Read LINEAR_CENTER. */
7388 for (i = 0; i < 2; i++)
7389 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7390 /* Overwrite LINEAR_SAMPLE. */
7391 for (i = 0; i < 2; i++)
7392 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7393 linear_center[i], base + 6 + i, "");
7394 /* Overwrite LINEAR_CENTROID. */
7395 for (i = 0; i < 2; i++)
7396 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7397 linear_center[i], base + 10 + i, "");
7398 }
7399
7400 /* Interpolate colors. */
7401 unsigned color_out_idx = 0;
7402 for (i = 0; i < 2; i++) {
7403 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7404 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7405 key->ps_prolog.face_vgpr_index;
7406 LLVMValueRef interp[2], color[4];
7407 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7408
7409 if (!writemask)
7410 continue;
7411
7412 /* If the interpolation qualifier is not CONSTANT (-1). */
7413 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7414 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7415 key->ps_prolog.color_interp_vgpr_index[i];
7416
7417 /* Get the (i,j) updated by bc_optimize handling. */
7418 interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7419 interp_vgpr, "");
7420 interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7421 interp_vgpr + 1, "");
7422 interp_ij = lp_build_gather_values(&ctx->gallivm, interp, 2);
7423 }
7424
7425 /* Use the absolute location of the input. */
7426 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7427
7428 if (key->ps_prolog.states.color_two_side) {
7429 face = LLVMGetParam(func, face_vgpr);
7430 face = ac_to_integer(&ctx->ac, face);
7431 }
7432
7433 interp_fs_input(ctx,
7434 key->ps_prolog.color_attr_index[i],
7435 TGSI_SEMANTIC_COLOR, i,
7436 key->ps_prolog.num_interp_inputs,
7437 key->ps_prolog.colors_read, interp_ij,
7438 prim_mask, face, color);
7439
7440 while (writemask) {
7441 unsigned chan = u_bit_scan(&writemask);
7442 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
7443 fninfo.num_params + color_out_idx++, "");
7444 }
7445 }
7446
7447 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
7448 * says:
7449 *
7450 * "When per-sample shading is active due to the use of a fragment
7451 * input qualified by sample or due to the use of the gl_SampleID
7452 * or gl_SamplePosition variables, only the bit for the current
7453 * sample is set in gl_SampleMaskIn. When state specifies multiple
7454 * fragment shader invocations for a given fragment, the sample
7455 * mask for any single fragment shader invocation may specify a
7456 * subset of the covered samples for the fragment. In this case,
7457 * the bit corresponding to each covered sample will be set in
7458 * exactly one fragment shader invocation."
7459 *
7460 * The samplemask loaded by hardware is always the coverage of the
7461 * entire pixel/fragment, so mask bits out based on the sample ID.
7462 */
7463 if (key->ps_prolog.states.samplemask_log_ps_iter) {
7464 /* The bit pattern matches that used by fixed function fragment
7465 * processing. */
7466 static const uint16_t ps_iter_masks[] = {
7467 0xffff, /* not used */
7468 0x5555,
7469 0x1111,
7470 0x0101,
7471 0x0001,
7472 };
7473 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
7474
7475 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
7476 unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs +
7477 key->ps_prolog.ancillary_vgpr_index;
7478 LLVMValueRef sampleid = unpack_param(ctx, ancillary_vgpr, 8, 4);
7479 LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1);
7480
7481 samplemask = ac_to_integer(&ctx->ac, samplemask);
7482 samplemask = LLVMBuildAnd(
7483 ctx->ac.builder,
7484 samplemask,
7485 LLVMBuildShl(ctx->ac.builder,
7486 LLVMConstInt(ctx->i32, ps_iter_mask, false),
7487 sampleid, ""),
7488 "");
7489 samplemask = ac_to_float(&ctx->ac, samplemask);
7490
7491 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
7492 ancillary_vgpr + 1, "");
7493 }
7494
7495 /* Tell LLVM to insert WQM instruction sequence when needed. */
7496 if (key->ps_prolog.wqm) {
7497 LLVMAddTargetDependentFunctionAttr(func,
7498 "amdgpu-ps-wqm-outputs", "");
7499 }
7500
7501 si_llvm_build_ret(ctx, ret);
7502 }
7503
7504 /**
7505 * Build the pixel shader epilog function. This handles everything that must be
7506 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7507 */
7508 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7509 union si_shader_part_key *key)
7510 {
7511 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7512 struct si_function_info fninfo;
7513 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7514 int i;
7515 struct si_ps_exports exp = {};
7516
7517 si_init_function_info(&fninfo);
7518
7519 /* Declare input SGPRs. */
7520 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7521 ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7522 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7523 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->i64);
7524 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7525
7526 /* Declare input VGPRs. */
7527 unsigned required_num_params =
7528 fninfo.num_sgpr_params +
7529 util_bitcount(key->ps_epilog.colors_written) * 4 +
7530 key->ps_epilog.writes_z +
7531 key->ps_epilog.writes_stencil +
7532 key->ps_epilog.writes_samplemask;
7533
7534 required_num_params = MAX2(required_num_params,
7535 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7536
7537 while (fninfo.num_params < required_num_params)
7538 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7539
7540 /* Create the function. */
7541 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7542 /* Disable elimination of unused inputs. */
7543 si_llvm_add_attribute(ctx->main_fn,
7544 "InitialPSInputAddr", 0xffffff);
7545
7546 /* Process colors. */
7547 unsigned vgpr = fninfo.num_sgpr_params;
7548 unsigned colors_written = key->ps_epilog.colors_written;
7549 int last_color_export = -1;
7550
7551 /* Find the last color export. */
7552 if (!key->ps_epilog.writes_z &&
7553 !key->ps_epilog.writes_stencil &&
7554 !key->ps_epilog.writes_samplemask) {
7555 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7556
7557 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7558 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7559 /* Just set this if any of the colorbuffers are enabled. */
7560 if (spi_format &
7561 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7562 last_color_export = 0;
7563 } else {
7564 for (i = 0; i < 8; i++)
7565 if (colors_written & (1 << i) &&
7566 (spi_format >> (i * 4)) & 0xf)
7567 last_color_export = i;
7568 }
7569 }
7570
7571 while (colors_written) {
7572 LLVMValueRef color[4];
7573 int mrt = u_bit_scan(&colors_written);
7574
7575 for (i = 0; i < 4; i++)
7576 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7577
7578 si_export_mrt_color(bld_base, color, mrt,
7579 fninfo.num_params - 1,
7580 mrt == last_color_export, &exp);
7581 }
7582
7583 /* Process depth, stencil, samplemask. */
7584 if (key->ps_epilog.writes_z)
7585 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7586 if (key->ps_epilog.writes_stencil)
7587 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7588 if (key->ps_epilog.writes_samplemask)
7589 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7590
7591 if (depth || stencil || samplemask)
7592 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7593 else if (last_color_export == -1)
7594 si_export_null(bld_base);
7595
7596 if (exp.num)
7597 si_emit_ps_exports(ctx, &exp);
7598
7599 /* Compile. */
7600 LLVMBuildRetVoid(ctx->ac.builder);
7601 }
7602
7603 /**
7604 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7605 */
7606 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7607 LLVMTargetMachineRef tm,
7608 struct si_shader *shader,
7609 struct pipe_debug_callback *debug)
7610 {
7611 union si_shader_part_key prolog_key;
7612 union si_shader_part_key epilog_key;
7613
7614 /* Get the prolog. */
7615 si_get_ps_prolog_key(shader, &prolog_key, true);
7616
7617 /* The prolog is a no-op if these aren't set. */
7618 if (si_need_ps_prolog(&prolog_key)) {
7619 shader->prolog =
7620 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7621 PIPE_SHADER_FRAGMENT, true,
7622 &prolog_key, tm, debug,
7623 si_build_ps_prolog_function,
7624 "Fragment Shader Prolog");
7625 if (!shader->prolog)
7626 return false;
7627 }
7628
7629 /* Get the epilog. */
7630 si_get_ps_epilog_key(shader, &epilog_key);
7631
7632 shader->epilog =
7633 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7634 PIPE_SHADER_FRAGMENT, false,
7635 &epilog_key, tm, debug,
7636 si_build_ps_epilog_function,
7637 "Fragment Shader Epilog");
7638 if (!shader->epilog)
7639 return false;
7640
7641 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7642 if (shader->key.part.ps.prolog.poly_stipple) {
7643 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7644 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7645 }
7646
7647 /* Set up the enable bits for per-sample shading if needed. */
7648 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7649 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7650 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7651 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7652 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7653 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7654 }
7655 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7656 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7657 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7658 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7659 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7660 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7661 }
7662 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7663 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7664 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7665 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7666 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7667 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7668 }
7669 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7670 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7671 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7672 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7673 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7674 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7675 }
7676
7677 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7678 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7679 !(shader->config.spi_ps_input_ena & 0xf)) {
7680 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7681 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7682 }
7683
7684 /* At least one pair of interpolation weights must be enabled. */
7685 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7686 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7687 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7688 }
7689
7690 /* Samplemask fixup requires the sample ID. */
7691 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
7692 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
7693 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
7694 }
7695
7696 /* The sample mask input is always enabled, because the API shader always
7697 * passes it through to the epilog. Disable it here if it's unused.
7698 */
7699 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7700 !shader->selector->info.reads_samplemask)
7701 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7702
7703 return true;
7704 }
7705
7706 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7707 unsigned *lds_size)
7708 {
7709 /* SPI barrier management bug:
7710 * Make sure we have at least 4k of LDS in use to avoid the bug.
7711 * It applies to workgroup sizes of more than one wavefront.
7712 */
7713 if (sscreen->info.family == CHIP_BONAIRE ||
7714 sscreen->info.family == CHIP_KABINI ||
7715 sscreen->info.family == CHIP_MULLINS)
7716 *lds_size = MAX2(*lds_size, 8);
7717 }
7718
7719 static void si_fix_resource_usage(struct si_screen *sscreen,
7720 struct si_shader *shader)
7721 {
7722 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7723
7724 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7725
7726 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7727 si_get_max_workgroup_size(shader) > 64) {
7728 si_multiwave_lds_size_workaround(sscreen,
7729 &shader->config.lds_size);
7730 }
7731 }
7732
7733 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7734 struct si_shader *shader,
7735 struct pipe_debug_callback *debug)
7736 {
7737 struct si_shader_selector *sel = shader->selector;
7738 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7739 int r;
7740
7741 /* LS, ES, VS are compiled on demand if the main part hasn't been
7742 * compiled for that stage.
7743 *
7744 * Vertex shaders are compiled on demand when a vertex fetch
7745 * workaround must be applied.
7746 */
7747 if (shader->is_monolithic) {
7748 /* Monolithic shader (compiled as a whole, has many variants,
7749 * may take a long time to compile).
7750 */
7751 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7752 if (r)
7753 return r;
7754 } else {
7755 /* The shader consists of several parts:
7756 *
7757 * - the middle part is the user shader, it has 1 variant only
7758 * and it was compiled during the creation of the shader
7759 * selector
7760 * - the prolog part is inserted at the beginning
7761 * - the epilog part is inserted at the end
7762 *
7763 * The prolog and epilog have many (but simple) variants.
7764 *
7765 * Starting with gfx9, geometry and tessellation control
7766 * shaders also contain the prolog and user shader parts of
7767 * the previous shader stage.
7768 */
7769
7770 if (!mainp)
7771 return -1;
7772
7773 /* Copy the compiled TGSI shader data over. */
7774 shader->is_binary_shared = true;
7775 shader->binary = mainp->binary;
7776 shader->config = mainp->config;
7777 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7778 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7779 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7780 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
7781 memcpy(shader->info.vs_output_param_offset,
7782 mainp->info.vs_output_param_offset,
7783 sizeof(mainp->info.vs_output_param_offset));
7784 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7785 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7786 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7787
7788 /* Select prologs and/or epilogs. */
7789 switch (sel->type) {
7790 case PIPE_SHADER_VERTEX:
7791 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7792 return -1;
7793 break;
7794 case PIPE_SHADER_TESS_CTRL:
7795 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7796 return -1;
7797 break;
7798 case PIPE_SHADER_TESS_EVAL:
7799 break;
7800 case PIPE_SHADER_GEOMETRY:
7801 if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
7802 return -1;
7803 break;
7804 case PIPE_SHADER_FRAGMENT:
7805 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7806 return -1;
7807
7808 /* Make sure we have at least as many VGPRs as there
7809 * are allocated inputs.
7810 */
7811 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7812 shader->info.num_input_vgprs);
7813 break;
7814 }
7815
7816 /* Update SGPR and VGPR counts. */
7817 if (shader->prolog) {
7818 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7819 shader->prolog->config.num_sgprs);
7820 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7821 shader->prolog->config.num_vgprs);
7822 }
7823 if (shader->previous_stage) {
7824 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7825 shader->previous_stage->config.num_sgprs);
7826 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7827 shader->previous_stage->config.num_vgprs);
7828 shader->config.spilled_sgprs =
7829 MAX2(shader->config.spilled_sgprs,
7830 shader->previous_stage->config.spilled_sgprs);
7831 shader->config.spilled_vgprs =
7832 MAX2(shader->config.spilled_vgprs,
7833 shader->previous_stage->config.spilled_vgprs);
7834 shader->config.private_mem_vgprs =
7835 MAX2(shader->config.private_mem_vgprs,
7836 shader->previous_stage->config.private_mem_vgprs);
7837 shader->config.scratch_bytes_per_wave =
7838 MAX2(shader->config.scratch_bytes_per_wave,
7839 shader->previous_stage->config.scratch_bytes_per_wave);
7840 shader->info.uses_instanceid |=
7841 shader->previous_stage->info.uses_instanceid;
7842 }
7843 if (shader->prolog2) {
7844 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7845 shader->prolog2->config.num_sgprs);
7846 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7847 shader->prolog2->config.num_vgprs);
7848 }
7849 if (shader->epilog) {
7850 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7851 shader->epilog->config.num_sgprs);
7852 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7853 shader->epilog->config.num_vgprs);
7854 }
7855 }
7856
7857 si_fix_resource_usage(sscreen, shader);
7858 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7859 stderr, true);
7860
7861 /* Upload. */
7862 r = si_shader_binary_upload(sscreen, shader);
7863 if (r) {
7864 fprintf(stderr, "LLVM failed to upload shader\n");
7865 return r;
7866 }
7867
7868 return 0;
7869 }
7870
7871 void si_shader_destroy(struct si_shader *shader)
7872 {
7873 if (shader->scratch_bo)
7874 r600_resource_reference(&shader->scratch_bo, NULL);
7875
7876 r600_resource_reference(&shader->bo, NULL);
7877
7878 if (!shader->is_binary_shared)
7879 ac_shader_binary_clean(&shader->binary);
7880
7881 free(shader->shader_log);
7882 }