radeonsi: call si_fix_resource_usage for the GS copy shader as well
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "util/u_memory.h"
26 #include "util/u_string.h"
27 #include "tgsi/tgsi_build.h"
28 #include "tgsi/tgsi_util.h"
29 #include "tgsi/tgsi_dump.h"
30
31 #include "ac_exp_param.h"
32 #include "ac_shader_util.h"
33 #include "ac_llvm_util.h"
34 #include "si_shader_internal.h"
35 #include "si_pipe.h"
36 #include "sid.h"
37
38 #include "compiler/nir/nir.h"
39
40 static const char *scratch_rsrc_dword0_symbol =
41 "SCRATCH_RSRC_DWORD0";
42
43 static const char *scratch_rsrc_dword1_symbol =
44 "SCRATCH_RSRC_DWORD1";
45
46 struct si_shader_output_values
47 {
48 LLVMValueRef values[4];
49 unsigned semantic_name;
50 unsigned semantic_index;
51 ubyte vertex_stream[4];
52 };
53
54 /**
55 * Used to collect types and other info about arguments of the LLVM function
56 * before the function is created.
57 */
58 struct si_function_info {
59 LLVMTypeRef types[100];
60 LLVMValueRef *assign[100];
61 unsigned num_sgpr_params;
62 unsigned num_params;
63 };
64
65 enum si_arg_regfile {
66 ARG_SGPR,
67 ARG_VGPR
68 };
69
70 static void si_init_shader_ctx(struct si_shader_context *ctx,
71 struct si_screen *sscreen,
72 struct ac_llvm_compiler *compiler);
73
74 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
75 struct lp_build_tgsi_context *bld_base,
76 struct lp_build_emit_data *emit_data);
77
78 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
79 FILE *f);
80
81 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
82 union si_shader_part_key *key);
83 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
84 union si_shader_part_key *key);
85 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
86 union si_shader_part_key *key);
87 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
88 union si_shader_part_key *key);
89 static void si_fix_resource_usage(struct si_screen *sscreen,
90 struct si_shader *shader);
91
92 /* Ideally pass the sample mask input to the PS epilog as v14, which
93 * is its usual location, so that the shader doesn't have to add v_mov.
94 */
95 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
96
97 static bool llvm_type_is_64bit(struct si_shader_context *ctx,
98 LLVMTypeRef type)
99 {
100 if (type == ctx->ac.i64 || type == ctx->ac.f64)
101 return true;
102
103 return false;
104 }
105
106 static bool is_merged_shader(struct si_shader_context *ctx)
107 {
108 if (ctx->screen->info.chip_class <= VI)
109 return false;
110
111 return ctx->shader->key.as_ls ||
112 ctx->shader->key.as_es ||
113 ctx->type == PIPE_SHADER_TESS_CTRL ||
114 ctx->type == PIPE_SHADER_GEOMETRY;
115 }
116
117 static void si_init_function_info(struct si_function_info *fninfo)
118 {
119 fninfo->num_params = 0;
120 fninfo->num_sgpr_params = 0;
121 }
122
123 static unsigned add_arg_assign(struct si_function_info *fninfo,
124 enum si_arg_regfile regfile, LLVMTypeRef type,
125 LLVMValueRef *assign)
126 {
127 assert(regfile != ARG_SGPR || fninfo->num_sgpr_params == fninfo->num_params);
128
129 unsigned idx = fninfo->num_params++;
130 assert(idx < ARRAY_SIZE(fninfo->types));
131
132 if (regfile == ARG_SGPR)
133 fninfo->num_sgpr_params = fninfo->num_params;
134
135 fninfo->types[idx] = type;
136 fninfo->assign[idx] = assign;
137 return idx;
138 }
139
140 static unsigned add_arg(struct si_function_info *fninfo,
141 enum si_arg_regfile regfile, LLVMTypeRef type)
142 {
143 return add_arg_assign(fninfo, regfile, type, NULL);
144 }
145
146 static void add_arg_assign_checked(struct si_function_info *fninfo,
147 enum si_arg_regfile regfile, LLVMTypeRef type,
148 LLVMValueRef *assign, unsigned idx)
149 {
150 MAYBE_UNUSED unsigned actual = add_arg_assign(fninfo, regfile, type, assign);
151 assert(actual == idx);
152 }
153
154 static void add_arg_checked(struct si_function_info *fninfo,
155 enum si_arg_regfile regfile, LLVMTypeRef type,
156 unsigned idx)
157 {
158 add_arg_assign_checked(fninfo, regfile, type, NULL, idx);
159 }
160
161 /**
162 * Returns a unique index for a per-patch semantic name and index. The index
163 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
164 * can be calculated.
165 */
166 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index)
167 {
168 switch (semantic_name) {
169 case TGSI_SEMANTIC_TESSOUTER:
170 return 0;
171 case TGSI_SEMANTIC_TESSINNER:
172 return 1;
173 case TGSI_SEMANTIC_PATCH:
174 assert(index < 30);
175 return 2 + index;
176
177 default:
178 assert(!"invalid semantic name");
179 return 0;
180 }
181 }
182
183 /**
184 * Returns a unique index for a semantic name and index. The index must be
185 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
186 * calculated.
187 */
188 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
189 unsigned is_varying)
190 {
191 switch (semantic_name) {
192 case TGSI_SEMANTIC_POSITION:
193 return 0;
194 case TGSI_SEMANTIC_GENERIC:
195 /* Since some shader stages use the the highest used IO index
196 * to determine the size to allocate for inputs/outputs
197 * (in LDS, tess and GS rings). GENERIC should be placed right
198 * after POSITION to make that size as small as possible.
199 */
200 if (index < SI_MAX_IO_GENERIC)
201 return 1 + index;
202
203 assert(!"invalid generic index");
204 return 0;
205 case TGSI_SEMANTIC_PSIZE:
206 return SI_MAX_IO_GENERIC + 1;
207 case TGSI_SEMANTIC_CLIPDIST:
208 assert(index <= 1);
209 return SI_MAX_IO_GENERIC + 2 + index;
210 case TGSI_SEMANTIC_FOG:
211 return SI_MAX_IO_GENERIC + 4;
212 case TGSI_SEMANTIC_LAYER:
213 return SI_MAX_IO_GENERIC + 5;
214 case TGSI_SEMANTIC_VIEWPORT_INDEX:
215 return SI_MAX_IO_GENERIC + 6;
216 case TGSI_SEMANTIC_PRIMID:
217 return SI_MAX_IO_GENERIC + 7;
218 case TGSI_SEMANTIC_COLOR:
219 assert(index < 2);
220 return SI_MAX_IO_GENERIC + 8 + index;
221 case TGSI_SEMANTIC_BCOLOR:
222 assert(index < 2);
223 /* If it's a varying, COLOR and BCOLOR alias. */
224 if (is_varying)
225 return SI_MAX_IO_GENERIC + 8 + index;
226 else
227 return SI_MAX_IO_GENERIC + 10 + index;
228 case TGSI_SEMANTIC_TEXCOORD:
229 assert(index < 8);
230 STATIC_ASSERT(SI_MAX_IO_GENERIC + 12 + 8 <= 63);
231 return SI_MAX_IO_GENERIC + 12 + index;
232 case TGSI_SEMANTIC_CLIPVERTEX:
233 return 63;
234 default:
235 fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
236 assert(!"invalid semantic name");
237 return 0;
238 }
239 }
240
241 /**
242 * Get the value of a shader input parameter and extract a bitfield.
243 */
244 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx,
245 LLVMValueRef value, unsigned rshift,
246 unsigned bitwidth)
247 {
248 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
249 value = ac_to_integer(&ctx->ac, value);
250
251 if (rshift)
252 value = LLVMBuildLShr(ctx->ac.builder, value,
253 LLVMConstInt(ctx->i32, rshift, 0), "");
254
255 if (rshift + bitwidth < 32) {
256 unsigned mask = (1 << bitwidth) - 1;
257 value = LLVMBuildAnd(ctx->ac.builder, value,
258 LLVMConstInt(ctx->i32, mask, 0), "");
259 }
260
261 return value;
262 }
263
264 LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
265 unsigned param, unsigned rshift,
266 unsigned bitwidth)
267 {
268 LLVMValueRef value = LLVMGetParam(ctx->main_fn, param);
269
270 return unpack_llvm_param(ctx, value, rshift, bitwidth);
271 }
272
273 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
274 {
275 switch (ctx->type) {
276 case PIPE_SHADER_TESS_CTRL:
277 return unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 0, 8);
278
279 case PIPE_SHADER_TESS_EVAL:
280 return LLVMGetParam(ctx->main_fn,
281 ctx->param_tes_rel_patch_id);
282
283 default:
284 assert(0);
285 return NULL;
286 }
287 }
288
289 /* Tessellation shaders pass outputs to the next shader using LDS.
290 *
291 * LS outputs = TCS inputs
292 * TCS outputs = TES inputs
293 *
294 * The LDS layout is:
295 * - TCS inputs for patch 0
296 * - TCS inputs for patch 1
297 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
298 * - ...
299 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
300 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
301 * - TCS outputs for patch 1
302 * - Per-patch TCS outputs for patch 1
303 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
304 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
305 * - ...
306 *
307 * All three shaders VS(LS), TCS, TES share the same LDS space.
308 */
309
310 static LLVMValueRef
311 get_tcs_in_patch_stride(struct si_shader_context *ctx)
312 {
313 return si_unpack_param(ctx, ctx->param_vs_state_bits, 8, 13);
314 }
315
316 static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *ctx)
317 {
318 assert(ctx->type == PIPE_SHADER_TESS_CTRL);
319
320 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
321 return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
322
323 return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
324 }
325
326 static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
327 {
328 unsigned stride = get_tcs_out_vertex_dw_stride_constant(ctx);
329
330 return LLVMConstInt(ctx->i32, stride, 0);
331 }
332
333 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
334 {
335 if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
336 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 0, 13);
337
338 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
339 unsigned tcs_out_vertices = info->properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
340 unsigned vertex_dw_stride = get_tcs_out_vertex_dw_stride_constant(ctx);
341 unsigned num_patch_outputs = util_last_bit64(ctx->shader->selector->patch_outputs_written);
342 unsigned patch_dw_stride = tcs_out_vertices * vertex_dw_stride +
343 num_patch_outputs * 4;
344 return LLVMConstInt(ctx->i32, patch_dw_stride, 0);
345 }
346
347 static LLVMValueRef
348 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
349 {
350 return LLVMBuildMul(ctx->ac.builder,
351 si_unpack_param(ctx,
352 ctx->param_tcs_out_lds_offsets,
353 0, 16),
354 LLVMConstInt(ctx->i32, 4, 0), "");
355 }
356
357 static LLVMValueRef
358 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
359 {
360 return LLVMBuildMul(ctx->ac.builder,
361 si_unpack_param(ctx,
362 ctx->param_tcs_out_lds_offsets,
363 16, 16),
364 LLVMConstInt(ctx->i32, 4, 0), "");
365 }
366
367 static LLVMValueRef
368 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
369 {
370 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
371 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
372
373 return LLVMBuildMul(ctx->ac.builder, patch_stride, rel_patch_id, "");
374 }
375
376 static LLVMValueRef
377 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
378 {
379 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
380 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
381 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
382
383 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_offset);
384 }
385
386 static LLVMValueRef
387 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
388 {
389 LLVMValueRef patch0_patch_data_offset =
390 get_tcs_out_patch0_patch_data_offset(ctx);
391 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
392 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
393
394 return ac_build_imad(&ctx->ac, patch_stride, rel_patch_id, patch0_patch_data_offset);
395 }
396
397 static LLVMValueRef get_num_tcs_out_vertices(struct si_shader_context *ctx)
398 {
399 unsigned tcs_out_vertices =
400 ctx->shader->selector ?
401 ctx->shader->selector->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] : 0;
402
403 /* If !tcs_out_vertices, it's either the fixed-func TCS or the TCS epilog. */
404 if (ctx->type == PIPE_SHADER_TESS_CTRL && tcs_out_vertices)
405 return LLVMConstInt(ctx->i32, tcs_out_vertices, 0);
406
407 return si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 6, 6);
408 }
409
410 static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
411 {
412 unsigned stride;
413
414 switch (ctx->type) {
415 case PIPE_SHADER_VERTEX:
416 stride = ctx->shader->selector->lshs_vertex_stride / 4;
417 return LLVMConstInt(ctx->i32, stride, 0);
418
419 case PIPE_SHADER_TESS_CTRL:
420 if (ctx->screen->info.chip_class >= GFX9 &&
421 ctx->shader->is_monolithic) {
422 stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
423 return LLVMConstInt(ctx->i32, stride, 0);
424 }
425 return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
426
427 default:
428 assert(0);
429 return NULL;
430 }
431 }
432
433 /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
434 * to float. */
435 static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
436 LLVMValueRef vec4,
437 unsigned double_index)
438 {
439 LLVMBuilderRef builder = ctx->ac.builder;
440 LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
441 LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
442 LLVMVectorType(f64, 2), "");
443 LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
444 LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
445 return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
446 }
447
448 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx,
449 LLVMValueRef i32, unsigned index)
450 {
451 assert(index <= 1);
452
453 if (index == 1)
454 return LLVMBuildAShr(ctx->ac.builder, i32,
455 LLVMConstInt(ctx->i32, 16, 0), "");
456
457 return LLVMBuildSExt(ctx->ac.builder,
458 LLVMBuildTrunc(ctx->ac.builder, i32,
459 ctx->ac.i16, ""),
460 ctx->i32, "");
461 }
462
463 void si_llvm_load_input_vs(
464 struct si_shader_context *ctx,
465 unsigned input_index,
466 LLVMValueRef out[4])
467 {
468 const struct tgsi_shader_info *info = &ctx->shader->selector->info;
469 unsigned vs_blit_property = info->properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
470
471 if (vs_blit_property) {
472 LLVMValueRef vertex_id = ctx->abi.vertex_id;
473 LLVMValueRef sel_x1 = LLVMBuildICmp(ctx->ac.builder,
474 LLVMIntULE, vertex_id,
475 ctx->i32_1, "");
476 /* Use LLVMIntNE, because we have 3 vertices and only
477 * the middle one should use y2.
478 */
479 LLVMValueRef sel_y1 = LLVMBuildICmp(ctx->ac.builder,
480 LLVMIntNE, vertex_id,
481 ctx->i32_1, "");
482
483 if (input_index == 0) {
484 /* Position: */
485 LLVMValueRef x1y1 = LLVMGetParam(ctx->main_fn,
486 ctx->param_vs_blit_inputs);
487 LLVMValueRef x2y2 = LLVMGetParam(ctx->main_fn,
488 ctx->param_vs_blit_inputs + 1);
489
490 LLVMValueRef x1 = unpack_sint16(ctx, x1y1, 0);
491 LLVMValueRef y1 = unpack_sint16(ctx, x1y1, 1);
492 LLVMValueRef x2 = unpack_sint16(ctx, x2y2, 0);
493 LLVMValueRef y2 = unpack_sint16(ctx, x2y2, 1);
494
495 LLVMValueRef x = LLVMBuildSelect(ctx->ac.builder, sel_x1,
496 x1, x2, "");
497 LLVMValueRef y = LLVMBuildSelect(ctx->ac.builder, sel_y1,
498 y1, y2, "");
499
500 out[0] = LLVMBuildSIToFP(ctx->ac.builder, x, ctx->f32, "");
501 out[1] = LLVMBuildSIToFP(ctx->ac.builder, y, ctx->f32, "");
502 out[2] = LLVMGetParam(ctx->main_fn,
503 ctx->param_vs_blit_inputs + 2);
504 out[3] = ctx->ac.f32_1;
505 return;
506 }
507
508 /* Color or texture coordinates: */
509 assert(input_index == 1);
510
511 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
512 for (int i = 0; i < 4; i++) {
513 out[i] = LLVMGetParam(ctx->main_fn,
514 ctx->param_vs_blit_inputs + 3 + i);
515 }
516 } else {
517 assert(vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD);
518 LLVMValueRef x1 = LLVMGetParam(ctx->main_fn,
519 ctx->param_vs_blit_inputs + 3);
520 LLVMValueRef y1 = LLVMGetParam(ctx->main_fn,
521 ctx->param_vs_blit_inputs + 4);
522 LLVMValueRef x2 = LLVMGetParam(ctx->main_fn,
523 ctx->param_vs_blit_inputs + 5);
524 LLVMValueRef y2 = LLVMGetParam(ctx->main_fn,
525 ctx->param_vs_blit_inputs + 6);
526
527 out[0] = LLVMBuildSelect(ctx->ac.builder, sel_x1,
528 x1, x2, "");
529 out[1] = LLVMBuildSelect(ctx->ac.builder, sel_y1,
530 y1, y2, "");
531 out[2] = LLVMGetParam(ctx->main_fn,
532 ctx->param_vs_blit_inputs + 7);
533 out[3] = LLVMGetParam(ctx->main_fn,
534 ctx->param_vs_blit_inputs + 8);
535 }
536 return;
537 }
538
539 unsigned chan;
540 unsigned fix_fetch;
541 unsigned num_fetches;
542 unsigned fetch_stride;
543 unsigned num_channels;
544
545 LLVMValueRef t_list_ptr;
546 LLVMValueRef t_offset;
547 LLVMValueRef t_list;
548 LLVMValueRef vertex_index;
549 LLVMValueRef input[3];
550
551 /* Load the T list */
552 t_list_ptr = LLVMGetParam(ctx->main_fn, ctx->param_vertex_buffers);
553
554 t_offset = LLVMConstInt(ctx->i32, input_index, 0);
555
556 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
557
558 vertex_index = LLVMGetParam(ctx->main_fn,
559 ctx->param_vertex_index0 +
560 input_index);
561
562 fix_fetch = ctx->shader->key.mono.vs_fix_fetch[input_index];
563
564 /* Do multiple loads for special formats. */
565 switch (fix_fetch) {
566 case SI_FIX_FETCH_RGB_64_FLOAT:
567 num_fetches = 3; /* 3 2-dword loads */
568 fetch_stride = 8;
569 num_channels = 2;
570 break;
571 case SI_FIX_FETCH_RGBA_64_FLOAT:
572 num_fetches = 2; /* 2 4-dword loads */
573 fetch_stride = 16;
574 num_channels = 4;
575 break;
576 case SI_FIX_FETCH_RGB_8:
577 case SI_FIX_FETCH_RGB_8_INT:
578 num_fetches = 3;
579 fetch_stride = 1;
580 num_channels = 1;
581 break;
582 case SI_FIX_FETCH_RGB_16:
583 case SI_FIX_FETCH_RGB_16_INT:
584 num_fetches = 3;
585 fetch_stride = 2;
586 num_channels = 1;
587 break;
588 default:
589 num_fetches = 1;
590 fetch_stride = 0;
591 num_channels = util_last_bit(info->input_usage_mask[input_index]);
592 }
593
594 for (unsigned i = 0; i < num_fetches; i++) {
595 LLVMValueRef voffset = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
596
597 input[i] = ac_build_buffer_load_format(&ctx->ac, t_list,
598 vertex_index, voffset,
599 num_channels, false, true);
600 input[i] = ac_build_expand_to_vec4(&ctx->ac, input[i], num_channels);
601 }
602
603 /* Break up the vec4 into individual components */
604 for (chan = 0; chan < 4; chan++) {
605 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, 0);
606 out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
607 input[0], llvm_chan, "");
608 }
609
610 switch (fix_fetch) {
611 case SI_FIX_FETCH_A2_SNORM:
612 case SI_FIX_FETCH_A2_SSCALED:
613 case SI_FIX_FETCH_A2_SINT: {
614 /* The hardware returns an unsigned value; convert it to a
615 * signed one.
616 */
617 LLVMValueRef tmp = out[3];
618 LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
619
620 /* First, recover the sign-extended signed integer value. */
621 if (fix_fetch == SI_FIX_FETCH_A2_SSCALED)
622 tmp = LLVMBuildFPToUI(ctx->ac.builder, tmp, ctx->i32, "");
623 else
624 tmp = ac_to_integer(&ctx->ac, tmp);
625
626 /* For the integer-like cases, do a natural sign extension.
627 *
628 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
629 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
630 * exponent.
631 */
632 tmp = LLVMBuildShl(ctx->ac.builder, tmp,
633 fix_fetch == SI_FIX_FETCH_A2_SNORM ?
634 LLVMConstInt(ctx->i32, 7, 0) : c30, "");
635 tmp = LLVMBuildAShr(ctx->ac.builder, tmp, c30, "");
636
637 /* Convert back to the right type. */
638 if (fix_fetch == SI_FIX_FETCH_A2_SNORM) {
639 LLVMValueRef clamp;
640 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
641 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
642 clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, tmp, neg_one, "");
643 tmp = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, tmp, "");
644 } else if (fix_fetch == SI_FIX_FETCH_A2_SSCALED) {
645 tmp = LLVMBuildSIToFP(ctx->ac.builder, tmp, ctx->f32, "");
646 }
647
648 out[3] = tmp;
649 break;
650 }
651 case SI_FIX_FETCH_RGBA_32_UNORM:
652 case SI_FIX_FETCH_RGBX_32_UNORM:
653 for (chan = 0; chan < 4; chan++) {
654 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
655 out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
656 out[chan], ctx->f32, "");
657 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
658 LLVMConstReal(ctx->f32, 1.0 / UINT_MAX), "");
659 }
660 /* RGBX UINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
661 if (fix_fetch == SI_FIX_FETCH_RGBX_32_UNORM)
662 out[3] = LLVMConstReal(ctx->f32, 1);
663 break;
664 case SI_FIX_FETCH_RGBA_32_SNORM:
665 case SI_FIX_FETCH_RGBX_32_SNORM:
666 case SI_FIX_FETCH_RGBA_32_FIXED:
667 case SI_FIX_FETCH_RGBX_32_FIXED: {
668 double scale;
669 if (fix_fetch >= SI_FIX_FETCH_RGBA_32_FIXED)
670 scale = 1.0 / 0x10000;
671 else
672 scale = 1.0 / INT_MAX;
673
674 for (chan = 0; chan < 4; chan++) {
675 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
676 out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
677 out[chan], ctx->f32, "");
678 out[chan] = LLVMBuildFMul(ctx->ac.builder, out[chan],
679 LLVMConstReal(ctx->f32, scale), "");
680 }
681 /* RGBX SINT returns 1 in alpha, which would be rounded to 0 by normalizing. */
682 if (fix_fetch == SI_FIX_FETCH_RGBX_32_SNORM ||
683 fix_fetch == SI_FIX_FETCH_RGBX_32_FIXED)
684 out[3] = LLVMConstReal(ctx->f32, 1);
685 break;
686 }
687 case SI_FIX_FETCH_RGBA_32_USCALED:
688 for (chan = 0; chan < 4; chan++) {
689 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
690 out[chan] = LLVMBuildUIToFP(ctx->ac.builder,
691 out[chan], ctx->f32, "");
692 }
693 break;
694 case SI_FIX_FETCH_RGBA_32_SSCALED:
695 for (chan = 0; chan < 4; chan++) {
696 out[chan] = ac_to_integer(&ctx->ac, out[chan]);
697 out[chan] = LLVMBuildSIToFP(ctx->ac.builder,
698 out[chan], ctx->f32, "");
699 }
700 break;
701 case SI_FIX_FETCH_RG_64_FLOAT:
702 for (chan = 0; chan < 2; chan++)
703 out[chan] = extract_double_to_float(ctx, input[0], chan);
704
705 out[2] = LLVMConstReal(ctx->f32, 0);
706 out[3] = LLVMConstReal(ctx->f32, 1);
707 break;
708 case SI_FIX_FETCH_RGB_64_FLOAT:
709 for (chan = 0; chan < 3; chan++)
710 out[chan] = extract_double_to_float(ctx, input[chan], 0);
711
712 out[3] = LLVMConstReal(ctx->f32, 1);
713 break;
714 case SI_FIX_FETCH_RGBA_64_FLOAT:
715 for (chan = 0; chan < 4; chan++) {
716 out[chan] = extract_double_to_float(ctx, input[chan / 2],
717 chan % 2);
718 }
719 break;
720 case SI_FIX_FETCH_RGB_8:
721 case SI_FIX_FETCH_RGB_8_INT:
722 case SI_FIX_FETCH_RGB_16:
723 case SI_FIX_FETCH_RGB_16_INT:
724 for (chan = 0; chan < 3; chan++) {
725 out[chan] = LLVMBuildExtractElement(ctx->ac.builder,
726 input[chan],
727 ctx->i32_0, "");
728 }
729 if (fix_fetch == SI_FIX_FETCH_RGB_8 ||
730 fix_fetch == SI_FIX_FETCH_RGB_16) {
731 out[3] = LLVMConstReal(ctx->f32, 1);
732 } else {
733 out[3] = ac_to_float(&ctx->ac, ctx->i32_1);
734 }
735 break;
736 }
737 }
738
739 static void declare_input_vs(
740 struct si_shader_context *ctx,
741 unsigned input_index,
742 const struct tgsi_full_declaration *decl,
743 LLVMValueRef out[4])
744 {
745 si_llvm_load_input_vs(ctx, input_index, out);
746 }
747
748 static LLVMValueRef get_primitive_id(struct si_shader_context *ctx,
749 unsigned swizzle)
750 {
751 if (swizzle > 0)
752 return ctx->i32_0;
753
754 switch (ctx->type) {
755 case PIPE_SHADER_VERTEX:
756 return LLVMGetParam(ctx->main_fn,
757 ctx->param_vs_prim_id);
758 case PIPE_SHADER_TESS_CTRL:
759 return ctx->abi.tcs_patch_id;
760 case PIPE_SHADER_TESS_EVAL:
761 return ctx->abi.tes_patch_id;
762 case PIPE_SHADER_GEOMETRY:
763 return ctx->abi.gs_prim_id;
764 default:
765 assert(0);
766 return ctx->i32_0;
767 }
768 }
769
770 /**
771 * Return the value of tgsi_ind_register for indexing.
772 * This is the indirect index with the constant offset added to it.
773 */
774 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
775 const struct tgsi_ind_register *ind,
776 unsigned addr_mul,
777 int rel_index)
778 {
779 LLVMValueRef result;
780
781 if (ind->File == TGSI_FILE_ADDRESS) {
782 result = ctx->addrs[ind->Index][ind->Swizzle];
783 result = LLVMBuildLoad(ctx->ac.builder, result, "");
784 } else {
785 struct tgsi_full_src_register src = {};
786
787 src.Register.File = ind->File;
788 src.Register.Index = ind->Index;
789
790 /* Set the second index to 0 for constants. */
791 if (ind->File == TGSI_FILE_CONSTANT)
792 src.Register.Dimension = 1;
793
794 result = ctx->bld_base.emit_fetch_funcs[ind->File](&ctx->bld_base, &src,
795 TGSI_TYPE_SIGNED,
796 ind->Swizzle);
797 result = ac_to_integer(&ctx->ac, result);
798 }
799
800 return ac_build_imad(&ctx->ac, result, LLVMConstInt(ctx->i32, addr_mul, 0),
801 LLVMConstInt(ctx->i32, rel_index, 0));
802 }
803
804 /**
805 * Like si_get_indirect_index, but restricts the return value to a (possibly
806 * undefined) value inside [0..num).
807 */
808 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
809 const struct tgsi_ind_register *ind,
810 int rel_index, unsigned num)
811 {
812 LLVMValueRef result = si_get_indirect_index(ctx, ind, 1, rel_index);
813
814 return si_llvm_bound_index(ctx, result, num);
815 }
816
817 static LLVMValueRef get_dw_address_from_generic_indices(struct si_shader_context *ctx,
818 LLVMValueRef vertex_dw_stride,
819 LLVMValueRef base_addr,
820 LLVMValueRef vertex_index,
821 LLVMValueRef param_index,
822 unsigned input_index,
823 ubyte *name,
824 ubyte *index,
825 bool is_patch)
826 {
827 if (vertex_dw_stride) {
828 base_addr = ac_build_imad(&ctx->ac, vertex_index,
829 vertex_dw_stride, base_addr);
830 }
831
832 if (param_index) {
833 base_addr = ac_build_imad(&ctx->ac, param_index,
834 LLVMConstInt(ctx->i32, 4, 0), base_addr);
835 }
836
837 int param = is_patch ?
838 si_shader_io_get_unique_index_patch(name[input_index],
839 index[input_index]) :
840 si_shader_io_get_unique_index(name[input_index],
841 index[input_index], false);
842
843 /* Add the base address of the element. */
844 return LLVMBuildAdd(ctx->ac.builder, base_addr,
845 LLVMConstInt(ctx->i32, param * 4, 0), "");
846 }
847
848 /**
849 * Calculate a dword address given an input or output register and a stride.
850 */
851 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
852 const struct tgsi_full_dst_register *dst,
853 const struct tgsi_full_src_register *src,
854 LLVMValueRef vertex_dw_stride,
855 LLVMValueRef base_addr)
856 {
857 struct tgsi_shader_info *info = &ctx->shader->selector->info;
858 ubyte *name, *index, *array_first;
859 int input_index;
860 struct tgsi_full_dst_register reg;
861 LLVMValueRef vertex_index = NULL;
862 LLVMValueRef ind_index = NULL;
863
864 /* Set the register description. The address computation is the same
865 * for sources and destinations. */
866 if (src) {
867 reg.Register.File = src->Register.File;
868 reg.Register.Index = src->Register.Index;
869 reg.Register.Indirect = src->Register.Indirect;
870 reg.Register.Dimension = src->Register.Dimension;
871 reg.Indirect = src->Indirect;
872 reg.Dimension = src->Dimension;
873 reg.DimIndirect = src->DimIndirect;
874 } else
875 reg = *dst;
876
877 /* If the register is 2-dimensional (e.g. an array of vertices
878 * in a primitive), calculate the base address of the vertex. */
879 if (reg.Register.Dimension) {
880 if (reg.Dimension.Indirect)
881 vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
882 1, reg.Dimension.Index);
883 else
884 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
885 }
886
887 /* Get information about the register. */
888 if (reg.Register.File == TGSI_FILE_INPUT) {
889 name = info->input_semantic_name;
890 index = info->input_semantic_index;
891 array_first = info->input_array_first;
892 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
893 name = info->output_semantic_name;
894 index = info->output_semantic_index;
895 array_first = info->output_array_first;
896 } else {
897 assert(0);
898 return NULL;
899 }
900
901 if (reg.Register.Indirect) {
902 /* Add the relative address of the element. */
903 if (reg.Indirect.ArrayID)
904 input_index = array_first[reg.Indirect.ArrayID];
905 else
906 input_index = reg.Register.Index;
907
908 ind_index = si_get_indirect_index(ctx, &reg.Indirect,
909 1, reg.Register.Index - input_index);
910 } else {
911 input_index = reg.Register.Index;
912 }
913
914 return get_dw_address_from_generic_indices(ctx, vertex_dw_stride,
915 base_addr, vertex_index,
916 ind_index, input_index,
917 name, index,
918 !reg.Register.Dimension);
919 }
920
921 /* The offchip buffer layout for TCS->TES is
922 *
923 * - attribute 0 of patch 0 vertex 0
924 * - attribute 0 of patch 0 vertex 1
925 * - attribute 0 of patch 0 vertex 2
926 * ...
927 * - attribute 0 of patch 1 vertex 0
928 * - attribute 0 of patch 1 vertex 1
929 * ...
930 * - attribute 1 of patch 0 vertex 0
931 * - attribute 1 of patch 0 vertex 1
932 * ...
933 * - per patch attribute 0 of patch 0
934 * - per patch attribute 0 of patch 1
935 * ...
936 *
937 * Note that every attribute has 4 components.
938 */
939 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
940 LLVMValueRef rel_patch_id,
941 LLVMValueRef vertex_index,
942 LLVMValueRef param_index)
943 {
944 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
945 LLVMValueRef param_stride, constant16;
946
947 vertices_per_patch = get_num_tcs_out_vertices(ctx);
948 num_patches = si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 0, 6);
949 total_vertices = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
950 num_patches, "");
951
952 constant16 = LLVMConstInt(ctx->i32, 16, 0);
953 if (vertex_index) {
954 base_addr = ac_build_imad(&ctx->ac, rel_patch_id,
955 vertices_per_patch, vertex_index);
956 param_stride = total_vertices;
957 } else {
958 base_addr = rel_patch_id;
959 param_stride = num_patches;
960 }
961
962 base_addr = ac_build_imad(&ctx->ac, param_index, param_stride, base_addr);
963 base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
964
965 if (!vertex_index) {
966 LLVMValueRef patch_data_offset =
967 si_unpack_param(ctx, ctx->param_tcs_offchip_layout, 12, 20);
968
969 base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
970 patch_data_offset, "");
971 }
972 return base_addr;
973 }
974
975 /* This is a generic helper that can be shared by the NIR and TGSI backends */
976 static LLVMValueRef get_tcs_tes_buffer_address_from_generic_indices(
977 struct si_shader_context *ctx,
978 LLVMValueRef vertex_index,
979 LLVMValueRef param_index,
980 unsigned param_base,
981 ubyte *name,
982 ubyte *index,
983 bool is_patch)
984 {
985 unsigned param_index_base;
986
987 param_index_base = is_patch ?
988 si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) :
989 si_shader_io_get_unique_index(name[param_base], index[param_base], false);
990
991 if (param_index) {
992 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
993 LLVMConstInt(ctx->i32, param_index_base, 0),
994 "");
995 } else {
996 param_index = LLVMConstInt(ctx->i32, param_index_base, 0);
997 }
998
999 return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx),
1000 vertex_index, param_index);
1001 }
1002
1003 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
1004 struct si_shader_context *ctx,
1005 const struct tgsi_full_dst_register *dst,
1006 const struct tgsi_full_src_register *src)
1007 {
1008 struct tgsi_shader_info *info = &ctx->shader->selector->info;
1009 ubyte *name, *index, *array_first;
1010 struct tgsi_full_src_register reg;
1011 LLVMValueRef vertex_index = NULL;
1012 LLVMValueRef param_index = NULL;
1013 unsigned param_base;
1014
1015 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
1016
1017 if (reg.Register.Dimension) {
1018
1019 if (reg.Dimension.Indirect)
1020 vertex_index = si_get_indirect_index(ctx, &reg.DimIndirect,
1021 1, reg.Dimension.Index);
1022 else
1023 vertex_index = LLVMConstInt(ctx->i32, reg.Dimension.Index, 0);
1024 }
1025
1026 /* Get information about the register. */
1027 if (reg.Register.File == TGSI_FILE_INPUT) {
1028 name = info->input_semantic_name;
1029 index = info->input_semantic_index;
1030 array_first = info->input_array_first;
1031 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1032 name = info->output_semantic_name;
1033 index = info->output_semantic_index;
1034 array_first = info->output_array_first;
1035 } else {
1036 assert(0);
1037 return NULL;
1038 }
1039
1040 if (reg.Register.Indirect) {
1041 if (reg.Indirect.ArrayID)
1042 param_base = array_first[reg.Indirect.ArrayID];
1043 else
1044 param_base = reg.Register.Index;
1045
1046 param_index = si_get_indirect_index(ctx, &reg.Indirect,
1047 1, reg.Register.Index - param_base);
1048
1049 } else {
1050 param_base = reg.Register.Index;
1051 }
1052
1053 return get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1054 param_index, param_base,
1055 name, index, !reg.Register.Dimension);
1056 }
1057
1058 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
1059 LLVMTypeRef type, unsigned swizzle,
1060 LLVMValueRef buffer, LLVMValueRef offset,
1061 LLVMValueRef base, bool can_speculate)
1062 {
1063 struct si_shader_context *ctx = si_shader_context(bld_base);
1064 LLVMValueRef value, value2;
1065 LLVMTypeRef vec_type = LLVMVectorType(type, 4);
1066
1067 if (swizzle == ~0) {
1068 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1069 0, 1, 0, can_speculate, false);
1070
1071 return LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1072 }
1073
1074 if (!llvm_type_is_64bit(ctx, type)) {
1075 value = ac_build_buffer_load(&ctx->ac, buffer, 4, NULL, base, offset,
1076 0, 1, 0, can_speculate, false);
1077
1078 value = LLVMBuildBitCast(ctx->ac.builder, value, vec_type, "");
1079 return LLVMBuildExtractElement(ctx->ac.builder, value,
1080 LLVMConstInt(ctx->i32, swizzle, 0), "");
1081 }
1082
1083 value = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1084 swizzle * 4, 1, 0, can_speculate, false);
1085
1086 value2 = ac_build_buffer_load(&ctx->ac, buffer, 1, NULL, base, offset,
1087 swizzle * 4 + 4, 1, 0, can_speculate, false);
1088
1089 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1090 }
1091
1092 /**
1093 * Load from LDS.
1094 *
1095 * \param type output value type
1096 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
1097 * \param dw_addr address in dwords
1098 */
1099 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
1100 LLVMTypeRef type, unsigned swizzle,
1101 LLVMValueRef dw_addr)
1102 {
1103 struct si_shader_context *ctx = si_shader_context(bld_base);
1104 LLVMValueRef value;
1105
1106 if (swizzle == ~0) {
1107 LLVMValueRef values[TGSI_NUM_CHANNELS];
1108
1109 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1110 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1111
1112 return ac_build_gather_values(&ctx->ac, values,
1113 TGSI_NUM_CHANNELS);
1114 }
1115
1116 /* Split 64-bit loads. */
1117 if (llvm_type_is_64bit(ctx, type)) {
1118 LLVMValueRef lo, hi;
1119
1120 lo = lds_load(bld_base, ctx->i32, swizzle, dw_addr);
1121 hi = lds_load(bld_base, ctx->i32, swizzle + 1, dw_addr);
1122 return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
1123 }
1124
1125 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1126 LLVMConstInt(ctx->i32, swizzle, 0), "");
1127
1128 value = ac_lds_load(&ctx->ac, dw_addr);
1129
1130 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1131 }
1132
1133 /**
1134 * Store to LDS.
1135 *
1136 * \param swizzle offset (typically 0..3)
1137 * \param dw_addr address in dwords
1138 * \param value value to store
1139 */
1140 static void lds_store(struct si_shader_context *ctx,
1141 unsigned dw_offset_imm, LLVMValueRef dw_addr,
1142 LLVMValueRef value)
1143 {
1144 dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
1145 LLVMConstInt(ctx->i32, dw_offset_imm, 0), "");
1146
1147 ac_lds_store(&ctx->ac, dw_addr, value);
1148 }
1149
1150 enum si_tess_ring {
1151 TCS_FACTOR_RING,
1152 TESS_OFFCHIP_RING_TCS,
1153 TESS_OFFCHIP_RING_TES,
1154 };
1155
1156 static LLVMValueRef get_tess_ring_descriptor(struct si_shader_context *ctx,
1157 enum si_tess_ring ring)
1158 {
1159 LLVMBuilderRef builder = ctx->ac.builder;
1160 unsigned param = ring == TESS_OFFCHIP_RING_TES ? ctx->param_tes_offchip_addr :
1161 ctx->param_tcs_out_lds_layout;
1162 LLVMValueRef addr = LLVMGetParam(ctx->main_fn, param);
1163
1164 /* TCS only receives high 13 bits of the address. */
1165 if (ring == TESS_OFFCHIP_RING_TCS || ring == TCS_FACTOR_RING) {
1166 addr = LLVMBuildAnd(builder, addr,
1167 LLVMConstInt(ctx->i32, 0xfff80000, 0), "");
1168 }
1169
1170 if (ring == TCS_FACTOR_RING) {
1171 unsigned tf_offset = ctx->screen->tess_offchip_ring_size;
1172 addr = LLVMBuildAdd(builder, addr,
1173 LLVMConstInt(ctx->i32, tf_offset, 0), "");
1174 }
1175
1176 LLVMValueRef desc[4];
1177 desc[0] = addr;
1178 desc[1] = LLVMConstInt(ctx->i32,
1179 S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
1180 desc[2] = LLVMConstInt(ctx->i32, 0xffffffff, 0);
1181 desc[3] = LLVMConstInt(ctx->i32,
1182 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
1183 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1184 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
1185 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
1186 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
1187 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0);
1188
1189 return ac_build_gather_values(&ctx->ac, desc, 4);
1190 }
1191
1192 static LLVMValueRef fetch_input_tcs(
1193 struct lp_build_tgsi_context *bld_base,
1194 const struct tgsi_full_src_register *reg,
1195 enum tgsi_opcode_type type, unsigned swizzle_in)
1196 {
1197 struct si_shader_context *ctx = si_shader_context(bld_base);
1198 LLVMValueRef dw_addr, stride;
1199 unsigned swizzle = swizzle_in & 0xffff;
1200 stride = get_tcs_in_vertex_dw_stride(ctx);
1201 dw_addr = get_tcs_in_current_patch_offset(ctx);
1202 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1203
1204 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1205 }
1206
1207 static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi,
1208 LLVMTypeRef type,
1209 LLVMValueRef vertex_index,
1210 LLVMValueRef param_index,
1211 unsigned const_index,
1212 unsigned location,
1213 unsigned driver_location,
1214 unsigned component,
1215 unsigned num_components,
1216 bool is_patch,
1217 bool is_compact,
1218 bool load_input)
1219 {
1220 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1221 struct tgsi_shader_info *info = &ctx->shader->selector->info;
1222 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1223 LLVMValueRef dw_addr, stride;
1224
1225 driver_location = driver_location / 4;
1226
1227 if (load_input) {
1228 stride = get_tcs_in_vertex_dw_stride(ctx);
1229 dw_addr = get_tcs_in_current_patch_offset(ctx);
1230 } else {
1231 if (is_patch) {
1232 stride = NULL;
1233 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1234 } else {
1235 stride = get_tcs_out_vertex_dw_stride(ctx);
1236 dw_addr = get_tcs_out_current_patch_offset(ctx);
1237 }
1238 }
1239
1240 if (param_index) {
1241 /* Add the constant index to the indirect index */
1242 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1243 LLVMConstInt(ctx->i32, const_index, 0), "");
1244 } else {
1245 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1246 }
1247
1248 ubyte *names;
1249 ubyte *indices;
1250 if (load_input) {
1251 names = info->input_semantic_name;
1252 indices = info->input_semantic_index;
1253 } else {
1254 names = info->output_semantic_name;
1255 indices = info->output_semantic_index;
1256 }
1257
1258 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1259 vertex_index, param_index,
1260 driver_location,
1261 names, indices,
1262 is_patch);
1263
1264 LLVMValueRef value[4];
1265 for (unsigned i = 0; i < num_components; i++) {
1266 unsigned offset = i;
1267 if (llvm_type_is_64bit(ctx, type))
1268 offset *= 2;
1269
1270 offset += component;
1271 value[i + component] = lds_load(bld_base, type, offset, dw_addr);
1272 }
1273
1274 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1275 }
1276
1277 static LLVMValueRef fetch_output_tcs(
1278 struct lp_build_tgsi_context *bld_base,
1279 const struct tgsi_full_src_register *reg,
1280 enum tgsi_opcode_type type, unsigned swizzle_in)
1281 {
1282 struct si_shader_context *ctx = si_shader_context(bld_base);
1283 LLVMValueRef dw_addr, stride;
1284 unsigned swizzle = (swizzle_in & 0xffff);
1285
1286 if (reg->Register.Dimension) {
1287 stride = get_tcs_out_vertex_dw_stride(ctx);
1288 dw_addr = get_tcs_out_current_patch_offset(ctx);
1289 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1290 } else {
1291 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1292 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1293 }
1294
1295 return lds_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle, dw_addr);
1296 }
1297
1298 static LLVMValueRef fetch_input_tes(
1299 struct lp_build_tgsi_context *bld_base,
1300 const struct tgsi_full_src_register *reg,
1301 enum tgsi_opcode_type type, unsigned swizzle_in)
1302 {
1303 struct si_shader_context *ctx = si_shader_context(bld_base);
1304 LLVMValueRef base, addr;
1305 unsigned swizzle = (swizzle_in & 0xffff);
1306
1307 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1308 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1309
1310 return buffer_load(bld_base, tgsi2llvmtype(bld_base, type), swizzle,
1311 ctx->tess_offchip_ring, base, addr, true);
1312 }
1313
1314 LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi,
1315 LLVMTypeRef type,
1316 LLVMValueRef vertex_index,
1317 LLVMValueRef param_index,
1318 unsigned const_index,
1319 unsigned location,
1320 unsigned driver_location,
1321 unsigned component,
1322 unsigned num_components,
1323 bool is_patch,
1324 bool is_compact,
1325 bool load_input)
1326 {
1327 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1328 struct tgsi_shader_info *info = &ctx->shader->selector->info;
1329 LLVMValueRef base, addr;
1330
1331 driver_location = driver_location / 4;
1332
1333 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1334
1335 if (param_index) {
1336 /* Add the constant index to the indirect index */
1337 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1338 LLVMConstInt(ctx->i32, const_index, 0), "");
1339 } else {
1340 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1341 }
1342
1343 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1344 param_index, driver_location,
1345 info->input_semantic_name,
1346 info->input_semantic_index,
1347 is_patch);
1348
1349 /* TODO: This will generate rather ordinary llvm code, although it
1350 * should be easy for the optimiser to fix up. In future we might want
1351 * to refactor buffer_load(), but for now this maximises code sharing
1352 * between the NIR and TGSI backends.
1353 */
1354 LLVMValueRef value[4];
1355 for (unsigned i = 0; i < num_components; i++) {
1356 unsigned offset = i;
1357 if (llvm_type_is_64bit(ctx, type))
1358 offset *= 2;
1359
1360 offset += component;
1361 value[i + component] = buffer_load(&ctx->bld_base, type, offset,
1362 ctx->tess_offchip_ring, base, addr, true);
1363 }
1364
1365 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1366 }
1367
1368 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1369 const struct tgsi_full_instruction *inst,
1370 const struct tgsi_opcode_info *info,
1371 unsigned index,
1372 LLVMValueRef dst[4])
1373 {
1374 struct si_shader_context *ctx = si_shader_context(bld_base);
1375 const struct tgsi_full_dst_register *reg = &inst->Dst[index];
1376 const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info;
1377 unsigned chan_index;
1378 LLVMValueRef dw_addr, stride;
1379 LLVMValueRef buffer, base, buf_addr;
1380 LLVMValueRef values[4];
1381 bool skip_lds_store;
1382 bool is_tess_factor = false, is_tess_inner = false;
1383
1384 /* Only handle per-patch and per-vertex outputs here.
1385 * Vectors will be lowered to scalars and this function will be called again.
1386 */
1387 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1388 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1389 si_llvm_emit_store(bld_base, inst, info, index, dst);
1390 return;
1391 }
1392
1393 if (reg->Register.Dimension) {
1394 stride = get_tcs_out_vertex_dw_stride(ctx);
1395 dw_addr = get_tcs_out_current_patch_offset(ctx);
1396 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1397 skip_lds_store = !sh_info->reads_pervertex_outputs;
1398 } else {
1399 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1400 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1401 skip_lds_store = !sh_info->reads_perpatch_outputs;
1402
1403 if (!reg->Register.Indirect) {
1404 int name = sh_info->output_semantic_name[reg->Register.Index];
1405
1406 /* Always write tess factors into LDS for the TCS epilog. */
1407 if (name == TGSI_SEMANTIC_TESSINNER ||
1408 name == TGSI_SEMANTIC_TESSOUTER) {
1409 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1410 skip_lds_store = !sh_info->reads_tessfactor_outputs &&
1411 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1412 is_tess_factor = true;
1413 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1414 }
1415 }
1416 }
1417
1418 buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1419
1420 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1421 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1422
1423 uint32_t writemask = reg->Register.WriteMask;
1424 while (writemask) {
1425 chan_index = u_bit_scan(&writemask);
1426 LLVMValueRef value = dst[chan_index];
1427
1428 if (inst->Instruction.Saturate)
1429 value = ac_build_clamp(&ctx->ac, value);
1430
1431 /* Skip LDS stores if there is no LDS read of this output. */
1432 if (!skip_lds_store)
1433 lds_store(ctx, chan_index, dw_addr, value);
1434
1435 value = ac_to_integer(&ctx->ac, value);
1436 values[chan_index] = value;
1437
1438 if (reg->Register.WriteMask != 0xF && !is_tess_factor) {
1439 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1440 buf_addr, base,
1441 4 * chan_index, 1, 0, true, false);
1442 }
1443
1444 /* Write tess factors into VGPRs for the epilog. */
1445 if (is_tess_factor &&
1446 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1447 if (!is_tess_inner) {
1448 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1449 ctx->invoc0_tess_factors[chan_index]);
1450 } else if (chan_index < 2) {
1451 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1452 ctx->invoc0_tess_factors[4 + chan_index]);
1453 }
1454 }
1455 }
1456
1457 if (reg->Register.WriteMask == 0xF && !is_tess_factor) {
1458 LLVMValueRef value = ac_build_gather_values(&ctx->ac,
1459 values, 4);
1460 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
1461 base, 0, 1, 0, true, false);
1462 }
1463 }
1464
1465 static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
1466 const struct nir_variable *var,
1467 LLVMValueRef vertex_index,
1468 LLVMValueRef param_index,
1469 unsigned const_index,
1470 LLVMValueRef src,
1471 unsigned writemask)
1472 {
1473 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1474 struct tgsi_shader_info *info = &ctx->shader->selector->info;
1475 const unsigned component = var->data.location_frac;
1476 const bool is_patch = var->data.patch;
1477 unsigned driver_location = var->data.driver_location;
1478 LLVMValueRef dw_addr, stride;
1479 LLVMValueRef buffer, base, addr;
1480 LLVMValueRef values[4];
1481 bool skip_lds_store;
1482 bool is_tess_factor = false, is_tess_inner = false;
1483
1484 driver_location = driver_location / 4;
1485
1486 if (param_index) {
1487 /* Add the constant index to the indirect index */
1488 param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
1489 LLVMConstInt(ctx->i32, const_index, 0), "");
1490 } else {
1491 if (const_index != 0)
1492 param_index = LLVMConstInt(ctx->i32, const_index, 0);
1493 }
1494
1495 if (!is_patch) {
1496 stride = get_tcs_out_vertex_dw_stride(ctx);
1497 dw_addr = get_tcs_out_current_patch_offset(ctx);
1498 dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr,
1499 vertex_index, param_index,
1500 driver_location,
1501 info->output_semantic_name,
1502 info->output_semantic_index,
1503 is_patch);
1504
1505 skip_lds_store = !info->reads_pervertex_outputs;
1506 } else {
1507 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1508 dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr,
1509 vertex_index, param_index,
1510 driver_location,
1511 info->output_semantic_name,
1512 info->output_semantic_index,
1513 is_patch);
1514
1515 skip_lds_store = !info->reads_perpatch_outputs;
1516
1517 if (!param_index) {
1518 int name = info->output_semantic_name[driver_location];
1519
1520 /* Always write tess factors into LDS for the TCS epilog. */
1521 if (name == TGSI_SEMANTIC_TESSINNER ||
1522 name == TGSI_SEMANTIC_TESSOUTER) {
1523 /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
1524 skip_lds_store = !info->reads_tessfactor_outputs &&
1525 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
1526 is_tess_factor = true;
1527 is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
1528 }
1529 }
1530 }
1531
1532 buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
1533
1534 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
1535
1536 addr = get_tcs_tes_buffer_address_from_generic_indices(ctx, vertex_index,
1537 param_index, driver_location,
1538 info->output_semantic_name,
1539 info->output_semantic_index,
1540 is_patch);
1541
1542 for (unsigned chan = 0; chan < 4; chan++) {
1543 if (!(writemask & (1 << chan)))
1544 continue;
1545 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
1546
1547 /* Skip LDS stores if there is no LDS read of this output. */
1548 if (!skip_lds_store)
1549 lds_store(ctx, chan, dw_addr, value);
1550
1551 value = ac_to_integer(&ctx->ac, value);
1552 values[chan] = value;
1553
1554 if (writemask != 0xF && !is_tess_factor) {
1555 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 1,
1556 addr, base,
1557 4 * chan, 1, 0, true, false);
1558 }
1559
1560 /* Write tess factors into VGPRs for the epilog. */
1561 if (is_tess_factor &&
1562 ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
1563 if (!is_tess_inner) {
1564 LLVMBuildStore(ctx->ac.builder, value, /* outer */
1565 ctx->invoc0_tess_factors[chan]);
1566 } else if (chan < 2) {
1567 LLVMBuildStore(ctx->ac.builder, value, /* inner */
1568 ctx->invoc0_tess_factors[4 + chan]);
1569 }
1570 }
1571 }
1572
1573 if (writemask == 0xF && !is_tess_factor) {
1574 LLVMValueRef value = ac_build_gather_values(&ctx->ac,
1575 values, 4);
1576 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
1577 base, 0, 1, 0, true, false);
1578 }
1579 }
1580
1581 LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi,
1582 unsigned input_index,
1583 unsigned vtx_offset_param,
1584 LLVMTypeRef type,
1585 unsigned swizzle)
1586 {
1587 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1588 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
1589 struct si_shader *shader = ctx->shader;
1590 LLVMValueRef vtx_offset, soffset;
1591 struct tgsi_shader_info *info = &shader->selector->info;
1592 unsigned semantic_name = info->input_semantic_name[input_index];
1593 unsigned semantic_index = info->input_semantic_index[input_index];
1594 unsigned param;
1595 LLVMValueRef value;
1596
1597 param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
1598
1599 /* GFX9 has the ESGS ring in LDS. */
1600 if (ctx->screen->info.chip_class >= GFX9) {
1601 unsigned index = vtx_offset_param;
1602
1603 switch (index / 2) {
1604 case 0:
1605 vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx01_offset,
1606 index % 2 ? 16 : 0, 16);
1607 break;
1608 case 1:
1609 vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx23_offset,
1610 index % 2 ? 16 : 0, 16);
1611 break;
1612 case 2:
1613 vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset,
1614 index % 2 ? 16 : 0, 16);
1615 break;
1616 default:
1617 assert(0);
1618 return NULL;
1619 }
1620
1621 vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset,
1622 LLVMConstInt(ctx->i32, param * 4, 0), "");
1623 return lds_load(bld_base, type, swizzle, vtx_offset);
1624 }
1625
1626 /* GFX6: input load from the ESGS ring in memory. */
1627 if (swizzle == ~0) {
1628 LLVMValueRef values[TGSI_NUM_CHANNELS];
1629 unsigned chan;
1630 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1631 values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
1632 type, chan);
1633 }
1634 return ac_build_gather_values(&ctx->ac, values,
1635 TGSI_NUM_CHANNELS);
1636 }
1637
1638 /* Get the vertex offset parameter on GFX6. */
1639 LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param];
1640
1641 vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
1642 LLVMConstInt(ctx->i32, 4, 0), "");
1643
1644 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
1645
1646 value = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1, ctx->i32_0,
1647 vtx_offset, soffset, 0, 1, 0, true, false);
1648 if (llvm_type_is_64bit(ctx, type)) {
1649 LLVMValueRef value2;
1650 soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle + 1) * 256, 0);
1651
1652 value2 = ac_build_buffer_load(&ctx->ac, ctx->esgs_ring, 1,
1653 ctx->i32_0, vtx_offset, soffset,
1654 0, 1, 0, true, false);
1655 return si_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1656 }
1657 return LLVMBuildBitCast(ctx->ac.builder, value, type, "");
1658 }
1659
1660 static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
1661 unsigned location,
1662 unsigned driver_location,
1663 unsigned component,
1664 unsigned num_components,
1665 unsigned vertex_index,
1666 unsigned const_index,
1667 LLVMTypeRef type)
1668 {
1669 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1670
1671 LLVMValueRef value[4];
1672 for (unsigned i = 0; i < num_components; i++) {
1673 unsigned offset = i;
1674 if (llvm_type_is_64bit(ctx, type))
1675 offset *= 2;
1676
1677 offset += component;
1678 value[i + component] = si_llvm_load_input_gs(&ctx->abi, driver_location / 4,
1679 vertex_index, type, offset);
1680 }
1681
1682 return ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
1683 }
1684
1685 static LLVMValueRef fetch_input_gs(
1686 struct lp_build_tgsi_context *bld_base,
1687 const struct tgsi_full_src_register *reg,
1688 enum tgsi_opcode_type type,
1689 unsigned swizzle_in)
1690 {
1691 struct si_shader_context *ctx = si_shader_context(bld_base);
1692 struct tgsi_shader_info *info = &ctx->shader->selector->info;
1693 unsigned swizzle = swizzle_in & 0xffff;
1694
1695 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1696 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1697 return get_primitive_id(ctx, swizzle);
1698
1699 if (!reg->Register.Dimension)
1700 return NULL;
1701
1702 return si_llvm_load_input_gs(&ctx->abi, reg->Register.Index,
1703 reg->Dimension.Index,
1704 tgsi2llvmtype(bld_base, type),
1705 swizzle);
1706 }
1707
1708 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1709 {
1710 switch (interpolate) {
1711 case TGSI_INTERPOLATE_CONSTANT:
1712 return 0;
1713
1714 case TGSI_INTERPOLATE_LINEAR:
1715 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1716 return SI_PARAM_LINEAR_SAMPLE;
1717 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1718 return SI_PARAM_LINEAR_CENTROID;
1719 else
1720 return SI_PARAM_LINEAR_CENTER;
1721 break;
1722 case TGSI_INTERPOLATE_COLOR:
1723 case TGSI_INTERPOLATE_PERSPECTIVE:
1724 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1725 return SI_PARAM_PERSP_SAMPLE;
1726 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1727 return SI_PARAM_PERSP_CENTROID;
1728 else
1729 return SI_PARAM_PERSP_CENTER;
1730 break;
1731 default:
1732 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1733 return -1;
1734 }
1735 }
1736
1737 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx,
1738 unsigned attr_index, unsigned chan,
1739 LLVMValueRef prim_mask,
1740 LLVMValueRef i, LLVMValueRef j)
1741 {
1742 if (i || j) {
1743 return ac_build_fs_interp(&ctx->ac,
1744 LLVMConstInt(ctx->i32, chan, 0),
1745 LLVMConstInt(ctx->i32, attr_index, 0),
1746 prim_mask, i, j);
1747 }
1748 return ac_build_fs_interp_mov(&ctx->ac,
1749 LLVMConstInt(ctx->i32, 2, 0), /* P0 */
1750 LLVMConstInt(ctx->i32, chan, 0),
1751 LLVMConstInt(ctx->i32, attr_index, 0),
1752 prim_mask);
1753 }
1754
1755 /**
1756 * Interpolate a fragment shader input.
1757 *
1758 * @param ctx context
1759 * @param input_index index of the input in hardware
1760 * @param semantic_name TGSI_SEMANTIC_*
1761 * @param semantic_index semantic index
1762 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1763 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1764 * @param interp_param interpolation weights (i,j)
1765 * @param prim_mask SI_PARAM_PRIM_MASK
1766 * @param face SI_PARAM_FRONT_FACE
1767 * @param result the return value (4 components)
1768 */
1769 static void interp_fs_input(struct si_shader_context *ctx,
1770 unsigned input_index,
1771 unsigned semantic_name,
1772 unsigned semantic_index,
1773 unsigned num_interp_inputs,
1774 unsigned colors_read_mask,
1775 LLVMValueRef interp_param,
1776 LLVMValueRef prim_mask,
1777 LLVMValueRef face,
1778 LLVMValueRef result[4])
1779 {
1780 LLVMValueRef i = NULL, j = NULL;
1781 unsigned chan;
1782
1783 /* fs.constant returns the param from the middle vertex, so it's not
1784 * really useful for flat shading. It's meant to be used for custom
1785 * interpolation (but the intrinsic can't fetch from the other two
1786 * vertices).
1787 *
1788 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1789 * to do the right thing. The only reason we use fs.constant is that
1790 * fs.interp cannot be used on integers, because they can be equal
1791 * to NaN.
1792 *
1793 * When interp is false we will use fs.constant or for newer llvm,
1794 * amdgcn.interp.mov.
1795 */
1796 bool interp = interp_param != NULL;
1797
1798 if (interp) {
1799 interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
1800 LLVMVectorType(ctx->f32, 2), "");
1801
1802 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1803 ctx->i32_0, "");
1804 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param,
1805 ctx->i32_1, "");
1806 }
1807
1808 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1809 ctx->shader->key.part.ps.prolog.color_two_side) {
1810 LLVMValueRef is_face_positive;
1811
1812 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1813 * otherwise it's at offset "num_inputs".
1814 */
1815 unsigned back_attr_offset = num_interp_inputs;
1816 if (semantic_index == 1 && colors_read_mask & 0xf)
1817 back_attr_offset += 1;
1818
1819 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
1820 face, ctx->i32_0, "");
1821
1822 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1823 LLVMValueRef front, back;
1824
1825 front = si_build_fs_interp(ctx,
1826 input_index, chan,
1827 prim_mask, i, j);
1828 back = si_build_fs_interp(ctx,
1829 back_attr_offset, chan,
1830 prim_mask, i, j);
1831
1832 result[chan] = LLVMBuildSelect(ctx->ac.builder,
1833 is_face_positive,
1834 front,
1835 back,
1836 "");
1837 }
1838 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1839 result[0] = si_build_fs_interp(ctx, input_index,
1840 0, prim_mask, i, j);
1841 result[1] =
1842 result[2] = LLVMConstReal(ctx->f32, 0.0f);
1843 result[3] = LLVMConstReal(ctx->f32, 1.0f);
1844 } else {
1845 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1846 result[chan] = si_build_fs_interp(ctx,
1847 input_index, chan,
1848 prim_mask, i, j);
1849 }
1850 }
1851 }
1852
1853 void si_llvm_load_input_fs(
1854 struct si_shader_context *ctx,
1855 unsigned input_index,
1856 LLVMValueRef out[4])
1857 {
1858 struct si_shader *shader = ctx->shader;
1859 struct tgsi_shader_info *info = &shader->selector->info;
1860 LLVMValueRef main_fn = ctx->main_fn;
1861 LLVMValueRef interp_param = NULL;
1862 int interp_param_idx;
1863 enum tgsi_semantic semantic_name = info->input_semantic_name[input_index];
1864 unsigned semantic_index = info->input_semantic_index[input_index];
1865 enum tgsi_interpolate_mode interp_mode = info->input_interpolate[input_index];
1866 enum tgsi_interpolate_loc interp_loc = info->input_interpolate_loc[input_index];
1867
1868 /* Get colors from input VGPRs (set by the prolog). */
1869 if (semantic_name == TGSI_SEMANTIC_COLOR) {
1870 unsigned colors_read = shader->selector->info.colors_read;
1871 unsigned mask = colors_read >> (semantic_index * 4);
1872 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1873 (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
1874 LLVMValueRef undef = LLVMGetUndef(ctx->f32);
1875
1876 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
1877 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
1878 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
1879 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
1880 return;
1881 }
1882
1883 interp_param_idx = lookup_interp_param_index(interp_mode, interp_loc);
1884 if (interp_param_idx == -1)
1885 return;
1886 else if (interp_param_idx) {
1887 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
1888 }
1889
1890 interp_fs_input(ctx, input_index, semantic_name,
1891 semantic_index, 0, /* this param is unused */
1892 shader->selector->info.colors_read, interp_param,
1893 ctx->abi.prim_mask,
1894 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1895 &out[0]);
1896 }
1897
1898 static void declare_input_fs(
1899 struct si_shader_context *ctx,
1900 unsigned input_index,
1901 const struct tgsi_full_declaration *decl,
1902 LLVMValueRef out[4])
1903 {
1904 si_llvm_load_input_fs(ctx, input_index, out);
1905 }
1906
1907 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
1908 {
1909 return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
1910 }
1911
1912 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
1913 {
1914 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1915
1916 /* For non-indexed draws, the base vertex set by the driver
1917 * (for direct draws) or the CP (for indirect draws) is the
1918 * first vertex ID, but GLSL expects 0 to be returned.
1919 */
1920 LLVMValueRef vs_state = LLVMGetParam(ctx->main_fn,
1921 ctx->param_vs_state_bits);
1922 LLVMValueRef indexed;
1923
1924 indexed = LLVMBuildLShr(ctx->ac.builder, vs_state, ctx->i32_1, "");
1925 indexed = LLVMBuildTrunc(ctx->ac.builder, indexed, ctx->i1, "");
1926
1927 return LLVMBuildSelect(ctx->ac.builder, indexed, ctx->abi.base_vertex,
1928 ctx->i32_0, "");
1929 }
1930
1931 static LLVMValueRef get_block_size(struct ac_shader_abi *abi)
1932 {
1933 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1934
1935 LLVMValueRef values[3];
1936 LLVMValueRef result;
1937 unsigned i;
1938 unsigned *properties = ctx->shader->selector->info.properties;
1939
1940 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1941 unsigned sizes[3] = {
1942 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1943 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1944 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1945 };
1946
1947 for (i = 0; i < 3; ++i)
1948 values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
1949
1950 result = ac_build_gather_values(&ctx->ac, values, 3);
1951 } else {
1952 result = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
1953 }
1954
1955 return result;
1956 }
1957
1958 /**
1959 * Load a dword from a constant buffer.
1960 */
1961 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1962 LLVMValueRef resource,
1963 LLVMValueRef offset)
1964 {
1965 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL,
1966 0, 0, 0, true, true);
1967 }
1968
1969 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
1970 {
1971 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1972 LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
1973 LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
1974 LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
1975
1976 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1977 LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), "");
1978 LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
1979
1980 LLVMValueRef pos[4] = {
1981 buffer_load_const(ctx, resource, offset0),
1982 buffer_load_const(ctx, resource, offset1),
1983 LLVMConstReal(ctx->f32, 0),
1984 LLVMConstReal(ctx->f32, 0)
1985 };
1986
1987 return ac_build_gather_values(&ctx->ac, pos, 4);
1988 }
1989
1990 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
1991 {
1992 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1993 return ac_to_integer(&ctx->ac, abi->sample_coverage);
1994 }
1995
1996 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
1997 {
1998 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
1999 LLVMValueRef coord[4] = {
2000 LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
2001 LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
2002 ctx->ac.f32_0,
2003 ctx->ac.f32_0
2004 };
2005
2006 /* For triangles, the vector should be (u, v, 1-u-v). */
2007 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
2008 PIPE_PRIM_TRIANGLES) {
2009 coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
2010 LLVMBuildFAdd(ctx->ac.builder,
2011 coord[0], coord[1], ""), "");
2012 }
2013 return ac_build_gather_values(&ctx->ac, coord, 4);
2014 }
2015
2016 static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
2017 unsigned semantic_name)
2018 {
2019 LLVMValueRef base, addr;
2020
2021 int param = si_shader_io_get_unique_index_patch(semantic_name, 0);
2022
2023 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
2024 addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL,
2025 LLVMConstInt(ctx->i32, param, 0));
2026
2027 return buffer_load(&ctx->bld_base, ctx->f32,
2028 ~0, ctx->tess_offchip_ring, base, addr, true);
2029
2030 }
2031
2032 static LLVMValueRef si_load_tess_level(struct ac_shader_abi *abi,
2033 unsigned varying_id)
2034 {
2035 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2036 unsigned semantic_name;
2037
2038 switch (varying_id) {
2039 case VARYING_SLOT_TESS_LEVEL_INNER:
2040 semantic_name = TGSI_SEMANTIC_TESSINNER;
2041 break;
2042 case VARYING_SLOT_TESS_LEVEL_OUTER:
2043 semantic_name = TGSI_SEMANTIC_TESSOUTER;
2044 break;
2045 default:
2046 unreachable("unknown tess level");
2047 }
2048
2049 return load_tess_level(ctx, semantic_name);
2050
2051 }
2052
2053 static LLVMValueRef si_load_patch_vertices_in(struct ac_shader_abi *abi)
2054 {
2055 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2056 if (ctx->type == PIPE_SHADER_TESS_CTRL)
2057 return si_unpack_param(ctx, ctx->param_tcs_out_lds_layout, 13, 6);
2058 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
2059 return get_num_tcs_out_vertices(ctx);
2060 else
2061 unreachable("invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
2062 }
2063
2064 void si_load_system_value(struct si_shader_context *ctx,
2065 unsigned index,
2066 const struct tgsi_full_declaration *decl)
2067 {
2068 LLVMValueRef value = 0;
2069
2070 assert(index < RADEON_LLVM_MAX_SYSTEM_VALUES);
2071
2072 switch (decl->Semantic.Name) {
2073 case TGSI_SEMANTIC_INSTANCEID:
2074 value = ctx->abi.instance_id;
2075 break;
2076
2077 case TGSI_SEMANTIC_VERTEXID:
2078 value = LLVMBuildAdd(ctx->ac.builder,
2079 ctx->abi.vertex_id,
2080 ctx->abi.base_vertex, "");
2081 break;
2082
2083 case TGSI_SEMANTIC_VERTEXID_NOBASE:
2084 /* Unused. Clarify the meaning in indexed vs. non-indexed
2085 * draws if this is ever used again. */
2086 assert(false);
2087 break;
2088
2089 case TGSI_SEMANTIC_BASEVERTEX:
2090 value = get_base_vertex(&ctx->abi);
2091 break;
2092
2093 case TGSI_SEMANTIC_BASEINSTANCE:
2094 value = ctx->abi.start_instance;
2095 break;
2096
2097 case TGSI_SEMANTIC_DRAWID:
2098 value = ctx->abi.draw_id;
2099 break;
2100
2101 case TGSI_SEMANTIC_INVOCATIONID:
2102 if (ctx->type == PIPE_SHADER_TESS_CTRL)
2103 value = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
2104 else if (ctx->type == PIPE_SHADER_GEOMETRY)
2105 value = ctx->abi.gs_invocation_id;
2106 else
2107 assert(!"INVOCATIONID not implemented");
2108 break;
2109
2110 case TGSI_SEMANTIC_POSITION:
2111 {
2112 LLVMValueRef pos[4] = {
2113 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2114 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2115 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
2116 ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
2117 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)),
2118 };
2119 value = ac_build_gather_values(&ctx->ac, pos, 4);
2120 break;
2121 }
2122
2123 case TGSI_SEMANTIC_FACE:
2124 value = ctx->abi.front_face;
2125 break;
2126
2127 case TGSI_SEMANTIC_SAMPLEID:
2128 value = si_get_sample_id(ctx);
2129 break;
2130
2131 case TGSI_SEMANTIC_SAMPLEPOS: {
2132 LLVMValueRef pos[4] = {
2133 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
2134 LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
2135 LLVMConstReal(ctx->f32, 0),
2136 LLVMConstReal(ctx->f32, 0)
2137 };
2138 pos[0] = ac_build_fract(&ctx->ac, pos[0], 32);
2139 pos[1] = ac_build_fract(&ctx->ac, pos[1], 32);
2140 value = ac_build_gather_values(&ctx->ac, pos, 4);
2141 break;
2142 }
2143
2144 case TGSI_SEMANTIC_SAMPLEMASK:
2145 /* This can only occur with the OpenGL Core profile, which
2146 * doesn't support smoothing.
2147 */
2148 value = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
2149 break;
2150
2151 case TGSI_SEMANTIC_TESSCOORD:
2152 value = si_load_tess_coord(&ctx->abi);
2153 break;
2154
2155 case TGSI_SEMANTIC_VERTICESIN:
2156 value = si_load_patch_vertices_in(&ctx->abi);
2157 break;
2158
2159 case TGSI_SEMANTIC_TESSINNER:
2160 case TGSI_SEMANTIC_TESSOUTER:
2161 value = load_tess_level(ctx, decl->Semantic.Name);
2162 break;
2163
2164 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
2165 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
2166 {
2167 LLVMValueRef buf, slot, val[4];
2168 int i, offset;
2169
2170 slot = LLVMConstInt(ctx->i32, SI_HS_CONST_DEFAULT_TESS_LEVELS, 0);
2171 buf = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2172 buf = ac_build_load_to_sgpr(&ctx->ac, buf, slot);
2173 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
2174
2175 for (i = 0; i < 4; i++)
2176 val[i] = buffer_load_const(ctx, buf,
2177 LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
2178 value = ac_build_gather_values(&ctx->ac, val, 4);
2179 break;
2180 }
2181
2182 case TGSI_SEMANTIC_PRIMID:
2183 value = get_primitive_id(ctx, 0);
2184 break;
2185
2186 case TGSI_SEMANTIC_GRID_SIZE:
2187 value = ctx->abi.num_work_groups;
2188 break;
2189
2190 case TGSI_SEMANTIC_BLOCK_SIZE:
2191 value = get_block_size(&ctx->abi);
2192 break;
2193
2194 case TGSI_SEMANTIC_BLOCK_ID:
2195 {
2196 LLVMValueRef values[3];
2197
2198 for (int i = 0; i < 3; i++) {
2199 values[i] = ctx->i32_0;
2200 if (ctx->abi.workgroup_ids[i]) {
2201 values[i] = ctx->abi.workgroup_ids[i];
2202 }
2203 }
2204 value = ac_build_gather_values(&ctx->ac, values, 3);
2205 break;
2206 }
2207
2208 case TGSI_SEMANTIC_THREAD_ID:
2209 value = ctx->abi.local_invocation_ids;
2210 break;
2211
2212 case TGSI_SEMANTIC_HELPER_INVOCATION:
2213 value = ac_build_intrinsic(&ctx->ac,
2214 "llvm.amdgcn.ps.live",
2215 ctx->i1, NULL, 0,
2216 AC_FUNC_ATTR_READNONE);
2217 value = LLVMBuildNot(ctx->ac.builder, value, "");
2218 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->i32, "");
2219 break;
2220
2221 case TGSI_SEMANTIC_SUBGROUP_SIZE:
2222 value = LLVMConstInt(ctx->i32, 64, 0);
2223 break;
2224
2225 case TGSI_SEMANTIC_SUBGROUP_INVOCATION:
2226 value = ac_get_thread_id(&ctx->ac);
2227 break;
2228
2229 case TGSI_SEMANTIC_SUBGROUP_EQ_MASK:
2230 {
2231 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2232 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2233 value = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->i64, 1, 0), id, "");
2234 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2235 break;
2236 }
2237
2238 case TGSI_SEMANTIC_SUBGROUP_GE_MASK:
2239 case TGSI_SEMANTIC_SUBGROUP_GT_MASK:
2240 case TGSI_SEMANTIC_SUBGROUP_LE_MASK:
2241 case TGSI_SEMANTIC_SUBGROUP_LT_MASK:
2242 {
2243 LLVMValueRef id = ac_get_thread_id(&ctx->ac);
2244 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_GT_MASK ||
2245 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK) {
2246 /* All bits set except LSB */
2247 value = LLVMConstInt(ctx->i64, -2, 0);
2248 } else {
2249 /* All bits set */
2250 value = LLVMConstInt(ctx->i64, -1, 0);
2251 }
2252 id = LLVMBuildZExt(ctx->ac.builder, id, ctx->i64, "");
2253 value = LLVMBuildShl(ctx->ac.builder, value, id, "");
2254 if (decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LE_MASK ||
2255 decl->Semantic.Name == TGSI_SEMANTIC_SUBGROUP_LT_MASK)
2256 value = LLVMBuildNot(ctx->ac.builder, value, "");
2257 value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->v2i32, "");
2258 break;
2259 }
2260
2261 case TGSI_SEMANTIC_CS_USER_DATA:
2262 value = LLVMGetParam(ctx->main_fn, ctx->param_cs_user_data);
2263 break;
2264
2265 default:
2266 assert(!"unknown system value");
2267 return;
2268 }
2269
2270 ctx->system_values[index] = value;
2271 }
2272
2273 void si_declare_compute_memory(struct si_shader_context *ctx)
2274 {
2275 struct si_shader_selector *sel = ctx->shader->selector;
2276 unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
2277
2278 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_ADDR_SPACE_LDS);
2279 LLVMValueRef var;
2280
2281 assert(!ctx->ac.lds);
2282
2283 var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
2284 LLVMArrayType(ctx->i8, lds_size),
2285 "compute_lds",
2286 AC_ADDR_SPACE_LDS);
2287 LLVMSetAlignment(var, 4);
2288
2289 ctx->ac.lds = LLVMBuildBitCast(ctx->ac.builder, var, i8p, "");
2290 }
2291
2292 void si_tgsi_declare_compute_memory(struct si_shader_context *ctx,
2293 const struct tgsi_full_declaration *decl)
2294 {
2295 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
2296 assert(decl->Range.First == decl->Range.Last);
2297
2298 si_declare_compute_memory(ctx);
2299 }
2300
2301 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
2302 {
2303 LLVMValueRef ptr =
2304 LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2305 struct si_shader_selector *sel = ctx->shader->selector;
2306
2307 /* Do the bounds checking with a descriptor, because
2308 * doing computation and manual bounds checking of 64-bit
2309 * addresses generates horrible VALU code with very high
2310 * VGPR usage and very low SIMD occupancy.
2311 */
2312 ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
2313
2314 LLVMValueRef desc0, desc1;
2315 desc0 = ptr;
2316 desc1 = LLVMConstInt(ctx->i32,
2317 S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
2318
2319 LLVMValueRef desc_elems[] = {
2320 desc0,
2321 desc1,
2322 LLVMConstInt(ctx->i32, (sel->info.const_file_max[0] + 1) * 16, 0),
2323 LLVMConstInt(ctx->i32,
2324 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
2325 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
2326 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
2327 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
2328 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
2329 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0)
2330 };
2331
2332 return ac_build_gather_values(&ctx->ac, desc_elems, 4);
2333 }
2334
2335 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
2336 {
2337 LLVMValueRef list_ptr = LLVMGetParam(ctx->main_fn,
2338 ctx->param_const_and_shader_buffers);
2339
2340 return ac_build_load_to_sgpr(&ctx->ac, list_ptr,
2341 LLVMConstInt(ctx->i32, si_get_constbuf_slot(i), 0));
2342 }
2343
2344 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
2345 {
2346 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2347 struct si_shader_selector *sel = ctx->shader->selector;
2348
2349 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2350
2351 if (sel->info.const_buffers_declared == 1 &&
2352 sel->info.shader_buffers_declared == 0) {
2353 return load_const_buffer_desc_fast_path(ctx);
2354 }
2355
2356 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
2357 index = LLVMBuildAdd(ctx->ac.builder, index,
2358 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2359
2360 return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2361 }
2362
2363 static LLVMValueRef
2364 load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
2365 {
2366 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
2367 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->main_fn,
2368 ctx->param_const_and_shader_buffers);
2369
2370 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
2371 index = LLVMBuildSub(ctx->ac.builder,
2372 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS - 1, 0),
2373 index, "");
2374
2375 return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
2376 }
2377
2378 static LLVMValueRef fetch_constant(
2379 struct lp_build_tgsi_context *bld_base,
2380 const struct tgsi_full_src_register *reg,
2381 enum tgsi_opcode_type type,
2382 unsigned swizzle_in)
2383 {
2384 struct si_shader_context *ctx = si_shader_context(bld_base);
2385 struct si_shader_selector *sel = ctx->shader->selector;
2386 const struct tgsi_ind_register *ireg = &reg->Indirect;
2387 unsigned buf, idx;
2388 unsigned swizzle = swizzle_in & 0xffff;
2389
2390 LLVMValueRef addr, bufp;
2391
2392 if (swizzle_in == LP_CHAN_ALL) {
2393 unsigned chan;
2394 LLVMValueRef values[4];
2395 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
2396 values[chan] = fetch_constant(bld_base, reg, type, chan);
2397
2398 return ac_build_gather_values(&ctx->ac, values, 4);
2399 }
2400
2401 /* Split 64-bit loads. */
2402 if (tgsi_type_is_64bit(type)) {
2403 LLVMValueRef lo, hi;
2404
2405 lo = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, swizzle);
2406 hi = fetch_constant(bld_base, reg, TGSI_TYPE_UNSIGNED, (swizzle_in >> 16));
2407 return si_llvm_emit_fetch_64bit(bld_base, tgsi2llvmtype(bld_base, type),
2408 lo, hi);
2409 }
2410
2411 idx = reg->Register.Index * 4 + swizzle;
2412 if (reg->Register.Indirect) {
2413 addr = si_get_indirect_index(ctx, ireg, 16, idx * 4);
2414 } else {
2415 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
2416 }
2417
2418 /* Fast path when user data SGPRs point to constant buffer 0 directly. */
2419 if (sel->info.const_buffers_declared == 1 &&
2420 sel->info.shader_buffers_declared == 0) {
2421 LLVMValueRef desc = load_const_buffer_desc_fast_path(ctx);
2422 LLVMValueRef result = buffer_load_const(ctx, desc, addr);
2423 return bitcast(bld_base, type, result);
2424 }
2425
2426 assert(reg->Register.Dimension);
2427 buf = reg->Dimension.Index;
2428
2429 if (reg->Dimension.Indirect) {
2430 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_const_and_shader_buffers);
2431 LLVMValueRef index;
2432 index = si_get_bounded_indirect_index(ctx, &reg->DimIndirect,
2433 reg->Dimension.Index,
2434 ctx->num_const_buffers);
2435 index = LLVMBuildAdd(ctx->ac.builder, index,
2436 LLVMConstInt(ctx->i32, SI_NUM_SHADER_BUFFERS, 0), "");
2437 bufp = ac_build_load_to_sgpr(&ctx->ac, ptr, index);
2438 } else
2439 bufp = load_const_buffer_desc(ctx, buf);
2440
2441 return bitcast(bld_base, type, buffer_load_const(ctx, bufp, addr));
2442 }
2443
2444 /* Initialize arguments for the shader export intrinsic */
2445 static void si_llvm_init_export_args(struct si_shader_context *ctx,
2446 LLVMValueRef *values,
2447 unsigned target,
2448 struct ac_export_args *args)
2449 {
2450 LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
2451 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
2452 unsigned chan;
2453 bool is_int8, is_int10;
2454
2455 /* Default is 0xf. Adjusted below depending on the format. */
2456 args->enabled_channels = 0xf; /* writemask */
2457
2458 /* Specify whether the EXEC mask represents the valid mask */
2459 args->valid_mask = 0;
2460
2461 /* Specify whether this is the last export */
2462 args->done = 0;
2463
2464 /* Specify the target we are exporting */
2465 args->target = target;
2466
2467 if (ctx->type == PIPE_SHADER_FRAGMENT) {
2468 const struct si_shader_key *key = &ctx->shader->key;
2469 unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
2470 int cbuf = target - V_008DFC_SQ_EXP_MRT;
2471
2472 assert(cbuf >= 0 && cbuf < 8);
2473 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2474 is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
2475 is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
2476 }
2477
2478 args->compr = false;
2479 args->out[0] = f32undef;
2480 args->out[1] = f32undef;
2481 args->out[2] = f32undef;
2482 args->out[3] = f32undef;
2483
2484 LLVMValueRef (*packf)(struct ac_llvm_context *ctx, LLVMValueRef args[2]) = NULL;
2485 LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef args[2],
2486 unsigned bits, bool hi) = NULL;
2487
2488 switch (spi_shader_col_format) {
2489 case V_028714_SPI_SHADER_ZERO:
2490 args->enabled_channels = 0; /* writemask */
2491 args->target = V_008DFC_SQ_EXP_NULL;
2492 break;
2493
2494 case V_028714_SPI_SHADER_32_R:
2495 args->enabled_channels = 1; /* writemask */
2496 args->out[0] = values[0];
2497 break;
2498
2499 case V_028714_SPI_SHADER_32_GR:
2500 args->enabled_channels = 0x3; /* writemask */
2501 args->out[0] = values[0];
2502 args->out[1] = values[1];
2503 break;
2504
2505 case V_028714_SPI_SHADER_32_AR:
2506 args->enabled_channels = 0x9; /* writemask */
2507 args->out[0] = values[0];
2508 args->out[3] = values[3];
2509 break;
2510
2511 case V_028714_SPI_SHADER_FP16_ABGR:
2512 packf = ac_build_cvt_pkrtz_f16;
2513 break;
2514
2515 case V_028714_SPI_SHADER_UNORM16_ABGR:
2516 packf = ac_build_cvt_pknorm_u16;
2517 break;
2518
2519 case V_028714_SPI_SHADER_SNORM16_ABGR:
2520 packf = ac_build_cvt_pknorm_i16;
2521 break;
2522
2523 case V_028714_SPI_SHADER_UINT16_ABGR:
2524 packi = ac_build_cvt_pk_u16;
2525 break;
2526
2527 case V_028714_SPI_SHADER_SINT16_ABGR:
2528 packi = ac_build_cvt_pk_i16;
2529 break;
2530
2531 case V_028714_SPI_SHADER_32_ABGR:
2532 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
2533 break;
2534 }
2535
2536 /* Pack f16 or norm_i16/u16. */
2537 if (packf) {
2538 for (chan = 0; chan < 2; chan++) {
2539 LLVMValueRef pack_args[2] = {
2540 values[2 * chan],
2541 values[2 * chan + 1]
2542 };
2543 LLVMValueRef packed;
2544
2545 packed = packf(&ctx->ac, pack_args);
2546 args->out[chan] = ac_to_float(&ctx->ac, packed);
2547 }
2548 args->compr = 1; /* COMPR flag */
2549 }
2550 /* Pack i16/u16. */
2551 if (packi) {
2552 for (chan = 0; chan < 2; chan++) {
2553 LLVMValueRef pack_args[2] = {
2554 ac_to_integer(&ctx->ac, values[2 * chan]),
2555 ac_to_integer(&ctx->ac, values[2 * chan + 1])
2556 };
2557 LLVMValueRef packed;
2558
2559 packed = packi(&ctx->ac, pack_args,
2560 is_int8 ? 8 : is_int10 ? 10 : 16,
2561 chan == 1);
2562 args->out[chan] = ac_to_float(&ctx->ac, packed);
2563 }
2564 args->compr = 1; /* COMPR flag */
2565 }
2566 }
2567
2568 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2569 LLVMValueRef alpha)
2570 {
2571 struct si_shader_context *ctx = si_shader_context(bld_base);
2572
2573 if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2574 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
2575 [PIPE_FUNC_LESS] = LLVMRealOLT,
2576 [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
2577 [PIPE_FUNC_LEQUAL] = LLVMRealOLE,
2578 [PIPE_FUNC_GREATER] = LLVMRealOGT,
2579 [PIPE_FUNC_NOTEQUAL] = LLVMRealONE,
2580 [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
2581 };
2582 LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
2583 assert(cond);
2584
2585 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn,
2586 SI_PARAM_ALPHA_REF);
2587 LLVMValueRef alpha_pass =
2588 LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
2589 ac_build_kill_if_false(&ctx->ac, alpha_pass);
2590 } else {
2591 ac_build_kill_if_false(&ctx->ac, ctx->i1false);
2592 }
2593 }
2594
2595 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2596 LLVMValueRef alpha,
2597 unsigned samplemask_param)
2598 {
2599 struct si_shader_context *ctx = si_shader_context(bld_base);
2600 LLVMValueRef coverage;
2601
2602 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2603 coverage = LLVMGetParam(ctx->main_fn,
2604 samplemask_param);
2605 coverage = ac_to_integer(&ctx->ac, coverage);
2606
2607 coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
2608 ctx->i32,
2609 &coverage, 1, AC_FUNC_ATTR_READNONE);
2610
2611 coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
2612 ctx->f32, "");
2613
2614 coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
2615 LLVMConstReal(ctx->f32,
2616 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2617
2618 return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
2619 }
2620
2621 static void si_llvm_emit_clipvertex(struct si_shader_context *ctx,
2622 struct ac_export_args *pos, LLVMValueRef *out_elts)
2623 {
2624 unsigned reg_index;
2625 unsigned chan;
2626 unsigned const_chan;
2627 LLVMValueRef base_elt;
2628 LLVMValueRef ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
2629 LLVMValueRef constbuf_index = LLVMConstInt(ctx->i32,
2630 SI_VS_CONST_CLIP_PLANES, 0);
2631 LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
2632
2633 for (reg_index = 0; reg_index < 2; reg_index ++) {
2634 struct ac_export_args *args = &pos[2 + reg_index];
2635
2636 args->out[0] =
2637 args->out[1] =
2638 args->out[2] =
2639 args->out[3] = LLVMConstReal(ctx->f32, 0.0f);
2640
2641 /* Compute dot products of position and user clip plane vectors */
2642 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2643 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2644 LLVMValueRef addr =
2645 LLVMConstInt(ctx->i32, ((reg_index * 4 + chan) * 4 +
2646 const_chan) * 4, 0);
2647 base_elt = buffer_load_const(ctx, const_resource,
2648 addr);
2649 args->out[chan] = ac_build_fmad(&ctx->ac, base_elt,
2650 out_elts[const_chan], args->out[chan]);
2651 }
2652 }
2653
2654 args->enabled_channels = 0xf;
2655 args->valid_mask = 0;
2656 args->done = 0;
2657 args->target = V_008DFC_SQ_EXP_POS + 2 + reg_index;
2658 args->compr = 0;
2659 }
2660 }
2661
2662 static void si_dump_streamout(struct pipe_stream_output_info *so)
2663 {
2664 unsigned i;
2665
2666 if (so->num_outputs)
2667 fprintf(stderr, "STREAMOUT\n");
2668
2669 for (i = 0; i < so->num_outputs; i++) {
2670 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2671 so->output[i].start_component;
2672 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2673 i, so->output[i].output_buffer,
2674 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2675 so->output[i].register_index,
2676 mask & 1 ? "x" : "",
2677 mask & 2 ? "y" : "",
2678 mask & 4 ? "z" : "",
2679 mask & 8 ? "w" : "");
2680 }
2681 }
2682
2683 static void emit_streamout_output(struct si_shader_context *ctx,
2684 LLVMValueRef const *so_buffers,
2685 LLVMValueRef const *so_write_offsets,
2686 struct pipe_stream_output *stream_out,
2687 struct si_shader_output_values *shader_out)
2688 {
2689 unsigned buf_idx = stream_out->output_buffer;
2690 unsigned start = stream_out->start_component;
2691 unsigned num_comps = stream_out->num_components;
2692 LLVMValueRef out[4];
2693
2694 assert(num_comps && num_comps <= 4);
2695 if (!num_comps || num_comps > 4)
2696 return;
2697
2698 /* Load the output as int. */
2699 for (int j = 0; j < num_comps; j++) {
2700 assert(stream_out->stream == shader_out->vertex_stream[start + j]);
2701
2702 out[j] = ac_to_integer(&ctx->ac, shader_out->values[start + j]);
2703 }
2704
2705 /* Pack the output. */
2706 LLVMValueRef vdata = NULL;
2707
2708 switch (num_comps) {
2709 case 1: /* as i32 */
2710 vdata = out[0];
2711 break;
2712 case 2: /* as v2i32 */
2713 case 3: /* as v4i32 (aligned to 4) */
2714 out[3] = LLVMGetUndef(ctx->i32);
2715 /* fall through */
2716 case 4: /* as v4i32 */
2717 vdata = ac_build_gather_values(&ctx->ac, out, util_next_power_of_two(num_comps));
2718 break;
2719 }
2720
2721 ac_build_buffer_store_dword(&ctx->ac, so_buffers[buf_idx],
2722 vdata, num_comps,
2723 so_write_offsets[buf_idx],
2724 ctx->i32_0,
2725 stream_out->dst_offset * 4, 1, 1, true, false);
2726 }
2727
2728 /**
2729 * Write streamout data to buffers for vertex stream @p stream (different
2730 * vertex streams can occur for GS copy shaders).
2731 */
2732 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2733 struct si_shader_output_values *outputs,
2734 unsigned noutput, unsigned stream)
2735 {
2736 struct si_shader_selector *sel = ctx->shader->selector;
2737 struct pipe_stream_output_info *so = &sel->so;
2738 LLVMBuilderRef builder = ctx->ac.builder;
2739 int i;
2740 struct lp_build_if_state if_ctx;
2741
2742 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2743 LLVMValueRef so_vtx_count =
2744 si_unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2745
2746 LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
2747
2748 /* can_emit = tid < so_vtx_count; */
2749 LLVMValueRef can_emit =
2750 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2751
2752 /* Emit the streamout code conditionally. This actually avoids
2753 * out-of-bounds buffer access. The hw tells us via the SGPR
2754 * (so_vtx_count) which threads are allowed to emit streamout data. */
2755 lp_build_if(&if_ctx, &ctx->gallivm, can_emit);
2756 {
2757 /* The buffer offset is computed as follows:
2758 * ByteOffset = streamout_offset[buffer_id]*4 +
2759 * (streamout_write_index + thread_id)*stride[buffer_id] +
2760 * attrib_offset
2761 */
2762
2763 LLVMValueRef so_write_index =
2764 LLVMGetParam(ctx->main_fn,
2765 ctx->param_streamout_write_index);
2766
2767 /* Compute (streamout_write_index + thread_id). */
2768 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2769
2770 /* Load the descriptor and compute the write offset for each
2771 * enabled buffer. */
2772 LLVMValueRef so_write_offset[4] = {};
2773 LLVMValueRef so_buffers[4];
2774 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
2775 ctx->param_rw_buffers);
2776
2777 for (i = 0; i < 4; i++) {
2778 if (!so->stride[i])
2779 continue;
2780
2781 LLVMValueRef offset = LLVMConstInt(ctx->i32,
2782 SI_VS_STREAMOUT_BUF0 + i, 0);
2783
2784 so_buffers[i] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
2785
2786 LLVMValueRef so_offset = LLVMGetParam(ctx->main_fn,
2787 ctx->param_streamout_offset[i]);
2788 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2789
2790 so_write_offset[i] = ac_build_imad(&ctx->ac, so_write_index,
2791 LLVMConstInt(ctx->i32, so->stride[i]*4, 0),
2792 so_offset);
2793 }
2794
2795 /* Write streamout data. */
2796 for (i = 0; i < so->num_outputs; i++) {
2797 unsigned reg = so->output[i].register_index;
2798
2799 if (reg >= noutput)
2800 continue;
2801
2802 if (stream != so->output[i].stream)
2803 continue;
2804
2805 emit_streamout_output(ctx, so_buffers, so_write_offset,
2806 &so->output[i], &outputs[reg]);
2807 }
2808 }
2809 lp_build_endif(&if_ctx);
2810 }
2811
2812 static void si_export_param(struct si_shader_context *ctx, unsigned index,
2813 LLVMValueRef *values)
2814 {
2815 struct ac_export_args args;
2816
2817 si_llvm_init_export_args(ctx, values,
2818 V_008DFC_SQ_EXP_PARAM + index, &args);
2819 ac_build_export(&ctx->ac, &args);
2820 }
2821
2822 static void si_build_param_exports(struct si_shader_context *ctx,
2823 struct si_shader_output_values *outputs,
2824 unsigned noutput)
2825 {
2826 struct si_shader *shader = ctx->shader;
2827 unsigned param_count = 0;
2828
2829 for (unsigned i = 0; i < noutput; i++) {
2830 unsigned semantic_name = outputs[i].semantic_name;
2831 unsigned semantic_index = outputs[i].semantic_index;
2832
2833 if (outputs[i].vertex_stream[0] != 0 &&
2834 outputs[i].vertex_stream[1] != 0 &&
2835 outputs[i].vertex_stream[2] != 0 &&
2836 outputs[i].vertex_stream[3] != 0)
2837 continue;
2838
2839 switch (semantic_name) {
2840 case TGSI_SEMANTIC_LAYER:
2841 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2842 case TGSI_SEMANTIC_CLIPDIST:
2843 case TGSI_SEMANTIC_COLOR:
2844 case TGSI_SEMANTIC_BCOLOR:
2845 case TGSI_SEMANTIC_PRIMID:
2846 case TGSI_SEMANTIC_FOG:
2847 case TGSI_SEMANTIC_TEXCOORD:
2848 case TGSI_SEMANTIC_GENERIC:
2849 break;
2850 default:
2851 continue;
2852 }
2853
2854 if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
2855 semantic_index < SI_MAX_IO_GENERIC) &&
2856 shader->key.opt.kill_outputs &
2857 (1ull << si_shader_io_get_unique_index(semantic_name,
2858 semantic_index, true)))
2859 continue;
2860
2861 si_export_param(ctx, param_count, outputs[i].values);
2862
2863 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2864 shader->info.vs_output_param_offset[i] = param_count++;
2865 }
2866
2867 shader->info.nr_param_exports = param_count;
2868 }
2869
2870 /* Generate export instructions for hardware VS shader stage */
2871 static void si_llvm_export_vs(struct si_shader_context *ctx,
2872 struct si_shader_output_values *outputs,
2873 unsigned noutput)
2874 {
2875 struct si_shader *shader = ctx->shader;
2876 struct ac_export_args pos_args[4] = {};
2877 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2878 unsigned pos_idx;
2879 int i;
2880
2881 /* Build position exports. */
2882 for (i = 0; i < noutput; i++) {
2883 switch (outputs[i].semantic_name) {
2884 case TGSI_SEMANTIC_POSITION:
2885 si_llvm_init_export_args(ctx, outputs[i].values,
2886 V_008DFC_SQ_EXP_POS, &pos_args[0]);
2887 break;
2888 case TGSI_SEMANTIC_PSIZE:
2889 psize_value = outputs[i].values[0];
2890 break;
2891 case TGSI_SEMANTIC_LAYER:
2892 layer_value = outputs[i].values[0];
2893 break;
2894 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2895 viewport_index_value = outputs[i].values[0];
2896 break;
2897 case TGSI_SEMANTIC_EDGEFLAG:
2898 edgeflag_value = outputs[i].values[0];
2899 break;
2900 case TGSI_SEMANTIC_CLIPDIST:
2901 if (!shader->key.opt.clip_disable) {
2902 unsigned index = 2 + outputs[i].semantic_index;
2903 si_llvm_init_export_args(ctx, outputs[i].values,
2904 V_008DFC_SQ_EXP_POS + index,
2905 &pos_args[index]);
2906 }
2907 break;
2908 case TGSI_SEMANTIC_CLIPVERTEX:
2909 if (!shader->key.opt.clip_disable) {
2910 si_llvm_emit_clipvertex(ctx, pos_args,
2911 outputs[i].values);
2912 }
2913 break;
2914 }
2915 }
2916
2917 /* We need to add the position output manually if it's missing. */
2918 if (!pos_args[0].out[0]) {
2919 pos_args[0].enabled_channels = 0xf; /* writemask */
2920 pos_args[0].valid_mask = 0; /* EXEC mask */
2921 pos_args[0].done = 0; /* last export? */
2922 pos_args[0].target = V_008DFC_SQ_EXP_POS;
2923 pos_args[0].compr = 0; /* COMPR flag */
2924 pos_args[0].out[0] = ctx->ac.f32_0; /* X */
2925 pos_args[0].out[1] = ctx->ac.f32_0; /* Y */
2926 pos_args[0].out[2] = ctx->ac.f32_0; /* Z */
2927 pos_args[0].out[3] = ctx->ac.f32_1; /* W */
2928 }
2929
2930 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2931 if (shader->selector->info.writes_psize ||
2932 shader->selector->info.writes_edgeflag ||
2933 shader->selector->info.writes_viewport_index ||
2934 shader->selector->info.writes_layer) {
2935 pos_args[1].enabled_channels = shader->selector->info.writes_psize |
2936 (shader->selector->info.writes_edgeflag << 1) |
2937 (shader->selector->info.writes_layer << 2);
2938
2939 pos_args[1].valid_mask = 0; /* EXEC mask */
2940 pos_args[1].done = 0; /* last export? */
2941 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
2942 pos_args[1].compr = 0; /* COMPR flag */
2943 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
2944 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
2945 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
2946 pos_args[1].out[3] = ctx->ac.f32_0; /* W */
2947
2948 if (shader->selector->info.writes_psize)
2949 pos_args[1].out[0] = psize_value;
2950
2951 if (shader->selector->info.writes_edgeflag) {
2952 /* The output is a float, but the hw expects an integer
2953 * with the first bit containing the edge flag. */
2954 edgeflag_value = LLVMBuildFPToUI(ctx->ac.builder,
2955 edgeflag_value,
2956 ctx->i32, "");
2957 edgeflag_value = ac_build_umin(&ctx->ac,
2958 edgeflag_value,
2959 ctx->i32_1);
2960
2961 /* The LLVM intrinsic expects a float. */
2962 pos_args[1].out[1] = ac_to_float(&ctx->ac, edgeflag_value);
2963 }
2964
2965 if (ctx->screen->info.chip_class >= GFX9) {
2966 /* GFX9 has the layer in out.z[10:0] and the viewport
2967 * index in out.z[19:16].
2968 */
2969 if (shader->selector->info.writes_layer)
2970 pos_args[1].out[2] = layer_value;
2971
2972 if (shader->selector->info.writes_viewport_index) {
2973 LLVMValueRef v = viewport_index_value;
2974
2975 v = ac_to_integer(&ctx->ac, v);
2976 v = LLVMBuildShl(ctx->ac.builder, v,
2977 LLVMConstInt(ctx->i32, 16, 0), "");
2978 v = LLVMBuildOr(ctx->ac.builder, v,
2979 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
2980 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
2981 pos_args[1].enabled_channels |= 1 << 2;
2982 }
2983 } else {
2984 if (shader->selector->info.writes_layer)
2985 pos_args[1].out[2] = layer_value;
2986
2987 if (shader->selector->info.writes_viewport_index) {
2988 pos_args[1].out[3] = viewport_index_value;
2989 pos_args[1].enabled_channels |= 1 << 3;
2990 }
2991 }
2992 }
2993
2994 for (i = 0; i < 4; i++)
2995 if (pos_args[i].out[0])
2996 shader->info.nr_pos_exports++;
2997
2998 pos_idx = 0;
2999 for (i = 0; i < 4; i++) {
3000 if (!pos_args[i].out[0])
3001 continue;
3002
3003 /* Specify the target we are exporting */
3004 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
3005
3006 if (pos_idx == shader->info.nr_pos_exports)
3007 /* Specify that this is the last export */
3008 pos_args[i].done = 1;
3009
3010 ac_build_export(&ctx->ac, &pos_args[i]);
3011 }
3012
3013 /* Build parameter exports. */
3014 si_build_param_exports(ctx, outputs, noutput);
3015 }
3016
3017 /**
3018 * Forward all outputs from the vertex shader to the TES. This is only used
3019 * for the fixed function TCS.
3020 */
3021 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
3022 {
3023 struct si_shader_context *ctx = si_shader_context(bld_base);
3024 LLVMValueRef invocation_id, buffer, buffer_offset;
3025 LLVMValueRef lds_vertex_stride, lds_base;
3026 uint64_t inputs;
3027
3028 invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
3029 buffer = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
3030 buffer_offset = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
3031
3032 lds_vertex_stride = get_tcs_in_vertex_dw_stride(ctx);
3033 lds_base = get_tcs_in_current_patch_offset(ctx);
3034 lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride,
3035 lds_base);
3036
3037 inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
3038 while (inputs) {
3039 unsigned i = u_bit_scan64(&inputs);
3040
3041 LLVMValueRef lds_ptr = LLVMBuildAdd(ctx->ac.builder, lds_base,
3042 LLVMConstInt(ctx->i32, 4 * i, 0),
3043 "");
3044
3045 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
3046 get_rel_patch_id(ctx),
3047 invocation_id,
3048 LLVMConstInt(ctx->i32, i, 0));
3049
3050 LLVMValueRef value = lds_load(bld_base, ctx->ac.i32, ~0,
3051 lds_ptr);
3052
3053 ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buffer_addr,
3054 buffer_offset, 0, 1, 0, true, false);
3055 }
3056 }
3057
3058 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
3059 LLVMValueRef rel_patch_id,
3060 LLVMValueRef invocation_id,
3061 LLVMValueRef tcs_out_current_patch_data_offset,
3062 LLVMValueRef invoc0_tf_outer[4],
3063 LLVMValueRef invoc0_tf_inner[2])
3064 {
3065 struct si_shader_context *ctx = si_shader_context(bld_base);
3066 struct si_shader *shader = ctx->shader;
3067 unsigned tess_inner_index, tess_outer_index;
3068 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
3069 LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
3070 unsigned stride, outer_comps, inner_comps, i, offset;
3071 struct lp_build_if_state if_ctx, inner_if_ctx;
3072
3073 /* Add a barrier before loading tess factors from LDS. */
3074 if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
3075 si_llvm_emit_barrier(NULL, bld_base, NULL);
3076
3077 /* Do this only for invocation 0, because the tess levels are per-patch,
3078 * not per-vertex.
3079 *
3080 * This can't jump, because invocation 0 executes this. It should
3081 * at least mask out the loads and stores for other invocations.
3082 */
3083 lp_build_if(&if_ctx, &ctx->gallivm,
3084 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3085 invocation_id, ctx->i32_0, ""));
3086
3087 /* Determine the layout of one tess factor element in the buffer. */
3088 switch (shader->key.part.tcs.epilog.prim_mode) {
3089 case PIPE_PRIM_LINES:
3090 stride = 2; /* 2 dwords, 1 vec2 store */
3091 outer_comps = 2;
3092 inner_comps = 0;
3093 break;
3094 case PIPE_PRIM_TRIANGLES:
3095 stride = 4; /* 4 dwords, 1 vec4 store */
3096 outer_comps = 3;
3097 inner_comps = 1;
3098 break;
3099 case PIPE_PRIM_QUADS:
3100 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
3101 outer_comps = 4;
3102 inner_comps = 2;
3103 break;
3104 default:
3105 assert(0);
3106 return;
3107 }
3108
3109 for (i = 0; i < 4; i++) {
3110 inner[i] = LLVMGetUndef(ctx->i32);
3111 outer[i] = LLVMGetUndef(ctx->i32);
3112 }
3113
3114 if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
3115 /* Tess factors are in VGPRs. */
3116 for (i = 0; i < outer_comps; i++)
3117 outer[i] = out[i] = invoc0_tf_outer[i];
3118 for (i = 0; i < inner_comps; i++)
3119 inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
3120 } else {
3121 /* Load tess_inner and tess_outer from LDS.
3122 * Any invocation can write them, so we can't get them from a temporary.
3123 */
3124 tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
3125 tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
3126
3127 lds_base = tcs_out_current_patch_data_offset;
3128 lds_inner = LLVMBuildAdd(ctx->ac.builder, lds_base,
3129 LLVMConstInt(ctx->i32,
3130 tess_inner_index * 4, 0), "");
3131 lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_base,
3132 LLVMConstInt(ctx->i32,
3133 tess_outer_index * 4, 0), "");
3134
3135 for (i = 0; i < outer_comps; i++) {
3136 outer[i] = out[i] =
3137 lds_load(bld_base, ctx->ac.i32, i, lds_outer);
3138 }
3139 for (i = 0; i < inner_comps; i++) {
3140 inner[i] = out[outer_comps+i] =
3141 lds_load(bld_base, ctx->ac.i32, i, lds_inner);
3142 }
3143 }
3144
3145 if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
3146 /* For isolines, the hardware expects tess factors in the
3147 * reverse order from what GLSL / TGSI specify.
3148 */
3149 LLVMValueRef tmp = out[0];
3150 out[0] = out[1];
3151 out[1] = tmp;
3152 }
3153
3154 /* Convert the outputs to vectors for stores. */
3155 vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
3156 vec1 = NULL;
3157
3158 if (stride > 4)
3159 vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
3160
3161 /* Get the buffer. */
3162 buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
3163
3164 /* Get the offset. */
3165 tf_base = LLVMGetParam(ctx->main_fn,
3166 ctx->param_tcs_factor_offset);
3167 byteoffset = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
3168 LLVMConstInt(ctx->i32, 4 * stride, 0), "");
3169
3170 lp_build_if(&inner_if_ctx, &ctx->gallivm,
3171 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3172 rel_patch_id, ctx->i32_0, ""));
3173
3174 /* Store the dynamic HS control word. */
3175 offset = 0;
3176 if (ctx->screen->info.chip_class <= VI) {
3177 ac_build_buffer_store_dword(&ctx->ac, buffer,
3178 LLVMConstInt(ctx->i32, 0x80000000, 0),
3179 1, ctx->i32_0, tf_base,
3180 offset, 1, 0, true, false);
3181 offset += 4;
3182 }
3183
3184 lp_build_endif(&inner_if_ctx);
3185
3186 /* Store the tessellation factors. */
3187 ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
3188 MIN2(stride, 4), byteoffset, tf_base,
3189 offset, 1, 0, true, false);
3190 offset += 16;
3191 if (vec1)
3192 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
3193 stride - 4, byteoffset, tf_base,
3194 offset, 1, 0, true, false);
3195
3196 /* Store the tess factors into the offchip buffer if TES reads them. */
3197 if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
3198 LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
3199 LLVMValueRef tf_inner_offset;
3200 unsigned param_outer, param_inner;
3201
3202 buf = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TCS);
3203 base = LLVMGetParam(ctx->main_fn, ctx->param_tcs_offchip_offset);
3204
3205 param_outer = si_shader_io_get_unique_index_patch(
3206 TGSI_SEMANTIC_TESSOUTER, 0);
3207 tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3208 LLVMConstInt(ctx->i32, param_outer, 0));
3209
3210 outer_vec = ac_build_gather_values(&ctx->ac, outer,
3211 util_next_power_of_two(outer_comps));
3212
3213 ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
3214 outer_comps, tf_outer_offset,
3215 base, 0, 1, 0, true, false);
3216 if (inner_comps) {
3217 param_inner = si_shader_io_get_unique_index_patch(
3218 TGSI_SEMANTIC_TESSINNER, 0);
3219 tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
3220 LLVMConstInt(ctx->i32, param_inner, 0));
3221
3222 inner_vec = inner_comps == 1 ? inner[0] :
3223 ac_build_gather_values(&ctx->ac, inner, inner_comps);
3224 ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
3225 inner_comps, tf_inner_offset,
3226 base, 0, 1, 0, true, false);
3227 }
3228 }
3229
3230 lp_build_endif(&if_ctx);
3231 }
3232
3233 static LLVMValueRef
3234 si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
3235 unsigned param, unsigned return_index)
3236 {
3237 return LLVMBuildInsertValue(ctx->ac.builder, ret,
3238 LLVMGetParam(ctx->main_fn, param),
3239 return_index, "");
3240 }
3241
3242 static LLVMValueRef
3243 si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
3244 unsigned param, unsigned return_index)
3245 {
3246 LLVMBuilderRef builder = ctx->ac.builder;
3247 LLVMValueRef p = LLVMGetParam(ctx->main_fn, param);
3248
3249 return LLVMBuildInsertValue(builder, ret,
3250 ac_to_float(&ctx->ac, p),
3251 return_index, "");
3252 }
3253
3254 static LLVMValueRef
3255 si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
3256 unsigned param, unsigned return_index)
3257 {
3258 LLVMBuilderRef builder = ctx->ac.builder;
3259 LLVMValueRef ptr, lo, hi;
3260
3261 ptr = LLVMGetParam(ctx->main_fn, param);
3262 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->i32, "");
3263 return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
3264 }
3265
3266 /* This only writes the tessellation factor levels. */
3267 static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi,
3268 unsigned max_outputs,
3269 LLVMValueRef *addrs)
3270 {
3271 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3272 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
3273 LLVMBuilderRef builder = ctx->ac.builder;
3274 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
3275
3276 si_copy_tcs_inputs(bld_base);
3277
3278 rel_patch_id = get_rel_patch_id(ctx);
3279 invocation_id = unpack_llvm_param(ctx, ctx->abi.tcs_rel_ids, 8, 5);
3280 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
3281
3282 if (ctx->screen->info.chip_class >= GFX9) {
3283 LLVMBasicBlockRef blocks[2] = {
3284 LLVMGetInsertBlock(builder),
3285 ctx->merged_wrap_if_state.entry_block
3286 };
3287 LLVMValueRef values[2];
3288
3289 lp_build_endif(&ctx->merged_wrap_if_state);
3290
3291 values[0] = rel_patch_id;
3292 values[1] = LLVMGetUndef(ctx->i32);
3293 rel_patch_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3294
3295 values[0] = tf_lds_offset;
3296 values[1] = LLVMGetUndef(ctx->i32);
3297 tf_lds_offset = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3298
3299 values[0] = invocation_id;
3300 values[1] = ctx->i32_1; /* cause the epilog to skip threads */
3301 invocation_id = ac_build_phi(&ctx->ac, ctx->i32, 2, values, blocks);
3302 }
3303
3304 /* Return epilog parameters from this function. */
3305 LLVMValueRef ret = ctx->return_value;
3306 unsigned vgpr;
3307
3308 if (ctx->screen->info.chip_class >= GFX9) {
3309 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3310 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3311 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3312 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3313 /* Tess offchip and tess factor offsets are at the beginning. */
3314 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3315 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3316 vgpr = 8 + GFX9_SGPR_TCS_OUT_LAYOUT + 1;
3317 } else {
3318 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3319 GFX6_SGPR_TCS_OFFCHIP_LAYOUT);
3320 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3321 GFX6_SGPR_TCS_OUT_LAYOUT);
3322 /* Tess offchip and tess factor offsets are after user SGPRs. */
3323 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset,
3324 GFX6_TCS_NUM_USER_SGPR);
3325 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset,
3326 GFX6_TCS_NUM_USER_SGPR + 1);
3327 vgpr = GFX6_TCS_NUM_USER_SGPR + 2;
3328 }
3329
3330 /* VGPRs */
3331 rel_patch_id = ac_to_float(&ctx->ac, rel_patch_id);
3332 invocation_id = ac_to_float(&ctx->ac, invocation_id);
3333 tf_lds_offset = ac_to_float(&ctx->ac, tf_lds_offset);
3334
3335 /* Leave a hole corresponding to the two input VGPRs. This ensures that
3336 * the invocation_id output does not alias the tcs_rel_ids input,
3337 * which saves a V_MOV on gfx9.
3338 */
3339 vgpr += 2;
3340
3341 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
3342 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
3343
3344 if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
3345 vgpr++; /* skip the tess factor LDS offset */
3346 for (unsigned i = 0; i < 6; i++) {
3347 LLVMValueRef value =
3348 LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], "");
3349 value = ac_to_float(&ctx->ac, value);
3350 ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, "");
3351 }
3352 } else {
3353 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
3354 }
3355 ctx->return_value = ret;
3356 }
3357
3358 /* Pass TCS inputs from LS to TCS on GFX9. */
3359 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
3360 {
3361 LLVMValueRef ret = ctx->return_value;
3362
3363 ret = si_insert_input_ptr(ctx, ret, 0, 0);
3364 ret = si_insert_input_ptr(ctx, ret, 1, 1);
3365 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_offset, 2);
3366 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3367 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_factor_offset, 4);
3368 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3369
3370 ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3371 8 + SI_SGPR_RW_BUFFERS);
3372 ret = si_insert_input_ptr(ctx, ret,
3373 ctx->param_bindless_samplers_and_images,
3374 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3375
3376 ret = si_insert_input_ret(ctx, ret, ctx->param_vs_state_bits,
3377 8 + SI_SGPR_VS_STATE_BITS);
3378
3379 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_offchip_layout,
3380 8 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT);
3381 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_offsets,
3382 8 + GFX9_SGPR_TCS_OUT_OFFSETS);
3383 ret = si_insert_input_ret(ctx, ret, ctx->param_tcs_out_lds_layout,
3384 8 + GFX9_SGPR_TCS_OUT_LAYOUT);
3385
3386 unsigned vgpr = 8 + GFX9_TCS_NUM_USER_SGPR;
3387 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3388 ac_to_float(&ctx->ac, ctx->abi.tcs_patch_id),
3389 vgpr++, "");
3390 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
3391 ac_to_float(&ctx->ac, ctx->abi.tcs_rel_ids),
3392 vgpr++, "");
3393 ctx->return_value = ret;
3394 }
3395
3396 /* Pass GS inputs from ES to GS on GFX9. */
3397 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
3398 {
3399 LLVMValueRef ret = ctx->return_value;
3400
3401 ret = si_insert_input_ptr(ctx, ret, 0, 0);
3402 ret = si_insert_input_ptr(ctx, ret, 1, 1);
3403 ret = si_insert_input_ret(ctx, ret, ctx->param_gs2vs_offset, 2);
3404 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_wave_info, 3);
3405 ret = si_insert_input_ret(ctx, ret, ctx->param_merged_scratch_offset, 5);
3406
3407 ret = si_insert_input_ptr(ctx, ret, ctx->param_rw_buffers,
3408 8 + SI_SGPR_RW_BUFFERS);
3409 ret = si_insert_input_ptr(ctx, ret,
3410 ctx->param_bindless_samplers_and_images,
3411 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
3412
3413 unsigned vgpr;
3414 if (ctx->type == PIPE_SHADER_VERTEX)
3415 vgpr = 8 + GFX9_VSGS_NUM_USER_SGPR;
3416 else
3417 vgpr = 8 + GFX9_TESGS_NUM_USER_SGPR;
3418
3419 for (unsigned i = 0; i < 5; i++) {
3420 unsigned param = ctx->param_gs_vtx01_offset + i;
3421 ret = si_insert_input_ret_float(ctx, ret, param, vgpr++);
3422 }
3423 ctx->return_value = ret;
3424 }
3425
3426 static void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi,
3427 unsigned max_outputs,
3428 LLVMValueRef *addrs)
3429 {
3430 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3431 struct si_shader *shader = ctx->shader;
3432 struct tgsi_shader_info *info = &shader->selector->info;
3433 unsigned i, chan;
3434 LLVMValueRef vertex_id = LLVMGetParam(ctx->main_fn,
3435 ctx->param_rel_auto_id);
3436 LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
3437 LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id,
3438 vertex_dw_stride, "");
3439
3440 /* Write outputs to LDS. The next shader (TCS aka HS) will read
3441 * its inputs from it. */
3442 for (i = 0; i < info->num_outputs; i++) {
3443 unsigned name = info->output_semantic_name[i];
3444 unsigned index = info->output_semantic_index[i];
3445
3446 /* The ARB_shader_viewport_layer_array spec contains the
3447 * following issue:
3448 *
3449 * 2) What happens if gl_ViewportIndex or gl_Layer is
3450 * written in the vertex shader and a geometry shader is
3451 * present?
3452 *
3453 * RESOLVED: The value written by the last vertex processing
3454 * stage is used. If the last vertex processing stage
3455 * (vertex, tessellation evaluation or geometry) does not
3456 * statically assign to gl_ViewportIndex or gl_Layer, index
3457 * or layer zero is assumed.
3458 *
3459 * So writes to those outputs in VS-as-LS are simply ignored.
3460 */
3461 if (name == TGSI_SEMANTIC_LAYER ||
3462 name == TGSI_SEMANTIC_VIEWPORT_INDEX)
3463 continue;
3464
3465 int param = si_shader_io_get_unique_index(name, index, false);
3466 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
3467 LLVMConstInt(ctx->i32, param * 4, 0), "");
3468
3469 for (chan = 0; chan < 4; chan++) {
3470 if (!(info->output_usagemask[i] & (1 << chan)))
3471 continue;
3472
3473 lds_store(ctx, chan, dw_addr,
3474 LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""));
3475 }
3476 }
3477
3478 if (ctx->screen->info.chip_class >= GFX9)
3479 si_set_ls_return_value_for_tcs(ctx);
3480 }
3481
3482 static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi,
3483 unsigned max_outputs,
3484 LLVMValueRef *addrs)
3485 {
3486 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3487 struct si_shader *es = ctx->shader;
3488 struct tgsi_shader_info *info = &es->selector->info;
3489 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
3490 ctx->param_es2gs_offset);
3491 LLVMValueRef lds_base = NULL;
3492 unsigned chan;
3493 int i;
3494
3495 if (ctx->screen->info.chip_class >= GFX9 && info->num_outputs) {
3496 unsigned itemsize_dw = es->selector->esgs_itemsize / 4;
3497 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
3498 LLVMValueRef wave_idx = si_unpack_param(ctx, ctx->param_merged_wave_info, 24, 4);
3499 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
3500 LLVMBuildMul(ctx->ac.builder, wave_idx,
3501 LLVMConstInt(ctx->i32, 64, false), ""), "");
3502 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
3503 LLVMConstInt(ctx->i32, itemsize_dw, 0), "");
3504 }
3505
3506 for (i = 0; i < info->num_outputs; i++) {
3507 int param;
3508
3509 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
3510 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
3511 continue;
3512
3513 param = si_shader_io_get_unique_index(info->output_semantic_name[i],
3514 info->output_semantic_index[i], false);
3515
3516 for (chan = 0; chan < 4; chan++) {
3517 if (!(info->output_usagemask[i] & (1 << chan)))
3518 continue;
3519
3520 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
3521 out_val = ac_to_integer(&ctx->ac, out_val);
3522
3523 /* GFX9 has the ESGS ring in LDS. */
3524 if (ctx->screen->info.chip_class >= GFX9) {
3525 lds_store(ctx, param * 4 + chan, lds_base, out_val);
3526 continue;
3527 }
3528
3529 ac_build_buffer_store_dword(&ctx->ac,
3530 ctx->esgs_ring,
3531 out_val, 1, NULL, soffset,
3532 (4 * param + chan) * 4,
3533 1, 1, true, true);
3534 }
3535 }
3536
3537 if (ctx->screen->info.chip_class >= GFX9)
3538 si_set_es_return_value_for_gs(ctx);
3539 }
3540
3541 static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
3542 {
3543 if (ctx->screen->info.chip_class >= GFX9)
3544 return si_unpack_param(ctx, ctx->param_merged_wave_info, 16, 8);
3545 else
3546 return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
3547 }
3548
3549 static void emit_gs_epilogue(struct si_shader_context *ctx)
3550 {
3551 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
3552 si_get_gs_wave_id(ctx));
3553
3554 if (ctx->screen->info.chip_class >= GFX9)
3555 lp_build_endif(&ctx->merged_wrap_if_state);
3556 }
3557
3558 static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi,
3559 unsigned max_outputs,
3560 LLVMValueRef *addrs)
3561 {
3562 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3563 struct tgsi_shader_info UNUSED *info = &ctx->shader->selector->info;
3564
3565 assert(info->num_outputs <= max_outputs);
3566
3567 emit_gs_epilogue(ctx);
3568 }
3569
3570 static void si_tgsi_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
3571 {
3572 struct si_shader_context *ctx = si_shader_context(bld_base);
3573 emit_gs_epilogue(ctx);
3574 }
3575
3576 static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
3577 unsigned max_outputs,
3578 LLVMValueRef *addrs)
3579 {
3580 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3581 struct tgsi_shader_info *info = &ctx->shader->selector->info;
3582 struct si_shader_output_values *outputs = NULL;
3583 int i,j;
3584
3585 assert(!ctx->shader->is_gs_copy_shader);
3586 assert(info->num_outputs <= max_outputs);
3587
3588 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
3589
3590 /* Vertex color clamping.
3591 *
3592 * This uses a state constant loaded in a user data SGPR and
3593 * an IF statement is added that clamps all colors if the constant
3594 * is true.
3595 */
3596 struct lp_build_if_state if_ctx;
3597 LLVMValueRef cond = NULL;
3598 LLVMValueRef addr, val;
3599
3600 for (i = 0; i < info->num_outputs; i++) {
3601 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
3602 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
3603 continue;
3604
3605 /* We've found a color. */
3606 if (!cond) {
3607 /* The state is in the first bit of the user SGPR. */
3608 cond = LLVMGetParam(ctx->main_fn,
3609 ctx->param_vs_state_bits);
3610 cond = LLVMBuildTrunc(ctx->ac.builder, cond,
3611 ctx->i1, "");
3612 lp_build_if(&if_ctx, &ctx->gallivm, cond);
3613 }
3614
3615 for (j = 0; j < 4; j++) {
3616 addr = addrs[4 * i + j];
3617 val = LLVMBuildLoad(ctx->ac.builder, addr, "");
3618 val = ac_build_clamp(&ctx->ac, val);
3619 LLVMBuildStore(ctx->ac.builder, val, addr);
3620 }
3621 }
3622
3623 if (cond)
3624 lp_build_endif(&if_ctx);
3625
3626 for (i = 0; i < info->num_outputs; i++) {
3627 outputs[i].semantic_name = info->output_semantic_name[i];
3628 outputs[i].semantic_index = info->output_semantic_index[i];
3629
3630 for (j = 0; j < 4; j++) {
3631 outputs[i].values[j] =
3632 LLVMBuildLoad(ctx->ac.builder,
3633 addrs[4 * i + j],
3634 "");
3635 outputs[i].vertex_stream[j] =
3636 (info->output_streams[i] >> (2 * j)) & 3;
3637 }
3638 }
3639
3640 if (ctx->shader->selector->so.num_outputs)
3641 si_llvm_emit_streamout(ctx, outputs, i, 0);
3642
3643 /* Export PrimitiveID. */
3644 if (ctx->shader->key.mono.u.vs_export_prim_id) {
3645 outputs[i].semantic_name = TGSI_SEMANTIC_PRIMID;
3646 outputs[i].semantic_index = 0;
3647 outputs[i].values[0] = ac_to_float(&ctx->ac, get_primitive_id(ctx, 0));
3648 for (j = 1; j < 4; j++)
3649 outputs[i].values[j] = LLVMConstReal(ctx->f32, 0);
3650
3651 memset(outputs[i].vertex_stream, 0,
3652 sizeof(outputs[i].vertex_stream));
3653 i++;
3654 }
3655
3656 si_llvm_export_vs(ctx, outputs, i);
3657 FREE(outputs);
3658 }
3659
3660 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
3661 {
3662 struct si_shader_context *ctx = si_shader_context(bld_base);
3663
3664 ctx->abi.emit_outputs(&ctx->abi, RADEON_LLVM_MAX_OUTPUTS,
3665 &ctx->outputs[0][0]);
3666 }
3667
3668 struct si_ps_exports {
3669 unsigned num;
3670 struct ac_export_args args[10];
3671 };
3672
3673 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
3674 LLVMValueRef depth, LLVMValueRef stencil,
3675 LLVMValueRef samplemask, struct si_ps_exports *exp)
3676 {
3677 struct si_shader_context *ctx = si_shader_context(bld_base);
3678 struct ac_export_args args;
3679
3680 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
3681
3682 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3683 }
3684
3685 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3686 LLVMValueRef *color, unsigned index,
3687 unsigned samplemask_param,
3688 bool is_last, struct si_ps_exports *exp)
3689 {
3690 struct si_shader_context *ctx = si_shader_context(bld_base);
3691 int i;
3692
3693 /* Clamp color */
3694 if (ctx->shader->key.part.ps.epilog.clamp_color)
3695 for (i = 0; i < 4; i++)
3696 color[i] = ac_build_clamp(&ctx->ac, color[i]);
3697
3698 /* Alpha to one */
3699 if (ctx->shader->key.part.ps.epilog.alpha_to_one)
3700 color[3] = ctx->ac.f32_1;
3701
3702 /* Alpha test */
3703 if (index == 0 &&
3704 ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3705 si_alpha_test(bld_base, color[3]);
3706
3707 /* Line & polygon smoothing */
3708 if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
3709 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3710 samplemask_param);
3711
3712 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3713 if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
3714 struct ac_export_args args[8];
3715 int c, last = -1;
3716
3717 /* Get the export arguments, also find out what the last one is. */
3718 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3719 si_llvm_init_export_args(ctx, color,
3720 V_008DFC_SQ_EXP_MRT + c, &args[c]);
3721 if (args[c].enabled_channels)
3722 last = c;
3723 }
3724
3725 /* Emit all exports. */
3726 for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
3727 if (is_last && last == c) {
3728 args[c].valid_mask = 1; /* whether the EXEC mask is valid */
3729 args[c].done = 1; /* DONE bit */
3730 } else if (!args[c].enabled_channels)
3731 continue; /* unnecessary NULL export */
3732
3733 memcpy(&exp->args[exp->num++], &args[c], sizeof(args[c]));
3734 }
3735 } else {
3736 struct ac_export_args args;
3737
3738 /* Export */
3739 si_llvm_init_export_args(ctx, color, V_008DFC_SQ_EXP_MRT + index,
3740 &args);
3741 if (is_last) {
3742 args.valid_mask = 1; /* whether the EXEC mask is valid */
3743 args.done = 1; /* DONE bit */
3744 } else if (!args.enabled_channels)
3745 return; /* unnecessary NULL export */
3746
3747 memcpy(&exp->args[exp->num++], &args, sizeof(args));
3748 }
3749 }
3750
3751 static void si_emit_ps_exports(struct si_shader_context *ctx,
3752 struct si_ps_exports *exp)
3753 {
3754 for (unsigned i = 0; i < exp->num; i++)
3755 ac_build_export(&ctx->ac, &exp->args[i]);
3756 }
3757
3758 /**
3759 * Return PS outputs in this order:
3760 *
3761 * v[0:3] = color0.xyzw
3762 * v[4:7] = color1.xyzw
3763 * ...
3764 * vN+0 = Depth
3765 * vN+1 = Stencil
3766 * vN+2 = SampleMask
3767 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3768 *
3769 * The alpha-ref SGPR is returned via its original location.
3770 */
3771 static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi,
3772 unsigned max_outputs,
3773 LLVMValueRef *addrs)
3774 {
3775 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
3776 struct si_shader *shader = ctx->shader;
3777 struct tgsi_shader_info *info = &shader->selector->info;
3778 LLVMBuilderRef builder = ctx->ac.builder;
3779 unsigned i, j, first_vgpr, vgpr;
3780
3781 LLVMValueRef color[8][4] = {};
3782 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3783 LLVMValueRef ret;
3784
3785 if (ctx->postponed_kill)
3786 ac_build_kill_if_false(&ctx->ac, LLVMBuildLoad(builder, ctx->postponed_kill, ""));
3787
3788 /* Read the output values. */
3789 for (i = 0; i < info->num_outputs; i++) {
3790 unsigned semantic_name = info->output_semantic_name[i];
3791 unsigned semantic_index = info->output_semantic_index[i];
3792
3793 switch (semantic_name) {
3794 case TGSI_SEMANTIC_COLOR:
3795 assert(semantic_index < 8);
3796 for (j = 0; j < 4; j++) {
3797 LLVMValueRef ptr = addrs[4 * i + j];
3798 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3799 color[semantic_index][j] = result;
3800 }
3801 break;
3802 case TGSI_SEMANTIC_POSITION:
3803 depth = LLVMBuildLoad(builder,
3804 addrs[4 * i + 2], "");
3805 break;
3806 case TGSI_SEMANTIC_STENCIL:
3807 stencil = LLVMBuildLoad(builder,
3808 addrs[4 * i + 1], "");
3809 break;
3810 case TGSI_SEMANTIC_SAMPLEMASK:
3811 samplemask = LLVMBuildLoad(builder,
3812 addrs[4 * i + 0], "");
3813 break;
3814 default:
3815 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3816 semantic_name);
3817 }
3818 }
3819
3820 /* Fill the return structure. */
3821 ret = ctx->return_value;
3822
3823 /* Set SGPRs. */
3824 ret = LLVMBuildInsertValue(builder, ret,
3825 ac_to_integer(&ctx->ac,
3826 LLVMGetParam(ctx->main_fn,
3827 SI_PARAM_ALPHA_REF)),
3828 SI_SGPR_ALPHA_REF, "");
3829
3830 /* Set VGPRs */
3831 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3832 for (i = 0; i < ARRAY_SIZE(color); i++) {
3833 if (!color[i][0])
3834 continue;
3835
3836 for (j = 0; j < 4; j++)
3837 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3838 }
3839 if (depth)
3840 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3841 if (stencil)
3842 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3843 if (samplemask)
3844 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3845
3846 /* Add the input sample mask for smoothing at the end. */
3847 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3848 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3849 ret = LLVMBuildInsertValue(builder, ret,
3850 LLVMGetParam(ctx->main_fn,
3851 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3852
3853 ctx->return_value = ret;
3854 }
3855
3856 static void membar_emit(
3857 const struct lp_build_tgsi_action *action,
3858 struct lp_build_tgsi_context *bld_base,
3859 struct lp_build_emit_data *emit_data)
3860 {
3861 struct si_shader_context *ctx = si_shader_context(bld_base);
3862 LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0);
3863 unsigned flags = LLVMConstIntGetZExtValue(src0);
3864 unsigned waitcnt = NOOP_WAITCNT;
3865
3866 if (flags & TGSI_MEMBAR_THREAD_GROUP)
3867 waitcnt &= VM_CNT & LGKM_CNT;
3868
3869 if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER |
3870 TGSI_MEMBAR_SHADER_BUFFER |
3871 TGSI_MEMBAR_SHADER_IMAGE))
3872 waitcnt &= VM_CNT;
3873
3874 if (flags & TGSI_MEMBAR_SHARED)
3875 waitcnt &= LGKM_CNT;
3876
3877 if (waitcnt != NOOP_WAITCNT)
3878 ac_build_waitcnt(&ctx->ac, waitcnt);
3879 }
3880
3881 static void clock_emit(
3882 const struct lp_build_tgsi_action *action,
3883 struct lp_build_tgsi_context *bld_base,
3884 struct lp_build_emit_data *emit_data)
3885 {
3886 struct si_shader_context *ctx = si_shader_context(bld_base);
3887 LLVMValueRef tmp = ac_build_shader_clock(&ctx->ac);
3888
3889 emit_data->output[0] =
3890 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_0, "");
3891 emit_data->output[1] =
3892 LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->i32_1, "");
3893 }
3894
3895 static void si_llvm_emit_ddxy(
3896 const struct lp_build_tgsi_action *action,
3897 struct lp_build_tgsi_context *bld_base,
3898 struct lp_build_emit_data *emit_data)
3899 {
3900 struct si_shader_context *ctx = si_shader_context(bld_base);
3901 unsigned opcode = emit_data->info->opcode;
3902 LLVMValueRef val;
3903 int idx;
3904 unsigned mask;
3905
3906 if (opcode == TGSI_OPCODE_DDX_FINE)
3907 mask = AC_TID_MASK_LEFT;
3908 else if (opcode == TGSI_OPCODE_DDY_FINE)
3909 mask = AC_TID_MASK_TOP;
3910 else
3911 mask = AC_TID_MASK_TOP_LEFT;
3912
3913 /* for DDX we want to next X pixel, DDY next Y pixel. */
3914 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
3915
3916 val = ac_to_integer(&ctx->ac, emit_data->args[0]);
3917 val = ac_build_ddxy(&ctx->ac, mask, idx, val);
3918 emit_data->output[emit_data->chan] = val;
3919 }
3920
3921 /*
3922 * this takes an I,J coordinate pair,
3923 * and works out the X and Y derivatives.
3924 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3925 */
3926 static LLVMValueRef si_llvm_emit_ddxy_interp(
3927 struct lp_build_tgsi_context *bld_base,
3928 LLVMValueRef interp_ij)
3929 {
3930 struct si_shader_context *ctx = si_shader_context(bld_base);
3931 LLVMValueRef result[4], a;
3932 unsigned i;
3933
3934 for (i = 0; i < 2; i++) {
3935 a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij,
3936 LLVMConstInt(ctx->i32, i, 0), "");
3937 result[i] = ac_build_ddxy(&ctx->ac, AC_TID_MASK_TOP_LEFT, 1,
3938 ac_to_integer(&ctx->ac, a)); /* DDX */
3939 result[2+i] = ac_build_ddxy(&ctx->ac, AC_TID_MASK_TOP_LEFT, 2,
3940 ac_to_integer(&ctx->ac, a)); /* DDY */
3941 }
3942
3943 return ac_build_gather_values(&ctx->ac, result, 4);
3944 }
3945
3946 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
3947 struct lp_build_tgsi_context *bld_base,
3948 struct lp_build_emit_data *emit_data)
3949 {
3950 struct si_shader_context *ctx = si_shader_context(bld_base);
3951 struct si_shader *shader = ctx->shader;
3952 const struct tgsi_shader_info *info = &shader->selector->info;
3953 LLVMValueRef interp_param;
3954 const struct tgsi_full_instruction *inst = emit_data->inst;
3955 const struct tgsi_full_src_register *input = &inst->Src[0];
3956 int input_base, input_array_size;
3957 int chan;
3958 int i;
3959 LLVMValueRef prim_mask = ctx->abi.prim_mask;
3960 LLVMValueRef array_idx, offset_x = NULL, offset_y = NULL;
3961 int interp_param_idx;
3962 unsigned interp;
3963 unsigned location;
3964
3965 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
3966 /* offset is in second src, first two channels */
3967 offset_x = lp_build_emit_fetch(bld_base, emit_data->inst, 1,
3968 TGSI_CHAN_X);
3969 offset_y = lp_build_emit_fetch(bld_base, emit_data->inst, 1,
3970 TGSI_CHAN_Y);
3971 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
3972 LLVMValueRef sample_position;
3973 LLVMValueRef sample_id;
3974 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3975
3976 /* fetch sample ID, then fetch its sample position,
3977 * and place into first two channels.
3978 */
3979 sample_id = lp_build_emit_fetch(bld_base,
3980 emit_data->inst, 1, TGSI_CHAN_X);
3981 sample_id = ac_to_integer(&ctx->ac, sample_id);
3982
3983 /* Section 8.13.2 (Interpolation Functions) of the OpenGL Shading
3984 * Language 4.50 spec says about interpolateAtSample:
3985 *
3986 * "Returns the value of the input interpolant variable at
3987 * the location of sample number sample. If multisample
3988 * buffers are not available, the input variable will be
3989 * evaluated at the center of the pixel. If sample sample
3990 * does not exist, the position used to interpolate the
3991 * input variable is undefined."
3992 *
3993 * This means that sample_id values outside of the valid are
3994 * in fact valid input, and the usual mechanism for loading the
3995 * sample position doesn't work.
3996 */
3997 if (ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center) {
3998 LLVMValueRef center[4] = {
3999 LLVMConstReal(ctx->f32, 0.5),
4000 LLVMConstReal(ctx->f32, 0.5),
4001 ctx->ac.f32_0,
4002 ctx->ac.f32_0,
4003 };
4004
4005 sample_position = ac_build_gather_values(&ctx->ac, center, 4);
4006 } else {
4007 sample_position = load_sample_position(&ctx->abi, sample_id);
4008 }
4009
4010 offset_x = LLVMBuildExtractElement(ctx->ac.builder, sample_position,
4011 ctx->i32_0, "");
4012
4013 offset_x = LLVMBuildFSub(ctx->ac.builder, offset_x, halfval, "");
4014 offset_y = LLVMBuildExtractElement(ctx->ac.builder, sample_position,
4015 ctx->i32_1, "");
4016 offset_y = LLVMBuildFSub(ctx->ac.builder, offset_y, halfval, "");
4017 }
4018
4019 assert(input->Register.File == TGSI_FILE_INPUT);
4020
4021 if (input->Register.Indirect) {
4022 unsigned array_id = input->Indirect.ArrayID;
4023
4024 if (array_id) {
4025 input_base = info->input_array_first[array_id];
4026 input_array_size = info->input_array_last[array_id] - input_base + 1;
4027 } else {
4028 input_base = inst->Src[0].Register.Index;
4029 input_array_size = info->num_inputs - input_base;
4030 }
4031
4032 array_idx = si_get_indirect_index(ctx, &input->Indirect,
4033 1, input->Register.Index - input_base);
4034 } else {
4035 input_base = inst->Src[0].Register.Index;
4036 input_array_size = 1;
4037 array_idx = ctx->i32_0;
4038 }
4039
4040 interp = shader->selector->info.input_interpolate[input_base];
4041
4042 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4043 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
4044 location = TGSI_INTERPOLATE_LOC_CENTER;
4045 else
4046 location = TGSI_INTERPOLATE_LOC_CENTROID;
4047
4048 interp_param_idx = lookup_interp_param_index(interp, location);
4049 if (interp_param_idx == -1)
4050 return;
4051 else if (interp_param_idx)
4052 interp_param = LLVMGetParam(ctx->main_fn, interp_param_idx);
4053 else
4054 interp_param = NULL;
4055
4056 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4057 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4058 LLVMValueRef ij_out[2];
4059 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
4060
4061 /*
4062 * take the I then J parameters, and the DDX/Y for it, and
4063 * calculate the IJ inputs for the interpolator.
4064 * temp1 = ddx * offset/sample.x + I;
4065 * interp_param.I = ddy * offset/sample.y + temp1;
4066 * temp1 = ddx * offset/sample.x + J;
4067 * interp_param.J = ddy * offset/sample.y + temp1;
4068 */
4069 for (i = 0; i < 2; i++) {
4070 LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, 0);
4071 LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, 0);
4072 LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
4073 ddxy_out, ix_ll, "");
4074 LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
4075 ddxy_out, iy_ll, "");
4076 LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
4077 interp_param, ix_ll, "");
4078 LLVMValueRef temp;
4079
4080 interp_el = ac_to_float(&ctx->ac, interp_el);
4081
4082 temp = ac_build_fmad(&ctx->ac, ddx_el, offset_x, interp_el);
4083 ij_out[i] = ac_build_fmad(&ctx->ac, ddy_el, offset_y, temp);
4084 }
4085 interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
4086 }
4087
4088 if (interp_param)
4089 interp_param = ac_to_float(&ctx->ac, interp_param);
4090
4091 for (chan = 0; chan < 4; chan++) {
4092 LLVMValueRef gather = LLVMGetUndef(LLVMVectorType(ctx->f32, input_array_size));
4093 unsigned schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
4094
4095 for (unsigned idx = 0; idx < input_array_size; ++idx) {
4096 LLVMValueRef v, i = NULL, j = NULL;
4097
4098 if (interp_param) {
4099 i = LLVMBuildExtractElement(
4100 ctx->ac.builder, interp_param, ctx->i32_0, "");
4101 j = LLVMBuildExtractElement(
4102 ctx->ac.builder, interp_param, ctx->i32_1, "");
4103 }
4104 v = si_build_fs_interp(ctx, input_base + idx, schan,
4105 prim_mask, i, j);
4106
4107 gather = LLVMBuildInsertElement(ctx->ac.builder,
4108 gather, v, LLVMConstInt(ctx->i32, idx, false), "");
4109 }
4110
4111 emit_data->output[chan] = LLVMBuildExtractElement(
4112 ctx->ac.builder, gather, array_idx, "");
4113 }
4114 }
4115
4116 static void vote_all_emit(
4117 const struct lp_build_tgsi_action *action,
4118 struct lp_build_tgsi_context *bld_base,
4119 struct lp_build_emit_data *emit_data)
4120 {
4121 struct si_shader_context *ctx = si_shader_context(bld_base);
4122
4123 LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, emit_data->args[0]);
4124 emit_data->output[emit_data->chan] =
4125 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4126 }
4127
4128 static void vote_any_emit(
4129 const struct lp_build_tgsi_action *action,
4130 struct lp_build_tgsi_context *bld_base,
4131 struct lp_build_emit_data *emit_data)
4132 {
4133 struct si_shader_context *ctx = si_shader_context(bld_base);
4134
4135 LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, emit_data->args[0]);
4136 emit_data->output[emit_data->chan] =
4137 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4138 }
4139
4140 static void vote_eq_emit(
4141 const struct lp_build_tgsi_action *action,
4142 struct lp_build_tgsi_context *bld_base,
4143 struct lp_build_emit_data *emit_data)
4144 {
4145 struct si_shader_context *ctx = si_shader_context(bld_base);
4146
4147 LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, emit_data->args[0]);
4148 emit_data->output[emit_data->chan] =
4149 LLVMBuildSExt(ctx->ac.builder, tmp, ctx->i32, "");
4150 }
4151
4152 static void ballot_emit(
4153 const struct lp_build_tgsi_action *action,
4154 struct lp_build_tgsi_context *bld_base,
4155 struct lp_build_emit_data *emit_data)
4156 {
4157 struct si_shader_context *ctx = si_shader_context(bld_base);
4158 LLVMBuilderRef builder = ctx->ac.builder;
4159 LLVMValueRef tmp;
4160
4161 tmp = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4162 tmp = ac_build_ballot(&ctx->ac, tmp);
4163 tmp = LLVMBuildBitCast(builder, tmp, ctx->v2i32, "");
4164
4165 emit_data->output[0] = LLVMBuildExtractElement(builder, tmp, ctx->i32_0, "");
4166 emit_data->output[1] = LLVMBuildExtractElement(builder, tmp, ctx->i32_1, "");
4167 }
4168
4169 static void read_lane_emit(
4170 const struct lp_build_tgsi_action *action,
4171 struct lp_build_tgsi_context *bld_base,
4172 struct lp_build_emit_data *emit_data)
4173 {
4174 struct si_shader_context *ctx = si_shader_context(bld_base);
4175
4176 if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_READ_INVOC) {
4177 emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
4178 0, emit_data->src_chan);
4179
4180 /* Always read the source invocation (= lane) from the X channel. */
4181 emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
4182 1, TGSI_CHAN_X);
4183 emit_data->arg_count = 2;
4184 }
4185
4186 /* We currently have no other way to prevent LLVM from lifting the icmp
4187 * calls to a dominating basic block.
4188 */
4189 ac_build_optimization_barrier(&ctx->ac, &emit_data->args[0]);
4190
4191 for (unsigned i = 0; i < emit_data->arg_count; ++i)
4192 emit_data->args[i] = ac_to_integer(&ctx->ac, emit_data->args[i]);
4193
4194 emit_data->output[emit_data->chan] =
4195 ac_build_intrinsic(&ctx->ac, action->intr_name,
4196 ctx->i32, emit_data->args, emit_data->arg_count,
4197 AC_FUNC_ATTR_READNONE |
4198 AC_FUNC_ATTR_CONVERGENT);
4199 }
4200
4201 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4202 struct lp_build_emit_data *emit_data)
4203 {
4204 struct si_shader_context *ctx = si_shader_context(bld_base);
4205 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4206 LLVMValueRef imm;
4207 unsigned stream;
4208
4209 assert(src0.File == TGSI_FILE_IMMEDIATE);
4210
4211 imm = ctx->imms[src0.Index * TGSI_NUM_CHANNELS + src0.SwizzleX];
4212 stream = LLVMConstIntGetZExtValue(imm) & 0x3;
4213 return stream;
4214 }
4215
4216 /* Emit one vertex from the geometry shader */
4217 static void si_llvm_emit_vertex(struct ac_shader_abi *abi,
4218 unsigned stream,
4219 LLVMValueRef *addrs)
4220 {
4221 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4222 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4223 struct si_shader *shader = ctx->shader;
4224 struct lp_build_if_state if_state;
4225 LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
4226 ctx->param_gs2vs_offset);
4227 LLVMValueRef gs_next_vertex;
4228 LLVMValueRef can_emit;
4229 unsigned chan, offset;
4230 int i;
4231
4232 /* Write vertex attribute values to GSVS ring */
4233 gs_next_vertex = LLVMBuildLoad(ctx->ac.builder,
4234 ctx->gs_next_vertex[stream],
4235 "");
4236
4237 /* If this thread has already emitted the declared maximum number of
4238 * vertices, skip the write: excessive vertex emissions are not
4239 * supposed to have any effect.
4240 *
4241 * If the shader has no writes to memory, kill it instead. This skips
4242 * further memory loads and may allow LLVM to skip to the end
4243 * altogether.
4244 */
4245 can_emit = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
4246 LLVMConstInt(ctx->i32,
4247 shader->selector->gs_max_out_vertices, 0), "");
4248
4249 bool use_kill = !info->writes_memory;
4250 if (use_kill) {
4251 ac_build_kill_if_false(&ctx->ac, can_emit);
4252 } else {
4253 lp_build_if(&if_state, &ctx->gallivm, can_emit);
4254 }
4255
4256 offset = 0;
4257 for (i = 0; i < info->num_outputs; i++) {
4258 for (chan = 0; chan < 4; chan++) {
4259 if (!(info->output_usagemask[i] & (1 << chan)) ||
4260 ((info->output_streams[i] >> (2 * chan)) & 3) != stream)
4261 continue;
4262
4263 LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
4264 LLVMValueRef voffset =
4265 LLVMConstInt(ctx->i32, offset *
4266 shader->selector->gs_max_out_vertices, 0);
4267 offset++;
4268
4269 voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
4270 voffset = LLVMBuildMul(ctx->ac.builder, voffset,
4271 LLVMConstInt(ctx->i32, 4, 0), "");
4272
4273 out_val = ac_to_integer(&ctx->ac, out_val);
4274
4275 ac_build_buffer_store_dword(&ctx->ac,
4276 ctx->gsvs_ring[stream],
4277 out_val, 1,
4278 voffset, soffset, 0,
4279 1, 1, true, true);
4280 }
4281 }
4282
4283 gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
4284 LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4285
4286 /* Signal vertex emission if vertex data was written. */
4287 if (offset) {
4288 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
4289 si_get_gs_wave_id(ctx));
4290 }
4291
4292 if (!use_kill)
4293 lp_build_endif(&if_state);
4294 }
4295
4296 /* Emit one vertex from the geometry shader */
4297 static void si_tgsi_emit_vertex(
4298 const struct lp_build_tgsi_action *action,
4299 struct lp_build_tgsi_context *bld_base,
4300 struct lp_build_emit_data *emit_data)
4301 {
4302 struct si_shader_context *ctx = si_shader_context(bld_base);
4303 unsigned stream = si_llvm_get_stream(bld_base, emit_data);
4304
4305 si_llvm_emit_vertex(&ctx->abi, stream, ctx->outputs[0]);
4306 }
4307
4308 /* Cut one primitive from the geometry shader */
4309 static void si_llvm_emit_primitive(struct ac_shader_abi *abi,
4310 unsigned stream)
4311 {
4312 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
4313
4314 /* Signal primitive cut */
4315 ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8),
4316 si_get_gs_wave_id(ctx));
4317 }
4318
4319 /* Cut one primitive from the geometry shader */
4320 static void si_tgsi_emit_primitive(
4321 const struct lp_build_tgsi_action *action,
4322 struct lp_build_tgsi_context *bld_base,
4323 struct lp_build_emit_data *emit_data)
4324 {
4325 struct si_shader_context *ctx = si_shader_context(bld_base);
4326
4327 si_llvm_emit_primitive(&ctx->abi, si_llvm_get_stream(bld_base, emit_data));
4328 }
4329
4330 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4331 struct lp_build_tgsi_context *bld_base,
4332 struct lp_build_emit_data *emit_data)
4333 {
4334 struct si_shader_context *ctx = si_shader_context(bld_base);
4335
4336 /* SI only (thanks to a hw bug workaround):
4337 * The real barrier instruction isn’t needed, because an entire patch
4338 * always fits into a single wave.
4339 */
4340 if (ctx->screen->info.chip_class == SI &&
4341 ctx->type == PIPE_SHADER_TESS_CTRL) {
4342 ac_build_waitcnt(&ctx->ac, LGKM_CNT & VM_CNT);
4343 return;
4344 }
4345
4346 ac_build_s_barrier(&ctx->ac);
4347 }
4348
4349 static void si_create_function(struct si_shader_context *ctx,
4350 const char *name,
4351 LLVMTypeRef *returns, unsigned num_returns,
4352 struct si_function_info *fninfo,
4353 unsigned max_workgroup_size)
4354 {
4355 int i;
4356
4357 si_llvm_create_func(ctx, name, returns, num_returns,
4358 fninfo->types, fninfo->num_params);
4359 ctx->return_value = LLVMGetUndef(ctx->return_type);
4360
4361 for (i = 0; i < fninfo->num_sgpr_params; ++i) {
4362 LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
4363
4364 /* The combination of:
4365 * - noalias
4366 * - dereferenceable
4367 * - invariant.load
4368 * allows the optimization passes to move loads and reduces
4369 * SGPR spilling significantly.
4370 */
4371 ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
4372 AC_FUNC_ATTR_INREG);
4373
4374 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4375 ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
4376 AC_FUNC_ATTR_NOALIAS);
4377 ac_add_attr_dereferenceable(P, UINT64_MAX);
4378 }
4379 }
4380
4381 for (i = 0; i < fninfo->num_params; ++i) {
4382 if (fninfo->assign[i])
4383 *fninfo->assign[i] = LLVMGetParam(ctx->main_fn, i);
4384 }
4385
4386 if (ctx->screen->info.address32_hi) {
4387 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4388 "amdgpu-32bit-address-high-bits",
4389 ctx->screen->info.address32_hi);
4390 }
4391
4392 if (max_workgroup_size) {
4393 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4394 "amdgpu-max-work-group-size",
4395 max_workgroup_size);
4396 }
4397 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4398 "no-signed-zeros-fp-math",
4399 "true");
4400
4401 if (ctx->screen->debug_flags & DBG(UNSAFE_MATH)) {
4402 /* These were copied from some LLVM test. */
4403 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4404 "less-precise-fpmad",
4405 "true");
4406 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4407 "no-infs-fp-math",
4408 "true");
4409 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4410 "no-nans-fp-math",
4411 "true");
4412 LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
4413 "unsafe-fp-math",
4414 "true");
4415 }
4416 }
4417
4418 static void declare_streamout_params(struct si_shader_context *ctx,
4419 struct pipe_stream_output_info *so,
4420 struct si_function_info *fninfo)
4421 {
4422 int i;
4423
4424 /* Streamout SGPRs. */
4425 if (so->num_outputs) {
4426 if (ctx->type != PIPE_SHADER_TESS_EVAL)
4427 ctx->param_streamout_config = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4428 else
4429 ctx->param_streamout_config = fninfo->num_params - 1;
4430
4431 ctx->param_streamout_write_index = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4432 }
4433 /* A streamout buffer offset is loaded if the stride is non-zero. */
4434 for (i = 0; i < 4; i++) {
4435 if (!so->stride[i])
4436 continue;
4437
4438 ctx->param_streamout_offset[i] = add_arg(fninfo, ARG_SGPR, ctx->ac.i32);
4439 }
4440 }
4441
4442 static unsigned si_get_max_workgroup_size(const struct si_shader *shader)
4443 {
4444 switch (shader->selector->type) {
4445 case PIPE_SHADER_TESS_CTRL:
4446 /* Return this so that LLVM doesn't remove s_barrier
4447 * instructions on chips where we use s_barrier. */
4448 return shader->selector->screen->info.chip_class >= CIK ? 128 : 64;
4449
4450 case PIPE_SHADER_GEOMETRY:
4451 return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 64;
4452
4453 case PIPE_SHADER_COMPUTE:
4454 break; /* see below */
4455
4456 default:
4457 return 0;
4458 }
4459
4460 const unsigned *properties = shader->selector->info.properties;
4461 unsigned max_work_group_size =
4462 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
4463 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
4464 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
4465
4466 if (!max_work_group_size) {
4467 /* This is a variable group size compute shader,
4468 * compile it for the maximum possible group size.
4469 */
4470 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
4471 }
4472 return max_work_group_size;
4473 }
4474
4475 static void declare_const_and_shader_buffers(struct si_shader_context *ctx,
4476 struct si_function_info *fninfo,
4477 bool assign_params)
4478 {
4479 LLVMTypeRef const_shader_buf_type;
4480
4481 if (ctx->shader->selector->info.const_buffers_declared == 1 &&
4482 ctx->shader->selector->info.shader_buffers_declared == 0)
4483 const_shader_buf_type = ctx->f32;
4484 else
4485 const_shader_buf_type = ctx->v4i32;
4486
4487 unsigned const_and_shader_buffers =
4488 add_arg(fninfo, ARG_SGPR,
4489 ac_array_in_const32_addr_space(const_shader_buf_type));
4490
4491 if (assign_params)
4492 ctx->param_const_and_shader_buffers = const_and_shader_buffers;
4493 }
4494
4495 static void declare_samplers_and_images(struct si_shader_context *ctx,
4496 struct si_function_info *fninfo,
4497 bool assign_params)
4498 {
4499 unsigned samplers_and_images =
4500 add_arg(fninfo, ARG_SGPR,
4501 ac_array_in_const32_addr_space(ctx->v8i32));
4502
4503 if (assign_params)
4504 ctx->param_samplers_and_images = samplers_and_images;
4505 }
4506
4507 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx,
4508 struct si_function_info *fninfo,
4509 bool assign_params)
4510 {
4511 declare_const_and_shader_buffers(ctx, fninfo, assign_params);
4512 declare_samplers_and_images(ctx, fninfo, assign_params);
4513 }
4514
4515 static void declare_global_desc_pointers(struct si_shader_context *ctx,
4516 struct si_function_info *fninfo)
4517 {
4518 ctx->param_rw_buffers = add_arg(fninfo, ARG_SGPR,
4519 ac_array_in_const32_addr_space(ctx->v4i32));
4520 ctx->param_bindless_samplers_and_images = add_arg(fninfo, ARG_SGPR,
4521 ac_array_in_const32_addr_space(ctx->v8i32));
4522 }
4523
4524 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
4525 struct si_function_info *fninfo)
4526 {
4527 ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
4528 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
4529 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
4530 add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
4531 }
4532
4533 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
4534 struct si_function_info *fninfo,
4535 unsigned *num_prolog_vgprs)
4536 {
4537 struct si_shader *shader = ctx->shader;
4538
4539 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.vertex_id);
4540 if (shader->key.as_ls) {
4541 ctx->param_rel_auto_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4542 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4543 } else {
4544 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.instance_id);
4545 ctx->param_vs_prim_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4546 }
4547 add_arg(fninfo, ARG_VGPR, ctx->i32); /* unused */
4548
4549 if (!shader->is_gs_copy_shader) {
4550 /* Vertex load indices. */
4551 ctx->param_vertex_index0 = fninfo->num_params;
4552 for (unsigned i = 0; i < shader->selector->info.num_inputs; i++)
4553 add_arg(fninfo, ARG_VGPR, ctx->i32);
4554 *num_prolog_vgprs += shader->selector->info.num_inputs;
4555 }
4556 }
4557
4558 static void declare_vs_blit_inputs(struct si_shader_context *ctx,
4559 struct si_function_info *fninfo,
4560 unsigned vs_blit_property)
4561 {
4562 ctx->param_vs_blit_inputs = fninfo->num_params;
4563 add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x1, y1 */
4564 add_arg(fninfo, ARG_SGPR, ctx->i32); /* i16 x2, y2 */
4565 add_arg(fninfo, ARG_SGPR, ctx->f32); /* depth */
4566
4567 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
4568 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color0 */
4569 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color1 */
4570 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color2 */
4571 add_arg(fninfo, ARG_SGPR, ctx->f32); /* color3 */
4572 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
4573 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x1 */
4574 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y1 */
4575 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.x2 */
4576 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.y2 */
4577 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.z */
4578 add_arg(fninfo, ARG_SGPR, ctx->f32); /* texcoord.w */
4579 }
4580 }
4581
4582 static void declare_tes_input_vgprs(struct si_shader_context *ctx,
4583 struct si_function_info *fninfo)
4584 {
4585 ctx->param_tes_u = add_arg(fninfo, ARG_VGPR, ctx->f32);
4586 ctx->param_tes_v = add_arg(fninfo, ARG_VGPR, ctx->f32);
4587 ctx->param_tes_rel_patch_id = add_arg(fninfo, ARG_VGPR, ctx->i32);
4588 add_arg_assign(fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tes_patch_id);
4589 }
4590
4591 enum {
4592 /* Convenient merged shader definitions. */
4593 SI_SHADER_MERGED_VERTEX_TESSCTRL = PIPE_SHADER_TYPES,
4594 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
4595 };
4596
4597 static void create_function(struct si_shader_context *ctx)
4598 {
4599 struct si_shader *shader = ctx->shader;
4600 struct si_function_info fninfo;
4601 LLVMTypeRef returns[16+32*4];
4602 unsigned i, num_return_sgprs;
4603 unsigned num_returns = 0;
4604 unsigned num_prolog_vgprs = 0;
4605 unsigned type = ctx->type;
4606 unsigned vs_blit_property =
4607 shader->selector->info.properties[TGSI_PROPERTY_VS_BLIT_SGPRS];
4608
4609 si_init_function_info(&fninfo);
4610
4611 /* Set MERGED shaders. */
4612 if (ctx->screen->info.chip_class >= GFX9) {
4613 if (shader->key.as_ls || type == PIPE_SHADER_TESS_CTRL)
4614 type = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
4615 else if (shader->key.as_es || type == PIPE_SHADER_GEOMETRY)
4616 type = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
4617 }
4618
4619 LLVMTypeRef v3i32 = LLVMVectorType(ctx->i32, 3);
4620
4621 switch (type) {
4622 case PIPE_SHADER_VERTEX:
4623 declare_global_desc_pointers(ctx, &fninfo);
4624
4625 if (vs_blit_property) {
4626 declare_vs_blit_inputs(ctx, &fninfo, vs_blit_property);
4627
4628 /* VGPRs */
4629 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4630 break;
4631 }
4632
4633 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4634 declare_vs_specific_input_sgprs(ctx, &fninfo);
4635 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4636 ac_array_in_const32_addr_space(ctx->v4i32));
4637
4638 if (shader->key.as_es) {
4639 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4640 } else if (shader->key.as_ls) {
4641 /* no extra parameters */
4642 } else {
4643 if (shader->is_gs_copy_shader) {
4644 fninfo.num_params = ctx->param_vs_state_bits + 1;
4645 fninfo.num_sgpr_params = fninfo.num_params;
4646 }
4647
4648 /* The locations of the other parameters are assigned dynamically. */
4649 declare_streamout_params(ctx, &shader->selector->so,
4650 &fninfo);
4651 }
4652
4653 /* VGPRs */
4654 declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
4655 break;
4656
4657 case PIPE_SHADER_TESS_CTRL: /* SI-CI-VI */
4658 declare_global_desc_pointers(ctx, &fninfo);
4659 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4660 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4661 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4662 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4663 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4664 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4665 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4666
4667 /* VGPRs */
4668 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4669 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4670
4671 /* param_tcs_offchip_offset and param_tcs_factor_offset are
4672 * placed after the user SGPRs.
4673 */
4674 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4675 returns[num_returns++] = ctx->i32; /* SGPRs */
4676 for (i = 0; i < 11; i++)
4677 returns[num_returns++] = ctx->f32; /* VGPRs */
4678 break;
4679
4680 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4681 /* Merged stages have 8 system SGPRs at the beginning. */
4682 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
4683 declare_per_stage_desc_pointers(ctx, &fninfo,
4684 ctx->type == PIPE_SHADER_TESS_CTRL);
4685 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4686 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4687 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4688 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4689 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4690 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
4691
4692 declare_global_desc_pointers(ctx, &fninfo);
4693 declare_per_stage_desc_pointers(ctx, &fninfo,
4694 ctx->type == PIPE_SHADER_VERTEX);
4695 declare_vs_specific_input_sgprs(ctx, &fninfo);
4696
4697 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4698 ctx->param_tcs_out_lds_offsets = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4699 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4700 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4701 ac_array_in_const32_addr_space(ctx->v4i32));
4702
4703 /* VGPRs (first TCS, then VS) */
4704 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_patch_id);
4705 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.tcs_rel_ids);
4706
4707 if (ctx->type == PIPE_SHADER_VERTEX) {
4708 declare_vs_input_vgprs(ctx, &fninfo,
4709 &num_prolog_vgprs);
4710
4711 /* LS return values are inputs to the TCS main shader part. */
4712 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4713 returns[num_returns++] = ctx->i32; /* SGPRs */
4714 for (i = 0; i < 2; i++)
4715 returns[num_returns++] = ctx->f32; /* VGPRs */
4716 } else {
4717 /* TCS return values are inputs to the TCS epilog.
4718 *
4719 * param_tcs_offchip_offset, param_tcs_factor_offset,
4720 * param_tcs_offchip_layout, and param_rw_buffers
4721 * should be passed to the epilog.
4722 */
4723 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
4724 returns[num_returns++] = ctx->i32; /* SGPRs */
4725 for (i = 0; i < 11; i++)
4726 returns[num_returns++] = ctx->f32; /* VGPRs */
4727 }
4728 break;
4729
4730 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
4731 /* Merged stages have 8 system SGPRs at the beginning. */
4732 /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
4733 declare_per_stage_desc_pointers(ctx, &fninfo,
4734 ctx->type == PIPE_SHADER_GEOMETRY);
4735 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4736 ctx->param_merged_wave_info = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4737 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4738 ctx->param_merged_scratch_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4739 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS << 8) */
4740 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
4741
4742 declare_global_desc_pointers(ctx, &fninfo);
4743 declare_per_stage_desc_pointers(ctx, &fninfo,
4744 (ctx->type == PIPE_SHADER_VERTEX ||
4745 ctx->type == PIPE_SHADER_TESS_EVAL));
4746 if (ctx->type == PIPE_SHADER_VERTEX) {
4747 declare_vs_specific_input_sgprs(ctx, &fninfo);
4748 } else {
4749 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4750 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4751 ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4752 /* Declare as many input SGPRs as the VS has. */
4753 }
4754
4755 if (ctx->type == PIPE_SHADER_VERTEX) {
4756 ctx->param_vertex_buffers = add_arg(&fninfo, ARG_SGPR,
4757 ac_array_in_const32_addr_space(ctx->v4i32));
4758 }
4759
4760 /* VGPRs (first GS, then VS/TES) */
4761 ctx->param_gs_vtx01_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4762 ctx->param_gs_vtx23_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4763 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4764 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4765 ctx->param_gs_vtx45_offset = add_arg(&fninfo, ARG_VGPR, ctx->i32);
4766
4767 if (ctx->type == PIPE_SHADER_VERTEX) {
4768 declare_vs_input_vgprs(ctx, &fninfo,
4769 &num_prolog_vgprs);
4770 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
4771 declare_tes_input_vgprs(ctx, &fninfo);
4772 }
4773
4774 if (ctx->type == PIPE_SHADER_VERTEX ||
4775 ctx->type == PIPE_SHADER_TESS_EVAL) {
4776 unsigned num_user_sgprs;
4777
4778 if (ctx->type == PIPE_SHADER_VERTEX)
4779 num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR;
4780 else
4781 num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
4782
4783 /* ES return values are inputs to GS. */
4784 for (i = 0; i < 8 + num_user_sgprs; i++)
4785 returns[num_returns++] = ctx->i32; /* SGPRs */
4786 for (i = 0; i < 5; i++)
4787 returns[num_returns++] = ctx->f32; /* VGPRs */
4788 }
4789 break;
4790
4791 case PIPE_SHADER_TESS_EVAL:
4792 declare_global_desc_pointers(ctx, &fninfo);
4793 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4794 ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4795 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4796 ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4797
4798 if (shader->key.as_es) {
4799 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4800 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4801 ctx->param_es2gs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4802 } else {
4803 add_arg(&fninfo, ARG_SGPR, ctx->i32);
4804 declare_streamout_params(ctx, &shader->selector->so,
4805 &fninfo);
4806 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4807 }
4808
4809 /* VGPRs */
4810 declare_tes_input_vgprs(ctx, &fninfo);
4811 break;
4812
4813 case PIPE_SHADER_GEOMETRY:
4814 declare_global_desc_pointers(ctx, &fninfo);
4815 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4816 ctx->param_gs2vs_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4817 ctx->param_gs_wave_id = add_arg(&fninfo, ARG_SGPR, ctx->i32);
4818
4819 /* VGPRs */
4820 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[0]);
4821 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[1]);
4822 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_prim_id);
4823 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[2]);
4824 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[3]);
4825 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[4]);
4826 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->gs_vtx_offset[5]);
4827 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &ctx->abi.gs_invocation_id);
4828 break;
4829
4830 case PIPE_SHADER_FRAGMENT:
4831 declare_global_desc_pointers(ctx, &fninfo);
4832 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4833 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
4834 add_arg_assign_checked(&fninfo, ARG_SGPR, ctx->i32,
4835 &ctx->abi.prim_mask, SI_PARAM_PRIM_MASK);
4836
4837 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_SAMPLE);
4838 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTER);
4839 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_PERSP_CENTROID);
4840 add_arg_checked(&fninfo, ARG_VGPR, v3i32, SI_PARAM_PERSP_PULL_MODEL);
4841 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_SAMPLE);
4842 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTER);
4843 add_arg_checked(&fninfo, ARG_VGPR, ctx->v2i32, SI_PARAM_LINEAR_CENTROID);
4844 add_arg_checked(&fninfo, ARG_VGPR, ctx->f32, SI_PARAM_LINE_STIPPLE_TEX);
4845 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4846 &ctx->abi.frag_pos[0], SI_PARAM_POS_X_FLOAT);
4847 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4848 &ctx->abi.frag_pos[1], SI_PARAM_POS_Y_FLOAT);
4849 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4850 &ctx->abi.frag_pos[2], SI_PARAM_POS_Z_FLOAT);
4851 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4852 &ctx->abi.frag_pos[3], SI_PARAM_POS_W_FLOAT);
4853 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4854 &ctx->abi.front_face, SI_PARAM_FRONT_FACE);
4855 shader->info.face_vgpr_index = 20;
4856 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->i32,
4857 &ctx->abi.ancillary, SI_PARAM_ANCILLARY);
4858 shader->info.ancillary_vgpr_index = 21;
4859 add_arg_assign_checked(&fninfo, ARG_VGPR, ctx->f32,
4860 &ctx->abi.sample_coverage, SI_PARAM_SAMPLE_COVERAGE);
4861 add_arg_checked(&fninfo, ARG_VGPR, ctx->i32, SI_PARAM_POS_FIXED_PT);
4862
4863 /* Color inputs from the prolog. */
4864 if (shader->selector->info.colors_read) {
4865 unsigned num_color_elements =
4866 util_bitcount(shader->selector->info.colors_read);
4867
4868 assert(fninfo.num_params + num_color_elements <= ARRAY_SIZE(fninfo.types));
4869 for (i = 0; i < num_color_elements; i++)
4870 add_arg(&fninfo, ARG_VGPR, ctx->f32);
4871
4872 num_prolog_vgprs += num_color_elements;
4873 }
4874
4875 /* Outputs for the epilog. */
4876 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
4877 num_returns =
4878 num_return_sgprs +
4879 util_bitcount(shader->selector->info.colors_written) * 4 +
4880 shader->selector->info.writes_z +
4881 shader->selector->info.writes_stencil +
4882 shader->selector->info.writes_samplemask +
4883 1 /* SampleMaskIn */;
4884
4885 num_returns = MAX2(num_returns,
4886 num_return_sgprs +
4887 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
4888
4889 for (i = 0; i < num_return_sgprs; i++)
4890 returns[i] = ctx->i32;
4891 for (; i < num_returns; i++)
4892 returns[i] = ctx->f32;
4893 break;
4894
4895 case PIPE_SHADER_COMPUTE:
4896 declare_global_desc_pointers(ctx, &fninfo);
4897 declare_per_stage_desc_pointers(ctx, &fninfo, true);
4898 if (shader->selector->info.uses_grid_size)
4899 add_arg_assign(&fninfo, ARG_SGPR, v3i32, &ctx->abi.num_work_groups);
4900 if (shader->selector->info.uses_block_size &&
4901 shader->selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
4902 ctx->param_block_size = add_arg(&fninfo, ARG_SGPR, v3i32);
4903
4904 unsigned cs_user_data_dwords =
4905 shader->selector->info.properties[TGSI_PROPERTY_CS_USER_DATA_DWORDS];
4906 if (cs_user_data_dwords) {
4907 ctx->param_cs_user_data = add_arg(&fninfo, ARG_SGPR,
4908 LLVMVectorType(ctx->i32, cs_user_data_dwords));
4909 }
4910
4911 for (i = 0; i < 3; i++) {
4912 ctx->abi.workgroup_ids[i] = NULL;
4913 if (shader->selector->info.uses_block_id[i])
4914 add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ctx->abi.workgroup_ids[i]);
4915 }
4916
4917 add_arg_assign(&fninfo, ARG_VGPR, v3i32, &ctx->abi.local_invocation_ids);
4918 break;
4919 default:
4920 assert(0 && "unimplemented shader");
4921 return;
4922 }
4923
4924 si_create_function(ctx, "main", returns, num_returns, &fninfo,
4925 si_get_max_workgroup_size(shader));
4926
4927 /* Reserve register locations for VGPR inputs the PS prolog may need. */
4928 if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
4929 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
4930 "InitialPSInputAddr",
4931 S_0286D0_PERSP_SAMPLE_ENA(1) |
4932 S_0286D0_PERSP_CENTER_ENA(1) |
4933 S_0286D0_PERSP_CENTROID_ENA(1) |
4934 S_0286D0_LINEAR_SAMPLE_ENA(1) |
4935 S_0286D0_LINEAR_CENTER_ENA(1) |
4936 S_0286D0_LINEAR_CENTROID_ENA(1) |
4937 S_0286D0_FRONT_FACE_ENA(1) |
4938 S_0286D0_ANCILLARY_ENA(1) |
4939 S_0286D0_POS_FIXED_PT_ENA(1));
4940 }
4941
4942 shader->info.num_input_sgprs = 0;
4943 shader->info.num_input_vgprs = 0;
4944
4945 for (i = 0; i < fninfo.num_sgpr_params; ++i)
4946 shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4;
4947
4948 for (; i < fninfo.num_params; ++i)
4949 shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4;
4950
4951 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
4952 shader->info.num_input_vgprs -= num_prolog_vgprs;
4953
4954 if (shader->key.as_ls ||
4955 ctx->type == PIPE_SHADER_TESS_CTRL ||
4956 /* GFX9 has the ESGS ring buffer in LDS. */
4957 type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY)
4958 ac_declare_lds_as_pointer(&ctx->ac);
4959 }
4960
4961 /**
4962 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
4963 * for later use.
4964 */
4965 static void preload_ring_buffers(struct si_shader_context *ctx)
4966 {
4967 LLVMBuilderRef builder = ctx->ac.builder;
4968
4969 LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn,
4970 ctx->param_rw_buffers);
4971
4972 if (ctx->screen->info.chip_class <= VI &&
4973 (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) {
4974 unsigned ring =
4975 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
4976 : SI_ES_RING_ESGS;
4977 LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0);
4978
4979 ctx->esgs_ring =
4980 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4981 }
4982
4983 if (ctx->shader->is_gs_copy_shader) {
4984 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4985
4986 ctx->gsvs_ring[0] =
4987 ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4988 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
4989 const struct si_shader_selector *sel = ctx->shader->selector;
4990 LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0);
4991 LLVMValueRef base_ring;
4992
4993 base_ring = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
4994
4995 /* The conceptual layout of the GSVS ring is
4996 * v0c0 .. vLv0 v0c1 .. vLc1 ..
4997 * but the real memory layout is swizzled across
4998 * threads:
4999 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
5000 * t16v0c0 ..
5001 * Override the buffer descriptor accordingly.
5002 */
5003 LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
5004 uint64_t stream_offset = 0;
5005
5006 for (unsigned stream = 0; stream < 4; ++stream) {
5007 unsigned num_components;
5008 unsigned stride;
5009 unsigned num_records;
5010 LLVMValueRef ring, tmp;
5011
5012 num_components = sel->info.num_stream_output_components[stream];
5013 if (!num_components)
5014 continue;
5015
5016 stride = 4 * num_components * sel->gs_max_out_vertices;
5017
5018 /* Limit on the stride field for <= CIK. */
5019 assert(stride < (1 << 14));
5020
5021 num_records = 64;
5022
5023 ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
5024 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_0, "");
5025 tmp = LLVMBuildAdd(builder, tmp,
5026 LLVMConstInt(ctx->i64,
5027 stream_offset, 0), "");
5028 stream_offset += stride * 64;
5029
5030 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_0, "");
5031 ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
5032 tmp = LLVMBuildExtractElement(builder, ring, ctx->i32_1, "");
5033 tmp = LLVMBuildOr(builder, tmp,
5034 LLVMConstInt(ctx->i32,
5035 S_008F04_STRIDE(stride) |
5036 S_008F04_SWIZZLE_ENABLE(1), 0), "");
5037 ring = LLVMBuildInsertElement(builder, ring, tmp, ctx->i32_1, "");
5038 ring = LLVMBuildInsertElement(builder, ring,
5039 LLVMConstInt(ctx->i32, num_records, 0),
5040 LLVMConstInt(ctx->i32, 2, 0), "");
5041 ring = LLVMBuildInsertElement(builder, ring,
5042 LLVMConstInt(ctx->i32,
5043 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5044 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5045 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5046 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
5047 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5048 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
5049 S_008F0C_ELEMENT_SIZE(1) | /* element_size = 4 (bytes) */
5050 S_008F0C_INDEX_STRIDE(1) | /* index_stride = 16 (elements) */
5051 S_008F0C_ADD_TID_ENABLE(1),
5052 0),
5053 LLVMConstInt(ctx->i32, 3, 0), "");
5054
5055 ctx->gsvs_ring[stream] = ring;
5056 }
5057 } else if (ctx->type == PIPE_SHADER_TESS_EVAL) {
5058 ctx->tess_offchip_ring = get_tess_ring_descriptor(ctx, TESS_OFFCHIP_RING_TES);
5059 }
5060 }
5061
5062 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5063 LLVMValueRef param_rw_buffers,
5064 unsigned param_pos_fixed_pt)
5065 {
5066 LLVMBuilderRef builder = ctx->ac.builder;
5067 LLVMValueRef slot, desc, offset, row, bit, address[2];
5068
5069 /* Use the fixed-point gl_FragCoord input.
5070 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5071 * per coordinate to get the repeating effect.
5072 */
5073 address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5074 address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5075
5076 /* Load the buffer descriptor. */
5077 slot = LLVMConstInt(ctx->i32, SI_PS_CONST_POLY_STIPPLE, 0);
5078 desc = ac_build_load_to_sgpr(&ctx->ac, param_rw_buffers, slot);
5079
5080 /* The stipple pattern is 32x32, each row has 32 bits. */
5081 offset = LLVMBuildMul(builder, address[1],
5082 LLVMConstInt(ctx->i32, 4, 0), "");
5083 row = buffer_load_const(ctx, desc, offset);
5084 row = ac_to_integer(&ctx->ac, row);
5085 bit = LLVMBuildLShr(builder, row, address[0], "");
5086 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5087 ac_build_kill_if_false(&ctx->ac, bit);
5088 }
5089
5090 void si_shader_binary_read_config(struct ac_shader_binary *binary,
5091 struct si_shader_config *conf,
5092 unsigned symbol_offset)
5093 {
5094 unsigned i;
5095 const unsigned char *config =
5096 ac_shader_binary_config_start(binary, symbol_offset);
5097 bool really_needs_scratch = false;
5098
5099 /* LLVM adds SGPR spills to the scratch size.
5100 * Find out if we really need the scratch buffer.
5101 */
5102 for (i = 0; i < binary->reloc_count; i++) {
5103 const struct ac_shader_reloc *reloc = &binary->relocs[i];
5104
5105 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5106 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5107 really_needs_scratch = true;
5108 break;
5109 }
5110 }
5111
5112 /* XXX: We may be able to emit some of these values directly rather than
5113 * extracting fields to be emitted later.
5114 */
5115
5116 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5117 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5118 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5119 switch (reg) {
5120 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5121 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5122 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5123 case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
5124 case R_00B848_COMPUTE_PGM_RSRC1:
5125 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5126 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5127 conf->float_mode = G_00B028_FLOAT_MODE(value);
5128 conf->rsrc1 = value;
5129 break;
5130 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5131 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5132 break;
5133 case R_00B84C_COMPUTE_PGM_RSRC2:
5134 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5135 conf->rsrc2 = value;
5136 break;
5137 case R_0286CC_SPI_PS_INPUT_ENA:
5138 conf->spi_ps_input_ena = value;
5139 break;
5140 case R_0286D0_SPI_PS_INPUT_ADDR:
5141 conf->spi_ps_input_addr = value;
5142 break;
5143 case R_0286E8_SPI_TMPRING_SIZE:
5144 case R_00B860_COMPUTE_TMPRING_SIZE:
5145 /* WAVESIZE is in units of 256 dwords. */
5146 if (really_needs_scratch)
5147 conf->scratch_bytes_per_wave =
5148 G_00B860_WAVESIZE(value) * 256 * 4;
5149 break;
5150 case 0x4: /* SPILLED_SGPRS */
5151 conf->spilled_sgprs = value;
5152 break;
5153 case 0x8: /* SPILLED_VGPRS */
5154 conf->spilled_vgprs = value;
5155 break;
5156 default:
5157 {
5158 static bool printed;
5159
5160 if (!printed) {
5161 fprintf(stderr, "Warning: LLVM emitted unknown "
5162 "config register: 0x%x\n", reg);
5163 printed = true;
5164 }
5165 }
5166 break;
5167 }
5168 }
5169
5170 if (!conf->spi_ps_input_addr)
5171 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5172 }
5173
5174 void si_shader_apply_scratch_relocs(struct si_shader *shader,
5175 uint64_t scratch_va)
5176 {
5177 unsigned i;
5178 uint32_t scratch_rsrc_dword0 = scratch_va;
5179 uint32_t scratch_rsrc_dword1 =
5180 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5181
5182 /* Enable scratch coalescing. */
5183 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5184
5185 for (i = 0 ; i < shader->binary.reloc_count; i++) {
5186 const struct ac_shader_reloc *reloc =
5187 &shader->binary.relocs[i];
5188 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5189 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5190 &scratch_rsrc_dword0, 4);
5191 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5192 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5193 &scratch_rsrc_dword1, 4);
5194 }
5195 }
5196 }
5197
5198 /* For the UMR disassembler. */
5199 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
5200 #define DEBUGGER_NUM_MARKERS 5
5201
5202 static unsigned si_get_shader_binary_size(const struct si_shader *shader)
5203 {
5204 unsigned size = shader->binary.code_size;
5205
5206 if (shader->prolog)
5207 size += shader->prolog->binary.code_size;
5208 if (shader->previous_stage)
5209 size += shader->previous_stage->binary.code_size;
5210 if (shader->prolog2)
5211 size += shader->prolog2->binary.code_size;
5212 if (shader->epilog)
5213 size += shader->epilog->binary.code_size;
5214 return size + DEBUGGER_NUM_MARKERS * 4;
5215 }
5216
5217 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5218 {
5219 const struct ac_shader_binary *prolog =
5220 shader->prolog ? &shader->prolog->binary : NULL;
5221 const struct ac_shader_binary *previous_stage =
5222 shader->previous_stage ? &shader->previous_stage->binary : NULL;
5223 const struct ac_shader_binary *prolog2 =
5224 shader->prolog2 ? &shader->prolog2->binary : NULL;
5225 const struct ac_shader_binary *epilog =
5226 shader->epilog ? &shader->epilog->binary : NULL;
5227 const struct ac_shader_binary *mainb = &shader->binary;
5228 unsigned bo_size = si_get_shader_binary_size(shader) +
5229 (!epilog ? mainb->rodata_size : 0);
5230 unsigned char *ptr;
5231
5232 assert(!prolog || !prolog->rodata_size);
5233 assert(!previous_stage || !previous_stage->rodata_size);
5234 assert(!prolog2 || !prolog2->rodata_size);
5235 assert((!prolog && !previous_stage && !prolog2 && !epilog) ||
5236 !mainb->rodata_size);
5237 assert(!epilog || !epilog->rodata_size);
5238
5239 r600_resource_reference(&shader->bo, NULL);
5240 shader->bo = si_aligned_buffer_create(&sscreen->b,
5241 sscreen->cpdma_prefetch_writes_memory ?
5242 0 : SI_RESOURCE_FLAG_READ_ONLY,
5243 PIPE_USAGE_IMMUTABLE,
5244 align(bo_size, SI_CPDMA_ALIGNMENT),
5245 256);
5246 if (!shader->bo)
5247 return -ENOMEM;
5248
5249 /* Upload. */
5250 ptr = sscreen->ws->buffer_map(shader->bo->buf, NULL,
5251 PIPE_TRANSFER_READ_WRITE |
5252 PIPE_TRANSFER_UNSYNCHRONIZED |
5253 RADEON_TRANSFER_TEMPORARY);
5254
5255 /* Don't use util_memcpy_cpu_to_le32. LLVM binaries are
5256 * endian-independent. */
5257 if (prolog) {
5258 memcpy(ptr, prolog->code, prolog->code_size);
5259 ptr += prolog->code_size;
5260 }
5261 if (previous_stage) {
5262 memcpy(ptr, previous_stage->code, previous_stage->code_size);
5263 ptr += previous_stage->code_size;
5264 }
5265 if (prolog2) {
5266 memcpy(ptr, prolog2->code, prolog2->code_size);
5267 ptr += prolog2->code_size;
5268 }
5269
5270 memcpy(ptr, mainb->code, mainb->code_size);
5271 ptr += mainb->code_size;
5272
5273 if (epilog) {
5274 memcpy(ptr, epilog->code, epilog->code_size);
5275 ptr += epilog->code_size;
5276 } else if (mainb->rodata_size > 0) {
5277 memcpy(ptr, mainb->rodata, mainb->rodata_size);
5278 ptr += mainb->rodata_size;
5279 }
5280
5281 /* Add end-of-code markers for the UMR disassembler. */
5282 uint32_t *ptr32 = (uint32_t*)ptr;
5283 for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
5284 ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
5285
5286 sscreen->ws->buffer_unmap(shader->bo->buf);
5287 return 0;
5288 }
5289
5290 static void si_shader_dump_disassembly(const struct ac_shader_binary *binary,
5291 struct pipe_debug_callback *debug,
5292 const char *name, FILE *file)
5293 {
5294 char *line, *p;
5295 unsigned i, count;
5296
5297 if (binary->disasm_string) {
5298 fprintf(file, "Shader %s disassembly:\n", name);
5299 fprintf(file, "%s", binary->disasm_string);
5300
5301 if (debug && debug->debug_message) {
5302 /* Very long debug messages are cut off, so send the
5303 * disassembly one line at a time. This causes more
5304 * overhead, but on the plus side it simplifies
5305 * parsing of resulting logs.
5306 */
5307 pipe_debug_message(debug, SHADER_INFO,
5308 "Shader Disassembly Begin");
5309
5310 line = binary->disasm_string;
5311 while (*line) {
5312 p = util_strchrnul(line, '\n');
5313 count = p - line;
5314
5315 if (count) {
5316 pipe_debug_message(debug, SHADER_INFO,
5317 "%.*s", count, line);
5318 }
5319
5320 if (!*p)
5321 break;
5322 line = p + 1;
5323 }
5324
5325 pipe_debug_message(debug, SHADER_INFO,
5326 "Shader Disassembly End");
5327 }
5328 } else {
5329 fprintf(file, "Shader %s binary:\n", name);
5330 for (i = 0; i < binary->code_size; i += 4) {
5331 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5332 binary->code[i + 3], binary->code[i + 2],
5333 binary->code[i + 1], binary->code[i]);
5334 }
5335 }
5336 }
5337
5338 static void si_calculate_max_simd_waves(struct si_shader *shader)
5339 {
5340 struct si_screen *sscreen = shader->selector->screen;
5341 struct si_shader_config *conf = &shader->config;
5342 unsigned num_inputs = shader->selector->info.num_inputs;
5343 unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : 256;
5344 unsigned lds_per_wave = 0;
5345 unsigned max_simd_waves;
5346
5347 max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
5348
5349 /* Compute LDS usage for PS. */
5350 switch (shader->selector->type) {
5351 case PIPE_SHADER_FRAGMENT:
5352 /* The minimum usage per wave is (num_inputs * 48). The maximum
5353 * usage is (num_inputs * 48 * 16).
5354 * We can get anything in between and it varies between waves.
5355 *
5356 * The 48 bytes per input for a single primitive is equal to
5357 * 4 bytes/component * 4 components/input * 3 points.
5358 *
5359 * Other stages don't know the size at compile time or don't
5360 * allocate LDS per wave, but instead they do it per thread group.
5361 */
5362 lds_per_wave = conf->lds_size * lds_increment +
5363 align(num_inputs * 48, lds_increment);
5364 break;
5365 case PIPE_SHADER_COMPUTE:
5366 if (shader->selector) {
5367 unsigned max_workgroup_size =
5368 si_get_max_workgroup_size(shader);
5369 lds_per_wave = (conf->lds_size * lds_increment) /
5370 DIV_ROUND_UP(max_workgroup_size, 64);
5371 }
5372 break;
5373 }
5374
5375 /* Compute the per-SIMD wave counts. */
5376 if (conf->num_sgprs) {
5377 if (sscreen->info.chip_class >= VI)
5378 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5379 else
5380 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5381 }
5382
5383 if (conf->num_vgprs)
5384 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5385
5386 /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
5387 * 16KB makes some SIMDs unoccupied). */
5388 if (lds_per_wave)
5389 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5390
5391 conf->max_simd_waves = max_simd_waves;
5392 }
5393
5394 void si_shader_dump_stats_for_shader_db(const struct si_shader *shader,
5395 struct pipe_debug_callback *debug)
5396 {
5397 const struct si_shader_config *conf = &shader->config;
5398
5399 pipe_debug_message(debug, SHADER_INFO,
5400 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5401 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
5402 "Spilled VGPRs: %d PrivMem VGPRs: %d",
5403 conf->num_sgprs, conf->num_vgprs,
5404 si_get_shader_binary_size(shader),
5405 conf->lds_size, conf->scratch_bytes_per_wave,
5406 conf->max_simd_waves, conf->spilled_sgprs,
5407 conf->spilled_vgprs, conf->private_mem_vgprs);
5408 }
5409
5410 static void si_shader_dump_stats(struct si_screen *sscreen,
5411 const struct si_shader *shader,
5412 unsigned processor,
5413 FILE *file,
5414 bool check_debug_option)
5415 {
5416 const struct si_shader_config *conf = &shader->config;
5417
5418 if (!check_debug_option ||
5419 si_can_dump_shader(sscreen, processor)) {
5420 if (processor == PIPE_SHADER_FRAGMENT) {
5421 fprintf(file, "*** SHADER CONFIG ***\n"
5422 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5423 "SPI_PS_INPUT_ENA = 0x%04x\n",
5424 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5425 }
5426
5427 fprintf(file, "*** SHADER STATS ***\n"
5428 "SGPRS: %d\n"
5429 "VGPRS: %d\n"
5430 "Spilled SGPRs: %d\n"
5431 "Spilled VGPRs: %d\n"
5432 "Private memory VGPRs: %d\n"
5433 "Code Size: %d bytes\n"
5434 "LDS: %d blocks\n"
5435 "Scratch: %d bytes per wave\n"
5436 "Max Waves: %d\n"
5437 "********************\n\n\n",
5438 conf->num_sgprs, conf->num_vgprs,
5439 conf->spilled_sgprs, conf->spilled_vgprs,
5440 conf->private_mem_vgprs,
5441 si_get_shader_binary_size(shader),
5442 conf->lds_size, conf->scratch_bytes_per_wave,
5443 conf->max_simd_waves);
5444 }
5445 }
5446
5447 const char *si_get_shader_name(const struct si_shader *shader, unsigned processor)
5448 {
5449 switch (processor) {
5450 case PIPE_SHADER_VERTEX:
5451 if (shader->key.as_es)
5452 return "Vertex Shader as ES";
5453 else if (shader->key.as_ls)
5454 return "Vertex Shader as LS";
5455 else
5456 return "Vertex Shader as VS";
5457 case PIPE_SHADER_TESS_CTRL:
5458 return "Tessellation Control Shader";
5459 case PIPE_SHADER_TESS_EVAL:
5460 if (shader->key.as_es)
5461 return "Tessellation Evaluation Shader as ES";
5462 else
5463 return "Tessellation Evaluation Shader as VS";
5464 case PIPE_SHADER_GEOMETRY:
5465 if (shader->is_gs_copy_shader)
5466 return "GS Copy Shader as VS";
5467 else
5468 return "Geometry Shader";
5469 case PIPE_SHADER_FRAGMENT:
5470 return "Pixel Shader";
5471 case PIPE_SHADER_COMPUTE:
5472 return "Compute Shader";
5473 default:
5474 return "Unknown Shader";
5475 }
5476 }
5477
5478 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
5479 struct pipe_debug_callback *debug, unsigned processor,
5480 FILE *file, bool check_debug_option)
5481 {
5482 if (!check_debug_option ||
5483 si_can_dump_shader(sscreen, processor))
5484 si_dump_shader_key(processor, shader, file);
5485
5486 if (!check_debug_option && shader->binary.llvm_ir_string) {
5487 if (shader->previous_stage &&
5488 shader->previous_stage->binary.llvm_ir_string) {
5489 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n",
5490 si_get_shader_name(shader, processor));
5491 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
5492 }
5493
5494 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
5495 si_get_shader_name(shader, processor));
5496 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
5497 }
5498
5499 if (!check_debug_option ||
5500 (si_can_dump_shader(sscreen, processor) &&
5501 !(sscreen->debug_flags & DBG(NO_ASM)))) {
5502 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5503
5504 if (shader->prolog)
5505 si_shader_dump_disassembly(&shader->prolog->binary,
5506 debug, "prolog", file);
5507 if (shader->previous_stage)
5508 si_shader_dump_disassembly(&shader->previous_stage->binary,
5509 debug, "previous stage", file);
5510 if (shader->prolog2)
5511 si_shader_dump_disassembly(&shader->prolog2->binary,
5512 debug, "prolog2", file);
5513
5514 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5515
5516 if (shader->epilog)
5517 si_shader_dump_disassembly(&shader->epilog->binary,
5518 debug, "epilog", file);
5519 fprintf(file, "\n");
5520 }
5521
5522 si_shader_dump_stats(sscreen, shader, processor, file,
5523 check_debug_option);
5524 }
5525
5526 static int si_compile_llvm(struct si_screen *sscreen,
5527 struct ac_shader_binary *binary,
5528 struct si_shader_config *conf,
5529 struct ac_llvm_compiler *compiler,
5530 LLVMModuleRef mod,
5531 struct pipe_debug_callback *debug,
5532 unsigned processor,
5533 const char *name,
5534 bool less_optimized)
5535 {
5536 int r = 0;
5537 unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
5538
5539 if (si_can_dump_shader(sscreen, processor)) {
5540 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5541
5542 if (!(sscreen->debug_flags & (DBG(NO_IR) | DBG(PREOPT_IR)))) {
5543 fprintf(stderr, "%s LLVM IR:\n\n", name);
5544 ac_dump_module(mod);
5545 fprintf(stderr, "\n");
5546 }
5547 }
5548
5549 if (sscreen->record_llvm_ir) {
5550 char *ir = LLVMPrintModuleToString(mod);
5551 binary->llvm_ir_string = strdup(ir);
5552 LLVMDisposeMessage(ir);
5553 }
5554
5555 if (!si_replace_shader(count, binary)) {
5556 r = si_llvm_compile(mod, binary, compiler, debug,
5557 less_optimized);
5558 if (r)
5559 return r;
5560 }
5561
5562 si_shader_binary_read_config(binary, conf, 0);
5563
5564 /* Enable 64-bit and 16-bit denormals, because there is no performance
5565 * cost.
5566 *
5567 * If denormals are enabled, all floating-point output modifiers are
5568 * ignored.
5569 *
5570 * Don't enable denormals for 32-bit floats, because:
5571 * - Floating-point output modifiers would be ignored by the hw.
5572 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5573 * have to stop using those.
5574 * - SI & CI would be very slow.
5575 */
5576 conf->float_mode |= V_00B028_FP_64_DENORMS;
5577
5578 FREE(binary->config);
5579 FREE(binary->global_symbol_offsets);
5580 binary->config = NULL;
5581 binary->global_symbol_offsets = NULL;
5582
5583 /* Some shaders can't have rodata because their binaries can be
5584 * concatenated.
5585 */
5586 if (binary->rodata_size &&
5587 (processor == PIPE_SHADER_VERTEX ||
5588 processor == PIPE_SHADER_TESS_CTRL ||
5589 processor == PIPE_SHADER_TESS_EVAL ||
5590 processor == PIPE_SHADER_FRAGMENT)) {
5591 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5592 return -EINVAL;
5593 }
5594
5595 return r;
5596 }
5597
5598 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
5599 {
5600 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
5601 LLVMBuildRetVoid(ctx->ac.builder);
5602 else
5603 LLVMBuildRet(ctx->ac.builder, ret);
5604 }
5605
5606 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5607 struct si_shader *
5608 si_generate_gs_copy_shader(struct si_screen *sscreen,
5609 struct ac_llvm_compiler *compiler,
5610 struct si_shader_selector *gs_selector,
5611 struct pipe_debug_callback *debug)
5612 {
5613 struct si_shader_context ctx;
5614 struct si_shader *shader;
5615 LLVMBuilderRef builder;
5616 struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
5617 struct tgsi_shader_info *gsinfo = &gs_selector->info;
5618 int i, r;
5619
5620
5621 shader = CALLOC_STRUCT(si_shader);
5622 if (!shader)
5623 return NULL;
5624
5625 /* We can leave the fence as permanently signaled because the GS copy
5626 * shader only becomes visible globally after it has been compiled. */
5627 util_queue_fence_init(&shader->ready);
5628
5629 shader->selector = gs_selector;
5630 shader->is_gs_copy_shader = true;
5631
5632 si_init_shader_ctx(&ctx, sscreen, compiler);
5633 ctx.shader = shader;
5634 ctx.type = PIPE_SHADER_VERTEX;
5635
5636 builder = ctx.ac.builder;
5637
5638 create_function(&ctx);
5639 preload_ring_buffers(&ctx);
5640
5641 LLVMValueRef voffset =
5642 LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
5643 LLVMConstInt(ctx.i32, 4, 0), "");
5644
5645 /* Fetch the vertex stream ID.*/
5646 LLVMValueRef stream_id;
5647
5648 if (gs_selector->so.num_outputs)
5649 stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2);
5650 else
5651 stream_id = ctx.i32_0;
5652
5653 /* Fill in output information. */
5654 for (i = 0; i < gsinfo->num_outputs; ++i) {
5655 outputs[i].semantic_name = gsinfo->output_semantic_name[i];
5656 outputs[i].semantic_index = gsinfo->output_semantic_index[i];
5657
5658 for (int chan = 0; chan < 4; chan++) {
5659 outputs[i].vertex_stream[chan] =
5660 (gsinfo->output_streams[i] >> (2 * chan)) & 3;
5661 }
5662 }
5663
5664 LLVMBasicBlockRef end_bb;
5665 LLVMValueRef switch_inst;
5666
5667 end_bb = LLVMAppendBasicBlockInContext(ctx.ac.context, ctx.main_fn, "end");
5668 switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
5669
5670 for (int stream = 0; stream < 4; stream++) {
5671 LLVMBasicBlockRef bb;
5672 unsigned offset;
5673
5674 if (!gsinfo->num_stream_output_components[stream])
5675 continue;
5676
5677 if (stream > 0 && !gs_selector->so.num_outputs)
5678 continue;
5679
5680 bb = LLVMInsertBasicBlockInContext(ctx.ac.context, end_bb, "out");
5681 LLVMAddCase(switch_inst, LLVMConstInt(ctx.i32, stream, 0), bb);
5682 LLVMPositionBuilderAtEnd(builder, bb);
5683
5684 /* Fetch vertex data from GSVS ring */
5685 offset = 0;
5686 for (i = 0; i < gsinfo->num_outputs; ++i) {
5687 for (unsigned chan = 0; chan < 4; chan++) {
5688 if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
5689 outputs[i].vertex_stream[chan] != stream) {
5690 outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
5691 continue;
5692 }
5693
5694 LLVMValueRef soffset = LLVMConstInt(ctx.i32,
5695 offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
5696 offset++;
5697
5698 outputs[i].values[chan] =
5699 ac_build_buffer_load(&ctx.ac,
5700 ctx.gsvs_ring[0], 1,
5701 ctx.i32_0, voffset,
5702 soffset, 0, 1, 1,
5703 true, false);
5704 }
5705 }
5706
5707 /* Streamout and exports. */
5708 if (gs_selector->so.num_outputs) {
5709 si_llvm_emit_streamout(&ctx, outputs,
5710 gsinfo->num_outputs,
5711 stream);
5712 }
5713
5714 if (stream == 0) {
5715 /* Vertex color clamping.
5716 *
5717 * This uses a state constant loaded in a user data SGPR and
5718 * an IF statement is added that clamps all colors if the constant
5719 * is true.
5720 */
5721 struct lp_build_if_state if_ctx;
5722 LLVMValueRef v[2], cond = NULL;
5723 LLVMBasicBlockRef blocks[2];
5724
5725 for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
5726 if (gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
5727 gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
5728 continue;
5729
5730 /* We've found a color. */
5731 if (!cond) {
5732 /* The state is in the first bit of the user SGPR. */
5733 cond = LLVMGetParam(ctx.main_fn,
5734 ctx.param_vs_state_bits);
5735 cond = LLVMBuildTrunc(ctx.ac.builder, cond,
5736 ctx.i1, "");
5737 lp_build_if(&if_ctx, &ctx.gallivm, cond);
5738 /* Remember blocks for Phi. */
5739 blocks[0] = if_ctx.true_block;
5740 blocks[1] = if_ctx.entry_block;
5741 }
5742
5743 for (unsigned j = 0; j < 4; j++) {
5744 /* Insert clamp into the true block. */
5745 v[0] = ac_build_clamp(&ctx.ac, outputs[i].values[j]);
5746 v[1] = outputs[i].values[j];
5747
5748 /* Insert Phi into the endif block. */
5749 LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.merge_block);
5750 outputs[i].values[j] = ac_build_phi(&ctx.ac, ctx.f32, 2, v, blocks);
5751 LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.true_block);
5752 }
5753 }
5754 if (cond)
5755 lp_build_endif(&if_ctx);
5756
5757 si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
5758 }
5759
5760 LLVMBuildBr(builder, end_bb);
5761 }
5762
5763 LLVMPositionBuilderAtEnd(builder, end_bb);
5764
5765 LLVMBuildRetVoid(ctx.ac.builder);
5766
5767 ctx.type = PIPE_SHADER_GEOMETRY; /* override for shader dumping */
5768 si_llvm_optimize_module(&ctx);
5769
5770 r = si_compile_llvm(sscreen, &ctx.shader->binary,
5771 &ctx.shader->config, ctx.compiler,
5772 ctx.ac.module,
5773 debug, PIPE_SHADER_GEOMETRY,
5774 "GS Copy Shader", false);
5775 if (!r) {
5776 if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
5777 fprintf(stderr, "GS Copy Shader:\n");
5778 si_shader_dump(sscreen, ctx.shader, debug,
5779 PIPE_SHADER_GEOMETRY, stderr, true);
5780 r = si_shader_binary_upload(sscreen, ctx.shader);
5781 }
5782
5783 si_llvm_dispose(&ctx);
5784
5785 if (r != 0) {
5786 FREE(shader);
5787 shader = NULL;
5788 } else {
5789 si_fix_resource_usage(sscreen, shader);
5790 }
5791 return shader;
5792 }
5793
5794 static void si_dump_shader_key_vs(const struct si_shader_key *key,
5795 const struct si_vs_prolog_bits *prolog,
5796 const char *prefix, FILE *f)
5797 {
5798 fprintf(f, " %s.instance_divisor_is_one = %u\n",
5799 prefix, prolog->instance_divisor_is_one);
5800 fprintf(f, " %s.instance_divisor_is_fetched = %u\n",
5801 prefix, prolog->instance_divisor_is_fetched);
5802 fprintf(f, " %s.ls_vgpr_fix = %u\n",
5803 prefix, prolog->ls_vgpr_fix);
5804
5805 fprintf(f, " mono.vs.fix_fetch = {");
5806 for (int i = 0; i < SI_MAX_ATTRIBS; i++)
5807 fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
5808 fprintf(f, "}\n");
5809 }
5810
5811 static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
5812 FILE *f)
5813 {
5814 const struct si_shader_key *key = &shader->key;
5815
5816 fprintf(f, "SHADER KEY\n");
5817
5818 switch (processor) {
5819 case PIPE_SHADER_VERTEX:
5820 si_dump_shader_key_vs(key, &key->part.vs.prolog,
5821 "part.vs.prolog", f);
5822 fprintf(f, " as_es = %u\n", key->as_es);
5823 fprintf(f, " as_ls = %u\n", key->as_ls);
5824 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5825 key->mono.u.vs_export_prim_id);
5826 break;
5827
5828 case PIPE_SHADER_TESS_CTRL:
5829 if (shader->selector->screen->info.chip_class >= GFX9) {
5830 si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog,
5831 "part.tcs.ls_prolog", f);
5832 }
5833 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
5834 fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%"PRIx64"\n", key->mono.u.ff_tcs_inputs_to_copy);
5835 break;
5836
5837 case PIPE_SHADER_TESS_EVAL:
5838 fprintf(f, " as_es = %u\n", key->as_es);
5839 fprintf(f, " mono.u.vs_export_prim_id = %u\n",
5840 key->mono.u.vs_export_prim_id);
5841 break;
5842
5843 case PIPE_SHADER_GEOMETRY:
5844 if (shader->is_gs_copy_shader)
5845 break;
5846
5847 if (shader->selector->screen->info.chip_class >= GFX9 &&
5848 key->part.gs.es->type == PIPE_SHADER_VERTEX) {
5849 si_dump_shader_key_vs(key, &key->part.gs.vs_prolog,
5850 "part.gs.vs_prolog", f);
5851 }
5852 fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", key->part.gs.prolog.tri_strip_adj_fix);
5853 break;
5854
5855 case PIPE_SHADER_COMPUTE:
5856 break;
5857
5858 case PIPE_SHADER_FRAGMENT:
5859 fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
5860 fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
5861 fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
5862 fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", key->part.ps.prolog.force_persp_sample_interp);
5863 fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", key->part.ps.prolog.force_linear_sample_interp);
5864 fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", key->part.ps.prolog.force_persp_center_interp);
5865 fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", key->part.ps.prolog.force_linear_center_interp);
5866 fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", key->part.ps.prolog.bc_optimize_for_persp);
5867 fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", key->part.ps.prolog.bc_optimize_for_linear);
5868 fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", key->part.ps.epilog.spi_shader_col_format);
5869 fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
5870 fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
5871 fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
5872 fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
5873 fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
5874 fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", key->part.ps.epilog.poly_line_smoothing);
5875 fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
5876 break;
5877
5878 default:
5879 assert(0);
5880 }
5881
5882 if ((processor == PIPE_SHADER_GEOMETRY ||
5883 processor == PIPE_SHADER_TESS_EVAL ||
5884 processor == PIPE_SHADER_VERTEX) &&
5885 !key->as_es && !key->as_ls) {
5886 fprintf(f, " opt.kill_outputs = 0x%"PRIx64"\n", key->opt.kill_outputs);
5887 fprintf(f, " opt.clip_disable = %u\n", key->opt.clip_disable);
5888 }
5889 }
5890
5891 static void si_init_shader_ctx(struct si_shader_context *ctx,
5892 struct si_screen *sscreen,
5893 struct ac_llvm_compiler *compiler)
5894 {
5895 struct lp_build_tgsi_context *bld_base;
5896
5897 si_llvm_context_init(ctx, sscreen, compiler);
5898
5899 bld_base = &ctx->bld_base;
5900 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5901
5902 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID].emit = build_interp_intrinsic;
5903 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE].emit = build_interp_intrinsic;
5904 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET].emit = build_interp_intrinsic;
5905
5906 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5907
5908 bld_base->op_actions[TGSI_OPCODE_CLOCK].emit = clock_emit;
5909
5910 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5911 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5912 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5913 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5914
5915 bld_base->op_actions[TGSI_OPCODE_VOTE_ALL].emit = vote_all_emit;
5916 bld_base->op_actions[TGSI_OPCODE_VOTE_ANY].emit = vote_any_emit;
5917 bld_base->op_actions[TGSI_OPCODE_VOTE_EQ].emit = vote_eq_emit;
5918 bld_base->op_actions[TGSI_OPCODE_BALLOT].emit = ballot_emit;
5919 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].intr_name = "llvm.amdgcn.readfirstlane";
5920 bld_base->op_actions[TGSI_OPCODE_READ_FIRST].emit = read_lane_emit;
5921 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].intr_name = "llvm.amdgcn.readlane";
5922 bld_base->op_actions[TGSI_OPCODE_READ_INVOC].emit = read_lane_emit;
5923
5924 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_tgsi_emit_vertex;
5925 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_tgsi_emit_primitive;
5926 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5927 }
5928
5929 static void si_optimize_vs_outputs(struct si_shader_context *ctx)
5930 {
5931 struct si_shader *shader = ctx->shader;
5932 struct tgsi_shader_info *info = &shader->selector->info;
5933
5934 if ((ctx->type != PIPE_SHADER_VERTEX &&
5935 ctx->type != PIPE_SHADER_TESS_EVAL) ||
5936 shader->key.as_ls ||
5937 shader->key.as_es)
5938 return;
5939
5940 ac_optimize_vs_outputs(&ctx->ac,
5941 ctx->main_fn,
5942 shader->info.vs_output_param_offset,
5943 info->num_outputs,
5944 &shader->info.nr_param_exports);
5945 }
5946
5947 static void si_init_exec_from_input(struct si_shader_context *ctx,
5948 unsigned param, unsigned bitoffset)
5949 {
5950 LLVMValueRef args[] = {
5951 LLVMGetParam(ctx->main_fn, param),
5952 LLVMConstInt(ctx->i32, bitoffset, 0),
5953 };
5954 ac_build_intrinsic(&ctx->ac,
5955 "llvm.amdgcn.init.exec.from.input",
5956 ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
5957 }
5958
5959 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
5960 const struct si_vs_prolog_bits *key)
5961 {
5962 /* VGPR initialization fixup for Vega10 and Raven is always done in the
5963 * VS prolog. */
5964 return sel->vs_needs_prolog || key->ls_vgpr_fix;
5965 }
5966
5967 static bool si_compile_tgsi_main(struct si_shader_context *ctx)
5968 {
5969 struct si_shader *shader = ctx->shader;
5970 struct si_shader_selector *sel = shader->selector;
5971 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
5972
5973 // TODO clean all this up!
5974 switch (ctx->type) {
5975 case PIPE_SHADER_VERTEX:
5976 ctx->load_input = declare_input_vs;
5977 if (shader->key.as_ls)
5978 ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
5979 else if (shader->key.as_es)
5980 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
5981 else
5982 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
5983 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5984 ctx->abi.load_base_vertex = get_base_vertex;
5985 break;
5986 case PIPE_SHADER_TESS_CTRL:
5987 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5988 ctx->abi.load_tess_varyings = si_nir_load_tcs_varyings;
5989 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5990 bld_base->emit_store = store_output_tcs;
5991 ctx->abi.store_tcs_outputs = si_nir_store_output_tcs;
5992 ctx->abi.emit_outputs = si_llvm_emit_tcs_epilogue;
5993 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
5994 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
5995 break;
5996 case PIPE_SHADER_TESS_EVAL:
5997 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
5998 ctx->abi.load_tess_varyings = si_nir_load_input_tes;
5999 ctx->abi.load_tess_coord = si_load_tess_coord;
6000 ctx->abi.load_tess_level = si_load_tess_level;
6001 ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
6002 if (shader->key.as_es)
6003 ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
6004 else
6005 ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
6006 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
6007 break;
6008 case PIPE_SHADER_GEOMETRY:
6009 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6010 ctx->abi.load_inputs = si_nir_load_input_gs;
6011 ctx->abi.emit_vertex = si_llvm_emit_vertex;
6012 ctx->abi.emit_primitive = si_llvm_emit_primitive;
6013 ctx->abi.emit_outputs = si_llvm_emit_gs_epilogue;
6014 bld_base->emit_epilogue = si_tgsi_emit_gs_epilogue;
6015 break;
6016 case PIPE_SHADER_FRAGMENT:
6017 ctx->load_input = declare_input_fs;
6018 ctx->abi.emit_outputs = si_llvm_return_fs_outputs;
6019 bld_base->emit_epilogue = si_tgsi_emit_epilogue;
6020 ctx->abi.lookup_interp_param = si_nir_lookup_interp_param;
6021 ctx->abi.load_sample_position = load_sample_position;
6022 ctx->abi.load_sample_mask_in = load_sample_mask_in;
6023 ctx->abi.emit_kill = si_llvm_emit_kill;
6024 break;
6025 case PIPE_SHADER_COMPUTE:
6026 ctx->abi.load_local_group_size = get_block_size;
6027 break;
6028 default:
6029 assert(!"Unsupported shader type");
6030 return false;
6031 }
6032
6033 ctx->abi.load_ubo = load_ubo;
6034 ctx->abi.load_ssbo = load_ssbo;
6035
6036 create_function(ctx);
6037 preload_ring_buffers(ctx);
6038
6039 /* For GFX9 merged shaders:
6040 * - Set EXEC for the first shader. If the prolog is present, set
6041 * EXEC there instead.
6042 * - Add a barrier before the second shader.
6043 * - In the second shader, reset EXEC to ~0 and wrap the main part in
6044 * an if-statement. This is required for correctness in geometry
6045 * shaders, to ensure that empty GS waves do not send GS_EMIT and
6046 * GS_CUT messages.
6047 *
6048 * For monolithic merged shaders, the first shader is wrapped in an
6049 * if-block together with its prolog in si_build_wrapper_function.
6050 */
6051 if (ctx->screen->info.chip_class >= GFX9) {
6052 if (!shader->is_monolithic &&
6053 sel->info.num_instructions > 1 && /* not empty shader */
6054 (shader->key.as_es || shader->key.as_ls) &&
6055 (ctx->type == PIPE_SHADER_TESS_EVAL ||
6056 (ctx->type == PIPE_SHADER_VERTEX &&
6057 !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog)))) {
6058 si_init_exec_from_input(ctx,
6059 ctx->param_merged_wave_info, 0);
6060 } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
6061 ctx->type == PIPE_SHADER_GEOMETRY) {
6062 if (!shader->is_monolithic)
6063 ac_init_exec_full_mask(&ctx->ac);
6064
6065 LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
6066 LLVMValueRef ena =
6067 LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
6068 ac_get_thread_id(&ctx->ac), num_threads, "");
6069 lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
6070
6071 /* The barrier must execute for all shaders in a
6072 * threadgroup.
6073 *
6074 * Execute the barrier inside the conditional block,
6075 * so that empty waves can jump directly to s_endpgm,
6076 * which will also signal the barrier.
6077 *
6078 * If the shader is TCS and the TCS epilog is present
6079 * and contains a barrier, it will wait there and then
6080 * reach s_endpgm.
6081 */
6082 si_llvm_emit_barrier(NULL, bld_base, NULL);
6083 }
6084 }
6085
6086 if (ctx->type == PIPE_SHADER_TESS_CTRL &&
6087 sel->tcs_info.tessfactors_are_def_in_all_invocs) {
6088 for (unsigned i = 0; i < 6; i++) {
6089 ctx->invoc0_tess_factors[i] =
6090 ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
6091 }
6092 }
6093
6094 if (ctx->type == PIPE_SHADER_GEOMETRY) {
6095 int i;
6096 for (i = 0; i < 4; i++) {
6097 ctx->gs_next_vertex[i] =
6098 ac_build_alloca(&ctx->ac, ctx->i32, "");
6099 }
6100 }
6101
6102 if (sel->force_correct_derivs_after_kill) {
6103 ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
6104 /* true = don't kill. */
6105 LLVMBuildStore(ctx->ac.builder, ctx->i1true,
6106 ctx->postponed_kill);
6107 }
6108
6109 if (sel->tokens) {
6110 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6111 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6112 return false;
6113 }
6114 } else {
6115 if (!si_nir_build_llvm(ctx, sel->nir)) {
6116 fprintf(stderr, "Failed to translate shader from NIR to LLVM\n");
6117 return false;
6118 }
6119 }
6120
6121 si_llvm_build_ret(ctx, ctx->return_value);
6122 return true;
6123 }
6124
6125 /**
6126 * Compute the VS prolog key, which contains all the information needed to
6127 * build the VS prolog function, and set shader->info bits where needed.
6128 *
6129 * \param info Shader info of the vertex shader.
6130 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
6131 * \param prolog_key Key of the VS prolog
6132 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
6133 * \param key Output shader part key.
6134 */
6135 static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
6136 unsigned num_input_sgprs,
6137 const struct si_vs_prolog_bits *prolog_key,
6138 struct si_shader *shader_out,
6139 union si_shader_part_key *key)
6140 {
6141 memset(key, 0, sizeof(*key));
6142 key->vs_prolog.states = *prolog_key;
6143 key->vs_prolog.num_input_sgprs = num_input_sgprs;
6144 key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6145 key->vs_prolog.as_ls = shader_out->key.as_ls;
6146 key->vs_prolog.as_es = shader_out->key.as_es;
6147
6148 if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
6149 key->vs_prolog.as_ls = 1;
6150 key->vs_prolog.num_merged_next_stage_vgprs = 2;
6151 } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
6152 key->vs_prolog.as_es = 1;
6153 key->vs_prolog.num_merged_next_stage_vgprs = 5;
6154 }
6155
6156 /* Enable loading the InstanceID VGPR. */
6157 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
6158
6159 if ((key->vs_prolog.states.instance_divisor_is_one |
6160 key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
6161 shader_out->info.uses_instanceid = true;
6162 }
6163
6164 /**
6165 * Compute the PS prolog key, which contains all the information needed to
6166 * build the PS prolog function, and set related bits in shader->config.
6167 */
6168 static void si_get_ps_prolog_key(struct si_shader *shader,
6169 union si_shader_part_key *key,
6170 bool separate_prolog)
6171 {
6172 struct tgsi_shader_info *info = &shader->selector->info;
6173
6174 memset(key, 0, sizeof(*key));
6175 key->ps_prolog.states = shader->key.part.ps.prolog;
6176 key->ps_prolog.colors_read = info->colors_read;
6177 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6178 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6179 key->ps_prolog.wqm = info->uses_derivatives &&
6180 (key->ps_prolog.colors_read ||
6181 key->ps_prolog.states.force_persp_sample_interp ||
6182 key->ps_prolog.states.force_linear_sample_interp ||
6183 key->ps_prolog.states.force_persp_center_interp ||
6184 key->ps_prolog.states.force_linear_center_interp ||
6185 key->ps_prolog.states.bc_optimize_for_persp ||
6186 key->ps_prolog.states.bc_optimize_for_linear);
6187 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
6188
6189 if (info->colors_read) {
6190 unsigned *color = shader->selector->color_attr_index;
6191
6192 if (shader->key.part.ps.prolog.color_two_side) {
6193 /* BCOLORs are stored after the last input. */
6194 key->ps_prolog.num_interp_inputs = info->num_inputs;
6195 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6196 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6197 }
6198
6199 for (unsigned i = 0; i < 2; i++) {
6200 unsigned interp = info->input_interpolate[color[i]];
6201 unsigned location = info->input_interpolate_loc[color[i]];
6202
6203 if (!(info->colors_read & (0xf << i*4)))
6204 continue;
6205
6206 key->ps_prolog.color_attr_index[i] = color[i];
6207
6208 if (shader->key.part.ps.prolog.flatshade_colors &&
6209 interp == TGSI_INTERPOLATE_COLOR)
6210 interp = TGSI_INTERPOLATE_CONSTANT;
6211
6212 switch (interp) {
6213 case TGSI_INTERPOLATE_CONSTANT:
6214 key->ps_prolog.color_interp_vgpr_index[i] = -1;
6215 break;
6216 case TGSI_INTERPOLATE_PERSPECTIVE:
6217 case TGSI_INTERPOLATE_COLOR:
6218 /* Force the interpolation location for colors here. */
6219 if (shader->key.part.ps.prolog.force_persp_sample_interp)
6220 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6221 if (shader->key.part.ps.prolog.force_persp_center_interp)
6222 location = TGSI_INTERPOLATE_LOC_CENTER;
6223
6224 switch (location) {
6225 case TGSI_INTERPOLATE_LOC_SAMPLE:
6226 key->ps_prolog.color_interp_vgpr_index[i] = 0;
6227 shader->config.spi_ps_input_ena |=
6228 S_0286CC_PERSP_SAMPLE_ENA(1);
6229 break;
6230 case TGSI_INTERPOLATE_LOC_CENTER:
6231 key->ps_prolog.color_interp_vgpr_index[i] = 2;
6232 shader->config.spi_ps_input_ena |=
6233 S_0286CC_PERSP_CENTER_ENA(1);
6234 break;
6235 case TGSI_INTERPOLATE_LOC_CENTROID:
6236 key->ps_prolog.color_interp_vgpr_index[i] = 4;
6237 shader->config.spi_ps_input_ena |=
6238 S_0286CC_PERSP_CENTROID_ENA(1);
6239 break;
6240 default:
6241 assert(0);
6242 }
6243 break;
6244 case TGSI_INTERPOLATE_LINEAR:
6245 /* Force the interpolation location for colors here. */
6246 if (shader->key.part.ps.prolog.force_linear_sample_interp)
6247 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6248 if (shader->key.part.ps.prolog.force_linear_center_interp)
6249 location = TGSI_INTERPOLATE_LOC_CENTER;
6250
6251 /* The VGPR assignment for non-monolithic shaders
6252 * works because InitialPSInputAddr is set on the
6253 * main shader and PERSP_PULL_MODEL is never used.
6254 */
6255 switch (location) {
6256 case TGSI_INTERPOLATE_LOC_SAMPLE:
6257 key->ps_prolog.color_interp_vgpr_index[i] =
6258 separate_prolog ? 6 : 9;
6259 shader->config.spi_ps_input_ena |=
6260 S_0286CC_LINEAR_SAMPLE_ENA(1);
6261 break;
6262 case TGSI_INTERPOLATE_LOC_CENTER:
6263 key->ps_prolog.color_interp_vgpr_index[i] =
6264 separate_prolog ? 8 : 11;
6265 shader->config.spi_ps_input_ena |=
6266 S_0286CC_LINEAR_CENTER_ENA(1);
6267 break;
6268 case TGSI_INTERPOLATE_LOC_CENTROID:
6269 key->ps_prolog.color_interp_vgpr_index[i] =
6270 separate_prolog ? 10 : 13;
6271 shader->config.spi_ps_input_ena |=
6272 S_0286CC_LINEAR_CENTROID_ENA(1);
6273 break;
6274 default:
6275 assert(0);
6276 }
6277 break;
6278 default:
6279 assert(0);
6280 }
6281 }
6282 }
6283 }
6284
6285 /**
6286 * Check whether a PS prolog is required based on the key.
6287 */
6288 static bool si_need_ps_prolog(const union si_shader_part_key *key)
6289 {
6290 return key->ps_prolog.colors_read ||
6291 key->ps_prolog.states.force_persp_sample_interp ||
6292 key->ps_prolog.states.force_linear_sample_interp ||
6293 key->ps_prolog.states.force_persp_center_interp ||
6294 key->ps_prolog.states.force_linear_center_interp ||
6295 key->ps_prolog.states.bc_optimize_for_persp ||
6296 key->ps_prolog.states.bc_optimize_for_linear ||
6297 key->ps_prolog.states.poly_stipple ||
6298 key->ps_prolog.states.samplemask_log_ps_iter;
6299 }
6300
6301 /**
6302 * Compute the PS epilog key, which contains all the information needed to
6303 * build the PS epilog function.
6304 */
6305 static void si_get_ps_epilog_key(struct si_shader *shader,
6306 union si_shader_part_key *key)
6307 {
6308 struct tgsi_shader_info *info = &shader->selector->info;
6309 memset(key, 0, sizeof(*key));
6310 key->ps_epilog.colors_written = info->colors_written;
6311 key->ps_epilog.writes_z = info->writes_z;
6312 key->ps_epilog.writes_stencil = info->writes_stencil;
6313 key->ps_epilog.writes_samplemask = info->writes_samplemask;
6314 key->ps_epilog.states = shader->key.part.ps.epilog;
6315 }
6316
6317 /**
6318 * Build the GS prolog function. Rotate the input vertices for triangle strips
6319 * with adjacency.
6320 */
6321 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
6322 union si_shader_part_key *key)
6323 {
6324 unsigned num_sgprs, num_vgprs;
6325 struct si_function_info fninfo;
6326 LLVMBuilderRef builder = ctx->ac.builder;
6327 LLVMTypeRef returns[48];
6328 LLVMValueRef func, ret;
6329
6330 si_init_function_info(&fninfo);
6331
6332 if (ctx->screen->info.chip_class >= GFX9) {
6333 if (key->gs_prolog.states.gfx9_prev_is_vs)
6334 num_sgprs = 8 + GFX9_VSGS_NUM_USER_SGPR;
6335 else
6336 num_sgprs = 8 + GFX9_TESGS_NUM_USER_SGPR;
6337 num_vgprs = 5; /* ES inputs are not needed by GS */
6338 } else {
6339 num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
6340 num_vgprs = 8;
6341 }
6342
6343 for (unsigned i = 0; i < num_sgprs; ++i) {
6344 add_arg(&fninfo, ARG_SGPR, ctx->i32);
6345 returns[i] = ctx->i32;
6346 }
6347
6348 for (unsigned i = 0; i < num_vgprs; ++i) {
6349 add_arg(&fninfo, ARG_VGPR, ctx->i32);
6350 returns[num_sgprs + i] = ctx->f32;
6351 }
6352
6353 /* Create the function. */
6354 si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
6355 &fninfo, 0);
6356 func = ctx->main_fn;
6357
6358 /* Set the full EXEC mask for the prolog, because we are only fiddling
6359 * with registers here. The main shader part will set the correct EXEC
6360 * mask.
6361 */
6362 if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
6363 ac_init_exec_full_mask(&ctx->ac);
6364
6365 /* Copy inputs to outputs. This should be no-op, as the registers match,
6366 * but it will prevent the compiler from overwriting them unintentionally.
6367 */
6368 ret = ctx->return_value;
6369 for (unsigned i = 0; i < num_sgprs; i++) {
6370 LLVMValueRef p = LLVMGetParam(func, i);
6371 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
6372 }
6373 for (unsigned i = 0; i < num_vgprs; i++) {
6374 LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
6375 p = ac_to_float(&ctx->ac, p);
6376 ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
6377 }
6378
6379 if (key->gs_prolog.states.tri_strip_adj_fix) {
6380 /* Remap the input vertices for every other primitive. */
6381 const unsigned gfx6_vtx_params[6] = {
6382 num_sgprs,
6383 num_sgprs + 1,
6384 num_sgprs + 3,
6385 num_sgprs + 4,
6386 num_sgprs + 5,
6387 num_sgprs + 6
6388 };
6389 const unsigned gfx9_vtx_params[3] = {
6390 num_sgprs,
6391 num_sgprs + 1,
6392 num_sgprs + 4,
6393 };
6394 LLVMValueRef vtx_in[6], vtx_out[6];
6395 LLVMValueRef prim_id, rotate;
6396
6397 if (ctx->screen->info.chip_class >= GFX9) {
6398 for (unsigned i = 0; i < 3; i++) {
6399 vtx_in[i*2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
6400 vtx_in[i*2+1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
6401 }
6402 } else {
6403 for (unsigned i = 0; i < 6; i++)
6404 vtx_in[i] = LLVMGetParam(func, gfx6_vtx_params[i]);
6405 }
6406
6407 prim_id = LLVMGetParam(func, num_sgprs + 2);
6408 rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
6409
6410 for (unsigned i = 0; i < 6; ++i) {
6411 LLVMValueRef base, rotated;
6412 base = vtx_in[i];
6413 rotated = vtx_in[(i + 4) % 6];
6414 vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
6415 }
6416
6417 if (ctx->screen->info.chip_class >= GFX9) {
6418 for (unsigned i = 0; i < 3; i++) {
6419 LLVMValueRef hi, out;
6420
6421 hi = LLVMBuildShl(builder, vtx_out[i*2+1],
6422 LLVMConstInt(ctx->i32, 16, 0), "");
6423 out = LLVMBuildOr(builder, vtx_out[i*2], hi, "");
6424 out = ac_to_float(&ctx->ac, out);
6425 ret = LLVMBuildInsertValue(builder, ret, out,
6426 gfx9_vtx_params[i], "");
6427 }
6428 } else {
6429 for (unsigned i = 0; i < 6; i++) {
6430 LLVMValueRef out;
6431
6432 out = ac_to_float(&ctx->ac, vtx_out[i]);
6433 ret = LLVMBuildInsertValue(builder, ret, out,
6434 gfx6_vtx_params[i], "");
6435 }
6436 }
6437 }
6438
6439 LLVMBuildRet(builder, ret);
6440 }
6441
6442 /**
6443 * Given a list of shader part functions, build a wrapper function that
6444 * runs them in sequence to form a monolithic shader.
6445 */
6446 static void si_build_wrapper_function(struct si_shader_context *ctx,
6447 LLVMValueRef *parts,
6448 unsigned num_parts,
6449 unsigned main_part,
6450 unsigned next_shader_first_part)
6451 {
6452 LLVMBuilderRef builder = ctx->ac.builder;
6453 /* PS epilog has one arg per color component; gfx9 merged shader
6454 * prologs need to forward 32 user SGPRs.
6455 */
6456 struct si_function_info fninfo;
6457 LLVMValueRef initial[64], out[64];
6458 LLVMTypeRef function_type;
6459 unsigned num_first_params;
6460 unsigned num_out, initial_num_out;
6461 MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
6462 MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
6463 unsigned num_sgprs, num_vgprs;
6464 unsigned gprs;
6465 struct lp_build_if_state if_state;
6466
6467 si_init_function_info(&fninfo);
6468
6469 for (unsigned i = 0; i < num_parts; ++i) {
6470 ac_add_function_attr(ctx->ac.context, parts[i], -1,
6471 AC_FUNC_ATTR_ALWAYSINLINE);
6472 LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
6473 }
6474
6475 /* The parameters of the wrapper function correspond to those of the
6476 * first part in terms of SGPRs and VGPRs, but we use the types of the
6477 * main part to get the right types. This is relevant for the
6478 * dereferenceable attribute on descriptor table pointers.
6479 */
6480 num_sgprs = 0;
6481 num_vgprs = 0;
6482
6483 function_type = LLVMGetElementType(LLVMTypeOf(parts[0]));
6484 num_first_params = LLVMCountParamTypes(function_type);
6485
6486 for (unsigned i = 0; i < num_first_params; ++i) {
6487 LLVMValueRef param = LLVMGetParam(parts[0], i);
6488
6489 if (ac_is_sgpr_param(param)) {
6490 assert(num_vgprs == 0);
6491 num_sgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6492 } else {
6493 num_vgprs += ac_get_type_size(LLVMTypeOf(param)) / 4;
6494 }
6495 }
6496
6497 gprs = 0;
6498 while (gprs < num_sgprs + num_vgprs) {
6499 LLVMValueRef param = LLVMGetParam(parts[main_part], fninfo.num_params);
6500 LLVMTypeRef type = LLVMTypeOf(param);
6501 unsigned size = ac_get_type_size(type) / 4;
6502
6503 add_arg(&fninfo, gprs < num_sgprs ? ARG_SGPR : ARG_VGPR, type);
6504
6505 assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
6506 assert(gprs + size <= num_sgprs + num_vgprs &&
6507 (gprs >= num_sgprs || gprs + size <= num_sgprs));
6508
6509 gprs += size;
6510 }
6511
6512 si_create_function(ctx, "wrapper", NULL, 0, &fninfo,
6513 si_get_max_workgroup_size(ctx->shader));
6514
6515 if (is_merged_shader(ctx))
6516 ac_init_exec_full_mask(&ctx->ac);
6517
6518 /* Record the arguments of the function as if they were an output of
6519 * a previous part.
6520 */
6521 num_out = 0;
6522 num_out_sgpr = 0;
6523
6524 for (unsigned i = 0; i < fninfo.num_params; ++i) {
6525 LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
6526 LLVMTypeRef param_type = LLVMTypeOf(param);
6527 LLVMTypeRef out_type = i < fninfo.num_sgpr_params ? ctx->i32 : ctx->f32;
6528 unsigned size = ac_get_type_size(param_type) / 4;
6529
6530 if (size == 1) {
6531 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6532 param = LLVMBuildPtrToInt(builder, param, ctx->i32, "");
6533 param_type = ctx->i32;
6534 }
6535
6536 if (param_type != out_type)
6537 param = LLVMBuildBitCast(builder, param, out_type, "");
6538 out[num_out++] = param;
6539 } else {
6540 LLVMTypeRef vector_type = LLVMVectorType(out_type, size);
6541
6542 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6543 param = LLVMBuildPtrToInt(builder, param, ctx->i64, "");
6544 param_type = ctx->i64;
6545 }
6546
6547 if (param_type != vector_type)
6548 param = LLVMBuildBitCast(builder, param, vector_type, "");
6549
6550 for (unsigned j = 0; j < size; ++j)
6551 out[num_out++] = LLVMBuildExtractElement(
6552 builder, param, LLVMConstInt(ctx->i32, j, 0), "");
6553 }
6554
6555 if (i < fninfo.num_sgpr_params)
6556 num_out_sgpr = num_out;
6557 }
6558
6559 memcpy(initial, out, sizeof(out));
6560 initial_num_out = num_out;
6561 initial_num_out_sgpr = num_out_sgpr;
6562
6563 /* Now chain the parts. */
6564 for (unsigned part = 0; part < num_parts; ++part) {
6565 LLVMValueRef in[48];
6566 LLVMValueRef ret;
6567 LLVMTypeRef ret_type;
6568 unsigned out_idx = 0;
6569 unsigned num_params = LLVMCountParams(parts[part]);
6570
6571 /* Merged shaders are executed conditionally depending
6572 * on the number of enabled threads passed in the input SGPRs. */
6573 if (is_merged_shader(ctx) && part == 0) {
6574 LLVMValueRef ena, count = initial[3];
6575
6576 count = LLVMBuildAnd(builder, count,
6577 LLVMConstInt(ctx->i32, 0x7f, 0), "");
6578 ena = LLVMBuildICmp(builder, LLVMIntULT,
6579 ac_get_thread_id(&ctx->ac), count, "");
6580 lp_build_if(&if_state, &ctx->gallivm, ena);
6581 }
6582
6583 /* Derive arguments for the next part from outputs of the
6584 * previous one.
6585 */
6586 for (unsigned param_idx = 0; param_idx < num_params; ++param_idx) {
6587 LLVMValueRef param;
6588 LLVMTypeRef param_type;
6589 bool is_sgpr;
6590 unsigned param_size;
6591 LLVMValueRef arg = NULL;
6592
6593 param = LLVMGetParam(parts[part], param_idx);
6594 param_type = LLVMTypeOf(param);
6595 param_size = ac_get_type_size(param_type) / 4;
6596 is_sgpr = ac_is_sgpr_param(param);
6597
6598 if (is_sgpr) {
6599 ac_add_function_attr(ctx->ac.context, parts[part],
6600 param_idx + 1, AC_FUNC_ATTR_INREG);
6601 } else if (out_idx < num_out_sgpr) {
6602 /* Skip returned SGPRs the current part doesn't
6603 * declare on the input. */
6604 out_idx = num_out_sgpr;
6605 }
6606
6607 assert(out_idx + param_size <= (is_sgpr ? num_out_sgpr : num_out));
6608
6609 if (param_size == 1)
6610 arg = out[out_idx];
6611 else
6612 arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
6613
6614 if (LLVMTypeOf(arg) != param_type) {
6615 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
6616 if (LLVMGetPointerAddressSpace(param_type) ==
6617 AC_ADDR_SPACE_CONST_32BIT) {
6618 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
6619 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6620 } else {
6621 arg = LLVMBuildBitCast(builder, arg, ctx->i64, "");
6622 arg = LLVMBuildIntToPtr(builder, arg, param_type, "");
6623 }
6624 } else {
6625 arg = LLVMBuildBitCast(builder, arg, param_type, "");
6626 }
6627 }
6628
6629 in[param_idx] = arg;
6630 out_idx += param_size;
6631 }
6632
6633 ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
6634
6635 if (is_merged_shader(ctx) &&
6636 part + 1 == next_shader_first_part) {
6637 lp_build_endif(&if_state);
6638
6639 /* The second half of the merged shader should use
6640 * the inputs from the toplevel (wrapper) function,
6641 * not the return value from the last call.
6642 *
6643 * That's because the last call was executed condi-
6644 * tionally, so we can't consume it in the main
6645 * block.
6646 */
6647 memcpy(out, initial, sizeof(initial));
6648 num_out = initial_num_out;
6649 num_out_sgpr = initial_num_out_sgpr;
6650 continue;
6651 }
6652
6653 /* Extract the returned GPRs. */
6654 ret_type = LLVMTypeOf(ret);
6655 num_out = 0;
6656 num_out_sgpr = 0;
6657
6658 if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
6659 assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
6660
6661 unsigned ret_size = LLVMCountStructElementTypes(ret_type);
6662
6663 for (unsigned i = 0; i < ret_size; ++i) {
6664 LLVMValueRef val =
6665 LLVMBuildExtractValue(builder, ret, i, "");
6666
6667 assert(num_out < ARRAY_SIZE(out));
6668 out[num_out++] = val;
6669
6670 if (LLVMTypeOf(val) == ctx->i32) {
6671 assert(num_out_sgpr + 1 == num_out);
6672 num_out_sgpr = num_out;
6673 }
6674 }
6675 }
6676 }
6677
6678 LLVMBuildRetVoid(builder);
6679 }
6680
6681 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
6682 struct si_shader_selector *sel)
6683 {
6684 if (!compiler->low_opt_passes)
6685 return false;
6686
6687 /* Assume a slow CPU. */
6688 assert(!sel->screen->info.has_dedicated_vram &&
6689 sel->screen->info.chip_class <= VI);
6690
6691 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
6692 * buffer stores. */
6693 return sel->type == PIPE_SHADER_COMPUTE &&
6694 sel->info.num_memory_instructions > 1000;
6695 }
6696
6697 int si_compile_tgsi_shader(struct si_screen *sscreen,
6698 struct ac_llvm_compiler *compiler,
6699 struct si_shader *shader,
6700 struct pipe_debug_callback *debug)
6701 {
6702 struct si_shader_selector *sel = shader->selector;
6703 struct si_shader_context ctx;
6704 int r = -1;
6705
6706 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6707 * conversion fails. */
6708 if (si_can_dump_shader(sscreen, sel->info.processor) &&
6709 !(sscreen->debug_flags & DBG(NO_TGSI))) {
6710 if (sel->tokens)
6711 tgsi_dump(sel->tokens, 0);
6712 else
6713 nir_print_shader(sel->nir, stderr);
6714 si_dump_streamout(&sel->so);
6715 }
6716
6717 si_init_shader_ctx(&ctx, sscreen, compiler);
6718 si_llvm_context_set_tgsi(&ctx, shader);
6719
6720 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
6721 sizeof(shader->info.vs_output_param_offset));
6722
6723 shader->info.uses_instanceid = sel->info.uses_instanceid;
6724
6725 if (!si_compile_tgsi_main(&ctx)) {
6726 si_llvm_dispose(&ctx);
6727 return -1;
6728 }
6729
6730 if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
6731 LLVMValueRef parts[2];
6732 bool need_prolog = sel->vs_needs_prolog;
6733
6734 parts[1] = ctx.main_fn;
6735
6736 if (need_prolog) {
6737 union si_shader_part_key prolog_key;
6738 si_get_vs_prolog_key(&sel->info,
6739 shader->info.num_input_sgprs,
6740 &shader->key.part.vs.prolog,
6741 shader, &prolog_key);
6742 si_build_vs_prolog_function(&ctx, &prolog_key);
6743 parts[0] = ctx.main_fn;
6744 }
6745
6746 si_build_wrapper_function(&ctx, parts + !need_prolog,
6747 1 + need_prolog, need_prolog, 0);
6748 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
6749 if (sscreen->info.chip_class >= GFX9) {
6750 struct si_shader_selector *ls = shader->key.part.tcs.ls;
6751 LLVMValueRef parts[4];
6752 bool vs_needs_prolog =
6753 si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog);
6754
6755 /* TCS main part */
6756 parts[2] = ctx.main_fn;
6757
6758 /* TCS epilog */
6759 union si_shader_part_key tcs_epilog_key;
6760 memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
6761 tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6762 si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
6763 parts[3] = ctx.main_fn;
6764
6765 /* VS as LS main part */
6766 struct si_shader shader_ls = {};
6767 shader_ls.selector = ls;
6768 shader_ls.key.as_ls = 1;
6769 shader_ls.key.mono = shader->key.mono;
6770 shader_ls.key.opt = shader->key.opt;
6771 shader_ls.is_monolithic = true;
6772 si_llvm_context_set_tgsi(&ctx, &shader_ls);
6773
6774 if (!si_compile_tgsi_main(&ctx)) {
6775 si_llvm_dispose(&ctx);
6776 return -1;
6777 }
6778 shader->info.uses_instanceid |= ls->info.uses_instanceid;
6779 parts[1] = ctx.main_fn;
6780
6781 /* LS prolog */
6782 if (vs_needs_prolog) {
6783 union si_shader_part_key vs_prolog_key;
6784 si_get_vs_prolog_key(&ls->info,
6785 shader_ls.info.num_input_sgprs,
6786 &shader->key.part.tcs.ls_prolog,
6787 shader, &vs_prolog_key);
6788 vs_prolog_key.vs_prolog.is_monolithic = true;
6789 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6790 parts[0] = ctx.main_fn;
6791 }
6792
6793 /* Reset the shader context. */
6794 ctx.shader = shader;
6795 ctx.type = PIPE_SHADER_TESS_CTRL;
6796
6797 si_build_wrapper_function(&ctx,
6798 parts + !vs_needs_prolog,
6799 4 - !vs_needs_prolog, vs_needs_prolog,
6800 vs_needs_prolog ? 2 : 1);
6801 } else {
6802 LLVMValueRef parts[2];
6803 union si_shader_part_key epilog_key;
6804
6805 parts[0] = ctx.main_fn;
6806
6807 memset(&epilog_key, 0, sizeof(epilog_key));
6808 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
6809 si_build_tcs_epilog_function(&ctx, &epilog_key);
6810 parts[1] = ctx.main_fn;
6811
6812 si_build_wrapper_function(&ctx, parts, 2, 0, 0);
6813 }
6814 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
6815 if (ctx.screen->info.chip_class >= GFX9) {
6816 struct si_shader_selector *es = shader->key.part.gs.es;
6817 LLVMValueRef es_prolog = NULL;
6818 LLVMValueRef es_main = NULL;
6819 LLVMValueRef gs_prolog = NULL;
6820 LLVMValueRef gs_main = ctx.main_fn;
6821
6822 /* GS prolog */
6823 union si_shader_part_key gs_prolog_key;
6824 memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
6825 gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6826 gs_prolog_key.gs_prolog.is_monolithic = true;
6827 si_build_gs_prolog_function(&ctx, &gs_prolog_key);
6828 gs_prolog = ctx.main_fn;
6829
6830 /* ES main part */
6831 struct si_shader shader_es = {};
6832 shader_es.selector = es;
6833 shader_es.key.as_es = 1;
6834 shader_es.key.mono = shader->key.mono;
6835 shader_es.key.opt = shader->key.opt;
6836 shader_es.is_monolithic = true;
6837 si_llvm_context_set_tgsi(&ctx, &shader_es);
6838
6839 if (!si_compile_tgsi_main(&ctx)) {
6840 si_llvm_dispose(&ctx);
6841 return -1;
6842 }
6843 shader->info.uses_instanceid |= es->info.uses_instanceid;
6844 es_main = ctx.main_fn;
6845
6846 /* ES prolog */
6847 if (es->vs_needs_prolog) {
6848 union si_shader_part_key vs_prolog_key;
6849 si_get_vs_prolog_key(&es->info,
6850 shader_es.info.num_input_sgprs,
6851 &shader->key.part.gs.vs_prolog,
6852 shader, &vs_prolog_key);
6853 vs_prolog_key.vs_prolog.is_monolithic = true;
6854 si_build_vs_prolog_function(&ctx, &vs_prolog_key);
6855 es_prolog = ctx.main_fn;
6856 }
6857
6858 /* Reset the shader context. */
6859 ctx.shader = shader;
6860 ctx.type = PIPE_SHADER_GEOMETRY;
6861
6862 /* Prepare the array of shader parts. */
6863 LLVMValueRef parts[4];
6864 unsigned num_parts = 0, main_part, next_first_part;
6865
6866 if (es_prolog)
6867 parts[num_parts++] = es_prolog;
6868
6869 parts[main_part = num_parts++] = es_main;
6870 parts[next_first_part = num_parts++] = gs_prolog;
6871 parts[num_parts++] = gs_main;
6872
6873 si_build_wrapper_function(&ctx, parts, num_parts,
6874 main_part, next_first_part);
6875 } else {
6876 LLVMValueRef parts[2];
6877 union si_shader_part_key prolog_key;
6878
6879 parts[1] = ctx.main_fn;
6880
6881 memset(&prolog_key, 0, sizeof(prolog_key));
6882 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
6883 si_build_gs_prolog_function(&ctx, &prolog_key);
6884 parts[0] = ctx.main_fn;
6885
6886 si_build_wrapper_function(&ctx, parts, 2, 1, 0);
6887 }
6888 } else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
6889 LLVMValueRef parts[3];
6890 union si_shader_part_key prolog_key;
6891 union si_shader_part_key epilog_key;
6892 bool need_prolog;
6893
6894 si_get_ps_prolog_key(shader, &prolog_key, false);
6895 need_prolog = si_need_ps_prolog(&prolog_key);
6896
6897 parts[need_prolog ? 1 : 0] = ctx.main_fn;
6898
6899 if (need_prolog) {
6900 si_build_ps_prolog_function(&ctx, &prolog_key);
6901 parts[0] = ctx.main_fn;
6902 }
6903
6904 si_get_ps_epilog_key(shader, &epilog_key);
6905 si_build_ps_epilog_function(&ctx, &epilog_key);
6906 parts[need_prolog ? 2 : 1] = ctx.main_fn;
6907
6908 si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
6909 need_prolog ? 1 : 0, 0);
6910 }
6911
6912 si_llvm_optimize_module(&ctx);
6913
6914 /* Post-optimization transformations and analysis. */
6915 si_optimize_vs_outputs(&ctx);
6916
6917 if ((debug && debug->debug_message) ||
6918 si_can_dump_shader(sscreen, ctx.type)) {
6919 ctx.shader->config.private_mem_vgprs =
6920 ac_count_scratch_private_memory(ctx.main_fn);
6921 }
6922
6923 /* Make sure the input is a pointer and not integer followed by inttoptr. */
6924 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) ==
6925 LLVMPointerTypeKind);
6926
6927 /* Compile to bytecode. */
6928 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
6929 ctx.ac.module, debug, ctx.type,
6930 si_get_shader_name(shader, ctx.type),
6931 si_should_optimize_less(compiler, shader->selector));
6932 si_llvm_dispose(&ctx);
6933 if (r) {
6934 fprintf(stderr, "LLVM failed to compile shader\n");
6935 return r;
6936 }
6937
6938 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6939 * LLVM 3.9svn has this bug.
6940 */
6941 if (sel->type == PIPE_SHADER_COMPUTE) {
6942 unsigned wave_size = 64;
6943 unsigned max_vgprs = 256;
6944 unsigned max_sgprs = sscreen->info.chip_class >= VI ? 800 : 512;
6945 unsigned max_sgprs_per_wave = 128;
6946 unsigned max_block_threads = si_get_max_workgroup_size(shader);
6947 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6948 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6949
6950 max_vgprs = max_vgprs / min_waves_per_simd;
6951 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6952
6953 if (shader->config.num_sgprs > max_sgprs ||
6954 shader->config.num_vgprs > max_vgprs) {
6955 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6956 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6957 shader->config.num_sgprs, shader->config.num_vgprs,
6958 max_sgprs, max_vgprs);
6959
6960 /* Just terminate the process, because dependent
6961 * shaders can hang due to bad input data, but use
6962 * the env var to allow shader-db to work.
6963 */
6964 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6965 abort();
6966 }
6967 }
6968
6969 /* Add the scratch offset to input SGPRs. */
6970 if (shader->config.scratch_bytes_per_wave && !is_merged_shader(&ctx))
6971 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6972
6973 /* Calculate the number of fragment input VGPRs. */
6974 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6975 shader->info.num_input_vgprs = 0;
6976 shader->info.face_vgpr_index = -1;
6977 shader->info.ancillary_vgpr_index = -1;
6978
6979 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6980 shader->info.num_input_vgprs += 2;
6981 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6982 shader->info.num_input_vgprs += 2;
6983 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6984 shader->info.num_input_vgprs += 2;
6985 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6986 shader->info.num_input_vgprs += 3;
6987 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6988 shader->info.num_input_vgprs += 2;
6989 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6990 shader->info.num_input_vgprs += 2;
6991 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6992 shader->info.num_input_vgprs += 2;
6993 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6994 shader->info.num_input_vgprs += 1;
6995 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6996 shader->info.num_input_vgprs += 1;
6997 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6998 shader->info.num_input_vgprs += 1;
6999 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
7000 shader->info.num_input_vgprs += 1;
7001 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
7002 shader->info.num_input_vgprs += 1;
7003 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
7004 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
7005 shader->info.num_input_vgprs += 1;
7006 }
7007 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)) {
7008 shader->info.ancillary_vgpr_index = shader->info.num_input_vgprs;
7009 shader->info.num_input_vgprs += 1;
7010 }
7011 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
7012 shader->info.num_input_vgprs += 1;
7013 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
7014 shader->info.num_input_vgprs += 1;
7015 }
7016
7017 si_calculate_max_simd_waves(shader);
7018 si_shader_dump_stats_for_shader_db(shader, debug);
7019 return 0;
7020 }
7021
7022 /**
7023 * Create, compile and return a shader part (prolog or epilog).
7024 *
7025 * \param sscreen screen
7026 * \param list list of shader parts of the same category
7027 * \param type shader type
7028 * \param key shader part key
7029 * \param prolog whether the part being requested is a prolog
7030 * \param tm LLVM target machine
7031 * \param debug debug callback
7032 * \param build the callback responsible for building the main function
7033 * \return non-NULL on success
7034 */
7035 static struct si_shader_part *
7036 si_get_shader_part(struct si_screen *sscreen,
7037 struct si_shader_part **list,
7038 enum pipe_shader_type type,
7039 bool prolog,
7040 union si_shader_part_key *key,
7041 struct ac_llvm_compiler *compiler,
7042 struct pipe_debug_callback *debug,
7043 void (*build)(struct si_shader_context *,
7044 union si_shader_part_key *),
7045 const char *name)
7046 {
7047 struct si_shader_part *result;
7048
7049 mtx_lock(&sscreen->shader_parts_mutex);
7050
7051 /* Find existing. */
7052 for (result = *list; result; result = result->next) {
7053 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
7054 mtx_unlock(&sscreen->shader_parts_mutex);
7055 return result;
7056 }
7057 }
7058
7059 /* Compile a new one. */
7060 result = CALLOC_STRUCT(si_shader_part);
7061 result->key = *key;
7062
7063 struct si_shader shader = {};
7064 struct si_shader_context ctx;
7065
7066 si_init_shader_ctx(&ctx, sscreen, compiler);
7067 ctx.shader = &shader;
7068 ctx.type = type;
7069
7070 switch (type) {
7071 case PIPE_SHADER_VERTEX:
7072 shader.key.as_ls = key->vs_prolog.as_ls;
7073 shader.key.as_es = key->vs_prolog.as_es;
7074 break;
7075 case PIPE_SHADER_TESS_CTRL:
7076 assert(!prolog);
7077 shader.key.part.tcs.epilog = key->tcs_epilog.states;
7078 break;
7079 case PIPE_SHADER_GEOMETRY:
7080 assert(prolog);
7081 break;
7082 case PIPE_SHADER_FRAGMENT:
7083 if (prolog)
7084 shader.key.part.ps.prolog = key->ps_prolog.states;
7085 else
7086 shader.key.part.ps.epilog = key->ps_epilog.states;
7087 break;
7088 default:
7089 unreachable("bad shader part");
7090 }
7091
7092 build(&ctx, key);
7093
7094 /* Compile. */
7095 si_llvm_optimize_module(&ctx);
7096
7097 if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
7098 ctx.ac.module, debug, ctx.type, name, false)) {
7099 FREE(result);
7100 result = NULL;
7101 goto out;
7102 }
7103
7104 result->next = *list;
7105 *list = result;
7106
7107 out:
7108 si_llvm_dispose(&ctx);
7109 mtx_unlock(&sscreen->shader_parts_mutex);
7110 return result;
7111 }
7112
7113 static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
7114 {
7115 LLVMValueRef ptr[2], list;
7116 bool merged_shader = is_merged_shader(ctx);
7117
7118 ptr[0] = LLVMGetParam(ctx->main_fn, (merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
7119 list = LLVMBuildIntToPtr(ctx->ac.builder, ptr[0],
7120 ac_array_in_const32_addr_space(ctx->v4i32), "");
7121 return list;
7122 }
7123
7124 /**
7125 * Build the vertex shader prolog function.
7126 *
7127 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
7128 * All inputs are returned unmodified. The vertex load indices are
7129 * stored after them, which will be used by the API VS for fetching inputs.
7130 *
7131 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7132 * input_v0,
7133 * input_v1,
7134 * input_v2,
7135 * input_v3,
7136 * (VertexID + BaseVertex),
7137 * (InstanceID + StartInstance),
7138 * (InstanceID / 2 + StartInstance)
7139 */
7140 static void si_build_vs_prolog_function(struct si_shader_context *ctx,
7141 union si_shader_part_key *key)
7142 {
7143 struct si_function_info fninfo;
7144 LLVMTypeRef *returns;
7145 LLVMValueRef ret, func;
7146 int num_returns, i;
7147 unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
7148 unsigned num_input_vgprs = key->vs_prolog.num_merged_next_stage_vgprs + 4;
7149 LLVMValueRef input_vgprs[9];
7150 unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs +
7151 num_input_vgprs;
7152 unsigned user_sgpr_base = key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0;
7153
7154 si_init_function_info(&fninfo);
7155
7156 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7157 returns = alloca((num_all_input_regs + key->vs_prolog.last_input + 1) *
7158 sizeof(LLVMTypeRef));
7159 num_returns = 0;
7160
7161 /* Declare input and output SGPRs. */
7162 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7163 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7164 returns[num_returns++] = ctx->i32;
7165 }
7166
7167 /* Preloaded VGPRs (outputs must be floats) */
7168 for (i = 0; i < num_input_vgprs; i++) {
7169 add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &input_vgprs[i]);
7170 returns[num_returns++] = ctx->f32;
7171 }
7172
7173 /* Vertex load indices. */
7174 for (i = 0; i <= key->vs_prolog.last_input; i++)
7175 returns[num_returns++] = ctx->f32;
7176
7177 /* Create the function. */
7178 si_create_function(ctx, "vs_prolog", returns, num_returns, &fninfo, 0);
7179 func = ctx->main_fn;
7180
7181 if (key->vs_prolog.num_merged_next_stage_vgprs) {
7182 if (!key->vs_prolog.is_monolithic)
7183 si_init_exec_from_input(ctx, 3, 0);
7184
7185 if (key->vs_prolog.as_ls &&
7186 ctx->screen->has_ls_vgpr_init_bug) {
7187 /* If there are no HS threads, SPI loads the LS VGPRs
7188 * starting at VGPR 0. Shift them back to where they
7189 * belong.
7190 */
7191 LLVMValueRef has_hs_threads =
7192 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE,
7193 si_unpack_param(ctx, 3, 8, 8),
7194 ctx->i32_0, "");
7195
7196 for (i = 4; i > 0; --i) {
7197 input_vgprs[i + 1] =
7198 LLVMBuildSelect(ctx->ac.builder, has_hs_threads,
7199 input_vgprs[i + 1],
7200 input_vgprs[i - 1], "");
7201 }
7202 }
7203 }
7204
7205 ctx->abi.vertex_id = input_vgprs[first_vs_vgpr];
7206 ctx->abi.instance_id = input_vgprs[first_vs_vgpr + (key->vs_prolog.as_ls ? 2 : 1)];
7207
7208 /* Copy inputs to outputs. This should be no-op, as the registers match,
7209 * but it will prevent the compiler from overwriting them unintentionally.
7210 */
7211 ret = ctx->return_value;
7212 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7213 LLVMValueRef p = LLVMGetParam(func, i);
7214 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7215 }
7216 for (i = 0; i < num_input_vgprs; i++) {
7217 LLVMValueRef p = input_vgprs[i];
7218 p = ac_to_float(&ctx->ac, p);
7219 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p,
7220 key->vs_prolog.num_input_sgprs + i, "");
7221 }
7222
7223 /* Compute vertex load indices from instance divisors. */
7224 LLVMValueRef instance_divisor_constbuf = NULL;
7225
7226 if (key->vs_prolog.states.instance_divisor_is_fetched) {
7227 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7228 LLVMValueRef buf_index =
7229 LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
7230 instance_divisor_constbuf =
7231 ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
7232 }
7233
7234 for (i = 0; i <= key->vs_prolog.last_input; i++) {
7235 bool divisor_is_one =
7236 key->vs_prolog.states.instance_divisor_is_one & (1u << i);
7237 bool divisor_is_fetched =
7238 key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
7239 LLVMValueRef index = NULL;
7240
7241 if (divisor_is_one) {
7242 index = ctx->abi.instance_id;
7243 } else if (divisor_is_fetched) {
7244 LLVMValueRef udiv_factors[4];
7245
7246 for (unsigned j = 0; j < 4; j++) {
7247 udiv_factors[j] =
7248 buffer_load_const(ctx, instance_divisor_constbuf,
7249 LLVMConstInt(ctx->i32, i*16 + j*4, 0));
7250 udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
7251 }
7252 /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
7253 * Such InstanceID might not be achievable in a reasonable time though.
7254 */
7255 index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
7256 udiv_factors[0], udiv_factors[1],
7257 udiv_factors[2], udiv_factors[3]);
7258 }
7259
7260 if (divisor_is_one || divisor_is_fetched) {
7261 /* Add StartInstance. */
7262 index = LLVMBuildAdd(ctx->ac.builder, index,
7263 LLVMGetParam(ctx->main_fn, user_sgpr_base +
7264 SI_SGPR_START_INSTANCE), "");
7265 } else {
7266 /* VertexID + BaseVertex */
7267 index = LLVMBuildAdd(ctx->ac.builder,
7268 ctx->abi.vertex_id,
7269 LLVMGetParam(func, user_sgpr_base +
7270 SI_SGPR_BASE_VERTEX), "");
7271 }
7272
7273 index = ac_to_float(&ctx->ac, index);
7274 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
7275 fninfo.num_params + i, "");
7276 }
7277
7278 si_llvm_build_ret(ctx, ret);
7279 }
7280
7281 static bool si_get_vs_prolog(struct si_screen *sscreen,
7282 struct ac_llvm_compiler *compiler,
7283 struct si_shader *shader,
7284 struct pipe_debug_callback *debug,
7285 struct si_shader *main_part,
7286 const struct si_vs_prolog_bits *key)
7287 {
7288 struct si_shader_selector *vs = main_part->selector;
7289
7290 if (!si_vs_needs_prolog(vs, key))
7291 return true;
7292
7293 /* Get the prolog. */
7294 union si_shader_part_key prolog_key;
7295 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs,
7296 key, shader, &prolog_key);
7297
7298 shader->prolog =
7299 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7300 PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
7301 debug, si_build_vs_prolog_function,
7302 "Vertex Shader Prolog");
7303 return shader->prolog != NULL;
7304 }
7305
7306 /**
7307 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7308 */
7309 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7310 struct ac_llvm_compiler *compiler,
7311 struct si_shader *shader,
7312 struct pipe_debug_callback *debug)
7313 {
7314 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
7315 &shader->key.part.vs.prolog);
7316 }
7317
7318 /**
7319 * Compile the TCS epilog function. This writes tesselation factors to memory
7320 * based on the output primitive type of the tesselator (determined by TES).
7321 */
7322 static void si_build_tcs_epilog_function(struct si_shader_context *ctx,
7323 union si_shader_part_key *key)
7324 {
7325 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7326 struct si_function_info fninfo;
7327 LLVMValueRef func;
7328
7329 si_init_function_info(&fninfo);
7330
7331 if (ctx->screen->info.chip_class >= GFX9) {
7332 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7333 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7334 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7335 add_arg(&fninfo, ARG_SGPR, ctx->i32); /* wave info */
7336 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7337 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7338 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7339 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7340 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7341 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7342 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7343 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7344 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7345 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7346 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7347 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7348 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7349 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7350 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7351 } else {
7352 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7353 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7354 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7355 add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7356 ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7357 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7358 ctx->param_tcs_out_lds_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7359 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7360 ctx->param_tcs_offchip_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7361 ctx->param_tcs_factor_offset = add_arg(&fninfo, ARG_SGPR, ctx->i32);
7362 }
7363
7364 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7365 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* VGPR gap */
7366 unsigned tess_factors_idx =
7367 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* patch index within the wave (REL_PATCH_ID) */
7368 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */
7369 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */
7370
7371 for (unsigned i = 0; i < 6; i++)
7372 add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
7373
7374 /* Create the function. */
7375 si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
7376 ctx->screen->info.chip_class >= CIK ? 128 : 64);
7377 ac_declare_lds_as_pointer(&ctx->ac);
7378 func = ctx->main_fn;
7379
7380 LLVMValueRef invoc0_tess_factors[6];
7381 for (unsigned i = 0; i < 6; i++)
7382 invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i);
7383
7384 si_write_tess_factors(bld_base,
7385 LLVMGetParam(func, tess_factors_idx),
7386 LLVMGetParam(func, tess_factors_idx + 1),
7387 LLVMGetParam(func, tess_factors_idx + 2),
7388 invoc0_tess_factors, invoc0_tess_factors + 4);
7389
7390 LLVMBuildRetVoid(ctx->ac.builder);
7391 }
7392
7393 /**
7394 * Select and compile (or reuse) TCS parts (epilog).
7395 */
7396 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7397 struct ac_llvm_compiler *compiler,
7398 struct si_shader *shader,
7399 struct pipe_debug_callback *debug)
7400 {
7401 if (sscreen->info.chip_class >= GFX9) {
7402 struct si_shader *ls_main_part =
7403 shader->key.part.tcs.ls->main_shader_part_ls;
7404
7405 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
7406 &shader->key.part.tcs.ls_prolog))
7407 return false;
7408
7409 shader->previous_stage = ls_main_part;
7410 }
7411
7412 /* Get the epilog. */
7413 union si_shader_part_key epilog_key;
7414 memset(&epilog_key, 0, sizeof(epilog_key));
7415 epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
7416
7417 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7418 PIPE_SHADER_TESS_CTRL, false,
7419 &epilog_key, compiler, debug,
7420 si_build_tcs_epilog_function,
7421 "Tessellation Control Shader Epilog");
7422 return shader->epilog != NULL;
7423 }
7424
7425 /**
7426 * Select and compile (or reuse) GS parts (prolog).
7427 */
7428 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
7429 struct ac_llvm_compiler *compiler,
7430 struct si_shader *shader,
7431 struct pipe_debug_callback *debug)
7432 {
7433 if (sscreen->info.chip_class >= GFX9) {
7434 struct si_shader *es_main_part =
7435 shader->key.part.gs.es->main_shader_part_es;
7436
7437 if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
7438 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
7439 &shader->key.part.gs.vs_prolog))
7440 return false;
7441
7442 shader->previous_stage = es_main_part;
7443 }
7444
7445 if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
7446 return true;
7447
7448 union si_shader_part_key prolog_key;
7449 memset(&prolog_key, 0, sizeof(prolog_key));
7450 prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
7451
7452 shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
7453 PIPE_SHADER_GEOMETRY, true,
7454 &prolog_key, compiler, debug,
7455 si_build_gs_prolog_function,
7456 "Geometry Shader Prolog");
7457 return shader->prolog2 != NULL;
7458 }
7459
7460 /**
7461 * Build the pixel shader prolog function. This handles:
7462 * - two-side color selection and interpolation
7463 * - overriding interpolation parameters for the API PS
7464 * - polygon stippling
7465 *
7466 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7467 * overriden by other states. (e.g. per-sample interpolation)
7468 * Interpolated colors are stored after the preloaded VGPRs.
7469 */
7470 static void si_build_ps_prolog_function(struct si_shader_context *ctx,
7471 union si_shader_part_key *key)
7472 {
7473 struct si_function_info fninfo;
7474 LLVMValueRef ret, func;
7475 int num_returns, i, num_color_channels;
7476
7477 assert(si_need_ps_prolog(key));
7478
7479 si_init_function_info(&fninfo);
7480
7481 /* Declare inputs. */
7482 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7483 add_arg(&fninfo, ARG_SGPR, ctx->i32);
7484
7485 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7486 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7487
7488 /* Declare outputs (same as inputs + add colors if needed) */
7489 num_returns = fninfo.num_params;
7490 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7491 for (i = 0; i < num_color_channels; i++)
7492 fninfo.types[num_returns++] = ctx->f32;
7493
7494 /* Create the function. */
7495 si_create_function(ctx, "ps_prolog", fninfo.types, num_returns,
7496 &fninfo, 0);
7497 func = ctx->main_fn;
7498
7499 /* Copy inputs to outputs. This should be no-op, as the registers match,
7500 * but it will prevent the compiler from overwriting them unintentionally.
7501 */
7502 ret = ctx->return_value;
7503 for (i = 0; i < fninfo.num_params; i++) {
7504 LLVMValueRef p = LLVMGetParam(func, i);
7505 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
7506 }
7507
7508 /* Polygon stippling. */
7509 if (key->ps_prolog.states.poly_stipple) {
7510 /* POS_FIXED_PT is always last. */
7511 unsigned pos = key->ps_prolog.num_input_sgprs +
7512 key->ps_prolog.num_input_vgprs - 1;
7513 LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
7514
7515 si_llvm_emit_polygon_stipple(ctx, list, pos);
7516 }
7517
7518 if (key->ps_prolog.states.bc_optimize_for_persp ||
7519 key->ps_prolog.states.bc_optimize_for_linear) {
7520 unsigned i, base = key->ps_prolog.num_input_sgprs;
7521 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7522
7523 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7524 * The hw doesn't compute CENTROID if the whole wave only
7525 * contains fully-covered quads.
7526 *
7527 * PRIM_MASK is after user SGPRs.
7528 */
7529 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7530 bc_optimize = LLVMBuildLShr(ctx->ac.builder, bc_optimize,
7531 LLVMConstInt(ctx->i32, 31, 0), "");
7532 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize,
7533 ctx->i1, "");
7534
7535 if (key->ps_prolog.states.bc_optimize_for_persp) {
7536 /* Read PERSP_CENTER. */
7537 for (i = 0; i < 2; i++)
7538 center[i] = LLVMGetParam(func, base + 2 + i);
7539 /* Read PERSP_CENTROID. */
7540 for (i = 0; i < 2; i++)
7541 centroid[i] = LLVMGetParam(func, base + 4 + i);
7542 /* Select PERSP_CENTROID. */
7543 for (i = 0; i < 2; i++) {
7544 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7545 center[i], centroid[i], "");
7546 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7547 tmp, base + 4 + i, "");
7548 }
7549 }
7550 if (key->ps_prolog.states.bc_optimize_for_linear) {
7551 /* Read LINEAR_CENTER. */
7552 for (i = 0; i < 2; i++)
7553 center[i] = LLVMGetParam(func, base + 8 + i);
7554 /* Read LINEAR_CENTROID. */
7555 for (i = 0; i < 2; i++)
7556 centroid[i] = LLVMGetParam(func, base + 10 + i);
7557 /* Select LINEAR_CENTROID. */
7558 for (i = 0; i < 2; i++) {
7559 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize,
7560 center[i], centroid[i], "");
7561 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7562 tmp, base + 10 + i, "");
7563 }
7564 }
7565 }
7566
7567 /* Force per-sample interpolation. */
7568 if (key->ps_prolog.states.force_persp_sample_interp) {
7569 unsigned i, base = key->ps_prolog.num_input_sgprs;
7570 LLVMValueRef persp_sample[2];
7571
7572 /* Read PERSP_SAMPLE. */
7573 for (i = 0; i < 2; i++)
7574 persp_sample[i] = LLVMGetParam(func, base + i);
7575 /* Overwrite PERSP_CENTER. */
7576 for (i = 0; i < 2; i++)
7577 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7578 persp_sample[i], base + 2 + i, "");
7579 /* Overwrite PERSP_CENTROID. */
7580 for (i = 0; i < 2; i++)
7581 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7582 persp_sample[i], base + 4 + i, "");
7583 }
7584 if (key->ps_prolog.states.force_linear_sample_interp) {
7585 unsigned i, base = key->ps_prolog.num_input_sgprs;
7586 LLVMValueRef linear_sample[2];
7587
7588 /* Read LINEAR_SAMPLE. */
7589 for (i = 0; i < 2; i++)
7590 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7591 /* Overwrite LINEAR_CENTER. */
7592 for (i = 0; i < 2; i++)
7593 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7594 linear_sample[i], base + 8 + i, "");
7595 /* Overwrite LINEAR_CENTROID. */
7596 for (i = 0; i < 2; i++)
7597 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7598 linear_sample[i], base + 10 + i, "");
7599 }
7600
7601 /* Force center interpolation. */
7602 if (key->ps_prolog.states.force_persp_center_interp) {
7603 unsigned i, base = key->ps_prolog.num_input_sgprs;
7604 LLVMValueRef persp_center[2];
7605
7606 /* Read PERSP_CENTER. */
7607 for (i = 0; i < 2; i++)
7608 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7609 /* Overwrite PERSP_SAMPLE. */
7610 for (i = 0; i < 2; i++)
7611 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7612 persp_center[i], base + i, "");
7613 /* Overwrite PERSP_CENTROID. */
7614 for (i = 0; i < 2; i++)
7615 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7616 persp_center[i], base + 4 + i, "");
7617 }
7618 if (key->ps_prolog.states.force_linear_center_interp) {
7619 unsigned i, base = key->ps_prolog.num_input_sgprs;
7620 LLVMValueRef linear_center[2];
7621
7622 /* Read LINEAR_CENTER. */
7623 for (i = 0; i < 2; i++)
7624 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7625 /* Overwrite LINEAR_SAMPLE. */
7626 for (i = 0; i < 2; i++)
7627 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7628 linear_center[i], base + 6 + i, "");
7629 /* Overwrite LINEAR_CENTROID. */
7630 for (i = 0; i < 2; i++)
7631 ret = LLVMBuildInsertValue(ctx->ac.builder, ret,
7632 linear_center[i], base + 10 + i, "");
7633 }
7634
7635 /* Interpolate colors. */
7636 unsigned color_out_idx = 0;
7637 for (i = 0; i < 2; i++) {
7638 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7639 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7640 key->ps_prolog.face_vgpr_index;
7641 LLVMValueRef interp[2], color[4];
7642 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7643
7644 if (!writemask)
7645 continue;
7646
7647 /* If the interpolation qualifier is not CONSTANT (-1). */
7648 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7649 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7650 key->ps_prolog.color_interp_vgpr_index[i];
7651
7652 /* Get the (i,j) updated by bc_optimize handling. */
7653 interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7654 interp_vgpr, "");
7655 interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
7656 interp_vgpr + 1, "");
7657 interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
7658 }
7659
7660 /* Use the absolute location of the input. */
7661 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7662
7663 if (key->ps_prolog.states.color_two_side) {
7664 face = LLVMGetParam(func, face_vgpr);
7665 face = ac_to_integer(&ctx->ac, face);
7666 }
7667
7668 interp_fs_input(ctx,
7669 key->ps_prolog.color_attr_index[i],
7670 TGSI_SEMANTIC_COLOR, i,
7671 key->ps_prolog.num_interp_inputs,
7672 key->ps_prolog.colors_read, interp_ij,
7673 prim_mask, face, color);
7674
7675 while (writemask) {
7676 unsigned chan = u_bit_scan(&writemask);
7677 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
7678 fninfo.num_params + color_out_idx++, "");
7679 }
7680 }
7681
7682 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
7683 * says:
7684 *
7685 * "When per-sample shading is active due to the use of a fragment
7686 * input qualified by sample or due to the use of the gl_SampleID
7687 * or gl_SamplePosition variables, only the bit for the current
7688 * sample is set in gl_SampleMaskIn. When state specifies multiple
7689 * fragment shader invocations for a given fragment, the sample
7690 * mask for any single fragment shader invocation may specify a
7691 * subset of the covered samples for the fragment. In this case,
7692 * the bit corresponding to each covered sample will be set in
7693 * exactly one fragment shader invocation."
7694 *
7695 * The samplemask loaded by hardware is always the coverage of the
7696 * entire pixel/fragment, so mask bits out based on the sample ID.
7697 */
7698 if (key->ps_prolog.states.samplemask_log_ps_iter) {
7699 /* The bit pattern matches that used by fixed function fragment
7700 * processing. */
7701 static const uint16_t ps_iter_masks[] = {
7702 0xffff, /* not used */
7703 0x5555,
7704 0x1111,
7705 0x0101,
7706 0x0001,
7707 };
7708 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
7709
7710 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
7711 unsigned ancillary_vgpr = key->ps_prolog.num_input_sgprs +
7712 key->ps_prolog.ancillary_vgpr_index;
7713 LLVMValueRef sampleid = si_unpack_param(ctx, ancillary_vgpr, 8, 4);
7714 LLVMValueRef samplemask = LLVMGetParam(func, ancillary_vgpr + 1);
7715
7716 samplemask = ac_to_integer(&ctx->ac, samplemask);
7717 samplemask = LLVMBuildAnd(
7718 ctx->ac.builder,
7719 samplemask,
7720 LLVMBuildShl(ctx->ac.builder,
7721 LLVMConstInt(ctx->i32, ps_iter_mask, false),
7722 sampleid, ""),
7723 "");
7724 samplemask = ac_to_float(&ctx->ac, samplemask);
7725
7726 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask,
7727 ancillary_vgpr + 1, "");
7728 }
7729
7730 /* Tell LLVM to insert WQM instruction sequence when needed. */
7731 if (key->ps_prolog.wqm) {
7732 LLVMAddTargetDependentFunctionAttr(func,
7733 "amdgpu-ps-wqm-outputs", "");
7734 }
7735
7736 si_llvm_build_ret(ctx, ret);
7737 }
7738
7739 /**
7740 * Build the pixel shader epilog function. This handles everything that must be
7741 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7742 */
7743 static void si_build_ps_epilog_function(struct si_shader_context *ctx,
7744 union si_shader_part_key *key)
7745 {
7746 struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
7747 struct si_function_info fninfo;
7748 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7749 int i;
7750 struct si_ps_exports exp = {};
7751
7752 si_init_function_info(&fninfo);
7753
7754 /* Declare input SGPRs. */
7755 ctx->param_rw_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7756 ctx->param_bindless_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7757 ctx->param_const_and_shader_buffers = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7758 ctx->param_samplers_and_images = add_arg(&fninfo, ARG_SGPR, ctx->ac.intptr);
7759 add_arg_checked(&fninfo, ARG_SGPR, ctx->f32, SI_PARAM_ALPHA_REF);
7760
7761 /* Declare input VGPRs. */
7762 unsigned required_num_params =
7763 fninfo.num_sgpr_params +
7764 util_bitcount(key->ps_epilog.colors_written) * 4 +
7765 key->ps_epilog.writes_z +
7766 key->ps_epilog.writes_stencil +
7767 key->ps_epilog.writes_samplemask;
7768
7769 required_num_params = MAX2(required_num_params,
7770 fninfo.num_sgpr_params + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7771
7772 while (fninfo.num_params < required_num_params)
7773 add_arg(&fninfo, ARG_VGPR, ctx->f32);
7774
7775 /* Create the function. */
7776 si_create_function(ctx, "ps_epilog", NULL, 0, &fninfo, 0);
7777 /* Disable elimination of unused inputs. */
7778 ac_llvm_add_target_dep_function_attr(ctx->main_fn,
7779 "InitialPSInputAddr", 0xffffff);
7780
7781 /* Process colors. */
7782 unsigned vgpr = fninfo.num_sgpr_params;
7783 unsigned colors_written = key->ps_epilog.colors_written;
7784 int last_color_export = -1;
7785
7786 /* Find the last color export. */
7787 if (!key->ps_epilog.writes_z &&
7788 !key->ps_epilog.writes_stencil &&
7789 !key->ps_epilog.writes_samplemask) {
7790 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7791
7792 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7793 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7794 /* Just set this if any of the colorbuffers are enabled. */
7795 if (spi_format &
7796 ((1ull << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7797 last_color_export = 0;
7798 } else {
7799 for (i = 0; i < 8; i++)
7800 if (colors_written & (1 << i) &&
7801 (spi_format >> (i * 4)) & 0xf)
7802 last_color_export = i;
7803 }
7804 }
7805
7806 while (colors_written) {
7807 LLVMValueRef color[4];
7808 int mrt = u_bit_scan(&colors_written);
7809
7810 for (i = 0; i < 4; i++)
7811 color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
7812
7813 si_export_mrt_color(bld_base, color, mrt,
7814 fninfo.num_params - 1,
7815 mrt == last_color_export, &exp);
7816 }
7817
7818 /* Process depth, stencil, samplemask. */
7819 if (key->ps_epilog.writes_z)
7820 depth = LLVMGetParam(ctx->main_fn, vgpr++);
7821 if (key->ps_epilog.writes_stencil)
7822 stencil = LLVMGetParam(ctx->main_fn, vgpr++);
7823 if (key->ps_epilog.writes_samplemask)
7824 samplemask = LLVMGetParam(ctx->main_fn, vgpr++);
7825
7826 if (depth || stencil || samplemask)
7827 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7828 else if (last_color_export == -1)
7829 ac_build_export_null(&ctx->ac);
7830
7831 if (exp.num)
7832 si_emit_ps_exports(ctx, &exp);
7833
7834 /* Compile. */
7835 LLVMBuildRetVoid(ctx->ac.builder);
7836 }
7837
7838 /**
7839 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7840 */
7841 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7842 struct ac_llvm_compiler *compiler,
7843 struct si_shader *shader,
7844 struct pipe_debug_callback *debug)
7845 {
7846 union si_shader_part_key prolog_key;
7847 union si_shader_part_key epilog_key;
7848
7849 /* Get the prolog. */
7850 si_get_ps_prolog_key(shader, &prolog_key, true);
7851
7852 /* The prolog is a no-op if these aren't set. */
7853 if (si_need_ps_prolog(&prolog_key)) {
7854 shader->prolog =
7855 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7856 PIPE_SHADER_FRAGMENT, true,
7857 &prolog_key, compiler, debug,
7858 si_build_ps_prolog_function,
7859 "Fragment Shader Prolog");
7860 if (!shader->prolog)
7861 return false;
7862 }
7863
7864 /* Get the epilog. */
7865 si_get_ps_epilog_key(shader, &epilog_key);
7866
7867 shader->epilog =
7868 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7869 PIPE_SHADER_FRAGMENT, false,
7870 &epilog_key, compiler, debug,
7871 si_build_ps_epilog_function,
7872 "Fragment Shader Epilog");
7873 if (!shader->epilog)
7874 return false;
7875
7876 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7877 if (shader->key.part.ps.prolog.poly_stipple) {
7878 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7879 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7880 }
7881
7882 /* Set up the enable bits for per-sample shading if needed. */
7883 if (shader->key.part.ps.prolog.force_persp_sample_interp &&
7884 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7885 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7886 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7887 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7888 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7889 }
7890 if (shader->key.part.ps.prolog.force_linear_sample_interp &&
7891 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7892 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7893 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7894 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7895 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7896 }
7897 if (shader->key.part.ps.prolog.force_persp_center_interp &&
7898 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7899 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7900 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7901 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7902 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7903 }
7904 if (shader->key.part.ps.prolog.force_linear_center_interp &&
7905 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7906 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7907 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7908 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7909 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7910 }
7911
7912 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7913 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7914 !(shader->config.spi_ps_input_ena & 0xf)) {
7915 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7916 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7917 }
7918
7919 /* At least one pair of interpolation weights must be enabled. */
7920 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7921 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7922 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7923 }
7924
7925 /* Samplemask fixup requires the sample ID. */
7926 if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
7927 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
7928 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
7929 }
7930
7931 /* The sample mask input is always enabled, because the API shader always
7932 * passes it through to the epilog. Disable it here if it's unused.
7933 */
7934 if (!shader->key.part.ps.epilog.poly_line_smoothing &&
7935 !shader->selector->info.reads_samplemask)
7936 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7937
7938 return true;
7939 }
7940
7941 void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
7942 unsigned *lds_size)
7943 {
7944 /* If tessellation is all offchip and on-chip GS isn't used, this
7945 * workaround is not needed.
7946 */
7947 return;
7948
7949 /* SPI barrier management bug:
7950 * Make sure we have at least 4k of LDS in use to avoid the bug.
7951 * It applies to workgroup sizes of more than one wavefront.
7952 */
7953 if (sscreen->info.family == CHIP_BONAIRE ||
7954 sscreen->info.family == CHIP_KABINI ||
7955 sscreen->info.family == CHIP_MULLINS)
7956 *lds_size = MAX2(*lds_size, 8);
7957 }
7958
7959 static void si_fix_resource_usage(struct si_screen *sscreen,
7960 struct si_shader *shader)
7961 {
7962 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7963
7964 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7965
7966 if (shader->selector->type == PIPE_SHADER_COMPUTE &&
7967 si_get_max_workgroup_size(shader) > 64) {
7968 si_multiwave_lds_size_workaround(sscreen,
7969 &shader->config.lds_size);
7970 }
7971 }
7972
7973 int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
7974 struct si_shader *shader,
7975 struct pipe_debug_callback *debug)
7976 {
7977 struct si_shader_selector *sel = shader->selector;
7978 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
7979 int r;
7980
7981 /* LS, ES, VS are compiled on demand if the main part hasn't been
7982 * compiled for that stage.
7983 *
7984 * Vertex shaders are compiled on demand when a vertex fetch
7985 * workaround must be applied.
7986 */
7987 if (shader->is_monolithic) {
7988 /* Monolithic shader (compiled as a whole, has many variants,
7989 * may take a long time to compile).
7990 */
7991 r = si_compile_tgsi_shader(sscreen, compiler, shader, debug);
7992 if (r)
7993 return r;
7994 } else {
7995 /* The shader consists of several parts:
7996 *
7997 * - the middle part is the user shader, it has 1 variant only
7998 * and it was compiled during the creation of the shader
7999 * selector
8000 * - the prolog part is inserted at the beginning
8001 * - the epilog part is inserted at the end
8002 *
8003 * The prolog and epilog have many (but simple) variants.
8004 *
8005 * Starting with gfx9, geometry and tessellation control
8006 * shaders also contain the prolog and user shader parts of
8007 * the previous shader stage.
8008 */
8009
8010 if (!mainp)
8011 return -1;
8012
8013 /* Copy the compiled TGSI shader data over. */
8014 shader->is_binary_shared = true;
8015 shader->binary = mainp->binary;
8016 shader->config = mainp->config;
8017 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
8018 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
8019 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
8020 shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
8021 memcpy(shader->info.vs_output_param_offset,
8022 mainp->info.vs_output_param_offset,
8023 sizeof(mainp->info.vs_output_param_offset));
8024 shader->info.uses_instanceid = mainp->info.uses_instanceid;
8025 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
8026 shader->info.nr_param_exports = mainp->info.nr_param_exports;
8027
8028 /* Select prologs and/or epilogs. */
8029 switch (sel->type) {
8030 case PIPE_SHADER_VERTEX:
8031 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
8032 return -1;
8033 break;
8034 case PIPE_SHADER_TESS_CTRL:
8035 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
8036 return -1;
8037 break;
8038 case PIPE_SHADER_TESS_EVAL:
8039 break;
8040 case PIPE_SHADER_GEOMETRY:
8041 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
8042 return -1;
8043 break;
8044 case PIPE_SHADER_FRAGMENT:
8045 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
8046 return -1;
8047
8048 /* Make sure we have at least as many VGPRs as there
8049 * are allocated inputs.
8050 */
8051 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8052 shader->info.num_input_vgprs);
8053 break;
8054 }
8055
8056 /* Update SGPR and VGPR counts. */
8057 if (shader->prolog) {
8058 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8059 shader->prolog->config.num_sgprs);
8060 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8061 shader->prolog->config.num_vgprs);
8062 }
8063 if (shader->previous_stage) {
8064 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8065 shader->previous_stage->config.num_sgprs);
8066 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8067 shader->previous_stage->config.num_vgprs);
8068 shader->config.spilled_sgprs =
8069 MAX2(shader->config.spilled_sgprs,
8070 shader->previous_stage->config.spilled_sgprs);
8071 shader->config.spilled_vgprs =
8072 MAX2(shader->config.spilled_vgprs,
8073 shader->previous_stage->config.spilled_vgprs);
8074 shader->config.private_mem_vgprs =
8075 MAX2(shader->config.private_mem_vgprs,
8076 shader->previous_stage->config.private_mem_vgprs);
8077 shader->config.scratch_bytes_per_wave =
8078 MAX2(shader->config.scratch_bytes_per_wave,
8079 shader->previous_stage->config.scratch_bytes_per_wave);
8080 shader->info.uses_instanceid |=
8081 shader->previous_stage->info.uses_instanceid;
8082 }
8083 if (shader->prolog2) {
8084 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8085 shader->prolog2->config.num_sgprs);
8086 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8087 shader->prolog2->config.num_vgprs);
8088 }
8089 if (shader->epilog) {
8090 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8091 shader->epilog->config.num_sgprs);
8092 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8093 shader->epilog->config.num_vgprs);
8094 }
8095 si_calculate_max_simd_waves(shader);
8096 }
8097
8098 si_fix_resource_usage(sscreen, shader);
8099 si_shader_dump(sscreen, shader, debug, sel->info.processor,
8100 stderr, true);
8101
8102 /* Upload. */
8103 r = si_shader_binary_upload(sscreen, shader);
8104 if (r) {
8105 fprintf(stderr, "LLVM failed to upload shader\n");
8106 return r;
8107 }
8108
8109 return 0;
8110 }
8111
8112 void si_shader_destroy(struct si_shader *shader)
8113 {
8114 if (shader->scratch_bo)
8115 r600_resource_reference(&shader->scratch_bo, NULL);
8116
8117 r600_resource_reference(&shader->bo, NULL);
8118
8119 if (!shader->is_binary_shared)
8120 ac_shader_binary_clean(&shader->binary);
8121
8122 free(shader->shader_log);
8123 }