radeonsi: add empty lines after shader stats
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "gallivm/lp_bld_misc.h"
37 #include "radeon/r600_cs.h"
38 #include "radeon/radeon_llvm.h"
39 #include "radeon/radeon_elf_util.h"
40 #include "radeon/radeon_llvm_emit.h"
41 #include "util/u_memory.h"
42 #include "util/u_pstipple.h"
43 #include "util/u_string.h"
44 #include "tgsi/tgsi_parse.h"
45 #include "tgsi/tgsi_build.h"
46 #include "tgsi/tgsi_util.h"
47 #include "tgsi/tgsi_dump.h"
48
49 #include "si_pipe.h"
50 #include "si_shader.h"
51 #include "sid.h"
52
53 #include <errno.h>
54
55 static const char *scratch_rsrc_dword0_symbol =
56 "SCRATCH_RSRC_DWORD0";
57
58 static const char *scratch_rsrc_dword1_symbol =
59 "SCRATCH_RSRC_DWORD1";
60
61 struct si_shader_output_values
62 {
63 LLVMValueRef values[4];
64 unsigned name;
65 unsigned sid;
66 };
67
68 struct si_shader_context
69 {
70 struct radeon_llvm_context radeon_bld;
71 struct si_shader *shader;
72 struct si_screen *screen;
73
74 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
75 bool is_gs_copy_shader;
76
77 /* Whether to generate the optimized shader variant compiled as a whole
78 * (without a prolog and epilog)
79 */
80 bool is_monolithic;
81
82 int param_streamout_config;
83 int param_streamout_write_index;
84 int param_streamout_offset[4];
85 int param_vertex_id;
86 int param_rel_auto_id;
87 int param_vs_prim_id;
88 int param_instance_id;
89 int param_vertex_index0;
90 int param_tes_u;
91 int param_tes_v;
92 int param_tes_rel_patch_id;
93 int param_tes_patch_id;
94 int param_es2gs_offset;
95 int param_oc_lds;
96
97 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
98 * 0x800000 for VS, 0x1 for ES.
99 */
100 int param_tess_offchip;
101
102 LLVMTargetMachineRef tm;
103
104 unsigned invariant_load_md_kind;
105 unsigned range_md_kind;
106 unsigned uniform_md_kind;
107 LLVMValueRef empty_md;
108
109 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
110 LLVMValueRef lds;
111 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
112 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
113 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
114 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
115 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
116 LLVMValueRef images[SI_NUM_IMAGES];
117 LLVMValueRef so_buffers[4];
118 LLVMValueRef esgs_ring;
119 LLVMValueRef gsvs_ring[4];
120 LLVMValueRef gs_next_vertex[4];
121 LLVMValueRef return_value;
122
123 LLVMTypeRef voidt;
124 LLVMTypeRef i1;
125 LLVMTypeRef i8;
126 LLVMTypeRef i32;
127 LLVMTypeRef i64;
128 LLVMTypeRef i128;
129 LLVMTypeRef f32;
130 LLVMTypeRef v16i8;
131 LLVMTypeRef v2i32;
132 LLVMTypeRef v4i32;
133 LLVMTypeRef v4f32;
134 LLVMTypeRef v8i32;
135
136 LLVMValueRef shared_memory;
137 };
138
139 static struct si_shader_context *si_shader_context(
140 struct lp_build_tgsi_context *bld_base)
141 {
142 return (struct si_shader_context *)bld_base;
143 }
144
145 static void si_init_shader_ctx(struct si_shader_context *ctx,
146 struct si_screen *sscreen,
147 struct si_shader *shader,
148 LLVMTargetMachineRef tm);
149
150 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
151 struct lp_build_tgsi_context *bld_base,
152 struct lp_build_emit_data *emit_data);
153
154 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
155 FILE *f);
156
157 /* Ideally pass the sample mask input to the PS epilog as v13, which
158 * is its usual location, so that the shader doesn't have to add v_mov.
159 */
160 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
161
162 /* The VS location of the PrimitiveID input is the same in the epilog,
163 * so that the main shader part doesn't have to move it.
164 */
165 #define VS_EPILOG_PRIMID_LOC 2
166
167 #define PERSPECTIVE_BASE 0
168 #define LINEAR_BASE 9
169
170 #define SAMPLE_OFFSET 0
171 #define CENTER_OFFSET 2
172 #define CENTROID_OFSET 4
173
174 #define USE_SGPR_MAX_SUFFIX_LEN 5
175 #define CONST_ADDR_SPACE 2
176 #define LOCAL_ADDR_SPACE 3
177 #define USER_SGPR_ADDR_SPACE 8
178
179
180 #define SENDMSG_GS 2
181 #define SENDMSG_GS_DONE 3
182
183 #define SENDMSG_GS_OP_NOP (0 << 4)
184 #define SENDMSG_GS_OP_CUT (1 << 4)
185 #define SENDMSG_GS_OP_EMIT (2 << 4)
186 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
187
188 /**
189 * Returns a unique index for a semantic name and index. The index must be
190 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
191 * calculated.
192 */
193 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
194 {
195 switch (semantic_name) {
196 case TGSI_SEMANTIC_POSITION:
197 return 0;
198 case TGSI_SEMANTIC_PSIZE:
199 return 1;
200 case TGSI_SEMANTIC_CLIPDIST:
201 assert(index <= 1);
202 return 2 + index;
203 case TGSI_SEMANTIC_GENERIC:
204 if (index <= 63-4)
205 return 4 + index;
206 else
207 /* same explanation as in the default statement,
208 * the only user hitting this is st/nine.
209 */
210 return 0;
211
212 /* patch indices are completely separate and thus start from 0 */
213 case TGSI_SEMANTIC_TESSOUTER:
214 return 0;
215 case TGSI_SEMANTIC_TESSINNER:
216 return 1;
217 case TGSI_SEMANTIC_PATCH:
218 return 2 + index;
219
220 default:
221 /* Don't fail here. The result of this function is only used
222 * for LS, TCS, TES, and GS, where legacy GL semantics can't
223 * occur, but this function is called for all vertex shaders
224 * before it's known whether LS will be compiled or not.
225 */
226 return 0;
227 }
228 }
229
230 /**
231 * Get the value of a shader input parameter and extract a bitfield.
232 */
233 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
234 unsigned param, unsigned rshift,
235 unsigned bitwidth)
236 {
237 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
238 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
239 param);
240
241 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
242 value = bitcast(&ctx->radeon_bld.soa.bld_base,
243 TGSI_TYPE_UNSIGNED, value);
244
245 if (rshift)
246 value = LLVMBuildLShr(gallivm->builder, value,
247 lp_build_const_int32(gallivm, rshift), "");
248
249 if (rshift + bitwidth < 32) {
250 unsigned mask = (1 << bitwidth) - 1;
251 value = LLVMBuildAnd(gallivm->builder, value,
252 lp_build_const_int32(gallivm, mask), "");
253 }
254
255 return value;
256 }
257
258 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
259 {
260 switch (ctx->type) {
261 case PIPE_SHADER_TESS_CTRL:
262 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
263
264 case PIPE_SHADER_TESS_EVAL:
265 return LLVMGetParam(ctx->radeon_bld.main_fn,
266 ctx->param_tes_rel_patch_id);
267
268 default:
269 assert(0);
270 return NULL;
271 }
272 }
273
274 /* Tessellation shaders pass outputs to the next shader using LDS.
275 *
276 * LS outputs = TCS inputs
277 * TCS outputs = TES inputs
278 *
279 * The LDS layout is:
280 * - TCS inputs for patch 0
281 * - TCS inputs for patch 1
282 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
283 * - ...
284 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
285 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
286 * - TCS outputs for patch 1
287 * - Per-patch TCS outputs for patch 1
288 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
289 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
290 * - ...
291 *
292 * All three shaders VS(LS), TCS, TES share the same LDS space.
293 */
294
295 static LLVMValueRef
296 get_tcs_in_patch_stride(struct si_shader_context *ctx)
297 {
298 if (ctx->type == PIPE_SHADER_VERTEX)
299 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
300 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
301 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
302 else {
303 assert(0);
304 return NULL;
305 }
306 }
307
308 static LLVMValueRef
309 get_tcs_out_patch_stride(struct si_shader_context *ctx)
310 {
311 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
312 }
313
314 static LLVMValueRef
315 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
316 {
317 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
318 unpack_param(ctx,
319 SI_PARAM_TCS_OUT_OFFSETS,
320 0, 16),
321 4);
322 }
323
324 static LLVMValueRef
325 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
326 {
327 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
328 unpack_param(ctx,
329 SI_PARAM_TCS_OUT_OFFSETS,
330 16, 16),
331 4);
332 }
333
334 static LLVMValueRef
335 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
336 {
337 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
338 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
339 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
340
341 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
342 }
343
344 static LLVMValueRef
345 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
346 {
347 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
348 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
349 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
350 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
351
352 return LLVMBuildAdd(gallivm->builder, patch0_offset,
353 LLVMBuildMul(gallivm->builder, patch_stride,
354 rel_patch_id, ""),
355 "");
356 }
357
358 static LLVMValueRef
359 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
360 {
361 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
362 LLVMValueRef patch0_patch_data_offset =
363 get_tcs_out_patch0_patch_data_offset(ctx);
364 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
365 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
366
367 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
368 LLVMBuildMul(gallivm->builder, patch_stride,
369 rel_patch_id, ""),
370 "");
371 }
372
373 static void build_indexed_store(struct si_shader_context *ctx,
374 LLVMValueRef base_ptr, LLVMValueRef index,
375 LLVMValueRef value)
376 {
377 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
378 struct gallivm_state *gallivm = bld_base->base.gallivm;
379 LLVMValueRef indices[2], pointer;
380
381 indices[0] = bld_base->uint_bld.zero;
382 indices[1] = index;
383
384 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
385 LLVMBuildStore(gallivm->builder, value, pointer);
386 }
387
388 /**
389 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
390 * It's equivalent to doing a load from &base_ptr[index].
391 *
392 * \param base_ptr Where the array starts.
393 * \param index The element index into the array.
394 * \param uniform Whether the base_ptr and index can be assumed to be
395 * dynamically uniform
396 */
397 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
398 LLVMValueRef base_ptr, LLVMValueRef index,
399 bool uniform)
400 {
401 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
402 struct gallivm_state *gallivm = bld_base->base.gallivm;
403 LLVMValueRef indices[2], pointer;
404
405 indices[0] = bld_base->uint_bld.zero;
406 indices[1] = index;
407
408 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
409 if (uniform)
410 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
411 return LLVMBuildLoad(gallivm->builder, pointer, "");
412 }
413
414 /**
415 * Do a load from &base_ptr[index], but also add a flag that it's loading
416 * a constant from a dynamically uniform index.
417 */
418 static LLVMValueRef build_indexed_load_const(
419 struct si_shader_context *ctx,
420 LLVMValueRef base_ptr, LLVMValueRef index)
421 {
422 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
423 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
424 return result;
425 }
426
427 static LLVMValueRef get_instance_index_for_fetch(
428 struct radeon_llvm_context *radeon_bld,
429 unsigned param_start_instance, unsigned divisor)
430 {
431 struct si_shader_context *ctx =
432 si_shader_context(&radeon_bld->soa.bld_base);
433 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
434
435 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
436 ctx->param_instance_id);
437
438 /* The division must be done before START_INSTANCE is added. */
439 if (divisor > 1)
440 result = LLVMBuildUDiv(gallivm->builder, result,
441 lp_build_const_int32(gallivm, divisor), "");
442
443 return LLVMBuildAdd(gallivm->builder, result,
444 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
445 }
446
447 static void declare_input_vs(
448 struct radeon_llvm_context *radeon_bld,
449 unsigned input_index,
450 const struct tgsi_full_declaration *decl)
451 {
452 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
453 struct gallivm_state *gallivm = base->gallivm;
454 struct si_shader_context *ctx =
455 si_shader_context(&radeon_bld->soa.bld_base);
456 unsigned divisor =
457 ctx->shader->key.vs.prolog.instance_divisors[input_index];
458
459 unsigned chan;
460
461 LLVMValueRef t_list_ptr;
462 LLVMValueRef t_offset;
463 LLVMValueRef t_list;
464 LLVMValueRef attribute_offset;
465 LLVMValueRef buffer_index;
466 LLVMValueRef args[3];
467 LLVMValueRef input;
468
469 /* Load the T list */
470 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
471
472 t_offset = lp_build_const_int32(gallivm, input_index);
473
474 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
475
476 /* Build the attribute offset */
477 attribute_offset = lp_build_const_int32(gallivm, 0);
478
479 if (!ctx->is_monolithic) {
480 buffer_index = LLVMGetParam(radeon_bld->main_fn,
481 ctx->param_vertex_index0 +
482 input_index);
483 } else if (divisor) {
484 /* Build index from instance ID, start instance and divisor */
485 ctx->shader->info.uses_instanceid = true;
486 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
487 SI_PARAM_START_INSTANCE,
488 divisor);
489 } else {
490 /* Load the buffer index for vertices. */
491 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
492 ctx->param_vertex_id);
493 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
494 SI_PARAM_BASE_VERTEX);
495 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
496 }
497
498 args[0] = t_list;
499 args[1] = attribute_offset;
500 args[2] = buffer_index;
501 input = lp_build_intrinsic(gallivm->builder,
502 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
503 LLVMReadNoneAttribute);
504
505 /* Break up the vec4 into individual components */
506 for (chan = 0; chan < 4; chan++) {
507 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
508 /* XXX: Use a helper function for this. There is one in
509 * tgsi_llvm.c. */
510 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
511 LLVMBuildExtractElement(gallivm->builder,
512 input, llvm_chan, "");
513 }
514 }
515
516 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
517 unsigned swizzle)
518 {
519 struct si_shader_context *ctx = si_shader_context(bld_base);
520
521 if (swizzle > 0)
522 return bld_base->uint_bld.zero;
523
524 switch (ctx->type) {
525 case PIPE_SHADER_VERTEX:
526 return LLVMGetParam(ctx->radeon_bld.main_fn,
527 ctx->param_vs_prim_id);
528 case PIPE_SHADER_TESS_CTRL:
529 return LLVMGetParam(ctx->radeon_bld.main_fn,
530 SI_PARAM_PATCH_ID);
531 case PIPE_SHADER_TESS_EVAL:
532 return LLVMGetParam(ctx->radeon_bld.main_fn,
533 ctx->param_tes_patch_id);
534 case PIPE_SHADER_GEOMETRY:
535 return LLVMGetParam(ctx->radeon_bld.main_fn,
536 SI_PARAM_PRIMITIVE_ID);
537 default:
538 assert(0);
539 return bld_base->uint_bld.zero;
540 }
541 }
542
543 /**
544 * Return the value of tgsi_ind_register for indexing.
545 * This is the indirect index with the constant offset added to it.
546 */
547 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
548 const struct tgsi_ind_register *ind,
549 int rel_index)
550 {
551 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
552 LLVMValueRef result;
553
554 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
555 result = LLVMBuildLoad(gallivm->builder, result, "");
556 result = LLVMBuildAdd(gallivm->builder, result,
557 lp_build_const_int32(gallivm, rel_index), "");
558 return result;
559 }
560
561 /**
562 * Like get_indirect_index, but restricts the return value to a (possibly
563 * undefined) value inside [0..num).
564 */
565 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
566 const struct tgsi_ind_register *ind,
567 int rel_index, unsigned num)
568 {
569 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
570 LLVMBuilderRef builder = gallivm->builder;
571 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
572 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
573 LLVMValueRef cc;
574
575 /* LLVM 3.8: If indirect resource indexing is used:
576 * - SI & CIK hang
577 * - VI crashes
578 */
579 if (HAVE_LLVM <= 0x0308)
580 return LLVMGetUndef(ctx->i32);
581
582 if (util_is_power_of_two(num)) {
583 result = LLVMBuildAnd(builder, result, c_max, "");
584 } else {
585 /* In theory, this MAX pattern should result in code that is
586 * as good as the bit-wise AND above.
587 *
588 * In practice, LLVM generates worse code (at the time of
589 * writing), because its value tracking is not strong enough.
590 */
591 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
592 result = LLVMBuildSelect(builder, cc, result, c_max, "");
593 }
594
595 return result;
596 }
597
598
599 /**
600 * Calculate a dword address given an input or output register and a stride.
601 */
602 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
603 const struct tgsi_full_dst_register *dst,
604 const struct tgsi_full_src_register *src,
605 LLVMValueRef vertex_dw_stride,
606 LLVMValueRef base_addr)
607 {
608 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
609 struct tgsi_shader_info *info = &ctx->shader->selector->info;
610 ubyte *name, *index, *array_first;
611 int first, param;
612 struct tgsi_full_dst_register reg;
613
614 /* Set the register description. The address computation is the same
615 * for sources and destinations. */
616 if (src) {
617 reg.Register.File = src->Register.File;
618 reg.Register.Index = src->Register.Index;
619 reg.Register.Indirect = src->Register.Indirect;
620 reg.Register.Dimension = src->Register.Dimension;
621 reg.Indirect = src->Indirect;
622 reg.Dimension = src->Dimension;
623 reg.DimIndirect = src->DimIndirect;
624 } else
625 reg = *dst;
626
627 /* If the register is 2-dimensional (e.g. an array of vertices
628 * in a primitive), calculate the base address of the vertex. */
629 if (reg.Register.Dimension) {
630 LLVMValueRef index;
631
632 if (reg.Dimension.Indirect)
633 index = get_indirect_index(ctx, &reg.DimIndirect,
634 reg.Dimension.Index);
635 else
636 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
637
638 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
639 LLVMBuildMul(gallivm->builder, index,
640 vertex_dw_stride, ""), "");
641 }
642
643 /* Get information about the register. */
644 if (reg.Register.File == TGSI_FILE_INPUT) {
645 name = info->input_semantic_name;
646 index = info->input_semantic_index;
647 array_first = info->input_array_first;
648 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
649 name = info->output_semantic_name;
650 index = info->output_semantic_index;
651 array_first = info->output_array_first;
652 } else {
653 assert(0);
654 return NULL;
655 }
656
657 if (reg.Register.Indirect) {
658 /* Add the relative address of the element. */
659 LLVMValueRef ind_index;
660
661 if (reg.Indirect.ArrayID)
662 first = array_first[reg.Indirect.ArrayID];
663 else
664 first = reg.Register.Index;
665
666 ind_index = get_indirect_index(ctx, &reg.Indirect,
667 reg.Register.Index - first);
668
669 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
670 LLVMBuildMul(gallivm->builder, ind_index,
671 lp_build_const_int32(gallivm, 4), ""), "");
672
673 param = si_shader_io_get_unique_index(name[first], index[first]);
674 } else {
675 param = si_shader_io_get_unique_index(name[reg.Register.Index],
676 index[reg.Register.Index]);
677 }
678
679 /* Add the base address of the element. */
680 return LLVMBuildAdd(gallivm->builder, base_addr,
681 lp_build_const_int32(gallivm, param * 4), "");
682 }
683
684 /* The offchip buffer layout for TCS->TES is
685 *
686 * - attribute 0 of patch 0 vertex 0
687 * - attribute 0 of patch 0 vertex 1
688 * - attribute 0 of patch 0 vertex 2
689 * ...
690 * - attribute 0 of patch 1 vertex 0
691 * - attribute 0 of patch 1 vertex 1
692 * ...
693 * - attribute 1 of patch 0 vertex 0
694 * - attribute 1 of patch 0 vertex 1
695 * ...
696 * - per patch attribute 0 of patch 0
697 * - per patch attribute 0 of patch 1
698 * ...
699 *
700 * Note that every attribute has 4 components.
701 */
702 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
703 LLVMValueRef vertex_index,
704 LLVMValueRef param_index)
705 {
706 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
707 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
708 LLVMValueRef param_stride, constant16;
709
710 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
711 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
712 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
713 num_patches, "");
714
715 constant16 = lp_build_const_int32(gallivm, 16);
716 if (vertex_index) {
717 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
718 vertices_per_patch, "");
719
720 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
721 vertex_index, "");
722
723 param_stride = total_vertices;
724 } else {
725 base_addr = get_rel_patch_id(ctx);
726 param_stride = num_patches;
727 }
728
729 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
730 LLVMBuildMul(gallivm->builder, param_index,
731 param_stride, ""), "");
732
733 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
734
735 if (!vertex_index) {
736 LLVMValueRef patch_data_offset =
737 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
738
739 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
740 patch_data_offset, "");
741 }
742 return base_addr;
743 }
744
745 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
746 struct si_shader_context *ctx,
747 const struct tgsi_full_dst_register *dst,
748 const struct tgsi_full_src_register *src)
749 {
750 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
751 struct tgsi_shader_info *info = &ctx->shader->selector->info;
752 ubyte *name, *index, *array_first;
753 struct tgsi_full_src_register reg;
754 LLVMValueRef vertex_index = NULL;
755 LLVMValueRef param_index = NULL;
756 unsigned param_index_base, param_base;
757
758 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
759
760 if (reg.Register.Dimension) {
761
762 if (reg.Dimension.Indirect)
763 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
764 reg.Dimension.Index);
765 else
766 vertex_index = lp_build_const_int32(gallivm,
767 reg.Dimension.Index);
768 }
769
770 /* Get information about the register. */
771 if (reg.Register.File == TGSI_FILE_INPUT) {
772 name = info->input_semantic_name;
773 index = info->input_semantic_index;
774 array_first = info->input_array_first;
775 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
776 name = info->output_semantic_name;
777 index = info->output_semantic_index;
778 array_first = info->output_array_first;
779 } else {
780 assert(0);
781 return NULL;
782 }
783
784 if (reg.Register.Indirect) {
785 if (reg.Indirect.ArrayID)
786 param_base = array_first[reg.Indirect.ArrayID];
787 else
788 param_base = reg.Register.Index;
789
790 param_index = get_indirect_index(ctx, &reg.Indirect,
791 reg.Register.Index - param_base);
792
793 } else {
794 param_base = reg.Register.Index;
795 param_index = lp_build_const_int32(gallivm, 0);
796 }
797
798 param_index_base = si_shader_io_get_unique_index(name[param_base],
799 index[param_base]);
800
801 param_index = LLVMBuildAdd(gallivm->builder, param_index,
802 lp_build_const_int32(gallivm, param_index_base),
803 "");
804
805 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
806 }
807
808 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
809 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
810 * or v4i32 (num_channels=3,4). */
811 static void build_tbuffer_store(struct si_shader_context *ctx,
812 LLVMValueRef rsrc,
813 LLVMValueRef vdata,
814 unsigned num_channels,
815 LLVMValueRef vaddr,
816 LLVMValueRef soffset,
817 unsigned inst_offset,
818 unsigned dfmt,
819 unsigned nfmt,
820 unsigned offen,
821 unsigned idxen,
822 unsigned glc,
823 unsigned slc,
824 unsigned tfe)
825 {
826 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
827 LLVMValueRef args[] = {
828 rsrc,
829 vdata,
830 LLVMConstInt(ctx->i32, num_channels, 0),
831 vaddr,
832 soffset,
833 LLVMConstInt(ctx->i32, inst_offset, 0),
834 LLVMConstInt(ctx->i32, dfmt, 0),
835 LLVMConstInt(ctx->i32, nfmt, 0),
836 LLVMConstInt(ctx->i32, offen, 0),
837 LLVMConstInt(ctx->i32, idxen, 0),
838 LLVMConstInt(ctx->i32, glc, 0),
839 LLVMConstInt(ctx->i32, slc, 0),
840 LLVMConstInt(ctx->i32, tfe, 0)
841 };
842
843 /* The instruction offset field has 12 bits */
844 assert(offen || inst_offset < (1 << 12));
845
846 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
847 unsigned func = CLAMP(num_channels, 1, 3) - 1;
848 const char *types[] = {"i32", "v2i32", "v4i32"};
849 char name[256];
850 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
851
852 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
853 args, ARRAY_SIZE(args), 0);
854 }
855
856 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
857 LLVMValueRef rsrc,
858 LLVMValueRef vdata,
859 unsigned num_channels,
860 LLVMValueRef vaddr,
861 LLVMValueRef soffset,
862 unsigned inst_offset)
863 {
864 static unsigned dfmt[] = {
865 V_008F0C_BUF_DATA_FORMAT_32,
866 V_008F0C_BUF_DATA_FORMAT_32_32,
867 V_008F0C_BUF_DATA_FORMAT_32_32_32,
868 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
869 };
870 assert(num_channels >= 1 && num_channels <= 4);
871
872 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
873 inst_offset, dfmt[num_channels-1],
874 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
875 }
876
877 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
878 LLVMValueRef rsrc,
879 int num_channels,
880 LLVMValueRef vindex,
881 LLVMValueRef voffset,
882 LLVMValueRef soffset,
883 unsigned inst_offset,
884 unsigned glc,
885 unsigned slc)
886 {
887 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
888 unsigned func = CLAMP(num_channels, 1, 3) - 1;
889
890 if (HAVE_LLVM >= 0x309) {
891 LLVMValueRef args[] = {
892 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
893 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
894 LLVMConstInt(ctx->i32, inst_offset, 0),
895 LLVMConstInt(ctx->i1, glc, 0),
896 LLVMConstInt(ctx->i1, slc, 0)
897 };
898
899 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
900 ctx->v4f32};
901 const char *type_names[] = {"f32", "v2f32", "v4f32"};
902 char name[256];
903
904 if (voffset) {
905 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
906 "");
907 }
908
909 if (soffset) {
910 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
911 "");
912 }
913
914 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
915 type_names[func]);
916
917 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
918 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
919 } else {
920 LLVMValueRef args[] = {
921 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
922 voffset ? voffset : vindex,
923 soffset,
924 LLVMConstInt(ctx->i32, inst_offset, 0),
925 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
926 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
927 LLVMConstInt(ctx->i32, glc, 0),
928 LLVMConstInt(ctx->i32, slc, 0),
929 LLVMConstInt(ctx->i32, 0, 0), // TFE
930 };
931
932 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
933 ctx->v4i32};
934 const char *type_names[] = {"i32", "v2i32", "v4i32"};
935 const char *arg_type = "i32";
936 char name[256];
937
938 if (voffset && vindex) {
939 LLVMValueRef vaddr[] = {vindex, voffset};
940
941 arg_type = "v2i32";
942 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
943 }
944
945 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
946 type_names[func], arg_type);
947
948 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
949 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
950 }
951 }
952
953 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
954 enum tgsi_opcode_type type, unsigned swizzle,
955 LLVMValueRef buffer, LLVMValueRef offset,
956 LLVMValueRef base)
957 {
958 struct si_shader_context *ctx = si_shader_context(bld_base);
959 struct gallivm_state *gallivm = bld_base->base.gallivm;
960 LLVMValueRef value, value2;
961 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
962 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
963
964 if (swizzle == ~0) {
965 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
966 0, 1, 0);
967
968 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
969 }
970
971 if (!tgsi_type_is_64bit(type)) {
972 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
973 0, 1, 0);
974
975 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
976 return LLVMBuildExtractElement(gallivm->builder, value,
977 lp_build_const_int32(gallivm, swizzle), "");
978 }
979
980 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
981 swizzle * 4, 1, 0);
982
983 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
984 swizzle * 4 + 4, 1, 0);
985
986 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
987 }
988
989 /**
990 * Load from LDS.
991 *
992 * \param type output value type
993 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
994 * \param dw_addr address in dwords
995 */
996 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
997 enum tgsi_opcode_type type, unsigned swizzle,
998 LLVMValueRef dw_addr)
999 {
1000 struct si_shader_context *ctx = si_shader_context(bld_base);
1001 struct gallivm_state *gallivm = bld_base->base.gallivm;
1002 LLVMValueRef value;
1003
1004 if (swizzle == ~0) {
1005 LLVMValueRef values[TGSI_NUM_CHANNELS];
1006
1007 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1008 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1009
1010 return lp_build_gather_values(bld_base->base.gallivm, values,
1011 TGSI_NUM_CHANNELS);
1012 }
1013
1014 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1015 lp_build_const_int32(gallivm, swizzle));
1016
1017 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1018 if (tgsi_type_is_64bit(type)) {
1019 LLVMValueRef value2;
1020 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1021 lp_build_const_int32(gallivm, swizzle + 1));
1022 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1023 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1024 }
1025
1026 return LLVMBuildBitCast(gallivm->builder, value,
1027 tgsi2llvmtype(bld_base, type), "");
1028 }
1029
1030 /**
1031 * Store to LDS.
1032 *
1033 * \param swizzle offset (typically 0..3)
1034 * \param dw_addr address in dwords
1035 * \param value value to store
1036 */
1037 static void lds_store(struct lp_build_tgsi_context *bld_base,
1038 unsigned swizzle, LLVMValueRef dw_addr,
1039 LLVMValueRef value)
1040 {
1041 struct si_shader_context *ctx = si_shader_context(bld_base);
1042 struct gallivm_state *gallivm = bld_base->base.gallivm;
1043
1044 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1045 lp_build_const_int32(gallivm, swizzle));
1046
1047 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1048 build_indexed_store(ctx, ctx->lds,
1049 dw_addr, value);
1050 }
1051
1052 static LLVMValueRef fetch_input_tcs(
1053 struct lp_build_tgsi_context *bld_base,
1054 const struct tgsi_full_src_register *reg,
1055 enum tgsi_opcode_type type, unsigned swizzle)
1056 {
1057 struct si_shader_context *ctx = si_shader_context(bld_base);
1058 LLVMValueRef dw_addr, stride;
1059
1060 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1061 dw_addr = get_tcs_in_current_patch_offset(ctx);
1062 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1063
1064 return lds_load(bld_base, type, swizzle, dw_addr);
1065 }
1066
1067 static LLVMValueRef fetch_output_tcs(
1068 struct lp_build_tgsi_context *bld_base,
1069 const struct tgsi_full_src_register *reg,
1070 enum tgsi_opcode_type type, unsigned swizzle)
1071 {
1072 struct si_shader_context *ctx = si_shader_context(bld_base);
1073 LLVMValueRef dw_addr, stride;
1074
1075 if (reg->Register.Dimension) {
1076 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1077 dw_addr = get_tcs_out_current_patch_offset(ctx);
1078 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1079 } else {
1080 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1081 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1082 }
1083
1084 return lds_load(bld_base, type, swizzle, dw_addr);
1085 }
1086
1087 static LLVMValueRef fetch_input_tes(
1088 struct lp_build_tgsi_context *bld_base,
1089 const struct tgsi_full_src_register *reg,
1090 enum tgsi_opcode_type type, unsigned swizzle)
1091 {
1092 struct si_shader_context *ctx = si_shader_context(bld_base);
1093 struct gallivm_state *gallivm = bld_base->base.gallivm;
1094 LLVMValueRef rw_buffers, buffer, base, addr;
1095
1096 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1097 SI_PARAM_RW_BUFFERS);
1098 buffer = build_indexed_load_const(ctx, rw_buffers,
1099 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1100
1101 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1102 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1103
1104 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1105 }
1106
1107 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1108 const struct tgsi_full_instruction *inst,
1109 const struct tgsi_opcode_info *info,
1110 LLVMValueRef dst[4])
1111 {
1112 struct si_shader_context *ctx = si_shader_context(bld_base);
1113 struct gallivm_state *gallivm = bld_base->base.gallivm;
1114 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1115 unsigned chan_index;
1116 LLVMValueRef dw_addr, stride;
1117 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1118 LLVMValueRef values[4];
1119
1120 /* Only handle per-patch and per-vertex outputs here.
1121 * Vectors will be lowered to scalars and this function will be called again.
1122 */
1123 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1124 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1125 radeon_llvm_emit_store(bld_base, inst, info, dst);
1126 return;
1127 }
1128
1129 if (reg->Register.Dimension) {
1130 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1131 dw_addr = get_tcs_out_current_patch_offset(ctx);
1132 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1133 } else {
1134 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1135 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1136 }
1137
1138 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1139 SI_PARAM_RW_BUFFERS);
1140 buffer = build_indexed_load_const(ctx, rw_buffers,
1141 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1142
1143 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1144 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1145
1146
1147 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1148 LLVMValueRef value = dst[chan_index];
1149
1150 if (inst->Instruction.Saturate)
1151 value = radeon_llvm_saturate(bld_base, value);
1152
1153 lds_store(bld_base, chan_index, dw_addr, value);
1154
1155 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1156 values[chan_index] = value;
1157
1158 if (inst->Dst[0].Register.WriteMask != 0xF) {
1159 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1160 buf_addr, base,
1161 4 * chan_index);
1162 }
1163 }
1164
1165 if (inst->Dst[0].Register.WriteMask == 0xF) {
1166 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1167 values, 4);
1168 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1169 base, 0);
1170 }
1171 }
1172
1173 static LLVMValueRef fetch_input_gs(
1174 struct lp_build_tgsi_context *bld_base,
1175 const struct tgsi_full_src_register *reg,
1176 enum tgsi_opcode_type type,
1177 unsigned swizzle)
1178 {
1179 struct lp_build_context *base = &bld_base->base;
1180 struct si_shader_context *ctx = si_shader_context(bld_base);
1181 struct si_shader *shader = ctx->shader;
1182 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1183 struct gallivm_state *gallivm = base->gallivm;
1184 LLVMValueRef vtx_offset;
1185 LLVMValueRef args[9];
1186 unsigned vtx_offset_param;
1187 struct tgsi_shader_info *info = &shader->selector->info;
1188 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1189 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1190 unsigned param;
1191 LLVMValueRef value;
1192
1193 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1194 return get_primitive_id(bld_base, swizzle);
1195
1196 if (!reg->Register.Dimension)
1197 return NULL;
1198
1199 if (swizzle == ~0) {
1200 LLVMValueRef values[TGSI_NUM_CHANNELS];
1201 unsigned chan;
1202 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1203 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1204 }
1205 return lp_build_gather_values(bld_base->base.gallivm, values,
1206 TGSI_NUM_CHANNELS);
1207 }
1208
1209 /* Get the vertex offset parameter */
1210 vtx_offset_param = reg->Dimension.Index;
1211 if (vtx_offset_param < 2) {
1212 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1213 } else {
1214 assert(vtx_offset_param < 6);
1215 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1216 }
1217 vtx_offset = lp_build_mul_imm(uint,
1218 LLVMGetParam(ctx->radeon_bld.main_fn,
1219 vtx_offset_param),
1220 4);
1221
1222 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1223 args[0] = ctx->esgs_ring;
1224 args[1] = vtx_offset;
1225 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1226 args[3] = uint->zero;
1227 args[4] = uint->one; /* OFFEN */
1228 args[5] = uint->zero; /* IDXEN */
1229 args[6] = uint->one; /* GLC */
1230 args[7] = uint->zero; /* SLC */
1231 args[8] = uint->zero; /* TFE */
1232
1233 value = lp_build_intrinsic(gallivm->builder,
1234 "llvm.SI.buffer.load.dword.i32.i32",
1235 ctx->i32, args, 9,
1236 LLVMReadOnlyAttribute);
1237 if (tgsi_type_is_64bit(type)) {
1238 LLVMValueRef value2;
1239 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1240 value2 = lp_build_intrinsic(gallivm->builder,
1241 "llvm.SI.buffer.load.dword.i32.i32",
1242 ctx->i32, args, 9,
1243 LLVMReadOnlyAttribute);
1244 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1245 value, value2);
1246 }
1247 return LLVMBuildBitCast(gallivm->builder,
1248 value,
1249 tgsi2llvmtype(bld_base, type), "");
1250 }
1251
1252 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1253 {
1254 switch (interpolate) {
1255 case TGSI_INTERPOLATE_CONSTANT:
1256 return 0;
1257
1258 case TGSI_INTERPOLATE_LINEAR:
1259 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1260 return SI_PARAM_LINEAR_SAMPLE;
1261 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1262 return SI_PARAM_LINEAR_CENTROID;
1263 else
1264 return SI_PARAM_LINEAR_CENTER;
1265 break;
1266 case TGSI_INTERPOLATE_COLOR:
1267 case TGSI_INTERPOLATE_PERSPECTIVE:
1268 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1269 return SI_PARAM_PERSP_SAMPLE;
1270 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1271 return SI_PARAM_PERSP_CENTROID;
1272 else
1273 return SI_PARAM_PERSP_CENTER;
1274 break;
1275 default:
1276 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1277 return -1;
1278 }
1279 }
1280
1281 /* This shouldn't be used by explicit INTERP opcodes. */
1282 static unsigned select_interp_param(struct si_shader_context *ctx,
1283 unsigned param)
1284 {
1285 if (!ctx->is_monolithic)
1286 return param;
1287
1288 if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1289 switch (param) {
1290 case SI_PARAM_PERSP_CENTROID:
1291 case SI_PARAM_PERSP_CENTER:
1292 return SI_PARAM_PERSP_SAMPLE;
1293 }
1294 }
1295 if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1296 switch (param) {
1297 case SI_PARAM_LINEAR_CENTROID:
1298 case SI_PARAM_LINEAR_CENTER:
1299 return SI_PARAM_LINEAR_SAMPLE;
1300 }
1301 }
1302 if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1303 switch (param) {
1304 case SI_PARAM_PERSP_CENTROID:
1305 case SI_PARAM_PERSP_SAMPLE:
1306 return SI_PARAM_PERSP_CENTER;
1307 }
1308 }
1309 if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1310 switch (param) {
1311 case SI_PARAM_LINEAR_CENTROID:
1312 case SI_PARAM_LINEAR_SAMPLE:
1313 return SI_PARAM_LINEAR_CENTER;
1314 }
1315 }
1316
1317 return param;
1318 }
1319
1320 /**
1321 * Interpolate a fragment shader input.
1322 *
1323 * @param ctx context
1324 * @param input_index index of the input in hardware
1325 * @param semantic_name TGSI_SEMANTIC_*
1326 * @param semantic_index semantic index
1327 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1328 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1329 * @param interp_param interpolation weights (i,j)
1330 * @param prim_mask SI_PARAM_PRIM_MASK
1331 * @param face SI_PARAM_FRONT_FACE
1332 * @param result the return value (4 components)
1333 */
1334 static void interp_fs_input(struct si_shader_context *ctx,
1335 unsigned input_index,
1336 unsigned semantic_name,
1337 unsigned semantic_index,
1338 unsigned num_interp_inputs,
1339 unsigned colors_read_mask,
1340 LLVMValueRef interp_param,
1341 LLVMValueRef prim_mask,
1342 LLVMValueRef face,
1343 LLVMValueRef result[4])
1344 {
1345 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1346 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1347 struct gallivm_state *gallivm = base->gallivm;
1348 const char *intr_name;
1349 LLVMValueRef attr_number;
1350
1351 unsigned chan;
1352
1353 attr_number = lp_build_const_int32(gallivm, input_index);
1354
1355 /* fs.constant returns the param from the middle vertex, so it's not
1356 * really useful for flat shading. It's meant to be used for custom
1357 * interpolation (but the intrinsic can't fetch from the other two
1358 * vertices).
1359 *
1360 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1361 * to do the right thing. The only reason we use fs.constant is that
1362 * fs.interp cannot be used on integers, because they can be equal
1363 * to NaN.
1364 */
1365 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1366
1367 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1368 ctx->shader->key.ps.prolog.color_two_side) {
1369 LLVMValueRef args[4];
1370 LLVMValueRef is_face_positive;
1371 LLVMValueRef back_attr_number;
1372
1373 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1374 * otherwise it's at offset "num_inputs".
1375 */
1376 unsigned back_attr_offset = num_interp_inputs;
1377 if (semantic_index == 1 && colors_read_mask & 0xf)
1378 back_attr_offset += 1;
1379
1380 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1381
1382 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1383 face, uint->zero, "");
1384
1385 args[2] = prim_mask;
1386 args[3] = interp_param;
1387 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1388 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1389 LLVMValueRef front, back;
1390
1391 args[0] = llvm_chan;
1392 args[1] = attr_number;
1393 front = lp_build_intrinsic(gallivm->builder, intr_name,
1394 ctx->f32, args, args[3] ? 4 : 3,
1395 LLVMReadNoneAttribute);
1396
1397 args[1] = back_attr_number;
1398 back = lp_build_intrinsic(gallivm->builder, intr_name,
1399 ctx->f32, args, args[3] ? 4 : 3,
1400 LLVMReadNoneAttribute);
1401
1402 result[chan] = LLVMBuildSelect(gallivm->builder,
1403 is_face_positive,
1404 front,
1405 back,
1406 "");
1407 }
1408 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1409 LLVMValueRef args[4];
1410
1411 args[0] = uint->zero;
1412 args[1] = attr_number;
1413 args[2] = prim_mask;
1414 args[3] = interp_param;
1415 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1416 ctx->f32, args, args[3] ? 4 : 3,
1417 LLVMReadNoneAttribute);
1418 result[1] =
1419 result[2] = lp_build_const_float(gallivm, 0.0f);
1420 result[3] = lp_build_const_float(gallivm, 1.0f);
1421 } else {
1422 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1423 LLVMValueRef args[4];
1424 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1425
1426 args[0] = llvm_chan;
1427 args[1] = attr_number;
1428 args[2] = prim_mask;
1429 args[3] = interp_param;
1430 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1431 ctx->f32, args, args[3] ? 4 : 3,
1432 LLVMReadNoneAttribute);
1433 }
1434 }
1435 }
1436
1437 /* LLVMGetParam with bc_optimize resolved. */
1438 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1439 int interp_param_idx)
1440 {
1441 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1442 LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1443 LLVMValueRef param = NULL;
1444
1445 /* Handle PRIM_MASK[31] (bc_optimize). */
1446 if (ctx->is_monolithic &&
1447 ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1448 interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1449 (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1450 interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1451 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1452 * The hw doesn't compute CENTROID if the whole wave only
1453 * contains fully-covered quads.
1454 */
1455 LLVMValueRef bc_optimize =
1456 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1457 bc_optimize = LLVMBuildLShr(builder,
1458 bc_optimize,
1459 LLVMConstInt(ctx->i32, 31, 0), "");
1460 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1461
1462 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1463 interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1464 param = LLVMBuildSelect(builder, bc_optimize,
1465 LLVMGetParam(main_fn,
1466 SI_PARAM_PERSP_CENTER),
1467 LLVMGetParam(main_fn,
1468 SI_PARAM_PERSP_CENTROID),
1469 "");
1470 }
1471 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1472 interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1473 param = LLVMBuildSelect(builder, bc_optimize,
1474 LLVMGetParam(main_fn,
1475 SI_PARAM_LINEAR_CENTER),
1476 LLVMGetParam(main_fn,
1477 SI_PARAM_LINEAR_CENTROID),
1478 "");
1479 }
1480 }
1481
1482 if (!param)
1483 param = LLVMGetParam(main_fn, interp_param_idx);
1484 return param;
1485 }
1486
1487 static void declare_input_fs(
1488 struct radeon_llvm_context *radeon_bld,
1489 unsigned input_index,
1490 const struct tgsi_full_declaration *decl)
1491 {
1492 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1493 struct si_shader_context *ctx =
1494 si_shader_context(&radeon_bld->soa.bld_base);
1495 struct si_shader *shader = ctx->shader;
1496 LLVMValueRef main_fn = radeon_bld->main_fn;
1497 LLVMValueRef interp_param = NULL;
1498 int interp_param_idx;
1499
1500 /* Get colors from input VGPRs (set by the prolog). */
1501 if (!ctx->is_monolithic &&
1502 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1503 unsigned i = decl->Semantic.Index;
1504 unsigned colors_read = shader->selector->info.colors_read;
1505 unsigned mask = colors_read >> (i * 4);
1506 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1507 (i ? util_bitcount(colors_read & 0xf) : 0);
1508
1509 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1510 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1511 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1512 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1513 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1514 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1515 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1516 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1517 return;
1518 }
1519
1520 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1521 decl->Interp.Location);
1522 if (interp_param_idx == -1)
1523 return;
1524 else if (interp_param_idx) {
1525 interp_param_idx = select_interp_param(ctx,
1526 interp_param_idx);
1527 interp_param = get_interp_param(ctx, interp_param_idx);
1528 }
1529
1530 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1531 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1532 ctx->shader->key.ps.prolog.flatshade_colors)
1533 interp_param = NULL; /* load the constant color */
1534
1535 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1536 decl->Semantic.Index, shader->selector->info.num_inputs,
1537 shader->selector->info.colors_read, interp_param,
1538 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1539 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1540 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1541 }
1542
1543 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1544 {
1545 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1546 SI_PARAM_ANCILLARY, 8, 4);
1547 }
1548
1549 /**
1550 * Set range metadata on an instruction. This can only be used on load and
1551 * call instructions. If you know an instruction can only produce the values
1552 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1553 * \p lo is the minimum value inclusive.
1554 * \p hi is the maximum value exclusive.
1555 */
1556 static void set_range_metadata(struct si_shader_context *ctx,
1557 LLVMValueRef value, unsigned lo, unsigned hi)
1558 {
1559 LLVMValueRef range_md, md_args[2];
1560 LLVMTypeRef type = LLVMTypeOf(value);
1561 LLVMContextRef context = LLVMGetTypeContext(type);
1562
1563 md_args[0] = LLVMConstInt(type, lo, false);
1564 md_args[1] = LLVMConstInt(type, hi, false);
1565 range_md = LLVMMDNodeInContext(context, md_args, 2);
1566 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1567 }
1568
1569 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1570 {
1571 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1572 LLVMValueRef tid;
1573
1574 if (HAVE_LLVM < 0x0308) {
1575 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1576 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1577 } else {
1578 LLVMValueRef tid_args[2];
1579 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1580 tid_args[1] = lp_build_const_int32(gallivm, 0);
1581 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1582 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1583 tid_args, 2, LLVMReadNoneAttribute);
1584
1585 tid = lp_build_intrinsic(gallivm->builder,
1586 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1587 tid_args, 2, LLVMReadNoneAttribute);
1588 }
1589 set_range_metadata(ctx, tid, 0, 64);
1590 return tid;
1591 }
1592
1593 /**
1594 * Load a dword from a constant buffer.
1595 */
1596 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1597 LLVMValueRef offset, LLVMTypeRef return_type)
1598 {
1599 LLVMValueRef args[2] = {resource, offset};
1600
1601 return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1602 LLVMReadNoneAttribute);
1603 }
1604
1605 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1606 {
1607 struct si_shader_context *ctx =
1608 si_shader_context(&radeon_bld->soa.bld_base);
1609 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1610 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1611 LLVMBuilderRef builder = gallivm->builder;
1612 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1613 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1614 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1615
1616 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1617 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1618 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1619
1620 LLVMValueRef pos[4] = {
1621 buffer_load_const(builder, resource, offset0, ctx->f32),
1622 buffer_load_const(builder, resource, offset1, ctx->f32),
1623 lp_build_const_float(gallivm, 0),
1624 lp_build_const_float(gallivm, 0)
1625 };
1626
1627 return lp_build_gather_values(gallivm, pos, 4);
1628 }
1629
1630 static void declare_system_value(
1631 struct radeon_llvm_context *radeon_bld,
1632 unsigned index,
1633 const struct tgsi_full_declaration *decl)
1634 {
1635 struct si_shader_context *ctx =
1636 si_shader_context(&radeon_bld->soa.bld_base);
1637 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1638 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1639 LLVMValueRef value = 0;
1640
1641 switch (decl->Semantic.Name) {
1642 case TGSI_SEMANTIC_INSTANCEID:
1643 value = LLVMGetParam(radeon_bld->main_fn,
1644 ctx->param_instance_id);
1645 break;
1646
1647 case TGSI_SEMANTIC_VERTEXID:
1648 value = LLVMBuildAdd(gallivm->builder,
1649 LLVMGetParam(radeon_bld->main_fn,
1650 ctx->param_vertex_id),
1651 LLVMGetParam(radeon_bld->main_fn,
1652 SI_PARAM_BASE_VERTEX), "");
1653 break;
1654
1655 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1656 value = LLVMGetParam(radeon_bld->main_fn,
1657 ctx->param_vertex_id);
1658 break;
1659
1660 case TGSI_SEMANTIC_BASEVERTEX:
1661 value = LLVMGetParam(radeon_bld->main_fn,
1662 SI_PARAM_BASE_VERTEX);
1663 break;
1664
1665 case TGSI_SEMANTIC_INVOCATIONID:
1666 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1667 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1668 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1669 value = LLVMGetParam(radeon_bld->main_fn,
1670 SI_PARAM_GS_INSTANCE_ID);
1671 else
1672 assert(!"INVOCATIONID not implemented");
1673 break;
1674
1675 case TGSI_SEMANTIC_POSITION:
1676 {
1677 LLVMValueRef pos[4] = {
1678 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1679 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1680 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1681 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1682 LLVMGetParam(radeon_bld->main_fn,
1683 SI_PARAM_POS_W_FLOAT)),
1684 };
1685 value = lp_build_gather_values(gallivm, pos, 4);
1686 break;
1687 }
1688
1689 case TGSI_SEMANTIC_FACE:
1690 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1691 break;
1692
1693 case TGSI_SEMANTIC_SAMPLEID:
1694 value = get_sample_id(radeon_bld);
1695 break;
1696
1697 case TGSI_SEMANTIC_SAMPLEPOS: {
1698 LLVMValueRef pos[4] = {
1699 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1700 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1701 lp_build_const_float(gallivm, 0),
1702 lp_build_const_float(gallivm, 0)
1703 };
1704 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1705 TGSI_OPCODE_FRC, pos[0]);
1706 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1707 TGSI_OPCODE_FRC, pos[1]);
1708 value = lp_build_gather_values(gallivm, pos, 4);
1709 break;
1710 }
1711
1712 case TGSI_SEMANTIC_SAMPLEMASK:
1713 /* This can only occur with the OpenGL Core profile, which
1714 * doesn't support smoothing.
1715 */
1716 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1717 break;
1718
1719 case TGSI_SEMANTIC_TESSCOORD:
1720 {
1721 LLVMValueRef coord[4] = {
1722 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1723 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1724 bld->zero,
1725 bld->zero
1726 };
1727
1728 /* For triangles, the vector should be (u, v, 1-u-v). */
1729 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1730 PIPE_PRIM_TRIANGLES)
1731 coord[2] = lp_build_sub(bld, bld->one,
1732 lp_build_add(bld, coord[0], coord[1]));
1733
1734 value = lp_build_gather_values(gallivm, coord, 4);
1735 break;
1736 }
1737
1738 case TGSI_SEMANTIC_VERTICESIN:
1739 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1740 break;
1741
1742 case TGSI_SEMANTIC_TESSINNER:
1743 case TGSI_SEMANTIC_TESSOUTER:
1744 {
1745 LLVMValueRef rw_buffers, buffer, base, addr;
1746 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1747
1748 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1749 SI_PARAM_RW_BUFFERS);
1750 buffer = build_indexed_load_const(ctx, rw_buffers,
1751 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1752
1753 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1754 addr = get_tcs_tes_buffer_address(ctx, NULL,
1755 lp_build_const_int32(gallivm, param));
1756
1757 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1758 ~0, buffer, base, addr);
1759
1760 break;
1761 }
1762
1763 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1764 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1765 {
1766 LLVMValueRef buf, slot, val[4];
1767 int i, offset;
1768
1769 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1770 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1771 buf = build_indexed_load_const(ctx, buf, slot);
1772 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1773
1774 for (i = 0; i < 4; i++)
1775 val[i] = buffer_load_const(gallivm->builder, buf,
1776 lp_build_const_int32(gallivm, (offset + i) * 4),
1777 ctx->f32);
1778 value = lp_build_gather_values(gallivm, val, 4);
1779 break;
1780 }
1781
1782 case TGSI_SEMANTIC_PRIMID:
1783 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1784 break;
1785
1786 case TGSI_SEMANTIC_GRID_SIZE:
1787 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1788 break;
1789
1790 case TGSI_SEMANTIC_BLOCK_SIZE:
1791 {
1792 LLVMValueRef values[3];
1793 unsigned i;
1794 unsigned *properties = ctx->shader->selector->info.properties;
1795 unsigned sizes[3] = {
1796 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1797 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1798 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1799 };
1800
1801 for (i = 0; i < 3; ++i)
1802 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1803
1804 value = lp_build_gather_values(gallivm, values, 3);
1805 break;
1806 }
1807
1808 case TGSI_SEMANTIC_BLOCK_ID:
1809 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1810 break;
1811
1812 case TGSI_SEMANTIC_THREAD_ID:
1813 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1814 break;
1815
1816 #if HAVE_LLVM >= 0x0309
1817 case TGSI_SEMANTIC_HELPER_INVOCATION:
1818 value = lp_build_intrinsic(gallivm->builder,
1819 "llvm.amdgcn.ps.live",
1820 ctx->i1, NULL, 0,
1821 LLVMReadNoneAttribute);
1822 value = LLVMBuildNot(gallivm->builder, value, "");
1823 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1824 break;
1825 #endif
1826
1827 default:
1828 assert(!"unknown system value");
1829 return;
1830 }
1831
1832 radeon_bld->system_values[index] = value;
1833 }
1834
1835 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1836 const struct tgsi_full_declaration *decl)
1837 {
1838 struct si_shader_context *ctx =
1839 si_shader_context(&radeon_bld->soa.bld_base);
1840 struct si_shader_selector *sel = ctx->shader->selector;
1841 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1842
1843 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1844 LLVMValueRef var;
1845
1846 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1847 assert(decl->Range.First == decl->Range.Last);
1848 assert(!ctx->shared_memory);
1849
1850 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1851 LLVMArrayType(ctx->i8, sel->local_size),
1852 "compute_lds",
1853 LOCAL_ADDR_SPACE);
1854 LLVMSetAlignment(var, 4);
1855
1856 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1857 }
1858
1859 static LLVMValueRef fetch_constant(
1860 struct lp_build_tgsi_context *bld_base,
1861 const struct tgsi_full_src_register *reg,
1862 enum tgsi_opcode_type type,
1863 unsigned swizzle)
1864 {
1865 struct si_shader_context *ctx = si_shader_context(bld_base);
1866 struct lp_build_context *base = &bld_base->base;
1867 const struct tgsi_ind_register *ireg = &reg->Indirect;
1868 unsigned buf, idx;
1869
1870 LLVMValueRef addr, bufp;
1871 LLVMValueRef result;
1872
1873 if (swizzle == LP_CHAN_ALL) {
1874 unsigned chan;
1875 LLVMValueRef values[4];
1876 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1877 values[chan] = fetch_constant(bld_base, reg, type, chan);
1878
1879 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1880 }
1881
1882 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1883 idx = reg->Register.Index * 4 + swizzle;
1884
1885 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1886 if (!tgsi_type_is_64bit(type))
1887 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1888 else {
1889 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1890 ctx->constants[buf][idx],
1891 ctx->constants[buf][idx + 1]);
1892 }
1893 }
1894
1895 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1896 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1897 LLVMValueRef index;
1898 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1899 reg->Dimension.Index,
1900 SI_NUM_CONST_BUFFERS);
1901 bufp = build_indexed_load_const(ctx, ptr, index);
1902 } else
1903 bufp = ctx->const_buffers[buf];
1904
1905 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1906 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1907 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1908 addr = lp_build_add(&bld_base->uint_bld, addr,
1909 lp_build_const_int32(base->gallivm, idx * 4));
1910
1911 result = buffer_load_const(base->gallivm->builder, bufp,
1912 addr, ctx->f32);
1913
1914 if (!tgsi_type_is_64bit(type))
1915 result = bitcast(bld_base, type, result);
1916 else {
1917 LLVMValueRef addr2, result2;
1918 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1919 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1920 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1921 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1922 lp_build_const_int32(base->gallivm, idx * 4));
1923
1924 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1925 addr2, ctx->f32);
1926
1927 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1928 result, result2);
1929 }
1930 return result;
1931 }
1932
1933 /* Upper 16 bits must be zero. */
1934 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1935 LLVMValueRef val[2])
1936 {
1937 return LLVMBuildOr(gallivm->builder, val[0],
1938 LLVMBuildShl(gallivm->builder, val[1],
1939 lp_build_const_int32(gallivm, 16),
1940 ""), "");
1941 }
1942
1943 /* Upper 16 bits are ignored and will be dropped. */
1944 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1945 LLVMValueRef val[2])
1946 {
1947 LLVMValueRef v[2] = {
1948 LLVMBuildAnd(gallivm->builder, val[0],
1949 lp_build_const_int32(gallivm, 0xffff), ""),
1950 val[1],
1951 };
1952 return si_llvm_pack_two_int16(gallivm, v);
1953 }
1954
1955 /* Initialize arguments for the shader export intrinsic */
1956 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1957 LLVMValueRef *values,
1958 unsigned target,
1959 LLVMValueRef *args)
1960 {
1961 struct si_shader_context *ctx = si_shader_context(bld_base);
1962 struct lp_build_context *uint =
1963 &ctx->radeon_bld.soa.bld_base.uint_bld;
1964 struct lp_build_context *base = &bld_base->base;
1965 struct gallivm_state *gallivm = base->gallivm;
1966 LLVMBuilderRef builder = base->gallivm->builder;
1967 LLVMValueRef val[4];
1968 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1969 unsigned chan;
1970 bool is_int8;
1971
1972 /* Default is 0xf. Adjusted below depending on the format. */
1973 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1974
1975 /* Specify whether the EXEC mask represents the valid mask */
1976 args[1] = uint->zero;
1977
1978 /* Specify whether this is the last export */
1979 args[2] = uint->zero;
1980
1981 /* Specify the target we are exporting */
1982 args[3] = lp_build_const_int32(base->gallivm, target);
1983
1984 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1985 const union si_shader_key *key = &ctx->shader->key;
1986 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1987 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1988
1989 assert(cbuf >= 0 && cbuf < 8);
1990 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1991 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1992 }
1993
1994 args[4] = uint->zero; /* COMPR flag */
1995 args[5] = base->undef;
1996 args[6] = base->undef;
1997 args[7] = base->undef;
1998 args[8] = base->undef;
1999
2000 switch (spi_shader_col_format) {
2001 case V_028714_SPI_SHADER_ZERO:
2002 args[0] = uint->zero; /* writemask */
2003 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2004 break;
2005
2006 case V_028714_SPI_SHADER_32_R:
2007 args[0] = uint->one; /* writemask */
2008 args[5] = values[0];
2009 break;
2010
2011 case V_028714_SPI_SHADER_32_GR:
2012 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2013 args[5] = values[0];
2014 args[6] = values[1];
2015 break;
2016
2017 case V_028714_SPI_SHADER_32_AR:
2018 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2019 args[5] = values[0];
2020 args[8] = values[3];
2021 break;
2022
2023 case V_028714_SPI_SHADER_FP16_ABGR:
2024 args[4] = uint->one; /* COMPR flag */
2025
2026 for (chan = 0; chan < 2; chan++) {
2027 LLVMValueRef pack_args[2] = {
2028 values[2 * chan],
2029 values[2 * chan + 1]
2030 };
2031 LLVMValueRef packed;
2032
2033 packed = lp_build_intrinsic(base->gallivm->builder,
2034 "llvm.SI.packf16",
2035 ctx->i32, pack_args, 2,
2036 LLVMReadNoneAttribute);
2037 args[chan + 5] =
2038 LLVMBuildBitCast(base->gallivm->builder,
2039 packed, ctx->f32, "");
2040 }
2041 break;
2042
2043 case V_028714_SPI_SHADER_UNORM16_ABGR:
2044 for (chan = 0; chan < 4; chan++) {
2045 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2046 val[chan] = LLVMBuildFMul(builder, val[chan],
2047 lp_build_const_float(gallivm, 65535), "");
2048 val[chan] = LLVMBuildFAdd(builder, val[chan],
2049 lp_build_const_float(gallivm, 0.5), "");
2050 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2051 ctx->i32, "");
2052 }
2053
2054 args[4] = uint->one; /* COMPR flag */
2055 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2056 si_llvm_pack_two_int16(gallivm, val));
2057 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2058 si_llvm_pack_two_int16(gallivm, val+2));
2059 break;
2060
2061 case V_028714_SPI_SHADER_SNORM16_ABGR:
2062 for (chan = 0; chan < 4; chan++) {
2063 /* Clamp between [-1, 1]. */
2064 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2065 values[chan],
2066 lp_build_const_float(gallivm, 1));
2067 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2068 val[chan],
2069 lp_build_const_float(gallivm, -1));
2070 /* Convert to a signed integer in [-32767, 32767]. */
2071 val[chan] = LLVMBuildFMul(builder, val[chan],
2072 lp_build_const_float(gallivm, 32767), "");
2073 /* If positive, add 0.5, else add -0.5. */
2074 val[chan] = LLVMBuildFAdd(builder, val[chan],
2075 LLVMBuildSelect(builder,
2076 LLVMBuildFCmp(builder, LLVMRealOGE,
2077 val[chan], base->zero, ""),
2078 lp_build_const_float(gallivm, 0.5),
2079 lp_build_const_float(gallivm, -0.5), ""), "");
2080 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2081 }
2082
2083 args[4] = uint->one; /* COMPR flag */
2084 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2085 si_llvm_pack_two_int32_as_int16(gallivm, val));
2086 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2087 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2088 break;
2089
2090 case V_028714_SPI_SHADER_UINT16_ABGR: {
2091 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2092 255 : 65535);
2093 /* Clamp. */
2094 for (chan = 0; chan < 4; chan++) {
2095 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2096 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2097 val[chan], max);
2098 }
2099
2100 args[4] = uint->one; /* COMPR flag */
2101 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2102 si_llvm_pack_two_int16(gallivm, val));
2103 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2104 si_llvm_pack_two_int16(gallivm, val+2));
2105 break;
2106 }
2107
2108 case V_028714_SPI_SHADER_SINT16_ABGR: {
2109 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2110 127 : 32767);
2111 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2112 -128 : -32768);
2113 /* Clamp. */
2114 for (chan = 0; chan < 4; chan++) {
2115 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2116 val[chan] = lp_build_emit_llvm_binary(bld_base,
2117 TGSI_OPCODE_IMIN,
2118 val[chan], max);
2119 val[chan] = lp_build_emit_llvm_binary(bld_base,
2120 TGSI_OPCODE_IMAX,
2121 val[chan], min);
2122 }
2123
2124 args[4] = uint->one; /* COMPR flag */
2125 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2126 si_llvm_pack_two_int32_as_int16(gallivm, val));
2127 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2128 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2129 break;
2130 }
2131
2132 case V_028714_SPI_SHADER_32_ABGR:
2133 memcpy(&args[5], values, sizeof(values[0]) * 4);
2134 break;
2135 }
2136 }
2137
2138 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2139 LLVMValueRef alpha)
2140 {
2141 struct si_shader_context *ctx = si_shader_context(bld_base);
2142 struct gallivm_state *gallivm = bld_base->base.gallivm;
2143
2144 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2145 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2146 SI_PARAM_ALPHA_REF);
2147
2148 LLVMValueRef alpha_pass =
2149 lp_build_cmp(&bld_base->base,
2150 ctx->shader->key.ps.epilog.alpha_func,
2151 alpha, alpha_ref);
2152 LLVMValueRef arg =
2153 lp_build_select(&bld_base->base,
2154 alpha_pass,
2155 lp_build_const_float(gallivm, 1.0f),
2156 lp_build_const_float(gallivm, -1.0f));
2157
2158 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2159 ctx->voidt, &arg, 1, 0);
2160 } else {
2161 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2162 ctx->voidt, NULL, 0, 0);
2163 }
2164 }
2165
2166 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2167 LLVMValueRef alpha,
2168 unsigned samplemask_param)
2169 {
2170 struct si_shader_context *ctx = si_shader_context(bld_base);
2171 struct gallivm_state *gallivm = bld_base->base.gallivm;
2172 LLVMValueRef coverage;
2173
2174 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2175 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2176 samplemask_param);
2177 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2178
2179 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2180 ctx->i32,
2181 &coverage, 1, LLVMReadNoneAttribute);
2182
2183 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2184 ctx->f32, "");
2185
2186 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2187 lp_build_const_float(gallivm,
2188 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2189
2190 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2191 }
2192
2193 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2194 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2195 {
2196 struct si_shader_context *ctx = si_shader_context(bld_base);
2197 struct lp_build_context *base = &bld_base->base;
2198 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2199 unsigned reg_index;
2200 unsigned chan;
2201 unsigned const_chan;
2202 LLVMValueRef base_elt;
2203 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2204 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2205 SI_VS_CONST_CLIP_PLANES);
2206 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2207
2208 for (reg_index = 0; reg_index < 2; reg_index ++) {
2209 LLVMValueRef *args = pos[2 + reg_index];
2210
2211 args[5] =
2212 args[6] =
2213 args[7] =
2214 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2215
2216 /* Compute dot products of position and user clip plane vectors */
2217 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2218 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2219 args[1] = lp_build_const_int32(base->gallivm,
2220 ((reg_index * 4 + chan) * 4 +
2221 const_chan) * 4);
2222 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
2223 args[1], ctx->f32);
2224 args[5 + chan] =
2225 lp_build_add(base, args[5 + chan],
2226 lp_build_mul(base, base_elt,
2227 out_elts[const_chan]));
2228 }
2229 }
2230
2231 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2232 args[1] = uint->zero;
2233 args[2] = uint->zero;
2234 args[3] = lp_build_const_int32(base->gallivm,
2235 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2236 args[4] = uint->zero;
2237 }
2238 }
2239
2240 static void si_dump_streamout(struct pipe_stream_output_info *so)
2241 {
2242 unsigned i;
2243
2244 if (so->num_outputs)
2245 fprintf(stderr, "STREAMOUT\n");
2246
2247 for (i = 0; i < so->num_outputs; i++) {
2248 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2249 so->output[i].start_component;
2250 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2251 i, so->output[i].output_buffer,
2252 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2253 so->output[i].register_index,
2254 mask & 1 ? "x" : "",
2255 mask & 2 ? "y" : "",
2256 mask & 4 ? "z" : "",
2257 mask & 8 ? "w" : "");
2258 }
2259 }
2260
2261 /* On SI, the vertex shader is responsible for writing streamout data
2262 * to buffers. */
2263 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2264 struct si_shader_output_values *outputs,
2265 unsigned noutput)
2266 {
2267 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2268 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2269 LLVMBuilderRef builder = gallivm->builder;
2270 int i, j;
2271 struct lp_build_if_state if_ctx;
2272
2273 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2274 LLVMValueRef so_vtx_count =
2275 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2276
2277 LLVMValueRef tid = get_thread_id(ctx);
2278
2279 /* can_emit = tid < so_vtx_count; */
2280 LLVMValueRef can_emit =
2281 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2282
2283 LLVMValueRef stream_id =
2284 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2285
2286 /* Emit the streamout code conditionally. This actually avoids
2287 * out-of-bounds buffer access. The hw tells us via the SGPR
2288 * (so_vtx_count) which threads are allowed to emit streamout data. */
2289 lp_build_if(&if_ctx, gallivm, can_emit);
2290 {
2291 /* The buffer offset is computed as follows:
2292 * ByteOffset = streamout_offset[buffer_id]*4 +
2293 * (streamout_write_index + thread_id)*stride[buffer_id] +
2294 * attrib_offset
2295 */
2296
2297 LLVMValueRef so_write_index =
2298 LLVMGetParam(ctx->radeon_bld.main_fn,
2299 ctx->param_streamout_write_index);
2300
2301 /* Compute (streamout_write_index + thread_id). */
2302 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2303
2304 /* Compute the write offset for each enabled buffer. */
2305 LLVMValueRef so_write_offset[4] = {};
2306 for (i = 0; i < 4; i++) {
2307 if (!so->stride[i])
2308 continue;
2309
2310 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2311 ctx->param_streamout_offset[i]);
2312 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2313
2314 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2315 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2316 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2317 }
2318
2319 /* Write streamout data. */
2320 for (i = 0; i < so->num_outputs; i++) {
2321 unsigned buf_idx = so->output[i].output_buffer;
2322 unsigned reg = so->output[i].register_index;
2323 unsigned start = so->output[i].start_component;
2324 unsigned num_comps = so->output[i].num_components;
2325 unsigned stream = so->output[i].stream;
2326 LLVMValueRef out[4];
2327 struct lp_build_if_state if_ctx_stream;
2328
2329 assert(num_comps && num_comps <= 4);
2330 if (!num_comps || num_comps > 4)
2331 continue;
2332
2333 if (reg >= noutput)
2334 continue;
2335
2336 /* Load the output as int. */
2337 for (j = 0; j < num_comps; j++) {
2338 out[j] = LLVMBuildBitCast(builder,
2339 outputs[reg].values[start+j],
2340 ctx->i32, "");
2341 }
2342
2343 /* Pack the output. */
2344 LLVMValueRef vdata = NULL;
2345
2346 switch (num_comps) {
2347 case 1: /* as i32 */
2348 vdata = out[0];
2349 break;
2350 case 2: /* as v2i32 */
2351 case 3: /* as v4i32 (aligned to 4) */
2352 case 4: /* as v4i32 */
2353 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2354 for (j = 0; j < num_comps; j++) {
2355 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2356 LLVMConstInt(ctx->i32, j, 0), "");
2357 }
2358 break;
2359 }
2360
2361 LLVMValueRef can_emit_stream =
2362 LLVMBuildICmp(builder, LLVMIntEQ,
2363 stream_id,
2364 lp_build_const_int32(gallivm, stream), "");
2365
2366 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2367 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2368 vdata, num_comps,
2369 so_write_offset[buf_idx],
2370 LLVMConstInt(ctx->i32, 0, 0),
2371 so->output[i].dst_offset*4);
2372 lp_build_endif(&if_ctx_stream);
2373 }
2374 }
2375 lp_build_endif(&if_ctx);
2376 }
2377
2378
2379 /* Generate export instructions for hardware VS shader stage */
2380 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2381 struct si_shader_output_values *outputs,
2382 unsigned noutput)
2383 {
2384 struct si_shader_context *ctx = si_shader_context(bld_base);
2385 struct si_shader *shader = ctx->shader;
2386 struct lp_build_context *base = &bld_base->base;
2387 struct lp_build_context *uint =
2388 &ctx->radeon_bld.soa.bld_base.uint_bld;
2389 LLVMValueRef args[9];
2390 LLVMValueRef pos_args[4][9] = { { 0 } };
2391 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2392 unsigned semantic_name, semantic_index;
2393 unsigned target;
2394 unsigned param_count = 0;
2395 unsigned pos_idx;
2396 int i;
2397
2398 if (outputs && ctx->shader->selector->so.num_outputs) {
2399 si_llvm_emit_streamout(ctx, outputs, noutput);
2400 }
2401
2402 for (i = 0; i < noutput; i++) {
2403 semantic_name = outputs[i].name;
2404 semantic_index = outputs[i].sid;
2405
2406 handle_semantic:
2407 /* Select the correct target */
2408 switch(semantic_name) {
2409 case TGSI_SEMANTIC_PSIZE:
2410 psize_value = outputs[i].values[0];
2411 continue;
2412 case TGSI_SEMANTIC_EDGEFLAG:
2413 edgeflag_value = outputs[i].values[0];
2414 continue;
2415 case TGSI_SEMANTIC_LAYER:
2416 layer_value = outputs[i].values[0];
2417 semantic_name = TGSI_SEMANTIC_GENERIC;
2418 goto handle_semantic;
2419 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2420 viewport_index_value = outputs[i].values[0];
2421 semantic_name = TGSI_SEMANTIC_GENERIC;
2422 goto handle_semantic;
2423 case TGSI_SEMANTIC_POSITION:
2424 target = V_008DFC_SQ_EXP_POS;
2425 break;
2426 case TGSI_SEMANTIC_COLOR:
2427 case TGSI_SEMANTIC_BCOLOR:
2428 target = V_008DFC_SQ_EXP_PARAM + param_count;
2429 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2430 shader->info.vs_output_param_offset[i] = param_count;
2431 param_count++;
2432 break;
2433 case TGSI_SEMANTIC_CLIPDIST:
2434 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2435 break;
2436 case TGSI_SEMANTIC_CLIPVERTEX:
2437 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2438 continue;
2439 case TGSI_SEMANTIC_PRIMID:
2440 case TGSI_SEMANTIC_FOG:
2441 case TGSI_SEMANTIC_TEXCOORD:
2442 case TGSI_SEMANTIC_GENERIC:
2443 target = V_008DFC_SQ_EXP_PARAM + param_count;
2444 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2445 shader->info.vs_output_param_offset[i] = param_count;
2446 param_count++;
2447 break;
2448 default:
2449 target = 0;
2450 fprintf(stderr,
2451 "Warning: SI unhandled vs output type:%d\n",
2452 semantic_name);
2453 }
2454
2455 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2456
2457 if (target >= V_008DFC_SQ_EXP_POS &&
2458 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2459 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2460 args, sizeof(args));
2461 } else {
2462 lp_build_intrinsic(base->gallivm->builder,
2463 "llvm.SI.export", ctx->voidt,
2464 args, 9, 0);
2465 }
2466
2467 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2468 semantic_name = TGSI_SEMANTIC_GENERIC;
2469 goto handle_semantic;
2470 }
2471 }
2472
2473 shader->info.nr_param_exports = param_count;
2474
2475 /* We need to add the position output manually if it's missing. */
2476 if (!pos_args[0][0]) {
2477 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2478 pos_args[0][1] = uint->zero; /* EXEC mask */
2479 pos_args[0][2] = uint->zero; /* last export? */
2480 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2481 pos_args[0][4] = uint->zero; /* COMPR flag */
2482 pos_args[0][5] = base->zero; /* X */
2483 pos_args[0][6] = base->zero; /* Y */
2484 pos_args[0][7] = base->zero; /* Z */
2485 pos_args[0][8] = base->one; /* W */
2486 }
2487
2488 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2489 if (shader->selector->info.writes_psize ||
2490 shader->selector->info.writes_edgeflag ||
2491 shader->selector->info.writes_viewport_index ||
2492 shader->selector->info.writes_layer) {
2493 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2494 shader->selector->info.writes_psize |
2495 (shader->selector->info.writes_edgeflag << 1) |
2496 (shader->selector->info.writes_layer << 2) |
2497 (shader->selector->info.writes_viewport_index << 3));
2498 pos_args[1][1] = uint->zero; /* EXEC mask */
2499 pos_args[1][2] = uint->zero; /* last export? */
2500 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2501 pos_args[1][4] = uint->zero; /* COMPR flag */
2502 pos_args[1][5] = base->zero; /* X */
2503 pos_args[1][6] = base->zero; /* Y */
2504 pos_args[1][7] = base->zero; /* Z */
2505 pos_args[1][8] = base->zero; /* W */
2506
2507 if (shader->selector->info.writes_psize)
2508 pos_args[1][5] = psize_value;
2509
2510 if (shader->selector->info.writes_edgeflag) {
2511 /* The output is a float, but the hw expects an integer
2512 * with the first bit containing the edge flag. */
2513 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2514 edgeflag_value,
2515 ctx->i32, "");
2516 edgeflag_value = lp_build_min(&bld_base->int_bld,
2517 edgeflag_value,
2518 bld_base->int_bld.one);
2519
2520 /* The LLVM intrinsic expects a float. */
2521 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2522 edgeflag_value,
2523 ctx->f32, "");
2524 }
2525
2526 if (shader->selector->info.writes_layer)
2527 pos_args[1][7] = layer_value;
2528
2529 if (shader->selector->info.writes_viewport_index)
2530 pos_args[1][8] = viewport_index_value;
2531 }
2532
2533 for (i = 0; i < 4; i++)
2534 if (pos_args[i][0])
2535 shader->info.nr_pos_exports++;
2536
2537 pos_idx = 0;
2538 for (i = 0; i < 4; i++) {
2539 if (!pos_args[i][0])
2540 continue;
2541
2542 /* Specify the target we are exporting */
2543 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2544
2545 if (pos_idx == shader->info.nr_pos_exports)
2546 /* Specify that this is the last export */
2547 pos_args[i][2] = uint->one;
2548
2549 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2550 ctx->voidt, pos_args[i], 9, 0);
2551 }
2552 }
2553
2554 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2555 {
2556 struct si_shader_context *ctx = si_shader_context(bld_base);
2557 struct gallivm_state *gallivm = bld_base->base.gallivm;
2558 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2559 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2560 uint64_t inputs;
2561
2562 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2563
2564 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2565 buffer = build_indexed_load_const(ctx, rw_buffers,
2566 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2567
2568 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2569
2570 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2571 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2572 lds_vertex_stride, "");
2573 lds_base = get_tcs_in_current_patch_offset(ctx);
2574 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2575
2576 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2577 while (inputs) {
2578 unsigned i = u_bit_scan64(&inputs);
2579
2580 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2581 lp_build_const_int32(gallivm, 4 * i),
2582 "");
2583
2584 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2585 invocation_id,
2586 lp_build_const_int32(gallivm, i));
2587
2588 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2589 lds_ptr);
2590
2591 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2592 buffer_offset, 0);
2593 }
2594 }
2595
2596 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2597 LLVMValueRef rel_patch_id,
2598 LLVMValueRef invocation_id,
2599 LLVMValueRef tcs_out_current_patch_data_offset)
2600 {
2601 struct si_shader_context *ctx = si_shader_context(bld_base);
2602 struct gallivm_state *gallivm = bld_base->base.gallivm;
2603 struct si_shader *shader = ctx->shader;
2604 unsigned tess_inner_index, tess_outer_index;
2605 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2606 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2607 unsigned stride, outer_comps, inner_comps, i;
2608 struct lp_build_if_state if_ctx, inner_if_ctx;
2609
2610 si_llvm_emit_barrier(NULL, bld_base, NULL);
2611
2612 /* Do this only for invocation 0, because the tess levels are per-patch,
2613 * not per-vertex.
2614 *
2615 * This can't jump, because invocation 0 executes this. It should
2616 * at least mask out the loads and stores for other invocations.
2617 */
2618 lp_build_if(&if_ctx, gallivm,
2619 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2620 invocation_id, bld_base->uint_bld.zero, ""));
2621
2622 /* Determine the layout of one tess factor element in the buffer. */
2623 switch (shader->key.tcs.epilog.prim_mode) {
2624 case PIPE_PRIM_LINES:
2625 stride = 2; /* 2 dwords, 1 vec2 store */
2626 outer_comps = 2;
2627 inner_comps = 0;
2628 break;
2629 case PIPE_PRIM_TRIANGLES:
2630 stride = 4; /* 4 dwords, 1 vec4 store */
2631 outer_comps = 3;
2632 inner_comps = 1;
2633 break;
2634 case PIPE_PRIM_QUADS:
2635 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2636 outer_comps = 4;
2637 inner_comps = 2;
2638 break;
2639 default:
2640 assert(0);
2641 return;
2642 }
2643
2644 /* Load tess_inner and tess_outer from LDS.
2645 * Any invocation can write them, so we can't get them from a temporary.
2646 */
2647 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2648 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2649
2650 lds_base = tcs_out_current_patch_data_offset;
2651 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2652 lp_build_const_int32(gallivm,
2653 tess_inner_index * 4), "");
2654 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2655 lp_build_const_int32(gallivm,
2656 tess_outer_index * 4), "");
2657
2658 for (i = 0; i < outer_comps; i++)
2659 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2660 for (i = 0; i < inner_comps; i++)
2661 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2662
2663 /* Convert the outputs to vectors for stores. */
2664 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2665 vec1 = NULL;
2666
2667 if (stride > 4)
2668 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2669
2670 /* Get the buffer. */
2671 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2672 SI_PARAM_RW_BUFFERS);
2673 buffer = build_indexed_load_const(ctx, rw_buffers,
2674 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2675
2676 /* Get the offset. */
2677 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2678 SI_PARAM_TESS_FACTOR_OFFSET);
2679 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2680 lp_build_const_int32(gallivm, 4 * stride), "");
2681
2682 lp_build_if(&inner_if_ctx, gallivm,
2683 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2684 rel_patch_id, bld_base->uint_bld.zero, ""));
2685
2686 /* Store the dynamic HS control word. */
2687 build_tbuffer_store_dwords(ctx, buffer,
2688 lp_build_const_int32(gallivm, 0x80000000),
2689 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2690
2691 lp_build_endif(&inner_if_ctx);
2692
2693 /* Store the tessellation factors. */
2694 build_tbuffer_store_dwords(ctx, buffer, vec0,
2695 MIN2(stride, 4), byteoffset, tf_base, 4);
2696 if (vec1)
2697 build_tbuffer_store_dwords(ctx, buffer, vec1,
2698 stride - 4, byteoffset, tf_base, 20);
2699 lp_build_endif(&if_ctx);
2700 }
2701
2702 /* This only writes the tessellation factor levels. */
2703 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2704 {
2705 struct si_shader_context *ctx = si_shader_context(bld_base);
2706 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2707
2708 rel_patch_id = get_rel_patch_id(ctx);
2709 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2710 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2711
2712 if (!ctx->is_monolithic) {
2713 /* Return epilog parameters from this function. */
2714 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2715 LLVMValueRef ret = ctx->return_value;
2716 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2717 unsigned vgpr;
2718
2719 /* RW_BUFFERS pointer */
2720 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2721 SI_PARAM_RW_BUFFERS);
2722 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2723 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2724 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2725 bld_base->uint_bld.zero, "");
2726 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2727 bld_base->uint_bld.one, "");
2728 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2729 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2730
2731 /* Tess factor buffer soffset is after user SGPRs. */
2732 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2733 SI_PARAM_TESS_FACTOR_OFFSET);
2734 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2735 SI_TCS_NUM_USER_SGPR + 1, "");
2736
2737 /* VGPRs */
2738 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2739 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2740 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2741
2742 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2743 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2744 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2745 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2746 ctx->return_value = ret;
2747 return;
2748 }
2749
2750 si_copy_tcs_inputs(bld_base);
2751 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2752 }
2753
2754 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2755 {
2756 struct si_shader_context *ctx = si_shader_context(bld_base);
2757 struct si_shader *shader = ctx->shader;
2758 struct tgsi_shader_info *info = &shader->selector->info;
2759 struct gallivm_state *gallivm = bld_base->base.gallivm;
2760 unsigned i, chan;
2761 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2762 ctx->param_rel_auto_id);
2763 LLVMValueRef vertex_dw_stride =
2764 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2765 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2766 vertex_dw_stride, "");
2767
2768 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2769 * its inputs from it. */
2770 for (i = 0; i < info->num_outputs; i++) {
2771 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2772 unsigned name = info->output_semantic_name[i];
2773 unsigned index = info->output_semantic_index[i];
2774 int param = si_shader_io_get_unique_index(name, index);
2775 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2776 lp_build_const_int32(gallivm, param * 4), "");
2777
2778 for (chan = 0; chan < 4; chan++) {
2779 lds_store(bld_base, chan, dw_addr,
2780 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2781 }
2782 }
2783 }
2784
2785 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2786 {
2787 struct si_shader_context *ctx = si_shader_context(bld_base);
2788 struct gallivm_state *gallivm = bld_base->base.gallivm;
2789 struct si_shader *es = ctx->shader;
2790 struct tgsi_shader_info *info = &es->selector->info;
2791 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2792 ctx->param_es2gs_offset);
2793 unsigned chan;
2794 int i;
2795
2796 for (i = 0; i < info->num_outputs; i++) {
2797 LLVMValueRef *out_ptr =
2798 ctx->radeon_bld.soa.outputs[i];
2799 int param_index;
2800
2801 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2802 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2803 continue;
2804
2805 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2806 info->output_semantic_index[i]);
2807
2808 for (chan = 0; chan < 4; chan++) {
2809 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2810 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2811
2812 build_tbuffer_store(ctx,
2813 ctx->esgs_ring,
2814 out_val, 1,
2815 LLVMGetUndef(ctx->i32), soffset,
2816 (4 * param_index + chan) * 4,
2817 V_008F0C_BUF_DATA_FORMAT_32,
2818 V_008F0C_BUF_NUM_FORMAT_UINT,
2819 0, 0, 1, 1, 0);
2820 }
2821 }
2822 }
2823
2824 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2825 {
2826 struct si_shader_context *ctx = si_shader_context(bld_base);
2827 struct gallivm_state *gallivm = bld_base->base.gallivm;
2828 LLVMValueRef args[2];
2829
2830 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2831 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2832 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2833 ctx->voidt, args, 2, 0);
2834 }
2835
2836 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2837 {
2838 struct si_shader_context *ctx = si_shader_context(bld_base);
2839 struct gallivm_state *gallivm = bld_base->base.gallivm;
2840 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2841 struct si_shader_output_values *outputs = NULL;
2842 int i,j;
2843
2844 assert(!ctx->is_gs_copy_shader);
2845
2846 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2847
2848 /* Vertex color clamping.
2849 *
2850 * This uses a state constant loaded in a user data SGPR and
2851 * an IF statement is added that clamps all colors if the constant
2852 * is true.
2853 */
2854 if (ctx->type == PIPE_SHADER_VERTEX) {
2855 struct lp_build_if_state if_ctx;
2856 LLVMValueRef cond = NULL;
2857 LLVMValueRef addr, val;
2858
2859 for (i = 0; i < info->num_outputs; i++) {
2860 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2861 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2862 continue;
2863
2864 /* We've found a color. */
2865 if (!cond) {
2866 /* The state is in the first bit of the user SGPR. */
2867 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2868 SI_PARAM_VS_STATE_BITS);
2869 cond = LLVMBuildTrunc(gallivm->builder, cond,
2870 ctx->i1, "");
2871 lp_build_if(&if_ctx, gallivm, cond);
2872 }
2873
2874 for (j = 0; j < 4; j++) {
2875 addr = ctx->radeon_bld.soa.outputs[i][j];
2876 val = LLVMBuildLoad(gallivm->builder, addr, "");
2877 val = radeon_llvm_saturate(bld_base, val);
2878 LLVMBuildStore(gallivm->builder, val, addr);
2879 }
2880 }
2881
2882 if (cond)
2883 lp_build_endif(&if_ctx);
2884 }
2885
2886 for (i = 0; i < info->num_outputs; i++) {
2887 outputs[i].name = info->output_semantic_name[i];
2888 outputs[i].sid = info->output_semantic_index[i];
2889
2890 for (j = 0; j < 4; j++)
2891 outputs[i].values[j] =
2892 LLVMBuildLoad(gallivm->builder,
2893 ctx->radeon_bld.soa.outputs[i][j],
2894 "");
2895 }
2896
2897 if (ctx->is_monolithic) {
2898 /* Export PrimitiveID when PS needs it. */
2899 if (si_vs_exports_prim_id(ctx->shader)) {
2900 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2901 outputs[i].sid = 0;
2902 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2903 get_primitive_id(bld_base, 0));
2904 outputs[i].values[1] = bld_base->base.undef;
2905 outputs[i].values[2] = bld_base->base.undef;
2906 outputs[i].values[3] = bld_base->base.undef;
2907 i++;
2908 }
2909 } else {
2910 /* Return the primitive ID from the LLVM function. */
2911 ctx->return_value =
2912 LLVMBuildInsertValue(gallivm->builder,
2913 ctx->return_value,
2914 bitcast(bld_base, TGSI_TYPE_FLOAT,
2915 get_primitive_id(bld_base, 0)),
2916 VS_EPILOG_PRIMID_LOC, "");
2917 }
2918
2919 si_llvm_export_vs(bld_base, outputs, i);
2920 FREE(outputs);
2921 }
2922
2923 struct si_ps_exports {
2924 unsigned num;
2925 LLVMValueRef args[10][9];
2926 };
2927
2928 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2929 LLVMValueRef depth, LLVMValueRef stencil,
2930 LLVMValueRef samplemask, struct si_ps_exports *exp)
2931 {
2932 struct si_shader_context *ctx = si_shader_context(bld_base);
2933 struct lp_build_context *base = &bld_base->base;
2934 struct lp_build_context *uint = &bld_base->uint_bld;
2935 LLVMValueRef args[9];
2936 unsigned mask = 0;
2937
2938 assert(depth || stencil || samplemask);
2939
2940 args[1] = uint->one; /* whether the EXEC mask is valid */
2941 args[2] = uint->one; /* DONE bit */
2942
2943 /* Specify the target we are exporting */
2944 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2945
2946 args[4] = uint->zero; /* COMP flag */
2947 args[5] = base->undef; /* R, depth */
2948 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2949 args[7] = base->undef; /* B, sample mask */
2950 args[8] = base->undef; /* A, alpha to mask */
2951
2952 if (depth) {
2953 args[5] = depth;
2954 mask |= 0x1;
2955 }
2956
2957 if (stencil) {
2958 args[6] = stencil;
2959 mask |= 0x2;
2960 }
2961
2962 if (samplemask) {
2963 args[7] = samplemask;
2964 mask |= 0x4;
2965 }
2966
2967 /* SI (except OLAND) has a bug that it only looks
2968 * at the X writemask component. */
2969 if (ctx->screen->b.chip_class == SI &&
2970 ctx->screen->b.family != CHIP_OLAND)
2971 mask |= 0x1;
2972
2973 /* Specify which components to enable */
2974 args[0] = lp_build_const_int32(base->gallivm, mask);
2975
2976 memcpy(exp->args[exp->num++], args, sizeof(args));
2977 }
2978
2979 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2980 LLVMValueRef *color, unsigned index,
2981 unsigned samplemask_param,
2982 bool is_last, struct si_ps_exports *exp)
2983 {
2984 struct si_shader_context *ctx = si_shader_context(bld_base);
2985 struct lp_build_context *base = &bld_base->base;
2986 int i;
2987
2988 /* Clamp color */
2989 if (ctx->shader->key.ps.epilog.clamp_color)
2990 for (i = 0; i < 4; i++)
2991 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2992
2993 /* Alpha to one */
2994 if (ctx->shader->key.ps.epilog.alpha_to_one)
2995 color[3] = base->one;
2996
2997 /* Alpha test */
2998 if (index == 0 &&
2999 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3000 si_alpha_test(bld_base, color[3]);
3001
3002 /* Line & polygon smoothing */
3003 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3004 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3005 samplemask_param);
3006
3007 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3008 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3009 LLVMValueRef args[8][9];
3010 int c, last = -1;
3011
3012 /* Get the export arguments, also find out what the last one is. */
3013 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3014 si_llvm_init_export_args(bld_base, color,
3015 V_008DFC_SQ_EXP_MRT + c, args[c]);
3016 if (args[c][0] != bld_base->uint_bld.zero)
3017 last = c;
3018 }
3019
3020 /* Emit all exports. */
3021 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3022 if (is_last && last == c) {
3023 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3024 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3025 } else if (args[c][0] == bld_base->uint_bld.zero)
3026 continue; /* unnecessary NULL export */
3027
3028 memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
3029 }
3030 } else {
3031 LLVMValueRef args[9];
3032
3033 /* Export */
3034 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3035 args);
3036 if (is_last) {
3037 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3038 args[2] = bld_base->uint_bld.one; /* DONE bit */
3039 } else if (args[0] == bld_base->uint_bld.zero)
3040 return; /* unnecessary NULL export */
3041
3042 memcpy(exp->args[exp->num++], args, sizeof(args));
3043 }
3044 }
3045
3046 static void si_emit_ps_exports(struct si_shader_context *ctx,
3047 struct si_ps_exports *exp)
3048 {
3049 for (unsigned i = 0; i < exp->num; i++)
3050 lp_build_intrinsic(ctx->radeon_bld.gallivm.builder,
3051 "llvm.SI.export", ctx->voidt,
3052 exp->args[i], 9, 0);
3053 }
3054
3055 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3056 {
3057 struct si_shader_context *ctx = si_shader_context(bld_base);
3058 struct lp_build_context *base = &bld_base->base;
3059 struct lp_build_context *uint = &bld_base->uint_bld;
3060 LLVMValueRef args[9];
3061
3062 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3063 args[1] = uint->one; /* whether the EXEC mask is valid */
3064 args[2] = uint->one; /* DONE bit */
3065 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3066 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3067 args[5] = uint->undef; /* R */
3068 args[6] = uint->undef; /* G */
3069 args[7] = uint->undef; /* B */
3070 args[8] = uint->undef; /* A */
3071
3072 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3073 ctx->voidt, args, 9, 0);
3074 }
3075
3076 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3077 {
3078 struct si_shader_context *ctx = si_shader_context(bld_base);
3079 struct si_shader *shader = ctx->shader;
3080 struct lp_build_context *base = &bld_base->base;
3081 struct tgsi_shader_info *info = &shader->selector->info;
3082 LLVMBuilderRef builder = base->gallivm->builder;
3083 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3084 int last_color_export = -1;
3085 int i;
3086 struct si_ps_exports exp = {};
3087
3088 /* Determine the last export. If MRTZ is present, it's always last.
3089 * Otherwise, find the last color export.
3090 */
3091 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3092 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3093
3094 /* Don't export NULL and return if alpha-test is enabled. */
3095 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3096 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3097 (spi_format & 0xf) == 0)
3098 spi_format |= V_028714_SPI_SHADER_32_AR;
3099
3100 for (i = 0; i < info->num_outputs; i++) {
3101 unsigned index = info->output_semantic_index[i];
3102
3103 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3104 continue;
3105
3106 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3107 if (shader->key.ps.epilog.last_cbuf > 0) {
3108 /* Just set this if any of the colorbuffers are enabled. */
3109 if (spi_format &
3110 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3111 last_color_export = i;
3112 continue;
3113 }
3114
3115 if ((spi_format >> (index * 4)) & 0xf)
3116 last_color_export = i;
3117 }
3118
3119 /* If there are no outputs, export NULL. */
3120 if (last_color_export == -1) {
3121 si_export_null(bld_base);
3122 return;
3123 }
3124 }
3125
3126 for (i = 0; i < info->num_outputs; i++) {
3127 unsigned semantic_name = info->output_semantic_name[i];
3128 unsigned semantic_index = info->output_semantic_index[i];
3129 unsigned j;
3130 LLVMValueRef color[4] = {};
3131
3132 /* Select the correct target */
3133 switch (semantic_name) {
3134 case TGSI_SEMANTIC_POSITION:
3135 depth = LLVMBuildLoad(builder,
3136 ctx->radeon_bld.soa.outputs[i][2], "");
3137 break;
3138 case TGSI_SEMANTIC_STENCIL:
3139 stencil = LLVMBuildLoad(builder,
3140 ctx->radeon_bld.soa.outputs[i][1], "");
3141 break;
3142 case TGSI_SEMANTIC_SAMPLEMASK:
3143 samplemask = LLVMBuildLoad(builder,
3144 ctx->radeon_bld.soa.outputs[i][0], "");
3145 break;
3146 case TGSI_SEMANTIC_COLOR:
3147 for (j = 0; j < 4; j++)
3148 color[j] = LLVMBuildLoad(builder,
3149 ctx->radeon_bld.soa.outputs[i][j], "");
3150
3151 si_export_mrt_color(bld_base, color, semantic_index,
3152 SI_PARAM_SAMPLE_COVERAGE,
3153 last_color_export == i, &exp);
3154 break;
3155 default:
3156 fprintf(stderr,
3157 "Warning: SI unhandled fs output type:%d\n",
3158 semantic_name);
3159 }
3160 }
3161
3162 if (depth || stencil || samplemask)
3163 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
3164
3165 si_emit_ps_exports(ctx, &exp);
3166 }
3167
3168 /**
3169 * Return PS outputs in this order:
3170 *
3171 * v[0:3] = color0.xyzw
3172 * v[4:7] = color1.xyzw
3173 * ...
3174 * vN+0 = Depth
3175 * vN+1 = Stencil
3176 * vN+2 = SampleMask
3177 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3178 *
3179 * The alpha-ref SGPR is returned via its original location.
3180 */
3181 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3182 {
3183 struct si_shader_context *ctx = si_shader_context(bld_base);
3184 struct si_shader *shader = ctx->shader;
3185 struct lp_build_context *base = &bld_base->base;
3186 struct tgsi_shader_info *info = &shader->selector->info;
3187 LLVMBuilderRef builder = base->gallivm->builder;
3188 unsigned i, j, first_vgpr, vgpr;
3189
3190 LLVMValueRef color[8][4] = {};
3191 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3192 LLVMValueRef ret;
3193
3194 /* Read the output values. */
3195 for (i = 0; i < info->num_outputs; i++) {
3196 unsigned semantic_name = info->output_semantic_name[i];
3197 unsigned semantic_index = info->output_semantic_index[i];
3198
3199 switch (semantic_name) {
3200 case TGSI_SEMANTIC_COLOR:
3201 assert(semantic_index < 8);
3202 for (j = 0; j < 4; j++) {
3203 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3204 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3205 color[semantic_index][j] = result;
3206 }
3207 break;
3208 case TGSI_SEMANTIC_POSITION:
3209 depth = LLVMBuildLoad(builder,
3210 ctx->radeon_bld.soa.outputs[i][2], "");
3211 break;
3212 case TGSI_SEMANTIC_STENCIL:
3213 stencil = LLVMBuildLoad(builder,
3214 ctx->radeon_bld.soa.outputs[i][1], "");
3215 break;
3216 case TGSI_SEMANTIC_SAMPLEMASK:
3217 samplemask = LLVMBuildLoad(builder,
3218 ctx->radeon_bld.soa.outputs[i][0], "");
3219 break;
3220 default:
3221 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3222 semantic_name);
3223 }
3224 }
3225
3226 /* Fill the return structure. */
3227 ret = ctx->return_value;
3228
3229 /* Set SGPRs. */
3230 ret = LLVMBuildInsertValue(builder, ret,
3231 bitcast(bld_base, TGSI_TYPE_SIGNED,
3232 LLVMGetParam(ctx->radeon_bld.main_fn,
3233 SI_PARAM_ALPHA_REF)),
3234 SI_SGPR_ALPHA_REF, "");
3235
3236 /* Set VGPRs */
3237 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3238 for (i = 0; i < ARRAY_SIZE(color); i++) {
3239 if (!color[i][0])
3240 continue;
3241
3242 for (j = 0; j < 4; j++)
3243 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3244 }
3245 if (depth)
3246 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3247 if (stencil)
3248 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3249 if (samplemask)
3250 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3251
3252 /* Add the input sample mask for smoothing at the end. */
3253 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3254 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3255 ret = LLVMBuildInsertValue(builder, ret,
3256 LLVMGetParam(ctx->radeon_bld.main_fn,
3257 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3258
3259 ctx->return_value = ret;
3260 }
3261
3262 /**
3263 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3264 * buffer in number of elements and return it as an i32.
3265 */
3266 static LLVMValueRef get_buffer_size(
3267 struct lp_build_tgsi_context *bld_base,
3268 LLVMValueRef descriptor)
3269 {
3270 struct si_shader_context *ctx = si_shader_context(bld_base);
3271 struct gallivm_state *gallivm = bld_base->base.gallivm;
3272 LLVMBuilderRef builder = gallivm->builder;
3273 LLVMValueRef size =
3274 LLVMBuildExtractElement(builder, descriptor,
3275 lp_build_const_int32(gallivm, 6), "");
3276
3277 if (ctx->screen->b.chip_class >= VI) {
3278 /* On VI, the descriptor contains the size in bytes,
3279 * but TXQ must return the size in elements.
3280 * The stride is always non-zero for resources using TXQ.
3281 */
3282 LLVMValueRef stride =
3283 LLVMBuildExtractElement(builder, descriptor,
3284 lp_build_const_int32(gallivm, 5), "");
3285 stride = LLVMBuildLShr(builder, stride,
3286 lp_build_const_int32(gallivm, 16), "");
3287 stride = LLVMBuildAnd(builder, stride,
3288 lp_build_const_int32(gallivm, 0x3FFF), "");
3289
3290 size = LLVMBuildUDiv(builder, size, stride, "");
3291 }
3292
3293 return size;
3294 }
3295
3296 /**
3297 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3298 * intrinsic names).
3299 */
3300 static void build_int_type_name(
3301 LLVMTypeRef type,
3302 char *buf, unsigned bufsize)
3303 {
3304 assert(bufsize >= 6);
3305
3306 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3307 snprintf(buf, bufsize, "v%ui32",
3308 LLVMGetVectorSize(type));
3309 else
3310 strcpy(buf, "i32");
3311 }
3312
3313 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3314 struct lp_build_tgsi_context *bld_base,
3315 struct lp_build_emit_data *emit_data);
3316
3317 /* Prevent optimizations (at least of memory accesses) across the current
3318 * point in the program by emitting empty inline assembly that is marked as
3319 * having side effects.
3320 */
3321 static void emit_optimization_barrier(struct si_shader_context *ctx)
3322 {
3323 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3324 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3325 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3326 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3327 }
3328
3329 static void emit_waitcnt(struct si_shader_context *ctx)
3330 {
3331 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3332 LLVMBuilderRef builder = gallivm->builder;
3333 LLVMValueRef args[1] = {
3334 lp_build_const_int32(gallivm, 0xf70)
3335 };
3336 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3337 ctx->voidt, args, 1, 0);
3338 }
3339
3340 static void membar_emit(
3341 const struct lp_build_tgsi_action *action,
3342 struct lp_build_tgsi_context *bld_base,
3343 struct lp_build_emit_data *emit_data)
3344 {
3345 struct si_shader_context *ctx = si_shader_context(bld_base);
3346
3347 emit_waitcnt(ctx);
3348 }
3349
3350 static LLVMValueRef
3351 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3352 const struct tgsi_full_src_register *reg)
3353 {
3354 LLVMValueRef ind_index;
3355 LLVMValueRef rsrc_ptr;
3356
3357 if (!reg->Register.Indirect)
3358 return ctx->shader_buffers[reg->Register.Index];
3359
3360 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3361 reg->Register.Index,
3362 SI_NUM_SHADER_BUFFERS);
3363
3364 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3365 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3366 }
3367
3368 static bool tgsi_is_array_sampler(unsigned target)
3369 {
3370 return target == TGSI_TEXTURE_1D_ARRAY ||
3371 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3372 target == TGSI_TEXTURE_2D_ARRAY ||
3373 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3374 target == TGSI_TEXTURE_CUBE_ARRAY ||
3375 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3376 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3377 }
3378
3379 static bool tgsi_is_array_image(unsigned target)
3380 {
3381 return target == TGSI_TEXTURE_3D ||
3382 target == TGSI_TEXTURE_CUBE ||
3383 target == TGSI_TEXTURE_1D_ARRAY ||
3384 target == TGSI_TEXTURE_2D_ARRAY ||
3385 target == TGSI_TEXTURE_CUBE_ARRAY ||
3386 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3387 }
3388
3389 /**
3390 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3391 *
3392 * At least on Tonga, executing image stores on images with DCC enabled and
3393 * non-trivial can eventually lead to lockups. This can occur when an
3394 * application binds an image as read-only but then uses a shader that writes
3395 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3396 * program termination) in this case, but it doesn't cost much to be a bit
3397 * nicer: disabling DCC in the shader still leads to undefined results but
3398 * avoids the lockup.
3399 */
3400 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3401 LLVMValueRef rsrc)
3402 {
3403 if (ctx->screen->b.chip_class <= CIK) {
3404 return rsrc;
3405 } else {
3406 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3407 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3408 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3409 LLVMValueRef tmp;
3410
3411 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3412 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3413 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3414 }
3415 }
3416
3417 /**
3418 * Load the resource descriptor for \p image.
3419 */
3420 static void
3421 image_fetch_rsrc(
3422 struct lp_build_tgsi_context *bld_base,
3423 const struct tgsi_full_src_register *image,
3424 bool dcc_off,
3425 LLVMValueRef *rsrc)
3426 {
3427 struct si_shader_context *ctx = si_shader_context(bld_base);
3428
3429 assert(image->Register.File == TGSI_FILE_IMAGE);
3430
3431 if (!image->Register.Indirect) {
3432 /* Fast path: use preloaded resources */
3433 *rsrc = ctx->images[image->Register.Index];
3434 } else {
3435 /* Indexing and manual load */
3436 LLVMValueRef ind_index;
3437 LLVMValueRef rsrc_ptr;
3438 LLVMValueRef tmp;
3439
3440 /* From the GL_ARB_shader_image_load_store extension spec:
3441 *
3442 * If a shader performs an image load, store, or atomic
3443 * operation using an image variable declared as an array,
3444 * and if the index used to select an individual element is
3445 * negative or greater than or equal to the size of the
3446 * array, the results of the operation are undefined but may
3447 * not lead to termination.
3448 */
3449 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3450 image->Register.Index,
3451 SI_NUM_IMAGES);
3452
3453 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3454 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3455 if (dcc_off)
3456 tmp = force_dcc_off(ctx, tmp);
3457 *rsrc = tmp;
3458 }
3459 }
3460
3461 static LLVMValueRef image_fetch_coords(
3462 struct lp_build_tgsi_context *bld_base,
3463 const struct tgsi_full_instruction *inst,
3464 unsigned src)
3465 {
3466 struct gallivm_state *gallivm = bld_base->base.gallivm;
3467 LLVMBuilderRef builder = gallivm->builder;
3468 unsigned target = inst->Memory.Texture;
3469 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3470 LLVMValueRef coords[4];
3471 LLVMValueRef tmp;
3472 int chan;
3473
3474 for (chan = 0; chan < num_coords; ++chan) {
3475 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3476 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3477 coords[chan] = tmp;
3478 }
3479
3480 if (num_coords == 1)
3481 return coords[0];
3482
3483 if (num_coords == 3) {
3484 /* LLVM has difficulties lowering 3-element vectors. */
3485 coords[3] = bld_base->uint_bld.undef;
3486 num_coords = 4;
3487 }
3488
3489 return lp_build_gather_values(gallivm, coords, num_coords);
3490 }
3491
3492 /**
3493 * Append the extra mode bits that are used by image load and store.
3494 */
3495 static void image_append_args(
3496 struct si_shader_context *ctx,
3497 struct lp_build_emit_data * emit_data,
3498 unsigned target,
3499 bool atomic)
3500 {
3501 const struct tgsi_full_instruction *inst = emit_data->inst;
3502 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3503 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3504
3505 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3506 emit_data->args[emit_data->arg_count++] =
3507 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3508 if (!atomic) {
3509 emit_data->args[emit_data->arg_count++] =
3510 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3511 i1true : i1false; /* glc */
3512 }
3513 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3514 }
3515
3516 /**
3517 * Given a 256 bit resource, extract the top half (which stores the buffer
3518 * resource in the case of textures and images).
3519 */
3520 static LLVMValueRef extract_rsrc_top_half(
3521 struct si_shader_context *ctx,
3522 LLVMValueRef rsrc)
3523 {
3524 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3525 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3526 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3527
3528 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3529 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3530 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3531
3532 return rsrc;
3533 }
3534
3535 /**
3536 * Append the resource and indexing arguments for buffer intrinsics.
3537 *
3538 * \param rsrc the v4i32 buffer resource
3539 * \param index index into the buffer (stride-based)
3540 * \param offset byte offset into the buffer
3541 */
3542 static void buffer_append_args(
3543 struct si_shader_context *ctx,
3544 struct lp_build_emit_data *emit_data,
3545 LLVMValueRef rsrc,
3546 LLVMValueRef index,
3547 LLVMValueRef offset,
3548 bool atomic)
3549 {
3550 const struct tgsi_full_instruction *inst = emit_data->inst;
3551 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3552 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3553
3554 emit_data->args[emit_data->arg_count++] = rsrc;
3555 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3556 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3557 if (!atomic) {
3558 emit_data->args[emit_data->arg_count++] =
3559 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3560 i1true : i1false; /* glc */
3561 }
3562 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3563 }
3564
3565 static void load_fetch_args(
3566 struct lp_build_tgsi_context * bld_base,
3567 struct lp_build_emit_data * emit_data)
3568 {
3569 struct si_shader_context *ctx = si_shader_context(bld_base);
3570 struct gallivm_state *gallivm = bld_base->base.gallivm;
3571 const struct tgsi_full_instruction * inst = emit_data->inst;
3572 unsigned target = inst->Memory.Texture;
3573 LLVMValueRef rsrc;
3574
3575 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3576
3577 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3578 LLVMBuilderRef builder = gallivm->builder;
3579 LLVMValueRef offset;
3580 LLVMValueRef tmp;
3581
3582 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3583
3584 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3585 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3586
3587 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3588 offset, false);
3589 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3590 LLVMValueRef coords;
3591
3592 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3593 coords = image_fetch_coords(bld_base, inst, 1);
3594
3595 if (target == TGSI_TEXTURE_BUFFER) {
3596 rsrc = extract_rsrc_top_half(ctx, rsrc);
3597 buffer_append_args(ctx, emit_data, rsrc, coords,
3598 bld_base->uint_bld.zero, false);
3599 } else {
3600 emit_data->args[0] = coords;
3601 emit_data->args[1] = rsrc;
3602 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3603 emit_data->arg_count = 3;
3604
3605 image_append_args(ctx, emit_data, target, false);
3606 }
3607 }
3608 }
3609
3610 static void load_emit_buffer(struct si_shader_context *ctx,
3611 struct lp_build_emit_data *emit_data)
3612 {
3613 const struct tgsi_full_instruction *inst = emit_data->inst;
3614 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3615 LLVMBuilderRef builder = gallivm->builder;
3616 uint writemask = inst->Dst[0].Register.WriteMask;
3617 uint count = util_last_bit(writemask);
3618 const char *intrinsic_name;
3619 LLVMTypeRef dst_type;
3620
3621 switch (count) {
3622 case 1:
3623 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3624 dst_type = ctx->f32;
3625 break;
3626 case 2:
3627 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3628 dst_type = LLVMVectorType(ctx->f32, 2);
3629 break;
3630 default: // 3 & 4
3631 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3632 dst_type = ctx->v4f32;
3633 count = 4;
3634 }
3635
3636 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3637 builder, intrinsic_name, dst_type,
3638 emit_data->args, emit_data->arg_count,
3639 LLVMReadOnlyAttribute);
3640 }
3641
3642 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3643 const struct tgsi_full_instruction *inst,
3644 LLVMTypeRef type, int arg)
3645 {
3646 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3647 LLVMBuilderRef builder = gallivm->builder;
3648 LLVMValueRef offset, ptr;
3649 int addr_space;
3650
3651 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3652 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3653
3654 ptr = ctx->shared_memory;
3655 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3656 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3657 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3658
3659 return ptr;
3660 }
3661
3662 static void load_emit_memory(
3663 struct si_shader_context *ctx,
3664 struct lp_build_emit_data *emit_data)
3665 {
3666 const struct tgsi_full_instruction *inst = emit_data->inst;
3667 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3668 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3669 LLVMBuilderRef builder = gallivm->builder;
3670 unsigned writemask = inst->Dst[0].Register.WriteMask;
3671 LLVMValueRef channels[4], ptr, derived_ptr, index;
3672 int chan;
3673
3674 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3675
3676 for (chan = 0; chan < 4; ++chan) {
3677 if (!(writemask & (1 << chan))) {
3678 channels[chan] = LLVMGetUndef(base->elem_type);
3679 continue;
3680 }
3681
3682 index = lp_build_const_int32(gallivm, chan);
3683 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3684 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3685 }
3686 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3687 }
3688
3689 static void load_emit(
3690 const struct lp_build_tgsi_action *action,
3691 struct lp_build_tgsi_context *bld_base,
3692 struct lp_build_emit_data *emit_data)
3693 {
3694 struct si_shader_context *ctx = si_shader_context(bld_base);
3695 struct gallivm_state *gallivm = bld_base->base.gallivm;
3696 LLVMBuilderRef builder = gallivm->builder;
3697 const struct tgsi_full_instruction * inst = emit_data->inst;
3698 char intrinsic_name[32];
3699 char coords_type[8];
3700
3701 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3702 load_emit_memory(ctx, emit_data);
3703 return;
3704 }
3705
3706 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3707 emit_waitcnt(ctx);
3708
3709 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3710 load_emit_buffer(ctx, emit_data);
3711 return;
3712 }
3713
3714 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3715 emit_data->output[emit_data->chan] =
3716 lp_build_intrinsic(
3717 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3718 emit_data->args, emit_data->arg_count,
3719 LLVMReadOnlyAttribute);
3720 } else {
3721 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3722 coords_type, sizeof(coords_type));
3723
3724 snprintf(intrinsic_name, sizeof(intrinsic_name),
3725 "llvm.amdgcn.image.load.%s", coords_type);
3726
3727 emit_data->output[emit_data->chan] =
3728 lp_build_intrinsic(
3729 builder, intrinsic_name, emit_data->dst_type,
3730 emit_data->args, emit_data->arg_count,
3731 LLVMReadOnlyAttribute);
3732 }
3733 }
3734
3735 static void store_fetch_args(
3736 struct lp_build_tgsi_context * bld_base,
3737 struct lp_build_emit_data * emit_data)
3738 {
3739 struct si_shader_context *ctx = si_shader_context(bld_base);
3740 struct gallivm_state *gallivm = bld_base->base.gallivm;
3741 LLVMBuilderRef builder = gallivm->builder;
3742 const struct tgsi_full_instruction * inst = emit_data->inst;
3743 struct tgsi_full_src_register memory;
3744 LLVMValueRef chans[4];
3745 LLVMValueRef data;
3746 LLVMValueRef rsrc;
3747 unsigned chan;
3748
3749 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3750
3751 for (chan = 0; chan < 4; ++chan) {
3752 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3753 }
3754 data = lp_build_gather_values(gallivm, chans, 4);
3755
3756 emit_data->args[emit_data->arg_count++] = data;
3757
3758 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3759
3760 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3761 LLVMValueRef offset;
3762 LLVMValueRef tmp;
3763
3764 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3765
3766 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3767 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3768
3769 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3770 offset, false);
3771 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3772 unsigned target = inst->Memory.Texture;
3773 LLVMValueRef coords;
3774
3775 coords = image_fetch_coords(bld_base, inst, 0);
3776
3777 if (target == TGSI_TEXTURE_BUFFER) {
3778 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3779
3780 rsrc = extract_rsrc_top_half(ctx, rsrc);
3781 buffer_append_args(ctx, emit_data, rsrc, coords,
3782 bld_base->uint_bld.zero, false);
3783 } else {
3784 emit_data->args[1] = coords;
3785 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3786 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3787 emit_data->arg_count = 4;
3788
3789 image_append_args(ctx, emit_data, target, false);
3790 }
3791 }
3792 }
3793
3794 static void store_emit_buffer(
3795 struct si_shader_context *ctx,
3796 struct lp_build_emit_data *emit_data)
3797 {
3798 const struct tgsi_full_instruction *inst = emit_data->inst;
3799 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3800 LLVMBuilderRef builder = gallivm->builder;
3801 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3802 LLVMValueRef base_data = emit_data->args[0];
3803 LLVMValueRef base_offset = emit_data->args[3];
3804 unsigned writemask = inst->Dst[0].Register.WriteMask;
3805
3806 while (writemask) {
3807 int start, count;
3808 const char *intrinsic_name;
3809 LLVMValueRef data;
3810 LLVMValueRef offset;
3811 LLVMValueRef tmp;
3812
3813 u_bit_scan_consecutive_range(&writemask, &start, &count);
3814
3815 /* Due to an LLVM limitation, split 3-element writes
3816 * into a 2-element and a 1-element write. */
3817 if (count == 3) {
3818 writemask |= 1 << (start + 2);
3819 count = 2;
3820 }
3821
3822 if (count == 4) {
3823 data = base_data;
3824 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3825 } else if (count == 2) {
3826 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3827
3828 tmp = LLVMBuildExtractElement(
3829 builder, base_data,
3830 lp_build_const_int32(gallivm, start), "");
3831 data = LLVMBuildInsertElement(
3832 builder, LLVMGetUndef(v2f32), tmp,
3833 uint_bld->zero, "");
3834
3835 tmp = LLVMBuildExtractElement(
3836 builder, base_data,
3837 lp_build_const_int32(gallivm, start + 1), "");
3838 data = LLVMBuildInsertElement(
3839 builder, data, tmp, uint_bld->one, "");
3840
3841 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3842 } else {
3843 assert(count == 1);
3844 data = LLVMBuildExtractElement(
3845 builder, base_data,
3846 lp_build_const_int32(gallivm, start), "");
3847 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3848 }
3849
3850 offset = base_offset;
3851 if (start != 0) {
3852 offset = LLVMBuildAdd(
3853 builder, offset,
3854 lp_build_const_int32(gallivm, start * 4), "");
3855 }
3856
3857 emit_data->args[0] = data;
3858 emit_data->args[3] = offset;
3859
3860 lp_build_intrinsic(
3861 builder, intrinsic_name, emit_data->dst_type,
3862 emit_data->args, emit_data->arg_count, 0);
3863 }
3864 }
3865
3866 static void store_emit_memory(
3867 struct si_shader_context *ctx,
3868 struct lp_build_emit_data *emit_data)
3869 {
3870 const struct tgsi_full_instruction *inst = emit_data->inst;
3871 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3872 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3873 LLVMBuilderRef builder = gallivm->builder;
3874 unsigned writemask = inst->Dst[0].Register.WriteMask;
3875 LLVMValueRef ptr, derived_ptr, data, index;
3876 int chan;
3877
3878 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3879
3880 for (chan = 0; chan < 4; ++chan) {
3881 if (!(writemask & (1 << chan))) {
3882 continue;
3883 }
3884 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3885 index = lp_build_const_int32(gallivm, chan);
3886 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3887 LLVMBuildStore(builder, data, derived_ptr);
3888 }
3889 }
3890
3891 static void store_emit(
3892 const struct lp_build_tgsi_action *action,
3893 struct lp_build_tgsi_context *bld_base,
3894 struct lp_build_emit_data *emit_data)
3895 {
3896 struct si_shader_context *ctx = si_shader_context(bld_base);
3897 struct gallivm_state *gallivm = bld_base->base.gallivm;
3898 LLVMBuilderRef builder = gallivm->builder;
3899 const struct tgsi_full_instruction * inst = emit_data->inst;
3900 unsigned target = inst->Memory.Texture;
3901 char intrinsic_name[32];
3902 char coords_type[8];
3903
3904 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3905 store_emit_memory(ctx, emit_data);
3906 return;
3907 }
3908
3909 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3910 emit_waitcnt(ctx);
3911
3912 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3913 store_emit_buffer(ctx, emit_data);
3914 return;
3915 }
3916
3917 if (target == TGSI_TEXTURE_BUFFER) {
3918 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3919 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3920 emit_data->dst_type, emit_data->args,
3921 emit_data->arg_count, 0);
3922 } else {
3923 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3924 coords_type, sizeof(coords_type));
3925 snprintf(intrinsic_name, sizeof(intrinsic_name),
3926 "llvm.amdgcn.image.store.%s", coords_type);
3927
3928 emit_data->output[emit_data->chan] =
3929 lp_build_intrinsic(
3930 builder, intrinsic_name, emit_data->dst_type,
3931 emit_data->args, emit_data->arg_count, 0);
3932 }
3933 }
3934
3935 static void atomic_fetch_args(
3936 struct lp_build_tgsi_context * bld_base,
3937 struct lp_build_emit_data * emit_data)
3938 {
3939 struct si_shader_context *ctx = si_shader_context(bld_base);
3940 struct gallivm_state *gallivm = bld_base->base.gallivm;
3941 LLVMBuilderRef builder = gallivm->builder;
3942 const struct tgsi_full_instruction * inst = emit_data->inst;
3943 LLVMValueRef data1, data2;
3944 LLVMValueRef rsrc;
3945 LLVMValueRef tmp;
3946
3947 emit_data->dst_type = bld_base->base.elem_type;
3948
3949 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3950 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3951
3952 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3953 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3954 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3955 }
3956
3957 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3958 * of arguments, which is reversed relative to TGSI (and GLSL)
3959 */
3960 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3961 emit_data->args[emit_data->arg_count++] = data2;
3962 emit_data->args[emit_data->arg_count++] = data1;
3963
3964 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3965 LLVMValueRef offset;
3966
3967 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3968
3969 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3970 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3971
3972 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3973 offset, true);
3974 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3975 unsigned target = inst->Memory.Texture;
3976 LLVMValueRef coords;
3977
3978 image_fetch_rsrc(bld_base, &inst->Src[0],
3979 target != TGSI_TEXTURE_BUFFER, &rsrc);
3980 coords = image_fetch_coords(bld_base, inst, 1);
3981
3982 if (target == TGSI_TEXTURE_BUFFER) {
3983 rsrc = extract_rsrc_top_half(ctx, rsrc);
3984 buffer_append_args(ctx, emit_data, rsrc, coords,
3985 bld_base->uint_bld.zero, true);
3986 } else {
3987 emit_data->args[emit_data->arg_count++] = coords;
3988 emit_data->args[emit_data->arg_count++] = rsrc;
3989
3990 image_append_args(ctx, emit_data, target, true);
3991 }
3992 }
3993 }
3994
3995 static void atomic_emit_memory(struct si_shader_context *ctx,
3996 struct lp_build_emit_data *emit_data) {
3997 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3998 LLVMBuilderRef builder = gallivm->builder;
3999 const struct tgsi_full_instruction * inst = emit_data->inst;
4000 LLVMValueRef ptr, result, arg;
4001
4002 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4003
4004 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
4005 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4006
4007 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4008 LLVMValueRef new_data;
4009 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4010 inst, 3, 0);
4011
4012 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4013
4014 #if HAVE_LLVM >= 0x309
4015 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4016 LLVMAtomicOrderingSequentiallyConsistent,
4017 LLVMAtomicOrderingSequentiallyConsistent,
4018 false);
4019 #endif
4020
4021 result = LLVMBuildExtractValue(builder, result, 0, "");
4022 } else {
4023 LLVMAtomicRMWBinOp op;
4024
4025 switch(inst->Instruction.Opcode) {
4026 case TGSI_OPCODE_ATOMUADD:
4027 op = LLVMAtomicRMWBinOpAdd;
4028 break;
4029 case TGSI_OPCODE_ATOMXCHG:
4030 op = LLVMAtomicRMWBinOpXchg;
4031 break;
4032 case TGSI_OPCODE_ATOMAND:
4033 op = LLVMAtomicRMWBinOpAnd;
4034 break;
4035 case TGSI_OPCODE_ATOMOR:
4036 op = LLVMAtomicRMWBinOpOr;
4037 break;
4038 case TGSI_OPCODE_ATOMXOR:
4039 op = LLVMAtomicRMWBinOpXor;
4040 break;
4041 case TGSI_OPCODE_ATOMUMIN:
4042 op = LLVMAtomicRMWBinOpUMin;
4043 break;
4044 case TGSI_OPCODE_ATOMUMAX:
4045 op = LLVMAtomicRMWBinOpUMax;
4046 break;
4047 case TGSI_OPCODE_ATOMIMIN:
4048 op = LLVMAtomicRMWBinOpMin;
4049 break;
4050 case TGSI_OPCODE_ATOMIMAX:
4051 op = LLVMAtomicRMWBinOpMax;
4052 break;
4053 default:
4054 unreachable("unknown atomic opcode");
4055 }
4056
4057 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4058 LLVMAtomicOrderingSequentiallyConsistent,
4059 false);
4060 }
4061 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4062 }
4063
4064 static void atomic_emit(
4065 const struct lp_build_tgsi_action *action,
4066 struct lp_build_tgsi_context *bld_base,
4067 struct lp_build_emit_data *emit_data)
4068 {
4069 struct si_shader_context *ctx = si_shader_context(bld_base);
4070 struct gallivm_state *gallivm = bld_base->base.gallivm;
4071 LLVMBuilderRef builder = gallivm->builder;
4072 const struct tgsi_full_instruction * inst = emit_data->inst;
4073 char intrinsic_name[40];
4074 LLVMValueRef tmp;
4075
4076 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4077 atomic_emit_memory(ctx, emit_data);
4078 return;
4079 }
4080
4081 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4082 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4083 snprintf(intrinsic_name, sizeof(intrinsic_name),
4084 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4085 } else {
4086 char coords_type[8];
4087
4088 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4089 coords_type, sizeof(coords_type));
4090 snprintf(intrinsic_name, sizeof(intrinsic_name),
4091 "llvm.amdgcn.image.atomic.%s.%s",
4092 action->intr_name, coords_type);
4093 }
4094
4095 tmp = lp_build_intrinsic(
4096 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4097 emit_data->args, emit_data->arg_count, 0);
4098 emit_data->output[emit_data->chan] =
4099 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4100 }
4101
4102 static void resq_fetch_args(
4103 struct lp_build_tgsi_context * bld_base,
4104 struct lp_build_emit_data * emit_data)
4105 {
4106 struct si_shader_context *ctx = si_shader_context(bld_base);
4107 struct gallivm_state *gallivm = bld_base->base.gallivm;
4108 const struct tgsi_full_instruction *inst = emit_data->inst;
4109 const struct tgsi_full_src_register *reg = &inst->Src[0];
4110
4111 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4112
4113 if (reg->Register.File == TGSI_FILE_BUFFER) {
4114 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4115 emit_data->arg_count = 1;
4116 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4117 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4118 emit_data->arg_count = 1;
4119 } else {
4120 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4121 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4122 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4123 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4124 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4125 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4126 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4127 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4128 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4129 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4130 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4131 emit_data->arg_count = 10;
4132 }
4133 }
4134
4135 static void resq_emit(
4136 const struct lp_build_tgsi_action *action,
4137 struct lp_build_tgsi_context *bld_base,
4138 struct lp_build_emit_data *emit_data)
4139 {
4140 struct gallivm_state *gallivm = bld_base->base.gallivm;
4141 LLVMBuilderRef builder = gallivm->builder;
4142 const struct tgsi_full_instruction *inst = emit_data->inst;
4143 LLVMValueRef out;
4144
4145 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4146 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4147 lp_build_const_int32(gallivm, 2), "");
4148 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4149 out = get_buffer_size(bld_base, emit_data->args[0]);
4150 } else {
4151 out = lp_build_intrinsic(
4152 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4153 emit_data->args, emit_data->arg_count,
4154 LLVMReadNoneAttribute);
4155
4156 /* Divide the number of layers by 6 to get the number of cubes. */
4157 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4158 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4159 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4160
4161 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4162 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4163 z = LLVMBuildSDiv(builder, z, imm6, "");
4164 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4165 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4166 }
4167 }
4168
4169 emit_data->output[emit_data->chan] = out;
4170 }
4171
4172 static void set_tex_fetch_args(struct si_shader_context *ctx,
4173 struct lp_build_emit_data *emit_data,
4174 unsigned opcode, unsigned target,
4175 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4176 LLVMValueRef *param, unsigned count,
4177 unsigned dmask)
4178 {
4179 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4180 unsigned num_args;
4181 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4182
4183 /* Pad to power of two vector */
4184 while (count < util_next_power_of_two(count))
4185 param[count++] = LLVMGetUndef(ctx->i32);
4186
4187 /* Texture coordinates. */
4188 if (count > 1)
4189 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4190 else
4191 emit_data->args[0] = param[0];
4192
4193 /* Resource. */
4194 emit_data->args[1] = res_ptr;
4195 num_args = 2;
4196
4197 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4198 emit_data->dst_type = ctx->v4i32;
4199 else {
4200 emit_data->dst_type = ctx->v4f32;
4201
4202 emit_data->args[num_args++] = samp_ptr;
4203 }
4204
4205 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4206 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4207 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4208 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4209 tgsi_is_array_sampler(target)); /* da */
4210 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4211 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4212 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4213 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4214
4215 emit_data->arg_count = num_args;
4216 }
4217
4218 static const struct lp_build_tgsi_action tex_action;
4219
4220 enum desc_type {
4221 DESC_IMAGE,
4222 DESC_FMASK,
4223 DESC_SAMPLER
4224 };
4225
4226 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4227 {
4228 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4229 CONST_ADDR_SPACE);
4230 }
4231
4232 /**
4233 * Load an image view, fmask view. or sampler state descriptor.
4234 */
4235 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4236 LLVMValueRef list, LLVMValueRef index,
4237 enum desc_type type)
4238 {
4239 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4240 LLVMBuilderRef builder = gallivm->builder;
4241
4242 switch (type) {
4243 case DESC_IMAGE:
4244 /* The image is at [0:7]. */
4245 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4246 break;
4247 case DESC_FMASK:
4248 /* The FMASK is at [8:15]. */
4249 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4250 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4251 break;
4252 case DESC_SAMPLER:
4253 /* The sampler state is at [12:15]. */
4254 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4255 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4256 list = LLVMBuildPointerCast(builder, list,
4257 const_array(ctx->v4i32, 0), "");
4258 break;
4259 }
4260
4261 return build_indexed_load_const(ctx, list, index);
4262 }
4263
4264 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4265 LLVMValueRef index, enum desc_type type)
4266 {
4267 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4268 SI_PARAM_SAMPLERS);
4269
4270 return get_sampler_desc_custom(ctx, list, index, type);
4271 }
4272
4273 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4274 *
4275 * SI-CI:
4276 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4277 * filtering manually. The driver sets img7 to a mask clearing
4278 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4279 * s_and_b32 samp0, samp0, img7
4280 *
4281 * VI:
4282 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4283 */
4284 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4285 LLVMValueRef res, LLVMValueRef samp)
4286 {
4287 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4288 LLVMValueRef img7, samp0;
4289
4290 if (ctx->screen->b.chip_class >= VI)
4291 return samp;
4292
4293 img7 = LLVMBuildExtractElement(builder, res,
4294 LLVMConstInt(ctx->i32, 7, 0), "");
4295 samp0 = LLVMBuildExtractElement(builder, samp,
4296 LLVMConstInt(ctx->i32, 0, 0), "");
4297 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4298 return LLVMBuildInsertElement(builder, samp, samp0,
4299 LLVMConstInt(ctx->i32, 0, 0), "");
4300 }
4301
4302 static void tex_fetch_ptrs(
4303 struct lp_build_tgsi_context *bld_base,
4304 struct lp_build_emit_data *emit_data,
4305 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4306 {
4307 struct si_shader_context *ctx = si_shader_context(bld_base);
4308 const struct tgsi_full_instruction *inst = emit_data->inst;
4309 unsigned target = inst->Texture.Texture;
4310 unsigned sampler_src;
4311 unsigned sampler_index;
4312
4313 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4314 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4315
4316 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4317 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4318 LLVMValueRef ind_index;
4319
4320 ind_index = get_bounded_indirect_index(ctx,
4321 &reg->Indirect,
4322 reg->Register.Index,
4323 SI_NUM_SAMPLERS);
4324
4325 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4326
4327 if (target == TGSI_TEXTURE_2D_MSAA ||
4328 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4329 if (samp_ptr)
4330 *samp_ptr = NULL;
4331 if (fmask_ptr)
4332 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4333 } else {
4334 if (samp_ptr) {
4335 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4336 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4337 }
4338 if (fmask_ptr)
4339 *fmask_ptr = NULL;
4340 }
4341 } else {
4342 *res_ptr = ctx->sampler_views[sampler_index];
4343 if (samp_ptr)
4344 *samp_ptr = ctx->sampler_states[sampler_index];
4345 if (fmask_ptr)
4346 *fmask_ptr = ctx->fmasks[sampler_index];
4347 }
4348 }
4349
4350 static void txq_fetch_args(
4351 struct lp_build_tgsi_context *bld_base,
4352 struct lp_build_emit_data *emit_data)
4353 {
4354 struct si_shader_context *ctx = si_shader_context(bld_base);
4355 struct gallivm_state *gallivm = bld_base->base.gallivm;
4356 LLVMBuilderRef builder = gallivm->builder;
4357 const struct tgsi_full_instruction *inst = emit_data->inst;
4358 unsigned target = inst->Texture.Texture;
4359 LLVMValueRef res_ptr;
4360 LLVMValueRef address;
4361
4362 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4363
4364 if (target == TGSI_TEXTURE_BUFFER) {
4365 /* Read the size from the buffer descriptor directly. */
4366 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4367 emit_data->args[0] = get_buffer_size(bld_base, res);
4368 return;
4369 }
4370
4371 /* Textures - set the mip level. */
4372 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4373
4374 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4375 NULL, &address, 1, 0xf);
4376 }
4377
4378 static void txq_emit(const struct lp_build_tgsi_action *action,
4379 struct lp_build_tgsi_context *bld_base,
4380 struct lp_build_emit_data *emit_data)
4381 {
4382 struct lp_build_context *base = &bld_base->base;
4383 unsigned target = emit_data->inst->Texture.Texture;
4384
4385 if (target == TGSI_TEXTURE_BUFFER) {
4386 /* Just return the buffer size. */
4387 emit_data->output[emit_data->chan] = emit_data->args[0];
4388 return;
4389 }
4390
4391 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4392 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4393 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4394 LLVMReadNoneAttribute);
4395
4396 /* Divide the number of layers by 6 to get the number of cubes. */
4397 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4398 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4399 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4400 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4401 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4402
4403 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4404 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4405 z = LLVMBuildSDiv(builder, z, six, "");
4406
4407 emit_data->output[emit_data->chan] =
4408 LLVMBuildInsertElement(builder, v4, z, two, "");
4409 }
4410 }
4411
4412 static void tex_fetch_args(
4413 struct lp_build_tgsi_context *bld_base,
4414 struct lp_build_emit_data *emit_data)
4415 {
4416 struct si_shader_context *ctx = si_shader_context(bld_base);
4417 struct gallivm_state *gallivm = bld_base->base.gallivm;
4418 const struct tgsi_full_instruction *inst = emit_data->inst;
4419 unsigned opcode = inst->Instruction.Opcode;
4420 unsigned target = inst->Texture.Texture;
4421 LLVMValueRef coords[5], derivs[6];
4422 LLVMValueRef address[16];
4423 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4424 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4425 unsigned count = 0;
4426 unsigned chan;
4427 unsigned num_deriv_channels = 0;
4428 bool has_offset = inst->Texture.NumOffsets > 0;
4429 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4430 unsigned dmask = 0xf;
4431
4432 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4433
4434 if (target == TGSI_TEXTURE_BUFFER) {
4435 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4436
4437 /* Bitcast and truncate v8i32 to v16i8. */
4438 LLVMValueRef res = res_ptr;
4439 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4440 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4441 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4442
4443 emit_data->dst_type = ctx->v4f32;
4444 emit_data->args[0] = res;
4445 emit_data->args[1] = bld_base->uint_bld.zero;
4446 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4447 emit_data->arg_count = 3;
4448 return;
4449 }
4450
4451 /* Fetch and project texture coordinates */
4452 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4453 for (chan = 0; chan < 3; chan++ ) {
4454 coords[chan] = lp_build_emit_fetch(bld_base,
4455 emit_data->inst, 0,
4456 chan);
4457 if (opcode == TGSI_OPCODE_TXP)
4458 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4459 TGSI_OPCODE_DIV,
4460 coords[chan],
4461 coords[3]);
4462 }
4463
4464 if (opcode == TGSI_OPCODE_TXP)
4465 coords[3] = bld_base->base.one;
4466
4467 /* Pack offsets. */
4468 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4469 /* The offsets are six-bit signed integers packed like this:
4470 * X=[5:0], Y=[13:8], and Z=[21:16].
4471 */
4472 LLVMValueRef offset[3], pack;
4473
4474 assert(inst->Texture.NumOffsets == 1);
4475
4476 for (chan = 0; chan < 3; chan++) {
4477 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4478 emit_data->inst, 0, chan);
4479 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4480 lp_build_const_int32(gallivm, 0x3f), "");
4481 if (chan)
4482 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4483 lp_build_const_int32(gallivm, chan*8), "");
4484 }
4485
4486 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4487 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4488 address[count++] = pack;
4489 }
4490
4491 /* Pack LOD bias value */
4492 if (opcode == TGSI_OPCODE_TXB)
4493 address[count++] = coords[3];
4494 if (opcode == TGSI_OPCODE_TXB2)
4495 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4496
4497 /* Pack depth comparison value */
4498 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4499 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4500 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4501 } else {
4502 assert(ref_pos >= 0);
4503 address[count++] = coords[ref_pos];
4504 }
4505 }
4506
4507 /* Pack user derivatives */
4508 if (opcode == TGSI_OPCODE_TXD) {
4509 int param, num_src_deriv_channels;
4510
4511 switch (target) {
4512 case TGSI_TEXTURE_3D:
4513 num_src_deriv_channels = 3;
4514 num_deriv_channels = 3;
4515 break;
4516 case TGSI_TEXTURE_2D:
4517 case TGSI_TEXTURE_SHADOW2D:
4518 case TGSI_TEXTURE_RECT:
4519 case TGSI_TEXTURE_SHADOWRECT:
4520 case TGSI_TEXTURE_2D_ARRAY:
4521 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4522 num_src_deriv_channels = 2;
4523 num_deriv_channels = 2;
4524 break;
4525 case TGSI_TEXTURE_CUBE:
4526 case TGSI_TEXTURE_SHADOWCUBE:
4527 case TGSI_TEXTURE_CUBE_ARRAY:
4528 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4529 /* Cube derivatives will be converted to 2D. */
4530 num_src_deriv_channels = 3;
4531 num_deriv_channels = 2;
4532 break;
4533 case TGSI_TEXTURE_1D:
4534 case TGSI_TEXTURE_SHADOW1D:
4535 case TGSI_TEXTURE_1D_ARRAY:
4536 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4537 num_src_deriv_channels = 1;
4538 num_deriv_channels = 1;
4539 break;
4540 default:
4541 unreachable("invalid target");
4542 }
4543
4544 for (param = 0; param < 2; param++)
4545 for (chan = 0; chan < num_src_deriv_channels; chan++)
4546 derivs[param * num_src_deriv_channels + chan] =
4547 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4548 }
4549
4550 if (target == TGSI_TEXTURE_CUBE ||
4551 target == TGSI_TEXTURE_CUBE_ARRAY ||
4552 target == TGSI_TEXTURE_SHADOWCUBE ||
4553 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4554 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4555
4556 if (opcode == TGSI_OPCODE_TXD)
4557 for (int i = 0; i < num_deriv_channels * 2; i++)
4558 address[count++] = derivs[i];
4559
4560 /* Pack texture coordinates */
4561 address[count++] = coords[0];
4562 if (num_coords > 1)
4563 address[count++] = coords[1];
4564 if (num_coords > 2)
4565 address[count++] = coords[2];
4566
4567 /* Pack LOD or sample index */
4568 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4569 address[count++] = coords[3];
4570 else if (opcode == TGSI_OPCODE_TXL2)
4571 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4572
4573 if (count > 16) {
4574 assert(!"Cannot handle more than 16 texture address parameters");
4575 count = 16;
4576 }
4577
4578 for (chan = 0; chan < count; chan++ ) {
4579 address[chan] = LLVMBuildBitCast(gallivm->builder,
4580 address[chan], ctx->i32, "");
4581 }
4582
4583 /* Adjust the sample index according to FMASK.
4584 *
4585 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4586 * which is the identity mapping. Each nibble says which physical sample
4587 * should be fetched to get that sample.
4588 *
4589 * For example, 0x11111100 means there are only 2 samples stored and
4590 * the second sample covers 3/4 of the pixel. When reading samples 0
4591 * and 1, return physical sample 0 (determined by the first two 0s
4592 * in FMASK), otherwise return physical sample 1.
4593 *
4594 * The sample index should be adjusted as follows:
4595 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4596 */
4597 if (target == TGSI_TEXTURE_2D_MSAA ||
4598 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4599 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4600 struct lp_build_emit_data txf_emit_data = *emit_data;
4601 LLVMValueRef txf_address[4];
4602 unsigned txf_count = count;
4603 struct tgsi_full_instruction inst = {};
4604
4605 memcpy(txf_address, address, sizeof(txf_address));
4606
4607 if (target == TGSI_TEXTURE_2D_MSAA) {
4608 txf_address[2] = bld_base->uint_bld.zero;
4609 }
4610 txf_address[3] = bld_base->uint_bld.zero;
4611
4612 /* Read FMASK using TXF. */
4613 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4614 inst.Texture.Texture = target;
4615 txf_emit_data.inst = &inst;
4616 txf_emit_data.chan = 0;
4617 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4618 target, fmask_ptr, NULL,
4619 txf_address, txf_count, 0xf);
4620 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4621
4622 /* Initialize some constants. */
4623 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4624 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4625
4626 /* Apply the formula. */
4627 LLVMValueRef fmask =
4628 LLVMBuildExtractElement(gallivm->builder,
4629 txf_emit_data.output[0],
4630 uint_bld->zero, "");
4631
4632 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4633
4634 LLVMValueRef sample_index4 =
4635 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4636
4637 LLVMValueRef shifted_fmask =
4638 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4639
4640 LLVMValueRef final_sample =
4641 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4642
4643 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4644 * resource descriptor is 0 (invalid),
4645 */
4646 LLVMValueRef fmask_desc =
4647 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4648 ctx->v8i32, "");
4649
4650 LLVMValueRef fmask_word1 =
4651 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4652 uint_bld->one, "");
4653
4654 LLVMValueRef word1_is_nonzero =
4655 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4656 fmask_word1, uint_bld->zero, "");
4657
4658 /* Replace the MSAA sample index. */
4659 address[sample_chan] =
4660 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4661 final_sample, address[sample_chan], "");
4662 }
4663
4664 if (opcode == TGSI_OPCODE_TXF) {
4665 /* add tex offsets */
4666 if (inst->Texture.NumOffsets) {
4667 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4668 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4669 const struct tgsi_texture_offset *off = inst->TexOffsets;
4670
4671 assert(inst->Texture.NumOffsets == 1);
4672
4673 switch (target) {
4674 case TGSI_TEXTURE_3D:
4675 address[2] = lp_build_add(uint_bld, address[2],
4676 bld->immediates[off->Index][off->SwizzleZ]);
4677 /* fall through */
4678 case TGSI_TEXTURE_2D:
4679 case TGSI_TEXTURE_SHADOW2D:
4680 case TGSI_TEXTURE_RECT:
4681 case TGSI_TEXTURE_SHADOWRECT:
4682 case TGSI_TEXTURE_2D_ARRAY:
4683 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4684 address[1] =
4685 lp_build_add(uint_bld, address[1],
4686 bld->immediates[off->Index][off->SwizzleY]);
4687 /* fall through */
4688 case TGSI_TEXTURE_1D:
4689 case TGSI_TEXTURE_SHADOW1D:
4690 case TGSI_TEXTURE_1D_ARRAY:
4691 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4692 address[0] =
4693 lp_build_add(uint_bld, address[0],
4694 bld->immediates[off->Index][off->SwizzleX]);
4695 break;
4696 /* texture offsets do not apply to other texture targets */
4697 }
4698 }
4699 }
4700
4701 if (opcode == TGSI_OPCODE_TG4) {
4702 unsigned gather_comp = 0;
4703
4704 /* DMASK was repurposed for GATHER4. 4 components are always
4705 * returned and DMASK works like a swizzle - it selects
4706 * the component to fetch. The only valid DMASK values are
4707 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4708 * (red,red,red,red) etc.) The ISA document doesn't mention
4709 * this.
4710 */
4711
4712 /* Get the component index from src1.x for Gather4. */
4713 if (!tgsi_is_shadow_target(target)) {
4714 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4715 LLVMValueRef comp_imm;
4716 struct tgsi_src_register src1 = inst->Src[1].Register;
4717
4718 assert(src1.File == TGSI_FILE_IMMEDIATE);
4719
4720 comp_imm = imms[src1.Index][src1.SwizzleX];
4721 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4722 gather_comp = CLAMP(gather_comp, 0, 3);
4723 }
4724
4725 dmask = 1 << gather_comp;
4726 }
4727
4728 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4729 samp_ptr, address, count, dmask);
4730 }
4731
4732 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4733 struct lp_build_tgsi_context *bld_base,
4734 struct lp_build_emit_data *emit_data)
4735 {
4736 struct si_shader_context *ctx = si_shader_context(bld_base);
4737 struct lp_build_context *base = &bld_base->base;
4738 unsigned opcode = emit_data->inst->Instruction.Opcode;
4739 unsigned target = emit_data->inst->Texture.Texture;
4740 char intr_name[127];
4741 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4742 bool is_shadow = tgsi_is_shadow_target(target);
4743 char type[64];
4744 const char *name = "llvm.SI.image.sample";
4745 const char *infix = "";
4746
4747 if (target == TGSI_TEXTURE_BUFFER) {
4748 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4749 base->gallivm->builder,
4750 "llvm.SI.vs.load.input", emit_data->dst_type,
4751 emit_data->args, emit_data->arg_count,
4752 LLVMReadNoneAttribute);
4753 return;
4754 }
4755
4756 switch (opcode) {
4757 case TGSI_OPCODE_TXF:
4758 name = target == TGSI_TEXTURE_2D_MSAA ||
4759 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4760 "llvm.SI.image.load" :
4761 "llvm.SI.image.load.mip";
4762 is_shadow = false;
4763 has_offset = false;
4764 break;
4765 case TGSI_OPCODE_LODQ:
4766 name = "llvm.SI.getlod";
4767 is_shadow = false;
4768 has_offset = false;
4769 break;
4770 case TGSI_OPCODE_TEX:
4771 case TGSI_OPCODE_TEX2:
4772 case TGSI_OPCODE_TXP:
4773 if (ctx->type != PIPE_SHADER_FRAGMENT)
4774 infix = ".lz";
4775 break;
4776 case TGSI_OPCODE_TXB:
4777 case TGSI_OPCODE_TXB2:
4778 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4779 infix = ".b";
4780 break;
4781 case TGSI_OPCODE_TXL:
4782 case TGSI_OPCODE_TXL2:
4783 infix = ".l";
4784 break;
4785 case TGSI_OPCODE_TXD:
4786 infix = ".d";
4787 break;
4788 case TGSI_OPCODE_TG4:
4789 name = "llvm.SI.gather4";
4790 infix = ".lz";
4791 break;
4792 default:
4793 assert(0);
4794 return;
4795 }
4796
4797 /* Add the type and suffixes .c, .o if needed. */
4798 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4799 sprintf(intr_name, "%s%s%s%s.%s",
4800 name, is_shadow ? ".c" : "", infix,
4801 has_offset ? ".o" : "", type);
4802
4803 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4804 base->gallivm->builder, intr_name, emit_data->dst_type,
4805 emit_data->args, emit_data->arg_count,
4806 LLVMReadNoneAttribute);
4807 }
4808
4809 static void si_llvm_emit_txqs(
4810 const struct lp_build_tgsi_action *action,
4811 struct lp_build_tgsi_context *bld_base,
4812 struct lp_build_emit_data *emit_data)
4813 {
4814 struct si_shader_context *ctx = si_shader_context(bld_base);
4815 struct gallivm_state *gallivm = bld_base->base.gallivm;
4816 LLVMBuilderRef builder = gallivm->builder;
4817 LLVMValueRef res, samples;
4818 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4819
4820 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4821
4822
4823 /* Read the samples from the descriptor directly. */
4824 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4825 samples = LLVMBuildExtractElement(
4826 builder, res,
4827 lp_build_const_int32(gallivm, 3), "");
4828 samples = LLVMBuildLShr(builder, samples,
4829 lp_build_const_int32(gallivm, 16), "");
4830 samples = LLVMBuildAnd(builder, samples,
4831 lp_build_const_int32(gallivm, 0xf), "");
4832 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4833 samples, "");
4834
4835 emit_data->output[emit_data->chan] = samples;
4836 }
4837
4838 /*
4839 * SI implements derivatives using the local data store (LDS)
4840 * All writes to the LDS happen in all executing threads at
4841 * the same time. TID is the Thread ID for the current
4842 * thread and is a value between 0 and 63, representing
4843 * the thread's position in the wavefront.
4844 *
4845 * For the pixel shader threads are grouped into quads of four pixels.
4846 * The TIDs of the pixels of a quad are:
4847 *
4848 * +------+------+
4849 * |4n + 0|4n + 1|
4850 * +------+------+
4851 * |4n + 2|4n + 3|
4852 * +------+------+
4853 *
4854 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4855 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4856 * the current pixel's column, and masking with 0xfffffffe yields the TID
4857 * of the left pixel of the current pixel's row.
4858 *
4859 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4860 * adding 2 yields the TID of the pixel below the top pixel.
4861 */
4862 /* masks for thread ID. */
4863 #define TID_MASK_TOP_LEFT 0xfffffffc
4864 #define TID_MASK_TOP 0xfffffffd
4865 #define TID_MASK_LEFT 0xfffffffe
4866
4867 static void si_llvm_emit_ddxy(
4868 const struct lp_build_tgsi_action *action,
4869 struct lp_build_tgsi_context *bld_base,
4870 struct lp_build_emit_data *emit_data)
4871 {
4872 struct si_shader_context *ctx = si_shader_context(bld_base);
4873 struct gallivm_state *gallivm = bld_base->base.gallivm;
4874 const struct tgsi_full_instruction *inst = emit_data->inst;
4875 unsigned opcode = inst->Instruction.Opcode;
4876 LLVMValueRef indices[2];
4877 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4878 LLVMValueRef tl, trbl, result[4];
4879 LLVMValueRef tl_tid, trbl_tid;
4880 unsigned swizzle[4];
4881 unsigned c;
4882 int idx;
4883 unsigned mask;
4884
4885 indices[0] = bld_base->uint_bld.zero;
4886 indices[1] = get_thread_id(ctx);
4887 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4888 indices, 2, "");
4889
4890 if (opcode == TGSI_OPCODE_DDX_FINE)
4891 mask = TID_MASK_LEFT;
4892 else if (opcode == TGSI_OPCODE_DDY_FINE)
4893 mask = TID_MASK_TOP;
4894 else
4895 mask = TID_MASK_TOP_LEFT;
4896
4897 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4898 lp_build_const_int32(gallivm, mask), "");
4899 indices[1] = tl_tid;
4900 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4901 indices, 2, "");
4902
4903 /* for DDX we want to next X pixel, DDY next Y pixel. */
4904 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4905 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4906 lp_build_const_int32(gallivm, idx), "");
4907 indices[1] = trbl_tid;
4908 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4909 indices, 2, "");
4910
4911 for (c = 0; c < 4; ++c) {
4912 unsigned i;
4913 LLVMValueRef val;
4914 LLVMValueRef args[2];
4915
4916 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4917 for (i = 0; i < c; ++i) {
4918 if (swizzle[i] == swizzle[c]) {
4919 result[c] = result[i];
4920 break;
4921 }
4922 }
4923 if (i != c)
4924 continue;
4925
4926 val = LLVMBuildBitCast(gallivm->builder,
4927 lp_build_emit_fetch(bld_base, inst, 0, c),
4928 ctx->i32, "");
4929
4930 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4931
4932 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4933 lp_build_const_int32(gallivm, 4), "");
4934 args[1] = val;
4935 tl = lp_build_intrinsic(gallivm->builder,
4936 "llvm.amdgcn.ds.bpermute", ctx->i32,
4937 args, 2, LLVMReadNoneAttribute);
4938
4939 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4940 lp_build_const_int32(gallivm, 4), "");
4941 trbl = lp_build_intrinsic(gallivm->builder,
4942 "llvm.amdgcn.ds.bpermute", ctx->i32,
4943 args, 2, LLVMReadNoneAttribute);
4944 } else {
4945 LLVMBuildStore(gallivm->builder, val, store_ptr);
4946 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4947 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4948 }
4949 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4950 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4951 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4952 }
4953
4954 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4955 }
4956
4957 /*
4958 * this takes an I,J coordinate pair,
4959 * and works out the X and Y derivatives.
4960 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4961 */
4962 static LLVMValueRef si_llvm_emit_ddxy_interp(
4963 struct lp_build_tgsi_context *bld_base,
4964 LLVMValueRef interp_ij)
4965 {
4966 struct si_shader_context *ctx = si_shader_context(bld_base);
4967 struct gallivm_state *gallivm = bld_base->base.gallivm;
4968 LLVMValueRef indices[2];
4969 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4970 LLVMValueRef tl, tr, bl, result[4];
4971 unsigned c;
4972
4973 indices[0] = bld_base->uint_bld.zero;
4974 indices[1] = get_thread_id(ctx);
4975 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4976 indices, 2, "");
4977
4978 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4979 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4980
4981 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4982 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4983
4984 indices[1] = temp;
4985 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4986 indices, 2, "");
4987
4988 indices[1] = temp2;
4989 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4990 indices, 2, "");
4991
4992 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4993 lp_build_const_int32(gallivm, 1), "");
4994 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4995 indices, 2, "");
4996
4997 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4998 lp_build_const_int32(gallivm, 2), "");
4999 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
5000 indices, 2, "");
5001
5002 for (c = 0; c < 2; ++c) {
5003 LLVMValueRef store_val;
5004 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
5005
5006 store_val = LLVMBuildExtractElement(gallivm->builder,
5007 interp_ij, c_ll, "");
5008 LLVMBuildStore(gallivm->builder,
5009 store_val,
5010 store_ptr);
5011
5012 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
5013 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5014
5015 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
5016 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5017
5018 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5019
5020 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5021 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5022
5023 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5024 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5025
5026 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5027 }
5028
5029 return lp_build_gather_values(gallivm, result, 4);
5030 }
5031
5032 static void interp_fetch_args(
5033 struct lp_build_tgsi_context *bld_base,
5034 struct lp_build_emit_data *emit_data)
5035 {
5036 struct si_shader_context *ctx = si_shader_context(bld_base);
5037 struct gallivm_state *gallivm = bld_base->base.gallivm;
5038 const struct tgsi_full_instruction *inst = emit_data->inst;
5039
5040 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5041 /* offset is in second src, first two channels */
5042 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5043 emit_data->inst, 1,
5044 TGSI_CHAN_X);
5045 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5046 emit_data->inst, 1,
5047 TGSI_CHAN_Y);
5048 emit_data->arg_count = 2;
5049 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5050 LLVMValueRef sample_position;
5051 LLVMValueRef sample_id;
5052 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5053
5054 /* fetch sample ID, then fetch its sample position,
5055 * and place into first two channels.
5056 */
5057 sample_id = lp_build_emit_fetch(bld_base,
5058 emit_data->inst, 1, TGSI_CHAN_X);
5059 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5060 ctx->i32, "");
5061 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5062
5063 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5064 sample_position,
5065 lp_build_const_int32(gallivm, 0), "");
5066
5067 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5068 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5069 sample_position,
5070 lp_build_const_int32(gallivm, 1), "");
5071 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5072 emit_data->arg_count = 2;
5073 }
5074 }
5075
5076 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5077 struct lp_build_tgsi_context *bld_base,
5078 struct lp_build_emit_data *emit_data)
5079 {
5080 struct si_shader_context *ctx = si_shader_context(bld_base);
5081 struct si_shader *shader = ctx->shader;
5082 struct gallivm_state *gallivm = bld_base->base.gallivm;
5083 LLVMValueRef interp_param;
5084 const struct tgsi_full_instruction *inst = emit_data->inst;
5085 const char *intr_name;
5086 int input_index = inst->Src[0].Register.Index;
5087 int chan;
5088 int i;
5089 LLVMValueRef attr_number;
5090 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5091 int interp_param_idx;
5092 unsigned interp = shader->selector->info.input_interpolate[input_index];
5093 unsigned location;
5094
5095 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5096
5097 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5098 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5099 location = TGSI_INTERPOLATE_LOC_CENTER;
5100 else
5101 location = TGSI_INTERPOLATE_LOC_CENTROID;
5102
5103 interp_param_idx = lookup_interp_param_index(interp, location);
5104 if (interp_param_idx == -1)
5105 return;
5106 else if (interp_param_idx)
5107 interp_param = get_interp_param(ctx, interp_param_idx);
5108 else
5109 interp_param = NULL;
5110
5111 attr_number = lp_build_const_int32(gallivm, input_index);
5112
5113 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5114 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5115 LLVMValueRef ij_out[2];
5116 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5117
5118 /*
5119 * take the I then J parameters, and the DDX/Y for it, and
5120 * calculate the IJ inputs for the interpolator.
5121 * temp1 = ddx * offset/sample.x + I;
5122 * interp_param.I = ddy * offset/sample.y + temp1;
5123 * temp1 = ddx * offset/sample.x + J;
5124 * interp_param.J = ddy * offset/sample.y + temp1;
5125 */
5126 for (i = 0; i < 2; i++) {
5127 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5128 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5129 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5130 ddxy_out, ix_ll, "");
5131 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5132 ddxy_out, iy_ll, "");
5133 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5134 interp_param, ix_ll, "");
5135 LLVMValueRef temp1, temp2;
5136
5137 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5138 ctx->f32, "");
5139
5140 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5141
5142 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5143
5144 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5145
5146 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5147
5148 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5149 temp2, ctx->i32, "");
5150 }
5151 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5152 }
5153
5154 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5155 for (chan = 0; chan < 2; chan++) {
5156 LLVMValueRef args[4];
5157 LLVMValueRef llvm_chan;
5158 unsigned schan;
5159
5160 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5161 llvm_chan = lp_build_const_int32(gallivm, schan);
5162
5163 args[0] = llvm_chan;
5164 args[1] = attr_number;
5165 args[2] = params;
5166 args[3] = interp_param;
5167
5168 emit_data->output[chan] =
5169 lp_build_intrinsic(gallivm->builder, intr_name,
5170 ctx->f32, args, args[3] ? 4 : 3,
5171 LLVMReadNoneAttribute);
5172 }
5173 }
5174
5175 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5176 struct lp_build_emit_data *emit_data)
5177 {
5178 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5179 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5180 unsigned stream;
5181
5182 assert(src0.File == TGSI_FILE_IMMEDIATE);
5183
5184 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5185 return stream;
5186 }
5187
5188 /* Emit one vertex from the geometry shader */
5189 static void si_llvm_emit_vertex(
5190 const struct lp_build_tgsi_action *action,
5191 struct lp_build_tgsi_context *bld_base,
5192 struct lp_build_emit_data *emit_data)
5193 {
5194 struct si_shader_context *ctx = si_shader_context(bld_base);
5195 struct lp_build_context *uint = &bld_base->uint_bld;
5196 struct si_shader *shader = ctx->shader;
5197 struct tgsi_shader_info *info = &shader->selector->info;
5198 struct gallivm_state *gallivm = bld_base->base.gallivm;
5199 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5200 SI_PARAM_GS2VS_OFFSET);
5201 LLVMValueRef gs_next_vertex;
5202 LLVMValueRef can_emit, kill;
5203 LLVMValueRef args[2];
5204 unsigned chan;
5205 int i;
5206 unsigned stream;
5207
5208 stream = si_llvm_get_stream(bld_base, emit_data);
5209
5210 /* Write vertex attribute values to GSVS ring */
5211 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5212 ctx->gs_next_vertex[stream],
5213 "");
5214
5215 /* If this thread has already emitted the declared maximum number of
5216 * vertices, kill it: excessive vertex emissions are not supposed to
5217 * have any effect, and GS threads have no externally observable
5218 * effects other than emitting vertices.
5219 */
5220 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5221 lp_build_const_int32(gallivm,
5222 shader->selector->gs_max_out_vertices), "");
5223 kill = lp_build_select(&bld_base->base, can_emit,
5224 lp_build_const_float(gallivm, 1.0f),
5225 lp_build_const_float(gallivm, -1.0f));
5226
5227 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5228 ctx->voidt, &kill, 1, 0);
5229
5230 for (i = 0; i < info->num_outputs; i++) {
5231 LLVMValueRef *out_ptr =
5232 ctx->radeon_bld.soa.outputs[i];
5233
5234 for (chan = 0; chan < 4; chan++) {
5235 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5236 LLVMValueRef voffset =
5237 lp_build_const_int32(gallivm, (i * 4 + chan) *
5238 shader->selector->gs_max_out_vertices);
5239
5240 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5241 voffset = lp_build_mul_imm(uint, voffset, 4);
5242
5243 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5244
5245 build_tbuffer_store(ctx,
5246 ctx->gsvs_ring[stream],
5247 out_val, 1,
5248 voffset, soffset, 0,
5249 V_008F0C_BUF_DATA_FORMAT_32,
5250 V_008F0C_BUF_NUM_FORMAT_UINT,
5251 1, 0, 1, 1, 0);
5252 }
5253 }
5254 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5255 lp_build_const_int32(gallivm, 1));
5256
5257 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5258
5259 /* Signal vertex emission */
5260 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5261 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5262 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5263 ctx->voidt, args, 2, 0);
5264 }
5265
5266 /* Cut one primitive from the geometry shader */
5267 static void si_llvm_emit_primitive(
5268 const struct lp_build_tgsi_action *action,
5269 struct lp_build_tgsi_context *bld_base,
5270 struct lp_build_emit_data *emit_data)
5271 {
5272 struct si_shader_context *ctx = si_shader_context(bld_base);
5273 struct gallivm_state *gallivm = bld_base->base.gallivm;
5274 LLVMValueRef args[2];
5275 unsigned stream;
5276
5277 /* Signal primitive cut */
5278 stream = si_llvm_get_stream(bld_base, emit_data);
5279 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5280 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5281 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5282 ctx->voidt, args, 2, 0);
5283 }
5284
5285 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5286 struct lp_build_tgsi_context *bld_base,
5287 struct lp_build_emit_data *emit_data)
5288 {
5289 struct si_shader_context *ctx = si_shader_context(bld_base);
5290 struct gallivm_state *gallivm = bld_base->base.gallivm;
5291
5292 /* The real barrier instruction isn’t needed, because an entire patch
5293 * always fits into a single wave.
5294 */
5295 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5296 emit_optimization_barrier(ctx);
5297 return;
5298 }
5299
5300 lp_build_intrinsic(gallivm->builder,
5301 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5302 : "llvm.AMDGPU.barrier.local",
5303 ctx->voidt, NULL, 0, 0);
5304 }
5305
5306 static const struct lp_build_tgsi_action tex_action = {
5307 .fetch_args = tex_fetch_args,
5308 .emit = build_tex_intrinsic,
5309 };
5310
5311 static const struct lp_build_tgsi_action interp_action = {
5312 .fetch_args = interp_fetch_args,
5313 .emit = build_interp_intrinsic,
5314 };
5315
5316 static void si_create_function(struct si_shader_context *ctx,
5317 LLVMTypeRef *returns, unsigned num_returns,
5318 LLVMTypeRef *params, unsigned num_params,
5319 int last_sgpr)
5320 {
5321 int i;
5322
5323 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5324 params, num_params);
5325 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5326 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5327
5328 for (i = 0; i <= last_sgpr; ++i) {
5329 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5330
5331 /* The combination of:
5332 * - ByVal
5333 * - dereferenceable
5334 * - invariant.load
5335 * allows the optimization passes to move loads and reduces
5336 * SGPR spilling significantly.
5337 */
5338 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5339 LLVMAddAttribute(P, LLVMByValAttribute);
5340 lp_add_attr_dereferenceable(P, UINT64_MAX);
5341 } else
5342 LLVMAddAttribute(P, LLVMInRegAttribute);
5343 }
5344
5345 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5346 /* These were copied from some LLVM test. */
5347 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5348 "less-precise-fpmad",
5349 "true");
5350 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5351 "no-infs-fp-math",
5352 "true");
5353 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5354 "no-nans-fp-math",
5355 "true");
5356 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5357 "unsafe-fp-math",
5358 "true");
5359 }
5360 }
5361
5362 static void create_meta_data(struct si_shader_context *ctx)
5363 {
5364 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5365
5366 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5367 "invariant.load", 14);
5368 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5369 "range", 5);
5370 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5371 "amdgpu.uniform", 14);
5372
5373 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5374 }
5375
5376 static void declare_streamout_params(struct si_shader_context *ctx,
5377 struct pipe_stream_output_info *so,
5378 LLVMTypeRef *params, LLVMTypeRef i32,
5379 unsigned *num_params)
5380 {
5381 int i;
5382
5383 /* Streamout SGPRs. */
5384 if (so->num_outputs) {
5385 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5386 params[ctx->param_streamout_config = (*num_params)++] = i32;
5387 else
5388 ctx->param_streamout_config = ctx->param_tess_offchip;
5389
5390 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5391 }
5392 /* A streamout buffer offset is loaded if the stride is non-zero. */
5393 for (i = 0; i < 4; i++) {
5394 if (!so->stride[i])
5395 continue;
5396
5397 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5398 }
5399 }
5400
5401 static unsigned llvm_get_type_size(LLVMTypeRef type)
5402 {
5403 LLVMTypeKind kind = LLVMGetTypeKind(type);
5404
5405 switch (kind) {
5406 case LLVMIntegerTypeKind:
5407 return LLVMGetIntTypeWidth(type) / 8;
5408 case LLVMFloatTypeKind:
5409 return 4;
5410 case LLVMPointerTypeKind:
5411 return 8;
5412 case LLVMVectorTypeKind:
5413 return LLVMGetVectorSize(type) *
5414 llvm_get_type_size(LLVMGetElementType(type));
5415 default:
5416 assert(0);
5417 return 0;
5418 }
5419 }
5420
5421 static void declare_tess_lds(struct si_shader_context *ctx)
5422 {
5423 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5424 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5425 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5426
5427 /* The actual size is computed outside of the shader to reduce
5428 * the number of shader variants. */
5429 ctx->lds =
5430 LLVMAddGlobalInAddressSpace(gallivm->module,
5431 LLVMArrayType(i32, lds_size / 4),
5432 "tess_lds",
5433 LOCAL_ADDR_SPACE);
5434 }
5435
5436 static void create_function(struct si_shader_context *ctx)
5437 {
5438 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5439 struct gallivm_state *gallivm = bld_base->base.gallivm;
5440 struct si_shader *shader = ctx->shader;
5441 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5442 LLVMTypeRef returns[16+32*4];
5443 unsigned i, last_sgpr, num_params, num_return_sgprs;
5444 unsigned num_returns = 0;
5445
5446 v3i32 = LLVMVectorType(ctx->i32, 3);
5447
5448 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5449 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5450 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5451 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5452 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5453
5454 switch (ctx->type) {
5455 case PIPE_SHADER_VERTEX:
5456 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5457 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5458 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5459 num_params = SI_PARAM_START_INSTANCE+1;
5460
5461 if (shader->key.vs.as_es) {
5462 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5463 } else if (shader->key.vs.as_ls) {
5464 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5465 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5466 } else {
5467 if (ctx->is_gs_copy_shader) {
5468 num_params = SI_PARAM_RW_BUFFERS+1;
5469 } else {
5470 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5471 num_params = SI_PARAM_VS_STATE_BITS+1;
5472 }
5473
5474 /* The locations of the other parameters are assigned dynamically. */
5475 declare_streamout_params(ctx, &shader->selector->so,
5476 params, ctx->i32, &num_params);
5477 }
5478
5479 last_sgpr = num_params-1;
5480
5481 /* VGPRs */
5482 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5483 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5484 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5485 params[ctx->param_instance_id = num_params++] = ctx->i32;
5486
5487 if (!ctx->is_monolithic &&
5488 !ctx->is_gs_copy_shader) {
5489 /* Vertex load indices. */
5490 ctx->param_vertex_index0 = num_params;
5491
5492 for (i = 0; i < shader->selector->info.num_inputs; i++)
5493 params[num_params++] = ctx->i32;
5494
5495 /* PrimitiveID output. */
5496 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5497 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5498 returns[num_returns++] = ctx->f32;
5499 }
5500 break;
5501
5502 case PIPE_SHADER_TESS_CTRL:
5503 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5504 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5505 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5506 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5507 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5508 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5509 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5510
5511 /* VGPRs */
5512 params[SI_PARAM_PATCH_ID] = ctx->i32;
5513 params[SI_PARAM_REL_IDS] = ctx->i32;
5514 num_params = SI_PARAM_REL_IDS+1;
5515
5516 if (!ctx->is_monolithic) {
5517 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5518 * placed after the user SGPRs.
5519 */
5520 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5521 returns[num_returns++] = ctx->i32; /* SGPRs */
5522
5523 for (i = 0; i < 3; i++)
5524 returns[num_returns++] = ctx->f32; /* VGPRs */
5525 }
5526 break;
5527
5528 case PIPE_SHADER_TESS_EVAL:
5529 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5530 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5531
5532 if (shader->key.tes.as_es) {
5533 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5534 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5535 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5536 } else {
5537 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5538 declare_streamout_params(ctx, &shader->selector->so,
5539 params, ctx->i32, &num_params);
5540 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5541 }
5542 last_sgpr = num_params - 1;
5543
5544 /* VGPRs */
5545 params[ctx->param_tes_u = num_params++] = ctx->f32;
5546 params[ctx->param_tes_v = num_params++] = ctx->f32;
5547 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5548 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5549
5550 /* PrimitiveID output. */
5551 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5552 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5553 returns[num_returns++] = ctx->f32;
5554 break;
5555
5556 case PIPE_SHADER_GEOMETRY:
5557 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5558 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5559 last_sgpr = SI_PARAM_GS_WAVE_ID;
5560
5561 /* VGPRs */
5562 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5563 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5564 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5565 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5566 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5567 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5568 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5569 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5570 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5571 break;
5572
5573 case PIPE_SHADER_FRAGMENT:
5574 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5575 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5576 last_sgpr = SI_PARAM_PRIM_MASK;
5577 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5578 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5579 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5580 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5581 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5582 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5583 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5584 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5585 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5586 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5587 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5588 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5589 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5590 params[SI_PARAM_ANCILLARY] = ctx->i32;
5591 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5592 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5593 num_params = SI_PARAM_POS_FIXED_PT+1;
5594
5595 if (!ctx->is_monolithic) {
5596 /* Color inputs from the prolog. */
5597 if (shader->selector->info.colors_read) {
5598 unsigned num_color_elements =
5599 util_bitcount(shader->selector->info.colors_read);
5600
5601 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5602 for (i = 0; i < num_color_elements; i++)
5603 params[num_params++] = ctx->f32;
5604 }
5605
5606 /* Outputs for the epilog. */
5607 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5608 num_returns =
5609 num_return_sgprs +
5610 util_bitcount(shader->selector->info.colors_written) * 4 +
5611 shader->selector->info.writes_z +
5612 shader->selector->info.writes_stencil +
5613 shader->selector->info.writes_samplemask +
5614 1 /* SampleMaskIn */;
5615
5616 num_returns = MAX2(num_returns,
5617 num_return_sgprs +
5618 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5619
5620 for (i = 0; i < num_return_sgprs; i++)
5621 returns[i] = ctx->i32;
5622 for (; i < num_returns; i++)
5623 returns[i] = ctx->f32;
5624 }
5625 break;
5626
5627 case PIPE_SHADER_COMPUTE:
5628 params[SI_PARAM_GRID_SIZE] = v3i32;
5629 params[SI_PARAM_BLOCK_ID] = v3i32;
5630 last_sgpr = SI_PARAM_BLOCK_ID;
5631
5632 params[SI_PARAM_THREAD_ID] = v3i32;
5633 num_params = SI_PARAM_THREAD_ID + 1;
5634 break;
5635 default:
5636 assert(0 && "unimplemented shader");
5637 return;
5638 }
5639
5640 assert(num_params <= ARRAY_SIZE(params));
5641
5642 si_create_function(ctx, returns, num_returns, params,
5643 num_params, last_sgpr);
5644
5645 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5646 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5647 !ctx->is_monolithic) {
5648 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5649 "InitialPSInputAddr",
5650 S_0286D0_PERSP_SAMPLE_ENA(1) |
5651 S_0286D0_PERSP_CENTER_ENA(1) |
5652 S_0286D0_PERSP_CENTROID_ENA(1) |
5653 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5654 S_0286D0_LINEAR_CENTER_ENA(1) |
5655 S_0286D0_LINEAR_CENTROID_ENA(1) |
5656 S_0286D0_FRONT_FACE_ENA(1) |
5657 S_0286D0_POS_FIXED_PT_ENA(1));
5658 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5659 const unsigned *properties = shader->selector->info.properties;
5660 unsigned max_work_group_size =
5661 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5662 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5663 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5664
5665 assert(max_work_group_size);
5666
5667 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5668 "amdgpu-max-work-group-size",
5669 max_work_group_size);
5670 }
5671
5672 shader->info.num_input_sgprs = 0;
5673 shader->info.num_input_vgprs = 0;
5674
5675 for (i = 0; i <= last_sgpr; ++i)
5676 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5677
5678 /* Unused fragment shader inputs are eliminated by the compiler,
5679 * so we don't know yet how many there will be.
5680 */
5681 if (ctx->type != PIPE_SHADER_FRAGMENT)
5682 for (; i < num_params; ++i)
5683 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5684
5685 if (bld_base->info &&
5686 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5687 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5688 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5689 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5690 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5691 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5692 ctx->lds =
5693 LLVMAddGlobalInAddressSpace(gallivm->module,
5694 LLVMArrayType(ctx->i32, 64),
5695 "ddxy_lds",
5696 LOCAL_ADDR_SPACE);
5697
5698 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5699 ctx->type == PIPE_SHADER_TESS_CTRL ||
5700 ctx->type == PIPE_SHADER_TESS_EVAL)
5701 declare_tess_lds(ctx);
5702 }
5703
5704 static void preload_constants(struct si_shader_context *ctx)
5705 {
5706 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5707 struct gallivm_state *gallivm = bld_base->base.gallivm;
5708 const struct tgsi_shader_info *info = bld_base->info;
5709 unsigned buf;
5710 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5711
5712 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5713 unsigned i, num_const = info->const_file_max[buf] + 1;
5714
5715 if (num_const == 0)
5716 continue;
5717
5718 /* Allocate space for the constant values */
5719 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5720
5721 /* Load the resource descriptor */
5722 ctx->const_buffers[buf] =
5723 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5724
5725 /* Load the constants, we rely on the code sinking to do the rest */
5726 for (i = 0; i < num_const * 4; ++i) {
5727 ctx->constants[buf][i] =
5728 buffer_load_const(gallivm->builder,
5729 ctx->const_buffers[buf],
5730 lp_build_const_int32(gallivm, i * 4),
5731 ctx->f32);
5732 }
5733 }
5734 }
5735
5736 static void preload_shader_buffers(struct si_shader_context *ctx)
5737 {
5738 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5739 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5740 int buf, maxbuf;
5741
5742 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5743 SI_NUM_SHADER_BUFFERS - 1);
5744 for (buf = 0; buf <= maxbuf; ++buf) {
5745 ctx->shader_buffers[buf] =
5746 build_indexed_load_const(
5747 ctx, ptr, lp_build_const_int32(gallivm, buf));
5748 }
5749 }
5750
5751 static void preload_samplers(struct si_shader_context *ctx)
5752 {
5753 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5754 struct gallivm_state *gallivm = bld_base->base.gallivm;
5755 const struct tgsi_shader_info *info = bld_base->info;
5756 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5757 LLVMValueRef offset;
5758
5759 if (num_samplers == 0)
5760 return;
5761
5762 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5763 for (i = 0; i < num_samplers; ++i) {
5764 /* Resource */
5765 offset = lp_build_const_int32(gallivm, i);
5766 ctx->sampler_views[i] =
5767 get_sampler_desc(ctx, offset, DESC_IMAGE);
5768
5769 /* FMASK resource */
5770 if (info->is_msaa_sampler[i])
5771 ctx->fmasks[i] =
5772 get_sampler_desc(ctx, offset, DESC_FMASK);
5773 else {
5774 ctx->sampler_states[i] =
5775 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5776 ctx->sampler_states[i] =
5777 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5778 ctx->sampler_states[i]);
5779 }
5780 }
5781 }
5782
5783 static void preload_images(struct si_shader_context *ctx)
5784 {
5785 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5786 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5787 struct gallivm_state *gallivm = bld_base->base.gallivm;
5788 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5789 LLVMValueRef res_ptr;
5790 unsigned i;
5791
5792 if (num_images == 0)
5793 return;
5794
5795 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5796
5797 for (i = 0; i < num_images; ++i) {
5798 /* Rely on LLVM to shrink the load for buffer resources. */
5799 LLVMValueRef rsrc =
5800 build_indexed_load_const(ctx, res_ptr,
5801 lp_build_const_int32(gallivm, i));
5802
5803 if (info->images_writemask & (1 << i) &&
5804 !(info->images_buffers & (1 << i)))
5805 rsrc = force_dcc_off(ctx, rsrc);
5806
5807 ctx->images[i] = rsrc;
5808 }
5809 }
5810
5811 static void preload_streamout_buffers(struct si_shader_context *ctx)
5812 {
5813 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5814 struct gallivm_state *gallivm = bld_base->base.gallivm;
5815 unsigned i;
5816
5817 /* Streamout can only be used if the shader is compiled as VS. */
5818 if (!ctx->shader->selector->so.num_outputs ||
5819 (ctx->type == PIPE_SHADER_VERTEX &&
5820 (ctx->shader->key.vs.as_es ||
5821 ctx->shader->key.vs.as_ls)) ||
5822 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5823 ctx->shader->key.tes.as_es))
5824 return;
5825
5826 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5827 SI_PARAM_RW_BUFFERS);
5828
5829 /* Load the resources, we rely on the code sinking to do the rest */
5830 for (i = 0; i < 4; ++i) {
5831 if (ctx->shader->selector->so.stride[i]) {
5832 LLVMValueRef offset = lp_build_const_int32(gallivm,
5833 SI_VS_STREAMOUT_BUF0 + i);
5834
5835 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5836 }
5837 }
5838 }
5839
5840 /**
5841 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5842 * for later use.
5843 */
5844 static void preload_ring_buffers(struct si_shader_context *ctx)
5845 {
5846 struct gallivm_state *gallivm =
5847 ctx->radeon_bld.soa.bld_base.base.gallivm;
5848
5849 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5850 SI_PARAM_RW_BUFFERS);
5851
5852 if ((ctx->type == PIPE_SHADER_VERTEX &&
5853 ctx->shader->key.vs.as_es) ||
5854 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5855 ctx->shader->key.tes.as_es) ||
5856 ctx->type == PIPE_SHADER_GEOMETRY) {
5857 unsigned ring =
5858 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5859 : SI_ES_RING_ESGS;
5860 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5861
5862 ctx->esgs_ring =
5863 build_indexed_load_const(ctx, buf_ptr, offset);
5864 }
5865
5866 if (ctx->is_gs_copy_shader) {
5867 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5868
5869 ctx->gsvs_ring[0] =
5870 build_indexed_load_const(ctx, buf_ptr, offset);
5871 }
5872 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5873 int i;
5874 for (i = 0; i < 4; i++) {
5875 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5876
5877 ctx->gsvs_ring[i] =
5878 build_indexed_load_const(ctx, buf_ptr, offset);
5879 }
5880 }
5881 }
5882
5883 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5884 LLVMValueRef param_rw_buffers,
5885 unsigned param_pos_fixed_pt)
5886 {
5887 struct lp_build_tgsi_context *bld_base =
5888 &ctx->radeon_bld.soa.bld_base;
5889 struct gallivm_state *gallivm = bld_base->base.gallivm;
5890 LLVMBuilderRef builder = gallivm->builder;
5891 LLVMValueRef slot, desc, offset, row, bit, address[2];
5892
5893 /* Use the fixed-point gl_FragCoord input.
5894 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5895 * per coordinate to get the repeating effect.
5896 */
5897 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5898 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5899
5900 /* Load the buffer descriptor. */
5901 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5902 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5903
5904 /* The stipple pattern is 32x32, each row has 32 bits. */
5905 offset = LLVMBuildMul(builder, address[1],
5906 LLVMConstInt(ctx->i32, 4, 0), "");
5907 row = buffer_load_const(builder, desc, offset, ctx->i32);
5908 bit = LLVMBuildLShr(builder, row, address[0], "");
5909 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5910
5911 /* The intrinsic kills the thread if arg < 0. */
5912 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5913 LLVMConstReal(ctx->f32, -1), "");
5914 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5915 }
5916
5917 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5918 struct si_shader_config *conf,
5919 unsigned symbol_offset)
5920 {
5921 unsigned i;
5922 const unsigned char *config =
5923 radeon_shader_binary_config_start(binary, symbol_offset);
5924 bool really_needs_scratch = false;
5925
5926 /* LLVM adds SGPR spills to the scratch size.
5927 * Find out if we really need the scratch buffer.
5928 */
5929 for (i = 0; i < binary->reloc_count; i++) {
5930 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5931
5932 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5933 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5934 really_needs_scratch = true;
5935 break;
5936 }
5937 }
5938
5939 /* XXX: We may be able to emit some of these values directly rather than
5940 * extracting fields to be emitted later.
5941 */
5942
5943 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5944 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5945 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5946 switch (reg) {
5947 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5948 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5949 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5950 case R_00B848_COMPUTE_PGM_RSRC1:
5951 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5952 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5953 conf->float_mode = G_00B028_FLOAT_MODE(value);
5954 conf->rsrc1 = value;
5955 break;
5956 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5957 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5958 break;
5959 case R_00B84C_COMPUTE_PGM_RSRC2:
5960 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5961 conf->rsrc2 = value;
5962 break;
5963 case R_0286CC_SPI_PS_INPUT_ENA:
5964 conf->spi_ps_input_ena = value;
5965 break;
5966 case R_0286D0_SPI_PS_INPUT_ADDR:
5967 conf->spi_ps_input_addr = value;
5968 break;
5969 case R_0286E8_SPI_TMPRING_SIZE:
5970 case R_00B860_COMPUTE_TMPRING_SIZE:
5971 /* WAVESIZE is in units of 256 dwords. */
5972 if (really_needs_scratch)
5973 conf->scratch_bytes_per_wave =
5974 G_00B860_WAVESIZE(value) * 256 * 4;
5975 break;
5976 case 0x4: /* SPILLED_SGPRS */
5977 conf->spilled_sgprs = value;
5978 break;
5979 case 0x8: /* SPILLED_VGPRS */
5980 conf->spilled_vgprs = value;
5981 break;
5982 default:
5983 {
5984 static bool printed;
5985
5986 if (!printed) {
5987 fprintf(stderr, "Warning: LLVM emitted unknown "
5988 "config register: 0x%x\n", reg);
5989 printed = true;
5990 }
5991 }
5992 break;
5993 }
5994
5995 if (!conf->spi_ps_input_addr)
5996 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5997 }
5998 }
5999
6000 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6001 struct si_shader *shader,
6002 struct si_shader_config *config,
6003 uint64_t scratch_va)
6004 {
6005 unsigned i;
6006 uint32_t scratch_rsrc_dword0 = scratch_va;
6007 uint32_t scratch_rsrc_dword1 =
6008 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6009
6010 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6011 * correctly.
6012 */
6013 if (HAVE_LLVM >= 0x0309)
6014 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6015 else
6016 scratch_rsrc_dword1 |=
6017 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6018
6019 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6020 const struct radeon_shader_reloc *reloc =
6021 &shader->binary.relocs[i];
6022 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6023 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6024 &scratch_rsrc_dword0, 4);
6025 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6026 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6027 &scratch_rsrc_dword1, 4);
6028 }
6029 }
6030 }
6031
6032 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6033 {
6034 unsigned size = shader->binary.code_size;
6035
6036 if (shader->prolog)
6037 size += shader->prolog->binary.code_size;
6038 if (shader->epilog)
6039 size += shader->epilog->binary.code_size;
6040 return size;
6041 }
6042
6043 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6044 {
6045 const struct radeon_shader_binary *prolog =
6046 shader->prolog ? &shader->prolog->binary : NULL;
6047 const struct radeon_shader_binary *epilog =
6048 shader->epilog ? &shader->epilog->binary : NULL;
6049 const struct radeon_shader_binary *mainb = &shader->binary;
6050 unsigned bo_size = si_get_shader_binary_size(shader) +
6051 (!epilog ? mainb->rodata_size : 0);
6052 unsigned char *ptr;
6053
6054 assert(!prolog || !prolog->rodata_size);
6055 assert((!prolog && !epilog) || !mainb->rodata_size);
6056 assert(!epilog || !epilog->rodata_size);
6057
6058 r600_resource_reference(&shader->bo, NULL);
6059 shader->bo = si_resource_create_custom(&sscreen->b.b,
6060 PIPE_USAGE_IMMUTABLE,
6061 bo_size);
6062 if (!shader->bo)
6063 return -ENOMEM;
6064
6065 /* Upload. */
6066 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6067 PIPE_TRANSFER_READ_WRITE);
6068
6069 if (prolog) {
6070 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6071 ptr += prolog->code_size;
6072 }
6073
6074 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6075 ptr += mainb->code_size;
6076
6077 if (epilog)
6078 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6079 else if (mainb->rodata_size > 0)
6080 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6081
6082 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6083 return 0;
6084 }
6085
6086 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6087 struct pipe_debug_callback *debug,
6088 const char *name, FILE *file)
6089 {
6090 char *line, *p;
6091 unsigned i, count;
6092
6093 if (binary->disasm_string) {
6094 fprintf(file, "Shader %s disassembly:\n", name);
6095 fprintf(file, "%s", binary->disasm_string);
6096
6097 if (debug && debug->debug_message) {
6098 /* Very long debug messages are cut off, so send the
6099 * disassembly one line at a time. This causes more
6100 * overhead, but on the plus side it simplifies
6101 * parsing of resulting logs.
6102 */
6103 pipe_debug_message(debug, SHADER_INFO,
6104 "Shader Disassembly Begin");
6105
6106 line = binary->disasm_string;
6107 while (*line) {
6108 p = util_strchrnul(line, '\n');
6109 count = p - line;
6110
6111 if (count) {
6112 pipe_debug_message(debug, SHADER_INFO,
6113 "%.*s", count, line);
6114 }
6115
6116 if (!*p)
6117 break;
6118 line = p + 1;
6119 }
6120
6121 pipe_debug_message(debug, SHADER_INFO,
6122 "Shader Disassembly End");
6123 }
6124 } else {
6125 fprintf(file, "Shader %s binary:\n", name);
6126 for (i = 0; i < binary->code_size; i += 4) {
6127 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6128 binary->code[i + 3], binary->code[i + 2],
6129 binary->code[i + 1], binary->code[i]);
6130 }
6131 }
6132 }
6133
6134 static void si_shader_dump_stats(struct si_screen *sscreen,
6135 struct si_shader_config *conf,
6136 unsigned num_inputs,
6137 unsigned code_size,
6138 struct pipe_debug_callback *debug,
6139 unsigned processor,
6140 FILE *file)
6141 {
6142 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6143 unsigned lds_per_wave = 0;
6144 unsigned max_simd_waves = 10;
6145
6146 /* Compute LDS usage for PS. */
6147 if (processor == PIPE_SHADER_FRAGMENT) {
6148 /* The minimum usage per wave is (num_inputs * 48). The maximum
6149 * usage is (num_inputs * 48 * 16).
6150 * We can get anything in between and it varies between waves.
6151 *
6152 * The 48 bytes per input for a single primitive is equal to
6153 * 4 bytes/component * 4 components/input * 3 points.
6154 *
6155 * Other stages don't know the size at compile time or don't
6156 * allocate LDS per wave, but instead they do it per thread group.
6157 */
6158 lds_per_wave = conf->lds_size * lds_increment +
6159 align(num_inputs * 48, lds_increment);
6160 }
6161
6162 /* Compute the per-SIMD wave counts. */
6163 if (conf->num_sgprs) {
6164 if (sscreen->b.chip_class >= VI)
6165 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6166 else
6167 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6168 }
6169
6170 if (conf->num_vgprs)
6171 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6172
6173 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6174 * that PS can use.
6175 */
6176 if (lds_per_wave)
6177 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6178
6179 if (file != stderr ||
6180 r600_can_dump_shader(&sscreen->b, processor)) {
6181 if (processor == PIPE_SHADER_FRAGMENT) {
6182 fprintf(file, "*** SHADER CONFIG ***\n"
6183 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6184 "SPI_PS_INPUT_ENA = 0x%04x\n",
6185 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6186 }
6187
6188 fprintf(file, "*** SHADER STATS ***\n"
6189 "SGPRS: %d\n"
6190 "VGPRS: %d\n"
6191 "Spilled SGPRs: %d\n"
6192 "Spilled VGPRs: %d\n"
6193 "Code Size: %d bytes\n"
6194 "LDS: %d blocks\n"
6195 "Scratch: %d bytes per wave\n"
6196 "Max Waves: %d\n"
6197 "********************\n\n\n",
6198 conf->num_sgprs, conf->num_vgprs,
6199 conf->spilled_sgprs, conf->spilled_vgprs, code_size,
6200 conf->lds_size, conf->scratch_bytes_per_wave,
6201 max_simd_waves);
6202 }
6203
6204 pipe_debug_message(debug, SHADER_INFO,
6205 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6206 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6207 "Spilled VGPRs: %d",
6208 conf->num_sgprs, conf->num_vgprs, code_size,
6209 conf->lds_size, conf->scratch_bytes_per_wave,
6210 max_simd_waves, conf->spilled_sgprs,
6211 conf->spilled_vgprs);
6212 }
6213
6214 static const char *si_get_shader_name(struct si_shader *shader,
6215 unsigned processor)
6216 {
6217 switch (processor) {
6218 case PIPE_SHADER_VERTEX:
6219 if (shader->key.vs.as_es)
6220 return "Vertex Shader as ES";
6221 else if (shader->key.vs.as_ls)
6222 return "Vertex Shader as LS";
6223 else
6224 return "Vertex Shader as VS";
6225 case PIPE_SHADER_TESS_CTRL:
6226 return "Tessellation Control Shader";
6227 case PIPE_SHADER_TESS_EVAL:
6228 if (shader->key.tes.as_es)
6229 return "Tessellation Evaluation Shader as ES";
6230 else
6231 return "Tessellation Evaluation Shader as VS";
6232 case PIPE_SHADER_GEOMETRY:
6233 if (shader->gs_copy_shader == NULL)
6234 return "GS Copy Shader as VS";
6235 else
6236 return "Geometry Shader";
6237 case PIPE_SHADER_FRAGMENT:
6238 return "Pixel Shader";
6239 case PIPE_SHADER_COMPUTE:
6240 return "Compute Shader";
6241 default:
6242 return "Unknown Shader";
6243 }
6244 }
6245
6246 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6247 struct pipe_debug_callback *debug, unsigned processor,
6248 FILE *file)
6249 {
6250 if (file != stderr ||
6251 r600_can_dump_shader(&sscreen->b, processor))
6252 si_dump_shader_key(processor, &shader->key, file);
6253
6254 if (file != stderr && shader->binary.llvm_ir_string) {
6255 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6256 si_get_shader_name(shader, processor));
6257 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6258 }
6259
6260 if (file != stderr ||
6261 (r600_can_dump_shader(&sscreen->b, processor) &&
6262 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6263 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6264
6265 if (shader->prolog)
6266 si_shader_dump_disassembly(&shader->prolog->binary,
6267 debug, "prolog", file);
6268
6269 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6270
6271 if (shader->epilog)
6272 si_shader_dump_disassembly(&shader->epilog->binary,
6273 debug, "epilog", file);
6274 fprintf(file, "\n");
6275 }
6276
6277 si_shader_dump_stats(sscreen, &shader->config,
6278 shader->selector ? shader->selector->info.num_inputs : 0,
6279 si_get_shader_binary_size(shader), debug, processor,
6280 file);
6281 }
6282
6283 int si_compile_llvm(struct si_screen *sscreen,
6284 struct radeon_shader_binary *binary,
6285 struct si_shader_config *conf,
6286 LLVMTargetMachineRef tm,
6287 LLVMModuleRef mod,
6288 struct pipe_debug_callback *debug,
6289 unsigned processor,
6290 const char *name)
6291 {
6292 int r = 0;
6293 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6294
6295 if (r600_can_dump_shader(&sscreen->b, processor)) {
6296 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6297
6298 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6299 fprintf(stderr, "%s LLVM IR:\n\n", name);
6300 LLVMDumpModule(mod);
6301 fprintf(stderr, "\n");
6302 }
6303 }
6304
6305 if (sscreen->record_llvm_ir) {
6306 char *ir = LLVMPrintModuleToString(mod);
6307 binary->llvm_ir_string = strdup(ir);
6308 LLVMDisposeMessage(ir);
6309 }
6310
6311 if (!si_replace_shader(count, binary)) {
6312 r = radeon_llvm_compile(mod, binary, tm, debug);
6313 if (r)
6314 return r;
6315 }
6316
6317 si_shader_binary_read_config(binary, conf, 0);
6318
6319 /* Enable 64-bit and 16-bit denormals, because there is no performance
6320 * cost.
6321 *
6322 * If denormals are enabled, all floating-point output modifiers are
6323 * ignored.
6324 *
6325 * Don't enable denormals for 32-bit floats, because:
6326 * - Floating-point output modifiers would be ignored by the hw.
6327 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6328 * have to stop using those.
6329 * - SI & CI would be very slow.
6330 */
6331 conf->float_mode |= V_00B028_FP_64_DENORMS;
6332
6333 FREE(binary->config);
6334 FREE(binary->global_symbol_offsets);
6335 binary->config = NULL;
6336 binary->global_symbol_offsets = NULL;
6337
6338 /* Some shaders can't have rodata because their binaries can be
6339 * concatenated.
6340 */
6341 if (binary->rodata_size &&
6342 (processor == PIPE_SHADER_VERTEX ||
6343 processor == PIPE_SHADER_TESS_CTRL ||
6344 processor == PIPE_SHADER_TESS_EVAL ||
6345 processor == PIPE_SHADER_FRAGMENT)) {
6346 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6347 return -EINVAL;
6348 }
6349
6350 return r;
6351 }
6352
6353 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6354 {
6355 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6356 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6357 else
6358 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6359 }
6360
6361 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6362 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6363 struct si_shader_context *ctx,
6364 struct si_shader *gs,
6365 struct pipe_debug_callback *debug)
6366 {
6367 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6368 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6369 struct lp_build_context *uint = &bld_base->uint_bld;
6370 struct si_shader_output_values *outputs;
6371 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6372 LLVMValueRef args[9];
6373 int i, r;
6374
6375 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6376
6377 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6378 ctx->type = PIPE_SHADER_VERTEX;
6379 ctx->is_gs_copy_shader = true;
6380
6381 create_meta_data(ctx);
6382 create_function(ctx);
6383 preload_streamout_buffers(ctx);
6384 preload_ring_buffers(ctx);
6385
6386 args[0] = ctx->gsvs_ring[0];
6387 args[1] = lp_build_mul_imm(uint,
6388 LLVMGetParam(ctx->radeon_bld.main_fn,
6389 ctx->param_vertex_id),
6390 4);
6391 args[3] = uint->zero;
6392 args[4] = uint->one; /* OFFEN */
6393 args[5] = uint->zero; /* IDXEN */
6394 args[6] = uint->one; /* GLC */
6395 args[7] = uint->one; /* SLC */
6396 args[8] = uint->zero; /* TFE */
6397
6398 /* Fetch vertex data from GSVS ring */
6399 for (i = 0; i < gsinfo->num_outputs; ++i) {
6400 unsigned chan;
6401
6402 outputs[i].name = gsinfo->output_semantic_name[i];
6403 outputs[i].sid = gsinfo->output_semantic_index[i];
6404
6405 for (chan = 0; chan < 4; chan++) {
6406 args[2] = lp_build_const_int32(gallivm,
6407 (i * 4 + chan) *
6408 gs->selector->gs_max_out_vertices * 16 * 4);
6409
6410 outputs[i].values[chan] =
6411 LLVMBuildBitCast(gallivm->builder,
6412 lp_build_intrinsic(gallivm->builder,
6413 "llvm.SI.buffer.load.dword.i32.i32",
6414 ctx->i32, args, 9,
6415 LLVMReadOnlyAttribute),
6416 ctx->f32, "");
6417 }
6418 }
6419
6420 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6421
6422 LLVMBuildRetVoid(gallivm->builder);
6423
6424 /* Dump LLVM IR before any optimization passes */
6425 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6426 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6427 LLVMDumpModule(bld_base->base.gallivm->module);
6428
6429 radeon_llvm_finalize_module(&ctx->radeon_bld);
6430
6431 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6432 &ctx->shader->config, ctx->tm,
6433 bld_base->base.gallivm->module,
6434 debug, PIPE_SHADER_GEOMETRY,
6435 "GS Copy Shader");
6436 if (!r) {
6437 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6438 fprintf(stderr, "GS Copy Shader:\n");
6439 si_shader_dump(sscreen, ctx->shader, debug,
6440 PIPE_SHADER_GEOMETRY, stderr);
6441 r = si_shader_binary_upload(sscreen, ctx->shader);
6442 }
6443
6444 radeon_llvm_dispose(&ctx->radeon_bld);
6445
6446 FREE(outputs);
6447 return r;
6448 }
6449
6450 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
6451 FILE *f)
6452 {
6453 int i;
6454
6455 fprintf(f, "SHADER KEY\n");
6456
6457 switch (shader) {
6458 case PIPE_SHADER_VERTEX:
6459 fprintf(f, " instance_divisors = {");
6460 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6461 fprintf(f, !i ? "%u" : ", %u",
6462 key->vs.prolog.instance_divisors[i]);
6463 fprintf(f, "}\n");
6464 fprintf(f, " as_es = %u\n", key->vs.as_es);
6465 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6466 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6467 break;
6468
6469 case PIPE_SHADER_TESS_CTRL:
6470 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6471 break;
6472
6473 case PIPE_SHADER_TESS_EVAL:
6474 fprintf(f, " as_es = %u\n", key->tes.as_es);
6475 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6476 break;
6477
6478 case PIPE_SHADER_GEOMETRY:
6479 case PIPE_SHADER_COMPUTE:
6480 break;
6481
6482 case PIPE_SHADER_FRAGMENT:
6483 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6484 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6485 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6486 fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6487 fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6488 fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6489 fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6490 fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6491 fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6492 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6493 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6494 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6495 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6496 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6497 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6498 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6499 break;
6500
6501 default:
6502 assert(0);
6503 }
6504 }
6505
6506 static void si_init_shader_ctx(struct si_shader_context *ctx,
6507 struct si_screen *sscreen,
6508 struct si_shader *shader,
6509 LLVMTargetMachineRef tm)
6510 {
6511 struct lp_build_tgsi_context *bld_base;
6512 struct lp_build_tgsi_action tmpl = {};
6513
6514 memset(ctx, 0, sizeof(*ctx));
6515 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
6516 ctx->tm = tm;
6517 ctx->screen = sscreen;
6518 if (shader && shader->selector)
6519 ctx->type = shader->selector->info.processor;
6520 else
6521 ctx->type = -1;
6522 ctx->shader = shader;
6523
6524 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6525 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6526 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6527 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6528 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6529 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6530 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6531 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6532 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6533 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6534 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6535 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6536
6537 bld_base = &ctx->radeon_bld.soa.bld_base;
6538 if (shader && shader->selector)
6539 bld_base->info = &shader->selector->info;
6540 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6541
6542 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6543 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6544 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6545
6546 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6547 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6548 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6549 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6550 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6551 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6552 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6553 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6554 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6555 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6556 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6557 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6558 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6559 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6560
6561 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6562 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6563 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6564 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6565 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6566 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6567
6568 tmpl.fetch_args = atomic_fetch_args;
6569 tmpl.emit = atomic_emit;
6570 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6571 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6572 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6573 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6574 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6575 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6576 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6577 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6578 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6579 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6580 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6581 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6582 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6583 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6584 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6585 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6586 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6587 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6588 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6589 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6590
6591 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6592
6593 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6594 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6595 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6596 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6597
6598 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6599 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6600 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6601
6602 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6603 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6604 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6605 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6606 }
6607
6608 int si_compile_tgsi_shader(struct si_screen *sscreen,
6609 LLVMTargetMachineRef tm,
6610 struct si_shader *shader,
6611 bool is_monolithic,
6612 struct pipe_debug_callback *debug)
6613 {
6614 struct si_shader_selector *sel = shader->selector;
6615 struct si_shader_context ctx;
6616 struct lp_build_tgsi_context *bld_base;
6617 LLVMModuleRef mod;
6618 int r = 0;
6619
6620 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6621 * conversion fails. */
6622 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6623 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6624 tgsi_dump(sel->tokens, 0);
6625 si_dump_streamout(&sel->so);
6626 }
6627
6628 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6629 ctx.is_monolithic = is_monolithic;
6630
6631 shader->info.uses_instanceid = sel->info.uses_instanceid;
6632
6633 bld_base = &ctx.radeon_bld.soa.bld_base;
6634 ctx.radeon_bld.load_system_value = declare_system_value;
6635
6636 switch (ctx.type) {
6637 case PIPE_SHADER_VERTEX:
6638 ctx.radeon_bld.load_input = declare_input_vs;
6639 if (shader->key.vs.as_ls)
6640 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6641 else if (shader->key.vs.as_es)
6642 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6643 else
6644 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6645 break;
6646 case PIPE_SHADER_TESS_CTRL:
6647 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6648 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6649 bld_base->emit_store = store_output_tcs;
6650 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6651 break;
6652 case PIPE_SHADER_TESS_EVAL:
6653 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6654 if (shader->key.tes.as_es)
6655 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6656 else
6657 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6658 break;
6659 case PIPE_SHADER_GEOMETRY:
6660 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6661 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6662 break;
6663 case PIPE_SHADER_FRAGMENT:
6664 ctx.radeon_bld.load_input = declare_input_fs;
6665 if (is_monolithic)
6666 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6667 else
6668 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6669 break;
6670 case PIPE_SHADER_COMPUTE:
6671 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6672 break;
6673 default:
6674 assert(!"Unsupported shader type");
6675 return -1;
6676 }
6677
6678 create_meta_data(&ctx);
6679 create_function(&ctx);
6680 preload_constants(&ctx);
6681 preload_shader_buffers(&ctx);
6682 preload_samplers(&ctx);
6683 preload_images(&ctx);
6684 preload_streamout_buffers(&ctx);
6685 preload_ring_buffers(&ctx);
6686
6687 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6688 shader->key.ps.prolog.poly_stipple) {
6689 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6690 SI_PARAM_RW_BUFFERS);
6691 si_llvm_emit_polygon_stipple(&ctx, list,
6692 SI_PARAM_POS_FIXED_PT);
6693 }
6694
6695 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6696 int i;
6697 for (i = 0; i < 4; i++) {
6698 ctx.gs_next_vertex[i] =
6699 lp_build_alloca(bld_base->base.gallivm,
6700 ctx.i32, "");
6701 }
6702 }
6703
6704 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6705 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6706 goto out;
6707 }
6708
6709 si_llvm_build_ret(&ctx, ctx.return_value);
6710 mod = bld_base->base.gallivm->module;
6711
6712 /* Dump LLVM IR before any optimization passes */
6713 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6714 r600_can_dump_shader(&sscreen->b, ctx.type))
6715 LLVMDumpModule(mod);
6716
6717 radeon_llvm_finalize_module(&ctx.radeon_bld);
6718
6719 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6720 mod, debug, ctx.type, "TGSI shader");
6721 if (r) {
6722 fprintf(stderr, "LLVM failed to compile shader\n");
6723 goto out;
6724 }
6725
6726 radeon_llvm_dispose(&ctx.radeon_bld);
6727
6728 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6729 * LLVM 3.9svn has this bug.
6730 */
6731 if (sel->type == PIPE_SHADER_COMPUTE) {
6732 unsigned *props = sel->info.properties;
6733 unsigned wave_size = 64;
6734 unsigned max_vgprs = 256;
6735 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6736 unsigned max_sgprs_per_wave = 128;
6737 unsigned min_waves_per_cu =
6738 DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
6739 props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
6740 props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
6741 wave_size);
6742 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6743
6744 max_vgprs = max_vgprs / min_waves_per_simd;
6745 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6746
6747 if (shader->config.num_sgprs > max_sgprs ||
6748 shader->config.num_vgprs > max_vgprs) {
6749 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6750 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6751 shader->config.num_sgprs, shader->config.num_vgprs,
6752 max_sgprs, max_vgprs);
6753
6754 /* Just terminate the process, because dependent
6755 * shaders can hang due to bad input data, but use
6756 * the env var to allow shader-db to work.
6757 */
6758 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6759 abort();
6760 }
6761 }
6762
6763 /* Add the scratch offset to input SGPRs. */
6764 if (shader->config.scratch_bytes_per_wave)
6765 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6766
6767 /* Calculate the number of fragment input VGPRs. */
6768 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6769 shader->info.num_input_vgprs = 0;
6770 shader->info.face_vgpr_index = -1;
6771
6772 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6773 shader->info.num_input_vgprs += 2;
6774 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6775 shader->info.num_input_vgprs += 2;
6776 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6777 shader->info.num_input_vgprs += 2;
6778 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6779 shader->info.num_input_vgprs += 3;
6780 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6781 shader->info.num_input_vgprs += 2;
6782 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6783 shader->info.num_input_vgprs += 2;
6784 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6785 shader->info.num_input_vgprs += 2;
6786 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6787 shader->info.num_input_vgprs += 1;
6788 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6789 shader->info.num_input_vgprs += 1;
6790 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6791 shader->info.num_input_vgprs += 1;
6792 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6793 shader->info.num_input_vgprs += 1;
6794 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6795 shader->info.num_input_vgprs += 1;
6796 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6797 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6798 shader->info.num_input_vgprs += 1;
6799 }
6800 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6801 shader->info.num_input_vgprs += 1;
6802 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6803 shader->info.num_input_vgprs += 1;
6804 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6805 shader->info.num_input_vgprs += 1;
6806 }
6807
6808 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6809 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6810 shader->gs_copy_shader->selector = shader->selector;
6811 ctx.shader = shader->gs_copy_shader;
6812 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6813 shader, debug))) {
6814 free(shader->gs_copy_shader);
6815 shader->gs_copy_shader = NULL;
6816 goto out;
6817 }
6818 }
6819
6820 out:
6821 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6822 FREE(ctx.constants[i]);
6823 return r;
6824 }
6825
6826 /**
6827 * Create, compile and return a shader part (prolog or epilog).
6828 *
6829 * \param sscreen screen
6830 * \param list list of shader parts of the same category
6831 * \param key shader part key
6832 * \param tm LLVM target machine
6833 * \param debug debug callback
6834 * \param compile the callback responsible for compilation
6835 * \return non-NULL on success
6836 */
6837 static struct si_shader_part *
6838 si_get_shader_part(struct si_screen *sscreen,
6839 struct si_shader_part **list,
6840 union si_shader_part_key *key,
6841 LLVMTargetMachineRef tm,
6842 struct pipe_debug_callback *debug,
6843 bool (*compile)(struct si_screen *,
6844 LLVMTargetMachineRef,
6845 struct pipe_debug_callback *,
6846 struct si_shader_part *))
6847 {
6848 struct si_shader_part *result;
6849
6850 pipe_mutex_lock(sscreen->shader_parts_mutex);
6851
6852 /* Find existing. */
6853 for (result = *list; result; result = result->next) {
6854 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6855 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6856 return result;
6857 }
6858 }
6859
6860 /* Compile a new one. */
6861 result = CALLOC_STRUCT(si_shader_part);
6862 result->key = *key;
6863 if (!compile(sscreen, tm, debug, result)) {
6864 FREE(result);
6865 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6866 return NULL;
6867 }
6868
6869 result->next = *list;
6870 *list = result;
6871 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6872 return result;
6873 }
6874
6875 /**
6876 * Create a vertex shader prolog.
6877 *
6878 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6879 * All inputs are returned unmodified. The vertex load indices are
6880 * stored after them, which will used by the API VS for fetching inputs.
6881 *
6882 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6883 * input_v0,
6884 * input_v1,
6885 * input_v2,
6886 * input_v3,
6887 * (VertexID + BaseVertex),
6888 * (InstanceID + StartInstance),
6889 * (InstanceID / 2 + StartInstance)
6890 */
6891 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6892 LLVMTargetMachineRef tm,
6893 struct pipe_debug_callback *debug,
6894 struct si_shader_part *out)
6895 {
6896 union si_shader_part_key *key = &out->key;
6897 struct si_shader shader = {};
6898 struct si_shader_context ctx;
6899 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6900 LLVMTypeRef *params, *returns;
6901 LLVMValueRef ret, func;
6902 int last_sgpr, num_params, num_returns, i;
6903 bool status = true;
6904
6905 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6906 ctx.type = PIPE_SHADER_VERTEX;
6907 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6908 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6909
6910 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6911 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6912 sizeof(LLVMTypeRef));
6913 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6914 key->vs_prolog.last_input + 1) *
6915 sizeof(LLVMTypeRef));
6916 num_params = 0;
6917 num_returns = 0;
6918
6919 /* Declare input and output SGPRs. */
6920 num_params = 0;
6921 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6922 params[num_params++] = ctx.i32;
6923 returns[num_returns++] = ctx.i32;
6924 }
6925 last_sgpr = num_params - 1;
6926
6927 /* 4 preloaded VGPRs (outputs must be floats) */
6928 for (i = 0; i < 4; i++) {
6929 params[num_params++] = ctx.i32;
6930 returns[num_returns++] = ctx.f32;
6931 }
6932
6933 /* Vertex load indices. */
6934 for (i = 0; i <= key->vs_prolog.last_input; i++)
6935 returns[num_returns++] = ctx.f32;
6936
6937 /* Create the function. */
6938 si_create_function(&ctx, returns, num_returns, params,
6939 num_params, last_sgpr);
6940 func = ctx.radeon_bld.main_fn;
6941
6942 /* Copy inputs to outputs. This should be no-op, as the registers match,
6943 * but it will prevent the compiler from overwriting them unintentionally.
6944 */
6945 ret = ctx.return_value;
6946 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6947 LLVMValueRef p = LLVMGetParam(func, i);
6948 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6949 }
6950 for (i = num_params - 4; i < num_params; i++) {
6951 LLVMValueRef p = LLVMGetParam(func, i);
6952 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6953 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6954 }
6955
6956 /* Compute vertex load indices from instance divisors. */
6957 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6958 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6959 LLVMValueRef index;
6960
6961 if (divisor) {
6962 /* InstanceID / Divisor + StartInstance */
6963 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6964 SI_SGPR_START_INSTANCE,
6965 divisor);
6966 } else {
6967 /* VertexID + BaseVertex */
6968 index = LLVMBuildAdd(gallivm->builder,
6969 LLVMGetParam(func, ctx.param_vertex_id),
6970 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6971 }
6972
6973 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6974 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6975 num_params++, "");
6976 }
6977
6978 /* Compile. */
6979 si_llvm_build_ret(&ctx, ret);
6980 radeon_llvm_finalize_module(&ctx.radeon_bld);
6981
6982 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6983 gallivm->module, debug, ctx.type,
6984 "Vertex Shader Prolog"))
6985 status = false;
6986
6987 radeon_llvm_dispose(&ctx.radeon_bld);
6988 return status;
6989 }
6990
6991 /**
6992 * Compile the vertex shader epilog. This is also used by the tessellation
6993 * evaluation shader compiled as VS.
6994 *
6995 * The input is PrimitiveID.
6996 *
6997 * If PrimitiveID is required by the pixel shader, export it.
6998 * Otherwise, do nothing.
6999 */
7000 static bool si_compile_vs_epilog(struct si_screen *sscreen,
7001 LLVMTargetMachineRef tm,
7002 struct pipe_debug_callback *debug,
7003 struct si_shader_part *out)
7004 {
7005 union si_shader_part_key *key = &out->key;
7006 struct si_shader_context ctx;
7007 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7008 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7009 LLVMTypeRef params[5];
7010 int num_params, i;
7011 bool status = true;
7012
7013 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
7014 ctx.type = PIPE_SHADER_VERTEX;
7015
7016 /* Declare input VGPRs. */
7017 num_params = key->vs_epilog.states.export_prim_id ?
7018 (VS_EPILOG_PRIMID_LOC + 1) : 0;
7019 assert(num_params <= ARRAY_SIZE(params));
7020
7021 for (i = 0; i < num_params; i++)
7022 params[i] = ctx.f32;
7023
7024 /* Create the function. */
7025 si_create_function(&ctx, NULL, 0, params, num_params, -1);
7026
7027 /* Emit exports. */
7028 if (key->vs_epilog.states.export_prim_id) {
7029 struct lp_build_context *base = &bld_base->base;
7030 struct lp_build_context *uint = &bld_base->uint_bld;
7031 LLVMValueRef args[9];
7032
7033 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7034 args[1] = uint->zero; /* whether the EXEC mask is valid */
7035 args[2] = uint->zero; /* DONE bit */
7036 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7037 key->vs_epilog.prim_id_param_offset);
7038 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7039 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
7040 VS_EPILOG_PRIMID_LOC); /* X */
7041 args[6] = uint->undef; /* Y */
7042 args[7] = uint->undef; /* Z */
7043 args[8] = uint->undef; /* W */
7044
7045 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7046 LLVMVoidTypeInContext(base->gallivm->context),
7047 args, 9, 0);
7048 }
7049
7050 /* Compile. */
7051 LLVMBuildRetVoid(gallivm->builder);
7052 radeon_llvm_finalize_module(&ctx.radeon_bld);
7053
7054 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7055 gallivm->module, debug, ctx.type,
7056 "Vertex Shader Epilog"))
7057 status = false;
7058
7059 radeon_llvm_dispose(&ctx.radeon_bld);
7060 return status;
7061 }
7062
7063 /**
7064 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7065 */
7066 static bool si_get_vs_epilog(struct si_screen *sscreen,
7067 LLVMTargetMachineRef tm,
7068 struct si_shader *shader,
7069 struct pipe_debug_callback *debug,
7070 struct si_vs_epilog_bits *states)
7071 {
7072 union si_shader_part_key epilog_key;
7073
7074 memset(&epilog_key, 0, sizeof(epilog_key));
7075 epilog_key.vs_epilog.states = *states;
7076
7077 /* Set up the PrimitiveID output. */
7078 if (shader->key.vs.epilog.export_prim_id) {
7079 unsigned index = shader->selector->info.num_outputs;
7080 unsigned offset = shader->info.nr_param_exports++;
7081
7082 epilog_key.vs_epilog.prim_id_param_offset = offset;
7083 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7084 shader->info.vs_output_param_offset[index] = offset;
7085 }
7086
7087 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7088 &epilog_key, tm, debug,
7089 si_compile_vs_epilog);
7090 return shader->epilog != NULL;
7091 }
7092
7093 /**
7094 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7095 */
7096 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7097 LLVMTargetMachineRef tm,
7098 struct si_shader *shader,
7099 struct pipe_debug_callback *debug)
7100 {
7101 struct tgsi_shader_info *info = &shader->selector->info;
7102 union si_shader_part_key prolog_key;
7103 unsigned i;
7104
7105 /* Get the prolog. */
7106 memset(&prolog_key, 0, sizeof(prolog_key));
7107 prolog_key.vs_prolog.states = shader->key.vs.prolog;
7108 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7109 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7110
7111 /* The prolog is a no-op if there are no inputs. */
7112 if (info->num_inputs) {
7113 shader->prolog =
7114 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7115 &prolog_key, tm, debug,
7116 si_compile_vs_prolog);
7117 if (!shader->prolog)
7118 return false;
7119 }
7120
7121 /* Get the epilog. */
7122 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7123 !si_get_vs_epilog(sscreen, tm, shader, debug,
7124 &shader->key.vs.epilog))
7125 return false;
7126
7127 /* Set the instanceID flag. */
7128 for (i = 0; i < info->num_inputs; i++)
7129 if (prolog_key.vs_prolog.states.instance_divisors[i])
7130 shader->info.uses_instanceid = true;
7131
7132 return true;
7133 }
7134
7135 /**
7136 * Select and compile (or reuse) TES parts (epilog).
7137 */
7138 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7139 LLVMTargetMachineRef tm,
7140 struct si_shader *shader,
7141 struct pipe_debug_callback *debug)
7142 {
7143 if (shader->key.tes.as_es)
7144 return true;
7145
7146 /* TES compiled as VS. */
7147 return si_get_vs_epilog(sscreen, tm, shader, debug,
7148 &shader->key.tes.epilog);
7149 }
7150
7151 /**
7152 * Compile the TCS epilog. This writes tesselation factors to memory based on
7153 * the output primitive type of the tesselator (determined by TES).
7154 */
7155 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7156 LLVMTargetMachineRef tm,
7157 struct pipe_debug_callback *debug,
7158 struct si_shader_part *out)
7159 {
7160 union si_shader_part_key *key = &out->key;
7161 struct si_shader shader = {};
7162 struct si_shader_context ctx;
7163 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7164 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7165 LLVMTypeRef params[16];
7166 LLVMValueRef func;
7167 int last_sgpr, num_params;
7168 bool status = true;
7169
7170 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7171 ctx.type = PIPE_SHADER_TESS_CTRL;
7172 shader.key.tcs.epilog = key->tcs_epilog.states;
7173
7174 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7175 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7176 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7177 params[SI_PARAM_SAMPLERS] = ctx.i64;
7178 params[SI_PARAM_IMAGES] = ctx.i64;
7179 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7180 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7181 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7182 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7183 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7184 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7185 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7186 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7187 num_params = last_sgpr + 1;
7188
7189 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7190 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7191 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7192
7193 /* Create the function. */
7194 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7195 declare_tess_lds(&ctx);
7196 func = ctx.radeon_bld.main_fn;
7197
7198 si_write_tess_factors(bld_base,
7199 LLVMGetParam(func, last_sgpr + 1),
7200 LLVMGetParam(func, last_sgpr + 2),
7201 LLVMGetParam(func, last_sgpr + 3));
7202
7203 /* Compile. */
7204 LLVMBuildRetVoid(gallivm->builder);
7205 radeon_llvm_finalize_module(&ctx.radeon_bld);
7206
7207 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7208 gallivm->module, debug, ctx.type,
7209 "Tessellation Control Shader Epilog"))
7210 status = false;
7211
7212 radeon_llvm_dispose(&ctx.radeon_bld);
7213 return status;
7214 }
7215
7216 /**
7217 * Select and compile (or reuse) TCS parts (epilog).
7218 */
7219 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7220 LLVMTargetMachineRef tm,
7221 struct si_shader *shader,
7222 struct pipe_debug_callback *debug)
7223 {
7224 union si_shader_part_key epilog_key;
7225
7226 /* Get the epilog. */
7227 memset(&epilog_key, 0, sizeof(epilog_key));
7228 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7229
7230 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7231 &epilog_key, tm, debug,
7232 si_compile_tcs_epilog);
7233 return shader->epilog != NULL;
7234 }
7235
7236 /**
7237 * Compile the pixel shader prolog. This handles:
7238 * - two-side color selection and interpolation
7239 * - overriding interpolation parameters for the API PS
7240 * - polygon stippling
7241 *
7242 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7243 * overriden by other states. (e.g. per-sample interpolation)
7244 * Interpolated colors are stored after the preloaded VGPRs.
7245 */
7246 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7247 LLVMTargetMachineRef tm,
7248 struct pipe_debug_callback *debug,
7249 struct si_shader_part *out)
7250 {
7251 union si_shader_part_key *key = &out->key;
7252 struct si_shader shader = {};
7253 struct si_shader_context ctx;
7254 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7255 LLVMTypeRef *params;
7256 LLVMValueRef ret, func;
7257 int last_sgpr, num_params, num_returns, i, num_color_channels;
7258 bool status = true;
7259
7260 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7261 ctx.type = PIPE_SHADER_FRAGMENT;
7262 shader.key.ps.prolog = key->ps_prolog.states;
7263
7264 /* Number of inputs + 8 color elements. */
7265 params = alloca((key->ps_prolog.num_input_sgprs +
7266 key->ps_prolog.num_input_vgprs + 8) *
7267 sizeof(LLVMTypeRef));
7268
7269 /* Declare inputs. */
7270 num_params = 0;
7271 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7272 params[num_params++] = ctx.i32;
7273 last_sgpr = num_params - 1;
7274
7275 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7276 params[num_params++] = ctx.f32;
7277
7278 /* Declare outputs (same as inputs + add colors if needed) */
7279 num_returns = num_params;
7280 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7281 for (i = 0; i < num_color_channels; i++)
7282 params[num_returns++] = ctx.f32;
7283
7284 /* Create the function. */
7285 si_create_function(&ctx, params, num_returns, params,
7286 num_params, last_sgpr);
7287 func = ctx.radeon_bld.main_fn;
7288
7289 /* Copy inputs to outputs. This should be no-op, as the registers match,
7290 * but it will prevent the compiler from overwriting them unintentionally.
7291 */
7292 ret = ctx.return_value;
7293 for (i = 0; i < num_params; i++) {
7294 LLVMValueRef p = LLVMGetParam(func, i);
7295 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7296 }
7297
7298 /* Polygon stippling. */
7299 if (key->ps_prolog.states.poly_stipple) {
7300 /* POS_FIXED_PT is always last. */
7301 unsigned pos = key->ps_prolog.num_input_sgprs +
7302 key->ps_prolog.num_input_vgprs - 1;
7303 LLVMValueRef ptr[2], list;
7304
7305 /* Get the pointer to rw buffers. */
7306 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7307 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7308 list = lp_build_gather_values(gallivm, ptr, 2);
7309 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7310 list = LLVMBuildIntToPtr(gallivm->builder, list,
7311 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7312
7313 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7314 }
7315
7316 if (key->ps_prolog.states.bc_optimize_for_persp ||
7317 key->ps_prolog.states.bc_optimize_for_linear) {
7318 unsigned i, base = key->ps_prolog.num_input_sgprs;
7319 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7320
7321 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7322 * The hw doesn't compute CENTROID if the whole wave only
7323 * contains fully-covered quads.
7324 *
7325 * PRIM_MASK is after user SGPRs.
7326 */
7327 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7328 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7329 LLVMConstInt(ctx.i32, 31, 0), "");
7330 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7331 ctx.i1, "");
7332
7333 if (key->ps_prolog.states.bc_optimize_for_persp) {
7334 /* Read PERSP_CENTER. */
7335 for (i = 0; i < 2; i++)
7336 center[i] = LLVMGetParam(func, base + 2 + i);
7337 /* Read PERSP_CENTROID. */
7338 for (i = 0; i < 2; i++)
7339 centroid[i] = LLVMGetParam(func, base + 4 + i);
7340 /* Select PERSP_CENTROID. */
7341 for (i = 0; i < 2; i++) {
7342 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7343 center[i], centroid[i], "");
7344 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7345 tmp, base + 4 + i, "");
7346 }
7347 }
7348 if (key->ps_prolog.states.bc_optimize_for_linear) {
7349 /* Read LINEAR_CENTER. */
7350 for (i = 0; i < 2; i++)
7351 center[i] = LLVMGetParam(func, base + 8 + i);
7352 /* Read LINEAR_CENTROID. */
7353 for (i = 0; i < 2; i++)
7354 centroid[i] = LLVMGetParam(func, base + 10 + i);
7355 /* Select LINEAR_CENTROID. */
7356 for (i = 0; i < 2; i++) {
7357 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7358 center[i], centroid[i], "");
7359 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7360 tmp, base + 10 + i, "");
7361 }
7362 }
7363 }
7364
7365 /* Interpolate colors. */
7366 for (i = 0; i < 2; i++) {
7367 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7368 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7369 key->ps_prolog.face_vgpr_index;
7370 LLVMValueRef interp[2], color[4];
7371 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7372
7373 if (!writemask)
7374 continue;
7375
7376 /* If the interpolation qualifier is not CONSTANT (-1). */
7377 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7378 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7379 key->ps_prolog.color_interp_vgpr_index[i];
7380
7381 /* Get the (i,j) updated by bc_optimize handling. */
7382 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7383 interp_vgpr, "");
7384 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7385 interp_vgpr + 1, "");
7386 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7387 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7388 ctx.v2i32, "");
7389 }
7390
7391 /* Use the absolute location of the input. */
7392 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7393
7394 if (key->ps_prolog.states.color_two_side) {
7395 face = LLVMGetParam(func, face_vgpr);
7396 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7397 }
7398
7399 interp_fs_input(&ctx,
7400 key->ps_prolog.color_attr_index[i],
7401 TGSI_SEMANTIC_COLOR, i,
7402 key->ps_prolog.num_interp_inputs,
7403 key->ps_prolog.colors_read, interp_ij,
7404 prim_mask, face, color);
7405
7406 while (writemask) {
7407 unsigned chan = u_bit_scan(&writemask);
7408 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7409 num_params++, "");
7410 }
7411 }
7412
7413 /* Force per-sample interpolation. */
7414 if (key->ps_prolog.states.force_persp_sample_interp) {
7415 unsigned i, base = key->ps_prolog.num_input_sgprs;
7416 LLVMValueRef persp_sample[2];
7417
7418 /* Read PERSP_SAMPLE. */
7419 for (i = 0; i < 2; i++)
7420 persp_sample[i] = LLVMGetParam(func, base + i);
7421 /* Overwrite PERSP_CENTER. */
7422 for (i = 0; i < 2; i++)
7423 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7424 persp_sample[i], base + 2 + i, "");
7425 /* Overwrite PERSP_CENTROID. */
7426 for (i = 0; i < 2; i++)
7427 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7428 persp_sample[i], base + 4 + i, "");
7429 }
7430 if (key->ps_prolog.states.force_linear_sample_interp) {
7431 unsigned i, base = key->ps_prolog.num_input_sgprs;
7432 LLVMValueRef linear_sample[2];
7433
7434 /* Read LINEAR_SAMPLE. */
7435 for (i = 0; i < 2; i++)
7436 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7437 /* Overwrite LINEAR_CENTER. */
7438 for (i = 0; i < 2; i++)
7439 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7440 linear_sample[i], base + 8 + i, "");
7441 /* Overwrite LINEAR_CENTROID. */
7442 for (i = 0; i < 2; i++)
7443 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7444 linear_sample[i], base + 10 + i, "");
7445 }
7446
7447 /* Force center interpolation. */
7448 if (key->ps_prolog.states.force_persp_center_interp) {
7449 unsigned i, base = key->ps_prolog.num_input_sgprs;
7450 LLVMValueRef persp_center[2];
7451
7452 /* Read PERSP_CENTER. */
7453 for (i = 0; i < 2; i++)
7454 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7455 /* Overwrite PERSP_SAMPLE. */
7456 for (i = 0; i < 2; i++)
7457 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7458 persp_center[i], base + i, "");
7459 /* Overwrite PERSP_CENTROID. */
7460 for (i = 0; i < 2; i++)
7461 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7462 persp_center[i], base + 4 + i, "");
7463 }
7464 if (key->ps_prolog.states.force_linear_center_interp) {
7465 unsigned i, base = key->ps_prolog.num_input_sgprs;
7466 LLVMValueRef linear_center[2];
7467
7468 /* Read LINEAR_CENTER. */
7469 for (i = 0; i < 2; i++)
7470 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7471 /* Overwrite LINEAR_SAMPLE. */
7472 for (i = 0; i < 2; i++)
7473 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7474 linear_center[i], base + 6 + i, "");
7475 /* Overwrite LINEAR_CENTROID. */
7476 for (i = 0; i < 2; i++)
7477 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7478 linear_center[i], base + 10 + i, "");
7479 }
7480
7481 /* Tell LLVM to insert WQM instruction sequence when needed. */
7482 if (key->ps_prolog.wqm) {
7483 LLVMAddTargetDependentFunctionAttr(func,
7484 "amdgpu-ps-wqm-outputs", "");
7485 }
7486
7487 /* Compile. */
7488 si_llvm_build_ret(&ctx, ret);
7489 radeon_llvm_finalize_module(&ctx.radeon_bld);
7490
7491 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7492 gallivm->module, debug, ctx.type,
7493 "Fragment Shader Prolog"))
7494 status = false;
7495
7496 radeon_llvm_dispose(&ctx.radeon_bld);
7497 return status;
7498 }
7499
7500 /**
7501 * Compile the pixel shader epilog. This handles everything that must be
7502 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7503 */
7504 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7505 LLVMTargetMachineRef tm,
7506 struct pipe_debug_callback *debug,
7507 struct si_shader_part *out)
7508 {
7509 union si_shader_part_key *key = &out->key;
7510 struct si_shader shader = {};
7511 struct si_shader_context ctx;
7512 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7513 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7514 LLVMTypeRef params[16+8*4+3];
7515 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7516 int last_sgpr, num_params, i;
7517 bool status = true;
7518 struct si_ps_exports exp = {};
7519
7520 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7521 ctx.type = PIPE_SHADER_FRAGMENT;
7522 shader.key.ps.epilog = key->ps_epilog.states;
7523
7524 /* Declare input SGPRs. */
7525 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7526 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7527 params[SI_PARAM_SAMPLERS] = ctx.i64;
7528 params[SI_PARAM_IMAGES] = ctx.i64;
7529 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7530 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7531 last_sgpr = SI_PARAM_ALPHA_REF;
7532
7533 /* Declare input VGPRs. */
7534 num_params = (last_sgpr + 1) +
7535 util_bitcount(key->ps_epilog.colors_written) * 4 +
7536 key->ps_epilog.writes_z +
7537 key->ps_epilog.writes_stencil +
7538 key->ps_epilog.writes_samplemask;
7539
7540 num_params = MAX2(num_params,
7541 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7542
7543 assert(num_params <= ARRAY_SIZE(params));
7544
7545 for (i = last_sgpr + 1; i < num_params; i++)
7546 params[i] = ctx.f32;
7547
7548 /* Create the function. */
7549 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7550 /* Disable elimination of unused inputs. */
7551 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7552 "InitialPSInputAddr", 0xffffff);
7553
7554 /* Process colors. */
7555 unsigned vgpr = last_sgpr + 1;
7556 unsigned colors_written = key->ps_epilog.colors_written;
7557 int last_color_export = -1;
7558
7559 /* Find the last color export. */
7560 if (!key->ps_epilog.writes_z &&
7561 !key->ps_epilog.writes_stencil &&
7562 !key->ps_epilog.writes_samplemask) {
7563 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7564
7565 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7566 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7567 /* Just set this if any of the colorbuffers are enabled. */
7568 if (spi_format &
7569 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7570 last_color_export = 0;
7571 } else {
7572 for (i = 0; i < 8; i++)
7573 if (colors_written & (1 << i) &&
7574 (spi_format >> (i * 4)) & 0xf)
7575 last_color_export = i;
7576 }
7577 }
7578
7579 while (colors_written) {
7580 LLVMValueRef color[4];
7581 int mrt = u_bit_scan(&colors_written);
7582
7583 for (i = 0; i < 4; i++)
7584 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7585
7586 si_export_mrt_color(bld_base, color, mrt,
7587 num_params - 1,
7588 mrt == last_color_export, &exp);
7589 }
7590
7591 /* Process depth, stencil, samplemask. */
7592 if (key->ps_epilog.writes_z)
7593 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7594 if (key->ps_epilog.writes_stencil)
7595 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7596 if (key->ps_epilog.writes_samplemask)
7597 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7598
7599 if (depth || stencil || samplemask)
7600 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7601 else if (last_color_export == -1)
7602 si_export_null(bld_base);
7603
7604 if (exp.num)
7605 si_emit_ps_exports(&ctx, &exp);
7606
7607 /* Compile. */
7608 LLVMBuildRetVoid(gallivm->builder);
7609 radeon_llvm_finalize_module(&ctx.radeon_bld);
7610
7611 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7612 gallivm->module, debug, ctx.type,
7613 "Fragment Shader Epilog"))
7614 status = false;
7615
7616 radeon_llvm_dispose(&ctx.radeon_bld);
7617 return status;
7618 }
7619
7620 /**
7621 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7622 */
7623 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7624 LLVMTargetMachineRef tm,
7625 struct si_shader *shader,
7626 struct pipe_debug_callback *debug)
7627 {
7628 struct tgsi_shader_info *info = &shader->selector->info;
7629 union si_shader_part_key prolog_key;
7630 union si_shader_part_key epilog_key;
7631 unsigned i;
7632
7633 /* Get the prolog. */
7634 memset(&prolog_key, 0, sizeof(prolog_key));
7635 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7636 prolog_key.ps_prolog.colors_read = info->colors_read;
7637 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7638 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7639 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7640 (prolog_key.ps_prolog.colors_read ||
7641 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7642 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7643 prolog_key.ps_prolog.states.force_persp_center_interp ||
7644 prolog_key.ps_prolog.states.force_linear_center_interp ||
7645 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7646 prolog_key.ps_prolog.states.bc_optimize_for_linear);
7647
7648 if (info->colors_read) {
7649 unsigned *color = shader->selector->color_attr_index;
7650
7651 if (shader->key.ps.prolog.color_two_side) {
7652 /* BCOLORs are stored after the last input. */
7653 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7654 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7655 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7656 }
7657
7658 for (i = 0; i < 2; i++) {
7659 unsigned interp = info->input_interpolate[color[i]];
7660 unsigned location = info->input_interpolate_loc[color[i]];
7661
7662 if (!(info->colors_read & (0xf << i*4)))
7663 continue;
7664
7665 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7666
7667 if (shader->key.ps.prolog.flatshade_colors &&
7668 interp == TGSI_INTERPOLATE_COLOR)
7669 interp = TGSI_INTERPOLATE_CONSTANT;
7670
7671 switch (interp) {
7672 case TGSI_INTERPOLATE_CONSTANT:
7673 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7674 break;
7675 case TGSI_INTERPOLATE_PERSPECTIVE:
7676 case TGSI_INTERPOLATE_COLOR:
7677 /* Force the interpolation location for colors here. */
7678 if (shader->key.ps.prolog.force_persp_sample_interp)
7679 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7680 if (shader->key.ps.prolog.force_persp_center_interp)
7681 location = TGSI_INTERPOLATE_LOC_CENTER;
7682
7683 switch (location) {
7684 case TGSI_INTERPOLATE_LOC_SAMPLE:
7685 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7686 shader->config.spi_ps_input_ena |=
7687 S_0286CC_PERSP_SAMPLE_ENA(1);
7688 break;
7689 case TGSI_INTERPOLATE_LOC_CENTER:
7690 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7691 shader->config.spi_ps_input_ena |=
7692 S_0286CC_PERSP_CENTER_ENA(1);
7693 break;
7694 case TGSI_INTERPOLATE_LOC_CENTROID:
7695 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7696 shader->config.spi_ps_input_ena |=
7697 S_0286CC_PERSP_CENTROID_ENA(1);
7698 break;
7699 default:
7700 assert(0);
7701 }
7702 break;
7703 case TGSI_INTERPOLATE_LINEAR:
7704 /* Force the interpolation location for colors here. */
7705 if (shader->key.ps.prolog.force_linear_sample_interp)
7706 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7707 if (shader->key.ps.prolog.force_linear_center_interp)
7708 location = TGSI_INTERPOLATE_LOC_CENTER;
7709
7710 switch (location) {
7711 case TGSI_INTERPOLATE_LOC_SAMPLE:
7712 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7713 shader->config.spi_ps_input_ena |=
7714 S_0286CC_LINEAR_SAMPLE_ENA(1);
7715 break;
7716 case TGSI_INTERPOLATE_LOC_CENTER:
7717 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7718 shader->config.spi_ps_input_ena |=
7719 S_0286CC_LINEAR_CENTER_ENA(1);
7720 break;
7721 case TGSI_INTERPOLATE_LOC_CENTROID:
7722 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7723 shader->config.spi_ps_input_ena |=
7724 S_0286CC_LINEAR_CENTROID_ENA(1);
7725 break;
7726 default:
7727 assert(0);
7728 }
7729 break;
7730 default:
7731 assert(0);
7732 }
7733 }
7734 }
7735
7736 /* The prolog is a no-op if these aren't set. */
7737 if (prolog_key.ps_prolog.colors_read ||
7738 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7739 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7740 prolog_key.ps_prolog.states.force_persp_center_interp ||
7741 prolog_key.ps_prolog.states.force_linear_center_interp ||
7742 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7743 prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7744 prolog_key.ps_prolog.states.poly_stipple) {
7745 shader->prolog =
7746 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7747 &prolog_key, tm, debug,
7748 si_compile_ps_prolog);
7749 if (!shader->prolog)
7750 return false;
7751 }
7752
7753 /* Get the epilog. */
7754 memset(&epilog_key, 0, sizeof(epilog_key));
7755 epilog_key.ps_epilog.colors_written = info->colors_written;
7756 epilog_key.ps_epilog.writes_z = info->writes_z;
7757 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7758 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7759 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7760
7761 shader->epilog =
7762 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7763 &epilog_key, tm, debug,
7764 si_compile_ps_epilog);
7765 if (!shader->epilog)
7766 return false;
7767
7768 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7769 if (shader->key.ps.prolog.poly_stipple) {
7770 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7771 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7772 }
7773
7774 /* Set up the enable bits for per-sample shading if needed. */
7775 if (shader->key.ps.prolog.force_persp_sample_interp &&
7776 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7777 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7778 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7779 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7780 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7781 }
7782 if (shader->key.ps.prolog.force_linear_sample_interp &&
7783 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7784 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7785 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7786 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7787 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7788 }
7789 if (shader->key.ps.prolog.force_persp_center_interp &&
7790 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7791 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7792 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7793 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7794 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7795 }
7796 if (shader->key.ps.prolog.force_linear_center_interp &&
7797 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7798 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7799 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7800 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7801 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7802 }
7803
7804 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7805 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7806 !(shader->config.spi_ps_input_ena & 0xf)) {
7807 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7808 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7809 }
7810
7811 /* At least one pair of interpolation weights must be enabled. */
7812 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7813 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7814 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7815 }
7816
7817 /* The sample mask input is always enabled, because the API shader always
7818 * passes it through to the epilog. Disable it here if it's unused.
7819 */
7820 if (!shader->key.ps.epilog.poly_line_smoothing &&
7821 !shader->selector->info.reads_samplemask)
7822 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7823
7824 return true;
7825 }
7826
7827 static void si_fix_num_sgprs(struct si_shader *shader)
7828 {
7829 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7830
7831 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7832 }
7833
7834 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7835 struct si_shader *shader,
7836 struct pipe_debug_callback *debug)
7837 {
7838 struct si_shader *mainp = shader->selector->main_shader_part;
7839 int r;
7840
7841 /* LS, ES, VS are compiled on demand if the main part hasn't been
7842 * compiled for that stage.
7843 */
7844 if (!mainp ||
7845 (shader->selector->type == PIPE_SHADER_VERTEX &&
7846 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7847 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7848 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7849 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7850 (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7851 shader->key.tcs.epilog.inputs_to_copy) ||
7852 shader->selector->type == PIPE_SHADER_COMPUTE) {
7853 /* Monolithic shader (compiled as a whole, has many variants,
7854 * may take a long time to compile).
7855 */
7856 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7857 if (r)
7858 return r;
7859 } else {
7860 /* The shader consists of 2-3 parts:
7861 *
7862 * - the middle part is the user shader, it has 1 variant only
7863 * and it was compiled during the creation of the shader
7864 * selector
7865 * - the prolog part is inserted at the beginning
7866 * - the epilog part is inserted at the end
7867 *
7868 * The prolog and epilog have many (but simple) variants.
7869 */
7870
7871 /* Copy the compiled TGSI shader data over. */
7872 shader->is_binary_shared = true;
7873 shader->binary = mainp->binary;
7874 shader->config = mainp->config;
7875 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7876 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7877 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7878 memcpy(shader->info.vs_output_param_offset,
7879 mainp->info.vs_output_param_offset,
7880 sizeof(mainp->info.vs_output_param_offset));
7881 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7882 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7883 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7884
7885 /* Select prologs and/or epilogs. */
7886 switch (shader->selector->type) {
7887 case PIPE_SHADER_VERTEX:
7888 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7889 return -1;
7890 break;
7891 case PIPE_SHADER_TESS_CTRL:
7892 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7893 return -1;
7894 break;
7895 case PIPE_SHADER_TESS_EVAL:
7896 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7897 return -1;
7898 break;
7899 case PIPE_SHADER_FRAGMENT:
7900 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7901 return -1;
7902
7903 /* Make sure we have at least as many VGPRs as there
7904 * are allocated inputs.
7905 */
7906 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7907 shader->info.num_input_vgprs);
7908 break;
7909 }
7910
7911 /* Update SGPR and VGPR counts. */
7912 if (shader->prolog) {
7913 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7914 shader->prolog->config.num_sgprs);
7915 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7916 shader->prolog->config.num_vgprs);
7917 }
7918 if (shader->epilog) {
7919 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7920 shader->epilog->config.num_sgprs);
7921 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7922 shader->epilog->config.num_vgprs);
7923 }
7924 }
7925
7926 si_fix_num_sgprs(shader);
7927 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7928 stderr);
7929
7930 /* Upload. */
7931 r = si_shader_binary_upload(sscreen, shader);
7932 if (r) {
7933 fprintf(stderr, "LLVM failed to upload shader\n");
7934 return r;
7935 }
7936
7937 return 0;
7938 }
7939
7940 void si_shader_destroy(struct si_shader *shader)
7941 {
7942 if (shader->gs_copy_shader) {
7943 si_shader_destroy(shader->gs_copy_shader);
7944 FREE(shader->gs_copy_shader);
7945 }
7946
7947 if (shader->scratch_bo)
7948 r600_resource_reference(&shader->scratch_bo, NULL);
7949
7950 r600_resource_reference(&shader->bo, NULL);
7951
7952 if (!shader->is_binary_shared)
7953 radeon_shader_binary_clean(&shader->binary);
7954 }