radeonsi: drop unnecessary u_pstipple.h include
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "gallivm/lp_bld_misc.h"
37 #include "radeon/r600_cs.h"
38 #include "radeon/radeon_llvm.h"
39 #include "radeon/radeon_elf_util.h"
40 #include "radeon/radeon_llvm_emit.h"
41 #include "util/u_memory.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94 int param_oc_lds;
95
96 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
97 * 0x800000 for VS, 0x1 for ES.
98 */
99 int param_tess_offchip;
100
101 LLVMTargetMachineRef tm;
102
103 unsigned invariant_load_md_kind;
104 unsigned range_md_kind;
105 unsigned uniform_md_kind;
106 LLVMValueRef empty_md;
107
108 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
109 LLVMValueRef lds;
110 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
111 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
112 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
113 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
114 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
115 LLVMValueRef images[SI_NUM_IMAGES];
116 LLVMValueRef so_buffers[4];
117 LLVMValueRef esgs_ring;
118 LLVMValueRef gsvs_ring[4];
119 LLVMValueRef gs_next_vertex[4];
120 LLVMValueRef return_value;
121
122 LLVMTypeRef voidt;
123 LLVMTypeRef i1;
124 LLVMTypeRef i8;
125 LLVMTypeRef i32;
126 LLVMTypeRef i64;
127 LLVMTypeRef i128;
128 LLVMTypeRef f32;
129 LLVMTypeRef v16i8;
130 LLVMTypeRef v2i32;
131 LLVMTypeRef v4i32;
132 LLVMTypeRef v4f32;
133 LLVMTypeRef v8i32;
134
135 LLVMValueRef shared_memory;
136 };
137
138 static struct si_shader_context *si_shader_context(
139 struct lp_build_tgsi_context *bld_base)
140 {
141 return (struct si_shader_context *)bld_base;
142 }
143
144 static void si_init_shader_ctx(struct si_shader_context *ctx,
145 struct si_screen *sscreen,
146 struct si_shader *shader,
147 LLVMTargetMachineRef tm);
148
149 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
150 struct lp_build_tgsi_context *bld_base,
151 struct lp_build_emit_data *emit_data);
152
153 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
154 FILE *f);
155
156 /* Ideally pass the sample mask input to the PS epilog as v13, which
157 * is its usual location, so that the shader doesn't have to add v_mov.
158 */
159 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
160
161 /* The VS location of the PrimitiveID input is the same in the epilog,
162 * so that the main shader part doesn't have to move it.
163 */
164 #define VS_EPILOG_PRIMID_LOC 2
165
166 #define PERSPECTIVE_BASE 0
167 #define LINEAR_BASE 9
168
169 #define SAMPLE_OFFSET 0
170 #define CENTER_OFFSET 2
171 #define CENTROID_OFSET 4
172
173 #define USE_SGPR_MAX_SUFFIX_LEN 5
174 #define CONST_ADDR_SPACE 2
175 #define LOCAL_ADDR_SPACE 3
176 #define USER_SGPR_ADDR_SPACE 8
177
178
179 #define SENDMSG_GS 2
180 #define SENDMSG_GS_DONE 3
181
182 #define SENDMSG_GS_OP_NOP (0 << 4)
183 #define SENDMSG_GS_OP_CUT (1 << 4)
184 #define SENDMSG_GS_OP_EMIT (2 << 4)
185 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
186
187 /**
188 * Returns a unique index for a semantic name and index. The index must be
189 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
190 * calculated.
191 */
192 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
193 {
194 switch (semantic_name) {
195 case TGSI_SEMANTIC_POSITION:
196 return 0;
197 case TGSI_SEMANTIC_PSIZE:
198 return 1;
199 case TGSI_SEMANTIC_CLIPDIST:
200 assert(index <= 1);
201 return 2 + index;
202 case TGSI_SEMANTIC_GENERIC:
203 if (index <= 63-4)
204 return 4 + index;
205 else
206 /* same explanation as in the default statement,
207 * the only user hitting this is st/nine.
208 */
209 return 0;
210
211 /* patch indices are completely separate and thus start from 0 */
212 case TGSI_SEMANTIC_TESSOUTER:
213 return 0;
214 case TGSI_SEMANTIC_TESSINNER:
215 return 1;
216 case TGSI_SEMANTIC_PATCH:
217 return 2 + index;
218
219 default:
220 /* Don't fail here. The result of this function is only used
221 * for LS, TCS, TES, and GS, where legacy GL semantics can't
222 * occur, but this function is called for all vertex shaders
223 * before it's known whether LS will be compiled or not.
224 */
225 return 0;
226 }
227 }
228
229 /**
230 * Get the value of a shader input parameter and extract a bitfield.
231 */
232 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
233 unsigned param, unsigned rshift,
234 unsigned bitwidth)
235 {
236 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
237 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
238 param);
239
240 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
241 value = bitcast(&ctx->radeon_bld.soa.bld_base,
242 TGSI_TYPE_UNSIGNED, value);
243
244 if (rshift)
245 value = LLVMBuildLShr(gallivm->builder, value,
246 lp_build_const_int32(gallivm, rshift), "");
247
248 if (rshift + bitwidth < 32) {
249 unsigned mask = (1 << bitwidth) - 1;
250 value = LLVMBuildAnd(gallivm->builder, value,
251 lp_build_const_int32(gallivm, mask), "");
252 }
253
254 return value;
255 }
256
257 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
258 {
259 switch (ctx->type) {
260 case PIPE_SHADER_TESS_CTRL:
261 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
262
263 case PIPE_SHADER_TESS_EVAL:
264 return LLVMGetParam(ctx->radeon_bld.main_fn,
265 ctx->param_tes_rel_patch_id);
266
267 default:
268 assert(0);
269 return NULL;
270 }
271 }
272
273 /* Tessellation shaders pass outputs to the next shader using LDS.
274 *
275 * LS outputs = TCS inputs
276 * TCS outputs = TES inputs
277 *
278 * The LDS layout is:
279 * - TCS inputs for patch 0
280 * - TCS inputs for patch 1
281 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
282 * - ...
283 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
284 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
285 * - TCS outputs for patch 1
286 * - Per-patch TCS outputs for patch 1
287 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
288 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
289 * - ...
290 *
291 * All three shaders VS(LS), TCS, TES share the same LDS space.
292 */
293
294 static LLVMValueRef
295 get_tcs_in_patch_stride(struct si_shader_context *ctx)
296 {
297 if (ctx->type == PIPE_SHADER_VERTEX)
298 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
299 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
300 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
301 else {
302 assert(0);
303 return NULL;
304 }
305 }
306
307 static LLVMValueRef
308 get_tcs_out_patch_stride(struct si_shader_context *ctx)
309 {
310 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
311 }
312
313 static LLVMValueRef
314 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
315 {
316 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
317 unpack_param(ctx,
318 SI_PARAM_TCS_OUT_OFFSETS,
319 0, 16),
320 4);
321 }
322
323 static LLVMValueRef
324 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
325 {
326 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
327 unpack_param(ctx,
328 SI_PARAM_TCS_OUT_OFFSETS,
329 16, 16),
330 4);
331 }
332
333 static LLVMValueRef
334 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
335 {
336 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
337 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
338 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
339
340 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
341 }
342
343 static LLVMValueRef
344 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
345 {
346 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
347 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
348 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
349 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
350
351 return LLVMBuildAdd(gallivm->builder, patch0_offset,
352 LLVMBuildMul(gallivm->builder, patch_stride,
353 rel_patch_id, ""),
354 "");
355 }
356
357 static LLVMValueRef
358 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
359 {
360 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
361 LLVMValueRef patch0_patch_data_offset =
362 get_tcs_out_patch0_patch_data_offset(ctx);
363 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
364 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
365
366 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
367 LLVMBuildMul(gallivm->builder, patch_stride,
368 rel_patch_id, ""),
369 "");
370 }
371
372 static void build_indexed_store(struct si_shader_context *ctx,
373 LLVMValueRef base_ptr, LLVMValueRef index,
374 LLVMValueRef value)
375 {
376 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
377 struct gallivm_state *gallivm = bld_base->base.gallivm;
378 LLVMValueRef indices[2], pointer;
379
380 indices[0] = bld_base->uint_bld.zero;
381 indices[1] = index;
382
383 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
384 LLVMBuildStore(gallivm->builder, value, pointer);
385 }
386
387 /**
388 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
389 * It's equivalent to doing a load from &base_ptr[index].
390 *
391 * \param base_ptr Where the array starts.
392 * \param index The element index into the array.
393 * \param uniform Whether the base_ptr and index can be assumed to be
394 * dynamically uniform
395 */
396 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
397 LLVMValueRef base_ptr, LLVMValueRef index,
398 bool uniform)
399 {
400 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
401 struct gallivm_state *gallivm = bld_base->base.gallivm;
402 LLVMValueRef indices[2], pointer;
403
404 indices[0] = bld_base->uint_bld.zero;
405 indices[1] = index;
406
407 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
408 if (uniform)
409 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
410 return LLVMBuildLoad(gallivm->builder, pointer, "");
411 }
412
413 /**
414 * Do a load from &base_ptr[index], but also add a flag that it's loading
415 * a constant from a dynamically uniform index.
416 */
417 static LLVMValueRef build_indexed_load_const(
418 struct si_shader_context *ctx,
419 LLVMValueRef base_ptr, LLVMValueRef index)
420 {
421 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
422 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
423 return result;
424 }
425
426 static LLVMValueRef get_instance_index_for_fetch(
427 struct radeon_llvm_context *radeon_bld,
428 unsigned param_start_instance, unsigned divisor)
429 {
430 struct si_shader_context *ctx =
431 si_shader_context(&radeon_bld->soa.bld_base);
432 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
433
434 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
435 ctx->param_instance_id);
436
437 /* The division must be done before START_INSTANCE is added. */
438 if (divisor > 1)
439 result = LLVMBuildUDiv(gallivm->builder, result,
440 lp_build_const_int32(gallivm, divisor), "");
441
442 return LLVMBuildAdd(gallivm->builder, result,
443 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
444 }
445
446 static void declare_input_vs(
447 struct radeon_llvm_context *radeon_bld,
448 unsigned input_index,
449 const struct tgsi_full_declaration *decl)
450 {
451 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
452 struct gallivm_state *gallivm = base->gallivm;
453 struct si_shader_context *ctx =
454 si_shader_context(&radeon_bld->soa.bld_base);
455 unsigned divisor =
456 ctx->shader->key.vs.prolog.instance_divisors[input_index];
457
458 unsigned chan;
459
460 LLVMValueRef t_list_ptr;
461 LLVMValueRef t_offset;
462 LLVMValueRef t_list;
463 LLVMValueRef attribute_offset;
464 LLVMValueRef buffer_index;
465 LLVMValueRef args[3];
466 LLVMValueRef input;
467
468 /* Load the T list */
469 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
470
471 t_offset = lp_build_const_int32(gallivm, input_index);
472
473 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
474
475 /* Build the attribute offset */
476 attribute_offset = lp_build_const_int32(gallivm, 0);
477
478 if (!ctx->is_monolithic) {
479 buffer_index = LLVMGetParam(radeon_bld->main_fn,
480 ctx->param_vertex_index0 +
481 input_index);
482 } else if (divisor) {
483 /* Build index from instance ID, start instance and divisor */
484 ctx->shader->info.uses_instanceid = true;
485 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
486 SI_PARAM_START_INSTANCE,
487 divisor);
488 } else {
489 /* Load the buffer index for vertices. */
490 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
491 ctx->param_vertex_id);
492 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
493 SI_PARAM_BASE_VERTEX);
494 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
495 }
496
497 args[0] = t_list;
498 args[1] = attribute_offset;
499 args[2] = buffer_index;
500 input = lp_build_intrinsic(gallivm->builder,
501 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
502 LLVMReadNoneAttribute);
503
504 /* Break up the vec4 into individual components */
505 for (chan = 0; chan < 4; chan++) {
506 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
507 /* XXX: Use a helper function for this. There is one in
508 * tgsi_llvm.c. */
509 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
510 LLVMBuildExtractElement(gallivm->builder,
511 input, llvm_chan, "");
512 }
513 }
514
515 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
516 unsigned swizzle)
517 {
518 struct si_shader_context *ctx = si_shader_context(bld_base);
519
520 if (swizzle > 0)
521 return bld_base->uint_bld.zero;
522
523 switch (ctx->type) {
524 case PIPE_SHADER_VERTEX:
525 return LLVMGetParam(ctx->radeon_bld.main_fn,
526 ctx->param_vs_prim_id);
527 case PIPE_SHADER_TESS_CTRL:
528 return LLVMGetParam(ctx->radeon_bld.main_fn,
529 SI_PARAM_PATCH_ID);
530 case PIPE_SHADER_TESS_EVAL:
531 return LLVMGetParam(ctx->radeon_bld.main_fn,
532 ctx->param_tes_patch_id);
533 case PIPE_SHADER_GEOMETRY:
534 return LLVMGetParam(ctx->radeon_bld.main_fn,
535 SI_PARAM_PRIMITIVE_ID);
536 default:
537 assert(0);
538 return bld_base->uint_bld.zero;
539 }
540 }
541
542 /**
543 * Return the value of tgsi_ind_register for indexing.
544 * This is the indirect index with the constant offset added to it.
545 */
546 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
547 const struct tgsi_ind_register *ind,
548 int rel_index)
549 {
550 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
551 LLVMValueRef result;
552
553 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
554 result = LLVMBuildLoad(gallivm->builder, result, "");
555 result = LLVMBuildAdd(gallivm->builder, result,
556 lp_build_const_int32(gallivm, rel_index), "");
557 return result;
558 }
559
560 /**
561 * Like get_indirect_index, but restricts the return value to a (possibly
562 * undefined) value inside [0..num).
563 */
564 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
565 const struct tgsi_ind_register *ind,
566 int rel_index, unsigned num)
567 {
568 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
569 LLVMBuilderRef builder = gallivm->builder;
570 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
571 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
572 LLVMValueRef cc;
573
574 /* LLVM 3.8: If indirect resource indexing is used:
575 * - SI & CIK hang
576 * - VI crashes
577 */
578 if (HAVE_LLVM <= 0x0308)
579 return LLVMGetUndef(ctx->i32);
580
581 if (util_is_power_of_two(num)) {
582 result = LLVMBuildAnd(builder, result, c_max, "");
583 } else {
584 /* In theory, this MAX pattern should result in code that is
585 * as good as the bit-wise AND above.
586 *
587 * In practice, LLVM generates worse code (at the time of
588 * writing), because its value tracking is not strong enough.
589 */
590 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
591 result = LLVMBuildSelect(builder, cc, result, c_max, "");
592 }
593
594 return result;
595 }
596
597
598 /**
599 * Calculate a dword address given an input or output register and a stride.
600 */
601 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
602 const struct tgsi_full_dst_register *dst,
603 const struct tgsi_full_src_register *src,
604 LLVMValueRef vertex_dw_stride,
605 LLVMValueRef base_addr)
606 {
607 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
608 struct tgsi_shader_info *info = &ctx->shader->selector->info;
609 ubyte *name, *index, *array_first;
610 int first, param;
611 struct tgsi_full_dst_register reg;
612
613 /* Set the register description. The address computation is the same
614 * for sources and destinations. */
615 if (src) {
616 reg.Register.File = src->Register.File;
617 reg.Register.Index = src->Register.Index;
618 reg.Register.Indirect = src->Register.Indirect;
619 reg.Register.Dimension = src->Register.Dimension;
620 reg.Indirect = src->Indirect;
621 reg.Dimension = src->Dimension;
622 reg.DimIndirect = src->DimIndirect;
623 } else
624 reg = *dst;
625
626 /* If the register is 2-dimensional (e.g. an array of vertices
627 * in a primitive), calculate the base address of the vertex. */
628 if (reg.Register.Dimension) {
629 LLVMValueRef index;
630
631 if (reg.Dimension.Indirect)
632 index = get_indirect_index(ctx, &reg.DimIndirect,
633 reg.Dimension.Index);
634 else
635 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
636
637 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
638 LLVMBuildMul(gallivm->builder, index,
639 vertex_dw_stride, ""), "");
640 }
641
642 /* Get information about the register. */
643 if (reg.Register.File == TGSI_FILE_INPUT) {
644 name = info->input_semantic_name;
645 index = info->input_semantic_index;
646 array_first = info->input_array_first;
647 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
648 name = info->output_semantic_name;
649 index = info->output_semantic_index;
650 array_first = info->output_array_first;
651 } else {
652 assert(0);
653 return NULL;
654 }
655
656 if (reg.Register.Indirect) {
657 /* Add the relative address of the element. */
658 LLVMValueRef ind_index;
659
660 if (reg.Indirect.ArrayID)
661 first = array_first[reg.Indirect.ArrayID];
662 else
663 first = reg.Register.Index;
664
665 ind_index = get_indirect_index(ctx, &reg.Indirect,
666 reg.Register.Index - first);
667
668 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
669 LLVMBuildMul(gallivm->builder, ind_index,
670 lp_build_const_int32(gallivm, 4), ""), "");
671
672 param = si_shader_io_get_unique_index(name[first], index[first]);
673 } else {
674 param = si_shader_io_get_unique_index(name[reg.Register.Index],
675 index[reg.Register.Index]);
676 }
677
678 /* Add the base address of the element. */
679 return LLVMBuildAdd(gallivm->builder, base_addr,
680 lp_build_const_int32(gallivm, param * 4), "");
681 }
682
683 /* The offchip buffer layout for TCS->TES is
684 *
685 * - attribute 0 of patch 0 vertex 0
686 * - attribute 0 of patch 0 vertex 1
687 * - attribute 0 of patch 0 vertex 2
688 * ...
689 * - attribute 0 of patch 1 vertex 0
690 * - attribute 0 of patch 1 vertex 1
691 * ...
692 * - attribute 1 of patch 0 vertex 0
693 * - attribute 1 of patch 0 vertex 1
694 * ...
695 * - per patch attribute 0 of patch 0
696 * - per patch attribute 0 of patch 1
697 * ...
698 *
699 * Note that every attribute has 4 components.
700 */
701 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
702 LLVMValueRef vertex_index,
703 LLVMValueRef param_index)
704 {
705 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
706 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
707 LLVMValueRef param_stride, constant16;
708
709 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
710 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
711 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
712 num_patches, "");
713
714 constant16 = lp_build_const_int32(gallivm, 16);
715 if (vertex_index) {
716 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
717 vertices_per_patch, "");
718
719 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
720 vertex_index, "");
721
722 param_stride = total_vertices;
723 } else {
724 base_addr = get_rel_patch_id(ctx);
725 param_stride = num_patches;
726 }
727
728 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
729 LLVMBuildMul(gallivm->builder, param_index,
730 param_stride, ""), "");
731
732 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
733
734 if (!vertex_index) {
735 LLVMValueRef patch_data_offset =
736 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
737
738 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
739 patch_data_offset, "");
740 }
741 return base_addr;
742 }
743
744 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
745 struct si_shader_context *ctx,
746 const struct tgsi_full_dst_register *dst,
747 const struct tgsi_full_src_register *src)
748 {
749 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
750 struct tgsi_shader_info *info = &ctx->shader->selector->info;
751 ubyte *name, *index, *array_first;
752 struct tgsi_full_src_register reg;
753 LLVMValueRef vertex_index = NULL;
754 LLVMValueRef param_index = NULL;
755 unsigned param_index_base, param_base;
756
757 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
758
759 if (reg.Register.Dimension) {
760
761 if (reg.Dimension.Indirect)
762 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
763 reg.Dimension.Index);
764 else
765 vertex_index = lp_build_const_int32(gallivm,
766 reg.Dimension.Index);
767 }
768
769 /* Get information about the register. */
770 if (reg.Register.File == TGSI_FILE_INPUT) {
771 name = info->input_semantic_name;
772 index = info->input_semantic_index;
773 array_first = info->input_array_first;
774 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
775 name = info->output_semantic_name;
776 index = info->output_semantic_index;
777 array_first = info->output_array_first;
778 } else {
779 assert(0);
780 return NULL;
781 }
782
783 if (reg.Register.Indirect) {
784 if (reg.Indirect.ArrayID)
785 param_base = array_first[reg.Indirect.ArrayID];
786 else
787 param_base = reg.Register.Index;
788
789 param_index = get_indirect_index(ctx, &reg.Indirect,
790 reg.Register.Index - param_base);
791
792 } else {
793 param_base = reg.Register.Index;
794 param_index = lp_build_const_int32(gallivm, 0);
795 }
796
797 param_index_base = si_shader_io_get_unique_index(name[param_base],
798 index[param_base]);
799
800 param_index = LLVMBuildAdd(gallivm->builder, param_index,
801 lp_build_const_int32(gallivm, param_index_base),
802 "");
803
804 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
805 }
806
807 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
808 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
809 * or v4i32 (num_channels=3,4). */
810 static void build_tbuffer_store(struct si_shader_context *ctx,
811 LLVMValueRef rsrc,
812 LLVMValueRef vdata,
813 unsigned num_channels,
814 LLVMValueRef vaddr,
815 LLVMValueRef soffset,
816 unsigned inst_offset,
817 unsigned dfmt,
818 unsigned nfmt,
819 unsigned offen,
820 unsigned idxen,
821 unsigned glc,
822 unsigned slc,
823 unsigned tfe)
824 {
825 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
826 LLVMValueRef args[] = {
827 rsrc,
828 vdata,
829 LLVMConstInt(ctx->i32, num_channels, 0),
830 vaddr,
831 soffset,
832 LLVMConstInt(ctx->i32, inst_offset, 0),
833 LLVMConstInt(ctx->i32, dfmt, 0),
834 LLVMConstInt(ctx->i32, nfmt, 0),
835 LLVMConstInt(ctx->i32, offen, 0),
836 LLVMConstInt(ctx->i32, idxen, 0),
837 LLVMConstInt(ctx->i32, glc, 0),
838 LLVMConstInt(ctx->i32, slc, 0),
839 LLVMConstInt(ctx->i32, tfe, 0)
840 };
841
842 /* The instruction offset field has 12 bits */
843 assert(offen || inst_offset < (1 << 12));
844
845 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
846 unsigned func = CLAMP(num_channels, 1, 3) - 1;
847 const char *types[] = {"i32", "v2i32", "v4i32"};
848 char name[256];
849 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
850
851 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
852 args, ARRAY_SIZE(args), 0);
853 }
854
855 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
856 LLVMValueRef rsrc,
857 LLVMValueRef vdata,
858 unsigned num_channels,
859 LLVMValueRef vaddr,
860 LLVMValueRef soffset,
861 unsigned inst_offset)
862 {
863 static unsigned dfmt[] = {
864 V_008F0C_BUF_DATA_FORMAT_32,
865 V_008F0C_BUF_DATA_FORMAT_32_32,
866 V_008F0C_BUF_DATA_FORMAT_32_32_32,
867 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
868 };
869 assert(num_channels >= 1 && num_channels <= 4);
870
871 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
872 inst_offset, dfmt[num_channels-1],
873 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
874 }
875
876 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
877 LLVMValueRef rsrc,
878 int num_channels,
879 LLVMValueRef vindex,
880 LLVMValueRef voffset,
881 LLVMValueRef soffset,
882 unsigned inst_offset,
883 unsigned glc,
884 unsigned slc)
885 {
886 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
887 unsigned func = CLAMP(num_channels, 1, 3) - 1;
888
889 if (HAVE_LLVM >= 0x309) {
890 LLVMValueRef args[] = {
891 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
892 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
893 LLVMConstInt(ctx->i32, inst_offset, 0),
894 LLVMConstInt(ctx->i1, glc, 0),
895 LLVMConstInt(ctx->i1, slc, 0)
896 };
897
898 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
899 ctx->v4f32};
900 const char *type_names[] = {"f32", "v2f32", "v4f32"};
901 char name[256];
902
903 if (voffset) {
904 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
905 "");
906 }
907
908 if (soffset) {
909 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
910 "");
911 }
912
913 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
914 type_names[func]);
915
916 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
917 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
918 } else {
919 LLVMValueRef args[] = {
920 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
921 voffset ? voffset : vindex,
922 soffset,
923 LLVMConstInt(ctx->i32, inst_offset, 0),
924 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
925 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
926 LLVMConstInt(ctx->i32, glc, 0),
927 LLVMConstInt(ctx->i32, slc, 0),
928 LLVMConstInt(ctx->i32, 0, 0), // TFE
929 };
930
931 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
932 ctx->v4i32};
933 const char *type_names[] = {"i32", "v2i32", "v4i32"};
934 const char *arg_type = "i32";
935 char name[256];
936
937 if (voffset && vindex) {
938 LLVMValueRef vaddr[] = {vindex, voffset};
939
940 arg_type = "v2i32";
941 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
942 }
943
944 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
945 type_names[func], arg_type);
946
947 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
948 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
949 }
950 }
951
952 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
953 enum tgsi_opcode_type type, unsigned swizzle,
954 LLVMValueRef buffer, LLVMValueRef offset,
955 LLVMValueRef base)
956 {
957 struct si_shader_context *ctx = si_shader_context(bld_base);
958 struct gallivm_state *gallivm = bld_base->base.gallivm;
959 LLVMValueRef value, value2;
960 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
961 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
962
963 if (swizzle == ~0) {
964 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
965 0, 1, 0);
966
967 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
968 }
969
970 if (!tgsi_type_is_64bit(type)) {
971 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
972 0, 1, 0);
973
974 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
975 return LLVMBuildExtractElement(gallivm->builder, value,
976 lp_build_const_int32(gallivm, swizzle), "");
977 }
978
979 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
980 swizzle * 4, 1, 0);
981
982 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
983 swizzle * 4 + 4, 1, 0);
984
985 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
986 }
987
988 /**
989 * Load from LDS.
990 *
991 * \param type output value type
992 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
993 * \param dw_addr address in dwords
994 */
995 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
996 enum tgsi_opcode_type type, unsigned swizzle,
997 LLVMValueRef dw_addr)
998 {
999 struct si_shader_context *ctx = si_shader_context(bld_base);
1000 struct gallivm_state *gallivm = bld_base->base.gallivm;
1001 LLVMValueRef value;
1002
1003 if (swizzle == ~0) {
1004 LLVMValueRef values[TGSI_NUM_CHANNELS];
1005
1006 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1007 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1008
1009 return lp_build_gather_values(bld_base->base.gallivm, values,
1010 TGSI_NUM_CHANNELS);
1011 }
1012
1013 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1014 lp_build_const_int32(gallivm, swizzle));
1015
1016 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1017 if (tgsi_type_is_64bit(type)) {
1018 LLVMValueRef value2;
1019 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1020 lp_build_const_int32(gallivm, swizzle + 1));
1021 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1022 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1023 }
1024
1025 return LLVMBuildBitCast(gallivm->builder, value,
1026 tgsi2llvmtype(bld_base, type), "");
1027 }
1028
1029 /**
1030 * Store to LDS.
1031 *
1032 * \param swizzle offset (typically 0..3)
1033 * \param dw_addr address in dwords
1034 * \param value value to store
1035 */
1036 static void lds_store(struct lp_build_tgsi_context *bld_base,
1037 unsigned swizzle, LLVMValueRef dw_addr,
1038 LLVMValueRef value)
1039 {
1040 struct si_shader_context *ctx = si_shader_context(bld_base);
1041 struct gallivm_state *gallivm = bld_base->base.gallivm;
1042
1043 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1044 lp_build_const_int32(gallivm, swizzle));
1045
1046 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1047 build_indexed_store(ctx, ctx->lds,
1048 dw_addr, value);
1049 }
1050
1051 static LLVMValueRef fetch_input_tcs(
1052 struct lp_build_tgsi_context *bld_base,
1053 const struct tgsi_full_src_register *reg,
1054 enum tgsi_opcode_type type, unsigned swizzle)
1055 {
1056 struct si_shader_context *ctx = si_shader_context(bld_base);
1057 LLVMValueRef dw_addr, stride;
1058
1059 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1060 dw_addr = get_tcs_in_current_patch_offset(ctx);
1061 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1062
1063 return lds_load(bld_base, type, swizzle, dw_addr);
1064 }
1065
1066 static LLVMValueRef fetch_output_tcs(
1067 struct lp_build_tgsi_context *bld_base,
1068 const struct tgsi_full_src_register *reg,
1069 enum tgsi_opcode_type type, unsigned swizzle)
1070 {
1071 struct si_shader_context *ctx = si_shader_context(bld_base);
1072 LLVMValueRef dw_addr, stride;
1073
1074 if (reg->Register.Dimension) {
1075 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1076 dw_addr = get_tcs_out_current_patch_offset(ctx);
1077 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1078 } else {
1079 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1080 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1081 }
1082
1083 return lds_load(bld_base, type, swizzle, dw_addr);
1084 }
1085
1086 static LLVMValueRef fetch_input_tes(
1087 struct lp_build_tgsi_context *bld_base,
1088 const struct tgsi_full_src_register *reg,
1089 enum tgsi_opcode_type type, unsigned swizzle)
1090 {
1091 struct si_shader_context *ctx = si_shader_context(bld_base);
1092 struct gallivm_state *gallivm = bld_base->base.gallivm;
1093 LLVMValueRef rw_buffers, buffer, base, addr;
1094
1095 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1096 SI_PARAM_RW_BUFFERS);
1097 buffer = build_indexed_load_const(ctx, rw_buffers,
1098 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1099
1100 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1101 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1102
1103 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1104 }
1105
1106 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1107 const struct tgsi_full_instruction *inst,
1108 const struct tgsi_opcode_info *info,
1109 LLVMValueRef dst[4])
1110 {
1111 struct si_shader_context *ctx = si_shader_context(bld_base);
1112 struct gallivm_state *gallivm = bld_base->base.gallivm;
1113 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1114 unsigned chan_index;
1115 LLVMValueRef dw_addr, stride;
1116 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1117 LLVMValueRef values[4];
1118
1119 /* Only handle per-patch and per-vertex outputs here.
1120 * Vectors will be lowered to scalars and this function will be called again.
1121 */
1122 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1123 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1124 radeon_llvm_emit_store(bld_base, inst, info, dst);
1125 return;
1126 }
1127
1128 if (reg->Register.Dimension) {
1129 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1130 dw_addr = get_tcs_out_current_patch_offset(ctx);
1131 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1132 } else {
1133 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1134 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1135 }
1136
1137 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1138 SI_PARAM_RW_BUFFERS);
1139 buffer = build_indexed_load_const(ctx, rw_buffers,
1140 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1141
1142 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1143 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1144
1145
1146 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1147 LLVMValueRef value = dst[chan_index];
1148
1149 if (inst->Instruction.Saturate)
1150 value = radeon_llvm_saturate(bld_base, value);
1151
1152 lds_store(bld_base, chan_index, dw_addr, value);
1153
1154 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1155 values[chan_index] = value;
1156
1157 if (inst->Dst[0].Register.WriteMask != 0xF) {
1158 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1159 buf_addr, base,
1160 4 * chan_index);
1161 }
1162 }
1163
1164 if (inst->Dst[0].Register.WriteMask == 0xF) {
1165 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1166 values, 4);
1167 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1168 base, 0);
1169 }
1170 }
1171
1172 static LLVMValueRef fetch_input_gs(
1173 struct lp_build_tgsi_context *bld_base,
1174 const struct tgsi_full_src_register *reg,
1175 enum tgsi_opcode_type type,
1176 unsigned swizzle)
1177 {
1178 struct lp_build_context *base = &bld_base->base;
1179 struct si_shader_context *ctx = si_shader_context(bld_base);
1180 struct si_shader *shader = ctx->shader;
1181 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1182 struct gallivm_state *gallivm = base->gallivm;
1183 LLVMValueRef vtx_offset;
1184 LLVMValueRef args[9];
1185 unsigned vtx_offset_param;
1186 struct tgsi_shader_info *info = &shader->selector->info;
1187 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1188 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1189 unsigned param;
1190 LLVMValueRef value;
1191
1192 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1193 return get_primitive_id(bld_base, swizzle);
1194
1195 if (!reg->Register.Dimension)
1196 return NULL;
1197
1198 if (swizzle == ~0) {
1199 LLVMValueRef values[TGSI_NUM_CHANNELS];
1200 unsigned chan;
1201 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1202 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1203 }
1204 return lp_build_gather_values(bld_base->base.gallivm, values,
1205 TGSI_NUM_CHANNELS);
1206 }
1207
1208 /* Get the vertex offset parameter */
1209 vtx_offset_param = reg->Dimension.Index;
1210 if (vtx_offset_param < 2) {
1211 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1212 } else {
1213 assert(vtx_offset_param < 6);
1214 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1215 }
1216 vtx_offset = lp_build_mul_imm(uint,
1217 LLVMGetParam(ctx->radeon_bld.main_fn,
1218 vtx_offset_param),
1219 4);
1220
1221 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1222 args[0] = ctx->esgs_ring;
1223 args[1] = vtx_offset;
1224 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1225 args[3] = uint->zero;
1226 args[4] = uint->one; /* OFFEN */
1227 args[5] = uint->zero; /* IDXEN */
1228 args[6] = uint->one; /* GLC */
1229 args[7] = uint->zero; /* SLC */
1230 args[8] = uint->zero; /* TFE */
1231
1232 value = lp_build_intrinsic(gallivm->builder,
1233 "llvm.SI.buffer.load.dword.i32.i32",
1234 ctx->i32, args, 9,
1235 LLVMReadOnlyAttribute);
1236 if (tgsi_type_is_64bit(type)) {
1237 LLVMValueRef value2;
1238 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1239 value2 = lp_build_intrinsic(gallivm->builder,
1240 "llvm.SI.buffer.load.dword.i32.i32",
1241 ctx->i32, args, 9,
1242 LLVMReadOnlyAttribute);
1243 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1244 value, value2);
1245 }
1246 return LLVMBuildBitCast(gallivm->builder,
1247 value,
1248 tgsi2llvmtype(bld_base, type), "");
1249 }
1250
1251 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1252 {
1253 switch (interpolate) {
1254 case TGSI_INTERPOLATE_CONSTANT:
1255 return 0;
1256
1257 case TGSI_INTERPOLATE_LINEAR:
1258 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1259 return SI_PARAM_LINEAR_SAMPLE;
1260 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1261 return SI_PARAM_LINEAR_CENTROID;
1262 else
1263 return SI_PARAM_LINEAR_CENTER;
1264 break;
1265 case TGSI_INTERPOLATE_COLOR:
1266 case TGSI_INTERPOLATE_PERSPECTIVE:
1267 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1268 return SI_PARAM_PERSP_SAMPLE;
1269 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1270 return SI_PARAM_PERSP_CENTROID;
1271 else
1272 return SI_PARAM_PERSP_CENTER;
1273 break;
1274 default:
1275 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1276 return -1;
1277 }
1278 }
1279
1280 /* This shouldn't be used by explicit INTERP opcodes. */
1281 static unsigned select_interp_param(struct si_shader_context *ctx,
1282 unsigned param)
1283 {
1284 if (!ctx->is_monolithic)
1285 return param;
1286
1287 if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1288 switch (param) {
1289 case SI_PARAM_PERSP_CENTROID:
1290 case SI_PARAM_PERSP_CENTER:
1291 return SI_PARAM_PERSP_SAMPLE;
1292 }
1293 }
1294 if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1295 switch (param) {
1296 case SI_PARAM_LINEAR_CENTROID:
1297 case SI_PARAM_LINEAR_CENTER:
1298 return SI_PARAM_LINEAR_SAMPLE;
1299 }
1300 }
1301 if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1302 switch (param) {
1303 case SI_PARAM_PERSP_CENTROID:
1304 case SI_PARAM_PERSP_SAMPLE:
1305 return SI_PARAM_PERSP_CENTER;
1306 }
1307 }
1308 if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1309 switch (param) {
1310 case SI_PARAM_LINEAR_CENTROID:
1311 case SI_PARAM_LINEAR_SAMPLE:
1312 return SI_PARAM_LINEAR_CENTER;
1313 }
1314 }
1315
1316 return param;
1317 }
1318
1319 /**
1320 * Interpolate a fragment shader input.
1321 *
1322 * @param ctx context
1323 * @param input_index index of the input in hardware
1324 * @param semantic_name TGSI_SEMANTIC_*
1325 * @param semantic_index semantic index
1326 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1327 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1328 * @param interp_param interpolation weights (i,j)
1329 * @param prim_mask SI_PARAM_PRIM_MASK
1330 * @param face SI_PARAM_FRONT_FACE
1331 * @param result the return value (4 components)
1332 */
1333 static void interp_fs_input(struct si_shader_context *ctx,
1334 unsigned input_index,
1335 unsigned semantic_name,
1336 unsigned semantic_index,
1337 unsigned num_interp_inputs,
1338 unsigned colors_read_mask,
1339 LLVMValueRef interp_param,
1340 LLVMValueRef prim_mask,
1341 LLVMValueRef face,
1342 LLVMValueRef result[4])
1343 {
1344 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1345 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1346 struct gallivm_state *gallivm = base->gallivm;
1347 const char *intr_name;
1348 LLVMValueRef attr_number;
1349
1350 unsigned chan;
1351
1352 attr_number = lp_build_const_int32(gallivm, input_index);
1353
1354 /* fs.constant returns the param from the middle vertex, so it's not
1355 * really useful for flat shading. It's meant to be used for custom
1356 * interpolation (but the intrinsic can't fetch from the other two
1357 * vertices).
1358 *
1359 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1360 * to do the right thing. The only reason we use fs.constant is that
1361 * fs.interp cannot be used on integers, because they can be equal
1362 * to NaN.
1363 */
1364 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1365
1366 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1367 ctx->shader->key.ps.prolog.color_two_side) {
1368 LLVMValueRef args[4];
1369 LLVMValueRef is_face_positive;
1370 LLVMValueRef back_attr_number;
1371
1372 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1373 * otherwise it's at offset "num_inputs".
1374 */
1375 unsigned back_attr_offset = num_interp_inputs;
1376 if (semantic_index == 1 && colors_read_mask & 0xf)
1377 back_attr_offset += 1;
1378
1379 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1380
1381 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1382 face, uint->zero, "");
1383
1384 args[2] = prim_mask;
1385 args[3] = interp_param;
1386 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1387 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1388 LLVMValueRef front, back;
1389
1390 args[0] = llvm_chan;
1391 args[1] = attr_number;
1392 front = lp_build_intrinsic(gallivm->builder, intr_name,
1393 ctx->f32, args, args[3] ? 4 : 3,
1394 LLVMReadNoneAttribute);
1395
1396 args[1] = back_attr_number;
1397 back = lp_build_intrinsic(gallivm->builder, intr_name,
1398 ctx->f32, args, args[3] ? 4 : 3,
1399 LLVMReadNoneAttribute);
1400
1401 result[chan] = LLVMBuildSelect(gallivm->builder,
1402 is_face_positive,
1403 front,
1404 back,
1405 "");
1406 }
1407 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1408 LLVMValueRef args[4];
1409
1410 args[0] = uint->zero;
1411 args[1] = attr_number;
1412 args[2] = prim_mask;
1413 args[3] = interp_param;
1414 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1415 ctx->f32, args, args[3] ? 4 : 3,
1416 LLVMReadNoneAttribute);
1417 result[1] =
1418 result[2] = lp_build_const_float(gallivm, 0.0f);
1419 result[3] = lp_build_const_float(gallivm, 1.0f);
1420 } else {
1421 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1422 LLVMValueRef args[4];
1423 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1424
1425 args[0] = llvm_chan;
1426 args[1] = attr_number;
1427 args[2] = prim_mask;
1428 args[3] = interp_param;
1429 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1430 ctx->f32, args, args[3] ? 4 : 3,
1431 LLVMReadNoneAttribute);
1432 }
1433 }
1434 }
1435
1436 /* LLVMGetParam with bc_optimize resolved. */
1437 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1438 int interp_param_idx)
1439 {
1440 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1441 LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1442 LLVMValueRef param = NULL;
1443
1444 /* Handle PRIM_MASK[31] (bc_optimize). */
1445 if (ctx->is_monolithic &&
1446 ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1447 interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1448 (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1449 interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1450 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1451 * The hw doesn't compute CENTROID if the whole wave only
1452 * contains fully-covered quads.
1453 */
1454 LLVMValueRef bc_optimize =
1455 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1456 bc_optimize = LLVMBuildLShr(builder,
1457 bc_optimize,
1458 LLVMConstInt(ctx->i32, 31, 0), "");
1459 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1460
1461 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1462 interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1463 param = LLVMBuildSelect(builder, bc_optimize,
1464 LLVMGetParam(main_fn,
1465 SI_PARAM_PERSP_CENTER),
1466 LLVMGetParam(main_fn,
1467 SI_PARAM_PERSP_CENTROID),
1468 "");
1469 }
1470 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1471 interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1472 param = LLVMBuildSelect(builder, bc_optimize,
1473 LLVMGetParam(main_fn,
1474 SI_PARAM_LINEAR_CENTER),
1475 LLVMGetParam(main_fn,
1476 SI_PARAM_LINEAR_CENTROID),
1477 "");
1478 }
1479 }
1480
1481 if (!param)
1482 param = LLVMGetParam(main_fn, interp_param_idx);
1483 return param;
1484 }
1485
1486 static void declare_input_fs(
1487 struct radeon_llvm_context *radeon_bld,
1488 unsigned input_index,
1489 const struct tgsi_full_declaration *decl)
1490 {
1491 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1492 struct si_shader_context *ctx =
1493 si_shader_context(&radeon_bld->soa.bld_base);
1494 struct si_shader *shader = ctx->shader;
1495 LLVMValueRef main_fn = radeon_bld->main_fn;
1496 LLVMValueRef interp_param = NULL;
1497 int interp_param_idx;
1498
1499 /* Get colors from input VGPRs (set by the prolog). */
1500 if (!ctx->is_monolithic &&
1501 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1502 unsigned i = decl->Semantic.Index;
1503 unsigned colors_read = shader->selector->info.colors_read;
1504 unsigned mask = colors_read >> (i * 4);
1505 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1506 (i ? util_bitcount(colors_read & 0xf) : 0);
1507
1508 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1509 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1510 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1511 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1512 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1513 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1514 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1515 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1516 return;
1517 }
1518
1519 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1520 decl->Interp.Location);
1521 if (interp_param_idx == -1)
1522 return;
1523 else if (interp_param_idx) {
1524 interp_param_idx = select_interp_param(ctx,
1525 interp_param_idx);
1526 interp_param = get_interp_param(ctx, interp_param_idx);
1527 }
1528
1529 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1530 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1531 ctx->shader->key.ps.prolog.flatshade_colors)
1532 interp_param = NULL; /* load the constant color */
1533
1534 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1535 decl->Semantic.Index, shader->selector->info.num_inputs,
1536 shader->selector->info.colors_read, interp_param,
1537 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1538 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1539 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1540 }
1541
1542 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1543 {
1544 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1545 SI_PARAM_ANCILLARY, 8, 4);
1546 }
1547
1548 /**
1549 * Set range metadata on an instruction. This can only be used on load and
1550 * call instructions. If you know an instruction can only produce the values
1551 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1552 * \p lo is the minimum value inclusive.
1553 * \p hi is the maximum value exclusive.
1554 */
1555 static void set_range_metadata(struct si_shader_context *ctx,
1556 LLVMValueRef value, unsigned lo, unsigned hi)
1557 {
1558 LLVMValueRef range_md, md_args[2];
1559 LLVMTypeRef type = LLVMTypeOf(value);
1560 LLVMContextRef context = LLVMGetTypeContext(type);
1561
1562 md_args[0] = LLVMConstInt(type, lo, false);
1563 md_args[1] = LLVMConstInt(type, hi, false);
1564 range_md = LLVMMDNodeInContext(context, md_args, 2);
1565 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1566 }
1567
1568 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1569 {
1570 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1571 LLVMValueRef tid;
1572
1573 if (HAVE_LLVM < 0x0308) {
1574 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1575 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1576 } else {
1577 LLVMValueRef tid_args[2];
1578 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1579 tid_args[1] = lp_build_const_int32(gallivm, 0);
1580 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1581 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1582 tid_args, 2, LLVMReadNoneAttribute);
1583
1584 tid = lp_build_intrinsic(gallivm->builder,
1585 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1586 tid_args, 2, LLVMReadNoneAttribute);
1587 }
1588 set_range_metadata(ctx, tid, 0, 64);
1589 return tid;
1590 }
1591
1592 /**
1593 * Load a dword from a constant buffer.
1594 */
1595 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1596 LLVMValueRef resource,
1597 LLVMValueRef offset)
1598 {
1599 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1600 LLVMValueRef args[2] = {resource, offset};
1601
1602 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1603 LLVMReadNoneAttribute);
1604 }
1605
1606 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1607 {
1608 struct si_shader_context *ctx =
1609 si_shader_context(&radeon_bld->soa.bld_base);
1610 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1611 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1612 LLVMBuilderRef builder = gallivm->builder;
1613 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1614 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1615 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1616
1617 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1618 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1619 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1620
1621 LLVMValueRef pos[4] = {
1622 buffer_load_const(ctx, resource, offset0),
1623 buffer_load_const(ctx, resource, offset1),
1624 lp_build_const_float(gallivm, 0),
1625 lp_build_const_float(gallivm, 0)
1626 };
1627
1628 return lp_build_gather_values(gallivm, pos, 4);
1629 }
1630
1631 static void declare_system_value(
1632 struct radeon_llvm_context *radeon_bld,
1633 unsigned index,
1634 const struct tgsi_full_declaration *decl)
1635 {
1636 struct si_shader_context *ctx =
1637 si_shader_context(&radeon_bld->soa.bld_base);
1638 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1639 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1640 LLVMValueRef value = 0;
1641
1642 switch (decl->Semantic.Name) {
1643 case TGSI_SEMANTIC_INSTANCEID:
1644 value = LLVMGetParam(radeon_bld->main_fn,
1645 ctx->param_instance_id);
1646 break;
1647
1648 case TGSI_SEMANTIC_VERTEXID:
1649 value = LLVMBuildAdd(gallivm->builder,
1650 LLVMGetParam(radeon_bld->main_fn,
1651 ctx->param_vertex_id),
1652 LLVMGetParam(radeon_bld->main_fn,
1653 SI_PARAM_BASE_VERTEX), "");
1654 break;
1655
1656 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1657 value = LLVMGetParam(radeon_bld->main_fn,
1658 ctx->param_vertex_id);
1659 break;
1660
1661 case TGSI_SEMANTIC_BASEVERTEX:
1662 value = LLVMGetParam(radeon_bld->main_fn,
1663 SI_PARAM_BASE_VERTEX);
1664 break;
1665
1666 case TGSI_SEMANTIC_INVOCATIONID:
1667 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1668 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1669 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1670 value = LLVMGetParam(radeon_bld->main_fn,
1671 SI_PARAM_GS_INSTANCE_ID);
1672 else
1673 assert(!"INVOCATIONID not implemented");
1674 break;
1675
1676 case TGSI_SEMANTIC_POSITION:
1677 {
1678 LLVMValueRef pos[4] = {
1679 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1680 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1681 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1682 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1683 LLVMGetParam(radeon_bld->main_fn,
1684 SI_PARAM_POS_W_FLOAT)),
1685 };
1686 value = lp_build_gather_values(gallivm, pos, 4);
1687 break;
1688 }
1689
1690 case TGSI_SEMANTIC_FACE:
1691 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1692 break;
1693
1694 case TGSI_SEMANTIC_SAMPLEID:
1695 value = get_sample_id(radeon_bld);
1696 break;
1697
1698 case TGSI_SEMANTIC_SAMPLEPOS: {
1699 LLVMValueRef pos[4] = {
1700 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1701 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1702 lp_build_const_float(gallivm, 0),
1703 lp_build_const_float(gallivm, 0)
1704 };
1705 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1706 TGSI_OPCODE_FRC, pos[0]);
1707 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1708 TGSI_OPCODE_FRC, pos[1]);
1709 value = lp_build_gather_values(gallivm, pos, 4);
1710 break;
1711 }
1712
1713 case TGSI_SEMANTIC_SAMPLEMASK:
1714 /* This can only occur with the OpenGL Core profile, which
1715 * doesn't support smoothing.
1716 */
1717 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1718 break;
1719
1720 case TGSI_SEMANTIC_TESSCOORD:
1721 {
1722 LLVMValueRef coord[4] = {
1723 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1724 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1725 bld->zero,
1726 bld->zero
1727 };
1728
1729 /* For triangles, the vector should be (u, v, 1-u-v). */
1730 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1731 PIPE_PRIM_TRIANGLES)
1732 coord[2] = lp_build_sub(bld, bld->one,
1733 lp_build_add(bld, coord[0], coord[1]));
1734
1735 value = lp_build_gather_values(gallivm, coord, 4);
1736 break;
1737 }
1738
1739 case TGSI_SEMANTIC_VERTICESIN:
1740 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1741 break;
1742
1743 case TGSI_SEMANTIC_TESSINNER:
1744 case TGSI_SEMANTIC_TESSOUTER:
1745 {
1746 LLVMValueRef rw_buffers, buffer, base, addr;
1747 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1748
1749 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1750 SI_PARAM_RW_BUFFERS);
1751 buffer = build_indexed_load_const(ctx, rw_buffers,
1752 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1753
1754 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1755 addr = get_tcs_tes_buffer_address(ctx, NULL,
1756 lp_build_const_int32(gallivm, param));
1757
1758 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1759 ~0, buffer, base, addr);
1760
1761 break;
1762 }
1763
1764 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1765 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1766 {
1767 LLVMValueRef buf, slot, val[4];
1768 int i, offset;
1769
1770 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1771 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1772 buf = build_indexed_load_const(ctx, buf, slot);
1773 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1774
1775 for (i = 0; i < 4; i++)
1776 val[i] = buffer_load_const(ctx, buf,
1777 lp_build_const_int32(gallivm, (offset + i) * 4));
1778 value = lp_build_gather_values(gallivm, val, 4);
1779 break;
1780 }
1781
1782 case TGSI_SEMANTIC_PRIMID:
1783 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1784 break;
1785
1786 case TGSI_SEMANTIC_GRID_SIZE:
1787 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1788 break;
1789
1790 case TGSI_SEMANTIC_BLOCK_SIZE:
1791 {
1792 LLVMValueRef values[3];
1793 unsigned i;
1794 unsigned *properties = ctx->shader->selector->info.properties;
1795 unsigned sizes[3] = {
1796 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1797 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1798 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1799 };
1800
1801 for (i = 0; i < 3; ++i)
1802 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1803
1804 value = lp_build_gather_values(gallivm, values, 3);
1805 break;
1806 }
1807
1808 case TGSI_SEMANTIC_BLOCK_ID:
1809 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1810 break;
1811
1812 case TGSI_SEMANTIC_THREAD_ID:
1813 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1814 break;
1815
1816 #if HAVE_LLVM >= 0x0309
1817 case TGSI_SEMANTIC_HELPER_INVOCATION:
1818 value = lp_build_intrinsic(gallivm->builder,
1819 "llvm.amdgcn.ps.live",
1820 ctx->i1, NULL, 0,
1821 LLVMReadNoneAttribute);
1822 value = LLVMBuildNot(gallivm->builder, value, "");
1823 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1824 break;
1825 #endif
1826
1827 default:
1828 assert(!"unknown system value");
1829 return;
1830 }
1831
1832 radeon_bld->system_values[index] = value;
1833 }
1834
1835 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1836 const struct tgsi_full_declaration *decl)
1837 {
1838 struct si_shader_context *ctx =
1839 si_shader_context(&radeon_bld->soa.bld_base);
1840 struct si_shader_selector *sel = ctx->shader->selector;
1841 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1842
1843 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1844 LLVMValueRef var;
1845
1846 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1847 assert(decl->Range.First == decl->Range.Last);
1848 assert(!ctx->shared_memory);
1849
1850 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1851 LLVMArrayType(ctx->i8, sel->local_size),
1852 "compute_lds",
1853 LOCAL_ADDR_SPACE);
1854 LLVMSetAlignment(var, 4);
1855
1856 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1857 }
1858
1859 static LLVMValueRef fetch_constant(
1860 struct lp_build_tgsi_context *bld_base,
1861 const struct tgsi_full_src_register *reg,
1862 enum tgsi_opcode_type type,
1863 unsigned swizzle)
1864 {
1865 struct si_shader_context *ctx = si_shader_context(bld_base);
1866 struct lp_build_context *base = &bld_base->base;
1867 const struct tgsi_ind_register *ireg = &reg->Indirect;
1868 unsigned buf, idx;
1869
1870 LLVMValueRef addr, bufp;
1871 LLVMValueRef result;
1872
1873 if (swizzle == LP_CHAN_ALL) {
1874 unsigned chan;
1875 LLVMValueRef values[4];
1876 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1877 values[chan] = fetch_constant(bld_base, reg, type, chan);
1878
1879 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1880 }
1881
1882 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1883 idx = reg->Register.Index * 4 + swizzle;
1884
1885 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1886 if (!tgsi_type_is_64bit(type))
1887 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1888 else {
1889 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1890 ctx->constants[buf][idx],
1891 ctx->constants[buf][idx + 1]);
1892 }
1893 }
1894
1895 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1896 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1897 LLVMValueRef index;
1898 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1899 reg->Dimension.Index,
1900 SI_NUM_CONST_BUFFERS);
1901 bufp = build_indexed_load_const(ctx, ptr, index);
1902 } else
1903 bufp = ctx->const_buffers[buf];
1904
1905 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1906 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1907 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1908 addr = lp_build_add(&bld_base->uint_bld, addr,
1909 lp_build_const_int32(base->gallivm, idx * 4));
1910
1911 result = buffer_load_const(ctx, bufp, addr);
1912
1913 if (!tgsi_type_is_64bit(type))
1914 result = bitcast(bld_base, type, result);
1915 else {
1916 LLVMValueRef addr2, result2;
1917 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1918 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1919 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1920 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1921 lp_build_const_int32(base->gallivm, idx * 4));
1922
1923 result2 = buffer_load_const(ctx, ctx->const_buffers[buf],
1924 addr2);
1925
1926 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1927 result, result2);
1928 }
1929 return result;
1930 }
1931
1932 /* Upper 16 bits must be zero. */
1933 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1934 LLVMValueRef val[2])
1935 {
1936 return LLVMBuildOr(gallivm->builder, val[0],
1937 LLVMBuildShl(gallivm->builder, val[1],
1938 lp_build_const_int32(gallivm, 16),
1939 ""), "");
1940 }
1941
1942 /* Upper 16 bits are ignored and will be dropped. */
1943 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1944 LLVMValueRef val[2])
1945 {
1946 LLVMValueRef v[2] = {
1947 LLVMBuildAnd(gallivm->builder, val[0],
1948 lp_build_const_int32(gallivm, 0xffff), ""),
1949 val[1],
1950 };
1951 return si_llvm_pack_two_int16(gallivm, v);
1952 }
1953
1954 /* Initialize arguments for the shader export intrinsic */
1955 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1956 LLVMValueRef *values,
1957 unsigned target,
1958 LLVMValueRef *args)
1959 {
1960 struct si_shader_context *ctx = si_shader_context(bld_base);
1961 struct lp_build_context *uint =
1962 &ctx->radeon_bld.soa.bld_base.uint_bld;
1963 struct lp_build_context *base = &bld_base->base;
1964 struct gallivm_state *gallivm = base->gallivm;
1965 LLVMBuilderRef builder = base->gallivm->builder;
1966 LLVMValueRef val[4];
1967 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1968 unsigned chan;
1969 bool is_int8;
1970
1971 /* Default is 0xf. Adjusted below depending on the format. */
1972 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1973
1974 /* Specify whether the EXEC mask represents the valid mask */
1975 args[1] = uint->zero;
1976
1977 /* Specify whether this is the last export */
1978 args[2] = uint->zero;
1979
1980 /* Specify the target we are exporting */
1981 args[3] = lp_build_const_int32(base->gallivm, target);
1982
1983 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1984 const union si_shader_key *key = &ctx->shader->key;
1985 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1986 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1987
1988 assert(cbuf >= 0 && cbuf < 8);
1989 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1990 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1991 }
1992
1993 args[4] = uint->zero; /* COMPR flag */
1994 args[5] = base->undef;
1995 args[6] = base->undef;
1996 args[7] = base->undef;
1997 args[8] = base->undef;
1998
1999 switch (spi_shader_col_format) {
2000 case V_028714_SPI_SHADER_ZERO:
2001 args[0] = uint->zero; /* writemask */
2002 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2003 break;
2004
2005 case V_028714_SPI_SHADER_32_R:
2006 args[0] = uint->one; /* writemask */
2007 args[5] = values[0];
2008 break;
2009
2010 case V_028714_SPI_SHADER_32_GR:
2011 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2012 args[5] = values[0];
2013 args[6] = values[1];
2014 break;
2015
2016 case V_028714_SPI_SHADER_32_AR:
2017 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2018 args[5] = values[0];
2019 args[8] = values[3];
2020 break;
2021
2022 case V_028714_SPI_SHADER_FP16_ABGR:
2023 args[4] = uint->one; /* COMPR flag */
2024
2025 for (chan = 0; chan < 2; chan++) {
2026 LLVMValueRef pack_args[2] = {
2027 values[2 * chan],
2028 values[2 * chan + 1]
2029 };
2030 LLVMValueRef packed;
2031
2032 packed = lp_build_intrinsic(base->gallivm->builder,
2033 "llvm.SI.packf16",
2034 ctx->i32, pack_args, 2,
2035 LLVMReadNoneAttribute);
2036 args[chan + 5] =
2037 LLVMBuildBitCast(base->gallivm->builder,
2038 packed, ctx->f32, "");
2039 }
2040 break;
2041
2042 case V_028714_SPI_SHADER_UNORM16_ABGR:
2043 for (chan = 0; chan < 4; chan++) {
2044 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2045 val[chan] = LLVMBuildFMul(builder, val[chan],
2046 lp_build_const_float(gallivm, 65535), "");
2047 val[chan] = LLVMBuildFAdd(builder, val[chan],
2048 lp_build_const_float(gallivm, 0.5), "");
2049 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2050 ctx->i32, "");
2051 }
2052
2053 args[4] = uint->one; /* COMPR flag */
2054 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2055 si_llvm_pack_two_int16(gallivm, val));
2056 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2057 si_llvm_pack_two_int16(gallivm, val+2));
2058 break;
2059
2060 case V_028714_SPI_SHADER_SNORM16_ABGR:
2061 for (chan = 0; chan < 4; chan++) {
2062 /* Clamp between [-1, 1]. */
2063 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2064 values[chan],
2065 lp_build_const_float(gallivm, 1));
2066 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2067 val[chan],
2068 lp_build_const_float(gallivm, -1));
2069 /* Convert to a signed integer in [-32767, 32767]. */
2070 val[chan] = LLVMBuildFMul(builder, val[chan],
2071 lp_build_const_float(gallivm, 32767), "");
2072 /* If positive, add 0.5, else add -0.5. */
2073 val[chan] = LLVMBuildFAdd(builder, val[chan],
2074 LLVMBuildSelect(builder,
2075 LLVMBuildFCmp(builder, LLVMRealOGE,
2076 val[chan], base->zero, ""),
2077 lp_build_const_float(gallivm, 0.5),
2078 lp_build_const_float(gallivm, -0.5), ""), "");
2079 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2080 }
2081
2082 args[4] = uint->one; /* COMPR flag */
2083 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2084 si_llvm_pack_two_int32_as_int16(gallivm, val));
2085 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2086 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2087 break;
2088
2089 case V_028714_SPI_SHADER_UINT16_ABGR: {
2090 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2091 255 : 65535);
2092 /* Clamp. */
2093 for (chan = 0; chan < 4; chan++) {
2094 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2095 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2096 val[chan], max);
2097 }
2098
2099 args[4] = uint->one; /* COMPR flag */
2100 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2101 si_llvm_pack_two_int16(gallivm, val));
2102 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2103 si_llvm_pack_two_int16(gallivm, val+2));
2104 break;
2105 }
2106
2107 case V_028714_SPI_SHADER_SINT16_ABGR: {
2108 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2109 127 : 32767);
2110 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2111 -128 : -32768);
2112 /* Clamp. */
2113 for (chan = 0; chan < 4; chan++) {
2114 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2115 val[chan] = lp_build_emit_llvm_binary(bld_base,
2116 TGSI_OPCODE_IMIN,
2117 val[chan], max);
2118 val[chan] = lp_build_emit_llvm_binary(bld_base,
2119 TGSI_OPCODE_IMAX,
2120 val[chan], min);
2121 }
2122
2123 args[4] = uint->one; /* COMPR flag */
2124 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2125 si_llvm_pack_two_int32_as_int16(gallivm, val));
2126 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2127 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2128 break;
2129 }
2130
2131 case V_028714_SPI_SHADER_32_ABGR:
2132 memcpy(&args[5], values, sizeof(values[0]) * 4);
2133 break;
2134 }
2135 }
2136
2137 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2138 LLVMValueRef alpha)
2139 {
2140 struct si_shader_context *ctx = si_shader_context(bld_base);
2141 struct gallivm_state *gallivm = bld_base->base.gallivm;
2142
2143 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2144 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2145 SI_PARAM_ALPHA_REF);
2146
2147 LLVMValueRef alpha_pass =
2148 lp_build_cmp(&bld_base->base,
2149 ctx->shader->key.ps.epilog.alpha_func,
2150 alpha, alpha_ref);
2151 LLVMValueRef arg =
2152 lp_build_select(&bld_base->base,
2153 alpha_pass,
2154 lp_build_const_float(gallivm, 1.0f),
2155 lp_build_const_float(gallivm, -1.0f));
2156
2157 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2158 ctx->voidt, &arg, 1, 0);
2159 } else {
2160 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2161 ctx->voidt, NULL, 0, 0);
2162 }
2163 }
2164
2165 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2166 LLVMValueRef alpha,
2167 unsigned samplemask_param)
2168 {
2169 struct si_shader_context *ctx = si_shader_context(bld_base);
2170 struct gallivm_state *gallivm = bld_base->base.gallivm;
2171 LLVMValueRef coverage;
2172
2173 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2174 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2175 samplemask_param);
2176 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2177
2178 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2179 ctx->i32,
2180 &coverage, 1, LLVMReadNoneAttribute);
2181
2182 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2183 ctx->f32, "");
2184
2185 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2186 lp_build_const_float(gallivm,
2187 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2188
2189 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2190 }
2191
2192 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2193 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2194 {
2195 struct si_shader_context *ctx = si_shader_context(bld_base);
2196 struct lp_build_context *base = &bld_base->base;
2197 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2198 unsigned reg_index;
2199 unsigned chan;
2200 unsigned const_chan;
2201 LLVMValueRef base_elt;
2202 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2203 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2204 SI_VS_CONST_CLIP_PLANES);
2205 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2206
2207 for (reg_index = 0; reg_index < 2; reg_index ++) {
2208 LLVMValueRef *args = pos[2 + reg_index];
2209
2210 args[5] =
2211 args[6] =
2212 args[7] =
2213 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2214
2215 /* Compute dot products of position and user clip plane vectors */
2216 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2217 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2218 args[1] = lp_build_const_int32(base->gallivm,
2219 ((reg_index * 4 + chan) * 4 +
2220 const_chan) * 4);
2221 base_elt = buffer_load_const(ctx, const_resource,
2222 args[1]);
2223 args[5 + chan] =
2224 lp_build_add(base, args[5 + chan],
2225 lp_build_mul(base, base_elt,
2226 out_elts[const_chan]));
2227 }
2228 }
2229
2230 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2231 args[1] = uint->zero;
2232 args[2] = uint->zero;
2233 args[3] = lp_build_const_int32(base->gallivm,
2234 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2235 args[4] = uint->zero;
2236 }
2237 }
2238
2239 static void si_dump_streamout(struct pipe_stream_output_info *so)
2240 {
2241 unsigned i;
2242
2243 if (so->num_outputs)
2244 fprintf(stderr, "STREAMOUT\n");
2245
2246 for (i = 0; i < so->num_outputs; i++) {
2247 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2248 so->output[i].start_component;
2249 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2250 i, so->output[i].output_buffer,
2251 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2252 so->output[i].register_index,
2253 mask & 1 ? "x" : "",
2254 mask & 2 ? "y" : "",
2255 mask & 4 ? "z" : "",
2256 mask & 8 ? "w" : "");
2257 }
2258 }
2259
2260 /* On SI, the vertex shader is responsible for writing streamout data
2261 * to buffers. */
2262 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2263 struct si_shader_output_values *outputs,
2264 unsigned noutput)
2265 {
2266 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2267 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2268 LLVMBuilderRef builder = gallivm->builder;
2269 int i, j;
2270 struct lp_build_if_state if_ctx;
2271
2272 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2273 LLVMValueRef so_vtx_count =
2274 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2275
2276 LLVMValueRef tid = get_thread_id(ctx);
2277
2278 /* can_emit = tid < so_vtx_count; */
2279 LLVMValueRef can_emit =
2280 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2281
2282 LLVMValueRef stream_id =
2283 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2284
2285 /* Emit the streamout code conditionally. This actually avoids
2286 * out-of-bounds buffer access. The hw tells us via the SGPR
2287 * (so_vtx_count) which threads are allowed to emit streamout data. */
2288 lp_build_if(&if_ctx, gallivm, can_emit);
2289 {
2290 /* The buffer offset is computed as follows:
2291 * ByteOffset = streamout_offset[buffer_id]*4 +
2292 * (streamout_write_index + thread_id)*stride[buffer_id] +
2293 * attrib_offset
2294 */
2295
2296 LLVMValueRef so_write_index =
2297 LLVMGetParam(ctx->radeon_bld.main_fn,
2298 ctx->param_streamout_write_index);
2299
2300 /* Compute (streamout_write_index + thread_id). */
2301 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2302
2303 /* Compute the write offset for each enabled buffer. */
2304 LLVMValueRef so_write_offset[4] = {};
2305 for (i = 0; i < 4; i++) {
2306 if (!so->stride[i])
2307 continue;
2308
2309 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2310 ctx->param_streamout_offset[i]);
2311 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2312
2313 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2314 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2315 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2316 }
2317
2318 /* Write streamout data. */
2319 for (i = 0; i < so->num_outputs; i++) {
2320 unsigned buf_idx = so->output[i].output_buffer;
2321 unsigned reg = so->output[i].register_index;
2322 unsigned start = so->output[i].start_component;
2323 unsigned num_comps = so->output[i].num_components;
2324 unsigned stream = so->output[i].stream;
2325 LLVMValueRef out[4];
2326 struct lp_build_if_state if_ctx_stream;
2327
2328 assert(num_comps && num_comps <= 4);
2329 if (!num_comps || num_comps > 4)
2330 continue;
2331
2332 if (reg >= noutput)
2333 continue;
2334
2335 /* Load the output as int. */
2336 for (j = 0; j < num_comps; j++) {
2337 out[j] = LLVMBuildBitCast(builder,
2338 outputs[reg].values[start+j],
2339 ctx->i32, "");
2340 }
2341
2342 /* Pack the output. */
2343 LLVMValueRef vdata = NULL;
2344
2345 switch (num_comps) {
2346 case 1: /* as i32 */
2347 vdata = out[0];
2348 break;
2349 case 2: /* as v2i32 */
2350 case 3: /* as v4i32 (aligned to 4) */
2351 case 4: /* as v4i32 */
2352 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2353 for (j = 0; j < num_comps; j++) {
2354 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2355 LLVMConstInt(ctx->i32, j, 0), "");
2356 }
2357 break;
2358 }
2359
2360 LLVMValueRef can_emit_stream =
2361 LLVMBuildICmp(builder, LLVMIntEQ,
2362 stream_id,
2363 lp_build_const_int32(gallivm, stream), "");
2364
2365 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2366 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2367 vdata, num_comps,
2368 so_write_offset[buf_idx],
2369 LLVMConstInt(ctx->i32, 0, 0),
2370 so->output[i].dst_offset*4);
2371 lp_build_endif(&if_ctx_stream);
2372 }
2373 }
2374 lp_build_endif(&if_ctx);
2375 }
2376
2377
2378 /* Generate export instructions for hardware VS shader stage */
2379 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2380 struct si_shader_output_values *outputs,
2381 unsigned noutput)
2382 {
2383 struct si_shader_context *ctx = si_shader_context(bld_base);
2384 struct si_shader *shader = ctx->shader;
2385 struct lp_build_context *base = &bld_base->base;
2386 struct lp_build_context *uint =
2387 &ctx->radeon_bld.soa.bld_base.uint_bld;
2388 LLVMValueRef args[9];
2389 LLVMValueRef pos_args[4][9] = { { 0 } };
2390 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2391 unsigned semantic_name, semantic_index;
2392 unsigned target;
2393 unsigned param_count = 0;
2394 unsigned pos_idx;
2395 int i;
2396
2397 if (outputs && ctx->shader->selector->so.num_outputs) {
2398 si_llvm_emit_streamout(ctx, outputs, noutput);
2399 }
2400
2401 for (i = 0; i < noutput; i++) {
2402 semantic_name = outputs[i].name;
2403 semantic_index = outputs[i].sid;
2404
2405 handle_semantic:
2406 /* Select the correct target */
2407 switch(semantic_name) {
2408 case TGSI_SEMANTIC_PSIZE:
2409 psize_value = outputs[i].values[0];
2410 continue;
2411 case TGSI_SEMANTIC_EDGEFLAG:
2412 edgeflag_value = outputs[i].values[0];
2413 continue;
2414 case TGSI_SEMANTIC_LAYER:
2415 layer_value = outputs[i].values[0];
2416 semantic_name = TGSI_SEMANTIC_GENERIC;
2417 goto handle_semantic;
2418 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2419 viewport_index_value = outputs[i].values[0];
2420 semantic_name = TGSI_SEMANTIC_GENERIC;
2421 goto handle_semantic;
2422 case TGSI_SEMANTIC_POSITION:
2423 target = V_008DFC_SQ_EXP_POS;
2424 break;
2425 case TGSI_SEMANTIC_COLOR:
2426 case TGSI_SEMANTIC_BCOLOR:
2427 target = V_008DFC_SQ_EXP_PARAM + param_count;
2428 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2429 shader->info.vs_output_param_offset[i] = param_count;
2430 param_count++;
2431 break;
2432 case TGSI_SEMANTIC_CLIPDIST:
2433 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2434 break;
2435 case TGSI_SEMANTIC_CLIPVERTEX:
2436 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2437 continue;
2438 case TGSI_SEMANTIC_PRIMID:
2439 case TGSI_SEMANTIC_FOG:
2440 case TGSI_SEMANTIC_TEXCOORD:
2441 case TGSI_SEMANTIC_GENERIC:
2442 target = V_008DFC_SQ_EXP_PARAM + param_count;
2443 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2444 shader->info.vs_output_param_offset[i] = param_count;
2445 param_count++;
2446 break;
2447 default:
2448 target = 0;
2449 fprintf(stderr,
2450 "Warning: SI unhandled vs output type:%d\n",
2451 semantic_name);
2452 }
2453
2454 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2455
2456 if (target >= V_008DFC_SQ_EXP_POS &&
2457 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2458 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2459 args, sizeof(args));
2460 } else {
2461 lp_build_intrinsic(base->gallivm->builder,
2462 "llvm.SI.export", ctx->voidt,
2463 args, 9, 0);
2464 }
2465
2466 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2467 semantic_name = TGSI_SEMANTIC_GENERIC;
2468 goto handle_semantic;
2469 }
2470 }
2471
2472 shader->info.nr_param_exports = param_count;
2473
2474 /* We need to add the position output manually if it's missing. */
2475 if (!pos_args[0][0]) {
2476 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2477 pos_args[0][1] = uint->zero; /* EXEC mask */
2478 pos_args[0][2] = uint->zero; /* last export? */
2479 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2480 pos_args[0][4] = uint->zero; /* COMPR flag */
2481 pos_args[0][5] = base->zero; /* X */
2482 pos_args[0][6] = base->zero; /* Y */
2483 pos_args[0][7] = base->zero; /* Z */
2484 pos_args[0][8] = base->one; /* W */
2485 }
2486
2487 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2488 if (shader->selector->info.writes_psize ||
2489 shader->selector->info.writes_edgeflag ||
2490 shader->selector->info.writes_viewport_index ||
2491 shader->selector->info.writes_layer) {
2492 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2493 shader->selector->info.writes_psize |
2494 (shader->selector->info.writes_edgeflag << 1) |
2495 (shader->selector->info.writes_layer << 2) |
2496 (shader->selector->info.writes_viewport_index << 3));
2497 pos_args[1][1] = uint->zero; /* EXEC mask */
2498 pos_args[1][2] = uint->zero; /* last export? */
2499 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2500 pos_args[1][4] = uint->zero; /* COMPR flag */
2501 pos_args[1][5] = base->zero; /* X */
2502 pos_args[1][6] = base->zero; /* Y */
2503 pos_args[1][7] = base->zero; /* Z */
2504 pos_args[1][8] = base->zero; /* W */
2505
2506 if (shader->selector->info.writes_psize)
2507 pos_args[1][5] = psize_value;
2508
2509 if (shader->selector->info.writes_edgeflag) {
2510 /* The output is a float, but the hw expects an integer
2511 * with the first bit containing the edge flag. */
2512 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2513 edgeflag_value,
2514 ctx->i32, "");
2515 edgeflag_value = lp_build_min(&bld_base->int_bld,
2516 edgeflag_value,
2517 bld_base->int_bld.one);
2518
2519 /* The LLVM intrinsic expects a float. */
2520 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2521 edgeflag_value,
2522 ctx->f32, "");
2523 }
2524
2525 if (shader->selector->info.writes_layer)
2526 pos_args[1][7] = layer_value;
2527
2528 if (shader->selector->info.writes_viewport_index)
2529 pos_args[1][8] = viewport_index_value;
2530 }
2531
2532 for (i = 0; i < 4; i++)
2533 if (pos_args[i][0])
2534 shader->info.nr_pos_exports++;
2535
2536 pos_idx = 0;
2537 for (i = 0; i < 4; i++) {
2538 if (!pos_args[i][0])
2539 continue;
2540
2541 /* Specify the target we are exporting */
2542 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2543
2544 if (pos_idx == shader->info.nr_pos_exports)
2545 /* Specify that this is the last export */
2546 pos_args[i][2] = uint->one;
2547
2548 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2549 ctx->voidt, pos_args[i], 9, 0);
2550 }
2551 }
2552
2553 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2554 {
2555 struct si_shader_context *ctx = si_shader_context(bld_base);
2556 struct gallivm_state *gallivm = bld_base->base.gallivm;
2557 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2558 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2559 uint64_t inputs;
2560
2561 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2562
2563 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2564 buffer = build_indexed_load_const(ctx, rw_buffers,
2565 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2566
2567 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2568
2569 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2570 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2571 lds_vertex_stride, "");
2572 lds_base = get_tcs_in_current_patch_offset(ctx);
2573 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2574
2575 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2576 while (inputs) {
2577 unsigned i = u_bit_scan64(&inputs);
2578
2579 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2580 lp_build_const_int32(gallivm, 4 * i),
2581 "");
2582
2583 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2584 invocation_id,
2585 lp_build_const_int32(gallivm, i));
2586
2587 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2588 lds_ptr);
2589
2590 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2591 buffer_offset, 0);
2592 }
2593 }
2594
2595 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2596 LLVMValueRef rel_patch_id,
2597 LLVMValueRef invocation_id,
2598 LLVMValueRef tcs_out_current_patch_data_offset)
2599 {
2600 struct si_shader_context *ctx = si_shader_context(bld_base);
2601 struct gallivm_state *gallivm = bld_base->base.gallivm;
2602 struct si_shader *shader = ctx->shader;
2603 unsigned tess_inner_index, tess_outer_index;
2604 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2605 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2606 unsigned stride, outer_comps, inner_comps, i;
2607 struct lp_build_if_state if_ctx, inner_if_ctx;
2608
2609 si_llvm_emit_barrier(NULL, bld_base, NULL);
2610
2611 /* Do this only for invocation 0, because the tess levels are per-patch,
2612 * not per-vertex.
2613 *
2614 * This can't jump, because invocation 0 executes this. It should
2615 * at least mask out the loads and stores for other invocations.
2616 */
2617 lp_build_if(&if_ctx, gallivm,
2618 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2619 invocation_id, bld_base->uint_bld.zero, ""));
2620
2621 /* Determine the layout of one tess factor element in the buffer. */
2622 switch (shader->key.tcs.epilog.prim_mode) {
2623 case PIPE_PRIM_LINES:
2624 stride = 2; /* 2 dwords, 1 vec2 store */
2625 outer_comps = 2;
2626 inner_comps = 0;
2627 break;
2628 case PIPE_PRIM_TRIANGLES:
2629 stride = 4; /* 4 dwords, 1 vec4 store */
2630 outer_comps = 3;
2631 inner_comps = 1;
2632 break;
2633 case PIPE_PRIM_QUADS:
2634 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2635 outer_comps = 4;
2636 inner_comps = 2;
2637 break;
2638 default:
2639 assert(0);
2640 return;
2641 }
2642
2643 /* Load tess_inner and tess_outer from LDS.
2644 * Any invocation can write them, so we can't get them from a temporary.
2645 */
2646 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2647 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2648
2649 lds_base = tcs_out_current_patch_data_offset;
2650 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2651 lp_build_const_int32(gallivm,
2652 tess_inner_index * 4), "");
2653 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2654 lp_build_const_int32(gallivm,
2655 tess_outer_index * 4), "");
2656
2657 for (i = 0; i < outer_comps; i++)
2658 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2659 for (i = 0; i < inner_comps; i++)
2660 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2661
2662 /* Convert the outputs to vectors for stores. */
2663 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2664 vec1 = NULL;
2665
2666 if (stride > 4)
2667 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2668
2669 /* Get the buffer. */
2670 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2671 SI_PARAM_RW_BUFFERS);
2672 buffer = build_indexed_load_const(ctx, rw_buffers,
2673 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2674
2675 /* Get the offset. */
2676 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2677 SI_PARAM_TESS_FACTOR_OFFSET);
2678 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2679 lp_build_const_int32(gallivm, 4 * stride), "");
2680
2681 lp_build_if(&inner_if_ctx, gallivm,
2682 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2683 rel_patch_id, bld_base->uint_bld.zero, ""));
2684
2685 /* Store the dynamic HS control word. */
2686 build_tbuffer_store_dwords(ctx, buffer,
2687 lp_build_const_int32(gallivm, 0x80000000),
2688 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2689
2690 lp_build_endif(&inner_if_ctx);
2691
2692 /* Store the tessellation factors. */
2693 build_tbuffer_store_dwords(ctx, buffer, vec0,
2694 MIN2(stride, 4), byteoffset, tf_base, 4);
2695 if (vec1)
2696 build_tbuffer_store_dwords(ctx, buffer, vec1,
2697 stride - 4, byteoffset, tf_base, 20);
2698 lp_build_endif(&if_ctx);
2699 }
2700
2701 /* This only writes the tessellation factor levels. */
2702 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2703 {
2704 struct si_shader_context *ctx = si_shader_context(bld_base);
2705 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2706
2707 rel_patch_id = get_rel_patch_id(ctx);
2708 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2709 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2710
2711 if (!ctx->is_monolithic) {
2712 /* Return epilog parameters from this function. */
2713 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2714 LLVMValueRef ret = ctx->return_value;
2715 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2716 unsigned vgpr;
2717
2718 /* RW_BUFFERS pointer */
2719 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2720 SI_PARAM_RW_BUFFERS);
2721 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2722 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2723 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2724 bld_base->uint_bld.zero, "");
2725 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2726 bld_base->uint_bld.one, "");
2727 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2728 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2729
2730 /* Tess factor buffer soffset is after user SGPRs. */
2731 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2732 SI_PARAM_TESS_FACTOR_OFFSET);
2733 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2734 SI_TCS_NUM_USER_SGPR + 1, "");
2735
2736 /* VGPRs */
2737 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2738 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2739 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2740
2741 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2742 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2743 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2744 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2745 ctx->return_value = ret;
2746 return;
2747 }
2748
2749 si_copy_tcs_inputs(bld_base);
2750 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2751 }
2752
2753 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2754 {
2755 struct si_shader_context *ctx = si_shader_context(bld_base);
2756 struct si_shader *shader = ctx->shader;
2757 struct tgsi_shader_info *info = &shader->selector->info;
2758 struct gallivm_state *gallivm = bld_base->base.gallivm;
2759 unsigned i, chan;
2760 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2761 ctx->param_rel_auto_id);
2762 LLVMValueRef vertex_dw_stride =
2763 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2764 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2765 vertex_dw_stride, "");
2766
2767 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2768 * its inputs from it. */
2769 for (i = 0; i < info->num_outputs; i++) {
2770 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2771 unsigned name = info->output_semantic_name[i];
2772 unsigned index = info->output_semantic_index[i];
2773 int param = si_shader_io_get_unique_index(name, index);
2774 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2775 lp_build_const_int32(gallivm, param * 4), "");
2776
2777 for (chan = 0; chan < 4; chan++) {
2778 lds_store(bld_base, chan, dw_addr,
2779 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2780 }
2781 }
2782 }
2783
2784 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2785 {
2786 struct si_shader_context *ctx = si_shader_context(bld_base);
2787 struct gallivm_state *gallivm = bld_base->base.gallivm;
2788 struct si_shader *es = ctx->shader;
2789 struct tgsi_shader_info *info = &es->selector->info;
2790 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2791 ctx->param_es2gs_offset);
2792 unsigned chan;
2793 int i;
2794
2795 for (i = 0; i < info->num_outputs; i++) {
2796 LLVMValueRef *out_ptr =
2797 ctx->radeon_bld.soa.outputs[i];
2798 int param_index;
2799
2800 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2801 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2802 continue;
2803
2804 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2805 info->output_semantic_index[i]);
2806
2807 for (chan = 0; chan < 4; chan++) {
2808 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2809 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2810
2811 build_tbuffer_store(ctx,
2812 ctx->esgs_ring,
2813 out_val, 1,
2814 LLVMGetUndef(ctx->i32), soffset,
2815 (4 * param_index + chan) * 4,
2816 V_008F0C_BUF_DATA_FORMAT_32,
2817 V_008F0C_BUF_NUM_FORMAT_UINT,
2818 0, 0, 1, 1, 0);
2819 }
2820 }
2821 }
2822
2823 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2824 {
2825 struct si_shader_context *ctx = si_shader_context(bld_base);
2826 struct gallivm_state *gallivm = bld_base->base.gallivm;
2827 LLVMValueRef args[2];
2828
2829 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2830 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2831 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2832 ctx->voidt, args, 2, 0);
2833 }
2834
2835 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2836 {
2837 struct si_shader_context *ctx = si_shader_context(bld_base);
2838 struct gallivm_state *gallivm = bld_base->base.gallivm;
2839 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2840 struct si_shader_output_values *outputs = NULL;
2841 int i,j;
2842
2843 assert(!ctx->is_gs_copy_shader);
2844
2845 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2846
2847 /* Vertex color clamping.
2848 *
2849 * This uses a state constant loaded in a user data SGPR and
2850 * an IF statement is added that clamps all colors if the constant
2851 * is true.
2852 */
2853 if (ctx->type == PIPE_SHADER_VERTEX) {
2854 struct lp_build_if_state if_ctx;
2855 LLVMValueRef cond = NULL;
2856 LLVMValueRef addr, val;
2857
2858 for (i = 0; i < info->num_outputs; i++) {
2859 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2860 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2861 continue;
2862
2863 /* We've found a color. */
2864 if (!cond) {
2865 /* The state is in the first bit of the user SGPR. */
2866 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2867 SI_PARAM_VS_STATE_BITS);
2868 cond = LLVMBuildTrunc(gallivm->builder, cond,
2869 ctx->i1, "");
2870 lp_build_if(&if_ctx, gallivm, cond);
2871 }
2872
2873 for (j = 0; j < 4; j++) {
2874 addr = ctx->radeon_bld.soa.outputs[i][j];
2875 val = LLVMBuildLoad(gallivm->builder, addr, "");
2876 val = radeon_llvm_saturate(bld_base, val);
2877 LLVMBuildStore(gallivm->builder, val, addr);
2878 }
2879 }
2880
2881 if (cond)
2882 lp_build_endif(&if_ctx);
2883 }
2884
2885 for (i = 0; i < info->num_outputs; i++) {
2886 outputs[i].name = info->output_semantic_name[i];
2887 outputs[i].sid = info->output_semantic_index[i];
2888
2889 for (j = 0; j < 4; j++)
2890 outputs[i].values[j] =
2891 LLVMBuildLoad(gallivm->builder,
2892 ctx->radeon_bld.soa.outputs[i][j],
2893 "");
2894 }
2895
2896 if (ctx->is_monolithic) {
2897 /* Export PrimitiveID when PS needs it. */
2898 if (si_vs_exports_prim_id(ctx->shader)) {
2899 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2900 outputs[i].sid = 0;
2901 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2902 get_primitive_id(bld_base, 0));
2903 outputs[i].values[1] = bld_base->base.undef;
2904 outputs[i].values[2] = bld_base->base.undef;
2905 outputs[i].values[3] = bld_base->base.undef;
2906 i++;
2907 }
2908 } else {
2909 /* Return the primitive ID from the LLVM function. */
2910 ctx->return_value =
2911 LLVMBuildInsertValue(gallivm->builder,
2912 ctx->return_value,
2913 bitcast(bld_base, TGSI_TYPE_FLOAT,
2914 get_primitive_id(bld_base, 0)),
2915 VS_EPILOG_PRIMID_LOC, "");
2916 }
2917
2918 si_llvm_export_vs(bld_base, outputs, i);
2919 FREE(outputs);
2920 }
2921
2922 struct si_ps_exports {
2923 unsigned num;
2924 LLVMValueRef args[10][9];
2925 };
2926
2927 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2928 LLVMValueRef depth, LLVMValueRef stencil,
2929 LLVMValueRef samplemask, struct si_ps_exports *exp)
2930 {
2931 struct si_shader_context *ctx = si_shader_context(bld_base);
2932 struct lp_build_context *base = &bld_base->base;
2933 struct lp_build_context *uint = &bld_base->uint_bld;
2934 LLVMValueRef args[9];
2935 unsigned mask = 0;
2936
2937 assert(depth || stencil || samplemask);
2938
2939 args[1] = uint->one; /* whether the EXEC mask is valid */
2940 args[2] = uint->one; /* DONE bit */
2941
2942 /* Specify the target we are exporting */
2943 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2944
2945 args[4] = uint->zero; /* COMP flag */
2946 args[5] = base->undef; /* R, depth */
2947 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2948 args[7] = base->undef; /* B, sample mask */
2949 args[8] = base->undef; /* A, alpha to mask */
2950
2951 if (depth) {
2952 args[5] = depth;
2953 mask |= 0x1;
2954 }
2955
2956 if (stencil) {
2957 args[6] = stencil;
2958 mask |= 0x2;
2959 }
2960
2961 if (samplemask) {
2962 args[7] = samplemask;
2963 mask |= 0x4;
2964 }
2965
2966 /* SI (except OLAND) has a bug that it only looks
2967 * at the X writemask component. */
2968 if (ctx->screen->b.chip_class == SI &&
2969 ctx->screen->b.family != CHIP_OLAND)
2970 mask |= 0x1;
2971
2972 /* Specify which components to enable */
2973 args[0] = lp_build_const_int32(base->gallivm, mask);
2974
2975 memcpy(exp->args[exp->num++], args, sizeof(args));
2976 }
2977
2978 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2979 LLVMValueRef *color, unsigned index,
2980 unsigned samplemask_param,
2981 bool is_last, struct si_ps_exports *exp)
2982 {
2983 struct si_shader_context *ctx = si_shader_context(bld_base);
2984 struct lp_build_context *base = &bld_base->base;
2985 int i;
2986
2987 /* Clamp color */
2988 if (ctx->shader->key.ps.epilog.clamp_color)
2989 for (i = 0; i < 4; i++)
2990 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2991
2992 /* Alpha to one */
2993 if (ctx->shader->key.ps.epilog.alpha_to_one)
2994 color[3] = base->one;
2995
2996 /* Alpha test */
2997 if (index == 0 &&
2998 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2999 si_alpha_test(bld_base, color[3]);
3000
3001 /* Line & polygon smoothing */
3002 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3003 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3004 samplemask_param);
3005
3006 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3007 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3008 LLVMValueRef args[8][9];
3009 int c, last = -1;
3010
3011 /* Get the export arguments, also find out what the last one is. */
3012 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3013 si_llvm_init_export_args(bld_base, color,
3014 V_008DFC_SQ_EXP_MRT + c, args[c]);
3015 if (args[c][0] != bld_base->uint_bld.zero)
3016 last = c;
3017 }
3018
3019 /* Emit all exports. */
3020 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3021 if (is_last && last == c) {
3022 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3023 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3024 } else if (args[c][0] == bld_base->uint_bld.zero)
3025 continue; /* unnecessary NULL export */
3026
3027 memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
3028 }
3029 } else {
3030 LLVMValueRef args[9];
3031
3032 /* Export */
3033 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3034 args);
3035 if (is_last) {
3036 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3037 args[2] = bld_base->uint_bld.one; /* DONE bit */
3038 } else if (args[0] == bld_base->uint_bld.zero)
3039 return; /* unnecessary NULL export */
3040
3041 memcpy(exp->args[exp->num++], args, sizeof(args));
3042 }
3043 }
3044
3045 static void si_emit_ps_exports(struct si_shader_context *ctx,
3046 struct si_ps_exports *exp)
3047 {
3048 for (unsigned i = 0; i < exp->num; i++)
3049 lp_build_intrinsic(ctx->radeon_bld.gallivm.builder,
3050 "llvm.SI.export", ctx->voidt,
3051 exp->args[i], 9, 0);
3052 }
3053
3054 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3055 {
3056 struct si_shader_context *ctx = si_shader_context(bld_base);
3057 struct lp_build_context *base = &bld_base->base;
3058 struct lp_build_context *uint = &bld_base->uint_bld;
3059 LLVMValueRef args[9];
3060
3061 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3062 args[1] = uint->one; /* whether the EXEC mask is valid */
3063 args[2] = uint->one; /* DONE bit */
3064 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3065 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3066 args[5] = uint->undef; /* R */
3067 args[6] = uint->undef; /* G */
3068 args[7] = uint->undef; /* B */
3069 args[8] = uint->undef; /* A */
3070
3071 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3072 ctx->voidt, args, 9, 0);
3073 }
3074
3075 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3076 {
3077 struct si_shader_context *ctx = si_shader_context(bld_base);
3078 struct si_shader *shader = ctx->shader;
3079 struct lp_build_context *base = &bld_base->base;
3080 struct tgsi_shader_info *info = &shader->selector->info;
3081 LLVMBuilderRef builder = base->gallivm->builder;
3082 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3083 int last_color_export = -1;
3084 int i;
3085 struct si_ps_exports exp = {};
3086
3087 /* Determine the last export. If MRTZ is present, it's always last.
3088 * Otherwise, find the last color export.
3089 */
3090 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3091 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3092
3093 /* Don't export NULL and return if alpha-test is enabled. */
3094 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3095 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3096 (spi_format & 0xf) == 0)
3097 spi_format |= V_028714_SPI_SHADER_32_AR;
3098
3099 for (i = 0; i < info->num_outputs; i++) {
3100 unsigned index = info->output_semantic_index[i];
3101
3102 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3103 continue;
3104
3105 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3106 if (shader->key.ps.epilog.last_cbuf > 0) {
3107 /* Just set this if any of the colorbuffers are enabled. */
3108 if (spi_format &
3109 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3110 last_color_export = i;
3111 continue;
3112 }
3113
3114 if ((spi_format >> (index * 4)) & 0xf)
3115 last_color_export = i;
3116 }
3117
3118 /* If there are no outputs, export NULL. */
3119 if (last_color_export == -1) {
3120 si_export_null(bld_base);
3121 return;
3122 }
3123 }
3124
3125 for (i = 0; i < info->num_outputs; i++) {
3126 unsigned semantic_name = info->output_semantic_name[i];
3127 unsigned semantic_index = info->output_semantic_index[i];
3128 unsigned j;
3129 LLVMValueRef color[4] = {};
3130
3131 /* Select the correct target */
3132 switch (semantic_name) {
3133 case TGSI_SEMANTIC_POSITION:
3134 depth = LLVMBuildLoad(builder,
3135 ctx->radeon_bld.soa.outputs[i][2], "");
3136 break;
3137 case TGSI_SEMANTIC_STENCIL:
3138 stencil = LLVMBuildLoad(builder,
3139 ctx->radeon_bld.soa.outputs[i][1], "");
3140 break;
3141 case TGSI_SEMANTIC_SAMPLEMASK:
3142 samplemask = LLVMBuildLoad(builder,
3143 ctx->radeon_bld.soa.outputs[i][0], "");
3144 break;
3145 case TGSI_SEMANTIC_COLOR:
3146 for (j = 0; j < 4; j++)
3147 color[j] = LLVMBuildLoad(builder,
3148 ctx->radeon_bld.soa.outputs[i][j], "");
3149
3150 si_export_mrt_color(bld_base, color, semantic_index,
3151 SI_PARAM_SAMPLE_COVERAGE,
3152 last_color_export == i, &exp);
3153 break;
3154 default:
3155 fprintf(stderr,
3156 "Warning: SI unhandled fs output type:%d\n",
3157 semantic_name);
3158 }
3159 }
3160
3161 if (depth || stencil || samplemask)
3162 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
3163
3164 si_emit_ps_exports(ctx, &exp);
3165 }
3166
3167 /**
3168 * Return PS outputs in this order:
3169 *
3170 * v[0:3] = color0.xyzw
3171 * v[4:7] = color1.xyzw
3172 * ...
3173 * vN+0 = Depth
3174 * vN+1 = Stencil
3175 * vN+2 = SampleMask
3176 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3177 *
3178 * The alpha-ref SGPR is returned via its original location.
3179 */
3180 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3181 {
3182 struct si_shader_context *ctx = si_shader_context(bld_base);
3183 struct si_shader *shader = ctx->shader;
3184 struct lp_build_context *base = &bld_base->base;
3185 struct tgsi_shader_info *info = &shader->selector->info;
3186 LLVMBuilderRef builder = base->gallivm->builder;
3187 unsigned i, j, first_vgpr, vgpr;
3188
3189 LLVMValueRef color[8][4] = {};
3190 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3191 LLVMValueRef ret;
3192
3193 /* Read the output values. */
3194 for (i = 0; i < info->num_outputs; i++) {
3195 unsigned semantic_name = info->output_semantic_name[i];
3196 unsigned semantic_index = info->output_semantic_index[i];
3197
3198 switch (semantic_name) {
3199 case TGSI_SEMANTIC_COLOR:
3200 assert(semantic_index < 8);
3201 for (j = 0; j < 4; j++) {
3202 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3203 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3204 color[semantic_index][j] = result;
3205 }
3206 break;
3207 case TGSI_SEMANTIC_POSITION:
3208 depth = LLVMBuildLoad(builder,
3209 ctx->radeon_bld.soa.outputs[i][2], "");
3210 break;
3211 case TGSI_SEMANTIC_STENCIL:
3212 stencil = LLVMBuildLoad(builder,
3213 ctx->radeon_bld.soa.outputs[i][1], "");
3214 break;
3215 case TGSI_SEMANTIC_SAMPLEMASK:
3216 samplemask = LLVMBuildLoad(builder,
3217 ctx->radeon_bld.soa.outputs[i][0], "");
3218 break;
3219 default:
3220 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3221 semantic_name);
3222 }
3223 }
3224
3225 /* Fill the return structure. */
3226 ret = ctx->return_value;
3227
3228 /* Set SGPRs. */
3229 ret = LLVMBuildInsertValue(builder, ret,
3230 bitcast(bld_base, TGSI_TYPE_SIGNED,
3231 LLVMGetParam(ctx->radeon_bld.main_fn,
3232 SI_PARAM_ALPHA_REF)),
3233 SI_SGPR_ALPHA_REF, "");
3234
3235 /* Set VGPRs */
3236 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3237 for (i = 0; i < ARRAY_SIZE(color); i++) {
3238 if (!color[i][0])
3239 continue;
3240
3241 for (j = 0; j < 4; j++)
3242 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3243 }
3244 if (depth)
3245 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3246 if (stencil)
3247 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3248 if (samplemask)
3249 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3250
3251 /* Add the input sample mask for smoothing at the end. */
3252 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3253 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3254 ret = LLVMBuildInsertValue(builder, ret,
3255 LLVMGetParam(ctx->radeon_bld.main_fn,
3256 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3257
3258 ctx->return_value = ret;
3259 }
3260
3261 /**
3262 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3263 * buffer in number of elements and return it as an i32.
3264 */
3265 static LLVMValueRef get_buffer_size(
3266 struct lp_build_tgsi_context *bld_base,
3267 LLVMValueRef descriptor)
3268 {
3269 struct si_shader_context *ctx = si_shader_context(bld_base);
3270 struct gallivm_state *gallivm = bld_base->base.gallivm;
3271 LLVMBuilderRef builder = gallivm->builder;
3272 LLVMValueRef size =
3273 LLVMBuildExtractElement(builder, descriptor,
3274 lp_build_const_int32(gallivm, 6), "");
3275
3276 if (ctx->screen->b.chip_class >= VI) {
3277 /* On VI, the descriptor contains the size in bytes,
3278 * but TXQ must return the size in elements.
3279 * The stride is always non-zero for resources using TXQ.
3280 */
3281 LLVMValueRef stride =
3282 LLVMBuildExtractElement(builder, descriptor,
3283 lp_build_const_int32(gallivm, 5), "");
3284 stride = LLVMBuildLShr(builder, stride,
3285 lp_build_const_int32(gallivm, 16), "");
3286 stride = LLVMBuildAnd(builder, stride,
3287 lp_build_const_int32(gallivm, 0x3FFF), "");
3288
3289 size = LLVMBuildUDiv(builder, size, stride, "");
3290 }
3291
3292 return size;
3293 }
3294
3295 /**
3296 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3297 * intrinsic names).
3298 */
3299 static void build_int_type_name(
3300 LLVMTypeRef type,
3301 char *buf, unsigned bufsize)
3302 {
3303 assert(bufsize >= 6);
3304
3305 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3306 snprintf(buf, bufsize, "v%ui32",
3307 LLVMGetVectorSize(type));
3308 else
3309 strcpy(buf, "i32");
3310 }
3311
3312 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3313 struct lp_build_tgsi_context *bld_base,
3314 struct lp_build_emit_data *emit_data);
3315
3316 /* Prevent optimizations (at least of memory accesses) across the current
3317 * point in the program by emitting empty inline assembly that is marked as
3318 * having side effects.
3319 */
3320 static void emit_optimization_barrier(struct si_shader_context *ctx)
3321 {
3322 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3323 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3324 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3325 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3326 }
3327
3328 static void emit_waitcnt(struct si_shader_context *ctx)
3329 {
3330 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3331 LLVMBuilderRef builder = gallivm->builder;
3332 LLVMValueRef args[1] = {
3333 lp_build_const_int32(gallivm, 0xf70)
3334 };
3335 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3336 ctx->voidt, args, 1, 0);
3337 }
3338
3339 static void membar_emit(
3340 const struct lp_build_tgsi_action *action,
3341 struct lp_build_tgsi_context *bld_base,
3342 struct lp_build_emit_data *emit_data)
3343 {
3344 struct si_shader_context *ctx = si_shader_context(bld_base);
3345
3346 emit_waitcnt(ctx);
3347 }
3348
3349 static LLVMValueRef
3350 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3351 const struct tgsi_full_src_register *reg)
3352 {
3353 LLVMValueRef ind_index;
3354 LLVMValueRef rsrc_ptr;
3355
3356 if (!reg->Register.Indirect)
3357 return ctx->shader_buffers[reg->Register.Index];
3358
3359 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3360 reg->Register.Index,
3361 SI_NUM_SHADER_BUFFERS);
3362
3363 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3364 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3365 }
3366
3367 static bool tgsi_is_array_sampler(unsigned target)
3368 {
3369 return target == TGSI_TEXTURE_1D_ARRAY ||
3370 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3371 target == TGSI_TEXTURE_2D_ARRAY ||
3372 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3373 target == TGSI_TEXTURE_CUBE_ARRAY ||
3374 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3375 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3376 }
3377
3378 static bool tgsi_is_array_image(unsigned target)
3379 {
3380 return target == TGSI_TEXTURE_3D ||
3381 target == TGSI_TEXTURE_CUBE ||
3382 target == TGSI_TEXTURE_1D_ARRAY ||
3383 target == TGSI_TEXTURE_2D_ARRAY ||
3384 target == TGSI_TEXTURE_CUBE_ARRAY ||
3385 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3386 }
3387
3388 /**
3389 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3390 *
3391 * At least on Tonga, executing image stores on images with DCC enabled and
3392 * non-trivial can eventually lead to lockups. This can occur when an
3393 * application binds an image as read-only but then uses a shader that writes
3394 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3395 * program termination) in this case, but it doesn't cost much to be a bit
3396 * nicer: disabling DCC in the shader still leads to undefined results but
3397 * avoids the lockup.
3398 */
3399 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3400 LLVMValueRef rsrc)
3401 {
3402 if (ctx->screen->b.chip_class <= CIK) {
3403 return rsrc;
3404 } else {
3405 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3406 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3407 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3408 LLVMValueRef tmp;
3409
3410 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3411 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3412 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3413 }
3414 }
3415
3416 /**
3417 * Load the resource descriptor for \p image.
3418 */
3419 static void
3420 image_fetch_rsrc(
3421 struct lp_build_tgsi_context *bld_base,
3422 const struct tgsi_full_src_register *image,
3423 bool dcc_off,
3424 LLVMValueRef *rsrc)
3425 {
3426 struct si_shader_context *ctx = si_shader_context(bld_base);
3427
3428 assert(image->Register.File == TGSI_FILE_IMAGE);
3429
3430 if (!image->Register.Indirect) {
3431 /* Fast path: use preloaded resources */
3432 *rsrc = ctx->images[image->Register.Index];
3433 } else {
3434 /* Indexing and manual load */
3435 LLVMValueRef ind_index;
3436 LLVMValueRef rsrc_ptr;
3437 LLVMValueRef tmp;
3438
3439 /* From the GL_ARB_shader_image_load_store extension spec:
3440 *
3441 * If a shader performs an image load, store, or atomic
3442 * operation using an image variable declared as an array,
3443 * and if the index used to select an individual element is
3444 * negative or greater than or equal to the size of the
3445 * array, the results of the operation are undefined but may
3446 * not lead to termination.
3447 */
3448 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3449 image->Register.Index,
3450 SI_NUM_IMAGES);
3451
3452 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3453 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3454 if (dcc_off)
3455 tmp = force_dcc_off(ctx, tmp);
3456 *rsrc = tmp;
3457 }
3458 }
3459
3460 static LLVMValueRef image_fetch_coords(
3461 struct lp_build_tgsi_context *bld_base,
3462 const struct tgsi_full_instruction *inst,
3463 unsigned src)
3464 {
3465 struct gallivm_state *gallivm = bld_base->base.gallivm;
3466 LLVMBuilderRef builder = gallivm->builder;
3467 unsigned target = inst->Memory.Texture;
3468 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3469 LLVMValueRef coords[4];
3470 LLVMValueRef tmp;
3471 int chan;
3472
3473 for (chan = 0; chan < num_coords; ++chan) {
3474 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3475 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3476 coords[chan] = tmp;
3477 }
3478
3479 if (num_coords == 1)
3480 return coords[0];
3481
3482 if (num_coords == 3) {
3483 /* LLVM has difficulties lowering 3-element vectors. */
3484 coords[3] = bld_base->uint_bld.undef;
3485 num_coords = 4;
3486 }
3487
3488 return lp_build_gather_values(gallivm, coords, num_coords);
3489 }
3490
3491 /**
3492 * Append the extra mode bits that are used by image load and store.
3493 */
3494 static void image_append_args(
3495 struct si_shader_context *ctx,
3496 struct lp_build_emit_data * emit_data,
3497 unsigned target,
3498 bool atomic)
3499 {
3500 const struct tgsi_full_instruction *inst = emit_data->inst;
3501 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3502 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3503
3504 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3505 emit_data->args[emit_data->arg_count++] =
3506 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3507 if (!atomic) {
3508 emit_data->args[emit_data->arg_count++] =
3509 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3510 i1true : i1false; /* glc */
3511 }
3512 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3513 }
3514
3515 /**
3516 * Given a 256 bit resource, extract the top half (which stores the buffer
3517 * resource in the case of textures and images).
3518 */
3519 static LLVMValueRef extract_rsrc_top_half(
3520 struct si_shader_context *ctx,
3521 LLVMValueRef rsrc)
3522 {
3523 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3524 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3525 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3526
3527 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3528 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3529 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3530
3531 return rsrc;
3532 }
3533
3534 /**
3535 * Append the resource and indexing arguments for buffer intrinsics.
3536 *
3537 * \param rsrc the v4i32 buffer resource
3538 * \param index index into the buffer (stride-based)
3539 * \param offset byte offset into the buffer
3540 */
3541 static void buffer_append_args(
3542 struct si_shader_context *ctx,
3543 struct lp_build_emit_data *emit_data,
3544 LLVMValueRef rsrc,
3545 LLVMValueRef index,
3546 LLVMValueRef offset,
3547 bool atomic)
3548 {
3549 const struct tgsi_full_instruction *inst = emit_data->inst;
3550 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3551 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3552
3553 emit_data->args[emit_data->arg_count++] = rsrc;
3554 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3555 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3556 if (!atomic) {
3557 emit_data->args[emit_data->arg_count++] =
3558 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3559 i1true : i1false; /* glc */
3560 }
3561 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3562 }
3563
3564 static void load_fetch_args(
3565 struct lp_build_tgsi_context * bld_base,
3566 struct lp_build_emit_data * emit_data)
3567 {
3568 struct si_shader_context *ctx = si_shader_context(bld_base);
3569 struct gallivm_state *gallivm = bld_base->base.gallivm;
3570 const struct tgsi_full_instruction * inst = emit_data->inst;
3571 unsigned target = inst->Memory.Texture;
3572 LLVMValueRef rsrc;
3573
3574 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3575
3576 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3577 LLVMBuilderRef builder = gallivm->builder;
3578 LLVMValueRef offset;
3579 LLVMValueRef tmp;
3580
3581 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3582
3583 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3584 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3585
3586 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3587 offset, false);
3588 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3589 LLVMValueRef coords;
3590
3591 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3592 coords = image_fetch_coords(bld_base, inst, 1);
3593
3594 if (target == TGSI_TEXTURE_BUFFER) {
3595 rsrc = extract_rsrc_top_half(ctx, rsrc);
3596 buffer_append_args(ctx, emit_data, rsrc, coords,
3597 bld_base->uint_bld.zero, false);
3598 } else {
3599 emit_data->args[0] = coords;
3600 emit_data->args[1] = rsrc;
3601 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3602 emit_data->arg_count = 3;
3603
3604 image_append_args(ctx, emit_data, target, false);
3605 }
3606 }
3607 }
3608
3609 static void load_emit_buffer(struct si_shader_context *ctx,
3610 struct lp_build_emit_data *emit_data)
3611 {
3612 const struct tgsi_full_instruction *inst = emit_data->inst;
3613 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3614 LLVMBuilderRef builder = gallivm->builder;
3615 uint writemask = inst->Dst[0].Register.WriteMask;
3616 uint count = util_last_bit(writemask);
3617 const char *intrinsic_name;
3618 LLVMTypeRef dst_type;
3619
3620 switch (count) {
3621 case 1:
3622 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3623 dst_type = ctx->f32;
3624 break;
3625 case 2:
3626 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3627 dst_type = LLVMVectorType(ctx->f32, 2);
3628 break;
3629 default: // 3 & 4
3630 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3631 dst_type = ctx->v4f32;
3632 count = 4;
3633 }
3634
3635 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3636 builder, intrinsic_name, dst_type,
3637 emit_data->args, emit_data->arg_count,
3638 LLVMReadOnlyAttribute);
3639 }
3640
3641 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3642 const struct tgsi_full_instruction *inst,
3643 LLVMTypeRef type, int arg)
3644 {
3645 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3646 LLVMBuilderRef builder = gallivm->builder;
3647 LLVMValueRef offset, ptr;
3648 int addr_space;
3649
3650 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3651 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3652
3653 ptr = ctx->shared_memory;
3654 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3655 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3656 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3657
3658 return ptr;
3659 }
3660
3661 static void load_emit_memory(
3662 struct si_shader_context *ctx,
3663 struct lp_build_emit_data *emit_data)
3664 {
3665 const struct tgsi_full_instruction *inst = emit_data->inst;
3666 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3667 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3668 LLVMBuilderRef builder = gallivm->builder;
3669 unsigned writemask = inst->Dst[0].Register.WriteMask;
3670 LLVMValueRef channels[4], ptr, derived_ptr, index;
3671 int chan;
3672
3673 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3674
3675 for (chan = 0; chan < 4; ++chan) {
3676 if (!(writemask & (1 << chan))) {
3677 channels[chan] = LLVMGetUndef(base->elem_type);
3678 continue;
3679 }
3680
3681 index = lp_build_const_int32(gallivm, chan);
3682 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3683 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3684 }
3685 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3686 }
3687
3688 static void load_emit(
3689 const struct lp_build_tgsi_action *action,
3690 struct lp_build_tgsi_context *bld_base,
3691 struct lp_build_emit_data *emit_data)
3692 {
3693 struct si_shader_context *ctx = si_shader_context(bld_base);
3694 struct gallivm_state *gallivm = bld_base->base.gallivm;
3695 LLVMBuilderRef builder = gallivm->builder;
3696 const struct tgsi_full_instruction * inst = emit_data->inst;
3697 char intrinsic_name[32];
3698 char coords_type[8];
3699
3700 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3701 load_emit_memory(ctx, emit_data);
3702 return;
3703 }
3704
3705 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3706 emit_waitcnt(ctx);
3707
3708 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3709 load_emit_buffer(ctx, emit_data);
3710 return;
3711 }
3712
3713 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3714 emit_data->output[emit_data->chan] =
3715 lp_build_intrinsic(
3716 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3717 emit_data->args, emit_data->arg_count,
3718 LLVMReadOnlyAttribute);
3719 } else {
3720 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3721 coords_type, sizeof(coords_type));
3722
3723 snprintf(intrinsic_name, sizeof(intrinsic_name),
3724 "llvm.amdgcn.image.load.%s", coords_type);
3725
3726 emit_data->output[emit_data->chan] =
3727 lp_build_intrinsic(
3728 builder, intrinsic_name, emit_data->dst_type,
3729 emit_data->args, emit_data->arg_count,
3730 LLVMReadOnlyAttribute);
3731 }
3732 }
3733
3734 static void store_fetch_args(
3735 struct lp_build_tgsi_context * bld_base,
3736 struct lp_build_emit_data * emit_data)
3737 {
3738 struct si_shader_context *ctx = si_shader_context(bld_base);
3739 struct gallivm_state *gallivm = bld_base->base.gallivm;
3740 LLVMBuilderRef builder = gallivm->builder;
3741 const struct tgsi_full_instruction * inst = emit_data->inst;
3742 struct tgsi_full_src_register memory;
3743 LLVMValueRef chans[4];
3744 LLVMValueRef data;
3745 LLVMValueRef rsrc;
3746 unsigned chan;
3747
3748 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3749
3750 for (chan = 0; chan < 4; ++chan) {
3751 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3752 }
3753 data = lp_build_gather_values(gallivm, chans, 4);
3754
3755 emit_data->args[emit_data->arg_count++] = data;
3756
3757 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3758
3759 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3760 LLVMValueRef offset;
3761 LLVMValueRef tmp;
3762
3763 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3764
3765 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3766 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3767
3768 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3769 offset, false);
3770 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3771 unsigned target = inst->Memory.Texture;
3772 LLVMValueRef coords;
3773
3774 coords = image_fetch_coords(bld_base, inst, 0);
3775
3776 if (target == TGSI_TEXTURE_BUFFER) {
3777 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3778
3779 rsrc = extract_rsrc_top_half(ctx, rsrc);
3780 buffer_append_args(ctx, emit_data, rsrc, coords,
3781 bld_base->uint_bld.zero, false);
3782 } else {
3783 emit_data->args[1] = coords;
3784 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3785 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3786 emit_data->arg_count = 4;
3787
3788 image_append_args(ctx, emit_data, target, false);
3789 }
3790 }
3791 }
3792
3793 static void store_emit_buffer(
3794 struct si_shader_context *ctx,
3795 struct lp_build_emit_data *emit_data)
3796 {
3797 const struct tgsi_full_instruction *inst = emit_data->inst;
3798 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3799 LLVMBuilderRef builder = gallivm->builder;
3800 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3801 LLVMValueRef base_data = emit_data->args[0];
3802 LLVMValueRef base_offset = emit_data->args[3];
3803 unsigned writemask = inst->Dst[0].Register.WriteMask;
3804
3805 while (writemask) {
3806 int start, count;
3807 const char *intrinsic_name;
3808 LLVMValueRef data;
3809 LLVMValueRef offset;
3810 LLVMValueRef tmp;
3811
3812 u_bit_scan_consecutive_range(&writemask, &start, &count);
3813
3814 /* Due to an LLVM limitation, split 3-element writes
3815 * into a 2-element and a 1-element write. */
3816 if (count == 3) {
3817 writemask |= 1 << (start + 2);
3818 count = 2;
3819 }
3820
3821 if (count == 4) {
3822 data = base_data;
3823 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3824 } else if (count == 2) {
3825 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3826
3827 tmp = LLVMBuildExtractElement(
3828 builder, base_data,
3829 lp_build_const_int32(gallivm, start), "");
3830 data = LLVMBuildInsertElement(
3831 builder, LLVMGetUndef(v2f32), tmp,
3832 uint_bld->zero, "");
3833
3834 tmp = LLVMBuildExtractElement(
3835 builder, base_data,
3836 lp_build_const_int32(gallivm, start + 1), "");
3837 data = LLVMBuildInsertElement(
3838 builder, data, tmp, uint_bld->one, "");
3839
3840 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3841 } else {
3842 assert(count == 1);
3843 data = LLVMBuildExtractElement(
3844 builder, base_data,
3845 lp_build_const_int32(gallivm, start), "");
3846 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3847 }
3848
3849 offset = base_offset;
3850 if (start != 0) {
3851 offset = LLVMBuildAdd(
3852 builder, offset,
3853 lp_build_const_int32(gallivm, start * 4), "");
3854 }
3855
3856 emit_data->args[0] = data;
3857 emit_data->args[3] = offset;
3858
3859 lp_build_intrinsic(
3860 builder, intrinsic_name, emit_data->dst_type,
3861 emit_data->args, emit_data->arg_count, 0);
3862 }
3863 }
3864
3865 static void store_emit_memory(
3866 struct si_shader_context *ctx,
3867 struct lp_build_emit_data *emit_data)
3868 {
3869 const struct tgsi_full_instruction *inst = emit_data->inst;
3870 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3871 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3872 LLVMBuilderRef builder = gallivm->builder;
3873 unsigned writemask = inst->Dst[0].Register.WriteMask;
3874 LLVMValueRef ptr, derived_ptr, data, index;
3875 int chan;
3876
3877 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3878
3879 for (chan = 0; chan < 4; ++chan) {
3880 if (!(writemask & (1 << chan))) {
3881 continue;
3882 }
3883 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3884 index = lp_build_const_int32(gallivm, chan);
3885 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3886 LLVMBuildStore(builder, data, derived_ptr);
3887 }
3888 }
3889
3890 static void store_emit(
3891 const struct lp_build_tgsi_action *action,
3892 struct lp_build_tgsi_context *bld_base,
3893 struct lp_build_emit_data *emit_data)
3894 {
3895 struct si_shader_context *ctx = si_shader_context(bld_base);
3896 struct gallivm_state *gallivm = bld_base->base.gallivm;
3897 LLVMBuilderRef builder = gallivm->builder;
3898 const struct tgsi_full_instruction * inst = emit_data->inst;
3899 unsigned target = inst->Memory.Texture;
3900 char intrinsic_name[32];
3901 char coords_type[8];
3902
3903 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3904 store_emit_memory(ctx, emit_data);
3905 return;
3906 }
3907
3908 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3909 emit_waitcnt(ctx);
3910
3911 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3912 store_emit_buffer(ctx, emit_data);
3913 return;
3914 }
3915
3916 if (target == TGSI_TEXTURE_BUFFER) {
3917 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3918 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3919 emit_data->dst_type, emit_data->args,
3920 emit_data->arg_count, 0);
3921 } else {
3922 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3923 coords_type, sizeof(coords_type));
3924 snprintf(intrinsic_name, sizeof(intrinsic_name),
3925 "llvm.amdgcn.image.store.%s", coords_type);
3926
3927 emit_data->output[emit_data->chan] =
3928 lp_build_intrinsic(
3929 builder, intrinsic_name, emit_data->dst_type,
3930 emit_data->args, emit_data->arg_count, 0);
3931 }
3932 }
3933
3934 static void atomic_fetch_args(
3935 struct lp_build_tgsi_context * bld_base,
3936 struct lp_build_emit_data * emit_data)
3937 {
3938 struct si_shader_context *ctx = si_shader_context(bld_base);
3939 struct gallivm_state *gallivm = bld_base->base.gallivm;
3940 LLVMBuilderRef builder = gallivm->builder;
3941 const struct tgsi_full_instruction * inst = emit_data->inst;
3942 LLVMValueRef data1, data2;
3943 LLVMValueRef rsrc;
3944 LLVMValueRef tmp;
3945
3946 emit_data->dst_type = bld_base->base.elem_type;
3947
3948 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3949 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3950
3951 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3952 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3953 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3954 }
3955
3956 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3957 * of arguments, which is reversed relative to TGSI (and GLSL)
3958 */
3959 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3960 emit_data->args[emit_data->arg_count++] = data2;
3961 emit_data->args[emit_data->arg_count++] = data1;
3962
3963 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3964 LLVMValueRef offset;
3965
3966 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3967
3968 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3969 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3970
3971 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3972 offset, true);
3973 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3974 unsigned target = inst->Memory.Texture;
3975 LLVMValueRef coords;
3976
3977 image_fetch_rsrc(bld_base, &inst->Src[0],
3978 target != TGSI_TEXTURE_BUFFER, &rsrc);
3979 coords = image_fetch_coords(bld_base, inst, 1);
3980
3981 if (target == TGSI_TEXTURE_BUFFER) {
3982 rsrc = extract_rsrc_top_half(ctx, rsrc);
3983 buffer_append_args(ctx, emit_data, rsrc, coords,
3984 bld_base->uint_bld.zero, true);
3985 } else {
3986 emit_data->args[emit_data->arg_count++] = coords;
3987 emit_data->args[emit_data->arg_count++] = rsrc;
3988
3989 image_append_args(ctx, emit_data, target, true);
3990 }
3991 }
3992 }
3993
3994 static void atomic_emit_memory(struct si_shader_context *ctx,
3995 struct lp_build_emit_data *emit_data) {
3996 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3997 LLVMBuilderRef builder = gallivm->builder;
3998 const struct tgsi_full_instruction * inst = emit_data->inst;
3999 LLVMValueRef ptr, result, arg;
4000
4001 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4002
4003 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
4004 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4005
4006 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4007 LLVMValueRef new_data;
4008 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4009 inst, 3, 0);
4010
4011 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4012
4013 #if HAVE_LLVM >= 0x309
4014 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4015 LLVMAtomicOrderingSequentiallyConsistent,
4016 LLVMAtomicOrderingSequentiallyConsistent,
4017 false);
4018 #endif
4019
4020 result = LLVMBuildExtractValue(builder, result, 0, "");
4021 } else {
4022 LLVMAtomicRMWBinOp op;
4023
4024 switch(inst->Instruction.Opcode) {
4025 case TGSI_OPCODE_ATOMUADD:
4026 op = LLVMAtomicRMWBinOpAdd;
4027 break;
4028 case TGSI_OPCODE_ATOMXCHG:
4029 op = LLVMAtomicRMWBinOpXchg;
4030 break;
4031 case TGSI_OPCODE_ATOMAND:
4032 op = LLVMAtomicRMWBinOpAnd;
4033 break;
4034 case TGSI_OPCODE_ATOMOR:
4035 op = LLVMAtomicRMWBinOpOr;
4036 break;
4037 case TGSI_OPCODE_ATOMXOR:
4038 op = LLVMAtomicRMWBinOpXor;
4039 break;
4040 case TGSI_OPCODE_ATOMUMIN:
4041 op = LLVMAtomicRMWBinOpUMin;
4042 break;
4043 case TGSI_OPCODE_ATOMUMAX:
4044 op = LLVMAtomicRMWBinOpUMax;
4045 break;
4046 case TGSI_OPCODE_ATOMIMIN:
4047 op = LLVMAtomicRMWBinOpMin;
4048 break;
4049 case TGSI_OPCODE_ATOMIMAX:
4050 op = LLVMAtomicRMWBinOpMax;
4051 break;
4052 default:
4053 unreachable("unknown atomic opcode");
4054 }
4055
4056 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4057 LLVMAtomicOrderingSequentiallyConsistent,
4058 false);
4059 }
4060 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4061 }
4062
4063 static void atomic_emit(
4064 const struct lp_build_tgsi_action *action,
4065 struct lp_build_tgsi_context *bld_base,
4066 struct lp_build_emit_data *emit_data)
4067 {
4068 struct si_shader_context *ctx = si_shader_context(bld_base);
4069 struct gallivm_state *gallivm = bld_base->base.gallivm;
4070 LLVMBuilderRef builder = gallivm->builder;
4071 const struct tgsi_full_instruction * inst = emit_data->inst;
4072 char intrinsic_name[40];
4073 LLVMValueRef tmp;
4074
4075 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4076 atomic_emit_memory(ctx, emit_data);
4077 return;
4078 }
4079
4080 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4081 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4082 snprintf(intrinsic_name, sizeof(intrinsic_name),
4083 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4084 } else {
4085 char coords_type[8];
4086
4087 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4088 coords_type, sizeof(coords_type));
4089 snprintf(intrinsic_name, sizeof(intrinsic_name),
4090 "llvm.amdgcn.image.atomic.%s.%s",
4091 action->intr_name, coords_type);
4092 }
4093
4094 tmp = lp_build_intrinsic(
4095 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4096 emit_data->args, emit_data->arg_count, 0);
4097 emit_data->output[emit_data->chan] =
4098 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4099 }
4100
4101 static void resq_fetch_args(
4102 struct lp_build_tgsi_context * bld_base,
4103 struct lp_build_emit_data * emit_data)
4104 {
4105 struct si_shader_context *ctx = si_shader_context(bld_base);
4106 struct gallivm_state *gallivm = bld_base->base.gallivm;
4107 const struct tgsi_full_instruction *inst = emit_data->inst;
4108 const struct tgsi_full_src_register *reg = &inst->Src[0];
4109
4110 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4111
4112 if (reg->Register.File == TGSI_FILE_BUFFER) {
4113 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4114 emit_data->arg_count = 1;
4115 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4116 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4117 emit_data->arg_count = 1;
4118 } else {
4119 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4120 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4121 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4122 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4123 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4124 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4125 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4126 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4127 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4128 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4129 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4130 emit_data->arg_count = 10;
4131 }
4132 }
4133
4134 static void resq_emit(
4135 const struct lp_build_tgsi_action *action,
4136 struct lp_build_tgsi_context *bld_base,
4137 struct lp_build_emit_data *emit_data)
4138 {
4139 struct gallivm_state *gallivm = bld_base->base.gallivm;
4140 LLVMBuilderRef builder = gallivm->builder;
4141 const struct tgsi_full_instruction *inst = emit_data->inst;
4142 LLVMValueRef out;
4143
4144 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4145 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4146 lp_build_const_int32(gallivm, 2), "");
4147 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4148 out = get_buffer_size(bld_base, emit_data->args[0]);
4149 } else {
4150 out = lp_build_intrinsic(
4151 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4152 emit_data->args, emit_data->arg_count,
4153 LLVMReadNoneAttribute);
4154
4155 /* Divide the number of layers by 6 to get the number of cubes. */
4156 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4157 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4158 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4159
4160 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4161 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4162 z = LLVMBuildSDiv(builder, z, imm6, "");
4163 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4164 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4165 }
4166 }
4167
4168 emit_data->output[emit_data->chan] = out;
4169 }
4170
4171 static void set_tex_fetch_args(struct si_shader_context *ctx,
4172 struct lp_build_emit_data *emit_data,
4173 unsigned opcode, unsigned target,
4174 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4175 LLVMValueRef *param, unsigned count,
4176 unsigned dmask)
4177 {
4178 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4179 unsigned num_args;
4180 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4181
4182 /* Pad to power of two vector */
4183 while (count < util_next_power_of_two(count))
4184 param[count++] = LLVMGetUndef(ctx->i32);
4185
4186 /* Texture coordinates. */
4187 if (count > 1)
4188 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4189 else
4190 emit_data->args[0] = param[0];
4191
4192 /* Resource. */
4193 emit_data->args[1] = res_ptr;
4194 num_args = 2;
4195
4196 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4197 emit_data->dst_type = ctx->v4i32;
4198 else {
4199 emit_data->dst_type = ctx->v4f32;
4200
4201 emit_data->args[num_args++] = samp_ptr;
4202 }
4203
4204 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4205 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4206 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4207 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4208 tgsi_is_array_sampler(target)); /* da */
4209 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4210 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4211 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4212 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4213
4214 emit_data->arg_count = num_args;
4215 }
4216
4217 static const struct lp_build_tgsi_action tex_action;
4218
4219 enum desc_type {
4220 DESC_IMAGE,
4221 DESC_FMASK,
4222 DESC_SAMPLER
4223 };
4224
4225 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4226 {
4227 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4228 CONST_ADDR_SPACE);
4229 }
4230
4231 /**
4232 * Load an image view, fmask view. or sampler state descriptor.
4233 */
4234 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4235 LLVMValueRef list, LLVMValueRef index,
4236 enum desc_type type)
4237 {
4238 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4239 LLVMBuilderRef builder = gallivm->builder;
4240
4241 switch (type) {
4242 case DESC_IMAGE:
4243 /* The image is at [0:7]. */
4244 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4245 break;
4246 case DESC_FMASK:
4247 /* The FMASK is at [8:15]. */
4248 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4249 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4250 break;
4251 case DESC_SAMPLER:
4252 /* The sampler state is at [12:15]. */
4253 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4254 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4255 list = LLVMBuildPointerCast(builder, list,
4256 const_array(ctx->v4i32, 0), "");
4257 break;
4258 }
4259
4260 return build_indexed_load_const(ctx, list, index);
4261 }
4262
4263 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4264 LLVMValueRef index, enum desc_type type)
4265 {
4266 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4267 SI_PARAM_SAMPLERS);
4268
4269 return get_sampler_desc_custom(ctx, list, index, type);
4270 }
4271
4272 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4273 *
4274 * SI-CI:
4275 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4276 * filtering manually. The driver sets img7 to a mask clearing
4277 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4278 * s_and_b32 samp0, samp0, img7
4279 *
4280 * VI:
4281 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4282 */
4283 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4284 LLVMValueRef res, LLVMValueRef samp)
4285 {
4286 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4287 LLVMValueRef img7, samp0;
4288
4289 if (ctx->screen->b.chip_class >= VI)
4290 return samp;
4291
4292 img7 = LLVMBuildExtractElement(builder, res,
4293 LLVMConstInt(ctx->i32, 7, 0), "");
4294 samp0 = LLVMBuildExtractElement(builder, samp,
4295 LLVMConstInt(ctx->i32, 0, 0), "");
4296 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4297 return LLVMBuildInsertElement(builder, samp, samp0,
4298 LLVMConstInt(ctx->i32, 0, 0), "");
4299 }
4300
4301 static void tex_fetch_ptrs(
4302 struct lp_build_tgsi_context *bld_base,
4303 struct lp_build_emit_data *emit_data,
4304 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4305 {
4306 struct si_shader_context *ctx = si_shader_context(bld_base);
4307 const struct tgsi_full_instruction *inst = emit_data->inst;
4308 unsigned target = inst->Texture.Texture;
4309 unsigned sampler_src;
4310 unsigned sampler_index;
4311
4312 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4313 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4314
4315 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4316 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4317 LLVMValueRef ind_index;
4318
4319 ind_index = get_bounded_indirect_index(ctx,
4320 &reg->Indirect,
4321 reg->Register.Index,
4322 SI_NUM_SAMPLERS);
4323
4324 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4325
4326 if (target == TGSI_TEXTURE_2D_MSAA ||
4327 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4328 if (samp_ptr)
4329 *samp_ptr = NULL;
4330 if (fmask_ptr)
4331 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4332 } else {
4333 if (samp_ptr) {
4334 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4335 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4336 }
4337 if (fmask_ptr)
4338 *fmask_ptr = NULL;
4339 }
4340 } else {
4341 *res_ptr = ctx->sampler_views[sampler_index];
4342 if (samp_ptr)
4343 *samp_ptr = ctx->sampler_states[sampler_index];
4344 if (fmask_ptr)
4345 *fmask_ptr = ctx->fmasks[sampler_index];
4346 }
4347 }
4348
4349 static void txq_fetch_args(
4350 struct lp_build_tgsi_context *bld_base,
4351 struct lp_build_emit_data *emit_data)
4352 {
4353 struct si_shader_context *ctx = si_shader_context(bld_base);
4354 struct gallivm_state *gallivm = bld_base->base.gallivm;
4355 LLVMBuilderRef builder = gallivm->builder;
4356 const struct tgsi_full_instruction *inst = emit_data->inst;
4357 unsigned target = inst->Texture.Texture;
4358 LLVMValueRef res_ptr;
4359 LLVMValueRef address;
4360
4361 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4362
4363 if (target == TGSI_TEXTURE_BUFFER) {
4364 /* Read the size from the buffer descriptor directly. */
4365 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4366 emit_data->args[0] = get_buffer_size(bld_base, res);
4367 return;
4368 }
4369
4370 /* Textures - set the mip level. */
4371 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4372
4373 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4374 NULL, &address, 1, 0xf);
4375 }
4376
4377 static void txq_emit(const struct lp_build_tgsi_action *action,
4378 struct lp_build_tgsi_context *bld_base,
4379 struct lp_build_emit_data *emit_data)
4380 {
4381 struct lp_build_context *base = &bld_base->base;
4382 unsigned target = emit_data->inst->Texture.Texture;
4383
4384 if (target == TGSI_TEXTURE_BUFFER) {
4385 /* Just return the buffer size. */
4386 emit_data->output[emit_data->chan] = emit_data->args[0];
4387 return;
4388 }
4389
4390 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4391 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4392 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4393 LLVMReadNoneAttribute);
4394
4395 /* Divide the number of layers by 6 to get the number of cubes. */
4396 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4397 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4398 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4399 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4400 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4401
4402 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4403 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4404 z = LLVMBuildSDiv(builder, z, six, "");
4405
4406 emit_data->output[emit_data->chan] =
4407 LLVMBuildInsertElement(builder, v4, z, two, "");
4408 }
4409 }
4410
4411 static void tex_fetch_args(
4412 struct lp_build_tgsi_context *bld_base,
4413 struct lp_build_emit_data *emit_data)
4414 {
4415 struct si_shader_context *ctx = si_shader_context(bld_base);
4416 struct gallivm_state *gallivm = bld_base->base.gallivm;
4417 const struct tgsi_full_instruction *inst = emit_data->inst;
4418 unsigned opcode = inst->Instruction.Opcode;
4419 unsigned target = inst->Texture.Texture;
4420 LLVMValueRef coords[5], derivs[6];
4421 LLVMValueRef address[16];
4422 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4423 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4424 unsigned count = 0;
4425 unsigned chan;
4426 unsigned num_deriv_channels = 0;
4427 bool has_offset = inst->Texture.NumOffsets > 0;
4428 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4429 unsigned dmask = 0xf;
4430
4431 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4432
4433 if (target == TGSI_TEXTURE_BUFFER) {
4434 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4435
4436 /* Bitcast and truncate v8i32 to v16i8. */
4437 LLVMValueRef res = res_ptr;
4438 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4439 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4440 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4441
4442 emit_data->dst_type = ctx->v4f32;
4443 emit_data->args[0] = res;
4444 emit_data->args[1] = bld_base->uint_bld.zero;
4445 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4446 emit_data->arg_count = 3;
4447 return;
4448 }
4449
4450 /* Fetch and project texture coordinates */
4451 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4452 for (chan = 0; chan < 3; chan++ ) {
4453 coords[chan] = lp_build_emit_fetch(bld_base,
4454 emit_data->inst, 0,
4455 chan);
4456 if (opcode == TGSI_OPCODE_TXP)
4457 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4458 TGSI_OPCODE_DIV,
4459 coords[chan],
4460 coords[3]);
4461 }
4462
4463 if (opcode == TGSI_OPCODE_TXP)
4464 coords[3] = bld_base->base.one;
4465
4466 /* Pack offsets. */
4467 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4468 /* The offsets are six-bit signed integers packed like this:
4469 * X=[5:0], Y=[13:8], and Z=[21:16].
4470 */
4471 LLVMValueRef offset[3], pack;
4472
4473 assert(inst->Texture.NumOffsets == 1);
4474
4475 for (chan = 0; chan < 3; chan++) {
4476 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4477 emit_data->inst, 0, chan);
4478 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4479 lp_build_const_int32(gallivm, 0x3f), "");
4480 if (chan)
4481 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4482 lp_build_const_int32(gallivm, chan*8), "");
4483 }
4484
4485 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4486 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4487 address[count++] = pack;
4488 }
4489
4490 /* Pack LOD bias value */
4491 if (opcode == TGSI_OPCODE_TXB)
4492 address[count++] = coords[3];
4493 if (opcode == TGSI_OPCODE_TXB2)
4494 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4495
4496 /* Pack depth comparison value */
4497 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4498 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4499 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4500 } else {
4501 assert(ref_pos >= 0);
4502 address[count++] = coords[ref_pos];
4503 }
4504 }
4505
4506 /* Pack user derivatives */
4507 if (opcode == TGSI_OPCODE_TXD) {
4508 int param, num_src_deriv_channels;
4509
4510 switch (target) {
4511 case TGSI_TEXTURE_3D:
4512 num_src_deriv_channels = 3;
4513 num_deriv_channels = 3;
4514 break;
4515 case TGSI_TEXTURE_2D:
4516 case TGSI_TEXTURE_SHADOW2D:
4517 case TGSI_TEXTURE_RECT:
4518 case TGSI_TEXTURE_SHADOWRECT:
4519 case TGSI_TEXTURE_2D_ARRAY:
4520 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4521 num_src_deriv_channels = 2;
4522 num_deriv_channels = 2;
4523 break;
4524 case TGSI_TEXTURE_CUBE:
4525 case TGSI_TEXTURE_SHADOWCUBE:
4526 case TGSI_TEXTURE_CUBE_ARRAY:
4527 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4528 /* Cube derivatives will be converted to 2D. */
4529 num_src_deriv_channels = 3;
4530 num_deriv_channels = 2;
4531 break;
4532 case TGSI_TEXTURE_1D:
4533 case TGSI_TEXTURE_SHADOW1D:
4534 case TGSI_TEXTURE_1D_ARRAY:
4535 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4536 num_src_deriv_channels = 1;
4537 num_deriv_channels = 1;
4538 break;
4539 default:
4540 unreachable("invalid target");
4541 }
4542
4543 for (param = 0; param < 2; param++)
4544 for (chan = 0; chan < num_src_deriv_channels; chan++)
4545 derivs[param * num_src_deriv_channels + chan] =
4546 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4547 }
4548
4549 if (target == TGSI_TEXTURE_CUBE ||
4550 target == TGSI_TEXTURE_CUBE_ARRAY ||
4551 target == TGSI_TEXTURE_SHADOWCUBE ||
4552 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4553 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4554
4555 if (opcode == TGSI_OPCODE_TXD)
4556 for (int i = 0; i < num_deriv_channels * 2; i++)
4557 address[count++] = derivs[i];
4558
4559 /* Pack texture coordinates */
4560 address[count++] = coords[0];
4561 if (num_coords > 1)
4562 address[count++] = coords[1];
4563 if (num_coords > 2)
4564 address[count++] = coords[2];
4565
4566 /* Pack LOD or sample index */
4567 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4568 address[count++] = coords[3];
4569 else if (opcode == TGSI_OPCODE_TXL2)
4570 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4571
4572 if (count > 16) {
4573 assert(!"Cannot handle more than 16 texture address parameters");
4574 count = 16;
4575 }
4576
4577 for (chan = 0; chan < count; chan++ ) {
4578 address[chan] = LLVMBuildBitCast(gallivm->builder,
4579 address[chan], ctx->i32, "");
4580 }
4581
4582 /* Adjust the sample index according to FMASK.
4583 *
4584 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4585 * which is the identity mapping. Each nibble says which physical sample
4586 * should be fetched to get that sample.
4587 *
4588 * For example, 0x11111100 means there are only 2 samples stored and
4589 * the second sample covers 3/4 of the pixel. When reading samples 0
4590 * and 1, return physical sample 0 (determined by the first two 0s
4591 * in FMASK), otherwise return physical sample 1.
4592 *
4593 * The sample index should be adjusted as follows:
4594 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4595 */
4596 if (target == TGSI_TEXTURE_2D_MSAA ||
4597 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4598 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4599 struct lp_build_emit_data txf_emit_data = *emit_data;
4600 LLVMValueRef txf_address[4];
4601 unsigned txf_count = count;
4602 struct tgsi_full_instruction inst = {};
4603
4604 memcpy(txf_address, address, sizeof(txf_address));
4605
4606 if (target == TGSI_TEXTURE_2D_MSAA) {
4607 txf_address[2] = bld_base->uint_bld.zero;
4608 }
4609 txf_address[3] = bld_base->uint_bld.zero;
4610
4611 /* Read FMASK using TXF. */
4612 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4613 inst.Texture.Texture = target;
4614 txf_emit_data.inst = &inst;
4615 txf_emit_data.chan = 0;
4616 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4617 target, fmask_ptr, NULL,
4618 txf_address, txf_count, 0xf);
4619 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4620
4621 /* Initialize some constants. */
4622 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4623 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4624
4625 /* Apply the formula. */
4626 LLVMValueRef fmask =
4627 LLVMBuildExtractElement(gallivm->builder,
4628 txf_emit_data.output[0],
4629 uint_bld->zero, "");
4630
4631 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4632
4633 LLVMValueRef sample_index4 =
4634 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4635
4636 LLVMValueRef shifted_fmask =
4637 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4638
4639 LLVMValueRef final_sample =
4640 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4641
4642 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4643 * resource descriptor is 0 (invalid),
4644 */
4645 LLVMValueRef fmask_desc =
4646 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4647 ctx->v8i32, "");
4648
4649 LLVMValueRef fmask_word1 =
4650 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4651 uint_bld->one, "");
4652
4653 LLVMValueRef word1_is_nonzero =
4654 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4655 fmask_word1, uint_bld->zero, "");
4656
4657 /* Replace the MSAA sample index. */
4658 address[sample_chan] =
4659 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4660 final_sample, address[sample_chan], "");
4661 }
4662
4663 if (opcode == TGSI_OPCODE_TXF) {
4664 /* add tex offsets */
4665 if (inst->Texture.NumOffsets) {
4666 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4667 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4668 const struct tgsi_texture_offset *off = inst->TexOffsets;
4669
4670 assert(inst->Texture.NumOffsets == 1);
4671
4672 switch (target) {
4673 case TGSI_TEXTURE_3D:
4674 address[2] = lp_build_add(uint_bld, address[2],
4675 bld->immediates[off->Index][off->SwizzleZ]);
4676 /* fall through */
4677 case TGSI_TEXTURE_2D:
4678 case TGSI_TEXTURE_SHADOW2D:
4679 case TGSI_TEXTURE_RECT:
4680 case TGSI_TEXTURE_SHADOWRECT:
4681 case TGSI_TEXTURE_2D_ARRAY:
4682 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4683 address[1] =
4684 lp_build_add(uint_bld, address[1],
4685 bld->immediates[off->Index][off->SwizzleY]);
4686 /* fall through */
4687 case TGSI_TEXTURE_1D:
4688 case TGSI_TEXTURE_SHADOW1D:
4689 case TGSI_TEXTURE_1D_ARRAY:
4690 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4691 address[0] =
4692 lp_build_add(uint_bld, address[0],
4693 bld->immediates[off->Index][off->SwizzleX]);
4694 break;
4695 /* texture offsets do not apply to other texture targets */
4696 }
4697 }
4698 }
4699
4700 if (opcode == TGSI_OPCODE_TG4) {
4701 unsigned gather_comp = 0;
4702
4703 /* DMASK was repurposed for GATHER4. 4 components are always
4704 * returned and DMASK works like a swizzle - it selects
4705 * the component to fetch. The only valid DMASK values are
4706 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4707 * (red,red,red,red) etc.) The ISA document doesn't mention
4708 * this.
4709 */
4710
4711 /* Get the component index from src1.x for Gather4. */
4712 if (!tgsi_is_shadow_target(target)) {
4713 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4714 LLVMValueRef comp_imm;
4715 struct tgsi_src_register src1 = inst->Src[1].Register;
4716
4717 assert(src1.File == TGSI_FILE_IMMEDIATE);
4718
4719 comp_imm = imms[src1.Index][src1.SwizzleX];
4720 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4721 gather_comp = CLAMP(gather_comp, 0, 3);
4722 }
4723
4724 dmask = 1 << gather_comp;
4725 }
4726
4727 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4728 samp_ptr, address, count, dmask);
4729 }
4730
4731 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4732 struct lp_build_tgsi_context *bld_base,
4733 struct lp_build_emit_data *emit_data)
4734 {
4735 struct si_shader_context *ctx = si_shader_context(bld_base);
4736 struct lp_build_context *base = &bld_base->base;
4737 unsigned opcode = emit_data->inst->Instruction.Opcode;
4738 unsigned target = emit_data->inst->Texture.Texture;
4739 char intr_name[127];
4740 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4741 bool is_shadow = tgsi_is_shadow_target(target);
4742 char type[64];
4743 const char *name = "llvm.SI.image.sample";
4744 const char *infix = "";
4745
4746 if (target == TGSI_TEXTURE_BUFFER) {
4747 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4748 base->gallivm->builder,
4749 "llvm.SI.vs.load.input", emit_data->dst_type,
4750 emit_data->args, emit_data->arg_count,
4751 LLVMReadNoneAttribute);
4752 return;
4753 }
4754
4755 switch (opcode) {
4756 case TGSI_OPCODE_TXF:
4757 name = target == TGSI_TEXTURE_2D_MSAA ||
4758 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4759 "llvm.SI.image.load" :
4760 "llvm.SI.image.load.mip";
4761 is_shadow = false;
4762 has_offset = false;
4763 break;
4764 case TGSI_OPCODE_LODQ:
4765 name = "llvm.SI.getlod";
4766 is_shadow = false;
4767 has_offset = false;
4768 break;
4769 case TGSI_OPCODE_TEX:
4770 case TGSI_OPCODE_TEX2:
4771 case TGSI_OPCODE_TXP:
4772 if (ctx->type != PIPE_SHADER_FRAGMENT)
4773 infix = ".lz";
4774 break;
4775 case TGSI_OPCODE_TXB:
4776 case TGSI_OPCODE_TXB2:
4777 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4778 infix = ".b";
4779 break;
4780 case TGSI_OPCODE_TXL:
4781 case TGSI_OPCODE_TXL2:
4782 infix = ".l";
4783 break;
4784 case TGSI_OPCODE_TXD:
4785 infix = ".d";
4786 break;
4787 case TGSI_OPCODE_TG4:
4788 name = "llvm.SI.gather4";
4789 infix = ".lz";
4790 break;
4791 default:
4792 assert(0);
4793 return;
4794 }
4795
4796 /* Add the type and suffixes .c, .o if needed. */
4797 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4798 sprintf(intr_name, "%s%s%s%s.%s",
4799 name, is_shadow ? ".c" : "", infix,
4800 has_offset ? ".o" : "", type);
4801
4802 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4803 base->gallivm->builder, intr_name, emit_data->dst_type,
4804 emit_data->args, emit_data->arg_count,
4805 LLVMReadNoneAttribute);
4806 }
4807
4808 static void si_llvm_emit_txqs(
4809 const struct lp_build_tgsi_action *action,
4810 struct lp_build_tgsi_context *bld_base,
4811 struct lp_build_emit_data *emit_data)
4812 {
4813 struct si_shader_context *ctx = si_shader_context(bld_base);
4814 struct gallivm_state *gallivm = bld_base->base.gallivm;
4815 LLVMBuilderRef builder = gallivm->builder;
4816 LLVMValueRef res, samples;
4817 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4818
4819 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4820
4821
4822 /* Read the samples from the descriptor directly. */
4823 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4824 samples = LLVMBuildExtractElement(
4825 builder, res,
4826 lp_build_const_int32(gallivm, 3), "");
4827 samples = LLVMBuildLShr(builder, samples,
4828 lp_build_const_int32(gallivm, 16), "");
4829 samples = LLVMBuildAnd(builder, samples,
4830 lp_build_const_int32(gallivm, 0xf), "");
4831 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4832 samples, "");
4833
4834 emit_data->output[emit_data->chan] = samples;
4835 }
4836
4837 /*
4838 * SI implements derivatives using the local data store (LDS)
4839 * All writes to the LDS happen in all executing threads at
4840 * the same time. TID is the Thread ID for the current
4841 * thread and is a value between 0 and 63, representing
4842 * the thread's position in the wavefront.
4843 *
4844 * For the pixel shader threads are grouped into quads of four pixels.
4845 * The TIDs of the pixels of a quad are:
4846 *
4847 * +------+------+
4848 * |4n + 0|4n + 1|
4849 * +------+------+
4850 * |4n + 2|4n + 3|
4851 * +------+------+
4852 *
4853 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4854 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4855 * the current pixel's column, and masking with 0xfffffffe yields the TID
4856 * of the left pixel of the current pixel's row.
4857 *
4858 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4859 * adding 2 yields the TID of the pixel below the top pixel.
4860 */
4861 /* masks for thread ID. */
4862 #define TID_MASK_TOP_LEFT 0xfffffffc
4863 #define TID_MASK_TOP 0xfffffffd
4864 #define TID_MASK_LEFT 0xfffffffe
4865
4866 static void si_llvm_emit_ddxy(
4867 const struct lp_build_tgsi_action *action,
4868 struct lp_build_tgsi_context *bld_base,
4869 struct lp_build_emit_data *emit_data)
4870 {
4871 struct si_shader_context *ctx = si_shader_context(bld_base);
4872 struct gallivm_state *gallivm = bld_base->base.gallivm;
4873 const struct tgsi_full_instruction *inst = emit_data->inst;
4874 unsigned opcode = inst->Instruction.Opcode;
4875 LLVMValueRef indices[2];
4876 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4877 LLVMValueRef tl, trbl, result[4];
4878 LLVMValueRef tl_tid, trbl_tid;
4879 unsigned swizzle[4];
4880 unsigned c;
4881 int idx;
4882 unsigned mask;
4883
4884 indices[0] = bld_base->uint_bld.zero;
4885 indices[1] = get_thread_id(ctx);
4886 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4887 indices, 2, "");
4888
4889 if (opcode == TGSI_OPCODE_DDX_FINE)
4890 mask = TID_MASK_LEFT;
4891 else if (opcode == TGSI_OPCODE_DDY_FINE)
4892 mask = TID_MASK_TOP;
4893 else
4894 mask = TID_MASK_TOP_LEFT;
4895
4896 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4897 lp_build_const_int32(gallivm, mask), "");
4898 indices[1] = tl_tid;
4899 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4900 indices, 2, "");
4901
4902 /* for DDX we want to next X pixel, DDY next Y pixel. */
4903 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4904 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4905 lp_build_const_int32(gallivm, idx), "");
4906 indices[1] = trbl_tid;
4907 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4908 indices, 2, "");
4909
4910 for (c = 0; c < 4; ++c) {
4911 unsigned i;
4912 LLVMValueRef val;
4913 LLVMValueRef args[2];
4914
4915 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4916 for (i = 0; i < c; ++i) {
4917 if (swizzle[i] == swizzle[c]) {
4918 result[c] = result[i];
4919 break;
4920 }
4921 }
4922 if (i != c)
4923 continue;
4924
4925 val = LLVMBuildBitCast(gallivm->builder,
4926 lp_build_emit_fetch(bld_base, inst, 0, c),
4927 ctx->i32, "");
4928
4929 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4930
4931 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4932 lp_build_const_int32(gallivm, 4), "");
4933 args[1] = val;
4934 tl = lp_build_intrinsic(gallivm->builder,
4935 "llvm.amdgcn.ds.bpermute", ctx->i32,
4936 args, 2, LLVMReadNoneAttribute);
4937
4938 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4939 lp_build_const_int32(gallivm, 4), "");
4940 trbl = lp_build_intrinsic(gallivm->builder,
4941 "llvm.amdgcn.ds.bpermute", ctx->i32,
4942 args, 2, LLVMReadNoneAttribute);
4943 } else {
4944 LLVMBuildStore(gallivm->builder, val, store_ptr);
4945 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4946 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4947 }
4948 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4949 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4950 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4951 }
4952
4953 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4954 }
4955
4956 /*
4957 * this takes an I,J coordinate pair,
4958 * and works out the X and Y derivatives.
4959 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4960 */
4961 static LLVMValueRef si_llvm_emit_ddxy_interp(
4962 struct lp_build_tgsi_context *bld_base,
4963 LLVMValueRef interp_ij)
4964 {
4965 struct si_shader_context *ctx = si_shader_context(bld_base);
4966 struct gallivm_state *gallivm = bld_base->base.gallivm;
4967 LLVMValueRef indices[2];
4968 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4969 LLVMValueRef tl, tr, bl, result[4];
4970 unsigned c;
4971
4972 indices[0] = bld_base->uint_bld.zero;
4973 indices[1] = get_thread_id(ctx);
4974 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4975 indices, 2, "");
4976
4977 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4978 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4979
4980 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4981 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4982
4983 indices[1] = temp;
4984 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4985 indices, 2, "");
4986
4987 indices[1] = temp2;
4988 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4989 indices, 2, "");
4990
4991 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4992 lp_build_const_int32(gallivm, 1), "");
4993 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4994 indices, 2, "");
4995
4996 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4997 lp_build_const_int32(gallivm, 2), "");
4998 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4999 indices, 2, "");
5000
5001 for (c = 0; c < 2; ++c) {
5002 LLVMValueRef store_val;
5003 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
5004
5005 store_val = LLVMBuildExtractElement(gallivm->builder,
5006 interp_ij, c_ll, "");
5007 LLVMBuildStore(gallivm->builder,
5008 store_val,
5009 store_ptr);
5010
5011 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
5012 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5013
5014 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
5015 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5016
5017 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5018
5019 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5020 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5021
5022 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5023 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5024
5025 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5026 }
5027
5028 return lp_build_gather_values(gallivm, result, 4);
5029 }
5030
5031 static void interp_fetch_args(
5032 struct lp_build_tgsi_context *bld_base,
5033 struct lp_build_emit_data *emit_data)
5034 {
5035 struct si_shader_context *ctx = si_shader_context(bld_base);
5036 struct gallivm_state *gallivm = bld_base->base.gallivm;
5037 const struct tgsi_full_instruction *inst = emit_data->inst;
5038
5039 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5040 /* offset is in second src, first two channels */
5041 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5042 emit_data->inst, 1,
5043 TGSI_CHAN_X);
5044 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5045 emit_data->inst, 1,
5046 TGSI_CHAN_Y);
5047 emit_data->arg_count = 2;
5048 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5049 LLVMValueRef sample_position;
5050 LLVMValueRef sample_id;
5051 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5052
5053 /* fetch sample ID, then fetch its sample position,
5054 * and place into first two channels.
5055 */
5056 sample_id = lp_build_emit_fetch(bld_base,
5057 emit_data->inst, 1, TGSI_CHAN_X);
5058 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5059 ctx->i32, "");
5060 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5061
5062 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5063 sample_position,
5064 lp_build_const_int32(gallivm, 0), "");
5065
5066 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5067 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5068 sample_position,
5069 lp_build_const_int32(gallivm, 1), "");
5070 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5071 emit_data->arg_count = 2;
5072 }
5073 }
5074
5075 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5076 struct lp_build_tgsi_context *bld_base,
5077 struct lp_build_emit_data *emit_data)
5078 {
5079 struct si_shader_context *ctx = si_shader_context(bld_base);
5080 struct si_shader *shader = ctx->shader;
5081 struct gallivm_state *gallivm = bld_base->base.gallivm;
5082 LLVMValueRef interp_param;
5083 const struct tgsi_full_instruction *inst = emit_data->inst;
5084 const char *intr_name;
5085 int input_index = inst->Src[0].Register.Index;
5086 int chan;
5087 int i;
5088 LLVMValueRef attr_number;
5089 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5090 int interp_param_idx;
5091 unsigned interp = shader->selector->info.input_interpolate[input_index];
5092 unsigned location;
5093
5094 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5095
5096 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5097 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5098 location = TGSI_INTERPOLATE_LOC_CENTER;
5099 else
5100 location = TGSI_INTERPOLATE_LOC_CENTROID;
5101
5102 interp_param_idx = lookup_interp_param_index(interp, location);
5103 if (interp_param_idx == -1)
5104 return;
5105 else if (interp_param_idx)
5106 interp_param = get_interp_param(ctx, interp_param_idx);
5107 else
5108 interp_param = NULL;
5109
5110 attr_number = lp_build_const_int32(gallivm, input_index);
5111
5112 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5113 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5114 LLVMValueRef ij_out[2];
5115 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5116
5117 /*
5118 * take the I then J parameters, and the DDX/Y for it, and
5119 * calculate the IJ inputs for the interpolator.
5120 * temp1 = ddx * offset/sample.x + I;
5121 * interp_param.I = ddy * offset/sample.y + temp1;
5122 * temp1 = ddx * offset/sample.x + J;
5123 * interp_param.J = ddy * offset/sample.y + temp1;
5124 */
5125 for (i = 0; i < 2; i++) {
5126 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5127 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5128 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5129 ddxy_out, ix_ll, "");
5130 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5131 ddxy_out, iy_ll, "");
5132 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5133 interp_param, ix_ll, "");
5134 LLVMValueRef temp1, temp2;
5135
5136 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5137 ctx->f32, "");
5138
5139 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5140
5141 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5142
5143 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5144
5145 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5146
5147 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5148 temp2, ctx->i32, "");
5149 }
5150 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5151 }
5152
5153 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5154 for (chan = 0; chan < 2; chan++) {
5155 LLVMValueRef args[4];
5156 LLVMValueRef llvm_chan;
5157 unsigned schan;
5158
5159 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5160 llvm_chan = lp_build_const_int32(gallivm, schan);
5161
5162 args[0] = llvm_chan;
5163 args[1] = attr_number;
5164 args[2] = params;
5165 args[3] = interp_param;
5166
5167 emit_data->output[chan] =
5168 lp_build_intrinsic(gallivm->builder, intr_name,
5169 ctx->f32, args, args[3] ? 4 : 3,
5170 LLVMReadNoneAttribute);
5171 }
5172 }
5173
5174 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5175 struct lp_build_emit_data *emit_data)
5176 {
5177 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5178 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5179 unsigned stream;
5180
5181 assert(src0.File == TGSI_FILE_IMMEDIATE);
5182
5183 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5184 return stream;
5185 }
5186
5187 /* Emit one vertex from the geometry shader */
5188 static void si_llvm_emit_vertex(
5189 const struct lp_build_tgsi_action *action,
5190 struct lp_build_tgsi_context *bld_base,
5191 struct lp_build_emit_data *emit_data)
5192 {
5193 struct si_shader_context *ctx = si_shader_context(bld_base);
5194 struct lp_build_context *uint = &bld_base->uint_bld;
5195 struct si_shader *shader = ctx->shader;
5196 struct tgsi_shader_info *info = &shader->selector->info;
5197 struct gallivm_state *gallivm = bld_base->base.gallivm;
5198 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5199 SI_PARAM_GS2VS_OFFSET);
5200 LLVMValueRef gs_next_vertex;
5201 LLVMValueRef can_emit, kill;
5202 LLVMValueRef args[2];
5203 unsigned chan;
5204 int i;
5205 unsigned stream;
5206
5207 stream = si_llvm_get_stream(bld_base, emit_data);
5208
5209 /* Write vertex attribute values to GSVS ring */
5210 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5211 ctx->gs_next_vertex[stream],
5212 "");
5213
5214 /* If this thread has already emitted the declared maximum number of
5215 * vertices, kill it: excessive vertex emissions are not supposed to
5216 * have any effect, and GS threads have no externally observable
5217 * effects other than emitting vertices.
5218 */
5219 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5220 lp_build_const_int32(gallivm,
5221 shader->selector->gs_max_out_vertices), "");
5222 kill = lp_build_select(&bld_base->base, can_emit,
5223 lp_build_const_float(gallivm, 1.0f),
5224 lp_build_const_float(gallivm, -1.0f));
5225
5226 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5227 ctx->voidt, &kill, 1, 0);
5228
5229 for (i = 0; i < info->num_outputs; i++) {
5230 LLVMValueRef *out_ptr =
5231 ctx->radeon_bld.soa.outputs[i];
5232
5233 for (chan = 0; chan < 4; chan++) {
5234 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5235 LLVMValueRef voffset =
5236 lp_build_const_int32(gallivm, (i * 4 + chan) *
5237 shader->selector->gs_max_out_vertices);
5238
5239 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5240 voffset = lp_build_mul_imm(uint, voffset, 4);
5241
5242 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5243
5244 build_tbuffer_store(ctx,
5245 ctx->gsvs_ring[stream],
5246 out_val, 1,
5247 voffset, soffset, 0,
5248 V_008F0C_BUF_DATA_FORMAT_32,
5249 V_008F0C_BUF_NUM_FORMAT_UINT,
5250 1, 0, 1, 1, 0);
5251 }
5252 }
5253 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5254 lp_build_const_int32(gallivm, 1));
5255
5256 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5257
5258 /* Signal vertex emission */
5259 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5260 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5261 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5262 ctx->voidt, args, 2, 0);
5263 }
5264
5265 /* Cut one primitive from the geometry shader */
5266 static void si_llvm_emit_primitive(
5267 const struct lp_build_tgsi_action *action,
5268 struct lp_build_tgsi_context *bld_base,
5269 struct lp_build_emit_data *emit_data)
5270 {
5271 struct si_shader_context *ctx = si_shader_context(bld_base);
5272 struct gallivm_state *gallivm = bld_base->base.gallivm;
5273 LLVMValueRef args[2];
5274 unsigned stream;
5275
5276 /* Signal primitive cut */
5277 stream = si_llvm_get_stream(bld_base, emit_data);
5278 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5279 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5280 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5281 ctx->voidt, args, 2, 0);
5282 }
5283
5284 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5285 struct lp_build_tgsi_context *bld_base,
5286 struct lp_build_emit_data *emit_data)
5287 {
5288 struct si_shader_context *ctx = si_shader_context(bld_base);
5289 struct gallivm_state *gallivm = bld_base->base.gallivm;
5290
5291 /* The real barrier instruction isn’t needed, because an entire patch
5292 * always fits into a single wave.
5293 */
5294 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5295 emit_optimization_barrier(ctx);
5296 return;
5297 }
5298
5299 lp_build_intrinsic(gallivm->builder,
5300 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5301 : "llvm.AMDGPU.barrier.local",
5302 ctx->voidt, NULL, 0, 0);
5303 }
5304
5305 static const struct lp_build_tgsi_action tex_action = {
5306 .fetch_args = tex_fetch_args,
5307 .emit = build_tex_intrinsic,
5308 };
5309
5310 static const struct lp_build_tgsi_action interp_action = {
5311 .fetch_args = interp_fetch_args,
5312 .emit = build_interp_intrinsic,
5313 };
5314
5315 static void si_create_function(struct si_shader_context *ctx,
5316 LLVMTypeRef *returns, unsigned num_returns,
5317 LLVMTypeRef *params, unsigned num_params,
5318 int last_sgpr)
5319 {
5320 int i;
5321
5322 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5323 params, num_params);
5324 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5325 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5326
5327 for (i = 0; i <= last_sgpr; ++i) {
5328 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5329
5330 /* The combination of:
5331 * - ByVal
5332 * - dereferenceable
5333 * - invariant.load
5334 * allows the optimization passes to move loads and reduces
5335 * SGPR spilling significantly.
5336 */
5337 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5338 LLVMAddAttribute(P, LLVMByValAttribute);
5339 lp_add_attr_dereferenceable(P, UINT64_MAX);
5340 } else
5341 LLVMAddAttribute(P, LLVMInRegAttribute);
5342 }
5343
5344 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5345 /* These were copied from some LLVM test. */
5346 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5347 "less-precise-fpmad",
5348 "true");
5349 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5350 "no-infs-fp-math",
5351 "true");
5352 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5353 "no-nans-fp-math",
5354 "true");
5355 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5356 "unsafe-fp-math",
5357 "true");
5358 }
5359 }
5360
5361 static void create_meta_data(struct si_shader_context *ctx)
5362 {
5363 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5364
5365 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5366 "invariant.load", 14);
5367 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5368 "range", 5);
5369 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5370 "amdgpu.uniform", 14);
5371
5372 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5373 }
5374
5375 static void declare_streamout_params(struct si_shader_context *ctx,
5376 struct pipe_stream_output_info *so,
5377 LLVMTypeRef *params, LLVMTypeRef i32,
5378 unsigned *num_params)
5379 {
5380 int i;
5381
5382 /* Streamout SGPRs. */
5383 if (so->num_outputs) {
5384 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5385 params[ctx->param_streamout_config = (*num_params)++] = i32;
5386 else
5387 ctx->param_streamout_config = ctx->param_tess_offchip;
5388
5389 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5390 }
5391 /* A streamout buffer offset is loaded if the stride is non-zero. */
5392 for (i = 0; i < 4; i++) {
5393 if (!so->stride[i])
5394 continue;
5395
5396 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5397 }
5398 }
5399
5400 static unsigned llvm_get_type_size(LLVMTypeRef type)
5401 {
5402 LLVMTypeKind kind = LLVMGetTypeKind(type);
5403
5404 switch (kind) {
5405 case LLVMIntegerTypeKind:
5406 return LLVMGetIntTypeWidth(type) / 8;
5407 case LLVMFloatTypeKind:
5408 return 4;
5409 case LLVMPointerTypeKind:
5410 return 8;
5411 case LLVMVectorTypeKind:
5412 return LLVMGetVectorSize(type) *
5413 llvm_get_type_size(LLVMGetElementType(type));
5414 default:
5415 assert(0);
5416 return 0;
5417 }
5418 }
5419
5420 static void declare_tess_lds(struct si_shader_context *ctx)
5421 {
5422 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5423 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5424 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5425
5426 /* The actual size is computed outside of the shader to reduce
5427 * the number of shader variants. */
5428 ctx->lds =
5429 LLVMAddGlobalInAddressSpace(gallivm->module,
5430 LLVMArrayType(i32, lds_size / 4),
5431 "tess_lds",
5432 LOCAL_ADDR_SPACE);
5433 }
5434
5435 static void create_function(struct si_shader_context *ctx)
5436 {
5437 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5438 struct gallivm_state *gallivm = bld_base->base.gallivm;
5439 struct si_shader *shader = ctx->shader;
5440 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5441 LLVMTypeRef returns[16+32*4];
5442 unsigned i, last_sgpr, num_params, num_return_sgprs;
5443 unsigned num_returns = 0;
5444
5445 v3i32 = LLVMVectorType(ctx->i32, 3);
5446
5447 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5448 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5449 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5450 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5451 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5452
5453 switch (ctx->type) {
5454 case PIPE_SHADER_VERTEX:
5455 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5456 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5457 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5458 num_params = SI_PARAM_START_INSTANCE+1;
5459
5460 if (shader->key.vs.as_es) {
5461 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5462 } else if (shader->key.vs.as_ls) {
5463 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5464 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5465 } else {
5466 if (ctx->is_gs_copy_shader) {
5467 num_params = SI_PARAM_RW_BUFFERS+1;
5468 } else {
5469 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5470 num_params = SI_PARAM_VS_STATE_BITS+1;
5471 }
5472
5473 /* The locations of the other parameters are assigned dynamically. */
5474 declare_streamout_params(ctx, &shader->selector->so,
5475 params, ctx->i32, &num_params);
5476 }
5477
5478 last_sgpr = num_params-1;
5479
5480 /* VGPRs */
5481 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5482 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5483 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5484 params[ctx->param_instance_id = num_params++] = ctx->i32;
5485
5486 if (!ctx->is_monolithic &&
5487 !ctx->is_gs_copy_shader) {
5488 /* Vertex load indices. */
5489 ctx->param_vertex_index0 = num_params;
5490
5491 for (i = 0; i < shader->selector->info.num_inputs; i++)
5492 params[num_params++] = ctx->i32;
5493
5494 /* PrimitiveID output. */
5495 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5496 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5497 returns[num_returns++] = ctx->f32;
5498 }
5499 break;
5500
5501 case PIPE_SHADER_TESS_CTRL:
5502 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5503 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5504 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5505 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5506 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5507 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5508 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5509
5510 /* VGPRs */
5511 params[SI_PARAM_PATCH_ID] = ctx->i32;
5512 params[SI_PARAM_REL_IDS] = ctx->i32;
5513 num_params = SI_PARAM_REL_IDS+1;
5514
5515 if (!ctx->is_monolithic) {
5516 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5517 * placed after the user SGPRs.
5518 */
5519 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5520 returns[num_returns++] = ctx->i32; /* SGPRs */
5521
5522 for (i = 0; i < 3; i++)
5523 returns[num_returns++] = ctx->f32; /* VGPRs */
5524 }
5525 break;
5526
5527 case PIPE_SHADER_TESS_EVAL:
5528 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5529 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5530
5531 if (shader->key.tes.as_es) {
5532 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5533 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5534 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5535 } else {
5536 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5537 declare_streamout_params(ctx, &shader->selector->so,
5538 params, ctx->i32, &num_params);
5539 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5540 }
5541 last_sgpr = num_params - 1;
5542
5543 /* VGPRs */
5544 params[ctx->param_tes_u = num_params++] = ctx->f32;
5545 params[ctx->param_tes_v = num_params++] = ctx->f32;
5546 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5547 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5548
5549 /* PrimitiveID output. */
5550 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5551 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5552 returns[num_returns++] = ctx->f32;
5553 break;
5554
5555 case PIPE_SHADER_GEOMETRY:
5556 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5557 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5558 last_sgpr = SI_PARAM_GS_WAVE_ID;
5559
5560 /* VGPRs */
5561 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5562 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5563 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5564 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5565 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5566 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5567 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5568 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5569 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5570 break;
5571
5572 case PIPE_SHADER_FRAGMENT:
5573 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5574 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5575 last_sgpr = SI_PARAM_PRIM_MASK;
5576 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5577 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5578 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5579 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5580 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5581 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5582 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5583 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5584 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5585 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5586 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5587 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5588 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5589 params[SI_PARAM_ANCILLARY] = ctx->i32;
5590 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5591 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5592 num_params = SI_PARAM_POS_FIXED_PT+1;
5593
5594 if (!ctx->is_monolithic) {
5595 /* Color inputs from the prolog. */
5596 if (shader->selector->info.colors_read) {
5597 unsigned num_color_elements =
5598 util_bitcount(shader->selector->info.colors_read);
5599
5600 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5601 for (i = 0; i < num_color_elements; i++)
5602 params[num_params++] = ctx->f32;
5603 }
5604
5605 /* Outputs for the epilog. */
5606 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5607 num_returns =
5608 num_return_sgprs +
5609 util_bitcount(shader->selector->info.colors_written) * 4 +
5610 shader->selector->info.writes_z +
5611 shader->selector->info.writes_stencil +
5612 shader->selector->info.writes_samplemask +
5613 1 /* SampleMaskIn */;
5614
5615 num_returns = MAX2(num_returns,
5616 num_return_sgprs +
5617 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5618
5619 for (i = 0; i < num_return_sgprs; i++)
5620 returns[i] = ctx->i32;
5621 for (; i < num_returns; i++)
5622 returns[i] = ctx->f32;
5623 }
5624 break;
5625
5626 case PIPE_SHADER_COMPUTE:
5627 params[SI_PARAM_GRID_SIZE] = v3i32;
5628 params[SI_PARAM_BLOCK_ID] = v3i32;
5629 last_sgpr = SI_PARAM_BLOCK_ID;
5630
5631 params[SI_PARAM_THREAD_ID] = v3i32;
5632 num_params = SI_PARAM_THREAD_ID + 1;
5633 break;
5634 default:
5635 assert(0 && "unimplemented shader");
5636 return;
5637 }
5638
5639 assert(num_params <= ARRAY_SIZE(params));
5640
5641 si_create_function(ctx, returns, num_returns, params,
5642 num_params, last_sgpr);
5643
5644 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5645 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5646 !ctx->is_monolithic) {
5647 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5648 "InitialPSInputAddr",
5649 S_0286D0_PERSP_SAMPLE_ENA(1) |
5650 S_0286D0_PERSP_CENTER_ENA(1) |
5651 S_0286D0_PERSP_CENTROID_ENA(1) |
5652 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5653 S_0286D0_LINEAR_CENTER_ENA(1) |
5654 S_0286D0_LINEAR_CENTROID_ENA(1) |
5655 S_0286D0_FRONT_FACE_ENA(1) |
5656 S_0286D0_POS_FIXED_PT_ENA(1));
5657 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5658 const unsigned *properties = shader->selector->info.properties;
5659 unsigned max_work_group_size =
5660 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5661 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5662 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5663
5664 assert(max_work_group_size);
5665
5666 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5667 "amdgpu-max-work-group-size",
5668 max_work_group_size);
5669 }
5670
5671 shader->info.num_input_sgprs = 0;
5672 shader->info.num_input_vgprs = 0;
5673
5674 for (i = 0; i <= last_sgpr; ++i)
5675 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5676
5677 /* Unused fragment shader inputs are eliminated by the compiler,
5678 * so we don't know yet how many there will be.
5679 */
5680 if (ctx->type != PIPE_SHADER_FRAGMENT)
5681 for (; i < num_params; ++i)
5682 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5683
5684 if (bld_base->info &&
5685 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5686 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5687 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5688 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5689 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5690 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5691 ctx->lds =
5692 LLVMAddGlobalInAddressSpace(gallivm->module,
5693 LLVMArrayType(ctx->i32, 64),
5694 "ddxy_lds",
5695 LOCAL_ADDR_SPACE);
5696
5697 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5698 ctx->type == PIPE_SHADER_TESS_CTRL ||
5699 ctx->type == PIPE_SHADER_TESS_EVAL)
5700 declare_tess_lds(ctx);
5701 }
5702
5703 static void preload_constants(struct si_shader_context *ctx)
5704 {
5705 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5706 struct gallivm_state *gallivm = bld_base->base.gallivm;
5707 const struct tgsi_shader_info *info = bld_base->info;
5708 unsigned buf;
5709 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5710
5711 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5712 unsigned i, num_const = info->const_file_max[buf] + 1;
5713
5714 if (num_const == 0)
5715 continue;
5716
5717 /* Allocate space for the constant values */
5718 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5719
5720 /* Load the resource descriptor */
5721 ctx->const_buffers[buf] =
5722 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5723
5724 /* Load the constants, we rely on the code sinking to do the rest */
5725 for (i = 0; i < num_const * 4; ++i) {
5726 ctx->constants[buf][i] =
5727 buffer_load_const(ctx,
5728 ctx->const_buffers[buf],
5729 lp_build_const_int32(gallivm, i * 4));
5730 }
5731 }
5732 }
5733
5734 static void preload_shader_buffers(struct si_shader_context *ctx)
5735 {
5736 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5737 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5738 int buf, maxbuf;
5739
5740 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5741 SI_NUM_SHADER_BUFFERS - 1);
5742 for (buf = 0; buf <= maxbuf; ++buf) {
5743 ctx->shader_buffers[buf] =
5744 build_indexed_load_const(
5745 ctx, ptr, lp_build_const_int32(gallivm, buf));
5746 }
5747 }
5748
5749 static void preload_samplers(struct si_shader_context *ctx)
5750 {
5751 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5752 struct gallivm_state *gallivm = bld_base->base.gallivm;
5753 const struct tgsi_shader_info *info = bld_base->info;
5754 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5755 LLVMValueRef offset;
5756
5757 if (num_samplers == 0)
5758 return;
5759
5760 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5761 for (i = 0; i < num_samplers; ++i) {
5762 /* Resource */
5763 offset = lp_build_const_int32(gallivm, i);
5764 ctx->sampler_views[i] =
5765 get_sampler_desc(ctx, offset, DESC_IMAGE);
5766
5767 /* FMASK resource */
5768 if (info->is_msaa_sampler[i])
5769 ctx->fmasks[i] =
5770 get_sampler_desc(ctx, offset, DESC_FMASK);
5771 else {
5772 ctx->sampler_states[i] =
5773 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5774 ctx->sampler_states[i] =
5775 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5776 ctx->sampler_states[i]);
5777 }
5778 }
5779 }
5780
5781 static void preload_images(struct si_shader_context *ctx)
5782 {
5783 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5784 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5785 struct gallivm_state *gallivm = bld_base->base.gallivm;
5786 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5787 LLVMValueRef res_ptr;
5788 unsigned i;
5789
5790 if (num_images == 0)
5791 return;
5792
5793 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5794
5795 for (i = 0; i < num_images; ++i) {
5796 /* Rely on LLVM to shrink the load for buffer resources. */
5797 LLVMValueRef rsrc =
5798 build_indexed_load_const(ctx, res_ptr,
5799 lp_build_const_int32(gallivm, i));
5800
5801 if (info->images_writemask & (1 << i) &&
5802 !(info->images_buffers & (1 << i)))
5803 rsrc = force_dcc_off(ctx, rsrc);
5804
5805 ctx->images[i] = rsrc;
5806 }
5807 }
5808
5809 static void preload_streamout_buffers(struct si_shader_context *ctx)
5810 {
5811 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5812 struct gallivm_state *gallivm = bld_base->base.gallivm;
5813 unsigned i;
5814
5815 /* Streamout can only be used if the shader is compiled as VS. */
5816 if (!ctx->shader->selector->so.num_outputs ||
5817 (ctx->type == PIPE_SHADER_VERTEX &&
5818 (ctx->shader->key.vs.as_es ||
5819 ctx->shader->key.vs.as_ls)) ||
5820 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5821 ctx->shader->key.tes.as_es))
5822 return;
5823
5824 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5825 SI_PARAM_RW_BUFFERS);
5826
5827 /* Load the resources, we rely on the code sinking to do the rest */
5828 for (i = 0; i < 4; ++i) {
5829 if (ctx->shader->selector->so.stride[i]) {
5830 LLVMValueRef offset = lp_build_const_int32(gallivm,
5831 SI_VS_STREAMOUT_BUF0 + i);
5832
5833 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5834 }
5835 }
5836 }
5837
5838 /**
5839 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5840 * for later use.
5841 */
5842 static void preload_ring_buffers(struct si_shader_context *ctx)
5843 {
5844 struct gallivm_state *gallivm =
5845 ctx->radeon_bld.soa.bld_base.base.gallivm;
5846
5847 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5848 SI_PARAM_RW_BUFFERS);
5849
5850 if ((ctx->type == PIPE_SHADER_VERTEX &&
5851 ctx->shader->key.vs.as_es) ||
5852 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5853 ctx->shader->key.tes.as_es) ||
5854 ctx->type == PIPE_SHADER_GEOMETRY) {
5855 unsigned ring =
5856 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5857 : SI_ES_RING_ESGS;
5858 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5859
5860 ctx->esgs_ring =
5861 build_indexed_load_const(ctx, buf_ptr, offset);
5862 }
5863
5864 if (ctx->is_gs_copy_shader) {
5865 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5866
5867 ctx->gsvs_ring[0] =
5868 build_indexed_load_const(ctx, buf_ptr, offset);
5869 }
5870 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5871 int i;
5872 for (i = 0; i < 4; i++) {
5873 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5874
5875 ctx->gsvs_ring[i] =
5876 build_indexed_load_const(ctx, buf_ptr, offset);
5877 }
5878 }
5879 }
5880
5881 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5882 LLVMValueRef param_rw_buffers,
5883 unsigned param_pos_fixed_pt)
5884 {
5885 struct lp_build_tgsi_context *bld_base =
5886 &ctx->radeon_bld.soa.bld_base;
5887 struct gallivm_state *gallivm = bld_base->base.gallivm;
5888 LLVMBuilderRef builder = gallivm->builder;
5889 LLVMValueRef slot, desc, offset, row, bit, address[2];
5890
5891 /* Use the fixed-point gl_FragCoord input.
5892 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5893 * per coordinate to get the repeating effect.
5894 */
5895 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5896 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5897
5898 /* Load the buffer descriptor. */
5899 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5900 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5901
5902 /* The stipple pattern is 32x32, each row has 32 bits. */
5903 offset = LLVMBuildMul(builder, address[1],
5904 LLVMConstInt(ctx->i32, 4, 0), "");
5905 row = buffer_load_const(ctx, desc, offset);
5906 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
5907 bit = LLVMBuildLShr(builder, row, address[0], "");
5908 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5909
5910 /* The intrinsic kills the thread if arg < 0. */
5911 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5912 LLVMConstReal(ctx->f32, -1), "");
5913 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5914 }
5915
5916 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5917 struct si_shader_config *conf,
5918 unsigned symbol_offset)
5919 {
5920 unsigned i;
5921 const unsigned char *config =
5922 radeon_shader_binary_config_start(binary, symbol_offset);
5923 bool really_needs_scratch = false;
5924
5925 /* LLVM adds SGPR spills to the scratch size.
5926 * Find out if we really need the scratch buffer.
5927 */
5928 for (i = 0; i < binary->reloc_count; i++) {
5929 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5930
5931 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5932 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5933 really_needs_scratch = true;
5934 break;
5935 }
5936 }
5937
5938 /* XXX: We may be able to emit some of these values directly rather than
5939 * extracting fields to be emitted later.
5940 */
5941
5942 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5943 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5944 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5945 switch (reg) {
5946 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5947 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5948 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5949 case R_00B848_COMPUTE_PGM_RSRC1:
5950 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5951 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5952 conf->float_mode = G_00B028_FLOAT_MODE(value);
5953 conf->rsrc1 = value;
5954 break;
5955 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5956 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5957 break;
5958 case R_00B84C_COMPUTE_PGM_RSRC2:
5959 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5960 conf->rsrc2 = value;
5961 break;
5962 case R_0286CC_SPI_PS_INPUT_ENA:
5963 conf->spi_ps_input_ena = value;
5964 break;
5965 case R_0286D0_SPI_PS_INPUT_ADDR:
5966 conf->spi_ps_input_addr = value;
5967 break;
5968 case R_0286E8_SPI_TMPRING_SIZE:
5969 case R_00B860_COMPUTE_TMPRING_SIZE:
5970 /* WAVESIZE is in units of 256 dwords. */
5971 if (really_needs_scratch)
5972 conf->scratch_bytes_per_wave =
5973 G_00B860_WAVESIZE(value) * 256 * 4;
5974 break;
5975 case 0x4: /* SPILLED_SGPRS */
5976 conf->spilled_sgprs = value;
5977 break;
5978 case 0x8: /* SPILLED_VGPRS */
5979 conf->spilled_vgprs = value;
5980 break;
5981 default:
5982 {
5983 static bool printed;
5984
5985 if (!printed) {
5986 fprintf(stderr, "Warning: LLVM emitted unknown "
5987 "config register: 0x%x\n", reg);
5988 printed = true;
5989 }
5990 }
5991 break;
5992 }
5993
5994 if (!conf->spi_ps_input_addr)
5995 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5996 }
5997 }
5998
5999 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6000 struct si_shader *shader,
6001 struct si_shader_config *config,
6002 uint64_t scratch_va)
6003 {
6004 unsigned i;
6005 uint32_t scratch_rsrc_dword0 = scratch_va;
6006 uint32_t scratch_rsrc_dword1 =
6007 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6008
6009 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6010 * correctly.
6011 */
6012 if (HAVE_LLVM >= 0x0309)
6013 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6014 else
6015 scratch_rsrc_dword1 |=
6016 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6017
6018 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6019 const struct radeon_shader_reloc *reloc =
6020 &shader->binary.relocs[i];
6021 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6022 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6023 &scratch_rsrc_dword0, 4);
6024 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6025 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6026 &scratch_rsrc_dword1, 4);
6027 }
6028 }
6029 }
6030
6031 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6032 {
6033 unsigned size = shader->binary.code_size;
6034
6035 if (shader->prolog)
6036 size += shader->prolog->binary.code_size;
6037 if (shader->epilog)
6038 size += shader->epilog->binary.code_size;
6039 return size;
6040 }
6041
6042 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6043 {
6044 const struct radeon_shader_binary *prolog =
6045 shader->prolog ? &shader->prolog->binary : NULL;
6046 const struct radeon_shader_binary *epilog =
6047 shader->epilog ? &shader->epilog->binary : NULL;
6048 const struct radeon_shader_binary *mainb = &shader->binary;
6049 unsigned bo_size = si_get_shader_binary_size(shader) +
6050 (!epilog ? mainb->rodata_size : 0);
6051 unsigned char *ptr;
6052
6053 assert(!prolog || !prolog->rodata_size);
6054 assert((!prolog && !epilog) || !mainb->rodata_size);
6055 assert(!epilog || !epilog->rodata_size);
6056
6057 r600_resource_reference(&shader->bo, NULL);
6058 shader->bo = si_resource_create_custom(&sscreen->b.b,
6059 PIPE_USAGE_IMMUTABLE,
6060 bo_size);
6061 if (!shader->bo)
6062 return -ENOMEM;
6063
6064 /* Upload. */
6065 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6066 PIPE_TRANSFER_READ_WRITE);
6067
6068 if (prolog) {
6069 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6070 ptr += prolog->code_size;
6071 }
6072
6073 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6074 ptr += mainb->code_size;
6075
6076 if (epilog)
6077 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6078 else if (mainb->rodata_size > 0)
6079 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6080
6081 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6082 return 0;
6083 }
6084
6085 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6086 struct pipe_debug_callback *debug,
6087 const char *name, FILE *file)
6088 {
6089 char *line, *p;
6090 unsigned i, count;
6091
6092 if (binary->disasm_string) {
6093 fprintf(file, "Shader %s disassembly:\n", name);
6094 fprintf(file, "%s", binary->disasm_string);
6095
6096 if (debug && debug->debug_message) {
6097 /* Very long debug messages are cut off, so send the
6098 * disassembly one line at a time. This causes more
6099 * overhead, but on the plus side it simplifies
6100 * parsing of resulting logs.
6101 */
6102 pipe_debug_message(debug, SHADER_INFO,
6103 "Shader Disassembly Begin");
6104
6105 line = binary->disasm_string;
6106 while (*line) {
6107 p = util_strchrnul(line, '\n');
6108 count = p - line;
6109
6110 if (count) {
6111 pipe_debug_message(debug, SHADER_INFO,
6112 "%.*s", count, line);
6113 }
6114
6115 if (!*p)
6116 break;
6117 line = p + 1;
6118 }
6119
6120 pipe_debug_message(debug, SHADER_INFO,
6121 "Shader Disassembly End");
6122 }
6123 } else {
6124 fprintf(file, "Shader %s binary:\n", name);
6125 for (i = 0; i < binary->code_size; i += 4) {
6126 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6127 binary->code[i + 3], binary->code[i + 2],
6128 binary->code[i + 1], binary->code[i]);
6129 }
6130 }
6131 }
6132
6133 static void si_shader_dump_stats(struct si_screen *sscreen,
6134 struct si_shader_config *conf,
6135 unsigned num_inputs,
6136 unsigned code_size,
6137 struct pipe_debug_callback *debug,
6138 unsigned processor,
6139 FILE *file)
6140 {
6141 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6142 unsigned lds_per_wave = 0;
6143 unsigned max_simd_waves = 10;
6144
6145 /* Compute LDS usage for PS. */
6146 if (processor == PIPE_SHADER_FRAGMENT) {
6147 /* The minimum usage per wave is (num_inputs * 48). The maximum
6148 * usage is (num_inputs * 48 * 16).
6149 * We can get anything in between and it varies between waves.
6150 *
6151 * The 48 bytes per input for a single primitive is equal to
6152 * 4 bytes/component * 4 components/input * 3 points.
6153 *
6154 * Other stages don't know the size at compile time or don't
6155 * allocate LDS per wave, but instead they do it per thread group.
6156 */
6157 lds_per_wave = conf->lds_size * lds_increment +
6158 align(num_inputs * 48, lds_increment);
6159 }
6160
6161 /* Compute the per-SIMD wave counts. */
6162 if (conf->num_sgprs) {
6163 if (sscreen->b.chip_class >= VI)
6164 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6165 else
6166 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6167 }
6168
6169 if (conf->num_vgprs)
6170 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6171
6172 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6173 * that PS can use.
6174 */
6175 if (lds_per_wave)
6176 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6177
6178 if (file != stderr ||
6179 r600_can_dump_shader(&sscreen->b, processor)) {
6180 if (processor == PIPE_SHADER_FRAGMENT) {
6181 fprintf(file, "*** SHADER CONFIG ***\n"
6182 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6183 "SPI_PS_INPUT_ENA = 0x%04x\n",
6184 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6185 }
6186
6187 fprintf(file, "*** SHADER STATS ***\n"
6188 "SGPRS: %d\n"
6189 "VGPRS: %d\n"
6190 "Spilled SGPRs: %d\n"
6191 "Spilled VGPRs: %d\n"
6192 "Code Size: %d bytes\n"
6193 "LDS: %d blocks\n"
6194 "Scratch: %d bytes per wave\n"
6195 "Max Waves: %d\n"
6196 "********************\n\n\n",
6197 conf->num_sgprs, conf->num_vgprs,
6198 conf->spilled_sgprs, conf->spilled_vgprs, code_size,
6199 conf->lds_size, conf->scratch_bytes_per_wave,
6200 max_simd_waves);
6201 }
6202
6203 pipe_debug_message(debug, SHADER_INFO,
6204 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6205 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6206 "Spilled VGPRs: %d",
6207 conf->num_sgprs, conf->num_vgprs, code_size,
6208 conf->lds_size, conf->scratch_bytes_per_wave,
6209 max_simd_waves, conf->spilled_sgprs,
6210 conf->spilled_vgprs);
6211 }
6212
6213 static const char *si_get_shader_name(struct si_shader *shader,
6214 unsigned processor)
6215 {
6216 switch (processor) {
6217 case PIPE_SHADER_VERTEX:
6218 if (shader->key.vs.as_es)
6219 return "Vertex Shader as ES";
6220 else if (shader->key.vs.as_ls)
6221 return "Vertex Shader as LS";
6222 else
6223 return "Vertex Shader as VS";
6224 case PIPE_SHADER_TESS_CTRL:
6225 return "Tessellation Control Shader";
6226 case PIPE_SHADER_TESS_EVAL:
6227 if (shader->key.tes.as_es)
6228 return "Tessellation Evaluation Shader as ES";
6229 else
6230 return "Tessellation Evaluation Shader as VS";
6231 case PIPE_SHADER_GEOMETRY:
6232 if (shader->gs_copy_shader == NULL)
6233 return "GS Copy Shader as VS";
6234 else
6235 return "Geometry Shader";
6236 case PIPE_SHADER_FRAGMENT:
6237 return "Pixel Shader";
6238 case PIPE_SHADER_COMPUTE:
6239 return "Compute Shader";
6240 default:
6241 return "Unknown Shader";
6242 }
6243 }
6244
6245 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6246 struct pipe_debug_callback *debug, unsigned processor,
6247 FILE *file)
6248 {
6249 if (file != stderr ||
6250 r600_can_dump_shader(&sscreen->b, processor))
6251 si_dump_shader_key(processor, &shader->key, file);
6252
6253 if (file != stderr && shader->binary.llvm_ir_string) {
6254 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6255 si_get_shader_name(shader, processor));
6256 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6257 }
6258
6259 if (file != stderr ||
6260 (r600_can_dump_shader(&sscreen->b, processor) &&
6261 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6262 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6263
6264 if (shader->prolog)
6265 si_shader_dump_disassembly(&shader->prolog->binary,
6266 debug, "prolog", file);
6267
6268 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6269
6270 if (shader->epilog)
6271 si_shader_dump_disassembly(&shader->epilog->binary,
6272 debug, "epilog", file);
6273 fprintf(file, "\n");
6274 }
6275
6276 si_shader_dump_stats(sscreen, &shader->config,
6277 shader->selector ? shader->selector->info.num_inputs : 0,
6278 si_get_shader_binary_size(shader), debug, processor,
6279 file);
6280 }
6281
6282 int si_compile_llvm(struct si_screen *sscreen,
6283 struct radeon_shader_binary *binary,
6284 struct si_shader_config *conf,
6285 LLVMTargetMachineRef tm,
6286 LLVMModuleRef mod,
6287 struct pipe_debug_callback *debug,
6288 unsigned processor,
6289 const char *name)
6290 {
6291 int r = 0;
6292 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6293
6294 if (r600_can_dump_shader(&sscreen->b, processor)) {
6295 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6296
6297 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6298 fprintf(stderr, "%s LLVM IR:\n\n", name);
6299 LLVMDumpModule(mod);
6300 fprintf(stderr, "\n");
6301 }
6302 }
6303
6304 if (sscreen->record_llvm_ir) {
6305 char *ir = LLVMPrintModuleToString(mod);
6306 binary->llvm_ir_string = strdup(ir);
6307 LLVMDisposeMessage(ir);
6308 }
6309
6310 if (!si_replace_shader(count, binary)) {
6311 r = radeon_llvm_compile(mod, binary, tm, debug);
6312 if (r)
6313 return r;
6314 }
6315
6316 si_shader_binary_read_config(binary, conf, 0);
6317
6318 /* Enable 64-bit and 16-bit denormals, because there is no performance
6319 * cost.
6320 *
6321 * If denormals are enabled, all floating-point output modifiers are
6322 * ignored.
6323 *
6324 * Don't enable denormals for 32-bit floats, because:
6325 * - Floating-point output modifiers would be ignored by the hw.
6326 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6327 * have to stop using those.
6328 * - SI & CI would be very slow.
6329 */
6330 conf->float_mode |= V_00B028_FP_64_DENORMS;
6331
6332 FREE(binary->config);
6333 FREE(binary->global_symbol_offsets);
6334 binary->config = NULL;
6335 binary->global_symbol_offsets = NULL;
6336
6337 /* Some shaders can't have rodata because their binaries can be
6338 * concatenated.
6339 */
6340 if (binary->rodata_size &&
6341 (processor == PIPE_SHADER_VERTEX ||
6342 processor == PIPE_SHADER_TESS_CTRL ||
6343 processor == PIPE_SHADER_TESS_EVAL ||
6344 processor == PIPE_SHADER_FRAGMENT)) {
6345 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6346 return -EINVAL;
6347 }
6348
6349 return r;
6350 }
6351
6352 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6353 {
6354 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6355 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6356 else
6357 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6358 }
6359
6360 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6361 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6362 struct si_shader_context *ctx,
6363 struct si_shader *gs,
6364 struct pipe_debug_callback *debug)
6365 {
6366 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6367 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6368 struct lp_build_context *uint = &bld_base->uint_bld;
6369 struct si_shader_output_values *outputs;
6370 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6371 LLVMValueRef args[9];
6372 int i, r;
6373
6374 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6375
6376 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6377 ctx->type = PIPE_SHADER_VERTEX;
6378 ctx->is_gs_copy_shader = true;
6379
6380 create_meta_data(ctx);
6381 create_function(ctx);
6382 preload_streamout_buffers(ctx);
6383 preload_ring_buffers(ctx);
6384
6385 args[0] = ctx->gsvs_ring[0];
6386 args[1] = lp_build_mul_imm(uint,
6387 LLVMGetParam(ctx->radeon_bld.main_fn,
6388 ctx->param_vertex_id),
6389 4);
6390 args[3] = uint->zero;
6391 args[4] = uint->one; /* OFFEN */
6392 args[5] = uint->zero; /* IDXEN */
6393 args[6] = uint->one; /* GLC */
6394 args[7] = uint->one; /* SLC */
6395 args[8] = uint->zero; /* TFE */
6396
6397 /* Fetch vertex data from GSVS ring */
6398 for (i = 0; i < gsinfo->num_outputs; ++i) {
6399 unsigned chan;
6400
6401 outputs[i].name = gsinfo->output_semantic_name[i];
6402 outputs[i].sid = gsinfo->output_semantic_index[i];
6403
6404 for (chan = 0; chan < 4; chan++) {
6405 args[2] = lp_build_const_int32(gallivm,
6406 (i * 4 + chan) *
6407 gs->selector->gs_max_out_vertices * 16 * 4);
6408
6409 outputs[i].values[chan] =
6410 LLVMBuildBitCast(gallivm->builder,
6411 lp_build_intrinsic(gallivm->builder,
6412 "llvm.SI.buffer.load.dword.i32.i32",
6413 ctx->i32, args, 9,
6414 LLVMReadOnlyAttribute),
6415 ctx->f32, "");
6416 }
6417 }
6418
6419 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6420
6421 LLVMBuildRetVoid(gallivm->builder);
6422
6423 /* Dump LLVM IR before any optimization passes */
6424 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6425 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6426 LLVMDumpModule(bld_base->base.gallivm->module);
6427
6428 radeon_llvm_finalize_module(&ctx->radeon_bld);
6429
6430 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6431 &ctx->shader->config, ctx->tm,
6432 bld_base->base.gallivm->module,
6433 debug, PIPE_SHADER_GEOMETRY,
6434 "GS Copy Shader");
6435 if (!r) {
6436 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6437 fprintf(stderr, "GS Copy Shader:\n");
6438 si_shader_dump(sscreen, ctx->shader, debug,
6439 PIPE_SHADER_GEOMETRY, stderr);
6440 r = si_shader_binary_upload(sscreen, ctx->shader);
6441 }
6442
6443 radeon_llvm_dispose(&ctx->radeon_bld);
6444
6445 FREE(outputs);
6446 return r;
6447 }
6448
6449 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
6450 FILE *f)
6451 {
6452 int i;
6453
6454 fprintf(f, "SHADER KEY\n");
6455
6456 switch (shader) {
6457 case PIPE_SHADER_VERTEX:
6458 fprintf(f, " instance_divisors = {");
6459 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6460 fprintf(f, !i ? "%u" : ", %u",
6461 key->vs.prolog.instance_divisors[i]);
6462 fprintf(f, "}\n");
6463 fprintf(f, " as_es = %u\n", key->vs.as_es);
6464 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6465 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6466 break;
6467
6468 case PIPE_SHADER_TESS_CTRL:
6469 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6470 break;
6471
6472 case PIPE_SHADER_TESS_EVAL:
6473 fprintf(f, " as_es = %u\n", key->tes.as_es);
6474 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6475 break;
6476
6477 case PIPE_SHADER_GEOMETRY:
6478 case PIPE_SHADER_COMPUTE:
6479 break;
6480
6481 case PIPE_SHADER_FRAGMENT:
6482 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6483 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6484 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6485 fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6486 fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6487 fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6488 fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6489 fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6490 fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6491 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6492 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6493 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6494 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6495 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6496 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6497 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6498 break;
6499
6500 default:
6501 assert(0);
6502 }
6503 }
6504
6505 static void si_init_shader_ctx(struct si_shader_context *ctx,
6506 struct si_screen *sscreen,
6507 struct si_shader *shader,
6508 LLVMTargetMachineRef tm)
6509 {
6510 struct lp_build_tgsi_context *bld_base;
6511 struct lp_build_tgsi_action tmpl = {};
6512
6513 memset(ctx, 0, sizeof(*ctx));
6514 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
6515 ctx->tm = tm;
6516 ctx->screen = sscreen;
6517 if (shader && shader->selector)
6518 ctx->type = shader->selector->info.processor;
6519 else
6520 ctx->type = -1;
6521 ctx->shader = shader;
6522
6523 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6524 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6525 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6526 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6527 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6528 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6529 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6530 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6531 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6532 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6533 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6534 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6535
6536 bld_base = &ctx->radeon_bld.soa.bld_base;
6537 if (shader && shader->selector)
6538 bld_base->info = &shader->selector->info;
6539 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6540
6541 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6542 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6543 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6544
6545 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6546 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6547 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6548 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6549 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6550 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6551 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6552 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6553 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6554 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6555 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6556 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6557 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6558 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6559
6560 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6561 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6562 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6563 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6564 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6565 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6566
6567 tmpl.fetch_args = atomic_fetch_args;
6568 tmpl.emit = atomic_emit;
6569 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6570 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6571 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6572 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6573 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6574 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6575 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6576 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6577 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6578 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6579 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6580 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6581 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6582 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6583 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6584 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6585 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6586 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6587 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6588 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6589
6590 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6591
6592 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6593 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6594 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6595 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6596
6597 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6598 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6599 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6600
6601 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6602 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6603 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6604 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6605 }
6606
6607 int si_compile_tgsi_shader(struct si_screen *sscreen,
6608 LLVMTargetMachineRef tm,
6609 struct si_shader *shader,
6610 bool is_monolithic,
6611 struct pipe_debug_callback *debug)
6612 {
6613 struct si_shader_selector *sel = shader->selector;
6614 struct si_shader_context ctx;
6615 struct lp_build_tgsi_context *bld_base;
6616 LLVMModuleRef mod;
6617 int r = 0;
6618
6619 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6620 * conversion fails. */
6621 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6622 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6623 tgsi_dump(sel->tokens, 0);
6624 si_dump_streamout(&sel->so);
6625 }
6626
6627 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6628 ctx.is_monolithic = is_monolithic;
6629
6630 shader->info.uses_instanceid = sel->info.uses_instanceid;
6631
6632 bld_base = &ctx.radeon_bld.soa.bld_base;
6633 ctx.radeon_bld.load_system_value = declare_system_value;
6634
6635 switch (ctx.type) {
6636 case PIPE_SHADER_VERTEX:
6637 ctx.radeon_bld.load_input = declare_input_vs;
6638 if (shader->key.vs.as_ls)
6639 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6640 else if (shader->key.vs.as_es)
6641 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6642 else
6643 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6644 break;
6645 case PIPE_SHADER_TESS_CTRL:
6646 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6647 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6648 bld_base->emit_store = store_output_tcs;
6649 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6650 break;
6651 case PIPE_SHADER_TESS_EVAL:
6652 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6653 if (shader->key.tes.as_es)
6654 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6655 else
6656 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6657 break;
6658 case PIPE_SHADER_GEOMETRY:
6659 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6660 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6661 break;
6662 case PIPE_SHADER_FRAGMENT:
6663 ctx.radeon_bld.load_input = declare_input_fs;
6664 if (is_monolithic)
6665 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6666 else
6667 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6668 break;
6669 case PIPE_SHADER_COMPUTE:
6670 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6671 break;
6672 default:
6673 assert(!"Unsupported shader type");
6674 return -1;
6675 }
6676
6677 create_meta_data(&ctx);
6678 create_function(&ctx);
6679 preload_constants(&ctx);
6680 preload_shader_buffers(&ctx);
6681 preload_samplers(&ctx);
6682 preload_images(&ctx);
6683 preload_streamout_buffers(&ctx);
6684 preload_ring_buffers(&ctx);
6685
6686 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6687 shader->key.ps.prolog.poly_stipple) {
6688 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6689 SI_PARAM_RW_BUFFERS);
6690 si_llvm_emit_polygon_stipple(&ctx, list,
6691 SI_PARAM_POS_FIXED_PT);
6692 }
6693
6694 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6695 int i;
6696 for (i = 0; i < 4; i++) {
6697 ctx.gs_next_vertex[i] =
6698 lp_build_alloca(bld_base->base.gallivm,
6699 ctx.i32, "");
6700 }
6701 }
6702
6703 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6704 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6705 goto out;
6706 }
6707
6708 si_llvm_build_ret(&ctx, ctx.return_value);
6709 mod = bld_base->base.gallivm->module;
6710
6711 /* Dump LLVM IR before any optimization passes */
6712 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6713 r600_can_dump_shader(&sscreen->b, ctx.type))
6714 LLVMDumpModule(mod);
6715
6716 radeon_llvm_finalize_module(&ctx.radeon_bld);
6717
6718 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6719 mod, debug, ctx.type, "TGSI shader");
6720 if (r) {
6721 fprintf(stderr, "LLVM failed to compile shader\n");
6722 goto out;
6723 }
6724
6725 radeon_llvm_dispose(&ctx.radeon_bld);
6726
6727 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6728 * LLVM 3.9svn has this bug.
6729 */
6730 if (sel->type == PIPE_SHADER_COMPUTE) {
6731 unsigned *props = sel->info.properties;
6732 unsigned wave_size = 64;
6733 unsigned max_vgprs = 256;
6734 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6735 unsigned max_sgprs_per_wave = 128;
6736 unsigned min_waves_per_cu =
6737 DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
6738 props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
6739 props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
6740 wave_size);
6741 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6742
6743 max_vgprs = max_vgprs / min_waves_per_simd;
6744 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6745
6746 if (shader->config.num_sgprs > max_sgprs ||
6747 shader->config.num_vgprs > max_vgprs) {
6748 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6749 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6750 shader->config.num_sgprs, shader->config.num_vgprs,
6751 max_sgprs, max_vgprs);
6752
6753 /* Just terminate the process, because dependent
6754 * shaders can hang due to bad input data, but use
6755 * the env var to allow shader-db to work.
6756 */
6757 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6758 abort();
6759 }
6760 }
6761
6762 /* Add the scratch offset to input SGPRs. */
6763 if (shader->config.scratch_bytes_per_wave)
6764 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6765
6766 /* Calculate the number of fragment input VGPRs. */
6767 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6768 shader->info.num_input_vgprs = 0;
6769 shader->info.face_vgpr_index = -1;
6770
6771 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6772 shader->info.num_input_vgprs += 2;
6773 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6774 shader->info.num_input_vgprs += 2;
6775 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6776 shader->info.num_input_vgprs += 2;
6777 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6778 shader->info.num_input_vgprs += 3;
6779 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6780 shader->info.num_input_vgprs += 2;
6781 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6782 shader->info.num_input_vgprs += 2;
6783 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6784 shader->info.num_input_vgprs += 2;
6785 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6786 shader->info.num_input_vgprs += 1;
6787 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6788 shader->info.num_input_vgprs += 1;
6789 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6790 shader->info.num_input_vgprs += 1;
6791 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6792 shader->info.num_input_vgprs += 1;
6793 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6794 shader->info.num_input_vgprs += 1;
6795 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6796 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6797 shader->info.num_input_vgprs += 1;
6798 }
6799 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6800 shader->info.num_input_vgprs += 1;
6801 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6802 shader->info.num_input_vgprs += 1;
6803 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6804 shader->info.num_input_vgprs += 1;
6805 }
6806
6807 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6808 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6809 shader->gs_copy_shader->selector = shader->selector;
6810 ctx.shader = shader->gs_copy_shader;
6811 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6812 shader, debug))) {
6813 free(shader->gs_copy_shader);
6814 shader->gs_copy_shader = NULL;
6815 goto out;
6816 }
6817 }
6818
6819 out:
6820 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6821 FREE(ctx.constants[i]);
6822 return r;
6823 }
6824
6825 /**
6826 * Create, compile and return a shader part (prolog or epilog).
6827 *
6828 * \param sscreen screen
6829 * \param list list of shader parts of the same category
6830 * \param key shader part key
6831 * \param tm LLVM target machine
6832 * \param debug debug callback
6833 * \param compile the callback responsible for compilation
6834 * \return non-NULL on success
6835 */
6836 static struct si_shader_part *
6837 si_get_shader_part(struct si_screen *sscreen,
6838 struct si_shader_part **list,
6839 union si_shader_part_key *key,
6840 LLVMTargetMachineRef tm,
6841 struct pipe_debug_callback *debug,
6842 bool (*compile)(struct si_screen *,
6843 LLVMTargetMachineRef,
6844 struct pipe_debug_callback *,
6845 struct si_shader_part *))
6846 {
6847 struct si_shader_part *result;
6848
6849 pipe_mutex_lock(sscreen->shader_parts_mutex);
6850
6851 /* Find existing. */
6852 for (result = *list; result; result = result->next) {
6853 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6854 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6855 return result;
6856 }
6857 }
6858
6859 /* Compile a new one. */
6860 result = CALLOC_STRUCT(si_shader_part);
6861 result->key = *key;
6862 if (!compile(sscreen, tm, debug, result)) {
6863 FREE(result);
6864 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6865 return NULL;
6866 }
6867
6868 result->next = *list;
6869 *list = result;
6870 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6871 return result;
6872 }
6873
6874 /**
6875 * Create a vertex shader prolog.
6876 *
6877 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6878 * All inputs are returned unmodified. The vertex load indices are
6879 * stored after them, which will used by the API VS for fetching inputs.
6880 *
6881 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6882 * input_v0,
6883 * input_v1,
6884 * input_v2,
6885 * input_v3,
6886 * (VertexID + BaseVertex),
6887 * (InstanceID + StartInstance),
6888 * (InstanceID / 2 + StartInstance)
6889 */
6890 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6891 LLVMTargetMachineRef tm,
6892 struct pipe_debug_callback *debug,
6893 struct si_shader_part *out)
6894 {
6895 union si_shader_part_key *key = &out->key;
6896 struct si_shader shader = {};
6897 struct si_shader_context ctx;
6898 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6899 LLVMTypeRef *params, *returns;
6900 LLVMValueRef ret, func;
6901 int last_sgpr, num_params, num_returns, i;
6902 bool status = true;
6903
6904 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6905 ctx.type = PIPE_SHADER_VERTEX;
6906 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6907 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6908
6909 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6910 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6911 sizeof(LLVMTypeRef));
6912 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6913 key->vs_prolog.last_input + 1) *
6914 sizeof(LLVMTypeRef));
6915 num_params = 0;
6916 num_returns = 0;
6917
6918 /* Declare input and output SGPRs. */
6919 num_params = 0;
6920 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6921 params[num_params++] = ctx.i32;
6922 returns[num_returns++] = ctx.i32;
6923 }
6924 last_sgpr = num_params - 1;
6925
6926 /* 4 preloaded VGPRs (outputs must be floats) */
6927 for (i = 0; i < 4; i++) {
6928 params[num_params++] = ctx.i32;
6929 returns[num_returns++] = ctx.f32;
6930 }
6931
6932 /* Vertex load indices. */
6933 for (i = 0; i <= key->vs_prolog.last_input; i++)
6934 returns[num_returns++] = ctx.f32;
6935
6936 /* Create the function. */
6937 si_create_function(&ctx, returns, num_returns, params,
6938 num_params, last_sgpr);
6939 func = ctx.radeon_bld.main_fn;
6940
6941 /* Copy inputs to outputs. This should be no-op, as the registers match,
6942 * but it will prevent the compiler from overwriting them unintentionally.
6943 */
6944 ret = ctx.return_value;
6945 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6946 LLVMValueRef p = LLVMGetParam(func, i);
6947 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6948 }
6949 for (i = num_params - 4; i < num_params; i++) {
6950 LLVMValueRef p = LLVMGetParam(func, i);
6951 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6952 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6953 }
6954
6955 /* Compute vertex load indices from instance divisors. */
6956 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6957 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6958 LLVMValueRef index;
6959
6960 if (divisor) {
6961 /* InstanceID / Divisor + StartInstance */
6962 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6963 SI_SGPR_START_INSTANCE,
6964 divisor);
6965 } else {
6966 /* VertexID + BaseVertex */
6967 index = LLVMBuildAdd(gallivm->builder,
6968 LLVMGetParam(func, ctx.param_vertex_id),
6969 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6970 }
6971
6972 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6973 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6974 num_params++, "");
6975 }
6976
6977 /* Compile. */
6978 si_llvm_build_ret(&ctx, ret);
6979 radeon_llvm_finalize_module(&ctx.radeon_bld);
6980
6981 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6982 gallivm->module, debug, ctx.type,
6983 "Vertex Shader Prolog"))
6984 status = false;
6985
6986 radeon_llvm_dispose(&ctx.radeon_bld);
6987 return status;
6988 }
6989
6990 /**
6991 * Compile the vertex shader epilog. This is also used by the tessellation
6992 * evaluation shader compiled as VS.
6993 *
6994 * The input is PrimitiveID.
6995 *
6996 * If PrimitiveID is required by the pixel shader, export it.
6997 * Otherwise, do nothing.
6998 */
6999 static bool si_compile_vs_epilog(struct si_screen *sscreen,
7000 LLVMTargetMachineRef tm,
7001 struct pipe_debug_callback *debug,
7002 struct si_shader_part *out)
7003 {
7004 union si_shader_part_key *key = &out->key;
7005 struct si_shader_context ctx;
7006 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7007 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7008 LLVMTypeRef params[5];
7009 int num_params, i;
7010 bool status = true;
7011
7012 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
7013 ctx.type = PIPE_SHADER_VERTEX;
7014
7015 /* Declare input VGPRs. */
7016 num_params = key->vs_epilog.states.export_prim_id ?
7017 (VS_EPILOG_PRIMID_LOC + 1) : 0;
7018 assert(num_params <= ARRAY_SIZE(params));
7019
7020 for (i = 0; i < num_params; i++)
7021 params[i] = ctx.f32;
7022
7023 /* Create the function. */
7024 si_create_function(&ctx, NULL, 0, params, num_params, -1);
7025
7026 /* Emit exports. */
7027 if (key->vs_epilog.states.export_prim_id) {
7028 struct lp_build_context *base = &bld_base->base;
7029 struct lp_build_context *uint = &bld_base->uint_bld;
7030 LLVMValueRef args[9];
7031
7032 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7033 args[1] = uint->zero; /* whether the EXEC mask is valid */
7034 args[2] = uint->zero; /* DONE bit */
7035 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7036 key->vs_epilog.prim_id_param_offset);
7037 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7038 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
7039 VS_EPILOG_PRIMID_LOC); /* X */
7040 args[6] = uint->undef; /* Y */
7041 args[7] = uint->undef; /* Z */
7042 args[8] = uint->undef; /* W */
7043
7044 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7045 LLVMVoidTypeInContext(base->gallivm->context),
7046 args, 9, 0);
7047 }
7048
7049 /* Compile. */
7050 LLVMBuildRetVoid(gallivm->builder);
7051 radeon_llvm_finalize_module(&ctx.radeon_bld);
7052
7053 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7054 gallivm->module, debug, ctx.type,
7055 "Vertex Shader Epilog"))
7056 status = false;
7057
7058 radeon_llvm_dispose(&ctx.radeon_bld);
7059 return status;
7060 }
7061
7062 /**
7063 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7064 */
7065 static bool si_get_vs_epilog(struct si_screen *sscreen,
7066 LLVMTargetMachineRef tm,
7067 struct si_shader *shader,
7068 struct pipe_debug_callback *debug,
7069 struct si_vs_epilog_bits *states)
7070 {
7071 union si_shader_part_key epilog_key;
7072
7073 memset(&epilog_key, 0, sizeof(epilog_key));
7074 epilog_key.vs_epilog.states = *states;
7075
7076 /* Set up the PrimitiveID output. */
7077 if (shader->key.vs.epilog.export_prim_id) {
7078 unsigned index = shader->selector->info.num_outputs;
7079 unsigned offset = shader->info.nr_param_exports++;
7080
7081 epilog_key.vs_epilog.prim_id_param_offset = offset;
7082 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7083 shader->info.vs_output_param_offset[index] = offset;
7084 }
7085
7086 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7087 &epilog_key, tm, debug,
7088 si_compile_vs_epilog);
7089 return shader->epilog != NULL;
7090 }
7091
7092 /**
7093 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7094 */
7095 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7096 LLVMTargetMachineRef tm,
7097 struct si_shader *shader,
7098 struct pipe_debug_callback *debug)
7099 {
7100 struct tgsi_shader_info *info = &shader->selector->info;
7101 union si_shader_part_key prolog_key;
7102 unsigned i;
7103
7104 /* Get the prolog. */
7105 memset(&prolog_key, 0, sizeof(prolog_key));
7106 prolog_key.vs_prolog.states = shader->key.vs.prolog;
7107 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7108 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7109
7110 /* The prolog is a no-op if there are no inputs. */
7111 if (info->num_inputs) {
7112 shader->prolog =
7113 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7114 &prolog_key, tm, debug,
7115 si_compile_vs_prolog);
7116 if (!shader->prolog)
7117 return false;
7118 }
7119
7120 /* Get the epilog. */
7121 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7122 !si_get_vs_epilog(sscreen, tm, shader, debug,
7123 &shader->key.vs.epilog))
7124 return false;
7125
7126 /* Set the instanceID flag. */
7127 for (i = 0; i < info->num_inputs; i++)
7128 if (prolog_key.vs_prolog.states.instance_divisors[i])
7129 shader->info.uses_instanceid = true;
7130
7131 return true;
7132 }
7133
7134 /**
7135 * Select and compile (or reuse) TES parts (epilog).
7136 */
7137 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7138 LLVMTargetMachineRef tm,
7139 struct si_shader *shader,
7140 struct pipe_debug_callback *debug)
7141 {
7142 if (shader->key.tes.as_es)
7143 return true;
7144
7145 /* TES compiled as VS. */
7146 return si_get_vs_epilog(sscreen, tm, shader, debug,
7147 &shader->key.tes.epilog);
7148 }
7149
7150 /**
7151 * Compile the TCS epilog. This writes tesselation factors to memory based on
7152 * the output primitive type of the tesselator (determined by TES).
7153 */
7154 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7155 LLVMTargetMachineRef tm,
7156 struct pipe_debug_callback *debug,
7157 struct si_shader_part *out)
7158 {
7159 union si_shader_part_key *key = &out->key;
7160 struct si_shader shader = {};
7161 struct si_shader_context ctx;
7162 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7163 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7164 LLVMTypeRef params[16];
7165 LLVMValueRef func;
7166 int last_sgpr, num_params;
7167 bool status = true;
7168
7169 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7170 ctx.type = PIPE_SHADER_TESS_CTRL;
7171 shader.key.tcs.epilog = key->tcs_epilog.states;
7172
7173 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7174 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7175 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7176 params[SI_PARAM_SAMPLERS] = ctx.i64;
7177 params[SI_PARAM_IMAGES] = ctx.i64;
7178 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7179 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7180 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7181 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7182 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7183 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7184 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7185 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7186 num_params = last_sgpr + 1;
7187
7188 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7189 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7190 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7191
7192 /* Create the function. */
7193 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7194 declare_tess_lds(&ctx);
7195 func = ctx.radeon_bld.main_fn;
7196
7197 si_write_tess_factors(bld_base,
7198 LLVMGetParam(func, last_sgpr + 1),
7199 LLVMGetParam(func, last_sgpr + 2),
7200 LLVMGetParam(func, last_sgpr + 3));
7201
7202 /* Compile. */
7203 LLVMBuildRetVoid(gallivm->builder);
7204 radeon_llvm_finalize_module(&ctx.radeon_bld);
7205
7206 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7207 gallivm->module, debug, ctx.type,
7208 "Tessellation Control Shader Epilog"))
7209 status = false;
7210
7211 radeon_llvm_dispose(&ctx.radeon_bld);
7212 return status;
7213 }
7214
7215 /**
7216 * Select and compile (or reuse) TCS parts (epilog).
7217 */
7218 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7219 LLVMTargetMachineRef tm,
7220 struct si_shader *shader,
7221 struct pipe_debug_callback *debug)
7222 {
7223 union si_shader_part_key epilog_key;
7224
7225 /* Get the epilog. */
7226 memset(&epilog_key, 0, sizeof(epilog_key));
7227 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7228
7229 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7230 &epilog_key, tm, debug,
7231 si_compile_tcs_epilog);
7232 return shader->epilog != NULL;
7233 }
7234
7235 /**
7236 * Compile the pixel shader prolog. This handles:
7237 * - two-side color selection and interpolation
7238 * - overriding interpolation parameters for the API PS
7239 * - polygon stippling
7240 *
7241 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7242 * overriden by other states. (e.g. per-sample interpolation)
7243 * Interpolated colors are stored after the preloaded VGPRs.
7244 */
7245 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7246 LLVMTargetMachineRef tm,
7247 struct pipe_debug_callback *debug,
7248 struct si_shader_part *out)
7249 {
7250 union si_shader_part_key *key = &out->key;
7251 struct si_shader shader = {};
7252 struct si_shader_context ctx;
7253 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7254 LLVMTypeRef *params;
7255 LLVMValueRef ret, func;
7256 int last_sgpr, num_params, num_returns, i, num_color_channels;
7257 bool status = true;
7258
7259 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7260 ctx.type = PIPE_SHADER_FRAGMENT;
7261 shader.key.ps.prolog = key->ps_prolog.states;
7262
7263 /* Number of inputs + 8 color elements. */
7264 params = alloca((key->ps_prolog.num_input_sgprs +
7265 key->ps_prolog.num_input_vgprs + 8) *
7266 sizeof(LLVMTypeRef));
7267
7268 /* Declare inputs. */
7269 num_params = 0;
7270 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7271 params[num_params++] = ctx.i32;
7272 last_sgpr = num_params - 1;
7273
7274 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7275 params[num_params++] = ctx.f32;
7276
7277 /* Declare outputs (same as inputs + add colors if needed) */
7278 num_returns = num_params;
7279 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7280 for (i = 0; i < num_color_channels; i++)
7281 params[num_returns++] = ctx.f32;
7282
7283 /* Create the function. */
7284 si_create_function(&ctx, params, num_returns, params,
7285 num_params, last_sgpr);
7286 func = ctx.radeon_bld.main_fn;
7287
7288 /* Copy inputs to outputs. This should be no-op, as the registers match,
7289 * but it will prevent the compiler from overwriting them unintentionally.
7290 */
7291 ret = ctx.return_value;
7292 for (i = 0; i < num_params; i++) {
7293 LLVMValueRef p = LLVMGetParam(func, i);
7294 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7295 }
7296
7297 /* Polygon stippling. */
7298 if (key->ps_prolog.states.poly_stipple) {
7299 /* POS_FIXED_PT is always last. */
7300 unsigned pos = key->ps_prolog.num_input_sgprs +
7301 key->ps_prolog.num_input_vgprs - 1;
7302 LLVMValueRef ptr[2], list;
7303
7304 /* Get the pointer to rw buffers. */
7305 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7306 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7307 list = lp_build_gather_values(gallivm, ptr, 2);
7308 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7309 list = LLVMBuildIntToPtr(gallivm->builder, list,
7310 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7311
7312 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7313 }
7314
7315 if (key->ps_prolog.states.bc_optimize_for_persp ||
7316 key->ps_prolog.states.bc_optimize_for_linear) {
7317 unsigned i, base = key->ps_prolog.num_input_sgprs;
7318 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7319
7320 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7321 * The hw doesn't compute CENTROID if the whole wave only
7322 * contains fully-covered quads.
7323 *
7324 * PRIM_MASK is after user SGPRs.
7325 */
7326 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7327 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7328 LLVMConstInt(ctx.i32, 31, 0), "");
7329 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7330 ctx.i1, "");
7331
7332 if (key->ps_prolog.states.bc_optimize_for_persp) {
7333 /* Read PERSP_CENTER. */
7334 for (i = 0; i < 2; i++)
7335 center[i] = LLVMGetParam(func, base + 2 + i);
7336 /* Read PERSP_CENTROID. */
7337 for (i = 0; i < 2; i++)
7338 centroid[i] = LLVMGetParam(func, base + 4 + i);
7339 /* Select PERSP_CENTROID. */
7340 for (i = 0; i < 2; i++) {
7341 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7342 center[i], centroid[i], "");
7343 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7344 tmp, base + 4 + i, "");
7345 }
7346 }
7347 if (key->ps_prolog.states.bc_optimize_for_linear) {
7348 /* Read LINEAR_CENTER. */
7349 for (i = 0; i < 2; i++)
7350 center[i] = LLVMGetParam(func, base + 8 + i);
7351 /* Read LINEAR_CENTROID. */
7352 for (i = 0; i < 2; i++)
7353 centroid[i] = LLVMGetParam(func, base + 10 + i);
7354 /* Select LINEAR_CENTROID. */
7355 for (i = 0; i < 2; i++) {
7356 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7357 center[i], centroid[i], "");
7358 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7359 tmp, base + 10 + i, "");
7360 }
7361 }
7362 }
7363
7364 /* Interpolate colors. */
7365 for (i = 0; i < 2; i++) {
7366 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7367 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7368 key->ps_prolog.face_vgpr_index;
7369 LLVMValueRef interp[2], color[4];
7370 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7371
7372 if (!writemask)
7373 continue;
7374
7375 /* If the interpolation qualifier is not CONSTANT (-1). */
7376 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7377 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7378 key->ps_prolog.color_interp_vgpr_index[i];
7379
7380 /* Get the (i,j) updated by bc_optimize handling. */
7381 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7382 interp_vgpr, "");
7383 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7384 interp_vgpr + 1, "");
7385 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7386 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7387 ctx.v2i32, "");
7388 }
7389
7390 /* Use the absolute location of the input. */
7391 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7392
7393 if (key->ps_prolog.states.color_two_side) {
7394 face = LLVMGetParam(func, face_vgpr);
7395 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7396 }
7397
7398 interp_fs_input(&ctx,
7399 key->ps_prolog.color_attr_index[i],
7400 TGSI_SEMANTIC_COLOR, i,
7401 key->ps_prolog.num_interp_inputs,
7402 key->ps_prolog.colors_read, interp_ij,
7403 prim_mask, face, color);
7404
7405 while (writemask) {
7406 unsigned chan = u_bit_scan(&writemask);
7407 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7408 num_params++, "");
7409 }
7410 }
7411
7412 /* Force per-sample interpolation. */
7413 if (key->ps_prolog.states.force_persp_sample_interp) {
7414 unsigned i, base = key->ps_prolog.num_input_sgprs;
7415 LLVMValueRef persp_sample[2];
7416
7417 /* Read PERSP_SAMPLE. */
7418 for (i = 0; i < 2; i++)
7419 persp_sample[i] = LLVMGetParam(func, base + i);
7420 /* Overwrite PERSP_CENTER. */
7421 for (i = 0; i < 2; i++)
7422 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7423 persp_sample[i], base + 2 + i, "");
7424 /* Overwrite PERSP_CENTROID. */
7425 for (i = 0; i < 2; i++)
7426 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7427 persp_sample[i], base + 4 + i, "");
7428 }
7429 if (key->ps_prolog.states.force_linear_sample_interp) {
7430 unsigned i, base = key->ps_prolog.num_input_sgprs;
7431 LLVMValueRef linear_sample[2];
7432
7433 /* Read LINEAR_SAMPLE. */
7434 for (i = 0; i < 2; i++)
7435 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7436 /* Overwrite LINEAR_CENTER. */
7437 for (i = 0; i < 2; i++)
7438 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7439 linear_sample[i], base + 8 + i, "");
7440 /* Overwrite LINEAR_CENTROID. */
7441 for (i = 0; i < 2; i++)
7442 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7443 linear_sample[i], base + 10 + i, "");
7444 }
7445
7446 /* Force center interpolation. */
7447 if (key->ps_prolog.states.force_persp_center_interp) {
7448 unsigned i, base = key->ps_prolog.num_input_sgprs;
7449 LLVMValueRef persp_center[2];
7450
7451 /* Read PERSP_CENTER. */
7452 for (i = 0; i < 2; i++)
7453 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7454 /* Overwrite PERSP_SAMPLE. */
7455 for (i = 0; i < 2; i++)
7456 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7457 persp_center[i], base + i, "");
7458 /* Overwrite PERSP_CENTROID. */
7459 for (i = 0; i < 2; i++)
7460 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7461 persp_center[i], base + 4 + i, "");
7462 }
7463 if (key->ps_prolog.states.force_linear_center_interp) {
7464 unsigned i, base = key->ps_prolog.num_input_sgprs;
7465 LLVMValueRef linear_center[2];
7466
7467 /* Read LINEAR_CENTER. */
7468 for (i = 0; i < 2; i++)
7469 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7470 /* Overwrite LINEAR_SAMPLE. */
7471 for (i = 0; i < 2; i++)
7472 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7473 linear_center[i], base + 6 + i, "");
7474 /* Overwrite LINEAR_CENTROID. */
7475 for (i = 0; i < 2; i++)
7476 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7477 linear_center[i], base + 10 + i, "");
7478 }
7479
7480 /* Tell LLVM to insert WQM instruction sequence when needed. */
7481 if (key->ps_prolog.wqm) {
7482 LLVMAddTargetDependentFunctionAttr(func,
7483 "amdgpu-ps-wqm-outputs", "");
7484 }
7485
7486 /* Compile. */
7487 si_llvm_build_ret(&ctx, ret);
7488 radeon_llvm_finalize_module(&ctx.radeon_bld);
7489
7490 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7491 gallivm->module, debug, ctx.type,
7492 "Fragment Shader Prolog"))
7493 status = false;
7494
7495 radeon_llvm_dispose(&ctx.radeon_bld);
7496 return status;
7497 }
7498
7499 /**
7500 * Compile the pixel shader epilog. This handles everything that must be
7501 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7502 */
7503 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7504 LLVMTargetMachineRef tm,
7505 struct pipe_debug_callback *debug,
7506 struct si_shader_part *out)
7507 {
7508 union si_shader_part_key *key = &out->key;
7509 struct si_shader shader = {};
7510 struct si_shader_context ctx;
7511 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7512 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7513 LLVMTypeRef params[16+8*4+3];
7514 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7515 int last_sgpr, num_params, i;
7516 bool status = true;
7517 struct si_ps_exports exp = {};
7518
7519 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7520 ctx.type = PIPE_SHADER_FRAGMENT;
7521 shader.key.ps.epilog = key->ps_epilog.states;
7522
7523 /* Declare input SGPRs. */
7524 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7525 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7526 params[SI_PARAM_SAMPLERS] = ctx.i64;
7527 params[SI_PARAM_IMAGES] = ctx.i64;
7528 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7529 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7530 last_sgpr = SI_PARAM_ALPHA_REF;
7531
7532 /* Declare input VGPRs. */
7533 num_params = (last_sgpr + 1) +
7534 util_bitcount(key->ps_epilog.colors_written) * 4 +
7535 key->ps_epilog.writes_z +
7536 key->ps_epilog.writes_stencil +
7537 key->ps_epilog.writes_samplemask;
7538
7539 num_params = MAX2(num_params,
7540 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7541
7542 assert(num_params <= ARRAY_SIZE(params));
7543
7544 for (i = last_sgpr + 1; i < num_params; i++)
7545 params[i] = ctx.f32;
7546
7547 /* Create the function. */
7548 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7549 /* Disable elimination of unused inputs. */
7550 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7551 "InitialPSInputAddr", 0xffffff);
7552
7553 /* Process colors. */
7554 unsigned vgpr = last_sgpr + 1;
7555 unsigned colors_written = key->ps_epilog.colors_written;
7556 int last_color_export = -1;
7557
7558 /* Find the last color export. */
7559 if (!key->ps_epilog.writes_z &&
7560 !key->ps_epilog.writes_stencil &&
7561 !key->ps_epilog.writes_samplemask) {
7562 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7563
7564 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7565 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7566 /* Just set this if any of the colorbuffers are enabled. */
7567 if (spi_format &
7568 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7569 last_color_export = 0;
7570 } else {
7571 for (i = 0; i < 8; i++)
7572 if (colors_written & (1 << i) &&
7573 (spi_format >> (i * 4)) & 0xf)
7574 last_color_export = i;
7575 }
7576 }
7577
7578 while (colors_written) {
7579 LLVMValueRef color[4];
7580 int mrt = u_bit_scan(&colors_written);
7581
7582 for (i = 0; i < 4; i++)
7583 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7584
7585 si_export_mrt_color(bld_base, color, mrt,
7586 num_params - 1,
7587 mrt == last_color_export, &exp);
7588 }
7589
7590 /* Process depth, stencil, samplemask. */
7591 if (key->ps_epilog.writes_z)
7592 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7593 if (key->ps_epilog.writes_stencil)
7594 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7595 if (key->ps_epilog.writes_samplemask)
7596 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7597
7598 if (depth || stencil || samplemask)
7599 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7600 else if (last_color_export == -1)
7601 si_export_null(bld_base);
7602
7603 if (exp.num)
7604 si_emit_ps_exports(&ctx, &exp);
7605
7606 /* Compile. */
7607 LLVMBuildRetVoid(gallivm->builder);
7608 radeon_llvm_finalize_module(&ctx.radeon_bld);
7609
7610 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7611 gallivm->module, debug, ctx.type,
7612 "Fragment Shader Epilog"))
7613 status = false;
7614
7615 radeon_llvm_dispose(&ctx.radeon_bld);
7616 return status;
7617 }
7618
7619 /**
7620 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7621 */
7622 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7623 LLVMTargetMachineRef tm,
7624 struct si_shader *shader,
7625 struct pipe_debug_callback *debug)
7626 {
7627 struct tgsi_shader_info *info = &shader->selector->info;
7628 union si_shader_part_key prolog_key;
7629 union si_shader_part_key epilog_key;
7630 unsigned i;
7631
7632 /* Get the prolog. */
7633 memset(&prolog_key, 0, sizeof(prolog_key));
7634 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7635 prolog_key.ps_prolog.colors_read = info->colors_read;
7636 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7637 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7638 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7639 (prolog_key.ps_prolog.colors_read ||
7640 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7641 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7642 prolog_key.ps_prolog.states.force_persp_center_interp ||
7643 prolog_key.ps_prolog.states.force_linear_center_interp ||
7644 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7645 prolog_key.ps_prolog.states.bc_optimize_for_linear);
7646
7647 if (info->colors_read) {
7648 unsigned *color = shader->selector->color_attr_index;
7649
7650 if (shader->key.ps.prolog.color_two_side) {
7651 /* BCOLORs are stored after the last input. */
7652 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7653 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7654 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7655 }
7656
7657 for (i = 0; i < 2; i++) {
7658 unsigned interp = info->input_interpolate[color[i]];
7659 unsigned location = info->input_interpolate_loc[color[i]];
7660
7661 if (!(info->colors_read & (0xf << i*4)))
7662 continue;
7663
7664 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7665
7666 if (shader->key.ps.prolog.flatshade_colors &&
7667 interp == TGSI_INTERPOLATE_COLOR)
7668 interp = TGSI_INTERPOLATE_CONSTANT;
7669
7670 switch (interp) {
7671 case TGSI_INTERPOLATE_CONSTANT:
7672 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7673 break;
7674 case TGSI_INTERPOLATE_PERSPECTIVE:
7675 case TGSI_INTERPOLATE_COLOR:
7676 /* Force the interpolation location for colors here. */
7677 if (shader->key.ps.prolog.force_persp_sample_interp)
7678 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7679 if (shader->key.ps.prolog.force_persp_center_interp)
7680 location = TGSI_INTERPOLATE_LOC_CENTER;
7681
7682 switch (location) {
7683 case TGSI_INTERPOLATE_LOC_SAMPLE:
7684 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7685 shader->config.spi_ps_input_ena |=
7686 S_0286CC_PERSP_SAMPLE_ENA(1);
7687 break;
7688 case TGSI_INTERPOLATE_LOC_CENTER:
7689 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7690 shader->config.spi_ps_input_ena |=
7691 S_0286CC_PERSP_CENTER_ENA(1);
7692 break;
7693 case TGSI_INTERPOLATE_LOC_CENTROID:
7694 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7695 shader->config.spi_ps_input_ena |=
7696 S_0286CC_PERSP_CENTROID_ENA(1);
7697 break;
7698 default:
7699 assert(0);
7700 }
7701 break;
7702 case TGSI_INTERPOLATE_LINEAR:
7703 /* Force the interpolation location for colors here. */
7704 if (shader->key.ps.prolog.force_linear_sample_interp)
7705 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7706 if (shader->key.ps.prolog.force_linear_center_interp)
7707 location = TGSI_INTERPOLATE_LOC_CENTER;
7708
7709 switch (location) {
7710 case TGSI_INTERPOLATE_LOC_SAMPLE:
7711 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7712 shader->config.spi_ps_input_ena |=
7713 S_0286CC_LINEAR_SAMPLE_ENA(1);
7714 break;
7715 case TGSI_INTERPOLATE_LOC_CENTER:
7716 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7717 shader->config.spi_ps_input_ena |=
7718 S_0286CC_LINEAR_CENTER_ENA(1);
7719 break;
7720 case TGSI_INTERPOLATE_LOC_CENTROID:
7721 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7722 shader->config.spi_ps_input_ena |=
7723 S_0286CC_LINEAR_CENTROID_ENA(1);
7724 break;
7725 default:
7726 assert(0);
7727 }
7728 break;
7729 default:
7730 assert(0);
7731 }
7732 }
7733 }
7734
7735 /* The prolog is a no-op if these aren't set. */
7736 if (prolog_key.ps_prolog.colors_read ||
7737 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7738 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7739 prolog_key.ps_prolog.states.force_persp_center_interp ||
7740 prolog_key.ps_prolog.states.force_linear_center_interp ||
7741 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7742 prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7743 prolog_key.ps_prolog.states.poly_stipple) {
7744 shader->prolog =
7745 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7746 &prolog_key, tm, debug,
7747 si_compile_ps_prolog);
7748 if (!shader->prolog)
7749 return false;
7750 }
7751
7752 /* Get the epilog. */
7753 memset(&epilog_key, 0, sizeof(epilog_key));
7754 epilog_key.ps_epilog.colors_written = info->colors_written;
7755 epilog_key.ps_epilog.writes_z = info->writes_z;
7756 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7757 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7758 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7759
7760 shader->epilog =
7761 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7762 &epilog_key, tm, debug,
7763 si_compile_ps_epilog);
7764 if (!shader->epilog)
7765 return false;
7766
7767 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7768 if (shader->key.ps.prolog.poly_stipple) {
7769 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7770 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7771 }
7772
7773 /* Set up the enable bits for per-sample shading if needed. */
7774 if (shader->key.ps.prolog.force_persp_sample_interp &&
7775 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7776 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7777 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7778 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7779 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7780 }
7781 if (shader->key.ps.prolog.force_linear_sample_interp &&
7782 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7783 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7784 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7785 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7786 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7787 }
7788 if (shader->key.ps.prolog.force_persp_center_interp &&
7789 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7790 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7791 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7792 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7793 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7794 }
7795 if (shader->key.ps.prolog.force_linear_center_interp &&
7796 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7797 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7798 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7799 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7800 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7801 }
7802
7803 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7804 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7805 !(shader->config.spi_ps_input_ena & 0xf)) {
7806 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7807 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7808 }
7809
7810 /* At least one pair of interpolation weights must be enabled. */
7811 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7812 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7813 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7814 }
7815
7816 /* The sample mask input is always enabled, because the API shader always
7817 * passes it through to the epilog. Disable it here if it's unused.
7818 */
7819 if (!shader->key.ps.epilog.poly_line_smoothing &&
7820 !shader->selector->info.reads_samplemask)
7821 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7822
7823 return true;
7824 }
7825
7826 static void si_fix_num_sgprs(struct si_shader *shader)
7827 {
7828 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7829
7830 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7831 }
7832
7833 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7834 struct si_shader *shader,
7835 struct pipe_debug_callback *debug)
7836 {
7837 struct si_shader *mainp = shader->selector->main_shader_part;
7838 int r;
7839
7840 /* LS, ES, VS are compiled on demand if the main part hasn't been
7841 * compiled for that stage.
7842 */
7843 if (!mainp ||
7844 (shader->selector->type == PIPE_SHADER_VERTEX &&
7845 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7846 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7847 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7848 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7849 (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7850 shader->key.tcs.epilog.inputs_to_copy) ||
7851 shader->selector->type == PIPE_SHADER_COMPUTE) {
7852 /* Monolithic shader (compiled as a whole, has many variants,
7853 * may take a long time to compile).
7854 */
7855 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7856 if (r)
7857 return r;
7858 } else {
7859 /* The shader consists of 2-3 parts:
7860 *
7861 * - the middle part is the user shader, it has 1 variant only
7862 * and it was compiled during the creation of the shader
7863 * selector
7864 * - the prolog part is inserted at the beginning
7865 * - the epilog part is inserted at the end
7866 *
7867 * The prolog and epilog have many (but simple) variants.
7868 */
7869
7870 /* Copy the compiled TGSI shader data over. */
7871 shader->is_binary_shared = true;
7872 shader->binary = mainp->binary;
7873 shader->config = mainp->config;
7874 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7875 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7876 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7877 memcpy(shader->info.vs_output_param_offset,
7878 mainp->info.vs_output_param_offset,
7879 sizeof(mainp->info.vs_output_param_offset));
7880 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7881 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7882 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7883
7884 /* Select prologs and/or epilogs. */
7885 switch (shader->selector->type) {
7886 case PIPE_SHADER_VERTEX:
7887 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7888 return -1;
7889 break;
7890 case PIPE_SHADER_TESS_CTRL:
7891 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7892 return -1;
7893 break;
7894 case PIPE_SHADER_TESS_EVAL:
7895 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7896 return -1;
7897 break;
7898 case PIPE_SHADER_FRAGMENT:
7899 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7900 return -1;
7901
7902 /* Make sure we have at least as many VGPRs as there
7903 * are allocated inputs.
7904 */
7905 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7906 shader->info.num_input_vgprs);
7907 break;
7908 }
7909
7910 /* Update SGPR and VGPR counts. */
7911 if (shader->prolog) {
7912 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7913 shader->prolog->config.num_sgprs);
7914 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7915 shader->prolog->config.num_vgprs);
7916 }
7917 if (shader->epilog) {
7918 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7919 shader->epilog->config.num_sgprs);
7920 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7921 shader->epilog->config.num_vgprs);
7922 }
7923 }
7924
7925 si_fix_num_sgprs(shader);
7926 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7927 stderr);
7928
7929 /* Upload. */
7930 r = si_shader_binary_upload(sscreen, shader);
7931 if (r) {
7932 fprintf(stderr, "LLVM failed to upload shader\n");
7933 return r;
7934 }
7935
7936 return 0;
7937 }
7938
7939 void si_shader_destroy(struct si_shader *shader)
7940 {
7941 if (shader->gs_copy_shader) {
7942 si_shader_destroy(shader->gs_copy_shader);
7943 FREE(shader->gs_copy_shader);
7944 }
7945
7946 if (shader->scratch_bo)
7947 r600_resource_reference(&shader->scratch_bo, NULL);
7948
7949 r600_resource_reference(&shader->bo, NULL);
7950
7951 if (!shader->is_binary_shared)
7952 radeon_shader_binary_clean(&shader->binary);
7953
7954 free(shader->shader_log);
7955 }