gallium/radeon: allocate temps array info in radeon_llvm_context_init
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "gallivm/lp_bld_misc.h"
37 #include "radeon/r600_cs.h"
38 #include "radeon/radeon_llvm.h"
39 #include "radeon/radeon_elf_util.h"
40 #include "radeon/radeon_llvm_emit.h"
41 #include "util/u_memory.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94 int param_oc_lds;
95
96 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
97 * 0x800000 for VS, 0x1 for ES.
98 */
99 int param_tess_offchip;
100
101 LLVMTargetMachineRef tm;
102
103 unsigned invariant_load_md_kind;
104 unsigned range_md_kind;
105 unsigned uniform_md_kind;
106 LLVMValueRef empty_md;
107
108 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
109 LLVMValueRef lds;
110 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
111 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
112 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
113 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
114 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
115 LLVMValueRef images[SI_NUM_IMAGES];
116 LLVMValueRef so_buffers[4];
117 LLVMValueRef esgs_ring;
118 LLVMValueRef gsvs_ring[4];
119 LLVMValueRef gs_next_vertex[4];
120 LLVMValueRef return_value;
121
122 LLVMTypeRef voidt;
123 LLVMTypeRef i1;
124 LLVMTypeRef i8;
125 LLVMTypeRef i32;
126 LLVMTypeRef i64;
127 LLVMTypeRef i128;
128 LLVMTypeRef f32;
129 LLVMTypeRef v16i8;
130 LLVMTypeRef v2i32;
131 LLVMTypeRef v4i32;
132 LLVMTypeRef v4f32;
133 LLVMTypeRef v8i32;
134
135 LLVMValueRef shared_memory;
136 };
137
138 static struct si_shader_context *si_shader_context(
139 struct lp_build_tgsi_context *bld_base)
140 {
141 return (struct si_shader_context *)bld_base;
142 }
143
144 static void si_init_shader_ctx(struct si_shader_context *ctx,
145 struct si_screen *sscreen,
146 struct si_shader *shader,
147 LLVMTargetMachineRef tm);
148
149 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
150 struct lp_build_tgsi_context *bld_base,
151 struct lp_build_emit_data *emit_data);
152
153 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
154 FILE *f);
155
156 /* Ideally pass the sample mask input to the PS epilog as v13, which
157 * is its usual location, so that the shader doesn't have to add v_mov.
158 */
159 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
160
161 /* The VS location of the PrimitiveID input is the same in the epilog,
162 * so that the main shader part doesn't have to move it.
163 */
164 #define VS_EPILOG_PRIMID_LOC 2
165
166 #define PERSPECTIVE_BASE 0
167 #define LINEAR_BASE 9
168
169 #define SAMPLE_OFFSET 0
170 #define CENTER_OFFSET 2
171 #define CENTROID_OFSET 4
172
173 #define USE_SGPR_MAX_SUFFIX_LEN 5
174 #define CONST_ADDR_SPACE 2
175 #define LOCAL_ADDR_SPACE 3
176 #define USER_SGPR_ADDR_SPACE 8
177
178
179 #define SENDMSG_GS 2
180 #define SENDMSG_GS_DONE 3
181
182 #define SENDMSG_GS_OP_NOP (0 << 4)
183 #define SENDMSG_GS_OP_CUT (1 << 4)
184 #define SENDMSG_GS_OP_EMIT (2 << 4)
185 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
186
187 /**
188 * Returns a unique index for a semantic name and index. The index must be
189 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
190 * calculated.
191 */
192 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
193 {
194 switch (semantic_name) {
195 case TGSI_SEMANTIC_POSITION:
196 return 0;
197 case TGSI_SEMANTIC_PSIZE:
198 return 1;
199 case TGSI_SEMANTIC_CLIPDIST:
200 assert(index <= 1);
201 return 2 + index;
202 case TGSI_SEMANTIC_GENERIC:
203 if (index <= 63-4)
204 return 4 + index;
205 else
206 /* same explanation as in the default statement,
207 * the only user hitting this is st/nine.
208 */
209 return 0;
210
211 /* patch indices are completely separate and thus start from 0 */
212 case TGSI_SEMANTIC_TESSOUTER:
213 return 0;
214 case TGSI_SEMANTIC_TESSINNER:
215 return 1;
216 case TGSI_SEMANTIC_PATCH:
217 return 2 + index;
218
219 default:
220 /* Don't fail here. The result of this function is only used
221 * for LS, TCS, TES, and GS, where legacy GL semantics can't
222 * occur, but this function is called for all vertex shaders
223 * before it's known whether LS will be compiled or not.
224 */
225 return 0;
226 }
227 }
228
229 /**
230 * Get the value of a shader input parameter and extract a bitfield.
231 */
232 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
233 unsigned param, unsigned rshift,
234 unsigned bitwidth)
235 {
236 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
237 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
238 param);
239
240 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
241 value = bitcast(&ctx->radeon_bld.soa.bld_base,
242 TGSI_TYPE_UNSIGNED, value);
243
244 if (rshift)
245 value = LLVMBuildLShr(gallivm->builder, value,
246 lp_build_const_int32(gallivm, rshift), "");
247
248 if (rshift + bitwidth < 32) {
249 unsigned mask = (1 << bitwidth) - 1;
250 value = LLVMBuildAnd(gallivm->builder, value,
251 lp_build_const_int32(gallivm, mask), "");
252 }
253
254 return value;
255 }
256
257 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
258 {
259 switch (ctx->type) {
260 case PIPE_SHADER_TESS_CTRL:
261 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
262
263 case PIPE_SHADER_TESS_EVAL:
264 return LLVMGetParam(ctx->radeon_bld.main_fn,
265 ctx->param_tes_rel_patch_id);
266
267 default:
268 assert(0);
269 return NULL;
270 }
271 }
272
273 /* Tessellation shaders pass outputs to the next shader using LDS.
274 *
275 * LS outputs = TCS inputs
276 * TCS outputs = TES inputs
277 *
278 * The LDS layout is:
279 * - TCS inputs for patch 0
280 * - TCS inputs for patch 1
281 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
282 * - ...
283 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
284 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
285 * - TCS outputs for patch 1
286 * - Per-patch TCS outputs for patch 1
287 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
288 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
289 * - ...
290 *
291 * All three shaders VS(LS), TCS, TES share the same LDS space.
292 */
293
294 static LLVMValueRef
295 get_tcs_in_patch_stride(struct si_shader_context *ctx)
296 {
297 if (ctx->type == PIPE_SHADER_VERTEX)
298 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
299 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
300 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
301 else {
302 assert(0);
303 return NULL;
304 }
305 }
306
307 static LLVMValueRef
308 get_tcs_out_patch_stride(struct si_shader_context *ctx)
309 {
310 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
311 }
312
313 static LLVMValueRef
314 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
315 {
316 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
317 unpack_param(ctx,
318 SI_PARAM_TCS_OUT_OFFSETS,
319 0, 16),
320 4);
321 }
322
323 static LLVMValueRef
324 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
325 {
326 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
327 unpack_param(ctx,
328 SI_PARAM_TCS_OUT_OFFSETS,
329 16, 16),
330 4);
331 }
332
333 static LLVMValueRef
334 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
335 {
336 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
337 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
338 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
339
340 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
341 }
342
343 static LLVMValueRef
344 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
345 {
346 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
347 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
348 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
349 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
350
351 return LLVMBuildAdd(gallivm->builder, patch0_offset,
352 LLVMBuildMul(gallivm->builder, patch_stride,
353 rel_patch_id, ""),
354 "");
355 }
356
357 static LLVMValueRef
358 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
359 {
360 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
361 LLVMValueRef patch0_patch_data_offset =
362 get_tcs_out_patch0_patch_data_offset(ctx);
363 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
364 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
365
366 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
367 LLVMBuildMul(gallivm->builder, patch_stride,
368 rel_patch_id, ""),
369 "");
370 }
371
372 static void build_indexed_store(struct si_shader_context *ctx,
373 LLVMValueRef base_ptr, LLVMValueRef index,
374 LLVMValueRef value)
375 {
376 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
377 struct gallivm_state *gallivm = bld_base->base.gallivm;
378 LLVMValueRef indices[2], pointer;
379
380 indices[0] = bld_base->uint_bld.zero;
381 indices[1] = index;
382
383 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
384 LLVMBuildStore(gallivm->builder, value, pointer);
385 }
386
387 /**
388 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
389 * It's equivalent to doing a load from &base_ptr[index].
390 *
391 * \param base_ptr Where the array starts.
392 * \param index The element index into the array.
393 * \param uniform Whether the base_ptr and index can be assumed to be
394 * dynamically uniform
395 */
396 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
397 LLVMValueRef base_ptr, LLVMValueRef index,
398 bool uniform)
399 {
400 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
401 struct gallivm_state *gallivm = bld_base->base.gallivm;
402 LLVMValueRef indices[2], pointer;
403
404 indices[0] = bld_base->uint_bld.zero;
405 indices[1] = index;
406
407 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
408 if (uniform)
409 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
410 return LLVMBuildLoad(gallivm->builder, pointer, "");
411 }
412
413 /**
414 * Do a load from &base_ptr[index], but also add a flag that it's loading
415 * a constant from a dynamically uniform index.
416 */
417 static LLVMValueRef build_indexed_load_const(
418 struct si_shader_context *ctx,
419 LLVMValueRef base_ptr, LLVMValueRef index)
420 {
421 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
422 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
423 return result;
424 }
425
426 static LLVMValueRef get_instance_index_for_fetch(
427 struct radeon_llvm_context *radeon_bld,
428 unsigned param_start_instance, unsigned divisor)
429 {
430 struct si_shader_context *ctx =
431 si_shader_context(&radeon_bld->soa.bld_base);
432 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
433
434 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
435 ctx->param_instance_id);
436
437 /* The division must be done before START_INSTANCE is added. */
438 if (divisor > 1)
439 result = LLVMBuildUDiv(gallivm->builder, result,
440 lp_build_const_int32(gallivm, divisor), "");
441
442 return LLVMBuildAdd(gallivm->builder, result,
443 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
444 }
445
446 static void declare_input_vs(
447 struct radeon_llvm_context *radeon_bld,
448 unsigned input_index,
449 const struct tgsi_full_declaration *decl)
450 {
451 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
452 struct gallivm_state *gallivm = base->gallivm;
453 struct si_shader_context *ctx =
454 si_shader_context(&radeon_bld->soa.bld_base);
455 unsigned divisor =
456 ctx->shader->key.vs.prolog.instance_divisors[input_index];
457
458 unsigned chan;
459
460 LLVMValueRef t_list_ptr;
461 LLVMValueRef t_offset;
462 LLVMValueRef t_list;
463 LLVMValueRef attribute_offset;
464 LLVMValueRef buffer_index;
465 LLVMValueRef args[3];
466 LLVMValueRef input;
467
468 /* Load the T list */
469 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
470
471 t_offset = lp_build_const_int32(gallivm, input_index);
472
473 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
474
475 /* Build the attribute offset */
476 attribute_offset = lp_build_const_int32(gallivm, 0);
477
478 if (!ctx->is_monolithic) {
479 buffer_index = LLVMGetParam(radeon_bld->main_fn,
480 ctx->param_vertex_index0 +
481 input_index);
482 } else if (divisor) {
483 /* Build index from instance ID, start instance and divisor */
484 ctx->shader->info.uses_instanceid = true;
485 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
486 SI_PARAM_START_INSTANCE,
487 divisor);
488 } else {
489 /* Load the buffer index for vertices. */
490 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
491 ctx->param_vertex_id);
492 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
493 SI_PARAM_BASE_VERTEX);
494 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
495 }
496
497 args[0] = t_list;
498 args[1] = attribute_offset;
499 args[2] = buffer_index;
500 input = lp_build_intrinsic(gallivm->builder,
501 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
502 LLVMReadNoneAttribute);
503
504 /* Break up the vec4 into individual components */
505 for (chan = 0; chan < 4; chan++) {
506 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
507 /* XXX: Use a helper function for this. There is one in
508 * tgsi_llvm.c. */
509 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
510 LLVMBuildExtractElement(gallivm->builder,
511 input, llvm_chan, "");
512 }
513 }
514
515 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
516 unsigned swizzle)
517 {
518 struct si_shader_context *ctx = si_shader_context(bld_base);
519
520 if (swizzle > 0)
521 return bld_base->uint_bld.zero;
522
523 switch (ctx->type) {
524 case PIPE_SHADER_VERTEX:
525 return LLVMGetParam(ctx->radeon_bld.main_fn,
526 ctx->param_vs_prim_id);
527 case PIPE_SHADER_TESS_CTRL:
528 return LLVMGetParam(ctx->radeon_bld.main_fn,
529 SI_PARAM_PATCH_ID);
530 case PIPE_SHADER_TESS_EVAL:
531 return LLVMGetParam(ctx->radeon_bld.main_fn,
532 ctx->param_tes_patch_id);
533 case PIPE_SHADER_GEOMETRY:
534 return LLVMGetParam(ctx->radeon_bld.main_fn,
535 SI_PARAM_PRIMITIVE_ID);
536 default:
537 assert(0);
538 return bld_base->uint_bld.zero;
539 }
540 }
541
542 /**
543 * Return the value of tgsi_ind_register for indexing.
544 * This is the indirect index with the constant offset added to it.
545 */
546 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
547 const struct tgsi_ind_register *ind,
548 int rel_index)
549 {
550 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
551 LLVMValueRef result;
552
553 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
554 result = LLVMBuildLoad(gallivm->builder, result, "");
555 result = LLVMBuildAdd(gallivm->builder, result,
556 lp_build_const_int32(gallivm, rel_index), "");
557 return result;
558 }
559
560 /**
561 * Like get_indirect_index, but restricts the return value to a (possibly
562 * undefined) value inside [0..num).
563 */
564 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
565 const struct tgsi_ind_register *ind,
566 int rel_index, unsigned num)
567 {
568 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
569 LLVMBuilderRef builder = gallivm->builder;
570 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
571 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
572 LLVMValueRef cc;
573
574 /* LLVM 3.8: If indirect resource indexing is used:
575 * - SI & CIK hang
576 * - VI crashes
577 */
578 if (HAVE_LLVM <= 0x0308)
579 return LLVMGetUndef(ctx->i32);
580
581 if (util_is_power_of_two(num)) {
582 result = LLVMBuildAnd(builder, result, c_max, "");
583 } else {
584 /* In theory, this MAX pattern should result in code that is
585 * as good as the bit-wise AND above.
586 *
587 * In practice, LLVM generates worse code (at the time of
588 * writing), because its value tracking is not strong enough.
589 */
590 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
591 result = LLVMBuildSelect(builder, cc, result, c_max, "");
592 }
593
594 return result;
595 }
596
597
598 /**
599 * Calculate a dword address given an input or output register and a stride.
600 */
601 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
602 const struct tgsi_full_dst_register *dst,
603 const struct tgsi_full_src_register *src,
604 LLVMValueRef vertex_dw_stride,
605 LLVMValueRef base_addr)
606 {
607 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
608 struct tgsi_shader_info *info = &ctx->shader->selector->info;
609 ubyte *name, *index, *array_first;
610 int first, param;
611 struct tgsi_full_dst_register reg;
612
613 /* Set the register description. The address computation is the same
614 * for sources and destinations. */
615 if (src) {
616 reg.Register.File = src->Register.File;
617 reg.Register.Index = src->Register.Index;
618 reg.Register.Indirect = src->Register.Indirect;
619 reg.Register.Dimension = src->Register.Dimension;
620 reg.Indirect = src->Indirect;
621 reg.Dimension = src->Dimension;
622 reg.DimIndirect = src->DimIndirect;
623 } else
624 reg = *dst;
625
626 /* If the register is 2-dimensional (e.g. an array of vertices
627 * in a primitive), calculate the base address of the vertex. */
628 if (reg.Register.Dimension) {
629 LLVMValueRef index;
630
631 if (reg.Dimension.Indirect)
632 index = get_indirect_index(ctx, &reg.DimIndirect,
633 reg.Dimension.Index);
634 else
635 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
636
637 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
638 LLVMBuildMul(gallivm->builder, index,
639 vertex_dw_stride, ""), "");
640 }
641
642 /* Get information about the register. */
643 if (reg.Register.File == TGSI_FILE_INPUT) {
644 name = info->input_semantic_name;
645 index = info->input_semantic_index;
646 array_first = info->input_array_first;
647 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
648 name = info->output_semantic_name;
649 index = info->output_semantic_index;
650 array_first = info->output_array_first;
651 } else {
652 assert(0);
653 return NULL;
654 }
655
656 if (reg.Register.Indirect) {
657 /* Add the relative address of the element. */
658 LLVMValueRef ind_index;
659
660 if (reg.Indirect.ArrayID)
661 first = array_first[reg.Indirect.ArrayID];
662 else
663 first = reg.Register.Index;
664
665 ind_index = get_indirect_index(ctx, &reg.Indirect,
666 reg.Register.Index - first);
667
668 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
669 LLVMBuildMul(gallivm->builder, ind_index,
670 lp_build_const_int32(gallivm, 4), ""), "");
671
672 param = si_shader_io_get_unique_index(name[first], index[first]);
673 } else {
674 param = si_shader_io_get_unique_index(name[reg.Register.Index],
675 index[reg.Register.Index]);
676 }
677
678 /* Add the base address of the element. */
679 return LLVMBuildAdd(gallivm->builder, base_addr,
680 lp_build_const_int32(gallivm, param * 4), "");
681 }
682
683 /* The offchip buffer layout for TCS->TES is
684 *
685 * - attribute 0 of patch 0 vertex 0
686 * - attribute 0 of patch 0 vertex 1
687 * - attribute 0 of patch 0 vertex 2
688 * ...
689 * - attribute 0 of patch 1 vertex 0
690 * - attribute 0 of patch 1 vertex 1
691 * ...
692 * - attribute 1 of patch 0 vertex 0
693 * - attribute 1 of patch 0 vertex 1
694 * ...
695 * - per patch attribute 0 of patch 0
696 * - per patch attribute 0 of patch 1
697 * ...
698 *
699 * Note that every attribute has 4 components.
700 */
701 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
702 LLVMValueRef vertex_index,
703 LLVMValueRef param_index)
704 {
705 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
706 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
707 LLVMValueRef param_stride, constant16;
708
709 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
710 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
711 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
712 num_patches, "");
713
714 constant16 = lp_build_const_int32(gallivm, 16);
715 if (vertex_index) {
716 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
717 vertices_per_patch, "");
718
719 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
720 vertex_index, "");
721
722 param_stride = total_vertices;
723 } else {
724 base_addr = get_rel_patch_id(ctx);
725 param_stride = num_patches;
726 }
727
728 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
729 LLVMBuildMul(gallivm->builder, param_index,
730 param_stride, ""), "");
731
732 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
733
734 if (!vertex_index) {
735 LLVMValueRef patch_data_offset =
736 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
737
738 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
739 patch_data_offset, "");
740 }
741 return base_addr;
742 }
743
744 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
745 struct si_shader_context *ctx,
746 const struct tgsi_full_dst_register *dst,
747 const struct tgsi_full_src_register *src)
748 {
749 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
750 struct tgsi_shader_info *info = &ctx->shader->selector->info;
751 ubyte *name, *index, *array_first;
752 struct tgsi_full_src_register reg;
753 LLVMValueRef vertex_index = NULL;
754 LLVMValueRef param_index = NULL;
755 unsigned param_index_base, param_base;
756
757 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
758
759 if (reg.Register.Dimension) {
760
761 if (reg.Dimension.Indirect)
762 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
763 reg.Dimension.Index);
764 else
765 vertex_index = lp_build_const_int32(gallivm,
766 reg.Dimension.Index);
767 }
768
769 /* Get information about the register. */
770 if (reg.Register.File == TGSI_FILE_INPUT) {
771 name = info->input_semantic_name;
772 index = info->input_semantic_index;
773 array_first = info->input_array_first;
774 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
775 name = info->output_semantic_name;
776 index = info->output_semantic_index;
777 array_first = info->output_array_first;
778 } else {
779 assert(0);
780 return NULL;
781 }
782
783 if (reg.Register.Indirect) {
784 if (reg.Indirect.ArrayID)
785 param_base = array_first[reg.Indirect.ArrayID];
786 else
787 param_base = reg.Register.Index;
788
789 param_index = get_indirect_index(ctx, &reg.Indirect,
790 reg.Register.Index - param_base);
791
792 } else {
793 param_base = reg.Register.Index;
794 param_index = lp_build_const_int32(gallivm, 0);
795 }
796
797 param_index_base = si_shader_io_get_unique_index(name[param_base],
798 index[param_base]);
799
800 param_index = LLVMBuildAdd(gallivm->builder, param_index,
801 lp_build_const_int32(gallivm, param_index_base),
802 "");
803
804 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
805 }
806
807 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
808 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
809 * or v4i32 (num_channels=3,4). */
810 static void build_tbuffer_store(struct si_shader_context *ctx,
811 LLVMValueRef rsrc,
812 LLVMValueRef vdata,
813 unsigned num_channels,
814 LLVMValueRef vaddr,
815 LLVMValueRef soffset,
816 unsigned inst_offset,
817 unsigned dfmt,
818 unsigned nfmt,
819 unsigned offen,
820 unsigned idxen,
821 unsigned glc,
822 unsigned slc,
823 unsigned tfe)
824 {
825 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
826 LLVMValueRef args[] = {
827 rsrc,
828 vdata,
829 LLVMConstInt(ctx->i32, num_channels, 0),
830 vaddr,
831 soffset,
832 LLVMConstInt(ctx->i32, inst_offset, 0),
833 LLVMConstInt(ctx->i32, dfmt, 0),
834 LLVMConstInt(ctx->i32, nfmt, 0),
835 LLVMConstInt(ctx->i32, offen, 0),
836 LLVMConstInt(ctx->i32, idxen, 0),
837 LLVMConstInt(ctx->i32, glc, 0),
838 LLVMConstInt(ctx->i32, slc, 0),
839 LLVMConstInt(ctx->i32, tfe, 0)
840 };
841
842 /* The instruction offset field has 12 bits */
843 assert(offen || inst_offset < (1 << 12));
844
845 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
846 unsigned func = CLAMP(num_channels, 1, 3) - 1;
847 const char *types[] = {"i32", "v2i32", "v4i32"};
848 char name[256];
849 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
850
851 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
852 args, ARRAY_SIZE(args), 0);
853 }
854
855 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
856 LLVMValueRef rsrc,
857 LLVMValueRef vdata,
858 unsigned num_channels,
859 LLVMValueRef vaddr,
860 LLVMValueRef soffset,
861 unsigned inst_offset)
862 {
863 static unsigned dfmt[] = {
864 V_008F0C_BUF_DATA_FORMAT_32,
865 V_008F0C_BUF_DATA_FORMAT_32_32,
866 V_008F0C_BUF_DATA_FORMAT_32_32_32,
867 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
868 };
869 assert(num_channels >= 1 && num_channels <= 4);
870
871 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
872 inst_offset, dfmt[num_channels-1],
873 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
874 }
875
876 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
877 LLVMValueRef rsrc,
878 int num_channels,
879 LLVMValueRef vindex,
880 LLVMValueRef voffset,
881 LLVMValueRef soffset,
882 unsigned inst_offset,
883 unsigned glc,
884 unsigned slc)
885 {
886 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
887 unsigned func = CLAMP(num_channels, 1, 3) - 1;
888
889 if (HAVE_LLVM >= 0x309) {
890 LLVMValueRef args[] = {
891 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
892 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
893 LLVMConstInt(ctx->i32, inst_offset, 0),
894 LLVMConstInt(ctx->i1, glc, 0),
895 LLVMConstInt(ctx->i1, slc, 0)
896 };
897
898 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
899 ctx->v4f32};
900 const char *type_names[] = {"f32", "v2f32", "v4f32"};
901 char name[256];
902
903 if (voffset) {
904 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
905 "");
906 }
907
908 if (soffset) {
909 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
910 "");
911 }
912
913 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
914 type_names[func]);
915
916 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
917 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
918 } else {
919 LLVMValueRef args[] = {
920 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
921 voffset ? voffset : vindex,
922 soffset,
923 LLVMConstInt(ctx->i32, inst_offset, 0),
924 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
925 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
926 LLVMConstInt(ctx->i32, glc, 0),
927 LLVMConstInt(ctx->i32, slc, 0),
928 LLVMConstInt(ctx->i32, 0, 0), // TFE
929 };
930
931 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
932 ctx->v4i32};
933 const char *type_names[] = {"i32", "v2i32", "v4i32"};
934 const char *arg_type = "i32";
935 char name[256];
936
937 if (voffset && vindex) {
938 LLVMValueRef vaddr[] = {vindex, voffset};
939
940 arg_type = "v2i32";
941 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
942 }
943
944 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
945 type_names[func], arg_type);
946
947 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
948 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
949 }
950 }
951
952 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
953 enum tgsi_opcode_type type, unsigned swizzle,
954 LLVMValueRef buffer, LLVMValueRef offset,
955 LLVMValueRef base)
956 {
957 struct si_shader_context *ctx = si_shader_context(bld_base);
958 struct gallivm_state *gallivm = bld_base->base.gallivm;
959 LLVMValueRef value, value2;
960 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
961 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
962
963 if (swizzle == ~0) {
964 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
965 0, 1, 0);
966
967 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
968 }
969
970 if (!tgsi_type_is_64bit(type)) {
971 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
972 0, 1, 0);
973
974 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
975 return LLVMBuildExtractElement(gallivm->builder, value,
976 lp_build_const_int32(gallivm, swizzle), "");
977 }
978
979 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
980 swizzle * 4, 1, 0);
981
982 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
983 swizzle * 4 + 4, 1, 0);
984
985 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
986 }
987
988 /**
989 * Load from LDS.
990 *
991 * \param type output value type
992 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
993 * \param dw_addr address in dwords
994 */
995 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
996 enum tgsi_opcode_type type, unsigned swizzle,
997 LLVMValueRef dw_addr)
998 {
999 struct si_shader_context *ctx = si_shader_context(bld_base);
1000 struct gallivm_state *gallivm = bld_base->base.gallivm;
1001 LLVMValueRef value;
1002
1003 if (swizzle == ~0) {
1004 LLVMValueRef values[TGSI_NUM_CHANNELS];
1005
1006 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1007 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1008
1009 return lp_build_gather_values(bld_base->base.gallivm, values,
1010 TGSI_NUM_CHANNELS);
1011 }
1012
1013 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1014 lp_build_const_int32(gallivm, swizzle));
1015
1016 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1017 if (tgsi_type_is_64bit(type)) {
1018 LLVMValueRef value2;
1019 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1020 lp_build_const_int32(gallivm, swizzle + 1));
1021 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1022 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1023 }
1024
1025 return LLVMBuildBitCast(gallivm->builder, value,
1026 tgsi2llvmtype(bld_base, type), "");
1027 }
1028
1029 /**
1030 * Store to LDS.
1031 *
1032 * \param swizzle offset (typically 0..3)
1033 * \param dw_addr address in dwords
1034 * \param value value to store
1035 */
1036 static void lds_store(struct lp_build_tgsi_context *bld_base,
1037 unsigned swizzle, LLVMValueRef dw_addr,
1038 LLVMValueRef value)
1039 {
1040 struct si_shader_context *ctx = si_shader_context(bld_base);
1041 struct gallivm_state *gallivm = bld_base->base.gallivm;
1042
1043 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1044 lp_build_const_int32(gallivm, swizzle));
1045
1046 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1047 build_indexed_store(ctx, ctx->lds,
1048 dw_addr, value);
1049 }
1050
1051 static LLVMValueRef fetch_input_tcs(
1052 struct lp_build_tgsi_context *bld_base,
1053 const struct tgsi_full_src_register *reg,
1054 enum tgsi_opcode_type type, unsigned swizzle)
1055 {
1056 struct si_shader_context *ctx = si_shader_context(bld_base);
1057 LLVMValueRef dw_addr, stride;
1058
1059 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1060 dw_addr = get_tcs_in_current_patch_offset(ctx);
1061 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1062
1063 return lds_load(bld_base, type, swizzle, dw_addr);
1064 }
1065
1066 static LLVMValueRef fetch_output_tcs(
1067 struct lp_build_tgsi_context *bld_base,
1068 const struct tgsi_full_src_register *reg,
1069 enum tgsi_opcode_type type, unsigned swizzle)
1070 {
1071 struct si_shader_context *ctx = si_shader_context(bld_base);
1072 LLVMValueRef dw_addr, stride;
1073
1074 if (reg->Register.Dimension) {
1075 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1076 dw_addr = get_tcs_out_current_patch_offset(ctx);
1077 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1078 } else {
1079 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1080 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1081 }
1082
1083 return lds_load(bld_base, type, swizzle, dw_addr);
1084 }
1085
1086 static LLVMValueRef fetch_input_tes(
1087 struct lp_build_tgsi_context *bld_base,
1088 const struct tgsi_full_src_register *reg,
1089 enum tgsi_opcode_type type, unsigned swizzle)
1090 {
1091 struct si_shader_context *ctx = si_shader_context(bld_base);
1092 struct gallivm_state *gallivm = bld_base->base.gallivm;
1093 LLVMValueRef rw_buffers, buffer, base, addr;
1094
1095 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1096 SI_PARAM_RW_BUFFERS);
1097 buffer = build_indexed_load_const(ctx, rw_buffers,
1098 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1099
1100 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1101 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1102
1103 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1104 }
1105
1106 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1107 const struct tgsi_full_instruction *inst,
1108 const struct tgsi_opcode_info *info,
1109 LLVMValueRef dst[4])
1110 {
1111 struct si_shader_context *ctx = si_shader_context(bld_base);
1112 struct gallivm_state *gallivm = bld_base->base.gallivm;
1113 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1114 unsigned chan_index;
1115 LLVMValueRef dw_addr, stride;
1116 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1117 LLVMValueRef values[4];
1118
1119 /* Only handle per-patch and per-vertex outputs here.
1120 * Vectors will be lowered to scalars and this function will be called again.
1121 */
1122 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1123 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1124 radeon_llvm_emit_store(bld_base, inst, info, dst);
1125 return;
1126 }
1127
1128 if (reg->Register.Dimension) {
1129 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1130 dw_addr = get_tcs_out_current_patch_offset(ctx);
1131 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1132 } else {
1133 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1134 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1135 }
1136
1137 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1138 SI_PARAM_RW_BUFFERS);
1139 buffer = build_indexed_load_const(ctx, rw_buffers,
1140 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1141
1142 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1143 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1144
1145
1146 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1147 LLVMValueRef value = dst[chan_index];
1148
1149 if (inst->Instruction.Saturate)
1150 value = radeon_llvm_saturate(bld_base, value);
1151
1152 lds_store(bld_base, chan_index, dw_addr, value);
1153
1154 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1155 values[chan_index] = value;
1156
1157 if (inst->Dst[0].Register.WriteMask != 0xF) {
1158 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1159 buf_addr, base,
1160 4 * chan_index);
1161 }
1162 }
1163
1164 if (inst->Dst[0].Register.WriteMask == 0xF) {
1165 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1166 values, 4);
1167 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1168 base, 0);
1169 }
1170 }
1171
1172 static LLVMValueRef fetch_input_gs(
1173 struct lp_build_tgsi_context *bld_base,
1174 const struct tgsi_full_src_register *reg,
1175 enum tgsi_opcode_type type,
1176 unsigned swizzle)
1177 {
1178 struct lp_build_context *base = &bld_base->base;
1179 struct si_shader_context *ctx = si_shader_context(bld_base);
1180 struct si_shader *shader = ctx->shader;
1181 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1182 struct gallivm_state *gallivm = base->gallivm;
1183 LLVMValueRef vtx_offset;
1184 LLVMValueRef args[9];
1185 unsigned vtx_offset_param;
1186 struct tgsi_shader_info *info = &shader->selector->info;
1187 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1188 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1189 unsigned param;
1190 LLVMValueRef value;
1191
1192 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1193 return get_primitive_id(bld_base, swizzle);
1194
1195 if (!reg->Register.Dimension)
1196 return NULL;
1197
1198 if (swizzle == ~0) {
1199 LLVMValueRef values[TGSI_NUM_CHANNELS];
1200 unsigned chan;
1201 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1202 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1203 }
1204 return lp_build_gather_values(bld_base->base.gallivm, values,
1205 TGSI_NUM_CHANNELS);
1206 }
1207
1208 /* Get the vertex offset parameter */
1209 vtx_offset_param = reg->Dimension.Index;
1210 if (vtx_offset_param < 2) {
1211 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1212 } else {
1213 assert(vtx_offset_param < 6);
1214 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1215 }
1216 vtx_offset = lp_build_mul_imm(uint,
1217 LLVMGetParam(ctx->radeon_bld.main_fn,
1218 vtx_offset_param),
1219 4);
1220
1221 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1222 args[0] = ctx->esgs_ring;
1223 args[1] = vtx_offset;
1224 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1225 args[3] = uint->zero;
1226 args[4] = uint->one; /* OFFEN */
1227 args[5] = uint->zero; /* IDXEN */
1228 args[6] = uint->one; /* GLC */
1229 args[7] = uint->zero; /* SLC */
1230 args[8] = uint->zero; /* TFE */
1231
1232 value = lp_build_intrinsic(gallivm->builder,
1233 "llvm.SI.buffer.load.dword.i32.i32",
1234 ctx->i32, args, 9,
1235 LLVMReadOnlyAttribute);
1236 if (tgsi_type_is_64bit(type)) {
1237 LLVMValueRef value2;
1238 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1239 value2 = lp_build_intrinsic(gallivm->builder,
1240 "llvm.SI.buffer.load.dword.i32.i32",
1241 ctx->i32, args, 9,
1242 LLVMReadOnlyAttribute);
1243 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1244 value, value2);
1245 }
1246 return LLVMBuildBitCast(gallivm->builder,
1247 value,
1248 tgsi2llvmtype(bld_base, type), "");
1249 }
1250
1251 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1252 {
1253 switch (interpolate) {
1254 case TGSI_INTERPOLATE_CONSTANT:
1255 return 0;
1256
1257 case TGSI_INTERPOLATE_LINEAR:
1258 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1259 return SI_PARAM_LINEAR_SAMPLE;
1260 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1261 return SI_PARAM_LINEAR_CENTROID;
1262 else
1263 return SI_PARAM_LINEAR_CENTER;
1264 break;
1265 case TGSI_INTERPOLATE_COLOR:
1266 case TGSI_INTERPOLATE_PERSPECTIVE:
1267 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1268 return SI_PARAM_PERSP_SAMPLE;
1269 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1270 return SI_PARAM_PERSP_CENTROID;
1271 else
1272 return SI_PARAM_PERSP_CENTER;
1273 break;
1274 default:
1275 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1276 return -1;
1277 }
1278 }
1279
1280 /* This shouldn't be used by explicit INTERP opcodes. */
1281 static unsigned select_interp_param(struct si_shader_context *ctx,
1282 unsigned param)
1283 {
1284 if (!ctx->is_monolithic)
1285 return param;
1286
1287 if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1288 switch (param) {
1289 case SI_PARAM_PERSP_CENTROID:
1290 case SI_PARAM_PERSP_CENTER:
1291 return SI_PARAM_PERSP_SAMPLE;
1292 }
1293 }
1294 if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1295 switch (param) {
1296 case SI_PARAM_LINEAR_CENTROID:
1297 case SI_PARAM_LINEAR_CENTER:
1298 return SI_PARAM_LINEAR_SAMPLE;
1299 }
1300 }
1301 if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1302 switch (param) {
1303 case SI_PARAM_PERSP_CENTROID:
1304 case SI_PARAM_PERSP_SAMPLE:
1305 return SI_PARAM_PERSP_CENTER;
1306 }
1307 }
1308 if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1309 switch (param) {
1310 case SI_PARAM_LINEAR_CENTROID:
1311 case SI_PARAM_LINEAR_SAMPLE:
1312 return SI_PARAM_LINEAR_CENTER;
1313 }
1314 }
1315
1316 return param;
1317 }
1318
1319 /**
1320 * Interpolate a fragment shader input.
1321 *
1322 * @param ctx context
1323 * @param input_index index of the input in hardware
1324 * @param semantic_name TGSI_SEMANTIC_*
1325 * @param semantic_index semantic index
1326 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1327 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1328 * @param interp_param interpolation weights (i,j)
1329 * @param prim_mask SI_PARAM_PRIM_MASK
1330 * @param face SI_PARAM_FRONT_FACE
1331 * @param result the return value (4 components)
1332 */
1333 static void interp_fs_input(struct si_shader_context *ctx,
1334 unsigned input_index,
1335 unsigned semantic_name,
1336 unsigned semantic_index,
1337 unsigned num_interp_inputs,
1338 unsigned colors_read_mask,
1339 LLVMValueRef interp_param,
1340 LLVMValueRef prim_mask,
1341 LLVMValueRef face,
1342 LLVMValueRef result[4])
1343 {
1344 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1345 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1346 struct gallivm_state *gallivm = base->gallivm;
1347 const char *intr_name;
1348 LLVMValueRef attr_number;
1349
1350 unsigned chan;
1351
1352 attr_number = lp_build_const_int32(gallivm, input_index);
1353
1354 /* fs.constant returns the param from the middle vertex, so it's not
1355 * really useful for flat shading. It's meant to be used for custom
1356 * interpolation (but the intrinsic can't fetch from the other two
1357 * vertices).
1358 *
1359 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1360 * to do the right thing. The only reason we use fs.constant is that
1361 * fs.interp cannot be used on integers, because they can be equal
1362 * to NaN.
1363 */
1364 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1365
1366 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1367 ctx->shader->key.ps.prolog.color_two_side) {
1368 LLVMValueRef args[4];
1369 LLVMValueRef is_face_positive;
1370 LLVMValueRef back_attr_number;
1371
1372 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1373 * otherwise it's at offset "num_inputs".
1374 */
1375 unsigned back_attr_offset = num_interp_inputs;
1376 if (semantic_index == 1 && colors_read_mask & 0xf)
1377 back_attr_offset += 1;
1378
1379 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1380
1381 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1382 face, uint->zero, "");
1383
1384 args[2] = prim_mask;
1385 args[3] = interp_param;
1386 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1387 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1388 LLVMValueRef front, back;
1389
1390 args[0] = llvm_chan;
1391 args[1] = attr_number;
1392 front = lp_build_intrinsic(gallivm->builder, intr_name,
1393 ctx->f32, args, args[3] ? 4 : 3,
1394 LLVMReadNoneAttribute);
1395
1396 args[1] = back_attr_number;
1397 back = lp_build_intrinsic(gallivm->builder, intr_name,
1398 ctx->f32, args, args[3] ? 4 : 3,
1399 LLVMReadNoneAttribute);
1400
1401 result[chan] = LLVMBuildSelect(gallivm->builder,
1402 is_face_positive,
1403 front,
1404 back,
1405 "");
1406 }
1407 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1408 LLVMValueRef args[4];
1409
1410 args[0] = uint->zero;
1411 args[1] = attr_number;
1412 args[2] = prim_mask;
1413 args[3] = interp_param;
1414 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1415 ctx->f32, args, args[3] ? 4 : 3,
1416 LLVMReadNoneAttribute);
1417 result[1] =
1418 result[2] = lp_build_const_float(gallivm, 0.0f);
1419 result[3] = lp_build_const_float(gallivm, 1.0f);
1420 } else {
1421 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1422 LLVMValueRef args[4];
1423 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1424
1425 args[0] = llvm_chan;
1426 args[1] = attr_number;
1427 args[2] = prim_mask;
1428 args[3] = interp_param;
1429 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1430 ctx->f32, args, args[3] ? 4 : 3,
1431 LLVMReadNoneAttribute);
1432 }
1433 }
1434 }
1435
1436 /* LLVMGetParam with bc_optimize resolved. */
1437 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1438 int interp_param_idx)
1439 {
1440 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1441 LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1442 LLVMValueRef param = NULL;
1443
1444 /* Handle PRIM_MASK[31] (bc_optimize). */
1445 if (ctx->is_monolithic &&
1446 ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1447 interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1448 (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1449 interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1450 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1451 * The hw doesn't compute CENTROID if the whole wave only
1452 * contains fully-covered quads.
1453 */
1454 LLVMValueRef bc_optimize =
1455 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1456 bc_optimize = LLVMBuildLShr(builder,
1457 bc_optimize,
1458 LLVMConstInt(ctx->i32, 31, 0), "");
1459 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1460
1461 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1462 interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1463 param = LLVMBuildSelect(builder, bc_optimize,
1464 LLVMGetParam(main_fn,
1465 SI_PARAM_PERSP_CENTER),
1466 LLVMGetParam(main_fn,
1467 SI_PARAM_PERSP_CENTROID),
1468 "");
1469 }
1470 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1471 interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1472 param = LLVMBuildSelect(builder, bc_optimize,
1473 LLVMGetParam(main_fn,
1474 SI_PARAM_LINEAR_CENTER),
1475 LLVMGetParam(main_fn,
1476 SI_PARAM_LINEAR_CENTROID),
1477 "");
1478 }
1479 }
1480
1481 if (!param)
1482 param = LLVMGetParam(main_fn, interp_param_idx);
1483 return param;
1484 }
1485
1486 static void declare_input_fs(
1487 struct radeon_llvm_context *radeon_bld,
1488 unsigned input_index,
1489 const struct tgsi_full_declaration *decl)
1490 {
1491 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1492 struct si_shader_context *ctx =
1493 si_shader_context(&radeon_bld->soa.bld_base);
1494 struct si_shader *shader = ctx->shader;
1495 LLVMValueRef main_fn = radeon_bld->main_fn;
1496 LLVMValueRef interp_param = NULL;
1497 int interp_param_idx;
1498
1499 /* Get colors from input VGPRs (set by the prolog). */
1500 if (!ctx->is_monolithic &&
1501 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1502 unsigned i = decl->Semantic.Index;
1503 unsigned colors_read = shader->selector->info.colors_read;
1504 unsigned mask = colors_read >> (i * 4);
1505 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1506 (i ? util_bitcount(colors_read & 0xf) : 0);
1507
1508 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1509 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1510 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1511 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1512 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1513 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1514 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1515 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1516 return;
1517 }
1518
1519 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1520 decl->Interp.Location);
1521 if (interp_param_idx == -1)
1522 return;
1523 else if (interp_param_idx) {
1524 interp_param_idx = select_interp_param(ctx,
1525 interp_param_idx);
1526 interp_param = get_interp_param(ctx, interp_param_idx);
1527 }
1528
1529 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1530 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1531 ctx->shader->key.ps.prolog.flatshade_colors)
1532 interp_param = NULL; /* load the constant color */
1533
1534 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1535 decl->Semantic.Index, shader->selector->info.num_inputs,
1536 shader->selector->info.colors_read, interp_param,
1537 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1538 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1539 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1540 }
1541
1542 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1543 {
1544 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1545 SI_PARAM_ANCILLARY, 8, 4);
1546 }
1547
1548 /**
1549 * Set range metadata on an instruction. This can only be used on load and
1550 * call instructions. If you know an instruction can only produce the values
1551 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1552 * \p lo is the minimum value inclusive.
1553 * \p hi is the maximum value exclusive.
1554 */
1555 static void set_range_metadata(struct si_shader_context *ctx,
1556 LLVMValueRef value, unsigned lo, unsigned hi)
1557 {
1558 LLVMValueRef range_md, md_args[2];
1559 LLVMTypeRef type = LLVMTypeOf(value);
1560 LLVMContextRef context = LLVMGetTypeContext(type);
1561
1562 md_args[0] = LLVMConstInt(type, lo, false);
1563 md_args[1] = LLVMConstInt(type, hi, false);
1564 range_md = LLVMMDNodeInContext(context, md_args, 2);
1565 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1566 }
1567
1568 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1569 {
1570 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1571 LLVMValueRef tid;
1572
1573 if (HAVE_LLVM < 0x0308) {
1574 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1575 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1576 } else {
1577 LLVMValueRef tid_args[2];
1578 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1579 tid_args[1] = lp_build_const_int32(gallivm, 0);
1580 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1581 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1582 tid_args, 2, LLVMReadNoneAttribute);
1583
1584 tid = lp_build_intrinsic(gallivm->builder,
1585 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1586 tid_args, 2, LLVMReadNoneAttribute);
1587 }
1588 set_range_metadata(ctx, tid, 0, 64);
1589 return tid;
1590 }
1591
1592 /**
1593 * Load a dword from a constant buffer.
1594 */
1595 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1596 LLVMValueRef resource,
1597 LLVMValueRef offset)
1598 {
1599 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1600 LLVMValueRef args[2] = {resource, offset};
1601
1602 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1603 LLVMReadNoneAttribute);
1604 }
1605
1606 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1607 {
1608 struct si_shader_context *ctx =
1609 si_shader_context(&radeon_bld->soa.bld_base);
1610 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1611 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1612 LLVMBuilderRef builder = gallivm->builder;
1613 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1614 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1615 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1616
1617 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1618 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1619 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1620
1621 LLVMValueRef pos[4] = {
1622 buffer_load_const(ctx, resource, offset0),
1623 buffer_load_const(ctx, resource, offset1),
1624 lp_build_const_float(gallivm, 0),
1625 lp_build_const_float(gallivm, 0)
1626 };
1627
1628 return lp_build_gather_values(gallivm, pos, 4);
1629 }
1630
1631 static void declare_system_value(
1632 struct radeon_llvm_context *radeon_bld,
1633 unsigned index,
1634 const struct tgsi_full_declaration *decl)
1635 {
1636 struct si_shader_context *ctx =
1637 si_shader_context(&radeon_bld->soa.bld_base);
1638 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1639 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1640 LLVMValueRef value = 0;
1641
1642 switch (decl->Semantic.Name) {
1643 case TGSI_SEMANTIC_INSTANCEID:
1644 value = LLVMGetParam(radeon_bld->main_fn,
1645 ctx->param_instance_id);
1646 break;
1647
1648 case TGSI_SEMANTIC_VERTEXID:
1649 value = LLVMBuildAdd(gallivm->builder,
1650 LLVMGetParam(radeon_bld->main_fn,
1651 ctx->param_vertex_id),
1652 LLVMGetParam(radeon_bld->main_fn,
1653 SI_PARAM_BASE_VERTEX), "");
1654 break;
1655
1656 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1657 value = LLVMGetParam(radeon_bld->main_fn,
1658 ctx->param_vertex_id);
1659 break;
1660
1661 case TGSI_SEMANTIC_BASEVERTEX:
1662 value = LLVMGetParam(radeon_bld->main_fn,
1663 SI_PARAM_BASE_VERTEX);
1664 break;
1665
1666 case TGSI_SEMANTIC_BASEINSTANCE:
1667 value = LLVMGetParam(radeon_bld->main_fn,
1668 SI_PARAM_START_INSTANCE);
1669 break;
1670
1671 case TGSI_SEMANTIC_DRAWID:
1672 value = LLVMGetParam(radeon_bld->main_fn,
1673 SI_PARAM_DRAWID);
1674 break;
1675
1676 case TGSI_SEMANTIC_INVOCATIONID:
1677 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1678 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1679 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1680 value = LLVMGetParam(radeon_bld->main_fn,
1681 SI_PARAM_GS_INSTANCE_ID);
1682 else
1683 assert(!"INVOCATIONID not implemented");
1684 break;
1685
1686 case TGSI_SEMANTIC_POSITION:
1687 {
1688 LLVMValueRef pos[4] = {
1689 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1690 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1691 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1692 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1693 LLVMGetParam(radeon_bld->main_fn,
1694 SI_PARAM_POS_W_FLOAT)),
1695 };
1696 value = lp_build_gather_values(gallivm, pos, 4);
1697 break;
1698 }
1699
1700 case TGSI_SEMANTIC_FACE:
1701 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1702 break;
1703
1704 case TGSI_SEMANTIC_SAMPLEID:
1705 value = get_sample_id(radeon_bld);
1706 break;
1707
1708 case TGSI_SEMANTIC_SAMPLEPOS: {
1709 LLVMValueRef pos[4] = {
1710 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1711 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1712 lp_build_const_float(gallivm, 0),
1713 lp_build_const_float(gallivm, 0)
1714 };
1715 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1716 TGSI_OPCODE_FRC, pos[0]);
1717 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1718 TGSI_OPCODE_FRC, pos[1]);
1719 value = lp_build_gather_values(gallivm, pos, 4);
1720 break;
1721 }
1722
1723 case TGSI_SEMANTIC_SAMPLEMASK:
1724 /* This can only occur with the OpenGL Core profile, which
1725 * doesn't support smoothing.
1726 */
1727 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1728 break;
1729
1730 case TGSI_SEMANTIC_TESSCOORD:
1731 {
1732 LLVMValueRef coord[4] = {
1733 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1734 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1735 bld->zero,
1736 bld->zero
1737 };
1738
1739 /* For triangles, the vector should be (u, v, 1-u-v). */
1740 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1741 PIPE_PRIM_TRIANGLES)
1742 coord[2] = lp_build_sub(bld, bld->one,
1743 lp_build_add(bld, coord[0], coord[1]));
1744
1745 value = lp_build_gather_values(gallivm, coord, 4);
1746 break;
1747 }
1748
1749 case TGSI_SEMANTIC_VERTICESIN:
1750 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1751 break;
1752
1753 case TGSI_SEMANTIC_TESSINNER:
1754 case TGSI_SEMANTIC_TESSOUTER:
1755 {
1756 LLVMValueRef rw_buffers, buffer, base, addr;
1757 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1758
1759 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1760 SI_PARAM_RW_BUFFERS);
1761 buffer = build_indexed_load_const(ctx, rw_buffers,
1762 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1763
1764 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1765 addr = get_tcs_tes_buffer_address(ctx, NULL,
1766 lp_build_const_int32(gallivm, param));
1767
1768 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1769 ~0, buffer, base, addr);
1770
1771 break;
1772 }
1773
1774 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1775 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1776 {
1777 LLVMValueRef buf, slot, val[4];
1778 int i, offset;
1779
1780 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1781 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1782 buf = build_indexed_load_const(ctx, buf, slot);
1783 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1784
1785 for (i = 0; i < 4; i++)
1786 val[i] = buffer_load_const(ctx, buf,
1787 lp_build_const_int32(gallivm, (offset + i) * 4));
1788 value = lp_build_gather_values(gallivm, val, 4);
1789 break;
1790 }
1791
1792 case TGSI_SEMANTIC_PRIMID:
1793 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1794 break;
1795
1796 case TGSI_SEMANTIC_GRID_SIZE:
1797 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1798 break;
1799
1800 case TGSI_SEMANTIC_BLOCK_SIZE:
1801 {
1802 LLVMValueRef values[3];
1803 unsigned i;
1804 unsigned *properties = ctx->shader->selector->info.properties;
1805 unsigned sizes[3] = {
1806 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1807 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1808 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1809 };
1810
1811 for (i = 0; i < 3; ++i)
1812 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1813
1814 value = lp_build_gather_values(gallivm, values, 3);
1815 break;
1816 }
1817
1818 case TGSI_SEMANTIC_BLOCK_ID:
1819 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1820 break;
1821
1822 case TGSI_SEMANTIC_THREAD_ID:
1823 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1824 break;
1825
1826 #if HAVE_LLVM >= 0x0309
1827 case TGSI_SEMANTIC_HELPER_INVOCATION:
1828 value = lp_build_intrinsic(gallivm->builder,
1829 "llvm.amdgcn.ps.live",
1830 ctx->i1, NULL, 0,
1831 LLVMReadNoneAttribute);
1832 value = LLVMBuildNot(gallivm->builder, value, "");
1833 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1834 break;
1835 #endif
1836
1837 default:
1838 assert(!"unknown system value");
1839 return;
1840 }
1841
1842 radeon_bld->system_values[index] = value;
1843 }
1844
1845 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1846 const struct tgsi_full_declaration *decl)
1847 {
1848 struct si_shader_context *ctx =
1849 si_shader_context(&radeon_bld->soa.bld_base);
1850 struct si_shader_selector *sel = ctx->shader->selector;
1851 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1852
1853 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1854 LLVMValueRef var;
1855
1856 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1857 assert(decl->Range.First == decl->Range.Last);
1858 assert(!ctx->shared_memory);
1859
1860 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1861 LLVMArrayType(ctx->i8, sel->local_size),
1862 "compute_lds",
1863 LOCAL_ADDR_SPACE);
1864 LLVMSetAlignment(var, 4);
1865
1866 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1867 }
1868
1869 static LLVMValueRef fetch_constant(
1870 struct lp_build_tgsi_context *bld_base,
1871 const struct tgsi_full_src_register *reg,
1872 enum tgsi_opcode_type type,
1873 unsigned swizzle)
1874 {
1875 struct si_shader_context *ctx = si_shader_context(bld_base);
1876 struct lp_build_context *base = &bld_base->base;
1877 const struct tgsi_ind_register *ireg = &reg->Indirect;
1878 unsigned buf, idx;
1879
1880 LLVMValueRef addr, bufp;
1881 LLVMValueRef result;
1882
1883 if (swizzle == LP_CHAN_ALL) {
1884 unsigned chan;
1885 LLVMValueRef values[4];
1886 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1887 values[chan] = fetch_constant(bld_base, reg, type, chan);
1888
1889 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1890 }
1891
1892 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1893 idx = reg->Register.Index * 4 + swizzle;
1894
1895 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1896 if (!tgsi_type_is_64bit(type))
1897 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1898 else {
1899 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1900 ctx->constants[buf][idx],
1901 ctx->constants[buf][idx + 1]);
1902 }
1903 }
1904
1905 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1906 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1907 LLVMValueRef index;
1908 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1909 reg->Dimension.Index,
1910 SI_NUM_CONST_BUFFERS);
1911 bufp = build_indexed_load_const(ctx, ptr, index);
1912 } else
1913 bufp = ctx->const_buffers[buf];
1914
1915 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1916 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1917 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1918 addr = lp_build_add(&bld_base->uint_bld, addr,
1919 lp_build_const_int32(base->gallivm, idx * 4));
1920
1921 result = buffer_load_const(ctx, bufp, addr);
1922
1923 if (!tgsi_type_is_64bit(type))
1924 result = bitcast(bld_base, type, result);
1925 else {
1926 LLVMValueRef addr2, result2;
1927 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1928 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1929 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1930 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1931 lp_build_const_int32(base->gallivm, idx * 4));
1932
1933 result2 = buffer_load_const(ctx, ctx->const_buffers[buf],
1934 addr2);
1935
1936 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1937 result, result2);
1938 }
1939 return result;
1940 }
1941
1942 /* Upper 16 bits must be zero. */
1943 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1944 LLVMValueRef val[2])
1945 {
1946 return LLVMBuildOr(gallivm->builder, val[0],
1947 LLVMBuildShl(gallivm->builder, val[1],
1948 lp_build_const_int32(gallivm, 16),
1949 ""), "");
1950 }
1951
1952 /* Upper 16 bits are ignored and will be dropped. */
1953 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1954 LLVMValueRef val[2])
1955 {
1956 LLVMValueRef v[2] = {
1957 LLVMBuildAnd(gallivm->builder, val[0],
1958 lp_build_const_int32(gallivm, 0xffff), ""),
1959 val[1],
1960 };
1961 return si_llvm_pack_two_int16(gallivm, v);
1962 }
1963
1964 /* Initialize arguments for the shader export intrinsic */
1965 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1966 LLVMValueRef *values,
1967 unsigned target,
1968 LLVMValueRef *args)
1969 {
1970 struct si_shader_context *ctx = si_shader_context(bld_base);
1971 struct lp_build_context *uint =
1972 &ctx->radeon_bld.soa.bld_base.uint_bld;
1973 struct lp_build_context *base = &bld_base->base;
1974 struct gallivm_state *gallivm = base->gallivm;
1975 LLVMBuilderRef builder = base->gallivm->builder;
1976 LLVMValueRef val[4];
1977 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1978 unsigned chan;
1979 bool is_int8;
1980
1981 /* Default is 0xf. Adjusted below depending on the format. */
1982 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1983
1984 /* Specify whether the EXEC mask represents the valid mask */
1985 args[1] = uint->zero;
1986
1987 /* Specify whether this is the last export */
1988 args[2] = uint->zero;
1989
1990 /* Specify the target we are exporting */
1991 args[3] = lp_build_const_int32(base->gallivm, target);
1992
1993 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1994 const union si_shader_key *key = &ctx->shader->key;
1995 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1996 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1997
1998 assert(cbuf >= 0 && cbuf < 8);
1999 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
2000 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
2001 }
2002
2003 args[4] = uint->zero; /* COMPR flag */
2004 args[5] = base->undef;
2005 args[6] = base->undef;
2006 args[7] = base->undef;
2007 args[8] = base->undef;
2008
2009 switch (spi_shader_col_format) {
2010 case V_028714_SPI_SHADER_ZERO:
2011 args[0] = uint->zero; /* writemask */
2012 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2013 break;
2014
2015 case V_028714_SPI_SHADER_32_R:
2016 args[0] = uint->one; /* writemask */
2017 args[5] = values[0];
2018 break;
2019
2020 case V_028714_SPI_SHADER_32_GR:
2021 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2022 args[5] = values[0];
2023 args[6] = values[1];
2024 break;
2025
2026 case V_028714_SPI_SHADER_32_AR:
2027 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2028 args[5] = values[0];
2029 args[8] = values[3];
2030 break;
2031
2032 case V_028714_SPI_SHADER_FP16_ABGR:
2033 args[4] = uint->one; /* COMPR flag */
2034
2035 for (chan = 0; chan < 2; chan++) {
2036 LLVMValueRef pack_args[2] = {
2037 values[2 * chan],
2038 values[2 * chan + 1]
2039 };
2040 LLVMValueRef packed;
2041
2042 packed = lp_build_intrinsic(base->gallivm->builder,
2043 "llvm.SI.packf16",
2044 ctx->i32, pack_args, 2,
2045 LLVMReadNoneAttribute);
2046 args[chan + 5] =
2047 LLVMBuildBitCast(base->gallivm->builder,
2048 packed, ctx->f32, "");
2049 }
2050 break;
2051
2052 case V_028714_SPI_SHADER_UNORM16_ABGR:
2053 for (chan = 0; chan < 4; chan++) {
2054 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2055 val[chan] = LLVMBuildFMul(builder, val[chan],
2056 lp_build_const_float(gallivm, 65535), "");
2057 val[chan] = LLVMBuildFAdd(builder, val[chan],
2058 lp_build_const_float(gallivm, 0.5), "");
2059 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2060 ctx->i32, "");
2061 }
2062
2063 args[4] = uint->one; /* COMPR flag */
2064 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2065 si_llvm_pack_two_int16(gallivm, val));
2066 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2067 si_llvm_pack_two_int16(gallivm, val+2));
2068 break;
2069
2070 case V_028714_SPI_SHADER_SNORM16_ABGR:
2071 for (chan = 0; chan < 4; chan++) {
2072 /* Clamp between [-1, 1]. */
2073 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2074 values[chan],
2075 lp_build_const_float(gallivm, 1));
2076 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2077 val[chan],
2078 lp_build_const_float(gallivm, -1));
2079 /* Convert to a signed integer in [-32767, 32767]. */
2080 val[chan] = LLVMBuildFMul(builder, val[chan],
2081 lp_build_const_float(gallivm, 32767), "");
2082 /* If positive, add 0.5, else add -0.5. */
2083 val[chan] = LLVMBuildFAdd(builder, val[chan],
2084 LLVMBuildSelect(builder,
2085 LLVMBuildFCmp(builder, LLVMRealOGE,
2086 val[chan], base->zero, ""),
2087 lp_build_const_float(gallivm, 0.5),
2088 lp_build_const_float(gallivm, -0.5), ""), "");
2089 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2090 }
2091
2092 args[4] = uint->one; /* COMPR flag */
2093 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2094 si_llvm_pack_two_int32_as_int16(gallivm, val));
2095 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2096 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2097 break;
2098
2099 case V_028714_SPI_SHADER_UINT16_ABGR: {
2100 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2101 255 : 65535);
2102 /* Clamp. */
2103 for (chan = 0; chan < 4; chan++) {
2104 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2105 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2106 val[chan], max);
2107 }
2108
2109 args[4] = uint->one; /* COMPR flag */
2110 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2111 si_llvm_pack_two_int16(gallivm, val));
2112 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2113 si_llvm_pack_two_int16(gallivm, val+2));
2114 break;
2115 }
2116
2117 case V_028714_SPI_SHADER_SINT16_ABGR: {
2118 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2119 127 : 32767);
2120 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2121 -128 : -32768);
2122 /* Clamp. */
2123 for (chan = 0; chan < 4; chan++) {
2124 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2125 val[chan] = lp_build_emit_llvm_binary(bld_base,
2126 TGSI_OPCODE_IMIN,
2127 val[chan], max);
2128 val[chan] = lp_build_emit_llvm_binary(bld_base,
2129 TGSI_OPCODE_IMAX,
2130 val[chan], min);
2131 }
2132
2133 args[4] = uint->one; /* COMPR flag */
2134 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2135 si_llvm_pack_two_int32_as_int16(gallivm, val));
2136 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2137 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2138 break;
2139 }
2140
2141 case V_028714_SPI_SHADER_32_ABGR:
2142 memcpy(&args[5], values, sizeof(values[0]) * 4);
2143 break;
2144 }
2145 }
2146
2147 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2148 LLVMValueRef alpha)
2149 {
2150 struct si_shader_context *ctx = si_shader_context(bld_base);
2151 struct gallivm_state *gallivm = bld_base->base.gallivm;
2152
2153 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2154 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2155 SI_PARAM_ALPHA_REF);
2156
2157 LLVMValueRef alpha_pass =
2158 lp_build_cmp(&bld_base->base,
2159 ctx->shader->key.ps.epilog.alpha_func,
2160 alpha, alpha_ref);
2161 LLVMValueRef arg =
2162 lp_build_select(&bld_base->base,
2163 alpha_pass,
2164 lp_build_const_float(gallivm, 1.0f),
2165 lp_build_const_float(gallivm, -1.0f));
2166
2167 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2168 ctx->voidt, &arg, 1, 0);
2169 } else {
2170 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2171 ctx->voidt, NULL, 0, 0);
2172 }
2173 }
2174
2175 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2176 LLVMValueRef alpha,
2177 unsigned samplemask_param)
2178 {
2179 struct si_shader_context *ctx = si_shader_context(bld_base);
2180 struct gallivm_state *gallivm = bld_base->base.gallivm;
2181 LLVMValueRef coverage;
2182
2183 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2184 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2185 samplemask_param);
2186 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2187
2188 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2189 ctx->i32,
2190 &coverage, 1, LLVMReadNoneAttribute);
2191
2192 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2193 ctx->f32, "");
2194
2195 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2196 lp_build_const_float(gallivm,
2197 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2198
2199 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2200 }
2201
2202 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2203 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2204 {
2205 struct si_shader_context *ctx = si_shader_context(bld_base);
2206 struct lp_build_context *base = &bld_base->base;
2207 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2208 unsigned reg_index;
2209 unsigned chan;
2210 unsigned const_chan;
2211 LLVMValueRef base_elt;
2212 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2213 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2214 SI_VS_CONST_CLIP_PLANES);
2215 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2216
2217 for (reg_index = 0; reg_index < 2; reg_index ++) {
2218 LLVMValueRef *args = pos[2 + reg_index];
2219
2220 args[5] =
2221 args[6] =
2222 args[7] =
2223 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2224
2225 /* Compute dot products of position and user clip plane vectors */
2226 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2227 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2228 args[1] = lp_build_const_int32(base->gallivm,
2229 ((reg_index * 4 + chan) * 4 +
2230 const_chan) * 4);
2231 base_elt = buffer_load_const(ctx, const_resource,
2232 args[1]);
2233 args[5 + chan] =
2234 lp_build_add(base, args[5 + chan],
2235 lp_build_mul(base, base_elt,
2236 out_elts[const_chan]));
2237 }
2238 }
2239
2240 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2241 args[1] = uint->zero;
2242 args[2] = uint->zero;
2243 args[3] = lp_build_const_int32(base->gallivm,
2244 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2245 args[4] = uint->zero;
2246 }
2247 }
2248
2249 static void si_dump_streamout(struct pipe_stream_output_info *so)
2250 {
2251 unsigned i;
2252
2253 if (so->num_outputs)
2254 fprintf(stderr, "STREAMOUT\n");
2255
2256 for (i = 0; i < so->num_outputs; i++) {
2257 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2258 so->output[i].start_component;
2259 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2260 i, so->output[i].output_buffer,
2261 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2262 so->output[i].register_index,
2263 mask & 1 ? "x" : "",
2264 mask & 2 ? "y" : "",
2265 mask & 4 ? "z" : "",
2266 mask & 8 ? "w" : "");
2267 }
2268 }
2269
2270 /* On SI, the vertex shader is responsible for writing streamout data
2271 * to buffers. */
2272 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2273 struct si_shader_output_values *outputs,
2274 unsigned noutput)
2275 {
2276 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2277 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2278 LLVMBuilderRef builder = gallivm->builder;
2279 int i, j;
2280 struct lp_build_if_state if_ctx;
2281
2282 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2283 LLVMValueRef so_vtx_count =
2284 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2285
2286 LLVMValueRef tid = get_thread_id(ctx);
2287
2288 /* can_emit = tid < so_vtx_count; */
2289 LLVMValueRef can_emit =
2290 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2291
2292 LLVMValueRef stream_id =
2293 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2294
2295 /* Emit the streamout code conditionally. This actually avoids
2296 * out-of-bounds buffer access. The hw tells us via the SGPR
2297 * (so_vtx_count) which threads are allowed to emit streamout data. */
2298 lp_build_if(&if_ctx, gallivm, can_emit);
2299 {
2300 /* The buffer offset is computed as follows:
2301 * ByteOffset = streamout_offset[buffer_id]*4 +
2302 * (streamout_write_index + thread_id)*stride[buffer_id] +
2303 * attrib_offset
2304 */
2305
2306 LLVMValueRef so_write_index =
2307 LLVMGetParam(ctx->radeon_bld.main_fn,
2308 ctx->param_streamout_write_index);
2309
2310 /* Compute (streamout_write_index + thread_id). */
2311 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2312
2313 /* Compute the write offset for each enabled buffer. */
2314 LLVMValueRef so_write_offset[4] = {};
2315 for (i = 0; i < 4; i++) {
2316 if (!so->stride[i])
2317 continue;
2318
2319 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2320 ctx->param_streamout_offset[i]);
2321 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2322
2323 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2324 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2325 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2326 }
2327
2328 /* Write streamout data. */
2329 for (i = 0; i < so->num_outputs; i++) {
2330 unsigned buf_idx = so->output[i].output_buffer;
2331 unsigned reg = so->output[i].register_index;
2332 unsigned start = so->output[i].start_component;
2333 unsigned num_comps = so->output[i].num_components;
2334 unsigned stream = so->output[i].stream;
2335 LLVMValueRef out[4];
2336 struct lp_build_if_state if_ctx_stream;
2337
2338 assert(num_comps && num_comps <= 4);
2339 if (!num_comps || num_comps > 4)
2340 continue;
2341
2342 if (reg >= noutput)
2343 continue;
2344
2345 /* Load the output as int. */
2346 for (j = 0; j < num_comps; j++) {
2347 out[j] = LLVMBuildBitCast(builder,
2348 outputs[reg].values[start+j],
2349 ctx->i32, "");
2350 }
2351
2352 /* Pack the output. */
2353 LLVMValueRef vdata = NULL;
2354
2355 switch (num_comps) {
2356 case 1: /* as i32 */
2357 vdata = out[0];
2358 break;
2359 case 2: /* as v2i32 */
2360 case 3: /* as v4i32 (aligned to 4) */
2361 case 4: /* as v4i32 */
2362 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2363 for (j = 0; j < num_comps; j++) {
2364 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2365 LLVMConstInt(ctx->i32, j, 0), "");
2366 }
2367 break;
2368 }
2369
2370 LLVMValueRef can_emit_stream =
2371 LLVMBuildICmp(builder, LLVMIntEQ,
2372 stream_id,
2373 lp_build_const_int32(gallivm, stream), "");
2374
2375 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2376 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2377 vdata, num_comps,
2378 so_write_offset[buf_idx],
2379 LLVMConstInt(ctx->i32, 0, 0),
2380 so->output[i].dst_offset*4);
2381 lp_build_endif(&if_ctx_stream);
2382 }
2383 }
2384 lp_build_endif(&if_ctx);
2385 }
2386
2387
2388 /* Generate export instructions for hardware VS shader stage */
2389 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2390 struct si_shader_output_values *outputs,
2391 unsigned noutput)
2392 {
2393 struct si_shader_context *ctx = si_shader_context(bld_base);
2394 struct si_shader *shader = ctx->shader;
2395 struct lp_build_context *base = &bld_base->base;
2396 struct lp_build_context *uint =
2397 &ctx->radeon_bld.soa.bld_base.uint_bld;
2398 LLVMValueRef args[9];
2399 LLVMValueRef pos_args[4][9] = { { 0 } };
2400 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2401 unsigned semantic_name, semantic_index;
2402 unsigned target;
2403 unsigned param_count = 0;
2404 unsigned pos_idx;
2405 int i;
2406
2407 if (outputs && ctx->shader->selector->so.num_outputs) {
2408 si_llvm_emit_streamout(ctx, outputs, noutput);
2409 }
2410
2411 for (i = 0; i < noutput; i++) {
2412 semantic_name = outputs[i].name;
2413 semantic_index = outputs[i].sid;
2414
2415 handle_semantic:
2416 /* Select the correct target */
2417 switch(semantic_name) {
2418 case TGSI_SEMANTIC_PSIZE:
2419 psize_value = outputs[i].values[0];
2420 continue;
2421 case TGSI_SEMANTIC_EDGEFLAG:
2422 edgeflag_value = outputs[i].values[0];
2423 continue;
2424 case TGSI_SEMANTIC_LAYER:
2425 layer_value = outputs[i].values[0];
2426 semantic_name = TGSI_SEMANTIC_GENERIC;
2427 goto handle_semantic;
2428 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2429 viewport_index_value = outputs[i].values[0];
2430 semantic_name = TGSI_SEMANTIC_GENERIC;
2431 goto handle_semantic;
2432 case TGSI_SEMANTIC_POSITION:
2433 target = V_008DFC_SQ_EXP_POS;
2434 break;
2435 case TGSI_SEMANTIC_COLOR:
2436 case TGSI_SEMANTIC_BCOLOR:
2437 target = V_008DFC_SQ_EXP_PARAM + param_count;
2438 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2439 shader->info.vs_output_param_offset[i] = param_count;
2440 param_count++;
2441 break;
2442 case TGSI_SEMANTIC_CLIPDIST:
2443 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2444 break;
2445 case TGSI_SEMANTIC_CLIPVERTEX:
2446 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2447 continue;
2448 case TGSI_SEMANTIC_PRIMID:
2449 case TGSI_SEMANTIC_FOG:
2450 case TGSI_SEMANTIC_TEXCOORD:
2451 case TGSI_SEMANTIC_GENERIC:
2452 target = V_008DFC_SQ_EXP_PARAM + param_count;
2453 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2454 shader->info.vs_output_param_offset[i] = param_count;
2455 param_count++;
2456 break;
2457 default:
2458 target = 0;
2459 fprintf(stderr,
2460 "Warning: SI unhandled vs output type:%d\n",
2461 semantic_name);
2462 }
2463
2464 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2465
2466 if (target >= V_008DFC_SQ_EXP_POS &&
2467 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2468 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2469 args, sizeof(args));
2470 } else {
2471 lp_build_intrinsic(base->gallivm->builder,
2472 "llvm.SI.export", ctx->voidt,
2473 args, 9, 0);
2474 }
2475
2476 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2477 semantic_name = TGSI_SEMANTIC_GENERIC;
2478 goto handle_semantic;
2479 }
2480 }
2481
2482 shader->info.nr_param_exports = param_count;
2483
2484 /* We need to add the position output manually if it's missing. */
2485 if (!pos_args[0][0]) {
2486 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2487 pos_args[0][1] = uint->zero; /* EXEC mask */
2488 pos_args[0][2] = uint->zero; /* last export? */
2489 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2490 pos_args[0][4] = uint->zero; /* COMPR flag */
2491 pos_args[0][5] = base->zero; /* X */
2492 pos_args[0][6] = base->zero; /* Y */
2493 pos_args[0][7] = base->zero; /* Z */
2494 pos_args[0][8] = base->one; /* W */
2495 }
2496
2497 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2498 if (shader->selector->info.writes_psize ||
2499 shader->selector->info.writes_edgeflag ||
2500 shader->selector->info.writes_viewport_index ||
2501 shader->selector->info.writes_layer) {
2502 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2503 shader->selector->info.writes_psize |
2504 (shader->selector->info.writes_edgeflag << 1) |
2505 (shader->selector->info.writes_layer << 2) |
2506 (shader->selector->info.writes_viewport_index << 3));
2507 pos_args[1][1] = uint->zero; /* EXEC mask */
2508 pos_args[1][2] = uint->zero; /* last export? */
2509 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2510 pos_args[1][4] = uint->zero; /* COMPR flag */
2511 pos_args[1][5] = base->zero; /* X */
2512 pos_args[1][6] = base->zero; /* Y */
2513 pos_args[1][7] = base->zero; /* Z */
2514 pos_args[1][8] = base->zero; /* W */
2515
2516 if (shader->selector->info.writes_psize)
2517 pos_args[1][5] = psize_value;
2518
2519 if (shader->selector->info.writes_edgeflag) {
2520 /* The output is a float, but the hw expects an integer
2521 * with the first bit containing the edge flag. */
2522 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2523 edgeflag_value,
2524 ctx->i32, "");
2525 edgeflag_value = lp_build_min(&bld_base->int_bld,
2526 edgeflag_value,
2527 bld_base->int_bld.one);
2528
2529 /* The LLVM intrinsic expects a float. */
2530 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2531 edgeflag_value,
2532 ctx->f32, "");
2533 }
2534
2535 if (shader->selector->info.writes_layer)
2536 pos_args[1][7] = layer_value;
2537
2538 if (shader->selector->info.writes_viewport_index)
2539 pos_args[1][8] = viewport_index_value;
2540 }
2541
2542 for (i = 0; i < 4; i++)
2543 if (pos_args[i][0])
2544 shader->info.nr_pos_exports++;
2545
2546 pos_idx = 0;
2547 for (i = 0; i < 4; i++) {
2548 if (!pos_args[i][0])
2549 continue;
2550
2551 /* Specify the target we are exporting */
2552 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2553
2554 if (pos_idx == shader->info.nr_pos_exports)
2555 /* Specify that this is the last export */
2556 pos_args[i][2] = uint->one;
2557
2558 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2559 ctx->voidt, pos_args[i], 9, 0);
2560 }
2561 }
2562
2563 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2564 {
2565 struct si_shader_context *ctx = si_shader_context(bld_base);
2566 struct gallivm_state *gallivm = bld_base->base.gallivm;
2567 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2568 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2569 uint64_t inputs;
2570
2571 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2572
2573 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2574 buffer = build_indexed_load_const(ctx, rw_buffers,
2575 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2576
2577 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2578
2579 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2580 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2581 lds_vertex_stride, "");
2582 lds_base = get_tcs_in_current_patch_offset(ctx);
2583 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2584
2585 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2586 while (inputs) {
2587 unsigned i = u_bit_scan64(&inputs);
2588
2589 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2590 lp_build_const_int32(gallivm, 4 * i),
2591 "");
2592
2593 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2594 invocation_id,
2595 lp_build_const_int32(gallivm, i));
2596
2597 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2598 lds_ptr);
2599
2600 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2601 buffer_offset, 0);
2602 }
2603 }
2604
2605 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2606 LLVMValueRef rel_patch_id,
2607 LLVMValueRef invocation_id,
2608 LLVMValueRef tcs_out_current_patch_data_offset)
2609 {
2610 struct si_shader_context *ctx = si_shader_context(bld_base);
2611 struct gallivm_state *gallivm = bld_base->base.gallivm;
2612 struct si_shader *shader = ctx->shader;
2613 unsigned tess_inner_index, tess_outer_index;
2614 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2615 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2616 unsigned stride, outer_comps, inner_comps, i;
2617 struct lp_build_if_state if_ctx, inner_if_ctx;
2618
2619 si_llvm_emit_barrier(NULL, bld_base, NULL);
2620
2621 /* Do this only for invocation 0, because the tess levels are per-patch,
2622 * not per-vertex.
2623 *
2624 * This can't jump, because invocation 0 executes this. It should
2625 * at least mask out the loads and stores for other invocations.
2626 */
2627 lp_build_if(&if_ctx, gallivm,
2628 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2629 invocation_id, bld_base->uint_bld.zero, ""));
2630
2631 /* Determine the layout of one tess factor element in the buffer. */
2632 switch (shader->key.tcs.epilog.prim_mode) {
2633 case PIPE_PRIM_LINES:
2634 stride = 2; /* 2 dwords, 1 vec2 store */
2635 outer_comps = 2;
2636 inner_comps = 0;
2637 break;
2638 case PIPE_PRIM_TRIANGLES:
2639 stride = 4; /* 4 dwords, 1 vec4 store */
2640 outer_comps = 3;
2641 inner_comps = 1;
2642 break;
2643 case PIPE_PRIM_QUADS:
2644 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2645 outer_comps = 4;
2646 inner_comps = 2;
2647 break;
2648 default:
2649 assert(0);
2650 return;
2651 }
2652
2653 /* Load tess_inner and tess_outer from LDS.
2654 * Any invocation can write them, so we can't get them from a temporary.
2655 */
2656 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2657 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2658
2659 lds_base = tcs_out_current_patch_data_offset;
2660 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2661 lp_build_const_int32(gallivm,
2662 tess_inner_index * 4), "");
2663 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2664 lp_build_const_int32(gallivm,
2665 tess_outer_index * 4), "");
2666
2667 for (i = 0; i < outer_comps; i++)
2668 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2669 for (i = 0; i < inner_comps; i++)
2670 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2671
2672 /* Convert the outputs to vectors for stores. */
2673 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2674 vec1 = NULL;
2675
2676 if (stride > 4)
2677 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2678
2679 /* Get the buffer. */
2680 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2681 SI_PARAM_RW_BUFFERS);
2682 buffer = build_indexed_load_const(ctx, rw_buffers,
2683 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2684
2685 /* Get the offset. */
2686 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2687 SI_PARAM_TESS_FACTOR_OFFSET);
2688 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2689 lp_build_const_int32(gallivm, 4 * stride), "");
2690
2691 lp_build_if(&inner_if_ctx, gallivm,
2692 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2693 rel_patch_id, bld_base->uint_bld.zero, ""));
2694
2695 /* Store the dynamic HS control word. */
2696 build_tbuffer_store_dwords(ctx, buffer,
2697 lp_build_const_int32(gallivm, 0x80000000),
2698 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2699
2700 lp_build_endif(&inner_if_ctx);
2701
2702 /* Store the tessellation factors. */
2703 build_tbuffer_store_dwords(ctx, buffer, vec0,
2704 MIN2(stride, 4), byteoffset, tf_base, 4);
2705 if (vec1)
2706 build_tbuffer_store_dwords(ctx, buffer, vec1,
2707 stride - 4, byteoffset, tf_base, 20);
2708 lp_build_endif(&if_ctx);
2709 }
2710
2711 /* This only writes the tessellation factor levels. */
2712 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2713 {
2714 struct si_shader_context *ctx = si_shader_context(bld_base);
2715 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2716
2717 rel_patch_id = get_rel_patch_id(ctx);
2718 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2719 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2720
2721 if (!ctx->is_monolithic) {
2722 /* Return epilog parameters from this function. */
2723 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2724 LLVMValueRef ret = ctx->return_value;
2725 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2726 unsigned vgpr;
2727
2728 /* RW_BUFFERS pointer */
2729 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2730 SI_PARAM_RW_BUFFERS);
2731 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2732 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2733 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2734 bld_base->uint_bld.zero, "");
2735 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2736 bld_base->uint_bld.one, "");
2737 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2738 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2739
2740 /* Tess factor buffer soffset is after user SGPRs. */
2741 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2742 SI_PARAM_TESS_FACTOR_OFFSET);
2743 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2744 SI_TCS_NUM_USER_SGPR + 1, "");
2745
2746 /* VGPRs */
2747 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2748 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2749 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2750
2751 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2752 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2753 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2754 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2755 ctx->return_value = ret;
2756 return;
2757 }
2758
2759 si_copy_tcs_inputs(bld_base);
2760 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2761 }
2762
2763 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2764 {
2765 struct si_shader_context *ctx = si_shader_context(bld_base);
2766 struct si_shader *shader = ctx->shader;
2767 struct tgsi_shader_info *info = &shader->selector->info;
2768 struct gallivm_state *gallivm = bld_base->base.gallivm;
2769 unsigned i, chan;
2770 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2771 ctx->param_rel_auto_id);
2772 LLVMValueRef vertex_dw_stride =
2773 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2774 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2775 vertex_dw_stride, "");
2776
2777 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2778 * its inputs from it. */
2779 for (i = 0; i < info->num_outputs; i++) {
2780 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2781 unsigned name = info->output_semantic_name[i];
2782 unsigned index = info->output_semantic_index[i];
2783 int param = si_shader_io_get_unique_index(name, index);
2784 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2785 lp_build_const_int32(gallivm, param * 4), "");
2786
2787 for (chan = 0; chan < 4; chan++) {
2788 lds_store(bld_base, chan, dw_addr,
2789 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2790 }
2791 }
2792 }
2793
2794 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2795 {
2796 struct si_shader_context *ctx = si_shader_context(bld_base);
2797 struct gallivm_state *gallivm = bld_base->base.gallivm;
2798 struct si_shader *es = ctx->shader;
2799 struct tgsi_shader_info *info = &es->selector->info;
2800 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2801 ctx->param_es2gs_offset);
2802 unsigned chan;
2803 int i;
2804
2805 for (i = 0; i < info->num_outputs; i++) {
2806 LLVMValueRef *out_ptr =
2807 ctx->radeon_bld.soa.outputs[i];
2808 int param_index;
2809
2810 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2811 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2812 continue;
2813
2814 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2815 info->output_semantic_index[i]);
2816
2817 for (chan = 0; chan < 4; chan++) {
2818 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2819 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2820
2821 build_tbuffer_store(ctx,
2822 ctx->esgs_ring,
2823 out_val, 1,
2824 LLVMGetUndef(ctx->i32), soffset,
2825 (4 * param_index + chan) * 4,
2826 V_008F0C_BUF_DATA_FORMAT_32,
2827 V_008F0C_BUF_NUM_FORMAT_UINT,
2828 0, 0, 1, 1, 0);
2829 }
2830 }
2831 }
2832
2833 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2834 {
2835 struct si_shader_context *ctx = si_shader_context(bld_base);
2836 struct gallivm_state *gallivm = bld_base->base.gallivm;
2837 LLVMValueRef args[2];
2838
2839 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2840 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2841 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2842 ctx->voidt, args, 2, 0);
2843 }
2844
2845 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2846 {
2847 struct si_shader_context *ctx = si_shader_context(bld_base);
2848 struct gallivm_state *gallivm = bld_base->base.gallivm;
2849 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2850 struct si_shader_output_values *outputs = NULL;
2851 int i,j;
2852
2853 assert(!ctx->is_gs_copy_shader);
2854
2855 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2856
2857 /* Vertex color clamping.
2858 *
2859 * This uses a state constant loaded in a user data SGPR and
2860 * an IF statement is added that clamps all colors if the constant
2861 * is true.
2862 */
2863 if (ctx->type == PIPE_SHADER_VERTEX) {
2864 struct lp_build_if_state if_ctx;
2865 LLVMValueRef cond = NULL;
2866 LLVMValueRef addr, val;
2867
2868 for (i = 0; i < info->num_outputs; i++) {
2869 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2870 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2871 continue;
2872
2873 /* We've found a color. */
2874 if (!cond) {
2875 /* The state is in the first bit of the user SGPR. */
2876 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2877 SI_PARAM_VS_STATE_BITS);
2878 cond = LLVMBuildTrunc(gallivm->builder, cond,
2879 ctx->i1, "");
2880 lp_build_if(&if_ctx, gallivm, cond);
2881 }
2882
2883 for (j = 0; j < 4; j++) {
2884 addr = ctx->radeon_bld.soa.outputs[i][j];
2885 val = LLVMBuildLoad(gallivm->builder, addr, "");
2886 val = radeon_llvm_saturate(bld_base, val);
2887 LLVMBuildStore(gallivm->builder, val, addr);
2888 }
2889 }
2890
2891 if (cond)
2892 lp_build_endif(&if_ctx);
2893 }
2894
2895 for (i = 0; i < info->num_outputs; i++) {
2896 outputs[i].name = info->output_semantic_name[i];
2897 outputs[i].sid = info->output_semantic_index[i];
2898
2899 for (j = 0; j < 4; j++)
2900 outputs[i].values[j] =
2901 LLVMBuildLoad(gallivm->builder,
2902 ctx->radeon_bld.soa.outputs[i][j],
2903 "");
2904 }
2905
2906 if (ctx->is_monolithic) {
2907 /* Export PrimitiveID when PS needs it. */
2908 if (si_vs_exports_prim_id(ctx->shader)) {
2909 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2910 outputs[i].sid = 0;
2911 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2912 get_primitive_id(bld_base, 0));
2913 outputs[i].values[1] = bld_base->base.undef;
2914 outputs[i].values[2] = bld_base->base.undef;
2915 outputs[i].values[3] = bld_base->base.undef;
2916 i++;
2917 }
2918 } else {
2919 /* Return the primitive ID from the LLVM function. */
2920 ctx->return_value =
2921 LLVMBuildInsertValue(gallivm->builder,
2922 ctx->return_value,
2923 bitcast(bld_base, TGSI_TYPE_FLOAT,
2924 get_primitive_id(bld_base, 0)),
2925 VS_EPILOG_PRIMID_LOC, "");
2926 }
2927
2928 si_llvm_export_vs(bld_base, outputs, i);
2929 FREE(outputs);
2930 }
2931
2932 struct si_ps_exports {
2933 unsigned num;
2934 LLVMValueRef args[10][9];
2935 };
2936
2937 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2938 LLVMValueRef depth, LLVMValueRef stencil,
2939 LLVMValueRef samplemask, struct si_ps_exports *exp)
2940 {
2941 struct si_shader_context *ctx = si_shader_context(bld_base);
2942 struct lp_build_context *base = &bld_base->base;
2943 struct lp_build_context *uint = &bld_base->uint_bld;
2944 LLVMValueRef args[9];
2945 unsigned mask = 0;
2946
2947 assert(depth || stencil || samplemask);
2948
2949 args[1] = uint->one; /* whether the EXEC mask is valid */
2950 args[2] = uint->one; /* DONE bit */
2951
2952 /* Specify the target we are exporting */
2953 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2954
2955 args[4] = uint->zero; /* COMP flag */
2956 args[5] = base->undef; /* R, depth */
2957 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2958 args[7] = base->undef; /* B, sample mask */
2959 args[8] = base->undef; /* A, alpha to mask */
2960
2961 if (depth) {
2962 args[5] = depth;
2963 mask |= 0x1;
2964 }
2965
2966 if (stencil) {
2967 args[6] = stencil;
2968 mask |= 0x2;
2969 }
2970
2971 if (samplemask) {
2972 args[7] = samplemask;
2973 mask |= 0x4;
2974 }
2975
2976 /* SI (except OLAND) has a bug that it only looks
2977 * at the X writemask component. */
2978 if (ctx->screen->b.chip_class == SI &&
2979 ctx->screen->b.family != CHIP_OLAND)
2980 mask |= 0x1;
2981
2982 /* Specify which components to enable */
2983 args[0] = lp_build_const_int32(base->gallivm, mask);
2984
2985 memcpy(exp->args[exp->num++], args, sizeof(args));
2986 }
2987
2988 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2989 LLVMValueRef *color, unsigned index,
2990 unsigned samplemask_param,
2991 bool is_last, struct si_ps_exports *exp)
2992 {
2993 struct si_shader_context *ctx = si_shader_context(bld_base);
2994 struct lp_build_context *base = &bld_base->base;
2995 int i;
2996
2997 /* Clamp color */
2998 if (ctx->shader->key.ps.epilog.clamp_color)
2999 for (i = 0; i < 4; i++)
3000 color[i] = radeon_llvm_saturate(bld_base, color[i]);
3001
3002 /* Alpha to one */
3003 if (ctx->shader->key.ps.epilog.alpha_to_one)
3004 color[3] = base->one;
3005
3006 /* Alpha test */
3007 if (index == 0 &&
3008 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3009 si_alpha_test(bld_base, color[3]);
3010
3011 /* Line & polygon smoothing */
3012 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3013 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3014 samplemask_param);
3015
3016 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3017 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3018 LLVMValueRef args[8][9];
3019 int c, last = -1;
3020
3021 /* Get the export arguments, also find out what the last one is. */
3022 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3023 si_llvm_init_export_args(bld_base, color,
3024 V_008DFC_SQ_EXP_MRT + c, args[c]);
3025 if (args[c][0] != bld_base->uint_bld.zero)
3026 last = c;
3027 }
3028
3029 /* Emit all exports. */
3030 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3031 if (is_last && last == c) {
3032 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3033 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3034 } else if (args[c][0] == bld_base->uint_bld.zero)
3035 continue; /* unnecessary NULL export */
3036
3037 memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
3038 }
3039 } else {
3040 LLVMValueRef args[9];
3041
3042 /* Export */
3043 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3044 args);
3045 if (is_last) {
3046 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3047 args[2] = bld_base->uint_bld.one; /* DONE bit */
3048 } else if (args[0] == bld_base->uint_bld.zero)
3049 return; /* unnecessary NULL export */
3050
3051 memcpy(exp->args[exp->num++], args, sizeof(args));
3052 }
3053 }
3054
3055 static void si_emit_ps_exports(struct si_shader_context *ctx,
3056 struct si_ps_exports *exp)
3057 {
3058 for (unsigned i = 0; i < exp->num; i++)
3059 lp_build_intrinsic(ctx->radeon_bld.gallivm.builder,
3060 "llvm.SI.export", ctx->voidt,
3061 exp->args[i], 9, 0);
3062 }
3063
3064 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3065 {
3066 struct si_shader_context *ctx = si_shader_context(bld_base);
3067 struct lp_build_context *base = &bld_base->base;
3068 struct lp_build_context *uint = &bld_base->uint_bld;
3069 LLVMValueRef args[9];
3070
3071 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3072 args[1] = uint->one; /* whether the EXEC mask is valid */
3073 args[2] = uint->one; /* DONE bit */
3074 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3075 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3076 args[5] = uint->undef; /* R */
3077 args[6] = uint->undef; /* G */
3078 args[7] = uint->undef; /* B */
3079 args[8] = uint->undef; /* A */
3080
3081 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3082 ctx->voidt, args, 9, 0);
3083 }
3084
3085 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3086 {
3087 struct si_shader_context *ctx = si_shader_context(bld_base);
3088 struct si_shader *shader = ctx->shader;
3089 struct lp_build_context *base = &bld_base->base;
3090 struct tgsi_shader_info *info = &shader->selector->info;
3091 LLVMBuilderRef builder = base->gallivm->builder;
3092 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3093 int last_color_export = -1;
3094 int i;
3095 struct si_ps_exports exp = {};
3096
3097 /* Determine the last export. If MRTZ is present, it's always last.
3098 * Otherwise, find the last color export.
3099 */
3100 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3101 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3102
3103 /* Don't export NULL and return if alpha-test is enabled. */
3104 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3105 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3106 (spi_format & 0xf) == 0)
3107 spi_format |= V_028714_SPI_SHADER_32_AR;
3108
3109 for (i = 0; i < info->num_outputs; i++) {
3110 unsigned index = info->output_semantic_index[i];
3111
3112 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3113 continue;
3114
3115 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3116 if (shader->key.ps.epilog.last_cbuf > 0) {
3117 /* Just set this if any of the colorbuffers are enabled. */
3118 if (spi_format &
3119 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3120 last_color_export = i;
3121 continue;
3122 }
3123
3124 if ((spi_format >> (index * 4)) & 0xf)
3125 last_color_export = i;
3126 }
3127
3128 /* If there are no outputs, export NULL. */
3129 if (last_color_export == -1) {
3130 si_export_null(bld_base);
3131 return;
3132 }
3133 }
3134
3135 for (i = 0; i < info->num_outputs; i++) {
3136 unsigned semantic_name = info->output_semantic_name[i];
3137 unsigned semantic_index = info->output_semantic_index[i];
3138 unsigned j;
3139 LLVMValueRef color[4] = {};
3140
3141 /* Select the correct target */
3142 switch (semantic_name) {
3143 case TGSI_SEMANTIC_POSITION:
3144 depth = LLVMBuildLoad(builder,
3145 ctx->radeon_bld.soa.outputs[i][2], "");
3146 break;
3147 case TGSI_SEMANTIC_STENCIL:
3148 stencil = LLVMBuildLoad(builder,
3149 ctx->radeon_bld.soa.outputs[i][1], "");
3150 break;
3151 case TGSI_SEMANTIC_SAMPLEMASK:
3152 samplemask = LLVMBuildLoad(builder,
3153 ctx->radeon_bld.soa.outputs[i][0], "");
3154 break;
3155 case TGSI_SEMANTIC_COLOR:
3156 for (j = 0; j < 4; j++)
3157 color[j] = LLVMBuildLoad(builder,
3158 ctx->radeon_bld.soa.outputs[i][j], "");
3159
3160 si_export_mrt_color(bld_base, color, semantic_index,
3161 SI_PARAM_SAMPLE_COVERAGE,
3162 last_color_export == i, &exp);
3163 break;
3164 default:
3165 fprintf(stderr,
3166 "Warning: SI unhandled fs output type:%d\n",
3167 semantic_name);
3168 }
3169 }
3170
3171 if (depth || stencil || samplemask)
3172 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
3173
3174 si_emit_ps_exports(ctx, &exp);
3175 }
3176
3177 /**
3178 * Return PS outputs in this order:
3179 *
3180 * v[0:3] = color0.xyzw
3181 * v[4:7] = color1.xyzw
3182 * ...
3183 * vN+0 = Depth
3184 * vN+1 = Stencil
3185 * vN+2 = SampleMask
3186 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3187 *
3188 * The alpha-ref SGPR is returned via its original location.
3189 */
3190 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3191 {
3192 struct si_shader_context *ctx = si_shader_context(bld_base);
3193 struct si_shader *shader = ctx->shader;
3194 struct lp_build_context *base = &bld_base->base;
3195 struct tgsi_shader_info *info = &shader->selector->info;
3196 LLVMBuilderRef builder = base->gallivm->builder;
3197 unsigned i, j, first_vgpr, vgpr;
3198
3199 LLVMValueRef color[8][4] = {};
3200 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3201 LLVMValueRef ret;
3202
3203 /* Read the output values. */
3204 for (i = 0; i < info->num_outputs; i++) {
3205 unsigned semantic_name = info->output_semantic_name[i];
3206 unsigned semantic_index = info->output_semantic_index[i];
3207
3208 switch (semantic_name) {
3209 case TGSI_SEMANTIC_COLOR:
3210 assert(semantic_index < 8);
3211 for (j = 0; j < 4; j++) {
3212 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3213 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3214 color[semantic_index][j] = result;
3215 }
3216 break;
3217 case TGSI_SEMANTIC_POSITION:
3218 depth = LLVMBuildLoad(builder,
3219 ctx->radeon_bld.soa.outputs[i][2], "");
3220 break;
3221 case TGSI_SEMANTIC_STENCIL:
3222 stencil = LLVMBuildLoad(builder,
3223 ctx->radeon_bld.soa.outputs[i][1], "");
3224 break;
3225 case TGSI_SEMANTIC_SAMPLEMASK:
3226 samplemask = LLVMBuildLoad(builder,
3227 ctx->radeon_bld.soa.outputs[i][0], "");
3228 break;
3229 default:
3230 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3231 semantic_name);
3232 }
3233 }
3234
3235 /* Fill the return structure. */
3236 ret = ctx->return_value;
3237
3238 /* Set SGPRs. */
3239 ret = LLVMBuildInsertValue(builder, ret,
3240 bitcast(bld_base, TGSI_TYPE_SIGNED,
3241 LLVMGetParam(ctx->radeon_bld.main_fn,
3242 SI_PARAM_ALPHA_REF)),
3243 SI_SGPR_ALPHA_REF, "");
3244
3245 /* Set VGPRs */
3246 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3247 for (i = 0; i < ARRAY_SIZE(color); i++) {
3248 if (!color[i][0])
3249 continue;
3250
3251 for (j = 0; j < 4; j++)
3252 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3253 }
3254 if (depth)
3255 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3256 if (stencil)
3257 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3258 if (samplemask)
3259 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3260
3261 /* Add the input sample mask for smoothing at the end. */
3262 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3263 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3264 ret = LLVMBuildInsertValue(builder, ret,
3265 LLVMGetParam(ctx->radeon_bld.main_fn,
3266 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3267
3268 ctx->return_value = ret;
3269 }
3270
3271 /**
3272 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3273 * buffer in number of elements and return it as an i32.
3274 */
3275 static LLVMValueRef get_buffer_size(
3276 struct lp_build_tgsi_context *bld_base,
3277 LLVMValueRef descriptor)
3278 {
3279 struct si_shader_context *ctx = si_shader_context(bld_base);
3280 struct gallivm_state *gallivm = bld_base->base.gallivm;
3281 LLVMBuilderRef builder = gallivm->builder;
3282 LLVMValueRef size =
3283 LLVMBuildExtractElement(builder, descriptor,
3284 lp_build_const_int32(gallivm, 6), "");
3285
3286 if (ctx->screen->b.chip_class >= VI) {
3287 /* On VI, the descriptor contains the size in bytes,
3288 * but TXQ must return the size in elements.
3289 * The stride is always non-zero for resources using TXQ.
3290 */
3291 LLVMValueRef stride =
3292 LLVMBuildExtractElement(builder, descriptor,
3293 lp_build_const_int32(gallivm, 5), "");
3294 stride = LLVMBuildLShr(builder, stride,
3295 lp_build_const_int32(gallivm, 16), "");
3296 stride = LLVMBuildAnd(builder, stride,
3297 lp_build_const_int32(gallivm, 0x3FFF), "");
3298
3299 size = LLVMBuildUDiv(builder, size, stride, "");
3300 }
3301
3302 return size;
3303 }
3304
3305 /**
3306 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3307 * intrinsic names).
3308 */
3309 static void build_int_type_name(
3310 LLVMTypeRef type,
3311 char *buf, unsigned bufsize)
3312 {
3313 assert(bufsize >= 6);
3314
3315 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3316 snprintf(buf, bufsize, "v%ui32",
3317 LLVMGetVectorSize(type));
3318 else
3319 strcpy(buf, "i32");
3320 }
3321
3322 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3323 struct lp_build_tgsi_context *bld_base,
3324 struct lp_build_emit_data *emit_data);
3325
3326 /* Prevent optimizations (at least of memory accesses) across the current
3327 * point in the program by emitting empty inline assembly that is marked as
3328 * having side effects.
3329 */
3330 static void emit_optimization_barrier(struct si_shader_context *ctx)
3331 {
3332 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3333 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3334 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3335 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3336 }
3337
3338 static void emit_waitcnt(struct si_shader_context *ctx)
3339 {
3340 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3341 LLVMBuilderRef builder = gallivm->builder;
3342 LLVMValueRef args[1] = {
3343 lp_build_const_int32(gallivm, 0xf70)
3344 };
3345 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3346 ctx->voidt, args, 1, 0);
3347 }
3348
3349 static void membar_emit(
3350 const struct lp_build_tgsi_action *action,
3351 struct lp_build_tgsi_context *bld_base,
3352 struct lp_build_emit_data *emit_data)
3353 {
3354 struct si_shader_context *ctx = si_shader_context(bld_base);
3355
3356 emit_waitcnt(ctx);
3357 }
3358
3359 static LLVMValueRef
3360 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3361 const struct tgsi_full_src_register *reg)
3362 {
3363 LLVMValueRef ind_index;
3364 LLVMValueRef rsrc_ptr;
3365
3366 if (!reg->Register.Indirect)
3367 return ctx->shader_buffers[reg->Register.Index];
3368
3369 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3370 reg->Register.Index,
3371 SI_NUM_SHADER_BUFFERS);
3372
3373 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3374 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3375 }
3376
3377 static bool tgsi_is_array_sampler(unsigned target)
3378 {
3379 return target == TGSI_TEXTURE_1D_ARRAY ||
3380 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3381 target == TGSI_TEXTURE_2D_ARRAY ||
3382 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3383 target == TGSI_TEXTURE_CUBE_ARRAY ||
3384 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3385 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3386 }
3387
3388 static bool tgsi_is_array_image(unsigned target)
3389 {
3390 return target == TGSI_TEXTURE_3D ||
3391 target == TGSI_TEXTURE_CUBE ||
3392 target == TGSI_TEXTURE_1D_ARRAY ||
3393 target == TGSI_TEXTURE_2D_ARRAY ||
3394 target == TGSI_TEXTURE_CUBE_ARRAY ||
3395 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3396 }
3397
3398 /**
3399 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3400 *
3401 * At least on Tonga, executing image stores on images with DCC enabled and
3402 * non-trivial can eventually lead to lockups. This can occur when an
3403 * application binds an image as read-only but then uses a shader that writes
3404 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3405 * program termination) in this case, but it doesn't cost much to be a bit
3406 * nicer: disabling DCC in the shader still leads to undefined results but
3407 * avoids the lockup.
3408 */
3409 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3410 LLVMValueRef rsrc)
3411 {
3412 if (ctx->screen->b.chip_class <= CIK) {
3413 return rsrc;
3414 } else {
3415 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3416 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3417 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3418 LLVMValueRef tmp;
3419
3420 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3421 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3422 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3423 }
3424 }
3425
3426 /**
3427 * Load the resource descriptor for \p image.
3428 */
3429 static void
3430 image_fetch_rsrc(
3431 struct lp_build_tgsi_context *bld_base,
3432 const struct tgsi_full_src_register *image,
3433 bool dcc_off,
3434 LLVMValueRef *rsrc)
3435 {
3436 struct si_shader_context *ctx = si_shader_context(bld_base);
3437
3438 assert(image->Register.File == TGSI_FILE_IMAGE);
3439
3440 if (!image->Register.Indirect) {
3441 /* Fast path: use preloaded resources */
3442 *rsrc = ctx->images[image->Register.Index];
3443 } else {
3444 /* Indexing and manual load */
3445 LLVMValueRef ind_index;
3446 LLVMValueRef rsrc_ptr;
3447 LLVMValueRef tmp;
3448
3449 /* From the GL_ARB_shader_image_load_store extension spec:
3450 *
3451 * If a shader performs an image load, store, or atomic
3452 * operation using an image variable declared as an array,
3453 * and if the index used to select an individual element is
3454 * negative or greater than or equal to the size of the
3455 * array, the results of the operation are undefined but may
3456 * not lead to termination.
3457 */
3458 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3459 image->Register.Index,
3460 SI_NUM_IMAGES);
3461
3462 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3463 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3464 if (dcc_off)
3465 tmp = force_dcc_off(ctx, tmp);
3466 *rsrc = tmp;
3467 }
3468 }
3469
3470 static LLVMValueRef image_fetch_coords(
3471 struct lp_build_tgsi_context *bld_base,
3472 const struct tgsi_full_instruction *inst,
3473 unsigned src)
3474 {
3475 struct gallivm_state *gallivm = bld_base->base.gallivm;
3476 LLVMBuilderRef builder = gallivm->builder;
3477 unsigned target = inst->Memory.Texture;
3478 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3479 LLVMValueRef coords[4];
3480 LLVMValueRef tmp;
3481 int chan;
3482
3483 for (chan = 0; chan < num_coords; ++chan) {
3484 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3485 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3486 coords[chan] = tmp;
3487 }
3488
3489 if (num_coords == 1)
3490 return coords[0];
3491
3492 if (num_coords == 3) {
3493 /* LLVM has difficulties lowering 3-element vectors. */
3494 coords[3] = bld_base->uint_bld.undef;
3495 num_coords = 4;
3496 }
3497
3498 return lp_build_gather_values(gallivm, coords, num_coords);
3499 }
3500
3501 /**
3502 * Append the extra mode bits that are used by image load and store.
3503 */
3504 static void image_append_args(
3505 struct si_shader_context *ctx,
3506 struct lp_build_emit_data * emit_data,
3507 unsigned target,
3508 bool atomic)
3509 {
3510 const struct tgsi_full_instruction *inst = emit_data->inst;
3511 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3512 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3513
3514 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3515 emit_data->args[emit_data->arg_count++] =
3516 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3517 if (!atomic) {
3518 emit_data->args[emit_data->arg_count++] =
3519 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3520 i1true : i1false; /* glc */
3521 }
3522 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3523 }
3524
3525 /**
3526 * Given a 256 bit resource, extract the top half (which stores the buffer
3527 * resource in the case of textures and images).
3528 */
3529 static LLVMValueRef extract_rsrc_top_half(
3530 struct si_shader_context *ctx,
3531 LLVMValueRef rsrc)
3532 {
3533 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3534 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3535 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3536
3537 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3538 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3539 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3540
3541 return rsrc;
3542 }
3543
3544 /**
3545 * Append the resource and indexing arguments for buffer intrinsics.
3546 *
3547 * \param rsrc the v4i32 buffer resource
3548 * \param index index into the buffer (stride-based)
3549 * \param offset byte offset into the buffer
3550 */
3551 static void buffer_append_args(
3552 struct si_shader_context *ctx,
3553 struct lp_build_emit_data *emit_data,
3554 LLVMValueRef rsrc,
3555 LLVMValueRef index,
3556 LLVMValueRef offset,
3557 bool atomic)
3558 {
3559 const struct tgsi_full_instruction *inst = emit_data->inst;
3560 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3561 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3562
3563 emit_data->args[emit_data->arg_count++] = rsrc;
3564 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3565 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3566 if (!atomic) {
3567 emit_data->args[emit_data->arg_count++] =
3568 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3569 i1true : i1false; /* glc */
3570 }
3571 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3572 }
3573
3574 static void load_fetch_args(
3575 struct lp_build_tgsi_context * bld_base,
3576 struct lp_build_emit_data * emit_data)
3577 {
3578 struct si_shader_context *ctx = si_shader_context(bld_base);
3579 struct gallivm_state *gallivm = bld_base->base.gallivm;
3580 const struct tgsi_full_instruction * inst = emit_data->inst;
3581 unsigned target = inst->Memory.Texture;
3582 LLVMValueRef rsrc;
3583
3584 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3585
3586 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3587 LLVMBuilderRef builder = gallivm->builder;
3588 LLVMValueRef offset;
3589 LLVMValueRef tmp;
3590
3591 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3592
3593 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3594 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3595
3596 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3597 offset, false);
3598 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3599 LLVMValueRef coords;
3600
3601 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3602 coords = image_fetch_coords(bld_base, inst, 1);
3603
3604 if (target == TGSI_TEXTURE_BUFFER) {
3605 rsrc = extract_rsrc_top_half(ctx, rsrc);
3606 buffer_append_args(ctx, emit_data, rsrc, coords,
3607 bld_base->uint_bld.zero, false);
3608 } else {
3609 emit_data->args[0] = coords;
3610 emit_data->args[1] = rsrc;
3611 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3612 emit_data->arg_count = 3;
3613
3614 image_append_args(ctx, emit_data, target, false);
3615 }
3616 }
3617 }
3618
3619 static void load_emit_buffer(struct si_shader_context *ctx,
3620 struct lp_build_emit_data *emit_data)
3621 {
3622 const struct tgsi_full_instruction *inst = emit_data->inst;
3623 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3624 LLVMBuilderRef builder = gallivm->builder;
3625 uint writemask = inst->Dst[0].Register.WriteMask;
3626 uint count = util_last_bit(writemask);
3627 const char *intrinsic_name;
3628 LLVMTypeRef dst_type;
3629
3630 switch (count) {
3631 case 1:
3632 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3633 dst_type = ctx->f32;
3634 break;
3635 case 2:
3636 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3637 dst_type = LLVMVectorType(ctx->f32, 2);
3638 break;
3639 default: // 3 & 4
3640 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3641 dst_type = ctx->v4f32;
3642 count = 4;
3643 }
3644
3645 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3646 builder, intrinsic_name, dst_type,
3647 emit_data->args, emit_data->arg_count,
3648 LLVMReadOnlyAttribute);
3649 }
3650
3651 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3652 const struct tgsi_full_instruction *inst,
3653 LLVMTypeRef type, int arg)
3654 {
3655 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3656 LLVMBuilderRef builder = gallivm->builder;
3657 LLVMValueRef offset, ptr;
3658 int addr_space;
3659
3660 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3661 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3662
3663 ptr = ctx->shared_memory;
3664 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3665 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3666 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3667
3668 return ptr;
3669 }
3670
3671 static void load_emit_memory(
3672 struct si_shader_context *ctx,
3673 struct lp_build_emit_data *emit_data)
3674 {
3675 const struct tgsi_full_instruction *inst = emit_data->inst;
3676 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3677 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3678 LLVMBuilderRef builder = gallivm->builder;
3679 unsigned writemask = inst->Dst[0].Register.WriteMask;
3680 LLVMValueRef channels[4], ptr, derived_ptr, index;
3681 int chan;
3682
3683 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3684
3685 for (chan = 0; chan < 4; ++chan) {
3686 if (!(writemask & (1 << chan))) {
3687 channels[chan] = LLVMGetUndef(base->elem_type);
3688 continue;
3689 }
3690
3691 index = lp_build_const_int32(gallivm, chan);
3692 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3693 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3694 }
3695 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3696 }
3697
3698 static void load_emit(
3699 const struct lp_build_tgsi_action *action,
3700 struct lp_build_tgsi_context *bld_base,
3701 struct lp_build_emit_data *emit_data)
3702 {
3703 struct si_shader_context *ctx = si_shader_context(bld_base);
3704 struct gallivm_state *gallivm = bld_base->base.gallivm;
3705 LLVMBuilderRef builder = gallivm->builder;
3706 const struct tgsi_full_instruction * inst = emit_data->inst;
3707 char intrinsic_name[32];
3708 char coords_type[8];
3709
3710 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3711 load_emit_memory(ctx, emit_data);
3712 return;
3713 }
3714
3715 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3716 emit_waitcnt(ctx);
3717
3718 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3719 load_emit_buffer(ctx, emit_data);
3720 return;
3721 }
3722
3723 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3724 emit_data->output[emit_data->chan] =
3725 lp_build_intrinsic(
3726 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3727 emit_data->args, emit_data->arg_count,
3728 LLVMReadOnlyAttribute);
3729 } else {
3730 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3731 coords_type, sizeof(coords_type));
3732
3733 snprintf(intrinsic_name, sizeof(intrinsic_name),
3734 "llvm.amdgcn.image.load.%s", coords_type);
3735
3736 emit_data->output[emit_data->chan] =
3737 lp_build_intrinsic(
3738 builder, intrinsic_name, emit_data->dst_type,
3739 emit_data->args, emit_data->arg_count,
3740 LLVMReadOnlyAttribute);
3741 }
3742 }
3743
3744 static void store_fetch_args(
3745 struct lp_build_tgsi_context * bld_base,
3746 struct lp_build_emit_data * emit_data)
3747 {
3748 struct si_shader_context *ctx = si_shader_context(bld_base);
3749 struct gallivm_state *gallivm = bld_base->base.gallivm;
3750 LLVMBuilderRef builder = gallivm->builder;
3751 const struct tgsi_full_instruction * inst = emit_data->inst;
3752 struct tgsi_full_src_register memory;
3753 LLVMValueRef chans[4];
3754 LLVMValueRef data;
3755 LLVMValueRef rsrc;
3756 unsigned chan;
3757
3758 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3759
3760 for (chan = 0; chan < 4; ++chan) {
3761 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3762 }
3763 data = lp_build_gather_values(gallivm, chans, 4);
3764
3765 emit_data->args[emit_data->arg_count++] = data;
3766
3767 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3768
3769 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3770 LLVMValueRef offset;
3771 LLVMValueRef tmp;
3772
3773 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3774
3775 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3776 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3777
3778 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3779 offset, false);
3780 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3781 unsigned target = inst->Memory.Texture;
3782 LLVMValueRef coords;
3783
3784 coords = image_fetch_coords(bld_base, inst, 0);
3785
3786 if (target == TGSI_TEXTURE_BUFFER) {
3787 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3788
3789 rsrc = extract_rsrc_top_half(ctx, rsrc);
3790 buffer_append_args(ctx, emit_data, rsrc, coords,
3791 bld_base->uint_bld.zero, false);
3792 } else {
3793 emit_data->args[1] = coords;
3794 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3795 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3796 emit_data->arg_count = 4;
3797
3798 image_append_args(ctx, emit_data, target, false);
3799 }
3800 }
3801 }
3802
3803 static void store_emit_buffer(
3804 struct si_shader_context *ctx,
3805 struct lp_build_emit_data *emit_data)
3806 {
3807 const struct tgsi_full_instruction *inst = emit_data->inst;
3808 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3809 LLVMBuilderRef builder = gallivm->builder;
3810 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3811 LLVMValueRef base_data = emit_data->args[0];
3812 LLVMValueRef base_offset = emit_data->args[3];
3813 unsigned writemask = inst->Dst[0].Register.WriteMask;
3814
3815 while (writemask) {
3816 int start, count;
3817 const char *intrinsic_name;
3818 LLVMValueRef data;
3819 LLVMValueRef offset;
3820 LLVMValueRef tmp;
3821
3822 u_bit_scan_consecutive_range(&writemask, &start, &count);
3823
3824 /* Due to an LLVM limitation, split 3-element writes
3825 * into a 2-element and a 1-element write. */
3826 if (count == 3) {
3827 writemask |= 1 << (start + 2);
3828 count = 2;
3829 }
3830
3831 if (count == 4) {
3832 data = base_data;
3833 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3834 } else if (count == 2) {
3835 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3836
3837 tmp = LLVMBuildExtractElement(
3838 builder, base_data,
3839 lp_build_const_int32(gallivm, start), "");
3840 data = LLVMBuildInsertElement(
3841 builder, LLVMGetUndef(v2f32), tmp,
3842 uint_bld->zero, "");
3843
3844 tmp = LLVMBuildExtractElement(
3845 builder, base_data,
3846 lp_build_const_int32(gallivm, start + 1), "");
3847 data = LLVMBuildInsertElement(
3848 builder, data, tmp, uint_bld->one, "");
3849
3850 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3851 } else {
3852 assert(count == 1);
3853 data = LLVMBuildExtractElement(
3854 builder, base_data,
3855 lp_build_const_int32(gallivm, start), "");
3856 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3857 }
3858
3859 offset = base_offset;
3860 if (start != 0) {
3861 offset = LLVMBuildAdd(
3862 builder, offset,
3863 lp_build_const_int32(gallivm, start * 4), "");
3864 }
3865
3866 emit_data->args[0] = data;
3867 emit_data->args[3] = offset;
3868
3869 lp_build_intrinsic(
3870 builder, intrinsic_name, emit_data->dst_type,
3871 emit_data->args, emit_data->arg_count, 0);
3872 }
3873 }
3874
3875 static void store_emit_memory(
3876 struct si_shader_context *ctx,
3877 struct lp_build_emit_data *emit_data)
3878 {
3879 const struct tgsi_full_instruction *inst = emit_data->inst;
3880 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3881 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3882 LLVMBuilderRef builder = gallivm->builder;
3883 unsigned writemask = inst->Dst[0].Register.WriteMask;
3884 LLVMValueRef ptr, derived_ptr, data, index;
3885 int chan;
3886
3887 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3888
3889 for (chan = 0; chan < 4; ++chan) {
3890 if (!(writemask & (1 << chan))) {
3891 continue;
3892 }
3893 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3894 index = lp_build_const_int32(gallivm, chan);
3895 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3896 LLVMBuildStore(builder, data, derived_ptr);
3897 }
3898 }
3899
3900 static void store_emit(
3901 const struct lp_build_tgsi_action *action,
3902 struct lp_build_tgsi_context *bld_base,
3903 struct lp_build_emit_data *emit_data)
3904 {
3905 struct si_shader_context *ctx = si_shader_context(bld_base);
3906 struct gallivm_state *gallivm = bld_base->base.gallivm;
3907 LLVMBuilderRef builder = gallivm->builder;
3908 const struct tgsi_full_instruction * inst = emit_data->inst;
3909 unsigned target = inst->Memory.Texture;
3910 char intrinsic_name[32];
3911 char coords_type[8];
3912
3913 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3914 store_emit_memory(ctx, emit_data);
3915 return;
3916 }
3917
3918 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3919 emit_waitcnt(ctx);
3920
3921 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3922 store_emit_buffer(ctx, emit_data);
3923 return;
3924 }
3925
3926 if (target == TGSI_TEXTURE_BUFFER) {
3927 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3928 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3929 emit_data->dst_type, emit_data->args,
3930 emit_data->arg_count, 0);
3931 } else {
3932 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3933 coords_type, sizeof(coords_type));
3934 snprintf(intrinsic_name, sizeof(intrinsic_name),
3935 "llvm.amdgcn.image.store.%s", coords_type);
3936
3937 emit_data->output[emit_data->chan] =
3938 lp_build_intrinsic(
3939 builder, intrinsic_name, emit_data->dst_type,
3940 emit_data->args, emit_data->arg_count, 0);
3941 }
3942 }
3943
3944 static void atomic_fetch_args(
3945 struct lp_build_tgsi_context * bld_base,
3946 struct lp_build_emit_data * emit_data)
3947 {
3948 struct si_shader_context *ctx = si_shader_context(bld_base);
3949 struct gallivm_state *gallivm = bld_base->base.gallivm;
3950 LLVMBuilderRef builder = gallivm->builder;
3951 const struct tgsi_full_instruction * inst = emit_data->inst;
3952 LLVMValueRef data1, data2;
3953 LLVMValueRef rsrc;
3954 LLVMValueRef tmp;
3955
3956 emit_data->dst_type = bld_base->base.elem_type;
3957
3958 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3959 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3960
3961 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3962 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3963 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3964 }
3965
3966 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3967 * of arguments, which is reversed relative to TGSI (and GLSL)
3968 */
3969 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3970 emit_data->args[emit_data->arg_count++] = data2;
3971 emit_data->args[emit_data->arg_count++] = data1;
3972
3973 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3974 LLVMValueRef offset;
3975
3976 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3977
3978 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3979 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3980
3981 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3982 offset, true);
3983 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3984 unsigned target = inst->Memory.Texture;
3985 LLVMValueRef coords;
3986
3987 image_fetch_rsrc(bld_base, &inst->Src[0],
3988 target != TGSI_TEXTURE_BUFFER, &rsrc);
3989 coords = image_fetch_coords(bld_base, inst, 1);
3990
3991 if (target == TGSI_TEXTURE_BUFFER) {
3992 rsrc = extract_rsrc_top_half(ctx, rsrc);
3993 buffer_append_args(ctx, emit_data, rsrc, coords,
3994 bld_base->uint_bld.zero, true);
3995 } else {
3996 emit_data->args[emit_data->arg_count++] = coords;
3997 emit_data->args[emit_data->arg_count++] = rsrc;
3998
3999 image_append_args(ctx, emit_data, target, true);
4000 }
4001 }
4002 }
4003
4004 static void atomic_emit_memory(struct si_shader_context *ctx,
4005 struct lp_build_emit_data *emit_data) {
4006 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4007 LLVMBuilderRef builder = gallivm->builder;
4008 const struct tgsi_full_instruction * inst = emit_data->inst;
4009 LLVMValueRef ptr, result, arg;
4010
4011 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4012
4013 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
4014 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4015
4016 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4017 LLVMValueRef new_data;
4018 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4019 inst, 3, 0);
4020
4021 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4022
4023 #if HAVE_LLVM >= 0x309
4024 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4025 LLVMAtomicOrderingSequentiallyConsistent,
4026 LLVMAtomicOrderingSequentiallyConsistent,
4027 false);
4028 #endif
4029
4030 result = LLVMBuildExtractValue(builder, result, 0, "");
4031 } else {
4032 LLVMAtomicRMWBinOp op;
4033
4034 switch(inst->Instruction.Opcode) {
4035 case TGSI_OPCODE_ATOMUADD:
4036 op = LLVMAtomicRMWBinOpAdd;
4037 break;
4038 case TGSI_OPCODE_ATOMXCHG:
4039 op = LLVMAtomicRMWBinOpXchg;
4040 break;
4041 case TGSI_OPCODE_ATOMAND:
4042 op = LLVMAtomicRMWBinOpAnd;
4043 break;
4044 case TGSI_OPCODE_ATOMOR:
4045 op = LLVMAtomicRMWBinOpOr;
4046 break;
4047 case TGSI_OPCODE_ATOMXOR:
4048 op = LLVMAtomicRMWBinOpXor;
4049 break;
4050 case TGSI_OPCODE_ATOMUMIN:
4051 op = LLVMAtomicRMWBinOpUMin;
4052 break;
4053 case TGSI_OPCODE_ATOMUMAX:
4054 op = LLVMAtomicRMWBinOpUMax;
4055 break;
4056 case TGSI_OPCODE_ATOMIMIN:
4057 op = LLVMAtomicRMWBinOpMin;
4058 break;
4059 case TGSI_OPCODE_ATOMIMAX:
4060 op = LLVMAtomicRMWBinOpMax;
4061 break;
4062 default:
4063 unreachable("unknown atomic opcode");
4064 }
4065
4066 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4067 LLVMAtomicOrderingSequentiallyConsistent,
4068 false);
4069 }
4070 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4071 }
4072
4073 static void atomic_emit(
4074 const struct lp_build_tgsi_action *action,
4075 struct lp_build_tgsi_context *bld_base,
4076 struct lp_build_emit_data *emit_data)
4077 {
4078 struct si_shader_context *ctx = si_shader_context(bld_base);
4079 struct gallivm_state *gallivm = bld_base->base.gallivm;
4080 LLVMBuilderRef builder = gallivm->builder;
4081 const struct tgsi_full_instruction * inst = emit_data->inst;
4082 char intrinsic_name[40];
4083 LLVMValueRef tmp;
4084
4085 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4086 atomic_emit_memory(ctx, emit_data);
4087 return;
4088 }
4089
4090 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4091 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4092 snprintf(intrinsic_name, sizeof(intrinsic_name),
4093 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4094 } else {
4095 char coords_type[8];
4096
4097 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4098 coords_type, sizeof(coords_type));
4099 snprintf(intrinsic_name, sizeof(intrinsic_name),
4100 "llvm.amdgcn.image.atomic.%s.%s",
4101 action->intr_name, coords_type);
4102 }
4103
4104 tmp = lp_build_intrinsic(
4105 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4106 emit_data->args, emit_data->arg_count, 0);
4107 emit_data->output[emit_data->chan] =
4108 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4109 }
4110
4111 static void resq_fetch_args(
4112 struct lp_build_tgsi_context * bld_base,
4113 struct lp_build_emit_data * emit_data)
4114 {
4115 struct si_shader_context *ctx = si_shader_context(bld_base);
4116 struct gallivm_state *gallivm = bld_base->base.gallivm;
4117 const struct tgsi_full_instruction *inst = emit_data->inst;
4118 const struct tgsi_full_src_register *reg = &inst->Src[0];
4119
4120 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4121
4122 if (reg->Register.File == TGSI_FILE_BUFFER) {
4123 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4124 emit_data->arg_count = 1;
4125 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4126 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4127 emit_data->arg_count = 1;
4128 } else {
4129 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4130 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4131 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4132 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4133 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4134 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4135 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4136 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4137 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4138 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4139 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4140 emit_data->arg_count = 10;
4141 }
4142 }
4143
4144 static void resq_emit(
4145 const struct lp_build_tgsi_action *action,
4146 struct lp_build_tgsi_context *bld_base,
4147 struct lp_build_emit_data *emit_data)
4148 {
4149 struct gallivm_state *gallivm = bld_base->base.gallivm;
4150 LLVMBuilderRef builder = gallivm->builder;
4151 const struct tgsi_full_instruction *inst = emit_data->inst;
4152 LLVMValueRef out;
4153
4154 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4155 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4156 lp_build_const_int32(gallivm, 2), "");
4157 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4158 out = get_buffer_size(bld_base, emit_data->args[0]);
4159 } else {
4160 out = lp_build_intrinsic(
4161 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4162 emit_data->args, emit_data->arg_count,
4163 LLVMReadNoneAttribute);
4164
4165 /* Divide the number of layers by 6 to get the number of cubes. */
4166 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4167 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4168 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4169
4170 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4171 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4172 z = LLVMBuildSDiv(builder, z, imm6, "");
4173 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4174 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4175 }
4176 }
4177
4178 emit_data->output[emit_data->chan] = out;
4179 }
4180
4181 static void set_tex_fetch_args(struct si_shader_context *ctx,
4182 struct lp_build_emit_data *emit_data,
4183 unsigned opcode, unsigned target,
4184 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4185 LLVMValueRef *param, unsigned count,
4186 unsigned dmask)
4187 {
4188 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4189 unsigned num_args;
4190 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4191
4192 /* Pad to power of two vector */
4193 while (count < util_next_power_of_two(count))
4194 param[count++] = LLVMGetUndef(ctx->i32);
4195
4196 /* Texture coordinates. */
4197 if (count > 1)
4198 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4199 else
4200 emit_data->args[0] = param[0];
4201
4202 /* Resource. */
4203 emit_data->args[1] = res_ptr;
4204 num_args = 2;
4205
4206 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4207 emit_data->dst_type = ctx->v4i32;
4208 else {
4209 emit_data->dst_type = ctx->v4f32;
4210
4211 emit_data->args[num_args++] = samp_ptr;
4212 }
4213
4214 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4215 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4216 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4217 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4218 tgsi_is_array_sampler(target)); /* da */
4219 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4220 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4221 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4222 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4223
4224 emit_data->arg_count = num_args;
4225 }
4226
4227 static const struct lp_build_tgsi_action tex_action;
4228
4229 enum desc_type {
4230 DESC_IMAGE,
4231 DESC_FMASK,
4232 DESC_SAMPLER
4233 };
4234
4235 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4236 {
4237 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4238 CONST_ADDR_SPACE);
4239 }
4240
4241 /**
4242 * Load an image view, fmask view. or sampler state descriptor.
4243 */
4244 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4245 LLVMValueRef list, LLVMValueRef index,
4246 enum desc_type type)
4247 {
4248 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4249 LLVMBuilderRef builder = gallivm->builder;
4250
4251 switch (type) {
4252 case DESC_IMAGE:
4253 /* The image is at [0:7]. */
4254 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4255 break;
4256 case DESC_FMASK:
4257 /* The FMASK is at [8:15]. */
4258 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4259 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4260 break;
4261 case DESC_SAMPLER:
4262 /* The sampler state is at [12:15]. */
4263 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4264 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4265 list = LLVMBuildPointerCast(builder, list,
4266 const_array(ctx->v4i32, 0), "");
4267 break;
4268 }
4269
4270 return build_indexed_load_const(ctx, list, index);
4271 }
4272
4273 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4274 LLVMValueRef index, enum desc_type type)
4275 {
4276 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4277 SI_PARAM_SAMPLERS);
4278
4279 return get_sampler_desc_custom(ctx, list, index, type);
4280 }
4281
4282 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4283 *
4284 * SI-CI:
4285 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4286 * filtering manually. The driver sets img7 to a mask clearing
4287 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4288 * s_and_b32 samp0, samp0, img7
4289 *
4290 * VI:
4291 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4292 */
4293 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4294 LLVMValueRef res, LLVMValueRef samp)
4295 {
4296 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4297 LLVMValueRef img7, samp0;
4298
4299 if (ctx->screen->b.chip_class >= VI)
4300 return samp;
4301
4302 img7 = LLVMBuildExtractElement(builder, res,
4303 LLVMConstInt(ctx->i32, 7, 0), "");
4304 samp0 = LLVMBuildExtractElement(builder, samp,
4305 LLVMConstInt(ctx->i32, 0, 0), "");
4306 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4307 return LLVMBuildInsertElement(builder, samp, samp0,
4308 LLVMConstInt(ctx->i32, 0, 0), "");
4309 }
4310
4311 static void tex_fetch_ptrs(
4312 struct lp_build_tgsi_context *bld_base,
4313 struct lp_build_emit_data *emit_data,
4314 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4315 {
4316 struct si_shader_context *ctx = si_shader_context(bld_base);
4317 const struct tgsi_full_instruction *inst = emit_data->inst;
4318 unsigned target = inst->Texture.Texture;
4319 unsigned sampler_src;
4320 unsigned sampler_index;
4321
4322 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4323 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4324
4325 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4326 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4327 LLVMValueRef ind_index;
4328
4329 ind_index = get_bounded_indirect_index(ctx,
4330 &reg->Indirect,
4331 reg->Register.Index,
4332 SI_NUM_SAMPLERS);
4333
4334 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4335
4336 if (target == TGSI_TEXTURE_2D_MSAA ||
4337 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4338 if (samp_ptr)
4339 *samp_ptr = NULL;
4340 if (fmask_ptr)
4341 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4342 } else {
4343 if (samp_ptr) {
4344 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4345 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4346 }
4347 if (fmask_ptr)
4348 *fmask_ptr = NULL;
4349 }
4350 } else {
4351 *res_ptr = ctx->sampler_views[sampler_index];
4352 if (samp_ptr)
4353 *samp_ptr = ctx->sampler_states[sampler_index];
4354 if (fmask_ptr)
4355 *fmask_ptr = ctx->fmasks[sampler_index];
4356 }
4357 }
4358
4359 static void txq_fetch_args(
4360 struct lp_build_tgsi_context *bld_base,
4361 struct lp_build_emit_data *emit_data)
4362 {
4363 struct si_shader_context *ctx = si_shader_context(bld_base);
4364 struct gallivm_state *gallivm = bld_base->base.gallivm;
4365 LLVMBuilderRef builder = gallivm->builder;
4366 const struct tgsi_full_instruction *inst = emit_data->inst;
4367 unsigned target = inst->Texture.Texture;
4368 LLVMValueRef res_ptr;
4369 LLVMValueRef address;
4370
4371 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4372
4373 if (target == TGSI_TEXTURE_BUFFER) {
4374 /* Read the size from the buffer descriptor directly. */
4375 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4376 emit_data->args[0] = get_buffer_size(bld_base, res);
4377 return;
4378 }
4379
4380 /* Textures - set the mip level. */
4381 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4382
4383 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4384 NULL, &address, 1, 0xf);
4385 }
4386
4387 static void txq_emit(const struct lp_build_tgsi_action *action,
4388 struct lp_build_tgsi_context *bld_base,
4389 struct lp_build_emit_data *emit_data)
4390 {
4391 struct lp_build_context *base = &bld_base->base;
4392 unsigned target = emit_data->inst->Texture.Texture;
4393
4394 if (target == TGSI_TEXTURE_BUFFER) {
4395 /* Just return the buffer size. */
4396 emit_data->output[emit_data->chan] = emit_data->args[0];
4397 return;
4398 }
4399
4400 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4401 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4402 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4403 LLVMReadNoneAttribute);
4404
4405 /* Divide the number of layers by 6 to get the number of cubes. */
4406 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4407 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4408 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4409 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4410 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4411
4412 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4413 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4414 z = LLVMBuildSDiv(builder, z, six, "");
4415
4416 emit_data->output[emit_data->chan] =
4417 LLVMBuildInsertElement(builder, v4, z, two, "");
4418 }
4419 }
4420
4421 static void tex_fetch_args(
4422 struct lp_build_tgsi_context *bld_base,
4423 struct lp_build_emit_data *emit_data)
4424 {
4425 struct si_shader_context *ctx = si_shader_context(bld_base);
4426 struct gallivm_state *gallivm = bld_base->base.gallivm;
4427 const struct tgsi_full_instruction *inst = emit_data->inst;
4428 unsigned opcode = inst->Instruction.Opcode;
4429 unsigned target = inst->Texture.Texture;
4430 LLVMValueRef coords[5], derivs[6];
4431 LLVMValueRef address[16];
4432 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4433 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4434 unsigned count = 0;
4435 unsigned chan;
4436 unsigned num_deriv_channels = 0;
4437 bool has_offset = inst->Texture.NumOffsets > 0;
4438 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4439 unsigned dmask = 0xf;
4440
4441 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4442
4443 if (target == TGSI_TEXTURE_BUFFER) {
4444 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4445
4446 /* Bitcast and truncate v8i32 to v16i8. */
4447 LLVMValueRef res = res_ptr;
4448 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4449 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4450 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4451
4452 emit_data->dst_type = ctx->v4f32;
4453 emit_data->args[0] = res;
4454 emit_data->args[1] = bld_base->uint_bld.zero;
4455 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4456 emit_data->arg_count = 3;
4457 return;
4458 }
4459
4460 /* Fetch and project texture coordinates */
4461 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4462 for (chan = 0; chan < 3; chan++ ) {
4463 coords[chan] = lp_build_emit_fetch(bld_base,
4464 emit_data->inst, 0,
4465 chan);
4466 if (opcode == TGSI_OPCODE_TXP)
4467 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4468 TGSI_OPCODE_DIV,
4469 coords[chan],
4470 coords[3]);
4471 }
4472
4473 if (opcode == TGSI_OPCODE_TXP)
4474 coords[3] = bld_base->base.one;
4475
4476 /* Pack offsets. */
4477 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4478 /* The offsets are six-bit signed integers packed like this:
4479 * X=[5:0], Y=[13:8], and Z=[21:16].
4480 */
4481 LLVMValueRef offset[3], pack;
4482
4483 assert(inst->Texture.NumOffsets == 1);
4484
4485 for (chan = 0; chan < 3; chan++) {
4486 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4487 emit_data->inst, 0, chan);
4488 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4489 lp_build_const_int32(gallivm, 0x3f), "");
4490 if (chan)
4491 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4492 lp_build_const_int32(gallivm, chan*8), "");
4493 }
4494
4495 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4496 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4497 address[count++] = pack;
4498 }
4499
4500 /* Pack LOD bias value */
4501 if (opcode == TGSI_OPCODE_TXB)
4502 address[count++] = coords[3];
4503 if (opcode == TGSI_OPCODE_TXB2)
4504 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4505
4506 /* Pack depth comparison value */
4507 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4508 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4509 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4510 } else {
4511 assert(ref_pos >= 0);
4512 address[count++] = coords[ref_pos];
4513 }
4514 }
4515
4516 /* Pack user derivatives */
4517 if (opcode == TGSI_OPCODE_TXD) {
4518 int param, num_src_deriv_channels;
4519
4520 switch (target) {
4521 case TGSI_TEXTURE_3D:
4522 num_src_deriv_channels = 3;
4523 num_deriv_channels = 3;
4524 break;
4525 case TGSI_TEXTURE_2D:
4526 case TGSI_TEXTURE_SHADOW2D:
4527 case TGSI_TEXTURE_RECT:
4528 case TGSI_TEXTURE_SHADOWRECT:
4529 case TGSI_TEXTURE_2D_ARRAY:
4530 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4531 num_src_deriv_channels = 2;
4532 num_deriv_channels = 2;
4533 break;
4534 case TGSI_TEXTURE_CUBE:
4535 case TGSI_TEXTURE_SHADOWCUBE:
4536 case TGSI_TEXTURE_CUBE_ARRAY:
4537 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4538 /* Cube derivatives will be converted to 2D. */
4539 num_src_deriv_channels = 3;
4540 num_deriv_channels = 2;
4541 break;
4542 case TGSI_TEXTURE_1D:
4543 case TGSI_TEXTURE_SHADOW1D:
4544 case TGSI_TEXTURE_1D_ARRAY:
4545 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4546 num_src_deriv_channels = 1;
4547 num_deriv_channels = 1;
4548 break;
4549 default:
4550 unreachable("invalid target");
4551 }
4552
4553 for (param = 0; param < 2; param++)
4554 for (chan = 0; chan < num_src_deriv_channels; chan++)
4555 derivs[param * num_src_deriv_channels + chan] =
4556 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4557 }
4558
4559 if (target == TGSI_TEXTURE_CUBE ||
4560 target == TGSI_TEXTURE_CUBE_ARRAY ||
4561 target == TGSI_TEXTURE_SHADOWCUBE ||
4562 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4563 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4564
4565 if (opcode == TGSI_OPCODE_TXD)
4566 for (int i = 0; i < num_deriv_channels * 2; i++)
4567 address[count++] = derivs[i];
4568
4569 /* Pack texture coordinates */
4570 address[count++] = coords[0];
4571 if (num_coords > 1)
4572 address[count++] = coords[1];
4573 if (num_coords > 2)
4574 address[count++] = coords[2];
4575
4576 /* Pack LOD or sample index */
4577 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4578 address[count++] = coords[3];
4579 else if (opcode == TGSI_OPCODE_TXL2)
4580 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4581
4582 if (count > 16) {
4583 assert(!"Cannot handle more than 16 texture address parameters");
4584 count = 16;
4585 }
4586
4587 for (chan = 0; chan < count; chan++ ) {
4588 address[chan] = LLVMBuildBitCast(gallivm->builder,
4589 address[chan], ctx->i32, "");
4590 }
4591
4592 /* Adjust the sample index according to FMASK.
4593 *
4594 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4595 * which is the identity mapping. Each nibble says which physical sample
4596 * should be fetched to get that sample.
4597 *
4598 * For example, 0x11111100 means there are only 2 samples stored and
4599 * the second sample covers 3/4 of the pixel. When reading samples 0
4600 * and 1, return physical sample 0 (determined by the first two 0s
4601 * in FMASK), otherwise return physical sample 1.
4602 *
4603 * The sample index should be adjusted as follows:
4604 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4605 */
4606 if (target == TGSI_TEXTURE_2D_MSAA ||
4607 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4608 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4609 struct lp_build_emit_data txf_emit_data = *emit_data;
4610 LLVMValueRef txf_address[4];
4611 unsigned txf_count = count;
4612 struct tgsi_full_instruction inst = {};
4613
4614 memcpy(txf_address, address, sizeof(txf_address));
4615
4616 if (target == TGSI_TEXTURE_2D_MSAA) {
4617 txf_address[2] = bld_base->uint_bld.zero;
4618 }
4619 txf_address[3] = bld_base->uint_bld.zero;
4620
4621 /* Read FMASK using TXF. */
4622 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4623 inst.Texture.Texture = target;
4624 txf_emit_data.inst = &inst;
4625 txf_emit_data.chan = 0;
4626 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4627 target, fmask_ptr, NULL,
4628 txf_address, txf_count, 0xf);
4629 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4630
4631 /* Initialize some constants. */
4632 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4633 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4634
4635 /* Apply the formula. */
4636 LLVMValueRef fmask =
4637 LLVMBuildExtractElement(gallivm->builder,
4638 txf_emit_data.output[0],
4639 uint_bld->zero, "");
4640
4641 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4642
4643 LLVMValueRef sample_index4 =
4644 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4645
4646 LLVMValueRef shifted_fmask =
4647 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4648
4649 LLVMValueRef final_sample =
4650 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4651
4652 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4653 * resource descriptor is 0 (invalid),
4654 */
4655 LLVMValueRef fmask_desc =
4656 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4657 ctx->v8i32, "");
4658
4659 LLVMValueRef fmask_word1 =
4660 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4661 uint_bld->one, "");
4662
4663 LLVMValueRef word1_is_nonzero =
4664 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4665 fmask_word1, uint_bld->zero, "");
4666
4667 /* Replace the MSAA sample index. */
4668 address[sample_chan] =
4669 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4670 final_sample, address[sample_chan], "");
4671 }
4672
4673 if (opcode == TGSI_OPCODE_TXF) {
4674 /* add tex offsets */
4675 if (inst->Texture.NumOffsets) {
4676 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4677 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4678 const struct tgsi_texture_offset *off = inst->TexOffsets;
4679
4680 assert(inst->Texture.NumOffsets == 1);
4681
4682 switch (target) {
4683 case TGSI_TEXTURE_3D:
4684 address[2] = lp_build_add(uint_bld, address[2],
4685 bld->immediates[off->Index][off->SwizzleZ]);
4686 /* fall through */
4687 case TGSI_TEXTURE_2D:
4688 case TGSI_TEXTURE_SHADOW2D:
4689 case TGSI_TEXTURE_RECT:
4690 case TGSI_TEXTURE_SHADOWRECT:
4691 case TGSI_TEXTURE_2D_ARRAY:
4692 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4693 address[1] =
4694 lp_build_add(uint_bld, address[1],
4695 bld->immediates[off->Index][off->SwizzleY]);
4696 /* fall through */
4697 case TGSI_TEXTURE_1D:
4698 case TGSI_TEXTURE_SHADOW1D:
4699 case TGSI_TEXTURE_1D_ARRAY:
4700 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4701 address[0] =
4702 lp_build_add(uint_bld, address[0],
4703 bld->immediates[off->Index][off->SwizzleX]);
4704 break;
4705 /* texture offsets do not apply to other texture targets */
4706 }
4707 }
4708 }
4709
4710 if (opcode == TGSI_OPCODE_TG4) {
4711 unsigned gather_comp = 0;
4712
4713 /* DMASK was repurposed for GATHER4. 4 components are always
4714 * returned and DMASK works like a swizzle - it selects
4715 * the component to fetch. The only valid DMASK values are
4716 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4717 * (red,red,red,red) etc.) The ISA document doesn't mention
4718 * this.
4719 */
4720
4721 /* Get the component index from src1.x for Gather4. */
4722 if (!tgsi_is_shadow_target(target)) {
4723 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4724 LLVMValueRef comp_imm;
4725 struct tgsi_src_register src1 = inst->Src[1].Register;
4726
4727 assert(src1.File == TGSI_FILE_IMMEDIATE);
4728
4729 comp_imm = imms[src1.Index][src1.SwizzleX];
4730 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4731 gather_comp = CLAMP(gather_comp, 0, 3);
4732 }
4733
4734 dmask = 1 << gather_comp;
4735 }
4736
4737 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4738 samp_ptr, address, count, dmask);
4739 }
4740
4741 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4742 struct lp_build_tgsi_context *bld_base,
4743 struct lp_build_emit_data *emit_data)
4744 {
4745 struct si_shader_context *ctx = si_shader_context(bld_base);
4746 struct lp_build_context *base = &bld_base->base;
4747 unsigned opcode = emit_data->inst->Instruction.Opcode;
4748 unsigned target = emit_data->inst->Texture.Texture;
4749 char intr_name[127];
4750 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4751 bool is_shadow = tgsi_is_shadow_target(target);
4752 char type[64];
4753 const char *name = "llvm.SI.image.sample";
4754 const char *infix = "";
4755
4756 if (target == TGSI_TEXTURE_BUFFER) {
4757 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4758 base->gallivm->builder,
4759 "llvm.SI.vs.load.input", emit_data->dst_type,
4760 emit_data->args, emit_data->arg_count,
4761 LLVMReadNoneAttribute);
4762 return;
4763 }
4764
4765 switch (opcode) {
4766 case TGSI_OPCODE_TXF:
4767 name = target == TGSI_TEXTURE_2D_MSAA ||
4768 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4769 "llvm.SI.image.load" :
4770 "llvm.SI.image.load.mip";
4771 is_shadow = false;
4772 has_offset = false;
4773 break;
4774 case TGSI_OPCODE_LODQ:
4775 name = "llvm.SI.getlod";
4776 is_shadow = false;
4777 has_offset = false;
4778 break;
4779 case TGSI_OPCODE_TEX:
4780 case TGSI_OPCODE_TEX2:
4781 case TGSI_OPCODE_TXP:
4782 if (ctx->type != PIPE_SHADER_FRAGMENT)
4783 infix = ".lz";
4784 break;
4785 case TGSI_OPCODE_TXB:
4786 case TGSI_OPCODE_TXB2:
4787 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4788 infix = ".b";
4789 break;
4790 case TGSI_OPCODE_TXL:
4791 case TGSI_OPCODE_TXL2:
4792 infix = ".l";
4793 break;
4794 case TGSI_OPCODE_TXD:
4795 infix = ".d";
4796 break;
4797 case TGSI_OPCODE_TG4:
4798 name = "llvm.SI.gather4";
4799 infix = ".lz";
4800 break;
4801 default:
4802 assert(0);
4803 return;
4804 }
4805
4806 /* Add the type and suffixes .c, .o if needed. */
4807 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4808 sprintf(intr_name, "%s%s%s%s.%s",
4809 name, is_shadow ? ".c" : "", infix,
4810 has_offset ? ".o" : "", type);
4811
4812 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4813 base->gallivm->builder, intr_name, emit_data->dst_type,
4814 emit_data->args, emit_data->arg_count,
4815 LLVMReadNoneAttribute);
4816 }
4817
4818 static void si_llvm_emit_txqs(
4819 const struct lp_build_tgsi_action *action,
4820 struct lp_build_tgsi_context *bld_base,
4821 struct lp_build_emit_data *emit_data)
4822 {
4823 struct si_shader_context *ctx = si_shader_context(bld_base);
4824 struct gallivm_state *gallivm = bld_base->base.gallivm;
4825 LLVMBuilderRef builder = gallivm->builder;
4826 LLVMValueRef res, samples;
4827 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4828
4829 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4830
4831
4832 /* Read the samples from the descriptor directly. */
4833 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4834 samples = LLVMBuildExtractElement(
4835 builder, res,
4836 lp_build_const_int32(gallivm, 3), "");
4837 samples = LLVMBuildLShr(builder, samples,
4838 lp_build_const_int32(gallivm, 16), "");
4839 samples = LLVMBuildAnd(builder, samples,
4840 lp_build_const_int32(gallivm, 0xf), "");
4841 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4842 samples, "");
4843
4844 emit_data->output[emit_data->chan] = samples;
4845 }
4846
4847 /*
4848 * SI implements derivatives using the local data store (LDS)
4849 * All writes to the LDS happen in all executing threads at
4850 * the same time. TID is the Thread ID for the current
4851 * thread and is a value between 0 and 63, representing
4852 * the thread's position in the wavefront.
4853 *
4854 * For the pixel shader threads are grouped into quads of four pixels.
4855 * The TIDs of the pixels of a quad are:
4856 *
4857 * +------+------+
4858 * |4n + 0|4n + 1|
4859 * +------+------+
4860 * |4n + 2|4n + 3|
4861 * +------+------+
4862 *
4863 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4864 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4865 * the current pixel's column, and masking with 0xfffffffe yields the TID
4866 * of the left pixel of the current pixel's row.
4867 *
4868 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4869 * adding 2 yields the TID of the pixel below the top pixel.
4870 */
4871 /* masks for thread ID. */
4872 #define TID_MASK_TOP_LEFT 0xfffffffc
4873 #define TID_MASK_TOP 0xfffffffd
4874 #define TID_MASK_LEFT 0xfffffffe
4875
4876 static void si_llvm_emit_ddxy(
4877 const struct lp_build_tgsi_action *action,
4878 struct lp_build_tgsi_context *bld_base,
4879 struct lp_build_emit_data *emit_data)
4880 {
4881 struct si_shader_context *ctx = si_shader_context(bld_base);
4882 struct gallivm_state *gallivm = bld_base->base.gallivm;
4883 const struct tgsi_full_instruction *inst = emit_data->inst;
4884 unsigned opcode = inst->Instruction.Opcode;
4885 LLVMValueRef indices[2];
4886 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4887 LLVMValueRef tl, trbl, result[4];
4888 LLVMValueRef tl_tid, trbl_tid;
4889 unsigned swizzle[4];
4890 unsigned c;
4891 int idx;
4892 unsigned mask;
4893
4894 indices[0] = bld_base->uint_bld.zero;
4895 indices[1] = get_thread_id(ctx);
4896 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4897 indices, 2, "");
4898
4899 if (opcode == TGSI_OPCODE_DDX_FINE)
4900 mask = TID_MASK_LEFT;
4901 else if (opcode == TGSI_OPCODE_DDY_FINE)
4902 mask = TID_MASK_TOP;
4903 else
4904 mask = TID_MASK_TOP_LEFT;
4905
4906 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4907 lp_build_const_int32(gallivm, mask), "");
4908 indices[1] = tl_tid;
4909 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4910 indices, 2, "");
4911
4912 /* for DDX we want to next X pixel, DDY next Y pixel. */
4913 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4914 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4915 lp_build_const_int32(gallivm, idx), "");
4916 indices[1] = trbl_tid;
4917 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4918 indices, 2, "");
4919
4920 for (c = 0; c < 4; ++c) {
4921 unsigned i;
4922 LLVMValueRef val;
4923 LLVMValueRef args[2];
4924
4925 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4926 for (i = 0; i < c; ++i) {
4927 if (swizzle[i] == swizzle[c]) {
4928 result[c] = result[i];
4929 break;
4930 }
4931 }
4932 if (i != c)
4933 continue;
4934
4935 val = LLVMBuildBitCast(gallivm->builder,
4936 lp_build_emit_fetch(bld_base, inst, 0, c),
4937 ctx->i32, "");
4938
4939 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4940
4941 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4942 lp_build_const_int32(gallivm, 4), "");
4943 args[1] = val;
4944 tl = lp_build_intrinsic(gallivm->builder,
4945 "llvm.amdgcn.ds.bpermute", ctx->i32,
4946 args, 2, LLVMReadNoneAttribute);
4947
4948 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4949 lp_build_const_int32(gallivm, 4), "");
4950 trbl = lp_build_intrinsic(gallivm->builder,
4951 "llvm.amdgcn.ds.bpermute", ctx->i32,
4952 args, 2, LLVMReadNoneAttribute);
4953 } else {
4954 LLVMBuildStore(gallivm->builder, val, store_ptr);
4955 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4956 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4957 }
4958 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4959 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4960 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4961 }
4962
4963 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4964 }
4965
4966 /*
4967 * this takes an I,J coordinate pair,
4968 * and works out the X and Y derivatives.
4969 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4970 */
4971 static LLVMValueRef si_llvm_emit_ddxy_interp(
4972 struct lp_build_tgsi_context *bld_base,
4973 LLVMValueRef interp_ij)
4974 {
4975 struct si_shader_context *ctx = si_shader_context(bld_base);
4976 struct gallivm_state *gallivm = bld_base->base.gallivm;
4977 LLVMValueRef indices[2];
4978 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4979 LLVMValueRef tl, tr, bl, result[4];
4980 unsigned c;
4981
4982 indices[0] = bld_base->uint_bld.zero;
4983 indices[1] = get_thread_id(ctx);
4984 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4985 indices, 2, "");
4986
4987 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4988 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4989
4990 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4991 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4992
4993 indices[1] = temp;
4994 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4995 indices, 2, "");
4996
4997 indices[1] = temp2;
4998 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4999 indices, 2, "");
5000
5001 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
5002 lp_build_const_int32(gallivm, 1), "");
5003 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
5004 indices, 2, "");
5005
5006 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
5007 lp_build_const_int32(gallivm, 2), "");
5008 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
5009 indices, 2, "");
5010
5011 for (c = 0; c < 2; ++c) {
5012 LLVMValueRef store_val;
5013 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
5014
5015 store_val = LLVMBuildExtractElement(gallivm->builder,
5016 interp_ij, c_ll, "");
5017 LLVMBuildStore(gallivm->builder,
5018 store_val,
5019 store_ptr);
5020
5021 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
5022 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5023
5024 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
5025 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5026
5027 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5028
5029 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5030 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5031
5032 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5033 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5034
5035 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5036 }
5037
5038 return lp_build_gather_values(gallivm, result, 4);
5039 }
5040
5041 static void interp_fetch_args(
5042 struct lp_build_tgsi_context *bld_base,
5043 struct lp_build_emit_data *emit_data)
5044 {
5045 struct si_shader_context *ctx = si_shader_context(bld_base);
5046 struct gallivm_state *gallivm = bld_base->base.gallivm;
5047 const struct tgsi_full_instruction *inst = emit_data->inst;
5048
5049 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5050 /* offset is in second src, first two channels */
5051 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5052 emit_data->inst, 1,
5053 TGSI_CHAN_X);
5054 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5055 emit_data->inst, 1,
5056 TGSI_CHAN_Y);
5057 emit_data->arg_count = 2;
5058 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5059 LLVMValueRef sample_position;
5060 LLVMValueRef sample_id;
5061 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5062
5063 /* fetch sample ID, then fetch its sample position,
5064 * and place into first two channels.
5065 */
5066 sample_id = lp_build_emit_fetch(bld_base,
5067 emit_data->inst, 1, TGSI_CHAN_X);
5068 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5069 ctx->i32, "");
5070 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5071
5072 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5073 sample_position,
5074 lp_build_const_int32(gallivm, 0), "");
5075
5076 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5077 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5078 sample_position,
5079 lp_build_const_int32(gallivm, 1), "");
5080 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5081 emit_data->arg_count = 2;
5082 }
5083 }
5084
5085 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5086 struct lp_build_tgsi_context *bld_base,
5087 struct lp_build_emit_data *emit_data)
5088 {
5089 struct si_shader_context *ctx = si_shader_context(bld_base);
5090 struct si_shader *shader = ctx->shader;
5091 struct gallivm_state *gallivm = bld_base->base.gallivm;
5092 LLVMValueRef interp_param;
5093 const struct tgsi_full_instruction *inst = emit_data->inst;
5094 const char *intr_name;
5095 int input_index = inst->Src[0].Register.Index;
5096 int chan;
5097 int i;
5098 LLVMValueRef attr_number;
5099 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5100 int interp_param_idx;
5101 unsigned interp = shader->selector->info.input_interpolate[input_index];
5102 unsigned location;
5103
5104 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5105
5106 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5107 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5108 location = TGSI_INTERPOLATE_LOC_CENTER;
5109 else
5110 location = TGSI_INTERPOLATE_LOC_CENTROID;
5111
5112 interp_param_idx = lookup_interp_param_index(interp, location);
5113 if (interp_param_idx == -1)
5114 return;
5115 else if (interp_param_idx)
5116 interp_param = get_interp_param(ctx, interp_param_idx);
5117 else
5118 interp_param = NULL;
5119
5120 attr_number = lp_build_const_int32(gallivm, input_index);
5121
5122 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5123 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5124 LLVMValueRef ij_out[2];
5125 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5126
5127 /*
5128 * take the I then J parameters, and the DDX/Y for it, and
5129 * calculate the IJ inputs for the interpolator.
5130 * temp1 = ddx * offset/sample.x + I;
5131 * interp_param.I = ddy * offset/sample.y + temp1;
5132 * temp1 = ddx * offset/sample.x + J;
5133 * interp_param.J = ddy * offset/sample.y + temp1;
5134 */
5135 for (i = 0; i < 2; i++) {
5136 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5137 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5138 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5139 ddxy_out, ix_ll, "");
5140 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5141 ddxy_out, iy_ll, "");
5142 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5143 interp_param, ix_ll, "");
5144 LLVMValueRef temp1, temp2;
5145
5146 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5147 ctx->f32, "");
5148
5149 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5150
5151 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5152
5153 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5154
5155 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5156
5157 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5158 temp2, ctx->i32, "");
5159 }
5160 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5161 }
5162
5163 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5164 for (chan = 0; chan < 2; chan++) {
5165 LLVMValueRef args[4];
5166 LLVMValueRef llvm_chan;
5167 unsigned schan;
5168
5169 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5170 llvm_chan = lp_build_const_int32(gallivm, schan);
5171
5172 args[0] = llvm_chan;
5173 args[1] = attr_number;
5174 args[2] = params;
5175 args[3] = interp_param;
5176
5177 emit_data->output[chan] =
5178 lp_build_intrinsic(gallivm->builder, intr_name,
5179 ctx->f32, args, args[3] ? 4 : 3,
5180 LLVMReadNoneAttribute);
5181 }
5182 }
5183
5184 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5185 struct lp_build_emit_data *emit_data)
5186 {
5187 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5188 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5189 unsigned stream;
5190
5191 assert(src0.File == TGSI_FILE_IMMEDIATE);
5192
5193 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5194 return stream;
5195 }
5196
5197 /* Emit one vertex from the geometry shader */
5198 static void si_llvm_emit_vertex(
5199 const struct lp_build_tgsi_action *action,
5200 struct lp_build_tgsi_context *bld_base,
5201 struct lp_build_emit_data *emit_data)
5202 {
5203 struct si_shader_context *ctx = si_shader_context(bld_base);
5204 struct lp_build_context *uint = &bld_base->uint_bld;
5205 struct si_shader *shader = ctx->shader;
5206 struct tgsi_shader_info *info = &shader->selector->info;
5207 struct gallivm_state *gallivm = bld_base->base.gallivm;
5208 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5209 SI_PARAM_GS2VS_OFFSET);
5210 LLVMValueRef gs_next_vertex;
5211 LLVMValueRef can_emit, kill;
5212 LLVMValueRef args[2];
5213 unsigned chan;
5214 int i;
5215 unsigned stream;
5216
5217 stream = si_llvm_get_stream(bld_base, emit_data);
5218
5219 /* Write vertex attribute values to GSVS ring */
5220 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5221 ctx->gs_next_vertex[stream],
5222 "");
5223
5224 /* If this thread has already emitted the declared maximum number of
5225 * vertices, kill it: excessive vertex emissions are not supposed to
5226 * have any effect, and GS threads have no externally observable
5227 * effects other than emitting vertices.
5228 */
5229 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5230 lp_build_const_int32(gallivm,
5231 shader->selector->gs_max_out_vertices), "");
5232 kill = lp_build_select(&bld_base->base, can_emit,
5233 lp_build_const_float(gallivm, 1.0f),
5234 lp_build_const_float(gallivm, -1.0f));
5235
5236 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5237 ctx->voidt, &kill, 1, 0);
5238
5239 for (i = 0; i < info->num_outputs; i++) {
5240 LLVMValueRef *out_ptr =
5241 ctx->radeon_bld.soa.outputs[i];
5242
5243 for (chan = 0; chan < 4; chan++) {
5244 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5245 LLVMValueRef voffset =
5246 lp_build_const_int32(gallivm, (i * 4 + chan) *
5247 shader->selector->gs_max_out_vertices);
5248
5249 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5250 voffset = lp_build_mul_imm(uint, voffset, 4);
5251
5252 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5253
5254 build_tbuffer_store(ctx,
5255 ctx->gsvs_ring[stream],
5256 out_val, 1,
5257 voffset, soffset, 0,
5258 V_008F0C_BUF_DATA_FORMAT_32,
5259 V_008F0C_BUF_NUM_FORMAT_UINT,
5260 1, 0, 1, 1, 0);
5261 }
5262 }
5263 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5264 lp_build_const_int32(gallivm, 1));
5265
5266 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5267
5268 /* Signal vertex emission */
5269 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5270 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5271 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5272 ctx->voidt, args, 2, 0);
5273 }
5274
5275 /* Cut one primitive from the geometry shader */
5276 static void si_llvm_emit_primitive(
5277 const struct lp_build_tgsi_action *action,
5278 struct lp_build_tgsi_context *bld_base,
5279 struct lp_build_emit_data *emit_data)
5280 {
5281 struct si_shader_context *ctx = si_shader_context(bld_base);
5282 struct gallivm_state *gallivm = bld_base->base.gallivm;
5283 LLVMValueRef args[2];
5284 unsigned stream;
5285
5286 /* Signal primitive cut */
5287 stream = si_llvm_get_stream(bld_base, emit_data);
5288 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5289 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5290 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5291 ctx->voidt, args, 2, 0);
5292 }
5293
5294 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5295 struct lp_build_tgsi_context *bld_base,
5296 struct lp_build_emit_data *emit_data)
5297 {
5298 struct si_shader_context *ctx = si_shader_context(bld_base);
5299 struct gallivm_state *gallivm = bld_base->base.gallivm;
5300
5301 /* The real barrier instruction isn’t needed, because an entire patch
5302 * always fits into a single wave.
5303 */
5304 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5305 emit_optimization_barrier(ctx);
5306 return;
5307 }
5308
5309 lp_build_intrinsic(gallivm->builder,
5310 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5311 : "llvm.AMDGPU.barrier.local",
5312 ctx->voidt, NULL, 0, 0);
5313 }
5314
5315 static const struct lp_build_tgsi_action tex_action = {
5316 .fetch_args = tex_fetch_args,
5317 .emit = build_tex_intrinsic,
5318 };
5319
5320 static const struct lp_build_tgsi_action interp_action = {
5321 .fetch_args = interp_fetch_args,
5322 .emit = build_interp_intrinsic,
5323 };
5324
5325 static void si_create_function(struct si_shader_context *ctx,
5326 LLVMTypeRef *returns, unsigned num_returns,
5327 LLVMTypeRef *params, unsigned num_params,
5328 int last_sgpr)
5329 {
5330 int i;
5331
5332 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5333 params, num_params);
5334 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5335 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5336
5337 for (i = 0; i <= last_sgpr; ++i) {
5338 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5339
5340 /* The combination of:
5341 * - ByVal
5342 * - dereferenceable
5343 * - invariant.load
5344 * allows the optimization passes to move loads and reduces
5345 * SGPR spilling significantly.
5346 */
5347 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5348 LLVMAddAttribute(P, LLVMByValAttribute);
5349 lp_add_attr_dereferenceable(P, UINT64_MAX);
5350 } else
5351 LLVMAddAttribute(P, LLVMInRegAttribute);
5352 }
5353
5354 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5355 /* These were copied from some LLVM test. */
5356 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5357 "less-precise-fpmad",
5358 "true");
5359 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5360 "no-infs-fp-math",
5361 "true");
5362 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5363 "no-nans-fp-math",
5364 "true");
5365 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5366 "unsafe-fp-math",
5367 "true");
5368 }
5369 }
5370
5371 static void create_meta_data(struct si_shader_context *ctx)
5372 {
5373 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5374
5375 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5376 "invariant.load", 14);
5377 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5378 "range", 5);
5379 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5380 "amdgpu.uniform", 14);
5381
5382 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5383 }
5384
5385 static void declare_streamout_params(struct si_shader_context *ctx,
5386 struct pipe_stream_output_info *so,
5387 LLVMTypeRef *params, LLVMTypeRef i32,
5388 unsigned *num_params)
5389 {
5390 int i;
5391
5392 /* Streamout SGPRs. */
5393 if (so->num_outputs) {
5394 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5395 params[ctx->param_streamout_config = (*num_params)++] = i32;
5396 else
5397 ctx->param_streamout_config = ctx->param_tess_offchip;
5398
5399 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5400 }
5401 /* A streamout buffer offset is loaded if the stride is non-zero. */
5402 for (i = 0; i < 4; i++) {
5403 if (!so->stride[i])
5404 continue;
5405
5406 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5407 }
5408 }
5409
5410 static unsigned llvm_get_type_size(LLVMTypeRef type)
5411 {
5412 LLVMTypeKind kind = LLVMGetTypeKind(type);
5413
5414 switch (kind) {
5415 case LLVMIntegerTypeKind:
5416 return LLVMGetIntTypeWidth(type) / 8;
5417 case LLVMFloatTypeKind:
5418 return 4;
5419 case LLVMPointerTypeKind:
5420 return 8;
5421 case LLVMVectorTypeKind:
5422 return LLVMGetVectorSize(type) *
5423 llvm_get_type_size(LLVMGetElementType(type));
5424 default:
5425 assert(0);
5426 return 0;
5427 }
5428 }
5429
5430 static void declare_tess_lds(struct si_shader_context *ctx)
5431 {
5432 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5433 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5434 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5435
5436 /* The actual size is computed outside of the shader to reduce
5437 * the number of shader variants. */
5438 ctx->lds =
5439 LLVMAddGlobalInAddressSpace(gallivm->module,
5440 LLVMArrayType(i32, lds_size / 4),
5441 "tess_lds",
5442 LOCAL_ADDR_SPACE);
5443 }
5444
5445 static void create_function(struct si_shader_context *ctx)
5446 {
5447 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5448 struct gallivm_state *gallivm = bld_base->base.gallivm;
5449 struct si_shader *shader = ctx->shader;
5450 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5451 LLVMTypeRef returns[16+32*4];
5452 unsigned i, last_sgpr, num_params, num_return_sgprs;
5453 unsigned num_returns = 0;
5454
5455 v3i32 = LLVMVectorType(ctx->i32, 3);
5456
5457 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5458 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5459 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5460 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5461 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5462
5463 switch (ctx->type) {
5464 case PIPE_SHADER_VERTEX:
5465 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5466 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5467 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5468 params[SI_PARAM_DRAWID] = ctx->i32;
5469 num_params = SI_PARAM_DRAWID+1;
5470
5471 if (shader->key.vs.as_es) {
5472 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5473 } else if (shader->key.vs.as_ls) {
5474 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5475 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5476 } else {
5477 if (ctx->is_gs_copy_shader) {
5478 num_params = SI_PARAM_RW_BUFFERS+1;
5479 } else {
5480 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5481 num_params = SI_PARAM_VS_STATE_BITS+1;
5482 }
5483
5484 /* The locations of the other parameters are assigned dynamically. */
5485 declare_streamout_params(ctx, &shader->selector->so,
5486 params, ctx->i32, &num_params);
5487 }
5488
5489 last_sgpr = num_params-1;
5490
5491 /* VGPRs */
5492 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5493 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5494 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5495 params[ctx->param_instance_id = num_params++] = ctx->i32;
5496
5497 if (!ctx->is_monolithic &&
5498 !ctx->is_gs_copy_shader) {
5499 /* Vertex load indices. */
5500 ctx->param_vertex_index0 = num_params;
5501
5502 for (i = 0; i < shader->selector->info.num_inputs; i++)
5503 params[num_params++] = ctx->i32;
5504
5505 /* PrimitiveID output. */
5506 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5507 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5508 returns[num_returns++] = ctx->f32;
5509 }
5510 break;
5511
5512 case PIPE_SHADER_TESS_CTRL:
5513 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5514 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5515 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5516 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5517 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5518 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5519 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5520
5521 /* VGPRs */
5522 params[SI_PARAM_PATCH_ID] = ctx->i32;
5523 params[SI_PARAM_REL_IDS] = ctx->i32;
5524 num_params = SI_PARAM_REL_IDS+1;
5525
5526 if (!ctx->is_monolithic) {
5527 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5528 * placed after the user SGPRs.
5529 */
5530 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5531 returns[num_returns++] = ctx->i32; /* SGPRs */
5532
5533 for (i = 0; i < 3; i++)
5534 returns[num_returns++] = ctx->f32; /* VGPRs */
5535 }
5536 break;
5537
5538 case PIPE_SHADER_TESS_EVAL:
5539 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5540 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5541
5542 if (shader->key.tes.as_es) {
5543 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5544 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5545 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5546 } else {
5547 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5548 declare_streamout_params(ctx, &shader->selector->so,
5549 params, ctx->i32, &num_params);
5550 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5551 }
5552 last_sgpr = num_params - 1;
5553
5554 /* VGPRs */
5555 params[ctx->param_tes_u = num_params++] = ctx->f32;
5556 params[ctx->param_tes_v = num_params++] = ctx->f32;
5557 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5558 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5559
5560 /* PrimitiveID output. */
5561 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5562 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5563 returns[num_returns++] = ctx->f32;
5564 break;
5565
5566 case PIPE_SHADER_GEOMETRY:
5567 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5568 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5569 last_sgpr = SI_PARAM_GS_WAVE_ID;
5570
5571 /* VGPRs */
5572 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5573 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5574 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5575 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5576 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5577 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5578 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5579 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5580 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5581 break;
5582
5583 case PIPE_SHADER_FRAGMENT:
5584 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5585 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5586 last_sgpr = SI_PARAM_PRIM_MASK;
5587 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5588 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5589 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5590 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5591 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5592 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5593 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5594 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5595 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5596 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5597 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5598 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5599 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5600 params[SI_PARAM_ANCILLARY] = ctx->i32;
5601 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5602 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5603 num_params = SI_PARAM_POS_FIXED_PT+1;
5604
5605 if (!ctx->is_monolithic) {
5606 /* Color inputs from the prolog. */
5607 if (shader->selector->info.colors_read) {
5608 unsigned num_color_elements =
5609 util_bitcount(shader->selector->info.colors_read);
5610
5611 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5612 for (i = 0; i < num_color_elements; i++)
5613 params[num_params++] = ctx->f32;
5614 }
5615
5616 /* Outputs for the epilog. */
5617 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5618 num_returns =
5619 num_return_sgprs +
5620 util_bitcount(shader->selector->info.colors_written) * 4 +
5621 shader->selector->info.writes_z +
5622 shader->selector->info.writes_stencil +
5623 shader->selector->info.writes_samplemask +
5624 1 /* SampleMaskIn */;
5625
5626 num_returns = MAX2(num_returns,
5627 num_return_sgprs +
5628 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5629
5630 for (i = 0; i < num_return_sgprs; i++)
5631 returns[i] = ctx->i32;
5632 for (; i < num_returns; i++)
5633 returns[i] = ctx->f32;
5634 }
5635 break;
5636
5637 case PIPE_SHADER_COMPUTE:
5638 params[SI_PARAM_GRID_SIZE] = v3i32;
5639 params[SI_PARAM_BLOCK_ID] = v3i32;
5640 last_sgpr = SI_PARAM_BLOCK_ID;
5641
5642 params[SI_PARAM_THREAD_ID] = v3i32;
5643 num_params = SI_PARAM_THREAD_ID + 1;
5644 break;
5645 default:
5646 assert(0 && "unimplemented shader");
5647 return;
5648 }
5649
5650 assert(num_params <= ARRAY_SIZE(params));
5651
5652 si_create_function(ctx, returns, num_returns, params,
5653 num_params, last_sgpr);
5654
5655 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5656 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5657 !ctx->is_monolithic) {
5658 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5659 "InitialPSInputAddr",
5660 S_0286D0_PERSP_SAMPLE_ENA(1) |
5661 S_0286D0_PERSP_CENTER_ENA(1) |
5662 S_0286D0_PERSP_CENTROID_ENA(1) |
5663 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5664 S_0286D0_LINEAR_CENTER_ENA(1) |
5665 S_0286D0_LINEAR_CENTROID_ENA(1) |
5666 S_0286D0_FRONT_FACE_ENA(1) |
5667 S_0286D0_POS_FIXED_PT_ENA(1));
5668 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5669 const unsigned *properties = shader->selector->info.properties;
5670 unsigned max_work_group_size =
5671 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5672 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5673 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5674
5675 assert(max_work_group_size);
5676
5677 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5678 "amdgpu-max-work-group-size",
5679 max_work_group_size);
5680 }
5681
5682 shader->info.num_input_sgprs = 0;
5683 shader->info.num_input_vgprs = 0;
5684
5685 for (i = 0; i <= last_sgpr; ++i)
5686 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5687
5688 /* Unused fragment shader inputs are eliminated by the compiler,
5689 * so we don't know yet how many there will be.
5690 */
5691 if (ctx->type != PIPE_SHADER_FRAGMENT)
5692 for (; i < num_params; ++i)
5693 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5694
5695 if (bld_base->info &&
5696 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5697 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5698 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5699 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5700 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5701 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5702 ctx->lds =
5703 LLVMAddGlobalInAddressSpace(gallivm->module,
5704 LLVMArrayType(ctx->i32, 64),
5705 "ddxy_lds",
5706 LOCAL_ADDR_SPACE);
5707
5708 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5709 ctx->type == PIPE_SHADER_TESS_CTRL ||
5710 ctx->type == PIPE_SHADER_TESS_EVAL)
5711 declare_tess_lds(ctx);
5712 }
5713
5714 static void preload_constants(struct si_shader_context *ctx)
5715 {
5716 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5717 struct gallivm_state *gallivm = bld_base->base.gallivm;
5718 const struct tgsi_shader_info *info = bld_base->info;
5719 unsigned buf;
5720 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5721
5722 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5723 unsigned i, num_const = info->const_file_max[buf] + 1;
5724
5725 if (num_const == 0)
5726 continue;
5727
5728 /* Allocate space for the constant values */
5729 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5730
5731 /* Load the resource descriptor */
5732 ctx->const_buffers[buf] =
5733 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5734
5735 /* Load the constants, we rely on the code sinking to do the rest */
5736 for (i = 0; i < num_const * 4; ++i) {
5737 ctx->constants[buf][i] =
5738 buffer_load_const(ctx,
5739 ctx->const_buffers[buf],
5740 lp_build_const_int32(gallivm, i * 4));
5741 }
5742 }
5743 }
5744
5745 static void preload_shader_buffers(struct si_shader_context *ctx)
5746 {
5747 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5748 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5749 int buf, maxbuf;
5750
5751 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5752 SI_NUM_SHADER_BUFFERS - 1);
5753 for (buf = 0; buf <= maxbuf; ++buf) {
5754 ctx->shader_buffers[buf] =
5755 build_indexed_load_const(
5756 ctx, ptr, lp_build_const_int32(gallivm, buf));
5757 }
5758 }
5759
5760 static void preload_samplers(struct si_shader_context *ctx)
5761 {
5762 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5763 struct gallivm_state *gallivm = bld_base->base.gallivm;
5764 const struct tgsi_shader_info *info = bld_base->info;
5765 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5766 LLVMValueRef offset;
5767
5768 if (num_samplers == 0)
5769 return;
5770
5771 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5772 for (i = 0; i < num_samplers; ++i) {
5773 /* Resource */
5774 offset = lp_build_const_int32(gallivm, i);
5775 ctx->sampler_views[i] =
5776 get_sampler_desc(ctx, offset, DESC_IMAGE);
5777
5778 /* FMASK resource */
5779 if (info->is_msaa_sampler[i])
5780 ctx->fmasks[i] =
5781 get_sampler_desc(ctx, offset, DESC_FMASK);
5782 else {
5783 ctx->sampler_states[i] =
5784 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5785 ctx->sampler_states[i] =
5786 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5787 ctx->sampler_states[i]);
5788 }
5789 }
5790 }
5791
5792 static void preload_images(struct si_shader_context *ctx)
5793 {
5794 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5795 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5796 struct gallivm_state *gallivm = bld_base->base.gallivm;
5797 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5798 LLVMValueRef res_ptr;
5799 unsigned i;
5800
5801 if (num_images == 0)
5802 return;
5803
5804 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5805
5806 for (i = 0; i < num_images; ++i) {
5807 /* Rely on LLVM to shrink the load for buffer resources. */
5808 LLVMValueRef rsrc =
5809 build_indexed_load_const(ctx, res_ptr,
5810 lp_build_const_int32(gallivm, i));
5811
5812 if (info->images_writemask & (1 << i) &&
5813 !(info->images_buffers & (1 << i)))
5814 rsrc = force_dcc_off(ctx, rsrc);
5815
5816 ctx->images[i] = rsrc;
5817 }
5818 }
5819
5820 static void preload_streamout_buffers(struct si_shader_context *ctx)
5821 {
5822 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5823 struct gallivm_state *gallivm = bld_base->base.gallivm;
5824 unsigned i;
5825
5826 /* Streamout can only be used if the shader is compiled as VS. */
5827 if (!ctx->shader->selector->so.num_outputs ||
5828 (ctx->type == PIPE_SHADER_VERTEX &&
5829 (ctx->shader->key.vs.as_es ||
5830 ctx->shader->key.vs.as_ls)) ||
5831 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5832 ctx->shader->key.tes.as_es))
5833 return;
5834
5835 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5836 SI_PARAM_RW_BUFFERS);
5837
5838 /* Load the resources, we rely on the code sinking to do the rest */
5839 for (i = 0; i < 4; ++i) {
5840 if (ctx->shader->selector->so.stride[i]) {
5841 LLVMValueRef offset = lp_build_const_int32(gallivm,
5842 SI_VS_STREAMOUT_BUF0 + i);
5843
5844 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5845 }
5846 }
5847 }
5848
5849 /**
5850 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5851 * for later use.
5852 */
5853 static void preload_ring_buffers(struct si_shader_context *ctx)
5854 {
5855 struct gallivm_state *gallivm =
5856 ctx->radeon_bld.soa.bld_base.base.gallivm;
5857
5858 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5859 SI_PARAM_RW_BUFFERS);
5860
5861 if ((ctx->type == PIPE_SHADER_VERTEX &&
5862 ctx->shader->key.vs.as_es) ||
5863 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5864 ctx->shader->key.tes.as_es) ||
5865 ctx->type == PIPE_SHADER_GEOMETRY) {
5866 unsigned ring =
5867 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5868 : SI_ES_RING_ESGS;
5869 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5870
5871 ctx->esgs_ring =
5872 build_indexed_load_const(ctx, buf_ptr, offset);
5873 }
5874
5875 if (ctx->is_gs_copy_shader) {
5876 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5877
5878 ctx->gsvs_ring[0] =
5879 build_indexed_load_const(ctx, buf_ptr, offset);
5880 }
5881 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5882 int i;
5883 for (i = 0; i < 4; i++) {
5884 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5885
5886 ctx->gsvs_ring[i] =
5887 build_indexed_load_const(ctx, buf_ptr, offset);
5888 }
5889 }
5890 }
5891
5892 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5893 LLVMValueRef param_rw_buffers,
5894 unsigned param_pos_fixed_pt)
5895 {
5896 struct lp_build_tgsi_context *bld_base =
5897 &ctx->radeon_bld.soa.bld_base;
5898 struct gallivm_state *gallivm = bld_base->base.gallivm;
5899 LLVMBuilderRef builder = gallivm->builder;
5900 LLVMValueRef slot, desc, offset, row, bit, address[2];
5901
5902 /* Use the fixed-point gl_FragCoord input.
5903 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5904 * per coordinate to get the repeating effect.
5905 */
5906 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5907 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5908
5909 /* Load the buffer descriptor. */
5910 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5911 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5912
5913 /* The stipple pattern is 32x32, each row has 32 bits. */
5914 offset = LLVMBuildMul(builder, address[1],
5915 LLVMConstInt(ctx->i32, 4, 0), "");
5916 row = buffer_load_const(ctx, desc, offset);
5917 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
5918 bit = LLVMBuildLShr(builder, row, address[0], "");
5919 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5920
5921 /* The intrinsic kills the thread if arg < 0. */
5922 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5923 LLVMConstReal(ctx->f32, -1), "");
5924 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5925 }
5926
5927 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5928 struct si_shader_config *conf,
5929 unsigned symbol_offset)
5930 {
5931 unsigned i;
5932 const unsigned char *config =
5933 radeon_shader_binary_config_start(binary, symbol_offset);
5934 bool really_needs_scratch = false;
5935
5936 /* LLVM adds SGPR spills to the scratch size.
5937 * Find out if we really need the scratch buffer.
5938 */
5939 for (i = 0; i < binary->reloc_count; i++) {
5940 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5941
5942 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5943 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5944 really_needs_scratch = true;
5945 break;
5946 }
5947 }
5948
5949 /* XXX: We may be able to emit some of these values directly rather than
5950 * extracting fields to be emitted later.
5951 */
5952
5953 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5954 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5955 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5956 switch (reg) {
5957 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5958 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5959 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5960 case R_00B848_COMPUTE_PGM_RSRC1:
5961 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5962 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5963 conf->float_mode = G_00B028_FLOAT_MODE(value);
5964 conf->rsrc1 = value;
5965 break;
5966 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5967 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5968 break;
5969 case R_00B84C_COMPUTE_PGM_RSRC2:
5970 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5971 conf->rsrc2 = value;
5972 break;
5973 case R_0286CC_SPI_PS_INPUT_ENA:
5974 conf->spi_ps_input_ena = value;
5975 break;
5976 case R_0286D0_SPI_PS_INPUT_ADDR:
5977 conf->spi_ps_input_addr = value;
5978 break;
5979 case R_0286E8_SPI_TMPRING_SIZE:
5980 case R_00B860_COMPUTE_TMPRING_SIZE:
5981 /* WAVESIZE is in units of 256 dwords. */
5982 if (really_needs_scratch)
5983 conf->scratch_bytes_per_wave =
5984 G_00B860_WAVESIZE(value) * 256 * 4;
5985 break;
5986 case 0x4: /* SPILLED_SGPRS */
5987 conf->spilled_sgprs = value;
5988 break;
5989 case 0x8: /* SPILLED_VGPRS */
5990 conf->spilled_vgprs = value;
5991 break;
5992 default:
5993 {
5994 static bool printed;
5995
5996 if (!printed) {
5997 fprintf(stderr, "Warning: LLVM emitted unknown "
5998 "config register: 0x%x\n", reg);
5999 printed = true;
6000 }
6001 }
6002 break;
6003 }
6004 }
6005
6006 if (!conf->spi_ps_input_addr)
6007 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6008 }
6009
6010 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6011 struct si_shader *shader,
6012 struct si_shader_config *config,
6013 uint64_t scratch_va)
6014 {
6015 unsigned i;
6016 uint32_t scratch_rsrc_dword0 = scratch_va;
6017 uint32_t scratch_rsrc_dword1 =
6018 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6019
6020 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6021 * correctly.
6022 */
6023 if (HAVE_LLVM >= 0x0309)
6024 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6025 else
6026 scratch_rsrc_dword1 |=
6027 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6028
6029 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6030 const struct radeon_shader_reloc *reloc =
6031 &shader->binary.relocs[i];
6032 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6033 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6034 &scratch_rsrc_dword0, 4);
6035 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6036 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6037 &scratch_rsrc_dword1, 4);
6038 }
6039 }
6040 }
6041
6042 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6043 {
6044 unsigned size = shader->binary.code_size;
6045
6046 if (shader->prolog)
6047 size += shader->prolog->binary.code_size;
6048 if (shader->epilog)
6049 size += shader->epilog->binary.code_size;
6050 return size;
6051 }
6052
6053 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6054 {
6055 const struct radeon_shader_binary *prolog =
6056 shader->prolog ? &shader->prolog->binary : NULL;
6057 const struct radeon_shader_binary *epilog =
6058 shader->epilog ? &shader->epilog->binary : NULL;
6059 const struct radeon_shader_binary *mainb = &shader->binary;
6060 unsigned bo_size = si_get_shader_binary_size(shader) +
6061 (!epilog ? mainb->rodata_size : 0);
6062 unsigned char *ptr;
6063
6064 assert(!prolog || !prolog->rodata_size);
6065 assert((!prolog && !epilog) || !mainb->rodata_size);
6066 assert(!epilog || !epilog->rodata_size);
6067
6068 r600_resource_reference(&shader->bo, NULL);
6069 shader->bo = si_resource_create_custom(&sscreen->b.b,
6070 PIPE_USAGE_IMMUTABLE,
6071 bo_size);
6072 if (!shader->bo)
6073 return -ENOMEM;
6074
6075 /* Upload. */
6076 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6077 PIPE_TRANSFER_READ_WRITE);
6078
6079 if (prolog) {
6080 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6081 ptr += prolog->code_size;
6082 }
6083
6084 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6085 ptr += mainb->code_size;
6086
6087 if (epilog)
6088 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6089 else if (mainb->rodata_size > 0)
6090 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6091
6092 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6093 return 0;
6094 }
6095
6096 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6097 struct pipe_debug_callback *debug,
6098 const char *name, FILE *file)
6099 {
6100 char *line, *p;
6101 unsigned i, count;
6102
6103 if (binary->disasm_string) {
6104 fprintf(file, "Shader %s disassembly:\n", name);
6105 fprintf(file, "%s", binary->disasm_string);
6106
6107 if (debug && debug->debug_message) {
6108 /* Very long debug messages are cut off, so send the
6109 * disassembly one line at a time. This causes more
6110 * overhead, but on the plus side it simplifies
6111 * parsing of resulting logs.
6112 */
6113 pipe_debug_message(debug, SHADER_INFO,
6114 "Shader Disassembly Begin");
6115
6116 line = binary->disasm_string;
6117 while (*line) {
6118 p = util_strchrnul(line, '\n');
6119 count = p - line;
6120
6121 if (count) {
6122 pipe_debug_message(debug, SHADER_INFO,
6123 "%.*s", count, line);
6124 }
6125
6126 if (!*p)
6127 break;
6128 line = p + 1;
6129 }
6130
6131 pipe_debug_message(debug, SHADER_INFO,
6132 "Shader Disassembly End");
6133 }
6134 } else {
6135 fprintf(file, "Shader %s binary:\n", name);
6136 for (i = 0; i < binary->code_size; i += 4) {
6137 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6138 binary->code[i + 3], binary->code[i + 2],
6139 binary->code[i + 1], binary->code[i]);
6140 }
6141 }
6142 }
6143
6144 static void si_shader_dump_stats(struct si_screen *sscreen,
6145 struct si_shader_config *conf,
6146 unsigned num_inputs,
6147 unsigned code_size,
6148 struct pipe_debug_callback *debug,
6149 unsigned processor,
6150 FILE *file)
6151 {
6152 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6153 unsigned lds_per_wave = 0;
6154 unsigned max_simd_waves = 10;
6155
6156 /* Compute LDS usage for PS. */
6157 if (processor == PIPE_SHADER_FRAGMENT) {
6158 /* The minimum usage per wave is (num_inputs * 48). The maximum
6159 * usage is (num_inputs * 48 * 16).
6160 * We can get anything in between and it varies between waves.
6161 *
6162 * The 48 bytes per input for a single primitive is equal to
6163 * 4 bytes/component * 4 components/input * 3 points.
6164 *
6165 * Other stages don't know the size at compile time or don't
6166 * allocate LDS per wave, but instead they do it per thread group.
6167 */
6168 lds_per_wave = conf->lds_size * lds_increment +
6169 align(num_inputs * 48, lds_increment);
6170 }
6171
6172 /* Compute the per-SIMD wave counts. */
6173 if (conf->num_sgprs) {
6174 if (sscreen->b.chip_class >= VI)
6175 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6176 else
6177 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6178 }
6179
6180 if (conf->num_vgprs)
6181 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6182
6183 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6184 * that PS can use.
6185 */
6186 if (lds_per_wave)
6187 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6188
6189 if (file != stderr ||
6190 r600_can_dump_shader(&sscreen->b, processor)) {
6191 if (processor == PIPE_SHADER_FRAGMENT) {
6192 fprintf(file, "*** SHADER CONFIG ***\n"
6193 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6194 "SPI_PS_INPUT_ENA = 0x%04x\n",
6195 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6196 }
6197
6198 fprintf(file, "*** SHADER STATS ***\n"
6199 "SGPRS: %d\n"
6200 "VGPRS: %d\n"
6201 "Spilled SGPRs: %d\n"
6202 "Spilled VGPRs: %d\n"
6203 "Code Size: %d bytes\n"
6204 "LDS: %d blocks\n"
6205 "Scratch: %d bytes per wave\n"
6206 "Max Waves: %d\n"
6207 "********************\n\n\n",
6208 conf->num_sgprs, conf->num_vgprs,
6209 conf->spilled_sgprs, conf->spilled_vgprs, code_size,
6210 conf->lds_size, conf->scratch_bytes_per_wave,
6211 max_simd_waves);
6212 }
6213
6214 pipe_debug_message(debug, SHADER_INFO,
6215 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6216 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6217 "Spilled VGPRs: %d",
6218 conf->num_sgprs, conf->num_vgprs, code_size,
6219 conf->lds_size, conf->scratch_bytes_per_wave,
6220 max_simd_waves, conf->spilled_sgprs,
6221 conf->spilled_vgprs);
6222 }
6223
6224 static const char *si_get_shader_name(struct si_shader *shader,
6225 unsigned processor)
6226 {
6227 switch (processor) {
6228 case PIPE_SHADER_VERTEX:
6229 if (shader->key.vs.as_es)
6230 return "Vertex Shader as ES";
6231 else if (shader->key.vs.as_ls)
6232 return "Vertex Shader as LS";
6233 else
6234 return "Vertex Shader as VS";
6235 case PIPE_SHADER_TESS_CTRL:
6236 return "Tessellation Control Shader";
6237 case PIPE_SHADER_TESS_EVAL:
6238 if (shader->key.tes.as_es)
6239 return "Tessellation Evaluation Shader as ES";
6240 else
6241 return "Tessellation Evaluation Shader as VS";
6242 case PIPE_SHADER_GEOMETRY:
6243 if (shader->gs_copy_shader == NULL)
6244 return "GS Copy Shader as VS";
6245 else
6246 return "Geometry Shader";
6247 case PIPE_SHADER_FRAGMENT:
6248 return "Pixel Shader";
6249 case PIPE_SHADER_COMPUTE:
6250 return "Compute Shader";
6251 default:
6252 return "Unknown Shader";
6253 }
6254 }
6255
6256 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6257 struct pipe_debug_callback *debug, unsigned processor,
6258 FILE *file)
6259 {
6260 if (file != stderr ||
6261 r600_can_dump_shader(&sscreen->b, processor))
6262 si_dump_shader_key(processor, &shader->key, file);
6263
6264 if (file != stderr && shader->binary.llvm_ir_string) {
6265 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6266 si_get_shader_name(shader, processor));
6267 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6268 }
6269
6270 if (file != stderr ||
6271 (r600_can_dump_shader(&sscreen->b, processor) &&
6272 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6273 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6274
6275 if (shader->prolog)
6276 si_shader_dump_disassembly(&shader->prolog->binary,
6277 debug, "prolog", file);
6278
6279 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6280
6281 if (shader->epilog)
6282 si_shader_dump_disassembly(&shader->epilog->binary,
6283 debug, "epilog", file);
6284 fprintf(file, "\n");
6285 }
6286
6287 si_shader_dump_stats(sscreen, &shader->config,
6288 shader->selector ? shader->selector->info.num_inputs : 0,
6289 si_get_shader_binary_size(shader), debug, processor,
6290 file);
6291 }
6292
6293 int si_compile_llvm(struct si_screen *sscreen,
6294 struct radeon_shader_binary *binary,
6295 struct si_shader_config *conf,
6296 LLVMTargetMachineRef tm,
6297 LLVMModuleRef mod,
6298 struct pipe_debug_callback *debug,
6299 unsigned processor,
6300 const char *name)
6301 {
6302 int r = 0;
6303 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6304
6305 if (r600_can_dump_shader(&sscreen->b, processor)) {
6306 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6307
6308 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6309 fprintf(stderr, "%s LLVM IR:\n\n", name);
6310 LLVMDumpModule(mod);
6311 fprintf(stderr, "\n");
6312 }
6313 }
6314
6315 if (sscreen->record_llvm_ir) {
6316 char *ir = LLVMPrintModuleToString(mod);
6317 binary->llvm_ir_string = strdup(ir);
6318 LLVMDisposeMessage(ir);
6319 }
6320
6321 if (!si_replace_shader(count, binary)) {
6322 r = radeon_llvm_compile(mod, binary, tm, debug);
6323 if (r)
6324 return r;
6325 }
6326
6327 si_shader_binary_read_config(binary, conf, 0);
6328
6329 /* Enable 64-bit and 16-bit denormals, because there is no performance
6330 * cost.
6331 *
6332 * If denormals are enabled, all floating-point output modifiers are
6333 * ignored.
6334 *
6335 * Don't enable denormals for 32-bit floats, because:
6336 * - Floating-point output modifiers would be ignored by the hw.
6337 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6338 * have to stop using those.
6339 * - SI & CI would be very slow.
6340 */
6341 conf->float_mode |= V_00B028_FP_64_DENORMS;
6342
6343 FREE(binary->config);
6344 FREE(binary->global_symbol_offsets);
6345 binary->config = NULL;
6346 binary->global_symbol_offsets = NULL;
6347
6348 /* Some shaders can't have rodata because their binaries can be
6349 * concatenated.
6350 */
6351 if (binary->rodata_size &&
6352 (processor == PIPE_SHADER_VERTEX ||
6353 processor == PIPE_SHADER_TESS_CTRL ||
6354 processor == PIPE_SHADER_TESS_EVAL ||
6355 processor == PIPE_SHADER_FRAGMENT)) {
6356 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6357 return -EINVAL;
6358 }
6359
6360 return r;
6361 }
6362
6363 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6364 {
6365 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6366 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6367 else
6368 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6369 }
6370
6371 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6372 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6373 struct si_shader_context *ctx,
6374 struct si_shader *gs,
6375 struct pipe_debug_callback *debug)
6376 {
6377 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6378 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6379 struct lp_build_context *uint = &bld_base->uint_bld;
6380 struct si_shader_output_values *outputs;
6381 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6382 LLVMValueRef args[9];
6383 int i, r;
6384
6385 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6386
6387 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6388 ctx->type = PIPE_SHADER_VERTEX;
6389 ctx->is_gs_copy_shader = true;
6390
6391 create_meta_data(ctx);
6392 create_function(ctx);
6393 preload_streamout_buffers(ctx);
6394 preload_ring_buffers(ctx);
6395
6396 args[0] = ctx->gsvs_ring[0];
6397 args[1] = lp_build_mul_imm(uint,
6398 LLVMGetParam(ctx->radeon_bld.main_fn,
6399 ctx->param_vertex_id),
6400 4);
6401 args[3] = uint->zero;
6402 args[4] = uint->one; /* OFFEN */
6403 args[5] = uint->zero; /* IDXEN */
6404 args[6] = uint->one; /* GLC */
6405 args[7] = uint->one; /* SLC */
6406 args[8] = uint->zero; /* TFE */
6407
6408 /* Fetch vertex data from GSVS ring */
6409 for (i = 0; i < gsinfo->num_outputs; ++i) {
6410 unsigned chan;
6411
6412 outputs[i].name = gsinfo->output_semantic_name[i];
6413 outputs[i].sid = gsinfo->output_semantic_index[i];
6414
6415 for (chan = 0; chan < 4; chan++) {
6416 args[2] = lp_build_const_int32(gallivm,
6417 (i * 4 + chan) *
6418 gs->selector->gs_max_out_vertices * 16 * 4);
6419
6420 outputs[i].values[chan] =
6421 LLVMBuildBitCast(gallivm->builder,
6422 lp_build_intrinsic(gallivm->builder,
6423 "llvm.SI.buffer.load.dword.i32.i32",
6424 ctx->i32, args, 9,
6425 LLVMReadOnlyAttribute),
6426 ctx->f32, "");
6427 }
6428 }
6429
6430 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6431
6432 LLVMBuildRetVoid(gallivm->builder);
6433
6434 /* Dump LLVM IR before any optimization passes */
6435 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6436 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6437 LLVMDumpModule(bld_base->base.gallivm->module);
6438
6439 radeon_llvm_finalize_module(&ctx->radeon_bld);
6440
6441 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6442 &ctx->shader->config, ctx->tm,
6443 bld_base->base.gallivm->module,
6444 debug, PIPE_SHADER_GEOMETRY,
6445 "GS Copy Shader");
6446 if (!r) {
6447 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6448 fprintf(stderr, "GS Copy Shader:\n");
6449 si_shader_dump(sscreen, ctx->shader, debug,
6450 PIPE_SHADER_GEOMETRY, stderr);
6451 r = si_shader_binary_upload(sscreen, ctx->shader);
6452 }
6453
6454 radeon_llvm_dispose(&ctx->radeon_bld);
6455
6456 FREE(outputs);
6457 return r;
6458 }
6459
6460 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
6461 FILE *f)
6462 {
6463 int i;
6464
6465 fprintf(f, "SHADER KEY\n");
6466
6467 switch (shader) {
6468 case PIPE_SHADER_VERTEX:
6469 fprintf(f, " instance_divisors = {");
6470 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6471 fprintf(f, !i ? "%u" : ", %u",
6472 key->vs.prolog.instance_divisors[i]);
6473 fprintf(f, "}\n");
6474 fprintf(f, " as_es = %u\n", key->vs.as_es);
6475 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6476 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6477 break;
6478
6479 case PIPE_SHADER_TESS_CTRL:
6480 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6481 break;
6482
6483 case PIPE_SHADER_TESS_EVAL:
6484 fprintf(f, " as_es = %u\n", key->tes.as_es);
6485 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6486 break;
6487
6488 case PIPE_SHADER_GEOMETRY:
6489 case PIPE_SHADER_COMPUTE:
6490 break;
6491
6492 case PIPE_SHADER_FRAGMENT:
6493 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6494 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6495 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6496 fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6497 fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6498 fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6499 fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6500 fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6501 fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6502 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6503 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6504 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6505 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6506 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6507 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6508 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6509 break;
6510
6511 default:
6512 assert(0);
6513 }
6514 }
6515
6516 static void si_init_shader_ctx(struct si_shader_context *ctx,
6517 struct si_screen *sscreen,
6518 struct si_shader *shader,
6519 LLVMTargetMachineRef tm)
6520 {
6521 struct lp_build_tgsi_context *bld_base;
6522 struct lp_build_tgsi_action tmpl = {};
6523
6524 memset(ctx, 0, sizeof(*ctx));
6525 radeon_llvm_context_init(
6526 &ctx->radeon_bld, "amdgcn--",
6527 (shader && shader->selector) ? &shader->selector->info : NULL);
6528 ctx->tm = tm;
6529 ctx->screen = sscreen;
6530 if (shader && shader->selector)
6531 ctx->type = shader->selector->info.processor;
6532 else
6533 ctx->type = -1;
6534 ctx->shader = shader;
6535
6536 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6537 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6538 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6539 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6540 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6541 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6542 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6543 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6544 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6545 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6546 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6547 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6548
6549 bld_base = &ctx->radeon_bld.soa.bld_base;
6550 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6551
6552 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6553 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6554 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6555
6556 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6557 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6558 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6559 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6560 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6561 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6562 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6563 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6564 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6565 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6566 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6567 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6568 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6569 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6570
6571 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6572 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6573 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6574 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6575 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6576 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6577
6578 tmpl.fetch_args = atomic_fetch_args;
6579 tmpl.emit = atomic_emit;
6580 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6581 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6582 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6583 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6584 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6585 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6586 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6587 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6588 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6589 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6590 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6591 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6592 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6593 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6594 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6595 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6596 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6597 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6598 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6599 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6600
6601 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6602
6603 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6604 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6605 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6606 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6607
6608 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6609 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6610 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6611
6612 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6613 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6614 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6615 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6616 }
6617
6618 int si_compile_tgsi_shader(struct si_screen *sscreen,
6619 LLVMTargetMachineRef tm,
6620 struct si_shader *shader,
6621 bool is_monolithic,
6622 struct pipe_debug_callback *debug)
6623 {
6624 struct si_shader_selector *sel = shader->selector;
6625 struct si_shader_context ctx;
6626 struct lp_build_tgsi_context *bld_base;
6627 LLVMModuleRef mod;
6628 int r = 0;
6629
6630 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6631 * conversion fails. */
6632 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6633 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6634 tgsi_dump(sel->tokens, 0);
6635 si_dump_streamout(&sel->so);
6636 }
6637
6638 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6639 ctx.is_monolithic = is_monolithic;
6640
6641 shader->info.uses_instanceid = sel->info.uses_instanceid;
6642
6643 bld_base = &ctx.radeon_bld.soa.bld_base;
6644 ctx.radeon_bld.load_system_value = declare_system_value;
6645
6646 switch (ctx.type) {
6647 case PIPE_SHADER_VERTEX:
6648 ctx.radeon_bld.load_input = declare_input_vs;
6649 if (shader->key.vs.as_ls)
6650 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6651 else if (shader->key.vs.as_es)
6652 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6653 else
6654 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6655 break;
6656 case PIPE_SHADER_TESS_CTRL:
6657 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6658 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6659 bld_base->emit_store = store_output_tcs;
6660 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6661 break;
6662 case PIPE_SHADER_TESS_EVAL:
6663 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6664 if (shader->key.tes.as_es)
6665 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6666 else
6667 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6668 break;
6669 case PIPE_SHADER_GEOMETRY:
6670 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6671 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6672 break;
6673 case PIPE_SHADER_FRAGMENT:
6674 ctx.radeon_bld.load_input = declare_input_fs;
6675 if (is_monolithic)
6676 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6677 else
6678 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6679 break;
6680 case PIPE_SHADER_COMPUTE:
6681 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6682 break;
6683 default:
6684 assert(!"Unsupported shader type");
6685 return -1;
6686 }
6687
6688 create_meta_data(&ctx);
6689 create_function(&ctx);
6690 preload_constants(&ctx);
6691 preload_shader_buffers(&ctx);
6692 preload_samplers(&ctx);
6693 preload_images(&ctx);
6694 preload_streamout_buffers(&ctx);
6695 preload_ring_buffers(&ctx);
6696
6697 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6698 shader->key.ps.prolog.poly_stipple) {
6699 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6700 SI_PARAM_RW_BUFFERS);
6701 si_llvm_emit_polygon_stipple(&ctx, list,
6702 SI_PARAM_POS_FIXED_PT);
6703 }
6704
6705 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6706 int i;
6707 for (i = 0; i < 4; i++) {
6708 ctx.gs_next_vertex[i] =
6709 lp_build_alloca(bld_base->base.gallivm,
6710 ctx.i32, "");
6711 }
6712 }
6713
6714 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6715 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6716 goto out;
6717 }
6718
6719 si_llvm_build_ret(&ctx, ctx.return_value);
6720 mod = bld_base->base.gallivm->module;
6721
6722 /* Dump LLVM IR before any optimization passes */
6723 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6724 r600_can_dump_shader(&sscreen->b, ctx.type))
6725 LLVMDumpModule(mod);
6726
6727 radeon_llvm_finalize_module(&ctx.radeon_bld);
6728
6729 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6730 mod, debug, ctx.type, "TGSI shader");
6731 if (r) {
6732 fprintf(stderr, "LLVM failed to compile shader\n");
6733 goto out;
6734 }
6735
6736 radeon_llvm_dispose(&ctx.radeon_bld);
6737
6738 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6739 * LLVM 3.9svn has this bug.
6740 */
6741 if (sel->type == PIPE_SHADER_COMPUTE) {
6742 unsigned *props = sel->info.properties;
6743 unsigned wave_size = 64;
6744 unsigned max_vgprs = 256;
6745 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6746 unsigned max_sgprs_per_wave = 128;
6747 unsigned min_waves_per_cu =
6748 DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
6749 props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
6750 props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
6751 wave_size);
6752 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6753
6754 max_vgprs = max_vgprs / min_waves_per_simd;
6755 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6756
6757 if (shader->config.num_sgprs > max_sgprs ||
6758 shader->config.num_vgprs > max_vgprs) {
6759 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6760 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6761 shader->config.num_sgprs, shader->config.num_vgprs,
6762 max_sgprs, max_vgprs);
6763
6764 /* Just terminate the process, because dependent
6765 * shaders can hang due to bad input data, but use
6766 * the env var to allow shader-db to work.
6767 */
6768 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6769 abort();
6770 }
6771 }
6772
6773 /* Add the scratch offset to input SGPRs. */
6774 if (shader->config.scratch_bytes_per_wave)
6775 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6776
6777 /* Calculate the number of fragment input VGPRs. */
6778 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6779 shader->info.num_input_vgprs = 0;
6780 shader->info.face_vgpr_index = -1;
6781
6782 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6783 shader->info.num_input_vgprs += 2;
6784 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6785 shader->info.num_input_vgprs += 2;
6786 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6787 shader->info.num_input_vgprs += 2;
6788 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6789 shader->info.num_input_vgprs += 3;
6790 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6791 shader->info.num_input_vgprs += 2;
6792 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6793 shader->info.num_input_vgprs += 2;
6794 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6795 shader->info.num_input_vgprs += 2;
6796 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6797 shader->info.num_input_vgprs += 1;
6798 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6799 shader->info.num_input_vgprs += 1;
6800 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6801 shader->info.num_input_vgprs += 1;
6802 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6803 shader->info.num_input_vgprs += 1;
6804 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6805 shader->info.num_input_vgprs += 1;
6806 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6807 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6808 shader->info.num_input_vgprs += 1;
6809 }
6810 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6811 shader->info.num_input_vgprs += 1;
6812 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6813 shader->info.num_input_vgprs += 1;
6814 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6815 shader->info.num_input_vgprs += 1;
6816 }
6817
6818 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6819 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6820 shader->gs_copy_shader->selector = shader->selector;
6821 ctx.shader = shader->gs_copy_shader;
6822 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6823 shader, debug))) {
6824 free(shader->gs_copy_shader);
6825 shader->gs_copy_shader = NULL;
6826 goto out;
6827 }
6828 }
6829
6830 out:
6831 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6832 FREE(ctx.constants[i]);
6833 return r;
6834 }
6835
6836 /**
6837 * Create, compile and return a shader part (prolog or epilog).
6838 *
6839 * \param sscreen screen
6840 * \param list list of shader parts of the same category
6841 * \param key shader part key
6842 * \param tm LLVM target machine
6843 * \param debug debug callback
6844 * \param compile the callback responsible for compilation
6845 * \return non-NULL on success
6846 */
6847 static struct si_shader_part *
6848 si_get_shader_part(struct si_screen *sscreen,
6849 struct si_shader_part **list,
6850 union si_shader_part_key *key,
6851 LLVMTargetMachineRef tm,
6852 struct pipe_debug_callback *debug,
6853 bool (*compile)(struct si_screen *,
6854 LLVMTargetMachineRef,
6855 struct pipe_debug_callback *,
6856 struct si_shader_part *))
6857 {
6858 struct si_shader_part *result;
6859
6860 pipe_mutex_lock(sscreen->shader_parts_mutex);
6861
6862 /* Find existing. */
6863 for (result = *list; result; result = result->next) {
6864 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6865 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6866 return result;
6867 }
6868 }
6869
6870 /* Compile a new one. */
6871 result = CALLOC_STRUCT(si_shader_part);
6872 result->key = *key;
6873 if (!compile(sscreen, tm, debug, result)) {
6874 FREE(result);
6875 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6876 return NULL;
6877 }
6878
6879 result->next = *list;
6880 *list = result;
6881 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6882 return result;
6883 }
6884
6885 /**
6886 * Create a vertex shader prolog.
6887 *
6888 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6889 * All inputs are returned unmodified. The vertex load indices are
6890 * stored after them, which will used by the API VS for fetching inputs.
6891 *
6892 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6893 * input_v0,
6894 * input_v1,
6895 * input_v2,
6896 * input_v3,
6897 * (VertexID + BaseVertex),
6898 * (InstanceID + StartInstance),
6899 * (InstanceID / 2 + StartInstance)
6900 */
6901 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6902 LLVMTargetMachineRef tm,
6903 struct pipe_debug_callback *debug,
6904 struct si_shader_part *out)
6905 {
6906 union si_shader_part_key *key = &out->key;
6907 struct si_shader shader = {};
6908 struct si_shader_context ctx;
6909 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6910 LLVMTypeRef *params, *returns;
6911 LLVMValueRef ret, func;
6912 int last_sgpr, num_params, num_returns, i;
6913 bool status = true;
6914
6915 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6916 ctx.type = PIPE_SHADER_VERTEX;
6917 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6918 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6919
6920 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6921 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6922 sizeof(LLVMTypeRef));
6923 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6924 key->vs_prolog.last_input + 1) *
6925 sizeof(LLVMTypeRef));
6926 num_params = 0;
6927 num_returns = 0;
6928
6929 /* Declare input and output SGPRs. */
6930 num_params = 0;
6931 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6932 params[num_params++] = ctx.i32;
6933 returns[num_returns++] = ctx.i32;
6934 }
6935 last_sgpr = num_params - 1;
6936
6937 /* 4 preloaded VGPRs (outputs must be floats) */
6938 for (i = 0; i < 4; i++) {
6939 params[num_params++] = ctx.i32;
6940 returns[num_returns++] = ctx.f32;
6941 }
6942
6943 /* Vertex load indices. */
6944 for (i = 0; i <= key->vs_prolog.last_input; i++)
6945 returns[num_returns++] = ctx.f32;
6946
6947 /* Create the function. */
6948 si_create_function(&ctx, returns, num_returns, params,
6949 num_params, last_sgpr);
6950 func = ctx.radeon_bld.main_fn;
6951
6952 /* Copy inputs to outputs. This should be no-op, as the registers match,
6953 * but it will prevent the compiler from overwriting them unintentionally.
6954 */
6955 ret = ctx.return_value;
6956 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6957 LLVMValueRef p = LLVMGetParam(func, i);
6958 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6959 }
6960 for (i = num_params - 4; i < num_params; i++) {
6961 LLVMValueRef p = LLVMGetParam(func, i);
6962 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6963 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6964 }
6965
6966 /* Compute vertex load indices from instance divisors. */
6967 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6968 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6969 LLVMValueRef index;
6970
6971 if (divisor) {
6972 /* InstanceID / Divisor + StartInstance */
6973 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6974 SI_SGPR_START_INSTANCE,
6975 divisor);
6976 } else {
6977 /* VertexID + BaseVertex */
6978 index = LLVMBuildAdd(gallivm->builder,
6979 LLVMGetParam(func, ctx.param_vertex_id),
6980 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6981 }
6982
6983 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6984 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6985 num_params++, "");
6986 }
6987
6988 /* Compile. */
6989 si_llvm_build_ret(&ctx, ret);
6990 radeon_llvm_finalize_module(&ctx.radeon_bld);
6991
6992 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6993 gallivm->module, debug, ctx.type,
6994 "Vertex Shader Prolog"))
6995 status = false;
6996
6997 radeon_llvm_dispose(&ctx.radeon_bld);
6998 return status;
6999 }
7000
7001 /**
7002 * Compile the vertex shader epilog. This is also used by the tessellation
7003 * evaluation shader compiled as VS.
7004 *
7005 * The input is PrimitiveID.
7006 *
7007 * If PrimitiveID is required by the pixel shader, export it.
7008 * Otherwise, do nothing.
7009 */
7010 static bool si_compile_vs_epilog(struct si_screen *sscreen,
7011 LLVMTargetMachineRef tm,
7012 struct pipe_debug_callback *debug,
7013 struct si_shader_part *out)
7014 {
7015 union si_shader_part_key *key = &out->key;
7016 struct si_shader_context ctx;
7017 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7018 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7019 LLVMTypeRef params[5];
7020 int num_params, i;
7021 bool status = true;
7022
7023 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
7024 ctx.type = PIPE_SHADER_VERTEX;
7025
7026 /* Declare input VGPRs. */
7027 num_params = key->vs_epilog.states.export_prim_id ?
7028 (VS_EPILOG_PRIMID_LOC + 1) : 0;
7029 assert(num_params <= ARRAY_SIZE(params));
7030
7031 for (i = 0; i < num_params; i++)
7032 params[i] = ctx.f32;
7033
7034 /* Create the function. */
7035 si_create_function(&ctx, NULL, 0, params, num_params, -1);
7036
7037 /* Emit exports. */
7038 if (key->vs_epilog.states.export_prim_id) {
7039 struct lp_build_context *base = &bld_base->base;
7040 struct lp_build_context *uint = &bld_base->uint_bld;
7041 LLVMValueRef args[9];
7042
7043 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7044 args[1] = uint->zero; /* whether the EXEC mask is valid */
7045 args[2] = uint->zero; /* DONE bit */
7046 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7047 key->vs_epilog.prim_id_param_offset);
7048 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7049 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
7050 VS_EPILOG_PRIMID_LOC); /* X */
7051 args[6] = uint->undef; /* Y */
7052 args[7] = uint->undef; /* Z */
7053 args[8] = uint->undef; /* W */
7054
7055 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7056 LLVMVoidTypeInContext(base->gallivm->context),
7057 args, 9, 0);
7058 }
7059
7060 /* Compile. */
7061 LLVMBuildRetVoid(gallivm->builder);
7062 radeon_llvm_finalize_module(&ctx.radeon_bld);
7063
7064 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7065 gallivm->module, debug, ctx.type,
7066 "Vertex Shader Epilog"))
7067 status = false;
7068
7069 radeon_llvm_dispose(&ctx.radeon_bld);
7070 return status;
7071 }
7072
7073 /**
7074 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7075 */
7076 static bool si_get_vs_epilog(struct si_screen *sscreen,
7077 LLVMTargetMachineRef tm,
7078 struct si_shader *shader,
7079 struct pipe_debug_callback *debug,
7080 struct si_vs_epilog_bits *states)
7081 {
7082 union si_shader_part_key epilog_key;
7083
7084 memset(&epilog_key, 0, sizeof(epilog_key));
7085 epilog_key.vs_epilog.states = *states;
7086
7087 /* Set up the PrimitiveID output. */
7088 if (shader->key.vs.epilog.export_prim_id) {
7089 unsigned index = shader->selector->info.num_outputs;
7090 unsigned offset = shader->info.nr_param_exports++;
7091
7092 epilog_key.vs_epilog.prim_id_param_offset = offset;
7093 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7094 shader->info.vs_output_param_offset[index] = offset;
7095 }
7096
7097 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7098 &epilog_key, tm, debug,
7099 si_compile_vs_epilog);
7100 return shader->epilog != NULL;
7101 }
7102
7103 /**
7104 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7105 */
7106 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7107 LLVMTargetMachineRef tm,
7108 struct si_shader *shader,
7109 struct pipe_debug_callback *debug)
7110 {
7111 struct tgsi_shader_info *info = &shader->selector->info;
7112 union si_shader_part_key prolog_key;
7113 unsigned i;
7114
7115 /* Get the prolog. */
7116 memset(&prolog_key, 0, sizeof(prolog_key));
7117 prolog_key.vs_prolog.states = shader->key.vs.prolog;
7118 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7119 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7120
7121 /* The prolog is a no-op if there are no inputs. */
7122 if (info->num_inputs) {
7123 shader->prolog =
7124 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7125 &prolog_key, tm, debug,
7126 si_compile_vs_prolog);
7127 if (!shader->prolog)
7128 return false;
7129 }
7130
7131 /* Get the epilog. */
7132 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7133 !si_get_vs_epilog(sscreen, tm, shader, debug,
7134 &shader->key.vs.epilog))
7135 return false;
7136
7137 /* Set the instanceID flag. */
7138 for (i = 0; i < info->num_inputs; i++)
7139 if (prolog_key.vs_prolog.states.instance_divisors[i])
7140 shader->info.uses_instanceid = true;
7141
7142 return true;
7143 }
7144
7145 /**
7146 * Select and compile (or reuse) TES parts (epilog).
7147 */
7148 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7149 LLVMTargetMachineRef tm,
7150 struct si_shader *shader,
7151 struct pipe_debug_callback *debug)
7152 {
7153 if (shader->key.tes.as_es)
7154 return true;
7155
7156 /* TES compiled as VS. */
7157 return si_get_vs_epilog(sscreen, tm, shader, debug,
7158 &shader->key.tes.epilog);
7159 }
7160
7161 /**
7162 * Compile the TCS epilog. This writes tesselation factors to memory based on
7163 * the output primitive type of the tesselator (determined by TES).
7164 */
7165 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7166 LLVMTargetMachineRef tm,
7167 struct pipe_debug_callback *debug,
7168 struct si_shader_part *out)
7169 {
7170 union si_shader_part_key *key = &out->key;
7171 struct si_shader shader = {};
7172 struct si_shader_context ctx;
7173 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7174 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7175 LLVMTypeRef params[16];
7176 LLVMValueRef func;
7177 int last_sgpr, num_params;
7178 bool status = true;
7179
7180 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7181 ctx.type = PIPE_SHADER_TESS_CTRL;
7182 shader.key.tcs.epilog = key->tcs_epilog.states;
7183
7184 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7185 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7186 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7187 params[SI_PARAM_SAMPLERS] = ctx.i64;
7188 params[SI_PARAM_IMAGES] = ctx.i64;
7189 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7190 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7191 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7192 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7193 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7194 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7195 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7196 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7197 num_params = last_sgpr + 1;
7198
7199 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7200 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7201 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7202
7203 /* Create the function. */
7204 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7205 declare_tess_lds(&ctx);
7206 func = ctx.radeon_bld.main_fn;
7207
7208 si_write_tess_factors(bld_base,
7209 LLVMGetParam(func, last_sgpr + 1),
7210 LLVMGetParam(func, last_sgpr + 2),
7211 LLVMGetParam(func, last_sgpr + 3));
7212
7213 /* Compile. */
7214 LLVMBuildRetVoid(gallivm->builder);
7215 radeon_llvm_finalize_module(&ctx.radeon_bld);
7216
7217 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7218 gallivm->module, debug, ctx.type,
7219 "Tessellation Control Shader Epilog"))
7220 status = false;
7221
7222 radeon_llvm_dispose(&ctx.radeon_bld);
7223 return status;
7224 }
7225
7226 /**
7227 * Select and compile (or reuse) TCS parts (epilog).
7228 */
7229 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7230 LLVMTargetMachineRef tm,
7231 struct si_shader *shader,
7232 struct pipe_debug_callback *debug)
7233 {
7234 union si_shader_part_key epilog_key;
7235
7236 /* Get the epilog. */
7237 memset(&epilog_key, 0, sizeof(epilog_key));
7238 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7239
7240 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7241 &epilog_key, tm, debug,
7242 si_compile_tcs_epilog);
7243 return shader->epilog != NULL;
7244 }
7245
7246 /**
7247 * Compile the pixel shader prolog. This handles:
7248 * - two-side color selection and interpolation
7249 * - overriding interpolation parameters for the API PS
7250 * - polygon stippling
7251 *
7252 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7253 * overriden by other states. (e.g. per-sample interpolation)
7254 * Interpolated colors are stored after the preloaded VGPRs.
7255 */
7256 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7257 LLVMTargetMachineRef tm,
7258 struct pipe_debug_callback *debug,
7259 struct si_shader_part *out)
7260 {
7261 union si_shader_part_key *key = &out->key;
7262 struct si_shader shader = {};
7263 struct si_shader_context ctx;
7264 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7265 LLVMTypeRef *params;
7266 LLVMValueRef ret, func;
7267 int last_sgpr, num_params, num_returns, i, num_color_channels;
7268 bool status = true;
7269
7270 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7271 ctx.type = PIPE_SHADER_FRAGMENT;
7272 shader.key.ps.prolog = key->ps_prolog.states;
7273
7274 /* Number of inputs + 8 color elements. */
7275 params = alloca((key->ps_prolog.num_input_sgprs +
7276 key->ps_prolog.num_input_vgprs + 8) *
7277 sizeof(LLVMTypeRef));
7278
7279 /* Declare inputs. */
7280 num_params = 0;
7281 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7282 params[num_params++] = ctx.i32;
7283 last_sgpr = num_params - 1;
7284
7285 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7286 params[num_params++] = ctx.f32;
7287
7288 /* Declare outputs (same as inputs + add colors if needed) */
7289 num_returns = num_params;
7290 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7291 for (i = 0; i < num_color_channels; i++)
7292 params[num_returns++] = ctx.f32;
7293
7294 /* Create the function. */
7295 si_create_function(&ctx, params, num_returns, params,
7296 num_params, last_sgpr);
7297 func = ctx.radeon_bld.main_fn;
7298
7299 /* Copy inputs to outputs. This should be no-op, as the registers match,
7300 * but it will prevent the compiler from overwriting them unintentionally.
7301 */
7302 ret = ctx.return_value;
7303 for (i = 0; i < num_params; i++) {
7304 LLVMValueRef p = LLVMGetParam(func, i);
7305 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7306 }
7307
7308 /* Polygon stippling. */
7309 if (key->ps_prolog.states.poly_stipple) {
7310 /* POS_FIXED_PT is always last. */
7311 unsigned pos = key->ps_prolog.num_input_sgprs +
7312 key->ps_prolog.num_input_vgprs - 1;
7313 LLVMValueRef ptr[2], list;
7314
7315 /* Get the pointer to rw buffers. */
7316 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7317 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7318 list = lp_build_gather_values(gallivm, ptr, 2);
7319 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7320 list = LLVMBuildIntToPtr(gallivm->builder, list,
7321 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7322
7323 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7324 }
7325
7326 if (key->ps_prolog.states.bc_optimize_for_persp ||
7327 key->ps_prolog.states.bc_optimize_for_linear) {
7328 unsigned i, base = key->ps_prolog.num_input_sgprs;
7329 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7330
7331 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7332 * The hw doesn't compute CENTROID if the whole wave only
7333 * contains fully-covered quads.
7334 *
7335 * PRIM_MASK is after user SGPRs.
7336 */
7337 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7338 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7339 LLVMConstInt(ctx.i32, 31, 0), "");
7340 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7341 ctx.i1, "");
7342
7343 if (key->ps_prolog.states.bc_optimize_for_persp) {
7344 /* Read PERSP_CENTER. */
7345 for (i = 0; i < 2; i++)
7346 center[i] = LLVMGetParam(func, base + 2 + i);
7347 /* Read PERSP_CENTROID. */
7348 for (i = 0; i < 2; i++)
7349 centroid[i] = LLVMGetParam(func, base + 4 + i);
7350 /* Select PERSP_CENTROID. */
7351 for (i = 0; i < 2; i++) {
7352 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7353 center[i], centroid[i], "");
7354 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7355 tmp, base + 4 + i, "");
7356 }
7357 }
7358 if (key->ps_prolog.states.bc_optimize_for_linear) {
7359 /* Read LINEAR_CENTER. */
7360 for (i = 0; i < 2; i++)
7361 center[i] = LLVMGetParam(func, base + 8 + i);
7362 /* Read LINEAR_CENTROID. */
7363 for (i = 0; i < 2; i++)
7364 centroid[i] = LLVMGetParam(func, base + 10 + i);
7365 /* Select LINEAR_CENTROID. */
7366 for (i = 0; i < 2; i++) {
7367 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7368 center[i], centroid[i], "");
7369 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7370 tmp, base + 10 + i, "");
7371 }
7372 }
7373 }
7374
7375 /* Interpolate colors. */
7376 for (i = 0; i < 2; i++) {
7377 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7378 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7379 key->ps_prolog.face_vgpr_index;
7380 LLVMValueRef interp[2], color[4];
7381 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7382
7383 if (!writemask)
7384 continue;
7385
7386 /* If the interpolation qualifier is not CONSTANT (-1). */
7387 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7388 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7389 key->ps_prolog.color_interp_vgpr_index[i];
7390
7391 /* Get the (i,j) updated by bc_optimize handling. */
7392 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7393 interp_vgpr, "");
7394 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7395 interp_vgpr + 1, "");
7396 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7397 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7398 ctx.v2i32, "");
7399 }
7400
7401 /* Use the absolute location of the input. */
7402 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7403
7404 if (key->ps_prolog.states.color_two_side) {
7405 face = LLVMGetParam(func, face_vgpr);
7406 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7407 }
7408
7409 interp_fs_input(&ctx,
7410 key->ps_prolog.color_attr_index[i],
7411 TGSI_SEMANTIC_COLOR, i,
7412 key->ps_prolog.num_interp_inputs,
7413 key->ps_prolog.colors_read, interp_ij,
7414 prim_mask, face, color);
7415
7416 while (writemask) {
7417 unsigned chan = u_bit_scan(&writemask);
7418 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7419 num_params++, "");
7420 }
7421 }
7422
7423 /* Force per-sample interpolation. */
7424 if (key->ps_prolog.states.force_persp_sample_interp) {
7425 unsigned i, base = key->ps_prolog.num_input_sgprs;
7426 LLVMValueRef persp_sample[2];
7427
7428 /* Read PERSP_SAMPLE. */
7429 for (i = 0; i < 2; i++)
7430 persp_sample[i] = LLVMGetParam(func, base + i);
7431 /* Overwrite PERSP_CENTER. */
7432 for (i = 0; i < 2; i++)
7433 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7434 persp_sample[i], base + 2 + i, "");
7435 /* Overwrite PERSP_CENTROID. */
7436 for (i = 0; i < 2; i++)
7437 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7438 persp_sample[i], base + 4 + i, "");
7439 }
7440 if (key->ps_prolog.states.force_linear_sample_interp) {
7441 unsigned i, base = key->ps_prolog.num_input_sgprs;
7442 LLVMValueRef linear_sample[2];
7443
7444 /* Read LINEAR_SAMPLE. */
7445 for (i = 0; i < 2; i++)
7446 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7447 /* Overwrite LINEAR_CENTER. */
7448 for (i = 0; i < 2; i++)
7449 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7450 linear_sample[i], base + 8 + i, "");
7451 /* Overwrite LINEAR_CENTROID. */
7452 for (i = 0; i < 2; i++)
7453 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7454 linear_sample[i], base + 10 + i, "");
7455 }
7456
7457 /* Force center interpolation. */
7458 if (key->ps_prolog.states.force_persp_center_interp) {
7459 unsigned i, base = key->ps_prolog.num_input_sgprs;
7460 LLVMValueRef persp_center[2];
7461
7462 /* Read PERSP_CENTER. */
7463 for (i = 0; i < 2; i++)
7464 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7465 /* Overwrite PERSP_SAMPLE. */
7466 for (i = 0; i < 2; i++)
7467 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7468 persp_center[i], base + i, "");
7469 /* Overwrite PERSP_CENTROID. */
7470 for (i = 0; i < 2; i++)
7471 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7472 persp_center[i], base + 4 + i, "");
7473 }
7474 if (key->ps_prolog.states.force_linear_center_interp) {
7475 unsigned i, base = key->ps_prolog.num_input_sgprs;
7476 LLVMValueRef linear_center[2];
7477
7478 /* Read LINEAR_CENTER. */
7479 for (i = 0; i < 2; i++)
7480 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7481 /* Overwrite LINEAR_SAMPLE. */
7482 for (i = 0; i < 2; i++)
7483 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7484 linear_center[i], base + 6 + i, "");
7485 /* Overwrite LINEAR_CENTROID. */
7486 for (i = 0; i < 2; i++)
7487 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7488 linear_center[i], base + 10 + i, "");
7489 }
7490
7491 /* Tell LLVM to insert WQM instruction sequence when needed. */
7492 if (key->ps_prolog.wqm) {
7493 LLVMAddTargetDependentFunctionAttr(func,
7494 "amdgpu-ps-wqm-outputs", "");
7495 }
7496
7497 /* Compile. */
7498 si_llvm_build_ret(&ctx, ret);
7499 radeon_llvm_finalize_module(&ctx.radeon_bld);
7500
7501 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7502 gallivm->module, debug, ctx.type,
7503 "Fragment Shader Prolog"))
7504 status = false;
7505
7506 radeon_llvm_dispose(&ctx.radeon_bld);
7507 return status;
7508 }
7509
7510 /**
7511 * Compile the pixel shader epilog. This handles everything that must be
7512 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7513 */
7514 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7515 LLVMTargetMachineRef tm,
7516 struct pipe_debug_callback *debug,
7517 struct si_shader_part *out)
7518 {
7519 union si_shader_part_key *key = &out->key;
7520 struct si_shader shader = {};
7521 struct si_shader_context ctx;
7522 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7523 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7524 LLVMTypeRef params[16+8*4+3];
7525 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7526 int last_sgpr, num_params, i;
7527 bool status = true;
7528 struct si_ps_exports exp = {};
7529
7530 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7531 ctx.type = PIPE_SHADER_FRAGMENT;
7532 shader.key.ps.epilog = key->ps_epilog.states;
7533
7534 /* Declare input SGPRs. */
7535 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7536 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7537 params[SI_PARAM_SAMPLERS] = ctx.i64;
7538 params[SI_PARAM_IMAGES] = ctx.i64;
7539 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7540 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7541 last_sgpr = SI_PARAM_ALPHA_REF;
7542
7543 /* Declare input VGPRs. */
7544 num_params = (last_sgpr + 1) +
7545 util_bitcount(key->ps_epilog.colors_written) * 4 +
7546 key->ps_epilog.writes_z +
7547 key->ps_epilog.writes_stencil +
7548 key->ps_epilog.writes_samplemask;
7549
7550 num_params = MAX2(num_params,
7551 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7552
7553 assert(num_params <= ARRAY_SIZE(params));
7554
7555 for (i = last_sgpr + 1; i < num_params; i++)
7556 params[i] = ctx.f32;
7557
7558 /* Create the function. */
7559 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7560 /* Disable elimination of unused inputs. */
7561 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7562 "InitialPSInputAddr", 0xffffff);
7563
7564 /* Process colors. */
7565 unsigned vgpr = last_sgpr + 1;
7566 unsigned colors_written = key->ps_epilog.colors_written;
7567 int last_color_export = -1;
7568
7569 /* Find the last color export. */
7570 if (!key->ps_epilog.writes_z &&
7571 !key->ps_epilog.writes_stencil &&
7572 !key->ps_epilog.writes_samplemask) {
7573 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7574
7575 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7576 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7577 /* Just set this if any of the colorbuffers are enabled. */
7578 if (spi_format &
7579 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7580 last_color_export = 0;
7581 } else {
7582 for (i = 0; i < 8; i++)
7583 if (colors_written & (1 << i) &&
7584 (spi_format >> (i * 4)) & 0xf)
7585 last_color_export = i;
7586 }
7587 }
7588
7589 while (colors_written) {
7590 LLVMValueRef color[4];
7591 int mrt = u_bit_scan(&colors_written);
7592
7593 for (i = 0; i < 4; i++)
7594 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7595
7596 si_export_mrt_color(bld_base, color, mrt,
7597 num_params - 1,
7598 mrt == last_color_export, &exp);
7599 }
7600
7601 /* Process depth, stencil, samplemask. */
7602 if (key->ps_epilog.writes_z)
7603 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7604 if (key->ps_epilog.writes_stencil)
7605 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7606 if (key->ps_epilog.writes_samplemask)
7607 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7608
7609 if (depth || stencil || samplemask)
7610 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7611 else if (last_color_export == -1)
7612 si_export_null(bld_base);
7613
7614 if (exp.num)
7615 si_emit_ps_exports(&ctx, &exp);
7616
7617 /* Compile. */
7618 LLVMBuildRetVoid(gallivm->builder);
7619 radeon_llvm_finalize_module(&ctx.radeon_bld);
7620
7621 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7622 gallivm->module, debug, ctx.type,
7623 "Fragment Shader Epilog"))
7624 status = false;
7625
7626 radeon_llvm_dispose(&ctx.radeon_bld);
7627 return status;
7628 }
7629
7630 /**
7631 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7632 */
7633 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7634 LLVMTargetMachineRef tm,
7635 struct si_shader *shader,
7636 struct pipe_debug_callback *debug)
7637 {
7638 struct tgsi_shader_info *info = &shader->selector->info;
7639 union si_shader_part_key prolog_key;
7640 union si_shader_part_key epilog_key;
7641 unsigned i;
7642
7643 /* Get the prolog. */
7644 memset(&prolog_key, 0, sizeof(prolog_key));
7645 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7646 prolog_key.ps_prolog.colors_read = info->colors_read;
7647 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7648 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7649 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7650 (prolog_key.ps_prolog.colors_read ||
7651 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7652 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7653 prolog_key.ps_prolog.states.force_persp_center_interp ||
7654 prolog_key.ps_prolog.states.force_linear_center_interp ||
7655 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7656 prolog_key.ps_prolog.states.bc_optimize_for_linear);
7657
7658 if (info->colors_read) {
7659 unsigned *color = shader->selector->color_attr_index;
7660
7661 if (shader->key.ps.prolog.color_two_side) {
7662 /* BCOLORs are stored after the last input. */
7663 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7664 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7665 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7666 }
7667
7668 for (i = 0; i < 2; i++) {
7669 unsigned interp = info->input_interpolate[color[i]];
7670 unsigned location = info->input_interpolate_loc[color[i]];
7671
7672 if (!(info->colors_read & (0xf << i*4)))
7673 continue;
7674
7675 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7676
7677 if (shader->key.ps.prolog.flatshade_colors &&
7678 interp == TGSI_INTERPOLATE_COLOR)
7679 interp = TGSI_INTERPOLATE_CONSTANT;
7680
7681 switch (interp) {
7682 case TGSI_INTERPOLATE_CONSTANT:
7683 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7684 break;
7685 case TGSI_INTERPOLATE_PERSPECTIVE:
7686 case TGSI_INTERPOLATE_COLOR:
7687 /* Force the interpolation location for colors here. */
7688 if (shader->key.ps.prolog.force_persp_sample_interp)
7689 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7690 if (shader->key.ps.prolog.force_persp_center_interp)
7691 location = TGSI_INTERPOLATE_LOC_CENTER;
7692
7693 switch (location) {
7694 case TGSI_INTERPOLATE_LOC_SAMPLE:
7695 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7696 shader->config.spi_ps_input_ena |=
7697 S_0286CC_PERSP_SAMPLE_ENA(1);
7698 break;
7699 case TGSI_INTERPOLATE_LOC_CENTER:
7700 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7701 shader->config.spi_ps_input_ena |=
7702 S_0286CC_PERSP_CENTER_ENA(1);
7703 break;
7704 case TGSI_INTERPOLATE_LOC_CENTROID:
7705 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7706 shader->config.spi_ps_input_ena |=
7707 S_0286CC_PERSP_CENTROID_ENA(1);
7708 break;
7709 default:
7710 assert(0);
7711 }
7712 break;
7713 case TGSI_INTERPOLATE_LINEAR:
7714 /* Force the interpolation location for colors here. */
7715 if (shader->key.ps.prolog.force_linear_sample_interp)
7716 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7717 if (shader->key.ps.prolog.force_linear_center_interp)
7718 location = TGSI_INTERPOLATE_LOC_CENTER;
7719
7720 switch (location) {
7721 case TGSI_INTERPOLATE_LOC_SAMPLE:
7722 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7723 shader->config.spi_ps_input_ena |=
7724 S_0286CC_LINEAR_SAMPLE_ENA(1);
7725 break;
7726 case TGSI_INTERPOLATE_LOC_CENTER:
7727 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7728 shader->config.spi_ps_input_ena |=
7729 S_0286CC_LINEAR_CENTER_ENA(1);
7730 break;
7731 case TGSI_INTERPOLATE_LOC_CENTROID:
7732 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7733 shader->config.spi_ps_input_ena |=
7734 S_0286CC_LINEAR_CENTROID_ENA(1);
7735 break;
7736 default:
7737 assert(0);
7738 }
7739 break;
7740 default:
7741 assert(0);
7742 }
7743 }
7744 }
7745
7746 /* The prolog is a no-op if these aren't set. */
7747 if (prolog_key.ps_prolog.colors_read ||
7748 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7749 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7750 prolog_key.ps_prolog.states.force_persp_center_interp ||
7751 prolog_key.ps_prolog.states.force_linear_center_interp ||
7752 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7753 prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7754 prolog_key.ps_prolog.states.poly_stipple) {
7755 shader->prolog =
7756 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7757 &prolog_key, tm, debug,
7758 si_compile_ps_prolog);
7759 if (!shader->prolog)
7760 return false;
7761 }
7762
7763 /* Get the epilog. */
7764 memset(&epilog_key, 0, sizeof(epilog_key));
7765 epilog_key.ps_epilog.colors_written = info->colors_written;
7766 epilog_key.ps_epilog.writes_z = info->writes_z;
7767 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7768 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7769 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7770
7771 shader->epilog =
7772 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7773 &epilog_key, tm, debug,
7774 si_compile_ps_epilog);
7775 if (!shader->epilog)
7776 return false;
7777
7778 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7779 if (shader->key.ps.prolog.poly_stipple) {
7780 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7781 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7782 }
7783
7784 /* Set up the enable bits for per-sample shading if needed. */
7785 if (shader->key.ps.prolog.force_persp_sample_interp &&
7786 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7787 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7788 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7789 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7790 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7791 }
7792 if (shader->key.ps.prolog.force_linear_sample_interp &&
7793 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7794 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7795 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7796 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7797 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7798 }
7799 if (shader->key.ps.prolog.force_persp_center_interp &&
7800 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7801 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7802 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7803 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7804 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7805 }
7806 if (shader->key.ps.prolog.force_linear_center_interp &&
7807 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7808 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7809 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7810 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7811 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7812 }
7813
7814 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7815 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7816 !(shader->config.spi_ps_input_ena & 0xf)) {
7817 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7818 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7819 }
7820
7821 /* At least one pair of interpolation weights must be enabled. */
7822 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7823 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7824 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7825 }
7826
7827 /* The sample mask input is always enabled, because the API shader always
7828 * passes it through to the epilog. Disable it here if it's unused.
7829 */
7830 if (!shader->key.ps.epilog.poly_line_smoothing &&
7831 !shader->selector->info.reads_samplemask)
7832 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7833
7834 return true;
7835 }
7836
7837 static void si_fix_num_sgprs(struct si_shader *shader)
7838 {
7839 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7840
7841 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7842 }
7843
7844 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7845 struct si_shader *shader,
7846 struct pipe_debug_callback *debug)
7847 {
7848 struct si_shader *mainp = shader->selector->main_shader_part;
7849 int r;
7850
7851 /* LS, ES, VS are compiled on demand if the main part hasn't been
7852 * compiled for that stage.
7853 */
7854 if (!mainp ||
7855 (shader->selector->type == PIPE_SHADER_VERTEX &&
7856 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7857 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7858 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7859 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7860 (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7861 shader->key.tcs.epilog.inputs_to_copy) ||
7862 shader->selector->type == PIPE_SHADER_COMPUTE) {
7863 /* Monolithic shader (compiled as a whole, has many variants,
7864 * may take a long time to compile).
7865 */
7866 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7867 if (r)
7868 return r;
7869 } else {
7870 /* The shader consists of 2-3 parts:
7871 *
7872 * - the middle part is the user shader, it has 1 variant only
7873 * and it was compiled during the creation of the shader
7874 * selector
7875 * - the prolog part is inserted at the beginning
7876 * - the epilog part is inserted at the end
7877 *
7878 * The prolog and epilog have many (but simple) variants.
7879 */
7880
7881 /* Copy the compiled TGSI shader data over. */
7882 shader->is_binary_shared = true;
7883 shader->binary = mainp->binary;
7884 shader->config = mainp->config;
7885 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7886 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7887 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7888 memcpy(shader->info.vs_output_param_offset,
7889 mainp->info.vs_output_param_offset,
7890 sizeof(mainp->info.vs_output_param_offset));
7891 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7892 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7893 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7894
7895 /* Select prologs and/or epilogs. */
7896 switch (shader->selector->type) {
7897 case PIPE_SHADER_VERTEX:
7898 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7899 return -1;
7900 break;
7901 case PIPE_SHADER_TESS_CTRL:
7902 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7903 return -1;
7904 break;
7905 case PIPE_SHADER_TESS_EVAL:
7906 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7907 return -1;
7908 break;
7909 case PIPE_SHADER_FRAGMENT:
7910 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7911 return -1;
7912
7913 /* Make sure we have at least as many VGPRs as there
7914 * are allocated inputs.
7915 */
7916 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7917 shader->info.num_input_vgprs);
7918 break;
7919 }
7920
7921 /* Update SGPR and VGPR counts. */
7922 if (shader->prolog) {
7923 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7924 shader->prolog->config.num_sgprs);
7925 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7926 shader->prolog->config.num_vgprs);
7927 }
7928 if (shader->epilog) {
7929 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7930 shader->epilog->config.num_sgprs);
7931 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7932 shader->epilog->config.num_vgprs);
7933 }
7934 }
7935
7936 si_fix_num_sgprs(shader);
7937 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7938 stderr);
7939
7940 /* Upload. */
7941 r = si_shader_binary_upload(sscreen, shader);
7942 if (r) {
7943 fprintf(stderr, "LLVM failed to upload shader\n");
7944 return r;
7945 }
7946
7947 return 0;
7948 }
7949
7950 void si_shader_destroy(struct si_shader *shader)
7951 {
7952 if (shader->gs_copy_shader) {
7953 si_shader_destroy(shader->gs_copy_shader);
7954 FREE(shader->gs_copy_shader);
7955 }
7956
7957 if (shader->scratch_bo)
7958 r600_resource_reference(&shader->scratch_bo, NULL);
7959
7960 r600_resource_reference(&shader->bo, NULL);
7961
7962 if (!shader->is_binary_shared)
7963 radeon_shader_binary_clean(&shader->binary);
7964
7965 free(shader->shader_log);
7966 }