radeonsi: clean up shader value metadata code
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "radeon/r600_cs.h"
37 #include "radeon/radeon_llvm.h"
38 #include "radeon/radeon_elf_util.h"
39 #include "radeon/radeon_llvm_emit.h"
40 #include "util/u_memory.h"
41 #include "util/u_pstipple.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94 int param_oc_lds;
95
96 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
97 * 0x800000 for VS, 0x1 for ES.
98 */
99 int param_tess_offchip;
100
101 LLVMTargetMachineRef tm;
102
103 unsigned range_md_kind;
104 unsigned tbaa_md_kind;
105 unsigned uniform_md_kind;
106 LLVMValueRef tbaa_const_md;
107 LLVMValueRef empty_md;
108
109 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
110 LLVMValueRef lds;
111 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
112 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
113 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
114 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
115 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
116 LLVMValueRef images[SI_NUM_IMAGES];
117 LLVMValueRef so_buffers[4];
118 LLVMValueRef esgs_ring;
119 LLVMValueRef gsvs_ring[4];
120 LLVMValueRef gs_next_vertex[4];
121 LLVMValueRef return_value;
122
123 LLVMTypeRef voidt;
124 LLVMTypeRef i1;
125 LLVMTypeRef i8;
126 LLVMTypeRef i32;
127 LLVMTypeRef i64;
128 LLVMTypeRef i128;
129 LLVMTypeRef f32;
130 LLVMTypeRef v16i8;
131 LLVMTypeRef v2i32;
132 LLVMTypeRef v4i32;
133 LLVMTypeRef v4f32;
134 LLVMTypeRef v8i32;
135
136 LLVMValueRef shared_memory;
137 };
138
139 static struct si_shader_context *si_shader_context(
140 struct lp_build_tgsi_context *bld_base)
141 {
142 return (struct si_shader_context *)bld_base;
143 }
144
145 static void si_init_shader_ctx(struct si_shader_context *ctx,
146 struct si_screen *sscreen,
147 struct si_shader *shader,
148 LLVMTargetMachineRef tm);
149
150 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
151 struct lp_build_tgsi_context *bld_base,
152 struct lp_build_emit_data *emit_data);
153
154 /* Ideally pass the sample mask input to the PS epilog as v13, which
155 * is its usual location, so that the shader doesn't have to add v_mov.
156 */
157 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
158
159 /* The VS location of the PrimitiveID input is the same in the epilog,
160 * so that the main shader part doesn't have to move it.
161 */
162 #define VS_EPILOG_PRIMID_LOC 2
163
164 #define PERSPECTIVE_BASE 0
165 #define LINEAR_BASE 9
166
167 #define SAMPLE_OFFSET 0
168 #define CENTER_OFFSET 2
169 #define CENTROID_OFSET 4
170
171 #define USE_SGPR_MAX_SUFFIX_LEN 5
172 #define CONST_ADDR_SPACE 2
173 #define LOCAL_ADDR_SPACE 3
174 #define USER_SGPR_ADDR_SPACE 8
175
176
177 #define SENDMSG_GS 2
178 #define SENDMSG_GS_DONE 3
179
180 #define SENDMSG_GS_OP_NOP (0 << 4)
181 #define SENDMSG_GS_OP_CUT (1 << 4)
182 #define SENDMSG_GS_OP_EMIT (2 << 4)
183 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
184
185 /**
186 * Returns a unique index for a semantic name and index. The index must be
187 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
188 * calculated.
189 */
190 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
191 {
192 switch (semantic_name) {
193 case TGSI_SEMANTIC_POSITION:
194 return 0;
195 case TGSI_SEMANTIC_PSIZE:
196 return 1;
197 case TGSI_SEMANTIC_CLIPDIST:
198 assert(index <= 1);
199 return 2 + index;
200 case TGSI_SEMANTIC_GENERIC:
201 if (index <= 63-4)
202 return 4 + index;
203 else
204 /* same explanation as in the default statement,
205 * the only user hitting this is st/nine.
206 */
207 return 0;
208
209 /* patch indices are completely separate and thus start from 0 */
210 case TGSI_SEMANTIC_TESSOUTER:
211 return 0;
212 case TGSI_SEMANTIC_TESSINNER:
213 return 1;
214 case TGSI_SEMANTIC_PATCH:
215 return 2 + index;
216
217 default:
218 /* Don't fail here. The result of this function is only used
219 * for LS, TCS, TES, and GS, where legacy GL semantics can't
220 * occur, but this function is called for all vertex shaders
221 * before it's known whether LS will be compiled or not.
222 */
223 return 0;
224 }
225 }
226
227 /**
228 * Get the value of a shader input parameter and extract a bitfield.
229 */
230 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
231 unsigned param, unsigned rshift,
232 unsigned bitwidth)
233 {
234 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
235 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
236 param);
237
238 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
239 value = bitcast(&ctx->radeon_bld.soa.bld_base,
240 TGSI_TYPE_UNSIGNED, value);
241
242 if (rshift)
243 value = LLVMBuildLShr(gallivm->builder, value,
244 lp_build_const_int32(gallivm, rshift), "");
245
246 if (rshift + bitwidth < 32) {
247 unsigned mask = (1 << bitwidth) - 1;
248 value = LLVMBuildAnd(gallivm->builder, value,
249 lp_build_const_int32(gallivm, mask), "");
250 }
251
252 return value;
253 }
254
255 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
256 {
257 switch (ctx->type) {
258 case PIPE_SHADER_TESS_CTRL:
259 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
260
261 case PIPE_SHADER_TESS_EVAL:
262 return LLVMGetParam(ctx->radeon_bld.main_fn,
263 ctx->param_tes_rel_patch_id);
264
265 default:
266 assert(0);
267 return NULL;
268 }
269 }
270
271 /* Tessellation shaders pass outputs to the next shader using LDS.
272 *
273 * LS outputs = TCS inputs
274 * TCS outputs = TES inputs
275 *
276 * The LDS layout is:
277 * - TCS inputs for patch 0
278 * - TCS inputs for patch 1
279 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
280 * - ...
281 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
282 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
283 * - TCS outputs for patch 1
284 * - Per-patch TCS outputs for patch 1
285 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
286 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
287 * - ...
288 *
289 * All three shaders VS(LS), TCS, TES share the same LDS space.
290 */
291
292 static LLVMValueRef
293 get_tcs_in_patch_stride(struct si_shader_context *ctx)
294 {
295 if (ctx->type == PIPE_SHADER_VERTEX)
296 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
297 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
298 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
299 else {
300 assert(0);
301 return NULL;
302 }
303 }
304
305 static LLVMValueRef
306 get_tcs_out_patch_stride(struct si_shader_context *ctx)
307 {
308 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
309 }
310
311 static LLVMValueRef
312 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
313 {
314 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
315 unpack_param(ctx,
316 SI_PARAM_TCS_OUT_OFFSETS,
317 0, 16),
318 4);
319 }
320
321 static LLVMValueRef
322 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
323 {
324 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
325 unpack_param(ctx,
326 SI_PARAM_TCS_OUT_OFFSETS,
327 16, 16),
328 4);
329 }
330
331 static LLVMValueRef
332 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
333 {
334 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
335 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
336 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
337
338 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
339 }
340
341 static LLVMValueRef
342 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
343 {
344 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
345 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
346 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
347 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
348
349 return LLVMBuildAdd(gallivm->builder, patch0_offset,
350 LLVMBuildMul(gallivm->builder, patch_stride,
351 rel_patch_id, ""),
352 "");
353 }
354
355 static LLVMValueRef
356 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
357 {
358 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
359 LLVMValueRef patch0_patch_data_offset =
360 get_tcs_out_patch0_patch_data_offset(ctx);
361 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
362 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
363
364 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
365 LLVMBuildMul(gallivm->builder, patch_stride,
366 rel_patch_id, ""),
367 "");
368 }
369
370 static void build_indexed_store(struct si_shader_context *ctx,
371 LLVMValueRef base_ptr, LLVMValueRef index,
372 LLVMValueRef value)
373 {
374 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
375 struct gallivm_state *gallivm = bld_base->base.gallivm;
376 LLVMValueRef indices[2], pointer;
377
378 indices[0] = bld_base->uint_bld.zero;
379 indices[1] = index;
380
381 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
382 LLVMBuildStore(gallivm->builder, value, pointer);
383 }
384
385 /**
386 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
387 * It's equivalent to doing a load from &base_ptr[index].
388 *
389 * \param base_ptr Where the array starts.
390 * \param index The element index into the array.
391 * \param uniform Whether the base_ptr and index can be assumed to be
392 * dynamically uniform
393 */
394 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
395 LLVMValueRef base_ptr, LLVMValueRef index,
396 bool uniform)
397 {
398 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
399 struct gallivm_state *gallivm = bld_base->base.gallivm;
400 LLVMValueRef indices[2], pointer;
401
402 indices[0] = bld_base->uint_bld.zero;
403 indices[1] = index;
404
405 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
406 if (uniform)
407 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
408 return LLVMBuildLoad(gallivm->builder, pointer, "");
409 }
410
411 /**
412 * Do a load from &base_ptr[index], but also add a flag that it's loading
413 * a constant from a dynamically uniform index.
414 */
415 static LLVMValueRef build_indexed_load_const(
416 struct si_shader_context *ctx,
417 LLVMValueRef base_ptr, LLVMValueRef index)
418 {
419 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
420 LLVMSetMetadata(result, ctx->tbaa_md_kind, ctx->tbaa_const_md);
421 return result;
422 }
423
424 static LLVMValueRef get_instance_index_for_fetch(
425 struct radeon_llvm_context *radeon_bld,
426 unsigned param_start_instance, unsigned divisor)
427 {
428 struct si_shader_context *ctx =
429 si_shader_context(&radeon_bld->soa.bld_base);
430 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
431
432 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
433 ctx->param_instance_id);
434
435 /* The division must be done before START_INSTANCE is added. */
436 if (divisor > 1)
437 result = LLVMBuildUDiv(gallivm->builder, result,
438 lp_build_const_int32(gallivm, divisor), "");
439
440 return LLVMBuildAdd(gallivm->builder, result,
441 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
442 }
443
444 static void declare_input_vs(
445 struct radeon_llvm_context *radeon_bld,
446 unsigned input_index,
447 const struct tgsi_full_declaration *decl)
448 {
449 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
450 struct gallivm_state *gallivm = base->gallivm;
451 struct si_shader_context *ctx =
452 si_shader_context(&radeon_bld->soa.bld_base);
453 unsigned divisor =
454 ctx->shader->key.vs.prolog.instance_divisors[input_index];
455
456 unsigned chan;
457
458 LLVMValueRef t_list_ptr;
459 LLVMValueRef t_offset;
460 LLVMValueRef t_list;
461 LLVMValueRef attribute_offset;
462 LLVMValueRef buffer_index;
463 LLVMValueRef args[3];
464 LLVMValueRef input;
465
466 /* Load the T list */
467 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
468
469 t_offset = lp_build_const_int32(gallivm, input_index);
470
471 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
472
473 /* Build the attribute offset */
474 attribute_offset = lp_build_const_int32(gallivm, 0);
475
476 if (!ctx->is_monolithic) {
477 buffer_index = LLVMGetParam(radeon_bld->main_fn,
478 ctx->param_vertex_index0 +
479 input_index);
480 } else if (divisor) {
481 /* Build index from instance ID, start instance and divisor */
482 ctx->shader->info.uses_instanceid = true;
483 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
484 SI_PARAM_START_INSTANCE,
485 divisor);
486 } else {
487 /* Load the buffer index for vertices. */
488 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
489 ctx->param_vertex_id);
490 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
491 SI_PARAM_BASE_VERTEX);
492 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
493 }
494
495 args[0] = t_list;
496 args[1] = attribute_offset;
497 args[2] = buffer_index;
498 input = lp_build_intrinsic(gallivm->builder,
499 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
500 LLVMReadNoneAttribute);
501
502 /* Break up the vec4 into individual components */
503 for (chan = 0; chan < 4; chan++) {
504 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
505 /* XXX: Use a helper function for this. There is one in
506 * tgsi_llvm.c. */
507 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
508 LLVMBuildExtractElement(gallivm->builder,
509 input, llvm_chan, "");
510 }
511 }
512
513 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
514 unsigned swizzle)
515 {
516 struct si_shader_context *ctx = si_shader_context(bld_base);
517
518 if (swizzle > 0)
519 return bld_base->uint_bld.zero;
520
521 switch (ctx->type) {
522 case PIPE_SHADER_VERTEX:
523 return LLVMGetParam(ctx->radeon_bld.main_fn,
524 ctx->param_vs_prim_id);
525 case PIPE_SHADER_TESS_CTRL:
526 return LLVMGetParam(ctx->radeon_bld.main_fn,
527 SI_PARAM_PATCH_ID);
528 case PIPE_SHADER_TESS_EVAL:
529 return LLVMGetParam(ctx->radeon_bld.main_fn,
530 ctx->param_tes_patch_id);
531 case PIPE_SHADER_GEOMETRY:
532 return LLVMGetParam(ctx->radeon_bld.main_fn,
533 SI_PARAM_PRIMITIVE_ID);
534 default:
535 assert(0);
536 return bld_base->uint_bld.zero;
537 }
538 }
539
540 /**
541 * Return the value of tgsi_ind_register for indexing.
542 * This is the indirect index with the constant offset added to it.
543 */
544 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
545 const struct tgsi_ind_register *ind,
546 int rel_index)
547 {
548 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
549 LLVMValueRef result;
550
551 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
552 result = LLVMBuildLoad(gallivm->builder, result, "");
553 result = LLVMBuildAdd(gallivm->builder, result,
554 lp_build_const_int32(gallivm, rel_index), "");
555 return result;
556 }
557
558 /**
559 * Like get_indirect_index, but restricts the return value to a (possibly
560 * undefined) value inside [0..num).
561 */
562 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
563 const struct tgsi_ind_register *ind,
564 int rel_index, unsigned num)
565 {
566 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
567 LLVMBuilderRef builder = gallivm->builder;
568 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
569 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
570 LLVMValueRef cc;
571
572 /* LLVM 3.8: If indirect resource indexing is used:
573 * - SI & CIK hang
574 * - VI crashes
575 */
576 if (HAVE_LLVM <= 0x0308)
577 return LLVMGetUndef(ctx->i32);
578
579 if (util_is_power_of_two(num)) {
580 result = LLVMBuildAnd(builder, result, c_max, "");
581 } else {
582 /* In theory, this MAX pattern should result in code that is
583 * as good as the bit-wise AND above.
584 *
585 * In practice, LLVM generates worse code (at the time of
586 * writing), because its value tracking is not strong enough.
587 */
588 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
589 result = LLVMBuildSelect(builder, cc, result, c_max, "");
590 }
591
592 return result;
593 }
594
595
596 /**
597 * Calculate a dword address given an input or output register and a stride.
598 */
599 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
600 const struct tgsi_full_dst_register *dst,
601 const struct tgsi_full_src_register *src,
602 LLVMValueRef vertex_dw_stride,
603 LLVMValueRef base_addr)
604 {
605 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
606 struct tgsi_shader_info *info = &ctx->shader->selector->info;
607 ubyte *name, *index, *array_first;
608 int first, param;
609 struct tgsi_full_dst_register reg;
610
611 /* Set the register description. The address computation is the same
612 * for sources and destinations. */
613 if (src) {
614 reg.Register.File = src->Register.File;
615 reg.Register.Index = src->Register.Index;
616 reg.Register.Indirect = src->Register.Indirect;
617 reg.Register.Dimension = src->Register.Dimension;
618 reg.Indirect = src->Indirect;
619 reg.Dimension = src->Dimension;
620 reg.DimIndirect = src->DimIndirect;
621 } else
622 reg = *dst;
623
624 /* If the register is 2-dimensional (e.g. an array of vertices
625 * in a primitive), calculate the base address of the vertex. */
626 if (reg.Register.Dimension) {
627 LLVMValueRef index;
628
629 if (reg.Dimension.Indirect)
630 index = get_indirect_index(ctx, &reg.DimIndirect,
631 reg.Dimension.Index);
632 else
633 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
634
635 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
636 LLVMBuildMul(gallivm->builder, index,
637 vertex_dw_stride, ""), "");
638 }
639
640 /* Get information about the register. */
641 if (reg.Register.File == TGSI_FILE_INPUT) {
642 name = info->input_semantic_name;
643 index = info->input_semantic_index;
644 array_first = info->input_array_first;
645 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
646 name = info->output_semantic_name;
647 index = info->output_semantic_index;
648 array_first = info->output_array_first;
649 } else {
650 assert(0);
651 return NULL;
652 }
653
654 if (reg.Register.Indirect) {
655 /* Add the relative address of the element. */
656 LLVMValueRef ind_index;
657
658 if (reg.Indirect.ArrayID)
659 first = array_first[reg.Indirect.ArrayID];
660 else
661 first = reg.Register.Index;
662
663 ind_index = get_indirect_index(ctx, &reg.Indirect,
664 reg.Register.Index - first);
665
666 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
667 LLVMBuildMul(gallivm->builder, ind_index,
668 lp_build_const_int32(gallivm, 4), ""), "");
669
670 param = si_shader_io_get_unique_index(name[first], index[first]);
671 } else {
672 param = si_shader_io_get_unique_index(name[reg.Register.Index],
673 index[reg.Register.Index]);
674 }
675
676 /* Add the base address of the element. */
677 return LLVMBuildAdd(gallivm->builder, base_addr,
678 lp_build_const_int32(gallivm, param * 4), "");
679 }
680
681 /* The offchip buffer layout for TCS->TES is
682 *
683 * - attribute 0 of patch 0 vertex 0
684 * - attribute 0 of patch 0 vertex 1
685 * - attribute 0 of patch 0 vertex 2
686 * ...
687 * - attribute 0 of patch 1 vertex 0
688 * - attribute 0 of patch 1 vertex 1
689 * ...
690 * - attribute 1 of patch 0 vertex 0
691 * - attribute 1 of patch 0 vertex 1
692 * ...
693 * - per patch attribute 0 of patch 0
694 * - per patch attribute 0 of patch 1
695 * ...
696 *
697 * Note that every attribute has 4 components.
698 */
699 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
700 LLVMValueRef vertex_index,
701 LLVMValueRef param_index)
702 {
703 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
704 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
705 LLVMValueRef param_stride, constant16;
706
707 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
708 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
709 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
710 num_patches, "");
711
712 constant16 = lp_build_const_int32(gallivm, 16);
713 if (vertex_index) {
714 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
715 vertices_per_patch, "");
716
717 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
718 vertex_index, "");
719
720 param_stride = total_vertices;
721 } else {
722 base_addr = get_rel_patch_id(ctx);
723 param_stride = num_patches;
724 }
725
726 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
727 LLVMBuildMul(gallivm->builder, param_index,
728 param_stride, ""), "");
729
730 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
731
732 if (!vertex_index) {
733 LLVMValueRef patch_data_offset =
734 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
735
736 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
737 patch_data_offset, "");
738 }
739 return base_addr;
740 }
741
742 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
743 struct si_shader_context *ctx,
744 const struct tgsi_full_dst_register *dst,
745 const struct tgsi_full_src_register *src)
746 {
747 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
748 struct tgsi_shader_info *info = &ctx->shader->selector->info;
749 ubyte *name, *index, *array_first;
750 struct tgsi_full_src_register reg;
751 LLVMValueRef vertex_index = NULL;
752 LLVMValueRef param_index = NULL;
753 unsigned param_index_base, param_base;
754
755 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
756
757 if (reg.Register.Dimension) {
758
759 if (reg.Dimension.Indirect)
760 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
761 reg.Dimension.Index);
762 else
763 vertex_index = lp_build_const_int32(gallivm,
764 reg.Dimension.Index);
765 }
766
767 /* Get information about the register. */
768 if (reg.Register.File == TGSI_FILE_INPUT) {
769 name = info->input_semantic_name;
770 index = info->input_semantic_index;
771 array_first = info->input_array_first;
772 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
773 name = info->output_semantic_name;
774 index = info->output_semantic_index;
775 array_first = info->output_array_first;
776 } else {
777 assert(0);
778 return NULL;
779 }
780
781 if (reg.Register.Indirect) {
782 if (reg.Indirect.ArrayID)
783 param_base = array_first[reg.Indirect.ArrayID];
784 else
785 param_base = reg.Register.Index;
786
787 param_index = get_indirect_index(ctx, &reg.Indirect,
788 reg.Register.Index - param_base);
789
790 } else {
791 param_base = reg.Register.Index;
792 param_index = lp_build_const_int32(gallivm, 0);
793 }
794
795 param_index_base = si_shader_io_get_unique_index(name[param_base],
796 index[param_base]);
797
798 param_index = LLVMBuildAdd(gallivm->builder, param_index,
799 lp_build_const_int32(gallivm, param_index_base),
800 "");
801
802 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
803 }
804
805 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
806 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
807 * or v4i32 (num_channels=3,4). */
808 static void build_tbuffer_store(struct si_shader_context *ctx,
809 LLVMValueRef rsrc,
810 LLVMValueRef vdata,
811 unsigned num_channels,
812 LLVMValueRef vaddr,
813 LLVMValueRef soffset,
814 unsigned inst_offset,
815 unsigned dfmt,
816 unsigned nfmt,
817 unsigned offen,
818 unsigned idxen,
819 unsigned glc,
820 unsigned slc,
821 unsigned tfe)
822 {
823 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
824 LLVMValueRef args[] = {
825 rsrc,
826 vdata,
827 LLVMConstInt(ctx->i32, num_channels, 0),
828 vaddr,
829 soffset,
830 LLVMConstInt(ctx->i32, inst_offset, 0),
831 LLVMConstInt(ctx->i32, dfmt, 0),
832 LLVMConstInt(ctx->i32, nfmt, 0),
833 LLVMConstInt(ctx->i32, offen, 0),
834 LLVMConstInt(ctx->i32, idxen, 0),
835 LLVMConstInt(ctx->i32, glc, 0),
836 LLVMConstInt(ctx->i32, slc, 0),
837 LLVMConstInt(ctx->i32, tfe, 0)
838 };
839
840 /* The instruction offset field has 12 bits */
841 assert(offen || inst_offset < (1 << 12));
842
843 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
844 unsigned func = CLAMP(num_channels, 1, 3) - 1;
845 const char *types[] = {"i32", "v2i32", "v4i32"};
846 char name[256];
847 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
848
849 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
850 args, ARRAY_SIZE(args), 0);
851 }
852
853 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
854 LLVMValueRef rsrc,
855 LLVMValueRef vdata,
856 unsigned num_channels,
857 LLVMValueRef vaddr,
858 LLVMValueRef soffset,
859 unsigned inst_offset)
860 {
861 static unsigned dfmt[] = {
862 V_008F0C_BUF_DATA_FORMAT_32,
863 V_008F0C_BUF_DATA_FORMAT_32_32,
864 V_008F0C_BUF_DATA_FORMAT_32_32_32,
865 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
866 };
867 assert(num_channels >= 1 && num_channels <= 4);
868
869 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
870 inst_offset, dfmt[num_channels-1],
871 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
872 }
873
874 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
875 LLVMValueRef rsrc,
876 int num_channels,
877 LLVMValueRef vindex,
878 LLVMValueRef voffset,
879 LLVMValueRef soffset,
880 unsigned inst_offset,
881 unsigned glc,
882 unsigned slc)
883 {
884 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
885 unsigned func = CLAMP(num_channels, 1, 3) - 1;
886
887 if (HAVE_LLVM >= 0x309) {
888 LLVMValueRef args[] = {
889 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
890 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
891 LLVMConstInt(ctx->i32, inst_offset, 0),
892 LLVMConstInt(ctx->i1, glc, 0),
893 LLVMConstInt(ctx->i1, slc, 0)
894 };
895
896 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
897 ctx->v4f32};
898 const char *type_names[] = {"f32", "v2f32", "v4f32"};
899 char name[256];
900
901 if (voffset) {
902 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
903 "");
904 }
905
906 if (soffset) {
907 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
908 "");
909 }
910
911 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
912 type_names[func]);
913
914 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
915 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
916 } else {
917 LLVMValueRef args[] = {
918 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
919 voffset ? voffset : vindex,
920 soffset,
921 LLVMConstInt(ctx->i32, inst_offset, 0),
922 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
923 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
924 LLVMConstInt(ctx->i32, glc, 0),
925 LLVMConstInt(ctx->i32, slc, 0),
926 LLVMConstInt(ctx->i32, 0, 0), // TFE
927 };
928
929 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
930 ctx->v4i32};
931 const char *type_names[] = {"i32", "v2i32", "v4i32"};
932 const char *arg_type = "i32";
933 char name[256];
934
935 if (voffset && vindex) {
936 LLVMValueRef vaddr[] = {vindex, voffset};
937
938 arg_type = "v2i32";
939 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
940 }
941
942 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
943 type_names[func], arg_type);
944
945 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
946 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
947 }
948 }
949
950 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
951 enum tgsi_opcode_type type, unsigned swizzle,
952 LLVMValueRef buffer, LLVMValueRef offset,
953 LLVMValueRef base)
954 {
955 struct si_shader_context *ctx = si_shader_context(bld_base);
956 struct gallivm_state *gallivm = bld_base->base.gallivm;
957 LLVMValueRef value, value2;
958 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
959 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
960
961 if (swizzle == ~0) {
962 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
963 0, 1, 0);
964
965 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
966 }
967
968 if (!tgsi_type_is_64bit(type)) {
969 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
970 0, 1, 0);
971
972 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
973 return LLVMBuildExtractElement(gallivm->builder, value,
974 lp_build_const_int32(gallivm, swizzle), "");
975 }
976
977 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
978 swizzle * 4, 1, 0);
979
980 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
981 swizzle * 4 + 4, 1, 0);
982
983 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
984 }
985
986 /**
987 * Load from LDS.
988 *
989 * \param type output value type
990 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
991 * \param dw_addr address in dwords
992 */
993 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
994 enum tgsi_opcode_type type, unsigned swizzle,
995 LLVMValueRef dw_addr)
996 {
997 struct si_shader_context *ctx = si_shader_context(bld_base);
998 struct gallivm_state *gallivm = bld_base->base.gallivm;
999 LLVMValueRef value;
1000
1001 if (swizzle == ~0) {
1002 LLVMValueRef values[TGSI_NUM_CHANNELS];
1003
1004 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1005 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1006
1007 return lp_build_gather_values(bld_base->base.gallivm, values,
1008 TGSI_NUM_CHANNELS);
1009 }
1010
1011 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1012 lp_build_const_int32(gallivm, swizzle));
1013
1014 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1015 if (tgsi_type_is_64bit(type)) {
1016 LLVMValueRef value2;
1017 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1018 lp_build_const_int32(gallivm, swizzle + 1));
1019 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1020 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1021 }
1022
1023 return LLVMBuildBitCast(gallivm->builder, value,
1024 tgsi2llvmtype(bld_base, type), "");
1025 }
1026
1027 /**
1028 * Store to LDS.
1029 *
1030 * \param swizzle offset (typically 0..3)
1031 * \param dw_addr address in dwords
1032 * \param value value to store
1033 */
1034 static void lds_store(struct lp_build_tgsi_context *bld_base,
1035 unsigned swizzle, LLVMValueRef dw_addr,
1036 LLVMValueRef value)
1037 {
1038 struct si_shader_context *ctx = si_shader_context(bld_base);
1039 struct gallivm_state *gallivm = bld_base->base.gallivm;
1040
1041 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1042 lp_build_const_int32(gallivm, swizzle));
1043
1044 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1045 build_indexed_store(ctx, ctx->lds,
1046 dw_addr, value);
1047 }
1048
1049 static LLVMValueRef fetch_input_tcs(
1050 struct lp_build_tgsi_context *bld_base,
1051 const struct tgsi_full_src_register *reg,
1052 enum tgsi_opcode_type type, unsigned swizzle)
1053 {
1054 struct si_shader_context *ctx = si_shader_context(bld_base);
1055 LLVMValueRef dw_addr, stride;
1056
1057 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1058 dw_addr = get_tcs_in_current_patch_offset(ctx);
1059 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1060
1061 return lds_load(bld_base, type, swizzle, dw_addr);
1062 }
1063
1064 static LLVMValueRef fetch_output_tcs(
1065 struct lp_build_tgsi_context *bld_base,
1066 const struct tgsi_full_src_register *reg,
1067 enum tgsi_opcode_type type, unsigned swizzle)
1068 {
1069 struct si_shader_context *ctx = si_shader_context(bld_base);
1070 LLVMValueRef dw_addr, stride;
1071
1072 if (reg->Register.Dimension) {
1073 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1074 dw_addr = get_tcs_out_current_patch_offset(ctx);
1075 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1076 } else {
1077 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1078 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1079 }
1080
1081 return lds_load(bld_base, type, swizzle, dw_addr);
1082 }
1083
1084 static LLVMValueRef fetch_input_tes(
1085 struct lp_build_tgsi_context *bld_base,
1086 const struct tgsi_full_src_register *reg,
1087 enum tgsi_opcode_type type, unsigned swizzle)
1088 {
1089 struct si_shader_context *ctx = si_shader_context(bld_base);
1090 struct gallivm_state *gallivm = bld_base->base.gallivm;
1091 LLVMValueRef rw_buffers, buffer, base, addr;
1092
1093 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1094 SI_PARAM_RW_BUFFERS);
1095 buffer = build_indexed_load_const(ctx, rw_buffers,
1096 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1097
1098 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1099 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1100
1101 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1102 }
1103
1104 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1105 const struct tgsi_full_instruction *inst,
1106 const struct tgsi_opcode_info *info,
1107 LLVMValueRef dst[4])
1108 {
1109 struct si_shader_context *ctx = si_shader_context(bld_base);
1110 struct gallivm_state *gallivm = bld_base->base.gallivm;
1111 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1112 unsigned chan_index;
1113 LLVMValueRef dw_addr, stride;
1114 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1115 LLVMValueRef values[4];
1116
1117 /* Only handle per-patch and per-vertex outputs here.
1118 * Vectors will be lowered to scalars and this function will be called again.
1119 */
1120 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1121 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1122 radeon_llvm_emit_store(bld_base, inst, info, dst);
1123 return;
1124 }
1125
1126 if (reg->Register.Dimension) {
1127 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1128 dw_addr = get_tcs_out_current_patch_offset(ctx);
1129 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1130 } else {
1131 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1132 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1133 }
1134
1135 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1136 SI_PARAM_RW_BUFFERS);
1137 buffer = build_indexed_load_const(ctx, rw_buffers,
1138 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1139
1140 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1141 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1142
1143
1144 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1145 LLVMValueRef value = dst[chan_index];
1146
1147 if (inst->Instruction.Saturate)
1148 value = radeon_llvm_saturate(bld_base, value);
1149
1150 lds_store(bld_base, chan_index, dw_addr, value);
1151
1152 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1153 values[chan_index] = value;
1154
1155 if (inst->Dst[0].Register.WriteMask != 0xF) {
1156 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1157 buf_addr, base,
1158 4 * chan_index);
1159 }
1160 }
1161
1162 if (inst->Dst[0].Register.WriteMask == 0xF) {
1163 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1164 values, 4);
1165 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1166 base, 0);
1167 }
1168 }
1169
1170 static LLVMValueRef fetch_input_gs(
1171 struct lp_build_tgsi_context *bld_base,
1172 const struct tgsi_full_src_register *reg,
1173 enum tgsi_opcode_type type,
1174 unsigned swizzle)
1175 {
1176 struct lp_build_context *base = &bld_base->base;
1177 struct si_shader_context *ctx = si_shader_context(bld_base);
1178 struct si_shader *shader = ctx->shader;
1179 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1180 struct gallivm_state *gallivm = base->gallivm;
1181 LLVMValueRef vtx_offset;
1182 LLVMValueRef args[9];
1183 unsigned vtx_offset_param;
1184 struct tgsi_shader_info *info = &shader->selector->info;
1185 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1186 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1187 unsigned param;
1188 LLVMValueRef value;
1189
1190 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1191 return get_primitive_id(bld_base, swizzle);
1192
1193 if (!reg->Register.Dimension)
1194 return NULL;
1195
1196 if (swizzle == ~0) {
1197 LLVMValueRef values[TGSI_NUM_CHANNELS];
1198 unsigned chan;
1199 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1200 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1201 }
1202 return lp_build_gather_values(bld_base->base.gallivm, values,
1203 TGSI_NUM_CHANNELS);
1204 }
1205
1206 /* Get the vertex offset parameter */
1207 vtx_offset_param = reg->Dimension.Index;
1208 if (vtx_offset_param < 2) {
1209 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1210 } else {
1211 assert(vtx_offset_param < 6);
1212 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1213 }
1214 vtx_offset = lp_build_mul_imm(uint,
1215 LLVMGetParam(ctx->radeon_bld.main_fn,
1216 vtx_offset_param),
1217 4);
1218
1219 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1220 args[0] = ctx->esgs_ring;
1221 args[1] = vtx_offset;
1222 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1223 args[3] = uint->zero;
1224 args[4] = uint->one; /* OFFEN */
1225 args[5] = uint->zero; /* IDXEN */
1226 args[6] = uint->one; /* GLC */
1227 args[7] = uint->zero; /* SLC */
1228 args[8] = uint->zero; /* TFE */
1229
1230 value = lp_build_intrinsic(gallivm->builder,
1231 "llvm.SI.buffer.load.dword.i32.i32",
1232 ctx->i32, args, 9,
1233 LLVMReadOnlyAttribute);
1234 if (tgsi_type_is_64bit(type)) {
1235 LLVMValueRef value2;
1236 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1237 value2 = lp_build_intrinsic(gallivm->builder,
1238 "llvm.SI.buffer.load.dword.i32.i32",
1239 ctx->i32, args, 9,
1240 LLVMReadOnlyAttribute);
1241 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1242 value, value2);
1243 }
1244 return LLVMBuildBitCast(gallivm->builder,
1245 value,
1246 tgsi2llvmtype(bld_base, type), "");
1247 }
1248
1249 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1250 {
1251 switch (interpolate) {
1252 case TGSI_INTERPOLATE_CONSTANT:
1253 return 0;
1254
1255 case TGSI_INTERPOLATE_LINEAR:
1256 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1257 return SI_PARAM_LINEAR_SAMPLE;
1258 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1259 return SI_PARAM_LINEAR_CENTROID;
1260 else
1261 return SI_PARAM_LINEAR_CENTER;
1262 break;
1263 case TGSI_INTERPOLATE_COLOR:
1264 case TGSI_INTERPOLATE_PERSPECTIVE:
1265 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1266 return SI_PARAM_PERSP_SAMPLE;
1267 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1268 return SI_PARAM_PERSP_CENTROID;
1269 else
1270 return SI_PARAM_PERSP_CENTER;
1271 break;
1272 default:
1273 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1274 return -1;
1275 }
1276 }
1277
1278 /* This shouldn't be used by explicit INTERP opcodes. */
1279 static unsigned select_interp_param(struct si_shader_context *ctx,
1280 unsigned param)
1281 {
1282 if (!ctx->is_monolithic)
1283 return param;
1284
1285 if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1286 switch (param) {
1287 case SI_PARAM_PERSP_CENTROID:
1288 case SI_PARAM_PERSP_CENTER:
1289 return SI_PARAM_PERSP_SAMPLE;
1290 }
1291 }
1292 if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1293 switch (param) {
1294 case SI_PARAM_LINEAR_CENTROID:
1295 case SI_PARAM_LINEAR_CENTER:
1296 return SI_PARAM_LINEAR_SAMPLE;
1297 }
1298 }
1299 if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1300 switch (param) {
1301 case SI_PARAM_PERSP_CENTROID:
1302 case SI_PARAM_PERSP_SAMPLE:
1303 return SI_PARAM_PERSP_CENTER;
1304 }
1305 }
1306 if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1307 switch (param) {
1308 case SI_PARAM_LINEAR_CENTROID:
1309 case SI_PARAM_LINEAR_SAMPLE:
1310 return SI_PARAM_LINEAR_CENTER;
1311 }
1312 }
1313
1314 return param;
1315 }
1316
1317 /**
1318 * Interpolate a fragment shader input.
1319 *
1320 * @param ctx context
1321 * @param input_index index of the input in hardware
1322 * @param semantic_name TGSI_SEMANTIC_*
1323 * @param semantic_index semantic index
1324 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1325 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1326 * @param interp_param interpolation weights (i,j)
1327 * @param prim_mask SI_PARAM_PRIM_MASK
1328 * @param face SI_PARAM_FRONT_FACE
1329 * @param result the return value (4 components)
1330 */
1331 static void interp_fs_input(struct si_shader_context *ctx,
1332 unsigned input_index,
1333 unsigned semantic_name,
1334 unsigned semantic_index,
1335 unsigned num_interp_inputs,
1336 unsigned colors_read_mask,
1337 LLVMValueRef interp_param,
1338 LLVMValueRef prim_mask,
1339 LLVMValueRef face,
1340 LLVMValueRef result[4])
1341 {
1342 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1343 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1344 struct gallivm_state *gallivm = base->gallivm;
1345 const char *intr_name;
1346 LLVMValueRef attr_number;
1347
1348 unsigned chan;
1349
1350 attr_number = lp_build_const_int32(gallivm, input_index);
1351
1352 /* fs.constant returns the param from the middle vertex, so it's not
1353 * really useful for flat shading. It's meant to be used for custom
1354 * interpolation (but the intrinsic can't fetch from the other two
1355 * vertices).
1356 *
1357 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1358 * to do the right thing. The only reason we use fs.constant is that
1359 * fs.interp cannot be used on integers, because they can be equal
1360 * to NaN.
1361 */
1362 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1363
1364 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1365 ctx->shader->key.ps.prolog.color_two_side) {
1366 LLVMValueRef args[4];
1367 LLVMValueRef is_face_positive;
1368 LLVMValueRef back_attr_number;
1369
1370 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1371 * otherwise it's at offset "num_inputs".
1372 */
1373 unsigned back_attr_offset = num_interp_inputs;
1374 if (semantic_index == 1 && colors_read_mask & 0xf)
1375 back_attr_offset += 1;
1376
1377 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1378
1379 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1380 face, uint->zero, "");
1381
1382 args[2] = prim_mask;
1383 args[3] = interp_param;
1384 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1385 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1386 LLVMValueRef front, back;
1387
1388 args[0] = llvm_chan;
1389 args[1] = attr_number;
1390 front = lp_build_intrinsic(gallivm->builder, intr_name,
1391 ctx->f32, args, args[3] ? 4 : 3,
1392 LLVMReadNoneAttribute);
1393
1394 args[1] = back_attr_number;
1395 back = lp_build_intrinsic(gallivm->builder, intr_name,
1396 ctx->f32, args, args[3] ? 4 : 3,
1397 LLVMReadNoneAttribute);
1398
1399 result[chan] = LLVMBuildSelect(gallivm->builder,
1400 is_face_positive,
1401 front,
1402 back,
1403 "");
1404 }
1405 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1406 LLVMValueRef args[4];
1407
1408 args[0] = uint->zero;
1409 args[1] = attr_number;
1410 args[2] = prim_mask;
1411 args[3] = interp_param;
1412 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1413 ctx->f32, args, args[3] ? 4 : 3,
1414 LLVMReadNoneAttribute);
1415 result[1] =
1416 result[2] = lp_build_const_float(gallivm, 0.0f);
1417 result[3] = lp_build_const_float(gallivm, 1.0f);
1418 } else {
1419 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1420 LLVMValueRef args[4];
1421 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1422
1423 args[0] = llvm_chan;
1424 args[1] = attr_number;
1425 args[2] = prim_mask;
1426 args[3] = interp_param;
1427 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1428 ctx->f32, args, args[3] ? 4 : 3,
1429 LLVMReadNoneAttribute);
1430 }
1431 }
1432 }
1433
1434 /* LLVMGetParam with bc_optimize resolved. */
1435 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1436 int interp_param_idx)
1437 {
1438 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1439 LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1440 LLVMValueRef param = NULL;
1441
1442 /* Handle PRIM_MASK[31] (bc_optimize). */
1443 if (ctx->is_monolithic &&
1444 ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1445 interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1446 (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1447 interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1448 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1449 * The hw doesn't compute CENTROID if the whole wave only
1450 * contains fully-covered quads.
1451 */
1452 LLVMValueRef bc_optimize =
1453 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1454 bc_optimize = LLVMBuildLShr(builder,
1455 bc_optimize,
1456 LLVMConstInt(ctx->i32, 31, 0), "");
1457 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1458
1459 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1460 interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1461 param = LLVMBuildSelect(builder, bc_optimize,
1462 LLVMGetParam(main_fn,
1463 SI_PARAM_PERSP_CENTER),
1464 LLVMGetParam(main_fn,
1465 SI_PARAM_PERSP_CENTROID),
1466 "");
1467 }
1468 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1469 interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1470 param = LLVMBuildSelect(builder, bc_optimize,
1471 LLVMGetParam(main_fn,
1472 SI_PARAM_LINEAR_CENTER),
1473 LLVMGetParam(main_fn,
1474 SI_PARAM_LINEAR_CENTROID),
1475 "");
1476 }
1477 }
1478
1479 if (!param)
1480 param = LLVMGetParam(main_fn, interp_param_idx);
1481 return param;
1482 }
1483
1484 static void declare_input_fs(
1485 struct radeon_llvm_context *radeon_bld,
1486 unsigned input_index,
1487 const struct tgsi_full_declaration *decl)
1488 {
1489 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1490 struct si_shader_context *ctx =
1491 si_shader_context(&radeon_bld->soa.bld_base);
1492 struct si_shader *shader = ctx->shader;
1493 LLVMValueRef main_fn = radeon_bld->main_fn;
1494 LLVMValueRef interp_param = NULL;
1495 int interp_param_idx;
1496
1497 /* Get colors from input VGPRs (set by the prolog). */
1498 if (!ctx->is_monolithic &&
1499 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1500 unsigned i = decl->Semantic.Index;
1501 unsigned colors_read = shader->selector->info.colors_read;
1502 unsigned mask = colors_read >> (i * 4);
1503 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1504 (i ? util_bitcount(colors_read & 0xf) : 0);
1505
1506 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1507 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1508 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1509 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1510 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1511 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1512 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1513 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1514 return;
1515 }
1516
1517 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1518 decl->Interp.Location);
1519 if (interp_param_idx == -1)
1520 return;
1521 else if (interp_param_idx) {
1522 interp_param_idx = select_interp_param(ctx,
1523 interp_param_idx);
1524 interp_param = get_interp_param(ctx, interp_param_idx);
1525 }
1526
1527 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1528 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1529 ctx->shader->key.ps.prolog.flatshade_colors)
1530 interp_param = NULL; /* load the constant color */
1531
1532 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1533 decl->Semantic.Index, shader->selector->info.num_inputs,
1534 shader->selector->info.colors_read, interp_param,
1535 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1536 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1537 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1538 }
1539
1540 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1541 {
1542 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1543 SI_PARAM_ANCILLARY, 8, 4);
1544 }
1545
1546 /**
1547 * Set range metadata on an instruction. This can only be used on load and
1548 * call instructions. If you know an instruction can only produce the values
1549 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1550 * \p lo is the minimum value inclusive.
1551 * \p hi is the maximum value exclusive.
1552 */
1553 static void set_range_metadata(struct si_shader_context *ctx,
1554 LLVMValueRef value, unsigned lo, unsigned hi)
1555 {
1556 LLVMValueRef range_md, md_args[2];
1557 LLVMTypeRef type = LLVMTypeOf(value);
1558 LLVMContextRef context = LLVMGetTypeContext(type);
1559
1560 md_args[0] = LLVMConstInt(type, lo, false);
1561 md_args[1] = LLVMConstInt(type, hi, false);
1562 range_md = LLVMMDNodeInContext(context, md_args, 2);
1563 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1564 }
1565
1566 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1567 {
1568 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1569 LLVMValueRef tid;
1570
1571 if (HAVE_LLVM < 0x0308) {
1572 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1573 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1574 } else {
1575 LLVMValueRef tid_args[2];
1576 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1577 tid_args[1] = lp_build_const_int32(gallivm, 0);
1578 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1579 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1580 tid_args, 2, LLVMReadNoneAttribute);
1581
1582 tid = lp_build_intrinsic(gallivm->builder,
1583 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1584 tid_args, 2, LLVMReadNoneAttribute);
1585 }
1586 set_range_metadata(ctx, tid, 0, 64);
1587 return tid;
1588 }
1589
1590 /**
1591 * Load a dword from a constant buffer.
1592 */
1593 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1594 LLVMValueRef offset, LLVMTypeRef return_type)
1595 {
1596 LLVMValueRef args[2] = {resource, offset};
1597
1598 return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1599 LLVMReadNoneAttribute);
1600 }
1601
1602 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1603 {
1604 struct si_shader_context *ctx =
1605 si_shader_context(&radeon_bld->soa.bld_base);
1606 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1607 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1608 LLVMBuilderRef builder = gallivm->builder;
1609 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1610 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1611 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1612
1613 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1614 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1615 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1616
1617 LLVMValueRef pos[4] = {
1618 buffer_load_const(builder, resource, offset0, ctx->f32),
1619 buffer_load_const(builder, resource, offset1, ctx->f32),
1620 lp_build_const_float(gallivm, 0),
1621 lp_build_const_float(gallivm, 0)
1622 };
1623
1624 return lp_build_gather_values(gallivm, pos, 4);
1625 }
1626
1627 static void declare_system_value(
1628 struct radeon_llvm_context *radeon_bld,
1629 unsigned index,
1630 const struct tgsi_full_declaration *decl)
1631 {
1632 struct si_shader_context *ctx =
1633 si_shader_context(&radeon_bld->soa.bld_base);
1634 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1635 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1636 LLVMValueRef value = 0;
1637
1638 switch (decl->Semantic.Name) {
1639 case TGSI_SEMANTIC_INSTANCEID:
1640 value = LLVMGetParam(radeon_bld->main_fn,
1641 ctx->param_instance_id);
1642 break;
1643
1644 case TGSI_SEMANTIC_VERTEXID:
1645 value = LLVMBuildAdd(gallivm->builder,
1646 LLVMGetParam(radeon_bld->main_fn,
1647 ctx->param_vertex_id),
1648 LLVMGetParam(radeon_bld->main_fn,
1649 SI_PARAM_BASE_VERTEX), "");
1650 break;
1651
1652 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1653 value = LLVMGetParam(radeon_bld->main_fn,
1654 ctx->param_vertex_id);
1655 break;
1656
1657 case TGSI_SEMANTIC_BASEVERTEX:
1658 value = LLVMGetParam(radeon_bld->main_fn,
1659 SI_PARAM_BASE_VERTEX);
1660 break;
1661
1662 case TGSI_SEMANTIC_INVOCATIONID:
1663 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1664 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1665 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1666 value = LLVMGetParam(radeon_bld->main_fn,
1667 SI_PARAM_GS_INSTANCE_ID);
1668 else
1669 assert(!"INVOCATIONID not implemented");
1670 break;
1671
1672 case TGSI_SEMANTIC_POSITION:
1673 {
1674 LLVMValueRef pos[4] = {
1675 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1676 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1677 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1678 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1679 LLVMGetParam(radeon_bld->main_fn,
1680 SI_PARAM_POS_W_FLOAT)),
1681 };
1682 value = lp_build_gather_values(gallivm, pos, 4);
1683 break;
1684 }
1685
1686 case TGSI_SEMANTIC_FACE:
1687 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1688 break;
1689
1690 case TGSI_SEMANTIC_SAMPLEID:
1691 value = get_sample_id(radeon_bld);
1692 break;
1693
1694 case TGSI_SEMANTIC_SAMPLEPOS: {
1695 LLVMValueRef pos[4] = {
1696 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1697 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1698 lp_build_const_float(gallivm, 0),
1699 lp_build_const_float(gallivm, 0)
1700 };
1701 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1702 TGSI_OPCODE_FRC, pos[0]);
1703 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1704 TGSI_OPCODE_FRC, pos[1]);
1705 value = lp_build_gather_values(gallivm, pos, 4);
1706 break;
1707 }
1708
1709 case TGSI_SEMANTIC_SAMPLEMASK:
1710 /* This can only occur with the OpenGL Core profile, which
1711 * doesn't support smoothing.
1712 */
1713 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1714 break;
1715
1716 case TGSI_SEMANTIC_TESSCOORD:
1717 {
1718 LLVMValueRef coord[4] = {
1719 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1720 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1721 bld->zero,
1722 bld->zero
1723 };
1724
1725 /* For triangles, the vector should be (u, v, 1-u-v). */
1726 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1727 PIPE_PRIM_TRIANGLES)
1728 coord[2] = lp_build_sub(bld, bld->one,
1729 lp_build_add(bld, coord[0], coord[1]));
1730
1731 value = lp_build_gather_values(gallivm, coord, 4);
1732 break;
1733 }
1734
1735 case TGSI_SEMANTIC_VERTICESIN:
1736 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1737 break;
1738
1739 case TGSI_SEMANTIC_TESSINNER:
1740 case TGSI_SEMANTIC_TESSOUTER:
1741 {
1742 LLVMValueRef rw_buffers, buffer, base, addr;
1743 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1744
1745 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1746 SI_PARAM_RW_BUFFERS);
1747 buffer = build_indexed_load_const(ctx, rw_buffers,
1748 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1749
1750 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1751 addr = get_tcs_tes_buffer_address(ctx, NULL,
1752 lp_build_const_int32(gallivm, param));
1753
1754 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1755 ~0, buffer, base, addr);
1756
1757 break;
1758 }
1759
1760 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1761 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1762 {
1763 LLVMValueRef buf, slot, val[4];
1764 int i, offset;
1765
1766 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1767 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1768 buf = build_indexed_load_const(ctx, buf, slot);
1769 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1770
1771 for (i = 0; i < 4; i++)
1772 val[i] = buffer_load_const(gallivm->builder, buf,
1773 lp_build_const_int32(gallivm, (offset + i) * 4),
1774 ctx->f32);
1775 value = lp_build_gather_values(gallivm, val, 4);
1776 break;
1777 }
1778
1779 case TGSI_SEMANTIC_PRIMID:
1780 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1781 break;
1782
1783 case TGSI_SEMANTIC_GRID_SIZE:
1784 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1785 break;
1786
1787 case TGSI_SEMANTIC_BLOCK_SIZE:
1788 {
1789 LLVMValueRef values[3];
1790 unsigned i;
1791 unsigned *properties = ctx->shader->selector->info.properties;
1792 unsigned sizes[3] = {
1793 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1794 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1795 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1796 };
1797
1798 for (i = 0; i < 3; ++i)
1799 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1800
1801 value = lp_build_gather_values(gallivm, values, 3);
1802 break;
1803 }
1804
1805 case TGSI_SEMANTIC_BLOCK_ID:
1806 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1807 break;
1808
1809 case TGSI_SEMANTIC_THREAD_ID:
1810 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1811 break;
1812
1813 #if HAVE_LLVM >= 0x0309
1814 case TGSI_SEMANTIC_HELPER_INVOCATION:
1815 value = lp_build_intrinsic(gallivm->builder,
1816 "llvm.amdgcn.ps.live",
1817 ctx->i1, NULL, 0,
1818 LLVMReadNoneAttribute);
1819 value = LLVMBuildNot(gallivm->builder, value, "");
1820 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1821 break;
1822 #endif
1823
1824 default:
1825 assert(!"unknown system value");
1826 return;
1827 }
1828
1829 radeon_bld->system_values[index] = value;
1830 }
1831
1832 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1833 const struct tgsi_full_declaration *decl)
1834 {
1835 struct si_shader_context *ctx =
1836 si_shader_context(&radeon_bld->soa.bld_base);
1837 struct si_shader_selector *sel = ctx->shader->selector;
1838 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1839
1840 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1841 LLVMValueRef var;
1842
1843 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1844 assert(decl->Range.First == decl->Range.Last);
1845 assert(!ctx->shared_memory);
1846
1847 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1848 LLVMArrayType(ctx->i8, sel->local_size),
1849 "compute_lds",
1850 LOCAL_ADDR_SPACE);
1851 LLVMSetAlignment(var, 4);
1852
1853 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1854 }
1855
1856 static LLVMValueRef fetch_constant(
1857 struct lp_build_tgsi_context *bld_base,
1858 const struct tgsi_full_src_register *reg,
1859 enum tgsi_opcode_type type,
1860 unsigned swizzle)
1861 {
1862 struct si_shader_context *ctx = si_shader_context(bld_base);
1863 struct lp_build_context *base = &bld_base->base;
1864 const struct tgsi_ind_register *ireg = &reg->Indirect;
1865 unsigned buf, idx;
1866
1867 LLVMValueRef addr, bufp;
1868 LLVMValueRef result;
1869
1870 if (swizzle == LP_CHAN_ALL) {
1871 unsigned chan;
1872 LLVMValueRef values[4];
1873 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1874 values[chan] = fetch_constant(bld_base, reg, type, chan);
1875
1876 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1877 }
1878
1879 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1880 idx = reg->Register.Index * 4 + swizzle;
1881
1882 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1883 if (!tgsi_type_is_64bit(type))
1884 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1885 else {
1886 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1887 ctx->constants[buf][idx],
1888 ctx->constants[buf][idx + 1]);
1889 }
1890 }
1891
1892 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1893 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1894 LLVMValueRef index;
1895 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1896 reg->Dimension.Index,
1897 SI_NUM_CONST_BUFFERS);
1898 bufp = build_indexed_load_const(ctx, ptr, index);
1899 } else
1900 bufp = ctx->const_buffers[buf];
1901
1902 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1903 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1904 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1905 addr = lp_build_add(&bld_base->uint_bld, addr,
1906 lp_build_const_int32(base->gallivm, idx * 4));
1907
1908 result = buffer_load_const(base->gallivm->builder, bufp,
1909 addr, ctx->f32);
1910
1911 if (!tgsi_type_is_64bit(type))
1912 result = bitcast(bld_base, type, result);
1913 else {
1914 LLVMValueRef addr2, result2;
1915 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1916 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1917 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1918 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1919 lp_build_const_int32(base->gallivm, idx * 4));
1920
1921 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1922 addr2, ctx->f32);
1923
1924 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1925 result, result2);
1926 }
1927 return result;
1928 }
1929
1930 /* Upper 16 bits must be zero. */
1931 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1932 LLVMValueRef val[2])
1933 {
1934 return LLVMBuildOr(gallivm->builder, val[0],
1935 LLVMBuildShl(gallivm->builder, val[1],
1936 lp_build_const_int32(gallivm, 16),
1937 ""), "");
1938 }
1939
1940 /* Upper 16 bits are ignored and will be dropped. */
1941 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1942 LLVMValueRef val[2])
1943 {
1944 LLVMValueRef v[2] = {
1945 LLVMBuildAnd(gallivm->builder, val[0],
1946 lp_build_const_int32(gallivm, 0xffff), ""),
1947 val[1],
1948 };
1949 return si_llvm_pack_two_int16(gallivm, v);
1950 }
1951
1952 /* Initialize arguments for the shader export intrinsic */
1953 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1954 LLVMValueRef *values,
1955 unsigned target,
1956 LLVMValueRef *args)
1957 {
1958 struct si_shader_context *ctx = si_shader_context(bld_base);
1959 struct lp_build_context *uint =
1960 &ctx->radeon_bld.soa.bld_base.uint_bld;
1961 struct lp_build_context *base = &bld_base->base;
1962 struct gallivm_state *gallivm = base->gallivm;
1963 LLVMBuilderRef builder = base->gallivm->builder;
1964 LLVMValueRef val[4];
1965 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1966 unsigned chan;
1967 bool is_int8;
1968
1969 /* Default is 0xf. Adjusted below depending on the format. */
1970 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1971
1972 /* Specify whether the EXEC mask represents the valid mask */
1973 args[1] = uint->zero;
1974
1975 /* Specify whether this is the last export */
1976 args[2] = uint->zero;
1977
1978 /* Specify the target we are exporting */
1979 args[3] = lp_build_const_int32(base->gallivm, target);
1980
1981 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1982 const union si_shader_key *key = &ctx->shader->key;
1983 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1984 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1985
1986 assert(cbuf >= 0 && cbuf < 8);
1987 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1988 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1989 }
1990
1991 args[4] = uint->zero; /* COMPR flag */
1992 args[5] = base->undef;
1993 args[6] = base->undef;
1994 args[7] = base->undef;
1995 args[8] = base->undef;
1996
1997 switch (spi_shader_col_format) {
1998 case V_028714_SPI_SHADER_ZERO:
1999 args[0] = uint->zero; /* writemask */
2000 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2001 break;
2002
2003 case V_028714_SPI_SHADER_32_R:
2004 args[0] = uint->one; /* writemask */
2005 args[5] = values[0];
2006 break;
2007
2008 case V_028714_SPI_SHADER_32_GR:
2009 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2010 args[5] = values[0];
2011 args[6] = values[1];
2012 break;
2013
2014 case V_028714_SPI_SHADER_32_AR:
2015 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2016 args[5] = values[0];
2017 args[8] = values[3];
2018 break;
2019
2020 case V_028714_SPI_SHADER_FP16_ABGR:
2021 args[4] = uint->one; /* COMPR flag */
2022
2023 for (chan = 0; chan < 2; chan++) {
2024 LLVMValueRef pack_args[2] = {
2025 values[2 * chan],
2026 values[2 * chan + 1]
2027 };
2028 LLVMValueRef packed;
2029
2030 packed = lp_build_intrinsic(base->gallivm->builder,
2031 "llvm.SI.packf16",
2032 ctx->i32, pack_args, 2,
2033 LLVMReadNoneAttribute);
2034 args[chan + 5] =
2035 LLVMBuildBitCast(base->gallivm->builder,
2036 packed, ctx->f32, "");
2037 }
2038 break;
2039
2040 case V_028714_SPI_SHADER_UNORM16_ABGR:
2041 for (chan = 0; chan < 4; chan++) {
2042 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2043 val[chan] = LLVMBuildFMul(builder, val[chan],
2044 lp_build_const_float(gallivm, 65535), "");
2045 val[chan] = LLVMBuildFAdd(builder, val[chan],
2046 lp_build_const_float(gallivm, 0.5), "");
2047 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2048 ctx->i32, "");
2049 }
2050
2051 args[4] = uint->one; /* COMPR flag */
2052 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2053 si_llvm_pack_two_int16(gallivm, val));
2054 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2055 si_llvm_pack_two_int16(gallivm, val+2));
2056 break;
2057
2058 case V_028714_SPI_SHADER_SNORM16_ABGR:
2059 for (chan = 0; chan < 4; chan++) {
2060 /* Clamp between [-1, 1]. */
2061 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2062 values[chan],
2063 lp_build_const_float(gallivm, 1));
2064 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2065 val[chan],
2066 lp_build_const_float(gallivm, -1));
2067 /* Convert to a signed integer in [-32767, 32767]. */
2068 val[chan] = LLVMBuildFMul(builder, val[chan],
2069 lp_build_const_float(gallivm, 32767), "");
2070 /* If positive, add 0.5, else add -0.5. */
2071 val[chan] = LLVMBuildFAdd(builder, val[chan],
2072 LLVMBuildSelect(builder,
2073 LLVMBuildFCmp(builder, LLVMRealOGE,
2074 val[chan], base->zero, ""),
2075 lp_build_const_float(gallivm, 0.5),
2076 lp_build_const_float(gallivm, -0.5), ""), "");
2077 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2078 }
2079
2080 args[4] = uint->one; /* COMPR flag */
2081 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2082 si_llvm_pack_two_int32_as_int16(gallivm, val));
2083 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2084 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2085 break;
2086
2087 case V_028714_SPI_SHADER_UINT16_ABGR: {
2088 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2089 255 : 65535);
2090 /* Clamp. */
2091 for (chan = 0; chan < 4; chan++) {
2092 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2093 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2094 val[chan], max);
2095 }
2096
2097 args[4] = uint->one; /* COMPR flag */
2098 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2099 si_llvm_pack_two_int16(gallivm, val));
2100 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2101 si_llvm_pack_two_int16(gallivm, val+2));
2102 break;
2103 }
2104
2105 case V_028714_SPI_SHADER_SINT16_ABGR: {
2106 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2107 127 : 32767);
2108 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2109 -128 : -32768);
2110 /* Clamp. */
2111 for (chan = 0; chan < 4; chan++) {
2112 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2113 val[chan] = lp_build_emit_llvm_binary(bld_base,
2114 TGSI_OPCODE_IMIN,
2115 val[chan], max);
2116 val[chan] = lp_build_emit_llvm_binary(bld_base,
2117 TGSI_OPCODE_IMAX,
2118 val[chan], min);
2119 }
2120
2121 args[4] = uint->one; /* COMPR flag */
2122 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2123 si_llvm_pack_two_int32_as_int16(gallivm, val));
2124 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2125 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2126 break;
2127 }
2128
2129 case V_028714_SPI_SHADER_32_ABGR:
2130 memcpy(&args[5], values, sizeof(values[0]) * 4);
2131 break;
2132 }
2133 }
2134
2135 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2136 LLVMValueRef alpha)
2137 {
2138 struct si_shader_context *ctx = si_shader_context(bld_base);
2139 struct gallivm_state *gallivm = bld_base->base.gallivm;
2140
2141 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2142 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2143 SI_PARAM_ALPHA_REF);
2144
2145 LLVMValueRef alpha_pass =
2146 lp_build_cmp(&bld_base->base,
2147 ctx->shader->key.ps.epilog.alpha_func,
2148 alpha, alpha_ref);
2149 LLVMValueRef arg =
2150 lp_build_select(&bld_base->base,
2151 alpha_pass,
2152 lp_build_const_float(gallivm, 1.0f),
2153 lp_build_const_float(gallivm, -1.0f));
2154
2155 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2156 ctx->voidt, &arg, 1, 0);
2157 } else {
2158 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2159 ctx->voidt, NULL, 0, 0);
2160 }
2161 }
2162
2163 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2164 LLVMValueRef alpha,
2165 unsigned samplemask_param)
2166 {
2167 struct si_shader_context *ctx = si_shader_context(bld_base);
2168 struct gallivm_state *gallivm = bld_base->base.gallivm;
2169 LLVMValueRef coverage;
2170
2171 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2172 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2173 samplemask_param);
2174 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2175
2176 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2177 ctx->i32,
2178 &coverage, 1, LLVMReadNoneAttribute);
2179
2180 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2181 ctx->f32, "");
2182
2183 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2184 lp_build_const_float(gallivm,
2185 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2186
2187 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2188 }
2189
2190 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2191 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2192 {
2193 struct si_shader_context *ctx = si_shader_context(bld_base);
2194 struct lp_build_context *base = &bld_base->base;
2195 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2196 unsigned reg_index;
2197 unsigned chan;
2198 unsigned const_chan;
2199 LLVMValueRef base_elt;
2200 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2201 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2202 SI_VS_CONST_CLIP_PLANES);
2203 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2204
2205 for (reg_index = 0; reg_index < 2; reg_index ++) {
2206 LLVMValueRef *args = pos[2 + reg_index];
2207
2208 args[5] =
2209 args[6] =
2210 args[7] =
2211 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2212
2213 /* Compute dot products of position and user clip plane vectors */
2214 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2215 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2216 args[1] = lp_build_const_int32(base->gallivm,
2217 ((reg_index * 4 + chan) * 4 +
2218 const_chan) * 4);
2219 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
2220 args[1], ctx->f32);
2221 args[5 + chan] =
2222 lp_build_add(base, args[5 + chan],
2223 lp_build_mul(base, base_elt,
2224 out_elts[const_chan]));
2225 }
2226 }
2227
2228 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2229 args[1] = uint->zero;
2230 args[2] = uint->zero;
2231 args[3] = lp_build_const_int32(base->gallivm,
2232 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2233 args[4] = uint->zero;
2234 }
2235 }
2236
2237 static void si_dump_streamout(struct pipe_stream_output_info *so)
2238 {
2239 unsigned i;
2240
2241 if (so->num_outputs)
2242 fprintf(stderr, "STREAMOUT\n");
2243
2244 for (i = 0; i < so->num_outputs; i++) {
2245 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2246 so->output[i].start_component;
2247 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2248 i, so->output[i].output_buffer,
2249 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2250 so->output[i].register_index,
2251 mask & 1 ? "x" : "",
2252 mask & 2 ? "y" : "",
2253 mask & 4 ? "z" : "",
2254 mask & 8 ? "w" : "");
2255 }
2256 }
2257
2258 /* On SI, the vertex shader is responsible for writing streamout data
2259 * to buffers. */
2260 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2261 struct si_shader_output_values *outputs,
2262 unsigned noutput)
2263 {
2264 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2265 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2266 LLVMBuilderRef builder = gallivm->builder;
2267 int i, j;
2268 struct lp_build_if_state if_ctx;
2269
2270 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2271 LLVMValueRef so_vtx_count =
2272 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2273
2274 LLVMValueRef tid = get_thread_id(ctx);
2275
2276 /* can_emit = tid < so_vtx_count; */
2277 LLVMValueRef can_emit =
2278 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2279
2280 LLVMValueRef stream_id =
2281 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2282
2283 /* Emit the streamout code conditionally. This actually avoids
2284 * out-of-bounds buffer access. The hw tells us via the SGPR
2285 * (so_vtx_count) which threads are allowed to emit streamout data. */
2286 lp_build_if(&if_ctx, gallivm, can_emit);
2287 {
2288 /* The buffer offset is computed as follows:
2289 * ByteOffset = streamout_offset[buffer_id]*4 +
2290 * (streamout_write_index + thread_id)*stride[buffer_id] +
2291 * attrib_offset
2292 */
2293
2294 LLVMValueRef so_write_index =
2295 LLVMGetParam(ctx->radeon_bld.main_fn,
2296 ctx->param_streamout_write_index);
2297
2298 /* Compute (streamout_write_index + thread_id). */
2299 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2300
2301 /* Compute the write offset for each enabled buffer. */
2302 LLVMValueRef so_write_offset[4] = {};
2303 for (i = 0; i < 4; i++) {
2304 if (!so->stride[i])
2305 continue;
2306
2307 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2308 ctx->param_streamout_offset[i]);
2309 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2310
2311 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2312 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2313 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2314 }
2315
2316 /* Write streamout data. */
2317 for (i = 0; i < so->num_outputs; i++) {
2318 unsigned buf_idx = so->output[i].output_buffer;
2319 unsigned reg = so->output[i].register_index;
2320 unsigned start = so->output[i].start_component;
2321 unsigned num_comps = so->output[i].num_components;
2322 unsigned stream = so->output[i].stream;
2323 LLVMValueRef out[4];
2324 struct lp_build_if_state if_ctx_stream;
2325
2326 assert(num_comps && num_comps <= 4);
2327 if (!num_comps || num_comps > 4)
2328 continue;
2329
2330 if (reg >= noutput)
2331 continue;
2332
2333 /* Load the output as int. */
2334 for (j = 0; j < num_comps; j++) {
2335 out[j] = LLVMBuildBitCast(builder,
2336 outputs[reg].values[start+j],
2337 ctx->i32, "");
2338 }
2339
2340 /* Pack the output. */
2341 LLVMValueRef vdata = NULL;
2342
2343 switch (num_comps) {
2344 case 1: /* as i32 */
2345 vdata = out[0];
2346 break;
2347 case 2: /* as v2i32 */
2348 case 3: /* as v4i32 (aligned to 4) */
2349 case 4: /* as v4i32 */
2350 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2351 for (j = 0; j < num_comps; j++) {
2352 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2353 LLVMConstInt(ctx->i32, j, 0), "");
2354 }
2355 break;
2356 }
2357
2358 LLVMValueRef can_emit_stream =
2359 LLVMBuildICmp(builder, LLVMIntEQ,
2360 stream_id,
2361 lp_build_const_int32(gallivm, stream), "");
2362
2363 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2364 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2365 vdata, num_comps,
2366 so_write_offset[buf_idx],
2367 LLVMConstInt(ctx->i32, 0, 0),
2368 so->output[i].dst_offset*4);
2369 lp_build_endif(&if_ctx_stream);
2370 }
2371 }
2372 lp_build_endif(&if_ctx);
2373 }
2374
2375
2376 /* Generate export instructions for hardware VS shader stage */
2377 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2378 struct si_shader_output_values *outputs,
2379 unsigned noutput)
2380 {
2381 struct si_shader_context *ctx = si_shader_context(bld_base);
2382 struct si_shader *shader = ctx->shader;
2383 struct lp_build_context *base = &bld_base->base;
2384 struct lp_build_context *uint =
2385 &ctx->radeon_bld.soa.bld_base.uint_bld;
2386 LLVMValueRef args[9];
2387 LLVMValueRef pos_args[4][9] = { { 0 } };
2388 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2389 unsigned semantic_name, semantic_index;
2390 unsigned target;
2391 unsigned param_count = 0;
2392 unsigned pos_idx;
2393 int i;
2394
2395 if (outputs && ctx->shader->selector->so.num_outputs) {
2396 si_llvm_emit_streamout(ctx, outputs, noutput);
2397 }
2398
2399 for (i = 0; i < noutput; i++) {
2400 semantic_name = outputs[i].name;
2401 semantic_index = outputs[i].sid;
2402
2403 handle_semantic:
2404 /* Select the correct target */
2405 switch(semantic_name) {
2406 case TGSI_SEMANTIC_PSIZE:
2407 psize_value = outputs[i].values[0];
2408 continue;
2409 case TGSI_SEMANTIC_EDGEFLAG:
2410 edgeflag_value = outputs[i].values[0];
2411 continue;
2412 case TGSI_SEMANTIC_LAYER:
2413 layer_value = outputs[i].values[0];
2414 semantic_name = TGSI_SEMANTIC_GENERIC;
2415 goto handle_semantic;
2416 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2417 viewport_index_value = outputs[i].values[0];
2418 semantic_name = TGSI_SEMANTIC_GENERIC;
2419 goto handle_semantic;
2420 case TGSI_SEMANTIC_POSITION:
2421 target = V_008DFC_SQ_EXP_POS;
2422 break;
2423 case TGSI_SEMANTIC_COLOR:
2424 case TGSI_SEMANTIC_BCOLOR:
2425 target = V_008DFC_SQ_EXP_PARAM + param_count;
2426 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2427 shader->info.vs_output_param_offset[i] = param_count;
2428 param_count++;
2429 break;
2430 case TGSI_SEMANTIC_CLIPDIST:
2431 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2432 break;
2433 case TGSI_SEMANTIC_CLIPVERTEX:
2434 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2435 continue;
2436 case TGSI_SEMANTIC_PRIMID:
2437 case TGSI_SEMANTIC_FOG:
2438 case TGSI_SEMANTIC_TEXCOORD:
2439 case TGSI_SEMANTIC_GENERIC:
2440 target = V_008DFC_SQ_EXP_PARAM + param_count;
2441 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2442 shader->info.vs_output_param_offset[i] = param_count;
2443 param_count++;
2444 break;
2445 default:
2446 target = 0;
2447 fprintf(stderr,
2448 "Warning: SI unhandled vs output type:%d\n",
2449 semantic_name);
2450 }
2451
2452 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2453
2454 if (target >= V_008DFC_SQ_EXP_POS &&
2455 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2456 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2457 args, sizeof(args));
2458 } else {
2459 lp_build_intrinsic(base->gallivm->builder,
2460 "llvm.SI.export", ctx->voidt,
2461 args, 9, 0);
2462 }
2463
2464 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2465 semantic_name = TGSI_SEMANTIC_GENERIC;
2466 goto handle_semantic;
2467 }
2468 }
2469
2470 shader->info.nr_param_exports = param_count;
2471
2472 /* We need to add the position output manually if it's missing. */
2473 if (!pos_args[0][0]) {
2474 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2475 pos_args[0][1] = uint->zero; /* EXEC mask */
2476 pos_args[0][2] = uint->zero; /* last export? */
2477 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2478 pos_args[0][4] = uint->zero; /* COMPR flag */
2479 pos_args[0][5] = base->zero; /* X */
2480 pos_args[0][6] = base->zero; /* Y */
2481 pos_args[0][7] = base->zero; /* Z */
2482 pos_args[0][8] = base->one; /* W */
2483 }
2484
2485 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2486 if (shader->selector->info.writes_psize ||
2487 shader->selector->info.writes_edgeflag ||
2488 shader->selector->info.writes_viewport_index ||
2489 shader->selector->info.writes_layer) {
2490 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2491 shader->selector->info.writes_psize |
2492 (shader->selector->info.writes_edgeflag << 1) |
2493 (shader->selector->info.writes_layer << 2) |
2494 (shader->selector->info.writes_viewport_index << 3));
2495 pos_args[1][1] = uint->zero; /* EXEC mask */
2496 pos_args[1][2] = uint->zero; /* last export? */
2497 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2498 pos_args[1][4] = uint->zero; /* COMPR flag */
2499 pos_args[1][5] = base->zero; /* X */
2500 pos_args[1][6] = base->zero; /* Y */
2501 pos_args[1][7] = base->zero; /* Z */
2502 pos_args[1][8] = base->zero; /* W */
2503
2504 if (shader->selector->info.writes_psize)
2505 pos_args[1][5] = psize_value;
2506
2507 if (shader->selector->info.writes_edgeflag) {
2508 /* The output is a float, but the hw expects an integer
2509 * with the first bit containing the edge flag. */
2510 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2511 edgeflag_value,
2512 ctx->i32, "");
2513 edgeflag_value = lp_build_min(&bld_base->int_bld,
2514 edgeflag_value,
2515 bld_base->int_bld.one);
2516
2517 /* The LLVM intrinsic expects a float. */
2518 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2519 edgeflag_value,
2520 ctx->f32, "");
2521 }
2522
2523 if (shader->selector->info.writes_layer)
2524 pos_args[1][7] = layer_value;
2525
2526 if (shader->selector->info.writes_viewport_index)
2527 pos_args[1][8] = viewport_index_value;
2528 }
2529
2530 for (i = 0; i < 4; i++)
2531 if (pos_args[i][0])
2532 shader->info.nr_pos_exports++;
2533
2534 pos_idx = 0;
2535 for (i = 0; i < 4; i++) {
2536 if (!pos_args[i][0])
2537 continue;
2538
2539 /* Specify the target we are exporting */
2540 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2541
2542 if (pos_idx == shader->info.nr_pos_exports)
2543 /* Specify that this is the last export */
2544 pos_args[i][2] = uint->one;
2545
2546 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2547 ctx->voidt, pos_args[i], 9, 0);
2548 }
2549 }
2550
2551 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2552 {
2553 struct si_shader_context *ctx = si_shader_context(bld_base);
2554 struct gallivm_state *gallivm = bld_base->base.gallivm;
2555 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2556 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2557 uint64_t inputs;
2558
2559 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2560
2561 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2562 buffer = build_indexed_load_const(ctx, rw_buffers,
2563 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2564
2565 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2566
2567 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2568 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2569 lds_vertex_stride, "");
2570 lds_base = get_tcs_in_current_patch_offset(ctx);
2571 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2572
2573 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2574 while (inputs) {
2575 unsigned i = u_bit_scan64(&inputs);
2576
2577 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2578 lp_build_const_int32(gallivm, 4 * i),
2579 "");
2580
2581 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2582 invocation_id,
2583 lp_build_const_int32(gallivm, i));
2584
2585 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2586 lds_ptr);
2587
2588 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2589 buffer_offset, 0);
2590 }
2591 }
2592
2593 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2594 LLVMValueRef rel_patch_id,
2595 LLVMValueRef invocation_id,
2596 LLVMValueRef tcs_out_current_patch_data_offset)
2597 {
2598 struct si_shader_context *ctx = si_shader_context(bld_base);
2599 struct gallivm_state *gallivm = bld_base->base.gallivm;
2600 struct si_shader *shader = ctx->shader;
2601 unsigned tess_inner_index, tess_outer_index;
2602 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2603 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2604 unsigned stride, outer_comps, inner_comps, i;
2605 struct lp_build_if_state if_ctx, inner_if_ctx;
2606
2607 si_llvm_emit_barrier(NULL, bld_base, NULL);
2608
2609 /* Do this only for invocation 0, because the tess levels are per-patch,
2610 * not per-vertex.
2611 *
2612 * This can't jump, because invocation 0 executes this. It should
2613 * at least mask out the loads and stores for other invocations.
2614 */
2615 lp_build_if(&if_ctx, gallivm,
2616 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2617 invocation_id, bld_base->uint_bld.zero, ""));
2618
2619 /* Determine the layout of one tess factor element in the buffer. */
2620 switch (shader->key.tcs.epilog.prim_mode) {
2621 case PIPE_PRIM_LINES:
2622 stride = 2; /* 2 dwords, 1 vec2 store */
2623 outer_comps = 2;
2624 inner_comps = 0;
2625 break;
2626 case PIPE_PRIM_TRIANGLES:
2627 stride = 4; /* 4 dwords, 1 vec4 store */
2628 outer_comps = 3;
2629 inner_comps = 1;
2630 break;
2631 case PIPE_PRIM_QUADS:
2632 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2633 outer_comps = 4;
2634 inner_comps = 2;
2635 break;
2636 default:
2637 assert(0);
2638 return;
2639 }
2640
2641 /* Load tess_inner and tess_outer from LDS.
2642 * Any invocation can write them, so we can't get them from a temporary.
2643 */
2644 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2645 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2646
2647 lds_base = tcs_out_current_patch_data_offset;
2648 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2649 lp_build_const_int32(gallivm,
2650 tess_inner_index * 4), "");
2651 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2652 lp_build_const_int32(gallivm,
2653 tess_outer_index * 4), "");
2654
2655 for (i = 0; i < outer_comps; i++)
2656 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2657 for (i = 0; i < inner_comps; i++)
2658 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2659
2660 /* Convert the outputs to vectors for stores. */
2661 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2662 vec1 = NULL;
2663
2664 if (stride > 4)
2665 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2666
2667 /* Get the buffer. */
2668 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2669 SI_PARAM_RW_BUFFERS);
2670 buffer = build_indexed_load_const(ctx, rw_buffers,
2671 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2672
2673 /* Get the offset. */
2674 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2675 SI_PARAM_TESS_FACTOR_OFFSET);
2676 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2677 lp_build_const_int32(gallivm, 4 * stride), "");
2678
2679 lp_build_if(&inner_if_ctx, gallivm,
2680 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2681 rel_patch_id, bld_base->uint_bld.zero, ""));
2682
2683 /* Store the dynamic HS control word. */
2684 build_tbuffer_store_dwords(ctx, buffer,
2685 lp_build_const_int32(gallivm, 0x80000000),
2686 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2687
2688 lp_build_endif(&inner_if_ctx);
2689
2690 /* Store the tessellation factors. */
2691 build_tbuffer_store_dwords(ctx, buffer, vec0,
2692 MIN2(stride, 4), byteoffset, tf_base, 4);
2693 if (vec1)
2694 build_tbuffer_store_dwords(ctx, buffer, vec1,
2695 stride - 4, byteoffset, tf_base, 20);
2696 lp_build_endif(&if_ctx);
2697 }
2698
2699 /* This only writes the tessellation factor levels. */
2700 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2701 {
2702 struct si_shader_context *ctx = si_shader_context(bld_base);
2703 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2704
2705 rel_patch_id = get_rel_patch_id(ctx);
2706 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2707 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2708
2709 if (!ctx->is_monolithic) {
2710 /* Return epilog parameters from this function. */
2711 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2712 LLVMValueRef ret = ctx->return_value;
2713 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2714 unsigned vgpr;
2715
2716 /* RW_BUFFERS pointer */
2717 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2718 SI_PARAM_RW_BUFFERS);
2719 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2720 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2721 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2722 bld_base->uint_bld.zero, "");
2723 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2724 bld_base->uint_bld.one, "");
2725 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2726 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2727
2728 /* Tess factor buffer soffset is after user SGPRs. */
2729 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2730 SI_PARAM_TESS_FACTOR_OFFSET);
2731 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2732 SI_TCS_NUM_USER_SGPR + 1, "");
2733
2734 /* VGPRs */
2735 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2736 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2737 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2738
2739 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2740 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2741 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2742 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2743 ctx->return_value = ret;
2744 return;
2745 }
2746
2747 si_copy_tcs_inputs(bld_base);
2748 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2749 }
2750
2751 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2752 {
2753 struct si_shader_context *ctx = si_shader_context(bld_base);
2754 struct si_shader *shader = ctx->shader;
2755 struct tgsi_shader_info *info = &shader->selector->info;
2756 struct gallivm_state *gallivm = bld_base->base.gallivm;
2757 unsigned i, chan;
2758 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2759 ctx->param_rel_auto_id);
2760 LLVMValueRef vertex_dw_stride =
2761 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2762 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2763 vertex_dw_stride, "");
2764
2765 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2766 * its inputs from it. */
2767 for (i = 0; i < info->num_outputs; i++) {
2768 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2769 unsigned name = info->output_semantic_name[i];
2770 unsigned index = info->output_semantic_index[i];
2771 int param = si_shader_io_get_unique_index(name, index);
2772 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2773 lp_build_const_int32(gallivm, param * 4), "");
2774
2775 for (chan = 0; chan < 4; chan++) {
2776 lds_store(bld_base, chan, dw_addr,
2777 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2778 }
2779 }
2780 }
2781
2782 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2783 {
2784 struct si_shader_context *ctx = si_shader_context(bld_base);
2785 struct gallivm_state *gallivm = bld_base->base.gallivm;
2786 struct si_shader *es = ctx->shader;
2787 struct tgsi_shader_info *info = &es->selector->info;
2788 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2789 ctx->param_es2gs_offset);
2790 unsigned chan;
2791 int i;
2792
2793 for (i = 0; i < info->num_outputs; i++) {
2794 LLVMValueRef *out_ptr =
2795 ctx->radeon_bld.soa.outputs[i];
2796 int param_index;
2797
2798 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2799 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2800 continue;
2801
2802 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2803 info->output_semantic_index[i]);
2804
2805 for (chan = 0; chan < 4; chan++) {
2806 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2807 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2808
2809 build_tbuffer_store(ctx,
2810 ctx->esgs_ring,
2811 out_val, 1,
2812 LLVMGetUndef(ctx->i32), soffset,
2813 (4 * param_index + chan) * 4,
2814 V_008F0C_BUF_DATA_FORMAT_32,
2815 V_008F0C_BUF_NUM_FORMAT_UINT,
2816 0, 0, 1, 1, 0);
2817 }
2818 }
2819 }
2820
2821 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2822 {
2823 struct si_shader_context *ctx = si_shader_context(bld_base);
2824 struct gallivm_state *gallivm = bld_base->base.gallivm;
2825 LLVMValueRef args[2];
2826
2827 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2828 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2829 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2830 ctx->voidt, args, 2, 0);
2831 }
2832
2833 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2834 {
2835 struct si_shader_context *ctx = si_shader_context(bld_base);
2836 struct gallivm_state *gallivm = bld_base->base.gallivm;
2837 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2838 struct si_shader_output_values *outputs = NULL;
2839 int i,j;
2840
2841 assert(!ctx->is_gs_copy_shader);
2842
2843 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2844
2845 /* Vertex color clamping.
2846 *
2847 * This uses a state constant loaded in a user data SGPR and
2848 * an IF statement is added that clamps all colors if the constant
2849 * is true.
2850 */
2851 if (ctx->type == PIPE_SHADER_VERTEX) {
2852 struct lp_build_if_state if_ctx;
2853 LLVMValueRef cond = NULL;
2854 LLVMValueRef addr, val;
2855
2856 for (i = 0; i < info->num_outputs; i++) {
2857 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2858 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2859 continue;
2860
2861 /* We've found a color. */
2862 if (!cond) {
2863 /* The state is in the first bit of the user SGPR. */
2864 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2865 SI_PARAM_VS_STATE_BITS);
2866 cond = LLVMBuildTrunc(gallivm->builder, cond,
2867 ctx->i1, "");
2868 lp_build_if(&if_ctx, gallivm, cond);
2869 }
2870
2871 for (j = 0; j < 4; j++) {
2872 addr = ctx->radeon_bld.soa.outputs[i][j];
2873 val = LLVMBuildLoad(gallivm->builder, addr, "");
2874 val = radeon_llvm_saturate(bld_base, val);
2875 LLVMBuildStore(gallivm->builder, val, addr);
2876 }
2877 }
2878
2879 if (cond)
2880 lp_build_endif(&if_ctx);
2881 }
2882
2883 for (i = 0; i < info->num_outputs; i++) {
2884 outputs[i].name = info->output_semantic_name[i];
2885 outputs[i].sid = info->output_semantic_index[i];
2886
2887 for (j = 0; j < 4; j++)
2888 outputs[i].values[j] =
2889 LLVMBuildLoad(gallivm->builder,
2890 ctx->radeon_bld.soa.outputs[i][j],
2891 "");
2892 }
2893
2894 if (ctx->is_monolithic) {
2895 /* Export PrimitiveID when PS needs it. */
2896 if (si_vs_exports_prim_id(ctx->shader)) {
2897 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2898 outputs[i].sid = 0;
2899 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2900 get_primitive_id(bld_base, 0));
2901 outputs[i].values[1] = bld_base->base.undef;
2902 outputs[i].values[2] = bld_base->base.undef;
2903 outputs[i].values[3] = bld_base->base.undef;
2904 i++;
2905 }
2906 } else {
2907 /* Return the primitive ID from the LLVM function. */
2908 ctx->return_value =
2909 LLVMBuildInsertValue(gallivm->builder,
2910 ctx->return_value,
2911 bitcast(bld_base, TGSI_TYPE_FLOAT,
2912 get_primitive_id(bld_base, 0)),
2913 VS_EPILOG_PRIMID_LOC, "");
2914 }
2915
2916 si_llvm_export_vs(bld_base, outputs, i);
2917 FREE(outputs);
2918 }
2919
2920 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2921 LLVMValueRef depth, LLVMValueRef stencil,
2922 LLVMValueRef samplemask)
2923 {
2924 struct si_shader_context *ctx = si_shader_context(bld_base);
2925 struct lp_build_context *base = &bld_base->base;
2926 struct lp_build_context *uint = &bld_base->uint_bld;
2927 LLVMValueRef args[9];
2928 unsigned mask = 0;
2929
2930 assert(depth || stencil || samplemask);
2931
2932 args[1] = uint->one; /* whether the EXEC mask is valid */
2933 args[2] = uint->one; /* DONE bit */
2934
2935 /* Specify the target we are exporting */
2936 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2937
2938 args[4] = uint->zero; /* COMP flag */
2939 args[5] = base->undef; /* R, depth */
2940 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2941 args[7] = base->undef; /* B, sample mask */
2942 args[8] = base->undef; /* A, alpha to mask */
2943
2944 if (depth) {
2945 args[5] = depth;
2946 mask |= 0x1;
2947 }
2948
2949 if (stencil) {
2950 args[6] = stencil;
2951 mask |= 0x2;
2952 }
2953
2954 if (samplemask) {
2955 args[7] = samplemask;
2956 mask |= 0x4;
2957 }
2958
2959 /* SI (except OLAND) has a bug that it only looks
2960 * at the X writemask component. */
2961 if (ctx->screen->b.chip_class == SI &&
2962 ctx->screen->b.family != CHIP_OLAND)
2963 mask |= 0x1;
2964
2965 /* Specify which components to enable */
2966 args[0] = lp_build_const_int32(base->gallivm, mask);
2967
2968 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2969 ctx->voidt, args, 9, 0);
2970 }
2971
2972 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2973 LLVMValueRef *color, unsigned index,
2974 unsigned samplemask_param,
2975 bool is_last)
2976 {
2977 struct si_shader_context *ctx = si_shader_context(bld_base);
2978 struct lp_build_context *base = &bld_base->base;
2979 int i;
2980
2981 /* Clamp color */
2982 if (ctx->shader->key.ps.epilog.clamp_color)
2983 for (i = 0; i < 4; i++)
2984 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2985
2986 /* Alpha to one */
2987 if (ctx->shader->key.ps.epilog.alpha_to_one)
2988 color[3] = base->one;
2989
2990 /* Alpha test */
2991 if (index == 0 &&
2992 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2993 si_alpha_test(bld_base, color[3]);
2994
2995 /* Line & polygon smoothing */
2996 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
2997 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2998 samplemask_param);
2999
3000 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3001 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3002 LLVMValueRef args[8][9];
3003 int c, last = -1;
3004
3005 /* Get the export arguments, also find out what the last one is. */
3006 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3007 si_llvm_init_export_args(bld_base, color,
3008 V_008DFC_SQ_EXP_MRT + c, args[c]);
3009 if (args[c][0] != bld_base->uint_bld.zero)
3010 last = c;
3011 }
3012
3013 /* Emit all exports. */
3014 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3015 if (is_last && last == c) {
3016 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3017 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3018 } else if (args[c][0] == bld_base->uint_bld.zero)
3019 continue; /* unnecessary NULL export */
3020
3021 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3022 ctx->voidt, args[c], 9, 0);
3023 }
3024 } else {
3025 LLVMValueRef args[9];
3026
3027 /* Export */
3028 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3029 args);
3030 if (is_last) {
3031 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3032 args[2] = bld_base->uint_bld.one; /* DONE bit */
3033 } else if (args[0] == bld_base->uint_bld.zero)
3034 return; /* unnecessary NULL export */
3035
3036 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3037 ctx->voidt, args, 9, 0);
3038 }
3039 }
3040
3041 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3042 {
3043 struct si_shader_context *ctx = si_shader_context(bld_base);
3044 struct lp_build_context *base = &bld_base->base;
3045 struct lp_build_context *uint = &bld_base->uint_bld;
3046 LLVMValueRef args[9];
3047
3048 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3049 args[1] = uint->one; /* whether the EXEC mask is valid */
3050 args[2] = uint->one; /* DONE bit */
3051 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3052 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3053 args[5] = uint->undef; /* R */
3054 args[6] = uint->undef; /* G */
3055 args[7] = uint->undef; /* B */
3056 args[8] = uint->undef; /* A */
3057
3058 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3059 ctx->voidt, args, 9, 0);
3060 }
3061
3062 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3063 {
3064 struct si_shader_context *ctx = si_shader_context(bld_base);
3065 struct si_shader *shader = ctx->shader;
3066 struct lp_build_context *base = &bld_base->base;
3067 struct tgsi_shader_info *info = &shader->selector->info;
3068 LLVMBuilderRef builder = base->gallivm->builder;
3069 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3070 int last_color_export = -1;
3071 int i;
3072
3073 /* Determine the last export. If MRTZ is present, it's always last.
3074 * Otherwise, find the last color export.
3075 */
3076 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3077 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3078
3079 /* Don't export NULL and return if alpha-test is enabled. */
3080 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3081 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3082 (spi_format & 0xf) == 0)
3083 spi_format |= V_028714_SPI_SHADER_32_AR;
3084
3085 for (i = 0; i < info->num_outputs; i++) {
3086 unsigned index = info->output_semantic_index[i];
3087
3088 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3089 continue;
3090
3091 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3092 if (shader->key.ps.epilog.last_cbuf > 0) {
3093 /* Just set this if any of the colorbuffers are enabled. */
3094 if (spi_format &
3095 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3096 last_color_export = i;
3097 continue;
3098 }
3099
3100 if ((spi_format >> (index * 4)) & 0xf)
3101 last_color_export = i;
3102 }
3103
3104 /* If there are no outputs, export NULL. */
3105 if (last_color_export == -1) {
3106 si_export_null(bld_base);
3107 return;
3108 }
3109 }
3110
3111 for (i = 0; i < info->num_outputs; i++) {
3112 unsigned semantic_name = info->output_semantic_name[i];
3113 unsigned semantic_index = info->output_semantic_index[i];
3114 unsigned j;
3115 LLVMValueRef color[4] = {};
3116
3117 /* Select the correct target */
3118 switch (semantic_name) {
3119 case TGSI_SEMANTIC_POSITION:
3120 depth = LLVMBuildLoad(builder,
3121 ctx->radeon_bld.soa.outputs[i][2], "");
3122 break;
3123 case TGSI_SEMANTIC_STENCIL:
3124 stencil = LLVMBuildLoad(builder,
3125 ctx->radeon_bld.soa.outputs[i][1], "");
3126 break;
3127 case TGSI_SEMANTIC_SAMPLEMASK:
3128 samplemask = LLVMBuildLoad(builder,
3129 ctx->radeon_bld.soa.outputs[i][0], "");
3130 break;
3131 case TGSI_SEMANTIC_COLOR:
3132 for (j = 0; j < 4; j++)
3133 color[j] = LLVMBuildLoad(builder,
3134 ctx->radeon_bld.soa.outputs[i][j], "");
3135
3136 si_export_mrt_color(bld_base, color, semantic_index,
3137 SI_PARAM_SAMPLE_COVERAGE,
3138 last_color_export == i);
3139 break;
3140 default:
3141 fprintf(stderr,
3142 "Warning: SI unhandled fs output type:%d\n",
3143 semantic_name);
3144 }
3145 }
3146
3147 if (depth || stencil || samplemask)
3148 si_export_mrt_z(bld_base, depth, stencil, samplemask);
3149 }
3150
3151 /**
3152 * Return PS outputs in this order:
3153 *
3154 * v[0:3] = color0.xyzw
3155 * v[4:7] = color1.xyzw
3156 * ...
3157 * vN+0 = Depth
3158 * vN+1 = Stencil
3159 * vN+2 = SampleMask
3160 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3161 *
3162 * The alpha-ref SGPR is returned via its original location.
3163 */
3164 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3165 {
3166 struct si_shader_context *ctx = si_shader_context(bld_base);
3167 struct si_shader *shader = ctx->shader;
3168 struct lp_build_context *base = &bld_base->base;
3169 struct tgsi_shader_info *info = &shader->selector->info;
3170 LLVMBuilderRef builder = base->gallivm->builder;
3171 unsigned i, j, first_vgpr, vgpr;
3172
3173 LLVMValueRef color[8][4] = {};
3174 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3175 LLVMValueRef ret;
3176
3177 /* Read the output values. */
3178 for (i = 0; i < info->num_outputs; i++) {
3179 unsigned semantic_name = info->output_semantic_name[i];
3180 unsigned semantic_index = info->output_semantic_index[i];
3181
3182 switch (semantic_name) {
3183 case TGSI_SEMANTIC_COLOR:
3184 assert(semantic_index < 8);
3185 for (j = 0; j < 4; j++) {
3186 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3187 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3188 color[semantic_index][j] = result;
3189 }
3190 break;
3191 case TGSI_SEMANTIC_POSITION:
3192 depth = LLVMBuildLoad(builder,
3193 ctx->radeon_bld.soa.outputs[i][2], "");
3194 break;
3195 case TGSI_SEMANTIC_STENCIL:
3196 stencil = LLVMBuildLoad(builder,
3197 ctx->radeon_bld.soa.outputs[i][1], "");
3198 break;
3199 case TGSI_SEMANTIC_SAMPLEMASK:
3200 samplemask = LLVMBuildLoad(builder,
3201 ctx->radeon_bld.soa.outputs[i][0], "");
3202 break;
3203 default:
3204 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3205 semantic_name);
3206 }
3207 }
3208
3209 /* Fill the return structure. */
3210 ret = ctx->return_value;
3211
3212 /* Set SGPRs. */
3213 ret = LLVMBuildInsertValue(builder, ret,
3214 bitcast(bld_base, TGSI_TYPE_SIGNED,
3215 LLVMGetParam(ctx->radeon_bld.main_fn,
3216 SI_PARAM_ALPHA_REF)),
3217 SI_SGPR_ALPHA_REF, "");
3218
3219 /* Set VGPRs */
3220 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3221 for (i = 0; i < ARRAY_SIZE(color); i++) {
3222 if (!color[i][0])
3223 continue;
3224
3225 for (j = 0; j < 4; j++)
3226 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3227 }
3228 if (depth)
3229 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3230 if (stencil)
3231 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3232 if (samplemask)
3233 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3234
3235 /* Add the input sample mask for smoothing at the end. */
3236 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3237 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3238 ret = LLVMBuildInsertValue(builder, ret,
3239 LLVMGetParam(ctx->radeon_bld.main_fn,
3240 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3241
3242 ctx->return_value = ret;
3243 }
3244
3245 /**
3246 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3247 * buffer in number of elements and return it as an i32.
3248 */
3249 static LLVMValueRef get_buffer_size(
3250 struct lp_build_tgsi_context *bld_base,
3251 LLVMValueRef descriptor)
3252 {
3253 struct si_shader_context *ctx = si_shader_context(bld_base);
3254 struct gallivm_state *gallivm = bld_base->base.gallivm;
3255 LLVMBuilderRef builder = gallivm->builder;
3256 LLVMValueRef size =
3257 LLVMBuildExtractElement(builder, descriptor,
3258 lp_build_const_int32(gallivm, 6), "");
3259
3260 if (ctx->screen->b.chip_class >= VI) {
3261 /* On VI, the descriptor contains the size in bytes,
3262 * but TXQ must return the size in elements.
3263 * The stride is always non-zero for resources using TXQ.
3264 */
3265 LLVMValueRef stride =
3266 LLVMBuildExtractElement(builder, descriptor,
3267 lp_build_const_int32(gallivm, 5), "");
3268 stride = LLVMBuildLShr(builder, stride,
3269 lp_build_const_int32(gallivm, 16), "");
3270 stride = LLVMBuildAnd(builder, stride,
3271 lp_build_const_int32(gallivm, 0x3FFF), "");
3272
3273 size = LLVMBuildUDiv(builder, size, stride, "");
3274 }
3275
3276 return size;
3277 }
3278
3279 /**
3280 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3281 * intrinsic names).
3282 */
3283 static void build_int_type_name(
3284 LLVMTypeRef type,
3285 char *buf, unsigned bufsize)
3286 {
3287 assert(bufsize >= 6);
3288
3289 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3290 snprintf(buf, bufsize, "v%ui32",
3291 LLVMGetVectorSize(type));
3292 else
3293 strcpy(buf, "i32");
3294 }
3295
3296 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3297 struct lp_build_tgsi_context *bld_base,
3298 struct lp_build_emit_data *emit_data);
3299
3300 /* Prevent optimizations (at least of memory accesses) across the current
3301 * point in the program by emitting empty inline assembly that is marked as
3302 * having side effects.
3303 */
3304 static void emit_optimization_barrier(struct si_shader_context *ctx)
3305 {
3306 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3307 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3308 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3309 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3310 }
3311
3312 static void emit_waitcnt(struct si_shader_context *ctx)
3313 {
3314 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3315 LLVMBuilderRef builder = gallivm->builder;
3316 LLVMValueRef args[1] = {
3317 lp_build_const_int32(gallivm, 0xf70)
3318 };
3319 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3320 ctx->voidt, args, 1, 0);
3321 }
3322
3323 static void membar_emit(
3324 const struct lp_build_tgsi_action *action,
3325 struct lp_build_tgsi_context *bld_base,
3326 struct lp_build_emit_data *emit_data)
3327 {
3328 struct si_shader_context *ctx = si_shader_context(bld_base);
3329
3330 emit_waitcnt(ctx);
3331 }
3332
3333 static LLVMValueRef
3334 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3335 const struct tgsi_full_src_register *reg)
3336 {
3337 LLVMValueRef ind_index;
3338 LLVMValueRef rsrc_ptr;
3339
3340 if (!reg->Register.Indirect)
3341 return ctx->shader_buffers[reg->Register.Index];
3342
3343 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3344 reg->Register.Index,
3345 SI_NUM_SHADER_BUFFERS);
3346
3347 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3348 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3349 }
3350
3351 static bool tgsi_is_array_sampler(unsigned target)
3352 {
3353 return target == TGSI_TEXTURE_1D_ARRAY ||
3354 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3355 target == TGSI_TEXTURE_2D_ARRAY ||
3356 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3357 target == TGSI_TEXTURE_CUBE_ARRAY ||
3358 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3359 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3360 }
3361
3362 static bool tgsi_is_array_image(unsigned target)
3363 {
3364 return target == TGSI_TEXTURE_3D ||
3365 target == TGSI_TEXTURE_CUBE ||
3366 target == TGSI_TEXTURE_1D_ARRAY ||
3367 target == TGSI_TEXTURE_2D_ARRAY ||
3368 target == TGSI_TEXTURE_CUBE_ARRAY ||
3369 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3370 }
3371
3372 /**
3373 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3374 *
3375 * At least on Tonga, executing image stores on images with DCC enabled and
3376 * non-trivial can eventually lead to lockups. This can occur when an
3377 * application binds an image as read-only but then uses a shader that writes
3378 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3379 * program termination) in this case, but it doesn't cost much to be a bit
3380 * nicer: disabling DCC in the shader still leads to undefined results but
3381 * avoids the lockup.
3382 */
3383 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3384 LLVMValueRef rsrc)
3385 {
3386 if (ctx->screen->b.chip_class <= CIK) {
3387 return rsrc;
3388 } else {
3389 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3390 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3391 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3392 LLVMValueRef tmp;
3393
3394 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3395 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3396 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3397 }
3398 }
3399
3400 /**
3401 * Load the resource descriptor for \p image.
3402 */
3403 static void
3404 image_fetch_rsrc(
3405 struct lp_build_tgsi_context *bld_base,
3406 const struct tgsi_full_src_register *image,
3407 bool dcc_off,
3408 LLVMValueRef *rsrc)
3409 {
3410 struct si_shader_context *ctx = si_shader_context(bld_base);
3411
3412 assert(image->Register.File == TGSI_FILE_IMAGE);
3413
3414 if (!image->Register.Indirect) {
3415 /* Fast path: use preloaded resources */
3416 *rsrc = ctx->images[image->Register.Index];
3417 } else {
3418 /* Indexing and manual load */
3419 LLVMValueRef ind_index;
3420 LLVMValueRef rsrc_ptr;
3421 LLVMValueRef tmp;
3422
3423 /* From the GL_ARB_shader_image_load_store extension spec:
3424 *
3425 * If a shader performs an image load, store, or atomic
3426 * operation using an image variable declared as an array,
3427 * and if the index used to select an individual element is
3428 * negative or greater than or equal to the size of the
3429 * array, the results of the operation are undefined but may
3430 * not lead to termination.
3431 */
3432 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3433 image->Register.Index,
3434 SI_NUM_IMAGES);
3435
3436 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3437 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3438 if (dcc_off)
3439 tmp = force_dcc_off(ctx, tmp);
3440 *rsrc = tmp;
3441 }
3442 }
3443
3444 static LLVMValueRef image_fetch_coords(
3445 struct lp_build_tgsi_context *bld_base,
3446 const struct tgsi_full_instruction *inst,
3447 unsigned src)
3448 {
3449 struct gallivm_state *gallivm = bld_base->base.gallivm;
3450 LLVMBuilderRef builder = gallivm->builder;
3451 unsigned target = inst->Memory.Texture;
3452 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3453 LLVMValueRef coords[4];
3454 LLVMValueRef tmp;
3455 int chan;
3456
3457 for (chan = 0; chan < num_coords; ++chan) {
3458 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3459 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3460 coords[chan] = tmp;
3461 }
3462
3463 if (num_coords == 1)
3464 return coords[0];
3465
3466 if (num_coords == 3) {
3467 /* LLVM has difficulties lowering 3-element vectors. */
3468 coords[3] = bld_base->uint_bld.undef;
3469 num_coords = 4;
3470 }
3471
3472 return lp_build_gather_values(gallivm, coords, num_coords);
3473 }
3474
3475 /**
3476 * Append the extra mode bits that are used by image load and store.
3477 */
3478 static void image_append_args(
3479 struct si_shader_context *ctx,
3480 struct lp_build_emit_data * emit_data,
3481 unsigned target,
3482 bool atomic)
3483 {
3484 const struct tgsi_full_instruction *inst = emit_data->inst;
3485 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3486 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3487
3488 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3489 emit_data->args[emit_data->arg_count++] =
3490 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3491 if (!atomic) {
3492 emit_data->args[emit_data->arg_count++] =
3493 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3494 i1true : i1false; /* glc */
3495 }
3496 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3497 }
3498
3499 /**
3500 * Given a 256 bit resource, extract the top half (which stores the buffer
3501 * resource in the case of textures and images).
3502 */
3503 static LLVMValueRef extract_rsrc_top_half(
3504 struct si_shader_context *ctx,
3505 LLVMValueRef rsrc)
3506 {
3507 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3508 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3509 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3510
3511 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3512 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3513 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3514
3515 return rsrc;
3516 }
3517
3518 /**
3519 * Append the resource and indexing arguments for buffer intrinsics.
3520 *
3521 * \param rsrc the v4i32 buffer resource
3522 * \param index index into the buffer (stride-based)
3523 * \param offset byte offset into the buffer
3524 */
3525 static void buffer_append_args(
3526 struct si_shader_context *ctx,
3527 struct lp_build_emit_data *emit_data,
3528 LLVMValueRef rsrc,
3529 LLVMValueRef index,
3530 LLVMValueRef offset,
3531 bool atomic)
3532 {
3533 const struct tgsi_full_instruction *inst = emit_data->inst;
3534 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3535 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3536
3537 emit_data->args[emit_data->arg_count++] = rsrc;
3538 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3539 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3540 if (!atomic) {
3541 emit_data->args[emit_data->arg_count++] =
3542 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3543 i1true : i1false; /* glc */
3544 }
3545 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3546 }
3547
3548 static void load_fetch_args(
3549 struct lp_build_tgsi_context * bld_base,
3550 struct lp_build_emit_data * emit_data)
3551 {
3552 struct si_shader_context *ctx = si_shader_context(bld_base);
3553 struct gallivm_state *gallivm = bld_base->base.gallivm;
3554 const struct tgsi_full_instruction * inst = emit_data->inst;
3555 unsigned target = inst->Memory.Texture;
3556 LLVMValueRef rsrc;
3557
3558 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3559
3560 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3561 LLVMBuilderRef builder = gallivm->builder;
3562 LLVMValueRef offset;
3563 LLVMValueRef tmp;
3564
3565 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3566
3567 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3568 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3569
3570 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3571 offset, false);
3572 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3573 LLVMValueRef coords;
3574
3575 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3576 coords = image_fetch_coords(bld_base, inst, 1);
3577
3578 if (target == TGSI_TEXTURE_BUFFER) {
3579 rsrc = extract_rsrc_top_half(ctx, rsrc);
3580 buffer_append_args(ctx, emit_data, rsrc, coords,
3581 bld_base->uint_bld.zero, false);
3582 } else {
3583 emit_data->args[0] = coords;
3584 emit_data->args[1] = rsrc;
3585 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3586 emit_data->arg_count = 3;
3587
3588 image_append_args(ctx, emit_data, target, false);
3589 }
3590 }
3591 }
3592
3593 static void load_emit_buffer(struct si_shader_context *ctx,
3594 struct lp_build_emit_data *emit_data)
3595 {
3596 const struct tgsi_full_instruction *inst = emit_data->inst;
3597 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3598 LLVMBuilderRef builder = gallivm->builder;
3599 uint writemask = inst->Dst[0].Register.WriteMask;
3600 uint count = util_last_bit(writemask);
3601 const char *intrinsic_name;
3602 LLVMTypeRef dst_type;
3603
3604 switch (count) {
3605 case 1:
3606 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3607 dst_type = ctx->f32;
3608 break;
3609 case 2:
3610 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3611 dst_type = LLVMVectorType(ctx->f32, 2);
3612 break;
3613 default: // 3 & 4
3614 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3615 dst_type = ctx->v4f32;
3616 count = 4;
3617 }
3618
3619 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3620 builder, intrinsic_name, dst_type,
3621 emit_data->args, emit_data->arg_count,
3622 LLVMReadOnlyAttribute);
3623 }
3624
3625 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3626 const struct tgsi_full_instruction *inst,
3627 LLVMTypeRef type, int arg)
3628 {
3629 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3630 LLVMBuilderRef builder = gallivm->builder;
3631 LLVMValueRef offset, ptr;
3632 int addr_space;
3633
3634 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3635 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3636
3637 ptr = ctx->shared_memory;
3638 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3639 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3640 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3641
3642 return ptr;
3643 }
3644
3645 static void load_emit_memory(
3646 struct si_shader_context *ctx,
3647 struct lp_build_emit_data *emit_data)
3648 {
3649 const struct tgsi_full_instruction *inst = emit_data->inst;
3650 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3651 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3652 LLVMBuilderRef builder = gallivm->builder;
3653 unsigned writemask = inst->Dst[0].Register.WriteMask;
3654 LLVMValueRef channels[4], ptr, derived_ptr, index;
3655 int chan;
3656
3657 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3658
3659 for (chan = 0; chan < 4; ++chan) {
3660 if (!(writemask & (1 << chan))) {
3661 channels[chan] = LLVMGetUndef(base->elem_type);
3662 continue;
3663 }
3664
3665 index = lp_build_const_int32(gallivm, chan);
3666 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3667 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3668 }
3669 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3670 }
3671
3672 static void load_emit(
3673 const struct lp_build_tgsi_action *action,
3674 struct lp_build_tgsi_context *bld_base,
3675 struct lp_build_emit_data *emit_data)
3676 {
3677 struct si_shader_context *ctx = si_shader_context(bld_base);
3678 struct gallivm_state *gallivm = bld_base->base.gallivm;
3679 LLVMBuilderRef builder = gallivm->builder;
3680 const struct tgsi_full_instruction * inst = emit_data->inst;
3681 char intrinsic_name[32];
3682 char coords_type[8];
3683
3684 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3685 load_emit_memory(ctx, emit_data);
3686 return;
3687 }
3688
3689 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3690 emit_waitcnt(ctx);
3691
3692 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3693 load_emit_buffer(ctx, emit_data);
3694 return;
3695 }
3696
3697 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3698 emit_data->output[emit_data->chan] =
3699 lp_build_intrinsic(
3700 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3701 emit_data->args, emit_data->arg_count,
3702 LLVMReadOnlyAttribute);
3703 } else {
3704 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3705 coords_type, sizeof(coords_type));
3706
3707 snprintf(intrinsic_name, sizeof(intrinsic_name),
3708 "llvm.amdgcn.image.load.%s", coords_type);
3709
3710 emit_data->output[emit_data->chan] =
3711 lp_build_intrinsic(
3712 builder, intrinsic_name, emit_data->dst_type,
3713 emit_data->args, emit_data->arg_count,
3714 LLVMReadOnlyAttribute);
3715 }
3716 }
3717
3718 static void store_fetch_args(
3719 struct lp_build_tgsi_context * bld_base,
3720 struct lp_build_emit_data * emit_data)
3721 {
3722 struct si_shader_context *ctx = si_shader_context(bld_base);
3723 struct gallivm_state *gallivm = bld_base->base.gallivm;
3724 LLVMBuilderRef builder = gallivm->builder;
3725 const struct tgsi_full_instruction * inst = emit_data->inst;
3726 struct tgsi_full_src_register memory;
3727 LLVMValueRef chans[4];
3728 LLVMValueRef data;
3729 LLVMValueRef rsrc;
3730 unsigned chan;
3731
3732 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3733
3734 for (chan = 0; chan < 4; ++chan) {
3735 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3736 }
3737 data = lp_build_gather_values(gallivm, chans, 4);
3738
3739 emit_data->args[emit_data->arg_count++] = data;
3740
3741 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3742
3743 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3744 LLVMValueRef offset;
3745 LLVMValueRef tmp;
3746
3747 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3748
3749 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3750 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3751
3752 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3753 offset, false);
3754 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3755 unsigned target = inst->Memory.Texture;
3756 LLVMValueRef coords;
3757
3758 coords = image_fetch_coords(bld_base, inst, 0);
3759
3760 if (target == TGSI_TEXTURE_BUFFER) {
3761 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3762
3763 rsrc = extract_rsrc_top_half(ctx, rsrc);
3764 buffer_append_args(ctx, emit_data, rsrc, coords,
3765 bld_base->uint_bld.zero, false);
3766 } else {
3767 emit_data->args[1] = coords;
3768 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3769 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3770 emit_data->arg_count = 4;
3771
3772 image_append_args(ctx, emit_data, target, false);
3773 }
3774 }
3775 }
3776
3777 static void store_emit_buffer(
3778 struct si_shader_context *ctx,
3779 struct lp_build_emit_data *emit_data)
3780 {
3781 const struct tgsi_full_instruction *inst = emit_data->inst;
3782 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3783 LLVMBuilderRef builder = gallivm->builder;
3784 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3785 LLVMValueRef base_data = emit_data->args[0];
3786 LLVMValueRef base_offset = emit_data->args[3];
3787 unsigned writemask = inst->Dst[0].Register.WriteMask;
3788
3789 while (writemask) {
3790 int start, count;
3791 const char *intrinsic_name;
3792 LLVMValueRef data;
3793 LLVMValueRef offset;
3794 LLVMValueRef tmp;
3795
3796 u_bit_scan_consecutive_range(&writemask, &start, &count);
3797
3798 /* Due to an LLVM limitation, split 3-element writes
3799 * into a 2-element and a 1-element write. */
3800 if (count == 3) {
3801 writemask |= 1 << (start + 2);
3802 count = 2;
3803 }
3804
3805 if (count == 4) {
3806 data = base_data;
3807 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3808 } else if (count == 2) {
3809 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3810
3811 tmp = LLVMBuildExtractElement(
3812 builder, base_data,
3813 lp_build_const_int32(gallivm, start), "");
3814 data = LLVMBuildInsertElement(
3815 builder, LLVMGetUndef(v2f32), tmp,
3816 uint_bld->zero, "");
3817
3818 tmp = LLVMBuildExtractElement(
3819 builder, base_data,
3820 lp_build_const_int32(gallivm, start + 1), "");
3821 data = LLVMBuildInsertElement(
3822 builder, data, tmp, uint_bld->one, "");
3823
3824 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3825 } else {
3826 assert(count == 1);
3827 data = LLVMBuildExtractElement(
3828 builder, base_data,
3829 lp_build_const_int32(gallivm, start), "");
3830 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3831 }
3832
3833 offset = base_offset;
3834 if (start != 0) {
3835 offset = LLVMBuildAdd(
3836 builder, offset,
3837 lp_build_const_int32(gallivm, start * 4), "");
3838 }
3839
3840 emit_data->args[0] = data;
3841 emit_data->args[3] = offset;
3842
3843 lp_build_intrinsic(
3844 builder, intrinsic_name, emit_data->dst_type,
3845 emit_data->args, emit_data->arg_count, 0);
3846 }
3847 }
3848
3849 static void store_emit_memory(
3850 struct si_shader_context *ctx,
3851 struct lp_build_emit_data *emit_data)
3852 {
3853 const struct tgsi_full_instruction *inst = emit_data->inst;
3854 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3855 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3856 LLVMBuilderRef builder = gallivm->builder;
3857 unsigned writemask = inst->Dst[0].Register.WriteMask;
3858 LLVMValueRef ptr, derived_ptr, data, index;
3859 int chan;
3860
3861 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3862
3863 for (chan = 0; chan < 4; ++chan) {
3864 if (!(writemask & (1 << chan))) {
3865 continue;
3866 }
3867 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3868 index = lp_build_const_int32(gallivm, chan);
3869 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3870 LLVMBuildStore(builder, data, derived_ptr);
3871 }
3872 }
3873
3874 static void store_emit(
3875 const struct lp_build_tgsi_action *action,
3876 struct lp_build_tgsi_context *bld_base,
3877 struct lp_build_emit_data *emit_data)
3878 {
3879 struct si_shader_context *ctx = si_shader_context(bld_base);
3880 struct gallivm_state *gallivm = bld_base->base.gallivm;
3881 LLVMBuilderRef builder = gallivm->builder;
3882 const struct tgsi_full_instruction * inst = emit_data->inst;
3883 unsigned target = inst->Memory.Texture;
3884 char intrinsic_name[32];
3885 char coords_type[8];
3886
3887 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3888 store_emit_memory(ctx, emit_data);
3889 return;
3890 }
3891
3892 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3893 emit_waitcnt(ctx);
3894
3895 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3896 store_emit_buffer(ctx, emit_data);
3897 return;
3898 }
3899
3900 if (target == TGSI_TEXTURE_BUFFER) {
3901 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3902 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3903 emit_data->dst_type, emit_data->args,
3904 emit_data->arg_count, 0);
3905 } else {
3906 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3907 coords_type, sizeof(coords_type));
3908 snprintf(intrinsic_name, sizeof(intrinsic_name),
3909 "llvm.amdgcn.image.store.%s", coords_type);
3910
3911 emit_data->output[emit_data->chan] =
3912 lp_build_intrinsic(
3913 builder, intrinsic_name, emit_data->dst_type,
3914 emit_data->args, emit_data->arg_count, 0);
3915 }
3916 }
3917
3918 static void atomic_fetch_args(
3919 struct lp_build_tgsi_context * bld_base,
3920 struct lp_build_emit_data * emit_data)
3921 {
3922 struct si_shader_context *ctx = si_shader_context(bld_base);
3923 struct gallivm_state *gallivm = bld_base->base.gallivm;
3924 LLVMBuilderRef builder = gallivm->builder;
3925 const struct tgsi_full_instruction * inst = emit_data->inst;
3926 LLVMValueRef data1, data2;
3927 LLVMValueRef rsrc;
3928 LLVMValueRef tmp;
3929
3930 emit_data->dst_type = bld_base->base.elem_type;
3931
3932 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3933 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3934
3935 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3936 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3937 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3938 }
3939
3940 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3941 * of arguments, which is reversed relative to TGSI (and GLSL)
3942 */
3943 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3944 emit_data->args[emit_data->arg_count++] = data2;
3945 emit_data->args[emit_data->arg_count++] = data1;
3946
3947 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3948 LLVMValueRef offset;
3949
3950 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3951
3952 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3953 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3954
3955 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3956 offset, true);
3957 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3958 unsigned target = inst->Memory.Texture;
3959 LLVMValueRef coords;
3960
3961 image_fetch_rsrc(bld_base, &inst->Src[0],
3962 target != TGSI_TEXTURE_BUFFER, &rsrc);
3963 coords = image_fetch_coords(bld_base, inst, 1);
3964
3965 if (target == TGSI_TEXTURE_BUFFER) {
3966 rsrc = extract_rsrc_top_half(ctx, rsrc);
3967 buffer_append_args(ctx, emit_data, rsrc, coords,
3968 bld_base->uint_bld.zero, true);
3969 } else {
3970 emit_data->args[emit_data->arg_count++] = coords;
3971 emit_data->args[emit_data->arg_count++] = rsrc;
3972
3973 image_append_args(ctx, emit_data, target, true);
3974 }
3975 }
3976 }
3977
3978 static void atomic_emit_memory(struct si_shader_context *ctx,
3979 struct lp_build_emit_data *emit_data) {
3980 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3981 LLVMBuilderRef builder = gallivm->builder;
3982 const struct tgsi_full_instruction * inst = emit_data->inst;
3983 LLVMValueRef ptr, result, arg;
3984
3985 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3986
3987 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3988 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3989
3990 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3991 LLVMValueRef new_data;
3992 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
3993 inst, 3, 0);
3994
3995 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3996
3997 #if HAVE_LLVM >= 0x309
3998 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3999 LLVMAtomicOrderingSequentiallyConsistent,
4000 LLVMAtomicOrderingSequentiallyConsistent,
4001 false);
4002 #endif
4003
4004 result = LLVMBuildExtractValue(builder, result, 0, "");
4005 } else {
4006 LLVMAtomicRMWBinOp op;
4007
4008 switch(inst->Instruction.Opcode) {
4009 case TGSI_OPCODE_ATOMUADD:
4010 op = LLVMAtomicRMWBinOpAdd;
4011 break;
4012 case TGSI_OPCODE_ATOMXCHG:
4013 op = LLVMAtomicRMWBinOpXchg;
4014 break;
4015 case TGSI_OPCODE_ATOMAND:
4016 op = LLVMAtomicRMWBinOpAnd;
4017 break;
4018 case TGSI_OPCODE_ATOMOR:
4019 op = LLVMAtomicRMWBinOpOr;
4020 break;
4021 case TGSI_OPCODE_ATOMXOR:
4022 op = LLVMAtomicRMWBinOpXor;
4023 break;
4024 case TGSI_OPCODE_ATOMUMIN:
4025 op = LLVMAtomicRMWBinOpUMin;
4026 break;
4027 case TGSI_OPCODE_ATOMUMAX:
4028 op = LLVMAtomicRMWBinOpUMax;
4029 break;
4030 case TGSI_OPCODE_ATOMIMIN:
4031 op = LLVMAtomicRMWBinOpMin;
4032 break;
4033 case TGSI_OPCODE_ATOMIMAX:
4034 op = LLVMAtomicRMWBinOpMax;
4035 break;
4036 default:
4037 unreachable("unknown atomic opcode");
4038 }
4039
4040 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4041 LLVMAtomicOrderingSequentiallyConsistent,
4042 false);
4043 }
4044 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4045 }
4046
4047 static void atomic_emit(
4048 const struct lp_build_tgsi_action *action,
4049 struct lp_build_tgsi_context *bld_base,
4050 struct lp_build_emit_data *emit_data)
4051 {
4052 struct si_shader_context *ctx = si_shader_context(bld_base);
4053 struct gallivm_state *gallivm = bld_base->base.gallivm;
4054 LLVMBuilderRef builder = gallivm->builder;
4055 const struct tgsi_full_instruction * inst = emit_data->inst;
4056 char intrinsic_name[40];
4057 LLVMValueRef tmp;
4058
4059 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4060 atomic_emit_memory(ctx, emit_data);
4061 return;
4062 }
4063
4064 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4065 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4066 snprintf(intrinsic_name, sizeof(intrinsic_name),
4067 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4068 } else {
4069 char coords_type[8];
4070
4071 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4072 coords_type, sizeof(coords_type));
4073 snprintf(intrinsic_name, sizeof(intrinsic_name),
4074 "llvm.amdgcn.image.atomic.%s.%s",
4075 action->intr_name, coords_type);
4076 }
4077
4078 tmp = lp_build_intrinsic(
4079 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4080 emit_data->args, emit_data->arg_count, 0);
4081 emit_data->output[emit_data->chan] =
4082 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4083 }
4084
4085 static void resq_fetch_args(
4086 struct lp_build_tgsi_context * bld_base,
4087 struct lp_build_emit_data * emit_data)
4088 {
4089 struct si_shader_context *ctx = si_shader_context(bld_base);
4090 struct gallivm_state *gallivm = bld_base->base.gallivm;
4091 const struct tgsi_full_instruction *inst = emit_data->inst;
4092 const struct tgsi_full_src_register *reg = &inst->Src[0];
4093
4094 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4095
4096 if (reg->Register.File == TGSI_FILE_BUFFER) {
4097 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4098 emit_data->arg_count = 1;
4099 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4100 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4101 emit_data->arg_count = 1;
4102 } else {
4103 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4104 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4105 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4106 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4107 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4108 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4109 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4110 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4111 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4112 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4113 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4114 emit_data->arg_count = 10;
4115 }
4116 }
4117
4118 static void resq_emit(
4119 const struct lp_build_tgsi_action *action,
4120 struct lp_build_tgsi_context *bld_base,
4121 struct lp_build_emit_data *emit_data)
4122 {
4123 struct gallivm_state *gallivm = bld_base->base.gallivm;
4124 LLVMBuilderRef builder = gallivm->builder;
4125 const struct tgsi_full_instruction *inst = emit_data->inst;
4126 LLVMValueRef out;
4127
4128 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4129 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4130 lp_build_const_int32(gallivm, 2), "");
4131 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4132 out = get_buffer_size(bld_base, emit_data->args[0]);
4133 } else {
4134 out = lp_build_intrinsic(
4135 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4136 emit_data->args, emit_data->arg_count,
4137 LLVMReadNoneAttribute);
4138
4139 /* Divide the number of layers by 6 to get the number of cubes. */
4140 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4141 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4142 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4143
4144 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4145 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4146 z = LLVMBuildSDiv(builder, z, imm6, "");
4147 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4148 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4149 }
4150 }
4151
4152 emit_data->output[emit_data->chan] = out;
4153 }
4154
4155 static void set_tex_fetch_args(struct si_shader_context *ctx,
4156 struct lp_build_emit_data *emit_data,
4157 unsigned opcode, unsigned target,
4158 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4159 LLVMValueRef *param, unsigned count,
4160 unsigned dmask)
4161 {
4162 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4163 unsigned num_args;
4164 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4165
4166 /* Pad to power of two vector */
4167 while (count < util_next_power_of_two(count))
4168 param[count++] = LLVMGetUndef(ctx->i32);
4169
4170 /* Texture coordinates. */
4171 if (count > 1)
4172 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4173 else
4174 emit_data->args[0] = param[0];
4175
4176 /* Resource. */
4177 emit_data->args[1] = res_ptr;
4178 num_args = 2;
4179
4180 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4181 emit_data->dst_type = ctx->v4i32;
4182 else {
4183 emit_data->dst_type = ctx->v4f32;
4184
4185 emit_data->args[num_args++] = samp_ptr;
4186 }
4187
4188 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4189 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4190 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4191 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4192 tgsi_is_array_sampler(target)); /* da */
4193 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4194 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4195 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4196 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4197
4198 emit_data->arg_count = num_args;
4199 }
4200
4201 static const struct lp_build_tgsi_action tex_action;
4202
4203 enum desc_type {
4204 DESC_IMAGE,
4205 DESC_FMASK,
4206 DESC_SAMPLER
4207 };
4208
4209 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4210 {
4211 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4212 CONST_ADDR_SPACE);
4213 }
4214
4215 /**
4216 * Load an image view, fmask view. or sampler state descriptor.
4217 */
4218 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4219 LLVMValueRef list, LLVMValueRef index,
4220 enum desc_type type)
4221 {
4222 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4223 LLVMBuilderRef builder = gallivm->builder;
4224
4225 switch (type) {
4226 case DESC_IMAGE:
4227 /* The image is at [0:7]. */
4228 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4229 break;
4230 case DESC_FMASK:
4231 /* The FMASK is at [8:15]. */
4232 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4233 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4234 break;
4235 case DESC_SAMPLER:
4236 /* The sampler state is at [12:15]. */
4237 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4238 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4239 list = LLVMBuildPointerCast(builder, list,
4240 const_array(ctx->v4i32, 0), "");
4241 break;
4242 }
4243
4244 return build_indexed_load_const(ctx, list, index);
4245 }
4246
4247 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4248 LLVMValueRef index, enum desc_type type)
4249 {
4250 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4251 SI_PARAM_SAMPLERS);
4252
4253 return get_sampler_desc_custom(ctx, list, index, type);
4254 }
4255
4256 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4257 *
4258 * SI-CI:
4259 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4260 * filtering manually. The driver sets img7 to a mask clearing
4261 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4262 * s_and_b32 samp0, samp0, img7
4263 *
4264 * VI:
4265 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4266 */
4267 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4268 LLVMValueRef res, LLVMValueRef samp)
4269 {
4270 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4271 LLVMValueRef img7, samp0;
4272
4273 if (ctx->screen->b.chip_class >= VI)
4274 return samp;
4275
4276 img7 = LLVMBuildExtractElement(builder, res,
4277 LLVMConstInt(ctx->i32, 7, 0), "");
4278 samp0 = LLVMBuildExtractElement(builder, samp,
4279 LLVMConstInt(ctx->i32, 0, 0), "");
4280 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4281 return LLVMBuildInsertElement(builder, samp, samp0,
4282 LLVMConstInt(ctx->i32, 0, 0), "");
4283 }
4284
4285 static void tex_fetch_ptrs(
4286 struct lp_build_tgsi_context *bld_base,
4287 struct lp_build_emit_data *emit_data,
4288 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4289 {
4290 struct si_shader_context *ctx = si_shader_context(bld_base);
4291 const struct tgsi_full_instruction *inst = emit_data->inst;
4292 unsigned target = inst->Texture.Texture;
4293 unsigned sampler_src;
4294 unsigned sampler_index;
4295
4296 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4297 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4298
4299 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4300 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4301 LLVMValueRef ind_index;
4302
4303 ind_index = get_bounded_indirect_index(ctx,
4304 &reg->Indirect,
4305 reg->Register.Index,
4306 SI_NUM_SAMPLERS);
4307
4308 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4309
4310 if (target == TGSI_TEXTURE_2D_MSAA ||
4311 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4312 if (samp_ptr)
4313 *samp_ptr = NULL;
4314 if (fmask_ptr)
4315 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4316 } else {
4317 if (samp_ptr) {
4318 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4319 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4320 }
4321 if (fmask_ptr)
4322 *fmask_ptr = NULL;
4323 }
4324 } else {
4325 *res_ptr = ctx->sampler_views[sampler_index];
4326 if (samp_ptr)
4327 *samp_ptr = ctx->sampler_states[sampler_index];
4328 if (fmask_ptr)
4329 *fmask_ptr = ctx->fmasks[sampler_index];
4330 }
4331 }
4332
4333 static void txq_fetch_args(
4334 struct lp_build_tgsi_context *bld_base,
4335 struct lp_build_emit_data *emit_data)
4336 {
4337 struct si_shader_context *ctx = si_shader_context(bld_base);
4338 struct gallivm_state *gallivm = bld_base->base.gallivm;
4339 LLVMBuilderRef builder = gallivm->builder;
4340 const struct tgsi_full_instruction *inst = emit_data->inst;
4341 unsigned target = inst->Texture.Texture;
4342 LLVMValueRef res_ptr;
4343 LLVMValueRef address;
4344
4345 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4346
4347 if (target == TGSI_TEXTURE_BUFFER) {
4348 /* Read the size from the buffer descriptor directly. */
4349 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4350 emit_data->args[0] = get_buffer_size(bld_base, res);
4351 return;
4352 }
4353
4354 /* Textures - set the mip level. */
4355 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4356
4357 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4358 NULL, &address, 1, 0xf);
4359 }
4360
4361 static void txq_emit(const struct lp_build_tgsi_action *action,
4362 struct lp_build_tgsi_context *bld_base,
4363 struct lp_build_emit_data *emit_data)
4364 {
4365 struct lp_build_context *base = &bld_base->base;
4366 unsigned target = emit_data->inst->Texture.Texture;
4367
4368 if (target == TGSI_TEXTURE_BUFFER) {
4369 /* Just return the buffer size. */
4370 emit_data->output[emit_data->chan] = emit_data->args[0];
4371 return;
4372 }
4373
4374 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4375 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4376 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4377 LLVMReadNoneAttribute);
4378
4379 /* Divide the number of layers by 6 to get the number of cubes. */
4380 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4381 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4382 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4383 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4384 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4385
4386 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4387 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4388 z = LLVMBuildSDiv(builder, z, six, "");
4389
4390 emit_data->output[emit_data->chan] =
4391 LLVMBuildInsertElement(builder, v4, z, two, "");
4392 }
4393 }
4394
4395 static void tex_fetch_args(
4396 struct lp_build_tgsi_context *bld_base,
4397 struct lp_build_emit_data *emit_data)
4398 {
4399 struct si_shader_context *ctx = si_shader_context(bld_base);
4400 struct gallivm_state *gallivm = bld_base->base.gallivm;
4401 const struct tgsi_full_instruction *inst = emit_data->inst;
4402 unsigned opcode = inst->Instruction.Opcode;
4403 unsigned target = inst->Texture.Texture;
4404 LLVMValueRef coords[5], derivs[6];
4405 LLVMValueRef address[16];
4406 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4407 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4408 unsigned count = 0;
4409 unsigned chan;
4410 unsigned num_deriv_channels = 0;
4411 bool has_offset = inst->Texture.NumOffsets > 0;
4412 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4413 unsigned dmask = 0xf;
4414
4415 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4416
4417 if (target == TGSI_TEXTURE_BUFFER) {
4418 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4419
4420 /* Bitcast and truncate v8i32 to v16i8. */
4421 LLVMValueRef res = res_ptr;
4422 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4423 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4424 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4425
4426 emit_data->dst_type = ctx->v4f32;
4427 emit_data->args[0] = res;
4428 emit_data->args[1] = bld_base->uint_bld.zero;
4429 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4430 emit_data->arg_count = 3;
4431 return;
4432 }
4433
4434 /* Fetch and project texture coordinates */
4435 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4436 for (chan = 0; chan < 3; chan++ ) {
4437 coords[chan] = lp_build_emit_fetch(bld_base,
4438 emit_data->inst, 0,
4439 chan);
4440 if (opcode == TGSI_OPCODE_TXP)
4441 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4442 TGSI_OPCODE_DIV,
4443 coords[chan],
4444 coords[3]);
4445 }
4446
4447 if (opcode == TGSI_OPCODE_TXP)
4448 coords[3] = bld_base->base.one;
4449
4450 /* Pack offsets. */
4451 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4452 /* The offsets are six-bit signed integers packed like this:
4453 * X=[5:0], Y=[13:8], and Z=[21:16].
4454 */
4455 LLVMValueRef offset[3], pack;
4456
4457 assert(inst->Texture.NumOffsets == 1);
4458
4459 for (chan = 0; chan < 3; chan++) {
4460 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4461 emit_data->inst, 0, chan);
4462 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4463 lp_build_const_int32(gallivm, 0x3f), "");
4464 if (chan)
4465 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4466 lp_build_const_int32(gallivm, chan*8), "");
4467 }
4468
4469 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4470 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4471 address[count++] = pack;
4472 }
4473
4474 /* Pack LOD bias value */
4475 if (opcode == TGSI_OPCODE_TXB)
4476 address[count++] = coords[3];
4477 if (opcode == TGSI_OPCODE_TXB2)
4478 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4479
4480 /* Pack depth comparison value */
4481 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4482 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4483 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4484 } else {
4485 assert(ref_pos >= 0);
4486 address[count++] = coords[ref_pos];
4487 }
4488 }
4489
4490 /* Pack user derivatives */
4491 if (opcode == TGSI_OPCODE_TXD) {
4492 int param, num_src_deriv_channels;
4493
4494 switch (target) {
4495 case TGSI_TEXTURE_3D:
4496 num_src_deriv_channels = 3;
4497 num_deriv_channels = 3;
4498 break;
4499 case TGSI_TEXTURE_2D:
4500 case TGSI_TEXTURE_SHADOW2D:
4501 case TGSI_TEXTURE_RECT:
4502 case TGSI_TEXTURE_SHADOWRECT:
4503 case TGSI_TEXTURE_2D_ARRAY:
4504 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4505 num_src_deriv_channels = 2;
4506 num_deriv_channels = 2;
4507 break;
4508 case TGSI_TEXTURE_CUBE:
4509 case TGSI_TEXTURE_SHADOWCUBE:
4510 case TGSI_TEXTURE_CUBE_ARRAY:
4511 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4512 /* Cube derivatives will be converted to 2D. */
4513 num_src_deriv_channels = 3;
4514 num_deriv_channels = 2;
4515 break;
4516 case TGSI_TEXTURE_1D:
4517 case TGSI_TEXTURE_SHADOW1D:
4518 case TGSI_TEXTURE_1D_ARRAY:
4519 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4520 num_src_deriv_channels = 1;
4521 num_deriv_channels = 1;
4522 break;
4523 default:
4524 unreachable("invalid target");
4525 }
4526
4527 for (param = 0; param < 2; param++)
4528 for (chan = 0; chan < num_src_deriv_channels; chan++)
4529 derivs[param * num_src_deriv_channels + chan] =
4530 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4531 }
4532
4533 if (target == TGSI_TEXTURE_CUBE ||
4534 target == TGSI_TEXTURE_CUBE_ARRAY ||
4535 target == TGSI_TEXTURE_SHADOWCUBE ||
4536 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4537 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4538
4539 if (opcode == TGSI_OPCODE_TXD)
4540 for (int i = 0; i < num_deriv_channels * 2; i++)
4541 address[count++] = derivs[i];
4542
4543 /* Pack texture coordinates */
4544 address[count++] = coords[0];
4545 if (num_coords > 1)
4546 address[count++] = coords[1];
4547 if (num_coords > 2)
4548 address[count++] = coords[2];
4549
4550 /* Pack LOD or sample index */
4551 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4552 address[count++] = coords[3];
4553 else if (opcode == TGSI_OPCODE_TXL2)
4554 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4555
4556 if (count > 16) {
4557 assert(!"Cannot handle more than 16 texture address parameters");
4558 count = 16;
4559 }
4560
4561 for (chan = 0; chan < count; chan++ ) {
4562 address[chan] = LLVMBuildBitCast(gallivm->builder,
4563 address[chan], ctx->i32, "");
4564 }
4565
4566 /* Adjust the sample index according to FMASK.
4567 *
4568 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4569 * which is the identity mapping. Each nibble says which physical sample
4570 * should be fetched to get that sample.
4571 *
4572 * For example, 0x11111100 means there are only 2 samples stored and
4573 * the second sample covers 3/4 of the pixel. When reading samples 0
4574 * and 1, return physical sample 0 (determined by the first two 0s
4575 * in FMASK), otherwise return physical sample 1.
4576 *
4577 * The sample index should be adjusted as follows:
4578 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4579 */
4580 if (target == TGSI_TEXTURE_2D_MSAA ||
4581 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4582 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4583 struct lp_build_emit_data txf_emit_data = *emit_data;
4584 LLVMValueRef txf_address[4];
4585 unsigned txf_count = count;
4586 struct tgsi_full_instruction inst = {};
4587
4588 memcpy(txf_address, address, sizeof(txf_address));
4589
4590 if (target == TGSI_TEXTURE_2D_MSAA) {
4591 txf_address[2] = bld_base->uint_bld.zero;
4592 }
4593 txf_address[3] = bld_base->uint_bld.zero;
4594
4595 /* Read FMASK using TXF. */
4596 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4597 inst.Texture.Texture = target;
4598 txf_emit_data.inst = &inst;
4599 txf_emit_data.chan = 0;
4600 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4601 target, fmask_ptr, NULL,
4602 txf_address, txf_count, 0xf);
4603 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4604
4605 /* Initialize some constants. */
4606 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4607 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4608
4609 /* Apply the formula. */
4610 LLVMValueRef fmask =
4611 LLVMBuildExtractElement(gallivm->builder,
4612 txf_emit_data.output[0],
4613 uint_bld->zero, "");
4614
4615 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4616
4617 LLVMValueRef sample_index4 =
4618 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4619
4620 LLVMValueRef shifted_fmask =
4621 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4622
4623 LLVMValueRef final_sample =
4624 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4625
4626 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4627 * resource descriptor is 0 (invalid),
4628 */
4629 LLVMValueRef fmask_desc =
4630 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4631 ctx->v8i32, "");
4632
4633 LLVMValueRef fmask_word1 =
4634 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4635 uint_bld->one, "");
4636
4637 LLVMValueRef word1_is_nonzero =
4638 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4639 fmask_word1, uint_bld->zero, "");
4640
4641 /* Replace the MSAA sample index. */
4642 address[sample_chan] =
4643 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4644 final_sample, address[sample_chan], "");
4645 }
4646
4647 if (opcode == TGSI_OPCODE_TXF) {
4648 /* add tex offsets */
4649 if (inst->Texture.NumOffsets) {
4650 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4651 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4652 const struct tgsi_texture_offset *off = inst->TexOffsets;
4653
4654 assert(inst->Texture.NumOffsets == 1);
4655
4656 switch (target) {
4657 case TGSI_TEXTURE_3D:
4658 address[2] = lp_build_add(uint_bld, address[2],
4659 bld->immediates[off->Index][off->SwizzleZ]);
4660 /* fall through */
4661 case TGSI_TEXTURE_2D:
4662 case TGSI_TEXTURE_SHADOW2D:
4663 case TGSI_TEXTURE_RECT:
4664 case TGSI_TEXTURE_SHADOWRECT:
4665 case TGSI_TEXTURE_2D_ARRAY:
4666 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4667 address[1] =
4668 lp_build_add(uint_bld, address[1],
4669 bld->immediates[off->Index][off->SwizzleY]);
4670 /* fall through */
4671 case TGSI_TEXTURE_1D:
4672 case TGSI_TEXTURE_SHADOW1D:
4673 case TGSI_TEXTURE_1D_ARRAY:
4674 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4675 address[0] =
4676 lp_build_add(uint_bld, address[0],
4677 bld->immediates[off->Index][off->SwizzleX]);
4678 break;
4679 /* texture offsets do not apply to other texture targets */
4680 }
4681 }
4682 }
4683
4684 if (opcode == TGSI_OPCODE_TG4) {
4685 unsigned gather_comp = 0;
4686
4687 /* DMASK was repurposed for GATHER4. 4 components are always
4688 * returned and DMASK works like a swizzle - it selects
4689 * the component to fetch. The only valid DMASK values are
4690 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4691 * (red,red,red,red) etc.) The ISA document doesn't mention
4692 * this.
4693 */
4694
4695 /* Get the component index from src1.x for Gather4. */
4696 if (!tgsi_is_shadow_target(target)) {
4697 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4698 LLVMValueRef comp_imm;
4699 struct tgsi_src_register src1 = inst->Src[1].Register;
4700
4701 assert(src1.File == TGSI_FILE_IMMEDIATE);
4702
4703 comp_imm = imms[src1.Index][src1.SwizzleX];
4704 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4705 gather_comp = CLAMP(gather_comp, 0, 3);
4706 }
4707
4708 dmask = 1 << gather_comp;
4709 }
4710
4711 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4712 samp_ptr, address, count, dmask);
4713 }
4714
4715 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4716 struct lp_build_tgsi_context *bld_base,
4717 struct lp_build_emit_data *emit_data)
4718 {
4719 struct si_shader_context *ctx = si_shader_context(bld_base);
4720 struct lp_build_context *base = &bld_base->base;
4721 unsigned opcode = emit_data->inst->Instruction.Opcode;
4722 unsigned target = emit_data->inst->Texture.Texture;
4723 char intr_name[127];
4724 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4725 bool is_shadow = tgsi_is_shadow_target(target);
4726 char type[64];
4727 const char *name = "llvm.SI.image.sample";
4728 const char *infix = "";
4729
4730 if (target == TGSI_TEXTURE_BUFFER) {
4731 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4732 base->gallivm->builder,
4733 "llvm.SI.vs.load.input", emit_data->dst_type,
4734 emit_data->args, emit_data->arg_count,
4735 LLVMReadNoneAttribute);
4736 return;
4737 }
4738
4739 switch (opcode) {
4740 case TGSI_OPCODE_TXF:
4741 name = target == TGSI_TEXTURE_2D_MSAA ||
4742 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4743 "llvm.SI.image.load" :
4744 "llvm.SI.image.load.mip";
4745 is_shadow = false;
4746 has_offset = false;
4747 break;
4748 case TGSI_OPCODE_LODQ:
4749 name = "llvm.SI.getlod";
4750 is_shadow = false;
4751 has_offset = false;
4752 break;
4753 case TGSI_OPCODE_TEX:
4754 case TGSI_OPCODE_TEX2:
4755 case TGSI_OPCODE_TXP:
4756 if (ctx->type != PIPE_SHADER_FRAGMENT)
4757 infix = ".lz";
4758 break;
4759 case TGSI_OPCODE_TXB:
4760 case TGSI_OPCODE_TXB2:
4761 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4762 infix = ".b";
4763 break;
4764 case TGSI_OPCODE_TXL:
4765 case TGSI_OPCODE_TXL2:
4766 infix = ".l";
4767 break;
4768 case TGSI_OPCODE_TXD:
4769 infix = ".d";
4770 break;
4771 case TGSI_OPCODE_TG4:
4772 name = "llvm.SI.gather4";
4773 infix = ".lz";
4774 break;
4775 default:
4776 assert(0);
4777 return;
4778 }
4779
4780 /* Add the type and suffixes .c, .o if needed. */
4781 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4782 sprintf(intr_name, "%s%s%s%s.%s",
4783 name, is_shadow ? ".c" : "", infix,
4784 has_offset ? ".o" : "", type);
4785
4786 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4787 base->gallivm->builder, intr_name, emit_data->dst_type,
4788 emit_data->args, emit_data->arg_count,
4789 LLVMReadNoneAttribute);
4790 }
4791
4792 static void si_llvm_emit_txqs(
4793 const struct lp_build_tgsi_action *action,
4794 struct lp_build_tgsi_context *bld_base,
4795 struct lp_build_emit_data *emit_data)
4796 {
4797 struct si_shader_context *ctx = si_shader_context(bld_base);
4798 struct gallivm_state *gallivm = bld_base->base.gallivm;
4799 LLVMBuilderRef builder = gallivm->builder;
4800 LLVMValueRef res, samples;
4801 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4802
4803 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4804
4805
4806 /* Read the samples from the descriptor directly. */
4807 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4808 samples = LLVMBuildExtractElement(
4809 builder, res,
4810 lp_build_const_int32(gallivm, 3), "");
4811 samples = LLVMBuildLShr(builder, samples,
4812 lp_build_const_int32(gallivm, 16), "");
4813 samples = LLVMBuildAnd(builder, samples,
4814 lp_build_const_int32(gallivm, 0xf), "");
4815 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4816 samples, "");
4817
4818 emit_data->output[emit_data->chan] = samples;
4819 }
4820
4821 /*
4822 * SI implements derivatives using the local data store (LDS)
4823 * All writes to the LDS happen in all executing threads at
4824 * the same time. TID is the Thread ID for the current
4825 * thread and is a value between 0 and 63, representing
4826 * the thread's position in the wavefront.
4827 *
4828 * For the pixel shader threads are grouped into quads of four pixels.
4829 * The TIDs of the pixels of a quad are:
4830 *
4831 * +------+------+
4832 * |4n + 0|4n + 1|
4833 * +------+------+
4834 * |4n + 2|4n + 3|
4835 * +------+------+
4836 *
4837 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4838 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4839 * the current pixel's column, and masking with 0xfffffffe yields the TID
4840 * of the left pixel of the current pixel's row.
4841 *
4842 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4843 * adding 2 yields the TID of the pixel below the top pixel.
4844 */
4845 /* masks for thread ID. */
4846 #define TID_MASK_TOP_LEFT 0xfffffffc
4847 #define TID_MASK_TOP 0xfffffffd
4848 #define TID_MASK_LEFT 0xfffffffe
4849
4850 static void si_llvm_emit_ddxy(
4851 const struct lp_build_tgsi_action *action,
4852 struct lp_build_tgsi_context *bld_base,
4853 struct lp_build_emit_data *emit_data)
4854 {
4855 struct si_shader_context *ctx = si_shader_context(bld_base);
4856 struct gallivm_state *gallivm = bld_base->base.gallivm;
4857 const struct tgsi_full_instruction *inst = emit_data->inst;
4858 unsigned opcode = inst->Instruction.Opcode;
4859 LLVMValueRef indices[2];
4860 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4861 LLVMValueRef tl, trbl, result[4];
4862 LLVMValueRef tl_tid, trbl_tid;
4863 unsigned swizzle[4];
4864 unsigned c;
4865 int idx;
4866 unsigned mask;
4867
4868 indices[0] = bld_base->uint_bld.zero;
4869 indices[1] = get_thread_id(ctx);
4870 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4871 indices, 2, "");
4872
4873 if (opcode == TGSI_OPCODE_DDX_FINE)
4874 mask = TID_MASK_LEFT;
4875 else if (opcode == TGSI_OPCODE_DDY_FINE)
4876 mask = TID_MASK_TOP;
4877 else
4878 mask = TID_MASK_TOP_LEFT;
4879
4880 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4881 lp_build_const_int32(gallivm, mask), "");
4882 indices[1] = tl_tid;
4883 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4884 indices, 2, "");
4885
4886 /* for DDX we want to next X pixel, DDY next Y pixel. */
4887 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4888 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4889 lp_build_const_int32(gallivm, idx), "");
4890 indices[1] = trbl_tid;
4891 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4892 indices, 2, "");
4893
4894 for (c = 0; c < 4; ++c) {
4895 unsigned i;
4896 LLVMValueRef val;
4897 LLVMValueRef args[2];
4898
4899 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4900 for (i = 0; i < c; ++i) {
4901 if (swizzle[i] == swizzle[c]) {
4902 result[c] = result[i];
4903 break;
4904 }
4905 }
4906 if (i != c)
4907 continue;
4908
4909 val = LLVMBuildBitCast(gallivm->builder,
4910 lp_build_emit_fetch(bld_base, inst, 0, c),
4911 ctx->i32, "");
4912
4913 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4914
4915 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4916 lp_build_const_int32(gallivm, 4), "");
4917 args[1] = val;
4918 tl = lp_build_intrinsic(gallivm->builder,
4919 "llvm.amdgcn.ds.bpermute", ctx->i32,
4920 args, 2, LLVMReadNoneAttribute);
4921
4922 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4923 lp_build_const_int32(gallivm, 4), "");
4924 trbl = lp_build_intrinsic(gallivm->builder,
4925 "llvm.amdgcn.ds.bpermute", ctx->i32,
4926 args, 2, LLVMReadNoneAttribute);
4927 } else {
4928 LLVMBuildStore(gallivm->builder, val, store_ptr);
4929 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4930 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4931 }
4932 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4933 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4934 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4935 }
4936
4937 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4938 }
4939
4940 /*
4941 * this takes an I,J coordinate pair,
4942 * and works out the X and Y derivatives.
4943 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4944 */
4945 static LLVMValueRef si_llvm_emit_ddxy_interp(
4946 struct lp_build_tgsi_context *bld_base,
4947 LLVMValueRef interp_ij)
4948 {
4949 struct si_shader_context *ctx = si_shader_context(bld_base);
4950 struct gallivm_state *gallivm = bld_base->base.gallivm;
4951 LLVMValueRef indices[2];
4952 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4953 LLVMValueRef tl, tr, bl, result[4];
4954 unsigned c;
4955
4956 indices[0] = bld_base->uint_bld.zero;
4957 indices[1] = get_thread_id(ctx);
4958 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4959 indices, 2, "");
4960
4961 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4962 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4963
4964 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4965 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4966
4967 indices[1] = temp;
4968 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4969 indices, 2, "");
4970
4971 indices[1] = temp2;
4972 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4973 indices, 2, "");
4974
4975 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4976 lp_build_const_int32(gallivm, 1), "");
4977 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4978 indices, 2, "");
4979
4980 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4981 lp_build_const_int32(gallivm, 2), "");
4982 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4983 indices, 2, "");
4984
4985 for (c = 0; c < 2; ++c) {
4986 LLVMValueRef store_val;
4987 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4988
4989 store_val = LLVMBuildExtractElement(gallivm->builder,
4990 interp_ij, c_ll, "");
4991 LLVMBuildStore(gallivm->builder,
4992 store_val,
4993 store_ptr);
4994
4995 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
4996 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4997
4998 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
4999 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5000
5001 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5002
5003 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5004 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5005
5006 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5007 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5008
5009 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5010 }
5011
5012 return lp_build_gather_values(gallivm, result, 4);
5013 }
5014
5015 static void interp_fetch_args(
5016 struct lp_build_tgsi_context *bld_base,
5017 struct lp_build_emit_data *emit_data)
5018 {
5019 struct si_shader_context *ctx = si_shader_context(bld_base);
5020 struct gallivm_state *gallivm = bld_base->base.gallivm;
5021 const struct tgsi_full_instruction *inst = emit_data->inst;
5022
5023 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5024 /* offset is in second src, first two channels */
5025 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5026 emit_data->inst, 1,
5027 TGSI_CHAN_X);
5028 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5029 emit_data->inst, 1,
5030 TGSI_CHAN_Y);
5031 emit_data->arg_count = 2;
5032 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5033 LLVMValueRef sample_position;
5034 LLVMValueRef sample_id;
5035 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5036
5037 /* fetch sample ID, then fetch its sample position,
5038 * and place into first two channels.
5039 */
5040 sample_id = lp_build_emit_fetch(bld_base,
5041 emit_data->inst, 1, TGSI_CHAN_X);
5042 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5043 ctx->i32, "");
5044 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5045
5046 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5047 sample_position,
5048 lp_build_const_int32(gallivm, 0), "");
5049
5050 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5051 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5052 sample_position,
5053 lp_build_const_int32(gallivm, 1), "");
5054 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5055 emit_data->arg_count = 2;
5056 }
5057 }
5058
5059 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5060 struct lp_build_tgsi_context *bld_base,
5061 struct lp_build_emit_data *emit_data)
5062 {
5063 struct si_shader_context *ctx = si_shader_context(bld_base);
5064 struct si_shader *shader = ctx->shader;
5065 struct gallivm_state *gallivm = bld_base->base.gallivm;
5066 LLVMValueRef interp_param;
5067 const struct tgsi_full_instruction *inst = emit_data->inst;
5068 const char *intr_name;
5069 int input_index = inst->Src[0].Register.Index;
5070 int chan;
5071 int i;
5072 LLVMValueRef attr_number;
5073 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5074 int interp_param_idx;
5075 unsigned interp = shader->selector->info.input_interpolate[input_index];
5076 unsigned location;
5077
5078 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5079
5080 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5081 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5082 location = TGSI_INTERPOLATE_LOC_CENTER;
5083 else
5084 location = TGSI_INTERPOLATE_LOC_CENTROID;
5085
5086 interp_param_idx = lookup_interp_param_index(interp, location);
5087 if (interp_param_idx == -1)
5088 return;
5089 else if (interp_param_idx)
5090 interp_param = get_interp_param(ctx, interp_param_idx);
5091 else
5092 interp_param = NULL;
5093
5094 attr_number = lp_build_const_int32(gallivm, input_index);
5095
5096 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5097 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5098 LLVMValueRef ij_out[2];
5099 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5100
5101 /*
5102 * take the I then J parameters, and the DDX/Y for it, and
5103 * calculate the IJ inputs for the interpolator.
5104 * temp1 = ddx * offset/sample.x + I;
5105 * interp_param.I = ddy * offset/sample.y + temp1;
5106 * temp1 = ddx * offset/sample.x + J;
5107 * interp_param.J = ddy * offset/sample.y + temp1;
5108 */
5109 for (i = 0; i < 2; i++) {
5110 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5111 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5112 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5113 ddxy_out, ix_ll, "");
5114 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5115 ddxy_out, iy_ll, "");
5116 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5117 interp_param, ix_ll, "");
5118 LLVMValueRef temp1, temp2;
5119
5120 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5121 ctx->f32, "");
5122
5123 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5124
5125 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5126
5127 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5128
5129 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5130
5131 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5132 temp2, ctx->i32, "");
5133 }
5134 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5135 }
5136
5137 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5138 for (chan = 0; chan < 2; chan++) {
5139 LLVMValueRef args[4];
5140 LLVMValueRef llvm_chan;
5141 unsigned schan;
5142
5143 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5144 llvm_chan = lp_build_const_int32(gallivm, schan);
5145
5146 args[0] = llvm_chan;
5147 args[1] = attr_number;
5148 args[2] = params;
5149 args[3] = interp_param;
5150
5151 emit_data->output[chan] =
5152 lp_build_intrinsic(gallivm->builder, intr_name,
5153 ctx->f32, args, args[3] ? 4 : 3,
5154 LLVMReadNoneAttribute);
5155 }
5156 }
5157
5158 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5159 struct lp_build_emit_data *emit_data)
5160 {
5161 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5162 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5163 unsigned stream;
5164
5165 assert(src0.File == TGSI_FILE_IMMEDIATE);
5166
5167 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5168 return stream;
5169 }
5170
5171 /* Emit one vertex from the geometry shader */
5172 static void si_llvm_emit_vertex(
5173 const struct lp_build_tgsi_action *action,
5174 struct lp_build_tgsi_context *bld_base,
5175 struct lp_build_emit_data *emit_data)
5176 {
5177 struct si_shader_context *ctx = si_shader_context(bld_base);
5178 struct lp_build_context *uint = &bld_base->uint_bld;
5179 struct si_shader *shader = ctx->shader;
5180 struct tgsi_shader_info *info = &shader->selector->info;
5181 struct gallivm_state *gallivm = bld_base->base.gallivm;
5182 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5183 SI_PARAM_GS2VS_OFFSET);
5184 LLVMValueRef gs_next_vertex;
5185 LLVMValueRef can_emit, kill;
5186 LLVMValueRef args[2];
5187 unsigned chan;
5188 int i;
5189 unsigned stream;
5190
5191 stream = si_llvm_get_stream(bld_base, emit_data);
5192
5193 /* Write vertex attribute values to GSVS ring */
5194 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5195 ctx->gs_next_vertex[stream],
5196 "");
5197
5198 /* If this thread has already emitted the declared maximum number of
5199 * vertices, kill it: excessive vertex emissions are not supposed to
5200 * have any effect, and GS threads have no externally observable
5201 * effects other than emitting vertices.
5202 */
5203 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5204 lp_build_const_int32(gallivm,
5205 shader->selector->gs_max_out_vertices), "");
5206 kill = lp_build_select(&bld_base->base, can_emit,
5207 lp_build_const_float(gallivm, 1.0f),
5208 lp_build_const_float(gallivm, -1.0f));
5209
5210 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5211 ctx->voidt, &kill, 1, 0);
5212
5213 for (i = 0; i < info->num_outputs; i++) {
5214 LLVMValueRef *out_ptr =
5215 ctx->radeon_bld.soa.outputs[i];
5216
5217 for (chan = 0; chan < 4; chan++) {
5218 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5219 LLVMValueRef voffset =
5220 lp_build_const_int32(gallivm, (i * 4 + chan) *
5221 shader->selector->gs_max_out_vertices);
5222
5223 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5224 voffset = lp_build_mul_imm(uint, voffset, 4);
5225
5226 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5227
5228 build_tbuffer_store(ctx,
5229 ctx->gsvs_ring[stream],
5230 out_val, 1,
5231 voffset, soffset, 0,
5232 V_008F0C_BUF_DATA_FORMAT_32,
5233 V_008F0C_BUF_NUM_FORMAT_UINT,
5234 1, 0, 1, 1, 0);
5235 }
5236 }
5237 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5238 lp_build_const_int32(gallivm, 1));
5239
5240 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5241
5242 /* Signal vertex emission */
5243 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5244 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5245 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5246 ctx->voidt, args, 2, 0);
5247 }
5248
5249 /* Cut one primitive from the geometry shader */
5250 static void si_llvm_emit_primitive(
5251 const struct lp_build_tgsi_action *action,
5252 struct lp_build_tgsi_context *bld_base,
5253 struct lp_build_emit_data *emit_data)
5254 {
5255 struct si_shader_context *ctx = si_shader_context(bld_base);
5256 struct gallivm_state *gallivm = bld_base->base.gallivm;
5257 LLVMValueRef args[2];
5258 unsigned stream;
5259
5260 /* Signal primitive cut */
5261 stream = si_llvm_get_stream(bld_base, emit_data);
5262 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5263 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5264 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5265 ctx->voidt, args, 2, 0);
5266 }
5267
5268 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5269 struct lp_build_tgsi_context *bld_base,
5270 struct lp_build_emit_data *emit_data)
5271 {
5272 struct si_shader_context *ctx = si_shader_context(bld_base);
5273 struct gallivm_state *gallivm = bld_base->base.gallivm;
5274
5275 /* The real barrier instruction isn’t needed, because an entire patch
5276 * always fits into a single wave.
5277 */
5278 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5279 emit_optimization_barrier(ctx);
5280 return;
5281 }
5282
5283 lp_build_intrinsic(gallivm->builder,
5284 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5285 : "llvm.AMDGPU.barrier.local",
5286 ctx->voidt, NULL, 0, 0);
5287 }
5288
5289 static const struct lp_build_tgsi_action tex_action = {
5290 .fetch_args = tex_fetch_args,
5291 .emit = build_tex_intrinsic,
5292 };
5293
5294 static const struct lp_build_tgsi_action interp_action = {
5295 .fetch_args = interp_fetch_args,
5296 .emit = build_interp_intrinsic,
5297 };
5298
5299 static void si_create_function(struct si_shader_context *ctx,
5300 LLVMTypeRef *returns, unsigned num_returns,
5301 LLVMTypeRef *params, unsigned num_params,
5302 int last_array_pointer, int last_sgpr)
5303 {
5304 int i;
5305
5306 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5307 params, num_params);
5308 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5309 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5310
5311 for (i = 0; i <= last_sgpr; ++i) {
5312 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5313
5314 /* We tell llvm that array inputs are passed by value to allow Sinking pass
5315 * to move load. Inputs are constant so this is fine. */
5316 if (i <= last_array_pointer)
5317 LLVMAddAttribute(P, LLVMByValAttribute);
5318 else
5319 LLVMAddAttribute(P, LLVMInRegAttribute);
5320 }
5321
5322 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5323 /* These were copied from some LLVM test. */
5324 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5325 "less-precise-fpmad",
5326 "true");
5327 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5328 "no-infs-fp-math",
5329 "true");
5330 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5331 "no-nans-fp-math",
5332 "true");
5333 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5334 "unsafe-fp-math",
5335 "true");
5336 }
5337 }
5338
5339 static void create_meta_data(struct si_shader_context *ctx)
5340 {
5341 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5342 LLVMValueRef tbaa_const[3];
5343
5344 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5345 "range", 5);
5346 ctx->tbaa_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5347 "tbaa", 4);
5348 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5349 "amdgpu.uniform", 14);
5350
5351 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5352
5353 tbaa_const[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
5354 tbaa_const[1] = 0;
5355 tbaa_const[2] = lp_build_const_int32(gallivm, 1);
5356 ctx->tbaa_const_md = LLVMMDNodeInContext(gallivm->context, tbaa_const, 3);
5357 }
5358
5359 static void declare_streamout_params(struct si_shader_context *ctx,
5360 struct pipe_stream_output_info *so,
5361 LLVMTypeRef *params, LLVMTypeRef i32,
5362 unsigned *num_params)
5363 {
5364 int i;
5365
5366 /* Streamout SGPRs. */
5367 if (so->num_outputs) {
5368 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5369 params[ctx->param_streamout_config = (*num_params)++] = i32;
5370 else
5371 ctx->param_streamout_config = ctx->param_tess_offchip;
5372
5373 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5374 }
5375 /* A streamout buffer offset is loaded if the stride is non-zero. */
5376 for (i = 0; i < 4; i++) {
5377 if (!so->stride[i])
5378 continue;
5379
5380 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5381 }
5382 }
5383
5384 static unsigned llvm_get_type_size(LLVMTypeRef type)
5385 {
5386 LLVMTypeKind kind = LLVMGetTypeKind(type);
5387
5388 switch (kind) {
5389 case LLVMIntegerTypeKind:
5390 return LLVMGetIntTypeWidth(type) / 8;
5391 case LLVMFloatTypeKind:
5392 return 4;
5393 case LLVMPointerTypeKind:
5394 return 8;
5395 case LLVMVectorTypeKind:
5396 return LLVMGetVectorSize(type) *
5397 llvm_get_type_size(LLVMGetElementType(type));
5398 default:
5399 assert(0);
5400 return 0;
5401 }
5402 }
5403
5404 static void declare_tess_lds(struct si_shader_context *ctx)
5405 {
5406 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5407 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5408 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5409
5410 /* The actual size is computed outside of the shader to reduce
5411 * the number of shader variants. */
5412 ctx->lds =
5413 LLVMAddGlobalInAddressSpace(gallivm->module,
5414 LLVMArrayType(i32, lds_size / 4),
5415 "tess_lds",
5416 LOCAL_ADDR_SPACE);
5417 }
5418
5419 static void create_function(struct si_shader_context *ctx)
5420 {
5421 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5422 struct gallivm_state *gallivm = bld_base->base.gallivm;
5423 struct si_shader *shader = ctx->shader;
5424 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5425 LLVMTypeRef returns[16+32*4];
5426 unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
5427 unsigned num_returns = 0;
5428
5429 v3i32 = LLVMVectorType(ctx->i32, 3);
5430
5431 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5432 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5433 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5434 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5435 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5436 last_array_pointer = SI_PARAM_SHADER_BUFFERS;
5437
5438 switch (ctx->type) {
5439 case PIPE_SHADER_VERTEX:
5440 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5441 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
5442 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5443 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5444 num_params = SI_PARAM_START_INSTANCE+1;
5445
5446 if (shader->key.vs.as_es) {
5447 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5448 } else if (shader->key.vs.as_ls) {
5449 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5450 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5451 } else {
5452 if (ctx->is_gs_copy_shader) {
5453 last_array_pointer = SI_PARAM_RW_BUFFERS;
5454 num_params = SI_PARAM_RW_BUFFERS+1;
5455 } else {
5456 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5457 num_params = SI_PARAM_VS_STATE_BITS+1;
5458 }
5459
5460 /* The locations of the other parameters are assigned dynamically. */
5461 declare_streamout_params(ctx, &shader->selector->so,
5462 params, ctx->i32, &num_params);
5463 }
5464
5465 last_sgpr = num_params-1;
5466
5467 /* VGPRs */
5468 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5469 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5470 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5471 params[ctx->param_instance_id = num_params++] = ctx->i32;
5472
5473 if (!ctx->is_monolithic &&
5474 !ctx->is_gs_copy_shader) {
5475 /* Vertex load indices. */
5476 ctx->param_vertex_index0 = num_params;
5477
5478 for (i = 0; i < shader->selector->info.num_inputs; i++)
5479 params[num_params++] = ctx->i32;
5480
5481 /* PrimitiveID output. */
5482 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5483 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5484 returns[num_returns++] = ctx->f32;
5485 }
5486 break;
5487
5488 case PIPE_SHADER_TESS_CTRL:
5489 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5490 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5491 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5492 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5493 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5494 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5495 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5496
5497 /* VGPRs */
5498 params[SI_PARAM_PATCH_ID] = ctx->i32;
5499 params[SI_PARAM_REL_IDS] = ctx->i32;
5500 num_params = SI_PARAM_REL_IDS+1;
5501
5502 if (!ctx->is_monolithic) {
5503 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5504 * placed after the user SGPRs.
5505 */
5506 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5507 returns[num_returns++] = ctx->i32; /* SGPRs */
5508
5509 for (i = 0; i < 3; i++)
5510 returns[num_returns++] = ctx->f32; /* VGPRs */
5511 }
5512 break;
5513
5514 case PIPE_SHADER_TESS_EVAL:
5515 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5516 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5517
5518 if (shader->key.tes.as_es) {
5519 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5520 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5521 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5522 } else {
5523 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5524 declare_streamout_params(ctx, &shader->selector->so,
5525 params, ctx->i32, &num_params);
5526 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5527 }
5528 last_sgpr = num_params - 1;
5529
5530 /* VGPRs */
5531 params[ctx->param_tes_u = num_params++] = ctx->f32;
5532 params[ctx->param_tes_v = num_params++] = ctx->f32;
5533 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5534 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5535
5536 /* PrimitiveID output. */
5537 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5538 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5539 returns[num_returns++] = ctx->f32;
5540 break;
5541
5542 case PIPE_SHADER_GEOMETRY:
5543 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5544 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5545 last_sgpr = SI_PARAM_GS_WAVE_ID;
5546
5547 /* VGPRs */
5548 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5549 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5550 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5551 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5552 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5553 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5554 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5555 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5556 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5557 break;
5558
5559 case PIPE_SHADER_FRAGMENT:
5560 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5561 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5562 last_sgpr = SI_PARAM_PRIM_MASK;
5563 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5564 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5565 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5566 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5567 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5568 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5569 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5570 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5571 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5572 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5573 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5574 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5575 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5576 params[SI_PARAM_ANCILLARY] = ctx->i32;
5577 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5578 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5579 num_params = SI_PARAM_POS_FIXED_PT+1;
5580
5581 if (!ctx->is_monolithic) {
5582 /* Color inputs from the prolog. */
5583 if (shader->selector->info.colors_read) {
5584 unsigned num_color_elements =
5585 util_bitcount(shader->selector->info.colors_read);
5586
5587 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5588 for (i = 0; i < num_color_elements; i++)
5589 params[num_params++] = ctx->f32;
5590 }
5591
5592 /* Outputs for the epilog. */
5593 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5594 num_returns =
5595 num_return_sgprs +
5596 util_bitcount(shader->selector->info.colors_written) * 4 +
5597 shader->selector->info.writes_z +
5598 shader->selector->info.writes_stencil +
5599 shader->selector->info.writes_samplemask +
5600 1 /* SampleMaskIn */;
5601
5602 num_returns = MAX2(num_returns,
5603 num_return_sgprs +
5604 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5605
5606 for (i = 0; i < num_return_sgprs; i++)
5607 returns[i] = ctx->i32;
5608 for (; i < num_returns; i++)
5609 returns[i] = ctx->f32;
5610 }
5611 break;
5612
5613 case PIPE_SHADER_COMPUTE:
5614 params[SI_PARAM_GRID_SIZE] = v3i32;
5615 params[SI_PARAM_BLOCK_ID] = v3i32;
5616 last_sgpr = SI_PARAM_BLOCK_ID;
5617
5618 params[SI_PARAM_THREAD_ID] = v3i32;
5619 num_params = SI_PARAM_THREAD_ID + 1;
5620 break;
5621 default:
5622 assert(0 && "unimplemented shader");
5623 return;
5624 }
5625
5626 assert(num_params <= ARRAY_SIZE(params));
5627
5628 si_create_function(ctx, returns, num_returns, params,
5629 num_params, last_array_pointer, last_sgpr);
5630
5631 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5632 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5633 !ctx->is_monolithic) {
5634 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5635 "InitialPSInputAddr",
5636 S_0286D0_PERSP_SAMPLE_ENA(1) |
5637 S_0286D0_PERSP_CENTER_ENA(1) |
5638 S_0286D0_PERSP_CENTROID_ENA(1) |
5639 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5640 S_0286D0_LINEAR_CENTER_ENA(1) |
5641 S_0286D0_LINEAR_CENTROID_ENA(1) |
5642 S_0286D0_FRONT_FACE_ENA(1) |
5643 S_0286D0_POS_FIXED_PT_ENA(1));
5644 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5645 const unsigned *properties = shader->selector->info.properties;
5646 unsigned max_work_group_size =
5647 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5648 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5649 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5650
5651 assert(max_work_group_size);
5652
5653 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5654 "amdgpu-max-work-group-size",
5655 max_work_group_size);
5656 }
5657
5658 shader->info.num_input_sgprs = 0;
5659 shader->info.num_input_vgprs = 0;
5660
5661 for (i = 0; i <= last_sgpr; ++i)
5662 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5663
5664 /* Unused fragment shader inputs are eliminated by the compiler,
5665 * so we don't know yet how many there will be.
5666 */
5667 if (ctx->type != PIPE_SHADER_FRAGMENT)
5668 for (; i < num_params; ++i)
5669 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5670
5671 if (bld_base->info &&
5672 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5673 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5674 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5675 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5676 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5677 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5678 ctx->lds =
5679 LLVMAddGlobalInAddressSpace(gallivm->module,
5680 LLVMArrayType(ctx->i32, 64),
5681 "ddxy_lds",
5682 LOCAL_ADDR_SPACE);
5683
5684 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5685 ctx->type == PIPE_SHADER_TESS_CTRL ||
5686 ctx->type == PIPE_SHADER_TESS_EVAL)
5687 declare_tess_lds(ctx);
5688 }
5689
5690 static void preload_constants(struct si_shader_context *ctx)
5691 {
5692 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5693 struct gallivm_state *gallivm = bld_base->base.gallivm;
5694 const struct tgsi_shader_info *info = bld_base->info;
5695 unsigned buf;
5696 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5697
5698 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5699 unsigned i, num_const = info->const_file_max[buf] + 1;
5700
5701 if (num_const == 0)
5702 continue;
5703
5704 /* Allocate space for the constant values */
5705 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5706
5707 /* Load the resource descriptor */
5708 ctx->const_buffers[buf] =
5709 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5710
5711 /* Load the constants, we rely on the code sinking to do the rest */
5712 for (i = 0; i < num_const * 4; ++i) {
5713 ctx->constants[buf][i] =
5714 buffer_load_const(gallivm->builder,
5715 ctx->const_buffers[buf],
5716 lp_build_const_int32(gallivm, i * 4),
5717 ctx->f32);
5718 }
5719 }
5720 }
5721
5722 static void preload_shader_buffers(struct si_shader_context *ctx)
5723 {
5724 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5725 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5726 int buf, maxbuf;
5727
5728 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5729 SI_NUM_SHADER_BUFFERS - 1);
5730 for (buf = 0; buf <= maxbuf; ++buf) {
5731 ctx->shader_buffers[buf] =
5732 build_indexed_load_const(
5733 ctx, ptr, lp_build_const_int32(gallivm, buf));
5734 }
5735 }
5736
5737 static void preload_samplers(struct si_shader_context *ctx)
5738 {
5739 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5740 struct gallivm_state *gallivm = bld_base->base.gallivm;
5741 const struct tgsi_shader_info *info = bld_base->info;
5742 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5743 LLVMValueRef offset;
5744
5745 if (num_samplers == 0)
5746 return;
5747
5748 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5749 for (i = 0; i < num_samplers; ++i) {
5750 /* Resource */
5751 offset = lp_build_const_int32(gallivm, i);
5752 ctx->sampler_views[i] =
5753 get_sampler_desc(ctx, offset, DESC_IMAGE);
5754
5755 /* FMASK resource */
5756 if (info->is_msaa_sampler[i])
5757 ctx->fmasks[i] =
5758 get_sampler_desc(ctx, offset, DESC_FMASK);
5759 else {
5760 ctx->sampler_states[i] =
5761 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5762 ctx->sampler_states[i] =
5763 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5764 ctx->sampler_states[i]);
5765 }
5766 }
5767 }
5768
5769 static void preload_images(struct si_shader_context *ctx)
5770 {
5771 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5772 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5773 struct gallivm_state *gallivm = bld_base->base.gallivm;
5774 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5775 LLVMValueRef res_ptr;
5776 unsigned i;
5777
5778 if (num_images == 0)
5779 return;
5780
5781 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5782
5783 for (i = 0; i < num_images; ++i) {
5784 /* Rely on LLVM to shrink the load for buffer resources. */
5785 LLVMValueRef rsrc =
5786 build_indexed_load_const(ctx, res_ptr,
5787 lp_build_const_int32(gallivm, i));
5788
5789 if (info->images_writemask & (1 << i) &&
5790 !(info->images_buffers & (1 << i)))
5791 rsrc = force_dcc_off(ctx, rsrc);
5792
5793 ctx->images[i] = rsrc;
5794 }
5795 }
5796
5797 static void preload_streamout_buffers(struct si_shader_context *ctx)
5798 {
5799 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5800 struct gallivm_state *gallivm = bld_base->base.gallivm;
5801 unsigned i;
5802
5803 /* Streamout can only be used if the shader is compiled as VS. */
5804 if (!ctx->shader->selector->so.num_outputs ||
5805 (ctx->type == PIPE_SHADER_VERTEX &&
5806 (ctx->shader->key.vs.as_es ||
5807 ctx->shader->key.vs.as_ls)) ||
5808 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5809 ctx->shader->key.tes.as_es))
5810 return;
5811
5812 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5813 SI_PARAM_RW_BUFFERS);
5814
5815 /* Load the resources, we rely on the code sinking to do the rest */
5816 for (i = 0; i < 4; ++i) {
5817 if (ctx->shader->selector->so.stride[i]) {
5818 LLVMValueRef offset = lp_build_const_int32(gallivm,
5819 SI_VS_STREAMOUT_BUF0 + i);
5820
5821 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5822 }
5823 }
5824 }
5825
5826 /**
5827 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5828 * for later use.
5829 */
5830 static void preload_ring_buffers(struct si_shader_context *ctx)
5831 {
5832 struct gallivm_state *gallivm =
5833 ctx->radeon_bld.soa.bld_base.base.gallivm;
5834
5835 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5836 SI_PARAM_RW_BUFFERS);
5837
5838 if ((ctx->type == PIPE_SHADER_VERTEX &&
5839 ctx->shader->key.vs.as_es) ||
5840 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5841 ctx->shader->key.tes.as_es) ||
5842 ctx->type == PIPE_SHADER_GEOMETRY) {
5843 unsigned ring =
5844 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5845 : SI_ES_RING_ESGS;
5846 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5847
5848 ctx->esgs_ring =
5849 build_indexed_load_const(ctx, buf_ptr, offset);
5850 }
5851
5852 if (ctx->is_gs_copy_shader) {
5853 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5854
5855 ctx->gsvs_ring[0] =
5856 build_indexed_load_const(ctx, buf_ptr, offset);
5857 }
5858 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5859 int i;
5860 for (i = 0; i < 4; i++) {
5861 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5862
5863 ctx->gsvs_ring[i] =
5864 build_indexed_load_const(ctx, buf_ptr, offset);
5865 }
5866 }
5867 }
5868
5869 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5870 LLVMValueRef param_rw_buffers,
5871 unsigned param_pos_fixed_pt)
5872 {
5873 struct lp_build_tgsi_context *bld_base =
5874 &ctx->radeon_bld.soa.bld_base;
5875 struct gallivm_state *gallivm = bld_base->base.gallivm;
5876 LLVMBuilderRef builder = gallivm->builder;
5877 LLVMValueRef slot, desc, offset, row, bit, address[2];
5878
5879 /* Use the fixed-point gl_FragCoord input.
5880 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5881 * per coordinate to get the repeating effect.
5882 */
5883 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5884 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5885
5886 /* Load the buffer descriptor. */
5887 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5888 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5889
5890 /* The stipple pattern is 32x32, each row has 32 bits. */
5891 offset = LLVMBuildMul(builder, address[1],
5892 LLVMConstInt(ctx->i32, 4, 0), "");
5893 row = buffer_load_const(builder, desc, offset, ctx->i32);
5894 bit = LLVMBuildLShr(builder, row, address[0], "");
5895 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5896
5897 /* The intrinsic kills the thread if arg < 0. */
5898 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5899 LLVMConstReal(ctx->f32, -1), "");
5900 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5901 }
5902
5903 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5904 struct si_shader_config *conf,
5905 unsigned symbol_offset)
5906 {
5907 unsigned i;
5908 const unsigned char *config =
5909 radeon_shader_binary_config_start(binary, symbol_offset);
5910 bool really_needs_scratch = false;
5911
5912 /* LLVM adds SGPR spills to the scratch size.
5913 * Find out if we really need the scratch buffer.
5914 */
5915 for (i = 0; i < binary->reloc_count; i++) {
5916 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5917
5918 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5919 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5920 really_needs_scratch = true;
5921 break;
5922 }
5923 }
5924
5925 /* XXX: We may be able to emit some of these values directly rather than
5926 * extracting fields to be emitted later.
5927 */
5928
5929 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5930 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5931 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5932 switch (reg) {
5933 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5934 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5935 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5936 case R_00B848_COMPUTE_PGM_RSRC1:
5937 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5938 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5939 conf->float_mode = G_00B028_FLOAT_MODE(value);
5940 conf->rsrc1 = value;
5941 break;
5942 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5943 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5944 break;
5945 case R_00B84C_COMPUTE_PGM_RSRC2:
5946 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5947 conf->rsrc2 = value;
5948 break;
5949 case R_0286CC_SPI_PS_INPUT_ENA:
5950 conf->spi_ps_input_ena = value;
5951 break;
5952 case R_0286D0_SPI_PS_INPUT_ADDR:
5953 conf->spi_ps_input_addr = value;
5954 break;
5955 case R_0286E8_SPI_TMPRING_SIZE:
5956 case R_00B860_COMPUTE_TMPRING_SIZE:
5957 /* WAVESIZE is in units of 256 dwords. */
5958 if (really_needs_scratch)
5959 conf->scratch_bytes_per_wave =
5960 G_00B860_WAVESIZE(value) * 256 * 4;
5961 break;
5962 default:
5963 {
5964 static bool printed;
5965
5966 if (!printed) {
5967 fprintf(stderr, "Warning: LLVM emitted unknown "
5968 "config register: 0x%x\n", reg);
5969 printed = true;
5970 }
5971 }
5972 break;
5973 }
5974
5975 if (!conf->spi_ps_input_addr)
5976 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5977 }
5978 }
5979
5980 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5981 struct si_shader *shader,
5982 struct si_shader_config *config,
5983 uint64_t scratch_va)
5984 {
5985 unsigned i;
5986 uint32_t scratch_rsrc_dword0 = scratch_va;
5987 uint32_t scratch_rsrc_dword1 =
5988 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5989
5990 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
5991 * correctly.
5992 */
5993 if (HAVE_LLVM >= 0x0309)
5994 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5995 else
5996 scratch_rsrc_dword1 |=
5997 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5998
5999 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6000 const struct radeon_shader_reloc *reloc =
6001 &shader->binary.relocs[i];
6002 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6003 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6004 &scratch_rsrc_dword0, 4);
6005 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6006 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6007 &scratch_rsrc_dword1, 4);
6008 }
6009 }
6010 }
6011
6012 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6013 {
6014 unsigned size = shader->binary.code_size;
6015
6016 if (shader->prolog)
6017 size += shader->prolog->binary.code_size;
6018 if (shader->epilog)
6019 size += shader->epilog->binary.code_size;
6020 return size;
6021 }
6022
6023 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6024 {
6025 const struct radeon_shader_binary *prolog =
6026 shader->prolog ? &shader->prolog->binary : NULL;
6027 const struct radeon_shader_binary *epilog =
6028 shader->epilog ? &shader->epilog->binary : NULL;
6029 const struct radeon_shader_binary *mainb = &shader->binary;
6030 unsigned bo_size = si_get_shader_binary_size(shader) +
6031 (!epilog ? mainb->rodata_size : 0);
6032 unsigned char *ptr;
6033
6034 assert(!prolog || !prolog->rodata_size);
6035 assert((!prolog && !epilog) || !mainb->rodata_size);
6036 assert(!epilog || !epilog->rodata_size);
6037
6038 r600_resource_reference(&shader->bo, NULL);
6039 shader->bo = si_resource_create_custom(&sscreen->b.b,
6040 PIPE_USAGE_IMMUTABLE,
6041 bo_size);
6042 if (!shader->bo)
6043 return -ENOMEM;
6044
6045 /* Upload. */
6046 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6047 PIPE_TRANSFER_READ_WRITE);
6048
6049 if (prolog) {
6050 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6051 ptr += prolog->code_size;
6052 }
6053
6054 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6055 ptr += mainb->code_size;
6056
6057 if (epilog)
6058 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6059 else if (mainb->rodata_size > 0)
6060 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6061
6062 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6063 return 0;
6064 }
6065
6066 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6067 struct pipe_debug_callback *debug,
6068 const char *name, FILE *file)
6069 {
6070 char *line, *p;
6071 unsigned i, count;
6072
6073 if (binary->disasm_string) {
6074 fprintf(file, "Shader %s disassembly:\n", name);
6075 fprintf(file, "%s", binary->disasm_string);
6076
6077 if (debug && debug->debug_message) {
6078 /* Very long debug messages are cut off, so send the
6079 * disassembly one line at a time. This causes more
6080 * overhead, but on the plus side it simplifies
6081 * parsing of resulting logs.
6082 */
6083 pipe_debug_message(debug, SHADER_INFO,
6084 "Shader Disassembly Begin");
6085
6086 line = binary->disasm_string;
6087 while (*line) {
6088 p = util_strchrnul(line, '\n');
6089 count = p - line;
6090
6091 if (count) {
6092 pipe_debug_message(debug, SHADER_INFO,
6093 "%.*s", count, line);
6094 }
6095
6096 if (!*p)
6097 break;
6098 line = p + 1;
6099 }
6100
6101 pipe_debug_message(debug, SHADER_INFO,
6102 "Shader Disassembly End");
6103 }
6104 } else {
6105 fprintf(file, "Shader %s binary:\n", name);
6106 for (i = 0; i < binary->code_size; i += 4) {
6107 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6108 binary->code[i + 3], binary->code[i + 2],
6109 binary->code[i + 1], binary->code[i]);
6110 }
6111 }
6112 }
6113
6114 static void si_shader_dump_stats(struct si_screen *sscreen,
6115 struct si_shader_config *conf,
6116 unsigned num_inputs,
6117 unsigned code_size,
6118 struct pipe_debug_callback *debug,
6119 unsigned processor,
6120 FILE *file)
6121 {
6122 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6123 unsigned lds_per_wave = 0;
6124 unsigned max_simd_waves = 10;
6125 /* Assuming SGPRs aren't spilled. */
6126 unsigned spilled_vgprs = conf->scratch_bytes_per_wave / 64 / 4;
6127
6128 /* Compute LDS usage for PS. */
6129 if (processor == PIPE_SHADER_FRAGMENT) {
6130 /* The minimum usage per wave is (num_inputs * 48). The maximum
6131 * usage is (num_inputs * 48 * 16).
6132 * We can get anything in between and it varies between waves.
6133 *
6134 * The 48 bytes per input for a single primitive is equal to
6135 * 4 bytes/component * 4 components/input * 3 points.
6136 *
6137 * Other stages don't know the size at compile time or don't
6138 * allocate LDS per wave, but instead they do it per thread group.
6139 */
6140 lds_per_wave = conf->lds_size * lds_increment +
6141 align(num_inputs * 48, lds_increment);
6142 }
6143
6144 /* Compute the per-SIMD wave counts. */
6145 if (conf->num_sgprs) {
6146 if (sscreen->b.chip_class >= VI)
6147 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6148 else
6149 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6150 }
6151
6152 if (conf->num_vgprs)
6153 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6154
6155 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6156 * that PS can use.
6157 */
6158 if (lds_per_wave)
6159 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6160
6161 if (file != stderr ||
6162 r600_can_dump_shader(&sscreen->b, processor)) {
6163 if (processor == PIPE_SHADER_FRAGMENT) {
6164 fprintf(file, "*** SHADER CONFIG ***\n"
6165 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6166 "SPI_PS_INPUT_ENA = 0x%04x\n",
6167 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6168 }
6169
6170 fprintf(file, "*** SHADER STATS ***\n"
6171 "SGPRS: %d\n"
6172 "VGPRS: %d\n"
6173 "Spilled VGPRs: %d\n"
6174 "Code Size: %d bytes\n"
6175 "LDS: %d blocks\n"
6176 "Scratch: %d bytes per wave\n"
6177 "Max Waves: %d\n"
6178 "********************\n",
6179 conf->num_sgprs, conf->num_vgprs, spilled_vgprs, code_size,
6180 conf->lds_size, conf->scratch_bytes_per_wave,
6181 max_simd_waves);
6182 }
6183
6184 pipe_debug_message(debug, SHADER_INFO,
6185 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6186 "LDS: %d Scratch: %d Max Waves: %d Spilled VGPRs: %d",
6187 conf->num_sgprs, conf->num_vgprs, code_size,
6188 conf->lds_size, conf->scratch_bytes_per_wave,
6189 max_simd_waves, spilled_vgprs);
6190 }
6191
6192 static const char *si_get_shader_name(struct si_shader *shader,
6193 unsigned processor)
6194 {
6195 switch (processor) {
6196 case PIPE_SHADER_VERTEX:
6197 if (shader->key.vs.as_es)
6198 return "Vertex Shader as ES";
6199 else if (shader->key.vs.as_ls)
6200 return "Vertex Shader as LS";
6201 else
6202 return "Vertex Shader as VS";
6203 case PIPE_SHADER_TESS_CTRL:
6204 return "Tessellation Control Shader";
6205 case PIPE_SHADER_TESS_EVAL:
6206 if (shader->key.tes.as_es)
6207 return "Tessellation Evaluation Shader as ES";
6208 else
6209 return "Tessellation Evaluation Shader as VS";
6210 case PIPE_SHADER_GEOMETRY:
6211 if (shader->gs_copy_shader == NULL)
6212 return "GS Copy Shader as VS";
6213 else
6214 return "Geometry Shader";
6215 case PIPE_SHADER_FRAGMENT:
6216 return "Pixel Shader";
6217 case PIPE_SHADER_COMPUTE:
6218 return "Compute Shader";
6219 default:
6220 return "Unknown Shader";
6221 }
6222 }
6223
6224 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6225 struct pipe_debug_callback *debug, unsigned processor,
6226 FILE *file)
6227 {
6228 if (file != stderr && shader->binary.llvm_ir_string) {
6229 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6230 si_get_shader_name(shader, processor));
6231 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6232 }
6233
6234 if (file != stderr ||
6235 (r600_can_dump_shader(&sscreen->b, processor) &&
6236 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6237 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6238
6239 if (shader->prolog)
6240 si_shader_dump_disassembly(&shader->prolog->binary,
6241 debug, "prolog", file);
6242
6243 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6244
6245 if (shader->epilog)
6246 si_shader_dump_disassembly(&shader->epilog->binary,
6247 debug, "epilog", file);
6248 fprintf(file, "\n");
6249 }
6250
6251 si_shader_dump_stats(sscreen, &shader->config,
6252 shader->selector ? shader->selector->info.num_inputs : 0,
6253 si_get_shader_binary_size(shader), debug, processor,
6254 file);
6255 }
6256
6257 int si_compile_llvm(struct si_screen *sscreen,
6258 struct radeon_shader_binary *binary,
6259 struct si_shader_config *conf,
6260 LLVMTargetMachineRef tm,
6261 LLVMModuleRef mod,
6262 struct pipe_debug_callback *debug,
6263 unsigned processor,
6264 const char *name)
6265 {
6266 int r = 0;
6267 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6268
6269 if (r600_can_dump_shader(&sscreen->b, processor)) {
6270 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6271
6272 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6273 fprintf(stderr, "%s LLVM IR:\n\n", name);
6274 LLVMDumpModule(mod);
6275 fprintf(stderr, "\n");
6276 }
6277 }
6278
6279 if (sscreen->record_llvm_ir) {
6280 char *ir = LLVMPrintModuleToString(mod);
6281 binary->llvm_ir_string = strdup(ir);
6282 LLVMDisposeMessage(ir);
6283 }
6284
6285 if (!si_replace_shader(count, binary)) {
6286 r = radeon_llvm_compile(mod, binary, tm, debug);
6287 if (r)
6288 return r;
6289 }
6290
6291 si_shader_binary_read_config(binary, conf, 0);
6292
6293 /* Enable 64-bit and 16-bit denormals, because there is no performance
6294 * cost.
6295 *
6296 * If denormals are enabled, all floating-point output modifiers are
6297 * ignored.
6298 *
6299 * Don't enable denormals for 32-bit floats, because:
6300 * - Floating-point output modifiers would be ignored by the hw.
6301 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6302 * have to stop using those.
6303 * - SI & CI would be very slow.
6304 */
6305 conf->float_mode |= V_00B028_FP_64_DENORMS;
6306
6307 FREE(binary->config);
6308 FREE(binary->global_symbol_offsets);
6309 binary->config = NULL;
6310 binary->global_symbol_offsets = NULL;
6311
6312 /* Some shaders can't have rodata because their binaries can be
6313 * concatenated.
6314 */
6315 if (binary->rodata_size &&
6316 (processor == PIPE_SHADER_VERTEX ||
6317 processor == PIPE_SHADER_TESS_CTRL ||
6318 processor == PIPE_SHADER_TESS_EVAL ||
6319 processor == PIPE_SHADER_FRAGMENT)) {
6320 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6321 return -EINVAL;
6322 }
6323
6324 return r;
6325 }
6326
6327 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6328 {
6329 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6330 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6331 else
6332 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6333 }
6334
6335 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6336 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6337 struct si_shader_context *ctx,
6338 struct si_shader *gs,
6339 struct pipe_debug_callback *debug)
6340 {
6341 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6342 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6343 struct lp_build_context *uint = &bld_base->uint_bld;
6344 struct si_shader_output_values *outputs;
6345 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6346 LLVMValueRef args[9];
6347 int i, r;
6348
6349 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6350
6351 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6352 ctx->type = PIPE_SHADER_VERTEX;
6353 ctx->is_gs_copy_shader = true;
6354
6355 create_meta_data(ctx);
6356 create_function(ctx);
6357 preload_streamout_buffers(ctx);
6358 preload_ring_buffers(ctx);
6359
6360 args[0] = ctx->gsvs_ring[0];
6361 args[1] = lp_build_mul_imm(uint,
6362 LLVMGetParam(ctx->radeon_bld.main_fn,
6363 ctx->param_vertex_id),
6364 4);
6365 args[3] = uint->zero;
6366 args[4] = uint->one; /* OFFEN */
6367 args[5] = uint->zero; /* IDXEN */
6368 args[6] = uint->one; /* GLC */
6369 args[7] = uint->one; /* SLC */
6370 args[8] = uint->zero; /* TFE */
6371
6372 /* Fetch vertex data from GSVS ring */
6373 for (i = 0; i < gsinfo->num_outputs; ++i) {
6374 unsigned chan;
6375
6376 outputs[i].name = gsinfo->output_semantic_name[i];
6377 outputs[i].sid = gsinfo->output_semantic_index[i];
6378
6379 for (chan = 0; chan < 4; chan++) {
6380 args[2] = lp_build_const_int32(gallivm,
6381 (i * 4 + chan) *
6382 gs->selector->gs_max_out_vertices * 16 * 4);
6383
6384 outputs[i].values[chan] =
6385 LLVMBuildBitCast(gallivm->builder,
6386 lp_build_intrinsic(gallivm->builder,
6387 "llvm.SI.buffer.load.dword.i32.i32",
6388 ctx->i32, args, 9,
6389 LLVMReadOnlyAttribute),
6390 ctx->f32, "");
6391 }
6392 }
6393
6394 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6395
6396 LLVMBuildRetVoid(gallivm->builder);
6397
6398 /* Dump LLVM IR before any optimization passes */
6399 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6400 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6401 LLVMDumpModule(bld_base->base.gallivm->module);
6402
6403 radeon_llvm_finalize_module(&ctx->radeon_bld);
6404
6405 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6406 &ctx->shader->config, ctx->tm,
6407 bld_base->base.gallivm->module,
6408 debug, PIPE_SHADER_GEOMETRY,
6409 "GS Copy Shader");
6410 if (!r) {
6411 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6412 fprintf(stderr, "GS Copy Shader:\n");
6413 si_shader_dump(sscreen, ctx->shader, debug,
6414 PIPE_SHADER_GEOMETRY, stderr);
6415 r = si_shader_binary_upload(sscreen, ctx->shader);
6416 }
6417
6418 radeon_llvm_dispose(&ctx->radeon_bld);
6419
6420 FREE(outputs);
6421 return r;
6422 }
6423
6424 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
6425 {
6426 int i;
6427
6428 fprintf(f, "SHADER KEY\n");
6429
6430 switch (shader) {
6431 case PIPE_SHADER_VERTEX:
6432 fprintf(f, " instance_divisors = {");
6433 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6434 fprintf(f, !i ? "%u" : ", %u",
6435 key->vs.prolog.instance_divisors[i]);
6436 fprintf(f, "}\n");
6437 fprintf(f, " as_es = %u\n", key->vs.as_es);
6438 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6439 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6440 break;
6441
6442 case PIPE_SHADER_TESS_CTRL:
6443 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6444 break;
6445
6446 case PIPE_SHADER_TESS_EVAL:
6447 fprintf(f, " as_es = %u\n", key->tes.as_es);
6448 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6449 break;
6450
6451 case PIPE_SHADER_GEOMETRY:
6452 case PIPE_SHADER_COMPUTE:
6453 break;
6454
6455 case PIPE_SHADER_FRAGMENT:
6456 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6457 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6458 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6459 fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6460 fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6461 fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6462 fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6463 fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6464 fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6465 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6466 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6467 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6468 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6469 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6470 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6471 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6472 break;
6473
6474 default:
6475 assert(0);
6476 }
6477 }
6478
6479 static void si_init_shader_ctx(struct si_shader_context *ctx,
6480 struct si_screen *sscreen,
6481 struct si_shader *shader,
6482 LLVMTargetMachineRef tm)
6483 {
6484 struct lp_build_tgsi_context *bld_base;
6485 struct lp_build_tgsi_action tmpl = {};
6486
6487 memset(ctx, 0, sizeof(*ctx));
6488 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
6489 ctx->tm = tm;
6490 ctx->screen = sscreen;
6491 if (shader && shader->selector)
6492 ctx->type = shader->selector->info.processor;
6493 else
6494 ctx->type = -1;
6495 ctx->shader = shader;
6496
6497 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6498 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6499 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6500 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6501 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6502 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6503 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6504 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6505 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6506 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6507 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6508 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6509
6510 bld_base = &ctx->radeon_bld.soa.bld_base;
6511 if (shader && shader->selector)
6512 bld_base->info = &shader->selector->info;
6513 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6514
6515 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6516 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6517 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6518
6519 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6520 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6521 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6522 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6523 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6524 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6525 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6526 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6527 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6528 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6529 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6530 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6531 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6532 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6533
6534 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6535 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6536 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6537 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6538 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6539 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6540
6541 tmpl.fetch_args = atomic_fetch_args;
6542 tmpl.emit = atomic_emit;
6543 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6544 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6545 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6546 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6547 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6548 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6549 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6550 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6551 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6552 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6553 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6554 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6555 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6556 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6557 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6558 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6559 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6560 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6561 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6562 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6563
6564 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6565
6566 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6567 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6568 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6569 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6570
6571 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6572 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6573 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6574
6575 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6576 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6577 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6578 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6579 }
6580
6581 int si_compile_tgsi_shader(struct si_screen *sscreen,
6582 LLVMTargetMachineRef tm,
6583 struct si_shader *shader,
6584 bool is_monolithic,
6585 struct pipe_debug_callback *debug)
6586 {
6587 struct si_shader_selector *sel = shader->selector;
6588 struct si_shader_context ctx;
6589 struct lp_build_tgsi_context *bld_base;
6590 LLVMModuleRef mod;
6591 int r = 0;
6592
6593 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6594 * conversion fails. */
6595 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6596 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6597 if (is_monolithic)
6598 si_dump_shader_key(sel->type, &shader->key, stderr);
6599 tgsi_dump(sel->tokens, 0);
6600 si_dump_streamout(&sel->so);
6601 }
6602
6603 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6604 ctx.is_monolithic = is_monolithic;
6605
6606 shader->info.uses_instanceid = sel->info.uses_instanceid;
6607
6608 bld_base = &ctx.radeon_bld.soa.bld_base;
6609 ctx.radeon_bld.load_system_value = declare_system_value;
6610
6611 switch (ctx.type) {
6612 case PIPE_SHADER_VERTEX:
6613 ctx.radeon_bld.load_input = declare_input_vs;
6614 if (shader->key.vs.as_ls)
6615 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6616 else if (shader->key.vs.as_es)
6617 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6618 else
6619 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6620 break;
6621 case PIPE_SHADER_TESS_CTRL:
6622 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6623 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6624 bld_base->emit_store = store_output_tcs;
6625 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6626 break;
6627 case PIPE_SHADER_TESS_EVAL:
6628 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6629 if (shader->key.tes.as_es)
6630 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6631 else
6632 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6633 break;
6634 case PIPE_SHADER_GEOMETRY:
6635 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6636 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6637 break;
6638 case PIPE_SHADER_FRAGMENT:
6639 ctx.radeon_bld.load_input = declare_input_fs;
6640 if (is_monolithic)
6641 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6642 else
6643 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6644 break;
6645 case PIPE_SHADER_COMPUTE:
6646 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6647 break;
6648 default:
6649 assert(!"Unsupported shader type");
6650 return -1;
6651 }
6652
6653 create_meta_data(&ctx);
6654 create_function(&ctx);
6655 preload_constants(&ctx);
6656 preload_shader_buffers(&ctx);
6657 preload_samplers(&ctx);
6658 preload_images(&ctx);
6659 preload_streamout_buffers(&ctx);
6660 preload_ring_buffers(&ctx);
6661
6662 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6663 shader->key.ps.prolog.poly_stipple) {
6664 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6665 SI_PARAM_RW_BUFFERS);
6666 si_llvm_emit_polygon_stipple(&ctx, list,
6667 SI_PARAM_POS_FIXED_PT);
6668 }
6669
6670 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6671 int i;
6672 for (i = 0; i < 4; i++) {
6673 ctx.gs_next_vertex[i] =
6674 lp_build_alloca(bld_base->base.gallivm,
6675 ctx.i32, "");
6676 }
6677 }
6678
6679 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6680 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6681 goto out;
6682 }
6683
6684 si_llvm_build_ret(&ctx, ctx.return_value);
6685 mod = bld_base->base.gallivm->module;
6686
6687 /* Dump LLVM IR before any optimization passes */
6688 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6689 r600_can_dump_shader(&sscreen->b, ctx.type))
6690 LLVMDumpModule(mod);
6691
6692 radeon_llvm_finalize_module(&ctx.radeon_bld);
6693
6694 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6695 mod, debug, ctx.type, "TGSI shader");
6696 if (r) {
6697 fprintf(stderr, "LLVM failed to compile shader\n");
6698 goto out;
6699 }
6700
6701 radeon_llvm_dispose(&ctx.radeon_bld);
6702
6703 /* Add the scratch offset to input SGPRs. */
6704 if (shader->config.scratch_bytes_per_wave)
6705 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6706
6707 /* Calculate the number of fragment input VGPRs. */
6708 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6709 shader->info.num_input_vgprs = 0;
6710 shader->info.face_vgpr_index = -1;
6711
6712 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6713 shader->info.num_input_vgprs += 2;
6714 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6715 shader->info.num_input_vgprs += 2;
6716 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6717 shader->info.num_input_vgprs += 2;
6718 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6719 shader->info.num_input_vgprs += 3;
6720 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6721 shader->info.num_input_vgprs += 2;
6722 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6723 shader->info.num_input_vgprs += 2;
6724 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6725 shader->info.num_input_vgprs += 2;
6726 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6727 shader->info.num_input_vgprs += 1;
6728 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6729 shader->info.num_input_vgprs += 1;
6730 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6731 shader->info.num_input_vgprs += 1;
6732 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6733 shader->info.num_input_vgprs += 1;
6734 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6735 shader->info.num_input_vgprs += 1;
6736 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6737 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6738 shader->info.num_input_vgprs += 1;
6739 }
6740 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6741 shader->info.num_input_vgprs += 1;
6742 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6743 shader->info.num_input_vgprs += 1;
6744 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6745 shader->info.num_input_vgprs += 1;
6746 }
6747
6748 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6749 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6750 shader->gs_copy_shader->selector = shader->selector;
6751 ctx.shader = shader->gs_copy_shader;
6752 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6753 shader, debug))) {
6754 free(shader->gs_copy_shader);
6755 shader->gs_copy_shader = NULL;
6756 goto out;
6757 }
6758 }
6759
6760 out:
6761 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6762 FREE(ctx.constants[i]);
6763 return r;
6764 }
6765
6766 /**
6767 * Create, compile and return a shader part (prolog or epilog).
6768 *
6769 * \param sscreen screen
6770 * \param list list of shader parts of the same category
6771 * \param key shader part key
6772 * \param tm LLVM target machine
6773 * \param debug debug callback
6774 * \param compile the callback responsible for compilation
6775 * \return non-NULL on success
6776 */
6777 static struct si_shader_part *
6778 si_get_shader_part(struct si_screen *sscreen,
6779 struct si_shader_part **list,
6780 union si_shader_part_key *key,
6781 LLVMTargetMachineRef tm,
6782 struct pipe_debug_callback *debug,
6783 bool (*compile)(struct si_screen *,
6784 LLVMTargetMachineRef,
6785 struct pipe_debug_callback *,
6786 struct si_shader_part *))
6787 {
6788 struct si_shader_part *result;
6789
6790 pipe_mutex_lock(sscreen->shader_parts_mutex);
6791
6792 /* Find existing. */
6793 for (result = *list; result; result = result->next) {
6794 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6795 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6796 return result;
6797 }
6798 }
6799
6800 /* Compile a new one. */
6801 result = CALLOC_STRUCT(si_shader_part);
6802 result->key = *key;
6803 if (!compile(sscreen, tm, debug, result)) {
6804 FREE(result);
6805 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6806 return NULL;
6807 }
6808
6809 result->next = *list;
6810 *list = result;
6811 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6812 return result;
6813 }
6814
6815 /**
6816 * Create a vertex shader prolog.
6817 *
6818 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6819 * All inputs are returned unmodified. The vertex load indices are
6820 * stored after them, which will used by the API VS for fetching inputs.
6821 *
6822 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6823 * input_v0,
6824 * input_v1,
6825 * input_v2,
6826 * input_v3,
6827 * (VertexID + BaseVertex),
6828 * (InstanceID + StartInstance),
6829 * (InstanceID / 2 + StartInstance)
6830 */
6831 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6832 LLVMTargetMachineRef tm,
6833 struct pipe_debug_callback *debug,
6834 struct si_shader_part *out)
6835 {
6836 union si_shader_part_key *key = &out->key;
6837 struct si_shader shader = {};
6838 struct si_shader_context ctx;
6839 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6840 LLVMTypeRef *params, *returns;
6841 LLVMValueRef ret, func;
6842 int last_sgpr, num_params, num_returns, i;
6843 bool status = true;
6844
6845 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6846 ctx.type = PIPE_SHADER_VERTEX;
6847 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6848 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6849
6850 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6851 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6852 sizeof(LLVMTypeRef));
6853 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6854 key->vs_prolog.last_input + 1) *
6855 sizeof(LLVMTypeRef));
6856 num_params = 0;
6857 num_returns = 0;
6858
6859 /* Declare input and output SGPRs. */
6860 num_params = 0;
6861 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6862 params[num_params++] = ctx.i32;
6863 returns[num_returns++] = ctx.i32;
6864 }
6865 last_sgpr = num_params - 1;
6866
6867 /* 4 preloaded VGPRs (outputs must be floats) */
6868 for (i = 0; i < 4; i++) {
6869 params[num_params++] = ctx.i32;
6870 returns[num_returns++] = ctx.f32;
6871 }
6872
6873 /* Vertex load indices. */
6874 for (i = 0; i <= key->vs_prolog.last_input; i++)
6875 returns[num_returns++] = ctx.f32;
6876
6877 /* Create the function. */
6878 si_create_function(&ctx, returns, num_returns, params,
6879 num_params, -1, last_sgpr);
6880 func = ctx.radeon_bld.main_fn;
6881
6882 /* Copy inputs to outputs. This should be no-op, as the registers match,
6883 * but it will prevent the compiler from overwriting them unintentionally.
6884 */
6885 ret = ctx.return_value;
6886 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6887 LLVMValueRef p = LLVMGetParam(func, i);
6888 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6889 }
6890 for (i = num_params - 4; i < num_params; i++) {
6891 LLVMValueRef p = LLVMGetParam(func, i);
6892 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6893 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6894 }
6895
6896 /* Compute vertex load indices from instance divisors. */
6897 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6898 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6899 LLVMValueRef index;
6900
6901 if (divisor) {
6902 /* InstanceID / Divisor + StartInstance */
6903 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6904 SI_SGPR_START_INSTANCE,
6905 divisor);
6906 } else {
6907 /* VertexID + BaseVertex */
6908 index = LLVMBuildAdd(gallivm->builder,
6909 LLVMGetParam(func, ctx.param_vertex_id),
6910 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6911 }
6912
6913 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6914 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6915 num_params++, "");
6916 }
6917
6918 /* Compile. */
6919 si_llvm_build_ret(&ctx, ret);
6920 radeon_llvm_finalize_module(&ctx.radeon_bld);
6921
6922 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6923 gallivm->module, debug, ctx.type,
6924 "Vertex Shader Prolog"))
6925 status = false;
6926
6927 radeon_llvm_dispose(&ctx.radeon_bld);
6928 return status;
6929 }
6930
6931 /**
6932 * Compile the vertex shader epilog. This is also used by the tessellation
6933 * evaluation shader compiled as VS.
6934 *
6935 * The input is PrimitiveID.
6936 *
6937 * If PrimitiveID is required by the pixel shader, export it.
6938 * Otherwise, do nothing.
6939 */
6940 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6941 LLVMTargetMachineRef tm,
6942 struct pipe_debug_callback *debug,
6943 struct si_shader_part *out)
6944 {
6945 union si_shader_part_key *key = &out->key;
6946 struct si_shader_context ctx;
6947 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6948 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6949 LLVMTypeRef params[5];
6950 int num_params, i;
6951 bool status = true;
6952
6953 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6954 ctx.type = PIPE_SHADER_VERTEX;
6955
6956 /* Declare input VGPRs. */
6957 num_params = key->vs_epilog.states.export_prim_id ?
6958 (VS_EPILOG_PRIMID_LOC + 1) : 0;
6959 assert(num_params <= ARRAY_SIZE(params));
6960
6961 for (i = 0; i < num_params; i++)
6962 params[i] = ctx.f32;
6963
6964 /* Create the function. */
6965 si_create_function(&ctx, NULL, 0, params, num_params,
6966 -1, -1);
6967
6968 /* Emit exports. */
6969 if (key->vs_epilog.states.export_prim_id) {
6970 struct lp_build_context *base = &bld_base->base;
6971 struct lp_build_context *uint = &bld_base->uint_bld;
6972 LLVMValueRef args[9];
6973
6974 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6975 args[1] = uint->zero; /* whether the EXEC mask is valid */
6976 args[2] = uint->zero; /* DONE bit */
6977 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6978 key->vs_epilog.prim_id_param_offset);
6979 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6980 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6981 VS_EPILOG_PRIMID_LOC); /* X */
6982 args[6] = uint->undef; /* Y */
6983 args[7] = uint->undef; /* Z */
6984 args[8] = uint->undef; /* W */
6985
6986 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6987 LLVMVoidTypeInContext(base->gallivm->context),
6988 args, 9, 0);
6989 }
6990
6991 /* Compile. */
6992 LLVMBuildRetVoid(gallivm->builder);
6993 radeon_llvm_finalize_module(&ctx.radeon_bld);
6994
6995 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6996 gallivm->module, debug, ctx.type,
6997 "Vertex Shader Epilog"))
6998 status = false;
6999
7000 radeon_llvm_dispose(&ctx.radeon_bld);
7001 return status;
7002 }
7003
7004 /**
7005 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7006 */
7007 static bool si_get_vs_epilog(struct si_screen *sscreen,
7008 LLVMTargetMachineRef tm,
7009 struct si_shader *shader,
7010 struct pipe_debug_callback *debug,
7011 struct si_vs_epilog_bits *states)
7012 {
7013 union si_shader_part_key epilog_key;
7014
7015 memset(&epilog_key, 0, sizeof(epilog_key));
7016 epilog_key.vs_epilog.states = *states;
7017
7018 /* Set up the PrimitiveID output. */
7019 if (shader->key.vs.epilog.export_prim_id) {
7020 unsigned index = shader->selector->info.num_outputs;
7021 unsigned offset = shader->info.nr_param_exports++;
7022
7023 epilog_key.vs_epilog.prim_id_param_offset = offset;
7024 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7025 shader->info.vs_output_param_offset[index] = offset;
7026 }
7027
7028 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7029 &epilog_key, tm, debug,
7030 si_compile_vs_epilog);
7031 return shader->epilog != NULL;
7032 }
7033
7034 /**
7035 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7036 */
7037 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7038 LLVMTargetMachineRef tm,
7039 struct si_shader *shader,
7040 struct pipe_debug_callback *debug)
7041 {
7042 struct tgsi_shader_info *info = &shader->selector->info;
7043 union si_shader_part_key prolog_key;
7044 unsigned i;
7045
7046 /* Get the prolog. */
7047 memset(&prolog_key, 0, sizeof(prolog_key));
7048 prolog_key.vs_prolog.states = shader->key.vs.prolog;
7049 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7050 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7051
7052 /* The prolog is a no-op if there are no inputs. */
7053 if (info->num_inputs) {
7054 shader->prolog =
7055 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7056 &prolog_key, tm, debug,
7057 si_compile_vs_prolog);
7058 if (!shader->prolog)
7059 return false;
7060 }
7061
7062 /* Get the epilog. */
7063 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7064 !si_get_vs_epilog(sscreen, tm, shader, debug,
7065 &shader->key.vs.epilog))
7066 return false;
7067
7068 /* Set the instanceID flag. */
7069 for (i = 0; i < info->num_inputs; i++)
7070 if (prolog_key.vs_prolog.states.instance_divisors[i])
7071 shader->info.uses_instanceid = true;
7072
7073 return true;
7074 }
7075
7076 /**
7077 * Select and compile (or reuse) TES parts (epilog).
7078 */
7079 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7080 LLVMTargetMachineRef tm,
7081 struct si_shader *shader,
7082 struct pipe_debug_callback *debug)
7083 {
7084 if (shader->key.tes.as_es)
7085 return true;
7086
7087 /* TES compiled as VS. */
7088 return si_get_vs_epilog(sscreen, tm, shader, debug,
7089 &shader->key.tes.epilog);
7090 }
7091
7092 /**
7093 * Compile the TCS epilog. This writes tesselation factors to memory based on
7094 * the output primitive type of the tesselator (determined by TES).
7095 */
7096 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7097 LLVMTargetMachineRef tm,
7098 struct pipe_debug_callback *debug,
7099 struct si_shader_part *out)
7100 {
7101 union si_shader_part_key *key = &out->key;
7102 struct si_shader shader = {};
7103 struct si_shader_context ctx;
7104 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7105 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7106 LLVMTypeRef params[16];
7107 LLVMValueRef func;
7108 int last_array_pointer, last_sgpr, num_params;
7109 bool status = true;
7110
7111 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7112 ctx.type = PIPE_SHADER_TESS_CTRL;
7113 shader.key.tcs.epilog = key->tcs_epilog.states;
7114
7115 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7116 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7117 last_array_pointer = SI_PARAM_RW_BUFFERS;
7118 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7119 params[SI_PARAM_SAMPLERS] = ctx.i64;
7120 params[SI_PARAM_IMAGES] = ctx.i64;
7121 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7122 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7123 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7124 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7125 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7126 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7127 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7128 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7129 num_params = last_sgpr + 1;
7130
7131 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7132 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7133 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7134
7135 /* Create the function. */
7136 si_create_function(&ctx, NULL, 0, params, num_params,
7137 last_array_pointer, last_sgpr);
7138 declare_tess_lds(&ctx);
7139 func = ctx.radeon_bld.main_fn;
7140
7141 si_write_tess_factors(bld_base,
7142 LLVMGetParam(func, last_sgpr + 1),
7143 LLVMGetParam(func, last_sgpr + 2),
7144 LLVMGetParam(func, last_sgpr + 3));
7145
7146 /* Compile. */
7147 LLVMBuildRetVoid(gallivm->builder);
7148 radeon_llvm_finalize_module(&ctx.radeon_bld);
7149
7150 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7151 gallivm->module, debug, ctx.type,
7152 "Tessellation Control Shader Epilog"))
7153 status = false;
7154
7155 radeon_llvm_dispose(&ctx.radeon_bld);
7156 return status;
7157 }
7158
7159 /**
7160 * Select and compile (or reuse) TCS parts (epilog).
7161 */
7162 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7163 LLVMTargetMachineRef tm,
7164 struct si_shader *shader,
7165 struct pipe_debug_callback *debug)
7166 {
7167 union si_shader_part_key epilog_key;
7168
7169 /* Get the epilog. */
7170 memset(&epilog_key, 0, sizeof(epilog_key));
7171 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7172
7173 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7174 &epilog_key, tm, debug,
7175 si_compile_tcs_epilog);
7176 return shader->epilog != NULL;
7177 }
7178
7179 /**
7180 * Compile the pixel shader prolog. This handles:
7181 * - two-side color selection and interpolation
7182 * - overriding interpolation parameters for the API PS
7183 * - polygon stippling
7184 *
7185 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7186 * overriden by other states. (e.g. per-sample interpolation)
7187 * Interpolated colors are stored after the preloaded VGPRs.
7188 */
7189 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7190 LLVMTargetMachineRef tm,
7191 struct pipe_debug_callback *debug,
7192 struct si_shader_part *out)
7193 {
7194 union si_shader_part_key *key = &out->key;
7195 struct si_shader shader = {};
7196 struct si_shader_context ctx;
7197 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7198 LLVMTypeRef *params;
7199 LLVMValueRef ret, func;
7200 int last_sgpr, num_params, num_returns, i, num_color_channels;
7201 bool status = true;
7202
7203 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7204 ctx.type = PIPE_SHADER_FRAGMENT;
7205 shader.key.ps.prolog = key->ps_prolog.states;
7206
7207 /* Number of inputs + 8 color elements. */
7208 params = alloca((key->ps_prolog.num_input_sgprs +
7209 key->ps_prolog.num_input_vgprs + 8) *
7210 sizeof(LLVMTypeRef));
7211
7212 /* Declare inputs. */
7213 num_params = 0;
7214 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7215 params[num_params++] = ctx.i32;
7216 last_sgpr = num_params - 1;
7217
7218 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7219 params[num_params++] = ctx.f32;
7220
7221 /* Declare outputs (same as inputs + add colors if needed) */
7222 num_returns = num_params;
7223 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7224 for (i = 0; i < num_color_channels; i++)
7225 params[num_returns++] = ctx.f32;
7226
7227 /* Create the function. */
7228 si_create_function(&ctx, params, num_returns, params,
7229 num_params, -1, last_sgpr);
7230 func = ctx.radeon_bld.main_fn;
7231
7232 /* Copy inputs to outputs. This should be no-op, as the registers match,
7233 * but it will prevent the compiler from overwriting them unintentionally.
7234 */
7235 ret = ctx.return_value;
7236 for (i = 0; i < num_params; i++) {
7237 LLVMValueRef p = LLVMGetParam(func, i);
7238 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7239 }
7240
7241 /* Polygon stippling. */
7242 if (key->ps_prolog.states.poly_stipple) {
7243 /* POS_FIXED_PT is always last. */
7244 unsigned pos = key->ps_prolog.num_input_sgprs +
7245 key->ps_prolog.num_input_vgprs - 1;
7246 LLVMValueRef ptr[2], list;
7247
7248 /* Get the pointer to rw buffers. */
7249 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7250 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7251 list = lp_build_gather_values(gallivm, ptr, 2);
7252 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7253 list = LLVMBuildIntToPtr(gallivm->builder, list,
7254 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7255
7256 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7257 }
7258
7259 if (key->ps_prolog.states.bc_optimize_for_persp ||
7260 key->ps_prolog.states.bc_optimize_for_linear) {
7261 unsigned i, base = key->ps_prolog.num_input_sgprs;
7262 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7263
7264 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7265 * The hw doesn't compute CENTROID if the whole wave only
7266 * contains fully-covered quads.
7267 *
7268 * PRIM_MASK is after user SGPRs.
7269 */
7270 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7271 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7272 LLVMConstInt(ctx.i32, 31, 0), "");
7273 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7274 ctx.i1, "");
7275
7276 if (key->ps_prolog.states.bc_optimize_for_persp) {
7277 /* Read PERSP_CENTER. */
7278 for (i = 0; i < 2; i++)
7279 center[i] = LLVMGetParam(func, base + 2 + i);
7280 /* Read PERSP_CENTROID. */
7281 for (i = 0; i < 2; i++)
7282 centroid[i] = LLVMGetParam(func, base + 4 + i);
7283 /* Select PERSP_CENTROID. */
7284 for (i = 0; i < 2; i++) {
7285 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7286 center[i], centroid[i], "");
7287 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7288 tmp, base + 4 + i, "");
7289 }
7290 }
7291 if (key->ps_prolog.states.bc_optimize_for_linear) {
7292 /* Read LINEAR_CENTER. */
7293 for (i = 0; i < 2; i++)
7294 center[i] = LLVMGetParam(func, base + 8 + i);
7295 /* Read LINEAR_CENTROID. */
7296 for (i = 0; i < 2; i++)
7297 centroid[i] = LLVMGetParam(func, base + 10 + i);
7298 /* Select LINEAR_CENTROID. */
7299 for (i = 0; i < 2; i++) {
7300 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7301 center[i], centroid[i], "");
7302 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7303 tmp, base + 10 + i, "");
7304 }
7305 }
7306 }
7307
7308 /* Interpolate colors. */
7309 for (i = 0; i < 2; i++) {
7310 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7311 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7312 key->ps_prolog.face_vgpr_index;
7313 LLVMValueRef interp[2], color[4];
7314 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7315
7316 if (!writemask)
7317 continue;
7318
7319 /* If the interpolation qualifier is not CONSTANT (-1). */
7320 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7321 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7322 key->ps_prolog.color_interp_vgpr_index[i];
7323
7324 /* Get the (i,j) updated by bc_optimize handling. */
7325 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7326 interp_vgpr, "");
7327 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7328 interp_vgpr + 1, "");
7329 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7330 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7331 ctx.v2i32, "");
7332 }
7333
7334 /* Use the absolute location of the input. */
7335 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7336
7337 if (key->ps_prolog.states.color_two_side) {
7338 face = LLVMGetParam(func, face_vgpr);
7339 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7340 }
7341
7342 interp_fs_input(&ctx,
7343 key->ps_prolog.color_attr_index[i],
7344 TGSI_SEMANTIC_COLOR, i,
7345 key->ps_prolog.num_interp_inputs,
7346 key->ps_prolog.colors_read, interp_ij,
7347 prim_mask, face, color);
7348
7349 while (writemask) {
7350 unsigned chan = u_bit_scan(&writemask);
7351 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7352 num_params++, "");
7353 }
7354 }
7355
7356 /* Force per-sample interpolation. */
7357 if (key->ps_prolog.states.force_persp_sample_interp) {
7358 unsigned i, base = key->ps_prolog.num_input_sgprs;
7359 LLVMValueRef persp_sample[2];
7360
7361 /* Read PERSP_SAMPLE. */
7362 for (i = 0; i < 2; i++)
7363 persp_sample[i] = LLVMGetParam(func, base + i);
7364 /* Overwrite PERSP_CENTER. */
7365 for (i = 0; i < 2; i++)
7366 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7367 persp_sample[i], base + 2 + i, "");
7368 /* Overwrite PERSP_CENTROID. */
7369 for (i = 0; i < 2; i++)
7370 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7371 persp_sample[i], base + 4 + i, "");
7372 }
7373 if (key->ps_prolog.states.force_linear_sample_interp) {
7374 unsigned i, base = key->ps_prolog.num_input_sgprs;
7375 LLVMValueRef linear_sample[2];
7376
7377 /* Read LINEAR_SAMPLE. */
7378 for (i = 0; i < 2; i++)
7379 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7380 /* Overwrite LINEAR_CENTER. */
7381 for (i = 0; i < 2; i++)
7382 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7383 linear_sample[i], base + 8 + i, "");
7384 /* Overwrite LINEAR_CENTROID. */
7385 for (i = 0; i < 2; i++)
7386 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7387 linear_sample[i], base + 10 + i, "");
7388 }
7389
7390 /* Force center interpolation. */
7391 if (key->ps_prolog.states.force_persp_center_interp) {
7392 unsigned i, base = key->ps_prolog.num_input_sgprs;
7393 LLVMValueRef persp_center[2];
7394
7395 /* Read PERSP_CENTER. */
7396 for (i = 0; i < 2; i++)
7397 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7398 /* Overwrite PERSP_SAMPLE. */
7399 for (i = 0; i < 2; i++)
7400 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7401 persp_center[i], base + i, "");
7402 /* Overwrite PERSP_CENTROID. */
7403 for (i = 0; i < 2; i++)
7404 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7405 persp_center[i], base + 4 + i, "");
7406 }
7407 if (key->ps_prolog.states.force_linear_center_interp) {
7408 unsigned i, base = key->ps_prolog.num_input_sgprs;
7409 LLVMValueRef linear_center[2];
7410
7411 /* Read LINEAR_CENTER. */
7412 for (i = 0; i < 2; i++)
7413 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7414 /* Overwrite LINEAR_SAMPLE. */
7415 for (i = 0; i < 2; i++)
7416 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7417 linear_center[i], base + 6 + i, "");
7418 /* Overwrite LINEAR_CENTROID. */
7419 for (i = 0; i < 2; i++)
7420 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7421 linear_center[i], base + 10 + i, "");
7422 }
7423
7424 /* Tell LLVM to insert WQM instruction sequence when needed. */
7425 if (key->ps_prolog.wqm) {
7426 LLVMAddTargetDependentFunctionAttr(func,
7427 "amdgpu-ps-wqm-outputs", "");
7428 }
7429
7430 /* Compile. */
7431 si_llvm_build_ret(&ctx, ret);
7432 radeon_llvm_finalize_module(&ctx.radeon_bld);
7433
7434 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7435 gallivm->module, debug, ctx.type,
7436 "Fragment Shader Prolog"))
7437 status = false;
7438
7439 radeon_llvm_dispose(&ctx.radeon_bld);
7440 return status;
7441 }
7442
7443 /**
7444 * Compile the pixel shader epilog. This handles everything that must be
7445 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7446 */
7447 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7448 LLVMTargetMachineRef tm,
7449 struct pipe_debug_callback *debug,
7450 struct si_shader_part *out)
7451 {
7452 union si_shader_part_key *key = &out->key;
7453 struct si_shader shader = {};
7454 struct si_shader_context ctx;
7455 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7456 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7457 LLVMTypeRef params[16+8*4+3];
7458 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7459 int last_array_pointer, last_sgpr, num_params, i;
7460 bool status = true;
7461
7462 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7463 ctx.type = PIPE_SHADER_FRAGMENT;
7464 shader.key.ps.epilog = key->ps_epilog.states;
7465
7466 /* Declare input SGPRs. */
7467 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7468 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7469 params[SI_PARAM_SAMPLERS] = ctx.i64;
7470 params[SI_PARAM_IMAGES] = ctx.i64;
7471 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7472 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7473 last_array_pointer = -1;
7474 last_sgpr = SI_PARAM_ALPHA_REF;
7475
7476 /* Declare input VGPRs. */
7477 num_params = (last_sgpr + 1) +
7478 util_bitcount(key->ps_epilog.colors_written) * 4 +
7479 key->ps_epilog.writes_z +
7480 key->ps_epilog.writes_stencil +
7481 key->ps_epilog.writes_samplemask;
7482
7483 num_params = MAX2(num_params,
7484 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7485
7486 assert(num_params <= ARRAY_SIZE(params));
7487
7488 for (i = last_sgpr + 1; i < num_params; i++)
7489 params[i] = ctx.f32;
7490
7491 /* Create the function. */
7492 si_create_function(&ctx, NULL, 0, params, num_params,
7493 last_array_pointer, last_sgpr);
7494 /* Disable elimination of unused inputs. */
7495 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7496 "InitialPSInputAddr", 0xffffff);
7497
7498 /* Process colors. */
7499 unsigned vgpr = last_sgpr + 1;
7500 unsigned colors_written = key->ps_epilog.colors_written;
7501 int last_color_export = -1;
7502
7503 /* Find the last color export. */
7504 if (!key->ps_epilog.writes_z &&
7505 !key->ps_epilog.writes_stencil &&
7506 !key->ps_epilog.writes_samplemask) {
7507 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7508
7509 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7510 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7511 /* Just set this if any of the colorbuffers are enabled. */
7512 if (spi_format &
7513 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7514 last_color_export = 0;
7515 } else {
7516 for (i = 0; i < 8; i++)
7517 if (colors_written & (1 << i) &&
7518 (spi_format >> (i * 4)) & 0xf)
7519 last_color_export = i;
7520 }
7521 }
7522
7523 while (colors_written) {
7524 LLVMValueRef color[4];
7525 int mrt = u_bit_scan(&colors_written);
7526
7527 for (i = 0; i < 4; i++)
7528 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7529
7530 si_export_mrt_color(bld_base, color, mrt,
7531 num_params - 1,
7532 mrt == last_color_export);
7533 }
7534
7535 /* Process depth, stencil, samplemask. */
7536 if (key->ps_epilog.writes_z)
7537 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7538 if (key->ps_epilog.writes_stencil)
7539 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7540 if (key->ps_epilog.writes_samplemask)
7541 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7542
7543 if (depth || stencil || samplemask)
7544 si_export_mrt_z(bld_base, depth, stencil, samplemask);
7545 else if (last_color_export == -1)
7546 si_export_null(bld_base);
7547
7548 /* Compile. */
7549 LLVMBuildRetVoid(gallivm->builder);
7550 radeon_llvm_finalize_module(&ctx.radeon_bld);
7551
7552 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7553 gallivm->module, debug, ctx.type,
7554 "Fragment Shader Epilog"))
7555 status = false;
7556
7557 radeon_llvm_dispose(&ctx.radeon_bld);
7558 return status;
7559 }
7560
7561 /**
7562 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7563 */
7564 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7565 LLVMTargetMachineRef tm,
7566 struct si_shader *shader,
7567 struct pipe_debug_callback *debug)
7568 {
7569 struct tgsi_shader_info *info = &shader->selector->info;
7570 union si_shader_part_key prolog_key;
7571 union si_shader_part_key epilog_key;
7572 unsigned i;
7573
7574 /* Get the prolog. */
7575 memset(&prolog_key, 0, sizeof(prolog_key));
7576 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7577 prolog_key.ps_prolog.colors_read = info->colors_read;
7578 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7579 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7580 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7581 (prolog_key.ps_prolog.colors_read ||
7582 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7583 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7584 prolog_key.ps_prolog.states.force_persp_center_interp ||
7585 prolog_key.ps_prolog.states.force_linear_center_interp ||
7586 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7587 prolog_key.ps_prolog.states.bc_optimize_for_linear);
7588
7589 if (info->colors_read) {
7590 unsigned *color = shader->selector->color_attr_index;
7591
7592 if (shader->key.ps.prolog.color_two_side) {
7593 /* BCOLORs are stored after the last input. */
7594 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7595 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7596 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7597 }
7598
7599 for (i = 0; i < 2; i++) {
7600 unsigned interp = info->input_interpolate[color[i]];
7601 unsigned location = info->input_interpolate_loc[color[i]];
7602
7603 if (!(info->colors_read & (0xf << i*4)))
7604 continue;
7605
7606 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7607
7608 if (shader->key.ps.prolog.flatshade_colors &&
7609 interp == TGSI_INTERPOLATE_COLOR)
7610 interp = TGSI_INTERPOLATE_CONSTANT;
7611
7612 switch (interp) {
7613 case TGSI_INTERPOLATE_CONSTANT:
7614 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7615 break;
7616 case TGSI_INTERPOLATE_PERSPECTIVE:
7617 case TGSI_INTERPOLATE_COLOR:
7618 /* Force the interpolation location for colors here. */
7619 if (shader->key.ps.prolog.force_persp_sample_interp)
7620 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7621 if (shader->key.ps.prolog.force_persp_center_interp)
7622 location = TGSI_INTERPOLATE_LOC_CENTER;
7623
7624 switch (location) {
7625 case TGSI_INTERPOLATE_LOC_SAMPLE:
7626 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7627 shader->config.spi_ps_input_ena |=
7628 S_0286CC_PERSP_SAMPLE_ENA(1);
7629 break;
7630 case TGSI_INTERPOLATE_LOC_CENTER:
7631 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7632 shader->config.spi_ps_input_ena |=
7633 S_0286CC_PERSP_CENTER_ENA(1);
7634 break;
7635 case TGSI_INTERPOLATE_LOC_CENTROID:
7636 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7637 shader->config.spi_ps_input_ena |=
7638 S_0286CC_PERSP_CENTROID_ENA(1);
7639 break;
7640 default:
7641 assert(0);
7642 }
7643 break;
7644 case TGSI_INTERPOLATE_LINEAR:
7645 /* Force the interpolation location for colors here. */
7646 if (shader->key.ps.prolog.force_linear_sample_interp)
7647 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7648 if (shader->key.ps.prolog.force_linear_center_interp)
7649 location = TGSI_INTERPOLATE_LOC_CENTER;
7650
7651 switch (location) {
7652 case TGSI_INTERPOLATE_LOC_SAMPLE:
7653 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7654 shader->config.spi_ps_input_ena |=
7655 S_0286CC_LINEAR_SAMPLE_ENA(1);
7656 break;
7657 case TGSI_INTERPOLATE_LOC_CENTER:
7658 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7659 shader->config.spi_ps_input_ena |=
7660 S_0286CC_LINEAR_CENTER_ENA(1);
7661 break;
7662 case TGSI_INTERPOLATE_LOC_CENTROID:
7663 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7664 shader->config.spi_ps_input_ena |=
7665 S_0286CC_LINEAR_CENTROID_ENA(1);
7666 break;
7667 default:
7668 assert(0);
7669 }
7670 break;
7671 default:
7672 assert(0);
7673 }
7674 }
7675 }
7676
7677 /* The prolog is a no-op if these aren't set. */
7678 if (prolog_key.ps_prolog.colors_read ||
7679 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7680 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7681 prolog_key.ps_prolog.states.force_persp_center_interp ||
7682 prolog_key.ps_prolog.states.force_linear_center_interp ||
7683 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7684 prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7685 prolog_key.ps_prolog.states.poly_stipple) {
7686 shader->prolog =
7687 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7688 &prolog_key, tm, debug,
7689 si_compile_ps_prolog);
7690 if (!shader->prolog)
7691 return false;
7692 }
7693
7694 /* Get the epilog. */
7695 memset(&epilog_key, 0, sizeof(epilog_key));
7696 epilog_key.ps_epilog.colors_written = info->colors_written;
7697 epilog_key.ps_epilog.writes_z = info->writes_z;
7698 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7699 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7700 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7701
7702 shader->epilog =
7703 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7704 &epilog_key, tm, debug,
7705 si_compile_ps_epilog);
7706 if (!shader->epilog)
7707 return false;
7708
7709 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7710 if (shader->key.ps.prolog.poly_stipple) {
7711 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7712 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7713 }
7714
7715 /* Set up the enable bits for per-sample shading if needed. */
7716 if (shader->key.ps.prolog.force_persp_sample_interp &&
7717 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7718 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7719 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7720 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7721 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7722 }
7723 if (shader->key.ps.prolog.force_linear_sample_interp &&
7724 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7725 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7726 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7727 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7728 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7729 }
7730 if (shader->key.ps.prolog.force_persp_center_interp &&
7731 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7732 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7733 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7734 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7735 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7736 }
7737 if (shader->key.ps.prolog.force_linear_center_interp &&
7738 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7739 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7740 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7741 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7742 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7743 }
7744
7745 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7746 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7747 !(shader->config.spi_ps_input_ena & 0xf)) {
7748 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7749 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7750 }
7751
7752 /* At least one pair of interpolation weights must be enabled. */
7753 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7754 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7755 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7756 }
7757
7758 /* The sample mask input is always enabled, because the API shader always
7759 * passes it through to the epilog. Disable it here if it's unused.
7760 */
7761 if (!shader->key.ps.epilog.poly_line_smoothing &&
7762 !shader->selector->info.reads_samplemask)
7763 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7764
7765 return true;
7766 }
7767
7768 static void si_fix_num_sgprs(struct si_shader *shader)
7769 {
7770 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7771
7772 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7773 }
7774
7775 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7776 struct si_shader *shader,
7777 struct pipe_debug_callback *debug)
7778 {
7779 struct si_shader *mainp = shader->selector->main_shader_part;
7780 int r;
7781
7782 /* LS, ES, VS are compiled on demand if the main part hasn't been
7783 * compiled for that stage.
7784 */
7785 if (!mainp ||
7786 (shader->selector->type == PIPE_SHADER_VERTEX &&
7787 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7788 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7789 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7790 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7791 (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7792 shader->key.tcs.epilog.inputs_to_copy) ||
7793 shader->selector->type == PIPE_SHADER_COMPUTE) {
7794 /* Monolithic shader (compiled as a whole, has many variants,
7795 * may take a long time to compile).
7796 */
7797 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7798 if (r)
7799 return r;
7800 } else {
7801 /* The shader consists of 2-3 parts:
7802 *
7803 * - the middle part is the user shader, it has 1 variant only
7804 * and it was compiled during the creation of the shader
7805 * selector
7806 * - the prolog part is inserted at the beginning
7807 * - the epilog part is inserted at the end
7808 *
7809 * The prolog and epilog have many (but simple) variants.
7810 */
7811
7812 /* Copy the compiled TGSI shader data over. */
7813 shader->is_binary_shared = true;
7814 shader->binary = mainp->binary;
7815 shader->config = mainp->config;
7816 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7817 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7818 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7819 memcpy(shader->info.vs_output_param_offset,
7820 mainp->info.vs_output_param_offset,
7821 sizeof(mainp->info.vs_output_param_offset));
7822 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7823 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7824 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7825
7826 /* Select prologs and/or epilogs. */
7827 switch (shader->selector->type) {
7828 case PIPE_SHADER_VERTEX:
7829 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7830 return -1;
7831 break;
7832 case PIPE_SHADER_TESS_CTRL:
7833 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7834 return -1;
7835 break;
7836 case PIPE_SHADER_TESS_EVAL:
7837 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7838 return -1;
7839 break;
7840 case PIPE_SHADER_FRAGMENT:
7841 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7842 return -1;
7843
7844 /* Make sure we have at least as many VGPRs as there
7845 * are allocated inputs.
7846 */
7847 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7848 shader->info.num_input_vgprs);
7849 break;
7850 }
7851
7852 /* Update SGPR and VGPR counts. */
7853 if (shader->prolog) {
7854 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7855 shader->prolog->config.num_sgprs);
7856 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7857 shader->prolog->config.num_vgprs);
7858 }
7859 if (shader->epilog) {
7860 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7861 shader->epilog->config.num_sgprs);
7862 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7863 shader->epilog->config.num_vgprs);
7864 }
7865 }
7866
7867 si_fix_num_sgprs(shader);
7868 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7869 stderr);
7870
7871 /* Upload. */
7872 r = si_shader_binary_upload(sscreen, shader);
7873 if (r) {
7874 fprintf(stderr, "LLVM failed to upload shader\n");
7875 return r;
7876 }
7877
7878 return 0;
7879 }
7880
7881 void si_shader_destroy(struct si_shader *shader)
7882 {
7883 if (shader->gs_copy_shader) {
7884 si_shader_destroy(shader->gs_copy_shader);
7885 FREE(shader->gs_copy_shader);
7886 }
7887
7888 if (shader->scratch_bo)
7889 r600_resource_reference(&shader->scratch_bo, NULL);
7890
7891 r600_resource_reference(&shader->bo, NULL);
7892
7893 if (!shader->is_binary_shared)
7894 radeon_shader_binary_clean(&shader->binary);
7895 }