radeonsi: workaround for tesselation on SI
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "radeon/r600_cs.h"
37 #include "radeon/radeon_llvm.h"
38 #include "radeon/radeon_elf_util.h"
39 #include "radeon/radeon_llvm_emit.h"
40 #include "util/u_memory.h"
41 #include "util/u_pstipple.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94
95 LLVMTargetMachineRef tm;
96
97 unsigned uniform_md_kind;
98 LLVMValueRef const_md;
99 LLVMValueRef empty_md;
100 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
101 LLVMValueRef lds;
102 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
103 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
104 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
105 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
106 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
107 LLVMValueRef images[SI_NUM_IMAGES];
108 LLVMValueRef so_buffers[4];
109 LLVMValueRef esgs_ring;
110 LLVMValueRef gsvs_ring[4];
111 LLVMValueRef gs_next_vertex[4];
112 LLVMValueRef return_value;
113
114 LLVMTypeRef voidt;
115 LLVMTypeRef i1;
116 LLVMTypeRef i8;
117 LLVMTypeRef i32;
118 LLVMTypeRef i64;
119 LLVMTypeRef i128;
120 LLVMTypeRef f32;
121 LLVMTypeRef v16i8;
122 LLVMTypeRef v2i32;
123 LLVMTypeRef v4i32;
124 LLVMTypeRef v4f32;
125 LLVMTypeRef v8i32;
126
127 LLVMValueRef shared_memory;
128 };
129
130 static struct si_shader_context *si_shader_context(
131 struct lp_build_tgsi_context *bld_base)
132 {
133 return (struct si_shader_context *)bld_base;
134 }
135
136 static void si_init_shader_ctx(struct si_shader_context *ctx,
137 struct si_screen *sscreen,
138 struct si_shader *shader,
139 LLVMTargetMachineRef tm);
140
141 /* Ideally pass the sample mask input to the PS epilog as v13, which
142 * is its usual location, so that the shader doesn't have to add v_mov.
143 */
144 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
145
146 /* The VS location of the PrimitiveID input is the same in the epilog,
147 * so that the main shader part doesn't have to move it.
148 */
149 #define VS_EPILOG_PRIMID_LOC 2
150
151 #define PERSPECTIVE_BASE 0
152 #define LINEAR_BASE 9
153
154 #define SAMPLE_OFFSET 0
155 #define CENTER_OFFSET 2
156 #define CENTROID_OFSET 4
157
158 #define USE_SGPR_MAX_SUFFIX_LEN 5
159 #define CONST_ADDR_SPACE 2
160 #define LOCAL_ADDR_SPACE 3
161 #define USER_SGPR_ADDR_SPACE 8
162
163
164 #define SENDMSG_GS 2
165 #define SENDMSG_GS_DONE 3
166
167 #define SENDMSG_GS_OP_NOP (0 << 4)
168 #define SENDMSG_GS_OP_CUT (1 << 4)
169 #define SENDMSG_GS_OP_EMIT (2 << 4)
170 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
171
172 /**
173 * Returns a unique index for a semantic name and index. The index must be
174 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
175 * calculated.
176 */
177 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
178 {
179 switch (semantic_name) {
180 case TGSI_SEMANTIC_POSITION:
181 return 0;
182 case TGSI_SEMANTIC_PSIZE:
183 return 1;
184 case TGSI_SEMANTIC_CLIPDIST:
185 assert(index <= 1);
186 return 2 + index;
187 case TGSI_SEMANTIC_GENERIC:
188 if (index <= 63-4)
189 return 4 + index;
190 else
191 /* same explanation as in the default statement,
192 * the only user hitting this is st/nine.
193 */
194 return 0;
195
196 /* patch indices are completely separate and thus start from 0 */
197 case TGSI_SEMANTIC_TESSOUTER:
198 return 0;
199 case TGSI_SEMANTIC_TESSINNER:
200 return 1;
201 case TGSI_SEMANTIC_PATCH:
202 return 2 + index;
203
204 default:
205 /* Don't fail here. The result of this function is only used
206 * for LS, TCS, TES, and GS, where legacy GL semantics can't
207 * occur, but this function is called for all vertex shaders
208 * before it's known whether LS will be compiled or not.
209 */
210 return 0;
211 }
212 }
213
214 /**
215 * Get the value of a shader input parameter and extract a bitfield.
216 */
217 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
218 unsigned param, unsigned rshift,
219 unsigned bitwidth)
220 {
221 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
222 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
223 param);
224
225 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
226 value = bitcast(&ctx->radeon_bld.soa.bld_base,
227 TGSI_TYPE_UNSIGNED, value);
228
229 if (rshift)
230 value = LLVMBuildLShr(gallivm->builder, value,
231 lp_build_const_int32(gallivm, rshift), "");
232
233 if (rshift + bitwidth < 32) {
234 unsigned mask = (1 << bitwidth) - 1;
235 value = LLVMBuildAnd(gallivm->builder, value,
236 lp_build_const_int32(gallivm, mask), "");
237 }
238
239 return value;
240 }
241
242 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
243 {
244 switch (ctx->type) {
245 case PIPE_SHADER_TESS_CTRL:
246 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
247
248 case PIPE_SHADER_TESS_EVAL:
249 return LLVMGetParam(ctx->radeon_bld.main_fn,
250 ctx->param_tes_rel_patch_id);
251
252 default:
253 assert(0);
254 return NULL;
255 }
256 }
257
258 /* Tessellation shaders pass outputs to the next shader using LDS.
259 *
260 * LS outputs = TCS inputs
261 * TCS outputs = TES inputs
262 *
263 * The LDS layout is:
264 * - TCS inputs for patch 0
265 * - TCS inputs for patch 1
266 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
267 * - ...
268 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
269 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
270 * - TCS outputs for patch 1
271 * - Per-patch TCS outputs for patch 1
272 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
273 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
274 * - ...
275 *
276 * All three shaders VS(LS), TCS, TES share the same LDS space.
277 */
278
279 static LLVMValueRef
280 get_tcs_in_patch_stride(struct si_shader_context *ctx)
281 {
282 if (ctx->type == PIPE_SHADER_VERTEX)
283 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
284 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
285 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
286 else {
287 assert(0);
288 return NULL;
289 }
290 }
291
292 static LLVMValueRef
293 get_tcs_out_patch_stride(struct si_shader_context *ctx)
294 {
295 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
296 }
297
298 static LLVMValueRef
299 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
300 {
301 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
302 unpack_param(ctx,
303 SI_PARAM_TCS_OUT_OFFSETS,
304 0, 16),
305 4);
306 }
307
308 static LLVMValueRef
309 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
310 {
311 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
312 unpack_param(ctx,
313 SI_PARAM_TCS_OUT_OFFSETS,
314 16, 16),
315 4);
316 }
317
318 static LLVMValueRef
319 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
320 {
321 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
322 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
323 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
324
325 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
326 }
327
328 static LLVMValueRef
329 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
330 {
331 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
332 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
333 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
334 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
335
336 return LLVMBuildAdd(gallivm->builder, patch0_offset,
337 LLVMBuildMul(gallivm->builder, patch_stride,
338 rel_patch_id, ""),
339 "");
340 }
341
342 static LLVMValueRef
343 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
344 {
345 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
346 LLVMValueRef patch0_patch_data_offset =
347 get_tcs_out_patch0_patch_data_offset(ctx);
348 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
349 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
350
351 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
352 LLVMBuildMul(gallivm->builder, patch_stride,
353 rel_patch_id, ""),
354 "");
355 }
356
357 static void build_indexed_store(struct si_shader_context *ctx,
358 LLVMValueRef base_ptr, LLVMValueRef index,
359 LLVMValueRef value)
360 {
361 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
362 struct gallivm_state *gallivm = bld_base->base.gallivm;
363 LLVMValueRef indices[2], pointer;
364
365 indices[0] = bld_base->uint_bld.zero;
366 indices[1] = index;
367
368 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
369 LLVMBuildStore(gallivm->builder, value, pointer);
370 }
371
372 /**
373 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
374 * It's equivalent to doing a load from &base_ptr[index].
375 *
376 * \param base_ptr Where the array starts.
377 * \param index The element index into the array.
378 * \param uniform Whether the base_ptr and index can be assumed to be
379 * dynamically uniform
380 */
381 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
382 LLVMValueRef base_ptr, LLVMValueRef index,
383 bool uniform)
384 {
385 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
386 struct gallivm_state *gallivm = bld_base->base.gallivm;
387 LLVMValueRef indices[2], pointer;
388
389 indices[0] = bld_base->uint_bld.zero;
390 indices[1] = index;
391
392 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
393 if (uniform)
394 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
395 return LLVMBuildLoad(gallivm->builder, pointer, "");
396 }
397
398 /**
399 * Do a load from &base_ptr[index], but also add a flag that it's loading
400 * a constant from a dynamically uniform index.
401 */
402 static LLVMValueRef build_indexed_load_const(
403 struct si_shader_context *ctx,
404 LLVMValueRef base_ptr, LLVMValueRef index)
405 {
406 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
407 LLVMSetMetadata(result, 1, ctx->const_md);
408 return result;
409 }
410
411 static LLVMValueRef get_instance_index_for_fetch(
412 struct radeon_llvm_context *radeon_bld,
413 unsigned param_start_instance, unsigned divisor)
414 {
415 struct si_shader_context *ctx =
416 si_shader_context(&radeon_bld->soa.bld_base);
417 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
418
419 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
420 ctx->param_instance_id);
421
422 /* The division must be done before START_INSTANCE is added. */
423 if (divisor > 1)
424 result = LLVMBuildUDiv(gallivm->builder, result,
425 lp_build_const_int32(gallivm, divisor), "");
426
427 return LLVMBuildAdd(gallivm->builder, result,
428 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
429 }
430
431 static void declare_input_vs(
432 struct radeon_llvm_context *radeon_bld,
433 unsigned input_index,
434 const struct tgsi_full_declaration *decl)
435 {
436 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
437 struct gallivm_state *gallivm = base->gallivm;
438 struct si_shader_context *ctx =
439 si_shader_context(&radeon_bld->soa.bld_base);
440 unsigned divisor =
441 ctx->shader->key.vs.prolog.instance_divisors[input_index];
442
443 unsigned chan;
444
445 LLVMValueRef t_list_ptr;
446 LLVMValueRef t_offset;
447 LLVMValueRef t_list;
448 LLVMValueRef attribute_offset;
449 LLVMValueRef buffer_index;
450 LLVMValueRef args[3];
451 LLVMValueRef input;
452
453 /* Load the T list */
454 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
455
456 t_offset = lp_build_const_int32(gallivm, input_index);
457
458 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
459
460 /* Build the attribute offset */
461 attribute_offset = lp_build_const_int32(gallivm, 0);
462
463 if (!ctx->is_monolithic) {
464 buffer_index = LLVMGetParam(radeon_bld->main_fn,
465 ctx->param_vertex_index0 +
466 input_index);
467 } else if (divisor) {
468 /* Build index from instance ID, start instance and divisor */
469 ctx->shader->info.uses_instanceid = true;
470 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
471 SI_PARAM_START_INSTANCE,
472 divisor);
473 } else {
474 /* Load the buffer index for vertices. */
475 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
476 ctx->param_vertex_id);
477 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
478 SI_PARAM_BASE_VERTEX);
479 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
480 }
481
482 args[0] = t_list;
483 args[1] = attribute_offset;
484 args[2] = buffer_index;
485 input = lp_build_intrinsic(gallivm->builder,
486 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
487 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
488
489 /* Break up the vec4 into individual components */
490 for (chan = 0; chan < 4; chan++) {
491 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
492 /* XXX: Use a helper function for this. There is one in
493 * tgsi_llvm.c. */
494 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
495 LLVMBuildExtractElement(gallivm->builder,
496 input, llvm_chan, "");
497 }
498 }
499
500 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
501 unsigned swizzle)
502 {
503 struct si_shader_context *ctx = si_shader_context(bld_base);
504
505 if (swizzle > 0)
506 return bld_base->uint_bld.zero;
507
508 switch (ctx->type) {
509 case PIPE_SHADER_VERTEX:
510 return LLVMGetParam(ctx->radeon_bld.main_fn,
511 ctx->param_vs_prim_id);
512 case PIPE_SHADER_TESS_CTRL:
513 return LLVMGetParam(ctx->radeon_bld.main_fn,
514 SI_PARAM_PATCH_ID);
515 case PIPE_SHADER_TESS_EVAL:
516 return LLVMGetParam(ctx->radeon_bld.main_fn,
517 ctx->param_tes_patch_id);
518 case PIPE_SHADER_GEOMETRY:
519 return LLVMGetParam(ctx->radeon_bld.main_fn,
520 SI_PARAM_PRIMITIVE_ID);
521 default:
522 assert(0);
523 return bld_base->uint_bld.zero;
524 }
525 }
526
527 /**
528 * Return the value of tgsi_ind_register for indexing.
529 * This is the indirect index with the constant offset added to it.
530 */
531 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
532 const struct tgsi_ind_register *ind,
533 int rel_index)
534 {
535 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
536 LLVMValueRef result;
537
538 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
539 result = LLVMBuildLoad(gallivm->builder, result, "");
540 result = LLVMBuildAdd(gallivm->builder, result,
541 lp_build_const_int32(gallivm, rel_index), "");
542 return result;
543 }
544
545 /**
546 * Like get_indirect_index, but restricts the return value to a (possibly
547 * undefined) value inside [0..num).
548 */
549 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
550 const struct tgsi_ind_register *ind,
551 int rel_index, unsigned num)
552 {
553 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
554 LLVMBuilderRef builder = gallivm->builder;
555 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
556 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
557 LLVMValueRef cc;
558
559 if (util_is_power_of_two(num)) {
560 result = LLVMBuildAnd(builder, result, c_max, "");
561 } else {
562 /* In theory, this MAX pattern should result in code that is
563 * as good as the bit-wise AND above.
564 *
565 * In practice, LLVM generates worse code (at the time of
566 * writing), because its value tracking is not strong enough.
567 */
568 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
569 result = LLVMBuildSelect(builder, cc, result, c_max, "");
570 }
571
572 return result;
573 }
574
575
576 /**
577 * Calculate a dword address given an input or output register and a stride.
578 */
579 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
580 const struct tgsi_full_dst_register *dst,
581 const struct tgsi_full_src_register *src,
582 LLVMValueRef vertex_dw_stride,
583 LLVMValueRef base_addr)
584 {
585 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
586 struct tgsi_shader_info *info = &ctx->shader->selector->info;
587 ubyte *name, *index, *array_first;
588 int first, param;
589 struct tgsi_full_dst_register reg;
590
591 /* Set the register description. The address computation is the same
592 * for sources and destinations. */
593 if (src) {
594 reg.Register.File = src->Register.File;
595 reg.Register.Index = src->Register.Index;
596 reg.Register.Indirect = src->Register.Indirect;
597 reg.Register.Dimension = src->Register.Dimension;
598 reg.Indirect = src->Indirect;
599 reg.Dimension = src->Dimension;
600 reg.DimIndirect = src->DimIndirect;
601 } else
602 reg = *dst;
603
604 /* If the register is 2-dimensional (e.g. an array of vertices
605 * in a primitive), calculate the base address of the vertex. */
606 if (reg.Register.Dimension) {
607 LLVMValueRef index;
608
609 if (reg.Dimension.Indirect)
610 index = get_indirect_index(ctx, &reg.DimIndirect,
611 reg.Dimension.Index);
612 else
613 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
614
615 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
616 LLVMBuildMul(gallivm->builder, index,
617 vertex_dw_stride, ""), "");
618 }
619
620 /* Get information about the register. */
621 if (reg.Register.File == TGSI_FILE_INPUT) {
622 name = info->input_semantic_name;
623 index = info->input_semantic_index;
624 array_first = info->input_array_first;
625 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
626 name = info->output_semantic_name;
627 index = info->output_semantic_index;
628 array_first = info->output_array_first;
629 } else {
630 assert(0);
631 return NULL;
632 }
633
634 if (reg.Register.Indirect) {
635 /* Add the relative address of the element. */
636 LLVMValueRef ind_index;
637
638 if (reg.Indirect.ArrayID)
639 first = array_first[reg.Indirect.ArrayID];
640 else
641 first = reg.Register.Index;
642
643 ind_index = get_indirect_index(ctx, &reg.Indirect,
644 reg.Register.Index - first);
645
646 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
647 LLVMBuildMul(gallivm->builder, ind_index,
648 lp_build_const_int32(gallivm, 4), ""), "");
649
650 param = si_shader_io_get_unique_index(name[first], index[first]);
651 } else {
652 param = si_shader_io_get_unique_index(name[reg.Register.Index],
653 index[reg.Register.Index]);
654 }
655
656 /* Add the base address of the element. */
657 return LLVMBuildAdd(gallivm->builder, base_addr,
658 lp_build_const_int32(gallivm, param * 4), "");
659 }
660
661 /**
662 * Load from LDS.
663 *
664 * \param type output value type
665 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
666 * \param dw_addr address in dwords
667 */
668 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
669 enum tgsi_opcode_type type, unsigned swizzle,
670 LLVMValueRef dw_addr)
671 {
672 struct si_shader_context *ctx = si_shader_context(bld_base);
673 struct gallivm_state *gallivm = bld_base->base.gallivm;
674 LLVMValueRef value;
675
676 if (swizzle == ~0) {
677 LLVMValueRef values[TGSI_NUM_CHANNELS];
678
679 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
680 values[chan] = lds_load(bld_base, type, chan, dw_addr);
681
682 return lp_build_gather_values(bld_base->base.gallivm, values,
683 TGSI_NUM_CHANNELS);
684 }
685
686 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
687 lp_build_const_int32(gallivm, swizzle));
688
689 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
690 if (type == TGSI_TYPE_DOUBLE) {
691 LLVMValueRef value2;
692 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
693 lp_build_const_int32(gallivm, swizzle + 1));
694 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
695 return radeon_llvm_emit_fetch_double(bld_base, value, value2);
696 }
697
698 return LLVMBuildBitCast(gallivm->builder, value,
699 tgsi2llvmtype(bld_base, type), "");
700 }
701
702 /**
703 * Store to LDS.
704 *
705 * \param swizzle offset (typically 0..3)
706 * \param dw_addr address in dwords
707 * \param value value to store
708 */
709 static void lds_store(struct lp_build_tgsi_context *bld_base,
710 unsigned swizzle, LLVMValueRef dw_addr,
711 LLVMValueRef value)
712 {
713 struct si_shader_context *ctx = si_shader_context(bld_base);
714 struct gallivm_state *gallivm = bld_base->base.gallivm;
715
716 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
717 lp_build_const_int32(gallivm, swizzle));
718
719 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
720 build_indexed_store(ctx, ctx->lds,
721 dw_addr, value);
722 }
723
724 static LLVMValueRef fetch_input_tcs(
725 struct lp_build_tgsi_context *bld_base,
726 const struct tgsi_full_src_register *reg,
727 enum tgsi_opcode_type type, unsigned swizzle)
728 {
729 struct si_shader_context *ctx = si_shader_context(bld_base);
730 LLVMValueRef dw_addr, stride;
731
732 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
733 dw_addr = get_tcs_in_current_patch_offset(ctx);
734 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
735
736 return lds_load(bld_base, type, swizzle, dw_addr);
737 }
738
739 static LLVMValueRef fetch_output_tcs(
740 struct lp_build_tgsi_context *bld_base,
741 const struct tgsi_full_src_register *reg,
742 enum tgsi_opcode_type type, unsigned swizzle)
743 {
744 struct si_shader_context *ctx = si_shader_context(bld_base);
745 LLVMValueRef dw_addr, stride;
746
747 if (reg->Register.Dimension) {
748 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
749 dw_addr = get_tcs_out_current_patch_offset(ctx);
750 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
751 } else {
752 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
753 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
754 }
755
756 return lds_load(bld_base, type, swizzle, dw_addr);
757 }
758
759 static LLVMValueRef fetch_input_tes(
760 struct lp_build_tgsi_context *bld_base,
761 const struct tgsi_full_src_register *reg,
762 enum tgsi_opcode_type type, unsigned swizzle)
763 {
764 struct si_shader_context *ctx = si_shader_context(bld_base);
765 LLVMValueRef dw_addr, stride;
766
767 if (reg->Register.Dimension) {
768 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
769 dw_addr = get_tcs_out_current_patch_offset(ctx);
770 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
771 } else {
772 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
773 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
774 }
775
776 return lds_load(bld_base, type, swizzle, dw_addr);
777 }
778
779 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
780 const struct tgsi_full_instruction *inst,
781 const struct tgsi_opcode_info *info,
782 LLVMValueRef dst[4])
783 {
784 struct si_shader_context *ctx = si_shader_context(bld_base);
785 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
786 unsigned chan_index;
787 LLVMValueRef dw_addr, stride;
788
789 /* Only handle per-patch and per-vertex outputs here.
790 * Vectors will be lowered to scalars and this function will be called again.
791 */
792 if (reg->Register.File != TGSI_FILE_OUTPUT ||
793 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
794 radeon_llvm_emit_store(bld_base, inst, info, dst);
795 return;
796 }
797
798 if (reg->Register.Dimension) {
799 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
800 dw_addr = get_tcs_out_current_patch_offset(ctx);
801 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
802 } else {
803 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
804 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
805 }
806
807 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
808 LLVMValueRef value = dst[chan_index];
809
810 if (inst->Instruction.Saturate)
811 value = radeon_llvm_saturate(bld_base, value);
812
813 lds_store(bld_base, chan_index, dw_addr, value);
814 }
815 }
816
817 static LLVMValueRef fetch_input_gs(
818 struct lp_build_tgsi_context *bld_base,
819 const struct tgsi_full_src_register *reg,
820 enum tgsi_opcode_type type,
821 unsigned swizzle)
822 {
823 struct lp_build_context *base = &bld_base->base;
824 struct si_shader_context *ctx = si_shader_context(bld_base);
825 struct si_shader *shader = ctx->shader;
826 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
827 struct gallivm_state *gallivm = base->gallivm;
828 LLVMValueRef vtx_offset;
829 LLVMValueRef args[9];
830 unsigned vtx_offset_param;
831 struct tgsi_shader_info *info = &shader->selector->info;
832 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
833 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
834 unsigned param;
835 LLVMValueRef value;
836
837 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
838 return get_primitive_id(bld_base, swizzle);
839
840 if (!reg->Register.Dimension)
841 return NULL;
842
843 if (swizzle == ~0) {
844 LLVMValueRef values[TGSI_NUM_CHANNELS];
845 unsigned chan;
846 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
847 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
848 }
849 return lp_build_gather_values(bld_base->base.gallivm, values,
850 TGSI_NUM_CHANNELS);
851 }
852
853 /* Get the vertex offset parameter */
854 vtx_offset_param = reg->Dimension.Index;
855 if (vtx_offset_param < 2) {
856 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
857 } else {
858 assert(vtx_offset_param < 6);
859 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
860 }
861 vtx_offset = lp_build_mul_imm(uint,
862 LLVMGetParam(ctx->radeon_bld.main_fn,
863 vtx_offset_param),
864 4);
865
866 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
867 args[0] = ctx->esgs_ring;
868 args[1] = vtx_offset;
869 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
870 args[3] = uint->zero;
871 args[4] = uint->one; /* OFFEN */
872 args[5] = uint->zero; /* IDXEN */
873 args[6] = uint->one; /* GLC */
874 args[7] = uint->zero; /* SLC */
875 args[8] = uint->zero; /* TFE */
876
877 value = lp_build_intrinsic(gallivm->builder,
878 "llvm.SI.buffer.load.dword.i32.i32",
879 ctx->i32, args, 9,
880 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
881 if (type == TGSI_TYPE_DOUBLE) {
882 LLVMValueRef value2;
883 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
884 value2 = lp_build_intrinsic(gallivm->builder,
885 "llvm.SI.buffer.load.dword.i32.i32",
886 ctx->i32, args, 9,
887 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
888 return radeon_llvm_emit_fetch_double(bld_base,
889 value, value2);
890 }
891 return LLVMBuildBitCast(gallivm->builder,
892 value,
893 tgsi2llvmtype(bld_base, type), "");
894 }
895
896 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
897 {
898 switch (interpolate) {
899 case TGSI_INTERPOLATE_CONSTANT:
900 return 0;
901
902 case TGSI_INTERPOLATE_LINEAR:
903 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
904 return SI_PARAM_LINEAR_SAMPLE;
905 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
906 return SI_PARAM_LINEAR_CENTROID;
907 else
908 return SI_PARAM_LINEAR_CENTER;
909 break;
910 case TGSI_INTERPOLATE_COLOR:
911 case TGSI_INTERPOLATE_PERSPECTIVE:
912 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
913 return SI_PARAM_PERSP_SAMPLE;
914 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
915 return SI_PARAM_PERSP_CENTROID;
916 else
917 return SI_PARAM_PERSP_CENTER;
918 break;
919 default:
920 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
921 return -1;
922 }
923 }
924
925 /* This shouldn't be used by explicit INTERP opcodes. */
926 static unsigned select_interp_param(struct si_shader_context *ctx,
927 unsigned param)
928 {
929 if (!ctx->shader->key.ps.prolog.force_persample_interp ||
930 !ctx->is_monolithic)
931 return param;
932
933 /* If the shader doesn't use center/centroid, just return the parameter.
934 *
935 * If the shader only uses one set of (i,j), "si_emit_spi_ps_input" can
936 * switch between center/centroid and sample without shader changes.
937 */
938 switch (param) {
939 case SI_PARAM_PERSP_CENTROID:
940 case SI_PARAM_PERSP_CENTER:
941 return SI_PARAM_PERSP_SAMPLE;
942
943 case SI_PARAM_LINEAR_CENTROID:
944 case SI_PARAM_LINEAR_CENTER:
945 return SI_PARAM_LINEAR_SAMPLE;
946
947 default:
948 return param;
949 }
950 }
951
952 /**
953 * Interpolate a fragment shader input.
954 *
955 * @param ctx context
956 * @param input_index index of the input in hardware
957 * @param semantic_name TGSI_SEMANTIC_*
958 * @param semantic_index semantic index
959 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
960 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
961 * @param interp_param interpolation weights (i,j)
962 * @param prim_mask SI_PARAM_PRIM_MASK
963 * @param face SI_PARAM_FRONT_FACE
964 * @param result the return value (4 components)
965 */
966 static void interp_fs_input(struct si_shader_context *ctx,
967 unsigned input_index,
968 unsigned semantic_name,
969 unsigned semantic_index,
970 unsigned num_interp_inputs,
971 unsigned colors_read_mask,
972 LLVMValueRef interp_param,
973 LLVMValueRef prim_mask,
974 LLVMValueRef face,
975 LLVMValueRef result[4])
976 {
977 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
978 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
979 struct gallivm_state *gallivm = base->gallivm;
980 const char *intr_name;
981 LLVMValueRef attr_number;
982
983 unsigned chan;
984
985 attr_number = lp_build_const_int32(gallivm, input_index);
986
987 /* fs.constant returns the param from the middle vertex, so it's not
988 * really useful for flat shading. It's meant to be used for custom
989 * interpolation (but the intrinsic can't fetch from the other two
990 * vertices).
991 *
992 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
993 * to do the right thing. The only reason we use fs.constant is that
994 * fs.interp cannot be used on integers, because they can be equal
995 * to NaN.
996 */
997 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
998
999 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1000 ctx->shader->key.ps.prolog.color_two_side) {
1001 LLVMValueRef args[4];
1002 LLVMValueRef is_face_positive;
1003 LLVMValueRef back_attr_number;
1004
1005 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1006 * otherwise it's at offset "num_inputs".
1007 */
1008 unsigned back_attr_offset = num_interp_inputs;
1009 if (semantic_index == 1 && colors_read_mask & 0xf)
1010 back_attr_offset += 1;
1011
1012 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1013
1014 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1015 face, uint->zero, "");
1016
1017 args[2] = prim_mask;
1018 args[3] = interp_param;
1019 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1020 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1021 LLVMValueRef front, back;
1022
1023 args[0] = llvm_chan;
1024 args[1] = attr_number;
1025 front = lp_build_intrinsic(gallivm->builder, intr_name,
1026 ctx->f32, args, args[3] ? 4 : 3,
1027 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1028
1029 args[1] = back_attr_number;
1030 back = lp_build_intrinsic(gallivm->builder, intr_name,
1031 ctx->f32, args, args[3] ? 4 : 3,
1032 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1033
1034 result[chan] = LLVMBuildSelect(gallivm->builder,
1035 is_face_positive,
1036 front,
1037 back,
1038 "");
1039 }
1040 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1041 LLVMValueRef args[4];
1042
1043 args[0] = uint->zero;
1044 args[1] = attr_number;
1045 args[2] = prim_mask;
1046 args[3] = interp_param;
1047 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1048 ctx->f32, args, args[3] ? 4 : 3,
1049 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1050 result[1] =
1051 result[2] = lp_build_const_float(gallivm, 0.0f);
1052 result[3] = lp_build_const_float(gallivm, 1.0f);
1053 } else {
1054 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1055 LLVMValueRef args[4];
1056 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1057
1058 args[0] = llvm_chan;
1059 args[1] = attr_number;
1060 args[2] = prim_mask;
1061 args[3] = interp_param;
1062 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1063 ctx->f32, args, args[3] ? 4 : 3,
1064 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1065 }
1066 }
1067 }
1068
1069 static void declare_input_fs(
1070 struct radeon_llvm_context *radeon_bld,
1071 unsigned input_index,
1072 const struct tgsi_full_declaration *decl)
1073 {
1074 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1075 struct si_shader_context *ctx =
1076 si_shader_context(&radeon_bld->soa.bld_base);
1077 struct si_shader *shader = ctx->shader;
1078 LLVMValueRef main_fn = radeon_bld->main_fn;
1079 LLVMValueRef interp_param = NULL;
1080 int interp_param_idx;
1081
1082 /* Get colors from input VGPRs (set by the prolog). */
1083 if (!ctx->is_monolithic &&
1084 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1085 unsigned i = decl->Semantic.Index;
1086 unsigned colors_read = shader->selector->info.colors_read;
1087 unsigned mask = colors_read >> (i * 4);
1088 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1089 (i ? util_bitcount(colors_read & 0xf) : 0);
1090
1091 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1092 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1093 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1094 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1095 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1096 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1097 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1098 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1099 return;
1100 }
1101
1102 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1103 decl->Interp.Location);
1104 if (interp_param_idx == -1)
1105 return;
1106 else if (interp_param_idx) {
1107 interp_param_idx = select_interp_param(ctx,
1108 interp_param_idx);
1109 interp_param = LLVMGetParam(main_fn, interp_param_idx);
1110 }
1111
1112 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1113 decl->Semantic.Index, shader->selector->info.num_inputs,
1114 shader->selector->info.colors_read, interp_param,
1115 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1116 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1117 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1118 }
1119
1120 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1121 {
1122 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1123 SI_PARAM_ANCILLARY, 8, 4);
1124 }
1125
1126 /**
1127 * Set range metadata on an instruction. This can only be used on load and
1128 * call instructions. If you know an instruction can only produce the values
1129 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1130 * \p lo is the minimum value inclusive.
1131 * \p hi is the maximum value exclusive.
1132 */
1133 static void set_range_metadata(LLVMValueRef value, unsigned lo, unsigned hi)
1134 {
1135 const char *range_md_string = "range";
1136 LLVMValueRef range_md, md_args[2];
1137 LLVMTypeRef type = LLVMTypeOf(value);
1138 LLVMContextRef context = LLVMGetTypeContext(type);
1139 unsigned md_range_id = LLVMGetMDKindIDInContext(context,
1140 range_md_string, strlen(range_md_string));
1141
1142 md_args[0] = LLVMConstInt(type, lo, false);
1143 md_args[1] = LLVMConstInt(type, hi, false);
1144 range_md = LLVMMDNodeInContext(context, md_args, 2);
1145 LLVMSetMetadata(value, md_range_id, range_md);
1146 }
1147
1148 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1149 {
1150 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1151 LLVMValueRef tid;
1152
1153 if (HAVE_LLVM < 0x0308) {
1154 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1155 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1156 } else {
1157 LLVMValueRef tid_args[2];
1158 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1159 tid_args[1] = lp_build_const_int32(gallivm, 0);
1160 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1161 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1162 tid_args, 2, LLVMReadNoneAttribute);
1163
1164 tid = lp_build_intrinsic(gallivm->builder,
1165 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1166 tid_args, 2, LLVMReadNoneAttribute);
1167 }
1168 set_range_metadata(tid, 0, 64);
1169 return tid;
1170 }
1171
1172 /**
1173 * Load a dword from a constant buffer.
1174 */
1175 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1176 LLVMValueRef offset, LLVMTypeRef return_type)
1177 {
1178 LLVMValueRef args[2] = {resource, offset};
1179
1180 return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1181 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1182 }
1183
1184 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1185 {
1186 struct si_shader_context *ctx =
1187 si_shader_context(&radeon_bld->soa.bld_base);
1188 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1189 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1190 LLVMBuilderRef builder = gallivm->builder;
1191 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1192 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1193 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1194
1195 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1196 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1197 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1198
1199 LLVMValueRef pos[4] = {
1200 buffer_load_const(builder, resource, offset0, ctx->f32),
1201 buffer_load_const(builder, resource, offset1, ctx->f32),
1202 lp_build_const_float(gallivm, 0),
1203 lp_build_const_float(gallivm, 0)
1204 };
1205
1206 return lp_build_gather_values(gallivm, pos, 4);
1207 }
1208
1209 static void declare_system_value(
1210 struct radeon_llvm_context *radeon_bld,
1211 unsigned index,
1212 const struct tgsi_full_declaration *decl)
1213 {
1214 struct si_shader_context *ctx =
1215 si_shader_context(&radeon_bld->soa.bld_base);
1216 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1217 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1218 LLVMValueRef value = 0;
1219
1220 switch (decl->Semantic.Name) {
1221 case TGSI_SEMANTIC_INSTANCEID:
1222 value = LLVMGetParam(radeon_bld->main_fn,
1223 ctx->param_instance_id);
1224 break;
1225
1226 case TGSI_SEMANTIC_VERTEXID:
1227 value = LLVMBuildAdd(gallivm->builder,
1228 LLVMGetParam(radeon_bld->main_fn,
1229 ctx->param_vertex_id),
1230 LLVMGetParam(radeon_bld->main_fn,
1231 SI_PARAM_BASE_VERTEX), "");
1232 break;
1233
1234 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1235 value = LLVMGetParam(radeon_bld->main_fn,
1236 ctx->param_vertex_id);
1237 break;
1238
1239 case TGSI_SEMANTIC_BASEVERTEX:
1240 value = LLVMGetParam(radeon_bld->main_fn,
1241 SI_PARAM_BASE_VERTEX);
1242 break;
1243
1244 case TGSI_SEMANTIC_INVOCATIONID:
1245 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1246 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1247 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1248 value = LLVMGetParam(radeon_bld->main_fn,
1249 SI_PARAM_GS_INSTANCE_ID);
1250 else
1251 assert(!"INVOCATIONID not implemented");
1252 break;
1253
1254 case TGSI_SEMANTIC_POSITION:
1255 {
1256 LLVMValueRef pos[4] = {
1257 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1258 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1259 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1260 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1261 LLVMGetParam(radeon_bld->main_fn,
1262 SI_PARAM_POS_W_FLOAT)),
1263 };
1264 value = lp_build_gather_values(gallivm, pos, 4);
1265 break;
1266 }
1267
1268 case TGSI_SEMANTIC_FACE:
1269 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1270 break;
1271
1272 case TGSI_SEMANTIC_SAMPLEID:
1273 value = get_sample_id(radeon_bld);
1274 break;
1275
1276 case TGSI_SEMANTIC_SAMPLEPOS: {
1277 LLVMValueRef pos[4] = {
1278 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1279 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1280 lp_build_const_float(gallivm, 0),
1281 lp_build_const_float(gallivm, 0)
1282 };
1283 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1284 TGSI_OPCODE_FRC, pos[0]);
1285 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1286 TGSI_OPCODE_FRC, pos[1]);
1287 value = lp_build_gather_values(gallivm, pos, 4);
1288 break;
1289 }
1290
1291 case TGSI_SEMANTIC_SAMPLEMASK:
1292 /* This can only occur with the OpenGL Core profile, which
1293 * doesn't support smoothing.
1294 */
1295 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1296 break;
1297
1298 case TGSI_SEMANTIC_TESSCOORD:
1299 {
1300 LLVMValueRef coord[4] = {
1301 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1302 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1303 bld->zero,
1304 bld->zero
1305 };
1306
1307 /* For triangles, the vector should be (u, v, 1-u-v). */
1308 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1309 PIPE_PRIM_TRIANGLES)
1310 coord[2] = lp_build_sub(bld, bld->one,
1311 lp_build_add(bld, coord[0], coord[1]));
1312
1313 value = lp_build_gather_values(gallivm, coord, 4);
1314 break;
1315 }
1316
1317 case TGSI_SEMANTIC_VERTICESIN:
1318 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1319 break;
1320
1321 case TGSI_SEMANTIC_TESSINNER:
1322 case TGSI_SEMANTIC_TESSOUTER:
1323 {
1324 LLVMValueRef dw_addr;
1325 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1326
1327 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1328 dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
1329 lp_build_const_int32(gallivm, param * 4), "");
1330
1331 value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1332 ~0, dw_addr);
1333 break;
1334 }
1335
1336 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1337 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1338 {
1339 LLVMValueRef buf, slot, val[4];
1340 int i, offset;
1341
1342 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1343 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1344 buf = build_indexed_load_const(ctx, buf, slot);
1345 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1346
1347 for (i = 0; i < 4; i++)
1348 val[i] = buffer_load_const(gallivm->builder, buf,
1349 lp_build_const_int32(gallivm, (offset + i) * 4),
1350 ctx->f32);
1351 value = lp_build_gather_values(gallivm, val, 4);
1352 break;
1353 }
1354
1355 case TGSI_SEMANTIC_PRIMID:
1356 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1357 break;
1358
1359 case TGSI_SEMANTIC_GRID_SIZE:
1360 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1361 break;
1362
1363 case TGSI_SEMANTIC_BLOCK_SIZE:
1364 {
1365 LLVMValueRef values[3];
1366 unsigned i;
1367 unsigned *properties = ctx->shader->selector->info.properties;
1368 unsigned sizes[3] = {
1369 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1370 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1371 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1372 };
1373
1374 for (i = 0; i < 3; ++i)
1375 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1376
1377 value = lp_build_gather_values(gallivm, values, 3);
1378 break;
1379 }
1380
1381 case TGSI_SEMANTIC_BLOCK_ID:
1382 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1383 break;
1384
1385 case TGSI_SEMANTIC_THREAD_ID:
1386 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1387 break;
1388
1389 #if HAVE_LLVM >= 0x0309
1390 case TGSI_SEMANTIC_HELPER_INVOCATION:
1391 value = lp_build_intrinsic(gallivm->builder,
1392 "llvm.amdgcn.ps.live",
1393 ctx->i1, NULL, 0,
1394 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1395 value = LLVMBuildNot(gallivm->builder, value, "");
1396 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1397 break;
1398 #endif
1399
1400 default:
1401 assert(!"unknown system value");
1402 return;
1403 }
1404
1405 radeon_bld->system_values[index] = value;
1406 }
1407
1408 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1409 const struct tgsi_full_declaration *decl)
1410 {
1411 struct si_shader_context *ctx =
1412 si_shader_context(&radeon_bld->soa.bld_base);
1413 struct si_shader_selector *sel = ctx->shader->selector;
1414 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1415
1416 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1417 LLVMValueRef var;
1418
1419 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1420 assert(decl->Range.First == decl->Range.Last);
1421 assert(!ctx->shared_memory);
1422
1423 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1424 LLVMArrayType(ctx->i8, sel->local_size),
1425 "compute_lds",
1426 LOCAL_ADDR_SPACE);
1427 LLVMSetAlignment(var, 4);
1428
1429 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1430 }
1431
1432 static LLVMValueRef fetch_constant(
1433 struct lp_build_tgsi_context *bld_base,
1434 const struct tgsi_full_src_register *reg,
1435 enum tgsi_opcode_type type,
1436 unsigned swizzle)
1437 {
1438 struct si_shader_context *ctx = si_shader_context(bld_base);
1439 struct lp_build_context *base = &bld_base->base;
1440 const struct tgsi_ind_register *ireg = &reg->Indirect;
1441 unsigned buf, idx;
1442
1443 LLVMValueRef addr, bufp;
1444 LLVMValueRef result;
1445
1446 if (swizzle == LP_CHAN_ALL) {
1447 unsigned chan;
1448 LLVMValueRef values[4];
1449 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1450 values[chan] = fetch_constant(bld_base, reg, type, chan);
1451
1452 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1453 }
1454
1455 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1456 idx = reg->Register.Index * 4 + swizzle;
1457
1458 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1459 if (type != TGSI_TYPE_DOUBLE)
1460 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1461 else {
1462 return radeon_llvm_emit_fetch_double(bld_base,
1463 ctx->constants[buf][idx],
1464 ctx->constants[buf][idx + 1]);
1465 }
1466 }
1467
1468 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1469 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1470 LLVMValueRef index;
1471 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1472 reg->Dimension.Index,
1473 SI_NUM_CONST_BUFFERS);
1474 bufp = build_indexed_load_const(ctx, ptr, index);
1475 } else
1476 bufp = ctx->const_buffers[buf];
1477
1478 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1479 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1480 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1481 addr = lp_build_add(&bld_base->uint_bld, addr,
1482 lp_build_const_int32(base->gallivm, idx * 4));
1483
1484 result = buffer_load_const(base->gallivm->builder, bufp,
1485 addr, ctx->f32);
1486
1487 if (type != TGSI_TYPE_DOUBLE)
1488 result = bitcast(bld_base, type, result);
1489 else {
1490 LLVMValueRef addr2, result2;
1491 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1492 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1493 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1494 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1495 lp_build_const_int32(base->gallivm, idx * 4));
1496
1497 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1498 addr2, ctx->f32);
1499
1500 result = radeon_llvm_emit_fetch_double(bld_base,
1501 result, result2);
1502 }
1503 return result;
1504 }
1505
1506 /* Upper 16 bits must be zero. */
1507 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1508 LLVMValueRef val[2])
1509 {
1510 return LLVMBuildOr(gallivm->builder, val[0],
1511 LLVMBuildShl(gallivm->builder, val[1],
1512 lp_build_const_int32(gallivm, 16),
1513 ""), "");
1514 }
1515
1516 /* Upper 16 bits are ignored and will be dropped. */
1517 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1518 LLVMValueRef val[2])
1519 {
1520 LLVMValueRef v[2] = {
1521 LLVMBuildAnd(gallivm->builder, val[0],
1522 lp_build_const_int32(gallivm, 0xffff), ""),
1523 val[1],
1524 };
1525 return si_llvm_pack_two_int16(gallivm, v);
1526 }
1527
1528 /* Initialize arguments for the shader export intrinsic */
1529 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1530 LLVMValueRef *values,
1531 unsigned target,
1532 LLVMValueRef *args)
1533 {
1534 struct si_shader_context *ctx = si_shader_context(bld_base);
1535 struct lp_build_context *uint =
1536 &ctx->radeon_bld.soa.bld_base.uint_bld;
1537 struct lp_build_context *base = &bld_base->base;
1538 struct gallivm_state *gallivm = base->gallivm;
1539 LLVMBuilderRef builder = base->gallivm->builder;
1540 LLVMValueRef val[4];
1541 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1542 unsigned chan;
1543 bool is_int8;
1544
1545 /* Default is 0xf. Adjusted below depending on the format. */
1546 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1547
1548 /* Specify whether the EXEC mask represents the valid mask */
1549 args[1] = uint->zero;
1550
1551 /* Specify whether this is the last export */
1552 args[2] = uint->zero;
1553
1554 /* Specify the target we are exporting */
1555 args[3] = lp_build_const_int32(base->gallivm, target);
1556
1557 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1558 const union si_shader_key *key = &ctx->shader->key;
1559 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1560 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1561
1562 assert(cbuf >= 0 && cbuf < 8);
1563 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1564 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1565 }
1566
1567 args[4] = uint->zero; /* COMPR flag */
1568 args[5] = base->undef;
1569 args[6] = base->undef;
1570 args[7] = base->undef;
1571 args[8] = base->undef;
1572
1573 switch (spi_shader_col_format) {
1574 case V_028714_SPI_SHADER_ZERO:
1575 args[0] = uint->zero; /* writemask */
1576 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1577 break;
1578
1579 case V_028714_SPI_SHADER_32_R:
1580 args[0] = uint->one; /* writemask */
1581 args[5] = values[0];
1582 break;
1583
1584 case V_028714_SPI_SHADER_32_GR:
1585 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1586 args[5] = values[0];
1587 args[6] = values[1];
1588 break;
1589
1590 case V_028714_SPI_SHADER_32_AR:
1591 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
1592 args[5] = values[0];
1593 args[8] = values[3];
1594 break;
1595
1596 case V_028714_SPI_SHADER_FP16_ABGR:
1597 args[4] = uint->one; /* COMPR flag */
1598
1599 for (chan = 0; chan < 2; chan++) {
1600 LLVMValueRef pack_args[2] = {
1601 values[2 * chan],
1602 values[2 * chan + 1]
1603 };
1604 LLVMValueRef packed;
1605
1606 packed = lp_build_intrinsic(base->gallivm->builder,
1607 "llvm.SI.packf16",
1608 ctx->i32, pack_args, 2,
1609 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1610 args[chan + 5] =
1611 LLVMBuildBitCast(base->gallivm->builder,
1612 packed, ctx->f32, "");
1613 }
1614 break;
1615
1616 case V_028714_SPI_SHADER_UNORM16_ABGR:
1617 for (chan = 0; chan < 4; chan++) {
1618 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
1619 val[chan] = LLVMBuildFMul(builder, val[chan],
1620 lp_build_const_float(gallivm, 65535), "");
1621 val[chan] = LLVMBuildFAdd(builder, val[chan],
1622 lp_build_const_float(gallivm, 0.5), "");
1623 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1624 ctx->i32, "");
1625 }
1626
1627 args[4] = uint->one; /* COMPR flag */
1628 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1629 si_llvm_pack_two_int16(gallivm, val));
1630 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1631 si_llvm_pack_two_int16(gallivm, val+2));
1632 break;
1633
1634 case V_028714_SPI_SHADER_SNORM16_ABGR:
1635 for (chan = 0; chan < 4; chan++) {
1636 /* Clamp between [-1, 1]. */
1637 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1638 values[chan],
1639 lp_build_const_float(gallivm, 1));
1640 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1641 val[chan],
1642 lp_build_const_float(gallivm, -1));
1643 /* Convert to a signed integer in [-32767, 32767]. */
1644 val[chan] = LLVMBuildFMul(builder, val[chan],
1645 lp_build_const_float(gallivm, 32767), "");
1646 /* If positive, add 0.5, else add -0.5. */
1647 val[chan] = LLVMBuildFAdd(builder, val[chan],
1648 LLVMBuildSelect(builder,
1649 LLVMBuildFCmp(builder, LLVMRealOGE,
1650 val[chan], base->zero, ""),
1651 lp_build_const_float(gallivm, 0.5),
1652 lp_build_const_float(gallivm, -0.5), ""), "");
1653 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1654 }
1655
1656 args[4] = uint->one; /* COMPR flag */
1657 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1658 si_llvm_pack_two_int32_as_int16(gallivm, val));
1659 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1660 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1661 break;
1662
1663 case V_028714_SPI_SHADER_UINT16_ABGR: {
1664 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1665 255 : 65535);
1666 /* Clamp. */
1667 for (chan = 0; chan < 4; chan++) {
1668 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1669 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1670 val[chan], max);
1671 }
1672
1673 args[4] = uint->one; /* COMPR flag */
1674 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1675 si_llvm_pack_two_int16(gallivm, val));
1676 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1677 si_llvm_pack_two_int16(gallivm, val+2));
1678 break;
1679 }
1680
1681 case V_028714_SPI_SHADER_SINT16_ABGR: {
1682 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1683 127 : 32767);
1684 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
1685 -128 : -32768);
1686 /* Clamp. */
1687 for (chan = 0; chan < 4; chan++) {
1688 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1689 val[chan] = lp_build_emit_llvm_binary(bld_base,
1690 TGSI_OPCODE_IMIN,
1691 val[chan], max);
1692 val[chan] = lp_build_emit_llvm_binary(bld_base,
1693 TGSI_OPCODE_IMAX,
1694 val[chan], min);
1695 }
1696
1697 args[4] = uint->one; /* COMPR flag */
1698 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1699 si_llvm_pack_two_int32_as_int16(gallivm, val));
1700 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1701 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1702 break;
1703 }
1704
1705 case V_028714_SPI_SHADER_32_ABGR:
1706 memcpy(&args[5], values, sizeof(values[0]) * 4);
1707 break;
1708 }
1709 }
1710
1711 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1712 LLVMValueRef alpha)
1713 {
1714 struct si_shader_context *ctx = si_shader_context(bld_base);
1715 struct gallivm_state *gallivm = bld_base->base.gallivm;
1716
1717 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1718 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
1719 SI_PARAM_ALPHA_REF);
1720
1721 LLVMValueRef alpha_pass =
1722 lp_build_cmp(&bld_base->base,
1723 ctx->shader->key.ps.epilog.alpha_func,
1724 alpha, alpha_ref);
1725 LLVMValueRef arg =
1726 lp_build_select(&bld_base->base,
1727 alpha_pass,
1728 lp_build_const_float(gallivm, 1.0f),
1729 lp_build_const_float(gallivm, -1.0f));
1730
1731 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
1732 ctx->voidt, &arg, 1, 0);
1733 } else {
1734 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
1735 ctx->voidt, NULL, 0, 0);
1736 }
1737 }
1738
1739 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
1740 LLVMValueRef alpha,
1741 unsigned samplemask_param)
1742 {
1743 struct si_shader_context *ctx = si_shader_context(bld_base);
1744 struct gallivm_state *gallivm = bld_base->base.gallivm;
1745 LLVMValueRef coverage;
1746
1747 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1748 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
1749 samplemask_param);
1750 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
1751
1752 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
1753 ctx->i32,
1754 &coverage, 1, LLVMReadNoneAttribute);
1755
1756 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
1757 ctx->f32, "");
1758
1759 coverage = LLVMBuildFMul(gallivm->builder, coverage,
1760 lp_build_const_float(gallivm,
1761 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1762
1763 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
1764 }
1765
1766 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
1767 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
1768 {
1769 struct si_shader_context *ctx = si_shader_context(bld_base);
1770 struct lp_build_context *base = &bld_base->base;
1771 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1772 unsigned reg_index;
1773 unsigned chan;
1774 unsigned const_chan;
1775 LLVMValueRef base_elt;
1776 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1777 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
1778 SI_VS_CONST_CLIP_PLANES);
1779 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
1780
1781 for (reg_index = 0; reg_index < 2; reg_index ++) {
1782 LLVMValueRef *args = pos[2 + reg_index];
1783
1784 args[5] =
1785 args[6] =
1786 args[7] =
1787 args[8] = lp_build_const_float(base->gallivm, 0.0f);
1788
1789 /* Compute dot products of position and user clip plane vectors */
1790 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1791 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
1792 args[1] = lp_build_const_int32(base->gallivm,
1793 ((reg_index * 4 + chan) * 4 +
1794 const_chan) * 4);
1795 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
1796 args[1], ctx->f32);
1797 args[5 + chan] =
1798 lp_build_add(base, args[5 + chan],
1799 lp_build_mul(base, base_elt,
1800 out_elts[const_chan]));
1801 }
1802 }
1803
1804 args[0] = lp_build_const_int32(base->gallivm, 0xf);
1805 args[1] = uint->zero;
1806 args[2] = uint->zero;
1807 args[3] = lp_build_const_int32(base->gallivm,
1808 V_008DFC_SQ_EXP_POS + 2 + reg_index);
1809 args[4] = uint->zero;
1810 }
1811 }
1812
1813 static void si_dump_streamout(struct pipe_stream_output_info *so)
1814 {
1815 unsigned i;
1816
1817 if (so->num_outputs)
1818 fprintf(stderr, "STREAMOUT\n");
1819
1820 for (i = 0; i < so->num_outputs; i++) {
1821 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
1822 so->output[i].start_component;
1823 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
1824 i, so->output[i].output_buffer,
1825 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
1826 so->output[i].register_index,
1827 mask & 1 ? "x" : "",
1828 mask & 2 ? "y" : "",
1829 mask & 4 ? "z" : "",
1830 mask & 8 ? "w" : "");
1831 }
1832 }
1833
1834 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1835 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1836 * or v4i32 (num_channels=3,4). */
1837 static void build_tbuffer_store(struct si_shader_context *ctx,
1838 LLVMValueRef rsrc,
1839 LLVMValueRef vdata,
1840 unsigned num_channels,
1841 LLVMValueRef vaddr,
1842 LLVMValueRef soffset,
1843 unsigned inst_offset,
1844 unsigned dfmt,
1845 unsigned nfmt,
1846 unsigned offen,
1847 unsigned idxen,
1848 unsigned glc,
1849 unsigned slc,
1850 unsigned tfe)
1851 {
1852 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1853 LLVMValueRef args[] = {
1854 rsrc,
1855 vdata,
1856 LLVMConstInt(ctx->i32, num_channels, 0),
1857 vaddr,
1858 soffset,
1859 LLVMConstInt(ctx->i32, inst_offset, 0),
1860 LLVMConstInt(ctx->i32, dfmt, 0),
1861 LLVMConstInt(ctx->i32, nfmt, 0),
1862 LLVMConstInt(ctx->i32, offen, 0),
1863 LLVMConstInt(ctx->i32, idxen, 0),
1864 LLVMConstInt(ctx->i32, glc, 0),
1865 LLVMConstInt(ctx->i32, slc, 0),
1866 LLVMConstInt(ctx->i32, tfe, 0)
1867 };
1868
1869 /* The instruction offset field has 12 bits */
1870 assert(offen || inst_offset < (1 << 12));
1871
1872 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
1873 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1874 const char *types[] = {"i32", "v2i32", "v4i32"};
1875 char name[256];
1876 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
1877
1878 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
1879 args, Elements(args), 0);
1880 }
1881
1882 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
1883 LLVMValueRef rsrc,
1884 LLVMValueRef vdata,
1885 unsigned num_channels,
1886 LLVMValueRef vaddr,
1887 LLVMValueRef soffset,
1888 unsigned inst_offset)
1889 {
1890 static unsigned dfmt[] = {
1891 V_008F0C_BUF_DATA_FORMAT_32,
1892 V_008F0C_BUF_DATA_FORMAT_32_32,
1893 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1894 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1895 };
1896 assert(num_channels >= 1 && num_channels <= 4);
1897
1898 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
1899 inst_offset, dfmt[num_channels-1],
1900 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
1901 }
1902
1903 /* On SI, the vertex shader is responsible for writing streamout data
1904 * to buffers. */
1905 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
1906 struct si_shader_output_values *outputs,
1907 unsigned noutput)
1908 {
1909 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
1910 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1911 LLVMBuilderRef builder = gallivm->builder;
1912 int i, j;
1913 struct lp_build_if_state if_ctx;
1914
1915 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1916 LLVMValueRef so_vtx_count =
1917 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
1918
1919 LLVMValueRef tid = get_thread_id(ctx);
1920
1921 /* can_emit = tid < so_vtx_count; */
1922 LLVMValueRef can_emit =
1923 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
1924
1925 LLVMValueRef stream_id =
1926 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
1927
1928 /* Emit the streamout code conditionally. This actually avoids
1929 * out-of-bounds buffer access. The hw tells us via the SGPR
1930 * (so_vtx_count) which threads are allowed to emit streamout data. */
1931 lp_build_if(&if_ctx, gallivm, can_emit);
1932 {
1933 /* The buffer offset is computed as follows:
1934 * ByteOffset = streamout_offset[buffer_id]*4 +
1935 * (streamout_write_index + thread_id)*stride[buffer_id] +
1936 * attrib_offset
1937 */
1938
1939 LLVMValueRef so_write_index =
1940 LLVMGetParam(ctx->radeon_bld.main_fn,
1941 ctx->param_streamout_write_index);
1942
1943 /* Compute (streamout_write_index + thread_id). */
1944 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
1945
1946 /* Compute the write offset for each enabled buffer. */
1947 LLVMValueRef so_write_offset[4] = {};
1948 for (i = 0; i < 4; i++) {
1949 if (!so->stride[i])
1950 continue;
1951
1952 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
1953 ctx->param_streamout_offset[i]);
1954 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
1955
1956 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
1957 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
1958 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
1959 }
1960
1961 /* Write streamout data. */
1962 for (i = 0; i < so->num_outputs; i++) {
1963 unsigned buf_idx = so->output[i].output_buffer;
1964 unsigned reg = so->output[i].register_index;
1965 unsigned start = so->output[i].start_component;
1966 unsigned num_comps = so->output[i].num_components;
1967 unsigned stream = so->output[i].stream;
1968 LLVMValueRef out[4];
1969 struct lp_build_if_state if_ctx_stream;
1970
1971 assert(num_comps && num_comps <= 4);
1972 if (!num_comps || num_comps > 4)
1973 continue;
1974
1975 if (reg >= noutput)
1976 continue;
1977
1978 /* Load the output as int. */
1979 for (j = 0; j < num_comps; j++) {
1980 out[j] = LLVMBuildBitCast(builder,
1981 outputs[reg].values[start+j],
1982 ctx->i32, "");
1983 }
1984
1985 /* Pack the output. */
1986 LLVMValueRef vdata = NULL;
1987
1988 switch (num_comps) {
1989 case 1: /* as i32 */
1990 vdata = out[0];
1991 break;
1992 case 2: /* as v2i32 */
1993 case 3: /* as v4i32 (aligned to 4) */
1994 case 4: /* as v4i32 */
1995 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
1996 for (j = 0; j < num_comps; j++) {
1997 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
1998 LLVMConstInt(ctx->i32, j, 0), "");
1999 }
2000 break;
2001 }
2002
2003 LLVMValueRef can_emit_stream =
2004 LLVMBuildICmp(builder, LLVMIntEQ,
2005 stream_id,
2006 lp_build_const_int32(gallivm, stream), "");
2007
2008 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2009 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2010 vdata, num_comps,
2011 so_write_offset[buf_idx],
2012 LLVMConstInt(ctx->i32, 0, 0),
2013 so->output[i].dst_offset*4);
2014 lp_build_endif(&if_ctx_stream);
2015 }
2016 }
2017 lp_build_endif(&if_ctx);
2018 }
2019
2020
2021 /* Generate export instructions for hardware VS shader stage */
2022 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2023 struct si_shader_output_values *outputs,
2024 unsigned noutput)
2025 {
2026 struct si_shader_context *ctx = si_shader_context(bld_base);
2027 struct si_shader *shader = ctx->shader;
2028 struct lp_build_context *base = &bld_base->base;
2029 struct lp_build_context *uint =
2030 &ctx->radeon_bld.soa.bld_base.uint_bld;
2031 LLVMValueRef args[9];
2032 LLVMValueRef pos_args[4][9] = { { 0 } };
2033 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2034 unsigned semantic_name, semantic_index;
2035 unsigned target;
2036 unsigned param_count = 0;
2037 unsigned pos_idx;
2038 int i;
2039
2040 if (outputs && ctx->shader->selector->so.num_outputs) {
2041 si_llvm_emit_streamout(ctx, outputs, noutput);
2042 }
2043
2044 for (i = 0; i < noutput; i++) {
2045 semantic_name = outputs[i].name;
2046 semantic_index = outputs[i].sid;
2047
2048 handle_semantic:
2049 /* Select the correct target */
2050 switch(semantic_name) {
2051 case TGSI_SEMANTIC_PSIZE:
2052 psize_value = outputs[i].values[0];
2053 continue;
2054 case TGSI_SEMANTIC_EDGEFLAG:
2055 edgeflag_value = outputs[i].values[0];
2056 continue;
2057 case TGSI_SEMANTIC_LAYER:
2058 layer_value = outputs[i].values[0];
2059 semantic_name = TGSI_SEMANTIC_GENERIC;
2060 goto handle_semantic;
2061 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2062 viewport_index_value = outputs[i].values[0];
2063 semantic_name = TGSI_SEMANTIC_GENERIC;
2064 goto handle_semantic;
2065 case TGSI_SEMANTIC_POSITION:
2066 target = V_008DFC_SQ_EXP_POS;
2067 break;
2068 case TGSI_SEMANTIC_COLOR:
2069 case TGSI_SEMANTIC_BCOLOR:
2070 target = V_008DFC_SQ_EXP_PARAM + param_count;
2071 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2072 shader->info.vs_output_param_offset[i] = param_count;
2073 param_count++;
2074 break;
2075 case TGSI_SEMANTIC_CLIPDIST:
2076 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2077 break;
2078 case TGSI_SEMANTIC_CLIPVERTEX:
2079 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2080 continue;
2081 case TGSI_SEMANTIC_PRIMID:
2082 case TGSI_SEMANTIC_FOG:
2083 case TGSI_SEMANTIC_TEXCOORD:
2084 case TGSI_SEMANTIC_GENERIC:
2085 target = V_008DFC_SQ_EXP_PARAM + param_count;
2086 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2087 shader->info.vs_output_param_offset[i] = param_count;
2088 param_count++;
2089 break;
2090 default:
2091 target = 0;
2092 fprintf(stderr,
2093 "Warning: SI unhandled vs output type:%d\n",
2094 semantic_name);
2095 }
2096
2097 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2098
2099 if (target >= V_008DFC_SQ_EXP_POS &&
2100 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2101 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2102 args, sizeof(args));
2103 } else {
2104 lp_build_intrinsic(base->gallivm->builder,
2105 "llvm.SI.export", ctx->voidt,
2106 args, 9, 0);
2107 }
2108
2109 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2110 semantic_name = TGSI_SEMANTIC_GENERIC;
2111 goto handle_semantic;
2112 }
2113 }
2114
2115 shader->info.nr_param_exports = param_count;
2116
2117 /* We need to add the position output manually if it's missing. */
2118 if (!pos_args[0][0]) {
2119 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2120 pos_args[0][1] = uint->zero; /* EXEC mask */
2121 pos_args[0][2] = uint->zero; /* last export? */
2122 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2123 pos_args[0][4] = uint->zero; /* COMPR flag */
2124 pos_args[0][5] = base->zero; /* X */
2125 pos_args[0][6] = base->zero; /* Y */
2126 pos_args[0][7] = base->zero; /* Z */
2127 pos_args[0][8] = base->one; /* W */
2128 }
2129
2130 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2131 if (shader->selector->info.writes_psize ||
2132 shader->selector->info.writes_edgeflag ||
2133 shader->selector->info.writes_viewport_index ||
2134 shader->selector->info.writes_layer) {
2135 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2136 shader->selector->info.writes_psize |
2137 (shader->selector->info.writes_edgeflag << 1) |
2138 (shader->selector->info.writes_layer << 2) |
2139 (shader->selector->info.writes_viewport_index << 3));
2140 pos_args[1][1] = uint->zero; /* EXEC mask */
2141 pos_args[1][2] = uint->zero; /* last export? */
2142 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2143 pos_args[1][4] = uint->zero; /* COMPR flag */
2144 pos_args[1][5] = base->zero; /* X */
2145 pos_args[1][6] = base->zero; /* Y */
2146 pos_args[1][7] = base->zero; /* Z */
2147 pos_args[1][8] = base->zero; /* W */
2148
2149 if (shader->selector->info.writes_psize)
2150 pos_args[1][5] = psize_value;
2151
2152 if (shader->selector->info.writes_edgeflag) {
2153 /* The output is a float, but the hw expects an integer
2154 * with the first bit containing the edge flag. */
2155 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2156 edgeflag_value,
2157 ctx->i32, "");
2158 edgeflag_value = lp_build_min(&bld_base->int_bld,
2159 edgeflag_value,
2160 bld_base->int_bld.one);
2161
2162 /* The LLVM intrinsic expects a float. */
2163 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2164 edgeflag_value,
2165 ctx->f32, "");
2166 }
2167
2168 if (shader->selector->info.writes_layer)
2169 pos_args[1][7] = layer_value;
2170
2171 if (shader->selector->info.writes_viewport_index)
2172 pos_args[1][8] = viewport_index_value;
2173 }
2174
2175 for (i = 0; i < 4; i++)
2176 if (pos_args[i][0])
2177 shader->info.nr_pos_exports++;
2178
2179 pos_idx = 0;
2180 for (i = 0; i < 4; i++) {
2181 if (!pos_args[i][0])
2182 continue;
2183
2184 /* Specify the target we are exporting */
2185 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2186
2187 if (pos_idx == shader->info.nr_pos_exports)
2188 /* Specify that this is the last export */
2189 pos_args[i][2] = uint->one;
2190
2191 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2192 ctx->voidt, pos_args[i], 9, 0);
2193 }
2194 }
2195
2196 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2197 LLVMValueRef rel_patch_id,
2198 LLVMValueRef invocation_id,
2199 LLVMValueRef tcs_out_current_patch_data_offset)
2200 {
2201 struct si_shader_context *ctx = si_shader_context(bld_base);
2202 struct gallivm_state *gallivm = bld_base->base.gallivm;
2203 struct si_shader *shader = ctx->shader;
2204 unsigned tess_inner_index, tess_outer_index;
2205 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2206 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2207 unsigned stride, outer_comps, inner_comps, i;
2208 struct lp_build_if_state if_ctx;
2209
2210 /* Do this only for invocation 0, because the tess levels are per-patch,
2211 * not per-vertex.
2212 *
2213 * This can't jump, because invocation 0 executes this. It should
2214 * at least mask out the loads and stores for other invocations.
2215 */
2216 lp_build_if(&if_ctx, gallivm,
2217 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2218 invocation_id, bld_base->uint_bld.zero, ""));
2219
2220 /* Determine the layout of one tess factor element in the buffer. */
2221 switch (shader->key.tcs.epilog.prim_mode) {
2222 case PIPE_PRIM_LINES:
2223 stride = 2; /* 2 dwords, 1 vec2 store */
2224 outer_comps = 2;
2225 inner_comps = 0;
2226 break;
2227 case PIPE_PRIM_TRIANGLES:
2228 stride = 4; /* 4 dwords, 1 vec4 store */
2229 outer_comps = 3;
2230 inner_comps = 1;
2231 break;
2232 case PIPE_PRIM_QUADS:
2233 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2234 outer_comps = 4;
2235 inner_comps = 2;
2236 break;
2237 default:
2238 assert(0);
2239 return;
2240 }
2241
2242 /* Load tess_inner and tess_outer from LDS.
2243 * Any invocation can write them, so we can't get them from a temporary.
2244 */
2245 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2246 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2247
2248 lds_base = tcs_out_current_patch_data_offset;
2249 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2250 lp_build_const_int32(gallivm,
2251 tess_inner_index * 4), "");
2252 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2253 lp_build_const_int32(gallivm,
2254 tess_outer_index * 4), "");
2255
2256 for (i = 0; i < outer_comps; i++)
2257 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2258 for (i = 0; i < inner_comps; i++)
2259 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2260
2261 /* Convert the outputs to vectors for stores. */
2262 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2263 vec1 = NULL;
2264
2265 if (stride > 4)
2266 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2267
2268 /* Get the buffer. */
2269 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2270 SI_PARAM_RW_BUFFERS);
2271 buffer = build_indexed_load_const(ctx, rw_buffers,
2272 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2273
2274 /* Get the offset. */
2275 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2276 SI_PARAM_TESS_FACTOR_OFFSET);
2277 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2278 lp_build_const_int32(gallivm, 4 * stride), "");
2279
2280 /* Store the outputs. */
2281 build_tbuffer_store_dwords(ctx, buffer, vec0,
2282 MIN2(stride, 4), byteoffset, tf_base, 0);
2283 if (vec1)
2284 build_tbuffer_store_dwords(ctx, buffer, vec1,
2285 stride - 4, byteoffset, tf_base, 16);
2286 lp_build_endif(&if_ctx);
2287 }
2288
2289 /* This only writes the tessellation factor levels. */
2290 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2291 {
2292 struct si_shader_context *ctx = si_shader_context(bld_base);
2293 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2294
2295 rel_patch_id = get_rel_patch_id(ctx);
2296 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2297 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2298
2299 if (!ctx->is_monolithic) {
2300 /* Return epilog parameters from this function. */
2301 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2302 LLVMValueRef ret = ctx->return_value;
2303 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2304 unsigned vgpr;
2305
2306 /* RW_BUFFERS pointer */
2307 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2308 SI_PARAM_RW_BUFFERS);
2309 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2310 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2311 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2312 bld_base->uint_bld.zero, "");
2313 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2314 bld_base->uint_bld.one, "");
2315 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2316 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2317
2318 /* Tess factor buffer soffset is after user SGPRs. */
2319 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2320 SI_PARAM_TESS_FACTOR_OFFSET);
2321 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2322 SI_TCS_NUM_USER_SGPR, "");
2323
2324 /* VGPRs */
2325 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2326 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2327 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2328
2329 vgpr = SI_TCS_NUM_USER_SGPR + 1;
2330 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2331 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2332 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2333 ctx->return_value = ret;
2334 return;
2335 }
2336
2337 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2338 }
2339
2340 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2341 {
2342 struct si_shader_context *ctx = si_shader_context(bld_base);
2343 struct si_shader *shader = ctx->shader;
2344 struct tgsi_shader_info *info = &shader->selector->info;
2345 struct gallivm_state *gallivm = bld_base->base.gallivm;
2346 unsigned i, chan;
2347 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2348 ctx->param_rel_auto_id);
2349 LLVMValueRef vertex_dw_stride =
2350 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2351 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2352 vertex_dw_stride, "");
2353
2354 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2355 * its inputs from it. */
2356 for (i = 0; i < info->num_outputs; i++) {
2357 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2358 unsigned name = info->output_semantic_name[i];
2359 unsigned index = info->output_semantic_index[i];
2360 int param = si_shader_io_get_unique_index(name, index);
2361 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2362 lp_build_const_int32(gallivm, param * 4), "");
2363
2364 for (chan = 0; chan < 4; chan++) {
2365 lds_store(bld_base, chan, dw_addr,
2366 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2367 }
2368 }
2369 }
2370
2371 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2372 {
2373 struct si_shader_context *ctx = si_shader_context(bld_base);
2374 struct gallivm_state *gallivm = bld_base->base.gallivm;
2375 struct si_shader *es = ctx->shader;
2376 struct tgsi_shader_info *info = &es->selector->info;
2377 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2378 ctx->param_es2gs_offset);
2379 unsigned chan;
2380 int i;
2381
2382 for (i = 0; i < info->num_outputs; i++) {
2383 LLVMValueRef *out_ptr =
2384 ctx->radeon_bld.soa.outputs[i];
2385 int param_index;
2386
2387 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2388 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2389 continue;
2390
2391 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2392 info->output_semantic_index[i]);
2393
2394 for (chan = 0; chan < 4; chan++) {
2395 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2396 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2397
2398 build_tbuffer_store(ctx,
2399 ctx->esgs_ring,
2400 out_val, 1,
2401 LLVMGetUndef(ctx->i32), soffset,
2402 (4 * param_index + chan) * 4,
2403 V_008F0C_BUF_DATA_FORMAT_32,
2404 V_008F0C_BUF_NUM_FORMAT_UINT,
2405 0, 0, 1, 1, 0);
2406 }
2407 }
2408 }
2409
2410 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2411 {
2412 struct si_shader_context *ctx = si_shader_context(bld_base);
2413 struct gallivm_state *gallivm = bld_base->base.gallivm;
2414 LLVMValueRef args[2];
2415
2416 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2417 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2418 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2419 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
2420 }
2421
2422 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2423 {
2424 struct si_shader_context *ctx = si_shader_context(bld_base);
2425 struct gallivm_state *gallivm = bld_base->base.gallivm;
2426 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2427 struct si_shader_output_values *outputs = NULL;
2428 int i,j;
2429
2430 assert(!ctx->is_gs_copy_shader);
2431
2432 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2433
2434 /* Vertex color clamping.
2435 *
2436 * This uses a state constant loaded in a user data SGPR and
2437 * an IF statement is added that clamps all colors if the constant
2438 * is true.
2439 */
2440 if (ctx->type == PIPE_SHADER_VERTEX) {
2441 struct lp_build_if_state if_ctx;
2442 LLVMValueRef cond = NULL;
2443 LLVMValueRef addr, val;
2444
2445 for (i = 0; i < info->num_outputs; i++) {
2446 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2447 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2448 continue;
2449
2450 /* We've found a color. */
2451 if (!cond) {
2452 /* The state is in the first bit of the user SGPR. */
2453 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2454 SI_PARAM_VS_STATE_BITS);
2455 cond = LLVMBuildTrunc(gallivm->builder, cond,
2456 ctx->i1, "");
2457 lp_build_if(&if_ctx, gallivm, cond);
2458 }
2459
2460 for (j = 0; j < 4; j++) {
2461 addr = ctx->radeon_bld.soa.outputs[i][j];
2462 val = LLVMBuildLoad(gallivm->builder, addr, "");
2463 val = radeon_llvm_saturate(bld_base, val);
2464 LLVMBuildStore(gallivm->builder, val, addr);
2465 }
2466 }
2467
2468 if (cond)
2469 lp_build_endif(&if_ctx);
2470 }
2471
2472 for (i = 0; i < info->num_outputs; i++) {
2473 outputs[i].name = info->output_semantic_name[i];
2474 outputs[i].sid = info->output_semantic_index[i];
2475
2476 for (j = 0; j < 4; j++)
2477 outputs[i].values[j] =
2478 LLVMBuildLoad(gallivm->builder,
2479 ctx->radeon_bld.soa.outputs[i][j],
2480 "");
2481 }
2482
2483 if (ctx->is_monolithic) {
2484 /* Export PrimitiveID when PS needs it. */
2485 if (si_vs_exports_prim_id(ctx->shader)) {
2486 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2487 outputs[i].sid = 0;
2488 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2489 get_primitive_id(bld_base, 0));
2490 outputs[i].values[1] = bld_base->base.undef;
2491 outputs[i].values[2] = bld_base->base.undef;
2492 outputs[i].values[3] = bld_base->base.undef;
2493 i++;
2494 }
2495 } else {
2496 /* Return the primitive ID from the LLVM function. */
2497 ctx->return_value =
2498 LLVMBuildInsertValue(gallivm->builder,
2499 ctx->return_value,
2500 bitcast(bld_base, TGSI_TYPE_FLOAT,
2501 get_primitive_id(bld_base, 0)),
2502 VS_EPILOG_PRIMID_LOC, "");
2503 }
2504
2505 si_llvm_export_vs(bld_base, outputs, i);
2506 FREE(outputs);
2507 }
2508
2509 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2510 LLVMValueRef depth, LLVMValueRef stencil,
2511 LLVMValueRef samplemask)
2512 {
2513 struct si_shader_context *ctx = si_shader_context(bld_base);
2514 struct lp_build_context *base = &bld_base->base;
2515 struct lp_build_context *uint = &bld_base->uint_bld;
2516 LLVMValueRef args[9];
2517 unsigned mask = 0;
2518
2519 assert(depth || stencil || samplemask);
2520
2521 args[1] = uint->one; /* whether the EXEC mask is valid */
2522 args[2] = uint->one; /* DONE bit */
2523
2524 /* Specify the target we are exporting */
2525 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2526
2527 args[4] = uint->zero; /* COMP flag */
2528 args[5] = base->undef; /* R, depth */
2529 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2530 args[7] = base->undef; /* B, sample mask */
2531 args[8] = base->undef; /* A, alpha to mask */
2532
2533 if (depth) {
2534 args[5] = depth;
2535 mask |= 0x1;
2536 }
2537
2538 if (stencil) {
2539 args[6] = stencil;
2540 mask |= 0x2;
2541 }
2542
2543 if (samplemask) {
2544 args[7] = samplemask;
2545 mask |= 0x4;
2546 }
2547
2548 /* SI (except OLAND) has a bug that it only looks
2549 * at the X writemask component. */
2550 if (ctx->screen->b.chip_class == SI &&
2551 ctx->screen->b.family != CHIP_OLAND)
2552 mask |= 0x1;
2553
2554 /* Specify which components to enable */
2555 args[0] = lp_build_const_int32(base->gallivm, mask);
2556
2557 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2558 ctx->voidt, args, 9, 0);
2559 }
2560
2561 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2562 LLVMValueRef *color, unsigned index,
2563 unsigned samplemask_param,
2564 bool is_last)
2565 {
2566 struct si_shader_context *ctx = si_shader_context(bld_base);
2567 struct lp_build_context *base = &bld_base->base;
2568 int i;
2569
2570 /* Clamp color */
2571 if (ctx->shader->key.ps.epilog.clamp_color)
2572 for (i = 0; i < 4; i++)
2573 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2574
2575 /* Alpha to one */
2576 if (ctx->shader->key.ps.epilog.alpha_to_one)
2577 color[3] = base->one;
2578
2579 /* Alpha test */
2580 if (index == 0 &&
2581 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2582 si_alpha_test(bld_base, color[3]);
2583
2584 /* Line & polygon smoothing */
2585 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
2586 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2587 samplemask_param);
2588
2589 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2590 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
2591 LLVMValueRef args[8][9];
2592 int c, last = -1;
2593
2594 /* Get the export arguments, also find out what the last one is. */
2595 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2596 si_llvm_init_export_args(bld_base, color,
2597 V_008DFC_SQ_EXP_MRT + c, args[c]);
2598 if (args[c][0] != bld_base->uint_bld.zero)
2599 last = c;
2600 }
2601
2602 /* Emit all exports. */
2603 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2604 if (is_last && last == c) {
2605 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2606 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
2607 } else if (args[c][0] == bld_base->uint_bld.zero)
2608 continue; /* unnecessary NULL export */
2609
2610 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2611 ctx->voidt, args[c], 9, 0);
2612 }
2613 } else {
2614 LLVMValueRef args[9];
2615
2616 /* Export */
2617 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2618 args);
2619 if (is_last) {
2620 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2621 args[2] = bld_base->uint_bld.one; /* DONE bit */
2622 } else if (args[0] == bld_base->uint_bld.zero)
2623 return; /* unnecessary NULL export */
2624
2625 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2626 ctx->voidt, args, 9, 0);
2627 }
2628 }
2629
2630 static void si_export_null(struct lp_build_tgsi_context *bld_base)
2631 {
2632 struct si_shader_context *ctx = si_shader_context(bld_base);
2633 struct lp_build_context *base = &bld_base->base;
2634 struct lp_build_context *uint = &bld_base->uint_bld;
2635 LLVMValueRef args[9];
2636
2637 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
2638 args[1] = uint->one; /* whether the EXEC mask is valid */
2639 args[2] = uint->one; /* DONE bit */
2640 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2641 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
2642 args[5] = uint->undef; /* R */
2643 args[6] = uint->undef; /* G */
2644 args[7] = uint->undef; /* B */
2645 args[8] = uint->undef; /* A */
2646
2647 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2648 ctx->voidt, args, 9, 0);
2649 }
2650
2651 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
2652 {
2653 struct si_shader_context *ctx = si_shader_context(bld_base);
2654 struct si_shader *shader = ctx->shader;
2655 struct lp_build_context *base = &bld_base->base;
2656 struct tgsi_shader_info *info = &shader->selector->info;
2657 LLVMBuilderRef builder = base->gallivm->builder;
2658 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2659 int last_color_export = -1;
2660 int i;
2661
2662 /* Determine the last export. If MRTZ is present, it's always last.
2663 * Otherwise, find the last color export.
2664 */
2665 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
2666 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
2667
2668 /* Don't export NULL and return if alpha-test is enabled. */
2669 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
2670 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
2671 (spi_format & 0xf) == 0)
2672 spi_format |= V_028714_SPI_SHADER_32_AR;
2673
2674 for (i = 0; i < info->num_outputs; i++) {
2675 unsigned index = info->output_semantic_index[i];
2676
2677 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
2678 continue;
2679
2680 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2681 if (shader->key.ps.epilog.last_cbuf > 0) {
2682 /* Just set this if any of the colorbuffers are enabled. */
2683 if (spi_format &
2684 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
2685 last_color_export = i;
2686 continue;
2687 }
2688
2689 if ((spi_format >> (index * 4)) & 0xf)
2690 last_color_export = i;
2691 }
2692
2693 /* If there are no outputs, export NULL. */
2694 if (last_color_export == -1) {
2695 si_export_null(bld_base);
2696 return;
2697 }
2698 }
2699
2700 for (i = 0; i < info->num_outputs; i++) {
2701 unsigned semantic_name = info->output_semantic_name[i];
2702 unsigned semantic_index = info->output_semantic_index[i];
2703 unsigned j;
2704 LLVMValueRef color[4] = {};
2705
2706 /* Select the correct target */
2707 switch (semantic_name) {
2708 case TGSI_SEMANTIC_POSITION:
2709 depth = LLVMBuildLoad(builder,
2710 ctx->radeon_bld.soa.outputs[i][2], "");
2711 break;
2712 case TGSI_SEMANTIC_STENCIL:
2713 stencil = LLVMBuildLoad(builder,
2714 ctx->radeon_bld.soa.outputs[i][1], "");
2715 break;
2716 case TGSI_SEMANTIC_SAMPLEMASK:
2717 samplemask = LLVMBuildLoad(builder,
2718 ctx->radeon_bld.soa.outputs[i][0], "");
2719 break;
2720 case TGSI_SEMANTIC_COLOR:
2721 for (j = 0; j < 4; j++)
2722 color[j] = LLVMBuildLoad(builder,
2723 ctx->radeon_bld.soa.outputs[i][j], "");
2724
2725 si_export_mrt_color(bld_base, color, semantic_index,
2726 SI_PARAM_SAMPLE_COVERAGE,
2727 last_color_export == i);
2728 break;
2729 default:
2730 fprintf(stderr,
2731 "Warning: SI unhandled fs output type:%d\n",
2732 semantic_name);
2733 }
2734 }
2735
2736 if (depth || stencil || samplemask)
2737 si_export_mrt_z(bld_base, depth, stencil, samplemask);
2738 }
2739
2740 /**
2741 * Return PS outputs in this order:
2742 *
2743 * v[0:3] = color0.xyzw
2744 * v[4:7] = color1.xyzw
2745 * ...
2746 * vN+0 = Depth
2747 * vN+1 = Stencil
2748 * vN+2 = SampleMask
2749 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
2750 *
2751 * The alpha-ref SGPR is returned via its original location.
2752 */
2753 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
2754 {
2755 struct si_shader_context *ctx = si_shader_context(bld_base);
2756 struct si_shader *shader = ctx->shader;
2757 struct lp_build_context *base = &bld_base->base;
2758 struct tgsi_shader_info *info = &shader->selector->info;
2759 LLVMBuilderRef builder = base->gallivm->builder;
2760 unsigned i, j, first_vgpr, vgpr;
2761
2762 LLVMValueRef color[8][4] = {};
2763 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2764 LLVMValueRef ret;
2765
2766 /* Read the output values. */
2767 for (i = 0; i < info->num_outputs; i++) {
2768 unsigned semantic_name = info->output_semantic_name[i];
2769 unsigned semantic_index = info->output_semantic_index[i];
2770
2771 switch (semantic_name) {
2772 case TGSI_SEMANTIC_COLOR:
2773 assert(semantic_index < 8);
2774 for (j = 0; j < 4; j++) {
2775 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
2776 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
2777 color[semantic_index][j] = result;
2778 }
2779 break;
2780 case TGSI_SEMANTIC_POSITION:
2781 depth = LLVMBuildLoad(builder,
2782 ctx->radeon_bld.soa.outputs[i][2], "");
2783 break;
2784 case TGSI_SEMANTIC_STENCIL:
2785 stencil = LLVMBuildLoad(builder,
2786 ctx->radeon_bld.soa.outputs[i][1], "");
2787 break;
2788 case TGSI_SEMANTIC_SAMPLEMASK:
2789 samplemask = LLVMBuildLoad(builder,
2790 ctx->radeon_bld.soa.outputs[i][0], "");
2791 break;
2792 default:
2793 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
2794 semantic_name);
2795 }
2796 }
2797
2798 /* Fill the return structure. */
2799 ret = ctx->return_value;
2800
2801 /* Set SGPRs. */
2802 ret = LLVMBuildInsertValue(builder, ret,
2803 bitcast(bld_base, TGSI_TYPE_SIGNED,
2804 LLVMGetParam(ctx->radeon_bld.main_fn,
2805 SI_PARAM_ALPHA_REF)),
2806 SI_SGPR_ALPHA_REF, "");
2807
2808 /* Set VGPRs */
2809 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
2810 for (i = 0; i < ARRAY_SIZE(color); i++) {
2811 if (!color[i][0])
2812 continue;
2813
2814 for (j = 0; j < 4; j++)
2815 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
2816 }
2817 if (depth)
2818 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
2819 if (stencil)
2820 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
2821 if (samplemask)
2822 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
2823
2824 /* Add the input sample mask for smoothing at the end. */
2825 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
2826 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
2827 ret = LLVMBuildInsertValue(builder, ret,
2828 LLVMGetParam(ctx->radeon_bld.main_fn,
2829 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
2830
2831 ctx->return_value = ret;
2832 }
2833
2834 /**
2835 * Given a v8i32 resource descriptor for a buffer, extract the size of the
2836 * buffer in number of elements and return it as an i32.
2837 */
2838 static LLVMValueRef get_buffer_size(
2839 struct lp_build_tgsi_context *bld_base,
2840 LLVMValueRef descriptor)
2841 {
2842 struct si_shader_context *ctx = si_shader_context(bld_base);
2843 struct gallivm_state *gallivm = bld_base->base.gallivm;
2844 LLVMBuilderRef builder = gallivm->builder;
2845 LLVMValueRef size =
2846 LLVMBuildExtractElement(builder, descriptor,
2847 lp_build_const_int32(gallivm, 6), "");
2848
2849 if (ctx->screen->b.chip_class >= VI) {
2850 /* On VI, the descriptor contains the size in bytes,
2851 * but TXQ must return the size in elements.
2852 * The stride is always non-zero for resources using TXQ.
2853 */
2854 LLVMValueRef stride =
2855 LLVMBuildExtractElement(builder, descriptor,
2856 lp_build_const_int32(gallivm, 5), "");
2857 stride = LLVMBuildLShr(builder, stride,
2858 lp_build_const_int32(gallivm, 16), "");
2859 stride = LLVMBuildAnd(builder, stride,
2860 lp_build_const_int32(gallivm, 0x3FFF), "");
2861
2862 size = LLVMBuildUDiv(builder, size, stride, "");
2863 }
2864
2865 return size;
2866 }
2867
2868 /**
2869 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
2870 * intrinsic names).
2871 */
2872 static void build_int_type_name(
2873 LLVMTypeRef type,
2874 char *buf, unsigned bufsize)
2875 {
2876 assert(bufsize >= 6);
2877
2878 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
2879 snprintf(buf, bufsize, "v%ui32",
2880 LLVMGetVectorSize(type));
2881 else
2882 strcpy(buf, "i32");
2883 }
2884
2885 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
2886 struct lp_build_tgsi_context *bld_base,
2887 struct lp_build_emit_data *emit_data);
2888
2889 /* Prevent optimizations (at least of memory accesses) across the current
2890 * point in the program by emitting empty inline assembly that is marked as
2891 * having side effects.
2892 */
2893 static void emit_optimization_barrier(struct si_shader_context *ctx)
2894 {
2895 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2896 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
2897 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
2898 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
2899 }
2900
2901 static void emit_waitcnt(struct si_shader_context *ctx)
2902 {
2903 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2904 LLVMBuilderRef builder = gallivm->builder;
2905 LLVMValueRef args[1] = {
2906 lp_build_const_int32(gallivm, 0xf70)
2907 };
2908 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
2909 ctx->voidt, args, 1, LLVMNoUnwindAttribute);
2910 }
2911
2912 static void membar_emit(
2913 const struct lp_build_tgsi_action *action,
2914 struct lp_build_tgsi_context *bld_base,
2915 struct lp_build_emit_data *emit_data)
2916 {
2917 struct si_shader_context *ctx = si_shader_context(bld_base);
2918
2919 emit_waitcnt(ctx);
2920 }
2921
2922 static LLVMValueRef
2923 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
2924 const struct tgsi_full_src_register *reg)
2925 {
2926 LLVMValueRef ind_index;
2927 LLVMValueRef rsrc_ptr;
2928
2929 if (!reg->Register.Indirect)
2930 return ctx->shader_buffers[reg->Register.Index];
2931
2932 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
2933 reg->Register.Index,
2934 SI_NUM_SHADER_BUFFERS);
2935
2936 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
2937 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
2938 }
2939
2940 static bool tgsi_is_array_sampler(unsigned target)
2941 {
2942 return target == TGSI_TEXTURE_1D_ARRAY ||
2943 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
2944 target == TGSI_TEXTURE_2D_ARRAY ||
2945 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
2946 target == TGSI_TEXTURE_CUBE_ARRAY ||
2947 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
2948 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2949 }
2950
2951 static bool tgsi_is_array_image(unsigned target)
2952 {
2953 return target == TGSI_TEXTURE_3D ||
2954 target == TGSI_TEXTURE_CUBE ||
2955 target == TGSI_TEXTURE_1D_ARRAY ||
2956 target == TGSI_TEXTURE_2D_ARRAY ||
2957 target == TGSI_TEXTURE_CUBE_ARRAY ||
2958 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2959 }
2960
2961 /**
2962 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
2963 *
2964 * At least on Tonga, executing image stores on images with DCC enabled and
2965 * non-trivial can eventually lead to lockups. This can occur when an
2966 * application binds an image as read-only but then uses a shader that writes
2967 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
2968 * program termination) in this case, but it doesn't cost much to be a bit
2969 * nicer: disabling DCC in the shader still leads to undefined results but
2970 * avoids the lockup.
2971 */
2972 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
2973 LLVMValueRef rsrc)
2974 {
2975 if (ctx->screen->b.chip_class <= CIK) {
2976 return rsrc;
2977 } else {
2978 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2979 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
2980 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
2981 LLVMValueRef tmp;
2982
2983 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
2984 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
2985 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
2986 }
2987 }
2988
2989 /**
2990 * Load the resource descriptor for \p image.
2991 */
2992 static void
2993 image_fetch_rsrc(
2994 struct lp_build_tgsi_context *bld_base,
2995 const struct tgsi_full_src_register *image,
2996 bool dcc_off,
2997 LLVMValueRef *rsrc)
2998 {
2999 struct si_shader_context *ctx = si_shader_context(bld_base);
3000
3001 assert(image->Register.File == TGSI_FILE_IMAGE);
3002
3003 if (!image->Register.Indirect) {
3004 /* Fast path: use preloaded resources */
3005 *rsrc = ctx->images[image->Register.Index];
3006 } else {
3007 /* Indexing and manual load */
3008 LLVMValueRef ind_index;
3009 LLVMValueRef rsrc_ptr;
3010 LLVMValueRef tmp;
3011
3012 /* From the GL_ARB_shader_image_load_store extension spec:
3013 *
3014 * If a shader performs an image load, store, or atomic
3015 * operation using an image variable declared as an array,
3016 * and if the index used to select an individual element is
3017 * negative or greater than or equal to the size of the
3018 * array, the results of the operation are undefined but may
3019 * not lead to termination.
3020 */
3021 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3022 image->Register.Index,
3023 SI_NUM_IMAGES);
3024
3025 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3026 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3027 if (dcc_off)
3028 tmp = force_dcc_off(ctx, tmp);
3029 *rsrc = tmp;
3030 }
3031 }
3032
3033 static LLVMValueRef image_fetch_coords(
3034 struct lp_build_tgsi_context *bld_base,
3035 const struct tgsi_full_instruction *inst,
3036 unsigned src)
3037 {
3038 struct gallivm_state *gallivm = bld_base->base.gallivm;
3039 LLVMBuilderRef builder = gallivm->builder;
3040 unsigned target = inst->Memory.Texture;
3041 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3042 LLVMValueRef coords[4];
3043 LLVMValueRef tmp;
3044 int chan;
3045
3046 for (chan = 0; chan < num_coords; ++chan) {
3047 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3048 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3049 coords[chan] = tmp;
3050 }
3051
3052 if (num_coords == 1)
3053 return coords[0];
3054
3055 if (num_coords == 3) {
3056 /* LLVM has difficulties lowering 3-element vectors. */
3057 coords[3] = bld_base->uint_bld.undef;
3058 num_coords = 4;
3059 }
3060
3061 return lp_build_gather_values(gallivm, coords, num_coords);
3062 }
3063
3064 /**
3065 * Append the extra mode bits that are used by image load and store.
3066 */
3067 static void image_append_args(
3068 struct si_shader_context *ctx,
3069 struct lp_build_emit_data * emit_data,
3070 unsigned target,
3071 bool atomic)
3072 {
3073 const struct tgsi_full_instruction *inst = emit_data->inst;
3074 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3075 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3076
3077 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3078 emit_data->args[emit_data->arg_count++] =
3079 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3080 if (!atomic) {
3081 emit_data->args[emit_data->arg_count++] =
3082 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3083 i1true : i1false; /* glc */
3084 }
3085 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3086 }
3087
3088 /**
3089 * Given a 256 bit resource, extract the top half (which stores the buffer
3090 * resource in the case of textures and images).
3091 */
3092 static LLVMValueRef extract_rsrc_top_half(
3093 struct si_shader_context *ctx,
3094 LLVMValueRef rsrc)
3095 {
3096 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3097 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3098 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3099
3100 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3101 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3102 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3103
3104 return rsrc;
3105 }
3106
3107 /**
3108 * Append the resource and indexing arguments for buffer intrinsics.
3109 *
3110 * \param rsrc the v4i32 buffer resource
3111 * \param index index into the buffer (stride-based)
3112 * \param offset byte offset into the buffer
3113 */
3114 static void buffer_append_args(
3115 struct si_shader_context *ctx,
3116 struct lp_build_emit_data *emit_data,
3117 LLVMValueRef rsrc,
3118 LLVMValueRef index,
3119 LLVMValueRef offset,
3120 bool atomic)
3121 {
3122 const struct tgsi_full_instruction *inst = emit_data->inst;
3123 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3124 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3125
3126 emit_data->args[emit_data->arg_count++] = rsrc;
3127 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3128 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3129 if (!atomic) {
3130 emit_data->args[emit_data->arg_count++] =
3131 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3132 i1true : i1false; /* glc */
3133 }
3134 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3135 }
3136
3137 static void load_fetch_args(
3138 struct lp_build_tgsi_context * bld_base,
3139 struct lp_build_emit_data * emit_data)
3140 {
3141 struct si_shader_context *ctx = si_shader_context(bld_base);
3142 struct gallivm_state *gallivm = bld_base->base.gallivm;
3143 const struct tgsi_full_instruction * inst = emit_data->inst;
3144 unsigned target = inst->Memory.Texture;
3145 LLVMValueRef rsrc;
3146
3147 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3148
3149 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3150 LLVMBuilderRef builder = gallivm->builder;
3151 LLVMValueRef offset;
3152 LLVMValueRef tmp;
3153
3154 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3155
3156 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3157 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3158
3159 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3160 offset, false);
3161 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3162 LLVMValueRef coords;
3163
3164 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3165 coords = image_fetch_coords(bld_base, inst, 1);
3166
3167 if (target == TGSI_TEXTURE_BUFFER) {
3168 rsrc = extract_rsrc_top_half(ctx, rsrc);
3169 buffer_append_args(ctx, emit_data, rsrc, coords,
3170 bld_base->uint_bld.zero, false);
3171 } else {
3172 emit_data->args[0] = coords;
3173 emit_data->args[1] = rsrc;
3174 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3175 emit_data->arg_count = 3;
3176
3177 image_append_args(ctx, emit_data, target, false);
3178 }
3179 }
3180 }
3181
3182 static void load_emit_buffer(struct si_shader_context *ctx,
3183 struct lp_build_emit_data *emit_data)
3184 {
3185 const struct tgsi_full_instruction *inst = emit_data->inst;
3186 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3187 LLVMBuilderRef builder = gallivm->builder;
3188 uint writemask = inst->Dst[0].Register.WriteMask;
3189 uint count = util_last_bit(writemask);
3190 const char *intrinsic_name;
3191 LLVMTypeRef dst_type;
3192
3193 switch (count) {
3194 case 1:
3195 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3196 dst_type = ctx->f32;
3197 break;
3198 case 2:
3199 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3200 dst_type = LLVMVectorType(ctx->f32, 2);
3201 break;
3202 default: // 3 & 4
3203 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3204 dst_type = ctx->v4f32;
3205 count = 4;
3206 }
3207
3208 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3209 builder, intrinsic_name, dst_type,
3210 emit_data->args, emit_data->arg_count,
3211 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3212 }
3213
3214 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3215 const struct tgsi_full_instruction *inst,
3216 LLVMTypeRef type, int arg)
3217 {
3218 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3219 LLVMBuilderRef builder = gallivm->builder;
3220 LLVMValueRef offset, ptr;
3221 int addr_space;
3222
3223 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3224 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3225
3226 ptr = ctx->shared_memory;
3227 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3228 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3229 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3230
3231 return ptr;
3232 }
3233
3234 static void load_emit_memory(
3235 struct si_shader_context *ctx,
3236 struct lp_build_emit_data *emit_data)
3237 {
3238 const struct tgsi_full_instruction *inst = emit_data->inst;
3239 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3240 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3241 LLVMBuilderRef builder = gallivm->builder;
3242 unsigned writemask = inst->Dst[0].Register.WriteMask;
3243 LLVMValueRef channels[4], ptr, derived_ptr, index;
3244 int chan;
3245
3246 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3247
3248 for (chan = 0; chan < 4; ++chan) {
3249 if (!(writemask & (1 << chan))) {
3250 channels[chan] = LLVMGetUndef(base->elem_type);
3251 continue;
3252 }
3253
3254 index = lp_build_const_int32(gallivm, chan);
3255 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3256 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3257 }
3258 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3259 }
3260
3261 static void load_emit(
3262 const struct lp_build_tgsi_action *action,
3263 struct lp_build_tgsi_context *bld_base,
3264 struct lp_build_emit_data *emit_data)
3265 {
3266 struct si_shader_context *ctx = si_shader_context(bld_base);
3267 struct gallivm_state *gallivm = bld_base->base.gallivm;
3268 LLVMBuilderRef builder = gallivm->builder;
3269 const struct tgsi_full_instruction * inst = emit_data->inst;
3270 char intrinsic_name[32];
3271 char coords_type[8];
3272
3273 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3274 load_emit_memory(ctx, emit_data);
3275 return;
3276 }
3277
3278 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3279 emit_waitcnt(ctx);
3280
3281 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3282 load_emit_buffer(ctx, emit_data);
3283 return;
3284 }
3285
3286 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3287 emit_data->output[emit_data->chan] =
3288 lp_build_intrinsic(
3289 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3290 emit_data->args, emit_data->arg_count,
3291 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3292 } else {
3293 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3294 coords_type, sizeof(coords_type));
3295
3296 snprintf(intrinsic_name, sizeof(intrinsic_name),
3297 "llvm.amdgcn.image.load.%s", coords_type);
3298
3299 emit_data->output[emit_data->chan] =
3300 lp_build_intrinsic(
3301 builder, intrinsic_name, emit_data->dst_type,
3302 emit_data->args, emit_data->arg_count,
3303 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3304 }
3305 }
3306
3307 static void store_fetch_args(
3308 struct lp_build_tgsi_context * bld_base,
3309 struct lp_build_emit_data * emit_data)
3310 {
3311 struct si_shader_context *ctx = si_shader_context(bld_base);
3312 struct gallivm_state *gallivm = bld_base->base.gallivm;
3313 LLVMBuilderRef builder = gallivm->builder;
3314 const struct tgsi_full_instruction * inst = emit_data->inst;
3315 struct tgsi_full_src_register memory;
3316 LLVMValueRef chans[4];
3317 LLVMValueRef data;
3318 LLVMValueRef rsrc;
3319 unsigned chan;
3320
3321 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3322
3323 for (chan = 0; chan < 4; ++chan) {
3324 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3325 }
3326 data = lp_build_gather_values(gallivm, chans, 4);
3327
3328 emit_data->args[emit_data->arg_count++] = data;
3329
3330 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3331
3332 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3333 LLVMValueRef offset;
3334 LLVMValueRef tmp;
3335
3336 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3337
3338 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3339 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3340
3341 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3342 offset, false);
3343 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3344 unsigned target = inst->Memory.Texture;
3345 LLVMValueRef coords;
3346
3347 coords = image_fetch_coords(bld_base, inst, 0);
3348
3349 if (target == TGSI_TEXTURE_BUFFER) {
3350 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3351
3352 rsrc = extract_rsrc_top_half(ctx, rsrc);
3353 buffer_append_args(ctx, emit_data, rsrc, coords,
3354 bld_base->uint_bld.zero, false);
3355 } else {
3356 emit_data->args[1] = coords;
3357 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3358 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3359 emit_data->arg_count = 4;
3360
3361 image_append_args(ctx, emit_data, target, false);
3362 }
3363 }
3364 }
3365
3366 static void store_emit_buffer(
3367 struct si_shader_context *ctx,
3368 struct lp_build_emit_data *emit_data)
3369 {
3370 const struct tgsi_full_instruction *inst = emit_data->inst;
3371 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3372 LLVMBuilderRef builder = gallivm->builder;
3373 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3374 LLVMValueRef base_data = emit_data->args[0];
3375 LLVMValueRef base_offset = emit_data->args[3];
3376 unsigned writemask = inst->Dst[0].Register.WriteMask;
3377
3378 while (writemask) {
3379 int start, count;
3380 const char *intrinsic_name;
3381 LLVMValueRef data;
3382 LLVMValueRef offset;
3383 LLVMValueRef tmp;
3384
3385 u_bit_scan_consecutive_range(&writemask, &start, &count);
3386
3387 /* Due to an LLVM limitation, split 3-element writes
3388 * into a 2-element and a 1-element write. */
3389 if (count == 3) {
3390 writemask |= 1 << (start + 2);
3391 count = 2;
3392 }
3393
3394 if (count == 4) {
3395 data = base_data;
3396 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3397 } else if (count == 2) {
3398 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3399
3400 tmp = LLVMBuildExtractElement(
3401 builder, base_data,
3402 lp_build_const_int32(gallivm, start), "");
3403 data = LLVMBuildInsertElement(
3404 builder, LLVMGetUndef(v2f32), tmp,
3405 uint_bld->zero, "");
3406
3407 tmp = LLVMBuildExtractElement(
3408 builder, base_data,
3409 lp_build_const_int32(gallivm, start + 1), "");
3410 data = LLVMBuildInsertElement(
3411 builder, data, tmp, uint_bld->one, "");
3412
3413 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3414 } else {
3415 assert(count == 1);
3416 data = LLVMBuildExtractElement(
3417 builder, base_data,
3418 lp_build_const_int32(gallivm, start), "");
3419 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3420 }
3421
3422 offset = base_offset;
3423 if (start != 0) {
3424 offset = LLVMBuildAdd(
3425 builder, offset,
3426 lp_build_const_int32(gallivm, start * 4), "");
3427 }
3428
3429 emit_data->args[0] = data;
3430 emit_data->args[3] = offset;
3431
3432 lp_build_intrinsic(
3433 builder, intrinsic_name, emit_data->dst_type,
3434 emit_data->args, emit_data->arg_count,
3435 LLVMNoUnwindAttribute);
3436 }
3437 }
3438
3439 static void store_emit_memory(
3440 struct si_shader_context *ctx,
3441 struct lp_build_emit_data *emit_data)
3442 {
3443 const struct tgsi_full_instruction *inst = emit_data->inst;
3444 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3445 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3446 LLVMBuilderRef builder = gallivm->builder;
3447 unsigned writemask = inst->Dst[0].Register.WriteMask;
3448 LLVMValueRef ptr, derived_ptr, data, index;
3449 int chan;
3450
3451 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3452
3453 for (chan = 0; chan < 4; ++chan) {
3454 if (!(writemask & (1 << chan))) {
3455 continue;
3456 }
3457 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3458 index = lp_build_const_int32(gallivm, chan);
3459 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3460 LLVMBuildStore(builder, data, derived_ptr);
3461 }
3462 }
3463
3464 static void store_emit(
3465 const struct lp_build_tgsi_action *action,
3466 struct lp_build_tgsi_context *bld_base,
3467 struct lp_build_emit_data *emit_data)
3468 {
3469 struct si_shader_context *ctx = si_shader_context(bld_base);
3470 struct gallivm_state *gallivm = bld_base->base.gallivm;
3471 LLVMBuilderRef builder = gallivm->builder;
3472 const struct tgsi_full_instruction * inst = emit_data->inst;
3473 unsigned target = inst->Memory.Texture;
3474 char intrinsic_name[32];
3475 char coords_type[8];
3476
3477 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3478 store_emit_memory(ctx, emit_data);
3479 return;
3480 }
3481
3482 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3483 emit_waitcnt(ctx);
3484
3485 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3486 store_emit_buffer(ctx, emit_data);
3487 return;
3488 }
3489
3490 if (target == TGSI_TEXTURE_BUFFER) {
3491 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3492 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3493 emit_data->dst_type, emit_data->args, emit_data->arg_count,
3494 LLVMNoUnwindAttribute);
3495 } else {
3496 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3497 coords_type, sizeof(coords_type));
3498 snprintf(intrinsic_name, sizeof(intrinsic_name),
3499 "llvm.amdgcn.image.store.%s", coords_type);
3500
3501 emit_data->output[emit_data->chan] =
3502 lp_build_intrinsic(
3503 builder, intrinsic_name, emit_data->dst_type,
3504 emit_data->args, emit_data->arg_count,
3505 LLVMNoUnwindAttribute);
3506 }
3507 }
3508
3509 static void atomic_fetch_args(
3510 struct lp_build_tgsi_context * bld_base,
3511 struct lp_build_emit_data * emit_data)
3512 {
3513 struct si_shader_context *ctx = si_shader_context(bld_base);
3514 struct gallivm_state *gallivm = bld_base->base.gallivm;
3515 LLVMBuilderRef builder = gallivm->builder;
3516 const struct tgsi_full_instruction * inst = emit_data->inst;
3517 LLVMValueRef data1, data2;
3518 LLVMValueRef rsrc;
3519 LLVMValueRef tmp;
3520
3521 emit_data->dst_type = bld_base->base.elem_type;
3522
3523 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3524 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3525
3526 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3527 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3528 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3529 }
3530
3531 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3532 * of arguments, which is reversed relative to TGSI (and GLSL)
3533 */
3534 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3535 emit_data->args[emit_data->arg_count++] = data2;
3536 emit_data->args[emit_data->arg_count++] = data1;
3537
3538 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3539 LLVMValueRef offset;
3540
3541 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3542
3543 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3544 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3545
3546 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3547 offset, true);
3548 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3549 unsigned target = inst->Memory.Texture;
3550 LLVMValueRef coords;
3551
3552 image_fetch_rsrc(bld_base, &inst->Src[0],
3553 target != TGSI_TEXTURE_BUFFER, &rsrc);
3554 coords = image_fetch_coords(bld_base, inst, 1);
3555
3556 if (target == TGSI_TEXTURE_BUFFER) {
3557 rsrc = extract_rsrc_top_half(ctx, rsrc);
3558 buffer_append_args(ctx, emit_data, rsrc, coords,
3559 bld_base->uint_bld.zero, true);
3560 } else {
3561 emit_data->args[emit_data->arg_count++] = coords;
3562 emit_data->args[emit_data->arg_count++] = rsrc;
3563
3564 image_append_args(ctx, emit_data, target, true);
3565 }
3566 }
3567 }
3568
3569 static void atomic_emit_memory(struct si_shader_context *ctx,
3570 struct lp_build_emit_data *emit_data) {
3571 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3572 LLVMBuilderRef builder = gallivm->builder;
3573 const struct tgsi_full_instruction * inst = emit_data->inst;
3574 LLVMValueRef ptr, result, arg;
3575
3576 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3577
3578 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3579 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3580
3581 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3582 LLVMValueRef new_data;
3583 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
3584 inst, 3, 0);
3585
3586 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3587
3588 #if HAVE_LLVM >= 0x309
3589 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3590 LLVMAtomicOrderingSequentiallyConsistent,
3591 LLVMAtomicOrderingSequentiallyConsistent,
3592 false);
3593 #endif
3594
3595 result = LLVMBuildExtractValue(builder, result, 0, "");
3596 } else {
3597 LLVMAtomicRMWBinOp op;
3598
3599 switch(inst->Instruction.Opcode) {
3600 case TGSI_OPCODE_ATOMUADD:
3601 op = LLVMAtomicRMWBinOpAdd;
3602 break;
3603 case TGSI_OPCODE_ATOMXCHG:
3604 op = LLVMAtomicRMWBinOpXchg;
3605 break;
3606 case TGSI_OPCODE_ATOMAND:
3607 op = LLVMAtomicRMWBinOpAnd;
3608 break;
3609 case TGSI_OPCODE_ATOMOR:
3610 op = LLVMAtomicRMWBinOpOr;
3611 break;
3612 case TGSI_OPCODE_ATOMXOR:
3613 op = LLVMAtomicRMWBinOpXor;
3614 break;
3615 case TGSI_OPCODE_ATOMUMIN:
3616 op = LLVMAtomicRMWBinOpUMin;
3617 break;
3618 case TGSI_OPCODE_ATOMUMAX:
3619 op = LLVMAtomicRMWBinOpUMax;
3620 break;
3621 case TGSI_OPCODE_ATOMIMIN:
3622 op = LLVMAtomicRMWBinOpMin;
3623 break;
3624 case TGSI_OPCODE_ATOMIMAX:
3625 op = LLVMAtomicRMWBinOpMax;
3626 break;
3627 default:
3628 unreachable("unknown atomic opcode");
3629 }
3630
3631 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
3632 LLVMAtomicOrderingSequentiallyConsistent,
3633 false);
3634 }
3635 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
3636 }
3637
3638 static void atomic_emit(
3639 const struct lp_build_tgsi_action *action,
3640 struct lp_build_tgsi_context *bld_base,
3641 struct lp_build_emit_data *emit_data)
3642 {
3643 struct si_shader_context *ctx = si_shader_context(bld_base);
3644 struct gallivm_state *gallivm = bld_base->base.gallivm;
3645 LLVMBuilderRef builder = gallivm->builder;
3646 const struct tgsi_full_instruction * inst = emit_data->inst;
3647 char intrinsic_name[40];
3648 LLVMValueRef tmp;
3649
3650 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3651 atomic_emit_memory(ctx, emit_data);
3652 return;
3653 }
3654
3655 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3656 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3657 snprintf(intrinsic_name, sizeof(intrinsic_name),
3658 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
3659 } else {
3660 char coords_type[8];
3661
3662 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3663 coords_type, sizeof(coords_type));
3664 snprintf(intrinsic_name, sizeof(intrinsic_name),
3665 "llvm.amdgcn.image.atomic.%s.%s",
3666 action->intr_name, coords_type);
3667 }
3668
3669 tmp = lp_build_intrinsic(
3670 builder, intrinsic_name, bld_base->uint_bld.elem_type,
3671 emit_data->args, emit_data->arg_count,
3672 LLVMNoUnwindAttribute);
3673 emit_data->output[emit_data->chan] =
3674 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
3675 }
3676
3677 static void resq_fetch_args(
3678 struct lp_build_tgsi_context * bld_base,
3679 struct lp_build_emit_data * emit_data)
3680 {
3681 struct si_shader_context *ctx = si_shader_context(bld_base);
3682 struct gallivm_state *gallivm = bld_base->base.gallivm;
3683 const struct tgsi_full_instruction *inst = emit_data->inst;
3684 const struct tgsi_full_src_register *reg = &inst->Src[0];
3685
3686 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3687
3688 if (reg->Register.File == TGSI_FILE_BUFFER) {
3689 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
3690 emit_data->arg_count = 1;
3691 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3692 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
3693 emit_data->arg_count = 1;
3694 } else {
3695 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
3696 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
3697 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3698 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
3699 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
3700 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
3701 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
3702 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
3703 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
3704 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
3705 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
3706 emit_data->arg_count = 10;
3707 }
3708 }
3709
3710 static void resq_emit(
3711 const struct lp_build_tgsi_action *action,
3712 struct lp_build_tgsi_context *bld_base,
3713 struct lp_build_emit_data *emit_data)
3714 {
3715 struct gallivm_state *gallivm = bld_base->base.gallivm;
3716 LLVMBuilderRef builder = gallivm->builder;
3717 const struct tgsi_full_instruction *inst = emit_data->inst;
3718 LLVMValueRef out;
3719
3720 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3721 out = LLVMBuildExtractElement(builder, emit_data->args[0],
3722 lp_build_const_int32(gallivm, 2), "");
3723 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3724 out = get_buffer_size(bld_base, emit_data->args[0]);
3725 } else {
3726 out = lp_build_intrinsic(
3727 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
3728 emit_data->args, emit_data->arg_count,
3729 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
3730
3731 /* Divide the number of layers by 6 to get the number of cubes. */
3732 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
3733 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
3734 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
3735
3736 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
3737 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
3738 z = LLVMBuildSDiv(builder, z, imm6, "");
3739 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
3740 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
3741 }
3742 }
3743
3744 emit_data->output[emit_data->chan] = out;
3745 }
3746
3747 static void set_tex_fetch_args(struct si_shader_context *ctx,
3748 struct lp_build_emit_data *emit_data,
3749 unsigned opcode, unsigned target,
3750 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
3751 LLVMValueRef *param, unsigned count,
3752 unsigned dmask)
3753 {
3754 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3755 unsigned num_args;
3756 unsigned is_rect = target == TGSI_TEXTURE_RECT;
3757
3758 /* Pad to power of two vector */
3759 while (count < util_next_power_of_two(count))
3760 param[count++] = LLVMGetUndef(ctx->i32);
3761
3762 /* Texture coordinates. */
3763 if (count > 1)
3764 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
3765 else
3766 emit_data->args[0] = param[0];
3767
3768 /* Resource. */
3769 emit_data->args[1] = res_ptr;
3770 num_args = 2;
3771
3772 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
3773 emit_data->dst_type = ctx->v4i32;
3774 else {
3775 emit_data->dst_type = ctx->v4f32;
3776
3777 emit_data->args[num_args++] = samp_ptr;
3778 }
3779
3780 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
3781 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
3782 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
3783 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
3784 tgsi_is_array_sampler(target)); /* da */
3785 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
3786 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
3787 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
3788 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
3789
3790 emit_data->arg_count = num_args;
3791 }
3792
3793 static const struct lp_build_tgsi_action tex_action;
3794
3795 enum desc_type {
3796 DESC_IMAGE,
3797 DESC_FMASK,
3798 DESC_SAMPLER
3799 };
3800
3801 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3802 {
3803 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3804 CONST_ADDR_SPACE);
3805 }
3806
3807 /**
3808 * Load an image view, fmask view. or sampler state descriptor.
3809 */
3810 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
3811 LLVMValueRef list, LLVMValueRef index,
3812 enum desc_type type)
3813 {
3814 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3815 LLVMBuilderRef builder = gallivm->builder;
3816
3817 switch (type) {
3818 case DESC_IMAGE:
3819 /* The image is at [0:7]. */
3820 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3821 break;
3822 case DESC_FMASK:
3823 /* The FMASK is at [8:15]. */
3824 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3825 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
3826 break;
3827 case DESC_SAMPLER:
3828 /* The sampler state is at [12:15]. */
3829 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3830 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
3831 list = LLVMBuildPointerCast(builder, list,
3832 const_array(ctx->v4i32, 0), "");
3833 break;
3834 }
3835
3836 return build_indexed_load_const(ctx, list, index);
3837 }
3838
3839 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
3840 LLVMValueRef index, enum desc_type type)
3841 {
3842 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
3843 SI_PARAM_SAMPLERS);
3844
3845 return get_sampler_desc_custom(ctx, list, index, type);
3846 }
3847
3848 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
3849 *
3850 * SI-CI:
3851 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
3852 * filtering manually. The driver sets img7 to a mask clearing
3853 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
3854 * s_and_b32 samp0, samp0, img7
3855 *
3856 * VI:
3857 * The ANISO_OVERRIDE sampler field enables this fix in TA.
3858 */
3859 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
3860 LLVMValueRef res, LLVMValueRef samp)
3861 {
3862 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3863 LLVMValueRef img7, samp0;
3864
3865 if (ctx->screen->b.chip_class >= VI)
3866 return samp;
3867
3868 img7 = LLVMBuildExtractElement(builder, res,
3869 LLVMConstInt(ctx->i32, 7, 0), "");
3870 samp0 = LLVMBuildExtractElement(builder, samp,
3871 LLVMConstInt(ctx->i32, 0, 0), "");
3872 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
3873 return LLVMBuildInsertElement(builder, samp, samp0,
3874 LLVMConstInt(ctx->i32, 0, 0), "");
3875 }
3876
3877 static void tex_fetch_ptrs(
3878 struct lp_build_tgsi_context *bld_base,
3879 struct lp_build_emit_data *emit_data,
3880 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
3881 {
3882 struct si_shader_context *ctx = si_shader_context(bld_base);
3883 const struct tgsi_full_instruction *inst = emit_data->inst;
3884 unsigned target = inst->Texture.Texture;
3885 unsigned sampler_src;
3886 unsigned sampler_index;
3887
3888 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
3889 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
3890
3891 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
3892 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
3893 LLVMValueRef ind_index;
3894
3895 ind_index = get_bounded_indirect_index(ctx,
3896 &reg->Indirect,
3897 reg->Register.Index,
3898 SI_NUM_SAMPLERS);
3899
3900 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
3901
3902 if (target == TGSI_TEXTURE_2D_MSAA ||
3903 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
3904 *samp_ptr = NULL;
3905 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
3906 } else {
3907 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
3908 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
3909 *fmask_ptr = NULL;
3910 }
3911 } else {
3912 *res_ptr = ctx->sampler_views[sampler_index];
3913 *samp_ptr = ctx->sampler_states[sampler_index];
3914 *fmask_ptr = ctx->fmasks[sampler_index];
3915 }
3916 }
3917
3918 static void tex_fetch_args(
3919 struct lp_build_tgsi_context *bld_base,
3920 struct lp_build_emit_data *emit_data)
3921 {
3922 struct si_shader_context *ctx = si_shader_context(bld_base);
3923 struct gallivm_state *gallivm = bld_base->base.gallivm;
3924 LLVMBuilderRef builder = gallivm->builder;
3925 const struct tgsi_full_instruction *inst = emit_data->inst;
3926 unsigned opcode = inst->Instruction.Opcode;
3927 unsigned target = inst->Texture.Texture;
3928 LLVMValueRef coords[5], derivs[6];
3929 LLVMValueRef address[16];
3930 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3931 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
3932 unsigned count = 0;
3933 unsigned chan;
3934 unsigned num_deriv_channels = 0;
3935 bool has_offset = inst->Texture.NumOffsets > 0;
3936 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
3937 unsigned dmask = 0xf;
3938
3939 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
3940
3941 if (opcode == TGSI_OPCODE_TXQ) {
3942 if (target == TGSI_TEXTURE_BUFFER) {
3943 /* Read the size from the buffer descriptor directly. */
3944 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
3945 emit_data->args[0] = get_buffer_size(bld_base, res);
3946 return;
3947 }
3948
3949 /* Textures - set the mip level. */
3950 address[count++] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
3951
3952 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
3953 NULL, address, count, 0xf);
3954 return;
3955 }
3956
3957 if (target == TGSI_TEXTURE_BUFFER) {
3958 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3959
3960 /* Bitcast and truncate v8i32 to v16i8. */
3961 LLVMValueRef res = res_ptr;
3962 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
3963 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
3964 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
3965
3966 emit_data->dst_type = ctx->v4f32;
3967 emit_data->args[0] = res;
3968 emit_data->args[1] = bld_base->uint_bld.zero;
3969 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3970 emit_data->arg_count = 3;
3971 return;
3972 }
3973
3974 /* Fetch and project texture coordinates */
3975 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
3976 for (chan = 0; chan < 3; chan++ ) {
3977 coords[chan] = lp_build_emit_fetch(bld_base,
3978 emit_data->inst, 0,
3979 chan);
3980 if (opcode == TGSI_OPCODE_TXP)
3981 coords[chan] = lp_build_emit_llvm_binary(bld_base,
3982 TGSI_OPCODE_DIV,
3983 coords[chan],
3984 coords[3]);
3985 }
3986
3987 if (opcode == TGSI_OPCODE_TXP)
3988 coords[3] = bld_base->base.one;
3989
3990 /* Pack offsets. */
3991 if (has_offset && opcode != TGSI_OPCODE_TXF) {
3992 /* The offsets are six-bit signed integers packed like this:
3993 * X=[5:0], Y=[13:8], and Z=[21:16].
3994 */
3995 LLVMValueRef offset[3], pack;
3996
3997 assert(inst->Texture.NumOffsets == 1);
3998
3999 for (chan = 0; chan < 3; chan++) {
4000 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4001 emit_data->inst, 0, chan);
4002 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4003 lp_build_const_int32(gallivm, 0x3f), "");
4004 if (chan)
4005 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4006 lp_build_const_int32(gallivm, chan*8), "");
4007 }
4008
4009 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4010 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4011 address[count++] = pack;
4012 }
4013
4014 /* Pack LOD bias value */
4015 if (opcode == TGSI_OPCODE_TXB)
4016 address[count++] = coords[3];
4017 if (opcode == TGSI_OPCODE_TXB2)
4018 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4019
4020 /* Pack depth comparison value */
4021 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4022 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4023 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4024 } else {
4025 assert(ref_pos >= 0);
4026 address[count++] = coords[ref_pos];
4027 }
4028 }
4029
4030 /* Pack user derivatives */
4031 if (opcode == TGSI_OPCODE_TXD) {
4032 int param, num_src_deriv_channels;
4033
4034 switch (target) {
4035 case TGSI_TEXTURE_3D:
4036 num_src_deriv_channels = 3;
4037 num_deriv_channels = 3;
4038 break;
4039 case TGSI_TEXTURE_2D:
4040 case TGSI_TEXTURE_SHADOW2D:
4041 case TGSI_TEXTURE_RECT:
4042 case TGSI_TEXTURE_SHADOWRECT:
4043 case TGSI_TEXTURE_2D_ARRAY:
4044 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4045 num_src_deriv_channels = 2;
4046 num_deriv_channels = 2;
4047 break;
4048 case TGSI_TEXTURE_CUBE:
4049 case TGSI_TEXTURE_SHADOWCUBE:
4050 case TGSI_TEXTURE_CUBE_ARRAY:
4051 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4052 /* Cube derivatives will be converted to 2D. */
4053 num_src_deriv_channels = 3;
4054 num_deriv_channels = 2;
4055 break;
4056 case TGSI_TEXTURE_1D:
4057 case TGSI_TEXTURE_SHADOW1D:
4058 case TGSI_TEXTURE_1D_ARRAY:
4059 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4060 num_src_deriv_channels = 1;
4061 num_deriv_channels = 1;
4062 break;
4063 default:
4064 unreachable("invalid target");
4065 }
4066
4067 for (param = 0; param < 2; param++)
4068 for (chan = 0; chan < num_src_deriv_channels; chan++)
4069 derivs[param * num_src_deriv_channels + chan] =
4070 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4071 }
4072
4073 if (target == TGSI_TEXTURE_CUBE ||
4074 target == TGSI_TEXTURE_CUBE_ARRAY ||
4075 target == TGSI_TEXTURE_SHADOWCUBE ||
4076 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4077 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4078
4079 if (opcode == TGSI_OPCODE_TXD)
4080 for (int i = 0; i < num_deriv_channels * 2; i++)
4081 address[count++] = derivs[i];
4082
4083 /* Pack texture coordinates */
4084 address[count++] = coords[0];
4085 if (num_coords > 1)
4086 address[count++] = coords[1];
4087 if (num_coords > 2)
4088 address[count++] = coords[2];
4089
4090 /* Pack LOD or sample index */
4091 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4092 address[count++] = coords[3];
4093 else if (opcode == TGSI_OPCODE_TXL2)
4094 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4095
4096 if (count > 16) {
4097 assert(!"Cannot handle more than 16 texture address parameters");
4098 count = 16;
4099 }
4100
4101 for (chan = 0; chan < count; chan++ ) {
4102 address[chan] = LLVMBuildBitCast(gallivm->builder,
4103 address[chan], ctx->i32, "");
4104 }
4105
4106 /* Adjust the sample index according to FMASK.
4107 *
4108 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4109 * which is the identity mapping. Each nibble says which physical sample
4110 * should be fetched to get that sample.
4111 *
4112 * For example, 0x11111100 means there are only 2 samples stored and
4113 * the second sample covers 3/4 of the pixel. When reading samples 0
4114 * and 1, return physical sample 0 (determined by the first two 0s
4115 * in FMASK), otherwise return physical sample 1.
4116 *
4117 * The sample index should be adjusted as follows:
4118 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4119 */
4120 if (target == TGSI_TEXTURE_2D_MSAA ||
4121 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4122 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4123 struct lp_build_emit_data txf_emit_data = *emit_data;
4124 LLVMValueRef txf_address[4];
4125 unsigned txf_count = count;
4126 struct tgsi_full_instruction inst = {};
4127
4128 memcpy(txf_address, address, sizeof(txf_address));
4129
4130 if (target == TGSI_TEXTURE_2D_MSAA) {
4131 txf_address[2] = bld_base->uint_bld.zero;
4132 }
4133 txf_address[3] = bld_base->uint_bld.zero;
4134
4135 /* Read FMASK using TXF. */
4136 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4137 inst.Texture.Texture = target;
4138 txf_emit_data.inst = &inst;
4139 txf_emit_data.chan = 0;
4140 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4141 target, fmask_ptr, NULL,
4142 txf_address, txf_count, 0xf);
4143 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4144
4145 /* Initialize some constants. */
4146 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4147 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4148
4149 /* Apply the formula. */
4150 LLVMValueRef fmask =
4151 LLVMBuildExtractElement(gallivm->builder,
4152 txf_emit_data.output[0],
4153 uint_bld->zero, "");
4154
4155 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4156
4157 LLVMValueRef sample_index4 =
4158 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4159
4160 LLVMValueRef shifted_fmask =
4161 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4162
4163 LLVMValueRef final_sample =
4164 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4165
4166 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4167 * resource descriptor is 0 (invalid),
4168 */
4169 LLVMValueRef fmask_desc =
4170 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4171 ctx->v8i32, "");
4172
4173 LLVMValueRef fmask_word1 =
4174 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4175 uint_bld->one, "");
4176
4177 LLVMValueRef word1_is_nonzero =
4178 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4179 fmask_word1, uint_bld->zero, "");
4180
4181 /* Replace the MSAA sample index. */
4182 address[sample_chan] =
4183 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4184 final_sample, address[sample_chan], "");
4185 }
4186
4187 if (opcode == TGSI_OPCODE_TXF) {
4188 /* add tex offsets */
4189 if (inst->Texture.NumOffsets) {
4190 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4191 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4192 const struct tgsi_texture_offset *off = inst->TexOffsets;
4193
4194 assert(inst->Texture.NumOffsets == 1);
4195
4196 switch (target) {
4197 case TGSI_TEXTURE_3D:
4198 address[2] = lp_build_add(uint_bld, address[2],
4199 bld->immediates[off->Index][off->SwizzleZ]);
4200 /* fall through */
4201 case TGSI_TEXTURE_2D:
4202 case TGSI_TEXTURE_SHADOW2D:
4203 case TGSI_TEXTURE_RECT:
4204 case TGSI_TEXTURE_SHADOWRECT:
4205 case TGSI_TEXTURE_2D_ARRAY:
4206 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4207 address[1] =
4208 lp_build_add(uint_bld, address[1],
4209 bld->immediates[off->Index][off->SwizzleY]);
4210 /* fall through */
4211 case TGSI_TEXTURE_1D:
4212 case TGSI_TEXTURE_SHADOW1D:
4213 case TGSI_TEXTURE_1D_ARRAY:
4214 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4215 address[0] =
4216 lp_build_add(uint_bld, address[0],
4217 bld->immediates[off->Index][off->SwizzleX]);
4218 break;
4219 /* texture offsets do not apply to other texture targets */
4220 }
4221 }
4222 }
4223
4224 if (opcode == TGSI_OPCODE_TG4) {
4225 unsigned gather_comp = 0;
4226
4227 /* DMASK was repurposed for GATHER4. 4 components are always
4228 * returned and DMASK works like a swizzle - it selects
4229 * the component to fetch. The only valid DMASK values are
4230 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4231 * (red,red,red,red) etc.) The ISA document doesn't mention
4232 * this.
4233 */
4234
4235 /* Get the component index from src1.x for Gather4. */
4236 if (!tgsi_is_shadow_target(target)) {
4237 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4238 LLVMValueRef comp_imm;
4239 struct tgsi_src_register src1 = inst->Src[1].Register;
4240
4241 assert(src1.File == TGSI_FILE_IMMEDIATE);
4242
4243 comp_imm = imms[src1.Index][src1.SwizzleX];
4244 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4245 gather_comp = CLAMP(gather_comp, 0, 3);
4246 }
4247
4248 dmask = 1 << gather_comp;
4249 }
4250
4251 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4252 samp_ptr, address, count, dmask);
4253 }
4254
4255 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4256 struct lp_build_tgsi_context *bld_base,
4257 struct lp_build_emit_data *emit_data)
4258 {
4259 struct lp_build_context *base = &bld_base->base;
4260 unsigned opcode = emit_data->inst->Instruction.Opcode;
4261 unsigned target = emit_data->inst->Texture.Texture;
4262 char intr_name[127];
4263 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4264 bool is_shadow = tgsi_is_shadow_target(target);
4265 char type[64];
4266 const char *name = "llvm.SI.image.sample";
4267 const char *infix = "";
4268
4269 if (opcode == TGSI_OPCODE_TXQ && target == TGSI_TEXTURE_BUFFER) {
4270 /* Just return the buffer size. */
4271 emit_data->output[emit_data->chan] = emit_data->args[0];
4272 return;
4273 }
4274
4275 if (target == TGSI_TEXTURE_BUFFER) {
4276 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4277 base->gallivm->builder,
4278 "llvm.SI.vs.load.input", emit_data->dst_type,
4279 emit_data->args, emit_data->arg_count,
4280 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4281 return;
4282 }
4283
4284 switch (opcode) {
4285 case TGSI_OPCODE_TXF:
4286 name = target == TGSI_TEXTURE_2D_MSAA ||
4287 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4288 "llvm.SI.image.load" :
4289 "llvm.SI.image.load.mip";
4290 is_shadow = false;
4291 has_offset = false;
4292 break;
4293 case TGSI_OPCODE_TXQ:
4294 name = "llvm.SI.getresinfo";
4295 is_shadow = false;
4296 has_offset = false;
4297 break;
4298 case TGSI_OPCODE_LODQ:
4299 name = "llvm.SI.getlod";
4300 is_shadow = false;
4301 has_offset = false;
4302 break;
4303 case TGSI_OPCODE_TEX:
4304 case TGSI_OPCODE_TEX2:
4305 case TGSI_OPCODE_TXP:
4306 break;
4307 case TGSI_OPCODE_TXB:
4308 case TGSI_OPCODE_TXB2:
4309 infix = ".b";
4310 break;
4311 case TGSI_OPCODE_TXL:
4312 case TGSI_OPCODE_TXL2:
4313 infix = ".l";
4314 break;
4315 case TGSI_OPCODE_TXD:
4316 infix = ".d";
4317 break;
4318 case TGSI_OPCODE_TG4:
4319 name = "llvm.SI.gather4";
4320 break;
4321 default:
4322 assert(0);
4323 return;
4324 }
4325
4326 /* Add the type and suffixes .c, .o if needed. */
4327 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4328 sprintf(intr_name, "%s%s%s%s.%s",
4329 name, is_shadow ? ".c" : "", infix,
4330 has_offset ? ".o" : "", type);
4331
4332 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4333 base->gallivm->builder, intr_name, emit_data->dst_type,
4334 emit_data->args, emit_data->arg_count,
4335 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4336
4337 /* Divide the number of layers by 6 to get the number of cubes. */
4338 if (opcode == TGSI_OPCODE_TXQ &&
4339 (target == TGSI_TEXTURE_CUBE_ARRAY ||
4340 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)) {
4341 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4342 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4343 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4344
4345 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4346 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4347 z = LLVMBuildSDiv(builder, z, six, "");
4348
4349 emit_data->output[emit_data->chan] =
4350 LLVMBuildInsertElement(builder, v4, z, two, "");
4351 }
4352 }
4353
4354 static void si_llvm_emit_txqs(
4355 const struct lp_build_tgsi_action *action,
4356 struct lp_build_tgsi_context *bld_base,
4357 struct lp_build_emit_data *emit_data)
4358 {
4359 struct si_shader_context *ctx = si_shader_context(bld_base);
4360 struct gallivm_state *gallivm = bld_base->base.gallivm;
4361 LLVMBuilderRef builder = gallivm->builder;
4362 LLVMValueRef res, samples;
4363 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4364
4365 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4366
4367
4368 /* Read the samples from the descriptor directly. */
4369 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4370 samples = LLVMBuildExtractElement(
4371 builder, res,
4372 lp_build_const_int32(gallivm, 3), "");
4373 samples = LLVMBuildLShr(builder, samples,
4374 lp_build_const_int32(gallivm, 16), "");
4375 samples = LLVMBuildAnd(builder, samples,
4376 lp_build_const_int32(gallivm, 0xf), "");
4377 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4378 samples, "");
4379
4380 emit_data->output[emit_data->chan] = samples;
4381 }
4382
4383 /*
4384 * SI implements derivatives using the local data store (LDS)
4385 * All writes to the LDS happen in all executing threads at
4386 * the same time. TID is the Thread ID for the current
4387 * thread and is a value between 0 and 63, representing
4388 * the thread's position in the wavefront.
4389 *
4390 * For the pixel shader threads are grouped into quads of four pixels.
4391 * The TIDs of the pixels of a quad are:
4392 *
4393 * +------+------+
4394 * |4n + 0|4n + 1|
4395 * +------+------+
4396 * |4n + 2|4n + 3|
4397 * +------+------+
4398 *
4399 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4400 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4401 * the current pixel's column, and masking with 0xfffffffe yields the TID
4402 * of the left pixel of the current pixel's row.
4403 *
4404 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4405 * adding 2 yields the TID of the pixel below the top pixel.
4406 */
4407 /* masks for thread ID. */
4408 #define TID_MASK_TOP_LEFT 0xfffffffc
4409 #define TID_MASK_TOP 0xfffffffd
4410 #define TID_MASK_LEFT 0xfffffffe
4411
4412 static void si_llvm_emit_ddxy(
4413 const struct lp_build_tgsi_action *action,
4414 struct lp_build_tgsi_context *bld_base,
4415 struct lp_build_emit_data *emit_data)
4416 {
4417 struct si_shader_context *ctx = si_shader_context(bld_base);
4418 struct gallivm_state *gallivm = bld_base->base.gallivm;
4419 const struct tgsi_full_instruction *inst = emit_data->inst;
4420 unsigned opcode = inst->Instruction.Opcode;
4421 LLVMValueRef indices[2];
4422 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4423 LLVMValueRef tl, trbl, result[4];
4424 LLVMValueRef tl_tid, trbl_tid;
4425 unsigned swizzle[4];
4426 unsigned c;
4427 int idx;
4428 unsigned mask;
4429
4430 indices[0] = bld_base->uint_bld.zero;
4431 indices[1] = get_thread_id(ctx);
4432 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4433 indices, 2, "");
4434
4435 if (opcode == TGSI_OPCODE_DDX_FINE)
4436 mask = TID_MASK_LEFT;
4437 else if (opcode == TGSI_OPCODE_DDY_FINE)
4438 mask = TID_MASK_TOP;
4439 else
4440 mask = TID_MASK_TOP_LEFT;
4441
4442 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4443 lp_build_const_int32(gallivm, mask), "");
4444 indices[1] = tl_tid;
4445 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4446 indices, 2, "");
4447
4448 /* for DDX we want to next X pixel, DDY next Y pixel. */
4449 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4450 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4451 lp_build_const_int32(gallivm, idx), "");
4452 indices[1] = trbl_tid;
4453 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4454 indices, 2, "");
4455
4456 for (c = 0; c < 4; ++c) {
4457 unsigned i;
4458 LLVMValueRef val;
4459 LLVMValueRef args[2];
4460
4461 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4462 for (i = 0; i < c; ++i) {
4463 if (swizzle[i] == swizzle[c]) {
4464 result[c] = result[i];
4465 break;
4466 }
4467 }
4468 if (i != c)
4469 continue;
4470
4471 val = LLVMBuildBitCast(gallivm->builder,
4472 lp_build_emit_fetch(bld_base, inst, 0, c),
4473 ctx->i32, "");
4474
4475 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4476
4477 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4478 lp_build_const_int32(gallivm, 4), "");
4479 args[1] = val;
4480 tl = lp_build_intrinsic(gallivm->builder,
4481 "llvm.amdgcn.ds.bpermute", ctx->i32,
4482 args, 2, LLVMReadNoneAttribute);
4483
4484 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4485 lp_build_const_int32(gallivm, 4), "");
4486 trbl = lp_build_intrinsic(gallivm->builder,
4487 "llvm.amdgcn.ds.bpermute", ctx->i32,
4488 args, 2, LLVMReadNoneAttribute);
4489 } else {
4490 LLVMBuildStore(gallivm->builder, val, store_ptr);
4491 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4492 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4493 }
4494 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4495 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4496 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4497 }
4498
4499 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4500 }
4501
4502 /*
4503 * this takes an I,J coordinate pair,
4504 * and works out the X and Y derivatives.
4505 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4506 */
4507 static LLVMValueRef si_llvm_emit_ddxy_interp(
4508 struct lp_build_tgsi_context *bld_base,
4509 LLVMValueRef interp_ij)
4510 {
4511 struct si_shader_context *ctx = si_shader_context(bld_base);
4512 struct gallivm_state *gallivm = bld_base->base.gallivm;
4513 LLVMValueRef indices[2];
4514 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4515 LLVMValueRef tl, tr, bl, result[4];
4516 unsigned c;
4517
4518 indices[0] = bld_base->uint_bld.zero;
4519 indices[1] = get_thread_id(ctx);
4520 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4521 indices, 2, "");
4522
4523 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4524 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4525
4526 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4527 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4528
4529 indices[1] = temp;
4530 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4531 indices, 2, "");
4532
4533 indices[1] = temp2;
4534 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4535 indices, 2, "");
4536
4537 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4538 lp_build_const_int32(gallivm, 1), "");
4539 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4540 indices, 2, "");
4541
4542 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4543 lp_build_const_int32(gallivm, 2), "");
4544 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4545 indices, 2, "");
4546
4547 for (c = 0; c < 2; ++c) {
4548 LLVMValueRef store_val;
4549 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4550
4551 store_val = LLVMBuildExtractElement(gallivm->builder,
4552 interp_ij, c_ll, "");
4553 LLVMBuildStore(gallivm->builder,
4554 store_val,
4555 store_ptr);
4556
4557 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
4558 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4559
4560 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
4561 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
4562
4563 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
4564
4565 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
4566 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4567
4568 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
4569 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
4570
4571 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
4572 }
4573
4574 return lp_build_gather_values(gallivm, result, 4);
4575 }
4576
4577 static void interp_fetch_args(
4578 struct lp_build_tgsi_context *bld_base,
4579 struct lp_build_emit_data *emit_data)
4580 {
4581 struct si_shader_context *ctx = si_shader_context(bld_base);
4582 struct gallivm_state *gallivm = bld_base->base.gallivm;
4583 const struct tgsi_full_instruction *inst = emit_data->inst;
4584
4585 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
4586 /* offset is in second src, first two channels */
4587 emit_data->args[0] = lp_build_emit_fetch(bld_base,
4588 emit_data->inst, 1,
4589 TGSI_CHAN_X);
4590 emit_data->args[1] = lp_build_emit_fetch(bld_base,
4591 emit_data->inst, 1,
4592 TGSI_CHAN_Y);
4593 emit_data->arg_count = 2;
4594 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4595 LLVMValueRef sample_position;
4596 LLVMValueRef sample_id;
4597 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
4598
4599 /* fetch sample ID, then fetch its sample position,
4600 * and place into first two channels.
4601 */
4602 sample_id = lp_build_emit_fetch(bld_base,
4603 emit_data->inst, 1, TGSI_CHAN_X);
4604 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
4605 ctx->i32, "");
4606 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
4607
4608 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
4609 sample_position,
4610 lp_build_const_int32(gallivm, 0), "");
4611
4612 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
4613 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
4614 sample_position,
4615 lp_build_const_int32(gallivm, 1), "");
4616 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
4617 emit_data->arg_count = 2;
4618 }
4619 }
4620
4621 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
4622 struct lp_build_tgsi_context *bld_base,
4623 struct lp_build_emit_data *emit_data)
4624 {
4625 struct si_shader_context *ctx = si_shader_context(bld_base);
4626 struct si_shader *shader = ctx->shader;
4627 struct gallivm_state *gallivm = bld_base->base.gallivm;
4628 LLVMValueRef interp_param;
4629 const struct tgsi_full_instruction *inst = emit_data->inst;
4630 const char *intr_name;
4631 int input_index = inst->Src[0].Register.Index;
4632 int chan;
4633 int i;
4634 LLVMValueRef attr_number;
4635 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
4636 int interp_param_idx;
4637 unsigned interp = shader->selector->info.input_interpolate[input_index];
4638 unsigned location;
4639
4640 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
4641
4642 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4643 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
4644 location = TGSI_INTERPOLATE_LOC_CENTER;
4645 else
4646 location = TGSI_INTERPOLATE_LOC_CENTROID;
4647
4648 interp_param_idx = lookup_interp_param_index(interp, location);
4649 if (interp_param_idx == -1)
4650 return;
4651 else if (interp_param_idx)
4652 interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
4653 else
4654 interp_param = NULL;
4655
4656 attr_number = lp_build_const_int32(gallivm, input_index);
4657
4658 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4659 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4660 LLVMValueRef ij_out[2];
4661 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
4662
4663 /*
4664 * take the I then J parameters, and the DDX/Y for it, and
4665 * calculate the IJ inputs for the interpolator.
4666 * temp1 = ddx * offset/sample.x + I;
4667 * interp_param.I = ddy * offset/sample.y + temp1;
4668 * temp1 = ddx * offset/sample.x + J;
4669 * interp_param.J = ddy * offset/sample.y + temp1;
4670 */
4671 for (i = 0; i < 2; i++) {
4672 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
4673 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
4674 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
4675 ddxy_out, ix_ll, "");
4676 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
4677 ddxy_out, iy_ll, "");
4678 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
4679 interp_param, ix_ll, "");
4680 LLVMValueRef temp1, temp2;
4681
4682 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
4683 ctx->f32, "");
4684
4685 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
4686
4687 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
4688
4689 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
4690
4691 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
4692
4693 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
4694 temp2, ctx->i32, "");
4695 }
4696 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
4697 }
4698
4699 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
4700 for (chan = 0; chan < 2; chan++) {
4701 LLVMValueRef args[4];
4702 LLVMValueRef llvm_chan;
4703 unsigned schan;
4704
4705 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
4706 llvm_chan = lp_build_const_int32(gallivm, schan);
4707
4708 args[0] = llvm_chan;
4709 args[1] = attr_number;
4710 args[2] = params;
4711 args[3] = interp_param;
4712
4713 emit_data->output[chan] =
4714 lp_build_intrinsic(gallivm->builder, intr_name,
4715 ctx->f32, args, args[3] ? 4 : 3,
4716 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4717 }
4718 }
4719
4720 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4721 struct lp_build_emit_data *emit_data)
4722 {
4723 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4724 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4725 unsigned stream;
4726
4727 assert(src0.File == TGSI_FILE_IMMEDIATE);
4728
4729 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
4730 return stream;
4731 }
4732
4733 /* Emit one vertex from the geometry shader */
4734 static void si_llvm_emit_vertex(
4735 const struct lp_build_tgsi_action *action,
4736 struct lp_build_tgsi_context *bld_base,
4737 struct lp_build_emit_data *emit_data)
4738 {
4739 struct si_shader_context *ctx = si_shader_context(bld_base);
4740 struct lp_build_context *uint = &bld_base->uint_bld;
4741 struct si_shader *shader = ctx->shader;
4742 struct tgsi_shader_info *info = &shader->selector->info;
4743 struct gallivm_state *gallivm = bld_base->base.gallivm;
4744 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
4745 SI_PARAM_GS2VS_OFFSET);
4746 LLVMValueRef gs_next_vertex;
4747 LLVMValueRef can_emit, kill;
4748 LLVMValueRef args[2];
4749 unsigned chan;
4750 int i;
4751 unsigned stream;
4752
4753 stream = si_llvm_get_stream(bld_base, emit_data);
4754
4755 /* Write vertex attribute values to GSVS ring */
4756 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
4757 ctx->gs_next_vertex[stream],
4758 "");
4759
4760 /* If this thread has already emitted the declared maximum number of
4761 * vertices, kill it: excessive vertex emissions are not supposed to
4762 * have any effect, and GS threads have no externally observable
4763 * effects other than emitting vertices.
4764 */
4765 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
4766 lp_build_const_int32(gallivm,
4767 shader->selector->gs_max_out_vertices), "");
4768 kill = lp_build_select(&bld_base->base, can_emit,
4769 lp_build_const_float(gallivm, 1.0f),
4770 lp_build_const_float(gallivm, -1.0f));
4771
4772 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
4773 ctx->voidt, &kill, 1, 0);
4774
4775 for (i = 0; i < info->num_outputs; i++) {
4776 LLVMValueRef *out_ptr =
4777 ctx->radeon_bld.soa.outputs[i];
4778
4779 for (chan = 0; chan < 4; chan++) {
4780 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4781 LLVMValueRef voffset =
4782 lp_build_const_int32(gallivm, (i * 4 + chan) *
4783 shader->selector->gs_max_out_vertices);
4784
4785 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4786 voffset = lp_build_mul_imm(uint, voffset, 4);
4787
4788 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4789
4790 build_tbuffer_store(ctx,
4791 ctx->gsvs_ring[stream],
4792 out_val, 1,
4793 voffset, soffset, 0,
4794 V_008F0C_BUF_DATA_FORMAT_32,
4795 V_008F0C_BUF_NUM_FORMAT_UINT,
4796 1, 0, 1, 1, 0);
4797 }
4798 }
4799 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4800 lp_build_const_int32(gallivm, 1));
4801
4802 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4803
4804 /* Signal vertex emission */
4805 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
4806 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4807 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4808 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4809 }
4810
4811 /* Cut one primitive from the geometry shader */
4812 static void si_llvm_emit_primitive(
4813 const struct lp_build_tgsi_action *action,
4814 struct lp_build_tgsi_context *bld_base,
4815 struct lp_build_emit_data *emit_data)
4816 {
4817 struct si_shader_context *ctx = si_shader_context(bld_base);
4818 struct gallivm_state *gallivm = bld_base->base.gallivm;
4819 LLVMValueRef args[2];
4820 unsigned stream;
4821
4822 /* Signal primitive cut */
4823 stream = si_llvm_get_stream(bld_base, emit_data);
4824 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
4825 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4826 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4827 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4828 }
4829
4830 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4831 struct lp_build_tgsi_context *bld_base,
4832 struct lp_build_emit_data *emit_data)
4833 {
4834 struct si_shader_context *ctx = si_shader_context(bld_base);
4835 struct gallivm_state *gallivm = bld_base->base.gallivm;
4836
4837 /* The real barrier instruction isn’t needed, because an entire patch
4838 * always fits into a single wave.
4839 */
4840 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4841 emit_optimization_barrier(ctx);
4842 return;
4843 }
4844
4845 lp_build_intrinsic(gallivm->builder,
4846 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
4847 : "llvm.AMDGPU.barrier.local",
4848 ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
4849 }
4850
4851 static const struct lp_build_tgsi_action tex_action = {
4852 .fetch_args = tex_fetch_args,
4853 .emit = build_tex_intrinsic,
4854 };
4855
4856 static const struct lp_build_tgsi_action interp_action = {
4857 .fetch_args = interp_fetch_args,
4858 .emit = build_interp_intrinsic,
4859 };
4860
4861 static void si_create_function(struct si_shader_context *ctx,
4862 LLVMTypeRef *returns, unsigned num_returns,
4863 LLVMTypeRef *params, unsigned num_params,
4864 int last_array_pointer, int last_sgpr)
4865 {
4866 int i;
4867
4868 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
4869 params, num_params);
4870 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
4871 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
4872
4873 for (i = 0; i <= last_sgpr; ++i) {
4874 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
4875
4876 /* We tell llvm that array inputs are passed by value to allow Sinking pass
4877 * to move load. Inputs are constant so this is fine. */
4878 if (i <= last_array_pointer)
4879 LLVMAddAttribute(P, LLVMByValAttribute);
4880 else
4881 LLVMAddAttribute(P, LLVMInRegAttribute);
4882 }
4883 }
4884
4885 static void create_meta_data(struct si_shader_context *ctx)
4886 {
4887 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
4888 LLVMValueRef args[3];
4889
4890 args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
4891 args[1] = 0;
4892 args[2] = lp_build_const_int32(gallivm, 1);
4893
4894 ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
4895
4896 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
4897 "amdgpu.uniform", 14);
4898
4899 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
4900 }
4901
4902 static void declare_streamout_params(struct si_shader_context *ctx,
4903 struct pipe_stream_output_info *so,
4904 LLVMTypeRef *params, LLVMTypeRef i32,
4905 unsigned *num_params)
4906 {
4907 int i;
4908
4909 /* Streamout SGPRs. */
4910 if (so->num_outputs) {
4911 params[ctx->param_streamout_config = (*num_params)++] = i32;
4912 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
4913 }
4914 /* A streamout buffer offset is loaded if the stride is non-zero. */
4915 for (i = 0; i < 4; i++) {
4916 if (!so->stride[i])
4917 continue;
4918
4919 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
4920 }
4921 }
4922
4923 static unsigned llvm_get_type_size(LLVMTypeRef type)
4924 {
4925 LLVMTypeKind kind = LLVMGetTypeKind(type);
4926
4927 switch (kind) {
4928 case LLVMIntegerTypeKind:
4929 return LLVMGetIntTypeWidth(type) / 8;
4930 case LLVMFloatTypeKind:
4931 return 4;
4932 case LLVMPointerTypeKind:
4933 return 8;
4934 case LLVMVectorTypeKind:
4935 return LLVMGetVectorSize(type) *
4936 llvm_get_type_size(LLVMGetElementType(type));
4937 default:
4938 assert(0);
4939 return 0;
4940 }
4941 }
4942
4943 static void declare_tess_lds(struct si_shader_context *ctx)
4944 {
4945 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4946 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
4947
4948 /* This is the upper bound, maximum is 32 inputs times 32 vertices */
4949 unsigned vertex_data_dw_size = 32*32*4;
4950 unsigned patch_data_dw_size = 32*4;
4951 /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
4952 unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
4953 unsigned lds_dwords = patch_dw_size;
4954
4955 if (ctx->screen->b.chip_class <= SI) {
4956 /* This is a horrible temporary workaround to make tesselation
4957 * not be completely broken on SI now that LLVM checks that
4958 * the declared LDS size fits into the device maximum of 32KB.
4959 */
4960 lds_dwords = 8 * 1024;
4961 }
4962
4963 /* The actual size is computed outside of the shader to reduce
4964 * the number of shader variants. */
4965 ctx->lds =
4966 LLVMAddGlobalInAddressSpace(gallivm->module,
4967 LLVMArrayType(i32, lds_dwords),
4968 "tess_lds",
4969 LOCAL_ADDR_SPACE);
4970 }
4971
4972 static void create_function(struct si_shader_context *ctx)
4973 {
4974 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
4975 struct gallivm_state *gallivm = bld_base->base.gallivm;
4976 struct si_shader *shader = ctx->shader;
4977 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
4978 LLVMTypeRef returns[16+32*4];
4979 unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
4980 unsigned num_returns = 0;
4981
4982 v3i32 = LLVMVectorType(ctx->i32, 3);
4983
4984 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
4985 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
4986 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
4987 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
4988 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
4989 last_array_pointer = SI_PARAM_SHADER_BUFFERS;
4990
4991 switch (ctx->type) {
4992 case PIPE_SHADER_VERTEX:
4993 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
4994 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
4995 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
4996 params[SI_PARAM_START_INSTANCE] = ctx->i32;
4997 num_params = SI_PARAM_START_INSTANCE+1;
4998
4999 if (shader->key.vs.as_es) {
5000 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5001 } else if (shader->key.vs.as_ls) {
5002 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5003 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5004 } else {
5005 if (ctx->is_gs_copy_shader) {
5006 last_array_pointer = SI_PARAM_RW_BUFFERS;
5007 num_params = SI_PARAM_RW_BUFFERS+1;
5008 } else {
5009 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5010 num_params = SI_PARAM_VS_STATE_BITS+1;
5011 }
5012
5013 /* The locations of the other parameters are assigned dynamically. */
5014 declare_streamout_params(ctx, &shader->selector->so,
5015 params, ctx->i32, &num_params);
5016 }
5017
5018 last_sgpr = num_params-1;
5019
5020 /* VGPRs */
5021 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5022 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5023 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5024 params[ctx->param_instance_id = num_params++] = ctx->i32;
5025
5026 if (!ctx->is_monolithic &&
5027 !ctx->is_gs_copy_shader) {
5028 /* Vertex load indices. */
5029 ctx->param_vertex_index0 = num_params;
5030
5031 for (i = 0; i < shader->selector->info.num_inputs; i++)
5032 params[num_params++] = ctx->i32;
5033
5034 /* PrimitiveID output. */
5035 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5036 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5037 returns[num_returns++] = ctx->f32;
5038 }
5039 break;
5040
5041 case PIPE_SHADER_TESS_CTRL:
5042 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5043 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5044 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5045 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5046 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5047
5048 /* VGPRs */
5049 params[SI_PARAM_PATCH_ID] = ctx->i32;
5050 params[SI_PARAM_REL_IDS] = ctx->i32;
5051 num_params = SI_PARAM_REL_IDS+1;
5052
5053 if (!ctx->is_monolithic) {
5054 /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
5055 for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
5056 returns[num_returns++] = ctx->i32; /* SGPRs */
5057
5058 for (i = 0; i < 3; i++)
5059 returns[num_returns++] = ctx->f32; /* VGPRs */
5060 }
5061 break;
5062
5063 case PIPE_SHADER_TESS_EVAL:
5064 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5065 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5066 num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
5067
5068 if (shader->key.tes.as_es) {
5069 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5070 } else {
5071 declare_streamout_params(ctx, &shader->selector->so,
5072 params, ctx->i32, &num_params);
5073 }
5074 last_sgpr = num_params - 1;
5075
5076 /* VGPRs */
5077 params[ctx->param_tes_u = num_params++] = ctx->f32;
5078 params[ctx->param_tes_v = num_params++] = ctx->f32;
5079 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5080 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5081
5082 /* PrimitiveID output. */
5083 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5084 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5085 returns[num_returns++] = ctx->f32;
5086 break;
5087
5088 case PIPE_SHADER_GEOMETRY:
5089 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5090 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5091 last_sgpr = SI_PARAM_GS_WAVE_ID;
5092
5093 /* VGPRs */
5094 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5095 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5096 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5097 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5098 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5099 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5100 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5101 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5102 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5103 break;
5104
5105 case PIPE_SHADER_FRAGMENT:
5106 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5107 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5108 last_sgpr = SI_PARAM_PRIM_MASK;
5109 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5110 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5111 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5112 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5113 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5114 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5115 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5116 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5117 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5118 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5119 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5120 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5121 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5122 params[SI_PARAM_ANCILLARY] = ctx->i32;
5123 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5124 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5125 num_params = SI_PARAM_POS_FIXED_PT+1;
5126
5127 if (!ctx->is_monolithic) {
5128 /* Color inputs from the prolog. */
5129 if (shader->selector->info.colors_read) {
5130 unsigned num_color_elements =
5131 util_bitcount(shader->selector->info.colors_read);
5132
5133 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5134 for (i = 0; i < num_color_elements; i++)
5135 params[num_params++] = ctx->f32;
5136 }
5137
5138 /* Outputs for the epilog. */
5139 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5140 num_returns =
5141 num_return_sgprs +
5142 util_bitcount(shader->selector->info.colors_written) * 4 +
5143 shader->selector->info.writes_z +
5144 shader->selector->info.writes_stencil +
5145 shader->selector->info.writes_samplemask +
5146 1 /* SampleMaskIn */;
5147
5148 num_returns = MAX2(num_returns,
5149 num_return_sgprs +
5150 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5151
5152 for (i = 0; i < num_return_sgprs; i++)
5153 returns[i] = ctx->i32;
5154 for (; i < num_returns; i++)
5155 returns[i] = ctx->f32;
5156 }
5157 break;
5158
5159 case PIPE_SHADER_COMPUTE:
5160 params[SI_PARAM_GRID_SIZE] = v3i32;
5161 params[SI_PARAM_BLOCK_ID] = v3i32;
5162 last_sgpr = SI_PARAM_BLOCK_ID;
5163
5164 params[SI_PARAM_THREAD_ID] = v3i32;
5165 num_params = SI_PARAM_THREAD_ID + 1;
5166 break;
5167 default:
5168 assert(0 && "unimplemented shader");
5169 return;
5170 }
5171
5172 assert(num_params <= Elements(params));
5173
5174 si_create_function(ctx, returns, num_returns, params,
5175 num_params, last_array_pointer, last_sgpr);
5176
5177 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5178 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5179 !ctx->is_monolithic) {
5180 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5181 "InitialPSInputAddr",
5182 S_0286D0_PERSP_SAMPLE_ENA(1) |
5183 S_0286D0_PERSP_CENTER_ENA(1) |
5184 S_0286D0_PERSP_CENTROID_ENA(1) |
5185 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5186 S_0286D0_LINEAR_CENTER_ENA(1) |
5187 S_0286D0_LINEAR_CENTROID_ENA(1) |
5188 S_0286D0_FRONT_FACE_ENA(1) |
5189 S_0286D0_POS_FIXED_PT_ENA(1));
5190 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5191 const unsigned *properties = shader->selector->info.properties;
5192 unsigned max_work_group_size =
5193 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5194 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5195 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5196
5197 assert(max_work_group_size);
5198
5199 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5200 "amdgpu-max-work-group-size",
5201 max_work_group_size);
5202 }
5203
5204 shader->info.num_input_sgprs = 0;
5205 shader->info.num_input_vgprs = 0;
5206
5207 for (i = 0; i <= last_sgpr; ++i)
5208 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5209
5210 /* Unused fragment shader inputs are eliminated by the compiler,
5211 * so we don't know yet how many there will be.
5212 */
5213 if (ctx->type != PIPE_SHADER_FRAGMENT)
5214 for (; i < num_params; ++i)
5215 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5216
5217 if (bld_base->info &&
5218 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5219 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5220 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5221 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5222 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5223 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5224 ctx->lds =
5225 LLVMAddGlobalInAddressSpace(gallivm->module,
5226 LLVMArrayType(ctx->i32, 64),
5227 "ddxy_lds",
5228 LOCAL_ADDR_SPACE);
5229
5230 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5231 ctx->type == PIPE_SHADER_TESS_CTRL ||
5232 ctx->type == PIPE_SHADER_TESS_EVAL)
5233 declare_tess_lds(ctx);
5234 }
5235
5236 static void preload_constants(struct si_shader_context *ctx)
5237 {
5238 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5239 struct gallivm_state *gallivm = bld_base->base.gallivm;
5240 const struct tgsi_shader_info *info = bld_base->info;
5241 unsigned buf;
5242 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5243
5244 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5245 unsigned i, num_const = info->const_file_max[buf] + 1;
5246
5247 if (num_const == 0)
5248 continue;
5249
5250 /* Allocate space for the constant values */
5251 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5252
5253 /* Load the resource descriptor */
5254 ctx->const_buffers[buf] =
5255 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5256
5257 /* Load the constants, we rely on the code sinking to do the rest */
5258 for (i = 0; i < num_const * 4; ++i) {
5259 ctx->constants[buf][i] =
5260 buffer_load_const(gallivm->builder,
5261 ctx->const_buffers[buf],
5262 lp_build_const_int32(gallivm, i * 4),
5263 ctx->f32);
5264 }
5265 }
5266 }
5267
5268 static void preload_shader_buffers(struct si_shader_context *ctx)
5269 {
5270 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5271 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5272 int buf, maxbuf;
5273
5274 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5275 SI_NUM_SHADER_BUFFERS - 1);
5276 for (buf = 0; buf <= maxbuf; ++buf) {
5277 ctx->shader_buffers[buf] =
5278 build_indexed_load_const(
5279 ctx, ptr, lp_build_const_int32(gallivm, buf));
5280 }
5281 }
5282
5283 static void preload_samplers(struct si_shader_context *ctx)
5284 {
5285 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5286 struct gallivm_state *gallivm = bld_base->base.gallivm;
5287 const struct tgsi_shader_info *info = bld_base->info;
5288 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5289 LLVMValueRef offset;
5290
5291 if (num_samplers == 0)
5292 return;
5293
5294 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5295 for (i = 0; i < num_samplers; ++i) {
5296 /* Resource */
5297 offset = lp_build_const_int32(gallivm, i);
5298 ctx->sampler_views[i] =
5299 get_sampler_desc(ctx, offset, DESC_IMAGE);
5300
5301 /* FMASK resource */
5302 if (info->is_msaa_sampler[i])
5303 ctx->fmasks[i] =
5304 get_sampler_desc(ctx, offset, DESC_FMASK);
5305 else {
5306 ctx->sampler_states[i] =
5307 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5308 ctx->sampler_states[i] =
5309 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5310 ctx->sampler_states[i]);
5311 }
5312 }
5313 }
5314
5315 static void preload_images(struct si_shader_context *ctx)
5316 {
5317 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5318 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5319 struct gallivm_state *gallivm = bld_base->base.gallivm;
5320 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5321 LLVMValueRef res_ptr;
5322 unsigned i;
5323
5324 if (num_images == 0)
5325 return;
5326
5327 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5328
5329 for (i = 0; i < num_images; ++i) {
5330 /* Rely on LLVM to shrink the load for buffer resources. */
5331 LLVMValueRef rsrc =
5332 build_indexed_load_const(ctx, res_ptr,
5333 lp_build_const_int32(gallivm, i));
5334
5335 if (info->images_writemask & (1 << i) &&
5336 !(info->images_buffers & (1 << i)))
5337 rsrc = force_dcc_off(ctx, rsrc);
5338
5339 ctx->images[i] = rsrc;
5340 }
5341 }
5342
5343 static void preload_streamout_buffers(struct si_shader_context *ctx)
5344 {
5345 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5346 struct gallivm_state *gallivm = bld_base->base.gallivm;
5347 unsigned i;
5348
5349 /* Streamout can only be used if the shader is compiled as VS. */
5350 if (!ctx->shader->selector->so.num_outputs ||
5351 (ctx->type == PIPE_SHADER_VERTEX &&
5352 (ctx->shader->key.vs.as_es ||
5353 ctx->shader->key.vs.as_ls)) ||
5354 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5355 ctx->shader->key.tes.as_es))
5356 return;
5357
5358 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5359 SI_PARAM_RW_BUFFERS);
5360
5361 /* Load the resources, we rely on the code sinking to do the rest */
5362 for (i = 0; i < 4; ++i) {
5363 if (ctx->shader->selector->so.stride[i]) {
5364 LLVMValueRef offset = lp_build_const_int32(gallivm,
5365 SI_VS_STREAMOUT_BUF0 + i);
5366
5367 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5368 }
5369 }
5370 }
5371
5372 /**
5373 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5374 * for later use.
5375 */
5376 static void preload_ring_buffers(struct si_shader_context *ctx)
5377 {
5378 struct gallivm_state *gallivm =
5379 ctx->radeon_bld.soa.bld_base.base.gallivm;
5380
5381 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5382 SI_PARAM_RW_BUFFERS);
5383
5384 if ((ctx->type == PIPE_SHADER_VERTEX &&
5385 ctx->shader->key.vs.as_es) ||
5386 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5387 ctx->shader->key.tes.as_es) ||
5388 ctx->type == PIPE_SHADER_GEOMETRY) {
5389 unsigned ring =
5390 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5391 : SI_ES_RING_ESGS;
5392 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5393
5394 ctx->esgs_ring =
5395 build_indexed_load_const(ctx, buf_ptr, offset);
5396 }
5397
5398 if (ctx->is_gs_copy_shader) {
5399 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5400
5401 ctx->gsvs_ring[0] =
5402 build_indexed_load_const(ctx, buf_ptr, offset);
5403 }
5404 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5405 int i;
5406 for (i = 0; i < 4; i++) {
5407 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5408
5409 ctx->gsvs_ring[i] =
5410 build_indexed_load_const(ctx, buf_ptr, offset);
5411 }
5412 }
5413 }
5414
5415 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5416 LLVMValueRef param_rw_buffers,
5417 unsigned param_pos_fixed_pt)
5418 {
5419 struct lp_build_tgsi_context *bld_base =
5420 &ctx->radeon_bld.soa.bld_base;
5421 struct gallivm_state *gallivm = bld_base->base.gallivm;
5422 LLVMBuilderRef builder = gallivm->builder;
5423 LLVMValueRef slot, desc, offset, row, bit, address[2];
5424
5425 /* Use the fixed-point gl_FragCoord input.
5426 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5427 * per coordinate to get the repeating effect.
5428 */
5429 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5430 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5431
5432 /* Load the buffer descriptor. */
5433 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5434 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5435
5436 /* The stipple pattern is 32x32, each row has 32 bits. */
5437 offset = LLVMBuildMul(builder, address[1],
5438 LLVMConstInt(ctx->i32, 4, 0), "");
5439 row = buffer_load_const(builder, desc, offset, ctx->i32);
5440 bit = LLVMBuildLShr(builder, row, address[0], "");
5441 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5442
5443 /* The intrinsic kills the thread if arg < 0. */
5444 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5445 LLVMConstReal(ctx->f32, -1), "");
5446 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5447 }
5448
5449 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5450 struct si_shader_config *conf,
5451 unsigned symbol_offset)
5452 {
5453 unsigned i;
5454 const unsigned char *config =
5455 radeon_shader_binary_config_start(binary, symbol_offset);
5456
5457 /* XXX: We may be able to emit some of these values directly rather than
5458 * extracting fields to be emitted later.
5459 */
5460
5461 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5462 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5463 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5464 switch (reg) {
5465 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5466 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5467 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5468 case R_00B848_COMPUTE_PGM_RSRC1:
5469 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5470 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5471 conf->float_mode = G_00B028_FLOAT_MODE(value);
5472 conf->rsrc1 = value;
5473 break;
5474 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5475 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5476 break;
5477 case R_00B84C_COMPUTE_PGM_RSRC2:
5478 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5479 conf->rsrc2 = value;
5480 break;
5481 case R_0286CC_SPI_PS_INPUT_ENA:
5482 conf->spi_ps_input_ena = value;
5483 break;
5484 case R_0286D0_SPI_PS_INPUT_ADDR:
5485 conf->spi_ps_input_addr = value;
5486 break;
5487 case R_0286E8_SPI_TMPRING_SIZE:
5488 case R_00B860_COMPUTE_TMPRING_SIZE:
5489 /* WAVESIZE is in units of 256 dwords. */
5490 conf->scratch_bytes_per_wave =
5491 G_00B860_WAVESIZE(value) * 256 * 4 * 1;
5492 break;
5493 default:
5494 {
5495 static bool printed;
5496
5497 if (!printed) {
5498 fprintf(stderr, "Warning: LLVM emitted unknown "
5499 "config register: 0x%x\n", reg);
5500 printed = true;
5501 }
5502 }
5503 break;
5504 }
5505
5506 if (!conf->spi_ps_input_addr)
5507 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5508 }
5509 }
5510
5511 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5512 struct si_shader *shader,
5513 struct si_shader_config *config,
5514 uint64_t scratch_va)
5515 {
5516 unsigned i;
5517 uint32_t scratch_rsrc_dword0 = scratch_va;
5518 uint32_t scratch_rsrc_dword1 =
5519 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
5520 | S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5521
5522 for (i = 0 ; i < shader->binary.reloc_count; i++) {
5523 const struct radeon_shader_reloc *reloc =
5524 &shader->binary.relocs[i];
5525 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5526 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5527 &scratch_rsrc_dword0, 4);
5528 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5529 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5530 &scratch_rsrc_dword1, 4);
5531 }
5532 }
5533 }
5534
5535 static unsigned si_get_shader_binary_size(struct si_shader *shader)
5536 {
5537 unsigned size = shader->binary.code_size;
5538
5539 if (shader->prolog)
5540 size += shader->prolog->binary.code_size;
5541 if (shader->epilog)
5542 size += shader->epilog->binary.code_size;
5543 return size;
5544 }
5545
5546 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5547 {
5548 const struct radeon_shader_binary *prolog =
5549 shader->prolog ? &shader->prolog->binary : NULL;
5550 const struct radeon_shader_binary *epilog =
5551 shader->epilog ? &shader->epilog->binary : NULL;
5552 const struct radeon_shader_binary *mainb = &shader->binary;
5553 unsigned bo_size = si_get_shader_binary_size(shader) +
5554 (!epilog ? mainb->rodata_size : 0);
5555 unsigned char *ptr;
5556
5557 assert(!prolog || !prolog->rodata_size);
5558 assert((!prolog && !epilog) || !mainb->rodata_size);
5559 assert(!epilog || !epilog->rodata_size);
5560
5561 r600_resource_reference(&shader->bo, NULL);
5562 shader->bo = si_resource_create_custom(&sscreen->b.b,
5563 PIPE_USAGE_IMMUTABLE,
5564 bo_size);
5565 if (!shader->bo)
5566 return -ENOMEM;
5567
5568 /* Upload. */
5569 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
5570 PIPE_TRANSFER_READ_WRITE);
5571
5572 if (prolog) {
5573 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
5574 ptr += prolog->code_size;
5575 }
5576
5577 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
5578 ptr += mainb->code_size;
5579
5580 if (epilog)
5581 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
5582 else if (mainb->rodata_size > 0)
5583 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
5584
5585 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5586 return 0;
5587 }
5588
5589 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
5590 struct pipe_debug_callback *debug,
5591 const char *name, FILE *file)
5592 {
5593 char *line, *p;
5594 unsigned i, count;
5595
5596 if (binary->disasm_string) {
5597 fprintf(file, "Shader %s disassembly:\n", name);
5598 fprintf(file, "%s", binary->disasm_string);
5599
5600 if (debug && debug->debug_message) {
5601 /* Very long debug messages are cut off, so send the
5602 * disassembly one line at a time. This causes more
5603 * overhead, but on the plus side it simplifies
5604 * parsing of resulting logs.
5605 */
5606 pipe_debug_message(debug, SHADER_INFO,
5607 "Shader Disassembly Begin");
5608
5609 line = binary->disasm_string;
5610 while (*line) {
5611 p = util_strchrnul(line, '\n');
5612 count = p - line;
5613
5614 if (count) {
5615 pipe_debug_message(debug, SHADER_INFO,
5616 "%.*s", count, line);
5617 }
5618
5619 if (!*p)
5620 break;
5621 line = p + 1;
5622 }
5623
5624 pipe_debug_message(debug, SHADER_INFO,
5625 "Shader Disassembly End");
5626 }
5627 } else {
5628 fprintf(file, "Shader %s binary:\n", name);
5629 for (i = 0; i < binary->code_size; i += 4) {
5630 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5631 binary->code[i + 3], binary->code[i + 2],
5632 binary->code[i + 1], binary->code[i]);
5633 }
5634 }
5635 }
5636
5637 static void si_shader_dump_stats(struct si_screen *sscreen,
5638 struct si_shader_config *conf,
5639 unsigned num_inputs,
5640 unsigned code_size,
5641 struct pipe_debug_callback *debug,
5642 unsigned processor,
5643 FILE *file)
5644 {
5645 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5646 unsigned lds_per_wave = 0;
5647 unsigned max_simd_waves = 10;
5648
5649 /* Compute LDS usage for PS. */
5650 if (processor == PIPE_SHADER_FRAGMENT) {
5651 /* The minimum usage per wave is (num_inputs * 48). The maximum
5652 * usage is (num_inputs * 48 * 16).
5653 * We can get anything in between and it varies between waves.
5654 *
5655 * The 48 bytes per input for a single primitive is equal to
5656 * 4 bytes/component * 4 components/input * 3 points.
5657 *
5658 * Other stages don't know the size at compile time or don't
5659 * allocate LDS per wave, but instead they do it per thread group.
5660 */
5661 lds_per_wave = conf->lds_size * lds_increment +
5662 align(num_inputs * 48, lds_increment);
5663 }
5664
5665 /* Compute the per-SIMD wave counts. */
5666 if (conf->num_sgprs) {
5667 if (sscreen->b.chip_class >= VI)
5668 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5669 else
5670 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5671 }
5672
5673 if (conf->num_vgprs)
5674 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5675
5676 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
5677 * that PS can use.
5678 */
5679 if (lds_per_wave)
5680 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5681
5682 if (file != stderr ||
5683 r600_can_dump_shader(&sscreen->b, processor)) {
5684 if (processor == PIPE_SHADER_FRAGMENT) {
5685 fprintf(file, "*** SHADER CONFIG ***\n"
5686 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5687 "SPI_PS_INPUT_ENA = 0x%04x\n",
5688 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5689 }
5690
5691 fprintf(file, "*** SHADER STATS ***\n"
5692 "SGPRS: %d\n"
5693 "VGPRS: %d\n"
5694 "Code Size: %d bytes\n"
5695 "LDS: %d blocks\n"
5696 "Scratch: %d bytes per wave\n"
5697 "Max Waves: %d\n"
5698 "********************\n",
5699 conf->num_sgprs, conf->num_vgprs, code_size,
5700 conf->lds_size, conf->scratch_bytes_per_wave,
5701 max_simd_waves);
5702 }
5703
5704 pipe_debug_message(debug, SHADER_INFO,
5705 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5706 "LDS: %d Scratch: %d Max Waves: %d",
5707 conf->num_sgprs, conf->num_vgprs, code_size,
5708 conf->lds_size, conf->scratch_bytes_per_wave,
5709 max_simd_waves);
5710 }
5711
5712 static const char *si_get_shader_name(struct si_shader *shader,
5713 unsigned processor)
5714 {
5715 switch (processor) {
5716 case PIPE_SHADER_VERTEX:
5717 if (shader->key.vs.as_es)
5718 return "Vertex Shader as ES";
5719 else if (shader->key.vs.as_ls)
5720 return "Vertex Shader as LS";
5721 else
5722 return "Vertex Shader as VS";
5723 case PIPE_SHADER_TESS_CTRL:
5724 return "Tessellation Control Shader";
5725 case PIPE_SHADER_TESS_EVAL:
5726 if (shader->key.tes.as_es)
5727 return "Tessellation Evaluation Shader as ES";
5728 else
5729 return "Tessellation Evaluation Shader as VS";
5730 case PIPE_SHADER_GEOMETRY:
5731 if (shader->gs_copy_shader == NULL)
5732 return "GS Copy Shader as VS";
5733 else
5734 return "Geometry Shader";
5735 case PIPE_SHADER_FRAGMENT:
5736 return "Pixel Shader";
5737 case PIPE_SHADER_COMPUTE:
5738 return "Compute Shader";
5739 default:
5740 return "Unknown Shader";
5741 }
5742 }
5743
5744 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
5745 struct pipe_debug_callback *debug, unsigned processor,
5746 FILE *file)
5747 {
5748 if (file != stderr ||
5749 (r600_can_dump_shader(&sscreen->b, processor) &&
5750 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5751 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5752
5753 if (shader->prolog)
5754 si_shader_dump_disassembly(&shader->prolog->binary,
5755 debug, "prolog", file);
5756
5757 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5758
5759 if (shader->epilog)
5760 si_shader_dump_disassembly(&shader->epilog->binary,
5761 debug, "epilog", file);
5762 fprintf(file, "\n");
5763 }
5764
5765 si_shader_dump_stats(sscreen, &shader->config,
5766 shader->selector ? shader->selector->info.num_inputs : 0,
5767 si_get_shader_binary_size(shader), debug, processor,
5768 file);
5769 }
5770
5771 int si_compile_llvm(struct si_screen *sscreen,
5772 struct radeon_shader_binary *binary,
5773 struct si_shader_config *conf,
5774 LLVMTargetMachineRef tm,
5775 LLVMModuleRef mod,
5776 struct pipe_debug_callback *debug,
5777 unsigned processor,
5778 const char *name)
5779 {
5780 int r = 0;
5781 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5782
5783 if (r600_can_dump_shader(&sscreen->b, processor)) {
5784 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5785
5786 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5787 fprintf(stderr, "%s LLVM IR:\n\n", name);
5788 LLVMDumpModule(mod);
5789 fprintf(stderr, "\n");
5790 }
5791 }
5792
5793 if (!si_replace_shader(count, binary)) {
5794 r = radeon_llvm_compile(mod, binary,
5795 r600_get_llvm_processor_name(sscreen->b.family), tm,
5796 debug);
5797 if (r)
5798 return r;
5799 }
5800
5801 si_shader_binary_read_config(binary, conf, 0);
5802
5803 /* Enable 64-bit and 16-bit denormals, because there is no performance
5804 * cost.
5805 *
5806 * If denormals are enabled, all floating-point output modifiers are
5807 * ignored.
5808 *
5809 * Don't enable denormals for 32-bit floats, because:
5810 * - Floating-point output modifiers would be ignored by the hw.
5811 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5812 * have to stop using those.
5813 * - SI & CI would be very slow.
5814 */
5815 conf->float_mode |= V_00B028_FP_64_DENORMS;
5816
5817 FREE(binary->config);
5818 FREE(binary->global_symbol_offsets);
5819 binary->config = NULL;
5820 binary->global_symbol_offsets = NULL;
5821
5822 /* Some shaders can't have rodata because their binaries can be
5823 * concatenated.
5824 */
5825 if (binary->rodata_size &&
5826 (processor == PIPE_SHADER_VERTEX ||
5827 processor == PIPE_SHADER_TESS_CTRL ||
5828 processor == PIPE_SHADER_TESS_EVAL ||
5829 processor == PIPE_SHADER_FRAGMENT)) {
5830 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5831 return -EINVAL;
5832 }
5833
5834 return r;
5835 }
5836
5837 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5838 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
5839 struct si_shader_context *ctx,
5840 struct si_shader *gs,
5841 struct pipe_debug_callback *debug)
5842 {
5843 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5844 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5845 struct lp_build_context *uint = &bld_base->uint_bld;
5846 struct si_shader_output_values *outputs;
5847 struct tgsi_shader_info *gsinfo = &gs->selector->info;
5848 LLVMValueRef args[9];
5849 int i, r;
5850
5851 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5852
5853 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
5854 ctx->type = PIPE_SHADER_VERTEX;
5855 ctx->is_gs_copy_shader = true;
5856
5857 create_meta_data(ctx);
5858 create_function(ctx);
5859 preload_streamout_buffers(ctx);
5860 preload_ring_buffers(ctx);
5861
5862 args[0] = ctx->gsvs_ring[0];
5863 args[1] = lp_build_mul_imm(uint,
5864 LLVMGetParam(ctx->radeon_bld.main_fn,
5865 ctx->param_vertex_id),
5866 4);
5867 args[3] = uint->zero;
5868 args[4] = uint->one; /* OFFEN */
5869 args[5] = uint->zero; /* IDXEN */
5870 args[6] = uint->one; /* GLC */
5871 args[7] = uint->one; /* SLC */
5872 args[8] = uint->zero; /* TFE */
5873
5874 /* Fetch vertex data from GSVS ring */
5875 for (i = 0; i < gsinfo->num_outputs; ++i) {
5876 unsigned chan;
5877
5878 outputs[i].name = gsinfo->output_semantic_name[i];
5879 outputs[i].sid = gsinfo->output_semantic_index[i];
5880
5881 for (chan = 0; chan < 4; chan++) {
5882 args[2] = lp_build_const_int32(gallivm,
5883 (i * 4 + chan) *
5884 gs->selector->gs_max_out_vertices * 16 * 4);
5885
5886 outputs[i].values[chan] =
5887 LLVMBuildBitCast(gallivm->builder,
5888 lp_build_intrinsic(gallivm->builder,
5889 "llvm.SI.buffer.load.dword.i32.i32",
5890 ctx->i32, args, 9,
5891 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
5892 ctx->f32, "");
5893 }
5894 }
5895
5896 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5897
5898 LLVMBuildRet(gallivm->builder, ctx->return_value);
5899
5900 /* Dump LLVM IR before any optimization passes */
5901 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
5902 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5903 LLVMDumpModule(bld_base->base.gallivm->module);
5904
5905 radeon_llvm_finalize_module(&ctx->radeon_bld);
5906
5907 r = si_compile_llvm(sscreen, &ctx->shader->binary,
5908 &ctx->shader->config, ctx->tm,
5909 bld_base->base.gallivm->module,
5910 debug, PIPE_SHADER_GEOMETRY,
5911 "GS Copy Shader");
5912 if (!r) {
5913 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5914 fprintf(stderr, "GS Copy Shader:\n");
5915 si_shader_dump(sscreen, ctx->shader, debug,
5916 PIPE_SHADER_GEOMETRY, stderr);
5917 r = si_shader_binary_upload(sscreen, ctx->shader);
5918 }
5919
5920 radeon_llvm_dispose(&ctx->radeon_bld);
5921
5922 FREE(outputs);
5923 return r;
5924 }
5925
5926 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
5927 {
5928 int i;
5929
5930 fprintf(f, "SHADER KEY\n");
5931
5932 switch (shader) {
5933 case PIPE_SHADER_VERTEX:
5934 fprintf(f, " instance_divisors = {");
5935 for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
5936 fprintf(f, !i ? "%u" : ", %u",
5937 key->vs.prolog.instance_divisors[i]);
5938 fprintf(f, "}\n");
5939 fprintf(f, " as_es = %u\n", key->vs.as_es);
5940 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
5941 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
5942 break;
5943
5944 case PIPE_SHADER_TESS_CTRL:
5945 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
5946 break;
5947
5948 case PIPE_SHADER_TESS_EVAL:
5949 fprintf(f, " as_es = %u\n", key->tes.as_es);
5950 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
5951 break;
5952
5953 case PIPE_SHADER_GEOMETRY:
5954 case PIPE_SHADER_COMPUTE:
5955 break;
5956
5957 case PIPE_SHADER_FRAGMENT:
5958 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
5959 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
5960 fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
5961 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
5962 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
5963 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
5964 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
5965 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
5966 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
5967 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
5968 break;
5969
5970 default:
5971 assert(0);
5972 }
5973 }
5974
5975 static void si_init_shader_ctx(struct si_shader_context *ctx,
5976 struct si_screen *sscreen,
5977 struct si_shader *shader,
5978 LLVMTargetMachineRef tm)
5979 {
5980 struct lp_build_tgsi_context *bld_base;
5981 struct lp_build_tgsi_action tmpl = {};
5982
5983 memset(ctx, 0, sizeof(*ctx));
5984 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
5985 ctx->tm = tm;
5986 ctx->screen = sscreen;
5987 if (shader && shader->selector)
5988 ctx->type = shader->selector->info.processor;
5989 else
5990 ctx->type = -1;
5991 ctx->shader = shader;
5992
5993 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
5994 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
5995 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
5996 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
5997 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
5998 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
5999 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6000 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6001 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6002 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6003 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6004 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6005
6006 bld_base = &ctx->radeon_bld.soa.bld_base;
6007 if (shader && shader->selector)
6008 bld_base->info = &shader->selector->info;
6009 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6010
6011 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6012 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6013 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6014
6015 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6016 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6017 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6018 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6019 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6020 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6021 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6022 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6023 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6024 bld_base->op_actions[TGSI_OPCODE_TXQ] = tex_action;
6025 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6026 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6027 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6028
6029 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6030 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6031 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6032 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6033 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6034 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6035
6036 tmpl.fetch_args = atomic_fetch_args;
6037 tmpl.emit = atomic_emit;
6038 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6039 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6040 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6041 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6042 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6043 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6044 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6045 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6046 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6047 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6048 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6049 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6050 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6051 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6052 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6053 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6054 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6055 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6056 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6057 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6058
6059 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6060
6061 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6062 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6063 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6064 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6065
6066 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6067 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6068 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6069
6070 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6071 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6072 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6073 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6074 }
6075
6076 int si_compile_tgsi_shader(struct si_screen *sscreen,
6077 LLVMTargetMachineRef tm,
6078 struct si_shader *shader,
6079 bool is_monolithic,
6080 struct pipe_debug_callback *debug)
6081 {
6082 struct si_shader_selector *sel = shader->selector;
6083 struct si_shader_context ctx;
6084 struct lp_build_tgsi_context *bld_base;
6085 LLVMModuleRef mod;
6086 int r = 0;
6087
6088 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6089 * conversion fails. */
6090 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6091 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6092 si_dump_shader_key(sel->type, &shader->key, stderr);
6093 tgsi_dump(sel->tokens, 0);
6094 si_dump_streamout(&sel->so);
6095 }
6096
6097 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6098 ctx.is_monolithic = is_monolithic;
6099
6100 shader->info.uses_instanceid = sel->info.uses_instanceid;
6101
6102 bld_base = &ctx.radeon_bld.soa.bld_base;
6103 ctx.radeon_bld.load_system_value = declare_system_value;
6104
6105 switch (ctx.type) {
6106 case PIPE_SHADER_VERTEX:
6107 ctx.radeon_bld.load_input = declare_input_vs;
6108 if (shader->key.vs.as_ls)
6109 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6110 else if (shader->key.vs.as_es)
6111 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6112 else
6113 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6114 break;
6115 case PIPE_SHADER_TESS_CTRL:
6116 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6117 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6118 bld_base->emit_store = store_output_tcs;
6119 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6120 break;
6121 case PIPE_SHADER_TESS_EVAL:
6122 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6123 if (shader->key.tes.as_es)
6124 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6125 else
6126 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6127 break;
6128 case PIPE_SHADER_GEOMETRY:
6129 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6130 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6131 break;
6132 case PIPE_SHADER_FRAGMENT:
6133 ctx.radeon_bld.load_input = declare_input_fs;
6134 if (is_monolithic)
6135 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6136 else
6137 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6138 break;
6139 case PIPE_SHADER_COMPUTE:
6140 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6141 break;
6142 default:
6143 assert(!"Unsupported shader type");
6144 return -1;
6145 }
6146
6147 create_meta_data(&ctx);
6148 create_function(&ctx);
6149 preload_constants(&ctx);
6150 preload_shader_buffers(&ctx);
6151 preload_samplers(&ctx);
6152 preload_images(&ctx);
6153 preload_streamout_buffers(&ctx);
6154 preload_ring_buffers(&ctx);
6155
6156 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6157 shader->key.ps.prolog.poly_stipple) {
6158 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6159 SI_PARAM_RW_BUFFERS);
6160 si_llvm_emit_polygon_stipple(&ctx, list,
6161 SI_PARAM_POS_FIXED_PT);
6162 }
6163
6164 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6165 int i;
6166 for (i = 0; i < 4; i++) {
6167 ctx.gs_next_vertex[i] =
6168 lp_build_alloca(bld_base->base.gallivm,
6169 ctx.i32, "");
6170 }
6171 }
6172
6173 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6174 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6175 goto out;
6176 }
6177
6178 LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
6179 mod = bld_base->base.gallivm->module;
6180
6181 /* Dump LLVM IR before any optimization passes */
6182 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6183 r600_can_dump_shader(&sscreen->b, ctx.type))
6184 LLVMDumpModule(mod);
6185
6186 radeon_llvm_finalize_module(&ctx.radeon_bld);
6187
6188 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6189 mod, debug, ctx.type, "TGSI shader");
6190 if (r) {
6191 fprintf(stderr, "LLVM failed to compile shader\n");
6192 goto out;
6193 }
6194
6195 radeon_llvm_dispose(&ctx.radeon_bld);
6196
6197 /* Add the scratch offset to input SGPRs. */
6198 if (shader->config.scratch_bytes_per_wave)
6199 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6200
6201 /* Calculate the number of fragment input VGPRs. */
6202 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6203 shader->info.num_input_vgprs = 0;
6204 shader->info.face_vgpr_index = -1;
6205
6206 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6207 shader->info.num_input_vgprs += 2;
6208 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6209 shader->info.num_input_vgprs += 2;
6210 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6211 shader->info.num_input_vgprs += 2;
6212 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6213 shader->info.num_input_vgprs += 3;
6214 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6215 shader->info.num_input_vgprs += 2;
6216 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6217 shader->info.num_input_vgprs += 2;
6218 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6219 shader->info.num_input_vgprs += 2;
6220 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6221 shader->info.num_input_vgprs += 1;
6222 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6223 shader->info.num_input_vgprs += 1;
6224 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6225 shader->info.num_input_vgprs += 1;
6226 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6227 shader->info.num_input_vgprs += 1;
6228 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6229 shader->info.num_input_vgprs += 1;
6230 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6231 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6232 shader->info.num_input_vgprs += 1;
6233 }
6234 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6235 shader->info.num_input_vgprs += 1;
6236 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6237 shader->info.num_input_vgprs += 1;
6238 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6239 shader->info.num_input_vgprs += 1;
6240 }
6241
6242 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6243 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6244 shader->gs_copy_shader->selector = shader->selector;
6245 ctx.shader = shader->gs_copy_shader;
6246 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6247 shader, debug))) {
6248 free(shader->gs_copy_shader);
6249 shader->gs_copy_shader = NULL;
6250 goto out;
6251 }
6252 }
6253
6254 out:
6255 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6256 FREE(ctx.constants[i]);
6257 return r;
6258 }
6259
6260 /**
6261 * Create, compile and return a shader part (prolog or epilog).
6262 *
6263 * \param sscreen screen
6264 * \param list list of shader parts of the same category
6265 * \param key shader part key
6266 * \param tm LLVM target machine
6267 * \param debug debug callback
6268 * \param compile the callback responsible for compilation
6269 * \return non-NULL on success
6270 */
6271 static struct si_shader_part *
6272 si_get_shader_part(struct si_screen *sscreen,
6273 struct si_shader_part **list,
6274 union si_shader_part_key *key,
6275 LLVMTargetMachineRef tm,
6276 struct pipe_debug_callback *debug,
6277 bool (*compile)(struct si_screen *,
6278 LLVMTargetMachineRef,
6279 struct pipe_debug_callback *,
6280 struct si_shader_part *))
6281 {
6282 struct si_shader_part *result;
6283
6284 pipe_mutex_lock(sscreen->shader_parts_mutex);
6285
6286 /* Find existing. */
6287 for (result = *list; result; result = result->next) {
6288 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6289 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6290 return result;
6291 }
6292 }
6293
6294 /* Compile a new one. */
6295 result = CALLOC_STRUCT(si_shader_part);
6296 result->key = *key;
6297 if (!compile(sscreen, tm, debug, result)) {
6298 FREE(result);
6299 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6300 return NULL;
6301 }
6302
6303 result->next = *list;
6304 *list = result;
6305 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6306 return result;
6307 }
6308
6309 /**
6310 * Create a vertex shader prolog.
6311 *
6312 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6313 * All inputs are returned unmodified. The vertex load indices are
6314 * stored after them, which will used by the API VS for fetching inputs.
6315 *
6316 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6317 * input_v0,
6318 * input_v1,
6319 * input_v2,
6320 * input_v3,
6321 * (VertexID + BaseVertex),
6322 * (InstanceID + StartInstance),
6323 * (InstanceID / 2 + StartInstance)
6324 */
6325 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6326 LLVMTargetMachineRef tm,
6327 struct pipe_debug_callback *debug,
6328 struct si_shader_part *out)
6329 {
6330 union si_shader_part_key *key = &out->key;
6331 struct si_shader shader = {};
6332 struct si_shader_context ctx;
6333 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6334 LLVMTypeRef *params, *returns;
6335 LLVMValueRef ret, func;
6336 int last_sgpr, num_params, num_returns, i;
6337 bool status = true;
6338
6339 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6340 ctx.type = PIPE_SHADER_VERTEX;
6341 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6342 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6343
6344 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6345 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6346 sizeof(LLVMTypeRef));
6347 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6348 key->vs_prolog.last_input + 1) *
6349 sizeof(LLVMTypeRef));
6350 num_params = 0;
6351 num_returns = 0;
6352
6353 /* Declare input and output SGPRs. */
6354 num_params = 0;
6355 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6356 params[num_params++] = ctx.i32;
6357 returns[num_returns++] = ctx.i32;
6358 }
6359 last_sgpr = num_params - 1;
6360
6361 /* 4 preloaded VGPRs (outputs must be floats) */
6362 for (i = 0; i < 4; i++) {
6363 params[num_params++] = ctx.i32;
6364 returns[num_returns++] = ctx.f32;
6365 }
6366
6367 /* Vertex load indices. */
6368 for (i = 0; i <= key->vs_prolog.last_input; i++)
6369 returns[num_returns++] = ctx.f32;
6370
6371 /* Create the function. */
6372 si_create_function(&ctx, returns, num_returns, params,
6373 num_params, -1, last_sgpr);
6374 func = ctx.radeon_bld.main_fn;
6375
6376 /* Copy inputs to outputs. This should be no-op, as the registers match,
6377 * but it will prevent the compiler from overwriting them unintentionally.
6378 */
6379 ret = ctx.return_value;
6380 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6381 LLVMValueRef p = LLVMGetParam(func, i);
6382 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6383 }
6384 for (i = num_params - 4; i < num_params; i++) {
6385 LLVMValueRef p = LLVMGetParam(func, i);
6386 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6387 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6388 }
6389
6390 /* Compute vertex load indices from instance divisors. */
6391 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6392 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6393 LLVMValueRef index;
6394
6395 if (divisor) {
6396 /* InstanceID / Divisor + StartInstance */
6397 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6398 SI_SGPR_START_INSTANCE,
6399 divisor);
6400 } else {
6401 /* VertexID + BaseVertex */
6402 index = LLVMBuildAdd(gallivm->builder,
6403 LLVMGetParam(func, ctx.param_vertex_id),
6404 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6405 }
6406
6407 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6408 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6409 num_params++, "");
6410 }
6411
6412 /* Compile. */
6413 LLVMBuildRet(gallivm->builder, ret);
6414 radeon_llvm_finalize_module(&ctx.radeon_bld);
6415
6416 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6417 gallivm->module, debug, ctx.type,
6418 "Vertex Shader Prolog"))
6419 status = false;
6420
6421 radeon_llvm_dispose(&ctx.radeon_bld);
6422 return status;
6423 }
6424
6425 /**
6426 * Compile the vertex shader epilog. This is also used by the tessellation
6427 * evaluation shader compiled as VS.
6428 *
6429 * The input is PrimitiveID.
6430 *
6431 * If PrimitiveID is required by the pixel shader, export it.
6432 * Otherwise, do nothing.
6433 */
6434 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6435 LLVMTargetMachineRef tm,
6436 struct pipe_debug_callback *debug,
6437 struct si_shader_part *out)
6438 {
6439 union si_shader_part_key *key = &out->key;
6440 struct si_shader_context ctx;
6441 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6442 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6443 LLVMTypeRef params[5];
6444 int num_params, i;
6445 bool status = true;
6446
6447 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6448 ctx.type = PIPE_SHADER_VERTEX;
6449
6450 /* Declare input VGPRs. */
6451 num_params = key->vs_epilog.states.export_prim_id ?
6452 (VS_EPILOG_PRIMID_LOC + 1) : 0;
6453 assert(num_params <= ARRAY_SIZE(params));
6454
6455 for (i = 0; i < num_params; i++)
6456 params[i] = ctx.f32;
6457
6458 /* Create the function. */
6459 si_create_function(&ctx, NULL, 0, params, num_params,
6460 -1, -1);
6461
6462 /* Emit exports. */
6463 if (key->vs_epilog.states.export_prim_id) {
6464 struct lp_build_context *base = &bld_base->base;
6465 struct lp_build_context *uint = &bld_base->uint_bld;
6466 LLVMValueRef args[9];
6467
6468 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6469 args[1] = uint->zero; /* whether the EXEC mask is valid */
6470 args[2] = uint->zero; /* DONE bit */
6471 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6472 key->vs_epilog.prim_id_param_offset);
6473 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6474 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6475 VS_EPILOG_PRIMID_LOC); /* X */
6476 args[6] = uint->undef; /* Y */
6477 args[7] = uint->undef; /* Z */
6478 args[8] = uint->undef; /* W */
6479
6480 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6481 LLVMVoidTypeInContext(base->gallivm->context),
6482 args, 9, 0);
6483 }
6484
6485 /* Compile. */
6486 LLVMBuildRet(gallivm->builder, ctx.return_value);
6487 radeon_llvm_finalize_module(&ctx.radeon_bld);
6488
6489 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6490 gallivm->module, debug, ctx.type,
6491 "Vertex Shader Epilog"))
6492 status = false;
6493
6494 radeon_llvm_dispose(&ctx.radeon_bld);
6495 return status;
6496 }
6497
6498 /**
6499 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
6500 */
6501 static bool si_get_vs_epilog(struct si_screen *sscreen,
6502 LLVMTargetMachineRef tm,
6503 struct si_shader *shader,
6504 struct pipe_debug_callback *debug,
6505 struct si_vs_epilog_bits *states)
6506 {
6507 union si_shader_part_key epilog_key;
6508
6509 memset(&epilog_key, 0, sizeof(epilog_key));
6510 epilog_key.vs_epilog.states = *states;
6511
6512 /* Set up the PrimitiveID output. */
6513 if (shader->key.vs.epilog.export_prim_id) {
6514 unsigned index = shader->selector->info.num_outputs;
6515 unsigned offset = shader->info.nr_param_exports++;
6516
6517 epilog_key.vs_epilog.prim_id_param_offset = offset;
6518 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6519 shader->info.vs_output_param_offset[index] = offset;
6520 }
6521
6522 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
6523 &epilog_key, tm, debug,
6524 si_compile_vs_epilog);
6525 return shader->epilog != NULL;
6526 }
6527
6528 /**
6529 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6530 */
6531 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6532 LLVMTargetMachineRef tm,
6533 struct si_shader *shader,
6534 struct pipe_debug_callback *debug)
6535 {
6536 struct tgsi_shader_info *info = &shader->selector->info;
6537 union si_shader_part_key prolog_key;
6538 unsigned i;
6539
6540 /* Get the prolog. */
6541 memset(&prolog_key, 0, sizeof(prolog_key));
6542 prolog_key.vs_prolog.states = shader->key.vs.prolog;
6543 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6544 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6545
6546 /* The prolog is a no-op if there are no inputs. */
6547 if (info->num_inputs) {
6548 shader->prolog =
6549 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6550 &prolog_key, tm, debug,
6551 si_compile_vs_prolog);
6552 if (!shader->prolog)
6553 return false;
6554 }
6555
6556 /* Get the epilog. */
6557 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
6558 !si_get_vs_epilog(sscreen, tm, shader, debug,
6559 &shader->key.vs.epilog))
6560 return false;
6561
6562 /* Set the instanceID flag. */
6563 for (i = 0; i < info->num_inputs; i++)
6564 if (prolog_key.vs_prolog.states.instance_divisors[i])
6565 shader->info.uses_instanceid = true;
6566
6567 return true;
6568 }
6569
6570 /**
6571 * Select and compile (or reuse) TES parts (epilog).
6572 */
6573 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
6574 LLVMTargetMachineRef tm,
6575 struct si_shader *shader,
6576 struct pipe_debug_callback *debug)
6577 {
6578 if (shader->key.tes.as_es)
6579 return true;
6580
6581 /* TES compiled as VS. */
6582 return si_get_vs_epilog(sscreen, tm, shader, debug,
6583 &shader->key.tes.epilog);
6584 }
6585
6586 /**
6587 * Compile the TCS epilog. This writes tesselation factors to memory based on
6588 * the output primitive type of the tesselator (determined by TES).
6589 */
6590 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
6591 LLVMTargetMachineRef tm,
6592 struct pipe_debug_callback *debug,
6593 struct si_shader_part *out)
6594 {
6595 union si_shader_part_key *key = &out->key;
6596 struct si_shader shader = {};
6597 struct si_shader_context ctx;
6598 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6599 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6600 LLVMTypeRef params[16];
6601 LLVMValueRef func;
6602 int last_array_pointer, last_sgpr, num_params;
6603 bool status = true;
6604
6605 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6606 ctx.type = PIPE_SHADER_TESS_CTRL;
6607 shader.key.tcs.epilog = key->tcs_epilog.states;
6608
6609 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
6610 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
6611 last_array_pointer = SI_PARAM_RW_BUFFERS;
6612 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6613 params[SI_PARAM_SAMPLERS] = ctx.i64;
6614 params[SI_PARAM_IMAGES] = ctx.i64;
6615 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6616 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
6617 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
6618 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
6619 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
6620 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
6621 num_params = last_sgpr + 1;
6622
6623 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
6624 params[num_params++] = ctx.i32; /* invocation ID within the patch */
6625 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
6626
6627 /* Create the function. */
6628 si_create_function(&ctx, NULL, 0, params, num_params,
6629 last_array_pointer, last_sgpr);
6630 declare_tess_lds(&ctx);
6631 func = ctx.radeon_bld.main_fn;
6632
6633 si_write_tess_factors(bld_base,
6634 LLVMGetParam(func, last_sgpr + 1),
6635 LLVMGetParam(func, last_sgpr + 2),
6636 LLVMGetParam(func, last_sgpr + 3));
6637
6638 /* Compile. */
6639 LLVMBuildRet(gallivm->builder, ctx.return_value);
6640 radeon_llvm_finalize_module(&ctx.radeon_bld);
6641
6642 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6643 gallivm->module, debug, ctx.type,
6644 "Tessellation Control Shader Epilog"))
6645 status = false;
6646
6647 radeon_llvm_dispose(&ctx.radeon_bld);
6648 return status;
6649 }
6650
6651 /**
6652 * Select and compile (or reuse) TCS parts (epilog).
6653 */
6654 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6655 LLVMTargetMachineRef tm,
6656 struct si_shader *shader,
6657 struct pipe_debug_callback *debug)
6658 {
6659 union si_shader_part_key epilog_key;
6660
6661 /* Get the epilog. */
6662 memset(&epilog_key, 0, sizeof(epilog_key));
6663 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
6664
6665 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6666 &epilog_key, tm, debug,
6667 si_compile_tcs_epilog);
6668 return shader->epilog != NULL;
6669 }
6670
6671 /**
6672 * Compile the pixel shader prolog. This handles:
6673 * - two-side color selection and interpolation
6674 * - overriding interpolation parameters for the API PS
6675 * - polygon stippling
6676 *
6677 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6678 * overriden by other states. (e.g. per-sample interpolation)
6679 * Interpolated colors are stored after the preloaded VGPRs.
6680 */
6681 static bool si_compile_ps_prolog(struct si_screen *sscreen,
6682 LLVMTargetMachineRef tm,
6683 struct pipe_debug_callback *debug,
6684 struct si_shader_part *out)
6685 {
6686 union si_shader_part_key *key = &out->key;
6687 struct si_shader shader = {};
6688 struct si_shader_context ctx;
6689 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6690 LLVMTypeRef *params;
6691 LLVMValueRef ret, func;
6692 int last_sgpr, num_params, num_returns, i, num_color_channels;
6693 bool status = true;
6694
6695 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6696 ctx.type = PIPE_SHADER_FRAGMENT;
6697 shader.key.ps.prolog = key->ps_prolog.states;
6698
6699 /* Number of inputs + 8 color elements. */
6700 params = alloca((key->ps_prolog.num_input_sgprs +
6701 key->ps_prolog.num_input_vgprs + 8) *
6702 sizeof(LLVMTypeRef));
6703
6704 /* Declare inputs. */
6705 num_params = 0;
6706 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6707 params[num_params++] = ctx.i32;
6708 last_sgpr = num_params - 1;
6709
6710 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6711 params[num_params++] = ctx.f32;
6712
6713 /* Declare outputs (same as inputs + add colors if needed) */
6714 num_returns = num_params;
6715 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6716 for (i = 0; i < num_color_channels; i++)
6717 params[num_returns++] = ctx.f32;
6718
6719 /* Create the function. */
6720 si_create_function(&ctx, params, num_returns, params,
6721 num_params, -1, last_sgpr);
6722 func = ctx.radeon_bld.main_fn;
6723
6724 /* Copy inputs to outputs. This should be no-op, as the registers match,
6725 * but it will prevent the compiler from overwriting them unintentionally.
6726 */
6727 ret = ctx.return_value;
6728 for (i = 0; i < num_params; i++) {
6729 LLVMValueRef p = LLVMGetParam(func, i);
6730 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6731 }
6732
6733 /* Polygon stippling. */
6734 if (key->ps_prolog.states.poly_stipple) {
6735 /* POS_FIXED_PT is always last. */
6736 unsigned pos = key->ps_prolog.num_input_sgprs +
6737 key->ps_prolog.num_input_vgprs - 1;
6738 LLVMValueRef ptr[2], list;
6739
6740 /* Get the pointer to rw buffers. */
6741 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
6742 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
6743 list = lp_build_gather_values(gallivm, ptr, 2);
6744 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
6745 list = LLVMBuildIntToPtr(gallivm->builder, list,
6746 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
6747
6748 si_llvm_emit_polygon_stipple(&ctx, list, pos);
6749 }
6750
6751 /* Interpolate colors. */
6752 for (i = 0; i < 2; i++) {
6753 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
6754 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
6755 key->ps_prolog.face_vgpr_index;
6756 LLVMValueRef interp[2], color[4];
6757 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
6758
6759 if (!writemask)
6760 continue;
6761
6762 /* If the interpolation qualifier is not CONSTANT (-1). */
6763 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
6764 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
6765 key->ps_prolog.color_interp_vgpr_index[i];
6766
6767 interp[0] = LLVMGetParam(func, interp_vgpr);
6768 interp[1] = LLVMGetParam(func, interp_vgpr + 1);
6769 interp_ij = lp_build_gather_values(gallivm, interp, 2);
6770 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
6771 ctx.v2i32, "");
6772 }
6773
6774 /* Use the absolute location of the input. */
6775 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6776
6777 if (key->ps_prolog.states.color_two_side) {
6778 face = LLVMGetParam(func, face_vgpr);
6779 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
6780 }
6781
6782 interp_fs_input(&ctx,
6783 key->ps_prolog.color_attr_index[i],
6784 TGSI_SEMANTIC_COLOR, i,
6785 key->ps_prolog.num_interp_inputs,
6786 key->ps_prolog.colors_read, interp_ij,
6787 prim_mask, face, color);
6788
6789 while (writemask) {
6790 unsigned chan = u_bit_scan(&writemask);
6791 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
6792 num_params++, "");
6793 }
6794 }
6795
6796 /* Force per-sample interpolation. */
6797 if (key->ps_prolog.states.force_persample_interp) {
6798 unsigned i, base = key->ps_prolog.num_input_sgprs;
6799 LLVMValueRef persp_sample[2], linear_sample[2];
6800
6801 /* Read PERSP_SAMPLE. */
6802 for (i = 0; i < 2; i++)
6803 persp_sample[i] = LLVMGetParam(func, base + i);
6804 /* Overwrite PERSP_CENTER. */
6805 for (i = 0; i < 2; i++)
6806 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6807 persp_sample[i], base + 2 + i, "");
6808 /* Overwrite PERSP_CENTROID. */
6809 for (i = 0; i < 2; i++)
6810 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6811 persp_sample[i], base + 4 + i, "");
6812 /* Read LINEAR_SAMPLE. */
6813 for (i = 0; i < 2; i++)
6814 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
6815 /* Overwrite LINEAR_CENTER. */
6816 for (i = 0; i < 2; i++)
6817 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6818 linear_sample[i], base + 8 + i, "");
6819 /* Overwrite LINEAR_CENTROID. */
6820 for (i = 0; i < 2; i++)
6821 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6822 linear_sample[i], base + 10 + i, "");
6823 }
6824
6825 /* Compile. */
6826 LLVMBuildRet(gallivm->builder, ret);
6827 radeon_llvm_finalize_module(&ctx.radeon_bld);
6828
6829 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6830 gallivm->module, debug, ctx.type,
6831 "Fragment Shader Prolog"))
6832 status = false;
6833
6834 radeon_llvm_dispose(&ctx.radeon_bld);
6835 return status;
6836 }
6837
6838 /**
6839 * Compile the pixel shader epilog. This handles everything that must be
6840 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
6841 */
6842 static bool si_compile_ps_epilog(struct si_screen *sscreen,
6843 LLVMTargetMachineRef tm,
6844 struct pipe_debug_callback *debug,
6845 struct si_shader_part *out)
6846 {
6847 union si_shader_part_key *key = &out->key;
6848 struct si_shader shader = {};
6849 struct si_shader_context ctx;
6850 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6851 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6852 LLVMTypeRef params[16+8*4+3];
6853 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
6854 int last_array_pointer, last_sgpr, num_params, i;
6855 bool status = true;
6856
6857 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6858 ctx.type = PIPE_SHADER_FRAGMENT;
6859 shader.key.ps.epilog = key->ps_epilog.states;
6860
6861 /* Declare input SGPRs. */
6862 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
6863 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6864 params[SI_PARAM_SAMPLERS] = ctx.i64;
6865 params[SI_PARAM_IMAGES] = ctx.i64;
6866 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6867 params[SI_PARAM_ALPHA_REF] = ctx.f32;
6868 last_array_pointer = -1;
6869 last_sgpr = SI_PARAM_ALPHA_REF;
6870
6871 /* Declare input VGPRs. */
6872 num_params = (last_sgpr + 1) +
6873 util_bitcount(key->ps_epilog.colors_written) * 4 +
6874 key->ps_epilog.writes_z +
6875 key->ps_epilog.writes_stencil +
6876 key->ps_epilog.writes_samplemask;
6877
6878 num_params = MAX2(num_params,
6879 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6880
6881 assert(num_params <= ARRAY_SIZE(params));
6882
6883 for (i = last_sgpr + 1; i < num_params; i++)
6884 params[i] = ctx.f32;
6885
6886 /* Create the function. */
6887 si_create_function(&ctx, NULL, 0, params, num_params,
6888 last_array_pointer, last_sgpr);
6889 /* Disable elimination of unused inputs. */
6890 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
6891 "InitialPSInputAddr", 0xffffff);
6892
6893 /* Process colors. */
6894 unsigned vgpr = last_sgpr + 1;
6895 unsigned colors_written = key->ps_epilog.colors_written;
6896 int last_color_export = -1;
6897
6898 /* Find the last color export. */
6899 if (!key->ps_epilog.writes_z &&
6900 !key->ps_epilog.writes_stencil &&
6901 !key->ps_epilog.writes_samplemask) {
6902 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
6903
6904 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
6905 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
6906 /* Just set this if any of the colorbuffers are enabled. */
6907 if (spi_format &
6908 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
6909 last_color_export = 0;
6910 } else {
6911 for (i = 0; i < 8; i++)
6912 if (colors_written & (1 << i) &&
6913 (spi_format >> (i * 4)) & 0xf)
6914 last_color_export = i;
6915 }
6916 }
6917
6918 while (colors_written) {
6919 LLVMValueRef color[4];
6920 int mrt = u_bit_scan(&colors_written);
6921
6922 for (i = 0; i < 4; i++)
6923 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6924
6925 si_export_mrt_color(bld_base, color, mrt,
6926 num_params - 1,
6927 mrt == last_color_export);
6928 }
6929
6930 /* Process depth, stencil, samplemask. */
6931 if (key->ps_epilog.writes_z)
6932 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6933 if (key->ps_epilog.writes_stencil)
6934 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6935 if (key->ps_epilog.writes_samplemask)
6936 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6937
6938 if (depth || stencil || samplemask)
6939 si_export_mrt_z(bld_base, depth, stencil, samplemask);
6940 else if (last_color_export == -1)
6941 si_export_null(bld_base);
6942
6943 /* Compile. */
6944 LLVMBuildRetVoid(gallivm->builder);
6945 radeon_llvm_finalize_module(&ctx.radeon_bld);
6946
6947 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6948 gallivm->module, debug, ctx.type,
6949 "Fragment Shader Epilog"))
6950 status = false;
6951
6952 radeon_llvm_dispose(&ctx.radeon_bld);
6953 return status;
6954 }
6955
6956 /**
6957 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
6958 */
6959 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
6960 LLVMTargetMachineRef tm,
6961 struct si_shader *shader,
6962 struct pipe_debug_callback *debug)
6963 {
6964 struct tgsi_shader_info *info = &shader->selector->info;
6965 union si_shader_part_key prolog_key;
6966 union si_shader_part_key epilog_key;
6967 unsigned i;
6968
6969 /* Get the prolog. */
6970 memset(&prolog_key, 0, sizeof(prolog_key));
6971 prolog_key.ps_prolog.states = shader->key.ps.prolog;
6972 prolog_key.ps_prolog.colors_read = info->colors_read;
6973 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6974 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6975
6976 if (info->colors_read) {
6977 unsigned *color = shader->selector->color_attr_index;
6978
6979 if (shader->key.ps.prolog.color_two_side) {
6980 /* BCOLORs are stored after the last input. */
6981 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
6982 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6983 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6984 }
6985
6986 for (i = 0; i < 2; i++) {
6987 unsigned location = info->input_interpolate_loc[color[i]];
6988
6989 if (!(info->colors_read & (0xf << i*4)))
6990 continue;
6991
6992 prolog_key.ps_prolog.color_attr_index[i] = color[i];
6993
6994 /* Force per-sample interpolation for the colors here. */
6995 if (shader->key.ps.prolog.force_persample_interp)
6996 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6997
6998 switch (info->input_interpolate[color[i]]) {
6999 case TGSI_INTERPOLATE_CONSTANT:
7000 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7001 break;
7002 case TGSI_INTERPOLATE_PERSPECTIVE:
7003 case TGSI_INTERPOLATE_COLOR:
7004 switch (location) {
7005 case TGSI_INTERPOLATE_LOC_SAMPLE:
7006 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7007 shader->config.spi_ps_input_ena |=
7008 S_0286CC_PERSP_SAMPLE_ENA(1);
7009 break;
7010 case TGSI_INTERPOLATE_LOC_CENTER:
7011 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7012 shader->config.spi_ps_input_ena |=
7013 S_0286CC_PERSP_CENTER_ENA(1);
7014 break;
7015 case TGSI_INTERPOLATE_LOC_CENTROID:
7016 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7017 shader->config.spi_ps_input_ena |=
7018 S_0286CC_PERSP_CENTROID_ENA(1);
7019 break;
7020 default:
7021 assert(0);
7022 }
7023 break;
7024 case TGSI_INTERPOLATE_LINEAR:
7025 switch (location) {
7026 case TGSI_INTERPOLATE_LOC_SAMPLE:
7027 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7028 shader->config.spi_ps_input_ena |=
7029 S_0286CC_LINEAR_SAMPLE_ENA(1);
7030 break;
7031 case TGSI_INTERPOLATE_LOC_CENTER:
7032 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7033 shader->config.spi_ps_input_ena |=
7034 S_0286CC_LINEAR_CENTER_ENA(1);
7035 break;
7036 case TGSI_INTERPOLATE_LOC_CENTROID:
7037 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7038 shader->config.spi_ps_input_ena |=
7039 S_0286CC_LINEAR_CENTROID_ENA(1);
7040 break;
7041 default:
7042 assert(0);
7043 }
7044 break;
7045 default:
7046 assert(0);
7047 }
7048 }
7049 }
7050
7051 /* The prolog is a no-op if these aren't set. */
7052 if (prolog_key.ps_prolog.colors_read ||
7053 prolog_key.ps_prolog.states.force_persample_interp ||
7054 prolog_key.ps_prolog.states.poly_stipple) {
7055 shader->prolog =
7056 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7057 &prolog_key, tm, debug,
7058 si_compile_ps_prolog);
7059 if (!shader->prolog)
7060 return false;
7061 }
7062
7063 /* Get the epilog. */
7064 memset(&epilog_key, 0, sizeof(epilog_key));
7065 epilog_key.ps_epilog.colors_written = info->colors_written;
7066 epilog_key.ps_epilog.writes_z = info->writes_z;
7067 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7068 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7069 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7070
7071 shader->epilog =
7072 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7073 &epilog_key, tm, debug,
7074 si_compile_ps_epilog);
7075 if (!shader->epilog)
7076 return false;
7077
7078 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7079 if (shader->key.ps.prolog.poly_stipple) {
7080 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7081 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7082 }
7083
7084 /* Set up the enable bits for per-sample shading if needed. */
7085 if (shader->key.ps.prolog.force_persample_interp) {
7086 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7087 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
7088 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7089 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7090 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7091 }
7092 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7093 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
7094 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7095 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7096 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7097 }
7098 }
7099
7100 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7101 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7102 !(shader->config.spi_ps_input_ena & 0xf)) {
7103 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7104 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7105 }
7106
7107 /* At least one pair of interpolation weights must be enabled. */
7108 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7109 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7110 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7111 }
7112
7113 /* The sample mask input is always enabled, because the API shader always
7114 * passes it through to the epilog. Disable it here if it's unused.
7115 */
7116 if (!shader->key.ps.epilog.poly_line_smoothing &&
7117 !shader->selector->info.reads_samplemask)
7118 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7119
7120 return true;
7121 }
7122
7123 static void si_fix_num_sgprs(struct si_shader *shader)
7124 {
7125 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7126
7127 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7128 }
7129
7130 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7131 struct si_shader *shader,
7132 struct pipe_debug_callback *debug)
7133 {
7134 struct si_shader *mainp = shader->selector->main_shader_part;
7135 int r;
7136
7137 /* LS, ES, VS are compiled on demand if the main part hasn't been
7138 * compiled for that stage.
7139 */
7140 if (!mainp ||
7141 (shader->selector->type == PIPE_SHADER_VERTEX &&
7142 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7143 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7144 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7145 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7146 shader->selector->type == PIPE_SHADER_COMPUTE) {
7147 /* Monolithic shader (compiled as a whole, has many variants,
7148 * may take a long time to compile).
7149 */
7150 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7151 if (r)
7152 return r;
7153 } else {
7154 /* The shader consists of 2-3 parts:
7155 *
7156 * - the middle part is the user shader, it has 1 variant only
7157 * and it was compiled during the creation of the shader
7158 * selector
7159 * - the prolog part is inserted at the beginning
7160 * - the epilog part is inserted at the end
7161 *
7162 * The prolog and epilog have many (but simple) variants.
7163 */
7164
7165 /* Copy the compiled TGSI shader data over. */
7166 shader->is_binary_shared = true;
7167 shader->binary = mainp->binary;
7168 shader->config = mainp->config;
7169 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7170 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7171 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7172 memcpy(shader->info.vs_output_param_offset,
7173 mainp->info.vs_output_param_offset,
7174 sizeof(mainp->info.vs_output_param_offset));
7175 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7176 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7177 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7178
7179 /* Select prologs and/or epilogs. */
7180 switch (shader->selector->type) {
7181 case PIPE_SHADER_VERTEX:
7182 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7183 return -1;
7184 break;
7185 case PIPE_SHADER_TESS_CTRL:
7186 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7187 return -1;
7188 break;
7189 case PIPE_SHADER_TESS_EVAL:
7190 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7191 return -1;
7192 break;
7193 case PIPE_SHADER_FRAGMENT:
7194 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7195 return -1;
7196
7197 /* Make sure we have at least as many VGPRs as there
7198 * are allocated inputs.
7199 */
7200 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7201 shader->info.num_input_vgprs);
7202 break;
7203 }
7204
7205 /* Update SGPR and VGPR counts. */
7206 if (shader->prolog) {
7207 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7208 shader->prolog->config.num_sgprs);
7209 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7210 shader->prolog->config.num_vgprs);
7211 }
7212 if (shader->epilog) {
7213 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7214 shader->epilog->config.num_sgprs);
7215 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7216 shader->epilog->config.num_vgprs);
7217 }
7218 }
7219
7220 si_fix_num_sgprs(shader);
7221 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7222 stderr);
7223
7224 /* Upload. */
7225 r = si_shader_binary_upload(sscreen, shader);
7226 if (r) {
7227 fprintf(stderr, "LLVM failed to upload shader\n");
7228 return r;
7229 }
7230
7231 return 0;
7232 }
7233
7234 void si_shader_destroy(struct si_shader *shader)
7235 {
7236 if (shader->gs_copy_shader) {
7237 si_shader_destroy(shader->gs_copy_shader);
7238 FREE(shader->gs_copy_shader);
7239 }
7240
7241 if (shader->scratch_bo)
7242 r600_resource_reference(&shader->scratch_bo, NULL);
7243
7244 r600_resource_reference(&shader->bo, NULL);
7245
7246 if (!shader->is_binary_shared)
7247 radeon_shader_binary_clean(&shader->binary);
7248 }