radeonsi: Implement ddx/ddy on VI using ds_bpermute
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "radeon/r600_cs.h"
37 #include "radeon/radeon_llvm.h"
38 #include "radeon/radeon_elf_util.h"
39 #include "radeon/radeon_llvm_emit.h"
40 #include "util/u_memory.h"
41 #include "util/u_pstipple.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94
95 LLVMTargetMachineRef tm;
96
97 LLVMValueRef const_md;
98 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
99 LLVMValueRef lds;
100 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
101 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
102 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
103 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
104 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
105 LLVMValueRef images[SI_NUM_IMAGES];
106 LLVMValueRef so_buffers[4];
107 LLVMValueRef esgs_ring;
108 LLVMValueRef gsvs_ring[4];
109 LLVMValueRef gs_next_vertex[4];
110 LLVMValueRef return_value;
111
112 LLVMTypeRef voidt;
113 LLVMTypeRef i1;
114 LLVMTypeRef i8;
115 LLVMTypeRef i32;
116 LLVMTypeRef i64;
117 LLVMTypeRef i128;
118 LLVMTypeRef f32;
119 LLVMTypeRef v16i8;
120 LLVMTypeRef v2i32;
121 LLVMTypeRef v4i32;
122 LLVMTypeRef v4f32;
123 LLVMTypeRef v8i32;
124
125 LLVMValueRef shared_memory;
126 };
127
128 static struct si_shader_context *si_shader_context(
129 struct lp_build_tgsi_context *bld_base)
130 {
131 return (struct si_shader_context *)bld_base;
132 }
133
134 static void si_init_shader_ctx(struct si_shader_context *ctx,
135 struct si_screen *sscreen,
136 struct si_shader *shader,
137 LLVMTargetMachineRef tm);
138
139 /* Ideally pass the sample mask input to the PS epilog as v13, which
140 * is its usual location, so that the shader doesn't have to add v_mov.
141 */
142 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
143
144 /* The VS location of the PrimitiveID input is the same in the epilog,
145 * so that the main shader part doesn't have to move it.
146 */
147 #define VS_EPILOG_PRIMID_LOC 2
148
149 #define PERSPECTIVE_BASE 0
150 #define LINEAR_BASE 9
151
152 #define SAMPLE_OFFSET 0
153 #define CENTER_OFFSET 2
154 #define CENTROID_OFSET 4
155
156 #define USE_SGPR_MAX_SUFFIX_LEN 5
157 #define CONST_ADDR_SPACE 2
158 #define LOCAL_ADDR_SPACE 3
159 #define USER_SGPR_ADDR_SPACE 8
160
161
162 #define SENDMSG_GS 2
163 #define SENDMSG_GS_DONE 3
164
165 #define SENDMSG_GS_OP_NOP (0 << 4)
166 #define SENDMSG_GS_OP_CUT (1 << 4)
167 #define SENDMSG_GS_OP_EMIT (2 << 4)
168 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
169
170 /**
171 * Returns a unique index for a semantic name and index. The index must be
172 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
173 * calculated.
174 */
175 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
176 {
177 switch (semantic_name) {
178 case TGSI_SEMANTIC_POSITION:
179 return 0;
180 case TGSI_SEMANTIC_PSIZE:
181 return 1;
182 case TGSI_SEMANTIC_CLIPDIST:
183 assert(index <= 1);
184 return 2 + index;
185 case TGSI_SEMANTIC_GENERIC:
186 if (index <= 63-4)
187 return 4 + index;
188 else
189 /* same explanation as in the default statement,
190 * the only user hitting this is st/nine.
191 */
192 return 0;
193
194 /* patch indices are completely separate and thus start from 0 */
195 case TGSI_SEMANTIC_TESSOUTER:
196 return 0;
197 case TGSI_SEMANTIC_TESSINNER:
198 return 1;
199 case TGSI_SEMANTIC_PATCH:
200 return 2 + index;
201
202 default:
203 /* Don't fail here. The result of this function is only used
204 * for LS, TCS, TES, and GS, where legacy GL semantics can't
205 * occur, but this function is called for all vertex shaders
206 * before it's known whether LS will be compiled or not.
207 */
208 return 0;
209 }
210 }
211
212 /**
213 * Get the value of a shader input parameter and extract a bitfield.
214 */
215 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
216 unsigned param, unsigned rshift,
217 unsigned bitwidth)
218 {
219 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
220 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
221 param);
222
223 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
224 value = bitcast(&ctx->radeon_bld.soa.bld_base,
225 TGSI_TYPE_UNSIGNED, value);
226
227 if (rshift)
228 value = LLVMBuildLShr(gallivm->builder, value,
229 lp_build_const_int32(gallivm, rshift), "");
230
231 if (rshift + bitwidth < 32) {
232 unsigned mask = (1 << bitwidth) - 1;
233 value = LLVMBuildAnd(gallivm->builder, value,
234 lp_build_const_int32(gallivm, mask), "");
235 }
236
237 return value;
238 }
239
240 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
241 {
242 switch (ctx->type) {
243 case PIPE_SHADER_TESS_CTRL:
244 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
245
246 case PIPE_SHADER_TESS_EVAL:
247 return LLVMGetParam(ctx->radeon_bld.main_fn,
248 ctx->param_tes_rel_patch_id);
249
250 default:
251 assert(0);
252 return NULL;
253 }
254 }
255
256 /* Tessellation shaders pass outputs to the next shader using LDS.
257 *
258 * LS outputs = TCS inputs
259 * TCS outputs = TES inputs
260 *
261 * The LDS layout is:
262 * - TCS inputs for patch 0
263 * - TCS inputs for patch 1
264 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
265 * - ...
266 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
267 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
268 * - TCS outputs for patch 1
269 * - Per-patch TCS outputs for patch 1
270 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
271 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
272 * - ...
273 *
274 * All three shaders VS(LS), TCS, TES share the same LDS space.
275 */
276
277 static LLVMValueRef
278 get_tcs_in_patch_stride(struct si_shader_context *ctx)
279 {
280 if (ctx->type == PIPE_SHADER_VERTEX)
281 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
282 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
283 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
284 else {
285 assert(0);
286 return NULL;
287 }
288 }
289
290 static LLVMValueRef
291 get_tcs_out_patch_stride(struct si_shader_context *ctx)
292 {
293 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
294 }
295
296 static LLVMValueRef
297 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
298 {
299 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
300 unpack_param(ctx,
301 SI_PARAM_TCS_OUT_OFFSETS,
302 0, 16),
303 4);
304 }
305
306 static LLVMValueRef
307 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
308 {
309 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
310 unpack_param(ctx,
311 SI_PARAM_TCS_OUT_OFFSETS,
312 16, 16),
313 4);
314 }
315
316 static LLVMValueRef
317 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
318 {
319 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
320 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
321 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
322
323 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
324 }
325
326 static LLVMValueRef
327 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
328 {
329 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
330 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
331 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
332 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
333
334 return LLVMBuildAdd(gallivm->builder, patch0_offset,
335 LLVMBuildMul(gallivm->builder, patch_stride,
336 rel_patch_id, ""),
337 "");
338 }
339
340 static LLVMValueRef
341 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
342 {
343 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
344 LLVMValueRef patch0_patch_data_offset =
345 get_tcs_out_patch0_patch_data_offset(ctx);
346 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
347 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
348
349 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
350 LLVMBuildMul(gallivm->builder, patch_stride,
351 rel_patch_id, ""),
352 "");
353 }
354
355 static void build_indexed_store(struct si_shader_context *ctx,
356 LLVMValueRef base_ptr, LLVMValueRef index,
357 LLVMValueRef value)
358 {
359 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
360 struct gallivm_state *gallivm = bld_base->base.gallivm;
361 LLVMValueRef indices[2], pointer;
362
363 indices[0] = bld_base->uint_bld.zero;
364 indices[1] = index;
365
366 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
367 LLVMBuildStore(gallivm->builder, value, pointer);
368 }
369
370 /**
371 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
372 * It's equivalent to doing a load from &base_ptr[index].
373 *
374 * \param base_ptr Where the array starts.
375 * \param index The element index into the array.
376 */
377 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
378 LLVMValueRef base_ptr, LLVMValueRef index)
379 {
380 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
381 struct gallivm_state *gallivm = bld_base->base.gallivm;
382 LLVMValueRef indices[2], pointer;
383
384 indices[0] = bld_base->uint_bld.zero;
385 indices[1] = index;
386
387 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
388 return LLVMBuildLoad(gallivm->builder, pointer, "");
389 }
390
391 /**
392 * Do a load from &base_ptr[index], but also add a flag that it's loading
393 * a constant.
394 */
395 static LLVMValueRef build_indexed_load_const(
396 struct si_shader_context *ctx,
397 LLVMValueRef base_ptr, LLVMValueRef index)
398 {
399 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index);
400 LLVMSetMetadata(result, 1, ctx->const_md);
401 return result;
402 }
403
404 static LLVMValueRef get_instance_index_for_fetch(
405 struct radeon_llvm_context *radeon_bld,
406 unsigned param_start_instance, unsigned divisor)
407 {
408 struct si_shader_context *ctx =
409 si_shader_context(&radeon_bld->soa.bld_base);
410 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
411
412 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
413 ctx->param_instance_id);
414
415 /* The division must be done before START_INSTANCE is added. */
416 if (divisor > 1)
417 result = LLVMBuildUDiv(gallivm->builder, result,
418 lp_build_const_int32(gallivm, divisor), "");
419
420 return LLVMBuildAdd(gallivm->builder, result,
421 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
422 }
423
424 static void declare_input_vs(
425 struct radeon_llvm_context *radeon_bld,
426 unsigned input_index,
427 const struct tgsi_full_declaration *decl)
428 {
429 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
430 struct gallivm_state *gallivm = base->gallivm;
431 struct si_shader_context *ctx =
432 si_shader_context(&radeon_bld->soa.bld_base);
433 unsigned divisor =
434 ctx->shader->key.vs.prolog.instance_divisors[input_index];
435
436 unsigned chan;
437
438 LLVMValueRef t_list_ptr;
439 LLVMValueRef t_offset;
440 LLVMValueRef t_list;
441 LLVMValueRef attribute_offset;
442 LLVMValueRef buffer_index;
443 LLVMValueRef args[3];
444 LLVMValueRef input;
445
446 /* Load the T list */
447 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
448
449 t_offset = lp_build_const_int32(gallivm, input_index);
450
451 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
452
453 /* Build the attribute offset */
454 attribute_offset = lp_build_const_int32(gallivm, 0);
455
456 if (!ctx->is_monolithic) {
457 buffer_index = LLVMGetParam(radeon_bld->main_fn,
458 ctx->param_vertex_index0 +
459 input_index);
460 } else if (divisor) {
461 /* Build index from instance ID, start instance and divisor */
462 ctx->shader->info.uses_instanceid = true;
463 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
464 SI_PARAM_START_INSTANCE,
465 divisor);
466 } else {
467 /* Load the buffer index for vertices. */
468 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
469 ctx->param_vertex_id);
470 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
471 SI_PARAM_BASE_VERTEX);
472 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
473 }
474
475 args[0] = t_list;
476 args[1] = attribute_offset;
477 args[2] = buffer_index;
478 input = lp_build_intrinsic(gallivm->builder,
479 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
480 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
481
482 /* Break up the vec4 into individual components */
483 for (chan = 0; chan < 4; chan++) {
484 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
485 /* XXX: Use a helper function for this. There is one in
486 * tgsi_llvm.c. */
487 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
488 LLVMBuildExtractElement(gallivm->builder,
489 input, llvm_chan, "");
490 }
491 }
492
493 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
494 unsigned swizzle)
495 {
496 struct si_shader_context *ctx = si_shader_context(bld_base);
497
498 if (swizzle > 0)
499 return bld_base->uint_bld.zero;
500
501 switch (ctx->type) {
502 case PIPE_SHADER_VERTEX:
503 return LLVMGetParam(ctx->radeon_bld.main_fn,
504 ctx->param_vs_prim_id);
505 case PIPE_SHADER_TESS_CTRL:
506 return LLVMGetParam(ctx->radeon_bld.main_fn,
507 SI_PARAM_PATCH_ID);
508 case PIPE_SHADER_TESS_EVAL:
509 return LLVMGetParam(ctx->radeon_bld.main_fn,
510 ctx->param_tes_patch_id);
511 case PIPE_SHADER_GEOMETRY:
512 return LLVMGetParam(ctx->radeon_bld.main_fn,
513 SI_PARAM_PRIMITIVE_ID);
514 default:
515 assert(0);
516 return bld_base->uint_bld.zero;
517 }
518 }
519
520 /**
521 * Return the value of tgsi_ind_register for indexing.
522 * This is the indirect index with the constant offset added to it.
523 */
524 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
525 const struct tgsi_ind_register *ind,
526 int rel_index)
527 {
528 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
529 LLVMValueRef result;
530
531 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
532 result = LLVMBuildLoad(gallivm->builder, result, "");
533 result = LLVMBuildAdd(gallivm->builder, result,
534 lp_build_const_int32(gallivm, rel_index), "");
535 return result;
536 }
537
538 /**
539 * Like get_indirect_index, but restricts the return value to a (possibly
540 * undefined) value inside [0..num).
541 */
542 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
543 const struct tgsi_ind_register *ind,
544 int rel_index, unsigned num)
545 {
546 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
547 LLVMBuilderRef builder = gallivm->builder;
548 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
549 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
550 LLVMValueRef cc;
551
552 if (util_is_power_of_two(num)) {
553 result = LLVMBuildAnd(builder, result, c_max, "");
554 } else {
555 /* In theory, this MAX pattern should result in code that is
556 * as good as the bit-wise AND above.
557 *
558 * In practice, LLVM generates worse code (at the time of
559 * writing), because its value tracking is not strong enough.
560 */
561 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
562 result = LLVMBuildSelect(builder, cc, result, c_max, "");
563 }
564
565 return result;
566 }
567
568
569 /**
570 * Calculate a dword address given an input or output register and a stride.
571 */
572 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
573 const struct tgsi_full_dst_register *dst,
574 const struct tgsi_full_src_register *src,
575 LLVMValueRef vertex_dw_stride,
576 LLVMValueRef base_addr)
577 {
578 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
579 struct tgsi_shader_info *info = &ctx->shader->selector->info;
580 ubyte *name, *index, *array_first;
581 int first, param;
582 struct tgsi_full_dst_register reg;
583
584 /* Set the register description. The address computation is the same
585 * for sources and destinations. */
586 if (src) {
587 reg.Register.File = src->Register.File;
588 reg.Register.Index = src->Register.Index;
589 reg.Register.Indirect = src->Register.Indirect;
590 reg.Register.Dimension = src->Register.Dimension;
591 reg.Indirect = src->Indirect;
592 reg.Dimension = src->Dimension;
593 reg.DimIndirect = src->DimIndirect;
594 } else
595 reg = *dst;
596
597 /* If the register is 2-dimensional (e.g. an array of vertices
598 * in a primitive), calculate the base address of the vertex. */
599 if (reg.Register.Dimension) {
600 LLVMValueRef index;
601
602 if (reg.Dimension.Indirect)
603 index = get_indirect_index(ctx, &reg.DimIndirect,
604 reg.Dimension.Index);
605 else
606 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
607
608 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
609 LLVMBuildMul(gallivm->builder, index,
610 vertex_dw_stride, ""), "");
611 }
612
613 /* Get information about the register. */
614 if (reg.Register.File == TGSI_FILE_INPUT) {
615 name = info->input_semantic_name;
616 index = info->input_semantic_index;
617 array_first = info->input_array_first;
618 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
619 name = info->output_semantic_name;
620 index = info->output_semantic_index;
621 array_first = info->output_array_first;
622 } else {
623 assert(0);
624 return NULL;
625 }
626
627 if (reg.Register.Indirect) {
628 /* Add the relative address of the element. */
629 LLVMValueRef ind_index;
630
631 if (reg.Indirect.ArrayID)
632 first = array_first[reg.Indirect.ArrayID];
633 else
634 first = reg.Register.Index;
635
636 ind_index = get_indirect_index(ctx, &reg.Indirect,
637 reg.Register.Index - first);
638
639 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
640 LLVMBuildMul(gallivm->builder, ind_index,
641 lp_build_const_int32(gallivm, 4), ""), "");
642
643 param = si_shader_io_get_unique_index(name[first], index[first]);
644 } else {
645 param = si_shader_io_get_unique_index(name[reg.Register.Index],
646 index[reg.Register.Index]);
647 }
648
649 /* Add the base address of the element. */
650 return LLVMBuildAdd(gallivm->builder, base_addr,
651 lp_build_const_int32(gallivm, param * 4), "");
652 }
653
654 /**
655 * Load from LDS.
656 *
657 * \param type output value type
658 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
659 * \param dw_addr address in dwords
660 */
661 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
662 enum tgsi_opcode_type type, unsigned swizzle,
663 LLVMValueRef dw_addr)
664 {
665 struct si_shader_context *ctx = si_shader_context(bld_base);
666 struct gallivm_state *gallivm = bld_base->base.gallivm;
667 LLVMValueRef value;
668
669 if (swizzle == ~0) {
670 LLVMValueRef values[TGSI_NUM_CHANNELS];
671
672 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
673 values[chan] = lds_load(bld_base, type, chan, dw_addr);
674
675 return lp_build_gather_values(bld_base->base.gallivm, values,
676 TGSI_NUM_CHANNELS);
677 }
678
679 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
680 lp_build_const_int32(gallivm, swizzle));
681
682 value = build_indexed_load(ctx, ctx->lds, dw_addr);
683 if (type == TGSI_TYPE_DOUBLE) {
684 LLVMValueRef value2;
685 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
686 lp_build_const_int32(gallivm, swizzle + 1));
687 value2 = build_indexed_load(ctx, ctx->lds, dw_addr);
688 return radeon_llvm_emit_fetch_double(bld_base, value, value2);
689 }
690
691 return LLVMBuildBitCast(gallivm->builder, value,
692 tgsi2llvmtype(bld_base, type), "");
693 }
694
695 /**
696 * Store to LDS.
697 *
698 * \param swizzle offset (typically 0..3)
699 * \param dw_addr address in dwords
700 * \param value value to store
701 */
702 static void lds_store(struct lp_build_tgsi_context *bld_base,
703 unsigned swizzle, LLVMValueRef dw_addr,
704 LLVMValueRef value)
705 {
706 struct si_shader_context *ctx = si_shader_context(bld_base);
707 struct gallivm_state *gallivm = bld_base->base.gallivm;
708
709 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
710 lp_build_const_int32(gallivm, swizzle));
711
712 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
713 build_indexed_store(ctx, ctx->lds,
714 dw_addr, value);
715 }
716
717 static LLVMValueRef fetch_input_tcs(
718 struct lp_build_tgsi_context *bld_base,
719 const struct tgsi_full_src_register *reg,
720 enum tgsi_opcode_type type, unsigned swizzle)
721 {
722 struct si_shader_context *ctx = si_shader_context(bld_base);
723 LLVMValueRef dw_addr, stride;
724
725 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
726 dw_addr = get_tcs_in_current_patch_offset(ctx);
727 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
728
729 return lds_load(bld_base, type, swizzle, dw_addr);
730 }
731
732 static LLVMValueRef fetch_output_tcs(
733 struct lp_build_tgsi_context *bld_base,
734 const struct tgsi_full_src_register *reg,
735 enum tgsi_opcode_type type, unsigned swizzle)
736 {
737 struct si_shader_context *ctx = si_shader_context(bld_base);
738 LLVMValueRef dw_addr, stride;
739
740 if (reg->Register.Dimension) {
741 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
742 dw_addr = get_tcs_out_current_patch_offset(ctx);
743 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
744 } else {
745 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
746 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
747 }
748
749 return lds_load(bld_base, type, swizzle, dw_addr);
750 }
751
752 static LLVMValueRef fetch_input_tes(
753 struct lp_build_tgsi_context *bld_base,
754 const struct tgsi_full_src_register *reg,
755 enum tgsi_opcode_type type, unsigned swizzle)
756 {
757 struct si_shader_context *ctx = si_shader_context(bld_base);
758 LLVMValueRef dw_addr, stride;
759
760 if (reg->Register.Dimension) {
761 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
762 dw_addr = get_tcs_out_current_patch_offset(ctx);
763 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
764 } else {
765 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
766 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
767 }
768
769 return lds_load(bld_base, type, swizzle, dw_addr);
770 }
771
772 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
773 const struct tgsi_full_instruction *inst,
774 const struct tgsi_opcode_info *info,
775 LLVMValueRef dst[4])
776 {
777 struct si_shader_context *ctx = si_shader_context(bld_base);
778 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
779 unsigned chan_index;
780 LLVMValueRef dw_addr, stride;
781
782 /* Only handle per-patch and per-vertex outputs here.
783 * Vectors will be lowered to scalars and this function will be called again.
784 */
785 if (reg->Register.File != TGSI_FILE_OUTPUT ||
786 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
787 radeon_llvm_emit_store(bld_base, inst, info, dst);
788 return;
789 }
790
791 if (reg->Register.Dimension) {
792 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
793 dw_addr = get_tcs_out_current_patch_offset(ctx);
794 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
795 } else {
796 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
797 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
798 }
799
800 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
801 LLVMValueRef value = dst[chan_index];
802
803 if (inst->Instruction.Saturate)
804 value = radeon_llvm_saturate(bld_base, value);
805
806 lds_store(bld_base, chan_index, dw_addr, value);
807 }
808 }
809
810 static LLVMValueRef fetch_input_gs(
811 struct lp_build_tgsi_context *bld_base,
812 const struct tgsi_full_src_register *reg,
813 enum tgsi_opcode_type type,
814 unsigned swizzle)
815 {
816 struct lp_build_context *base = &bld_base->base;
817 struct si_shader_context *ctx = si_shader_context(bld_base);
818 struct si_shader *shader = ctx->shader;
819 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
820 struct gallivm_state *gallivm = base->gallivm;
821 LLVMValueRef vtx_offset;
822 LLVMValueRef args[9];
823 unsigned vtx_offset_param;
824 struct tgsi_shader_info *info = &shader->selector->info;
825 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
826 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
827 unsigned param;
828 LLVMValueRef value;
829
830 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
831 return get_primitive_id(bld_base, swizzle);
832
833 if (!reg->Register.Dimension)
834 return NULL;
835
836 if (swizzle == ~0) {
837 LLVMValueRef values[TGSI_NUM_CHANNELS];
838 unsigned chan;
839 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
840 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
841 }
842 return lp_build_gather_values(bld_base->base.gallivm, values,
843 TGSI_NUM_CHANNELS);
844 }
845
846 /* Get the vertex offset parameter */
847 vtx_offset_param = reg->Dimension.Index;
848 if (vtx_offset_param < 2) {
849 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
850 } else {
851 assert(vtx_offset_param < 6);
852 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
853 }
854 vtx_offset = lp_build_mul_imm(uint,
855 LLVMGetParam(ctx->radeon_bld.main_fn,
856 vtx_offset_param),
857 4);
858
859 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
860 args[0] = ctx->esgs_ring;
861 args[1] = vtx_offset;
862 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
863 args[3] = uint->zero;
864 args[4] = uint->one; /* OFFEN */
865 args[5] = uint->zero; /* IDXEN */
866 args[6] = uint->one; /* GLC */
867 args[7] = uint->zero; /* SLC */
868 args[8] = uint->zero; /* TFE */
869
870 value = lp_build_intrinsic(gallivm->builder,
871 "llvm.SI.buffer.load.dword.i32.i32",
872 ctx->i32, args, 9,
873 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
874 if (type == TGSI_TYPE_DOUBLE) {
875 LLVMValueRef value2;
876 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
877 value2 = lp_build_intrinsic(gallivm->builder,
878 "llvm.SI.buffer.load.dword.i32.i32",
879 ctx->i32, args, 9,
880 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
881 return radeon_llvm_emit_fetch_double(bld_base,
882 value, value2);
883 }
884 return LLVMBuildBitCast(gallivm->builder,
885 value,
886 tgsi2llvmtype(bld_base, type), "");
887 }
888
889 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
890 {
891 switch (interpolate) {
892 case TGSI_INTERPOLATE_CONSTANT:
893 return 0;
894
895 case TGSI_INTERPOLATE_LINEAR:
896 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
897 return SI_PARAM_LINEAR_SAMPLE;
898 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
899 return SI_PARAM_LINEAR_CENTROID;
900 else
901 return SI_PARAM_LINEAR_CENTER;
902 break;
903 case TGSI_INTERPOLATE_COLOR:
904 case TGSI_INTERPOLATE_PERSPECTIVE:
905 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
906 return SI_PARAM_PERSP_SAMPLE;
907 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
908 return SI_PARAM_PERSP_CENTROID;
909 else
910 return SI_PARAM_PERSP_CENTER;
911 break;
912 default:
913 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
914 return -1;
915 }
916 }
917
918 /* This shouldn't be used by explicit INTERP opcodes. */
919 static unsigned select_interp_param(struct si_shader_context *ctx,
920 unsigned param)
921 {
922 if (!ctx->shader->key.ps.prolog.force_persample_interp ||
923 !ctx->is_monolithic)
924 return param;
925
926 /* If the shader doesn't use center/centroid, just return the parameter.
927 *
928 * If the shader only uses one set of (i,j), "si_emit_spi_ps_input" can
929 * switch between center/centroid and sample without shader changes.
930 */
931 switch (param) {
932 case SI_PARAM_PERSP_CENTROID:
933 case SI_PARAM_PERSP_CENTER:
934 return SI_PARAM_PERSP_SAMPLE;
935
936 case SI_PARAM_LINEAR_CENTROID:
937 case SI_PARAM_LINEAR_CENTER:
938 return SI_PARAM_LINEAR_SAMPLE;
939
940 default:
941 return param;
942 }
943 }
944
945 /**
946 * Interpolate a fragment shader input.
947 *
948 * @param ctx context
949 * @param input_index index of the input in hardware
950 * @param semantic_name TGSI_SEMANTIC_*
951 * @param semantic_index semantic index
952 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
953 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
954 * @param interp_param interpolation weights (i,j)
955 * @param prim_mask SI_PARAM_PRIM_MASK
956 * @param face SI_PARAM_FRONT_FACE
957 * @param result the return value (4 components)
958 */
959 static void interp_fs_input(struct si_shader_context *ctx,
960 unsigned input_index,
961 unsigned semantic_name,
962 unsigned semantic_index,
963 unsigned num_interp_inputs,
964 unsigned colors_read_mask,
965 LLVMValueRef interp_param,
966 LLVMValueRef prim_mask,
967 LLVMValueRef face,
968 LLVMValueRef result[4])
969 {
970 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
971 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
972 struct gallivm_state *gallivm = base->gallivm;
973 const char *intr_name;
974 LLVMValueRef attr_number;
975
976 unsigned chan;
977
978 attr_number = lp_build_const_int32(gallivm, input_index);
979
980 /* fs.constant returns the param from the middle vertex, so it's not
981 * really useful for flat shading. It's meant to be used for custom
982 * interpolation (but the intrinsic can't fetch from the other two
983 * vertices).
984 *
985 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
986 * to do the right thing. The only reason we use fs.constant is that
987 * fs.interp cannot be used on integers, because they can be equal
988 * to NaN.
989 */
990 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
991
992 if (semantic_name == TGSI_SEMANTIC_COLOR &&
993 ctx->shader->key.ps.prolog.color_two_side) {
994 LLVMValueRef args[4];
995 LLVMValueRef is_face_positive;
996 LLVMValueRef back_attr_number;
997
998 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
999 * otherwise it's at offset "num_inputs".
1000 */
1001 unsigned back_attr_offset = num_interp_inputs;
1002 if (semantic_index == 1 && colors_read_mask & 0xf)
1003 back_attr_offset += 1;
1004
1005 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1006
1007 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1008 face, uint->zero, "");
1009
1010 args[2] = prim_mask;
1011 args[3] = interp_param;
1012 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1013 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1014 LLVMValueRef front, back;
1015
1016 args[0] = llvm_chan;
1017 args[1] = attr_number;
1018 front = lp_build_intrinsic(gallivm->builder, intr_name,
1019 ctx->f32, args, args[3] ? 4 : 3,
1020 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1021
1022 args[1] = back_attr_number;
1023 back = lp_build_intrinsic(gallivm->builder, intr_name,
1024 ctx->f32, args, args[3] ? 4 : 3,
1025 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1026
1027 result[chan] = LLVMBuildSelect(gallivm->builder,
1028 is_face_positive,
1029 front,
1030 back,
1031 "");
1032 }
1033 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1034 LLVMValueRef args[4];
1035
1036 args[0] = uint->zero;
1037 args[1] = attr_number;
1038 args[2] = prim_mask;
1039 args[3] = interp_param;
1040 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1041 ctx->f32, args, args[3] ? 4 : 3,
1042 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1043 result[1] =
1044 result[2] = lp_build_const_float(gallivm, 0.0f);
1045 result[3] = lp_build_const_float(gallivm, 1.0f);
1046 } else {
1047 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1048 LLVMValueRef args[4];
1049 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1050
1051 args[0] = llvm_chan;
1052 args[1] = attr_number;
1053 args[2] = prim_mask;
1054 args[3] = interp_param;
1055 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1056 ctx->f32, args, args[3] ? 4 : 3,
1057 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1058 }
1059 }
1060 }
1061
1062 static void declare_input_fs(
1063 struct radeon_llvm_context *radeon_bld,
1064 unsigned input_index,
1065 const struct tgsi_full_declaration *decl)
1066 {
1067 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1068 struct si_shader_context *ctx =
1069 si_shader_context(&radeon_bld->soa.bld_base);
1070 struct si_shader *shader = ctx->shader;
1071 LLVMValueRef main_fn = radeon_bld->main_fn;
1072 LLVMValueRef interp_param = NULL;
1073 int interp_param_idx;
1074
1075 /* Get colors from input VGPRs (set by the prolog). */
1076 if (!ctx->is_monolithic &&
1077 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1078 unsigned i = decl->Semantic.Index;
1079 unsigned colors_read = shader->selector->info.colors_read;
1080 unsigned mask = colors_read >> (i * 4);
1081 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1082 (i ? util_bitcount(colors_read & 0xf) : 0);
1083
1084 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1085 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1086 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1087 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1088 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1089 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1090 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1091 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1092 return;
1093 }
1094
1095 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1096 decl->Interp.Location);
1097 if (interp_param_idx == -1)
1098 return;
1099 else if (interp_param_idx) {
1100 interp_param_idx = select_interp_param(ctx,
1101 interp_param_idx);
1102 interp_param = LLVMGetParam(main_fn, interp_param_idx);
1103 }
1104
1105 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1106 decl->Semantic.Index, shader->selector->info.num_inputs,
1107 shader->selector->info.colors_read, interp_param,
1108 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1109 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1110 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1111 }
1112
1113 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1114 {
1115 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1116 SI_PARAM_ANCILLARY, 8, 4);
1117 }
1118
1119 /**
1120 * Set range metadata on an instruction. This can only be used on load and
1121 * call instructions. If you know an instruction can only produce the values
1122 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1123 * \p lo is the minimum value inclusive.
1124 * \p hi is the maximum value exclusive.
1125 */
1126 static void set_range_metadata(LLVMValueRef value, unsigned lo, unsigned hi)
1127 {
1128 const char *range_md_string = "range";
1129 LLVMValueRef range_md, md_args[2];
1130 LLVMTypeRef type = LLVMTypeOf(value);
1131 LLVMContextRef context = LLVMGetTypeContext(type);
1132 unsigned md_range_id = LLVMGetMDKindIDInContext(context,
1133 range_md_string, strlen(range_md_string));
1134
1135 md_args[0] = LLVMConstInt(type, lo, false);
1136 md_args[1] = LLVMConstInt(type, hi, false);
1137 range_md = LLVMMDNodeInContext(context, md_args, 2);
1138 LLVMSetMetadata(value, md_range_id, range_md);
1139 }
1140
1141 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1142 {
1143 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1144 LLVMValueRef tid;
1145
1146 if (HAVE_LLVM < 0x0308) {
1147 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1148 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1149 } else {
1150 LLVMValueRef tid_args[2];
1151 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1152 tid_args[1] = lp_build_const_int32(gallivm, 0);
1153 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1154 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1155 tid_args, 2, LLVMReadNoneAttribute);
1156
1157 tid = lp_build_intrinsic(gallivm->builder,
1158 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1159 tid_args, 2, LLVMReadNoneAttribute);
1160 }
1161 set_range_metadata(tid, 0, 64);
1162 return tid;
1163 }
1164
1165 /**
1166 * Load a dword from a constant buffer.
1167 */
1168 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1169 LLVMValueRef offset, LLVMTypeRef return_type)
1170 {
1171 LLVMValueRef args[2] = {resource, offset};
1172
1173 return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1174 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1175 }
1176
1177 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1178 {
1179 struct si_shader_context *ctx =
1180 si_shader_context(&radeon_bld->soa.bld_base);
1181 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1182 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1183 LLVMBuilderRef builder = gallivm->builder;
1184 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1185 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1186 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1187
1188 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1189 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1190 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1191
1192 LLVMValueRef pos[4] = {
1193 buffer_load_const(builder, resource, offset0, ctx->f32),
1194 buffer_load_const(builder, resource, offset1, ctx->f32),
1195 lp_build_const_float(gallivm, 0),
1196 lp_build_const_float(gallivm, 0)
1197 };
1198
1199 return lp_build_gather_values(gallivm, pos, 4);
1200 }
1201
1202 static void declare_system_value(
1203 struct radeon_llvm_context *radeon_bld,
1204 unsigned index,
1205 const struct tgsi_full_declaration *decl)
1206 {
1207 struct si_shader_context *ctx =
1208 si_shader_context(&radeon_bld->soa.bld_base);
1209 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1210 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1211 LLVMValueRef value = 0;
1212
1213 switch (decl->Semantic.Name) {
1214 case TGSI_SEMANTIC_INSTANCEID:
1215 value = LLVMGetParam(radeon_bld->main_fn,
1216 ctx->param_instance_id);
1217 break;
1218
1219 case TGSI_SEMANTIC_VERTEXID:
1220 value = LLVMBuildAdd(gallivm->builder,
1221 LLVMGetParam(radeon_bld->main_fn,
1222 ctx->param_vertex_id),
1223 LLVMGetParam(radeon_bld->main_fn,
1224 SI_PARAM_BASE_VERTEX), "");
1225 break;
1226
1227 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1228 value = LLVMGetParam(radeon_bld->main_fn,
1229 ctx->param_vertex_id);
1230 break;
1231
1232 case TGSI_SEMANTIC_BASEVERTEX:
1233 value = LLVMGetParam(radeon_bld->main_fn,
1234 SI_PARAM_BASE_VERTEX);
1235 break;
1236
1237 case TGSI_SEMANTIC_INVOCATIONID:
1238 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1239 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1240 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1241 value = LLVMGetParam(radeon_bld->main_fn,
1242 SI_PARAM_GS_INSTANCE_ID);
1243 else
1244 assert(!"INVOCATIONID not implemented");
1245 break;
1246
1247 case TGSI_SEMANTIC_POSITION:
1248 {
1249 LLVMValueRef pos[4] = {
1250 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1251 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1252 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1253 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1254 LLVMGetParam(radeon_bld->main_fn,
1255 SI_PARAM_POS_W_FLOAT)),
1256 };
1257 value = lp_build_gather_values(gallivm, pos, 4);
1258 break;
1259 }
1260
1261 case TGSI_SEMANTIC_FACE:
1262 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1263 break;
1264
1265 case TGSI_SEMANTIC_SAMPLEID:
1266 value = get_sample_id(radeon_bld);
1267 break;
1268
1269 case TGSI_SEMANTIC_SAMPLEPOS: {
1270 LLVMValueRef pos[4] = {
1271 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1272 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1273 lp_build_const_float(gallivm, 0),
1274 lp_build_const_float(gallivm, 0)
1275 };
1276 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1277 TGSI_OPCODE_FRC, pos[0]);
1278 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1279 TGSI_OPCODE_FRC, pos[1]);
1280 value = lp_build_gather_values(gallivm, pos, 4);
1281 break;
1282 }
1283
1284 case TGSI_SEMANTIC_SAMPLEMASK:
1285 /* This can only occur with the OpenGL Core profile, which
1286 * doesn't support smoothing.
1287 */
1288 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1289 break;
1290
1291 case TGSI_SEMANTIC_TESSCOORD:
1292 {
1293 LLVMValueRef coord[4] = {
1294 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1295 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1296 bld->zero,
1297 bld->zero
1298 };
1299
1300 /* For triangles, the vector should be (u, v, 1-u-v). */
1301 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1302 PIPE_PRIM_TRIANGLES)
1303 coord[2] = lp_build_sub(bld, bld->one,
1304 lp_build_add(bld, coord[0], coord[1]));
1305
1306 value = lp_build_gather_values(gallivm, coord, 4);
1307 break;
1308 }
1309
1310 case TGSI_SEMANTIC_VERTICESIN:
1311 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1312 break;
1313
1314 case TGSI_SEMANTIC_TESSINNER:
1315 case TGSI_SEMANTIC_TESSOUTER:
1316 {
1317 LLVMValueRef dw_addr;
1318 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1319
1320 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1321 dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
1322 lp_build_const_int32(gallivm, param * 4), "");
1323
1324 value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1325 ~0, dw_addr);
1326 break;
1327 }
1328
1329 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1330 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1331 {
1332 LLVMValueRef buf, slot, val[4];
1333 int i, offset;
1334
1335 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1336 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1337 buf = build_indexed_load_const(ctx, buf, slot);
1338 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1339
1340 for (i = 0; i < 4; i++)
1341 val[i] = buffer_load_const(gallivm->builder, buf,
1342 lp_build_const_int32(gallivm, (offset + i) * 4),
1343 ctx->f32);
1344 value = lp_build_gather_values(gallivm, val, 4);
1345 break;
1346 }
1347
1348 case TGSI_SEMANTIC_PRIMID:
1349 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1350 break;
1351
1352 case TGSI_SEMANTIC_GRID_SIZE:
1353 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1354 break;
1355
1356 case TGSI_SEMANTIC_BLOCK_SIZE:
1357 {
1358 LLVMValueRef values[3];
1359 unsigned i;
1360 unsigned *properties = ctx->shader->selector->info.properties;
1361 unsigned sizes[3] = {
1362 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1363 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1364 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1365 };
1366
1367 for (i = 0; i < 3; ++i)
1368 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1369
1370 value = lp_build_gather_values(gallivm, values, 3);
1371 break;
1372 }
1373
1374 case TGSI_SEMANTIC_BLOCK_ID:
1375 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1376 break;
1377
1378 case TGSI_SEMANTIC_THREAD_ID:
1379 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1380 break;
1381
1382 #if HAVE_LLVM >= 0x0309
1383 case TGSI_SEMANTIC_HELPER_INVOCATION:
1384 value = lp_build_intrinsic(gallivm->builder,
1385 "llvm.amdgcn.ps.live",
1386 ctx->i1, NULL, 0,
1387 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1388 value = LLVMBuildNot(gallivm->builder, value, "");
1389 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1390 break;
1391 #endif
1392
1393 default:
1394 assert(!"unknown system value");
1395 return;
1396 }
1397
1398 radeon_bld->system_values[index] = value;
1399 }
1400
1401 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1402 const struct tgsi_full_declaration *decl)
1403 {
1404 struct si_shader_context *ctx =
1405 si_shader_context(&radeon_bld->soa.bld_base);
1406 struct si_shader_selector *sel = ctx->shader->selector;
1407 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1408
1409 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1410 LLVMValueRef var;
1411
1412 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1413 assert(decl->Range.First == decl->Range.Last);
1414 assert(!ctx->shared_memory);
1415
1416 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1417 LLVMArrayType(ctx->i8, sel->local_size),
1418 "compute_lds",
1419 LOCAL_ADDR_SPACE);
1420 LLVMSetAlignment(var, 4);
1421
1422 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1423 }
1424
1425 static LLVMValueRef fetch_constant(
1426 struct lp_build_tgsi_context *bld_base,
1427 const struct tgsi_full_src_register *reg,
1428 enum tgsi_opcode_type type,
1429 unsigned swizzle)
1430 {
1431 struct si_shader_context *ctx = si_shader_context(bld_base);
1432 struct lp_build_context *base = &bld_base->base;
1433 const struct tgsi_ind_register *ireg = &reg->Indirect;
1434 unsigned buf, idx;
1435
1436 LLVMValueRef addr, bufp;
1437 LLVMValueRef result;
1438
1439 if (swizzle == LP_CHAN_ALL) {
1440 unsigned chan;
1441 LLVMValueRef values[4];
1442 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1443 values[chan] = fetch_constant(bld_base, reg, type, chan);
1444
1445 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1446 }
1447
1448 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1449 idx = reg->Register.Index * 4 + swizzle;
1450
1451 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1452 if (type != TGSI_TYPE_DOUBLE)
1453 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1454 else {
1455 return radeon_llvm_emit_fetch_double(bld_base,
1456 ctx->constants[buf][idx],
1457 ctx->constants[buf][idx + 1]);
1458 }
1459 }
1460
1461 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1462 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1463 LLVMValueRef index;
1464 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1465 reg->Dimension.Index,
1466 SI_NUM_CONST_BUFFERS);
1467 bufp = build_indexed_load_const(ctx, ptr, index);
1468 } else
1469 bufp = ctx->const_buffers[buf];
1470
1471 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1472 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1473 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1474 addr = lp_build_add(&bld_base->uint_bld, addr,
1475 lp_build_const_int32(base->gallivm, idx * 4));
1476
1477 result = buffer_load_const(base->gallivm->builder, bufp,
1478 addr, ctx->f32);
1479
1480 if (type != TGSI_TYPE_DOUBLE)
1481 result = bitcast(bld_base, type, result);
1482 else {
1483 LLVMValueRef addr2, result2;
1484 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1485 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1486 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1487 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1488 lp_build_const_int32(base->gallivm, idx * 4));
1489
1490 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1491 addr2, ctx->f32);
1492
1493 result = radeon_llvm_emit_fetch_double(bld_base,
1494 result, result2);
1495 }
1496 return result;
1497 }
1498
1499 /* Upper 16 bits must be zero. */
1500 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1501 LLVMValueRef val[2])
1502 {
1503 return LLVMBuildOr(gallivm->builder, val[0],
1504 LLVMBuildShl(gallivm->builder, val[1],
1505 lp_build_const_int32(gallivm, 16),
1506 ""), "");
1507 }
1508
1509 /* Upper 16 bits are ignored and will be dropped. */
1510 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1511 LLVMValueRef val[2])
1512 {
1513 LLVMValueRef v[2] = {
1514 LLVMBuildAnd(gallivm->builder, val[0],
1515 lp_build_const_int32(gallivm, 0xffff), ""),
1516 val[1],
1517 };
1518 return si_llvm_pack_two_int16(gallivm, v);
1519 }
1520
1521 /* Initialize arguments for the shader export intrinsic */
1522 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1523 LLVMValueRef *values,
1524 unsigned target,
1525 LLVMValueRef *args)
1526 {
1527 struct si_shader_context *ctx = si_shader_context(bld_base);
1528 struct lp_build_context *uint =
1529 &ctx->radeon_bld.soa.bld_base.uint_bld;
1530 struct lp_build_context *base = &bld_base->base;
1531 struct gallivm_state *gallivm = base->gallivm;
1532 LLVMBuilderRef builder = base->gallivm->builder;
1533 LLVMValueRef val[4];
1534 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1535 unsigned chan;
1536 bool is_int8;
1537
1538 /* Default is 0xf. Adjusted below depending on the format. */
1539 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1540
1541 /* Specify whether the EXEC mask represents the valid mask */
1542 args[1] = uint->zero;
1543
1544 /* Specify whether this is the last export */
1545 args[2] = uint->zero;
1546
1547 /* Specify the target we are exporting */
1548 args[3] = lp_build_const_int32(base->gallivm, target);
1549
1550 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1551 const union si_shader_key *key = &ctx->shader->key;
1552 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1553 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1554
1555 assert(cbuf >= 0 && cbuf < 8);
1556 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1557 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1558 }
1559
1560 args[4] = uint->zero; /* COMPR flag */
1561 args[5] = base->undef;
1562 args[6] = base->undef;
1563 args[7] = base->undef;
1564 args[8] = base->undef;
1565
1566 switch (spi_shader_col_format) {
1567 case V_028714_SPI_SHADER_ZERO:
1568 args[0] = uint->zero; /* writemask */
1569 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1570 break;
1571
1572 case V_028714_SPI_SHADER_32_R:
1573 args[0] = uint->one; /* writemask */
1574 args[5] = values[0];
1575 break;
1576
1577 case V_028714_SPI_SHADER_32_GR:
1578 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1579 args[5] = values[0];
1580 args[6] = values[1];
1581 break;
1582
1583 case V_028714_SPI_SHADER_32_AR:
1584 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
1585 args[5] = values[0];
1586 args[8] = values[3];
1587 break;
1588
1589 case V_028714_SPI_SHADER_FP16_ABGR:
1590 args[4] = uint->one; /* COMPR flag */
1591
1592 for (chan = 0; chan < 2; chan++) {
1593 LLVMValueRef pack_args[2] = {
1594 values[2 * chan],
1595 values[2 * chan + 1]
1596 };
1597 LLVMValueRef packed;
1598
1599 packed = lp_build_intrinsic(base->gallivm->builder,
1600 "llvm.SI.packf16",
1601 ctx->i32, pack_args, 2,
1602 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1603 args[chan + 5] =
1604 LLVMBuildBitCast(base->gallivm->builder,
1605 packed, ctx->f32, "");
1606 }
1607 break;
1608
1609 case V_028714_SPI_SHADER_UNORM16_ABGR:
1610 for (chan = 0; chan < 4; chan++) {
1611 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
1612 val[chan] = LLVMBuildFMul(builder, val[chan],
1613 lp_build_const_float(gallivm, 65535), "");
1614 val[chan] = LLVMBuildFAdd(builder, val[chan],
1615 lp_build_const_float(gallivm, 0.5), "");
1616 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1617 ctx->i32, "");
1618 }
1619
1620 args[4] = uint->one; /* COMPR flag */
1621 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1622 si_llvm_pack_two_int16(gallivm, val));
1623 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1624 si_llvm_pack_two_int16(gallivm, val+2));
1625 break;
1626
1627 case V_028714_SPI_SHADER_SNORM16_ABGR:
1628 for (chan = 0; chan < 4; chan++) {
1629 /* Clamp between [-1, 1]. */
1630 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1631 values[chan],
1632 lp_build_const_float(gallivm, 1));
1633 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1634 val[chan],
1635 lp_build_const_float(gallivm, -1));
1636 /* Convert to a signed integer in [-32767, 32767]. */
1637 val[chan] = LLVMBuildFMul(builder, val[chan],
1638 lp_build_const_float(gallivm, 32767), "");
1639 /* If positive, add 0.5, else add -0.5. */
1640 val[chan] = LLVMBuildFAdd(builder, val[chan],
1641 LLVMBuildSelect(builder,
1642 LLVMBuildFCmp(builder, LLVMRealOGE,
1643 val[chan], base->zero, ""),
1644 lp_build_const_float(gallivm, 0.5),
1645 lp_build_const_float(gallivm, -0.5), ""), "");
1646 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1647 }
1648
1649 args[4] = uint->one; /* COMPR flag */
1650 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1651 si_llvm_pack_two_int32_as_int16(gallivm, val));
1652 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1653 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1654 break;
1655
1656 case V_028714_SPI_SHADER_UINT16_ABGR: {
1657 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1658 255 : 65535);
1659 /* Clamp. */
1660 for (chan = 0; chan < 4; chan++) {
1661 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1662 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1663 val[chan], max);
1664 }
1665
1666 args[4] = uint->one; /* COMPR flag */
1667 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1668 si_llvm_pack_two_int16(gallivm, val));
1669 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1670 si_llvm_pack_two_int16(gallivm, val+2));
1671 break;
1672 }
1673
1674 case V_028714_SPI_SHADER_SINT16_ABGR: {
1675 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1676 127 : 32767);
1677 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
1678 -128 : -32768);
1679 /* Clamp. */
1680 for (chan = 0; chan < 4; chan++) {
1681 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1682 val[chan] = lp_build_emit_llvm_binary(bld_base,
1683 TGSI_OPCODE_IMIN,
1684 val[chan], max);
1685 val[chan] = lp_build_emit_llvm_binary(bld_base,
1686 TGSI_OPCODE_IMAX,
1687 val[chan], min);
1688 }
1689
1690 args[4] = uint->one; /* COMPR flag */
1691 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1692 si_llvm_pack_two_int32_as_int16(gallivm, val));
1693 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1694 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1695 break;
1696 }
1697
1698 case V_028714_SPI_SHADER_32_ABGR:
1699 memcpy(&args[5], values, sizeof(values[0]) * 4);
1700 break;
1701 }
1702 }
1703
1704 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1705 LLVMValueRef alpha)
1706 {
1707 struct si_shader_context *ctx = si_shader_context(bld_base);
1708 struct gallivm_state *gallivm = bld_base->base.gallivm;
1709
1710 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1711 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
1712 SI_PARAM_ALPHA_REF);
1713
1714 LLVMValueRef alpha_pass =
1715 lp_build_cmp(&bld_base->base,
1716 ctx->shader->key.ps.epilog.alpha_func,
1717 alpha, alpha_ref);
1718 LLVMValueRef arg =
1719 lp_build_select(&bld_base->base,
1720 alpha_pass,
1721 lp_build_const_float(gallivm, 1.0f),
1722 lp_build_const_float(gallivm, -1.0f));
1723
1724 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
1725 ctx->voidt, &arg, 1, 0);
1726 } else {
1727 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
1728 ctx->voidt, NULL, 0, 0);
1729 }
1730 }
1731
1732 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
1733 LLVMValueRef alpha,
1734 unsigned samplemask_param)
1735 {
1736 struct si_shader_context *ctx = si_shader_context(bld_base);
1737 struct gallivm_state *gallivm = bld_base->base.gallivm;
1738 LLVMValueRef coverage;
1739
1740 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1741 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
1742 samplemask_param);
1743 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
1744
1745 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
1746 ctx->i32,
1747 &coverage, 1, LLVMReadNoneAttribute);
1748
1749 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
1750 ctx->f32, "");
1751
1752 coverage = LLVMBuildFMul(gallivm->builder, coverage,
1753 lp_build_const_float(gallivm,
1754 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1755
1756 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
1757 }
1758
1759 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
1760 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
1761 {
1762 struct si_shader_context *ctx = si_shader_context(bld_base);
1763 struct lp_build_context *base = &bld_base->base;
1764 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1765 unsigned reg_index;
1766 unsigned chan;
1767 unsigned const_chan;
1768 LLVMValueRef base_elt;
1769 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1770 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
1771 SI_VS_CONST_CLIP_PLANES);
1772 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
1773
1774 for (reg_index = 0; reg_index < 2; reg_index ++) {
1775 LLVMValueRef *args = pos[2 + reg_index];
1776
1777 args[5] =
1778 args[6] =
1779 args[7] =
1780 args[8] = lp_build_const_float(base->gallivm, 0.0f);
1781
1782 /* Compute dot products of position and user clip plane vectors */
1783 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1784 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
1785 args[1] = lp_build_const_int32(base->gallivm,
1786 ((reg_index * 4 + chan) * 4 +
1787 const_chan) * 4);
1788 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
1789 args[1], ctx->f32);
1790 args[5 + chan] =
1791 lp_build_add(base, args[5 + chan],
1792 lp_build_mul(base, base_elt,
1793 out_elts[const_chan]));
1794 }
1795 }
1796
1797 args[0] = lp_build_const_int32(base->gallivm, 0xf);
1798 args[1] = uint->zero;
1799 args[2] = uint->zero;
1800 args[3] = lp_build_const_int32(base->gallivm,
1801 V_008DFC_SQ_EXP_POS + 2 + reg_index);
1802 args[4] = uint->zero;
1803 }
1804 }
1805
1806 static void si_dump_streamout(struct pipe_stream_output_info *so)
1807 {
1808 unsigned i;
1809
1810 if (so->num_outputs)
1811 fprintf(stderr, "STREAMOUT\n");
1812
1813 for (i = 0; i < so->num_outputs; i++) {
1814 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
1815 so->output[i].start_component;
1816 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
1817 i, so->output[i].output_buffer,
1818 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
1819 so->output[i].register_index,
1820 mask & 1 ? "x" : "",
1821 mask & 2 ? "y" : "",
1822 mask & 4 ? "z" : "",
1823 mask & 8 ? "w" : "");
1824 }
1825 }
1826
1827 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1828 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1829 * or v4i32 (num_channels=3,4). */
1830 static void build_tbuffer_store(struct si_shader_context *ctx,
1831 LLVMValueRef rsrc,
1832 LLVMValueRef vdata,
1833 unsigned num_channels,
1834 LLVMValueRef vaddr,
1835 LLVMValueRef soffset,
1836 unsigned inst_offset,
1837 unsigned dfmt,
1838 unsigned nfmt,
1839 unsigned offen,
1840 unsigned idxen,
1841 unsigned glc,
1842 unsigned slc,
1843 unsigned tfe)
1844 {
1845 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1846 LLVMValueRef args[] = {
1847 rsrc,
1848 vdata,
1849 LLVMConstInt(ctx->i32, num_channels, 0),
1850 vaddr,
1851 soffset,
1852 LLVMConstInt(ctx->i32, inst_offset, 0),
1853 LLVMConstInt(ctx->i32, dfmt, 0),
1854 LLVMConstInt(ctx->i32, nfmt, 0),
1855 LLVMConstInt(ctx->i32, offen, 0),
1856 LLVMConstInt(ctx->i32, idxen, 0),
1857 LLVMConstInt(ctx->i32, glc, 0),
1858 LLVMConstInt(ctx->i32, slc, 0),
1859 LLVMConstInt(ctx->i32, tfe, 0)
1860 };
1861
1862 /* The instruction offset field has 12 bits */
1863 assert(offen || inst_offset < (1 << 12));
1864
1865 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
1866 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1867 const char *types[] = {"i32", "v2i32", "v4i32"};
1868 char name[256];
1869 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
1870
1871 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
1872 args, Elements(args), 0);
1873 }
1874
1875 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
1876 LLVMValueRef rsrc,
1877 LLVMValueRef vdata,
1878 unsigned num_channels,
1879 LLVMValueRef vaddr,
1880 LLVMValueRef soffset,
1881 unsigned inst_offset)
1882 {
1883 static unsigned dfmt[] = {
1884 V_008F0C_BUF_DATA_FORMAT_32,
1885 V_008F0C_BUF_DATA_FORMAT_32_32,
1886 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1887 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1888 };
1889 assert(num_channels >= 1 && num_channels <= 4);
1890
1891 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
1892 inst_offset, dfmt[num_channels-1],
1893 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
1894 }
1895
1896 /* On SI, the vertex shader is responsible for writing streamout data
1897 * to buffers. */
1898 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
1899 struct si_shader_output_values *outputs,
1900 unsigned noutput)
1901 {
1902 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
1903 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1904 LLVMBuilderRef builder = gallivm->builder;
1905 int i, j;
1906 struct lp_build_if_state if_ctx;
1907
1908 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1909 LLVMValueRef so_vtx_count =
1910 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
1911
1912 LLVMValueRef tid = get_thread_id(ctx);
1913
1914 /* can_emit = tid < so_vtx_count; */
1915 LLVMValueRef can_emit =
1916 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
1917
1918 LLVMValueRef stream_id =
1919 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
1920
1921 /* Emit the streamout code conditionally. This actually avoids
1922 * out-of-bounds buffer access. The hw tells us via the SGPR
1923 * (so_vtx_count) which threads are allowed to emit streamout data. */
1924 lp_build_if(&if_ctx, gallivm, can_emit);
1925 {
1926 /* The buffer offset is computed as follows:
1927 * ByteOffset = streamout_offset[buffer_id]*4 +
1928 * (streamout_write_index + thread_id)*stride[buffer_id] +
1929 * attrib_offset
1930 */
1931
1932 LLVMValueRef so_write_index =
1933 LLVMGetParam(ctx->radeon_bld.main_fn,
1934 ctx->param_streamout_write_index);
1935
1936 /* Compute (streamout_write_index + thread_id). */
1937 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
1938
1939 /* Compute the write offset for each enabled buffer. */
1940 LLVMValueRef so_write_offset[4] = {};
1941 for (i = 0; i < 4; i++) {
1942 if (!so->stride[i])
1943 continue;
1944
1945 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
1946 ctx->param_streamout_offset[i]);
1947 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
1948
1949 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
1950 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
1951 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
1952 }
1953
1954 /* Write streamout data. */
1955 for (i = 0; i < so->num_outputs; i++) {
1956 unsigned buf_idx = so->output[i].output_buffer;
1957 unsigned reg = so->output[i].register_index;
1958 unsigned start = so->output[i].start_component;
1959 unsigned num_comps = so->output[i].num_components;
1960 unsigned stream = so->output[i].stream;
1961 LLVMValueRef out[4];
1962 struct lp_build_if_state if_ctx_stream;
1963
1964 assert(num_comps && num_comps <= 4);
1965 if (!num_comps || num_comps > 4)
1966 continue;
1967
1968 if (reg >= noutput)
1969 continue;
1970
1971 /* Load the output as int. */
1972 for (j = 0; j < num_comps; j++) {
1973 out[j] = LLVMBuildBitCast(builder,
1974 outputs[reg].values[start+j],
1975 ctx->i32, "");
1976 }
1977
1978 /* Pack the output. */
1979 LLVMValueRef vdata = NULL;
1980
1981 switch (num_comps) {
1982 case 1: /* as i32 */
1983 vdata = out[0];
1984 break;
1985 case 2: /* as v2i32 */
1986 case 3: /* as v4i32 (aligned to 4) */
1987 case 4: /* as v4i32 */
1988 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
1989 for (j = 0; j < num_comps; j++) {
1990 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
1991 LLVMConstInt(ctx->i32, j, 0), "");
1992 }
1993 break;
1994 }
1995
1996 LLVMValueRef can_emit_stream =
1997 LLVMBuildICmp(builder, LLVMIntEQ,
1998 stream_id,
1999 lp_build_const_int32(gallivm, stream), "");
2000
2001 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2002 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2003 vdata, num_comps,
2004 so_write_offset[buf_idx],
2005 LLVMConstInt(ctx->i32, 0, 0),
2006 so->output[i].dst_offset*4);
2007 lp_build_endif(&if_ctx_stream);
2008 }
2009 }
2010 lp_build_endif(&if_ctx);
2011 }
2012
2013
2014 /* Generate export instructions for hardware VS shader stage */
2015 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2016 struct si_shader_output_values *outputs,
2017 unsigned noutput)
2018 {
2019 struct si_shader_context *ctx = si_shader_context(bld_base);
2020 struct si_shader *shader = ctx->shader;
2021 struct lp_build_context *base = &bld_base->base;
2022 struct lp_build_context *uint =
2023 &ctx->radeon_bld.soa.bld_base.uint_bld;
2024 LLVMValueRef args[9];
2025 LLVMValueRef pos_args[4][9] = { { 0 } };
2026 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2027 unsigned semantic_name, semantic_index;
2028 unsigned target;
2029 unsigned param_count = 0;
2030 unsigned pos_idx;
2031 int i;
2032
2033 if (outputs && ctx->shader->selector->so.num_outputs) {
2034 si_llvm_emit_streamout(ctx, outputs, noutput);
2035 }
2036
2037 for (i = 0; i < noutput; i++) {
2038 semantic_name = outputs[i].name;
2039 semantic_index = outputs[i].sid;
2040
2041 handle_semantic:
2042 /* Select the correct target */
2043 switch(semantic_name) {
2044 case TGSI_SEMANTIC_PSIZE:
2045 psize_value = outputs[i].values[0];
2046 continue;
2047 case TGSI_SEMANTIC_EDGEFLAG:
2048 edgeflag_value = outputs[i].values[0];
2049 continue;
2050 case TGSI_SEMANTIC_LAYER:
2051 layer_value = outputs[i].values[0];
2052 semantic_name = TGSI_SEMANTIC_GENERIC;
2053 goto handle_semantic;
2054 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2055 viewport_index_value = outputs[i].values[0];
2056 semantic_name = TGSI_SEMANTIC_GENERIC;
2057 goto handle_semantic;
2058 case TGSI_SEMANTIC_POSITION:
2059 target = V_008DFC_SQ_EXP_POS;
2060 break;
2061 case TGSI_SEMANTIC_COLOR:
2062 case TGSI_SEMANTIC_BCOLOR:
2063 target = V_008DFC_SQ_EXP_PARAM + param_count;
2064 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2065 shader->info.vs_output_param_offset[i] = param_count;
2066 param_count++;
2067 break;
2068 case TGSI_SEMANTIC_CLIPDIST:
2069 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2070 break;
2071 case TGSI_SEMANTIC_CLIPVERTEX:
2072 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2073 continue;
2074 case TGSI_SEMANTIC_PRIMID:
2075 case TGSI_SEMANTIC_FOG:
2076 case TGSI_SEMANTIC_TEXCOORD:
2077 case TGSI_SEMANTIC_GENERIC:
2078 target = V_008DFC_SQ_EXP_PARAM + param_count;
2079 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2080 shader->info.vs_output_param_offset[i] = param_count;
2081 param_count++;
2082 break;
2083 default:
2084 target = 0;
2085 fprintf(stderr,
2086 "Warning: SI unhandled vs output type:%d\n",
2087 semantic_name);
2088 }
2089
2090 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2091
2092 if (target >= V_008DFC_SQ_EXP_POS &&
2093 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2094 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2095 args, sizeof(args));
2096 } else {
2097 lp_build_intrinsic(base->gallivm->builder,
2098 "llvm.SI.export", ctx->voidt,
2099 args, 9, 0);
2100 }
2101
2102 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2103 semantic_name = TGSI_SEMANTIC_GENERIC;
2104 goto handle_semantic;
2105 }
2106 }
2107
2108 shader->info.nr_param_exports = param_count;
2109
2110 /* We need to add the position output manually if it's missing. */
2111 if (!pos_args[0][0]) {
2112 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2113 pos_args[0][1] = uint->zero; /* EXEC mask */
2114 pos_args[0][2] = uint->zero; /* last export? */
2115 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2116 pos_args[0][4] = uint->zero; /* COMPR flag */
2117 pos_args[0][5] = base->zero; /* X */
2118 pos_args[0][6] = base->zero; /* Y */
2119 pos_args[0][7] = base->zero; /* Z */
2120 pos_args[0][8] = base->one; /* W */
2121 }
2122
2123 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2124 if (shader->selector->info.writes_psize ||
2125 shader->selector->info.writes_edgeflag ||
2126 shader->selector->info.writes_viewport_index ||
2127 shader->selector->info.writes_layer) {
2128 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2129 shader->selector->info.writes_psize |
2130 (shader->selector->info.writes_edgeflag << 1) |
2131 (shader->selector->info.writes_layer << 2) |
2132 (shader->selector->info.writes_viewport_index << 3));
2133 pos_args[1][1] = uint->zero; /* EXEC mask */
2134 pos_args[1][2] = uint->zero; /* last export? */
2135 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2136 pos_args[1][4] = uint->zero; /* COMPR flag */
2137 pos_args[1][5] = base->zero; /* X */
2138 pos_args[1][6] = base->zero; /* Y */
2139 pos_args[1][7] = base->zero; /* Z */
2140 pos_args[1][8] = base->zero; /* W */
2141
2142 if (shader->selector->info.writes_psize)
2143 pos_args[1][5] = psize_value;
2144
2145 if (shader->selector->info.writes_edgeflag) {
2146 /* The output is a float, but the hw expects an integer
2147 * with the first bit containing the edge flag. */
2148 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2149 edgeflag_value,
2150 ctx->i32, "");
2151 edgeflag_value = lp_build_min(&bld_base->int_bld,
2152 edgeflag_value,
2153 bld_base->int_bld.one);
2154
2155 /* The LLVM intrinsic expects a float. */
2156 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2157 edgeflag_value,
2158 ctx->f32, "");
2159 }
2160
2161 if (shader->selector->info.writes_layer)
2162 pos_args[1][7] = layer_value;
2163
2164 if (shader->selector->info.writes_viewport_index)
2165 pos_args[1][8] = viewport_index_value;
2166 }
2167
2168 for (i = 0; i < 4; i++)
2169 if (pos_args[i][0])
2170 shader->info.nr_pos_exports++;
2171
2172 pos_idx = 0;
2173 for (i = 0; i < 4; i++) {
2174 if (!pos_args[i][0])
2175 continue;
2176
2177 /* Specify the target we are exporting */
2178 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2179
2180 if (pos_idx == shader->info.nr_pos_exports)
2181 /* Specify that this is the last export */
2182 pos_args[i][2] = uint->one;
2183
2184 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2185 ctx->voidt, pos_args[i], 9, 0);
2186 }
2187 }
2188
2189 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2190 LLVMValueRef rel_patch_id,
2191 LLVMValueRef invocation_id,
2192 LLVMValueRef tcs_out_current_patch_data_offset)
2193 {
2194 struct si_shader_context *ctx = si_shader_context(bld_base);
2195 struct gallivm_state *gallivm = bld_base->base.gallivm;
2196 struct si_shader *shader = ctx->shader;
2197 unsigned tess_inner_index, tess_outer_index;
2198 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2199 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2200 unsigned stride, outer_comps, inner_comps, i;
2201 struct lp_build_if_state if_ctx;
2202
2203 /* Do this only for invocation 0, because the tess levels are per-patch,
2204 * not per-vertex.
2205 *
2206 * This can't jump, because invocation 0 executes this. It should
2207 * at least mask out the loads and stores for other invocations.
2208 */
2209 lp_build_if(&if_ctx, gallivm,
2210 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2211 invocation_id, bld_base->uint_bld.zero, ""));
2212
2213 /* Determine the layout of one tess factor element in the buffer. */
2214 switch (shader->key.tcs.epilog.prim_mode) {
2215 case PIPE_PRIM_LINES:
2216 stride = 2; /* 2 dwords, 1 vec2 store */
2217 outer_comps = 2;
2218 inner_comps = 0;
2219 break;
2220 case PIPE_PRIM_TRIANGLES:
2221 stride = 4; /* 4 dwords, 1 vec4 store */
2222 outer_comps = 3;
2223 inner_comps = 1;
2224 break;
2225 case PIPE_PRIM_QUADS:
2226 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2227 outer_comps = 4;
2228 inner_comps = 2;
2229 break;
2230 default:
2231 assert(0);
2232 return;
2233 }
2234
2235 /* Load tess_inner and tess_outer from LDS.
2236 * Any invocation can write them, so we can't get them from a temporary.
2237 */
2238 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2239 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2240
2241 lds_base = tcs_out_current_patch_data_offset;
2242 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2243 lp_build_const_int32(gallivm,
2244 tess_inner_index * 4), "");
2245 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2246 lp_build_const_int32(gallivm,
2247 tess_outer_index * 4), "");
2248
2249 for (i = 0; i < outer_comps; i++)
2250 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2251 for (i = 0; i < inner_comps; i++)
2252 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2253
2254 /* Convert the outputs to vectors for stores. */
2255 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2256 vec1 = NULL;
2257
2258 if (stride > 4)
2259 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2260
2261 /* Get the buffer. */
2262 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2263 SI_PARAM_RW_BUFFERS);
2264 buffer = build_indexed_load_const(ctx, rw_buffers,
2265 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2266
2267 /* Get the offset. */
2268 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2269 SI_PARAM_TESS_FACTOR_OFFSET);
2270 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2271 lp_build_const_int32(gallivm, 4 * stride), "");
2272
2273 /* Store the outputs. */
2274 build_tbuffer_store_dwords(ctx, buffer, vec0,
2275 MIN2(stride, 4), byteoffset, tf_base, 0);
2276 if (vec1)
2277 build_tbuffer_store_dwords(ctx, buffer, vec1,
2278 stride - 4, byteoffset, tf_base, 16);
2279 lp_build_endif(&if_ctx);
2280 }
2281
2282 /* This only writes the tessellation factor levels. */
2283 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2284 {
2285 struct si_shader_context *ctx = si_shader_context(bld_base);
2286 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2287
2288 rel_patch_id = get_rel_patch_id(ctx);
2289 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2290 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2291
2292 if (!ctx->is_monolithic) {
2293 /* Return epilog parameters from this function. */
2294 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2295 LLVMValueRef ret = ctx->return_value;
2296 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2297 unsigned vgpr;
2298
2299 /* RW_BUFFERS pointer */
2300 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2301 SI_PARAM_RW_BUFFERS);
2302 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2303 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2304 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2305 bld_base->uint_bld.zero, "");
2306 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2307 bld_base->uint_bld.one, "");
2308 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2309 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2310
2311 /* Tess factor buffer soffset is after user SGPRs. */
2312 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2313 SI_PARAM_TESS_FACTOR_OFFSET);
2314 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2315 SI_TCS_NUM_USER_SGPR, "");
2316
2317 /* VGPRs */
2318 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2319 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2320 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2321
2322 vgpr = SI_TCS_NUM_USER_SGPR + 1;
2323 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2324 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2325 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2326 ctx->return_value = ret;
2327 return;
2328 }
2329
2330 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2331 }
2332
2333 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2334 {
2335 struct si_shader_context *ctx = si_shader_context(bld_base);
2336 struct si_shader *shader = ctx->shader;
2337 struct tgsi_shader_info *info = &shader->selector->info;
2338 struct gallivm_state *gallivm = bld_base->base.gallivm;
2339 unsigned i, chan;
2340 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2341 ctx->param_rel_auto_id);
2342 LLVMValueRef vertex_dw_stride =
2343 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2344 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2345 vertex_dw_stride, "");
2346
2347 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2348 * its inputs from it. */
2349 for (i = 0; i < info->num_outputs; i++) {
2350 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2351 unsigned name = info->output_semantic_name[i];
2352 unsigned index = info->output_semantic_index[i];
2353 int param = si_shader_io_get_unique_index(name, index);
2354 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2355 lp_build_const_int32(gallivm, param * 4), "");
2356
2357 for (chan = 0; chan < 4; chan++) {
2358 lds_store(bld_base, chan, dw_addr,
2359 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2360 }
2361 }
2362 }
2363
2364 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2365 {
2366 struct si_shader_context *ctx = si_shader_context(bld_base);
2367 struct gallivm_state *gallivm = bld_base->base.gallivm;
2368 struct si_shader *es = ctx->shader;
2369 struct tgsi_shader_info *info = &es->selector->info;
2370 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2371 ctx->param_es2gs_offset);
2372 unsigned chan;
2373 int i;
2374
2375 for (i = 0; i < info->num_outputs; i++) {
2376 LLVMValueRef *out_ptr =
2377 ctx->radeon_bld.soa.outputs[i];
2378 int param_index;
2379
2380 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2381 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2382 continue;
2383
2384 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2385 info->output_semantic_index[i]);
2386
2387 for (chan = 0; chan < 4; chan++) {
2388 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2389 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2390
2391 build_tbuffer_store(ctx,
2392 ctx->esgs_ring,
2393 out_val, 1,
2394 LLVMGetUndef(ctx->i32), soffset,
2395 (4 * param_index + chan) * 4,
2396 V_008F0C_BUF_DATA_FORMAT_32,
2397 V_008F0C_BUF_NUM_FORMAT_UINT,
2398 0, 0, 1, 1, 0);
2399 }
2400 }
2401 }
2402
2403 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2404 {
2405 struct si_shader_context *ctx = si_shader_context(bld_base);
2406 struct gallivm_state *gallivm = bld_base->base.gallivm;
2407 LLVMValueRef args[2];
2408
2409 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2410 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2411 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2412 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
2413 }
2414
2415 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2416 {
2417 struct si_shader_context *ctx = si_shader_context(bld_base);
2418 struct gallivm_state *gallivm = bld_base->base.gallivm;
2419 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2420 struct si_shader_output_values *outputs = NULL;
2421 int i,j;
2422
2423 assert(!ctx->is_gs_copy_shader);
2424
2425 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2426
2427 /* Vertex color clamping.
2428 *
2429 * This uses a state constant loaded in a user data SGPR and
2430 * an IF statement is added that clamps all colors if the constant
2431 * is true.
2432 */
2433 if (ctx->type == PIPE_SHADER_VERTEX) {
2434 struct lp_build_if_state if_ctx;
2435 LLVMValueRef cond = NULL;
2436 LLVMValueRef addr, val;
2437
2438 for (i = 0; i < info->num_outputs; i++) {
2439 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2440 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2441 continue;
2442
2443 /* We've found a color. */
2444 if (!cond) {
2445 /* The state is in the first bit of the user SGPR. */
2446 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2447 SI_PARAM_VS_STATE_BITS);
2448 cond = LLVMBuildTrunc(gallivm->builder, cond,
2449 ctx->i1, "");
2450 lp_build_if(&if_ctx, gallivm, cond);
2451 }
2452
2453 for (j = 0; j < 4; j++) {
2454 addr = ctx->radeon_bld.soa.outputs[i][j];
2455 val = LLVMBuildLoad(gallivm->builder, addr, "");
2456 val = radeon_llvm_saturate(bld_base, val);
2457 LLVMBuildStore(gallivm->builder, val, addr);
2458 }
2459 }
2460
2461 if (cond)
2462 lp_build_endif(&if_ctx);
2463 }
2464
2465 for (i = 0; i < info->num_outputs; i++) {
2466 outputs[i].name = info->output_semantic_name[i];
2467 outputs[i].sid = info->output_semantic_index[i];
2468
2469 for (j = 0; j < 4; j++)
2470 outputs[i].values[j] =
2471 LLVMBuildLoad(gallivm->builder,
2472 ctx->radeon_bld.soa.outputs[i][j],
2473 "");
2474 }
2475
2476 if (ctx->is_monolithic) {
2477 /* Export PrimitiveID when PS needs it. */
2478 if (si_vs_exports_prim_id(ctx->shader)) {
2479 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2480 outputs[i].sid = 0;
2481 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2482 get_primitive_id(bld_base, 0));
2483 outputs[i].values[1] = bld_base->base.undef;
2484 outputs[i].values[2] = bld_base->base.undef;
2485 outputs[i].values[3] = bld_base->base.undef;
2486 i++;
2487 }
2488 } else {
2489 /* Return the primitive ID from the LLVM function. */
2490 ctx->return_value =
2491 LLVMBuildInsertValue(gallivm->builder,
2492 ctx->return_value,
2493 bitcast(bld_base, TGSI_TYPE_FLOAT,
2494 get_primitive_id(bld_base, 0)),
2495 VS_EPILOG_PRIMID_LOC, "");
2496 }
2497
2498 si_llvm_export_vs(bld_base, outputs, i);
2499 FREE(outputs);
2500 }
2501
2502 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2503 LLVMValueRef depth, LLVMValueRef stencil,
2504 LLVMValueRef samplemask)
2505 {
2506 struct si_shader_context *ctx = si_shader_context(bld_base);
2507 struct lp_build_context *base = &bld_base->base;
2508 struct lp_build_context *uint = &bld_base->uint_bld;
2509 LLVMValueRef args[9];
2510 unsigned mask = 0;
2511
2512 assert(depth || stencil || samplemask);
2513
2514 args[1] = uint->one; /* whether the EXEC mask is valid */
2515 args[2] = uint->one; /* DONE bit */
2516
2517 /* Specify the target we are exporting */
2518 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2519
2520 args[4] = uint->zero; /* COMP flag */
2521 args[5] = base->undef; /* R, depth */
2522 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2523 args[7] = base->undef; /* B, sample mask */
2524 args[8] = base->undef; /* A, alpha to mask */
2525
2526 if (depth) {
2527 args[5] = depth;
2528 mask |= 0x1;
2529 }
2530
2531 if (stencil) {
2532 args[6] = stencil;
2533 mask |= 0x2;
2534 }
2535
2536 if (samplemask) {
2537 args[7] = samplemask;
2538 mask |= 0x4;
2539 }
2540
2541 /* SI (except OLAND) has a bug that it only looks
2542 * at the X writemask component. */
2543 if (ctx->screen->b.chip_class == SI &&
2544 ctx->screen->b.family != CHIP_OLAND)
2545 mask |= 0x1;
2546
2547 /* Specify which components to enable */
2548 args[0] = lp_build_const_int32(base->gallivm, mask);
2549
2550 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2551 ctx->voidt, args, 9, 0);
2552 }
2553
2554 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2555 LLVMValueRef *color, unsigned index,
2556 unsigned samplemask_param,
2557 bool is_last)
2558 {
2559 struct si_shader_context *ctx = si_shader_context(bld_base);
2560 struct lp_build_context *base = &bld_base->base;
2561 int i;
2562
2563 /* Clamp color */
2564 if (ctx->shader->key.ps.epilog.clamp_color)
2565 for (i = 0; i < 4; i++)
2566 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2567
2568 /* Alpha to one */
2569 if (ctx->shader->key.ps.epilog.alpha_to_one)
2570 color[3] = base->one;
2571
2572 /* Alpha test */
2573 if (index == 0 &&
2574 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2575 si_alpha_test(bld_base, color[3]);
2576
2577 /* Line & polygon smoothing */
2578 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
2579 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2580 samplemask_param);
2581
2582 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2583 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
2584 LLVMValueRef args[8][9];
2585 int c, last = -1;
2586
2587 /* Get the export arguments, also find out what the last one is. */
2588 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2589 si_llvm_init_export_args(bld_base, color,
2590 V_008DFC_SQ_EXP_MRT + c, args[c]);
2591 if (args[c][0] != bld_base->uint_bld.zero)
2592 last = c;
2593 }
2594
2595 /* Emit all exports. */
2596 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2597 if (is_last && last == c) {
2598 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2599 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
2600 } else if (args[c][0] == bld_base->uint_bld.zero)
2601 continue; /* unnecessary NULL export */
2602
2603 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2604 ctx->voidt, args[c], 9, 0);
2605 }
2606 } else {
2607 LLVMValueRef args[9];
2608
2609 /* Export */
2610 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2611 args);
2612 if (is_last) {
2613 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2614 args[2] = bld_base->uint_bld.one; /* DONE bit */
2615 } else if (args[0] == bld_base->uint_bld.zero)
2616 return; /* unnecessary NULL export */
2617
2618 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2619 ctx->voidt, args, 9, 0);
2620 }
2621 }
2622
2623 static void si_export_null(struct lp_build_tgsi_context *bld_base)
2624 {
2625 struct si_shader_context *ctx = si_shader_context(bld_base);
2626 struct lp_build_context *base = &bld_base->base;
2627 struct lp_build_context *uint = &bld_base->uint_bld;
2628 LLVMValueRef args[9];
2629
2630 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
2631 args[1] = uint->one; /* whether the EXEC mask is valid */
2632 args[2] = uint->one; /* DONE bit */
2633 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2634 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
2635 args[5] = uint->undef; /* R */
2636 args[6] = uint->undef; /* G */
2637 args[7] = uint->undef; /* B */
2638 args[8] = uint->undef; /* A */
2639
2640 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2641 ctx->voidt, args, 9, 0);
2642 }
2643
2644 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
2645 {
2646 struct si_shader_context *ctx = si_shader_context(bld_base);
2647 struct si_shader *shader = ctx->shader;
2648 struct lp_build_context *base = &bld_base->base;
2649 struct tgsi_shader_info *info = &shader->selector->info;
2650 LLVMBuilderRef builder = base->gallivm->builder;
2651 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2652 int last_color_export = -1;
2653 int i;
2654
2655 /* Determine the last export. If MRTZ is present, it's always last.
2656 * Otherwise, find the last color export.
2657 */
2658 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
2659 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
2660
2661 /* Don't export NULL and return if alpha-test is enabled. */
2662 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
2663 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
2664 (spi_format & 0xf) == 0)
2665 spi_format |= V_028714_SPI_SHADER_32_AR;
2666
2667 for (i = 0; i < info->num_outputs; i++) {
2668 unsigned index = info->output_semantic_index[i];
2669
2670 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
2671 continue;
2672
2673 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2674 if (shader->key.ps.epilog.last_cbuf > 0) {
2675 /* Just set this if any of the colorbuffers are enabled. */
2676 if (spi_format &
2677 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
2678 last_color_export = i;
2679 continue;
2680 }
2681
2682 if ((spi_format >> (index * 4)) & 0xf)
2683 last_color_export = i;
2684 }
2685
2686 /* If there are no outputs, export NULL. */
2687 if (last_color_export == -1) {
2688 si_export_null(bld_base);
2689 return;
2690 }
2691 }
2692
2693 for (i = 0; i < info->num_outputs; i++) {
2694 unsigned semantic_name = info->output_semantic_name[i];
2695 unsigned semantic_index = info->output_semantic_index[i];
2696 unsigned j;
2697 LLVMValueRef color[4] = {};
2698
2699 /* Select the correct target */
2700 switch (semantic_name) {
2701 case TGSI_SEMANTIC_POSITION:
2702 depth = LLVMBuildLoad(builder,
2703 ctx->radeon_bld.soa.outputs[i][2], "");
2704 break;
2705 case TGSI_SEMANTIC_STENCIL:
2706 stencil = LLVMBuildLoad(builder,
2707 ctx->radeon_bld.soa.outputs[i][1], "");
2708 break;
2709 case TGSI_SEMANTIC_SAMPLEMASK:
2710 samplemask = LLVMBuildLoad(builder,
2711 ctx->radeon_bld.soa.outputs[i][0], "");
2712 break;
2713 case TGSI_SEMANTIC_COLOR:
2714 for (j = 0; j < 4; j++)
2715 color[j] = LLVMBuildLoad(builder,
2716 ctx->radeon_bld.soa.outputs[i][j], "");
2717
2718 si_export_mrt_color(bld_base, color, semantic_index,
2719 SI_PARAM_SAMPLE_COVERAGE,
2720 last_color_export == i);
2721 break;
2722 default:
2723 fprintf(stderr,
2724 "Warning: SI unhandled fs output type:%d\n",
2725 semantic_name);
2726 }
2727 }
2728
2729 if (depth || stencil || samplemask)
2730 si_export_mrt_z(bld_base, depth, stencil, samplemask);
2731 }
2732
2733 /**
2734 * Return PS outputs in this order:
2735 *
2736 * v[0:3] = color0.xyzw
2737 * v[4:7] = color1.xyzw
2738 * ...
2739 * vN+0 = Depth
2740 * vN+1 = Stencil
2741 * vN+2 = SampleMask
2742 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
2743 *
2744 * The alpha-ref SGPR is returned via its original location.
2745 */
2746 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
2747 {
2748 struct si_shader_context *ctx = si_shader_context(bld_base);
2749 struct si_shader *shader = ctx->shader;
2750 struct lp_build_context *base = &bld_base->base;
2751 struct tgsi_shader_info *info = &shader->selector->info;
2752 LLVMBuilderRef builder = base->gallivm->builder;
2753 unsigned i, j, first_vgpr, vgpr;
2754
2755 LLVMValueRef color[8][4] = {};
2756 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2757 LLVMValueRef ret;
2758
2759 /* Read the output values. */
2760 for (i = 0; i < info->num_outputs; i++) {
2761 unsigned semantic_name = info->output_semantic_name[i];
2762 unsigned semantic_index = info->output_semantic_index[i];
2763
2764 switch (semantic_name) {
2765 case TGSI_SEMANTIC_COLOR:
2766 assert(semantic_index < 8);
2767 for (j = 0; j < 4; j++) {
2768 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
2769 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
2770 color[semantic_index][j] = result;
2771 }
2772 break;
2773 case TGSI_SEMANTIC_POSITION:
2774 depth = LLVMBuildLoad(builder,
2775 ctx->radeon_bld.soa.outputs[i][2], "");
2776 break;
2777 case TGSI_SEMANTIC_STENCIL:
2778 stencil = LLVMBuildLoad(builder,
2779 ctx->radeon_bld.soa.outputs[i][1], "");
2780 break;
2781 case TGSI_SEMANTIC_SAMPLEMASK:
2782 samplemask = LLVMBuildLoad(builder,
2783 ctx->radeon_bld.soa.outputs[i][0], "");
2784 break;
2785 default:
2786 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
2787 semantic_name);
2788 }
2789 }
2790
2791 /* Fill the return structure. */
2792 ret = ctx->return_value;
2793
2794 /* Set SGPRs. */
2795 ret = LLVMBuildInsertValue(builder, ret,
2796 bitcast(bld_base, TGSI_TYPE_SIGNED,
2797 LLVMGetParam(ctx->radeon_bld.main_fn,
2798 SI_PARAM_ALPHA_REF)),
2799 SI_SGPR_ALPHA_REF, "");
2800
2801 /* Set VGPRs */
2802 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
2803 for (i = 0; i < ARRAY_SIZE(color); i++) {
2804 if (!color[i][0])
2805 continue;
2806
2807 for (j = 0; j < 4; j++)
2808 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
2809 }
2810 if (depth)
2811 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
2812 if (stencil)
2813 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
2814 if (samplemask)
2815 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
2816
2817 /* Add the input sample mask for smoothing at the end. */
2818 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
2819 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
2820 ret = LLVMBuildInsertValue(builder, ret,
2821 LLVMGetParam(ctx->radeon_bld.main_fn,
2822 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
2823
2824 ctx->return_value = ret;
2825 }
2826
2827 /**
2828 * Given a v8i32 resource descriptor for a buffer, extract the size of the
2829 * buffer in number of elements and return it as an i32.
2830 */
2831 static LLVMValueRef get_buffer_size(
2832 struct lp_build_tgsi_context *bld_base,
2833 LLVMValueRef descriptor)
2834 {
2835 struct si_shader_context *ctx = si_shader_context(bld_base);
2836 struct gallivm_state *gallivm = bld_base->base.gallivm;
2837 LLVMBuilderRef builder = gallivm->builder;
2838 LLVMValueRef size =
2839 LLVMBuildExtractElement(builder, descriptor,
2840 lp_build_const_int32(gallivm, 6), "");
2841
2842 if (ctx->screen->b.chip_class >= VI) {
2843 /* On VI, the descriptor contains the size in bytes,
2844 * but TXQ must return the size in elements.
2845 * The stride is always non-zero for resources using TXQ.
2846 */
2847 LLVMValueRef stride =
2848 LLVMBuildExtractElement(builder, descriptor,
2849 lp_build_const_int32(gallivm, 5), "");
2850 stride = LLVMBuildLShr(builder, stride,
2851 lp_build_const_int32(gallivm, 16), "");
2852 stride = LLVMBuildAnd(builder, stride,
2853 lp_build_const_int32(gallivm, 0x3FFF), "");
2854
2855 size = LLVMBuildUDiv(builder, size, stride, "");
2856 }
2857
2858 return size;
2859 }
2860
2861 /**
2862 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
2863 * intrinsic names).
2864 */
2865 static void build_int_type_name(
2866 LLVMTypeRef type,
2867 char *buf, unsigned bufsize)
2868 {
2869 assert(bufsize >= 6);
2870
2871 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
2872 snprintf(buf, bufsize, "v%ui32",
2873 LLVMGetVectorSize(type));
2874 else
2875 strcpy(buf, "i32");
2876 }
2877
2878 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
2879 struct lp_build_tgsi_context *bld_base,
2880 struct lp_build_emit_data *emit_data);
2881
2882 /* Prevent optimizations (at least of memory accesses) across the current
2883 * point in the program by emitting empty inline assembly that is marked as
2884 * having side effects.
2885 */
2886 static void emit_optimization_barrier(struct si_shader_context *ctx)
2887 {
2888 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2889 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
2890 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
2891 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
2892 }
2893
2894 static void membar_emit(
2895 const struct lp_build_tgsi_action *action,
2896 struct lp_build_tgsi_context *bld_base,
2897 struct lp_build_emit_data *emit_data)
2898 {
2899 struct si_shader_context *ctx = si_shader_context(bld_base);
2900
2901 /* Since memoryBarrier only makes guarantees about atomics and
2902 * coherent image accesses (which bypass TC L1), we do not need to emit
2903 * any special cache handling here.
2904 *
2905 * We do have to prevent LLVM from re-ordering loads across
2906 * the barrier though.
2907 */
2908 emit_optimization_barrier(ctx);
2909 }
2910
2911 static LLVMValueRef
2912 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
2913 const struct tgsi_full_src_register *reg)
2914 {
2915 LLVMValueRef ind_index;
2916 LLVMValueRef rsrc_ptr;
2917
2918 if (!reg->Register.Indirect)
2919 return ctx->shader_buffers[reg->Register.Index];
2920
2921 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
2922 reg->Register.Index,
2923 SI_NUM_SHADER_BUFFERS);
2924
2925 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
2926 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
2927 }
2928
2929 static bool tgsi_is_array_sampler(unsigned target)
2930 {
2931 return target == TGSI_TEXTURE_1D_ARRAY ||
2932 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
2933 target == TGSI_TEXTURE_2D_ARRAY ||
2934 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
2935 target == TGSI_TEXTURE_CUBE_ARRAY ||
2936 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
2937 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2938 }
2939
2940 static bool tgsi_is_array_image(unsigned target)
2941 {
2942 return target == TGSI_TEXTURE_3D ||
2943 target == TGSI_TEXTURE_CUBE ||
2944 target == TGSI_TEXTURE_1D_ARRAY ||
2945 target == TGSI_TEXTURE_2D_ARRAY ||
2946 target == TGSI_TEXTURE_CUBE_ARRAY ||
2947 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2948 }
2949
2950 /**
2951 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
2952 *
2953 * At least on Tonga, executing image stores on images with DCC enabled and
2954 * non-trivial can eventually lead to lockups. This can occur when an
2955 * application binds an image as read-only but then uses a shader that writes
2956 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
2957 * program termination) in this case, but it doesn't cost much to be a bit
2958 * nicer: disabling DCC in the shader still leads to undefined results but
2959 * avoids the lockup.
2960 */
2961 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
2962 LLVMValueRef rsrc)
2963 {
2964 if (ctx->screen->b.chip_class <= CIK) {
2965 return rsrc;
2966 } else {
2967 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2968 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
2969 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
2970 LLVMValueRef tmp;
2971
2972 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
2973 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
2974 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
2975 }
2976 }
2977
2978 /**
2979 * Load the resource descriptor for \p image.
2980 */
2981 static void
2982 image_fetch_rsrc(
2983 struct lp_build_tgsi_context *bld_base,
2984 const struct tgsi_full_src_register *image,
2985 bool dcc_off,
2986 LLVMValueRef *rsrc)
2987 {
2988 struct si_shader_context *ctx = si_shader_context(bld_base);
2989
2990 assert(image->Register.File == TGSI_FILE_IMAGE);
2991
2992 if (!image->Register.Indirect) {
2993 /* Fast path: use preloaded resources */
2994 *rsrc = ctx->images[image->Register.Index];
2995 } else {
2996 /* Indexing and manual load */
2997 LLVMValueRef ind_index;
2998 LLVMValueRef rsrc_ptr;
2999 LLVMValueRef tmp;
3000
3001 /* From the GL_ARB_shader_image_load_store extension spec:
3002 *
3003 * If a shader performs an image load, store, or atomic
3004 * operation using an image variable declared as an array,
3005 * and if the index used to select an individual element is
3006 * negative or greater than or equal to the size of the
3007 * array, the results of the operation are undefined but may
3008 * not lead to termination.
3009 */
3010 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3011 image->Register.Index,
3012 SI_NUM_IMAGES);
3013
3014 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3015 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3016 if (dcc_off)
3017 tmp = force_dcc_off(ctx, tmp);
3018 *rsrc = tmp;
3019 }
3020 }
3021
3022 static LLVMValueRef image_fetch_coords(
3023 struct lp_build_tgsi_context *bld_base,
3024 const struct tgsi_full_instruction *inst,
3025 unsigned src)
3026 {
3027 struct gallivm_state *gallivm = bld_base->base.gallivm;
3028 LLVMBuilderRef builder = gallivm->builder;
3029 unsigned target = inst->Memory.Texture;
3030 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3031 LLVMValueRef coords[4];
3032 LLVMValueRef tmp;
3033 int chan;
3034
3035 for (chan = 0; chan < num_coords; ++chan) {
3036 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3037 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3038 coords[chan] = tmp;
3039 }
3040
3041 if (num_coords == 1)
3042 return coords[0];
3043
3044 if (num_coords == 3) {
3045 /* LLVM has difficulties lowering 3-element vectors. */
3046 coords[3] = bld_base->uint_bld.undef;
3047 num_coords = 4;
3048 }
3049
3050 return lp_build_gather_values(gallivm, coords, num_coords);
3051 }
3052
3053 /**
3054 * Append the extra mode bits that are used by image load and store.
3055 */
3056 static void image_append_args(
3057 struct si_shader_context *ctx,
3058 struct lp_build_emit_data * emit_data,
3059 unsigned target,
3060 bool atomic)
3061 {
3062 const struct tgsi_full_instruction *inst = emit_data->inst;
3063 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3064 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3065
3066 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3067 emit_data->args[emit_data->arg_count++] =
3068 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3069 if (!atomic) {
3070 emit_data->args[emit_data->arg_count++] =
3071 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3072 i1true : i1false; /* glc */
3073 }
3074 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3075 }
3076
3077 /**
3078 * Given a 256 bit resource, extract the top half (which stores the buffer
3079 * resource in the case of textures and images).
3080 */
3081 static LLVMValueRef extract_rsrc_top_half(
3082 struct si_shader_context *ctx,
3083 LLVMValueRef rsrc)
3084 {
3085 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3086 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3087 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3088
3089 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3090 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3091 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3092
3093 return rsrc;
3094 }
3095
3096 /**
3097 * Append the resource and indexing arguments for buffer intrinsics.
3098 *
3099 * \param rsrc the v4i32 buffer resource
3100 * \param index index into the buffer (stride-based)
3101 * \param offset byte offset into the buffer
3102 */
3103 static void buffer_append_args(
3104 struct si_shader_context *ctx,
3105 struct lp_build_emit_data *emit_data,
3106 LLVMValueRef rsrc,
3107 LLVMValueRef index,
3108 LLVMValueRef offset,
3109 bool atomic)
3110 {
3111 const struct tgsi_full_instruction *inst = emit_data->inst;
3112 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3113 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3114
3115 emit_data->args[emit_data->arg_count++] = rsrc;
3116 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3117 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3118 if (!atomic) {
3119 emit_data->args[emit_data->arg_count++] =
3120 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3121 i1true : i1false; /* glc */
3122 }
3123 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3124 }
3125
3126 static void load_fetch_args(
3127 struct lp_build_tgsi_context * bld_base,
3128 struct lp_build_emit_data * emit_data)
3129 {
3130 struct si_shader_context *ctx = si_shader_context(bld_base);
3131 struct gallivm_state *gallivm = bld_base->base.gallivm;
3132 const struct tgsi_full_instruction * inst = emit_data->inst;
3133 unsigned target = inst->Memory.Texture;
3134 LLVMValueRef rsrc;
3135
3136 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3137
3138 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3139 LLVMBuilderRef builder = gallivm->builder;
3140 LLVMValueRef offset;
3141 LLVMValueRef tmp;
3142
3143 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3144
3145 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3146 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3147
3148 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3149 offset, false);
3150 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3151 LLVMValueRef coords;
3152
3153 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3154 coords = image_fetch_coords(bld_base, inst, 1);
3155
3156 if (target == TGSI_TEXTURE_BUFFER) {
3157 rsrc = extract_rsrc_top_half(ctx, rsrc);
3158 buffer_append_args(ctx, emit_data, rsrc, coords,
3159 bld_base->uint_bld.zero, false);
3160 } else {
3161 emit_data->args[0] = coords;
3162 emit_data->args[1] = rsrc;
3163 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3164 emit_data->arg_count = 3;
3165
3166 image_append_args(ctx, emit_data, target, false);
3167 }
3168 }
3169 }
3170
3171 static void load_emit_buffer(struct si_shader_context *ctx,
3172 struct lp_build_emit_data *emit_data)
3173 {
3174 const struct tgsi_full_instruction *inst = emit_data->inst;
3175 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3176 LLVMBuilderRef builder = gallivm->builder;
3177 uint writemask = inst->Dst[0].Register.WriteMask;
3178 uint count = util_last_bit(writemask);
3179 const char *intrinsic_name;
3180 LLVMTypeRef dst_type;
3181
3182 switch (count) {
3183 case 1:
3184 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3185 dst_type = ctx->f32;
3186 break;
3187 case 2:
3188 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3189 dst_type = LLVMVectorType(ctx->f32, 2);
3190 break;
3191 default: // 3 & 4
3192 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3193 dst_type = ctx->v4f32;
3194 count = 4;
3195 }
3196
3197 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3198 builder, intrinsic_name, dst_type,
3199 emit_data->args, emit_data->arg_count,
3200 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3201 }
3202
3203 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3204 const struct tgsi_full_instruction *inst,
3205 LLVMTypeRef type, int arg)
3206 {
3207 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3208 LLVMBuilderRef builder = gallivm->builder;
3209 LLVMValueRef offset, ptr;
3210 int addr_space;
3211
3212 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3213 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3214
3215 ptr = ctx->shared_memory;
3216 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3217 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3218 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3219
3220 return ptr;
3221 }
3222
3223 static void load_emit_memory(
3224 struct si_shader_context *ctx,
3225 struct lp_build_emit_data *emit_data)
3226 {
3227 const struct tgsi_full_instruction *inst = emit_data->inst;
3228 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3229 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3230 LLVMBuilderRef builder = gallivm->builder;
3231 unsigned writemask = inst->Dst[0].Register.WriteMask;
3232 LLVMValueRef channels[4], ptr, derived_ptr, index;
3233 int chan;
3234
3235 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3236
3237 for (chan = 0; chan < 4; ++chan) {
3238 if (!(writemask & (1 << chan))) {
3239 channels[chan] = LLVMGetUndef(base->elem_type);
3240 continue;
3241 }
3242
3243 index = lp_build_const_int32(gallivm, chan);
3244 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3245 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3246 }
3247 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3248 }
3249
3250 static void load_emit(
3251 const struct lp_build_tgsi_action *action,
3252 struct lp_build_tgsi_context *bld_base,
3253 struct lp_build_emit_data *emit_data)
3254 {
3255 struct si_shader_context *ctx = si_shader_context(bld_base);
3256 struct gallivm_state *gallivm = bld_base->base.gallivm;
3257 LLVMBuilderRef builder = gallivm->builder;
3258 const struct tgsi_full_instruction * inst = emit_data->inst;
3259 char intrinsic_name[32];
3260 char coords_type[8];
3261
3262 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3263 load_emit_memory(ctx, emit_data);
3264 return;
3265 }
3266
3267 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3268 emit_optimization_barrier(ctx);
3269
3270 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3271 load_emit_buffer(ctx, emit_data);
3272 return;
3273 }
3274
3275 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3276 emit_data->output[emit_data->chan] =
3277 lp_build_intrinsic(
3278 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3279 emit_data->args, emit_data->arg_count,
3280 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3281 } else {
3282 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3283 coords_type, sizeof(coords_type));
3284
3285 snprintf(intrinsic_name, sizeof(intrinsic_name),
3286 "llvm.amdgcn.image.load.%s", coords_type);
3287
3288 emit_data->output[emit_data->chan] =
3289 lp_build_intrinsic(
3290 builder, intrinsic_name, emit_data->dst_type,
3291 emit_data->args, emit_data->arg_count,
3292 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3293 }
3294 }
3295
3296 static void store_fetch_args(
3297 struct lp_build_tgsi_context * bld_base,
3298 struct lp_build_emit_data * emit_data)
3299 {
3300 struct si_shader_context *ctx = si_shader_context(bld_base);
3301 struct gallivm_state *gallivm = bld_base->base.gallivm;
3302 LLVMBuilderRef builder = gallivm->builder;
3303 const struct tgsi_full_instruction * inst = emit_data->inst;
3304 struct tgsi_full_src_register memory;
3305 LLVMValueRef chans[4];
3306 LLVMValueRef data;
3307 LLVMValueRef rsrc;
3308 unsigned chan;
3309
3310 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3311
3312 for (chan = 0; chan < 4; ++chan) {
3313 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3314 }
3315 data = lp_build_gather_values(gallivm, chans, 4);
3316
3317 emit_data->args[emit_data->arg_count++] = data;
3318
3319 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3320
3321 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3322 LLVMValueRef offset;
3323 LLVMValueRef tmp;
3324
3325 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3326
3327 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3328 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3329
3330 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3331 offset, false);
3332 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3333 unsigned target = inst->Memory.Texture;
3334 LLVMValueRef coords;
3335
3336 coords = image_fetch_coords(bld_base, inst, 0);
3337
3338 if (target == TGSI_TEXTURE_BUFFER) {
3339 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3340
3341 rsrc = extract_rsrc_top_half(ctx, rsrc);
3342 buffer_append_args(ctx, emit_data, rsrc, coords,
3343 bld_base->uint_bld.zero, false);
3344 } else {
3345 emit_data->args[1] = coords;
3346 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3347 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3348 emit_data->arg_count = 4;
3349
3350 image_append_args(ctx, emit_data, target, false);
3351 }
3352 }
3353 }
3354
3355 static void store_emit_buffer(
3356 struct si_shader_context *ctx,
3357 struct lp_build_emit_data *emit_data)
3358 {
3359 const struct tgsi_full_instruction *inst = emit_data->inst;
3360 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3361 LLVMBuilderRef builder = gallivm->builder;
3362 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3363 LLVMValueRef base_data = emit_data->args[0];
3364 LLVMValueRef base_offset = emit_data->args[3];
3365 unsigned writemask = inst->Dst[0].Register.WriteMask;
3366
3367 while (writemask) {
3368 int start, count;
3369 const char *intrinsic_name;
3370 LLVMValueRef data;
3371 LLVMValueRef offset;
3372 LLVMValueRef tmp;
3373
3374 u_bit_scan_consecutive_range(&writemask, &start, &count);
3375
3376 /* Due to an LLVM limitation, split 3-element writes
3377 * into a 2-element and a 1-element write. */
3378 if (count == 3) {
3379 writemask |= 1 << (start + 2);
3380 count = 2;
3381 }
3382
3383 if (count == 4) {
3384 data = base_data;
3385 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3386 } else if (count == 2) {
3387 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3388
3389 tmp = LLVMBuildExtractElement(
3390 builder, base_data,
3391 lp_build_const_int32(gallivm, start), "");
3392 data = LLVMBuildInsertElement(
3393 builder, LLVMGetUndef(v2f32), tmp,
3394 uint_bld->zero, "");
3395
3396 tmp = LLVMBuildExtractElement(
3397 builder, base_data,
3398 lp_build_const_int32(gallivm, start + 1), "");
3399 data = LLVMBuildInsertElement(
3400 builder, data, tmp, uint_bld->one, "");
3401
3402 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3403 } else {
3404 assert(count == 1);
3405 data = LLVMBuildExtractElement(
3406 builder, base_data,
3407 lp_build_const_int32(gallivm, start), "");
3408 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3409 }
3410
3411 offset = base_offset;
3412 if (start != 0) {
3413 offset = LLVMBuildAdd(
3414 builder, offset,
3415 lp_build_const_int32(gallivm, start * 4), "");
3416 }
3417
3418 emit_data->args[0] = data;
3419 emit_data->args[3] = offset;
3420
3421 lp_build_intrinsic(
3422 builder, intrinsic_name, emit_data->dst_type,
3423 emit_data->args, emit_data->arg_count,
3424 LLVMNoUnwindAttribute);
3425 }
3426 }
3427
3428 static void store_emit_memory(
3429 struct si_shader_context *ctx,
3430 struct lp_build_emit_data *emit_data)
3431 {
3432 const struct tgsi_full_instruction *inst = emit_data->inst;
3433 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3434 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3435 LLVMBuilderRef builder = gallivm->builder;
3436 unsigned writemask = inst->Dst[0].Register.WriteMask;
3437 LLVMValueRef ptr, derived_ptr, data, index;
3438 int chan;
3439
3440 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3441
3442 for (chan = 0; chan < 4; ++chan) {
3443 if (!(writemask & (1 << chan))) {
3444 continue;
3445 }
3446 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3447 index = lp_build_const_int32(gallivm, chan);
3448 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3449 LLVMBuildStore(builder, data, derived_ptr);
3450 }
3451 }
3452
3453 static void store_emit(
3454 const struct lp_build_tgsi_action *action,
3455 struct lp_build_tgsi_context *bld_base,
3456 struct lp_build_emit_data *emit_data)
3457 {
3458 struct gallivm_state *gallivm = bld_base->base.gallivm;
3459 LLVMBuilderRef builder = gallivm->builder;
3460 const struct tgsi_full_instruction * inst = emit_data->inst;
3461 unsigned target = inst->Memory.Texture;
3462 char intrinsic_name[32];
3463 char coords_type[8];
3464
3465 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3466 store_emit_buffer(si_shader_context(bld_base), emit_data);
3467 return;
3468 } else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3469 store_emit_memory(si_shader_context(bld_base), emit_data);
3470 return;
3471 }
3472
3473 if (target == TGSI_TEXTURE_BUFFER) {
3474 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3475 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3476 emit_data->dst_type, emit_data->args, emit_data->arg_count,
3477 LLVMNoUnwindAttribute);
3478 } else {
3479 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3480 coords_type, sizeof(coords_type));
3481 snprintf(intrinsic_name, sizeof(intrinsic_name),
3482 "llvm.amdgcn.image.store.%s", coords_type);
3483
3484 emit_data->output[emit_data->chan] =
3485 lp_build_intrinsic(
3486 builder, intrinsic_name, emit_data->dst_type,
3487 emit_data->args, emit_data->arg_count,
3488 LLVMNoUnwindAttribute);
3489 }
3490 }
3491
3492 static void atomic_fetch_args(
3493 struct lp_build_tgsi_context * bld_base,
3494 struct lp_build_emit_data * emit_data)
3495 {
3496 struct si_shader_context *ctx = si_shader_context(bld_base);
3497 struct gallivm_state *gallivm = bld_base->base.gallivm;
3498 LLVMBuilderRef builder = gallivm->builder;
3499 const struct tgsi_full_instruction * inst = emit_data->inst;
3500 LLVMValueRef data1, data2;
3501 LLVMValueRef rsrc;
3502 LLVMValueRef tmp;
3503
3504 emit_data->dst_type = bld_base->base.elem_type;
3505
3506 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3507 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3508
3509 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3510 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3511 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3512 }
3513
3514 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3515 * of arguments, which is reversed relative to TGSI (and GLSL)
3516 */
3517 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3518 emit_data->args[emit_data->arg_count++] = data2;
3519 emit_data->args[emit_data->arg_count++] = data1;
3520
3521 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3522 LLVMValueRef offset;
3523
3524 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3525
3526 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3527 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3528
3529 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3530 offset, true);
3531 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3532 unsigned target = inst->Memory.Texture;
3533 LLVMValueRef coords;
3534
3535 image_fetch_rsrc(bld_base, &inst->Src[0],
3536 target != TGSI_TEXTURE_BUFFER, &rsrc);
3537 coords = image_fetch_coords(bld_base, inst, 1);
3538
3539 if (target == TGSI_TEXTURE_BUFFER) {
3540 rsrc = extract_rsrc_top_half(ctx, rsrc);
3541 buffer_append_args(ctx, emit_data, rsrc, coords,
3542 bld_base->uint_bld.zero, true);
3543 } else {
3544 emit_data->args[emit_data->arg_count++] = coords;
3545 emit_data->args[emit_data->arg_count++] = rsrc;
3546
3547 image_append_args(ctx, emit_data, target, true);
3548 }
3549 }
3550 }
3551
3552 static void atomic_emit_memory(struct si_shader_context *ctx,
3553 struct lp_build_emit_data *emit_data) {
3554 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3555 LLVMBuilderRef builder = gallivm->builder;
3556 const struct tgsi_full_instruction * inst = emit_data->inst;
3557 LLVMValueRef ptr, result, arg;
3558
3559 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3560
3561 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3562 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3563
3564 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3565 LLVMValueRef new_data;
3566 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
3567 inst, 3, 0);
3568
3569 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3570
3571 #if HAVE_LLVM >= 0x309
3572 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3573 LLVMAtomicOrderingSequentiallyConsistent,
3574 LLVMAtomicOrderingSequentiallyConsistent,
3575 false);
3576 #endif
3577
3578 result = LLVMBuildExtractValue(builder, result, 0, "");
3579 } else {
3580 LLVMAtomicRMWBinOp op;
3581
3582 switch(inst->Instruction.Opcode) {
3583 case TGSI_OPCODE_ATOMUADD:
3584 op = LLVMAtomicRMWBinOpAdd;
3585 break;
3586 case TGSI_OPCODE_ATOMXCHG:
3587 op = LLVMAtomicRMWBinOpXchg;
3588 break;
3589 case TGSI_OPCODE_ATOMAND:
3590 op = LLVMAtomicRMWBinOpAnd;
3591 break;
3592 case TGSI_OPCODE_ATOMOR:
3593 op = LLVMAtomicRMWBinOpOr;
3594 break;
3595 case TGSI_OPCODE_ATOMXOR:
3596 op = LLVMAtomicRMWBinOpXor;
3597 break;
3598 case TGSI_OPCODE_ATOMUMIN:
3599 op = LLVMAtomicRMWBinOpUMin;
3600 break;
3601 case TGSI_OPCODE_ATOMUMAX:
3602 op = LLVMAtomicRMWBinOpUMax;
3603 break;
3604 case TGSI_OPCODE_ATOMIMIN:
3605 op = LLVMAtomicRMWBinOpMin;
3606 break;
3607 case TGSI_OPCODE_ATOMIMAX:
3608 op = LLVMAtomicRMWBinOpMax;
3609 break;
3610 default:
3611 unreachable("unknown atomic opcode");
3612 }
3613
3614 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
3615 LLVMAtomicOrderingSequentiallyConsistent,
3616 false);
3617 }
3618 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
3619 }
3620
3621 static void atomic_emit(
3622 const struct lp_build_tgsi_action *action,
3623 struct lp_build_tgsi_context *bld_base,
3624 struct lp_build_emit_data *emit_data)
3625 {
3626 struct si_shader_context *ctx = si_shader_context(bld_base);
3627 struct gallivm_state *gallivm = bld_base->base.gallivm;
3628 LLVMBuilderRef builder = gallivm->builder;
3629 const struct tgsi_full_instruction * inst = emit_data->inst;
3630 char intrinsic_name[40];
3631 LLVMValueRef tmp;
3632
3633 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3634 atomic_emit_memory(ctx, emit_data);
3635 return;
3636 }
3637
3638 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3639 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3640 snprintf(intrinsic_name, sizeof(intrinsic_name),
3641 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
3642 } else {
3643 char coords_type[8];
3644
3645 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3646 coords_type, sizeof(coords_type));
3647 snprintf(intrinsic_name, sizeof(intrinsic_name),
3648 "llvm.amdgcn.image.atomic.%s.%s",
3649 action->intr_name, coords_type);
3650 }
3651
3652 tmp = lp_build_intrinsic(
3653 builder, intrinsic_name, bld_base->uint_bld.elem_type,
3654 emit_data->args, emit_data->arg_count,
3655 LLVMNoUnwindAttribute);
3656 emit_data->output[emit_data->chan] =
3657 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
3658 }
3659
3660 static void resq_fetch_args(
3661 struct lp_build_tgsi_context * bld_base,
3662 struct lp_build_emit_data * emit_data)
3663 {
3664 struct si_shader_context *ctx = si_shader_context(bld_base);
3665 struct gallivm_state *gallivm = bld_base->base.gallivm;
3666 const struct tgsi_full_instruction *inst = emit_data->inst;
3667 const struct tgsi_full_src_register *reg = &inst->Src[0];
3668
3669 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3670
3671 if (reg->Register.File == TGSI_FILE_BUFFER) {
3672 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
3673 emit_data->arg_count = 1;
3674 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3675 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
3676 emit_data->arg_count = 1;
3677 } else {
3678 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
3679 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
3680 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3681 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
3682 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
3683 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
3684 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
3685 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
3686 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
3687 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
3688 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
3689 emit_data->arg_count = 10;
3690 }
3691 }
3692
3693 static void resq_emit(
3694 const struct lp_build_tgsi_action *action,
3695 struct lp_build_tgsi_context *bld_base,
3696 struct lp_build_emit_data *emit_data)
3697 {
3698 struct gallivm_state *gallivm = bld_base->base.gallivm;
3699 LLVMBuilderRef builder = gallivm->builder;
3700 const struct tgsi_full_instruction *inst = emit_data->inst;
3701 LLVMValueRef out;
3702
3703 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3704 out = LLVMBuildExtractElement(builder, emit_data->args[0],
3705 lp_build_const_int32(gallivm, 2), "");
3706 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3707 out = get_buffer_size(bld_base, emit_data->args[0]);
3708 } else {
3709 out = lp_build_intrinsic(
3710 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
3711 emit_data->args, emit_data->arg_count,
3712 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
3713
3714 /* Divide the number of layers by 6 to get the number of cubes. */
3715 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
3716 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
3717 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
3718
3719 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
3720 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
3721 z = LLVMBuildSDiv(builder, z, imm6, "");
3722 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
3723 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
3724 }
3725 }
3726
3727 emit_data->output[emit_data->chan] = out;
3728 }
3729
3730 static void set_tex_fetch_args(struct si_shader_context *ctx,
3731 struct lp_build_emit_data *emit_data,
3732 unsigned opcode, unsigned target,
3733 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
3734 LLVMValueRef *param, unsigned count,
3735 unsigned dmask)
3736 {
3737 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3738 unsigned num_args;
3739 unsigned is_rect = target == TGSI_TEXTURE_RECT;
3740
3741 /* Pad to power of two vector */
3742 while (count < util_next_power_of_two(count))
3743 param[count++] = LLVMGetUndef(ctx->i32);
3744
3745 /* Texture coordinates. */
3746 if (count > 1)
3747 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
3748 else
3749 emit_data->args[0] = param[0];
3750
3751 /* Resource. */
3752 emit_data->args[1] = res_ptr;
3753 num_args = 2;
3754
3755 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
3756 emit_data->dst_type = ctx->v4i32;
3757 else {
3758 emit_data->dst_type = ctx->v4f32;
3759
3760 emit_data->args[num_args++] = samp_ptr;
3761 }
3762
3763 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
3764 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
3765 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
3766 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
3767 tgsi_is_array_sampler(target)); /* da */
3768 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
3769 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
3770 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
3771 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
3772
3773 emit_data->arg_count = num_args;
3774 }
3775
3776 static const struct lp_build_tgsi_action tex_action;
3777
3778 enum desc_type {
3779 DESC_IMAGE,
3780 DESC_FMASK,
3781 DESC_SAMPLER
3782 };
3783
3784 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3785 {
3786 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3787 CONST_ADDR_SPACE);
3788 }
3789
3790 /**
3791 * Load an image view, fmask view. or sampler state descriptor.
3792 */
3793 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
3794 LLVMValueRef list, LLVMValueRef index,
3795 enum desc_type type)
3796 {
3797 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3798 LLVMBuilderRef builder = gallivm->builder;
3799
3800 switch (type) {
3801 case DESC_IMAGE:
3802 /* The image is at [0:7]. */
3803 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3804 break;
3805 case DESC_FMASK:
3806 /* The FMASK is at [8:15]. */
3807 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3808 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
3809 break;
3810 case DESC_SAMPLER:
3811 /* The sampler state is at [12:15]. */
3812 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3813 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
3814 list = LLVMBuildPointerCast(builder, list,
3815 const_array(ctx->v4i32, 0), "");
3816 break;
3817 }
3818
3819 return build_indexed_load_const(ctx, list, index);
3820 }
3821
3822 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
3823 LLVMValueRef index, enum desc_type type)
3824 {
3825 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
3826 SI_PARAM_SAMPLERS);
3827
3828 return get_sampler_desc_custom(ctx, list, index, type);
3829 }
3830
3831 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
3832 *
3833 * SI-CI:
3834 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
3835 * filtering manually. The driver sets img7 to a mask clearing
3836 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
3837 * s_and_b32 samp0, samp0, img7
3838 *
3839 * VI:
3840 * The ANISO_OVERRIDE sampler field enables this fix in TA.
3841 */
3842 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
3843 LLVMValueRef res, LLVMValueRef samp)
3844 {
3845 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3846 LLVMValueRef img7, samp0;
3847
3848 if (ctx->screen->b.chip_class >= VI)
3849 return samp;
3850
3851 img7 = LLVMBuildExtractElement(builder, res,
3852 LLVMConstInt(ctx->i32, 7, 0), "");
3853 samp0 = LLVMBuildExtractElement(builder, samp,
3854 LLVMConstInt(ctx->i32, 0, 0), "");
3855 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
3856 return LLVMBuildInsertElement(builder, samp, samp0,
3857 LLVMConstInt(ctx->i32, 0, 0), "");
3858 }
3859
3860 static void tex_fetch_ptrs(
3861 struct lp_build_tgsi_context *bld_base,
3862 struct lp_build_emit_data *emit_data,
3863 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
3864 {
3865 struct si_shader_context *ctx = si_shader_context(bld_base);
3866 const struct tgsi_full_instruction *inst = emit_data->inst;
3867 unsigned target = inst->Texture.Texture;
3868 unsigned sampler_src;
3869 unsigned sampler_index;
3870
3871 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
3872 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
3873
3874 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
3875 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
3876 LLVMValueRef ind_index;
3877
3878 ind_index = get_bounded_indirect_index(ctx,
3879 &reg->Indirect,
3880 reg->Register.Index,
3881 SI_NUM_SAMPLERS);
3882
3883 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
3884
3885 if (target == TGSI_TEXTURE_2D_MSAA ||
3886 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
3887 *samp_ptr = NULL;
3888 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
3889 } else {
3890 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
3891 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
3892 *fmask_ptr = NULL;
3893 }
3894 } else {
3895 *res_ptr = ctx->sampler_views[sampler_index];
3896 *samp_ptr = ctx->sampler_states[sampler_index];
3897 *fmask_ptr = ctx->fmasks[sampler_index];
3898 }
3899 }
3900
3901 static void tex_fetch_args(
3902 struct lp_build_tgsi_context *bld_base,
3903 struct lp_build_emit_data *emit_data)
3904 {
3905 struct si_shader_context *ctx = si_shader_context(bld_base);
3906 struct gallivm_state *gallivm = bld_base->base.gallivm;
3907 LLVMBuilderRef builder = gallivm->builder;
3908 const struct tgsi_full_instruction *inst = emit_data->inst;
3909 unsigned opcode = inst->Instruction.Opcode;
3910 unsigned target = inst->Texture.Texture;
3911 LLVMValueRef coords[5], derivs[6];
3912 LLVMValueRef address[16];
3913 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3914 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
3915 unsigned count = 0;
3916 unsigned chan;
3917 unsigned num_deriv_channels = 0;
3918 bool has_offset = inst->Texture.NumOffsets > 0;
3919 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
3920 unsigned dmask = 0xf;
3921
3922 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
3923
3924 if (opcode == TGSI_OPCODE_TXQ) {
3925 if (target == TGSI_TEXTURE_BUFFER) {
3926 /* Read the size from the buffer descriptor directly. */
3927 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
3928 emit_data->args[0] = get_buffer_size(bld_base, res);
3929 return;
3930 }
3931
3932 /* Textures - set the mip level. */
3933 address[count++] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
3934
3935 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
3936 NULL, address, count, 0xf);
3937 return;
3938 }
3939
3940 if (target == TGSI_TEXTURE_BUFFER) {
3941 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3942
3943 /* Bitcast and truncate v8i32 to v16i8. */
3944 LLVMValueRef res = res_ptr;
3945 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
3946 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
3947 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
3948
3949 emit_data->dst_type = ctx->v4f32;
3950 emit_data->args[0] = res;
3951 emit_data->args[1] = bld_base->uint_bld.zero;
3952 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3953 emit_data->arg_count = 3;
3954 return;
3955 }
3956
3957 /* Fetch and project texture coordinates */
3958 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
3959 for (chan = 0; chan < 3; chan++ ) {
3960 coords[chan] = lp_build_emit_fetch(bld_base,
3961 emit_data->inst, 0,
3962 chan);
3963 if (opcode == TGSI_OPCODE_TXP)
3964 coords[chan] = lp_build_emit_llvm_binary(bld_base,
3965 TGSI_OPCODE_DIV,
3966 coords[chan],
3967 coords[3]);
3968 }
3969
3970 if (opcode == TGSI_OPCODE_TXP)
3971 coords[3] = bld_base->base.one;
3972
3973 /* Pack offsets. */
3974 if (has_offset && opcode != TGSI_OPCODE_TXF) {
3975 /* The offsets are six-bit signed integers packed like this:
3976 * X=[5:0], Y=[13:8], and Z=[21:16].
3977 */
3978 LLVMValueRef offset[3], pack;
3979
3980 assert(inst->Texture.NumOffsets == 1);
3981
3982 for (chan = 0; chan < 3; chan++) {
3983 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
3984 emit_data->inst, 0, chan);
3985 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
3986 lp_build_const_int32(gallivm, 0x3f), "");
3987 if (chan)
3988 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
3989 lp_build_const_int32(gallivm, chan*8), "");
3990 }
3991
3992 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
3993 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
3994 address[count++] = pack;
3995 }
3996
3997 /* Pack LOD bias value */
3998 if (opcode == TGSI_OPCODE_TXB)
3999 address[count++] = coords[3];
4000 if (opcode == TGSI_OPCODE_TXB2)
4001 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4002
4003 /* Pack depth comparison value */
4004 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4005 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4006 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4007 } else {
4008 assert(ref_pos >= 0);
4009 address[count++] = coords[ref_pos];
4010 }
4011 }
4012
4013 /* Pack user derivatives */
4014 if (opcode == TGSI_OPCODE_TXD) {
4015 int param, num_src_deriv_channels;
4016
4017 switch (target) {
4018 case TGSI_TEXTURE_3D:
4019 num_src_deriv_channels = 3;
4020 num_deriv_channels = 3;
4021 break;
4022 case TGSI_TEXTURE_2D:
4023 case TGSI_TEXTURE_SHADOW2D:
4024 case TGSI_TEXTURE_RECT:
4025 case TGSI_TEXTURE_SHADOWRECT:
4026 case TGSI_TEXTURE_2D_ARRAY:
4027 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4028 num_src_deriv_channels = 2;
4029 num_deriv_channels = 2;
4030 break;
4031 case TGSI_TEXTURE_CUBE:
4032 case TGSI_TEXTURE_SHADOWCUBE:
4033 case TGSI_TEXTURE_CUBE_ARRAY:
4034 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4035 /* Cube derivatives will be converted to 2D. */
4036 num_src_deriv_channels = 3;
4037 num_deriv_channels = 2;
4038 break;
4039 case TGSI_TEXTURE_1D:
4040 case TGSI_TEXTURE_SHADOW1D:
4041 case TGSI_TEXTURE_1D_ARRAY:
4042 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4043 num_src_deriv_channels = 1;
4044 num_deriv_channels = 1;
4045 break;
4046 default:
4047 unreachable("invalid target");
4048 }
4049
4050 for (param = 0; param < 2; param++)
4051 for (chan = 0; chan < num_src_deriv_channels; chan++)
4052 derivs[param * num_src_deriv_channels + chan] =
4053 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4054 }
4055
4056 if (target == TGSI_TEXTURE_CUBE ||
4057 target == TGSI_TEXTURE_CUBE_ARRAY ||
4058 target == TGSI_TEXTURE_SHADOWCUBE ||
4059 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4060 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4061
4062 if (opcode == TGSI_OPCODE_TXD)
4063 for (int i = 0; i < num_deriv_channels * 2; i++)
4064 address[count++] = derivs[i];
4065
4066 /* Pack texture coordinates */
4067 address[count++] = coords[0];
4068 if (num_coords > 1)
4069 address[count++] = coords[1];
4070 if (num_coords > 2)
4071 address[count++] = coords[2];
4072
4073 /* Pack LOD or sample index */
4074 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4075 address[count++] = coords[3];
4076 else if (opcode == TGSI_OPCODE_TXL2)
4077 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4078
4079 if (count > 16) {
4080 assert(!"Cannot handle more than 16 texture address parameters");
4081 count = 16;
4082 }
4083
4084 for (chan = 0; chan < count; chan++ ) {
4085 address[chan] = LLVMBuildBitCast(gallivm->builder,
4086 address[chan], ctx->i32, "");
4087 }
4088
4089 /* Adjust the sample index according to FMASK.
4090 *
4091 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4092 * which is the identity mapping. Each nibble says which physical sample
4093 * should be fetched to get that sample.
4094 *
4095 * For example, 0x11111100 means there are only 2 samples stored and
4096 * the second sample covers 3/4 of the pixel. When reading samples 0
4097 * and 1, return physical sample 0 (determined by the first two 0s
4098 * in FMASK), otherwise return physical sample 1.
4099 *
4100 * The sample index should be adjusted as follows:
4101 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4102 */
4103 if (target == TGSI_TEXTURE_2D_MSAA ||
4104 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4105 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4106 struct lp_build_emit_data txf_emit_data = *emit_data;
4107 LLVMValueRef txf_address[4];
4108 unsigned txf_count = count;
4109 struct tgsi_full_instruction inst = {};
4110
4111 memcpy(txf_address, address, sizeof(txf_address));
4112
4113 if (target == TGSI_TEXTURE_2D_MSAA) {
4114 txf_address[2] = bld_base->uint_bld.zero;
4115 }
4116 txf_address[3] = bld_base->uint_bld.zero;
4117
4118 /* Read FMASK using TXF. */
4119 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4120 inst.Texture.Texture = target;
4121 txf_emit_data.inst = &inst;
4122 txf_emit_data.chan = 0;
4123 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4124 target, fmask_ptr, NULL,
4125 txf_address, txf_count, 0xf);
4126 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4127
4128 /* Initialize some constants. */
4129 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4130 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4131
4132 /* Apply the formula. */
4133 LLVMValueRef fmask =
4134 LLVMBuildExtractElement(gallivm->builder,
4135 txf_emit_data.output[0],
4136 uint_bld->zero, "");
4137
4138 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4139
4140 LLVMValueRef sample_index4 =
4141 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4142
4143 LLVMValueRef shifted_fmask =
4144 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4145
4146 LLVMValueRef final_sample =
4147 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4148
4149 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4150 * resource descriptor is 0 (invalid),
4151 */
4152 LLVMValueRef fmask_desc =
4153 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4154 ctx->v8i32, "");
4155
4156 LLVMValueRef fmask_word1 =
4157 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4158 uint_bld->one, "");
4159
4160 LLVMValueRef word1_is_nonzero =
4161 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4162 fmask_word1, uint_bld->zero, "");
4163
4164 /* Replace the MSAA sample index. */
4165 address[sample_chan] =
4166 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4167 final_sample, address[sample_chan], "");
4168 }
4169
4170 if (opcode == TGSI_OPCODE_TXF) {
4171 /* add tex offsets */
4172 if (inst->Texture.NumOffsets) {
4173 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4174 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4175 const struct tgsi_texture_offset *off = inst->TexOffsets;
4176
4177 assert(inst->Texture.NumOffsets == 1);
4178
4179 switch (target) {
4180 case TGSI_TEXTURE_3D:
4181 address[2] = lp_build_add(uint_bld, address[2],
4182 bld->immediates[off->Index][off->SwizzleZ]);
4183 /* fall through */
4184 case TGSI_TEXTURE_2D:
4185 case TGSI_TEXTURE_SHADOW2D:
4186 case TGSI_TEXTURE_RECT:
4187 case TGSI_TEXTURE_SHADOWRECT:
4188 case TGSI_TEXTURE_2D_ARRAY:
4189 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4190 address[1] =
4191 lp_build_add(uint_bld, address[1],
4192 bld->immediates[off->Index][off->SwizzleY]);
4193 /* fall through */
4194 case TGSI_TEXTURE_1D:
4195 case TGSI_TEXTURE_SHADOW1D:
4196 case TGSI_TEXTURE_1D_ARRAY:
4197 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4198 address[0] =
4199 lp_build_add(uint_bld, address[0],
4200 bld->immediates[off->Index][off->SwizzleX]);
4201 break;
4202 /* texture offsets do not apply to other texture targets */
4203 }
4204 }
4205 }
4206
4207 if (opcode == TGSI_OPCODE_TG4) {
4208 unsigned gather_comp = 0;
4209
4210 /* DMASK was repurposed for GATHER4. 4 components are always
4211 * returned and DMASK works like a swizzle - it selects
4212 * the component to fetch. The only valid DMASK values are
4213 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4214 * (red,red,red,red) etc.) The ISA document doesn't mention
4215 * this.
4216 */
4217
4218 /* Get the component index from src1.x for Gather4. */
4219 if (!tgsi_is_shadow_target(target)) {
4220 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4221 LLVMValueRef comp_imm;
4222 struct tgsi_src_register src1 = inst->Src[1].Register;
4223
4224 assert(src1.File == TGSI_FILE_IMMEDIATE);
4225
4226 comp_imm = imms[src1.Index][src1.SwizzleX];
4227 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4228 gather_comp = CLAMP(gather_comp, 0, 3);
4229 }
4230
4231 dmask = 1 << gather_comp;
4232 }
4233
4234 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4235 samp_ptr, address, count, dmask);
4236 }
4237
4238 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4239 struct lp_build_tgsi_context *bld_base,
4240 struct lp_build_emit_data *emit_data)
4241 {
4242 struct lp_build_context *base = &bld_base->base;
4243 unsigned opcode = emit_data->inst->Instruction.Opcode;
4244 unsigned target = emit_data->inst->Texture.Texture;
4245 char intr_name[127];
4246 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4247 bool is_shadow = tgsi_is_shadow_target(target);
4248 char type[64];
4249 const char *name = "llvm.SI.image.sample";
4250 const char *infix = "";
4251
4252 if (opcode == TGSI_OPCODE_TXQ && target == TGSI_TEXTURE_BUFFER) {
4253 /* Just return the buffer size. */
4254 emit_data->output[emit_data->chan] = emit_data->args[0];
4255 return;
4256 }
4257
4258 if (target == TGSI_TEXTURE_BUFFER) {
4259 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4260 base->gallivm->builder,
4261 "llvm.SI.vs.load.input", emit_data->dst_type,
4262 emit_data->args, emit_data->arg_count,
4263 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4264 return;
4265 }
4266
4267 switch (opcode) {
4268 case TGSI_OPCODE_TXF:
4269 name = target == TGSI_TEXTURE_2D_MSAA ||
4270 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4271 "llvm.SI.image.load" :
4272 "llvm.SI.image.load.mip";
4273 is_shadow = false;
4274 has_offset = false;
4275 break;
4276 case TGSI_OPCODE_TXQ:
4277 name = "llvm.SI.getresinfo";
4278 is_shadow = false;
4279 has_offset = false;
4280 break;
4281 case TGSI_OPCODE_LODQ:
4282 name = "llvm.SI.getlod";
4283 is_shadow = false;
4284 has_offset = false;
4285 break;
4286 case TGSI_OPCODE_TEX:
4287 case TGSI_OPCODE_TEX2:
4288 case TGSI_OPCODE_TXP:
4289 break;
4290 case TGSI_OPCODE_TXB:
4291 case TGSI_OPCODE_TXB2:
4292 infix = ".b";
4293 break;
4294 case TGSI_OPCODE_TXL:
4295 case TGSI_OPCODE_TXL2:
4296 infix = ".l";
4297 break;
4298 case TGSI_OPCODE_TXD:
4299 infix = ".d";
4300 break;
4301 case TGSI_OPCODE_TG4:
4302 name = "llvm.SI.gather4";
4303 break;
4304 default:
4305 assert(0);
4306 return;
4307 }
4308
4309 /* Add the type and suffixes .c, .o if needed. */
4310 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4311 sprintf(intr_name, "%s%s%s%s.%s",
4312 name, is_shadow ? ".c" : "", infix,
4313 has_offset ? ".o" : "", type);
4314
4315 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4316 base->gallivm->builder, intr_name, emit_data->dst_type,
4317 emit_data->args, emit_data->arg_count,
4318 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4319
4320 /* Divide the number of layers by 6 to get the number of cubes. */
4321 if (opcode == TGSI_OPCODE_TXQ &&
4322 (target == TGSI_TEXTURE_CUBE_ARRAY ||
4323 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)) {
4324 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4325 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4326 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4327
4328 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4329 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4330 z = LLVMBuildSDiv(builder, z, six, "");
4331
4332 emit_data->output[emit_data->chan] =
4333 LLVMBuildInsertElement(builder, v4, z, two, "");
4334 }
4335 }
4336
4337 static void si_llvm_emit_txqs(
4338 const struct lp_build_tgsi_action *action,
4339 struct lp_build_tgsi_context *bld_base,
4340 struct lp_build_emit_data *emit_data)
4341 {
4342 struct si_shader_context *ctx = si_shader_context(bld_base);
4343 struct gallivm_state *gallivm = bld_base->base.gallivm;
4344 LLVMBuilderRef builder = gallivm->builder;
4345 LLVMValueRef res, samples;
4346 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4347
4348 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4349
4350
4351 /* Read the samples from the descriptor directly. */
4352 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4353 samples = LLVMBuildExtractElement(
4354 builder, res,
4355 lp_build_const_int32(gallivm, 3), "");
4356 samples = LLVMBuildLShr(builder, samples,
4357 lp_build_const_int32(gallivm, 16), "");
4358 samples = LLVMBuildAnd(builder, samples,
4359 lp_build_const_int32(gallivm, 0xf), "");
4360 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4361 samples, "");
4362
4363 emit_data->output[emit_data->chan] = samples;
4364 }
4365
4366 /*
4367 * SI implements derivatives using the local data store (LDS)
4368 * All writes to the LDS happen in all executing threads at
4369 * the same time. TID is the Thread ID for the current
4370 * thread and is a value between 0 and 63, representing
4371 * the thread's position in the wavefront.
4372 *
4373 * For the pixel shader threads are grouped into quads of four pixels.
4374 * The TIDs of the pixels of a quad are:
4375 *
4376 * +------+------+
4377 * |4n + 0|4n + 1|
4378 * +------+------+
4379 * |4n + 2|4n + 3|
4380 * +------+------+
4381 *
4382 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4383 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4384 * the current pixel's column, and masking with 0xfffffffe yields the TID
4385 * of the left pixel of the current pixel's row.
4386 *
4387 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4388 * adding 2 yields the TID of the pixel below the top pixel.
4389 */
4390 /* masks for thread ID. */
4391 #define TID_MASK_TOP_LEFT 0xfffffffc
4392 #define TID_MASK_TOP 0xfffffffd
4393 #define TID_MASK_LEFT 0xfffffffe
4394
4395 static void si_llvm_emit_ddxy(
4396 const struct lp_build_tgsi_action *action,
4397 struct lp_build_tgsi_context *bld_base,
4398 struct lp_build_emit_data *emit_data)
4399 {
4400 struct si_shader_context *ctx = si_shader_context(bld_base);
4401 struct gallivm_state *gallivm = bld_base->base.gallivm;
4402 const struct tgsi_full_instruction *inst = emit_data->inst;
4403 unsigned opcode = inst->Instruction.Opcode;
4404 LLVMValueRef indices[2];
4405 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4406 LLVMValueRef tl, trbl, result[4];
4407 LLVMValueRef tl_tid, trbl_tid;
4408 unsigned swizzle[4];
4409 unsigned c;
4410 int idx;
4411 unsigned mask;
4412
4413 indices[0] = bld_base->uint_bld.zero;
4414 indices[1] = get_thread_id(ctx);
4415 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4416 indices, 2, "");
4417
4418 if (opcode == TGSI_OPCODE_DDX_FINE)
4419 mask = TID_MASK_LEFT;
4420 else if (opcode == TGSI_OPCODE_DDY_FINE)
4421 mask = TID_MASK_TOP;
4422 else
4423 mask = TID_MASK_TOP_LEFT;
4424
4425 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4426 lp_build_const_int32(gallivm, mask), "");
4427 indices[1] = tl_tid;
4428 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4429 indices, 2, "");
4430
4431 /* for DDX we want to next X pixel, DDY next Y pixel. */
4432 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4433 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4434 lp_build_const_int32(gallivm, idx), "");
4435 indices[1] = trbl_tid;
4436 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4437 indices, 2, "");
4438
4439 for (c = 0; c < 4; ++c) {
4440 unsigned i;
4441 LLVMValueRef val;
4442 LLVMValueRef args[2];
4443
4444 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4445 for (i = 0; i < c; ++i) {
4446 if (swizzle[i] == swizzle[c]) {
4447 result[c] = result[i];
4448 break;
4449 }
4450 }
4451 if (i != c)
4452 continue;
4453
4454 val = LLVMBuildBitCast(gallivm->builder,
4455 lp_build_emit_fetch(bld_base, inst, 0, c),
4456 ctx->i32, "");
4457
4458 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4459
4460 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4461 lp_build_const_int32(gallivm, 4), "");
4462 args[1] = val;
4463 tl = lp_build_intrinsic(gallivm->builder,
4464 "llvm.amdgcn.ds.bpermute", ctx->i32,
4465 args, 2, LLVMReadNoneAttribute);
4466
4467 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4468 lp_build_const_int32(gallivm, 4), "");
4469 trbl = lp_build_intrinsic(gallivm->builder,
4470 "llvm.amdgcn.ds.bpermute", ctx->i32,
4471 args, 2, LLVMReadNoneAttribute);
4472 } else {
4473 LLVMBuildStore(gallivm->builder, val, store_ptr);
4474 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4475 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4476 }
4477 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4478 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4479 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4480 }
4481
4482 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4483 }
4484
4485 /*
4486 * this takes an I,J coordinate pair,
4487 * and works out the X and Y derivatives.
4488 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4489 */
4490 static LLVMValueRef si_llvm_emit_ddxy_interp(
4491 struct lp_build_tgsi_context *bld_base,
4492 LLVMValueRef interp_ij)
4493 {
4494 struct si_shader_context *ctx = si_shader_context(bld_base);
4495 struct gallivm_state *gallivm = bld_base->base.gallivm;
4496 LLVMValueRef indices[2];
4497 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4498 LLVMValueRef tl, tr, bl, result[4];
4499 unsigned c;
4500
4501 indices[0] = bld_base->uint_bld.zero;
4502 indices[1] = get_thread_id(ctx);
4503 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4504 indices, 2, "");
4505
4506 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4507 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4508
4509 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4510 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4511
4512 indices[1] = temp;
4513 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4514 indices, 2, "");
4515
4516 indices[1] = temp2;
4517 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4518 indices, 2, "");
4519
4520 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4521 lp_build_const_int32(gallivm, 1), "");
4522 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4523 indices, 2, "");
4524
4525 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4526 lp_build_const_int32(gallivm, 2), "");
4527 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4528 indices, 2, "");
4529
4530 for (c = 0; c < 2; ++c) {
4531 LLVMValueRef store_val;
4532 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4533
4534 store_val = LLVMBuildExtractElement(gallivm->builder,
4535 interp_ij, c_ll, "");
4536 LLVMBuildStore(gallivm->builder,
4537 store_val,
4538 store_ptr);
4539
4540 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
4541 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4542
4543 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
4544 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
4545
4546 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
4547
4548 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
4549 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4550
4551 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
4552 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
4553
4554 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
4555 }
4556
4557 return lp_build_gather_values(gallivm, result, 4);
4558 }
4559
4560 static void interp_fetch_args(
4561 struct lp_build_tgsi_context *bld_base,
4562 struct lp_build_emit_data *emit_data)
4563 {
4564 struct si_shader_context *ctx = si_shader_context(bld_base);
4565 struct gallivm_state *gallivm = bld_base->base.gallivm;
4566 const struct tgsi_full_instruction *inst = emit_data->inst;
4567
4568 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
4569 /* offset is in second src, first two channels */
4570 emit_data->args[0] = lp_build_emit_fetch(bld_base,
4571 emit_data->inst, 1,
4572 TGSI_CHAN_X);
4573 emit_data->args[1] = lp_build_emit_fetch(bld_base,
4574 emit_data->inst, 1,
4575 TGSI_CHAN_Y);
4576 emit_data->arg_count = 2;
4577 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4578 LLVMValueRef sample_position;
4579 LLVMValueRef sample_id;
4580 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
4581
4582 /* fetch sample ID, then fetch its sample position,
4583 * and place into first two channels.
4584 */
4585 sample_id = lp_build_emit_fetch(bld_base,
4586 emit_data->inst, 1, TGSI_CHAN_X);
4587 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
4588 ctx->i32, "");
4589 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
4590
4591 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
4592 sample_position,
4593 lp_build_const_int32(gallivm, 0), "");
4594
4595 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
4596 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
4597 sample_position,
4598 lp_build_const_int32(gallivm, 1), "");
4599 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
4600 emit_data->arg_count = 2;
4601 }
4602 }
4603
4604 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
4605 struct lp_build_tgsi_context *bld_base,
4606 struct lp_build_emit_data *emit_data)
4607 {
4608 struct si_shader_context *ctx = si_shader_context(bld_base);
4609 struct si_shader *shader = ctx->shader;
4610 struct gallivm_state *gallivm = bld_base->base.gallivm;
4611 LLVMValueRef interp_param;
4612 const struct tgsi_full_instruction *inst = emit_data->inst;
4613 const char *intr_name;
4614 int input_index = inst->Src[0].Register.Index;
4615 int chan;
4616 int i;
4617 LLVMValueRef attr_number;
4618 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
4619 int interp_param_idx;
4620 unsigned interp = shader->selector->info.input_interpolate[input_index];
4621 unsigned location;
4622
4623 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
4624
4625 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4626 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
4627 location = TGSI_INTERPOLATE_LOC_CENTER;
4628 else
4629 location = TGSI_INTERPOLATE_LOC_CENTROID;
4630
4631 interp_param_idx = lookup_interp_param_index(interp, location);
4632 if (interp_param_idx == -1)
4633 return;
4634 else if (interp_param_idx)
4635 interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
4636 else
4637 interp_param = NULL;
4638
4639 attr_number = lp_build_const_int32(gallivm, input_index);
4640
4641 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4642 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4643 LLVMValueRef ij_out[2];
4644 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
4645
4646 /*
4647 * take the I then J parameters, and the DDX/Y for it, and
4648 * calculate the IJ inputs for the interpolator.
4649 * temp1 = ddx * offset/sample.x + I;
4650 * interp_param.I = ddy * offset/sample.y + temp1;
4651 * temp1 = ddx * offset/sample.x + J;
4652 * interp_param.J = ddy * offset/sample.y + temp1;
4653 */
4654 for (i = 0; i < 2; i++) {
4655 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
4656 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
4657 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
4658 ddxy_out, ix_ll, "");
4659 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
4660 ddxy_out, iy_ll, "");
4661 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
4662 interp_param, ix_ll, "");
4663 LLVMValueRef temp1, temp2;
4664
4665 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
4666 ctx->f32, "");
4667
4668 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
4669
4670 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
4671
4672 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
4673
4674 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
4675
4676 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
4677 temp2, ctx->i32, "");
4678 }
4679 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
4680 }
4681
4682 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
4683 for (chan = 0; chan < 2; chan++) {
4684 LLVMValueRef args[4];
4685 LLVMValueRef llvm_chan;
4686 unsigned schan;
4687
4688 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
4689 llvm_chan = lp_build_const_int32(gallivm, schan);
4690
4691 args[0] = llvm_chan;
4692 args[1] = attr_number;
4693 args[2] = params;
4694 args[3] = interp_param;
4695
4696 emit_data->output[chan] =
4697 lp_build_intrinsic(gallivm->builder, intr_name,
4698 ctx->f32, args, args[3] ? 4 : 3,
4699 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4700 }
4701 }
4702
4703 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4704 struct lp_build_emit_data *emit_data)
4705 {
4706 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4707 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4708 unsigned stream;
4709
4710 assert(src0.File == TGSI_FILE_IMMEDIATE);
4711
4712 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
4713 return stream;
4714 }
4715
4716 /* Emit one vertex from the geometry shader */
4717 static void si_llvm_emit_vertex(
4718 const struct lp_build_tgsi_action *action,
4719 struct lp_build_tgsi_context *bld_base,
4720 struct lp_build_emit_data *emit_data)
4721 {
4722 struct si_shader_context *ctx = si_shader_context(bld_base);
4723 struct lp_build_context *uint = &bld_base->uint_bld;
4724 struct si_shader *shader = ctx->shader;
4725 struct tgsi_shader_info *info = &shader->selector->info;
4726 struct gallivm_state *gallivm = bld_base->base.gallivm;
4727 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
4728 SI_PARAM_GS2VS_OFFSET);
4729 LLVMValueRef gs_next_vertex;
4730 LLVMValueRef can_emit, kill;
4731 LLVMValueRef args[2];
4732 unsigned chan;
4733 int i;
4734 unsigned stream;
4735
4736 stream = si_llvm_get_stream(bld_base, emit_data);
4737
4738 /* Write vertex attribute values to GSVS ring */
4739 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
4740 ctx->gs_next_vertex[stream],
4741 "");
4742
4743 /* If this thread has already emitted the declared maximum number of
4744 * vertices, kill it: excessive vertex emissions are not supposed to
4745 * have any effect, and GS threads have no externally observable
4746 * effects other than emitting vertices.
4747 */
4748 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
4749 lp_build_const_int32(gallivm,
4750 shader->selector->gs_max_out_vertices), "");
4751 kill = lp_build_select(&bld_base->base, can_emit,
4752 lp_build_const_float(gallivm, 1.0f),
4753 lp_build_const_float(gallivm, -1.0f));
4754
4755 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
4756 ctx->voidt, &kill, 1, 0);
4757
4758 for (i = 0; i < info->num_outputs; i++) {
4759 LLVMValueRef *out_ptr =
4760 ctx->radeon_bld.soa.outputs[i];
4761
4762 for (chan = 0; chan < 4; chan++) {
4763 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4764 LLVMValueRef voffset =
4765 lp_build_const_int32(gallivm, (i * 4 + chan) *
4766 shader->selector->gs_max_out_vertices);
4767
4768 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4769 voffset = lp_build_mul_imm(uint, voffset, 4);
4770
4771 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4772
4773 build_tbuffer_store(ctx,
4774 ctx->gsvs_ring[stream],
4775 out_val, 1,
4776 voffset, soffset, 0,
4777 V_008F0C_BUF_DATA_FORMAT_32,
4778 V_008F0C_BUF_NUM_FORMAT_UINT,
4779 1, 0, 1, 1, 0);
4780 }
4781 }
4782 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4783 lp_build_const_int32(gallivm, 1));
4784
4785 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4786
4787 /* Signal vertex emission */
4788 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
4789 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4790 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4791 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4792 }
4793
4794 /* Cut one primitive from the geometry shader */
4795 static void si_llvm_emit_primitive(
4796 const struct lp_build_tgsi_action *action,
4797 struct lp_build_tgsi_context *bld_base,
4798 struct lp_build_emit_data *emit_data)
4799 {
4800 struct si_shader_context *ctx = si_shader_context(bld_base);
4801 struct gallivm_state *gallivm = bld_base->base.gallivm;
4802 LLVMValueRef args[2];
4803 unsigned stream;
4804
4805 /* Signal primitive cut */
4806 stream = si_llvm_get_stream(bld_base, emit_data);
4807 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
4808 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4809 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4810 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4811 }
4812
4813 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4814 struct lp_build_tgsi_context *bld_base,
4815 struct lp_build_emit_data *emit_data)
4816 {
4817 struct si_shader_context *ctx = si_shader_context(bld_base);
4818 struct gallivm_state *gallivm = bld_base->base.gallivm;
4819
4820 /* The real barrier instruction isn’t needed, because an entire patch
4821 * always fits into a single wave.
4822 */
4823 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4824 emit_optimization_barrier(ctx);
4825 return;
4826 }
4827
4828 lp_build_intrinsic(gallivm->builder,
4829 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
4830 : "llvm.AMDGPU.barrier.local",
4831 ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
4832 }
4833
4834 static const struct lp_build_tgsi_action tex_action = {
4835 .fetch_args = tex_fetch_args,
4836 .emit = build_tex_intrinsic,
4837 };
4838
4839 static const struct lp_build_tgsi_action interp_action = {
4840 .fetch_args = interp_fetch_args,
4841 .emit = build_interp_intrinsic,
4842 };
4843
4844 static void si_create_function(struct si_shader_context *ctx,
4845 LLVMTypeRef *returns, unsigned num_returns,
4846 LLVMTypeRef *params, unsigned num_params,
4847 int last_array_pointer, int last_sgpr)
4848 {
4849 int i;
4850
4851 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
4852 params, num_params);
4853 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
4854 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
4855
4856 for (i = 0; i <= last_sgpr; ++i) {
4857 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
4858
4859 /* We tell llvm that array inputs are passed by value to allow Sinking pass
4860 * to move load. Inputs are constant so this is fine. */
4861 if (i <= last_array_pointer)
4862 LLVMAddAttribute(P, LLVMByValAttribute);
4863 else
4864 LLVMAddAttribute(P, LLVMInRegAttribute);
4865 }
4866 }
4867
4868 static void create_meta_data(struct si_shader_context *ctx)
4869 {
4870 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
4871 LLVMValueRef args[3];
4872
4873 args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
4874 args[1] = 0;
4875 args[2] = lp_build_const_int32(gallivm, 1);
4876
4877 ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
4878 }
4879
4880 static void declare_streamout_params(struct si_shader_context *ctx,
4881 struct pipe_stream_output_info *so,
4882 LLVMTypeRef *params, LLVMTypeRef i32,
4883 unsigned *num_params)
4884 {
4885 int i;
4886
4887 /* Streamout SGPRs. */
4888 if (so->num_outputs) {
4889 params[ctx->param_streamout_config = (*num_params)++] = i32;
4890 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
4891 }
4892 /* A streamout buffer offset is loaded if the stride is non-zero. */
4893 for (i = 0; i < 4; i++) {
4894 if (!so->stride[i])
4895 continue;
4896
4897 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
4898 }
4899 }
4900
4901 static unsigned llvm_get_type_size(LLVMTypeRef type)
4902 {
4903 LLVMTypeKind kind = LLVMGetTypeKind(type);
4904
4905 switch (kind) {
4906 case LLVMIntegerTypeKind:
4907 return LLVMGetIntTypeWidth(type) / 8;
4908 case LLVMFloatTypeKind:
4909 return 4;
4910 case LLVMPointerTypeKind:
4911 return 8;
4912 case LLVMVectorTypeKind:
4913 return LLVMGetVectorSize(type) *
4914 llvm_get_type_size(LLVMGetElementType(type));
4915 default:
4916 assert(0);
4917 return 0;
4918 }
4919 }
4920
4921 static void declare_tess_lds(struct si_shader_context *ctx)
4922 {
4923 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4924 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
4925
4926 /* This is the upper bound, maximum is 32 inputs times 32 vertices */
4927 unsigned vertex_data_dw_size = 32*32*4;
4928 unsigned patch_data_dw_size = 32*4;
4929 /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
4930 unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
4931 unsigned lds_dwords = patch_dw_size;
4932
4933 /* The actual size is computed outside of the shader to reduce
4934 * the number of shader variants. */
4935 ctx->lds =
4936 LLVMAddGlobalInAddressSpace(gallivm->module,
4937 LLVMArrayType(i32, lds_dwords),
4938 "tess_lds",
4939 LOCAL_ADDR_SPACE);
4940 }
4941
4942 static void create_function(struct si_shader_context *ctx)
4943 {
4944 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
4945 struct gallivm_state *gallivm = bld_base->base.gallivm;
4946 struct si_shader *shader = ctx->shader;
4947 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
4948 LLVMTypeRef returns[16+32*4];
4949 unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
4950 unsigned num_returns = 0;
4951
4952 v3i32 = LLVMVectorType(ctx->i32, 3);
4953
4954 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
4955 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
4956 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
4957 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
4958 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
4959 last_array_pointer = SI_PARAM_SHADER_BUFFERS;
4960
4961 switch (ctx->type) {
4962 case PIPE_SHADER_VERTEX:
4963 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
4964 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
4965 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
4966 params[SI_PARAM_START_INSTANCE] = ctx->i32;
4967 num_params = SI_PARAM_START_INSTANCE+1;
4968
4969 if (shader->key.vs.as_es) {
4970 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4971 } else if (shader->key.vs.as_ls) {
4972 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
4973 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
4974 } else {
4975 if (ctx->is_gs_copy_shader) {
4976 last_array_pointer = SI_PARAM_RW_BUFFERS;
4977 num_params = SI_PARAM_RW_BUFFERS+1;
4978 } else {
4979 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
4980 num_params = SI_PARAM_VS_STATE_BITS+1;
4981 }
4982
4983 /* The locations of the other parameters are assigned dynamically. */
4984 declare_streamout_params(ctx, &shader->selector->so,
4985 params, ctx->i32, &num_params);
4986 }
4987
4988 last_sgpr = num_params-1;
4989
4990 /* VGPRs */
4991 params[ctx->param_vertex_id = num_params++] = ctx->i32;
4992 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
4993 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
4994 params[ctx->param_instance_id = num_params++] = ctx->i32;
4995
4996 if (!ctx->is_monolithic &&
4997 !ctx->is_gs_copy_shader) {
4998 /* Vertex load indices. */
4999 ctx->param_vertex_index0 = num_params;
5000
5001 for (i = 0; i < shader->selector->info.num_inputs; i++)
5002 params[num_params++] = ctx->i32;
5003
5004 /* PrimitiveID output. */
5005 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5006 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5007 returns[num_returns++] = ctx->f32;
5008 }
5009 break;
5010
5011 case PIPE_SHADER_TESS_CTRL:
5012 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5013 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5014 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5015 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5016 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5017
5018 /* VGPRs */
5019 params[SI_PARAM_PATCH_ID] = ctx->i32;
5020 params[SI_PARAM_REL_IDS] = ctx->i32;
5021 num_params = SI_PARAM_REL_IDS+1;
5022
5023 if (!ctx->is_monolithic) {
5024 /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
5025 for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
5026 returns[num_returns++] = ctx->i32; /* SGPRs */
5027
5028 for (i = 0; i < 3; i++)
5029 returns[num_returns++] = ctx->f32; /* VGPRs */
5030 }
5031 break;
5032
5033 case PIPE_SHADER_TESS_EVAL:
5034 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5035 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5036 num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
5037
5038 if (shader->key.tes.as_es) {
5039 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5040 } else {
5041 declare_streamout_params(ctx, &shader->selector->so,
5042 params, ctx->i32, &num_params);
5043 }
5044 last_sgpr = num_params - 1;
5045
5046 /* VGPRs */
5047 params[ctx->param_tes_u = num_params++] = ctx->f32;
5048 params[ctx->param_tes_v = num_params++] = ctx->f32;
5049 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5050 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5051
5052 /* PrimitiveID output. */
5053 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5054 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5055 returns[num_returns++] = ctx->f32;
5056 break;
5057
5058 case PIPE_SHADER_GEOMETRY:
5059 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5060 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5061 last_sgpr = SI_PARAM_GS_WAVE_ID;
5062
5063 /* VGPRs */
5064 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5065 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5066 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5067 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5068 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5069 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5070 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5071 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5072 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5073 break;
5074
5075 case PIPE_SHADER_FRAGMENT:
5076 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5077 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5078 last_sgpr = SI_PARAM_PRIM_MASK;
5079 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5080 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5081 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5082 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5083 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5084 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5085 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5086 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5087 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5088 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5089 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5090 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5091 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5092 params[SI_PARAM_ANCILLARY] = ctx->i32;
5093 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5094 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5095 num_params = SI_PARAM_POS_FIXED_PT+1;
5096
5097 if (!ctx->is_monolithic) {
5098 /* Color inputs from the prolog. */
5099 if (shader->selector->info.colors_read) {
5100 unsigned num_color_elements =
5101 util_bitcount(shader->selector->info.colors_read);
5102
5103 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5104 for (i = 0; i < num_color_elements; i++)
5105 params[num_params++] = ctx->f32;
5106 }
5107
5108 /* Outputs for the epilog. */
5109 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5110 num_returns =
5111 num_return_sgprs +
5112 util_bitcount(shader->selector->info.colors_written) * 4 +
5113 shader->selector->info.writes_z +
5114 shader->selector->info.writes_stencil +
5115 shader->selector->info.writes_samplemask +
5116 1 /* SampleMaskIn */;
5117
5118 num_returns = MAX2(num_returns,
5119 num_return_sgprs +
5120 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5121
5122 for (i = 0; i < num_return_sgprs; i++)
5123 returns[i] = ctx->i32;
5124 for (; i < num_returns; i++)
5125 returns[i] = ctx->f32;
5126 }
5127 break;
5128
5129 case PIPE_SHADER_COMPUTE:
5130 params[SI_PARAM_GRID_SIZE] = v3i32;
5131 params[SI_PARAM_BLOCK_ID] = v3i32;
5132 last_sgpr = SI_PARAM_BLOCK_ID;
5133
5134 params[SI_PARAM_THREAD_ID] = v3i32;
5135 num_params = SI_PARAM_THREAD_ID + 1;
5136 break;
5137 default:
5138 assert(0 && "unimplemented shader");
5139 return;
5140 }
5141
5142 assert(num_params <= Elements(params));
5143
5144 si_create_function(ctx, returns, num_returns, params,
5145 num_params, last_array_pointer, last_sgpr);
5146
5147 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5148 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5149 !ctx->is_monolithic) {
5150 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5151 "InitialPSInputAddr",
5152 S_0286D0_PERSP_SAMPLE_ENA(1) |
5153 S_0286D0_PERSP_CENTER_ENA(1) |
5154 S_0286D0_PERSP_CENTROID_ENA(1) |
5155 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5156 S_0286D0_LINEAR_CENTER_ENA(1) |
5157 S_0286D0_LINEAR_CENTROID_ENA(1) |
5158 S_0286D0_FRONT_FACE_ENA(1) |
5159 S_0286D0_POS_FIXED_PT_ENA(1));
5160 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5161 const unsigned *properties = shader->selector->info.properties;
5162 unsigned max_work_group_size =
5163 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5164 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5165 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5166
5167 assert(max_work_group_size);
5168
5169 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5170 "amdgpu-max-work-group-size",
5171 max_work_group_size);
5172 }
5173
5174 shader->info.num_input_sgprs = 0;
5175 shader->info.num_input_vgprs = 0;
5176
5177 for (i = 0; i <= last_sgpr; ++i)
5178 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5179
5180 /* Unused fragment shader inputs are eliminated by the compiler,
5181 * so we don't know yet how many there will be.
5182 */
5183 if (ctx->type != PIPE_SHADER_FRAGMENT)
5184 for (; i < num_params; ++i)
5185 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5186
5187 if (bld_base->info &&
5188 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5189 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5190 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5191 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5192 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5193 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5194 ctx->lds =
5195 LLVMAddGlobalInAddressSpace(gallivm->module,
5196 LLVMArrayType(ctx->i32, 64),
5197 "ddxy_lds",
5198 LOCAL_ADDR_SPACE);
5199
5200 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5201 ctx->type == PIPE_SHADER_TESS_CTRL ||
5202 ctx->type == PIPE_SHADER_TESS_EVAL)
5203 declare_tess_lds(ctx);
5204 }
5205
5206 static void preload_constants(struct si_shader_context *ctx)
5207 {
5208 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5209 struct gallivm_state *gallivm = bld_base->base.gallivm;
5210 const struct tgsi_shader_info *info = bld_base->info;
5211 unsigned buf;
5212 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5213
5214 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5215 unsigned i, num_const = info->const_file_max[buf] + 1;
5216
5217 if (num_const == 0)
5218 continue;
5219
5220 /* Allocate space for the constant values */
5221 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5222
5223 /* Load the resource descriptor */
5224 ctx->const_buffers[buf] =
5225 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5226
5227 /* Load the constants, we rely on the code sinking to do the rest */
5228 for (i = 0; i < num_const * 4; ++i) {
5229 ctx->constants[buf][i] =
5230 buffer_load_const(gallivm->builder,
5231 ctx->const_buffers[buf],
5232 lp_build_const_int32(gallivm, i * 4),
5233 ctx->f32);
5234 }
5235 }
5236 }
5237
5238 static void preload_shader_buffers(struct si_shader_context *ctx)
5239 {
5240 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5241 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5242 int buf, maxbuf;
5243
5244 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5245 SI_NUM_SHADER_BUFFERS - 1);
5246 for (buf = 0; buf <= maxbuf; ++buf) {
5247 ctx->shader_buffers[buf] =
5248 build_indexed_load_const(
5249 ctx, ptr, lp_build_const_int32(gallivm, buf));
5250 }
5251 }
5252
5253 static void preload_samplers(struct si_shader_context *ctx)
5254 {
5255 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5256 struct gallivm_state *gallivm = bld_base->base.gallivm;
5257 const struct tgsi_shader_info *info = bld_base->info;
5258 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5259 LLVMValueRef offset;
5260
5261 if (num_samplers == 0)
5262 return;
5263
5264 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5265 for (i = 0; i < num_samplers; ++i) {
5266 /* Resource */
5267 offset = lp_build_const_int32(gallivm, i);
5268 ctx->sampler_views[i] =
5269 get_sampler_desc(ctx, offset, DESC_IMAGE);
5270
5271 /* FMASK resource */
5272 if (info->is_msaa_sampler[i])
5273 ctx->fmasks[i] =
5274 get_sampler_desc(ctx, offset, DESC_FMASK);
5275 else {
5276 ctx->sampler_states[i] =
5277 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5278 ctx->sampler_states[i] =
5279 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5280 ctx->sampler_states[i]);
5281 }
5282 }
5283 }
5284
5285 static void preload_images(struct si_shader_context *ctx)
5286 {
5287 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5288 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5289 struct gallivm_state *gallivm = bld_base->base.gallivm;
5290 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5291 LLVMValueRef res_ptr;
5292 unsigned i;
5293
5294 if (num_images == 0)
5295 return;
5296
5297 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5298
5299 for (i = 0; i < num_images; ++i) {
5300 /* Rely on LLVM to shrink the load for buffer resources. */
5301 LLVMValueRef rsrc =
5302 build_indexed_load_const(ctx, res_ptr,
5303 lp_build_const_int32(gallivm, i));
5304
5305 if (info->images_writemask & (1 << i) &&
5306 !(info->images_buffers & (1 << i)))
5307 rsrc = force_dcc_off(ctx, rsrc);
5308
5309 ctx->images[i] = rsrc;
5310 }
5311 }
5312
5313 static void preload_streamout_buffers(struct si_shader_context *ctx)
5314 {
5315 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5316 struct gallivm_state *gallivm = bld_base->base.gallivm;
5317 unsigned i;
5318
5319 /* Streamout can only be used if the shader is compiled as VS. */
5320 if (!ctx->shader->selector->so.num_outputs ||
5321 (ctx->type == PIPE_SHADER_VERTEX &&
5322 (ctx->shader->key.vs.as_es ||
5323 ctx->shader->key.vs.as_ls)) ||
5324 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5325 ctx->shader->key.tes.as_es))
5326 return;
5327
5328 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5329 SI_PARAM_RW_BUFFERS);
5330
5331 /* Load the resources, we rely on the code sinking to do the rest */
5332 for (i = 0; i < 4; ++i) {
5333 if (ctx->shader->selector->so.stride[i]) {
5334 LLVMValueRef offset = lp_build_const_int32(gallivm,
5335 SI_VS_STREAMOUT_BUF0 + i);
5336
5337 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5338 }
5339 }
5340 }
5341
5342 /**
5343 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5344 * for later use.
5345 */
5346 static void preload_ring_buffers(struct si_shader_context *ctx)
5347 {
5348 struct gallivm_state *gallivm =
5349 ctx->radeon_bld.soa.bld_base.base.gallivm;
5350
5351 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5352 SI_PARAM_RW_BUFFERS);
5353
5354 if ((ctx->type == PIPE_SHADER_VERTEX &&
5355 ctx->shader->key.vs.as_es) ||
5356 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5357 ctx->shader->key.tes.as_es) ||
5358 ctx->type == PIPE_SHADER_GEOMETRY) {
5359 unsigned ring =
5360 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5361 : SI_ES_RING_ESGS;
5362 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5363
5364 ctx->esgs_ring =
5365 build_indexed_load_const(ctx, buf_ptr, offset);
5366 }
5367
5368 if (ctx->is_gs_copy_shader) {
5369 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5370
5371 ctx->gsvs_ring[0] =
5372 build_indexed_load_const(ctx, buf_ptr, offset);
5373 }
5374 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5375 int i;
5376 for (i = 0; i < 4; i++) {
5377 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5378
5379 ctx->gsvs_ring[i] =
5380 build_indexed_load_const(ctx, buf_ptr, offset);
5381 }
5382 }
5383 }
5384
5385 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5386 LLVMValueRef param_rw_buffers,
5387 unsigned param_pos_fixed_pt)
5388 {
5389 struct lp_build_tgsi_context *bld_base =
5390 &ctx->radeon_bld.soa.bld_base;
5391 struct gallivm_state *gallivm = bld_base->base.gallivm;
5392 LLVMBuilderRef builder = gallivm->builder;
5393 LLVMValueRef slot, desc, offset, row, bit, address[2];
5394
5395 /* Use the fixed-point gl_FragCoord input.
5396 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5397 * per coordinate to get the repeating effect.
5398 */
5399 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5400 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5401
5402 /* Load the buffer descriptor. */
5403 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5404 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5405
5406 /* The stipple pattern is 32x32, each row has 32 bits. */
5407 offset = LLVMBuildMul(builder, address[1],
5408 LLVMConstInt(ctx->i32, 4, 0), "");
5409 row = buffer_load_const(builder, desc, offset, ctx->i32);
5410 bit = LLVMBuildLShr(builder, row, address[0], "");
5411 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5412
5413 /* The intrinsic kills the thread if arg < 0. */
5414 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5415 LLVMConstReal(ctx->f32, -1), "");
5416 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5417 }
5418
5419 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5420 struct si_shader_config *conf,
5421 unsigned symbol_offset)
5422 {
5423 unsigned i;
5424 const unsigned char *config =
5425 radeon_shader_binary_config_start(binary, symbol_offset);
5426
5427 /* XXX: We may be able to emit some of these values directly rather than
5428 * extracting fields to be emitted later.
5429 */
5430
5431 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5432 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5433 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5434 switch (reg) {
5435 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5436 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5437 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5438 case R_00B848_COMPUTE_PGM_RSRC1:
5439 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5440 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5441 conf->float_mode = G_00B028_FLOAT_MODE(value);
5442 conf->rsrc1 = value;
5443 break;
5444 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5445 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5446 break;
5447 case R_00B84C_COMPUTE_PGM_RSRC2:
5448 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5449 conf->rsrc2 = value;
5450 break;
5451 case R_0286CC_SPI_PS_INPUT_ENA:
5452 conf->spi_ps_input_ena = value;
5453 break;
5454 case R_0286D0_SPI_PS_INPUT_ADDR:
5455 conf->spi_ps_input_addr = value;
5456 break;
5457 case R_0286E8_SPI_TMPRING_SIZE:
5458 case R_00B860_COMPUTE_TMPRING_SIZE:
5459 /* WAVESIZE is in units of 256 dwords. */
5460 conf->scratch_bytes_per_wave =
5461 G_00B860_WAVESIZE(value) * 256 * 4 * 1;
5462 break;
5463 default:
5464 {
5465 static bool printed;
5466
5467 if (!printed) {
5468 fprintf(stderr, "Warning: LLVM emitted unknown "
5469 "config register: 0x%x\n", reg);
5470 printed = true;
5471 }
5472 }
5473 break;
5474 }
5475
5476 if (!conf->spi_ps_input_addr)
5477 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5478 }
5479 }
5480
5481 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5482 struct si_shader *shader,
5483 struct si_shader_config *config,
5484 uint64_t scratch_va)
5485 {
5486 unsigned i;
5487 uint32_t scratch_rsrc_dword0 = scratch_va;
5488 uint32_t scratch_rsrc_dword1 =
5489 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
5490 | S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5491
5492 for (i = 0 ; i < shader->binary.reloc_count; i++) {
5493 const struct radeon_shader_reloc *reloc =
5494 &shader->binary.relocs[i];
5495 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5496 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5497 &scratch_rsrc_dword0, 4);
5498 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5499 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5500 &scratch_rsrc_dword1, 4);
5501 }
5502 }
5503 }
5504
5505 static unsigned si_get_shader_binary_size(struct si_shader *shader)
5506 {
5507 unsigned size = shader->binary.code_size;
5508
5509 if (shader->prolog)
5510 size += shader->prolog->binary.code_size;
5511 if (shader->epilog)
5512 size += shader->epilog->binary.code_size;
5513 return size;
5514 }
5515
5516 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5517 {
5518 const struct radeon_shader_binary *prolog =
5519 shader->prolog ? &shader->prolog->binary : NULL;
5520 const struct radeon_shader_binary *epilog =
5521 shader->epilog ? &shader->epilog->binary : NULL;
5522 const struct radeon_shader_binary *mainb = &shader->binary;
5523 unsigned bo_size = si_get_shader_binary_size(shader) +
5524 (!epilog ? mainb->rodata_size : 0);
5525 unsigned char *ptr;
5526
5527 assert(!prolog || !prolog->rodata_size);
5528 assert((!prolog && !epilog) || !mainb->rodata_size);
5529 assert(!epilog || !epilog->rodata_size);
5530
5531 r600_resource_reference(&shader->bo, NULL);
5532 shader->bo = si_resource_create_custom(&sscreen->b.b,
5533 PIPE_USAGE_IMMUTABLE,
5534 bo_size);
5535 if (!shader->bo)
5536 return -ENOMEM;
5537
5538 /* Upload. */
5539 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
5540 PIPE_TRANSFER_READ_WRITE);
5541
5542 if (prolog) {
5543 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
5544 ptr += prolog->code_size;
5545 }
5546
5547 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
5548 ptr += mainb->code_size;
5549
5550 if (epilog)
5551 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
5552 else if (mainb->rodata_size > 0)
5553 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
5554
5555 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5556 return 0;
5557 }
5558
5559 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
5560 struct pipe_debug_callback *debug,
5561 const char *name, FILE *file)
5562 {
5563 char *line, *p;
5564 unsigned i, count;
5565
5566 if (binary->disasm_string) {
5567 fprintf(file, "Shader %s disassembly:\n", name);
5568 fprintf(file, "%s", binary->disasm_string);
5569
5570 if (debug && debug->debug_message) {
5571 /* Very long debug messages are cut off, so send the
5572 * disassembly one line at a time. This causes more
5573 * overhead, but on the plus side it simplifies
5574 * parsing of resulting logs.
5575 */
5576 pipe_debug_message(debug, SHADER_INFO,
5577 "Shader Disassembly Begin");
5578
5579 line = binary->disasm_string;
5580 while (*line) {
5581 p = util_strchrnul(line, '\n');
5582 count = p - line;
5583
5584 if (count) {
5585 pipe_debug_message(debug, SHADER_INFO,
5586 "%.*s", count, line);
5587 }
5588
5589 if (!*p)
5590 break;
5591 line = p + 1;
5592 }
5593
5594 pipe_debug_message(debug, SHADER_INFO,
5595 "Shader Disassembly End");
5596 }
5597 } else {
5598 fprintf(file, "Shader %s binary:\n", name);
5599 for (i = 0; i < binary->code_size; i += 4) {
5600 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5601 binary->code[i + 3], binary->code[i + 2],
5602 binary->code[i + 1], binary->code[i]);
5603 }
5604 }
5605 }
5606
5607 static void si_shader_dump_stats(struct si_screen *sscreen,
5608 struct si_shader_config *conf,
5609 unsigned num_inputs,
5610 unsigned code_size,
5611 struct pipe_debug_callback *debug,
5612 unsigned processor,
5613 FILE *file)
5614 {
5615 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5616 unsigned lds_per_wave = 0;
5617 unsigned max_simd_waves = 10;
5618
5619 /* Compute LDS usage for PS. */
5620 if (processor == PIPE_SHADER_FRAGMENT) {
5621 /* The minimum usage per wave is (num_inputs * 36). The maximum
5622 * usage is (num_inputs * 36 * 16).
5623 * We can get anything in between and it varies between waves.
5624 *
5625 * Other stages don't know the size at compile time or don't
5626 * allocate LDS per wave, but instead they do it per thread group.
5627 */
5628 lds_per_wave = conf->lds_size * lds_increment +
5629 align(num_inputs * 36, lds_increment);
5630 }
5631
5632 /* Compute the per-SIMD wave counts. */
5633 if (conf->num_sgprs) {
5634 if (sscreen->b.chip_class >= VI)
5635 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5636 else
5637 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5638 }
5639
5640 if (conf->num_vgprs)
5641 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5642
5643 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
5644 * that PS can use.
5645 */
5646 if (lds_per_wave)
5647 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5648
5649 if (file != stderr ||
5650 r600_can_dump_shader(&sscreen->b, processor)) {
5651 if (processor == PIPE_SHADER_FRAGMENT) {
5652 fprintf(file, "*** SHADER CONFIG ***\n"
5653 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5654 "SPI_PS_INPUT_ENA = 0x%04x\n",
5655 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5656 }
5657
5658 fprintf(file, "*** SHADER STATS ***\n"
5659 "SGPRS: %d\n"
5660 "VGPRS: %d\n"
5661 "Code Size: %d bytes\n"
5662 "LDS: %d blocks\n"
5663 "Scratch: %d bytes per wave\n"
5664 "Max Waves: %d\n"
5665 "********************\n",
5666 conf->num_sgprs, conf->num_vgprs, code_size,
5667 conf->lds_size, conf->scratch_bytes_per_wave,
5668 max_simd_waves);
5669 }
5670
5671 pipe_debug_message(debug, SHADER_INFO,
5672 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5673 "LDS: %d Scratch: %d Max Waves: %d",
5674 conf->num_sgprs, conf->num_vgprs, code_size,
5675 conf->lds_size, conf->scratch_bytes_per_wave,
5676 max_simd_waves);
5677 }
5678
5679 static const char *si_get_shader_name(struct si_shader *shader,
5680 unsigned processor)
5681 {
5682 switch (processor) {
5683 case PIPE_SHADER_VERTEX:
5684 if (shader->key.vs.as_es)
5685 return "Vertex Shader as ES";
5686 else if (shader->key.vs.as_ls)
5687 return "Vertex Shader as LS";
5688 else
5689 return "Vertex Shader as VS";
5690 case PIPE_SHADER_TESS_CTRL:
5691 return "Tessellation Control Shader";
5692 case PIPE_SHADER_TESS_EVAL:
5693 if (shader->key.tes.as_es)
5694 return "Tessellation Evaluation Shader as ES";
5695 else
5696 return "Tessellation Evaluation Shader as VS";
5697 case PIPE_SHADER_GEOMETRY:
5698 if (shader->gs_copy_shader == NULL)
5699 return "GS Copy Shader as VS";
5700 else
5701 return "Geometry Shader";
5702 case PIPE_SHADER_FRAGMENT:
5703 return "Pixel Shader";
5704 case PIPE_SHADER_COMPUTE:
5705 return "Compute Shader";
5706 default:
5707 return "Unknown Shader";
5708 }
5709 }
5710
5711 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
5712 struct pipe_debug_callback *debug, unsigned processor,
5713 FILE *file)
5714 {
5715 if (file != stderr ||
5716 (r600_can_dump_shader(&sscreen->b, processor) &&
5717 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5718 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5719
5720 if (shader->prolog)
5721 si_shader_dump_disassembly(&shader->prolog->binary,
5722 debug, "prolog", file);
5723
5724 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5725
5726 if (shader->epilog)
5727 si_shader_dump_disassembly(&shader->epilog->binary,
5728 debug, "epilog", file);
5729 fprintf(file, "\n");
5730 }
5731
5732 si_shader_dump_stats(sscreen, &shader->config,
5733 shader->selector ? shader->selector->info.num_inputs : 0,
5734 si_get_shader_binary_size(shader), debug, processor,
5735 file);
5736 }
5737
5738 int si_compile_llvm(struct si_screen *sscreen,
5739 struct radeon_shader_binary *binary,
5740 struct si_shader_config *conf,
5741 LLVMTargetMachineRef tm,
5742 LLVMModuleRef mod,
5743 struct pipe_debug_callback *debug,
5744 unsigned processor,
5745 const char *name)
5746 {
5747 int r = 0;
5748 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5749
5750 if (r600_can_dump_shader(&sscreen->b, processor)) {
5751 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5752
5753 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5754 fprintf(stderr, "%s LLVM IR:\n\n", name);
5755 LLVMDumpModule(mod);
5756 fprintf(stderr, "\n");
5757 }
5758 }
5759
5760 if (!si_replace_shader(count, binary)) {
5761 r = radeon_llvm_compile(mod, binary,
5762 r600_get_llvm_processor_name(sscreen->b.family), tm,
5763 debug);
5764 if (r)
5765 return r;
5766 }
5767
5768 si_shader_binary_read_config(binary, conf, 0);
5769
5770 /* Enable 64-bit and 16-bit denormals, because there is no performance
5771 * cost.
5772 *
5773 * If denormals are enabled, all floating-point output modifiers are
5774 * ignored.
5775 *
5776 * Don't enable denormals for 32-bit floats, because:
5777 * - Floating-point output modifiers would be ignored by the hw.
5778 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5779 * have to stop using those.
5780 * - SI & CI would be very slow.
5781 */
5782 conf->float_mode |= V_00B028_FP_64_DENORMS;
5783
5784 FREE(binary->config);
5785 FREE(binary->global_symbol_offsets);
5786 binary->config = NULL;
5787 binary->global_symbol_offsets = NULL;
5788
5789 /* Some shaders can't have rodata because their binaries can be
5790 * concatenated.
5791 */
5792 if (binary->rodata_size &&
5793 (processor == PIPE_SHADER_VERTEX ||
5794 processor == PIPE_SHADER_TESS_CTRL ||
5795 processor == PIPE_SHADER_TESS_EVAL ||
5796 processor == PIPE_SHADER_FRAGMENT)) {
5797 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5798 return -EINVAL;
5799 }
5800
5801 return r;
5802 }
5803
5804 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5805 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
5806 struct si_shader_context *ctx,
5807 struct si_shader *gs,
5808 struct pipe_debug_callback *debug)
5809 {
5810 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5811 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5812 struct lp_build_context *uint = &bld_base->uint_bld;
5813 struct si_shader_output_values *outputs;
5814 struct tgsi_shader_info *gsinfo = &gs->selector->info;
5815 LLVMValueRef args[9];
5816 int i, r;
5817
5818 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5819
5820 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
5821 ctx->type = PIPE_SHADER_VERTEX;
5822 ctx->is_gs_copy_shader = true;
5823
5824 create_meta_data(ctx);
5825 create_function(ctx);
5826 preload_streamout_buffers(ctx);
5827 preload_ring_buffers(ctx);
5828
5829 args[0] = ctx->gsvs_ring[0];
5830 args[1] = lp_build_mul_imm(uint,
5831 LLVMGetParam(ctx->radeon_bld.main_fn,
5832 ctx->param_vertex_id),
5833 4);
5834 args[3] = uint->zero;
5835 args[4] = uint->one; /* OFFEN */
5836 args[5] = uint->zero; /* IDXEN */
5837 args[6] = uint->one; /* GLC */
5838 args[7] = uint->one; /* SLC */
5839 args[8] = uint->zero; /* TFE */
5840
5841 /* Fetch vertex data from GSVS ring */
5842 for (i = 0; i < gsinfo->num_outputs; ++i) {
5843 unsigned chan;
5844
5845 outputs[i].name = gsinfo->output_semantic_name[i];
5846 outputs[i].sid = gsinfo->output_semantic_index[i];
5847
5848 for (chan = 0; chan < 4; chan++) {
5849 args[2] = lp_build_const_int32(gallivm,
5850 (i * 4 + chan) *
5851 gs->selector->gs_max_out_vertices * 16 * 4);
5852
5853 outputs[i].values[chan] =
5854 LLVMBuildBitCast(gallivm->builder,
5855 lp_build_intrinsic(gallivm->builder,
5856 "llvm.SI.buffer.load.dword.i32.i32",
5857 ctx->i32, args, 9,
5858 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
5859 ctx->f32, "");
5860 }
5861 }
5862
5863 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5864
5865 LLVMBuildRet(gallivm->builder, ctx->return_value);
5866
5867 /* Dump LLVM IR before any optimization passes */
5868 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
5869 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5870 LLVMDumpModule(bld_base->base.gallivm->module);
5871
5872 radeon_llvm_finalize_module(&ctx->radeon_bld);
5873
5874 r = si_compile_llvm(sscreen, &ctx->shader->binary,
5875 &ctx->shader->config, ctx->tm,
5876 bld_base->base.gallivm->module,
5877 debug, PIPE_SHADER_GEOMETRY,
5878 "GS Copy Shader");
5879 if (!r) {
5880 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
5881 fprintf(stderr, "GS Copy Shader:\n");
5882 si_shader_dump(sscreen, ctx->shader, debug,
5883 PIPE_SHADER_GEOMETRY, stderr);
5884 r = si_shader_binary_upload(sscreen, ctx->shader);
5885 }
5886
5887 radeon_llvm_dispose(&ctx->radeon_bld);
5888
5889 FREE(outputs);
5890 return r;
5891 }
5892
5893 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
5894 {
5895 int i;
5896
5897 fprintf(f, "SHADER KEY\n");
5898
5899 switch (shader) {
5900 case PIPE_SHADER_VERTEX:
5901 fprintf(f, " instance_divisors = {");
5902 for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
5903 fprintf(f, !i ? "%u" : ", %u",
5904 key->vs.prolog.instance_divisors[i]);
5905 fprintf(f, "}\n");
5906 fprintf(f, " as_es = %u\n", key->vs.as_es);
5907 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
5908 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
5909 break;
5910
5911 case PIPE_SHADER_TESS_CTRL:
5912 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
5913 break;
5914
5915 case PIPE_SHADER_TESS_EVAL:
5916 fprintf(f, " as_es = %u\n", key->tes.as_es);
5917 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
5918 break;
5919
5920 case PIPE_SHADER_GEOMETRY:
5921 case PIPE_SHADER_COMPUTE:
5922 break;
5923
5924 case PIPE_SHADER_FRAGMENT:
5925 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
5926 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
5927 fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
5928 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
5929 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
5930 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
5931 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
5932 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
5933 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
5934 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
5935 break;
5936
5937 default:
5938 assert(0);
5939 }
5940 }
5941
5942 static void si_init_shader_ctx(struct si_shader_context *ctx,
5943 struct si_screen *sscreen,
5944 struct si_shader *shader,
5945 LLVMTargetMachineRef tm)
5946 {
5947 struct lp_build_tgsi_context *bld_base;
5948 struct lp_build_tgsi_action tmpl = {};
5949
5950 memset(ctx, 0, sizeof(*ctx));
5951 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
5952 ctx->tm = tm;
5953 ctx->screen = sscreen;
5954 if (shader && shader->selector)
5955 ctx->type = shader->selector->info.processor;
5956 else
5957 ctx->type = -1;
5958 ctx->shader = shader;
5959
5960 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
5961 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
5962 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
5963 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
5964 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
5965 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
5966 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
5967 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
5968 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
5969 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
5970 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
5971 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
5972
5973 bld_base = &ctx->radeon_bld.soa.bld_base;
5974 if (shader && shader->selector)
5975 bld_base->info = &shader->selector->info;
5976 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5977
5978 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5979 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5980 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5981
5982 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
5983 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
5984 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
5985 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
5986 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
5987 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
5988 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
5989 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
5990 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
5991 bld_base->op_actions[TGSI_OPCODE_TXQ] = tex_action;
5992 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
5993 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
5994 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
5995
5996 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
5997 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
5998 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
5999 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6000 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6001 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6002
6003 tmpl.fetch_args = atomic_fetch_args;
6004 tmpl.emit = atomic_emit;
6005 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6006 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6007 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6008 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6009 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6010 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6011 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6012 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6013 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6014 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6015 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6016 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6017 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6018 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6019 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6020 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6021 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6022 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6023 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6024 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6025
6026 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6027
6028 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6029 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6030 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6031 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6032
6033 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6034 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6035 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6036
6037 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6038 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6039 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6040 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6041 }
6042
6043 int si_compile_tgsi_shader(struct si_screen *sscreen,
6044 LLVMTargetMachineRef tm,
6045 struct si_shader *shader,
6046 bool is_monolithic,
6047 struct pipe_debug_callback *debug)
6048 {
6049 struct si_shader_selector *sel = shader->selector;
6050 struct si_shader_context ctx;
6051 struct lp_build_tgsi_context *bld_base;
6052 LLVMModuleRef mod;
6053 int r = 0;
6054
6055 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6056 * conversion fails. */
6057 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6058 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6059 si_dump_shader_key(sel->type, &shader->key, stderr);
6060 tgsi_dump(sel->tokens, 0);
6061 si_dump_streamout(&sel->so);
6062 }
6063
6064 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6065 ctx.is_monolithic = is_monolithic;
6066
6067 shader->info.uses_instanceid = sel->info.uses_instanceid;
6068
6069 bld_base = &ctx.radeon_bld.soa.bld_base;
6070 ctx.radeon_bld.load_system_value = declare_system_value;
6071
6072 switch (ctx.type) {
6073 case PIPE_SHADER_VERTEX:
6074 ctx.radeon_bld.load_input = declare_input_vs;
6075 if (shader->key.vs.as_ls)
6076 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6077 else if (shader->key.vs.as_es)
6078 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6079 else
6080 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6081 break;
6082 case PIPE_SHADER_TESS_CTRL:
6083 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6084 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6085 bld_base->emit_store = store_output_tcs;
6086 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6087 break;
6088 case PIPE_SHADER_TESS_EVAL:
6089 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6090 if (shader->key.tes.as_es)
6091 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6092 else
6093 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6094 break;
6095 case PIPE_SHADER_GEOMETRY:
6096 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6097 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6098 break;
6099 case PIPE_SHADER_FRAGMENT:
6100 ctx.radeon_bld.load_input = declare_input_fs;
6101 if (is_monolithic)
6102 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6103 else
6104 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6105 break;
6106 case PIPE_SHADER_COMPUTE:
6107 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6108 break;
6109 default:
6110 assert(!"Unsupported shader type");
6111 return -1;
6112 }
6113
6114 create_meta_data(&ctx);
6115 create_function(&ctx);
6116 preload_constants(&ctx);
6117 preload_shader_buffers(&ctx);
6118 preload_samplers(&ctx);
6119 preload_images(&ctx);
6120 preload_streamout_buffers(&ctx);
6121 preload_ring_buffers(&ctx);
6122
6123 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6124 shader->key.ps.prolog.poly_stipple) {
6125 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6126 SI_PARAM_RW_BUFFERS);
6127 si_llvm_emit_polygon_stipple(&ctx, list,
6128 SI_PARAM_POS_FIXED_PT);
6129 }
6130
6131 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6132 int i;
6133 for (i = 0; i < 4; i++) {
6134 ctx.gs_next_vertex[i] =
6135 lp_build_alloca(bld_base->base.gallivm,
6136 ctx.i32, "");
6137 }
6138 }
6139
6140 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6141 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6142 goto out;
6143 }
6144
6145 LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
6146 mod = bld_base->base.gallivm->module;
6147
6148 /* Dump LLVM IR before any optimization passes */
6149 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6150 r600_can_dump_shader(&sscreen->b, ctx.type))
6151 LLVMDumpModule(mod);
6152
6153 radeon_llvm_finalize_module(&ctx.radeon_bld);
6154
6155 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6156 mod, debug, ctx.type, "TGSI shader");
6157 if (r) {
6158 fprintf(stderr, "LLVM failed to compile shader\n");
6159 goto out;
6160 }
6161
6162 radeon_llvm_dispose(&ctx.radeon_bld);
6163
6164 /* Add the scratch offset to input SGPRs. */
6165 if (shader->config.scratch_bytes_per_wave)
6166 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6167
6168 /* Calculate the number of fragment input VGPRs. */
6169 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6170 shader->info.num_input_vgprs = 0;
6171 shader->info.face_vgpr_index = -1;
6172
6173 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6174 shader->info.num_input_vgprs += 2;
6175 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6176 shader->info.num_input_vgprs += 2;
6177 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6178 shader->info.num_input_vgprs += 2;
6179 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6180 shader->info.num_input_vgprs += 3;
6181 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6182 shader->info.num_input_vgprs += 2;
6183 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6184 shader->info.num_input_vgprs += 2;
6185 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6186 shader->info.num_input_vgprs += 2;
6187 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6188 shader->info.num_input_vgprs += 1;
6189 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6190 shader->info.num_input_vgprs += 1;
6191 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6192 shader->info.num_input_vgprs += 1;
6193 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6194 shader->info.num_input_vgprs += 1;
6195 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6196 shader->info.num_input_vgprs += 1;
6197 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6198 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6199 shader->info.num_input_vgprs += 1;
6200 }
6201 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6202 shader->info.num_input_vgprs += 1;
6203 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6204 shader->info.num_input_vgprs += 1;
6205 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6206 shader->info.num_input_vgprs += 1;
6207 }
6208
6209 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6210 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6211 shader->gs_copy_shader->selector = shader->selector;
6212 ctx.shader = shader->gs_copy_shader;
6213 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6214 shader, debug))) {
6215 free(shader->gs_copy_shader);
6216 shader->gs_copy_shader = NULL;
6217 goto out;
6218 }
6219 }
6220
6221 out:
6222 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6223 FREE(ctx.constants[i]);
6224 return r;
6225 }
6226
6227 /**
6228 * Create, compile and return a shader part (prolog or epilog).
6229 *
6230 * \param sscreen screen
6231 * \param list list of shader parts of the same category
6232 * \param key shader part key
6233 * \param tm LLVM target machine
6234 * \param debug debug callback
6235 * \param compile the callback responsible for compilation
6236 * \return non-NULL on success
6237 */
6238 static struct si_shader_part *
6239 si_get_shader_part(struct si_screen *sscreen,
6240 struct si_shader_part **list,
6241 union si_shader_part_key *key,
6242 LLVMTargetMachineRef tm,
6243 struct pipe_debug_callback *debug,
6244 bool (*compile)(struct si_screen *,
6245 LLVMTargetMachineRef,
6246 struct pipe_debug_callback *,
6247 struct si_shader_part *))
6248 {
6249 struct si_shader_part *result;
6250
6251 pipe_mutex_lock(sscreen->shader_parts_mutex);
6252
6253 /* Find existing. */
6254 for (result = *list; result; result = result->next) {
6255 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6256 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6257 return result;
6258 }
6259 }
6260
6261 /* Compile a new one. */
6262 result = CALLOC_STRUCT(si_shader_part);
6263 result->key = *key;
6264 if (!compile(sscreen, tm, debug, result)) {
6265 FREE(result);
6266 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6267 return NULL;
6268 }
6269
6270 result->next = *list;
6271 *list = result;
6272 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6273 return result;
6274 }
6275
6276 /**
6277 * Create a vertex shader prolog.
6278 *
6279 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6280 * All inputs are returned unmodified. The vertex load indices are
6281 * stored after them, which will used by the API VS for fetching inputs.
6282 *
6283 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6284 * input_v0,
6285 * input_v1,
6286 * input_v2,
6287 * input_v3,
6288 * (VertexID + BaseVertex),
6289 * (InstanceID + StartInstance),
6290 * (InstanceID / 2 + StartInstance)
6291 */
6292 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6293 LLVMTargetMachineRef tm,
6294 struct pipe_debug_callback *debug,
6295 struct si_shader_part *out)
6296 {
6297 union si_shader_part_key *key = &out->key;
6298 struct si_shader shader = {};
6299 struct si_shader_context ctx;
6300 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6301 LLVMTypeRef *params, *returns;
6302 LLVMValueRef ret, func;
6303 int last_sgpr, num_params, num_returns, i;
6304 bool status = true;
6305
6306 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6307 ctx.type = PIPE_SHADER_VERTEX;
6308 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6309 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6310
6311 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6312 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6313 sizeof(LLVMTypeRef));
6314 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6315 key->vs_prolog.last_input + 1) *
6316 sizeof(LLVMTypeRef));
6317 num_params = 0;
6318 num_returns = 0;
6319
6320 /* Declare input and output SGPRs. */
6321 num_params = 0;
6322 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6323 params[num_params++] = ctx.i32;
6324 returns[num_returns++] = ctx.i32;
6325 }
6326 last_sgpr = num_params - 1;
6327
6328 /* 4 preloaded VGPRs (outputs must be floats) */
6329 for (i = 0; i < 4; i++) {
6330 params[num_params++] = ctx.i32;
6331 returns[num_returns++] = ctx.f32;
6332 }
6333
6334 /* Vertex load indices. */
6335 for (i = 0; i <= key->vs_prolog.last_input; i++)
6336 returns[num_returns++] = ctx.f32;
6337
6338 /* Create the function. */
6339 si_create_function(&ctx, returns, num_returns, params,
6340 num_params, -1, last_sgpr);
6341 func = ctx.radeon_bld.main_fn;
6342
6343 /* Copy inputs to outputs. This should be no-op, as the registers match,
6344 * but it will prevent the compiler from overwriting them unintentionally.
6345 */
6346 ret = ctx.return_value;
6347 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6348 LLVMValueRef p = LLVMGetParam(func, i);
6349 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6350 }
6351 for (i = num_params - 4; i < num_params; i++) {
6352 LLVMValueRef p = LLVMGetParam(func, i);
6353 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6354 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6355 }
6356
6357 /* Compute vertex load indices from instance divisors. */
6358 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6359 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6360 LLVMValueRef index;
6361
6362 if (divisor) {
6363 /* InstanceID / Divisor + StartInstance */
6364 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6365 SI_SGPR_START_INSTANCE,
6366 divisor);
6367 } else {
6368 /* VertexID + BaseVertex */
6369 index = LLVMBuildAdd(gallivm->builder,
6370 LLVMGetParam(func, ctx.param_vertex_id),
6371 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6372 }
6373
6374 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6375 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6376 num_params++, "");
6377 }
6378
6379 /* Compile. */
6380 LLVMBuildRet(gallivm->builder, ret);
6381 radeon_llvm_finalize_module(&ctx.radeon_bld);
6382
6383 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6384 gallivm->module, debug, ctx.type,
6385 "Vertex Shader Prolog"))
6386 status = false;
6387
6388 radeon_llvm_dispose(&ctx.radeon_bld);
6389 return status;
6390 }
6391
6392 /**
6393 * Compile the vertex shader epilog. This is also used by the tessellation
6394 * evaluation shader compiled as VS.
6395 *
6396 * The input is PrimitiveID.
6397 *
6398 * If PrimitiveID is required by the pixel shader, export it.
6399 * Otherwise, do nothing.
6400 */
6401 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6402 LLVMTargetMachineRef tm,
6403 struct pipe_debug_callback *debug,
6404 struct si_shader_part *out)
6405 {
6406 union si_shader_part_key *key = &out->key;
6407 struct si_shader_context ctx;
6408 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6409 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6410 LLVMTypeRef params[5];
6411 int num_params, i;
6412 bool status = true;
6413
6414 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6415 ctx.type = PIPE_SHADER_VERTEX;
6416
6417 /* Declare input VGPRs. */
6418 num_params = key->vs_epilog.states.export_prim_id ?
6419 (VS_EPILOG_PRIMID_LOC + 1) : 0;
6420 assert(num_params <= ARRAY_SIZE(params));
6421
6422 for (i = 0; i < num_params; i++)
6423 params[i] = ctx.f32;
6424
6425 /* Create the function. */
6426 si_create_function(&ctx, NULL, 0, params, num_params,
6427 -1, -1);
6428
6429 /* Emit exports. */
6430 if (key->vs_epilog.states.export_prim_id) {
6431 struct lp_build_context *base = &bld_base->base;
6432 struct lp_build_context *uint = &bld_base->uint_bld;
6433 LLVMValueRef args[9];
6434
6435 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6436 args[1] = uint->zero; /* whether the EXEC mask is valid */
6437 args[2] = uint->zero; /* DONE bit */
6438 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6439 key->vs_epilog.prim_id_param_offset);
6440 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6441 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6442 VS_EPILOG_PRIMID_LOC); /* X */
6443 args[6] = uint->undef; /* Y */
6444 args[7] = uint->undef; /* Z */
6445 args[8] = uint->undef; /* W */
6446
6447 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6448 LLVMVoidTypeInContext(base->gallivm->context),
6449 args, 9, 0);
6450 }
6451
6452 /* Compile. */
6453 LLVMBuildRet(gallivm->builder, ctx.return_value);
6454 radeon_llvm_finalize_module(&ctx.radeon_bld);
6455
6456 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6457 gallivm->module, debug, ctx.type,
6458 "Vertex Shader Epilog"))
6459 status = false;
6460
6461 radeon_llvm_dispose(&ctx.radeon_bld);
6462 return status;
6463 }
6464
6465 /**
6466 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
6467 */
6468 static bool si_get_vs_epilog(struct si_screen *sscreen,
6469 LLVMTargetMachineRef tm,
6470 struct si_shader *shader,
6471 struct pipe_debug_callback *debug,
6472 struct si_vs_epilog_bits *states)
6473 {
6474 union si_shader_part_key epilog_key;
6475
6476 memset(&epilog_key, 0, sizeof(epilog_key));
6477 epilog_key.vs_epilog.states = *states;
6478
6479 /* Set up the PrimitiveID output. */
6480 if (shader->key.vs.epilog.export_prim_id) {
6481 unsigned index = shader->selector->info.num_outputs;
6482 unsigned offset = shader->info.nr_param_exports++;
6483
6484 epilog_key.vs_epilog.prim_id_param_offset = offset;
6485 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6486 shader->info.vs_output_param_offset[index] = offset;
6487 }
6488
6489 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
6490 &epilog_key, tm, debug,
6491 si_compile_vs_epilog);
6492 return shader->epilog != NULL;
6493 }
6494
6495 /**
6496 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6497 */
6498 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6499 LLVMTargetMachineRef tm,
6500 struct si_shader *shader,
6501 struct pipe_debug_callback *debug)
6502 {
6503 struct tgsi_shader_info *info = &shader->selector->info;
6504 union si_shader_part_key prolog_key;
6505 unsigned i;
6506
6507 /* Get the prolog. */
6508 memset(&prolog_key, 0, sizeof(prolog_key));
6509 prolog_key.vs_prolog.states = shader->key.vs.prolog;
6510 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6511 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6512
6513 /* The prolog is a no-op if there are no inputs. */
6514 if (info->num_inputs) {
6515 shader->prolog =
6516 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6517 &prolog_key, tm, debug,
6518 si_compile_vs_prolog);
6519 if (!shader->prolog)
6520 return false;
6521 }
6522
6523 /* Get the epilog. */
6524 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
6525 !si_get_vs_epilog(sscreen, tm, shader, debug,
6526 &shader->key.vs.epilog))
6527 return false;
6528
6529 /* Set the instanceID flag. */
6530 for (i = 0; i < info->num_inputs; i++)
6531 if (prolog_key.vs_prolog.states.instance_divisors[i])
6532 shader->info.uses_instanceid = true;
6533
6534 return true;
6535 }
6536
6537 /**
6538 * Select and compile (or reuse) TES parts (epilog).
6539 */
6540 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
6541 LLVMTargetMachineRef tm,
6542 struct si_shader *shader,
6543 struct pipe_debug_callback *debug)
6544 {
6545 if (shader->key.tes.as_es)
6546 return true;
6547
6548 /* TES compiled as VS. */
6549 return si_get_vs_epilog(sscreen, tm, shader, debug,
6550 &shader->key.tes.epilog);
6551 }
6552
6553 /**
6554 * Compile the TCS epilog. This writes tesselation factors to memory based on
6555 * the output primitive type of the tesselator (determined by TES).
6556 */
6557 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
6558 LLVMTargetMachineRef tm,
6559 struct pipe_debug_callback *debug,
6560 struct si_shader_part *out)
6561 {
6562 union si_shader_part_key *key = &out->key;
6563 struct si_shader shader = {};
6564 struct si_shader_context ctx;
6565 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6566 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6567 LLVMTypeRef params[16];
6568 LLVMValueRef func;
6569 int last_array_pointer, last_sgpr, num_params;
6570 bool status = true;
6571
6572 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6573 ctx.type = PIPE_SHADER_TESS_CTRL;
6574 shader.key.tcs.epilog = key->tcs_epilog.states;
6575
6576 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
6577 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
6578 last_array_pointer = SI_PARAM_RW_BUFFERS;
6579 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6580 params[SI_PARAM_SAMPLERS] = ctx.i64;
6581 params[SI_PARAM_IMAGES] = ctx.i64;
6582 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6583 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
6584 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
6585 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
6586 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
6587 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
6588 num_params = last_sgpr + 1;
6589
6590 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
6591 params[num_params++] = ctx.i32; /* invocation ID within the patch */
6592 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
6593
6594 /* Create the function. */
6595 si_create_function(&ctx, NULL, 0, params, num_params,
6596 last_array_pointer, last_sgpr);
6597 declare_tess_lds(&ctx);
6598 func = ctx.radeon_bld.main_fn;
6599
6600 si_write_tess_factors(bld_base,
6601 LLVMGetParam(func, last_sgpr + 1),
6602 LLVMGetParam(func, last_sgpr + 2),
6603 LLVMGetParam(func, last_sgpr + 3));
6604
6605 /* Compile. */
6606 LLVMBuildRet(gallivm->builder, ctx.return_value);
6607 radeon_llvm_finalize_module(&ctx.radeon_bld);
6608
6609 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6610 gallivm->module, debug, ctx.type,
6611 "Tessellation Control Shader Epilog"))
6612 status = false;
6613
6614 radeon_llvm_dispose(&ctx.radeon_bld);
6615 return status;
6616 }
6617
6618 /**
6619 * Select and compile (or reuse) TCS parts (epilog).
6620 */
6621 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6622 LLVMTargetMachineRef tm,
6623 struct si_shader *shader,
6624 struct pipe_debug_callback *debug)
6625 {
6626 union si_shader_part_key epilog_key;
6627
6628 /* Get the epilog. */
6629 memset(&epilog_key, 0, sizeof(epilog_key));
6630 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
6631
6632 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6633 &epilog_key, tm, debug,
6634 si_compile_tcs_epilog);
6635 return shader->epilog != NULL;
6636 }
6637
6638 /**
6639 * Compile the pixel shader prolog. This handles:
6640 * - two-side color selection and interpolation
6641 * - overriding interpolation parameters for the API PS
6642 * - polygon stippling
6643 *
6644 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6645 * overriden by other states. (e.g. per-sample interpolation)
6646 * Interpolated colors are stored after the preloaded VGPRs.
6647 */
6648 static bool si_compile_ps_prolog(struct si_screen *sscreen,
6649 LLVMTargetMachineRef tm,
6650 struct pipe_debug_callback *debug,
6651 struct si_shader_part *out)
6652 {
6653 union si_shader_part_key *key = &out->key;
6654 struct si_shader shader = {};
6655 struct si_shader_context ctx;
6656 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6657 LLVMTypeRef *params;
6658 LLVMValueRef ret, func;
6659 int last_sgpr, num_params, num_returns, i, num_color_channels;
6660 bool status = true;
6661
6662 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6663 ctx.type = PIPE_SHADER_FRAGMENT;
6664 shader.key.ps.prolog = key->ps_prolog.states;
6665
6666 /* Number of inputs + 8 color elements. */
6667 params = alloca((key->ps_prolog.num_input_sgprs +
6668 key->ps_prolog.num_input_vgprs + 8) *
6669 sizeof(LLVMTypeRef));
6670
6671 /* Declare inputs. */
6672 num_params = 0;
6673 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6674 params[num_params++] = ctx.i32;
6675 last_sgpr = num_params - 1;
6676
6677 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6678 params[num_params++] = ctx.f32;
6679
6680 /* Declare outputs (same as inputs + add colors if needed) */
6681 num_returns = num_params;
6682 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6683 for (i = 0; i < num_color_channels; i++)
6684 params[num_returns++] = ctx.f32;
6685
6686 /* Create the function. */
6687 si_create_function(&ctx, params, num_returns, params,
6688 num_params, -1, last_sgpr);
6689 func = ctx.radeon_bld.main_fn;
6690
6691 /* Copy inputs to outputs. This should be no-op, as the registers match,
6692 * but it will prevent the compiler from overwriting them unintentionally.
6693 */
6694 ret = ctx.return_value;
6695 for (i = 0; i < num_params; i++) {
6696 LLVMValueRef p = LLVMGetParam(func, i);
6697 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6698 }
6699
6700 /* Polygon stippling. */
6701 if (key->ps_prolog.states.poly_stipple) {
6702 /* POS_FIXED_PT is always last. */
6703 unsigned pos = key->ps_prolog.num_input_sgprs +
6704 key->ps_prolog.num_input_vgprs - 1;
6705 LLVMValueRef ptr[2], list;
6706
6707 /* Get the pointer to rw buffers. */
6708 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
6709 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
6710 list = lp_build_gather_values(gallivm, ptr, 2);
6711 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
6712 list = LLVMBuildIntToPtr(gallivm->builder, list,
6713 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
6714
6715 si_llvm_emit_polygon_stipple(&ctx, list, pos);
6716 }
6717
6718 /* Interpolate colors. */
6719 for (i = 0; i < 2; i++) {
6720 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
6721 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
6722 key->ps_prolog.face_vgpr_index;
6723 LLVMValueRef interp[2], color[4];
6724 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
6725
6726 if (!writemask)
6727 continue;
6728
6729 /* If the interpolation qualifier is not CONSTANT (-1). */
6730 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
6731 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
6732 key->ps_prolog.color_interp_vgpr_index[i];
6733
6734 interp[0] = LLVMGetParam(func, interp_vgpr);
6735 interp[1] = LLVMGetParam(func, interp_vgpr + 1);
6736 interp_ij = lp_build_gather_values(gallivm, interp, 2);
6737 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
6738 ctx.v2i32, "");
6739 }
6740
6741 /* Use the absolute location of the input. */
6742 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6743
6744 if (key->ps_prolog.states.color_two_side) {
6745 face = LLVMGetParam(func, face_vgpr);
6746 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
6747 }
6748
6749 interp_fs_input(&ctx,
6750 key->ps_prolog.color_attr_index[i],
6751 TGSI_SEMANTIC_COLOR, i,
6752 key->ps_prolog.num_interp_inputs,
6753 key->ps_prolog.colors_read, interp_ij,
6754 prim_mask, face, color);
6755
6756 while (writemask) {
6757 unsigned chan = u_bit_scan(&writemask);
6758 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
6759 num_params++, "");
6760 }
6761 }
6762
6763 /* Force per-sample interpolation. */
6764 if (key->ps_prolog.states.force_persample_interp) {
6765 unsigned i, base = key->ps_prolog.num_input_sgprs;
6766 LLVMValueRef persp_sample[2], linear_sample[2];
6767
6768 /* Read PERSP_SAMPLE. */
6769 for (i = 0; i < 2; i++)
6770 persp_sample[i] = LLVMGetParam(func, base + i);
6771 /* Overwrite PERSP_CENTER. */
6772 for (i = 0; i < 2; i++)
6773 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6774 persp_sample[i], base + 2 + i, "");
6775 /* Overwrite PERSP_CENTROID. */
6776 for (i = 0; i < 2; i++)
6777 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6778 persp_sample[i], base + 4 + i, "");
6779 /* Read LINEAR_SAMPLE. */
6780 for (i = 0; i < 2; i++)
6781 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
6782 /* Overwrite LINEAR_CENTER. */
6783 for (i = 0; i < 2; i++)
6784 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6785 linear_sample[i], base + 8 + i, "");
6786 /* Overwrite LINEAR_CENTROID. */
6787 for (i = 0; i < 2; i++)
6788 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6789 linear_sample[i], base + 10 + i, "");
6790 }
6791
6792 /* Compile. */
6793 LLVMBuildRet(gallivm->builder, ret);
6794 radeon_llvm_finalize_module(&ctx.radeon_bld);
6795
6796 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6797 gallivm->module, debug, ctx.type,
6798 "Fragment Shader Prolog"))
6799 status = false;
6800
6801 radeon_llvm_dispose(&ctx.radeon_bld);
6802 return status;
6803 }
6804
6805 /**
6806 * Compile the pixel shader epilog. This handles everything that must be
6807 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
6808 */
6809 static bool si_compile_ps_epilog(struct si_screen *sscreen,
6810 LLVMTargetMachineRef tm,
6811 struct pipe_debug_callback *debug,
6812 struct si_shader_part *out)
6813 {
6814 union si_shader_part_key *key = &out->key;
6815 struct si_shader shader = {};
6816 struct si_shader_context ctx;
6817 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6818 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6819 LLVMTypeRef params[16+8*4+3];
6820 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
6821 int last_array_pointer, last_sgpr, num_params, i;
6822 bool status = true;
6823
6824 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6825 ctx.type = PIPE_SHADER_FRAGMENT;
6826 shader.key.ps.epilog = key->ps_epilog.states;
6827
6828 /* Declare input SGPRs. */
6829 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
6830 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6831 params[SI_PARAM_SAMPLERS] = ctx.i64;
6832 params[SI_PARAM_IMAGES] = ctx.i64;
6833 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6834 params[SI_PARAM_ALPHA_REF] = ctx.f32;
6835 last_array_pointer = -1;
6836 last_sgpr = SI_PARAM_ALPHA_REF;
6837
6838 /* Declare input VGPRs. */
6839 num_params = (last_sgpr + 1) +
6840 util_bitcount(key->ps_epilog.colors_written) * 4 +
6841 key->ps_epilog.writes_z +
6842 key->ps_epilog.writes_stencil +
6843 key->ps_epilog.writes_samplemask;
6844
6845 num_params = MAX2(num_params,
6846 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6847
6848 assert(num_params <= ARRAY_SIZE(params));
6849
6850 for (i = last_sgpr + 1; i < num_params; i++)
6851 params[i] = ctx.f32;
6852
6853 /* Create the function. */
6854 si_create_function(&ctx, NULL, 0, params, num_params,
6855 last_array_pointer, last_sgpr);
6856 /* Disable elimination of unused inputs. */
6857 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
6858 "InitialPSInputAddr", 0xffffff);
6859
6860 /* Process colors. */
6861 unsigned vgpr = last_sgpr + 1;
6862 unsigned colors_written = key->ps_epilog.colors_written;
6863 int last_color_export = -1;
6864
6865 /* Find the last color export. */
6866 if (!key->ps_epilog.writes_z &&
6867 !key->ps_epilog.writes_stencil &&
6868 !key->ps_epilog.writes_samplemask) {
6869 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
6870
6871 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
6872 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
6873 /* Just set this if any of the colorbuffers are enabled. */
6874 if (spi_format &
6875 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
6876 last_color_export = 0;
6877 } else {
6878 for (i = 0; i < 8; i++)
6879 if (colors_written & (1 << i) &&
6880 (spi_format >> (i * 4)) & 0xf)
6881 last_color_export = i;
6882 }
6883 }
6884
6885 while (colors_written) {
6886 LLVMValueRef color[4];
6887 int mrt = u_bit_scan(&colors_written);
6888
6889 for (i = 0; i < 4; i++)
6890 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6891
6892 si_export_mrt_color(bld_base, color, mrt,
6893 num_params - 1,
6894 mrt == last_color_export);
6895 }
6896
6897 /* Process depth, stencil, samplemask. */
6898 if (key->ps_epilog.writes_z)
6899 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6900 if (key->ps_epilog.writes_stencil)
6901 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6902 if (key->ps_epilog.writes_samplemask)
6903 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6904
6905 if (depth || stencil || samplemask)
6906 si_export_mrt_z(bld_base, depth, stencil, samplemask);
6907 else if (last_color_export == -1)
6908 si_export_null(bld_base);
6909
6910 /* Compile. */
6911 LLVMBuildRetVoid(gallivm->builder);
6912 radeon_llvm_finalize_module(&ctx.radeon_bld);
6913
6914 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6915 gallivm->module, debug, ctx.type,
6916 "Fragment Shader Epilog"))
6917 status = false;
6918
6919 radeon_llvm_dispose(&ctx.radeon_bld);
6920 return status;
6921 }
6922
6923 /**
6924 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
6925 */
6926 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
6927 LLVMTargetMachineRef tm,
6928 struct si_shader *shader,
6929 struct pipe_debug_callback *debug)
6930 {
6931 struct tgsi_shader_info *info = &shader->selector->info;
6932 union si_shader_part_key prolog_key;
6933 union si_shader_part_key epilog_key;
6934 unsigned i;
6935
6936 /* Get the prolog. */
6937 memset(&prolog_key, 0, sizeof(prolog_key));
6938 prolog_key.ps_prolog.states = shader->key.ps.prolog;
6939 prolog_key.ps_prolog.colors_read = info->colors_read;
6940 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6941 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6942
6943 if (info->colors_read) {
6944 unsigned *color = shader->selector->color_attr_index;
6945
6946 if (shader->key.ps.prolog.color_two_side) {
6947 /* BCOLORs are stored after the last input. */
6948 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
6949 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6950 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6951 }
6952
6953 for (i = 0; i < 2; i++) {
6954 unsigned location = info->input_interpolate_loc[color[i]];
6955
6956 if (!(info->colors_read & (0xf << i*4)))
6957 continue;
6958
6959 prolog_key.ps_prolog.color_attr_index[i] = color[i];
6960
6961 /* Force per-sample interpolation for the colors here. */
6962 if (shader->key.ps.prolog.force_persample_interp)
6963 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6964
6965 switch (info->input_interpolate[color[i]]) {
6966 case TGSI_INTERPOLATE_CONSTANT:
6967 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
6968 break;
6969 case TGSI_INTERPOLATE_PERSPECTIVE:
6970 case TGSI_INTERPOLATE_COLOR:
6971 switch (location) {
6972 case TGSI_INTERPOLATE_LOC_SAMPLE:
6973 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
6974 shader->config.spi_ps_input_ena |=
6975 S_0286CC_PERSP_SAMPLE_ENA(1);
6976 break;
6977 case TGSI_INTERPOLATE_LOC_CENTER:
6978 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
6979 shader->config.spi_ps_input_ena |=
6980 S_0286CC_PERSP_CENTER_ENA(1);
6981 break;
6982 case TGSI_INTERPOLATE_LOC_CENTROID:
6983 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
6984 shader->config.spi_ps_input_ena |=
6985 S_0286CC_PERSP_CENTROID_ENA(1);
6986 break;
6987 default:
6988 assert(0);
6989 }
6990 break;
6991 case TGSI_INTERPOLATE_LINEAR:
6992 switch (location) {
6993 case TGSI_INTERPOLATE_LOC_SAMPLE:
6994 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
6995 shader->config.spi_ps_input_ena |=
6996 S_0286CC_LINEAR_SAMPLE_ENA(1);
6997 break;
6998 case TGSI_INTERPOLATE_LOC_CENTER:
6999 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7000 shader->config.spi_ps_input_ena |=
7001 S_0286CC_LINEAR_CENTER_ENA(1);
7002 break;
7003 case TGSI_INTERPOLATE_LOC_CENTROID:
7004 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7005 shader->config.spi_ps_input_ena |=
7006 S_0286CC_LINEAR_CENTROID_ENA(1);
7007 break;
7008 default:
7009 assert(0);
7010 }
7011 break;
7012 default:
7013 assert(0);
7014 }
7015 }
7016 }
7017
7018 /* The prolog is a no-op if these aren't set. */
7019 if (prolog_key.ps_prolog.colors_read ||
7020 prolog_key.ps_prolog.states.force_persample_interp ||
7021 prolog_key.ps_prolog.states.poly_stipple) {
7022 shader->prolog =
7023 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7024 &prolog_key, tm, debug,
7025 si_compile_ps_prolog);
7026 if (!shader->prolog)
7027 return false;
7028 }
7029
7030 /* Get the epilog. */
7031 memset(&epilog_key, 0, sizeof(epilog_key));
7032 epilog_key.ps_epilog.colors_written = info->colors_written;
7033 epilog_key.ps_epilog.writes_z = info->writes_z;
7034 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7035 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7036 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7037
7038 shader->epilog =
7039 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7040 &epilog_key, tm, debug,
7041 si_compile_ps_epilog);
7042 if (!shader->epilog)
7043 return false;
7044
7045 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7046 if (shader->key.ps.prolog.poly_stipple) {
7047 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7048 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7049 }
7050
7051 /* Set up the enable bits for per-sample shading if needed. */
7052 if (shader->key.ps.prolog.force_persample_interp) {
7053 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7054 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
7055 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7056 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7057 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7058 }
7059 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7060 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
7061 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7062 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7063 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7064 }
7065 }
7066
7067 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7068 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7069 !(shader->config.spi_ps_input_ena & 0xf)) {
7070 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7071 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7072 }
7073
7074 /* At least one pair of interpolation weights must be enabled. */
7075 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7076 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7077 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7078 }
7079
7080 /* The sample mask input is always enabled, because the API shader always
7081 * passes it through to the epilog. Disable it here if it's unused.
7082 */
7083 if (!shader->key.ps.epilog.poly_line_smoothing &&
7084 !shader->selector->info.reads_samplemask)
7085 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7086
7087 return true;
7088 }
7089
7090 static void si_fix_num_sgprs(struct si_shader *shader)
7091 {
7092 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7093
7094 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7095 }
7096
7097 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7098 struct si_shader *shader,
7099 struct pipe_debug_callback *debug)
7100 {
7101 struct si_shader *mainp = shader->selector->main_shader_part;
7102 int r;
7103
7104 /* LS, ES, VS are compiled on demand if the main part hasn't been
7105 * compiled for that stage.
7106 */
7107 if (!mainp ||
7108 (shader->selector->type == PIPE_SHADER_VERTEX &&
7109 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7110 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7111 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7112 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7113 shader->selector->type == PIPE_SHADER_COMPUTE) {
7114 /* Monolithic shader (compiled as a whole, has many variants,
7115 * may take a long time to compile).
7116 */
7117 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7118 if (r)
7119 return r;
7120 } else {
7121 /* The shader consists of 2-3 parts:
7122 *
7123 * - the middle part is the user shader, it has 1 variant only
7124 * and it was compiled during the creation of the shader
7125 * selector
7126 * - the prolog part is inserted at the beginning
7127 * - the epilog part is inserted at the end
7128 *
7129 * The prolog and epilog have many (but simple) variants.
7130 */
7131
7132 /* Copy the compiled TGSI shader data over. */
7133 shader->is_binary_shared = true;
7134 shader->binary = mainp->binary;
7135 shader->config = mainp->config;
7136 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7137 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7138 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7139 memcpy(shader->info.vs_output_param_offset,
7140 mainp->info.vs_output_param_offset,
7141 sizeof(mainp->info.vs_output_param_offset));
7142 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7143 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7144 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7145
7146 /* Select prologs and/or epilogs. */
7147 switch (shader->selector->type) {
7148 case PIPE_SHADER_VERTEX:
7149 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7150 return -1;
7151 break;
7152 case PIPE_SHADER_TESS_CTRL:
7153 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7154 return -1;
7155 break;
7156 case PIPE_SHADER_TESS_EVAL:
7157 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7158 return -1;
7159 break;
7160 case PIPE_SHADER_FRAGMENT:
7161 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7162 return -1;
7163
7164 /* Make sure we have at least as many VGPRs as there
7165 * are allocated inputs.
7166 */
7167 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7168 shader->info.num_input_vgprs);
7169 break;
7170 }
7171
7172 /* Update SGPR and VGPR counts. */
7173 if (shader->prolog) {
7174 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7175 shader->prolog->config.num_sgprs);
7176 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7177 shader->prolog->config.num_vgprs);
7178 }
7179 if (shader->epilog) {
7180 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7181 shader->epilog->config.num_sgprs);
7182 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7183 shader->epilog->config.num_vgprs);
7184 }
7185 }
7186
7187 si_fix_num_sgprs(shader);
7188 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7189 stderr);
7190
7191 /* Upload. */
7192 r = si_shader_binary_upload(sscreen, shader);
7193 if (r) {
7194 fprintf(stderr, "LLVM failed to upload shader\n");
7195 return r;
7196 }
7197
7198 return 0;
7199 }
7200
7201 void si_shader_destroy(struct si_shader *shader)
7202 {
7203 if (shader->gs_copy_shader) {
7204 si_shader_destroy(shader->gs_copy_shader);
7205 FREE(shader->gs_copy_shader);
7206 }
7207
7208 if (shader->scratch_bo)
7209 r600_resource_reference(&shader->scratch_bo, NULL);
7210
7211 r600_resource_reference(&shader->bo, NULL);
7212
7213 if (!shader->is_binary_shared)
7214 radeon_shader_binary_clean(&shader->binary);
7215 }