radeonsi: export SampleMask from pixel shaders at full rate
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "gallivm/lp_bld_misc.h"
37 #include "radeon/r600_cs.h"
38 #include "radeon/radeon_llvm.h"
39 #include "radeon/radeon_elf_util.h"
40 #include "radeon/radeon_llvm_emit.h"
41 #include "util/u_memory.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94 int param_oc_lds;
95
96 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
97 * 0x800000 for VS, 0x1 for ES.
98 */
99 int param_tess_offchip;
100
101 LLVMTargetMachineRef tm;
102
103 unsigned invariant_load_md_kind;
104 unsigned range_md_kind;
105 unsigned uniform_md_kind;
106 LLVMValueRef empty_md;
107
108 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
109 LLVMValueRef lds;
110 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
111 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
112 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
113 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
114 LLVMValueRef images[SI_NUM_IMAGES];
115 LLVMValueRef so_buffers[4];
116 LLVMValueRef esgs_ring;
117 LLVMValueRef gsvs_ring[4];
118 LLVMValueRef gs_next_vertex[4];
119 LLVMValueRef return_value;
120
121 LLVMTypeRef voidt;
122 LLVMTypeRef i1;
123 LLVMTypeRef i8;
124 LLVMTypeRef i32;
125 LLVMTypeRef i64;
126 LLVMTypeRef i128;
127 LLVMTypeRef f32;
128 LLVMTypeRef v16i8;
129 LLVMTypeRef v2i32;
130 LLVMTypeRef v4i32;
131 LLVMTypeRef v4f32;
132 LLVMTypeRef v8i32;
133
134 LLVMValueRef shared_memory;
135 };
136
137 static struct si_shader_context *si_shader_context(
138 struct lp_build_tgsi_context *bld_base)
139 {
140 return (struct si_shader_context *)bld_base;
141 }
142
143 static void si_init_shader_ctx(struct si_shader_context *ctx,
144 struct si_screen *sscreen,
145 struct si_shader *shader,
146 LLVMTargetMachineRef tm);
147
148 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
149 struct lp_build_tgsi_context *bld_base,
150 struct lp_build_emit_data *emit_data);
151
152 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
153 FILE *f);
154
155 /* Ideally pass the sample mask input to the PS epilog as v13, which
156 * is its usual location, so that the shader doesn't have to add v_mov.
157 */
158 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
159
160 /* The VS location of the PrimitiveID input is the same in the epilog,
161 * so that the main shader part doesn't have to move it.
162 */
163 #define VS_EPILOG_PRIMID_LOC 2
164
165 #define PERSPECTIVE_BASE 0
166 #define LINEAR_BASE 9
167
168 #define SAMPLE_OFFSET 0
169 #define CENTER_OFFSET 2
170 #define CENTROID_OFSET 4
171
172 #define USE_SGPR_MAX_SUFFIX_LEN 5
173 #define CONST_ADDR_SPACE 2
174 #define LOCAL_ADDR_SPACE 3
175 #define USER_SGPR_ADDR_SPACE 8
176
177
178 #define SENDMSG_GS 2
179 #define SENDMSG_GS_DONE 3
180
181 #define SENDMSG_GS_OP_NOP (0 << 4)
182 #define SENDMSG_GS_OP_CUT (1 << 4)
183 #define SENDMSG_GS_OP_EMIT (2 << 4)
184 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
185
186 /**
187 * Returns a unique index for a semantic name and index. The index must be
188 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
189 * calculated.
190 */
191 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
192 {
193 switch (semantic_name) {
194 case TGSI_SEMANTIC_POSITION:
195 return 0;
196 case TGSI_SEMANTIC_PSIZE:
197 return 1;
198 case TGSI_SEMANTIC_CLIPDIST:
199 assert(index <= 1);
200 return 2 + index;
201 case TGSI_SEMANTIC_GENERIC:
202 if (index <= 63-4)
203 return 4 + index;
204 else
205 /* same explanation as in the default statement,
206 * the only user hitting this is st/nine.
207 */
208 return 0;
209
210 /* patch indices are completely separate and thus start from 0 */
211 case TGSI_SEMANTIC_TESSOUTER:
212 return 0;
213 case TGSI_SEMANTIC_TESSINNER:
214 return 1;
215 case TGSI_SEMANTIC_PATCH:
216 return 2 + index;
217
218 default:
219 /* Don't fail here. The result of this function is only used
220 * for LS, TCS, TES, and GS, where legacy GL semantics can't
221 * occur, but this function is called for all vertex shaders
222 * before it's known whether LS will be compiled or not.
223 */
224 return 0;
225 }
226 }
227
228 /**
229 * Get the value of a shader input parameter and extract a bitfield.
230 */
231 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
232 unsigned param, unsigned rshift,
233 unsigned bitwidth)
234 {
235 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
236 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
237 param);
238
239 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
240 value = bitcast(&ctx->radeon_bld.soa.bld_base,
241 TGSI_TYPE_UNSIGNED, value);
242
243 if (rshift)
244 value = LLVMBuildLShr(gallivm->builder, value,
245 lp_build_const_int32(gallivm, rshift), "");
246
247 if (rshift + bitwidth < 32) {
248 unsigned mask = (1 << bitwidth) - 1;
249 value = LLVMBuildAnd(gallivm->builder, value,
250 lp_build_const_int32(gallivm, mask), "");
251 }
252
253 return value;
254 }
255
256 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
257 {
258 switch (ctx->type) {
259 case PIPE_SHADER_TESS_CTRL:
260 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
261
262 case PIPE_SHADER_TESS_EVAL:
263 return LLVMGetParam(ctx->radeon_bld.main_fn,
264 ctx->param_tes_rel_patch_id);
265
266 default:
267 assert(0);
268 return NULL;
269 }
270 }
271
272 /* Tessellation shaders pass outputs to the next shader using LDS.
273 *
274 * LS outputs = TCS inputs
275 * TCS outputs = TES inputs
276 *
277 * The LDS layout is:
278 * - TCS inputs for patch 0
279 * - TCS inputs for patch 1
280 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
281 * - ...
282 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
283 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
284 * - TCS outputs for patch 1
285 * - Per-patch TCS outputs for patch 1
286 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
287 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
288 * - ...
289 *
290 * All three shaders VS(LS), TCS, TES share the same LDS space.
291 */
292
293 static LLVMValueRef
294 get_tcs_in_patch_stride(struct si_shader_context *ctx)
295 {
296 if (ctx->type == PIPE_SHADER_VERTEX)
297 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
298 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
299 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
300 else {
301 assert(0);
302 return NULL;
303 }
304 }
305
306 static LLVMValueRef
307 get_tcs_out_patch_stride(struct si_shader_context *ctx)
308 {
309 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
310 }
311
312 static LLVMValueRef
313 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
314 {
315 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
316 unpack_param(ctx,
317 SI_PARAM_TCS_OUT_OFFSETS,
318 0, 16),
319 4);
320 }
321
322 static LLVMValueRef
323 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
324 {
325 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
326 unpack_param(ctx,
327 SI_PARAM_TCS_OUT_OFFSETS,
328 16, 16),
329 4);
330 }
331
332 static LLVMValueRef
333 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
334 {
335 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
336 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
337 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
338
339 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
340 }
341
342 static LLVMValueRef
343 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
344 {
345 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
346 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
347 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
348 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
349
350 return LLVMBuildAdd(gallivm->builder, patch0_offset,
351 LLVMBuildMul(gallivm->builder, patch_stride,
352 rel_patch_id, ""),
353 "");
354 }
355
356 static LLVMValueRef
357 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
358 {
359 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
360 LLVMValueRef patch0_patch_data_offset =
361 get_tcs_out_patch0_patch_data_offset(ctx);
362 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
363 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
364
365 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
366 LLVMBuildMul(gallivm->builder, patch_stride,
367 rel_patch_id, ""),
368 "");
369 }
370
371 static void build_indexed_store(struct si_shader_context *ctx,
372 LLVMValueRef base_ptr, LLVMValueRef index,
373 LLVMValueRef value)
374 {
375 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
376 struct gallivm_state *gallivm = bld_base->base.gallivm;
377 LLVMValueRef indices[2], pointer;
378
379 indices[0] = bld_base->uint_bld.zero;
380 indices[1] = index;
381
382 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
383 LLVMBuildStore(gallivm->builder, value, pointer);
384 }
385
386 /**
387 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
388 * It's equivalent to doing a load from &base_ptr[index].
389 *
390 * \param base_ptr Where the array starts.
391 * \param index The element index into the array.
392 * \param uniform Whether the base_ptr and index can be assumed to be
393 * dynamically uniform
394 */
395 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
396 LLVMValueRef base_ptr, LLVMValueRef index,
397 bool uniform)
398 {
399 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
400 struct gallivm_state *gallivm = bld_base->base.gallivm;
401 LLVMValueRef indices[2], pointer;
402
403 indices[0] = bld_base->uint_bld.zero;
404 indices[1] = index;
405
406 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
407 if (uniform)
408 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
409 return LLVMBuildLoad(gallivm->builder, pointer, "");
410 }
411
412 /**
413 * Do a load from &base_ptr[index], but also add a flag that it's loading
414 * a constant from a dynamically uniform index.
415 */
416 static LLVMValueRef build_indexed_load_const(
417 struct si_shader_context *ctx,
418 LLVMValueRef base_ptr, LLVMValueRef index)
419 {
420 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
421 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
422 return result;
423 }
424
425 static LLVMValueRef get_instance_index_for_fetch(
426 struct radeon_llvm_context *radeon_bld,
427 unsigned param_start_instance, unsigned divisor)
428 {
429 struct si_shader_context *ctx =
430 si_shader_context(&radeon_bld->soa.bld_base);
431 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
432
433 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
434 ctx->param_instance_id);
435
436 /* The division must be done before START_INSTANCE is added. */
437 if (divisor > 1)
438 result = LLVMBuildUDiv(gallivm->builder, result,
439 lp_build_const_int32(gallivm, divisor), "");
440
441 return LLVMBuildAdd(gallivm->builder, result,
442 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
443 }
444
445 static void declare_input_vs(
446 struct radeon_llvm_context *radeon_bld,
447 unsigned input_index,
448 const struct tgsi_full_declaration *decl)
449 {
450 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
451 struct gallivm_state *gallivm = base->gallivm;
452 struct si_shader_context *ctx =
453 si_shader_context(&radeon_bld->soa.bld_base);
454 unsigned divisor =
455 ctx->shader->key.vs.prolog.instance_divisors[input_index];
456
457 unsigned chan;
458
459 LLVMValueRef t_list_ptr;
460 LLVMValueRef t_offset;
461 LLVMValueRef t_list;
462 LLVMValueRef attribute_offset;
463 LLVMValueRef buffer_index;
464 LLVMValueRef args[3];
465 LLVMValueRef input;
466
467 /* Load the T list */
468 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
469
470 t_offset = lp_build_const_int32(gallivm, input_index);
471
472 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
473
474 /* Build the attribute offset */
475 attribute_offset = lp_build_const_int32(gallivm, 0);
476
477 if (!ctx->is_monolithic) {
478 buffer_index = LLVMGetParam(radeon_bld->main_fn,
479 ctx->param_vertex_index0 +
480 input_index);
481 } else if (divisor) {
482 /* Build index from instance ID, start instance and divisor */
483 ctx->shader->info.uses_instanceid = true;
484 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
485 SI_PARAM_START_INSTANCE,
486 divisor);
487 } else {
488 /* Load the buffer index for vertices. */
489 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
490 ctx->param_vertex_id);
491 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
492 SI_PARAM_BASE_VERTEX);
493 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
494 }
495
496 args[0] = t_list;
497 args[1] = attribute_offset;
498 args[2] = buffer_index;
499 input = lp_build_intrinsic(gallivm->builder,
500 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
501 LLVMReadNoneAttribute);
502
503 /* Break up the vec4 into individual components */
504 for (chan = 0; chan < 4; chan++) {
505 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
506 /* XXX: Use a helper function for this. There is one in
507 * tgsi_llvm.c. */
508 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
509 LLVMBuildExtractElement(gallivm->builder,
510 input, llvm_chan, "");
511 }
512 }
513
514 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
515 unsigned swizzle)
516 {
517 struct si_shader_context *ctx = si_shader_context(bld_base);
518
519 if (swizzle > 0)
520 return bld_base->uint_bld.zero;
521
522 switch (ctx->type) {
523 case PIPE_SHADER_VERTEX:
524 return LLVMGetParam(ctx->radeon_bld.main_fn,
525 ctx->param_vs_prim_id);
526 case PIPE_SHADER_TESS_CTRL:
527 return LLVMGetParam(ctx->radeon_bld.main_fn,
528 SI_PARAM_PATCH_ID);
529 case PIPE_SHADER_TESS_EVAL:
530 return LLVMGetParam(ctx->radeon_bld.main_fn,
531 ctx->param_tes_patch_id);
532 case PIPE_SHADER_GEOMETRY:
533 return LLVMGetParam(ctx->radeon_bld.main_fn,
534 SI_PARAM_PRIMITIVE_ID);
535 default:
536 assert(0);
537 return bld_base->uint_bld.zero;
538 }
539 }
540
541 /**
542 * Return the value of tgsi_ind_register for indexing.
543 * This is the indirect index with the constant offset added to it.
544 */
545 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
546 const struct tgsi_ind_register *ind,
547 int rel_index)
548 {
549 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
550 LLVMValueRef result;
551
552 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
553 result = LLVMBuildLoad(gallivm->builder, result, "");
554 result = LLVMBuildAdd(gallivm->builder, result,
555 lp_build_const_int32(gallivm, rel_index), "");
556 return result;
557 }
558
559 /**
560 * Like get_indirect_index, but restricts the return value to a (possibly
561 * undefined) value inside [0..num).
562 */
563 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
564 const struct tgsi_ind_register *ind,
565 int rel_index, unsigned num)
566 {
567 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
568
569 /* LLVM 3.8: If indirect resource indexing is used:
570 * - SI & CIK hang
571 * - VI crashes
572 */
573 if (HAVE_LLVM <= 0x0308)
574 return LLVMGetUndef(ctx->i32);
575
576 return radeon_llvm_bound_index(&ctx->radeon_bld, result, num);
577 }
578
579
580 /**
581 * Calculate a dword address given an input or output register and a stride.
582 */
583 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
584 const struct tgsi_full_dst_register *dst,
585 const struct tgsi_full_src_register *src,
586 LLVMValueRef vertex_dw_stride,
587 LLVMValueRef base_addr)
588 {
589 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
590 struct tgsi_shader_info *info = &ctx->shader->selector->info;
591 ubyte *name, *index, *array_first;
592 int first, param;
593 struct tgsi_full_dst_register reg;
594
595 /* Set the register description. The address computation is the same
596 * for sources and destinations. */
597 if (src) {
598 reg.Register.File = src->Register.File;
599 reg.Register.Index = src->Register.Index;
600 reg.Register.Indirect = src->Register.Indirect;
601 reg.Register.Dimension = src->Register.Dimension;
602 reg.Indirect = src->Indirect;
603 reg.Dimension = src->Dimension;
604 reg.DimIndirect = src->DimIndirect;
605 } else
606 reg = *dst;
607
608 /* If the register is 2-dimensional (e.g. an array of vertices
609 * in a primitive), calculate the base address of the vertex. */
610 if (reg.Register.Dimension) {
611 LLVMValueRef index;
612
613 if (reg.Dimension.Indirect)
614 index = get_indirect_index(ctx, &reg.DimIndirect,
615 reg.Dimension.Index);
616 else
617 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
618
619 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
620 LLVMBuildMul(gallivm->builder, index,
621 vertex_dw_stride, ""), "");
622 }
623
624 /* Get information about the register. */
625 if (reg.Register.File == TGSI_FILE_INPUT) {
626 name = info->input_semantic_name;
627 index = info->input_semantic_index;
628 array_first = info->input_array_first;
629 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
630 name = info->output_semantic_name;
631 index = info->output_semantic_index;
632 array_first = info->output_array_first;
633 } else {
634 assert(0);
635 return NULL;
636 }
637
638 if (reg.Register.Indirect) {
639 /* Add the relative address of the element. */
640 LLVMValueRef ind_index;
641
642 if (reg.Indirect.ArrayID)
643 first = array_first[reg.Indirect.ArrayID];
644 else
645 first = reg.Register.Index;
646
647 ind_index = get_indirect_index(ctx, &reg.Indirect,
648 reg.Register.Index - first);
649
650 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
651 LLVMBuildMul(gallivm->builder, ind_index,
652 lp_build_const_int32(gallivm, 4), ""), "");
653
654 param = si_shader_io_get_unique_index(name[first], index[first]);
655 } else {
656 param = si_shader_io_get_unique_index(name[reg.Register.Index],
657 index[reg.Register.Index]);
658 }
659
660 /* Add the base address of the element. */
661 return LLVMBuildAdd(gallivm->builder, base_addr,
662 lp_build_const_int32(gallivm, param * 4), "");
663 }
664
665 /* The offchip buffer layout for TCS->TES is
666 *
667 * - attribute 0 of patch 0 vertex 0
668 * - attribute 0 of patch 0 vertex 1
669 * - attribute 0 of patch 0 vertex 2
670 * ...
671 * - attribute 0 of patch 1 vertex 0
672 * - attribute 0 of patch 1 vertex 1
673 * ...
674 * - attribute 1 of patch 0 vertex 0
675 * - attribute 1 of patch 0 vertex 1
676 * ...
677 * - per patch attribute 0 of patch 0
678 * - per patch attribute 0 of patch 1
679 * ...
680 *
681 * Note that every attribute has 4 components.
682 */
683 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
684 LLVMValueRef vertex_index,
685 LLVMValueRef param_index)
686 {
687 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
688 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
689 LLVMValueRef param_stride, constant16;
690
691 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
692 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
693 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
694 num_patches, "");
695
696 constant16 = lp_build_const_int32(gallivm, 16);
697 if (vertex_index) {
698 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
699 vertices_per_patch, "");
700
701 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
702 vertex_index, "");
703
704 param_stride = total_vertices;
705 } else {
706 base_addr = get_rel_patch_id(ctx);
707 param_stride = num_patches;
708 }
709
710 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
711 LLVMBuildMul(gallivm->builder, param_index,
712 param_stride, ""), "");
713
714 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
715
716 if (!vertex_index) {
717 LLVMValueRef patch_data_offset =
718 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
719
720 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
721 patch_data_offset, "");
722 }
723 return base_addr;
724 }
725
726 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
727 struct si_shader_context *ctx,
728 const struct tgsi_full_dst_register *dst,
729 const struct tgsi_full_src_register *src)
730 {
731 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
732 struct tgsi_shader_info *info = &ctx->shader->selector->info;
733 ubyte *name, *index, *array_first;
734 struct tgsi_full_src_register reg;
735 LLVMValueRef vertex_index = NULL;
736 LLVMValueRef param_index = NULL;
737 unsigned param_index_base, param_base;
738
739 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
740
741 if (reg.Register.Dimension) {
742
743 if (reg.Dimension.Indirect)
744 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
745 reg.Dimension.Index);
746 else
747 vertex_index = lp_build_const_int32(gallivm,
748 reg.Dimension.Index);
749 }
750
751 /* Get information about the register. */
752 if (reg.Register.File == TGSI_FILE_INPUT) {
753 name = info->input_semantic_name;
754 index = info->input_semantic_index;
755 array_first = info->input_array_first;
756 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
757 name = info->output_semantic_name;
758 index = info->output_semantic_index;
759 array_first = info->output_array_first;
760 } else {
761 assert(0);
762 return NULL;
763 }
764
765 if (reg.Register.Indirect) {
766 if (reg.Indirect.ArrayID)
767 param_base = array_first[reg.Indirect.ArrayID];
768 else
769 param_base = reg.Register.Index;
770
771 param_index = get_indirect_index(ctx, &reg.Indirect,
772 reg.Register.Index - param_base);
773
774 } else {
775 param_base = reg.Register.Index;
776 param_index = lp_build_const_int32(gallivm, 0);
777 }
778
779 param_index_base = si_shader_io_get_unique_index(name[param_base],
780 index[param_base]);
781
782 param_index = LLVMBuildAdd(gallivm->builder, param_index,
783 lp_build_const_int32(gallivm, param_index_base),
784 "");
785
786 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
787 }
788
789 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
790 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
791 * or v4i32 (num_channels=3,4). */
792 static void build_tbuffer_store(struct si_shader_context *ctx,
793 LLVMValueRef rsrc,
794 LLVMValueRef vdata,
795 unsigned num_channels,
796 LLVMValueRef vaddr,
797 LLVMValueRef soffset,
798 unsigned inst_offset,
799 unsigned dfmt,
800 unsigned nfmt,
801 unsigned offen,
802 unsigned idxen,
803 unsigned glc,
804 unsigned slc,
805 unsigned tfe)
806 {
807 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
808 LLVMValueRef args[] = {
809 rsrc,
810 vdata,
811 LLVMConstInt(ctx->i32, num_channels, 0),
812 vaddr,
813 soffset,
814 LLVMConstInt(ctx->i32, inst_offset, 0),
815 LLVMConstInt(ctx->i32, dfmt, 0),
816 LLVMConstInt(ctx->i32, nfmt, 0),
817 LLVMConstInt(ctx->i32, offen, 0),
818 LLVMConstInt(ctx->i32, idxen, 0),
819 LLVMConstInt(ctx->i32, glc, 0),
820 LLVMConstInt(ctx->i32, slc, 0),
821 LLVMConstInt(ctx->i32, tfe, 0)
822 };
823
824 /* The instruction offset field has 12 bits */
825 assert(offen || inst_offset < (1 << 12));
826
827 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
828 unsigned func = CLAMP(num_channels, 1, 3) - 1;
829 const char *types[] = {"i32", "v2i32", "v4i32"};
830 char name[256];
831 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
832
833 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
834 args, ARRAY_SIZE(args), 0);
835 }
836
837 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
838 LLVMValueRef rsrc,
839 LLVMValueRef vdata,
840 unsigned num_channels,
841 LLVMValueRef vaddr,
842 LLVMValueRef soffset,
843 unsigned inst_offset)
844 {
845 static unsigned dfmt[] = {
846 V_008F0C_BUF_DATA_FORMAT_32,
847 V_008F0C_BUF_DATA_FORMAT_32_32,
848 V_008F0C_BUF_DATA_FORMAT_32_32_32,
849 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
850 };
851 assert(num_channels >= 1 && num_channels <= 4);
852
853 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
854 inst_offset, dfmt[num_channels-1],
855 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
856 }
857
858 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
859 LLVMValueRef rsrc,
860 int num_channels,
861 LLVMValueRef vindex,
862 LLVMValueRef voffset,
863 LLVMValueRef soffset,
864 unsigned inst_offset,
865 unsigned glc,
866 unsigned slc)
867 {
868 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
869 unsigned func = CLAMP(num_channels, 1, 3) - 1;
870
871 if (HAVE_LLVM >= 0x309) {
872 LLVMValueRef args[] = {
873 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
874 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
875 LLVMConstInt(ctx->i32, inst_offset, 0),
876 LLVMConstInt(ctx->i1, glc, 0),
877 LLVMConstInt(ctx->i1, slc, 0)
878 };
879
880 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
881 ctx->v4f32};
882 const char *type_names[] = {"f32", "v2f32", "v4f32"};
883 char name[256];
884
885 if (voffset) {
886 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
887 "");
888 }
889
890 if (soffset) {
891 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
892 "");
893 }
894
895 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
896 type_names[func]);
897
898 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
899 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
900 } else {
901 LLVMValueRef args[] = {
902 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
903 voffset ? voffset : vindex,
904 soffset,
905 LLVMConstInt(ctx->i32, inst_offset, 0),
906 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
907 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
908 LLVMConstInt(ctx->i32, glc, 0),
909 LLVMConstInt(ctx->i32, slc, 0),
910 LLVMConstInt(ctx->i32, 0, 0), // TFE
911 };
912
913 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
914 ctx->v4i32};
915 const char *type_names[] = {"i32", "v2i32", "v4i32"};
916 const char *arg_type = "i32";
917 char name[256];
918
919 if (voffset && vindex) {
920 LLVMValueRef vaddr[] = {vindex, voffset};
921
922 arg_type = "v2i32";
923 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
924 }
925
926 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
927 type_names[func], arg_type);
928
929 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
930 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
931 }
932 }
933
934 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
935 enum tgsi_opcode_type type, unsigned swizzle,
936 LLVMValueRef buffer, LLVMValueRef offset,
937 LLVMValueRef base)
938 {
939 struct si_shader_context *ctx = si_shader_context(bld_base);
940 struct gallivm_state *gallivm = bld_base->base.gallivm;
941 LLVMValueRef value, value2;
942 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
943 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
944
945 if (swizzle == ~0) {
946 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
947 0, 1, 0);
948
949 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
950 }
951
952 if (!tgsi_type_is_64bit(type)) {
953 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
954 0, 1, 0);
955
956 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
957 return LLVMBuildExtractElement(gallivm->builder, value,
958 lp_build_const_int32(gallivm, swizzle), "");
959 }
960
961 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
962 swizzle * 4, 1, 0);
963
964 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
965 swizzle * 4 + 4, 1, 0);
966
967 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
968 }
969
970 /**
971 * Load from LDS.
972 *
973 * \param type output value type
974 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
975 * \param dw_addr address in dwords
976 */
977 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
978 enum tgsi_opcode_type type, unsigned swizzle,
979 LLVMValueRef dw_addr)
980 {
981 struct si_shader_context *ctx = si_shader_context(bld_base);
982 struct gallivm_state *gallivm = bld_base->base.gallivm;
983 LLVMValueRef value;
984
985 if (swizzle == ~0) {
986 LLVMValueRef values[TGSI_NUM_CHANNELS];
987
988 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
989 values[chan] = lds_load(bld_base, type, chan, dw_addr);
990
991 return lp_build_gather_values(bld_base->base.gallivm, values,
992 TGSI_NUM_CHANNELS);
993 }
994
995 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
996 lp_build_const_int32(gallivm, swizzle));
997
998 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
999 if (tgsi_type_is_64bit(type)) {
1000 LLVMValueRef value2;
1001 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1002 lp_build_const_int32(gallivm, swizzle + 1));
1003 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1004 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1005 }
1006
1007 return LLVMBuildBitCast(gallivm->builder, value,
1008 tgsi2llvmtype(bld_base, type), "");
1009 }
1010
1011 /**
1012 * Store to LDS.
1013 *
1014 * \param swizzle offset (typically 0..3)
1015 * \param dw_addr address in dwords
1016 * \param value value to store
1017 */
1018 static void lds_store(struct lp_build_tgsi_context *bld_base,
1019 unsigned swizzle, LLVMValueRef dw_addr,
1020 LLVMValueRef value)
1021 {
1022 struct si_shader_context *ctx = si_shader_context(bld_base);
1023 struct gallivm_state *gallivm = bld_base->base.gallivm;
1024
1025 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1026 lp_build_const_int32(gallivm, swizzle));
1027
1028 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1029 build_indexed_store(ctx, ctx->lds,
1030 dw_addr, value);
1031 }
1032
1033 static LLVMValueRef fetch_input_tcs(
1034 struct lp_build_tgsi_context *bld_base,
1035 const struct tgsi_full_src_register *reg,
1036 enum tgsi_opcode_type type, unsigned swizzle)
1037 {
1038 struct si_shader_context *ctx = si_shader_context(bld_base);
1039 LLVMValueRef dw_addr, stride;
1040
1041 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1042 dw_addr = get_tcs_in_current_patch_offset(ctx);
1043 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1044
1045 return lds_load(bld_base, type, swizzle, dw_addr);
1046 }
1047
1048 static LLVMValueRef fetch_output_tcs(
1049 struct lp_build_tgsi_context *bld_base,
1050 const struct tgsi_full_src_register *reg,
1051 enum tgsi_opcode_type type, unsigned swizzle)
1052 {
1053 struct si_shader_context *ctx = si_shader_context(bld_base);
1054 LLVMValueRef dw_addr, stride;
1055
1056 if (reg->Register.Dimension) {
1057 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1058 dw_addr = get_tcs_out_current_patch_offset(ctx);
1059 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1060 } else {
1061 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1062 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1063 }
1064
1065 return lds_load(bld_base, type, swizzle, dw_addr);
1066 }
1067
1068 static LLVMValueRef fetch_input_tes(
1069 struct lp_build_tgsi_context *bld_base,
1070 const struct tgsi_full_src_register *reg,
1071 enum tgsi_opcode_type type, unsigned swizzle)
1072 {
1073 struct si_shader_context *ctx = si_shader_context(bld_base);
1074 struct gallivm_state *gallivm = bld_base->base.gallivm;
1075 LLVMValueRef rw_buffers, buffer, base, addr;
1076
1077 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1078 SI_PARAM_RW_BUFFERS);
1079 buffer = build_indexed_load_const(ctx, rw_buffers,
1080 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1081
1082 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1083 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1084
1085 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1086 }
1087
1088 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1089 const struct tgsi_full_instruction *inst,
1090 const struct tgsi_opcode_info *info,
1091 LLVMValueRef dst[4])
1092 {
1093 struct si_shader_context *ctx = si_shader_context(bld_base);
1094 struct gallivm_state *gallivm = bld_base->base.gallivm;
1095 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1096 unsigned chan_index;
1097 LLVMValueRef dw_addr, stride;
1098 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1099 LLVMValueRef values[4];
1100
1101 /* Only handle per-patch and per-vertex outputs here.
1102 * Vectors will be lowered to scalars and this function will be called again.
1103 */
1104 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1105 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1106 radeon_llvm_emit_store(bld_base, inst, info, dst);
1107 return;
1108 }
1109
1110 if (reg->Register.Dimension) {
1111 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1112 dw_addr = get_tcs_out_current_patch_offset(ctx);
1113 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1114 } else {
1115 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1116 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1117 }
1118
1119 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1120 SI_PARAM_RW_BUFFERS);
1121 buffer = build_indexed_load_const(ctx, rw_buffers,
1122 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1123
1124 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1125 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1126
1127
1128 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1129 LLVMValueRef value = dst[chan_index];
1130
1131 if (inst->Instruction.Saturate)
1132 value = radeon_llvm_saturate(bld_base, value);
1133
1134 lds_store(bld_base, chan_index, dw_addr, value);
1135
1136 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1137 values[chan_index] = value;
1138
1139 if (inst->Dst[0].Register.WriteMask != 0xF) {
1140 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1141 buf_addr, base,
1142 4 * chan_index);
1143 }
1144 }
1145
1146 if (inst->Dst[0].Register.WriteMask == 0xF) {
1147 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1148 values, 4);
1149 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1150 base, 0);
1151 }
1152 }
1153
1154 static LLVMValueRef fetch_input_gs(
1155 struct lp_build_tgsi_context *bld_base,
1156 const struct tgsi_full_src_register *reg,
1157 enum tgsi_opcode_type type,
1158 unsigned swizzle)
1159 {
1160 struct lp_build_context *base = &bld_base->base;
1161 struct si_shader_context *ctx = si_shader_context(bld_base);
1162 struct si_shader *shader = ctx->shader;
1163 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1164 struct gallivm_state *gallivm = base->gallivm;
1165 LLVMValueRef vtx_offset;
1166 LLVMValueRef args[9];
1167 unsigned vtx_offset_param;
1168 struct tgsi_shader_info *info = &shader->selector->info;
1169 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1170 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1171 unsigned param;
1172 LLVMValueRef value;
1173
1174 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1175 return get_primitive_id(bld_base, swizzle);
1176
1177 if (!reg->Register.Dimension)
1178 return NULL;
1179
1180 if (swizzle == ~0) {
1181 LLVMValueRef values[TGSI_NUM_CHANNELS];
1182 unsigned chan;
1183 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1184 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1185 }
1186 return lp_build_gather_values(bld_base->base.gallivm, values,
1187 TGSI_NUM_CHANNELS);
1188 }
1189
1190 /* Get the vertex offset parameter */
1191 vtx_offset_param = reg->Dimension.Index;
1192 if (vtx_offset_param < 2) {
1193 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1194 } else {
1195 assert(vtx_offset_param < 6);
1196 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1197 }
1198 vtx_offset = lp_build_mul_imm(uint,
1199 LLVMGetParam(ctx->radeon_bld.main_fn,
1200 vtx_offset_param),
1201 4);
1202
1203 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1204 args[0] = ctx->esgs_ring;
1205 args[1] = vtx_offset;
1206 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1207 args[3] = uint->zero;
1208 args[4] = uint->one; /* OFFEN */
1209 args[5] = uint->zero; /* IDXEN */
1210 args[6] = uint->one; /* GLC */
1211 args[7] = uint->zero; /* SLC */
1212 args[8] = uint->zero; /* TFE */
1213
1214 value = lp_build_intrinsic(gallivm->builder,
1215 "llvm.SI.buffer.load.dword.i32.i32",
1216 ctx->i32, args, 9,
1217 LLVMReadOnlyAttribute);
1218 if (tgsi_type_is_64bit(type)) {
1219 LLVMValueRef value2;
1220 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1221 value2 = lp_build_intrinsic(gallivm->builder,
1222 "llvm.SI.buffer.load.dword.i32.i32",
1223 ctx->i32, args, 9,
1224 LLVMReadOnlyAttribute);
1225 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1226 value, value2);
1227 }
1228 return LLVMBuildBitCast(gallivm->builder,
1229 value,
1230 tgsi2llvmtype(bld_base, type), "");
1231 }
1232
1233 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1234 {
1235 switch (interpolate) {
1236 case TGSI_INTERPOLATE_CONSTANT:
1237 return 0;
1238
1239 case TGSI_INTERPOLATE_LINEAR:
1240 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1241 return SI_PARAM_LINEAR_SAMPLE;
1242 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1243 return SI_PARAM_LINEAR_CENTROID;
1244 else
1245 return SI_PARAM_LINEAR_CENTER;
1246 break;
1247 case TGSI_INTERPOLATE_COLOR:
1248 case TGSI_INTERPOLATE_PERSPECTIVE:
1249 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1250 return SI_PARAM_PERSP_SAMPLE;
1251 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1252 return SI_PARAM_PERSP_CENTROID;
1253 else
1254 return SI_PARAM_PERSP_CENTER;
1255 break;
1256 default:
1257 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1258 return -1;
1259 }
1260 }
1261
1262 /* This shouldn't be used by explicit INTERP opcodes. */
1263 static unsigned select_interp_param(struct si_shader_context *ctx,
1264 unsigned param)
1265 {
1266 if (!ctx->is_monolithic)
1267 return param;
1268
1269 if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1270 switch (param) {
1271 case SI_PARAM_PERSP_CENTROID:
1272 case SI_PARAM_PERSP_CENTER:
1273 return SI_PARAM_PERSP_SAMPLE;
1274 }
1275 }
1276 if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1277 switch (param) {
1278 case SI_PARAM_LINEAR_CENTROID:
1279 case SI_PARAM_LINEAR_CENTER:
1280 return SI_PARAM_LINEAR_SAMPLE;
1281 }
1282 }
1283 if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1284 switch (param) {
1285 case SI_PARAM_PERSP_CENTROID:
1286 case SI_PARAM_PERSP_SAMPLE:
1287 return SI_PARAM_PERSP_CENTER;
1288 }
1289 }
1290 if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1291 switch (param) {
1292 case SI_PARAM_LINEAR_CENTROID:
1293 case SI_PARAM_LINEAR_SAMPLE:
1294 return SI_PARAM_LINEAR_CENTER;
1295 }
1296 }
1297
1298 return param;
1299 }
1300
1301 /**
1302 * Interpolate a fragment shader input.
1303 *
1304 * @param ctx context
1305 * @param input_index index of the input in hardware
1306 * @param semantic_name TGSI_SEMANTIC_*
1307 * @param semantic_index semantic index
1308 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1309 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1310 * @param interp_param interpolation weights (i,j)
1311 * @param prim_mask SI_PARAM_PRIM_MASK
1312 * @param face SI_PARAM_FRONT_FACE
1313 * @param result the return value (4 components)
1314 */
1315 static void interp_fs_input(struct si_shader_context *ctx,
1316 unsigned input_index,
1317 unsigned semantic_name,
1318 unsigned semantic_index,
1319 unsigned num_interp_inputs,
1320 unsigned colors_read_mask,
1321 LLVMValueRef interp_param,
1322 LLVMValueRef prim_mask,
1323 LLVMValueRef face,
1324 LLVMValueRef result[4])
1325 {
1326 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1327 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1328 struct gallivm_state *gallivm = base->gallivm;
1329 const char *intr_name;
1330 LLVMValueRef attr_number;
1331
1332 unsigned chan;
1333
1334 attr_number = lp_build_const_int32(gallivm, input_index);
1335
1336 /* fs.constant returns the param from the middle vertex, so it's not
1337 * really useful for flat shading. It's meant to be used for custom
1338 * interpolation (but the intrinsic can't fetch from the other two
1339 * vertices).
1340 *
1341 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1342 * to do the right thing. The only reason we use fs.constant is that
1343 * fs.interp cannot be used on integers, because they can be equal
1344 * to NaN.
1345 */
1346 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1347
1348 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1349 ctx->shader->key.ps.prolog.color_two_side) {
1350 LLVMValueRef args[4];
1351 LLVMValueRef is_face_positive;
1352 LLVMValueRef back_attr_number;
1353
1354 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1355 * otherwise it's at offset "num_inputs".
1356 */
1357 unsigned back_attr_offset = num_interp_inputs;
1358 if (semantic_index == 1 && colors_read_mask & 0xf)
1359 back_attr_offset += 1;
1360
1361 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1362
1363 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1364 face, uint->zero, "");
1365
1366 args[2] = prim_mask;
1367 args[3] = interp_param;
1368 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1369 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1370 LLVMValueRef front, back;
1371
1372 args[0] = llvm_chan;
1373 args[1] = attr_number;
1374 front = lp_build_intrinsic(gallivm->builder, intr_name,
1375 ctx->f32, args, args[3] ? 4 : 3,
1376 LLVMReadNoneAttribute);
1377
1378 args[1] = back_attr_number;
1379 back = lp_build_intrinsic(gallivm->builder, intr_name,
1380 ctx->f32, args, args[3] ? 4 : 3,
1381 LLVMReadNoneAttribute);
1382
1383 result[chan] = LLVMBuildSelect(gallivm->builder,
1384 is_face_positive,
1385 front,
1386 back,
1387 "");
1388 }
1389 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1390 LLVMValueRef args[4];
1391
1392 args[0] = uint->zero;
1393 args[1] = attr_number;
1394 args[2] = prim_mask;
1395 args[3] = interp_param;
1396 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1397 ctx->f32, args, args[3] ? 4 : 3,
1398 LLVMReadNoneAttribute);
1399 result[1] =
1400 result[2] = lp_build_const_float(gallivm, 0.0f);
1401 result[3] = lp_build_const_float(gallivm, 1.0f);
1402 } else {
1403 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1404 LLVMValueRef args[4];
1405 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1406
1407 args[0] = llvm_chan;
1408 args[1] = attr_number;
1409 args[2] = prim_mask;
1410 args[3] = interp_param;
1411 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1412 ctx->f32, args, args[3] ? 4 : 3,
1413 LLVMReadNoneAttribute);
1414 }
1415 }
1416 }
1417
1418 /* LLVMGetParam with bc_optimize resolved. */
1419 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1420 int interp_param_idx)
1421 {
1422 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1423 LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1424 LLVMValueRef param = NULL;
1425
1426 /* Handle PRIM_MASK[31] (bc_optimize). */
1427 if (ctx->is_monolithic &&
1428 ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1429 interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1430 (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1431 interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1432 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1433 * The hw doesn't compute CENTROID if the whole wave only
1434 * contains fully-covered quads.
1435 */
1436 LLVMValueRef bc_optimize =
1437 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1438 bc_optimize = LLVMBuildLShr(builder,
1439 bc_optimize,
1440 LLVMConstInt(ctx->i32, 31, 0), "");
1441 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1442
1443 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1444 interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1445 param = LLVMBuildSelect(builder, bc_optimize,
1446 LLVMGetParam(main_fn,
1447 SI_PARAM_PERSP_CENTER),
1448 LLVMGetParam(main_fn,
1449 SI_PARAM_PERSP_CENTROID),
1450 "");
1451 }
1452 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1453 interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1454 param = LLVMBuildSelect(builder, bc_optimize,
1455 LLVMGetParam(main_fn,
1456 SI_PARAM_LINEAR_CENTER),
1457 LLVMGetParam(main_fn,
1458 SI_PARAM_LINEAR_CENTROID),
1459 "");
1460 }
1461 }
1462
1463 if (!param)
1464 param = LLVMGetParam(main_fn, interp_param_idx);
1465 return param;
1466 }
1467
1468 static void declare_input_fs(
1469 struct radeon_llvm_context *radeon_bld,
1470 unsigned input_index,
1471 const struct tgsi_full_declaration *decl)
1472 {
1473 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1474 struct si_shader_context *ctx =
1475 si_shader_context(&radeon_bld->soa.bld_base);
1476 struct si_shader *shader = ctx->shader;
1477 LLVMValueRef main_fn = radeon_bld->main_fn;
1478 LLVMValueRef interp_param = NULL;
1479 int interp_param_idx;
1480
1481 /* Get colors from input VGPRs (set by the prolog). */
1482 if (!ctx->is_monolithic &&
1483 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1484 unsigned i = decl->Semantic.Index;
1485 unsigned colors_read = shader->selector->info.colors_read;
1486 unsigned mask = colors_read >> (i * 4);
1487 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1488 (i ? util_bitcount(colors_read & 0xf) : 0);
1489
1490 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1491 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1492 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1493 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1494 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1495 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1496 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1497 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1498 return;
1499 }
1500
1501 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1502 decl->Interp.Location);
1503 if (interp_param_idx == -1)
1504 return;
1505 else if (interp_param_idx) {
1506 interp_param_idx = select_interp_param(ctx,
1507 interp_param_idx);
1508 interp_param = get_interp_param(ctx, interp_param_idx);
1509 }
1510
1511 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1512 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1513 ctx->shader->key.ps.prolog.flatshade_colors)
1514 interp_param = NULL; /* load the constant color */
1515
1516 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1517 decl->Semantic.Index, shader->selector->info.num_inputs,
1518 shader->selector->info.colors_read, interp_param,
1519 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1520 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1521 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1522 }
1523
1524 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1525 {
1526 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1527 SI_PARAM_ANCILLARY, 8, 4);
1528 }
1529
1530 /**
1531 * Set range metadata on an instruction. This can only be used on load and
1532 * call instructions. If you know an instruction can only produce the values
1533 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1534 * \p lo is the minimum value inclusive.
1535 * \p hi is the maximum value exclusive.
1536 */
1537 static void set_range_metadata(struct si_shader_context *ctx,
1538 LLVMValueRef value, unsigned lo, unsigned hi)
1539 {
1540 LLVMValueRef range_md, md_args[2];
1541 LLVMTypeRef type = LLVMTypeOf(value);
1542 LLVMContextRef context = LLVMGetTypeContext(type);
1543
1544 md_args[0] = LLVMConstInt(type, lo, false);
1545 md_args[1] = LLVMConstInt(type, hi, false);
1546 range_md = LLVMMDNodeInContext(context, md_args, 2);
1547 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1548 }
1549
1550 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1551 {
1552 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1553 LLVMValueRef tid;
1554
1555 if (HAVE_LLVM < 0x0308) {
1556 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1557 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1558 } else {
1559 LLVMValueRef tid_args[2];
1560 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1561 tid_args[1] = lp_build_const_int32(gallivm, 0);
1562 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1563 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1564 tid_args, 2, LLVMReadNoneAttribute);
1565
1566 tid = lp_build_intrinsic(gallivm->builder,
1567 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1568 tid_args, 2, LLVMReadNoneAttribute);
1569 }
1570 set_range_metadata(ctx, tid, 0, 64);
1571 return tid;
1572 }
1573
1574 /**
1575 * Load a dword from a constant buffer.
1576 */
1577 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1578 LLVMValueRef resource,
1579 LLVMValueRef offset)
1580 {
1581 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1582 LLVMValueRef args[2] = {resource, offset};
1583
1584 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1585 LLVMReadNoneAttribute);
1586 }
1587
1588 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1589 {
1590 struct si_shader_context *ctx =
1591 si_shader_context(&radeon_bld->soa.bld_base);
1592 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1593 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1594 LLVMBuilderRef builder = gallivm->builder;
1595 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1596 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1597 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1598
1599 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1600 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1601 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1602
1603 LLVMValueRef pos[4] = {
1604 buffer_load_const(ctx, resource, offset0),
1605 buffer_load_const(ctx, resource, offset1),
1606 lp_build_const_float(gallivm, 0),
1607 lp_build_const_float(gallivm, 0)
1608 };
1609
1610 return lp_build_gather_values(gallivm, pos, 4);
1611 }
1612
1613 static void declare_system_value(
1614 struct radeon_llvm_context *radeon_bld,
1615 unsigned index,
1616 const struct tgsi_full_declaration *decl)
1617 {
1618 struct si_shader_context *ctx =
1619 si_shader_context(&radeon_bld->soa.bld_base);
1620 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1621 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1622 LLVMValueRef value = 0;
1623
1624 switch (decl->Semantic.Name) {
1625 case TGSI_SEMANTIC_INSTANCEID:
1626 value = LLVMGetParam(radeon_bld->main_fn,
1627 ctx->param_instance_id);
1628 break;
1629
1630 case TGSI_SEMANTIC_VERTEXID:
1631 value = LLVMBuildAdd(gallivm->builder,
1632 LLVMGetParam(radeon_bld->main_fn,
1633 ctx->param_vertex_id),
1634 LLVMGetParam(radeon_bld->main_fn,
1635 SI_PARAM_BASE_VERTEX), "");
1636 break;
1637
1638 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1639 value = LLVMGetParam(radeon_bld->main_fn,
1640 ctx->param_vertex_id);
1641 break;
1642
1643 case TGSI_SEMANTIC_BASEVERTEX:
1644 value = LLVMGetParam(radeon_bld->main_fn,
1645 SI_PARAM_BASE_VERTEX);
1646 break;
1647
1648 case TGSI_SEMANTIC_BASEINSTANCE:
1649 value = LLVMGetParam(radeon_bld->main_fn,
1650 SI_PARAM_START_INSTANCE);
1651 break;
1652
1653 case TGSI_SEMANTIC_DRAWID:
1654 value = LLVMGetParam(radeon_bld->main_fn,
1655 SI_PARAM_DRAWID);
1656 break;
1657
1658 case TGSI_SEMANTIC_INVOCATIONID:
1659 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1660 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1661 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1662 value = LLVMGetParam(radeon_bld->main_fn,
1663 SI_PARAM_GS_INSTANCE_ID);
1664 else
1665 assert(!"INVOCATIONID not implemented");
1666 break;
1667
1668 case TGSI_SEMANTIC_POSITION:
1669 {
1670 LLVMValueRef pos[4] = {
1671 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1672 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1673 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1674 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1675 LLVMGetParam(radeon_bld->main_fn,
1676 SI_PARAM_POS_W_FLOAT)),
1677 };
1678 value = lp_build_gather_values(gallivm, pos, 4);
1679 break;
1680 }
1681
1682 case TGSI_SEMANTIC_FACE:
1683 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1684 break;
1685
1686 case TGSI_SEMANTIC_SAMPLEID:
1687 value = get_sample_id(radeon_bld);
1688 break;
1689
1690 case TGSI_SEMANTIC_SAMPLEPOS: {
1691 LLVMValueRef pos[4] = {
1692 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1693 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1694 lp_build_const_float(gallivm, 0),
1695 lp_build_const_float(gallivm, 0)
1696 };
1697 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1698 TGSI_OPCODE_FRC, pos[0]);
1699 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1700 TGSI_OPCODE_FRC, pos[1]);
1701 value = lp_build_gather_values(gallivm, pos, 4);
1702 break;
1703 }
1704
1705 case TGSI_SEMANTIC_SAMPLEMASK:
1706 /* This can only occur with the OpenGL Core profile, which
1707 * doesn't support smoothing.
1708 */
1709 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1710 break;
1711
1712 case TGSI_SEMANTIC_TESSCOORD:
1713 {
1714 LLVMValueRef coord[4] = {
1715 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1716 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1717 bld->zero,
1718 bld->zero
1719 };
1720
1721 /* For triangles, the vector should be (u, v, 1-u-v). */
1722 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1723 PIPE_PRIM_TRIANGLES)
1724 coord[2] = lp_build_sub(bld, bld->one,
1725 lp_build_add(bld, coord[0], coord[1]));
1726
1727 value = lp_build_gather_values(gallivm, coord, 4);
1728 break;
1729 }
1730
1731 case TGSI_SEMANTIC_VERTICESIN:
1732 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1733 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1734 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1735 value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7);
1736 else
1737 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1738 break;
1739
1740 case TGSI_SEMANTIC_TESSINNER:
1741 case TGSI_SEMANTIC_TESSOUTER:
1742 {
1743 LLVMValueRef rw_buffers, buffer, base, addr;
1744 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1745
1746 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1747 SI_PARAM_RW_BUFFERS);
1748 buffer = build_indexed_load_const(ctx, rw_buffers,
1749 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1750
1751 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1752 addr = get_tcs_tes_buffer_address(ctx, NULL,
1753 lp_build_const_int32(gallivm, param));
1754
1755 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1756 ~0, buffer, base, addr);
1757
1758 break;
1759 }
1760
1761 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1762 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1763 {
1764 LLVMValueRef buf, slot, val[4];
1765 int i, offset;
1766
1767 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1768 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1769 buf = build_indexed_load_const(ctx, buf, slot);
1770 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1771
1772 for (i = 0; i < 4; i++)
1773 val[i] = buffer_load_const(ctx, buf,
1774 lp_build_const_int32(gallivm, (offset + i) * 4));
1775 value = lp_build_gather_values(gallivm, val, 4);
1776 break;
1777 }
1778
1779 case TGSI_SEMANTIC_PRIMID:
1780 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1781 break;
1782
1783 case TGSI_SEMANTIC_GRID_SIZE:
1784 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1785 break;
1786
1787 case TGSI_SEMANTIC_BLOCK_SIZE:
1788 {
1789 LLVMValueRef values[3];
1790 unsigned i;
1791 unsigned *properties = ctx->shader->selector->info.properties;
1792 unsigned sizes[3] = {
1793 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1794 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1795 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1796 };
1797
1798 for (i = 0; i < 3; ++i)
1799 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1800
1801 value = lp_build_gather_values(gallivm, values, 3);
1802 break;
1803 }
1804
1805 case TGSI_SEMANTIC_BLOCK_ID:
1806 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1807 break;
1808
1809 case TGSI_SEMANTIC_THREAD_ID:
1810 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1811 break;
1812
1813 #if HAVE_LLVM >= 0x0309
1814 case TGSI_SEMANTIC_HELPER_INVOCATION:
1815 value = lp_build_intrinsic(gallivm->builder,
1816 "llvm.amdgcn.ps.live",
1817 ctx->i1, NULL, 0,
1818 LLVMReadNoneAttribute);
1819 value = LLVMBuildNot(gallivm->builder, value, "");
1820 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1821 break;
1822 #endif
1823
1824 default:
1825 assert(!"unknown system value");
1826 return;
1827 }
1828
1829 radeon_bld->system_values[index] = value;
1830 }
1831
1832 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1833 const struct tgsi_full_declaration *decl)
1834 {
1835 struct si_shader_context *ctx =
1836 si_shader_context(&radeon_bld->soa.bld_base);
1837 struct si_shader_selector *sel = ctx->shader->selector;
1838 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1839
1840 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1841 LLVMValueRef var;
1842
1843 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1844 assert(decl->Range.First == decl->Range.Last);
1845 assert(!ctx->shared_memory);
1846
1847 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1848 LLVMArrayType(ctx->i8, sel->local_size),
1849 "compute_lds",
1850 LOCAL_ADDR_SPACE);
1851 LLVMSetAlignment(var, 4);
1852
1853 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1854 }
1855
1856 static LLVMValueRef fetch_constant(
1857 struct lp_build_tgsi_context *bld_base,
1858 const struct tgsi_full_src_register *reg,
1859 enum tgsi_opcode_type type,
1860 unsigned swizzle)
1861 {
1862 struct si_shader_context *ctx = si_shader_context(bld_base);
1863 struct lp_build_context *base = &bld_base->base;
1864 const struct tgsi_ind_register *ireg = &reg->Indirect;
1865 unsigned buf, idx;
1866
1867 LLVMValueRef addr, bufp;
1868 LLVMValueRef result;
1869
1870 if (swizzle == LP_CHAN_ALL) {
1871 unsigned chan;
1872 LLVMValueRef values[4];
1873 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1874 values[chan] = fetch_constant(bld_base, reg, type, chan);
1875
1876 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1877 }
1878
1879 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1880 idx = reg->Register.Index * 4 + swizzle;
1881
1882 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1883 LLVMValueRef c0, c1;
1884
1885 c0 = buffer_load_const(ctx, ctx->const_buffers[buf],
1886 LLVMConstInt(ctx->i32, idx * 4, 0));
1887
1888 if (!tgsi_type_is_64bit(type))
1889 return bitcast(bld_base, type, c0);
1890 else {
1891 c1 = buffer_load_const(ctx, ctx->const_buffers[buf],
1892 LLVMConstInt(ctx->i32,
1893 (idx + 1) * 4, 0));
1894 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1895 c0, c1);
1896 }
1897 }
1898
1899 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1900 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1901 LLVMValueRef index;
1902 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1903 reg->Dimension.Index,
1904 SI_NUM_CONST_BUFFERS);
1905 bufp = build_indexed_load_const(ctx, ptr, index);
1906 } else
1907 bufp = ctx->const_buffers[buf];
1908
1909 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1910 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1911 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1912 addr = lp_build_add(&bld_base->uint_bld, addr,
1913 lp_build_const_int32(base->gallivm, idx * 4));
1914
1915 result = buffer_load_const(ctx, bufp, addr);
1916
1917 if (!tgsi_type_is_64bit(type))
1918 result = bitcast(bld_base, type, result);
1919 else {
1920 LLVMValueRef addr2, result2;
1921 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1922 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1923 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1924 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1925 lp_build_const_int32(base->gallivm, idx * 4));
1926
1927 result2 = buffer_load_const(ctx, ctx->const_buffers[buf],
1928 addr2);
1929
1930 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1931 result, result2);
1932 }
1933 return result;
1934 }
1935
1936 /* Upper 16 bits must be zero. */
1937 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1938 LLVMValueRef val[2])
1939 {
1940 return LLVMBuildOr(gallivm->builder, val[0],
1941 LLVMBuildShl(gallivm->builder, val[1],
1942 lp_build_const_int32(gallivm, 16),
1943 ""), "");
1944 }
1945
1946 /* Upper 16 bits are ignored and will be dropped. */
1947 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1948 LLVMValueRef val[2])
1949 {
1950 LLVMValueRef v[2] = {
1951 LLVMBuildAnd(gallivm->builder, val[0],
1952 lp_build_const_int32(gallivm, 0xffff), ""),
1953 val[1],
1954 };
1955 return si_llvm_pack_two_int16(gallivm, v);
1956 }
1957
1958 /* Initialize arguments for the shader export intrinsic */
1959 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1960 LLVMValueRef *values,
1961 unsigned target,
1962 LLVMValueRef *args)
1963 {
1964 struct si_shader_context *ctx = si_shader_context(bld_base);
1965 struct lp_build_context *uint =
1966 &ctx->radeon_bld.soa.bld_base.uint_bld;
1967 struct lp_build_context *base = &bld_base->base;
1968 struct gallivm_state *gallivm = base->gallivm;
1969 LLVMBuilderRef builder = base->gallivm->builder;
1970 LLVMValueRef val[4];
1971 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1972 unsigned chan;
1973 bool is_int8;
1974
1975 /* Default is 0xf. Adjusted below depending on the format. */
1976 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1977
1978 /* Specify whether the EXEC mask represents the valid mask */
1979 args[1] = uint->zero;
1980
1981 /* Specify whether this is the last export */
1982 args[2] = uint->zero;
1983
1984 /* Specify the target we are exporting */
1985 args[3] = lp_build_const_int32(base->gallivm, target);
1986
1987 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1988 const union si_shader_key *key = &ctx->shader->key;
1989 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1990 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1991
1992 assert(cbuf >= 0 && cbuf < 8);
1993 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1994 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1995 }
1996
1997 args[4] = uint->zero; /* COMPR flag */
1998 args[5] = base->undef;
1999 args[6] = base->undef;
2000 args[7] = base->undef;
2001 args[8] = base->undef;
2002
2003 switch (spi_shader_col_format) {
2004 case V_028714_SPI_SHADER_ZERO:
2005 args[0] = uint->zero; /* writemask */
2006 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2007 break;
2008
2009 case V_028714_SPI_SHADER_32_R:
2010 args[0] = uint->one; /* writemask */
2011 args[5] = values[0];
2012 break;
2013
2014 case V_028714_SPI_SHADER_32_GR:
2015 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2016 args[5] = values[0];
2017 args[6] = values[1];
2018 break;
2019
2020 case V_028714_SPI_SHADER_32_AR:
2021 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2022 args[5] = values[0];
2023 args[8] = values[3];
2024 break;
2025
2026 case V_028714_SPI_SHADER_FP16_ABGR:
2027 args[4] = uint->one; /* COMPR flag */
2028
2029 for (chan = 0; chan < 2; chan++) {
2030 LLVMValueRef pack_args[2] = {
2031 values[2 * chan],
2032 values[2 * chan + 1]
2033 };
2034 LLVMValueRef packed;
2035
2036 packed = lp_build_intrinsic(base->gallivm->builder,
2037 "llvm.SI.packf16",
2038 ctx->i32, pack_args, 2,
2039 LLVMReadNoneAttribute);
2040 args[chan + 5] =
2041 LLVMBuildBitCast(base->gallivm->builder,
2042 packed, ctx->f32, "");
2043 }
2044 break;
2045
2046 case V_028714_SPI_SHADER_UNORM16_ABGR:
2047 for (chan = 0; chan < 4; chan++) {
2048 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2049 val[chan] = LLVMBuildFMul(builder, val[chan],
2050 lp_build_const_float(gallivm, 65535), "");
2051 val[chan] = LLVMBuildFAdd(builder, val[chan],
2052 lp_build_const_float(gallivm, 0.5), "");
2053 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2054 ctx->i32, "");
2055 }
2056
2057 args[4] = uint->one; /* COMPR flag */
2058 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2059 si_llvm_pack_two_int16(gallivm, val));
2060 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2061 si_llvm_pack_two_int16(gallivm, val+2));
2062 break;
2063
2064 case V_028714_SPI_SHADER_SNORM16_ABGR:
2065 for (chan = 0; chan < 4; chan++) {
2066 /* Clamp between [-1, 1]. */
2067 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2068 values[chan],
2069 lp_build_const_float(gallivm, 1));
2070 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2071 val[chan],
2072 lp_build_const_float(gallivm, -1));
2073 /* Convert to a signed integer in [-32767, 32767]. */
2074 val[chan] = LLVMBuildFMul(builder, val[chan],
2075 lp_build_const_float(gallivm, 32767), "");
2076 /* If positive, add 0.5, else add -0.5. */
2077 val[chan] = LLVMBuildFAdd(builder, val[chan],
2078 LLVMBuildSelect(builder,
2079 LLVMBuildFCmp(builder, LLVMRealOGE,
2080 val[chan], base->zero, ""),
2081 lp_build_const_float(gallivm, 0.5),
2082 lp_build_const_float(gallivm, -0.5), ""), "");
2083 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2084 }
2085
2086 args[4] = uint->one; /* COMPR flag */
2087 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2088 si_llvm_pack_two_int32_as_int16(gallivm, val));
2089 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2090 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2091 break;
2092
2093 case V_028714_SPI_SHADER_UINT16_ABGR: {
2094 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2095 255 : 65535);
2096 /* Clamp. */
2097 for (chan = 0; chan < 4; chan++) {
2098 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2099 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2100 val[chan], max);
2101 }
2102
2103 args[4] = uint->one; /* COMPR flag */
2104 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2105 si_llvm_pack_two_int16(gallivm, val));
2106 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2107 si_llvm_pack_two_int16(gallivm, val+2));
2108 break;
2109 }
2110
2111 case V_028714_SPI_SHADER_SINT16_ABGR: {
2112 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2113 127 : 32767);
2114 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2115 -128 : -32768);
2116 /* Clamp. */
2117 for (chan = 0; chan < 4; chan++) {
2118 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2119 val[chan] = lp_build_emit_llvm_binary(bld_base,
2120 TGSI_OPCODE_IMIN,
2121 val[chan], max);
2122 val[chan] = lp_build_emit_llvm_binary(bld_base,
2123 TGSI_OPCODE_IMAX,
2124 val[chan], min);
2125 }
2126
2127 args[4] = uint->one; /* COMPR flag */
2128 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2129 si_llvm_pack_two_int32_as_int16(gallivm, val));
2130 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2131 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2132 break;
2133 }
2134
2135 case V_028714_SPI_SHADER_32_ABGR:
2136 memcpy(&args[5], values, sizeof(values[0]) * 4);
2137 break;
2138 }
2139 }
2140
2141 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2142 LLVMValueRef alpha)
2143 {
2144 struct si_shader_context *ctx = si_shader_context(bld_base);
2145 struct gallivm_state *gallivm = bld_base->base.gallivm;
2146
2147 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2148 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2149 SI_PARAM_ALPHA_REF);
2150
2151 LLVMValueRef alpha_pass =
2152 lp_build_cmp(&bld_base->base,
2153 ctx->shader->key.ps.epilog.alpha_func,
2154 alpha, alpha_ref);
2155 LLVMValueRef arg =
2156 lp_build_select(&bld_base->base,
2157 alpha_pass,
2158 lp_build_const_float(gallivm, 1.0f),
2159 lp_build_const_float(gallivm, -1.0f));
2160
2161 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2162 ctx->voidt, &arg, 1, 0);
2163 } else {
2164 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2165 ctx->voidt, NULL, 0, 0);
2166 }
2167 }
2168
2169 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2170 LLVMValueRef alpha,
2171 unsigned samplemask_param)
2172 {
2173 struct si_shader_context *ctx = si_shader_context(bld_base);
2174 struct gallivm_state *gallivm = bld_base->base.gallivm;
2175 LLVMValueRef coverage;
2176
2177 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2178 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2179 samplemask_param);
2180 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2181
2182 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2183 ctx->i32,
2184 &coverage, 1, LLVMReadNoneAttribute);
2185
2186 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2187 ctx->f32, "");
2188
2189 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2190 lp_build_const_float(gallivm,
2191 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2192
2193 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2194 }
2195
2196 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2197 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2198 {
2199 struct si_shader_context *ctx = si_shader_context(bld_base);
2200 struct lp_build_context *base = &bld_base->base;
2201 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2202 unsigned reg_index;
2203 unsigned chan;
2204 unsigned const_chan;
2205 LLVMValueRef base_elt;
2206 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2207 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2208 SI_VS_CONST_CLIP_PLANES);
2209 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2210
2211 for (reg_index = 0; reg_index < 2; reg_index ++) {
2212 LLVMValueRef *args = pos[2 + reg_index];
2213
2214 args[5] =
2215 args[6] =
2216 args[7] =
2217 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2218
2219 /* Compute dot products of position and user clip plane vectors */
2220 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2221 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2222 args[1] = lp_build_const_int32(base->gallivm,
2223 ((reg_index * 4 + chan) * 4 +
2224 const_chan) * 4);
2225 base_elt = buffer_load_const(ctx, const_resource,
2226 args[1]);
2227 args[5 + chan] =
2228 lp_build_add(base, args[5 + chan],
2229 lp_build_mul(base, base_elt,
2230 out_elts[const_chan]));
2231 }
2232 }
2233
2234 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2235 args[1] = uint->zero;
2236 args[2] = uint->zero;
2237 args[3] = lp_build_const_int32(base->gallivm,
2238 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2239 args[4] = uint->zero;
2240 }
2241 }
2242
2243 static void si_dump_streamout(struct pipe_stream_output_info *so)
2244 {
2245 unsigned i;
2246
2247 if (so->num_outputs)
2248 fprintf(stderr, "STREAMOUT\n");
2249
2250 for (i = 0; i < so->num_outputs; i++) {
2251 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2252 so->output[i].start_component;
2253 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2254 i, so->output[i].output_buffer,
2255 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2256 so->output[i].register_index,
2257 mask & 1 ? "x" : "",
2258 mask & 2 ? "y" : "",
2259 mask & 4 ? "z" : "",
2260 mask & 8 ? "w" : "");
2261 }
2262 }
2263
2264 /* On SI, the vertex shader is responsible for writing streamout data
2265 * to buffers. */
2266 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2267 struct si_shader_output_values *outputs,
2268 unsigned noutput)
2269 {
2270 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2271 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2272 LLVMBuilderRef builder = gallivm->builder;
2273 int i, j;
2274 struct lp_build_if_state if_ctx;
2275
2276 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2277 LLVMValueRef so_vtx_count =
2278 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2279
2280 LLVMValueRef tid = get_thread_id(ctx);
2281
2282 /* can_emit = tid < so_vtx_count; */
2283 LLVMValueRef can_emit =
2284 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2285
2286 LLVMValueRef stream_id =
2287 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2288
2289 /* Emit the streamout code conditionally. This actually avoids
2290 * out-of-bounds buffer access. The hw tells us via the SGPR
2291 * (so_vtx_count) which threads are allowed to emit streamout data. */
2292 lp_build_if(&if_ctx, gallivm, can_emit);
2293 {
2294 /* The buffer offset is computed as follows:
2295 * ByteOffset = streamout_offset[buffer_id]*4 +
2296 * (streamout_write_index + thread_id)*stride[buffer_id] +
2297 * attrib_offset
2298 */
2299
2300 LLVMValueRef so_write_index =
2301 LLVMGetParam(ctx->radeon_bld.main_fn,
2302 ctx->param_streamout_write_index);
2303
2304 /* Compute (streamout_write_index + thread_id). */
2305 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2306
2307 /* Compute the write offset for each enabled buffer. */
2308 LLVMValueRef so_write_offset[4] = {};
2309 for (i = 0; i < 4; i++) {
2310 if (!so->stride[i])
2311 continue;
2312
2313 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2314 ctx->param_streamout_offset[i]);
2315 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2316
2317 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2318 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2319 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2320 }
2321
2322 /* Write streamout data. */
2323 for (i = 0; i < so->num_outputs; i++) {
2324 unsigned buf_idx = so->output[i].output_buffer;
2325 unsigned reg = so->output[i].register_index;
2326 unsigned start = so->output[i].start_component;
2327 unsigned num_comps = so->output[i].num_components;
2328 unsigned stream = so->output[i].stream;
2329 LLVMValueRef out[4];
2330 struct lp_build_if_state if_ctx_stream;
2331
2332 assert(num_comps && num_comps <= 4);
2333 if (!num_comps || num_comps > 4)
2334 continue;
2335
2336 if (reg >= noutput)
2337 continue;
2338
2339 /* Load the output as int. */
2340 for (j = 0; j < num_comps; j++) {
2341 out[j] = LLVMBuildBitCast(builder,
2342 outputs[reg].values[start+j],
2343 ctx->i32, "");
2344 }
2345
2346 /* Pack the output. */
2347 LLVMValueRef vdata = NULL;
2348
2349 switch (num_comps) {
2350 case 1: /* as i32 */
2351 vdata = out[0];
2352 break;
2353 case 2: /* as v2i32 */
2354 case 3: /* as v4i32 (aligned to 4) */
2355 case 4: /* as v4i32 */
2356 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2357 for (j = 0; j < num_comps; j++) {
2358 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2359 LLVMConstInt(ctx->i32, j, 0), "");
2360 }
2361 break;
2362 }
2363
2364 LLVMValueRef can_emit_stream =
2365 LLVMBuildICmp(builder, LLVMIntEQ,
2366 stream_id,
2367 lp_build_const_int32(gallivm, stream), "");
2368
2369 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2370 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2371 vdata, num_comps,
2372 so_write_offset[buf_idx],
2373 LLVMConstInt(ctx->i32, 0, 0),
2374 so->output[i].dst_offset*4);
2375 lp_build_endif(&if_ctx_stream);
2376 }
2377 }
2378 lp_build_endif(&if_ctx);
2379 }
2380
2381
2382 /* Generate export instructions for hardware VS shader stage */
2383 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2384 struct si_shader_output_values *outputs,
2385 unsigned noutput)
2386 {
2387 struct si_shader_context *ctx = si_shader_context(bld_base);
2388 struct si_shader *shader = ctx->shader;
2389 struct lp_build_context *base = &bld_base->base;
2390 struct lp_build_context *uint =
2391 &ctx->radeon_bld.soa.bld_base.uint_bld;
2392 LLVMValueRef args[9];
2393 LLVMValueRef pos_args[4][9] = { { 0 } };
2394 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2395 unsigned semantic_name, semantic_index;
2396 unsigned target;
2397 unsigned param_count = 0;
2398 unsigned pos_idx;
2399 int i;
2400
2401 if (outputs && ctx->shader->selector->so.num_outputs) {
2402 si_llvm_emit_streamout(ctx, outputs, noutput);
2403 }
2404
2405 for (i = 0; i < noutput; i++) {
2406 semantic_name = outputs[i].name;
2407 semantic_index = outputs[i].sid;
2408
2409 handle_semantic:
2410 /* Select the correct target */
2411 switch(semantic_name) {
2412 case TGSI_SEMANTIC_PSIZE:
2413 psize_value = outputs[i].values[0];
2414 continue;
2415 case TGSI_SEMANTIC_EDGEFLAG:
2416 edgeflag_value = outputs[i].values[0];
2417 continue;
2418 case TGSI_SEMANTIC_LAYER:
2419 layer_value = outputs[i].values[0];
2420 semantic_name = TGSI_SEMANTIC_GENERIC;
2421 goto handle_semantic;
2422 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2423 viewport_index_value = outputs[i].values[0];
2424 semantic_name = TGSI_SEMANTIC_GENERIC;
2425 goto handle_semantic;
2426 case TGSI_SEMANTIC_POSITION:
2427 target = V_008DFC_SQ_EXP_POS;
2428 break;
2429 case TGSI_SEMANTIC_COLOR:
2430 case TGSI_SEMANTIC_BCOLOR:
2431 target = V_008DFC_SQ_EXP_PARAM + param_count;
2432 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2433 shader->info.vs_output_param_offset[i] = param_count;
2434 param_count++;
2435 break;
2436 case TGSI_SEMANTIC_CLIPDIST:
2437 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2438 break;
2439 case TGSI_SEMANTIC_CLIPVERTEX:
2440 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2441 continue;
2442 case TGSI_SEMANTIC_PRIMID:
2443 case TGSI_SEMANTIC_FOG:
2444 case TGSI_SEMANTIC_TEXCOORD:
2445 case TGSI_SEMANTIC_GENERIC:
2446 target = V_008DFC_SQ_EXP_PARAM + param_count;
2447 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2448 shader->info.vs_output_param_offset[i] = param_count;
2449 param_count++;
2450 break;
2451 default:
2452 target = 0;
2453 fprintf(stderr,
2454 "Warning: SI unhandled vs output type:%d\n",
2455 semantic_name);
2456 }
2457
2458 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2459
2460 if (target >= V_008DFC_SQ_EXP_POS &&
2461 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2462 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2463 args, sizeof(args));
2464 } else {
2465 lp_build_intrinsic(base->gallivm->builder,
2466 "llvm.SI.export", ctx->voidt,
2467 args, 9, 0);
2468 }
2469
2470 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2471 semantic_name = TGSI_SEMANTIC_GENERIC;
2472 goto handle_semantic;
2473 }
2474 }
2475
2476 shader->info.nr_param_exports = param_count;
2477
2478 /* We need to add the position output manually if it's missing. */
2479 if (!pos_args[0][0]) {
2480 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2481 pos_args[0][1] = uint->zero; /* EXEC mask */
2482 pos_args[0][2] = uint->zero; /* last export? */
2483 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2484 pos_args[0][4] = uint->zero; /* COMPR flag */
2485 pos_args[0][5] = base->zero; /* X */
2486 pos_args[0][6] = base->zero; /* Y */
2487 pos_args[0][7] = base->zero; /* Z */
2488 pos_args[0][8] = base->one; /* W */
2489 }
2490
2491 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2492 if (shader->selector->info.writes_psize ||
2493 shader->selector->info.writes_edgeflag ||
2494 shader->selector->info.writes_viewport_index ||
2495 shader->selector->info.writes_layer) {
2496 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2497 shader->selector->info.writes_psize |
2498 (shader->selector->info.writes_edgeflag << 1) |
2499 (shader->selector->info.writes_layer << 2) |
2500 (shader->selector->info.writes_viewport_index << 3));
2501 pos_args[1][1] = uint->zero; /* EXEC mask */
2502 pos_args[1][2] = uint->zero; /* last export? */
2503 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2504 pos_args[1][4] = uint->zero; /* COMPR flag */
2505 pos_args[1][5] = base->zero; /* X */
2506 pos_args[1][6] = base->zero; /* Y */
2507 pos_args[1][7] = base->zero; /* Z */
2508 pos_args[1][8] = base->zero; /* W */
2509
2510 if (shader->selector->info.writes_psize)
2511 pos_args[1][5] = psize_value;
2512
2513 if (shader->selector->info.writes_edgeflag) {
2514 /* The output is a float, but the hw expects an integer
2515 * with the first bit containing the edge flag. */
2516 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2517 edgeflag_value,
2518 ctx->i32, "");
2519 edgeflag_value = lp_build_min(&bld_base->int_bld,
2520 edgeflag_value,
2521 bld_base->int_bld.one);
2522
2523 /* The LLVM intrinsic expects a float. */
2524 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2525 edgeflag_value,
2526 ctx->f32, "");
2527 }
2528
2529 if (shader->selector->info.writes_layer)
2530 pos_args[1][7] = layer_value;
2531
2532 if (shader->selector->info.writes_viewport_index)
2533 pos_args[1][8] = viewport_index_value;
2534 }
2535
2536 for (i = 0; i < 4; i++)
2537 if (pos_args[i][0])
2538 shader->info.nr_pos_exports++;
2539
2540 pos_idx = 0;
2541 for (i = 0; i < 4; i++) {
2542 if (!pos_args[i][0])
2543 continue;
2544
2545 /* Specify the target we are exporting */
2546 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2547
2548 if (pos_idx == shader->info.nr_pos_exports)
2549 /* Specify that this is the last export */
2550 pos_args[i][2] = uint->one;
2551
2552 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2553 ctx->voidt, pos_args[i], 9, 0);
2554 }
2555 }
2556
2557 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2558 {
2559 struct si_shader_context *ctx = si_shader_context(bld_base);
2560 struct gallivm_state *gallivm = bld_base->base.gallivm;
2561 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2562 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2563 uint64_t inputs;
2564
2565 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2566
2567 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2568 buffer = build_indexed_load_const(ctx, rw_buffers,
2569 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2570
2571 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2572
2573 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2574 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2575 lds_vertex_stride, "");
2576 lds_base = get_tcs_in_current_patch_offset(ctx);
2577 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2578
2579 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2580 while (inputs) {
2581 unsigned i = u_bit_scan64(&inputs);
2582
2583 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2584 lp_build_const_int32(gallivm, 4 * i),
2585 "");
2586
2587 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2588 invocation_id,
2589 lp_build_const_int32(gallivm, i));
2590
2591 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2592 lds_ptr);
2593
2594 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2595 buffer_offset, 0);
2596 }
2597 }
2598
2599 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2600 LLVMValueRef rel_patch_id,
2601 LLVMValueRef invocation_id,
2602 LLVMValueRef tcs_out_current_patch_data_offset)
2603 {
2604 struct si_shader_context *ctx = si_shader_context(bld_base);
2605 struct gallivm_state *gallivm = bld_base->base.gallivm;
2606 struct si_shader *shader = ctx->shader;
2607 unsigned tess_inner_index, tess_outer_index;
2608 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2609 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2610 unsigned stride, outer_comps, inner_comps, i;
2611 struct lp_build_if_state if_ctx, inner_if_ctx;
2612
2613 si_llvm_emit_barrier(NULL, bld_base, NULL);
2614
2615 /* Do this only for invocation 0, because the tess levels are per-patch,
2616 * not per-vertex.
2617 *
2618 * This can't jump, because invocation 0 executes this. It should
2619 * at least mask out the loads and stores for other invocations.
2620 */
2621 lp_build_if(&if_ctx, gallivm,
2622 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2623 invocation_id, bld_base->uint_bld.zero, ""));
2624
2625 /* Determine the layout of one tess factor element in the buffer. */
2626 switch (shader->key.tcs.epilog.prim_mode) {
2627 case PIPE_PRIM_LINES:
2628 stride = 2; /* 2 dwords, 1 vec2 store */
2629 outer_comps = 2;
2630 inner_comps = 0;
2631 break;
2632 case PIPE_PRIM_TRIANGLES:
2633 stride = 4; /* 4 dwords, 1 vec4 store */
2634 outer_comps = 3;
2635 inner_comps = 1;
2636 break;
2637 case PIPE_PRIM_QUADS:
2638 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2639 outer_comps = 4;
2640 inner_comps = 2;
2641 break;
2642 default:
2643 assert(0);
2644 return;
2645 }
2646
2647 /* Load tess_inner and tess_outer from LDS.
2648 * Any invocation can write them, so we can't get them from a temporary.
2649 */
2650 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2651 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2652
2653 lds_base = tcs_out_current_patch_data_offset;
2654 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2655 lp_build_const_int32(gallivm,
2656 tess_inner_index * 4), "");
2657 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2658 lp_build_const_int32(gallivm,
2659 tess_outer_index * 4), "");
2660
2661 for (i = 0; i < outer_comps; i++)
2662 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2663 for (i = 0; i < inner_comps; i++)
2664 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2665
2666 /* Convert the outputs to vectors for stores. */
2667 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2668 vec1 = NULL;
2669
2670 if (stride > 4)
2671 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2672
2673 /* Get the buffer. */
2674 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2675 SI_PARAM_RW_BUFFERS);
2676 buffer = build_indexed_load_const(ctx, rw_buffers,
2677 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2678
2679 /* Get the offset. */
2680 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2681 SI_PARAM_TESS_FACTOR_OFFSET);
2682 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2683 lp_build_const_int32(gallivm, 4 * stride), "");
2684
2685 lp_build_if(&inner_if_ctx, gallivm,
2686 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2687 rel_patch_id, bld_base->uint_bld.zero, ""));
2688
2689 /* Store the dynamic HS control word. */
2690 build_tbuffer_store_dwords(ctx, buffer,
2691 lp_build_const_int32(gallivm, 0x80000000),
2692 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2693
2694 lp_build_endif(&inner_if_ctx);
2695
2696 /* Store the tessellation factors. */
2697 build_tbuffer_store_dwords(ctx, buffer, vec0,
2698 MIN2(stride, 4), byteoffset, tf_base, 4);
2699 if (vec1)
2700 build_tbuffer_store_dwords(ctx, buffer, vec1,
2701 stride - 4, byteoffset, tf_base, 20);
2702 lp_build_endif(&if_ctx);
2703 }
2704
2705 /* This only writes the tessellation factor levels. */
2706 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2707 {
2708 struct si_shader_context *ctx = si_shader_context(bld_base);
2709 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2710
2711 rel_patch_id = get_rel_patch_id(ctx);
2712 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2713 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2714
2715 if (!ctx->is_monolithic) {
2716 /* Return epilog parameters from this function. */
2717 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2718 LLVMValueRef ret = ctx->return_value;
2719 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2720 unsigned vgpr;
2721
2722 /* RW_BUFFERS pointer */
2723 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2724 SI_PARAM_RW_BUFFERS);
2725 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2726 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2727 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2728 bld_base->uint_bld.zero, "");
2729 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2730 bld_base->uint_bld.one, "");
2731 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2732 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2733
2734 /* Tess factor buffer soffset is after user SGPRs. */
2735 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2736 SI_PARAM_TESS_FACTOR_OFFSET);
2737 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2738 SI_TCS_NUM_USER_SGPR + 1, "");
2739
2740 /* VGPRs */
2741 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2742 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2743 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2744
2745 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2746 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2747 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2748 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2749 ctx->return_value = ret;
2750 return;
2751 }
2752
2753 si_copy_tcs_inputs(bld_base);
2754 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2755 }
2756
2757 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2758 {
2759 struct si_shader_context *ctx = si_shader_context(bld_base);
2760 struct si_shader *shader = ctx->shader;
2761 struct tgsi_shader_info *info = &shader->selector->info;
2762 struct gallivm_state *gallivm = bld_base->base.gallivm;
2763 unsigned i, chan;
2764 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2765 ctx->param_rel_auto_id);
2766 LLVMValueRef vertex_dw_stride =
2767 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2768 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2769 vertex_dw_stride, "");
2770
2771 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2772 * its inputs from it. */
2773 for (i = 0; i < info->num_outputs; i++) {
2774 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2775 unsigned name = info->output_semantic_name[i];
2776 unsigned index = info->output_semantic_index[i];
2777 int param = si_shader_io_get_unique_index(name, index);
2778 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2779 lp_build_const_int32(gallivm, param * 4), "");
2780
2781 for (chan = 0; chan < 4; chan++) {
2782 lds_store(bld_base, chan, dw_addr,
2783 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2784 }
2785 }
2786 }
2787
2788 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2789 {
2790 struct si_shader_context *ctx = si_shader_context(bld_base);
2791 struct gallivm_state *gallivm = bld_base->base.gallivm;
2792 struct si_shader *es = ctx->shader;
2793 struct tgsi_shader_info *info = &es->selector->info;
2794 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2795 ctx->param_es2gs_offset);
2796 unsigned chan;
2797 int i;
2798
2799 for (i = 0; i < info->num_outputs; i++) {
2800 LLVMValueRef *out_ptr =
2801 ctx->radeon_bld.soa.outputs[i];
2802 int param_index;
2803
2804 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2805 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2806 continue;
2807
2808 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2809 info->output_semantic_index[i]);
2810
2811 for (chan = 0; chan < 4; chan++) {
2812 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2813 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2814
2815 build_tbuffer_store(ctx,
2816 ctx->esgs_ring,
2817 out_val, 1,
2818 LLVMGetUndef(ctx->i32), soffset,
2819 (4 * param_index + chan) * 4,
2820 V_008F0C_BUF_DATA_FORMAT_32,
2821 V_008F0C_BUF_NUM_FORMAT_UINT,
2822 0, 0, 1, 1, 0);
2823 }
2824 }
2825 }
2826
2827 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2828 {
2829 struct si_shader_context *ctx = si_shader_context(bld_base);
2830 struct gallivm_state *gallivm = bld_base->base.gallivm;
2831 LLVMValueRef args[2];
2832
2833 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2834 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2835 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2836 ctx->voidt, args, 2, 0);
2837 }
2838
2839 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2840 {
2841 struct si_shader_context *ctx = si_shader_context(bld_base);
2842 struct gallivm_state *gallivm = bld_base->base.gallivm;
2843 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2844 struct si_shader_output_values *outputs = NULL;
2845 int i,j;
2846
2847 assert(!ctx->is_gs_copy_shader);
2848
2849 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2850
2851 /* Vertex color clamping.
2852 *
2853 * This uses a state constant loaded in a user data SGPR and
2854 * an IF statement is added that clamps all colors if the constant
2855 * is true.
2856 */
2857 if (ctx->type == PIPE_SHADER_VERTEX) {
2858 struct lp_build_if_state if_ctx;
2859 LLVMValueRef cond = NULL;
2860 LLVMValueRef addr, val;
2861
2862 for (i = 0; i < info->num_outputs; i++) {
2863 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2864 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2865 continue;
2866
2867 /* We've found a color. */
2868 if (!cond) {
2869 /* The state is in the first bit of the user SGPR. */
2870 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2871 SI_PARAM_VS_STATE_BITS);
2872 cond = LLVMBuildTrunc(gallivm->builder, cond,
2873 ctx->i1, "");
2874 lp_build_if(&if_ctx, gallivm, cond);
2875 }
2876
2877 for (j = 0; j < 4; j++) {
2878 addr = ctx->radeon_bld.soa.outputs[i][j];
2879 val = LLVMBuildLoad(gallivm->builder, addr, "");
2880 val = radeon_llvm_saturate(bld_base, val);
2881 LLVMBuildStore(gallivm->builder, val, addr);
2882 }
2883 }
2884
2885 if (cond)
2886 lp_build_endif(&if_ctx);
2887 }
2888
2889 for (i = 0; i < info->num_outputs; i++) {
2890 outputs[i].name = info->output_semantic_name[i];
2891 outputs[i].sid = info->output_semantic_index[i];
2892
2893 for (j = 0; j < 4; j++)
2894 outputs[i].values[j] =
2895 LLVMBuildLoad(gallivm->builder,
2896 ctx->radeon_bld.soa.outputs[i][j],
2897 "");
2898 }
2899
2900 if (ctx->is_monolithic) {
2901 /* Export PrimitiveID when PS needs it. */
2902 if (si_vs_exports_prim_id(ctx->shader)) {
2903 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2904 outputs[i].sid = 0;
2905 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2906 get_primitive_id(bld_base, 0));
2907 outputs[i].values[1] = bld_base->base.undef;
2908 outputs[i].values[2] = bld_base->base.undef;
2909 outputs[i].values[3] = bld_base->base.undef;
2910 i++;
2911 }
2912 } else {
2913 /* Return the primitive ID from the LLVM function. */
2914 ctx->return_value =
2915 LLVMBuildInsertValue(gallivm->builder,
2916 ctx->return_value,
2917 bitcast(bld_base, TGSI_TYPE_FLOAT,
2918 get_primitive_id(bld_base, 0)),
2919 VS_EPILOG_PRIMID_LOC, "");
2920 }
2921
2922 si_llvm_export_vs(bld_base, outputs, i);
2923 FREE(outputs);
2924 }
2925
2926 struct si_ps_exports {
2927 unsigned num;
2928 LLVMValueRef args[10][9];
2929 };
2930
2931 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
2932 bool writes_samplemask)
2933 {
2934 if (writes_z) {
2935 /* Z needs 32 bits. */
2936 if (writes_samplemask)
2937 return V_028710_SPI_SHADER_32_ABGR;
2938 else if (writes_stencil)
2939 return V_028710_SPI_SHADER_32_GR;
2940 else
2941 return V_028710_SPI_SHADER_32_R;
2942 } else if (writes_stencil || writes_samplemask) {
2943 /* Both stencil and sample mask need only 16 bits. */
2944 return V_028710_SPI_SHADER_UINT16_ABGR;
2945 } else {
2946 return V_028710_SPI_SHADER_ZERO;
2947 }
2948 }
2949
2950 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2951 LLVMValueRef depth, LLVMValueRef stencil,
2952 LLVMValueRef samplemask, struct si_ps_exports *exp)
2953 {
2954 struct si_shader_context *ctx = si_shader_context(bld_base);
2955 struct lp_build_context *base = &bld_base->base;
2956 struct lp_build_context *uint = &bld_base->uint_bld;
2957 LLVMValueRef args[9];
2958 unsigned mask = 0;
2959 unsigned format = si_get_spi_shader_z_format(depth != NULL,
2960 stencil != NULL,
2961 samplemask != NULL);
2962
2963 assert(depth || stencil || samplemask);
2964
2965 args[1] = uint->one; /* whether the EXEC mask is valid */
2966 args[2] = uint->one; /* DONE bit */
2967
2968 /* Specify the target we are exporting */
2969 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2970
2971 args[4] = uint->zero; /* COMP flag */
2972 args[5] = base->undef; /* R, depth */
2973 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2974 args[7] = base->undef; /* B, sample mask */
2975 args[8] = base->undef; /* A, alpha to mask */
2976
2977 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
2978 assert(!depth);
2979 args[4] = uint->one; /* COMPR flag */
2980
2981 if (stencil) {
2982 /* Stencil should be in X[23:16]. */
2983 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
2984 stencil = LLVMBuildShl(base->gallivm->builder, stencil,
2985 LLVMConstInt(ctx->i32, 16, 0), "");
2986 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
2987 mask |= 0x3;
2988 }
2989 if (samplemask) {
2990 /* SampleMask should be in Y[15:0]. */
2991 args[6] = samplemask;
2992 mask |= 0xc;
2993 }
2994 } else {
2995 if (depth) {
2996 args[5] = depth;
2997 mask |= 0x1;
2998 }
2999 if (stencil) {
3000 args[6] = stencil;
3001 mask |= 0x2;
3002 }
3003 if (samplemask) {
3004 args[7] = samplemask;
3005 mask |= 0x4;
3006 }
3007 }
3008
3009 /* SI (except OLAND) has a bug that it only looks
3010 * at the X writemask component. */
3011 if (ctx->screen->b.chip_class == SI &&
3012 ctx->screen->b.family != CHIP_OLAND)
3013 mask |= 0x1;
3014
3015 /* Specify which components to enable */
3016 args[0] = lp_build_const_int32(base->gallivm, mask);
3017
3018 memcpy(exp->args[exp->num++], args, sizeof(args));
3019 }
3020
3021 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3022 LLVMValueRef *color, unsigned index,
3023 unsigned samplemask_param,
3024 bool is_last, struct si_ps_exports *exp)
3025 {
3026 struct si_shader_context *ctx = si_shader_context(bld_base);
3027 struct lp_build_context *base = &bld_base->base;
3028 int i;
3029
3030 /* Clamp color */
3031 if (ctx->shader->key.ps.epilog.clamp_color)
3032 for (i = 0; i < 4; i++)
3033 color[i] = radeon_llvm_saturate(bld_base, color[i]);
3034
3035 /* Alpha to one */
3036 if (ctx->shader->key.ps.epilog.alpha_to_one)
3037 color[3] = base->one;
3038
3039 /* Alpha test */
3040 if (index == 0 &&
3041 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3042 si_alpha_test(bld_base, color[3]);
3043
3044 /* Line & polygon smoothing */
3045 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3046 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3047 samplemask_param);
3048
3049 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3050 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3051 LLVMValueRef args[8][9];
3052 int c, last = -1;
3053
3054 /* Get the export arguments, also find out what the last one is. */
3055 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3056 si_llvm_init_export_args(bld_base, color,
3057 V_008DFC_SQ_EXP_MRT + c, args[c]);
3058 if (args[c][0] != bld_base->uint_bld.zero)
3059 last = c;
3060 }
3061
3062 /* Emit all exports. */
3063 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3064 if (is_last && last == c) {
3065 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3066 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3067 } else if (args[c][0] == bld_base->uint_bld.zero)
3068 continue; /* unnecessary NULL export */
3069
3070 memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
3071 }
3072 } else {
3073 LLVMValueRef args[9];
3074
3075 /* Export */
3076 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3077 args);
3078 if (is_last) {
3079 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3080 args[2] = bld_base->uint_bld.one; /* DONE bit */
3081 } else if (args[0] == bld_base->uint_bld.zero)
3082 return; /* unnecessary NULL export */
3083
3084 memcpy(exp->args[exp->num++], args, sizeof(args));
3085 }
3086 }
3087
3088 static void si_emit_ps_exports(struct si_shader_context *ctx,
3089 struct si_ps_exports *exp)
3090 {
3091 for (unsigned i = 0; i < exp->num; i++)
3092 lp_build_intrinsic(ctx->radeon_bld.gallivm.builder,
3093 "llvm.SI.export", ctx->voidt,
3094 exp->args[i], 9, 0);
3095 }
3096
3097 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3098 {
3099 struct si_shader_context *ctx = si_shader_context(bld_base);
3100 struct lp_build_context *base = &bld_base->base;
3101 struct lp_build_context *uint = &bld_base->uint_bld;
3102 LLVMValueRef args[9];
3103
3104 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3105 args[1] = uint->one; /* whether the EXEC mask is valid */
3106 args[2] = uint->one; /* DONE bit */
3107 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3108 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3109 args[5] = base->undef; /* R */
3110 args[6] = base->undef; /* G */
3111 args[7] = base->undef; /* B */
3112 args[8] = base->undef; /* A */
3113
3114 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3115 ctx->voidt, args, 9, 0);
3116 }
3117
3118 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3119 {
3120 struct si_shader_context *ctx = si_shader_context(bld_base);
3121 struct si_shader *shader = ctx->shader;
3122 struct lp_build_context *base = &bld_base->base;
3123 struct tgsi_shader_info *info = &shader->selector->info;
3124 LLVMBuilderRef builder = base->gallivm->builder;
3125 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3126 int last_color_export = -1;
3127 int i;
3128 struct si_ps_exports exp = {};
3129
3130 /* Determine the last export. If MRTZ is present, it's always last.
3131 * Otherwise, find the last color export.
3132 */
3133 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3134 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3135
3136 /* Don't export NULL and return if alpha-test is enabled. */
3137 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3138 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3139 (spi_format & 0xf) == 0)
3140 spi_format |= V_028714_SPI_SHADER_32_AR;
3141
3142 for (i = 0; i < info->num_outputs; i++) {
3143 unsigned index = info->output_semantic_index[i];
3144
3145 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3146 continue;
3147
3148 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3149 if (shader->key.ps.epilog.last_cbuf > 0) {
3150 /* Just set this if any of the colorbuffers are enabled. */
3151 if (spi_format &
3152 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3153 last_color_export = i;
3154 continue;
3155 }
3156
3157 if ((spi_format >> (index * 4)) & 0xf)
3158 last_color_export = i;
3159 }
3160
3161 /* If there are no outputs, export NULL. */
3162 if (last_color_export == -1) {
3163 si_export_null(bld_base);
3164 return;
3165 }
3166 }
3167
3168 for (i = 0; i < info->num_outputs; i++) {
3169 unsigned semantic_name = info->output_semantic_name[i];
3170 unsigned semantic_index = info->output_semantic_index[i];
3171 unsigned j;
3172 LLVMValueRef color[4] = {};
3173
3174 /* Select the correct target */
3175 switch (semantic_name) {
3176 case TGSI_SEMANTIC_POSITION:
3177 depth = LLVMBuildLoad(builder,
3178 ctx->radeon_bld.soa.outputs[i][2], "");
3179 break;
3180 case TGSI_SEMANTIC_STENCIL:
3181 stencil = LLVMBuildLoad(builder,
3182 ctx->radeon_bld.soa.outputs[i][1], "");
3183 break;
3184 case TGSI_SEMANTIC_SAMPLEMASK:
3185 samplemask = LLVMBuildLoad(builder,
3186 ctx->radeon_bld.soa.outputs[i][0], "");
3187 break;
3188 case TGSI_SEMANTIC_COLOR:
3189 for (j = 0; j < 4; j++)
3190 color[j] = LLVMBuildLoad(builder,
3191 ctx->radeon_bld.soa.outputs[i][j], "");
3192
3193 si_export_mrt_color(bld_base, color, semantic_index,
3194 SI_PARAM_SAMPLE_COVERAGE,
3195 last_color_export == i, &exp);
3196 break;
3197 default:
3198 fprintf(stderr,
3199 "Warning: SI unhandled fs output type:%d\n",
3200 semantic_name);
3201 }
3202 }
3203
3204 if (depth || stencil || samplemask)
3205 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
3206
3207 si_emit_ps_exports(ctx, &exp);
3208 }
3209
3210 /**
3211 * Return PS outputs in this order:
3212 *
3213 * v[0:3] = color0.xyzw
3214 * v[4:7] = color1.xyzw
3215 * ...
3216 * vN+0 = Depth
3217 * vN+1 = Stencil
3218 * vN+2 = SampleMask
3219 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3220 *
3221 * The alpha-ref SGPR is returned via its original location.
3222 */
3223 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3224 {
3225 struct si_shader_context *ctx = si_shader_context(bld_base);
3226 struct si_shader *shader = ctx->shader;
3227 struct lp_build_context *base = &bld_base->base;
3228 struct tgsi_shader_info *info = &shader->selector->info;
3229 LLVMBuilderRef builder = base->gallivm->builder;
3230 unsigned i, j, first_vgpr, vgpr;
3231
3232 LLVMValueRef color[8][4] = {};
3233 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3234 LLVMValueRef ret;
3235
3236 /* Read the output values. */
3237 for (i = 0; i < info->num_outputs; i++) {
3238 unsigned semantic_name = info->output_semantic_name[i];
3239 unsigned semantic_index = info->output_semantic_index[i];
3240
3241 switch (semantic_name) {
3242 case TGSI_SEMANTIC_COLOR:
3243 assert(semantic_index < 8);
3244 for (j = 0; j < 4; j++) {
3245 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3246 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3247 color[semantic_index][j] = result;
3248 }
3249 break;
3250 case TGSI_SEMANTIC_POSITION:
3251 depth = LLVMBuildLoad(builder,
3252 ctx->radeon_bld.soa.outputs[i][2], "");
3253 break;
3254 case TGSI_SEMANTIC_STENCIL:
3255 stencil = LLVMBuildLoad(builder,
3256 ctx->radeon_bld.soa.outputs[i][1], "");
3257 break;
3258 case TGSI_SEMANTIC_SAMPLEMASK:
3259 samplemask = LLVMBuildLoad(builder,
3260 ctx->radeon_bld.soa.outputs[i][0], "");
3261 break;
3262 default:
3263 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3264 semantic_name);
3265 }
3266 }
3267
3268 /* Fill the return structure. */
3269 ret = ctx->return_value;
3270
3271 /* Set SGPRs. */
3272 ret = LLVMBuildInsertValue(builder, ret,
3273 bitcast(bld_base, TGSI_TYPE_SIGNED,
3274 LLVMGetParam(ctx->radeon_bld.main_fn,
3275 SI_PARAM_ALPHA_REF)),
3276 SI_SGPR_ALPHA_REF, "");
3277
3278 /* Set VGPRs */
3279 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3280 for (i = 0; i < ARRAY_SIZE(color); i++) {
3281 if (!color[i][0])
3282 continue;
3283
3284 for (j = 0; j < 4; j++)
3285 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3286 }
3287 if (depth)
3288 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3289 if (stencil)
3290 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3291 if (samplemask)
3292 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3293
3294 /* Add the input sample mask for smoothing at the end. */
3295 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3296 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3297 ret = LLVMBuildInsertValue(builder, ret,
3298 LLVMGetParam(ctx->radeon_bld.main_fn,
3299 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3300
3301 ctx->return_value = ret;
3302 }
3303
3304 /**
3305 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3306 * buffer in number of elements and return it as an i32.
3307 */
3308 static LLVMValueRef get_buffer_size(
3309 struct lp_build_tgsi_context *bld_base,
3310 LLVMValueRef descriptor)
3311 {
3312 struct si_shader_context *ctx = si_shader_context(bld_base);
3313 struct gallivm_state *gallivm = bld_base->base.gallivm;
3314 LLVMBuilderRef builder = gallivm->builder;
3315 LLVMValueRef size =
3316 LLVMBuildExtractElement(builder, descriptor,
3317 lp_build_const_int32(gallivm, 6), "");
3318
3319 if (ctx->screen->b.chip_class >= VI) {
3320 /* On VI, the descriptor contains the size in bytes,
3321 * but TXQ must return the size in elements.
3322 * The stride is always non-zero for resources using TXQ.
3323 */
3324 LLVMValueRef stride =
3325 LLVMBuildExtractElement(builder, descriptor,
3326 lp_build_const_int32(gallivm, 5), "");
3327 stride = LLVMBuildLShr(builder, stride,
3328 lp_build_const_int32(gallivm, 16), "");
3329 stride = LLVMBuildAnd(builder, stride,
3330 lp_build_const_int32(gallivm, 0x3FFF), "");
3331
3332 size = LLVMBuildUDiv(builder, size, stride, "");
3333 }
3334
3335 return size;
3336 }
3337
3338 /**
3339 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3340 * intrinsic names).
3341 */
3342 static void build_int_type_name(
3343 LLVMTypeRef type,
3344 char *buf, unsigned bufsize)
3345 {
3346 assert(bufsize >= 6);
3347
3348 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3349 snprintf(buf, bufsize, "v%ui32",
3350 LLVMGetVectorSize(type));
3351 else
3352 strcpy(buf, "i32");
3353 }
3354
3355 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3356 struct lp_build_tgsi_context *bld_base,
3357 struct lp_build_emit_data *emit_data);
3358
3359 /* Prevent optimizations (at least of memory accesses) across the current
3360 * point in the program by emitting empty inline assembly that is marked as
3361 * having side effects.
3362 */
3363 static void emit_optimization_barrier(struct si_shader_context *ctx)
3364 {
3365 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3366 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3367 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3368 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3369 }
3370
3371 static void emit_waitcnt(struct si_shader_context *ctx)
3372 {
3373 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3374 LLVMBuilderRef builder = gallivm->builder;
3375 LLVMValueRef args[1] = {
3376 lp_build_const_int32(gallivm, 0xf70)
3377 };
3378 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3379 ctx->voidt, args, 1, 0);
3380 }
3381
3382 static void membar_emit(
3383 const struct lp_build_tgsi_action *action,
3384 struct lp_build_tgsi_context *bld_base,
3385 struct lp_build_emit_data *emit_data)
3386 {
3387 struct si_shader_context *ctx = si_shader_context(bld_base);
3388
3389 emit_waitcnt(ctx);
3390 }
3391
3392 static LLVMValueRef
3393 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3394 const struct tgsi_full_src_register *reg)
3395 {
3396 LLVMValueRef ind_index;
3397 LLVMValueRef rsrc_ptr;
3398
3399 if (!reg->Register.Indirect)
3400 return ctx->shader_buffers[reg->Register.Index];
3401
3402 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3403 reg->Register.Index,
3404 SI_NUM_SHADER_BUFFERS);
3405
3406 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3407 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3408 }
3409
3410 static bool tgsi_is_array_sampler(unsigned target)
3411 {
3412 return target == TGSI_TEXTURE_1D_ARRAY ||
3413 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3414 target == TGSI_TEXTURE_2D_ARRAY ||
3415 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3416 target == TGSI_TEXTURE_CUBE_ARRAY ||
3417 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3418 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3419 }
3420
3421 static bool tgsi_is_array_image(unsigned target)
3422 {
3423 return target == TGSI_TEXTURE_3D ||
3424 target == TGSI_TEXTURE_CUBE ||
3425 target == TGSI_TEXTURE_1D_ARRAY ||
3426 target == TGSI_TEXTURE_2D_ARRAY ||
3427 target == TGSI_TEXTURE_CUBE_ARRAY ||
3428 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3429 }
3430
3431 /**
3432 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3433 *
3434 * At least on Tonga, executing image stores on images with DCC enabled and
3435 * non-trivial can eventually lead to lockups. This can occur when an
3436 * application binds an image as read-only but then uses a shader that writes
3437 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3438 * program termination) in this case, but it doesn't cost much to be a bit
3439 * nicer: disabling DCC in the shader still leads to undefined results but
3440 * avoids the lockup.
3441 */
3442 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3443 LLVMValueRef rsrc)
3444 {
3445 if (ctx->screen->b.chip_class <= CIK) {
3446 return rsrc;
3447 } else {
3448 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3449 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3450 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3451 LLVMValueRef tmp;
3452
3453 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3454 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3455 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3456 }
3457 }
3458
3459 /**
3460 * Load the resource descriptor for \p image.
3461 */
3462 static void
3463 image_fetch_rsrc(
3464 struct lp_build_tgsi_context *bld_base,
3465 const struct tgsi_full_src_register *image,
3466 bool dcc_off,
3467 LLVMValueRef *rsrc)
3468 {
3469 struct si_shader_context *ctx = si_shader_context(bld_base);
3470
3471 assert(image->Register.File == TGSI_FILE_IMAGE);
3472
3473 if (!image->Register.Indirect) {
3474 /* Fast path: use preloaded resources */
3475 *rsrc = ctx->images[image->Register.Index];
3476 } else {
3477 /* Indexing and manual load */
3478 LLVMValueRef ind_index;
3479 LLVMValueRef rsrc_ptr;
3480 LLVMValueRef tmp;
3481
3482 /* From the GL_ARB_shader_image_load_store extension spec:
3483 *
3484 * If a shader performs an image load, store, or atomic
3485 * operation using an image variable declared as an array,
3486 * and if the index used to select an individual element is
3487 * negative or greater than or equal to the size of the
3488 * array, the results of the operation are undefined but may
3489 * not lead to termination.
3490 */
3491 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3492 image->Register.Index,
3493 SI_NUM_IMAGES);
3494
3495 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3496 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3497 if (dcc_off)
3498 tmp = force_dcc_off(ctx, tmp);
3499 *rsrc = tmp;
3500 }
3501 }
3502
3503 static LLVMValueRef image_fetch_coords(
3504 struct lp_build_tgsi_context *bld_base,
3505 const struct tgsi_full_instruction *inst,
3506 unsigned src)
3507 {
3508 struct gallivm_state *gallivm = bld_base->base.gallivm;
3509 LLVMBuilderRef builder = gallivm->builder;
3510 unsigned target = inst->Memory.Texture;
3511 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3512 LLVMValueRef coords[4];
3513 LLVMValueRef tmp;
3514 int chan;
3515
3516 for (chan = 0; chan < num_coords; ++chan) {
3517 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3518 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3519 coords[chan] = tmp;
3520 }
3521
3522 if (num_coords == 1)
3523 return coords[0];
3524
3525 if (num_coords == 3) {
3526 /* LLVM has difficulties lowering 3-element vectors. */
3527 coords[3] = bld_base->uint_bld.undef;
3528 num_coords = 4;
3529 }
3530
3531 return lp_build_gather_values(gallivm, coords, num_coords);
3532 }
3533
3534 /**
3535 * Append the extra mode bits that are used by image load and store.
3536 */
3537 static void image_append_args(
3538 struct si_shader_context *ctx,
3539 struct lp_build_emit_data * emit_data,
3540 unsigned target,
3541 bool atomic)
3542 {
3543 const struct tgsi_full_instruction *inst = emit_data->inst;
3544 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3545 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3546
3547 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3548 emit_data->args[emit_data->arg_count++] =
3549 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3550 if (!atomic) {
3551 emit_data->args[emit_data->arg_count++] =
3552 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3553 i1true : i1false; /* glc */
3554 }
3555 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3556 }
3557
3558 /**
3559 * Given a 256 bit resource, extract the top half (which stores the buffer
3560 * resource in the case of textures and images).
3561 */
3562 static LLVMValueRef extract_rsrc_top_half(
3563 struct si_shader_context *ctx,
3564 LLVMValueRef rsrc)
3565 {
3566 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3567 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3568 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3569
3570 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3571 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3572 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3573
3574 return rsrc;
3575 }
3576
3577 /**
3578 * Append the resource and indexing arguments for buffer intrinsics.
3579 *
3580 * \param rsrc the v4i32 buffer resource
3581 * \param index index into the buffer (stride-based)
3582 * \param offset byte offset into the buffer
3583 */
3584 static void buffer_append_args(
3585 struct si_shader_context *ctx,
3586 struct lp_build_emit_data *emit_data,
3587 LLVMValueRef rsrc,
3588 LLVMValueRef index,
3589 LLVMValueRef offset,
3590 bool atomic)
3591 {
3592 const struct tgsi_full_instruction *inst = emit_data->inst;
3593 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3594 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3595
3596 emit_data->args[emit_data->arg_count++] = rsrc;
3597 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3598 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3599 if (!atomic) {
3600 emit_data->args[emit_data->arg_count++] =
3601 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3602 i1true : i1false; /* glc */
3603 }
3604 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3605 }
3606
3607 static void load_fetch_args(
3608 struct lp_build_tgsi_context * bld_base,
3609 struct lp_build_emit_data * emit_data)
3610 {
3611 struct si_shader_context *ctx = si_shader_context(bld_base);
3612 struct gallivm_state *gallivm = bld_base->base.gallivm;
3613 const struct tgsi_full_instruction * inst = emit_data->inst;
3614 unsigned target = inst->Memory.Texture;
3615 LLVMValueRef rsrc;
3616
3617 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3618
3619 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3620 LLVMBuilderRef builder = gallivm->builder;
3621 LLVMValueRef offset;
3622 LLVMValueRef tmp;
3623
3624 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3625
3626 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3627 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3628
3629 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3630 offset, false);
3631 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3632 LLVMValueRef coords;
3633
3634 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3635 coords = image_fetch_coords(bld_base, inst, 1);
3636
3637 if (target == TGSI_TEXTURE_BUFFER) {
3638 rsrc = extract_rsrc_top_half(ctx, rsrc);
3639 buffer_append_args(ctx, emit_data, rsrc, coords,
3640 bld_base->uint_bld.zero, false);
3641 } else {
3642 emit_data->args[0] = coords;
3643 emit_data->args[1] = rsrc;
3644 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3645 emit_data->arg_count = 3;
3646
3647 image_append_args(ctx, emit_data, target, false);
3648 }
3649 }
3650 }
3651
3652 static void load_emit_buffer(struct si_shader_context *ctx,
3653 struct lp_build_emit_data *emit_data)
3654 {
3655 const struct tgsi_full_instruction *inst = emit_data->inst;
3656 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3657 LLVMBuilderRef builder = gallivm->builder;
3658 uint writemask = inst->Dst[0].Register.WriteMask;
3659 uint count = util_last_bit(writemask);
3660 const char *intrinsic_name;
3661 LLVMTypeRef dst_type;
3662
3663 switch (count) {
3664 case 1:
3665 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3666 dst_type = ctx->f32;
3667 break;
3668 case 2:
3669 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3670 dst_type = LLVMVectorType(ctx->f32, 2);
3671 break;
3672 default: // 3 & 4
3673 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3674 dst_type = ctx->v4f32;
3675 count = 4;
3676 }
3677
3678 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3679 builder, intrinsic_name, dst_type,
3680 emit_data->args, emit_data->arg_count,
3681 LLVMReadOnlyAttribute);
3682 }
3683
3684 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3685 const struct tgsi_full_instruction *inst,
3686 LLVMTypeRef type, int arg)
3687 {
3688 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3689 LLVMBuilderRef builder = gallivm->builder;
3690 LLVMValueRef offset, ptr;
3691 int addr_space;
3692
3693 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3694 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3695
3696 ptr = ctx->shared_memory;
3697 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3698 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3699 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3700
3701 return ptr;
3702 }
3703
3704 static void load_emit_memory(
3705 struct si_shader_context *ctx,
3706 struct lp_build_emit_data *emit_data)
3707 {
3708 const struct tgsi_full_instruction *inst = emit_data->inst;
3709 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3710 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3711 LLVMBuilderRef builder = gallivm->builder;
3712 unsigned writemask = inst->Dst[0].Register.WriteMask;
3713 LLVMValueRef channels[4], ptr, derived_ptr, index;
3714 int chan;
3715
3716 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3717
3718 for (chan = 0; chan < 4; ++chan) {
3719 if (!(writemask & (1 << chan))) {
3720 channels[chan] = LLVMGetUndef(base->elem_type);
3721 continue;
3722 }
3723
3724 index = lp_build_const_int32(gallivm, chan);
3725 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3726 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3727 }
3728 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3729 }
3730
3731 static void load_emit(
3732 const struct lp_build_tgsi_action *action,
3733 struct lp_build_tgsi_context *bld_base,
3734 struct lp_build_emit_data *emit_data)
3735 {
3736 struct si_shader_context *ctx = si_shader_context(bld_base);
3737 struct gallivm_state *gallivm = bld_base->base.gallivm;
3738 LLVMBuilderRef builder = gallivm->builder;
3739 const struct tgsi_full_instruction * inst = emit_data->inst;
3740 char intrinsic_name[32];
3741 char coords_type[8];
3742
3743 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3744 load_emit_memory(ctx, emit_data);
3745 return;
3746 }
3747
3748 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3749 emit_waitcnt(ctx);
3750
3751 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3752 load_emit_buffer(ctx, emit_data);
3753 return;
3754 }
3755
3756 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3757 emit_data->output[emit_data->chan] =
3758 lp_build_intrinsic(
3759 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3760 emit_data->args, emit_data->arg_count,
3761 LLVMReadOnlyAttribute);
3762 } else {
3763 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3764 coords_type, sizeof(coords_type));
3765
3766 snprintf(intrinsic_name, sizeof(intrinsic_name),
3767 "llvm.amdgcn.image.load.%s", coords_type);
3768
3769 emit_data->output[emit_data->chan] =
3770 lp_build_intrinsic(
3771 builder, intrinsic_name, emit_data->dst_type,
3772 emit_data->args, emit_data->arg_count,
3773 LLVMReadOnlyAttribute);
3774 }
3775 }
3776
3777 static void store_fetch_args(
3778 struct lp_build_tgsi_context * bld_base,
3779 struct lp_build_emit_data * emit_data)
3780 {
3781 struct si_shader_context *ctx = si_shader_context(bld_base);
3782 struct gallivm_state *gallivm = bld_base->base.gallivm;
3783 LLVMBuilderRef builder = gallivm->builder;
3784 const struct tgsi_full_instruction * inst = emit_data->inst;
3785 struct tgsi_full_src_register memory;
3786 LLVMValueRef chans[4];
3787 LLVMValueRef data;
3788 LLVMValueRef rsrc;
3789 unsigned chan;
3790
3791 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3792
3793 for (chan = 0; chan < 4; ++chan) {
3794 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3795 }
3796 data = lp_build_gather_values(gallivm, chans, 4);
3797
3798 emit_data->args[emit_data->arg_count++] = data;
3799
3800 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3801
3802 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3803 LLVMValueRef offset;
3804 LLVMValueRef tmp;
3805
3806 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3807
3808 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3809 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3810
3811 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3812 offset, false);
3813 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3814 unsigned target = inst->Memory.Texture;
3815 LLVMValueRef coords;
3816
3817 coords = image_fetch_coords(bld_base, inst, 0);
3818
3819 if (target == TGSI_TEXTURE_BUFFER) {
3820 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3821
3822 rsrc = extract_rsrc_top_half(ctx, rsrc);
3823 buffer_append_args(ctx, emit_data, rsrc, coords,
3824 bld_base->uint_bld.zero, false);
3825 } else {
3826 emit_data->args[1] = coords;
3827 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3828 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3829 emit_data->arg_count = 4;
3830
3831 image_append_args(ctx, emit_data, target, false);
3832 }
3833 }
3834 }
3835
3836 static void store_emit_buffer(
3837 struct si_shader_context *ctx,
3838 struct lp_build_emit_data *emit_data)
3839 {
3840 const struct tgsi_full_instruction *inst = emit_data->inst;
3841 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3842 LLVMBuilderRef builder = gallivm->builder;
3843 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3844 LLVMValueRef base_data = emit_data->args[0];
3845 LLVMValueRef base_offset = emit_data->args[3];
3846 unsigned writemask = inst->Dst[0].Register.WriteMask;
3847
3848 while (writemask) {
3849 int start, count;
3850 const char *intrinsic_name;
3851 LLVMValueRef data;
3852 LLVMValueRef offset;
3853 LLVMValueRef tmp;
3854
3855 u_bit_scan_consecutive_range(&writemask, &start, &count);
3856
3857 /* Due to an LLVM limitation, split 3-element writes
3858 * into a 2-element and a 1-element write. */
3859 if (count == 3) {
3860 writemask |= 1 << (start + 2);
3861 count = 2;
3862 }
3863
3864 if (count == 4) {
3865 data = base_data;
3866 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3867 } else if (count == 2) {
3868 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3869
3870 tmp = LLVMBuildExtractElement(
3871 builder, base_data,
3872 lp_build_const_int32(gallivm, start), "");
3873 data = LLVMBuildInsertElement(
3874 builder, LLVMGetUndef(v2f32), tmp,
3875 uint_bld->zero, "");
3876
3877 tmp = LLVMBuildExtractElement(
3878 builder, base_data,
3879 lp_build_const_int32(gallivm, start + 1), "");
3880 data = LLVMBuildInsertElement(
3881 builder, data, tmp, uint_bld->one, "");
3882
3883 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3884 } else {
3885 assert(count == 1);
3886 data = LLVMBuildExtractElement(
3887 builder, base_data,
3888 lp_build_const_int32(gallivm, start), "");
3889 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3890 }
3891
3892 offset = base_offset;
3893 if (start != 0) {
3894 offset = LLVMBuildAdd(
3895 builder, offset,
3896 lp_build_const_int32(gallivm, start * 4), "");
3897 }
3898
3899 emit_data->args[0] = data;
3900 emit_data->args[3] = offset;
3901
3902 lp_build_intrinsic(
3903 builder, intrinsic_name, emit_data->dst_type,
3904 emit_data->args, emit_data->arg_count, 0);
3905 }
3906 }
3907
3908 static void store_emit_memory(
3909 struct si_shader_context *ctx,
3910 struct lp_build_emit_data *emit_data)
3911 {
3912 const struct tgsi_full_instruction *inst = emit_data->inst;
3913 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3914 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3915 LLVMBuilderRef builder = gallivm->builder;
3916 unsigned writemask = inst->Dst[0].Register.WriteMask;
3917 LLVMValueRef ptr, derived_ptr, data, index;
3918 int chan;
3919
3920 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3921
3922 for (chan = 0; chan < 4; ++chan) {
3923 if (!(writemask & (1 << chan))) {
3924 continue;
3925 }
3926 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3927 index = lp_build_const_int32(gallivm, chan);
3928 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3929 LLVMBuildStore(builder, data, derived_ptr);
3930 }
3931 }
3932
3933 static void store_emit(
3934 const struct lp_build_tgsi_action *action,
3935 struct lp_build_tgsi_context *bld_base,
3936 struct lp_build_emit_data *emit_data)
3937 {
3938 struct si_shader_context *ctx = si_shader_context(bld_base);
3939 struct gallivm_state *gallivm = bld_base->base.gallivm;
3940 LLVMBuilderRef builder = gallivm->builder;
3941 const struct tgsi_full_instruction * inst = emit_data->inst;
3942 unsigned target = inst->Memory.Texture;
3943 char intrinsic_name[32];
3944 char coords_type[8];
3945
3946 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3947 store_emit_memory(ctx, emit_data);
3948 return;
3949 }
3950
3951 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3952 emit_waitcnt(ctx);
3953
3954 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3955 store_emit_buffer(ctx, emit_data);
3956 return;
3957 }
3958
3959 if (target == TGSI_TEXTURE_BUFFER) {
3960 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3961 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3962 emit_data->dst_type, emit_data->args,
3963 emit_data->arg_count, 0);
3964 } else {
3965 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3966 coords_type, sizeof(coords_type));
3967 snprintf(intrinsic_name, sizeof(intrinsic_name),
3968 "llvm.amdgcn.image.store.%s", coords_type);
3969
3970 emit_data->output[emit_data->chan] =
3971 lp_build_intrinsic(
3972 builder, intrinsic_name, emit_data->dst_type,
3973 emit_data->args, emit_data->arg_count, 0);
3974 }
3975 }
3976
3977 static void atomic_fetch_args(
3978 struct lp_build_tgsi_context * bld_base,
3979 struct lp_build_emit_data * emit_data)
3980 {
3981 struct si_shader_context *ctx = si_shader_context(bld_base);
3982 struct gallivm_state *gallivm = bld_base->base.gallivm;
3983 LLVMBuilderRef builder = gallivm->builder;
3984 const struct tgsi_full_instruction * inst = emit_data->inst;
3985 LLVMValueRef data1, data2;
3986 LLVMValueRef rsrc;
3987 LLVMValueRef tmp;
3988
3989 emit_data->dst_type = bld_base->base.elem_type;
3990
3991 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3992 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3993
3994 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3995 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3996 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3997 }
3998
3999 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4000 * of arguments, which is reversed relative to TGSI (and GLSL)
4001 */
4002 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4003 emit_data->args[emit_data->arg_count++] = data2;
4004 emit_data->args[emit_data->arg_count++] = data1;
4005
4006 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4007 LLVMValueRef offset;
4008
4009 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4010
4011 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4012 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
4013
4014 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
4015 offset, true);
4016 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4017 unsigned target = inst->Memory.Texture;
4018 LLVMValueRef coords;
4019
4020 image_fetch_rsrc(bld_base, &inst->Src[0],
4021 target != TGSI_TEXTURE_BUFFER, &rsrc);
4022 coords = image_fetch_coords(bld_base, inst, 1);
4023
4024 if (target == TGSI_TEXTURE_BUFFER) {
4025 rsrc = extract_rsrc_top_half(ctx, rsrc);
4026 buffer_append_args(ctx, emit_data, rsrc, coords,
4027 bld_base->uint_bld.zero, true);
4028 } else {
4029 emit_data->args[emit_data->arg_count++] = coords;
4030 emit_data->args[emit_data->arg_count++] = rsrc;
4031
4032 image_append_args(ctx, emit_data, target, true);
4033 }
4034 }
4035 }
4036
4037 static void atomic_emit_memory(struct si_shader_context *ctx,
4038 struct lp_build_emit_data *emit_data) {
4039 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4040 LLVMBuilderRef builder = gallivm->builder;
4041 const struct tgsi_full_instruction * inst = emit_data->inst;
4042 LLVMValueRef ptr, result, arg;
4043
4044 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4045
4046 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
4047 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4048
4049 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4050 LLVMValueRef new_data;
4051 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4052 inst, 3, 0);
4053
4054 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4055
4056 #if HAVE_LLVM >= 0x309
4057 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4058 LLVMAtomicOrderingSequentiallyConsistent,
4059 LLVMAtomicOrderingSequentiallyConsistent,
4060 false);
4061 #endif
4062
4063 result = LLVMBuildExtractValue(builder, result, 0, "");
4064 } else {
4065 LLVMAtomicRMWBinOp op;
4066
4067 switch(inst->Instruction.Opcode) {
4068 case TGSI_OPCODE_ATOMUADD:
4069 op = LLVMAtomicRMWBinOpAdd;
4070 break;
4071 case TGSI_OPCODE_ATOMXCHG:
4072 op = LLVMAtomicRMWBinOpXchg;
4073 break;
4074 case TGSI_OPCODE_ATOMAND:
4075 op = LLVMAtomicRMWBinOpAnd;
4076 break;
4077 case TGSI_OPCODE_ATOMOR:
4078 op = LLVMAtomicRMWBinOpOr;
4079 break;
4080 case TGSI_OPCODE_ATOMXOR:
4081 op = LLVMAtomicRMWBinOpXor;
4082 break;
4083 case TGSI_OPCODE_ATOMUMIN:
4084 op = LLVMAtomicRMWBinOpUMin;
4085 break;
4086 case TGSI_OPCODE_ATOMUMAX:
4087 op = LLVMAtomicRMWBinOpUMax;
4088 break;
4089 case TGSI_OPCODE_ATOMIMIN:
4090 op = LLVMAtomicRMWBinOpMin;
4091 break;
4092 case TGSI_OPCODE_ATOMIMAX:
4093 op = LLVMAtomicRMWBinOpMax;
4094 break;
4095 default:
4096 unreachable("unknown atomic opcode");
4097 }
4098
4099 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4100 LLVMAtomicOrderingSequentiallyConsistent,
4101 false);
4102 }
4103 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4104 }
4105
4106 static void atomic_emit(
4107 const struct lp_build_tgsi_action *action,
4108 struct lp_build_tgsi_context *bld_base,
4109 struct lp_build_emit_data *emit_data)
4110 {
4111 struct si_shader_context *ctx = si_shader_context(bld_base);
4112 struct gallivm_state *gallivm = bld_base->base.gallivm;
4113 LLVMBuilderRef builder = gallivm->builder;
4114 const struct tgsi_full_instruction * inst = emit_data->inst;
4115 char intrinsic_name[40];
4116 LLVMValueRef tmp;
4117
4118 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4119 atomic_emit_memory(ctx, emit_data);
4120 return;
4121 }
4122
4123 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4124 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4125 snprintf(intrinsic_name, sizeof(intrinsic_name),
4126 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4127 } else {
4128 char coords_type[8];
4129
4130 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4131 coords_type, sizeof(coords_type));
4132 snprintf(intrinsic_name, sizeof(intrinsic_name),
4133 "llvm.amdgcn.image.atomic.%s.%s",
4134 action->intr_name, coords_type);
4135 }
4136
4137 tmp = lp_build_intrinsic(
4138 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4139 emit_data->args, emit_data->arg_count, 0);
4140 emit_data->output[emit_data->chan] =
4141 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4142 }
4143
4144 static void resq_fetch_args(
4145 struct lp_build_tgsi_context * bld_base,
4146 struct lp_build_emit_data * emit_data)
4147 {
4148 struct si_shader_context *ctx = si_shader_context(bld_base);
4149 struct gallivm_state *gallivm = bld_base->base.gallivm;
4150 const struct tgsi_full_instruction *inst = emit_data->inst;
4151 const struct tgsi_full_src_register *reg = &inst->Src[0];
4152
4153 emit_data->dst_type = ctx->v4i32;
4154
4155 if (reg->Register.File == TGSI_FILE_BUFFER) {
4156 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4157 emit_data->arg_count = 1;
4158 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4159 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4160 emit_data->arg_count = 1;
4161 } else {
4162 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4163 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4164 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4165 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4166 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4167 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4168 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4169 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4170 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4171 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4172 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4173 emit_data->arg_count = 10;
4174 }
4175 }
4176
4177 static void resq_emit(
4178 const struct lp_build_tgsi_action *action,
4179 struct lp_build_tgsi_context *bld_base,
4180 struct lp_build_emit_data *emit_data)
4181 {
4182 struct gallivm_state *gallivm = bld_base->base.gallivm;
4183 LLVMBuilderRef builder = gallivm->builder;
4184 const struct tgsi_full_instruction *inst = emit_data->inst;
4185 LLVMValueRef out;
4186
4187 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4188 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4189 lp_build_const_int32(gallivm, 2), "");
4190 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4191 out = get_buffer_size(bld_base, emit_data->args[0]);
4192 } else {
4193 out = lp_build_intrinsic(
4194 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4195 emit_data->args, emit_data->arg_count,
4196 LLVMReadNoneAttribute);
4197
4198 /* Divide the number of layers by 6 to get the number of cubes. */
4199 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4200 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4201 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4202
4203 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4204 z = LLVMBuildSDiv(builder, z, imm6, "");
4205 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4206 }
4207 }
4208
4209 emit_data->output[emit_data->chan] = out;
4210 }
4211
4212 static void set_tex_fetch_args(struct si_shader_context *ctx,
4213 struct lp_build_emit_data *emit_data,
4214 unsigned opcode, unsigned target,
4215 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4216 LLVMValueRef *param, unsigned count,
4217 unsigned dmask)
4218 {
4219 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4220 unsigned num_args;
4221 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4222
4223 /* Pad to power of two vector */
4224 while (count < util_next_power_of_two(count))
4225 param[count++] = LLVMGetUndef(ctx->i32);
4226
4227 /* Texture coordinates. */
4228 if (count > 1)
4229 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4230 else
4231 emit_data->args[0] = param[0];
4232
4233 /* Resource. */
4234 emit_data->args[1] = res_ptr;
4235 num_args = 2;
4236
4237 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4238 emit_data->dst_type = ctx->v4i32;
4239 else {
4240 emit_data->dst_type = ctx->v4f32;
4241
4242 emit_data->args[num_args++] = samp_ptr;
4243 }
4244
4245 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4246 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4247 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4248 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4249 tgsi_is_array_sampler(target)); /* da */
4250 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4251 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4252 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4253 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4254
4255 emit_data->arg_count = num_args;
4256 }
4257
4258 static const struct lp_build_tgsi_action tex_action;
4259
4260 enum desc_type {
4261 DESC_IMAGE,
4262 DESC_FMASK,
4263 DESC_SAMPLER
4264 };
4265
4266 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4267 {
4268 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4269 CONST_ADDR_SPACE);
4270 }
4271
4272 /**
4273 * Load an image view, fmask view. or sampler state descriptor.
4274 */
4275 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4276 LLVMValueRef list, LLVMValueRef index,
4277 enum desc_type type)
4278 {
4279 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4280 LLVMBuilderRef builder = gallivm->builder;
4281
4282 switch (type) {
4283 case DESC_IMAGE:
4284 /* The image is at [0:7]. */
4285 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4286 break;
4287 case DESC_FMASK:
4288 /* The FMASK is at [8:15]. */
4289 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4290 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4291 break;
4292 case DESC_SAMPLER:
4293 /* The sampler state is at [12:15]. */
4294 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4295 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4296 list = LLVMBuildPointerCast(builder, list,
4297 const_array(ctx->v4i32, 0), "");
4298 break;
4299 }
4300
4301 return build_indexed_load_const(ctx, list, index);
4302 }
4303
4304 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4305 LLVMValueRef index, enum desc_type type)
4306 {
4307 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4308 SI_PARAM_SAMPLERS);
4309
4310 return get_sampler_desc_custom(ctx, list, index, type);
4311 }
4312
4313 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4314 *
4315 * SI-CI:
4316 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4317 * filtering manually. The driver sets img7 to a mask clearing
4318 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4319 * s_and_b32 samp0, samp0, img7
4320 *
4321 * VI:
4322 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4323 */
4324 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4325 LLVMValueRef res, LLVMValueRef samp)
4326 {
4327 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4328 LLVMValueRef img7, samp0;
4329
4330 if (ctx->screen->b.chip_class >= VI)
4331 return samp;
4332
4333 img7 = LLVMBuildExtractElement(builder, res,
4334 LLVMConstInt(ctx->i32, 7, 0), "");
4335 samp0 = LLVMBuildExtractElement(builder, samp,
4336 LLVMConstInt(ctx->i32, 0, 0), "");
4337 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4338 return LLVMBuildInsertElement(builder, samp, samp0,
4339 LLVMConstInt(ctx->i32, 0, 0), "");
4340 }
4341
4342 static void tex_fetch_ptrs(
4343 struct lp_build_tgsi_context *bld_base,
4344 struct lp_build_emit_data *emit_data,
4345 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4346 {
4347 struct si_shader_context *ctx = si_shader_context(bld_base);
4348 const struct tgsi_full_instruction *inst = emit_data->inst;
4349 unsigned target = inst->Texture.Texture;
4350 unsigned sampler_src;
4351 unsigned sampler_index;
4352
4353 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4354 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4355
4356 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4357 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4358 LLVMValueRef ind_index;
4359
4360 ind_index = get_bounded_indirect_index(ctx,
4361 &reg->Indirect,
4362 reg->Register.Index,
4363 SI_NUM_SAMPLERS);
4364
4365 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4366
4367 if (target == TGSI_TEXTURE_2D_MSAA ||
4368 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4369 if (samp_ptr)
4370 *samp_ptr = NULL;
4371 if (fmask_ptr)
4372 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4373 } else {
4374 if (samp_ptr) {
4375 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4376 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4377 }
4378 if (fmask_ptr)
4379 *fmask_ptr = NULL;
4380 }
4381 } else {
4382 *res_ptr = ctx->sampler_views[sampler_index];
4383 if (samp_ptr)
4384 *samp_ptr = ctx->sampler_states[sampler_index];
4385 if (fmask_ptr)
4386 *fmask_ptr = ctx->fmasks[sampler_index];
4387 }
4388 }
4389
4390 static void txq_fetch_args(
4391 struct lp_build_tgsi_context *bld_base,
4392 struct lp_build_emit_data *emit_data)
4393 {
4394 struct si_shader_context *ctx = si_shader_context(bld_base);
4395 struct gallivm_state *gallivm = bld_base->base.gallivm;
4396 LLVMBuilderRef builder = gallivm->builder;
4397 const struct tgsi_full_instruction *inst = emit_data->inst;
4398 unsigned target = inst->Texture.Texture;
4399 LLVMValueRef res_ptr;
4400 LLVMValueRef address;
4401
4402 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4403
4404 if (target == TGSI_TEXTURE_BUFFER) {
4405 /* Read the size from the buffer descriptor directly. */
4406 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4407 emit_data->args[0] = get_buffer_size(bld_base, res);
4408 return;
4409 }
4410
4411 /* Textures - set the mip level. */
4412 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4413
4414 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4415 NULL, &address, 1, 0xf);
4416 }
4417
4418 static void txq_emit(const struct lp_build_tgsi_action *action,
4419 struct lp_build_tgsi_context *bld_base,
4420 struct lp_build_emit_data *emit_data)
4421 {
4422 struct lp_build_context *base = &bld_base->base;
4423 unsigned target = emit_data->inst->Texture.Texture;
4424
4425 if (target == TGSI_TEXTURE_BUFFER) {
4426 /* Just return the buffer size. */
4427 emit_data->output[emit_data->chan] = emit_data->args[0];
4428 return;
4429 }
4430
4431 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4432 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4433 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4434 LLVMReadNoneAttribute);
4435
4436 /* Divide the number of layers by 6 to get the number of cubes. */
4437 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4438 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4439 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4440 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4441 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4442
4443 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4444 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4445 z = LLVMBuildSDiv(builder, z, six, "");
4446
4447 emit_data->output[emit_data->chan] =
4448 LLVMBuildInsertElement(builder, v4, z, two, "");
4449 }
4450 }
4451
4452 static void tex_fetch_args(
4453 struct lp_build_tgsi_context *bld_base,
4454 struct lp_build_emit_data *emit_data)
4455 {
4456 struct si_shader_context *ctx = si_shader_context(bld_base);
4457 struct gallivm_state *gallivm = bld_base->base.gallivm;
4458 const struct tgsi_full_instruction *inst = emit_data->inst;
4459 unsigned opcode = inst->Instruction.Opcode;
4460 unsigned target = inst->Texture.Texture;
4461 LLVMValueRef coords[5], derivs[6];
4462 LLVMValueRef address[16];
4463 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4464 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4465 unsigned count = 0;
4466 unsigned chan;
4467 unsigned num_deriv_channels = 0;
4468 bool has_offset = inst->Texture.NumOffsets > 0;
4469 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4470 unsigned dmask = 0xf;
4471
4472 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4473
4474 if (target == TGSI_TEXTURE_BUFFER) {
4475 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4476
4477 /* Bitcast and truncate v8i32 to v16i8. */
4478 LLVMValueRef res = res_ptr;
4479 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4480 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4481 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4482
4483 emit_data->dst_type = ctx->v4f32;
4484 emit_data->args[0] = res;
4485 emit_data->args[1] = bld_base->uint_bld.zero;
4486 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4487 emit_data->arg_count = 3;
4488 return;
4489 }
4490
4491 /* Fetch and project texture coordinates */
4492 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4493 for (chan = 0; chan < 3; chan++ ) {
4494 coords[chan] = lp_build_emit_fetch(bld_base,
4495 emit_data->inst, 0,
4496 chan);
4497 if (opcode == TGSI_OPCODE_TXP)
4498 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4499 TGSI_OPCODE_DIV,
4500 coords[chan],
4501 coords[3]);
4502 }
4503
4504 if (opcode == TGSI_OPCODE_TXP)
4505 coords[3] = bld_base->base.one;
4506
4507 /* Pack offsets. */
4508 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4509 /* The offsets are six-bit signed integers packed like this:
4510 * X=[5:0], Y=[13:8], and Z=[21:16].
4511 */
4512 LLVMValueRef offset[3], pack;
4513
4514 assert(inst->Texture.NumOffsets == 1);
4515
4516 for (chan = 0; chan < 3; chan++) {
4517 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4518 emit_data->inst, 0, chan);
4519 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4520 lp_build_const_int32(gallivm, 0x3f), "");
4521 if (chan)
4522 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4523 lp_build_const_int32(gallivm, chan*8), "");
4524 }
4525
4526 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4527 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4528 address[count++] = pack;
4529 }
4530
4531 /* Pack LOD bias value */
4532 if (opcode == TGSI_OPCODE_TXB)
4533 address[count++] = coords[3];
4534 if (opcode == TGSI_OPCODE_TXB2)
4535 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4536
4537 /* Pack depth comparison value */
4538 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4539 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4540 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4541 } else {
4542 assert(ref_pos >= 0);
4543 address[count++] = coords[ref_pos];
4544 }
4545 }
4546
4547 /* Pack user derivatives */
4548 if (opcode == TGSI_OPCODE_TXD) {
4549 int param, num_src_deriv_channels;
4550
4551 switch (target) {
4552 case TGSI_TEXTURE_3D:
4553 num_src_deriv_channels = 3;
4554 num_deriv_channels = 3;
4555 break;
4556 case TGSI_TEXTURE_2D:
4557 case TGSI_TEXTURE_SHADOW2D:
4558 case TGSI_TEXTURE_RECT:
4559 case TGSI_TEXTURE_SHADOWRECT:
4560 case TGSI_TEXTURE_2D_ARRAY:
4561 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4562 num_src_deriv_channels = 2;
4563 num_deriv_channels = 2;
4564 break;
4565 case TGSI_TEXTURE_CUBE:
4566 case TGSI_TEXTURE_SHADOWCUBE:
4567 case TGSI_TEXTURE_CUBE_ARRAY:
4568 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4569 /* Cube derivatives will be converted to 2D. */
4570 num_src_deriv_channels = 3;
4571 num_deriv_channels = 2;
4572 break;
4573 case TGSI_TEXTURE_1D:
4574 case TGSI_TEXTURE_SHADOW1D:
4575 case TGSI_TEXTURE_1D_ARRAY:
4576 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4577 num_src_deriv_channels = 1;
4578 num_deriv_channels = 1;
4579 break;
4580 default:
4581 unreachable("invalid target");
4582 }
4583
4584 for (param = 0; param < 2; param++)
4585 for (chan = 0; chan < num_src_deriv_channels; chan++)
4586 derivs[param * num_src_deriv_channels + chan] =
4587 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4588 }
4589
4590 if (target == TGSI_TEXTURE_CUBE ||
4591 target == TGSI_TEXTURE_CUBE_ARRAY ||
4592 target == TGSI_TEXTURE_SHADOWCUBE ||
4593 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4594 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4595
4596 if (opcode == TGSI_OPCODE_TXD)
4597 for (int i = 0; i < num_deriv_channels * 2; i++)
4598 address[count++] = derivs[i];
4599
4600 /* Pack texture coordinates */
4601 address[count++] = coords[0];
4602 if (num_coords > 1)
4603 address[count++] = coords[1];
4604 if (num_coords > 2)
4605 address[count++] = coords[2];
4606
4607 /* Pack LOD or sample index */
4608 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4609 address[count++] = coords[3];
4610 else if (opcode == TGSI_OPCODE_TXL2)
4611 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4612
4613 if (count > 16) {
4614 assert(!"Cannot handle more than 16 texture address parameters");
4615 count = 16;
4616 }
4617
4618 for (chan = 0; chan < count; chan++ ) {
4619 address[chan] = LLVMBuildBitCast(gallivm->builder,
4620 address[chan], ctx->i32, "");
4621 }
4622
4623 /* Adjust the sample index according to FMASK.
4624 *
4625 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4626 * which is the identity mapping. Each nibble says which physical sample
4627 * should be fetched to get that sample.
4628 *
4629 * For example, 0x11111100 means there are only 2 samples stored and
4630 * the second sample covers 3/4 of the pixel. When reading samples 0
4631 * and 1, return physical sample 0 (determined by the first two 0s
4632 * in FMASK), otherwise return physical sample 1.
4633 *
4634 * The sample index should be adjusted as follows:
4635 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4636 */
4637 if (target == TGSI_TEXTURE_2D_MSAA ||
4638 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4639 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4640 struct lp_build_emit_data txf_emit_data = *emit_data;
4641 LLVMValueRef txf_address[4];
4642 unsigned txf_count = count;
4643 struct tgsi_full_instruction inst = {};
4644
4645 memcpy(txf_address, address, sizeof(txf_address));
4646
4647 if (target == TGSI_TEXTURE_2D_MSAA) {
4648 txf_address[2] = bld_base->uint_bld.zero;
4649 }
4650 txf_address[3] = bld_base->uint_bld.zero;
4651
4652 /* Read FMASK using TXF. */
4653 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4654 inst.Texture.Texture = target;
4655 txf_emit_data.inst = &inst;
4656 txf_emit_data.chan = 0;
4657 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4658 target, fmask_ptr, NULL,
4659 txf_address, txf_count, 0xf);
4660 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4661
4662 /* Initialize some constants. */
4663 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4664 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4665
4666 /* Apply the formula. */
4667 LLVMValueRef fmask =
4668 LLVMBuildExtractElement(gallivm->builder,
4669 txf_emit_data.output[0],
4670 uint_bld->zero, "");
4671
4672 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4673
4674 LLVMValueRef sample_index4 =
4675 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4676
4677 LLVMValueRef shifted_fmask =
4678 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4679
4680 LLVMValueRef final_sample =
4681 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4682
4683 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4684 * resource descriptor is 0 (invalid),
4685 */
4686 LLVMValueRef fmask_desc =
4687 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4688 ctx->v8i32, "");
4689
4690 LLVMValueRef fmask_word1 =
4691 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4692 uint_bld->one, "");
4693
4694 LLVMValueRef word1_is_nonzero =
4695 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4696 fmask_word1, uint_bld->zero, "");
4697
4698 /* Replace the MSAA sample index. */
4699 address[sample_chan] =
4700 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4701 final_sample, address[sample_chan], "");
4702 }
4703
4704 if (opcode == TGSI_OPCODE_TXF) {
4705 /* add tex offsets */
4706 if (inst->Texture.NumOffsets) {
4707 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4708 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4709 const struct tgsi_texture_offset *off = inst->TexOffsets;
4710
4711 assert(inst->Texture.NumOffsets == 1);
4712
4713 switch (target) {
4714 case TGSI_TEXTURE_3D:
4715 address[2] = lp_build_add(uint_bld, address[2],
4716 bld->immediates[off->Index][off->SwizzleZ]);
4717 /* fall through */
4718 case TGSI_TEXTURE_2D:
4719 case TGSI_TEXTURE_SHADOW2D:
4720 case TGSI_TEXTURE_RECT:
4721 case TGSI_TEXTURE_SHADOWRECT:
4722 case TGSI_TEXTURE_2D_ARRAY:
4723 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4724 address[1] =
4725 lp_build_add(uint_bld, address[1],
4726 bld->immediates[off->Index][off->SwizzleY]);
4727 /* fall through */
4728 case TGSI_TEXTURE_1D:
4729 case TGSI_TEXTURE_SHADOW1D:
4730 case TGSI_TEXTURE_1D_ARRAY:
4731 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4732 address[0] =
4733 lp_build_add(uint_bld, address[0],
4734 bld->immediates[off->Index][off->SwizzleX]);
4735 break;
4736 /* texture offsets do not apply to other texture targets */
4737 }
4738 }
4739 }
4740
4741 if (opcode == TGSI_OPCODE_TG4) {
4742 unsigned gather_comp = 0;
4743
4744 /* DMASK was repurposed for GATHER4. 4 components are always
4745 * returned and DMASK works like a swizzle - it selects
4746 * the component to fetch. The only valid DMASK values are
4747 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4748 * (red,red,red,red) etc.) The ISA document doesn't mention
4749 * this.
4750 */
4751
4752 /* Get the component index from src1.x for Gather4. */
4753 if (!tgsi_is_shadow_target(target)) {
4754 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4755 LLVMValueRef comp_imm;
4756 struct tgsi_src_register src1 = inst->Src[1].Register;
4757
4758 assert(src1.File == TGSI_FILE_IMMEDIATE);
4759
4760 comp_imm = imms[src1.Index][src1.SwizzleX];
4761 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4762 gather_comp = CLAMP(gather_comp, 0, 3);
4763 }
4764
4765 dmask = 1 << gather_comp;
4766 }
4767
4768 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4769 samp_ptr, address, count, dmask);
4770 }
4771
4772 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
4773 * incorrectly forces nearest filtering if the texture format is integer.
4774 * The only effect it has on Gather4, which always returns 4 texels for
4775 * bilinear filtering, is that the final coordinates are off by 0.5 of
4776 * the texel size.
4777 *
4778 * The workaround is to subtract 0.5 from the unnormalized coordinates,
4779 * or (0.5 / size) from the normalized coordinates.
4780 */
4781 static void si_lower_gather4_integer(struct si_shader_context *ctx,
4782 struct lp_build_emit_data *emit_data,
4783 const char *intr_name,
4784 unsigned coord_vgpr_index)
4785 {
4786 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4787 LLVMValueRef coord = emit_data->args[0];
4788 LLVMValueRef half_texel[2];
4789 int c;
4790
4791 if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_RECT ||
4792 emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
4793 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
4794 } else {
4795 struct tgsi_full_instruction txq_inst = {};
4796 struct lp_build_emit_data txq_emit_data = {};
4797
4798 /* Query the texture size. */
4799 txq_inst.Texture.Texture = emit_data->inst->Texture.Texture;
4800 txq_emit_data.inst = &txq_inst;
4801 txq_emit_data.dst_type = ctx->v4i32;
4802 set_tex_fetch_args(ctx, &txq_emit_data, TGSI_OPCODE_TXQ,
4803 txq_inst.Texture.Texture,
4804 emit_data->args[1], NULL,
4805 &ctx->radeon_bld.soa.bld_base.uint_bld.zero,
4806 1, 0xf);
4807 txq_emit(NULL, &ctx->radeon_bld.soa.bld_base, &txq_emit_data);
4808
4809 /* Compute -0.5 / size. */
4810 for (c = 0; c < 2; c++) {
4811 half_texel[c] =
4812 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
4813 LLVMConstInt(ctx->i32, c, 0), "");
4814 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
4815 half_texel[c] =
4816 lp_build_emit_llvm_unary(&ctx->radeon_bld.soa.bld_base,
4817 TGSI_OPCODE_RCP, half_texel[c]);
4818 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
4819 LLVMConstReal(ctx->f32, -0.5), "");
4820 }
4821 }
4822
4823 for (c = 0; c < 2; c++) {
4824 LLVMValueRef tmp;
4825 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
4826
4827 tmp = LLVMBuildExtractElement(builder, coord, index, "");
4828 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4829 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
4830 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4831 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
4832 }
4833
4834 emit_data->args[0] = coord;
4835 emit_data->output[emit_data->chan] =
4836 lp_build_intrinsic(builder, intr_name, emit_data->dst_type,
4837 emit_data->args, emit_data->arg_count,
4838 LLVMReadNoneAttribute);
4839 }
4840
4841 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4842 struct lp_build_tgsi_context *bld_base,
4843 struct lp_build_emit_data *emit_data)
4844 {
4845 struct si_shader_context *ctx = si_shader_context(bld_base);
4846 struct lp_build_context *base = &bld_base->base;
4847 const struct tgsi_full_instruction *inst = emit_data->inst;
4848 unsigned opcode = inst->Instruction.Opcode;
4849 unsigned target = inst->Texture.Texture;
4850 char intr_name[127];
4851 bool has_offset = inst->Texture.NumOffsets > 0;
4852 bool is_shadow = tgsi_is_shadow_target(target);
4853 char type[64];
4854 const char *name = "llvm.SI.image.sample";
4855 const char *infix = "";
4856
4857 if (target == TGSI_TEXTURE_BUFFER) {
4858 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4859 base->gallivm->builder,
4860 "llvm.SI.vs.load.input", emit_data->dst_type,
4861 emit_data->args, emit_data->arg_count,
4862 LLVMReadNoneAttribute);
4863 return;
4864 }
4865
4866 switch (opcode) {
4867 case TGSI_OPCODE_TXF:
4868 name = target == TGSI_TEXTURE_2D_MSAA ||
4869 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4870 "llvm.SI.image.load" :
4871 "llvm.SI.image.load.mip";
4872 is_shadow = false;
4873 has_offset = false;
4874 break;
4875 case TGSI_OPCODE_LODQ:
4876 name = "llvm.SI.getlod";
4877 is_shadow = false;
4878 has_offset = false;
4879 break;
4880 case TGSI_OPCODE_TEX:
4881 case TGSI_OPCODE_TEX2:
4882 case TGSI_OPCODE_TXP:
4883 if (ctx->type != PIPE_SHADER_FRAGMENT)
4884 infix = ".lz";
4885 break;
4886 case TGSI_OPCODE_TXB:
4887 case TGSI_OPCODE_TXB2:
4888 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4889 infix = ".b";
4890 break;
4891 case TGSI_OPCODE_TXL:
4892 case TGSI_OPCODE_TXL2:
4893 infix = ".l";
4894 break;
4895 case TGSI_OPCODE_TXD:
4896 infix = ".d";
4897 break;
4898 case TGSI_OPCODE_TG4:
4899 name = "llvm.SI.gather4";
4900 infix = ".lz";
4901 break;
4902 default:
4903 assert(0);
4904 return;
4905 }
4906
4907 /* Add the type and suffixes .c, .o if needed. */
4908 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4909 sprintf(intr_name, "%s%s%s%s.%s",
4910 name, is_shadow ? ".c" : "", infix,
4911 has_offset ? ".o" : "", type);
4912
4913 /* The hardware needs special lowering for Gather4 with integer formats. */
4914 if (opcode == TGSI_OPCODE_TG4) {
4915 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4916 /* This will also work with non-constant indexing because of how
4917 * glsl_to_tgsi works and we intent to preserve that behavior.
4918 */
4919 const unsigned src_idx = 2;
4920 unsigned sampler = inst->Src[src_idx].Register.Index;
4921
4922 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
4923
4924 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
4925 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT) {
4926 /* Texture coordinates start after:
4927 * {offset, bias, z-compare, derivatives}
4928 * Only the offset and z-compare can occur here.
4929 */
4930 si_lower_gather4_integer(ctx, emit_data, intr_name,
4931 (int)has_offset + (int)is_shadow);
4932 return;
4933 }
4934 }
4935
4936 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4937 base->gallivm->builder, intr_name, emit_data->dst_type,
4938 emit_data->args, emit_data->arg_count,
4939 LLVMReadNoneAttribute);
4940 }
4941
4942 static void si_llvm_emit_txqs(
4943 const struct lp_build_tgsi_action *action,
4944 struct lp_build_tgsi_context *bld_base,
4945 struct lp_build_emit_data *emit_data)
4946 {
4947 struct si_shader_context *ctx = si_shader_context(bld_base);
4948 struct gallivm_state *gallivm = bld_base->base.gallivm;
4949 LLVMBuilderRef builder = gallivm->builder;
4950 LLVMValueRef res, samples;
4951 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4952
4953 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4954
4955
4956 /* Read the samples from the descriptor directly. */
4957 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4958 samples = LLVMBuildExtractElement(
4959 builder, res,
4960 lp_build_const_int32(gallivm, 3), "");
4961 samples = LLVMBuildLShr(builder, samples,
4962 lp_build_const_int32(gallivm, 16), "");
4963 samples = LLVMBuildAnd(builder, samples,
4964 lp_build_const_int32(gallivm, 0xf), "");
4965 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4966 samples, "");
4967
4968 emit_data->output[emit_data->chan] = samples;
4969 }
4970
4971 /*
4972 * SI implements derivatives using the local data store (LDS)
4973 * All writes to the LDS happen in all executing threads at
4974 * the same time. TID is the Thread ID for the current
4975 * thread and is a value between 0 and 63, representing
4976 * the thread's position in the wavefront.
4977 *
4978 * For the pixel shader threads are grouped into quads of four pixels.
4979 * The TIDs of the pixels of a quad are:
4980 *
4981 * +------+------+
4982 * |4n + 0|4n + 1|
4983 * +------+------+
4984 * |4n + 2|4n + 3|
4985 * +------+------+
4986 *
4987 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4988 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4989 * the current pixel's column, and masking with 0xfffffffe yields the TID
4990 * of the left pixel of the current pixel's row.
4991 *
4992 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4993 * adding 2 yields the TID of the pixel below the top pixel.
4994 */
4995 /* masks for thread ID. */
4996 #define TID_MASK_TOP_LEFT 0xfffffffc
4997 #define TID_MASK_TOP 0xfffffffd
4998 #define TID_MASK_LEFT 0xfffffffe
4999
5000 static void si_llvm_emit_ddxy(
5001 const struct lp_build_tgsi_action *action,
5002 struct lp_build_tgsi_context *bld_base,
5003 struct lp_build_emit_data *emit_data)
5004 {
5005 struct si_shader_context *ctx = si_shader_context(bld_base);
5006 struct gallivm_state *gallivm = bld_base->base.gallivm;
5007 const struct tgsi_full_instruction *inst = emit_data->inst;
5008 unsigned opcode = inst->Instruction.Opcode;
5009 LLVMValueRef indices[2];
5010 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
5011 LLVMValueRef tl, trbl, result[4];
5012 LLVMValueRef tl_tid, trbl_tid;
5013 unsigned swizzle[4];
5014 unsigned c;
5015 int idx;
5016 unsigned mask;
5017
5018 indices[0] = bld_base->uint_bld.zero;
5019 indices[1] = get_thread_id(ctx);
5020 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
5021 indices, 2, "");
5022
5023 if (opcode == TGSI_OPCODE_DDX_FINE)
5024 mask = TID_MASK_LEFT;
5025 else if (opcode == TGSI_OPCODE_DDY_FINE)
5026 mask = TID_MASK_TOP;
5027 else
5028 mask = TID_MASK_TOP_LEFT;
5029
5030 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
5031 lp_build_const_int32(gallivm, mask), "");
5032 indices[1] = tl_tid;
5033 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
5034 indices, 2, "");
5035
5036 /* for DDX we want to next X pixel, DDY next Y pixel. */
5037 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5038 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
5039 lp_build_const_int32(gallivm, idx), "");
5040 indices[1] = trbl_tid;
5041 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
5042 indices, 2, "");
5043
5044 for (c = 0; c < 4; ++c) {
5045 unsigned i;
5046 LLVMValueRef val;
5047 LLVMValueRef args[2];
5048
5049 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
5050 for (i = 0; i < c; ++i) {
5051 if (swizzle[i] == swizzle[c]) {
5052 result[c] = result[i];
5053 break;
5054 }
5055 }
5056 if (i != c)
5057 continue;
5058
5059 val = LLVMBuildBitCast(gallivm->builder,
5060 lp_build_emit_fetch(bld_base, inst, 0, c),
5061 ctx->i32, "");
5062
5063 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
5064
5065 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
5066 lp_build_const_int32(gallivm, 4), "");
5067 args[1] = val;
5068 tl = lp_build_intrinsic(gallivm->builder,
5069 "llvm.amdgcn.ds.bpermute", ctx->i32,
5070 args, 2, LLVMReadNoneAttribute);
5071
5072 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
5073 lp_build_const_int32(gallivm, 4), "");
5074 trbl = lp_build_intrinsic(gallivm->builder,
5075 "llvm.amdgcn.ds.bpermute", ctx->i32,
5076 args, 2, LLVMReadNoneAttribute);
5077 } else {
5078 LLVMBuildStore(gallivm->builder, val, store_ptr);
5079 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
5080 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
5081 }
5082 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5083 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
5084 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
5085 }
5086
5087 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
5088 }
5089
5090 /*
5091 * this takes an I,J coordinate pair,
5092 * and works out the X and Y derivatives.
5093 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5094 */
5095 static LLVMValueRef si_llvm_emit_ddxy_interp(
5096 struct lp_build_tgsi_context *bld_base,
5097 LLVMValueRef interp_ij)
5098 {
5099 struct si_shader_context *ctx = si_shader_context(bld_base);
5100 struct gallivm_state *gallivm = bld_base->base.gallivm;
5101 LLVMValueRef indices[2];
5102 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
5103 LLVMValueRef tl, tr, bl, result[4];
5104 unsigned c;
5105
5106 indices[0] = bld_base->uint_bld.zero;
5107 indices[1] = get_thread_id(ctx);
5108 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
5109 indices, 2, "");
5110
5111 temp = LLVMBuildAnd(gallivm->builder, indices[1],
5112 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
5113
5114 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
5115 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
5116
5117 indices[1] = temp;
5118 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
5119 indices, 2, "");
5120
5121 indices[1] = temp2;
5122 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
5123 indices, 2, "");
5124
5125 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
5126 lp_build_const_int32(gallivm, 1), "");
5127 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
5128 indices, 2, "");
5129
5130 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
5131 lp_build_const_int32(gallivm, 2), "");
5132 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
5133 indices, 2, "");
5134
5135 for (c = 0; c < 2; ++c) {
5136 LLVMValueRef store_val;
5137 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
5138
5139 store_val = LLVMBuildExtractElement(gallivm->builder,
5140 interp_ij, c_ll, "");
5141 LLVMBuildStore(gallivm->builder,
5142 store_val,
5143 store_ptr);
5144
5145 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
5146 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5147
5148 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
5149 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5150
5151 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5152
5153 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5154 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5155
5156 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5157 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5158
5159 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5160 }
5161
5162 return lp_build_gather_values(gallivm, result, 4);
5163 }
5164
5165 static void interp_fetch_args(
5166 struct lp_build_tgsi_context *bld_base,
5167 struct lp_build_emit_data *emit_data)
5168 {
5169 struct si_shader_context *ctx = si_shader_context(bld_base);
5170 struct gallivm_state *gallivm = bld_base->base.gallivm;
5171 const struct tgsi_full_instruction *inst = emit_data->inst;
5172
5173 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5174 /* offset is in second src, first two channels */
5175 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5176 emit_data->inst, 1,
5177 TGSI_CHAN_X);
5178 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5179 emit_data->inst, 1,
5180 TGSI_CHAN_Y);
5181 emit_data->arg_count = 2;
5182 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5183 LLVMValueRef sample_position;
5184 LLVMValueRef sample_id;
5185 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5186
5187 /* fetch sample ID, then fetch its sample position,
5188 * and place into first two channels.
5189 */
5190 sample_id = lp_build_emit_fetch(bld_base,
5191 emit_data->inst, 1, TGSI_CHAN_X);
5192 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5193 ctx->i32, "");
5194 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5195
5196 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5197 sample_position,
5198 lp_build_const_int32(gallivm, 0), "");
5199
5200 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5201 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5202 sample_position,
5203 lp_build_const_int32(gallivm, 1), "");
5204 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5205 emit_data->arg_count = 2;
5206 }
5207 }
5208
5209 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5210 struct lp_build_tgsi_context *bld_base,
5211 struct lp_build_emit_data *emit_data)
5212 {
5213 struct si_shader_context *ctx = si_shader_context(bld_base);
5214 struct si_shader *shader = ctx->shader;
5215 struct gallivm_state *gallivm = bld_base->base.gallivm;
5216 LLVMValueRef interp_param;
5217 const struct tgsi_full_instruction *inst = emit_data->inst;
5218 const char *intr_name;
5219 int input_index = inst->Src[0].Register.Index;
5220 int chan;
5221 int i;
5222 LLVMValueRef attr_number;
5223 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5224 int interp_param_idx;
5225 unsigned interp = shader->selector->info.input_interpolate[input_index];
5226 unsigned location;
5227
5228 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5229
5230 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5231 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5232 location = TGSI_INTERPOLATE_LOC_CENTER;
5233 else
5234 location = TGSI_INTERPOLATE_LOC_CENTROID;
5235
5236 interp_param_idx = lookup_interp_param_index(interp, location);
5237 if (interp_param_idx == -1)
5238 return;
5239 else if (interp_param_idx)
5240 interp_param = get_interp_param(ctx, interp_param_idx);
5241 else
5242 interp_param = NULL;
5243
5244 attr_number = lp_build_const_int32(gallivm, input_index);
5245
5246 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5247 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5248 LLVMValueRef ij_out[2];
5249 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5250
5251 /*
5252 * take the I then J parameters, and the DDX/Y for it, and
5253 * calculate the IJ inputs for the interpolator.
5254 * temp1 = ddx * offset/sample.x + I;
5255 * interp_param.I = ddy * offset/sample.y + temp1;
5256 * temp1 = ddx * offset/sample.x + J;
5257 * interp_param.J = ddy * offset/sample.y + temp1;
5258 */
5259 for (i = 0; i < 2; i++) {
5260 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5261 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5262 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5263 ddxy_out, ix_ll, "");
5264 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5265 ddxy_out, iy_ll, "");
5266 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5267 interp_param, ix_ll, "");
5268 LLVMValueRef temp1, temp2;
5269
5270 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5271 ctx->f32, "");
5272
5273 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5274
5275 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5276
5277 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5278
5279 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5280
5281 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5282 temp2, ctx->i32, "");
5283 }
5284 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5285 }
5286
5287 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5288 for (chan = 0; chan < 2; chan++) {
5289 LLVMValueRef args[4];
5290 LLVMValueRef llvm_chan;
5291 unsigned schan;
5292
5293 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5294 llvm_chan = lp_build_const_int32(gallivm, schan);
5295
5296 args[0] = llvm_chan;
5297 args[1] = attr_number;
5298 args[2] = params;
5299 args[3] = interp_param;
5300
5301 emit_data->output[chan] =
5302 lp_build_intrinsic(gallivm->builder, intr_name,
5303 ctx->f32, args, args[3] ? 4 : 3,
5304 LLVMReadNoneAttribute);
5305 }
5306 }
5307
5308 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5309 struct lp_build_emit_data *emit_data)
5310 {
5311 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5312 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5313 unsigned stream;
5314
5315 assert(src0.File == TGSI_FILE_IMMEDIATE);
5316
5317 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5318 return stream;
5319 }
5320
5321 /* Emit one vertex from the geometry shader */
5322 static void si_llvm_emit_vertex(
5323 const struct lp_build_tgsi_action *action,
5324 struct lp_build_tgsi_context *bld_base,
5325 struct lp_build_emit_data *emit_data)
5326 {
5327 struct si_shader_context *ctx = si_shader_context(bld_base);
5328 struct lp_build_context *uint = &bld_base->uint_bld;
5329 struct si_shader *shader = ctx->shader;
5330 struct tgsi_shader_info *info = &shader->selector->info;
5331 struct gallivm_state *gallivm = bld_base->base.gallivm;
5332 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5333 SI_PARAM_GS2VS_OFFSET);
5334 LLVMValueRef gs_next_vertex;
5335 LLVMValueRef can_emit, kill;
5336 LLVMValueRef args[2];
5337 unsigned chan;
5338 int i;
5339 unsigned stream;
5340
5341 stream = si_llvm_get_stream(bld_base, emit_data);
5342
5343 /* Write vertex attribute values to GSVS ring */
5344 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5345 ctx->gs_next_vertex[stream],
5346 "");
5347
5348 /* If this thread has already emitted the declared maximum number of
5349 * vertices, kill it: excessive vertex emissions are not supposed to
5350 * have any effect, and GS threads have no externally observable
5351 * effects other than emitting vertices.
5352 */
5353 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5354 lp_build_const_int32(gallivm,
5355 shader->selector->gs_max_out_vertices), "");
5356 kill = lp_build_select(&bld_base->base, can_emit,
5357 lp_build_const_float(gallivm, 1.0f),
5358 lp_build_const_float(gallivm, -1.0f));
5359
5360 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5361 ctx->voidt, &kill, 1, 0);
5362
5363 for (i = 0; i < info->num_outputs; i++) {
5364 LLVMValueRef *out_ptr =
5365 ctx->radeon_bld.soa.outputs[i];
5366
5367 for (chan = 0; chan < 4; chan++) {
5368 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5369 LLVMValueRef voffset =
5370 lp_build_const_int32(gallivm, (i * 4 + chan) *
5371 shader->selector->gs_max_out_vertices);
5372
5373 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5374 voffset = lp_build_mul_imm(uint, voffset, 4);
5375
5376 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5377
5378 build_tbuffer_store(ctx,
5379 ctx->gsvs_ring[stream],
5380 out_val, 1,
5381 voffset, soffset, 0,
5382 V_008F0C_BUF_DATA_FORMAT_32,
5383 V_008F0C_BUF_NUM_FORMAT_UINT,
5384 1, 0, 1, 1, 0);
5385 }
5386 }
5387 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5388 lp_build_const_int32(gallivm, 1));
5389
5390 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5391
5392 /* Signal vertex emission */
5393 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5394 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5395 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5396 ctx->voidt, args, 2, 0);
5397 }
5398
5399 /* Cut one primitive from the geometry shader */
5400 static void si_llvm_emit_primitive(
5401 const struct lp_build_tgsi_action *action,
5402 struct lp_build_tgsi_context *bld_base,
5403 struct lp_build_emit_data *emit_data)
5404 {
5405 struct si_shader_context *ctx = si_shader_context(bld_base);
5406 struct gallivm_state *gallivm = bld_base->base.gallivm;
5407 LLVMValueRef args[2];
5408 unsigned stream;
5409
5410 /* Signal primitive cut */
5411 stream = si_llvm_get_stream(bld_base, emit_data);
5412 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5413 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5414 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5415 ctx->voidt, args, 2, 0);
5416 }
5417
5418 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5419 struct lp_build_tgsi_context *bld_base,
5420 struct lp_build_emit_data *emit_data)
5421 {
5422 struct si_shader_context *ctx = si_shader_context(bld_base);
5423 struct gallivm_state *gallivm = bld_base->base.gallivm;
5424
5425 /* The real barrier instruction isn’t needed, because an entire patch
5426 * always fits into a single wave.
5427 */
5428 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5429 emit_optimization_barrier(ctx);
5430 return;
5431 }
5432
5433 lp_build_intrinsic(gallivm->builder,
5434 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5435 : "llvm.AMDGPU.barrier.local",
5436 ctx->voidt, NULL, 0, 0);
5437 }
5438
5439 static const struct lp_build_tgsi_action tex_action = {
5440 .fetch_args = tex_fetch_args,
5441 .emit = build_tex_intrinsic,
5442 };
5443
5444 static const struct lp_build_tgsi_action interp_action = {
5445 .fetch_args = interp_fetch_args,
5446 .emit = build_interp_intrinsic,
5447 };
5448
5449 static void si_create_function(struct si_shader_context *ctx,
5450 LLVMTypeRef *returns, unsigned num_returns,
5451 LLVMTypeRef *params, unsigned num_params,
5452 int last_sgpr)
5453 {
5454 int i;
5455
5456 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5457 params, num_params);
5458 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5459 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5460
5461 for (i = 0; i <= last_sgpr; ++i) {
5462 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5463
5464 /* The combination of:
5465 * - ByVal
5466 * - dereferenceable
5467 * - invariant.load
5468 * allows the optimization passes to move loads and reduces
5469 * SGPR spilling significantly.
5470 */
5471 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5472 LLVMAddAttribute(P, LLVMByValAttribute);
5473 lp_add_attr_dereferenceable(P, UINT64_MAX);
5474 } else
5475 LLVMAddAttribute(P, LLVMInRegAttribute);
5476 }
5477
5478 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5479 /* These were copied from some LLVM test. */
5480 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5481 "less-precise-fpmad",
5482 "true");
5483 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5484 "no-infs-fp-math",
5485 "true");
5486 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5487 "no-nans-fp-math",
5488 "true");
5489 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5490 "unsafe-fp-math",
5491 "true");
5492 }
5493 }
5494
5495 static void create_meta_data(struct si_shader_context *ctx)
5496 {
5497 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5498
5499 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5500 "invariant.load", 14);
5501 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5502 "range", 5);
5503 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5504 "amdgpu.uniform", 14);
5505
5506 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5507 }
5508
5509 static void declare_streamout_params(struct si_shader_context *ctx,
5510 struct pipe_stream_output_info *so,
5511 LLVMTypeRef *params, LLVMTypeRef i32,
5512 unsigned *num_params)
5513 {
5514 int i;
5515
5516 /* Streamout SGPRs. */
5517 if (so->num_outputs) {
5518 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5519 params[ctx->param_streamout_config = (*num_params)++] = i32;
5520 else
5521 ctx->param_streamout_config = ctx->param_tess_offchip;
5522
5523 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5524 }
5525 /* A streamout buffer offset is loaded if the stride is non-zero. */
5526 for (i = 0; i < 4; i++) {
5527 if (!so->stride[i])
5528 continue;
5529
5530 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5531 }
5532 }
5533
5534 static unsigned llvm_get_type_size(LLVMTypeRef type)
5535 {
5536 LLVMTypeKind kind = LLVMGetTypeKind(type);
5537
5538 switch (kind) {
5539 case LLVMIntegerTypeKind:
5540 return LLVMGetIntTypeWidth(type) / 8;
5541 case LLVMFloatTypeKind:
5542 return 4;
5543 case LLVMPointerTypeKind:
5544 return 8;
5545 case LLVMVectorTypeKind:
5546 return LLVMGetVectorSize(type) *
5547 llvm_get_type_size(LLVMGetElementType(type));
5548 default:
5549 assert(0);
5550 return 0;
5551 }
5552 }
5553
5554 static void declare_tess_lds(struct si_shader_context *ctx)
5555 {
5556 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5557 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5558 struct lp_build_context *uint = &bld_base->uint_bld;
5559
5560 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5561 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, uint->zero,
5562 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5563 "tess_lds");
5564 }
5565
5566 static void create_function(struct si_shader_context *ctx)
5567 {
5568 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5569 struct gallivm_state *gallivm = bld_base->base.gallivm;
5570 struct si_shader *shader = ctx->shader;
5571 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5572 LLVMTypeRef returns[16+32*4];
5573 unsigned i, last_sgpr, num_params, num_return_sgprs;
5574 unsigned num_returns = 0;
5575
5576 v3i32 = LLVMVectorType(ctx->i32, 3);
5577
5578 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5579 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5580 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5581 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5582 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5583
5584 switch (ctx->type) {
5585 case PIPE_SHADER_VERTEX:
5586 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5587 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5588 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5589 params[SI_PARAM_DRAWID] = ctx->i32;
5590 num_params = SI_PARAM_DRAWID+1;
5591
5592 if (shader->key.vs.as_es) {
5593 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5594 } else if (shader->key.vs.as_ls) {
5595 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5596 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5597 } else {
5598 if (ctx->is_gs_copy_shader) {
5599 num_params = SI_PARAM_RW_BUFFERS+1;
5600 } else {
5601 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5602 num_params = SI_PARAM_VS_STATE_BITS+1;
5603 }
5604
5605 /* The locations of the other parameters are assigned dynamically. */
5606 declare_streamout_params(ctx, &shader->selector->so,
5607 params, ctx->i32, &num_params);
5608 }
5609
5610 last_sgpr = num_params-1;
5611
5612 /* VGPRs */
5613 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5614 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5615 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5616 params[ctx->param_instance_id = num_params++] = ctx->i32;
5617
5618 if (!ctx->is_monolithic &&
5619 !ctx->is_gs_copy_shader) {
5620 /* Vertex load indices. */
5621 ctx->param_vertex_index0 = num_params;
5622
5623 for (i = 0; i < shader->selector->info.num_inputs; i++)
5624 params[num_params++] = ctx->i32;
5625
5626 /* PrimitiveID output. */
5627 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5628 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5629 returns[num_returns++] = ctx->f32;
5630 }
5631 break;
5632
5633 case PIPE_SHADER_TESS_CTRL:
5634 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5635 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5636 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5637 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5638 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5639 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5640 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5641
5642 /* VGPRs */
5643 params[SI_PARAM_PATCH_ID] = ctx->i32;
5644 params[SI_PARAM_REL_IDS] = ctx->i32;
5645 num_params = SI_PARAM_REL_IDS+1;
5646
5647 if (!ctx->is_monolithic) {
5648 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5649 * placed after the user SGPRs.
5650 */
5651 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5652 returns[num_returns++] = ctx->i32; /* SGPRs */
5653
5654 for (i = 0; i < 3; i++)
5655 returns[num_returns++] = ctx->f32; /* VGPRs */
5656 }
5657 break;
5658
5659 case PIPE_SHADER_TESS_EVAL:
5660 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5661 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5662
5663 if (shader->key.tes.as_es) {
5664 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5665 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5666 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5667 } else {
5668 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5669 declare_streamout_params(ctx, &shader->selector->so,
5670 params, ctx->i32, &num_params);
5671 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5672 }
5673 last_sgpr = num_params - 1;
5674
5675 /* VGPRs */
5676 params[ctx->param_tes_u = num_params++] = ctx->f32;
5677 params[ctx->param_tes_v = num_params++] = ctx->f32;
5678 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5679 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5680
5681 /* PrimitiveID output. */
5682 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5683 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5684 returns[num_returns++] = ctx->f32;
5685 break;
5686
5687 case PIPE_SHADER_GEOMETRY:
5688 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5689 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5690 last_sgpr = SI_PARAM_GS_WAVE_ID;
5691
5692 /* VGPRs */
5693 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5694 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5695 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5696 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5697 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5698 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5699 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5700 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5701 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5702 break;
5703
5704 case PIPE_SHADER_FRAGMENT:
5705 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5706 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5707 last_sgpr = SI_PARAM_PRIM_MASK;
5708 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5709 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5710 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5711 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5712 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5713 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5714 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5715 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5716 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5717 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5718 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5719 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5720 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5721 params[SI_PARAM_ANCILLARY] = ctx->i32;
5722 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5723 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5724 num_params = SI_PARAM_POS_FIXED_PT+1;
5725
5726 if (!ctx->is_monolithic) {
5727 /* Color inputs from the prolog. */
5728 if (shader->selector->info.colors_read) {
5729 unsigned num_color_elements =
5730 util_bitcount(shader->selector->info.colors_read);
5731
5732 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5733 for (i = 0; i < num_color_elements; i++)
5734 params[num_params++] = ctx->f32;
5735 }
5736
5737 /* Outputs for the epilog. */
5738 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5739 num_returns =
5740 num_return_sgprs +
5741 util_bitcount(shader->selector->info.colors_written) * 4 +
5742 shader->selector->info.writes_z +
5743 shader->selector->info.writes_stencil +
5744 shader->selector->info.writes_samplemask +
5745 1 /* SampleMaskIn */;
5746
5747 num_returns = MAX2(num_returns,
5748 num_return_sgprs +
5749 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5750
5751 for (i = 0; i < num_return_sgprs; i++)
5752 returns[i] = ctx->i32;
5753 for (; i < num_returns; i++)
5754 returns[i] = ctx->f32;
5755 }
5756 break;
5757
5758 case PIPE_SHADER_COMPUTE:
5759 params[SI_PARAM_GRID_SIZE] = v3i32;
5760 params[SI_PARAM_BLOCK_ID] = v3i32;
5761 last_sgpr = SI_PARAM_BLOCK_ID;
5762
5763 params[SI_PARAM_THREAD_ID] = v3i32;
5764 num_params = SI_PARAM_THREAD_ID + 1;
5765 break;
5766 default:
5767 assert(0 && "unimplemented shader");
5768 return;
5769 }
5770
5771 assert(num_params <= ARRAY_SIZE(params));
5772
5773 si_create_function(ctx, returns, num_returns, params,
5774 num_params, last_sgpr);
5775
5776 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5777 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5778 !ctx->is_monolithic) {
5779 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5780 "InitialPSInputAddr",
5781 S_0286D0_PERSP_SAMPLE_ENA(1) |
5782 S_0286D0_PERSP_CENTER_ENA(1) |
5783 S_0286D0_PERSP_CENTROID_ENA(1) |
5784 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5785 S_0286D0_LINEAR_CENTER_ENA(1) |
5786 S_0286D0_LINEAR_CENTROID_ENA(1) |
5787 S_0286D0_FRONT_FACE_ENA(1) |
5788 S_0286D0_POS_FIXED_PT_ENA(1));
5789 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5790 const unsigned *properties = shader->selector->info.properties;
5791 unsigned max_work_group_size =
5792 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5793 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5794 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5795
5796 assert(max_work_group_size);
5797
5798 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5799 "amdgpu-max-work-group-size",
5800 max_work_group_size);
5801 }
5802
5803 shader->info.num_input_sgprs = 0;
5804 shader->info.num_input_vgprs = 0;
5805
5806 for (i = 0; i <= last_sgpr; ++i)
5807 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5808
5809 /* Unused fragment shader inputs are eliminated by the compiler,
5810 * so we don't know yet how many there will be.
5811 */
5812 if (ctx->type != PIPE_SHADER_FRAGMENT)
5813 for (; i < num_params; ++i)
5814 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5815
5816 if (bld_base->info &&
5817 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5818 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5819 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5820 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5821 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5822 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5823 ctx->lds =
5824 LLVMAddGlobalInAddressSpace(gallivm->module,
5825 LLVMArrayType(ctx->i32, 64),
5826 "ddxy_lds",
5827 LOCAL_ADDR_SPACE);
5828
5829 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5830 ctx->type == PIPE_SHADER_TESS_CTRL ||
5831 ctx->type == PIPE_SHADER_TESS_EVAL)
5832 declare_tess_lds(ctx);
5833 }
5834
5835 static void preload_constants(struct si_shader_context *ctx)
5836 {
5837 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5838 struct gallivm_state *gallivm = bld_base->base.gallivm;
5839 const struct tgsi_shader_info *info = bld_base->info;
5840 unsigned buf;
5841 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5842
5843 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5844 if (info->const_file_max[buf] == -1)
5845 continue;
5846
5847 /* Load the resource descriptor */
5848 ctx->const_buffers[buf] =
5849 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5850 }
5851 }
5852
5853 static void preload_shader_buffers(struct si_shader_context *ctx)
5854 {
5855 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5856 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5857 int buf, maxbuf;
5858
5859 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5860 SI_NUM_SHADER_BUFFERS - 1);
5861 for (buf = 0; buf <= maxbuf; ++buf) {
5862 ctx->shader_buffers[buf] =
5863 build_indexed_load_const(
5864 ctx, ptr, lp_build_const_int32(gallivm, buf));
5865 }
5866 }
5867
5868 static void preload_samplers(struct si_shader_context *ctx)
5869 {
5870 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5871 struct gallivm_state *gallivm = bld_base->base.gallivm;
5872 const struct tgsi_shader_info *info = bld_base->info;
5873 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5874 LLVMValueRef offset;
5875
5876 if (num_samplers == 0)
5877 return;
5878
5879 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5880 for (i = 0; i < num_samplers; ++i) {
5881 /* Resource */
5882 offset = lp_build_const_int32(gallivm, i);
5883 ctx->sampler_views[i] =
5884 get_sampler_desc(ctx, offset, DESC_IMAGE);
5885
5886 /* FMASK resource */
5887 if (info->is_msaa_sampler[i])
5888 ctx->fmasks[i] =
5889 get_sampler_desc(ctx, offset, DESC_FMASK);
5890 else {
5891 ctx->sampler_states[i] =
5892 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5893 ctx->sampler_states[i] =
5894 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5895 ctx->sampler_states[i]);
5896 }
5897 }
5898 }
5899
5900 static void preload_images(struct si_shader_context *ctx)
5901 {
5902 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5903 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5904 struct gallivm_state *gallivm = bld_base->base.gallivm;
5905 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5906 LLVMValueRef res_ptr;
5907 unsigned i;
5908
5909 if (num_images == 0)
5910 return;
5911
5912 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5913
5914 for (i = 0; i < num_images; ++i) {
5915 /* Rely on LLVM to shrink the load for buffer resources. */
5916 LLVMValueRef rsrc =
5917 build_indexed_load_const(ctx, res_ptr,
5918 lp_build_const_int32(gallivm, i));
5919
5920 if (info->images_writemask & (1 << i) &&
5921 !(info->images_buffers & (1 << i)))
5922 rsrc = force_dcc_off(ctx, rsrc);
5923
5924 ctx->images[i] = rsrc;
5925 }
5926 }
5927
5928 static void preload_streamout_buffers(struct si_shader_context *ctx)
5929 {
5930 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5931 struct gallivm_state *gallivm = bld_base->base.gallivm;
5932 unsigned i;
5933
5934 /* Streamout can only be used if the shader is compiled as VS. */
5935 if (!ctx->shader->selector->so.num_outputs ||
5936 (ctx->type == PIPE_SHADER_VERTEX &&
5937 (ctx->shader->key.vs.as_es ||
5938 ctx->shader->key.vs.as_ls)) ||
5939 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5940 ctx->shader->key.tes.as_es))
5941 return;
5942
5943 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5944 SI_PARAM_RW_BUFFERS);
5945
5946 /* Load the resources, we rely on the code sinking to do the rest */
5947 for (i = 0; i < 4; ++i) {
5948 if (ctx->shader->selector->so.stride[i]) {
5949 LLVMValueRef offset = lp_build_const_int32(gallivm,
5950 SI_VS_STREAMOUT_BUF0 + i);
5951
5952 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5953 }
5954 }
5955 }
5956
5957 /**
5958 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5959 * for later use.
5960 */
5961 static void preload_ring_buffers(struct si_shader_context *ctx)
5962 {
5963 struct gallivm_state *gallivm =
5964 ctx->radeon_bld.soa.bld_base.base.gallivm;
5965
5966 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5967 SI_PARAM_RW_BUFFERS);
5968
5969 if ((ctx->type == PIPE_SHADER_VERTEX &&
5970 ctx->shader->key.vs.as_es) ||
5971 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5972 ctx->shader->key.tes.as_es) ||
5973 ctx->type == PIPE_SHADER_GEOMETRY) {
5974 unsigned ring =
5975 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5976 : SI_ES_RING_ESGS;
5977 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5978
5979 ctx->esgs_ring =
5980 build_indexed_load_const(ctx, buf_ptr, offset);
5981 }
5982
5983 if (ctx->is_gs_copy_shader) {
5984 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5985
5986 ctx->gsvs_ring[0] =
5987 build_indexed_load_const(ctx, buf_ptr, offset);
5988 }
5989 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5990 int i;
5991 for (i = 0; i < 4; i++) {
5992 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5993
5994 ctx->gsvs_ring[i] =
5995 build_indexed_load_const(ctx, buf_ptr, offset);
5996 }
5997 }
5998 }
5999
6000 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
6001 LLVMValueRef param_rw_buffers,
6002 unsigned param_pos_fixed_pt)
6003 {
6004 struct lp_build_tgsi_context *bld_base =
6005 &ctx->radeon_bld.soa.bld_base;
6006 struct gallivm_state *gallivm = bld_base->base.gallivm;
6007 LLVMBuilderRef builder = gallivm->builder;
6008 LLVMValueRef slot, desc, offset, row, bit, address[2];
6009
6010 /* Use the fixed-point gl_FragCoord input.
6011 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
6012 * per coordinate to get the repeating effect.
6013 */
6014 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
6015 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
6016
6017 /* Load the buffer descriptor. */
6018 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
6019 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
6020
6021 /* The stipple pattern is 32x32, each row has 32 bits. */
6022 offset = LLVMBuildMul(builder, address[1],
6023 LLVMConstInt(ctx->i32, 4, 0), "");
6024 row = buffer_load_const(ctx, desc, offset);
6025 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
6026 bit = LLVMBuildLShr(builder, row, address[0], "");
6027 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
6028
6029 /* The intrinsic kills the thread if arg < 0. */
6030 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
6031 LLVMConstReal(ctx->f32, -1), "");
6032 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
6033 }
6034
6035 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
6036 struct si_shader_config *conf,
6037 unsigned symbol_offset)
6038 {
6039 unsigned i;
6040 const unsigned char *config =
6041 radeon_shader_binary_config_start(binary, symbol_offset);
6042 bool really_needs_scratch = false;
6043
6044 /* LLVM adds SGPR spills to the scratch size.
6045 * Find out if we really need the scratch buffer.
6046 */
6047 for (i = 0; i < binary->reloc_count; i++) {
6048 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
6049
6050 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
6051 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6052 really_needs_scratch = true;
6053 break;
6054 }
6055 }
6056
6057 /* XXX: We may be able to emit some of these values directly rather than
6058 * extracting fields to be emitted later.
6059 */
6060
6061 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
6062 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
6063 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
6064 switch (reg) {
6065 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
6066 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
6067 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
6068 case R_00B848_COMPUTE_PGM_RSRC1:
6069 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
6070 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
6071 conf->float_mode = G_00B028_FLOAT_MODE(value);
6072 conf->rsrc1 = value;
6073 break;
6074 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
6075 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
6076 break;
6077 case R_00B84C_COMPUTE_PGM_RSRC2:
6078 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
6079 conf->rsrc2 = value;
6080 break;
6081 case R_0286CC_SPI_PS_INPUT_ENA:
6082 conf->spi_ps_input_ena = value;
6083 break;
6084 case R_0286D0_SPI_PS_INPUT_ADDR:
6085 conf->spi_ps_input_addr = value;
6086 break;
6087 case R_0286E8_SPI_TMPRING_SIZE:
6088 case R_00B860_COMPUTE_TMPRING_SIZE:
6089 /* WAVESIZE is in units of 256 dwords. */
6090 if (really_needs_scratch)
6091 conf->scratch_bytes_per_wave =
6092 G_00B860_WAVESIZE(value) * 256 * 4;
6093 break;
6094 case 0x4: /* SPILLED_SGPRS */
6095 conf->spilled_sgprs = value;
6096 break;
6097 case 0x8: /* SPILLED_VGPRS */
6098 conf->spilled_vgprs = value;
6099 break;
6100 default:
6101 {
6102 static bool printed;
6103
6104 if (!printed) {
6105 fprintf(stderr, "Warning: LLVM emitted unknown "
6106 "config register: 0x%x\n", reg);
6107 printed = true;
6108 }
6109 }
6110 break;
6111 }
6112 }
6113
6114 if (!conf->spi_ps_input_addr)
6115 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6116 }
6117
6118 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6119 struct si_shader *shader,
6120 struct si_shader_config *config,
6121 uint64_t scratch_va)
6122 {
6123 unsigned i;
6124 uint32_t scratch_rsrc_dword0 = scratch_va;
6125 uint32_t scratch_rsrc_dword1 =
6126 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6127
6128 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6129 * correctly.
6130 */
6131 if (HAVE_LLVM >= 0x0309)
6132 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6133 else
6134 scratch_rsrc_dword1 |=
6135 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6136
6137 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6138 const struct radeon_shader_reloc *reloc =
6139 &shader->binary.relocs[i];
6140 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6141 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6142 &scratch_rsrc_dword0, 4);
6143 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6144 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6145 &scratch_rsrc_dword1, 4);
6146 }
6147 }
6148 }
6149
6150 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6151 {
6152 unsigned size = shader->binary.code_size;
6153
6154 if (shader->prolog)
6155 size += shader->prolog->binary.code_size;
6156 if (shader->epilog)
6157 size += shader->epilog->binary.code_size;
6158 return size;
6159 }
6160
6161 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6162 {
6163 const struct radeon_shader_binary *prolog =
6164 shader->prolog ? &shader->prolog->binary : NULL;
6165 const struct radeon_shader_binary *epilog =
6166 shader->epilog ? &shader->epilog->binary : NULL;
6167 const struct radeon_shader_binary *mainb = &shader->binary;
6168 unsigned bo_size = si_get_shader_binary_size(shader) +
6169 (!epilog ? mainb->rodata_size : 0);
6170 unsigned char *ptr;
6171
6172 assert(!prolog || !prolog->rodata_size);
6173 assert((!prolog && !epilog) || !mainb->rodata_size);
6174 assert(!epilog || !epilog->rodata_size);
6175
6176 r600_resource_reference(&shader->bo, NULL);
6177 shader->bo = si_resource_create_custom(&sscreen->b.b,
6178 PIPE_USAGE_IMMUTABLE,
6179 bo_size);
6180 if (!shader->bo)
6181 return -ENOMEM;
6182
6183 /* Upload. */
6184 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6185 PIPE_TRANSFER_READ_WRITE);
6186
6187 if (prolog) {
6188 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6189 ptr += prolog->code_size;
6190 }
6191
6192 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6193 ptr += mainb->code_size;
6194
6195 if (epilog)
6196 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6197 else if (mainb->rodata_size > 0)
6198 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6199
6200 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6201 return 0;
6202 }
6203
6204 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6205 struct pipe_debug_callback *debug,
6206 const char *name, FILE *file)
6207 {
6208 char *line, *p;
6209 unsigned i, count;
6210
6211 if (binary->disasm_string) {
6212 fprintf(file, "Shader %s disassembly:\n", name);
6213 fprintf(file, "%s", binary->disasm_string);
6214
6215 if (debug && debug->debug_message) {
6216 /* Very long debug messages are cut off, so send the
6217 * disassembly one line at a time. This causes more
6218 * overhead, but on the plus side it simplifies
6219 * parsing of resulting logs.
6220 */
6221 pipe_debug_message(debug, SHADER_INFO,
6222 "Shader Disassembly Begin");
6223
6224 line = binary->disasm_string;
6225 while (*line) {
6226 p = util_strchrnul(line, '\n');
6227 count = p - line;
6228
6229 if (count) {
6230 pipe_debug_message(debug, SHADER_INFO,
6231 "%.*s", count, line);
6232 }
6233
6234 if (!*p)
6235 break;
6236 line = p + 1;
6237 }
6238
6239 pipe_debug_message(debug, SHADER_INFO,
6240 "Shader Disassembly End");
6241 }
6242 } else {
6243 fprintf(file, "Shader %s binary:\n", name);
6244 for (i = 0; i < binary->code_size; i += 4) {
6245 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6246 binary->code[i + 3], binary->code[i + 2],
6247 binary->code[i + 1], binary->code[i]);
6248 }
6249 }
6250 }
6251
6252 static void si_shader_dump_stats(struct si_screen *sscreen,
6253 struct si_shader_config *conf,
6254 unsigned num_inputs,
6255 unsigned code_size,
6256 struct pipe_debug_callback *debug,
6257 unsigned processor,
6258 FILE *file)
6259 {
6260 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6261 unsigned lds_per_wave = 0;
6262 unsigned max_simd_waves = 10;
6263
6264 /* Compute LDS usage for PS. */
6265 if (processor == PIPE_SHADER_FRAGMENT) {
6266 /* The minimum usage per wave is (num_inputs * 48). The maximum
6267 * usage is (num_inputs * 48 * 16).
6268 * We can get anything in between and it varies between waves.
6269 *
6270 * The 48 bytes per input for a single primitive is equal to
6271 * 4 bytes/component * 4 components/input * 3 points.
6272 *
6273 * Other stages don't know the size at compile time or don't
6274 * allocate LDS per wave, but instead they do it per thread group.
6275 */
6276 lds_per_wave = conf->lds_size * lds_increment +
6277 align(num_inputs * 48, lds_increment);
6278 }
6279
6280 /* Compute the per-SIMD wave counts. */
6281 if (conf->num_sgprs) {
6282 if (sscreen->b.chip_class >= VI)
6283 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6284 else
6285 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6286 }
6287
6288 if (conf->num_vgprs)
6289 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6290
6291 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6292 * that PS can use.
6293 */
6294 if (lds_per_wave)
6295 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6296
6297 if (file != stderr ||
6298 r600_can_dump_shader(&sscreen->b, processor)) {
6299 if (processor == PIPE_SHADER_FRAGMENT) {
6300 fprintf(file, "*** SHADER CONFIG ***\n"
6301 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6302 "SPI_PS_INPUT_ENA = 0x%04x\n",
6303 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6304 }
6305
6306 fprintf(file, "*** SHADER STATS ***\n"
6307 "SGPRS: %d\n"
6308 "VGPRS: %d\n"
6309 "Spilled SGPRs: %d\n"
6310 "Spilled VGPRs: %d\n"
6311 "Code Size: %d bytes\n"
6312 "LDS: %d blocks\n"
6313 "Scratch: %d bytes per wave\n"
6314 "Max Waves: %d\n"
6315 "********************\n\n\n",
6316 conf->num_sgprs, conf->num_vgprs,
6317 conf->spilled_sgprs, conf->spilled_vgprs, code_size,
6318 conf->lds_size, conf->scratch_bytes_per_wave,
6319 max_simd_waves);
6320 }
6321
6322 pipe_debug_message(debug, SHADER_INFO,
6323 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6324 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6325 "Spilled VGPRs: %d",
6326 conf->num_sgprs, conf->num_vgprs, code_size,
6327 conf->lds_size, conf->scratch_bytes_per_wave,
6328 max_simd_waves, conf->spilled_sgprs,
6329 conf->spilled_vgprs);
6330 }
6331
6332 static const char *si_get_shader_name(struct si_shader *shader,
6333 unsigned processor)
6334 {
6335 switch (processor) {
6336 case PIPE_SHADER_VERTEX:
6337 if (shader->key.vs.as_es)
6338 return "Vertex Shader as ES";
6339 else if (shader->key.vs.as_ls)
6340 return "Vertex Shader as LS";
6341 else
6342 return "Vertex Shader as VS";
6343 case PIPE_SHADER_TESS_CTRL:
6344 return "Tessellation Control Shader";
6345 case PIPE_SHADER_TESS_EVAL:
6346 if (shader->key.tes.as_es)
6347 return "Tessellation Evaluation Shader as ES";
6348 else
6349 return "Tessellation Evaluation Shader as VS";
6350 case PIPE_SHADER_GEOMETRY:
6351 if (shader->gs_copy_shader == NULL)
6352 return "GS Copy Shader as VS";
6353 else
6354 return "Geometry Shader";
6355 case PIPE_SHADER_FRAGMENT:
6356 return "Pixel Shader";
6357 case PIPE_SHADER_COMPUTE:
6358 return "Compute Shader";
6359 default:
6360 return "Unknown Shader";
6361 }
6362 }
6363
6364 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6365 struct pipe_debug_callback *debug, unsigned processor,
6366 FILE *file)
6367 {
6368 if (file != stderr ||
6369 r600_can_dump_shader(&sscreen->b, processor))
6370 si_dump_shader_key(processor, &shader->key, file);
6371
6372 if (file != stderr && shader->binary.llvm_ir_string) {
6373 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6374 si_get_shader_name(shader, processor));
6375 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6376 }
6377
6378 if (file != stderr ||
6379 (r600_can_dump_shader(&sscreen->b, processor) &&
6380 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6381 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6382
6383 if (shader->prolog)
6384 si_shader_dump_disassembly(&shader->prolog->binary,
6385 debug, "prolog", file);
6386
6387 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6388
6389 if (shader->epilog)
6390 si_shader_dump_disassembly(&shader->epilog->binary,
6391 debug, "epilog", file);
6392 fprintf(file, "\n");
6393 }
6394
6395 si_shader_dump_stats(sscreen, &shader->config,
6396 shader->selector ? shader->selector->info.num_inputs : 0,
6397 si_get_shader_binary_size(shader), debug, processor,
6398 file);
6399 }
6400
6401 int si_compile_llvm(struct si_screen *sscreen,
6402 struct radeon_shader_binary *binary,
6403 struct si_shader_config *conf,
6404 LLVMTargetMachineRef tm,
6405 LLVMModuleRef mod,
6406 struct pipe_debug_callback *debug,
6407 unsigned processor,
6408 const char *name)
6409 {
6410 int r = 0;
6411 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6412
6413 if (r600_can_dump_shader(&sscreen->b, processor)) {
6414 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6415
6416 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6417 fprintf(stderr, "%s LLVM IR:\n\n", name);
6418 LLVMDumpModule(mod);
6419 fprintf(stderr, "\n");
6420 }
6421 }
6422
6423 if (sscreen->record_llvm_ir) {
6424 char *ir = LLVMPrintModuleToString(mod);
6425 binary->llvm_ir_string = strdup(ir);
6426 LLVMDisposeMessage(ir);
6427 }
6428
6429 if (!si_replace_shader(count, binary)) {
6430 r = radeon_llvm_compile(mod, binary, tm, debug);
6431 if (r)
6432 return r;
6433 }
6434
6435 si_shader_binary_read_config(binary, conf, 0);
6436
6437 /* Enable 64-bit and 16-bit denormals, because there is no performance
6438 * cost.
6439 *
6440 * If denormals are enabled, all floating-point output modifiers are
6441 * ignored.
6442 *
6443 * Don't enable denormals for 32-bit floats, because:
6444 * - Floating-point output modifiers would be ignored by the hw.
6445 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6446 * have to stop using those.
6447 * - SI & CI would be very slow.
6448 */
6449 conf->float_mode |= V_00B028_FP_64_DENORMS;
6450
6451 FREE(binary->config);
6452 FREE(binary->global_symbol_offsets);
6453 binary->config = NULL;
6454 binary->global_symbol_offsets = NULL;
6455
6456 /* Some shaders can't have rodata because their binaries can be
6457 * concatenated.
6458 */
6459 if (binary->rodata_size &&
6460 (processor == PIPE_SHADER_VERTEX ||
6461 processor == PIPE_SHADER_TESS_CTRL ||
6462 processor == PIPE_SHADER_TESS_EVAL ||
6463 processor == PIPE_SHADER_FRAGMENT)) {
6464 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6465 return -EINVAL;
6466 }
6467
6468 return r;
6469 }
6470
6471 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6472 {
6473 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6474 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6475 else
6476 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6477 }
6478
6479 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6480 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6481 struct si_shader_context *ctx,
6482 struct si_shader *gs,
6483 struct pipe_debug_callback *debug)
6484 {
6485 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6486 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6487 struct lp_build_context *uint = &bld_base->uint_bld;
6488 struct si_shader_output_values *outputs;
6489 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6490 LLVMValueRef args[9];
6491 int i, r;
6492
6493 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6494
6495 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6496 ctx->type = PIPE_SHADER_VERTEX;
6497 ctx->is_gs_copy_shader = true;
6498
6499 create_meta_data(ctx);
6500 create_function(ctx);
6501 preload_streamout_buffers(ctx);
6502 preload_ring_buffers(ctx);
6503
6504 args[0] = ctx->gsvs_ring[0];
6505 args[1] = lp_build_mul_imm(uint,
6506 LLVMGetParam(ctx->radeon_bld.main_fn,
6507 ctx->param_vertex_id),
6508 4);
6509 args[3] = uint->zero;
6510 args[4] = uint->one; /* OFFEN */
6511 args[5] = uint->zero; /* IDXEN */
6512 args[6] = uint->one; /* GLC */
6513 args[7] = uint->one; /* SLC */
6514 args[8] = uint->zero; /* TFE */
6515
6516 /* Fetch vertex data from GSVS ring */
6517 for (i = 0; i < gsinfo->num_outputs; ++i) {
6518 unsigned chan;
6519
6520 outputs[i].name = gsinfo->output_semantic_name[i];
6521 outputs[i].sid = gsinfo->output_semantic_index[i];
6522
6523 for (chan = 0; chan < 4; chan++) {
6524 args[2] = lp_build_const_int32(gallivm,
6525 (i * 4 + chan) *
6526 gs->selector->gs_max_out_vertices * 16 * 4);
6527
6528 outputs[i].values[chan] =
6529 LLVMBuildBitCast(gallivm->builder,
6530 lp_build_intrinsic(gallivm->builder,
6531 "llvm.SI.buffer.load.dword.i32.i32",
6532 ctx->i32, args, 9,
6533 LLVMReadOnlyAttribute),
6534 ctx->f32, "");
6535 }
6536 }
6537
6538 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6539
6540 LLVMBuildRetVoid(gallivm->builder);
6541
6542 /* Dump LLVM IR before any optimization passes */
6543 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6544 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6545 LLVMDumpModule(bld_base->base.gallivm->module);
6546
6547 radeon_llvm_finalize_module(&ctx->radeon_bld);
6548
6549 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6550 &ctx->shader->config, ctx->tm,
6551 bld_base->base.gallivm->module,
6552 debug, PIPE_SHADER_GEOMETRY,
6553 "GS Copy Shader");
6554 if (!r) {
6555 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6556 fprintf(stderr, "GS Copy Shader:\n");
6557 si_shader_dump(sscreen, ctx->shader, debug,
6558 PIPE_SHADER_GEOMETRY, stderr);
6559 r = si_shader_binary_upload(sscreen, ctx->shader);
6560 }
6561
6562 radeon_llvm_dispose(&ctx->radeon_bld);
6563
6564 FREE(outputs);
6565 return r;
6566 }
6567
6568 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
6569 FILE *f)
6570 {
6571 int i;
6572
6573 fprintf(f, "SHADER KEY\n");
6574
6575 switch (shader) {
6576 case PIPE_SHADER_VERTEX:
6577 fprintf(f, " instance_divisors = {");
6578 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6579 fprintf(f, !i ? "%u" : ", %u",
6580 key->vs.prolog.instance_divisors[i]);
6581 fprintf(f, "}\n");
6582 fprintf(f, " as_es = %u\n", key->vs.as_es);
6583 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6584 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6585 break;
6586
6587 case PIPE_SHADER_TESS_CTRL:
6588 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6589 break;
6590
6591 case PIPE_SHADER_TESS_EVAL:
6592 fprintf(f, " as_es = %u\n", key->tes.as_es);
6593 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6594 break;
6595
6596 case PIPE_SHADER_GEOMETRY:
6597 case PIPE_SHADER_COMPUTE:
6598 break;
6599
6600 case PIPE_SHADER_FRAGMENT:
6601 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6602 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6603 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6604 fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6605 fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6606 fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6607 fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6608 fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6609 fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6610 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6611 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6612 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6613 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6614 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6615 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6616 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6617 break;
6618
6619 default:
6620 assert(0);
6621 }
6622 }
6623
6624 static void si_init_shader_ctx(struct si_shader_context *ctx,
6625 struct si_screen *sscreen,
6626 struct si_shader *shader,
6627 LLVMTargetMachineRef tm)
6628 {
6629 struct lp_build_tgsi_context *bld_base;
6630 struct lp_build_tgsi_action tmpl = {};
6631
6632 memset(ctx, 0, sizeof(*ctx));
6633 radeon_llvm_context_init(
6634 &ctx->radeon_bld, "amdgcn--",
6635 (shader && shader->selector) ? &shader->selector->info : NULL,
6636 (shader && shader->selector) ? shader->selector->tokens : NULL);
6637 ctx->tm = tm;
6638 ctx->screen = sscreen;
6639 if (shader && shader->selector)
6640 ctx->type = shader->selector->info.processor;
6641 else
6642 ctx->type = -1;
6643 ctx->shader = shader;
6644
6645 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6646 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6647 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6648 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6649 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6650 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6651 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6652 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6653 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6654 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6655 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6656 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6657
6658 bld_base = &ctx->radeon_bld.soa.bld_base;
6659 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6660
6661 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6662 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6663 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6664
6665 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6666 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6667 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6668 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6669 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6670 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6671 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6672 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6673 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6674 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6675 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6676 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6677 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6678 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6679
6680 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6681 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6682 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6683 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6684 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6685 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6686
6687 tmpl.fetch_args = atomic_fetch_args;
6688 tmpl.emit = atomic_emit;
6689 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6690 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6691 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6692 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6693 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6694 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6695 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6696 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6697 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6698 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6699 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6700 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6701 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6702 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6703 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6704 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6705 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6706 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6707 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6708 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6709
6710 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6711
6712 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6713 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6714 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6715 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6716
6717 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6718 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6719 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6720
6721 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6722 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6723 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6724 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6725 }
6726
6727 int si_compile_tgsi_shader(struct si_screen *sscreen,
6728 LLVMTargetMachineRef tm,
6729 struct si_shader *shader,
6730 bool is_monolithic,
6731 struct pipe_debug_callback *debug)
6732 {
6733 struct si_shader_selector *sel = shader->selector;
6734 struct si_shader_context ctx;
6735 struct lp_build_tgsi_context *bld_base;
6736 LLVMModuleRef mod;
6737 int r = 0;
6738
6739 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6740 * conversion fails. */
6741 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6742 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6743 tgsi_dump(sel->tokens, 0);
6744 si_dump_streamout(&sel->so);
6745 }
6746
6747 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6748 ctx.is_monolithic = is_monolithic;
6749
6750 shader->info.uses_instanceid = sel->info.uses_instanceid;
6751
6752 bld_base = &ctx.radeon_bld.soa.bld_base;
6753 ctx.radeon_bld.load_system_value = declare_system_value;
6754
6755 switch (ctx.type) {
6756 case PIPE_SHADER_VERTEX:
6757 ctx.radeon_bld.load_input = declare_input_vs;
6758 if (shader->key.vs.as_ls)
6759 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6760 else if (shader->key.vs.as_es)
6761 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6762 else
6763 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6764 break;
6765 case PIPE_SHADER_TESS_CTRL:
6766 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6767 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6768 bld_base->emit_store = store_output_tcs;
6769 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6770 break;
6771 case PIPE_SHADER_TESS_EVAL:
6772 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6773 if (shader->key.tes.as_es)
6774 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6775 else
6776 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6777 break;
6778 case PIPE_SHADER_GEOMETRY:
6779 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6780 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6781 break;
6782 case PIPE_SHADER_FRAGMENT:
6783 ctx.radeon_bld.load_input = declare_input_fs;
6784 if (is_monolithic)
6785 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6786 else
6787 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6788 break;
6789 case PIPE_SHADER_COMPUTE:
6790 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6791 break;
6792 default:
6793 assert(!"Unsupported shader type");
6794 return -1;
6795 }
6796
6797 create_meta_data(&ctx);
6798 create_function(&ctx);
6799 preload_constants(&ctx);
6800 preload_shader_buffers(&ctx);
6801 preload_samplers(&ctx);
6802 preload_images(&ctx);
6803 preload_streamout_buffers(&ctx);
6804 preload_ring_buffers(&ctx);
6805
6806 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6807 shader->key.ps.prolog.poly_stipple) {
6808 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6809 SI_PARAM_RW_BUFFERS);
6810 si_llvm_emit_polygon_stipple(&ctx, list,
6811 SI_PARAM_POS_FIXED_PT);
6812 }
6813
6814 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6815 int i;
6816 for (i = 0; i < 4; i++) {
6817 ctx.gs_next_vertex[i] =
6818 lp_build_alloca(bld_base->base.gallivm,
6819 ctx.i32, "");
6820 }
6821 }
6822
6823 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6824 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6825 goto out;
6826 }
6827
6828 si_llvm_build_ret(&ctx, ctx.return_value);
6829 mod = bld_base->base.gallivm->module;
6830
6831 /* Dump LLVM IR before any optimization passes */
6832 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6833 r600_can_dump_shader(&sscreen->b, ctx.type))
6834 LLVMDumpModule(mod);
6835
6836 radeon_llvm_finalize_module(&ctx.radeon_bld);
6837
6838 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6839 mod, debug, ctx.type, "TGSI shader");
6840 if (r) {
6841 fprintf(stderr, "LLVM failed to compile shader\n");
6842 goto out;
6843 }
6844
6845 radeon_llvm_dispose(&ctx.radeon_bld);
6846
6847 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6848 * LLVM 3.9svn has this bug.
6849 */
6850 if (sel->type == PIPE_SHADER_COMPUTE) {
6851 unsigned *props = sel->info.properties;
6852 unsigned wave_size = 64;
6853 unsigned max_vgprs = 256;
6854 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6855 unsigned max_sgprs_per_wave = 128;
6856 unsigned min_waves_per_cu =
6857 DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
6858 props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
6859 props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
6860 wave_size);
6861 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6862
6863 max_vgprs = max_vgprs / min_waves_per_simd;
6864 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6865
6866 if (shader->config.num_sgprs > max_sgprs ||
6867 shader->config.num_vgprs > max_vgprs) {
6868 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6869 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6870 shader->config.num_sgprs, shader->config.num_vgprs,
6871 max_sgprs, max_vgprs);
6872
6873 /* Just terminate the process, because dependent
6874 * shaders can hang due to bad input data, but use
6875 * the env var to allow shader-db to work.
6876 */
6877 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6878 abort();
6879 }
6880 }
6881
6882 /* Add the scratch offset to input SGPRs. */
6883 if (shader->config.scratch_bytes_per_wave)
6884 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6885
6886 /* Calculate the number of fragment input VGPRs. */
6887 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6888 shader->info.num_input_vgprs = 0;
6889 shader->info.face_vgpr_index = -1;
6890
6891 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6892 shader->info.num_input_vgprs += 2;
6893 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6894 shader->info.num_input_vgprs += 2;
6895 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6896 shader->info.num_input_vgprs += 2;
6897 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6898 shader->info.num_input_vgprs += 3;
6899 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6900 shader->info.num_input_vgprs += 2;
6901 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6902 shader->info.num_input_vgprs += 2;
6903 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6904 shader->info.num_input_vgprs += 2;
6905 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6906 shader->info.num_input_vgprs += 1;
6907 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6908 shader->info.num_input_vgprs += 1;
6909 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6910 shader->info.num_input_vgprs += 1;
6911 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6912 shader->info.num_input_vgprs += 1;
6913 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6914 shader->info.num_input_vgprs += 1;
6915 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6916 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6917 shader->info.num_input_vgprs += 1;
6918 }
6919 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6920 shader->info.num_input_vgprs += 1;
6921 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6922 shader->info.num_input_vgprs += 1;
6923 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6924 shader->info.num_input_vgprs += 1;
6925 }
6926
6927 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6928 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6929 shader->gs_copy_shader->selector = shader->selector;
6930 ctx.shader = shader->gs_copy_shader;
6931 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6932 shader, debug))) {
6933 free(shader->gs_copy_shader);
6934 shader->gs_copy_shader = NULL;
6935 goto out;
6936 }
6937 }
6938
6939 out:
6940 return r;
6941 }
6942
6943 /**
6944 * Create, compile and return a shader part (prolog or epilog).
6945 *
6946 * \param sscreen screen
6947 * \param list list of shader parts of the same category
6948 * \param key shader part key
6949 * \param tm LLVM target machine
6950 * \param debug debug callback
6951 * \param compile the callback responsible for compilation
6952 * \return non-NULL on success
6953 */
6954 static struct si_shader_part *
6955 si_get_shader_part(struct si_screen *sscreen,
6956 struct si_shader_part **list,
6957 union si_shader_part_key *key,
6958 LLVMTargetMachineRef tm,
6959 struct pipe_debug_callback *debug,
6960 bool (*compile)(struct si_screen *,
6961 LLVMTargetMachineRef,
6962 struct pipe_debug_callback *,
6963 struct si_shader_part *))
6964 {
6965 struct si_shader_part *result;
6966
6967 pipe_mutex_lock(sscreen->shader_parts_mutex);
6968
6969 /* Find existing. */
6970 for (result = *list; result; result = result->next) {
6971 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6972 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6973 return result;
6974 }
6975 }
6976
6977 /* Compile a new one. */
6978 result = CALLOC_STRUCT(si_shader_part);
6979 result->key = *key;
6980 if (!compile(sscreen, tm, debug, result)) {
6981 FREE(result);
6982 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6983 return NULL;
6984 }
6985
6986 result->next = *list;
6987 *list = result;
6988 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6989 return result;
6990 }
6991
6992 /**
6993 * Create a vertex shader prolog.
6994 *
6995 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6996 * All inputs are returned unmodified. The vertex load indices are
6997 * stored after them, which will used by the API VS for fetching inputs.
6998 *
6999 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
7000 * input_v0,
7001 * input_v1,
7002 * input_v2,
7003 * input_v3,
7004 * (VertexID + BaseVertex),
7005 * (InstanceID + StartInstance),
7006 * (InstanceID / 2 + StartInstance)
7007 */
7008 static bool si_compile_vs_prolog(struct si_screen *sscreen,
7009 LLVMTargetMachineRef tm,
7010 struct pipe_debug_callback *debug,
7011 struct si_shader_part *out)
7012 {
7013 union si_shader_part_key *key = &out->key;
7014 struct si_shader shader = {};
7015 struct si_shader_context ctx;
7016 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7017 LLVMTypeRef *params, *returns;
7018 LLVMValueRef ret, func;
7019 int last_sgpr, num_params, num_returns, i;
7020 bool status = true;
7021
7022 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7023 ctx.type = PIPE_SHADER_VERTEX;
7024 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
7025 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
7026
7027 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
7028 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
7029 sizeof(LLVMTypeRef));
7030 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
7031 key->vs_prolog.last_input + 1) *
7032 sizeof(LLVMTypeRef));
7033 num_params = 0;
7034 num_returns = 0;
7035
7036 /* Declare input and output SGPRs. */
7037 num_params = 0;
7038 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7039 params[num_params++] = ctx.i32;
7040 returns[num_returns++] = ctx.i32;
7041 }
7042 last_sgpr = num_params - 1;
7043
7044 /* 4 preloaded VGPRs (outputs must be floats) */
7045 for (i = 0; i < 4; i++) {
7046 params[num_params++] = ctx.i32;
7047 returns[num_returns++] = ctx.f32;
7048 }
7049
7050 /* Vertex load indices. */
7051 for (i = 0; i <= key->vs_prolog.last_input; i++)
7052 returns[num_returns++] = ctx.f32;
7053
7054 /* Create the function. */
7055 si_create_function(&ctx, returns, num_returns, params,
7056 num_params, last_sgpr);
7057 func = ctx.radeon_bld.main_fn;
7058
7059 /* Copy inputs to outputs. This should be no-op, as the registers match,
7060 * but it will prevent the compiler from overwriting them unintentionally.
7061 */
7062 ret = ctx.return_value;
7063 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
7064 LLVMValueRef p = LLVMGetParam(func, i);
7065 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7066 }
7067 for (i = num_params - 4; i < num_params; i++) {
7068 LLVMValueRef p = LLVMGetParam(func, i);
7069 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
7070 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7071 }
7072
7073 /* Compute vertex load indices from instance divisors. */
7074 for (i = 0; i <= key->vs_prolog.last_input; i++) {
7075 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
7076 LLVMValueRef index;
7077
7078 if (divisor) {
7079 /* InstanceID / Divisor + StartInstance */
7080 index = get_instance_index_for_fetch(&ctx.radeon_bld,
7081 SI_SGPR_START_INSTANCE,
7082 divisor);
7083 } else {
7084 /* VertexID + BaseVertex */
7085 index = LLVMBuildAdd(gallivm->builder,
7086 LLVMGetParam(func, ctx.param_vertex_id),
7087 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
7088 }
7089
7090 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
7091 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
7092 num_params++, "");
7093 }
7094
7095 /* Compile. */
7096 si_llvm_build_ret(&ctx, ret);
7097 radeon_llvm_finalize_module(&ctx.radeon_bld);
7098
7099 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7100 gallivm->module, debug, ctx.type,
7101 "Vertex Shader Prolog"))
7102 status = false;
7103
7104 radeon_llvm_dispose(&ctx.radeon_bld);
7105 return status;
7106 }
7107
7108 /**
7109 * Compile the vertex shader epilog. This is also used by the tessellation
7110 * evaluation shader compiled as VS.
7111 *
7112 * The input is PrimitiveID.
7113 *
7114 * If PrimitiveID is required by the pixel shader, export it.
7115 * Otherwise, do nothing.
7116 */
7117 static bool si_compile_vs_epilog(struct si_screen *sscreen,
7118 LLVMTargetMachineRef tm,
7119 struct pipe_debug_callback *debug,
7120 struct si_shader_part *out)
7121 {
7122 union si_shader_part_key *key = &out->key;
7123 struct si_shader_context ctx;
7124 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7125 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7126 LLVMTypeRef params[5];
7127 int num_params, i;
7128 bool status = true;
7129
7130 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
7131 ctx.type = PIPE_SHADER_VERTEX;
7132
7133 /* Declare input VGPRs. */
7134 num_params = key->vs_epilog.states.export_prim_id ?
7135 (VS_EPILOG_PRIMID_LOC + 1) : 0;
7136 assert(num_params <= ARRAY_SIZE(params));
7137
7138 for (i = 0; i < num_params; i++)
7139 params[i] = ctx.f32;
7140
7141 /* Create the function. */
7142 si_create_function(&ctx, NULL, 0, params, num_params, -1);
7143
7144 /* Emit exports. */
7145 if (key->vs_epilog.states.export_prim_id) {
7146 struct lp_build_context *base = &bld_base->base;
7147 struct lp_build_context *uint = &bld_base->uint_bld;
7148 LLVMValueRef args[9];
7149
7150 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7151 args[1] = uint->zero; /* whether the EXEC mask is valid */
7152 args[2] = uint->zero; /* DONE bit */
7153 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7154 key->vs_epilog.prim_id_param_offset);
7155 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7156 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
7157 VS_EPILOG_PRIMID_LOC); /* X */
7158 args[6] = uint->undef; /* Y */
7159 args[7] = uint->undef; /* Z */
7160 args[8] = uint->undef; /* W */
7161
7162 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7163 LLVMVoidTypeInContext(base->gallivm->context),
7164 args, 9, 0);
7165 }
7166
7167 /* Compile. */
7168 LLVMBuildRetVoid(gallivm->builder);
7169 radeon_llvm_finalize_module(&ctx.radeon_bld);
7170
7171 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7172 gallivm->module, debug, ctx.type,
7173 "Vertex Shader Epilog"))
7174 status = false;
7175
7176 radeon_llvm_dispose(&ctx.radeon_bld);
7177 return status;
7178 }
7179
7180 /**
7181 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7182 */
7183 static bool si_get_vs_epilog(struct si_screen *sscreen,
7184 LLVMTargetMachineRef tm,
7185 struct si_shader *shader,
7186 struct pipe_debug_callback *debug,
7187 struct si_vs_epilog_bits *states)
7188 {
7189 union si_shader_part_key epilog_key;
7190
7191 memset(&epilog_key, 0, sizeof(epilog_key));
7192 epilog_key.vs_epilog.states = *states;
7193
7194 /* Set up the PrimitiveID output. */
7195 if (shader->key.vs.epilog.export_prim_id) {
7196 unsigned index = shader->selector->info.num_outputs;
7197 unsigned offset = shader->info.nr_param_exports++;
7198
7199 epilog_key.vs_epilog.prim_id_param_offset = offset;
7200 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7201 shader->info.vs_output_param_offset[index] = offset;
7202 }
7203
7204 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7205 &epilog_key, tm, debug,
7206 si_compile_vs_epilog);
7207 return shader->epilog != NULL;
7208 }
7209
7210 /**
7211 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7212 */
7213 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7214 LLVMTargetMachineRef tm,
7215 struct si_shader *shader,
7216 struct pipe_debug_callback *debug)
7217 {
7218 struct tgsi_shader_info *info = &shader->selector->info;
7219 union si_shader_part_key prolog_key;
7220 unsigned i;
7221
7222 /* Get the prolog. */
7223 memset(&prolog_key, 0, sizeof(prolog_key));
7224 prolog_key.vs_prolog.states = shader->key.vs.prolog;
7225 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7226 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7227
7228 /* The prolog is a no-op if there are no inputs. */
7229 if (info->num_inputs) {
7230 shader->prolog =
7231 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7232 &prolog_key, tm, debug,
7233 si_compile_vs_prolog);
7234 if (!shader->prolog)
7235 return false;
7236 }
7237
7238 /* Get the epilog. */
7239 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7240 !si_get_vs_epilog(sscreen, tm, shader, debug,
7241 &shader->key.vs.epilog))
7242 return false;
7243
7244 /* Set the instanceID flag. */
7245 for (i = 0; i < info->num_inputs; i++)
7246 if (prolog_key.vs_prolog.states.instance_divisors[i])
7247 shader->info.uses_instanceid = true;
7248
7249 return true;
7250 }
7251
7252 /**
7253 * Select and compile (or reuse) TES parts (epilog).
7254 */
7255 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7256 LLVMTargetMachineRef tm,
7257 struct si_shader *shader,
7258 struct pipe_debug_callback *debug)
7259 {
7260 if (shader->key.tes.as_es)
7261 return true;
7262
7263 /* TES compiled as VS. */
7264 return si_get_vs_epilog(sscreen, tm, shader, debug,
7265 &shader->key.tes.epilog);
7266 }
7267
7268 /**
7269 * Compile the TCS epilog. This writes tesselation factors to memory based on
7270 * the output primitive type of the tesselator (determined by TES).
7271 */
7272 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7273 LLVMTargetMachineRef tm,
7274 struct pipe_debug_callback *debug,
7275 struct si_shader_part *out)
7276 {
7277 union si_shader_part_key *key = &out->key;
7278 struct si_shader shader = {};
7279 struct si_shader_context ctx;
7280 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7281 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7282 LLVMTypeRef params[16];
7283 LLVMValueRef func;
7284 int last_sgpr, num_params;
7285 bool status = true;
7286
7287 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7288 ctx.type = PIPE_SHADER_TESS_CTRL;
7289 shader.key.tcs.epilog = key->tcs_epilog.states;
7290
7291 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7292 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7293 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7294 params[SI_PARAM_SAMPLERS] = ctx.i64;
7295 params[SI_PARAM_IMAGES] = ctx.i64;
7296 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7297 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7298 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7299 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7300 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7301 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7302 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7303 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7304 num_params = last_sgpr + 1;
7305
7306 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7307 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7308 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7309
7310 /* Create the function. */
7311 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7312 declare_tess_lds(&ctx);
7313 func = ctx.radeon_bld.main_fn;
7314
7315 si_write_tess_factors(bld_base,
7316 LLVMGetParam(func, last_sgpr + 1),
7317 LLVMGetParam(func, last_sgpr + 2),
7318 LLVMGetParam(func, last_sgpr + 3));
7319
7320 /* Compile. */
7321 LLVMBuildRetVoid(gallivm->builder);
7322 radeon_llvm_finalize_module(&ctx.radeon_bld);
7323
7324 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7325 gallivm->module, debug, ctx.type,
7326 "Tessellation Control Shader Epilog"))
7327 status = false;
7328
7329 radeon_llvm_dispose(&ctx.radeon_bld);
7330 return status;
7331 }
7332
7333 /**
7334 * Select and compile (or reuse) TCS parts (epilog).
7335 */
7336 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7337 LLVMTargetMachineRef tm,
7338 struct si_shader *shader,
7339 struct pipe_debug_callback *debug)
7340 {
7341 union si_shader_part_key epilog_key;
7342
7343 /* Get the epilog. */
7344 memset(&epilog_key, 0, sizeof(epilog_key));
7345 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7346
7347 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7348 &epilog_key, tm, debug,
7349 si_compile_tcs_epilog);
7350 return shader->epilog != NULL;
7351 }
7352
7353 /**
7354 * Compile the pixel shader prolog. This handles:
7355 * - two-side color selection and interpolation
7356 * - overriding interpolation parameters for the API PS
7357 * - polygon stippling
7358 *
7359 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7360 * overriden by other states. (e.g. per-sample interpolation)
7361 * Interpolated colors are stored after the preloaded VGPRs.
7362 */
7363 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7364 LLVMTargetMachineRef tm,
7365 struct pipe_debug_callback *debug,
7366 struct si_shader_part *out)
7367 {
7368 union si_shader_part_key *key = &out->key;
7369 struct si_shader shader = {};
7370 struct si_shader_context ctx;
7371 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7372 LLVMTypeRef *params;
7373 LLVMValueRef ret, func;
7374 int last_sgpr, num_params, num_returns, i, num_color_channels;
7375 bool status = true;
7376
7377 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7378 ctx.type = PIPE_SHADER_FRAGMENT;
7379 shader.key.ps.prolog = key->ps_prolog.states;
7380
7381 /* Number of inputs + 8 color elements. */
7382 params = alloca((key->ps_prolog.num_input_sgprs +
7383 key->ps_prolog.num_input_vgprs + 8) *
7384 sizeof(LLVMTypeRef));
7385
7386 /* Declare inputs. */
7387 num_params = 0;
7388 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7389 params[num_params++] = ctx.i32;
7390 last_sgpr = num_params - 1;
7391
7392 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7393 params[num_params++] = ctx.f32;
7394
7395 /* Declare outputs (same as inputs + add colors if needed) */
7396 num_returns = num_params;
7397 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7398 for (i = 0; i < num_color_channels; i++)
7399 params[num_returns++] = ctx.f32;
7400
7401 /* Create the function. */
7402 si_create_function(&ctx, params, num_returns, params,
7403 num_params, last_sgpr);
7404 func = ctx.radeon_bld.main_fn;
7405
7406 /* Copy inputs to outputs. This should be no-op, as the registers match,
7407 * but it will prevent the compiler from overwriting them unintentionally.
7408 */
7409 ret = ctx.return_value;
7410 for (i = 0; i < num_params; i++) {
7411 LLVMValueRef p = LLVMGetParam(func, i);
7412 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7413 }
7414
7415 /* Polygon stippling. */
7416 if (key->ps_prolog.states.poly_stipple) {
7417 /* POS_FIXED_PT is always last. */
7418 unsigned pos = key->ps_prolog.num_input_sgprs +
7419 key->ps_prolog.num_input_vgprs - 1;
7420 LLVMValueRef ptr[2], list;
7421
7422 /* Get the pointer to rw buffers. */
7423 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7424 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7425 list = lp_build_gather_values(gallivm, ptr, 2);
7426 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7427 list = LLVMBuildIntToPtr(gallivm->builder, list,
7428 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7429
7430 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7431 }
7432
7433 if (key->ps_prolog.states.bc_optimize_for_persp ||
7434 key->ps_prolog.states.bc_optimize_for_linear) {
7435 unsigned i, base = key->ps_prolog.num_input_sgprs;
7436 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7437
7438 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7439 * The hw doesn't compute CENTROID if the whole wave only
7440 * contains fully-covered quads.
7441 *
7442 * PRIM_MASK is after user SGPRs.
7443 */
7444 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7445 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7446 LLVMConstInt(ctx.i32, 31, 0), "");
7447 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7448 ctx.i1, "");
7449
7450 if (key->ps_prolog.states.bc_optimize_for_persp) {
7451 /* Read PERSP_CENTER. */
7452 for (i = 0; i < 2; i++)
7453 center[i] = LLVMGetParam(func, base + 2 + i);
7454 /* Read PERSP_CENTROID. */
7455 for (i = 0; i < 2; i++)
7456 centroid[i] = LLVMGetParam(func, base + 4 + i);
7457 /* Select PERSP_CENTROID. */
7458 for (i = 0; i < 2; i++) {
7459 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7460 center[i], centroid[i], "");
7461 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7462 tmp, base + 4 + i, "");
7463 }
7464 }
7465 if (key->ps_prolog.states.bc_optimize_for_linear) {
7466 /* Read LINEAR_CENTER. */
7467 for (i = 0; i < 2; i++)
7468 center[i] = LLVMGetParam(func, base + 8 + i);
7469 /* Read LINEAR_CENTROID. */
7470 for (i = 0; i < 2; i++)
7471 centroid[i] = LLVMGetParam(func, base + 10 + i);
7472 /* Select LINEAR_CENTROID. */
7473 for (i = 0; i < 2; i++) {
7474 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7475 center[i], centroid[i], "");
7476 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7477 tmp, base + 10 + i, "");
7478 }
7479 }
7480 }
7481
7482 /* Interpolate colors. */
7483 for (i = 0; i < 2; i++) {
7484 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7485 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7486 key->ps_prolog.face_vgpr_index;
7487 LLVMValueRef interp[2], color[4];
7488 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7489
7490 if (!writemask)
7491 continue;
7492
7493 /* If the interpolation qualifier is not CONSTANT (-1). */
7494 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7495 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7496 key->ps_prolog.color_interp_vgpr_index[i];
7497
7498 /* Get the (i,j) updated by bc_optimize handling. */
7499 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7500 interp_vgpr, "");
7501 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7502 interp_vgpr + 1, "");
7503 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7504 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7505 ctx.v2i32, "");
7506 }
7507
7508 /* Use the absolute location of the input. */
7509 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7510
7511 if (key->ps_prolog.states.color_two_side) {
7512 face = LLVMGetParam(func, face_vgpr);
7513 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7514 }
7515
7516 interp_fs_input(&ctx,
7517 key->ps_prolog.color_attr_index[i],
7518 TGSI_SEMANTIC_COLOR, i,
7519 key->ps_prolog.num_interp_inputs,
7520 key->ps_prolog.colors_read, interp_ij,
7521 prim_mask, face, color);
7522
7523 while (writemask) {
7524 unsigned chan = u_bit_scan(&writemask);
7525 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7526 num_params++, "");
7527 }
7528 }
7529
7530 /* Force per-sample interpolation. */
7531 if (key->ps_prolog.states.force_persp_sample_interp) {
7532 unsigned i, base = key->ps_prolog.num_input_sgprs;
7533 LLVMValueRef persp_sample[2];
7534
7535 /* Read PERSP_SAMPLE. */
7536 for (i = 0; i < 2; i++)
7537 persp_sample[i] = LLVMGetParam(func, base + i);
7538 /* Overwrite PERSP_CENTER. */
7539 for (i = 0; i < 2; i++)
7540 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7541 persp_sample[i], base + 2 + i, "");
7542 /* Overwrite PERSP_CENTROID. */
7543 for (i = 0; i < 2; i++)
7544 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7545 persp_sample[i], base + 4 + i, "");
7546 }
7547 if (key->ps_prolog.states.force_linear_sample_interp) {
7548 unsigned i, base = key->ps_prolog.num_input_sgprs;
7549 LLVMValueRef linear_sample[2];
7550
7551 /* Read LINEAR_SAMPLE. */
7552 for (i = 0; i < 2; i++)
7553 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7554 /* Overwrite LINEAR_CENTER. */
7555 for (i = 0; i < 2; i++)
7556 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7557 linear_sample[i], base + 8 + i, "");
7558 /* Overwrite LINEAR_CENTROID. */
7559 for (i = 0; i < 2; i++)
7560 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7561 linear_sample[i], base + 10 + i, "");
7562 }
7563
7564 /* Force center interpolation. */
7565 if (key->ps_prolog.states.force_persp_center_interp) {
7566 unsigned i, base = key->ps_prolog.num_input_sgprs;
7567 LLVMValueRef persp_center[2];
7568
7569 /* Read PERSP_CENTER. */
7570 for (i = 0; i < 2; i++)
7571 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7572 /* Overwrite PERSP_SAMPLE. */
7573 for (i = 0; i < 2; i++)
7574 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7575 persp_center[i], base + i, "");
7576 /* Overwrite PERSP_CENTROID. */
7577 for (i = 0; i < 2; i++)
7578 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7579 persp_center[i], base + 4 + i, "");
7580 }
7581 if (key->ps_prolog.states.force_linear_center_interp) {
7582 unsigned i, base = key->ps_prolog.num_input_sgprs;
7583 LLVMValueRef linear_center[2];
7584
7585 /* Read LINEAR_CENTER. */
7586 for (i = 0; i < 2; i++)
7587 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7588 /* Overwrite LINEAR_SAMPLE. */
7589 for (i = 0; i < 2; i++)
7590 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7591 linear_center[i], base + 6 + i, "");
7592 /* Overwrite LINEAR_CENTROID. */
7593 for (i = 0; i < 2; i++)
7594 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7595 linear_center[i], base + 10 + i, "");
7596 }
7597
7598 /* Tell LLVM to insert WQM instruction sequence when needed. */
7599 if (key->ps_prolog.wqm) {
7600 LLVMAddTargetDependentFunctionAttr(func,
7601 "amdgpu-ps-wqm-outputs", "");
7602 }
7603
7604 /* Compile. */
7605 si_llvm_build_ret(&ctx, ret);
7606 radeon_llvm_finalize_module(&ctx.radeon_bld);
7607
7608 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7609 gallivm->module, debug, ctx.type,
7610 "Fragment Shader Prolog"))
7611 status = false;
7612
7613 radeon_llvm_dispose(&ctx.radeon_bld);
7614 return status;
7615 }
7616
7617 /**
7618 * Compile the pixel shader epilog. This handles everything that must be
7619 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7620 */
7621 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7622 LLVMTargetMachineRef tm,
7623 struct pipe_debug_callback *debug,
7624 struct si_shader_part *out)
7625 {
7626 union si_shader_part_key *key = &out->key;
7627 struct si_shader shader = {};
7628 struct si_shader_context ctx;
7629 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7630 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7631 LLVMTypeRef params[16+8*4+3];
7632 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7633 int last_sgpr, num_params, i;
7634 bool status = true;
7635 struct si_ps_exports exp = {};
7636
7637 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7638 ctx.type = PIPE_SHADER_FRAGMENT;
7639 shader.key.ps.epilog = key->ps_epilog.states;
7640
7641 /* Declare input SGPRs. */
7642 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7643 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7644 params[SI_PARAM_SAMPLERS] = ctx.i64;
7645 params[SI_PARAM_IMAGES] = ctx.i64;
7646 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7647 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7648 last_sgpr = SI_PARAM_ALPHA_REF;
7649
7650 /* Declare input VGPRs. */
7651 num_params = (last_sgpr + 1) +
7652 util_bitcount(key->ps_epilog.colors_written) * 4 +
7653 key->ps_epilog.writes_z +
7654 key->ps_epilog.writes_stencil +
7655 key->ps_epilog.writes_samplemask;
7656
7657 num_params = MAX2(num_params,
7658 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7659
7660 assert(num_params <= ARRAY_SIZE(params));
7661
7662 for (i = last_sgpr + 1; i < num_params; i++)
7663 params[i] = ctx.f32;
7664
7665 /* Create the function. */
7666 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7667 /* Disable elimination of unused inputs. */
7668 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7669 "InitialPSInputAddr", 0xffffff);
7670
7671 /* Process colors. */
7672 unsigned vgpr = last_sgpr + 1;
7673 unsigned colors_written = key->ps_epilog.colors_written;
7674 int last_color_export = -1;
7675
7676 /* Find the last color export. */
7677 if (!key->ps_epilog.writes_z &&
7678 !key->ps_epilog.writes_stencil &&
7679 !key->ps_epilog.writes_samplemask) {
7680 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7681
7682 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7683 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7684 /* Just set this if any of the colorbuffers are enabled. */
7685 if (spi_format &
7686 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7687 last_color_export = 0;
7688 } else {
7689 for (i = 0; i < 8; i++)
7690 if (colors_written & (1 << i) &&
7691 (spi_format >> (i * 4)) & 0xf)
7692 last_color_export = i;
7693 }
7694 }
7695
7696 while (colors_written) {
7697 LLVMValueRef color[4];
7698 int mrt = u_bit_scan(&colors_written);
7699
7700 for (i = 0; i < 4; i++)
7701 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7702
7703 si_export_mrt_color(bld_base, color, mrt,
7704 num_params - 1,
7705 mrt == last_color_export, &exp);
7706 }
7707
7708 /* Process depth, stencil, samplemask. */
7709 if (key->ps_epilog.writes_z)
7710 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7711 if (key->ps_epilog.writes_stencil)
7712 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7713 if (key->ps_epilog.writes_samplemask)
7714 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7715
7716 if (depth || stencil || samplemask)
7717 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7718 else if (last_color_export == -1)
7719 si_export_null(bld_base);
7720
7721 if (exp.num)
7722 si_emit_ps_exports(&ctx, &exp);
7723
7724 /* Compile. */
7725 LLVMBuildRetVoid(gallivm->builder);
7726 radeon_llvm_finalize_module(&ctx.radeon_bld);
7727
7728 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7729 gallivm->module, debug, ctx.type,
7730 "Fragment Shader Epilog"))
7731 status = false;
7732
7733 radeon_llvm_dispose(&ctx.radeon_bld);
7734 return status;
7735 }
7736
7737 /**
7738 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7739 */
7740 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7741 LLVMTargetMachineRef tm,
7742 struct si_shader *shader,
7743 struct pipe_debug_callback *debug)
7744 {
7745 struct tgsi_shader_info *info = &shader->selector->info;
7746 union si_shader_part_key prolog_key;
7747 union si_shader_part_key epilog_key;
7748 unsigned i;
7749
7750 /* Get the prolog. */
7751 memset(&prolog_key, 0, sizeof(prolog_key));
7752 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7753 prolog_key.ps_prolog.colors_read = info->colors_read;
7754 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7755 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7756 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7757 (prolog_key.ps_prolog.colors_read ||
7758 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7759 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7760 prolog_key.ps_prolog.states.force_persp_center_interp ||
7761 prolog_key.ps_prolog.states.force_linear_center_interp ||
7762 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7763 prolog_key.ps_prolog.states.bc_optimize_for_linear);
7764
7765 if (info->colors_read) {
7766 unsigned *color = shader->selector->color_attr_index;
7767
7768 if (shader->key.ps.prolog.color_two_side) {
7769 /* BCOLORs are stored after the last input. */
7770 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7771 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7772 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7773 }
7774
7775 for (i = 0; i < 2; i++) {
7776 unsigned interp = info->input_interpolate[color[i]];
7777 unsigned location = info->input_interpolate_loc[color[i]];
7778
7779 if (!(info->colors_read & (0xf << i*4)))
7780 continue;
7781
7782 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7783
7784 if (shader->key.ps.prolog.flatshade_colors &&
7785 interp == TGSI_INTERPOLATE_COLOR)
7786 interp = TGSI_INTERPOLATE_CONSTANT;
7787
7788 switch (interp) {
7789 case TGSI_INTERPOLATE_CONSTANT:
7790 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7791 break;
7792 case TGSI_INTERPOLATE_PERSPECTIVE:
7793 case TGSI_INTERPOLATE_COLOR:
7794 /* Force the interpolation location for colors here. */
7795 if (shader->key.ps.prolog.force_persp_sample_interp)
7796 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7797 if (shader->key.ps.prolog.force_persp_center_interp)
7798 location = TGSI_INTERPOLATE_LOC_CENTER;
7799
7800 switch (location) {
7801 case TGSI_INTERPOLATE_LOC_SAMPLE:
7802 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7803 shader->config.spi_ps_input_ena |=
7804 S_0286CC_PERSP_SAMPLE_ENA(1);
7805 break;
7806 case TGSI_INTERPOLATE_LOC_CENTER:
7807 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7808 shader->config.spi_ps_input_ena |=
7809 S_0286CC_PERSP_CENTER_ENA(1);
7810 break;
7811 case TGSI_INTERPOLATE_LOC_CENTROID:
7812 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7813 shader->config.spi_ps_input_ena |=
7814 S_0286CC_PERSP_CENTROID_ENA(1);
7815 break;
7816 default:
7817 assert(0);
7818 }
7819 break;
7820 case TGSI_INTERPOLATE_LINEAR:
7821 /* Force the interpolation location for colors here. */
7822 if (shader->key.ps.prolog.force_linear_sample_interp)
7823 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7824 if (shader->key.ps.prolog.force_linear_center_interp)
7825 location = TGSI_INTERPOLATE_LOC_CENTER;
7826
7827 switch (location) {
7828 case TGSI_INTERPOLATE_LOC_SAMPLE:
7829 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7830 shader->config.spi_ps_input_ena |=
7831 S_0286CC_LINEAR_SAMPLE_ENA(1);
7832 break;
7833 case TGSI_INTERPOLATE_LOC_CENTER:
7834 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7835 shader->config.spi_ps_input_ena |=
7836 S_0286CC_LINEAR_CENTER_ENA(1);
7837 break;
7838 case TGSI_INTERPOLATE_LOC_CENTROID:
7839 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7840 shader->config.spi_ps_input_ena |=
7841 S_0286CC_LINEAR_CENTROID_ENA(1);
7842 break;
7843 default:
7844 assert(0);
7845 }
7846 break;
7847 default:
7848 assert(0);
7849 }
7850 }
7851 }
7852
7853 /* The prolog is a no-op if these aren't set. */
7854 if (prolog_key.ps_prolog.colors_read ||
7855 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7856 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7857 prolog_key.ps_prolog.states.force_persp_center_interp ||
7858 prolog_key.ps_prolog.states.force_linear_center_interp ||
7859 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7860 prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7861 prolog_key.ps_prolog.states.poly_stipple) {
7862 shader->prolog =
7863 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7864 &prolog_key, tm, debug,
7865 si_compile_ps_prolog);
7866 if (!shader->prolog)
7867 return false;
7868 }
7869
7870 /* Get the epilog. */
7871 memset(&epilog_key, 0, sizeof(epilog_key));
7872 epilog_key.ps_epilog.colors_written = info->colors_written;
7873 epilog_key.ps_epilog.writes_z = info->writes_z;
7874 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7875 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7876 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7877
7878 shader->epilog =
7879 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7880 &epilog_key, tm, debug,
7881 si_compile_ps_epilog);
7882 if (!shader->epilog)
7883 return false;
7884
7885 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7886 if (shader->key.ps.prolog.poly_stipple) {
7887 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7888 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7889 }
7890
7891 /* Set up the enable bits for per-sample shading if needed. */
7892 if (shader->key.ps.prolog.force_persp_sample_interp &&
7893 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7894 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7895 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7896 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7897 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7898 }
7899 if (shader->key.ps.prolog.force_linear_sample_interp &&
7900 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7901 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7902 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7903 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7904 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7905 }
7906 if (shader->key.ps.prolog.force_persp_center_interp &&
7907 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7908 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7909 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7910 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7911 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7912 }
7913 if (shader->key.ps.prolog.force_linear_center_interp &&
7914 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7915 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7916 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7917 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7918 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7919 }
7920
7921 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7922 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7923 !(shader->config.spi_ps_input_ena & 0xf)) {
7924 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7925 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7926 }
7927
7928 /* At least one pair of interpolation weights must be enabled. */
7929 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7930 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7931 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7932 }
7933
7934 /* The sample mask input is always enabled, because the API shader always
7935 * passes it through to the epilog. Disable it here if it's unused.
7936 */
7937 if (!shader->key.ps.epilog.poly_line_smoothing &&
7938 !shader->selector->info.reads_samplemask)
7939 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7940
7941 return true;
7942 }
7943
7944 static void si_fix_num_sgprs(struct si_shader *shader)
7945 {
7946 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7947
7948 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7949 }
7950
7951 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7952 struct si_shader *shader,
7953 struct pipe_debug_callback *debug)
7954 {
7955 struct si_shader *mainp = shader->selector->main_shader_part;
7956 int r;
7957
7958 /* LS, ES, VS are compiled on demand if the main part hasn't been
7959 * compiled for that stage.
7960 */
7961 if (!mainp ||
7962 (shader->selector->type == PIPE_SHADER_VERTEX &&
7963 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7964 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7965 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7966 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7967 (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7968 shader->key.tcs.epilog.inputs_to_copy) ||
7969 shader->selector->type == PIPE_SHADER_COMPUTE) {
7970 /* Monolithic shader (compiled as a whole, has many variants,
7971 * may take a long time to compile).
7972 */
7973 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7974 if (r)
7975 return r;
7976 } else {
7977 /* The shader consists of 2-3 parts:
7978 *
7979 * - the middle part is the user shader, it has 1 variant only
7980 * and it was compiled during the creation of the shader
7981 * selector
7982 * - the prolog part is inserted at the beginning
7983 * - the epilog part is inserted at the end
7984 *
7985 * The prolog and epilog have many (but simple) variants.
7986 */
7987
7988 /* Copy the compiled TGSI shader data over. */
7989 shader->is_binary_shared = true;
7990 shader->binary = mainp->binary;
7991 shader->config = mainp->config;
7992 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7993 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7994 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7995 memcpy(shader->info.vs_output_param_offset,
7996 mainp->info.vs_output_param_offset,
7997 sizeof(mainp->info.vs_output_param_offset));
7998 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7999 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
8000 shader->info.nr_param_exports = mainp->info.nr_param_exports;
8001
8002 /* Select prologs and/or epilogs. */
8003 switch (shader->selector->type) {
8004 case PIPE_SHADER_VERTEX:
8005 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
8006 return -1;
8007 break;
8008 case PIPE_SHADER_TESS_CTRL:
8009 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
8010 return -1;
8011 break;
8012 case PIPE_SHADER_TESS_EVAL:
8013 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
8014 return -1;
8015 break;
8016 case PIPE_SHADER_FRAGMENT:
8017 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
8018 return -1;
8019
8020 /* Make sure we have at least as many VGPRs as there
8021 * are allocated inputs.
8022 */
8023 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8024 shader->info.num_input_vgprs);
8025 break;
8026 }
8027
8028 /* Update SGPR and VGPR counts. */
8029 if (shader->prolog) {
8030 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8031 shader->prolog->config.num_sgprs);
8032 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8033 shader->prolog->config.num_vgprs);
8034 }
8035 if (shader->epilog) {
8036 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
8037 shader->epilog->config.num_sgprs);
8038 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
8039 shader->epilog->config.num_vgprs);
8040 }
8041 }
8042
8043 si_fix_num_sgprs(shader);
8044 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
8045 stderr);
8046
8047 /* Upload. */
8048 r = si_shader_binary_upload(sscreen, shader);
8049 if (r) {
8050 fprintf(stderr, "LLVM failed to upload shader\n");
8051 return r;
8052 }
8053
8054 return 0;
8055 }
8056
8057 void si_shader_destroy(struct si_shader *shader)
8058 {
8059 if (shader->gs_copy_shader) {
8060 si_shader_destroy(shader->gs_copy_shader);
8061 FREE(shader->gs_copy_shader);
8062 }
8063
8064 if (shader->scratch_bo)
8065 r600_resource_reference(&shader->scratch_bo, NULL);
8066
8067 r600_resource_reference(&shader->bo, NULL);
8068
8069 if (!shader->is_binary_shared)
8070 radeon_shader_binary_clean(&shader->binary);
8071
8072 free(shader->shader_log);
8073 }