radeonsi: get rid of img/buf/sampler descriptor preloading (v2)
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "gallivm/lp_bld_misc.h"
37 #include "radeon/r600_cs.h"
38 #include "radeon/radeon_llvm.h"
39 #include "radeon/radeon_elf_util.h"
40 #include "radeon/radeon_llvm_emit.h"
41 #include "util/u_memory.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94 int param_oc_lds;
95
96 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
97 * 0x800000 for VS, 0x1 for ES.
98 */
99 int param_tess_offchip;
100
101 LLVMTargetMachineRef tm;
102
103 unsigned invariant_load_md_kind;
104 unsigned range_md_kind;
105 unsigned uniform_md_kind;
106 LLVMValueRef empty_md;
107
108 /* Preloaded descriptors. */
109 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
110 LLVMValueRef esgs_ring;
111 LLVMValueRef gsvs_ring[4];
112
113 LLVMValueRef lds;
114 LLVMValueRef gs_next_vertex[4];
115 LLVMValueRef return_value;
116
117 LLVMTypeRef voidt;
118 LLVMTypeRef i1;
119 LLVMTypeRef i8;
120 LLVMTypeRef i32;
121 LLVMTypeRef i64;
122 LLVMTypeRef i128;
123 LLVMTypeRef f32;
124 LLVMTypeRef v16i8;
125 LLVMTypeRef v2i32;
126 LLVMTypeRef v4i32;
127 LLVMTypeRef v4f32;
128 LLVMTypeRef v8i32;
129
130 LLVMValueRef shared_memory;
131 };
132
133 static struct si_shader_context *si_shader_context(
134 struct lp_build_tgsi_context *bld_base)
135 {
136 return (struct si_shader_context *)bld_base;
137 }
138
139 static void si_init_shader_ctx(struct si_shader_context *ctx,
140 struct si_screen *sscreen,
141 struct si_shader *shader,
142 LLVMTargetMachineRef tm);
143
144 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
145 struct lp_build_tgsi_context *bld_base,
146 struct lp_build_emit_data *emit_data);
147
148 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
149 FILE *f);
150
151 /* Ideally pass the sample mask input to the PS epilog as v13, which
152 * is its usual location, so that the shader doesn't have to add v_mov.
153 */
154 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
155
156 /* The VS location of the PrimitiveID input is the same in the epilog,
157 * so that the main shader part doesn't have to move it.
158 */
159 #define VS_EPILOG_PRIMID_LOC 2
160
161 #define PERSPECTIVE_BASE 0
162 #define LINEAR_BASE 9
163
164 #define SAMPLE_OFFSET 0
165 #define CENTER_OFFSET 2
166 #define CENTROID_OFSET 4
167
168 #define USE_SGPR_MAX_SUFFIX_LEN 5
169 #define CONST_ADDR_SPACE 2
170 #define LOCAL_ADDR_SPACE 3
171 #define USER_SGPR_ADDR_SPACE 8
172
173
174 #define SENDMSG_GS 2
175 #define SENDMSG_GS_DONE 3
176
177 #define SENDMSG_GS_OP_NOP (0 << 4)
178 #define SENDMSG_GS_OP_CUT (1 << 4)
179 #define SENDMSG_GS_OP_EMIT (2 << 4)
180 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
181
182 /**
183 * Returns a unique index for a semantic name and index. The index must be
184 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
185 * calculated.
186 */
187 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
188 {
189 switch (semantic_name) {
190 case TGSI_SEMANTIC_POSITION:
191 return 0;
192 case TGSI_SEMANTIC_PSIZE:
193 return 1;
194 case TGSI_SEMANTIC_CLIPDIST:
195 assert(index <= 1);
196 return 2 + index;
197 case TGSI_SEMANTIC_GENERIC:
198 if (index <= 63-4)
199 return 4 + index;
200 else
201 /* same explanation as in the default statement,
202 * the only user hitting this is st/nine.
203 */
204 return 0;
205
206 /* patch indices are completely separate and thus start from 0 */
207 case TGSI_SEMANTIC_TESSOUTER:
208 return 0;
209 case TGSI_SEMANTIC_TESSINNER:
210 return 1;
211 case TGSI_SEMANTIC_PATCH:
212 return 2 + index;
213
214 default:
215 /* Don't fail here. The result of this function is only used
216 * for LS, TCS, TES, and GS, where legacy GL semantics can't
217 * occur, but this function is called for all vertex shaders
218 * before it's known whether LS will be compiled or not.
219 */
220 return 0;
221 }
222 }
223
224 /**
225 * Get the value of a shader input parameter and extract a bitfield.
226 */
227 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
228 unsigned param, unsigned rshift,
229 unsigned bitwidth)
230 {
231 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
232 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
233 param);
234
235 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
236 value = bitcast(&ctx->radeon_bld.soa.bld_base,
237 TGSI_TYPE_UNSIGNED, value);
238
239 if (rshift)
240 value = LLVMBuildLShr(gallivm->builder, value,
241 lp_build_const_int32(gallivm, rshift), "");
242
243 if (rshift + bitwidth < 32) {
244 unsigned mask = (1 << bitwidth) - 1;
245 value = LLVMBuildAnd(gallivm->builder, value,
246 lp_build_const_int32(gallivm, mask), "");
247 }
248
249 return value;
250 }
251
252 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
253 {
254 switch (ctx->type) {
255 case PIPE_SHADER_TESS_CTRL:
256 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
257
258 case PIPE_SHADER_TESS_EVAL:
259 return LLVMGetParam(ctx->radeon_bld.main_fn,
260 ctx->param_tes_rel_patch_id);
261
262 default:
263 assert(0);
264 return NULL;
265 }
266 }
267
268 /* Tessellation shaders pass outputs to the next shader using LDS.
269 *
270 * LS outputs = TCS inputs
271 * TCS outputs = TES inputs
272 *
273 * The LDS layout is:
274 * - TCS inputs for patch 0
275 * - TCS inputs for patch 1
276 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
277 * - ...
278 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
279 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
280 * - TCS outputs for patch 1
281 * - Per-patch TCS outputs for patch 1
282 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
283 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
284 * - ...
285 *
286 * All three shaders VS(LS), TCS, TES share the same LDS space.
287 */
288
289 static LLVMValueRef
290 get_tcs_in_patch_stride(struct si_shader_context *ctx)
291 {
292 if (ctx->type == PIPE_SHADER_VERTEX)
293 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
294 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
295 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
296 else {
297 assert(0);
298 return NULL;
299 }
300 }
301
302 static LLVMValueRef
303 get_tcs_out_patch_stride(struct si_shader_context *ctx)
304 {
305 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
306 }
307
308 static LLVMValueRef
309 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
310 {
311 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
312 unpack_param(ctx,
313 SI_PARAM_TCS_OUT_OFFSETS,
314 0, 16),
315 4);
316 }
317
318 static LLVMValueRef
319 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
320 {
321 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
322 unpack_param(ctx,
323 SI_PARAM_TCS_OUT_OFFSETS,
324 16, 16),
325 4);
326 }
327
328 static LLVMValueRef
329 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
330 {
331 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
332 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
333 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
334
335 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
336 }
337
338 static LLVMValueRef
339 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
340 {
341 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
342 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
343 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
344 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
345
346 return LLVMBuildAdd(gallivm->builder, patch0_offset,
347 LLVMBuildMul(gallivm->builder, patch_stride,
348 rel_patch_id, ""),
349 "");
350 }
351
352 static LLVMValueRef
353 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
354 {
355 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
356 LLVMValueRef patch0_patch_data_offset =
357 get_tcs_out_patch0_patch_data_offset(ctx);
358 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
359 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
360
361 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
362 LLVMBuildMul(gallivm->builder, patch_stride,
363 rel_patch_id, ""),
364 "");
365 }
366
367 static void build_indexed_store(struct si_shader_context *ctx,
368 LLVMValueRef base_ptr, LLVMValueRef index,
369 LLVMValueRef value)
370 {
371 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
372 struct gallivm_state *gallivm = bld_base->base.gallivm;
373 LLVMValueRef indices[2], pointer;
374
375 indices[0] = bld_base->uint_bld.zero;
376 indices[1] = index;
377
378 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
379 LLVMBuildStore(gallivm->builder, value, pointer);
380 }
381
382 /**
383 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
384 * It's equivalent to doing a load from &base_ptr[index].
385 *
386 * \param base_ptr Where the array starts.
387 * \param index The element index into the array.
388 * \param uniform Whether the base_ptr and index can be assumed to be
389 * dynamically uniform
390 */
391 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
392 LLVMValueRef base_ptr, LLVMValueRef index,
393 bool uniform)
394 {
395 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
396 struct gallivm_state *gallivm = bld_base->base.gallivm;
397 LLVMValueRef indices[2], pointer;
398
399 indices[0] = bld_base->uint_bld.zero;
400 indices[1] = index;
401
402 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
403 if (uniform)
404 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
405 return LLVMBuildLoad(gallivm->builder, pointer, "");
406 }
407
408 /**
409 * Do a load from &base_ptr[index], but also add a flag that it's loading
410 * a constant from a dynamically uniform index.
411 */
412 static LLVMValueRef build_indexed_load_const(
413 struct si_shader_context *ctx,
414 LLVMValueRef base_ptr, LLVMValueRef index)
415 {
416 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
417 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
418 return result;
419 }
420
421 static LLVMValueRef get_instance_index_for_fetch(
422 struct radeon_llvm_context *radeon_bld,
423 unsigned param_start_instance, unsigned divisor)
424 {
425 struct si_shader_context *ctx =
426 si_shader_context(&radeon_bld->soa.bld_base);
427 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
428
429 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
430 ctx->param_instance_id);
431
432 /* The division must be done before START_INSTANCE is added. */
433 if (divisor > 1)
434 result = LLVMBuildUDiv(gallivm->builder, result,
435 lp_build_const_int32(gallivm, divisor), "");
436
437 return LLVMBuildAdd(gallivm->builder, result,
438 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
439 }
440
441 static void declare_input_vs(
442 struct radeon_llvm_context *radeon_bld,
443 unsigned input_index,
444 const struct tgsi_full_declaration *decl)
445 {
446 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
447 struct gallivm_state *gallivm = base->gallivm;
448 struct si_shader_context *ctx =
449 si_shader_context(&radeon_bld->soa.bld_base);
450 unsigned divisor =
451 ctx->shader->key.vs.prolog.instance_divisors[input_index];
452
453 unsigned chan;
454
455 LLVMValueRef t_list_ptr;
456 LLVMValueRef t_offset;
457 LLVMValueRef t_list;
458 LLVMValueRef attribute_offset;
459 LLVMValueRef buffer_index;
460 LLVMValueRef args[3];
461 LLVMValueRef input;
462
463 /* Load the T list */
464 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
465
466 t_offset = lp_build_const_int32(gallivm, input_index);
467
468 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
469
470 /* Build the attribute offset */
471 attribute_offset = lp_build_const_int32(gallivm, 0);
472
473 if (!ctx->is_monolithic) {
474 buffer_index = LLVMGetParam(radeon_bld->main_fn,
475 ctx->param_vertex_index0 +
476 input_index);
477 } else if (divisor) {
478 /* Build index from instance ID, start instance and divisor */
479 ctx->shader->info.uses_instanceid = true;
480 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
481 SI_PARAM_START_INSTANCE,
482 divisor);
483 } else {
484 /* Load the buffer index for vertices. */
485 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
486 ctx->param_vertex_id);
487 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
488 SI_PARAM_BASE_VERTEX);
489 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
490 }
491
492 args[0] = t_list;
493 args[1] = attribute_offset;
494 args[2] = buffer_index;
495 input = lp_build_intrinsic(gallivm->builder,
496 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
497 LLVMReadNoneAttribute);
498
499 /* Break up the vec4 into individual components */
500 for (chan = 0; chan < 4; chan++) {
501 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
502 /* XXX: Use a helper function for this. There is one in
503 * tgsi_llvm.c. */
504 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
505 LLVMBuildExtractElement(gallivm->builder,
506 input, llvm_chan, "");
507 }
508 }
509
510 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
511 unsigned swizzle)
512 {
513 struct si_shader_context *ctx = si_shader_context(bld_base);
514
515 if (swizzle > 0)
516 return bld_base->uint_bld.zero;
517
518 switch (ctx->type) {
519 case PIPE_SHADER_VERTEX:
520 return LLVMGetParam(ctx->radeon_bld.main_fn,
521 ctx->param_vs_prim_id);
522 case PIPE_SHADER_TESS_CTRL:
523 return LLVMGetParam(ctx->radeon_bld.main_fn,
524 SI_PARAM_PATCH_ID);
525 case PIPE_SHADER_TESS_EVAL:
526 return LLVMGetParam(ctx->radeon_bld.main_fn,
527 ctx->param_tes_patch_id);
528 case PIPE_SHADER_GEOMETRY:
529 return LLVMGetParam(ctx->radeon_bld.main_fn,
530 SI_PARAM_PRIMITIVE_ID);
531 default:
532 assert(0);
533 return bld_base->uint_bld.zero;
534 }
535 }
536
537 /**
538 * Return the value of tgsi_ind_register for indexing.
539 * This is the indirect index with the constant offset added to it.
540 */
541 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
542 const struct tgsi_ind_register *ind,
543 int rel_index)
544 {
545 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
546 LLVMValueRef result;
547
548 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
549 result = LLVMBuildLoad(gallivm->builder, result, "");
550 result = LLVMBuildAdd(gallivm->builder, result,
551 lp_build_const_int32(gallivm, rel_index), "");
552 return result;
553 }
554
555 /**
556 * Like get_indirect_index, but restricts the return value to a (possibly
557 * undefined) value inside [0..num).
558 */
559 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
560 const struct tgsi_ind_register *ind,
561 int rel_index, unsigned num)
562 {
563 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
564
565 /* LLVM 3.8: If indirect resource indexing is used:
566 * - SI & CIK hang
567 * - VI crashes
568 */
569 if (HAVE_LLVM <= 0x0308)
570 return LLVMGetUndef(ctx->i32);
571
572 return radeon_llvm_bound_index(&ctx->radeon_bld, result, num);
573 }
574
575
576 /**
577 * Calculate a dword address given an input or output register and a stride.
578 */
579 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
580 const struct tgsi_full_dst_register *dst,
581 const struct tgsi_full_src_register *src,
582 LLVMValueRef vertex_dw_stride,
583 LLVMValueRef base_addr)
584 {
585 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
586 struct tgsi_shader_info *info = &ctx->shader->selector->info;
587 ubyte *name, *index, *array_first;
588 int first, param;
589 struct tgsi_full_dst_register reg;
590
591 /* Set the register description. The address computation is the same
592 * for sources and destinations. */
593 if (src) {
594 reg.Register.File = src->Register.File;
595 reg.Register.Index = src->Register.Index;
596 reg.Register.Indirect = src->Register.Indirect;
597 reg.Register.Dimension = src->Register.Dimension;
598 reg.Indirect = src->Indirect;
599 reg.Dimension = src->Dimension;
600 reg.DimIndirect = src->DimIndirect;
601 } else
602 reg = *dst;
603
604 /* If the register is 2-dimensional (e.g. an array of vertices
605 * in a primitive), calculate the base address of the vertex. */
606 if (reg.Register.Dimension) {
607 LLVMValueRef index;
608
609 if (reg.Dimension.Indirect)
610 index = get_indirect_index(ctx, &reg.DimIndirect,
611 reg.Dimension.Index);
612 else
613 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
614
615 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
616 LLVMBuildMul(gallivm->builder, index,
617 vertex_dw_stride, ""), "");
618 }
619
620 /* Get information about the register. */
621 if (reg.Register.File == TGSI_FILE_INPUT) {
622 name = info->input_semantic_name;
623 index = info->input_semantic_index;
624 array_first = info->input_array_first;
625 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
626 name = info->output_semantic_name;
627 index = info->output_semantic_index;
628 array_first = info->output_array_first;
629 } else {
630 assert(0);
631 return NULL;
632 }
633
634 if (reg.Register.Indirect) {
635 /* Add the relative address of the element. */
636 LLVMValueRef ind_index;
637
638 if (reg.Indirect.ArrayID)
639 first = array_first[reg.Indirect.ArrayID];
640 else
641 first = reg.Register.Index;
642
643 ind_index = get_indirect_index(ctx, &reg.Indirect,
644 reg.Register.Index - first);
645
646 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
647 LLVMBuildMul(gallivm->builder, ind_index,
648 lp_build_const_int32(gallivm, 4), ""), "");
649
650 param = si_shader_io_get_unique_index(name[first], index[first]);
651 } else {
652 param = si_shader_io_get_unique_index(name[reg.Register.Index],
653 index[reg.Register.Index]);
654 }
655
656 /* Add the base address of the element. */
657 return LLVMBuildAdd(gallivm->builder, base_addr,
658 lp_build_const_int32(gallivm, param * 4), "");
659 }
660
661 /* The offchip buffer layout for TCS->TES is
662 *
663 * - attribute 0 of patch 0 vertex 0
664 * - attribute 0 of patch 0 vertex 1
665 * - attribute 0 of patch 0 vertex 2
666 * ...
667 * - attribute 0 of patch 1 vertex 0
668 * - attribute 0 of patch 1 vertex 1
669 * ...
670 * - attribute 1 of patch 0 vertex 0
671 * - attribute 1 of patch 0 vertex 1
672 * ...
673 * - per patch attribute 0 of patch 0
674 * - per patch attribute 0 of patch 1
675 * ...
676 *
677 * Note that every attribute has 4 components.
678 */
679 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
680 LLVMValueRef vertex_index,
681 LLVMValueRef param_index)
682 {
683 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
684 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
685 LLVMValueRef param_stride, constant16;
686
687 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
688 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
689 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
690 num_patches, "");
691
692 constant16 = lp_build_const_int32(gallivm, 16);
693 if (vertex_index) {
694 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
695 vertices_per_patch, "");
696
697 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
698 vertex_index, "");
699
700 param_stride = total_vertices;
701 } else {
702 base_addr = get_rel_patch_id(ctx);
703 param_stride = num_patches;
704 }
705
706 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
707 LLVMBuildMul(gallivm->builder, param_index,
708 param_stride, ""), "");
709
710 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
711
712 if (!vertex_index) {
713 LLVMValueRef patch_data_offset =
714 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
715
716 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
717 patch_data_offset, "");
718 }
719 return base_addr;
720 }
721
722 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
723 struct si_shader_context *ctx,
724 const struct tgsi_full_dst_register *dst,
725 const struct tgsi_full_src_register *src)
726 {
727 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
728 struct tgsi_shader_info *info = &ctx->shader->selector->info;
729 ubyte *name, *index, *array_first;
730 struct tgsi_full_src_register reg;
731 LLVMValueRef vertex_index = NULL;
732 LLVMValueRef param_index = NULL;
733 unsigned param_index_base, param_base;
734
735 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
736
737 if (reg.Register.Dimension) {
738
739 if (reg.Dimension.Indirect)
740 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
741 reg.Dimension.Index);
742 else
743 vertex_index = lp_build_const_int32(gallivm,
744 reg.Dimension.Index);
745 }
746
747 /* Get information about the register. */
748 if (reg.Register.File == TGSI_FILE_INPUT) {
749 name = info->input_semantic_name;
750 index = info->input_semantic_index;
751 array_first = info->input_array_first;
752 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
753 name = info->output_semantic_name;
754 index = info->output_semantic_index;
755 array_first = info->output_array_first;
756 } else {
757 assert(0);
758 return NULL;
759 }
760
761 if (reg.Register.Indirect) {
762 if (reg.Indirect.ArrayID)
763 param_base = array_first[reg.Indirect.ArrayID];
764 else
765 param_base = reg.Register.Index;
766
767 param_index = get_indirect_index(ctx, &reg.Indirect,
768 reg.Register.Index - param_base);
769
770 } else {
771 param_base = reg.Register.Index;
772 param_index = lp_build_const_int32(gallivm, 0);
773 }
774
775 param_index_base = si_shader_io_get_unique_index(name[param_base],
776 index[param_base]);
777
778 param_index = LLVMBuildAdd(gallivm->builder, param_index,
779 lp_build_const_int32(gallivm, param_index_base),
780 "");
781
782 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
783 }
784
785 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
786 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
787 * or v4i32 (num_channels=3,4). */
788 static void build_tbuffer_store(struct si_shader_context *ctx,
789 LLVMValueRef rsrc,
790 LLVMValueRef vdata,
791 unsigned num_channels,
792 LLVMValueRef vaddr,
793 LLVMValueRef soffset,
794 unsigned inst_offset,
795 unsigned dfmt,
796 unsigned nfmt,
797 unsigned offen,
798 unsigned idxen,
799 unsigned glc,
800 unsigned slc,
801 unsigned tfe)
802 {
803 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
804 LLVMValueRef args[] = {
805 rsrc,
806 vdata,
807 LLVMConstInt(ctx->i32, num_channels, 0),
808 vaddr,
809 soffset,
810 LLVMConstInt(ctx->i32, inst_offset, 0),
811 LLVMConstInt(ctx->i32, dfmt, 0),
812 LLVMConstInt(ctx->i32, nfmt, 0),
813 LLVMConstInt(ctx->i32, offen, 0),
814 LLVMConstInt(ctx->i32, idxen, 0),
815 LLVMConstInt(ctx->i32, glc, 0),
816 LLVMConstInt(ctx->i32, slc, 0),
817 LLVMConstInt(ctx->i32, tfe, 0)
818 };
819
820 /* The instruction offset field has 12 bits */
821 assert(offen || inst_offset < (1 << 12));
822
823 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
824 unsigned func = CLAMP(num_channels, 1, 3) - 1;
825 const char *types[] = {"i32", "v2i32", "v4i32"};
826 char name[256];
827 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
828
829 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
830 args, ARRAY_SIZE(args), 0);
831 }
832
833 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
834 LLVMValueRef rsrc,
835 LLVMValueRef vdata,
836 unsigned num_channels,
837 LLVMValueRef vaddr,
838 LLVMValueRef soffset,
839 unsigned inst_offset)
840 {
841 static unsigned dfmt[] = {
842 V_008F0C_BUF_DATA_FORMAT_32,
843 V_008F0C_BUF_DATA_FORMAT_32_32,
844 V_008F0C_BUF_DATA_FORMAT_32_32_32,
845 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
846 };
847 assert(num_channels >= 1 && num_channels <= 4);
848
849 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
850 inst_offset, dfmt[num_channels-1],
851 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
852 }
853
854 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
855 LLVMValueRef rsrc,
856 int num_channels,
857 LLVMValueRef vindex,
858 LLVMValueRef voffset,
859 LLVMValueRef soffset,
860 unsigned inst_offset,
861 unsigned glc,
862 unsigned slc)
863 {
864 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
865 unsigned func = CLAMP(num_channels, 1, 3) - 1;
866
867 if (HAVE_LLVM >= 0x309) {
868 LLVMValueRef args[] = {
869 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
870 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
871 LLVMConstInt(ctx->i32, inst_offset, 0),
872 LLVMConstInt(ctx->i1, glc, 0),
873 LLVMConstInt(ctx->i1, slc, 0)
874 };
875
876 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
877 ctx->v4f32};
878 const char *type_names[] = {"f32", "v2f32", "v4f32"};
879 char name[256];
880
881 if (voffset) {
882 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
883 "");
884 }
885
886 if (soffset) {
887 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
888 "");
889 }
890
891 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
892 type_names[func]);
893
894 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
895 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
896 } else {
897 LLVMValueRef args[] = {
898 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
899 voffset ? voffset : vindex,
900 soffset,
901 LLVMConstInt(ctx->i32, inst_offset, 0),
902 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
903 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
904 LLVMConstInt(ctx->i32, glc, 0),
905 LLVMConstInt(ctx->i32, slc, 0),
906 LLVMConstInt(ctx->i32, 0, 0), // TFE
907 };
908
909 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
910 ctx->v4i32};
911 const char *type_names[] = {"i32", "v2i32", "v4i32"};
912 const char *arg_type = "i32";
913 char name[256];
914
915 if (voffset && vindex) {
916 LLVMValueRef vaddr[] = {vindex, voffset};
917
918 arg_type = "v2i32";
919 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
920 }
921
922 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
923 type_names[func], arg_type);
924
925 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
926 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
927 }
928 }
929
930 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
931 enum tgsi_opcode_type type, unsigned swizzle,
932 LLVMValueRef buffer, LLVMValueRef offset,
933 LLVMValueRef base)
934 {
935 struct si_shader_context *ctx = si_shader_context(bld_base);
936 struct gallivm_state *gallivm = bld_base->base.gallivm;
937 LLVMValueRef value, value2;
938 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
939 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
940
941 if (swizzle == ~0) {
942 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
943 0, 1, 0);
944
945 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
946 }
947
948 if (!tgsi_type_is_64bit(type)) {
949 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
950 0, 1, 0);
951
952 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
953 return LLVMBuildExtractElement(gallivm->builder, value,
954 lp_build_const_int32(gallivm, swizzle), "");
955 }
956
957 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
958 swizzle * 4, 1, 0);
959
960 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
961 swizzle * 4 + 4, 1, 0);
962
963 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
964 }
965
966 /**
967 * Load from LDS.
968 *
969 * \param type output value type
970 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
971 * \param dw_addr address in dwords
972 */
973 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
974 enum tgsi_opcode_type type, unsigned swizzle,
975 LLVMValueRef dw_addr)
976 {
977 struct si_shader_context *ctx = si_shader_context(bld_base);
978 struct gallivm_state *gallivm = bld_base->base.gallivm;
979 LLVMValueRef value;
980
981 if (swizzle == ~0) {
982 LLVMValueRef values[TGSI_NUM_CHANNELS];
983
984 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
985 values[chan] = lds_load(bld_base, type, chan, dw_addr);
986
987 return lp_build_gather_values(bld_base->base.gallivm, values,
988 TGSI_NUM_CHANNELS);
989 }
990
991 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
992 lp_build_const_int32(gallivm, swizzle));
993
994 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
995 if (tgsi_type_is_64bit(type)) {
996 LLVMValueRef value2;
997 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
998 lp_build_const_int32(gallivm, swizzle + 1));
999 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1000 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1001 }
1002
1003 return LLVMBuildBitCast(gallivm->builder, value,
1004 tgsi2llvmtype(bld_base, type), "");
1005 }
1006
1007 /**
1008 * Store to LDS.
1009 *
1010 * \param swizzle offset (typically 0..3)
1011 * \param dw_addr address in dwords
1012 * \param value value to store
1013 */
1014 static void lds_store(struct lp_build_tgsi_context *bld_base,
1015 unsigned swizzle, LLVMValueRef dw_addr,
1016 LLVMValueRef value)
1017 {
1018 struct si_shader_context *ctx = si_shader_context(bld_base);
1019 struct gallivm_state *gallivm = bld_base->base.gallivm;
1020
1021 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1022 lp_build_const_int32(gallivm, swizzle));
1023
1024 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1025 build_indexed_store(ctx, ctx->lds,
1026 dw_addr, value);
1027 }
1028
1029 static LLVMValueRef fetch_input_tcs(
1030 struct lp_build_tgsi_context *bld_base,
1031 const struct tgsi_full_src_register *reg,
1032 enum tgsi_opcode_type type, unsigned swizzle)
1033 {
1034 struct si_shader_context *ctx = si_shader_context(bld_base);
1035 LLVMValueRef dw_addr, stride;
1036
1037 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1038 dw_addr = get_tcs_in_current_patch_offset(ctx);
1039 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1040
1041 return lds_load(bld_base, type, swizzle, dw_addr);
1042 }
1043
1044 static LLVMValueRef fetch_output_tcs(
1045 struct lp_build_tgsi_context *bld_base,
1046 const struct tgsi_full_src_register *reg,
1047 enum tgsi_opcode_type type, unsigned swizzle)
1048 {
1049 struct si_shader_context *ctx = si_shader_context(bld_base);
1050 LLVMValueRef dw_addr, stride;
1051
1052 if (reg->Register.Dimension) {
1053 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1054 dw_addr = get_tcs_out_current_patch_offset(ctx);
1055 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1056 } else {
1057 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1058 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1059 }
1060
1061 return lds_load(bld_base, type, swizzle, dw_addr);
1062 }
1063
1064 static LLVMValueRef fetch_input_tes(
1065 struct lp_build_tgsi_context *bld_base,
1066 const struct tgsi_full_src_register *reg,
1067 enum tgsi_opcode_type type, unsigned swizzle)
1068 {
1069 struct si_shader_context *ctx = si_shader_context(bld_base);
1070 struct gallivm_state *gallivm = bld_base->base.gallivm;
1071 LLVMValueRef rw_buffers, buffer, base, addr;
1072
1073 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1074 SI_PARAM_RW_BUFFERS);
1075 buffer = build_indexed_load_const(ctx, rw_buffers,
1076 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1077
1078 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1079 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1080
1081 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1082 }
1083
1084 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1085 const struct tgsi_full_instruction *inst,
1086 const struct tgsi_opcode_info *info,
1087 LLVMValueRef dst[4])
1088 {
1089 struct si_shader_context *ctx = si_shader_context(bld_base);
1090 struct gallivm_state *gallivm = bld_base->base.gallivm;
1091 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1092 unsigned chan_index;
1093 LLVMValueRef dw_addr, stride;
1094 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1095 LLVMValueRef values[4];
1096
1097 /* Only handle per-patch and per-vertex outputs here.
1098 * Vectors will be lowered to scalars and this function will be called again.
1099 */
1100 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1101 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1102 radeon_llvm_emit_store(bld_base, inst, info, dst);
1103 return;
1104 }
1105
1106 if (reg->Register.Dimension) {
1107 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1108 dw_addr = get_tcs_out_current_patch_offset(ctx);
1109 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1110 } else {
1111 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1112 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1113 }
1114
1115 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1116 SI_PARAM_RW_BUFFERS);
1117 buffer = build_indexed_load_const(ctx, rw_buffers,
1118 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1119
1120 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1121 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1122
1123
1124 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1125 LLVMValueRef value = dst[chan_index];
1126
1127 if (inst->Instruction.Saturate)
1128 value = radeon_llvm_saturate(bld_base, value);
1129
1130 lds_store(bld_base, chan_index, dw_addr, value);
1131
1132 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1133 values[chan_index] = value;
1134
1135 if (inst->Dst[0].Register.WriteMask != 0xF) {
1136 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1137 buf_addr, base,
1138 4 * chan_index);
1139 }
1140 }
1141
1142 if (inst->Dst[0].Register.WriteMask == 0xF) {
1143 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1144 values, 4);
1145 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1146 base, 0);
1147 }
1148 }
1149
1150 static LLVMValueRef fetch_input_gs(
1151 struct lp_build_tgsi_context *bld_base,
1152 const struct tgsi_full_src_register *reg,
1153 enum tgsi_opcode_type type,
1154 unsigned swizzle)
1155 {
1156 struct lp_build_context *base = &bld_base->base;
1157 struct si_shader_context *ctx = si_shader_context(bld_base);
1158 struct si_shader *shader = ctx->shader;
1159 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1160 struct gallivm_state *gallivm = base->gallivm;
1161 LLVMValueRef vtx_offset;
1162 LLVMValueRef args[9];
1163 unsigned vtx_offset_param;
1164 struct tgsi_shader_info *info = &shader->selector->info;
1165 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1166 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1167 unsigned param;
1168 LLVMValueRef value;
1169
1170 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1171 return get_primitive_id(bld_base, swizzle);
1172
1173 if (!reg->Register.Dimension)
1174 return NULL;
1175
1176 if (swizzle == ~0) {
1177 LLVMValueRef values[TGSI_NUM_CHANNELS];
1178 unsigned chan;
1179 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1180 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1181 }
1182 return lp_build_gather_values(bld_base->base.gallivm, values,
1183 TGSI_NUM_CHANNELS);
1184 }
1185
1186 /* Get the vertex offset parameter */
1187 vtx_offset_param = reg->Dimension.Index;
1188 if (vtx_offset_param < 2) {
1189 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1190 } else {
1191 assert(vtx_offset_param < 6);
1192 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1193 }
1194 vtx_offset = lp_build_mul_imm(uint,
1195 LLVMGetParam(ctx->radeon_bld.main_fn,
1196 vtx_offset_param),
1197 4);
1198
1199 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1200 args[0] = ctx->esgs_ring;
1201 args[1] = vtx_offset;
1202 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1203 args[3] = uint->zero;
1204 args[4] = uint->one; /* OFFEN */
1205 args[5] = uint->zero; /* IDXEN */
1206 args[6] = uint->one; /* GLC */
1207 args[7] = uint->zero; /* SLC */
1208 args[8] = uint->zero; /* TFE */
1209
1210 value = lp_build_intrinsic(gallivm->builder,
1211 "llvm.SI.buffer.load.dword.i32.i32",
1212 ctx->i32, args, 9,
1213 LLVMReadOnlyAttribute);
1214 if (tgsi_type_is_64bit(type)) {
1215 LLVMValueRef value2;
1216 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1217 value2 = lp_build_intrinsic(gallivm->builder,
1218 "llvm.SI.buffer.load.dword.i32.i32",
1219 ctx->i32, args, 9,
1220 LLVMReadOnlyAttribute);
1221 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1222 value, value2);
1223 }
1224 return LLVMBuildBitCast(gallivm->builder,
1225 value,
1226 tgsi2llvmtype(bld_base, type), "");
1227 }
1228
1229 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1230 {
1231 switch (interpolate) {
1232 case TGSI_INTERPOLATE_CONSTANT:
1233 return 0;
1234
1235 case TGSI_INTERPOLATE_LINEAR:
1236 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1237 return SI_PARAM_LINEAR_SAMPLE;
1238 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1239 return SI_PARAM_LINEAR_CENTROID;
1240 else
1241 return SI_PARAM_LINEAR_CENTER;
1242 break;
1243 case TGSI_INTERPOLATE_COLOR:
1244 case TGSI_INTERPOLATE_PERSPECTIVE:
1245 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1246 return SI_PARAM_PERSP_SAMPLE;
1247 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1248 return SI_PARAM_PERSP_CENTROID;
1249 else
1250 return SI_PARAM_PERSP_CENTER;
1251 break;
1252 default:
1253 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1254 return -1;
1255 }
1256 }
1257
1258 /* This shouldn't be used by explicit INTERP opcodes. */
1259 static unsigned select_interp_param(struct si_shader_context *ctx,
1260 unsigned param)
1261 {
1262 if (!ctx->is_monolithic)
1263 return param;
1264
1265 if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1266 switch (param) {
1267 case SI_PARAM_PERSP_CENTROID:
1268 case SI_PARAM_PERSP_CENTER:
1269 return SI_PARAM_PERSP_SAMPLE;
1270 }
1271 }
1272 if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1273 switch (param) {
1274 case SI_PARAM_LINEAR_CENTROID:
1275 case SI_PARAM_LINEAR_CENTER:
1276 return SI_PARAM_LINEAR_SAMPLE;
1277 }
1278 }
1279 if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1280 switch (param) {
1281 case SI_PARAM_PERSP_CENTROID:
1282 case SI_PARAM_PERSP_SAMPLE:
1283 return SI_PARAM_PERSP_CENTER;
1284 }
1285 }
1286 if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1287 switch (param) {
1288 case SI_PARAM_LINEAR_CENTROID:
1289 case SI_PARAM_LINEAR_SAMPLE:
1290 return SI_PARAM_LINEAR_CENTER;
1291 }
1292 }
1293
1294 return param;
1295 }
1296
1297 /**
1298 * Interpolate a fragment shader input.
1299 *
1300 * @param ctx context
1301 * @param input_index index of the input in hardware
1302 * @param semantic_name TGSI_SEMANTIC_*
1303 * @param semantic_index semantic index
1304 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1305 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1306 * @param interp_param interpolation weights (i,j)
1307 * @param prim_mask SI_PARAM_PRIM_MASK
1308 * @param face SI_PARAM_FRONT_FACE
1309 * @param result the return value (4 components)
1310 */
1311 static void interp_fs_input(struct si_shader_context *ctx,
1312 unsigned input_index,
1313 unsigned semantic_name,
1314 unsigned semantic_index,
1315 unsigned num_interp_inputs,
1316 unsigned colors_read_mask,
1317 LLVMValueRef interp_param,
1318 LLVMValueRef prim_mask,
1319 LLVMValueRef face,
1320 LLVMValueRef result[4])
1321 {
1322 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1323 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1324 struct gallivm_state *gallivm = base->gallivm;
1325 const char *intr_name;
1326 LLVMValueRef attr_number;
1327
1328 unsigned chan;
1329
1330 attr_number = lp_build_const_int32(gallivm, input_index);
1331
1332 /* fs.constant returns the param from the middle vertex, so it's not
1333 * really useful for flat shading. It's meant to be used for custom
1334 * interpolation (but the intrinsic can't fetch from the other two
1335 * vertices).
1336 *
1337 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1338 * to do the right thing. The only reason we use fs.constant is that
1339 * fs.interp cannot be used on integers, because they can be equal
1340 * to NaN.
1341 */
1342 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1343
1344 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1345 ctx->shader->key.ps.prolog.color_two_side) {
1346 LLVMValueRef args[4];
1347 LLVMValueRef is_face_positive;
1348 LLVMValueRef back_attr_number;
1349
1350 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1351 * otherwise it's at offset "num_inputs".
1352 */
1353 unsigned back_attr_offset = num_interp_inputs;
1354 if (semantic_index == 1 && colors_read_mask & 0xf)
1355 back_attr_offset += 1;
1356
1357 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1358
1359 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1360 face, uint->zero, "");
1361
1362 args[2] = prim_mask;
1363 args[3] = interp_param;
1364 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1365 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1366 LLVMValueRef front, back;
1367
1368 args[0] = llvm_chan;
1369 args[1] = attr_number;
1370 front = lp_build_intrinsic(gallivm->builder, intr_name,
1371 ctx->f32, args, args[3] ? 4 : 3,
1372 LLVMReadNoneAttribute);
1373
1374 args[1] = back_attr_number;
1375 back = lp_build_intrinsic(gallivm->builder, intr_name,
1376 ctx->f32, args, args[3] ? 4 : 3,
1377 LLVMReadNoneAttribute);
1378
1379 result[chan] = LLVMBuildSelect(gallivm->builder,
1380 is_face_positive,
1381 front,
1382 back,
1383 "");
1384 }
1385 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1386 LLVMValueRef args[4];
1387
1388 args[0] = uint->zero;
1389 args[1] = attr_number;
1390 args[2] = prim_mask;
1391 args[3] = interp_param;
1392 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1393 ctx->f32, args, args[3] ? 4 : 3,
1394 LLVMReadNoneAttribute);
1395 result[1] =
1396 result[2] = lp_build_const_float(gallivm, 0.0f);
1397 result[3] = lp_build_const_float(gallivm, 1.0f);
1398 } else {
1399 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1400 LLVMValueRef args[4];
1401 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1402
1403 args[0] = llvm_chan;
1404 args[1] = attr_number;
1405 args[2] = prim_mask;
1406 args[3] = interp_param;
1407 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1408 ctx->f32, args, args[3] ? 4 : 3,
1409 LLVMReadNoneAttribute);
1410 }
1411 }
1412 }
1413
1414 /* LLVMGetParam with bc_optimize resolved. */
1415 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1416 int interp_param_idx)
1417 {
1418 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1419 LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1420 LLVMValueRef param = NULL;
1421
1422 /* Handle PRIM_MASK[31] (bc_optimize). */
1423 if (ctx->is_monolithic &&
1424 ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1425 interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1426 (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1427 interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1428 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1429 * The hw doesn't compute CENTROID if the whole wave only
1430 * contains fully-covered quads.
1431 */
1432 LLVMValueRef bc_optimize =
1433 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1434 bc_optimize = LLVMBuildLShr(builder,
1435 bc_optimize,
1436 LLVMConstInt(ctx->i32, 31, 0), "");
1437 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1438
1439 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1440 interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1441 param = LLVMBuildSelect(builder, bc_optimize,
1442 LLVMGetParam(main_fn,
1443 SI_PARAM_PERSP_CENTER),
1444 LLVMGetParam(main_fn,
1445 SI_PARAM_PERSP_CENTROID),
1446 "");
1447 }
1448 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1449 interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1450 param = LLVMBuildSelect(builder, bc_optimize,
1451 LLVMGetParam(main_fn,
1452 SI_PARAM_LINEAR_CENTER),
1453 LLVMGetParam(main_fn,
1454 SI_PARAM_LINEAR_CENTROID),
1455 "");
1456 }
1457 }
1458
1459 if (!param)
1460 param = LLVMGetParam(main_fn, interp_param_idx);
1461 return param;
1462 }
1463
1464 static void declare_input_fs(
1465 struct radeon_llvm_context *radeon_bld,
1466 unsigned input_index,
1467 const struct tgsi_full_declaration *decl)
1468 {
1469 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1470 struct si_shader_context *ctx =
1471 si_shader_context(&radeon_bld->soa.bld_base);
1472 struct si_shader *shader = ctx->shader;
1473 LLVMValueRef main_fn = radeon_bld->main_fn;
1474 LLVMValueRef interp_param = NULL;
1475 int interp_param_idx;
1476
1477 /* Get colors from input VGPRs (set by the prolog). */
1478 if (!ctx->is_monolithic &&
1479 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1480 unsigned i = decl->Semantic.Index;
1481 unsigned colors_read = shader->selector->info.colors_read;
1482 unsigned mask = colors_read >> (i * 4);
1483 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1484 (i ? util_bitcount(colors_read & 0xf) : 0);
1485
1486 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1487 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1488 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1489 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1490 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1491 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1492 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1493 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1494 return;
1495 }
1496
1497 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1498 decl->Interp.Location);
1499 if (interp_param_idx == -1)
1500 return;
1501 else if (interp_param_idx) {
1502 interp_param_idx = select_interp_param(ctx,
1503 interp_param_idx);
1504 interp_param = get_interp_param(ctx, interp_param_idx);
1505 }
1506
1507 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1508 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1509 ctx->shader->key.ps.prolog.flatshade_colors)
1510 interp_param = NULL; /* load the constant color */
1511
1512 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1513 decl->Semantic.Index, shader->selector->info.num_inputs,
1514 shader->selector->info.colors_read, interp_param,
1515 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1516 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1517 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1518 }
1519
1520 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1521 {
1522 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1523 SI_PARAM_ANCILLARY, 8, 4);
1524 }
1525
1526 /**
1527 * Set range metadata on an instruction. This can only be used on load and
1528 * call instructions. If you know an instruction can only produce the values
1529 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1530 * \p lo is the minimum value inclusive.
1531 * \p hi is the maximum value exclusive.
1532 */
1533 static void set_range_metadata(struct si_shader_context *ctx,
1534 LLVMValueRef value, unsigned lo, unsigned hi)
1535 {
1536 LLVMValueRef range_md, md_args[2];
1537 LLVMTypeRef type = LLVMTypeOf(value);
1538 LLVMContextRef context = LLVMGetTypeContext(type);
1539
1540 md_args[0] = LLVMConstInt(type, lo, false);
1541 md_args[1] = LLVMConstInt(type, hi, false);
1542 range_md = LLVMMDNodeInContext(context, md_args, 2);
1543 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1544 }
1545
1546 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1547 {
1548 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1549 LLVMValueRef tid;
1550
1551 if (HAVE_LLVM < 0x0308) {
1552 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1553 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1554 } else {
1555 LLVMValueRef tid_args[2];
1556 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1557 tid_args[1] = lp_build_const_int32(gallivm, 0);
1558 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1559 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1560 tid_args, 2, LLVMReadNoneAttribute);
1561
1562 tid = lp_build_intrinsic(gallivm->builder,
1563 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1564 tid_args, 2, LLVMReadNoneAttribute);
1565 }
1566 set_range_metadata(ctx, tid, 0, 64);
1567 return tid;
1568 }
1569
1570 /**
1571 * Load a dword from a constant buffer.
1572 */
1573 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1574 LLVMValueRef resource,
1575 LLVMValueRef offset)
1576 {
1577 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1578 LLVMValueRef args[2] = {resource, offset};
1579
1580 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1581 LLVMReadNoneAttribute);
1582 }
1583
1584 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1585 {
1586 struct si_shader_context *ctx =
1587 si_shader_context(&radeon_bld->soa.bld_base);
1588 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1589 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1590 LLVMBuilderRef builder = gallivm->builder;
1591 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1592 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1593 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1594
1595 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1596 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1597 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1598
1599 LLVMValueRef pos[4] = {
1600 buffer_load_const(ctx, resource, offset0),
1601 buffer_load_const(ctx, resource, offset1),
1602 lp_build_const_float(gallivm, 0),
1603 lp_build_const_float(gallivm, 0)
1604 };
1605
1606 return lp_build_gather_values(gallivm, pos, 4);
1607 }
1608
1609 static void declare_system_value(
1610 struct radeon_llvm_context *radeon_bld,
1611 unsigned index,
1612 const struct tgsi_full_declaration *decl)
1613 {
1614 struct si_shader_context *ctx =
1615 si_shader_context(&radeon_bld->soa.bld_base);
1616 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1617 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1618 LLVMValueRef value = 0;
1619
1620 switch (decl->Semantic.Name) {
1621 case TGSI_SEMANTIC_INSTANCEID:
1622 value = LLVMGetParam(radeon_bld->main_fn,
1623 ctx->param_instance_id);
1624 break;
1625
1626 case TGSI_SEMANTIC_VERTEXID:
1627 value = LLVMBuildAdd(gallivm->builder,
1628 LLVMGetParam(radeon_bld->main_fn,
1629 ctx->param_vertex_id),
1630 LLVMGetParam(radeon_bld->main_fn,
1631 SI_PARAM_BASE_VERTEX), "");
1632 break;
1633
1634 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1635 value = LLVMGetParam(radeon_bld->main_fn,
1636 ctx->param_vertex_id);
1637 break;
1638
1639 case TGSI_SEMANTIC_BASEVERTEX:
1640 value = LLVMGetParam(radeon_bld->main_fn,
1641 SI_PARAM_BASE_VERTEX);
1642 break;
1643
1644 case TGSI_SEMANTIC_BASEINSTANCE:
1645 value = LLVMGetParam(radeon_bld->main_fn,
1646 SI_PARAM_START_INSTANCE);
1647 break;
1648
1649 case TGSI_SEMANTIC_DRAWID:
1650 value = LLVMGetParam(radeon_bld->main_fn,
1651 SI_PARAM_DRAWID);
1652 break;
1653
1654 case TGSI_SEMANTIC_INVOCATIONID:
1655 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1656 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1657 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1658 value = LLVMGetParam(radeon_bld->main_fn,
1659 SI_PARAM_GS_INSTANCE_ID);
1660 else
1661 assert(!"INVOCATIONID not implemented");
1662 break;
1663
1664 case TGSI_SEMANTIC_POSITION:
1665 {
1666 LLVMValueRef pos[4] = {
1667 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1668 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1669 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1670 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1671 LLVMGetParam(radeon_bld->main_fn,
1672 SI_PARAM_POS_W_FLOAT)),
1673 };
1674 value = lp_build_gather_values(gallivm, pos, 4);
1675 break;
1676 }
1677
1678 case TGSI_SEMANTIC_FACE:
1679 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1680 break;
1681
1682 case TGSI_SEMANTIC_SAMPLEID:
1683 value = get_sample_id(radeon_bld);
1684 break;
1685
1686 case TGSI_SEMANTIC_SAMPLEPOS: {
1687 LLVMValueRef pos[4] = {
1688 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1689 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1690 lp_build_const_float(gallivm, 0),
1691 lp_build_const_float(gallivm, 0)
1692 };
1693 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1694 TGSI_OPCODE_FRC, pos[0]);
1695 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1696 TGSI_OPCODE_FRC, pos[1]);
1697 value = lp_build_gather_values(gallivm, pos, 4);
1698 break;
1699 }
1700
1701 case TGSI_SEMANTIC_SAMPLEMASK:
1702 /* This can only occur with the OpenGL Core profile, which
1703 * doesn't support smoothing.
1704 */
1705 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1706 break;
1707
1708 case TGSI_SEMANTIC_TESSCOORD:
1709 {
1710 LLVMValueRef coord[4] = {
1711 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1712 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1713 bld->zero,
1714 bld->zero
1715 };
1716
1717 /* For triangles, the vector should be (u, v, 1-u-v). */
1718 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1719 PIPE_PRIM_TRIANGLES)
1720 coord[2] = lp_build_sub(bld, bld->one,
1721 lp_build_add(bld, coord[0], coord[1]));
1722
1723 value = lp_build_gather_values(gallivm, coord, 4);
1724 break;
1725 }
1726
1727 case TGSI_SEMANTIC_VERTICESIN:
1728 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1729 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1730 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1731 value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7);
1732 else
1733 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1734 break;
1735
1736 case TGSI_SEMANTIC_TESSINNER:
1737 case TGSI_SEMANTIC_TESSOUTER:
1738 {
1739 LLVMValueRef rw_buffers, buffer, base, addr;
1740 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1741
1742 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1743 SI_PARAM_RW_BUFFERS);
1744 buffer = build_indexed_load_const(ctx, rw_buffers,
1745 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1746
1747 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1748 addr = get_tcs_tes_buffer_address(ctx, NULL,
1749 lp_build_const_int32(gallivm, param));
1750
1751 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1752 ~0, buffer, base, addr);
1753
1754 break;
1755 }
1756
1757 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1758 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1759 {
1760 LLVMValueRef buf, slot, val[4];
1761 int i, offset;
1762
1763 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1764 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1765 buf = build_indexed_load_const(ctx, buf, slot);
1766 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1767
1768 for (i = 0; i < 4; i++)
1769 val[i] = buffer_load_const(ctx, buf,
1770 lp_build_const_int32(gallivm, (offset + i) * 4));
1771 value = lp_build_gather_values(gallivm, val, 4);
1772 break;
1773 }
1774
1775 case TGSI_SEMANTIC_PRIMID:
1776 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1777 break;
1778
1779 case TGSI_SEMANTIC_GRID_SIZE:
1780 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1781 break;
1782
1783 case TGSI_SEMANTIC_BLOCK_SIZE:
1784 {
1785 LLVMValueRef values[3];
1786 unsigned i;
1787 unsigned *properties = ctx->shader->selector->info.properties;
1788 unsigned sizes[3] = {
1789 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1790 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1791 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1792 };
1793
1794 for (i = 0; i < 3; ++i)
1795 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1796
1797 value = lp_build_gather_values(gallivm, values, 3);
1798 break;
1799 }
1800
1801 case TGSI_SEMANTIC_BLOCK_ID:
1802 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1803 break;
1804
1805 case TGSI_SEMANTIC_THREAD_ID:
1806 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1807 break;
1808
1809 #if HAVE_LLVM >= 0x0309
1810 case TGSI_SEMANTIC_HELPER_INVOCATION:
1811 value = lp_build_intrinsic(gallivm->builder,
1812 "llvm.amdgcn.ps.live",
1813 ctx->i1, NULL, 0,
1814 LLVMReadNoneAttribute);
1815 value = LLVMBuildNot(gallivm->builder, value, "");
1816 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1817 break;
1818 #endif
1819
1820 default:
1821 assert(!"unknown system value");
1822 return;
1823 }
1824
1825 radeon_bld->system_values[index] = value;
1826 }
1827
1828 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1829 const struct tgsi_full_declaration *decl)
1830 {
1831 struct si_shader_context *ctx =
1832 si_shader_context(&radeon_bld->soa.bld_base);
1833 struct si_shader_selector *sel = ctx->shader->selector;
1834 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1835
1836 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1837 LLVMValueRef var;
1838
1839 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1840 assert(decl->Range.First == decl->Range.Last);
1841 assert(!ctx->shared_memory);
1842
1843 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1844 LLVMArrayType(ctx->i8, sel->local_size),
1845 "compute_lds",
1846 LOCAL_ADDR_SPACE);
1847 LLVMSetAlignment(var, 4);
1848
1849 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1850 }
1851
1852 static LLVMValueRef fetch_constant(
1853 struct lp_build_tgsi_context *bld_base,
1854 const struct tgsi_full_src_register *reg,
1855 enum tgsi_opcode_type type,
1856 unsigned swizzle)
1857 {
1858 struct si_shader_context *ctx = si_shader_context(bld_base);
1859 struct lp_build_context *base = &bld_base->base;
1860 const struct tgsi_ind_register *ireg = &reg->Indirect;
1861 unsigned buf, idx;
1862
1863 LLVMValueRef addr, bufp;
1864 LLVMValueRef result;
1865
1866 if (swizzle == LP_CHAN_ALL) {
1867 unsigned chan;
1868 LLVMValueRef values[4];
1869 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1870 values[chan] = fetch_constant(bld_base, reg, type, chan);
1871
1872 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1873 }
1874
1875 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1876 idx = reg->Register.Index * 4 + swizzle;
1877
1878 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1879 LLVMValueRef c0, c1;
1880
1881 c0 = buffer_load_const(ctx, ctx->const_buffers[buf],
1882 LLVMConstInt(ctx->i32, idx * 4, 0));
1883
1884 if (!tgsi_type_is_64bit(type))
1885 return bitcast(bld_base, type, c0);
1886 else {
1887 c1 = buffer_load_const(ctx, ctx->const_buffers[buf],
1888 LLVMConstInt(ctx->i32,
1889 (idx + 1) * 4, 0));
1890 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1891 c0, c1);
1892 }
1893 }
1894
1895 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1896 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1897 LLVMValueRef index;
1898 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1899 reg->Dimension.Index,
1900 SI_NUM_CONST_BUFFERS);
1901 bufp = build_indexed_load_const(ctx, ptr, index);
1902 } else
1903 bufp = ctx->const_buffers[buf];
1904
1905 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1906 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1907 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1908 addr = lp_build_add(&bld_base->uint_bld, addr,
1909 lp_build_const_int32(base->gallivm, idx * 4));
1910
1911 result = buffer_load_const(ctx, bufp, addr);
1912
1913 if (!tgsi_type_is_64bit(type))
1914 result = bitcast(bld_base, type, result);
1915 else {
1916 LLVMValueRef addr2, result2;
1917 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1918 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1919 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1920 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1921 lp_build_const_int32(base->gallivm, idx * 4));
1922
1923 result2 = buffer_load_const(ctx, bufp, addr2);
1924
1925 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1926 result, result2);
1927 }
1928 return result;
1929 }
1930
1931 /* Upper 16 bits must be zero. */
1932 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1933 LLVMValueRef val[2])
1934 {
1935 return LLVMBuildOr(gallivm->builder, val[0],
1936 LLVMBuildShl(gallivm->builder, val[1],
1937 lp_build_const_int32(gallivm, 16),
1938 ""), "");
1939 }
1940
1941 /* Upper 16 bits are ignored and will be dropped. */
1942 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1943 LLVMValueRef val[2])
1944 {
1945 LLVMValueRef v[2] = {
1946 LLVMBuildAnd(gallivm->builder, val[0],
1947 lp_build_const_int32(gallivm, 0xffff), ""),
1948 val[1],
1949 };
1950 return si_llvm_pack_two_int16(gallivm, v);
1951 }
1952
1953 /* Initialize arguments for the shader export intrinsic */
1954 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1955 LLVMValueRef *values,
1956 unsigned target,
1957 LLVMValueRef *args)
1958 {
1959 struct si_shader_context *ctx = si_shader_context(bld_base);
1960 struct lp_build_context *uint =
1961 &ctx->radeon_bld.soa.bld_base.uint_bld;
1962 struct lp_build_context *base = &bld_base->base;
1963 struct gallivm_state *gallivm = base->gallivm;
1964 LLVMBuilderRef builder = base->gallivm->builder;
1965 LLVMValueRef val[4];
1966 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1967 unsigned chan;
1968 bool is_int8;
1969
1970 /* Default is 0xf. Adjusted below depending on the format. */
1971 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1972
1973 /* Specify whether the EXEC mask represents the valid mask */
1974 args[1] = uint->zero;
1975
1976 /* Specify whether this is the last export */
1977 args[2] = uint->zero;
1978
1979 /* Specify the target we are exporting */
1980 args[3] = lp_build_const_int32(base->gallivm, target);
1981
1982 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1983 const union si_shader_key *key = &ctx->shader->key;
1984 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1985 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1986
1987 assert(cbuf >= 0 && cbuf < 8);
1988 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1989 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1990 }
1991
1992 args[4] = uint->zero; /* COMPR flag */
1993 args[5] = base->undef;
1994 args[6] = base->undef;
1995 args[7] = base->undef;
1996 args[8] = base->undef;
1997
1998 switch (spi_shader_col_format) {
1999 case V_028714_SPI_SHADER_ZERO:
2000 args[0] = uint->zero; /* writemask */
2001 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2002 break;
2003
2004 case V_028714_SPI_SHADER_32_R:
2005 args[0] = uint->one; /* writemask */
2006 args[5] = values[0];
2007 break;
2008
2009 case V_028714_SPI_SHADER_32_GR:
2010 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
2011 args[5] = values[0];
2012 args[6] = values[1];
2013 break;
2014
2015 case V_028714_SPI_SHADER_32_AR:
2016 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2017 args[5] = values[0];
2018 args[8] = values[3];
2019 break;
2020
2021 case V_028714_SPI_SHADER_FP16_ABGR:
2022 args[4] = uint->one; /* COMPR flag */
2023
2024 for (chan = 0; chan < 2; chan++) {
2025 LLVMValueRef pack_args[2] = {
2026 values[2 * chan],
2027 values[2 * chan + 1]
2028 };
2029 LLVMValueRef packed;
2030
2031 packed = lp_build_intrinsic(base->gallivm->builder,
2032 "llvm.SI.packf16",
2033 ctx->i32, pack_args, 2,
2034 LLVMReadNoneAttribute);
2035 args[chan + 5] =
2036 LLVMBuildBitCast(base->gallivm->builder,
2037 packed, ctx->f32, "");
2038 }
2039 break;
2040
2041 case V_028714_SPI_SHADER_UNORM16_ABGR:
2042 for (chan = 0; chan < 4; chan++) {
2043 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2044 val[chan] = LLVMBuildFMul(builder, val[chan],
2045 lp_build_const_float(gallivm, 65535), "");
2046 val[chan] = LLVMBuildFAdd(builder, val[chan],
2047 lp_build_const_float(gallivm, 0.5), "");
2048 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2049 ctx->i32, "");
2050 }
2051
2052 args[4] = uint->one; /* COMPR flag */
2053 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2054 si_llvm_pack_two_int16(gallivm, val));
2055 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2056 si_llvm_pack_two_int16(gallivm, val+2));
2057 break;
2058
2059 case V_028714_SPI_SHADER_SNORM16_ABGR:
2060 for (chan = 0; chan < 4; chan++) {
2061 /* Clamp between [-1, 1]. */
2062 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2063 values[chan],
2064 lp_build_const_float(gallivm, 1));
2065 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2066 val[chan],
2067 lp_build_const_float(gallivm, -1));
2068 /* Convert to a signed integer in [-32767, 32767]. */
2069 val[chan] = LLVMBuildFMul(builder, val[chan],
2070 lp_build_const_float(gallivm, 32767), "");
2071 /* If positive, add 0.5, else add -0.5. */
2072 val[chan] = LLVMBuildFAdd(builder, val[chan],
2073 LLVMBuildSelect(builder,
2074 LLVMBuildFCmp(builder, LLVMRealOGE,
2075 val[chan], base->zero, ""),
2076 lp_build_const_float(gallivm, 0.5),
2077 lp_build_const_float(gallivm, -0.5), ""), "");
2078 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2079 }
2080
2081 args[4] = uint->one; /* COMPR flag */
2082 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2083 si_llvm_pack_two_int32_as_int16(gallivm, val));
2084 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2085 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2086 break;
2087
2088 case V_028714_SPI_SHADER_UINT16_ABGR: {
2089 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2090 255 : 65535);
2091 /* Clamp. */
2092 for (chan = 0; chan < 4; chan++) {
2093 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2094 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2095 val[chan], max);
2096 }
2097
2098 args[4] = uint->one; /* COMPR flag */
2099 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2100 si_llvm_pack_two_int16(gallivm, val));
2101 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2102 si_llvm_pack_two_int16(gallivm, val+2));
2103 break;
2104 }
2105
2106 case V_028714_SPI_SHADER_SINT16_ABGR: {
2107 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2108 127 : 32767);
2109 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2110 -128 : -32768);
2111 /* Clamp. */
2112 for (chan = 0; chan < 4; chan++) {
2113 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2114 val[chan] = lp_build_emit_llvm_binary(bld_base,
2115 TGSI_OPCODE_IMIN,
2116 val[chan], max);
2117 val[chan] = lp_build_emit_llvm_binary(bld_base,
2118 TGSI_OPCODE_IMAX,
2119 val[chan], min);
2120 }
2121
2122 args[4] = uint->one; /* COMPR flag */
2123 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2124 si_llvm_pack_two_int32_as_int16(gallivm, val));
2125 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2126 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2127 break;
2128 }
2129
2130 case V_028714_SPI_SHADER_32_ABGR:
2131 memcpy(&args[5], values, sizeof(values[0]) * 4);
2132 break;
2133 }
2134 }
2135
2136 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2137 LLVMValueRef alpha)
2138 {
2139 struct si_shader_context *ctx = si_shader_context(bld_base);
2140 struct gallivm_state *gallivm = bld_base->base.gallivm;
2141
2142 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2143 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2144 SI_PARAM_ALPHA_REF);
2145
2146 LLVMValueRef alpha_pass =
2147 lp_build_cmp(&bld_base->base,
2148 ctx->shader->key.ps.epilog.alpha_func,
2149 alpha, alpha_ref);
2150 LLVMValueRef arg =
2151 lp_build_select(&bld_base->base,
2152 alpha_pass,
2153 lp_build_const_float(gallivm, 1.0f),
2154 lp_build_const_float(gallivm, -1.0f));
2155
2156 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2157 ctx->voidt, &arg, 1, 0);
2158 } else {
2159 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2160 ctx->voidt, NULL, 0, 0);
2161 }
2162 }
2163
2164 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2165 LLVMValueRef alpha,
2166 unsigned samplemask_param)
2167 {
2168 struct si_shader_context *ctx = si_shader_context(bld_base);
2169 struct gallivm_state *gallivm = bld_base->base.gallivm;
2170 LLVMValueRef coverage;
2171
2172 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2173 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2174 samplemask_param);
2175 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2176
2177 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2178 ctx->i32,
2179 &coverage, 1, LLVMReadNoneAttribute);
2180
2181 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2182 ctx->f32, "");
2183
2184 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2185 lp_build_const_float(gallivm,
2186 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2187
2188 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2189 }
2190
2191 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2192 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2193 {
2194 struct si_shader_context *ctx = si_shader_context(bld_base);
2195 struct lp_build_context *base = &bld_base->base;
2196 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2197 unsigned reg_index;
2198 unsigned chan;
2199 unsigned const_chan;
2200 LLVMValueRef base_elt;
2201 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2202 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2203 SI_VS_CONST_CLIP_PLANES);
2204 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2205
2206 for (reg_index = 0; reg_index < 2; reg_index ++) {
2207 LLVMValueRef *args = pos[2 + reg_index];
2208
2209 args[5] =
2210 args[6] =
2211 args[7] =
2212 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2213
2214 /* Compute dot products of position and user clip plane vectors */
2215 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2216 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2217 args[1] = lp_build_const_int32(base->gallivm,
2218 ((reg_index * 4 + chan) * 4 +
2219 const_chan) * 4);
2220 base_elt = buffer_load_const(ctx, const_resource,
2221 args[1]);
2222 args[5 + chan] =
2223 lp_build_add(base, args[5 + chan],
2224 lp_build_mul(base, base_elt,
2225 out_elts[const_chan]));
2226 }
2227 }
2228
2229 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2230 args[1] = uint->zero;
2231 args[2] = uint->zero;
2232 args[3] = lp_build_const_int32(base->gallivm,
2233 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2234 args[4] = uint->zero;
2235 }
2236 }
2237
2238 static void si_dump_streamout(struct pipe_stream_output_info *so)
2239 {
2240 unsigned i;
2241
2242 if (so->num_outputs)
2243 fprintf(stderr, "STREAMOUT\n");
2244
2245 for (i = 0; i < so->num_outputs; i++) {
2246 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2247 so->output[i].start_component;
2248 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2249 i, so->output[i].output_buffer,
2250 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2251 so->output[i].register_index,
2252 mask & 1 ? "x" : "",
2253 mask & 2 ? "y" : "",
2254 mask & 4 ? "z" : "",
2255 mask & 8 ? "w" : "");
2256 }
2257 }
2258
2259 /* On SI, the vertex shader is responsible for writing streamout data
2260 * to buffers. */
2261 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2262 struct si_shader_output_values *outputs,
2263 unsigned noutput)
2264 {
2265 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2266 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2267 LLVMBuilderRef builder = gallivm->builder;
2268 int i, j;
2269 struct lp_build_if_state if_ctx;
2270 LLVMValueRef so_buffers[4];
2271 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
2272 SI_PARAM_RW_BUFFERS);
2273
2274 /* Load the descriptors. */
2275 for (i = 0; i < 4; ++i) {
2276 if (ctx->shader->selector->so.stride[i]) {
2277 LLVMValueRef offset = lp_build_const_int32(gallivm,
2278 SI_VS_STREAMOUT_BUF0 + i);
2279
2280 so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
2281 }
2282 }
2283
2284 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2285 LLVMValueRef so_vtx_count =
2286 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2287
2288 LLVMValueRef tid = get_thread_id(ctx);
2289
2290 /* can_emit = tid < so_vtx_count; */
2291 LLVMValueRef can_emit =
2292 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2293
2294 LLVMValueRef stream_id =
2295 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2296
2297 /* Emit the streamout code conditionally. This actually avoids
2298 * out-of-bounds buffer access. The hw tells us via the SGPR
2299 * (so_vtx_count) which threads are allowed to emit streamout data. */
2300 lp_build_if(&if_ctx, gallivm, can_emit);
2301 {
2302 /* The buffer offset is computed as follows:
2303 * ByteOffset = streamout_offset[buffer_id]*4 +
2304 * (streamout_write_index + thread_id)*stride[buffer_id] +
2305 * attrib_offset
2306 */
2307
2308 LLVMValueRef so_write_index =
2309 LLVMGetParam(ctx->radeon_bld.main_fn,
2310 ctx->param_streamout_write_index);
2311
2312 /* Compute (streamout_write_index + thread_id). */
2313 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2314
2315 /* Compute the write offset for each enabled buffer. */
2316 LLVMValueRef so_write_offset[4] = {};
2317 for (i = 0; i < 4; i++) {
2318 if (!so->stride[i])
2319 continue;
2320
2321 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2322 ctx->param_streamout_offset[i]);
2323 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2324
2325 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2326 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2327 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2328 }
2329
2330 /* Write streamout data. */
2331 for (i = 0; i < so->num_outputs; i++) {
2332 unsigned buf_idx = so->output[i].output_buffer;
2333 unsigned reg = so->output[i].register_index;
2334 unsigned start = so->output[i].start_component;
2335 unsigned num_comps = so->output[i].num_components;
2336 unsigned stream = so->output[i].stream;
2337 LLVMValueRef out[4];
2338 struct lp_build_if_state if_ctx_stream;
2339
2340 assert(num_comps && num_comps <= 4);
2341 if (!num_comps || num_comps > 4)
2342 continue;
2343
2344 if (reg >= noutput)
2345 continue;
2346
2347 /* Load the output as int. */
2348 for (j = 0; j < num_comps; j++) {
2349 out[j] = LLVMBuildBitCast(builder,
2350 outputs[reg].values[start+j],
2351 ctx->i32, "");
2352 }
2353
2354 /* Pack the output. */
2355 LLVMValueRef vdata = NULL;
2356
2357 switch (num_comps) {
2358 case 1: /* as i32 */
2359 vdata = out[0];
2360 break;
2361 case 2: /* as v2i32 */
2362 case 3: /* as v4i32 (aligned to 4) */
2363 case 4: /* as v4i32 */
2364 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2365 for (j = 0; j < num_comps; j++) {
2366 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2367 LLVMConstInt(ctx->i32, j, 0), "");
2368 }
2369 break;
2370 }
2371
2372 LLVMValueRef can_emit_stream =
2373 LLVMBuildICmp(builder, LLVMIntEQ,
2374 stream_id,
2375 lp_build_const_int32(gallivm, stream), "");
2376
2377 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2378 build_tbuffer_store_dwords(ctx, so_buffers[buf_idx],
2379 vdata, num_comps,
2380 so_write_offset[buf_idx],
2381 LLVMConstInt(ctx->i32, 0, 0),
2382 so->output[i].dst_offset*4);
2383 lp_build_endif(&if_ctx_stream);
2384 }
2385 }
2386 lp_build_endif(&if_ctx);
2387 }
2388
2389
2390 /* Generate export instructions for hardware VS shader stage */
2391 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2392 struct si_shader_output_values *outputs,
2393 unsigned noutput)
2394 {
2395 struct si_shader_context *ctx = si_shader_context(bld_base);
2396 struct si_shader *shader = ctx->shader;
2397 struct lp_build_context *base = &bld_base->base;
2398 struct lp_build_context *uint =
2399 &ctx->radeon_bld.soa.bld_base.uint_bld;
2400 LLVMValueRef args[9];
2401 LLVMValueRef pos_args[4][9] = { { 0 } };
2402 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2403 unsigned semantic_name, semantic_index;
2404 unsigned target;
2405 unsigned param_count = 0;
2406 unsigned pos_idx;
2407 int i;
2408
2409 if (outputs && ctx->shader->selector->so.num_outputs) {
2410 si_llvm_emit_streamout(ctx, outputs, noutput);
2411 }
2412
2413 for (i = 0; i < noutput; i++) {
2414 semantic_name = outputs[i].name;
2415 semantic_index = outputs[i].sid;
2416
2417 handle_semantic:
2418 /* Select the correct target */
2419 switch(semantic_name) {
2420 case TGSI_SEMANTIC_PSIZE:
2421 psize_value = outputs[i].values[0];
2422 continue;
2423 case TGSI_SEMANTIC_EDGEFLAG:
2424 edgeflag_value = outputs[i].values[0];
2425 continue;
2426 case TGSI_SEMANTIC_LAYER:
2427 layer_value = outputs[i].values[0];
2428 semantic_name = TGSI_SEMANTIC_GENERIC;
2429 goto handle_semantic;
2430 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2431 viewport_index_value = outputs[i].values[0];
2432 semantic_name = TGSI_SEMANTIC_GENERIC;
2433 goto handle_semantic;
2434 case TGSI_SEMANTIC_POSITION:
2435 target = V_008DFC_SQ_EXP_POS;
2436 break;
2437 case TGSI_SEMANTIC_COLOR:
2438 case TGSI_SEMANTIC_BCOLOR:
2439 target = V_008DFC_SQ_EXP_PARAM + param_count;
2440 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2441 shader->info.vs_output_param_offset[i] = param_count;
2442 param_count++;
2443 break;
2444 case TGSI_SEMANTIC_CLIPDIST:
2445 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2446 break;
2447 case TGSI_SEMANTIC_CLIPVERTEX:
2448 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2449 continue;
2450 case TGSI_SEMANTIC_PRIMID:
2451 case TGSI_SEMANTIC_FOG:
2452 case TGSI_SEMANTIC_TEXCOORD:
2453 case TGSI_SEMANTIC_GENERIC:
2454 target = V_008DFC_SQ_EXP_PARAM + param_count;
2455 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2456 shader->info.vs_output_param_offset[i] = param_count;
2457 param_count++;
2458 break;
2459 default:
2460 target = 0;
2461 fprintf(stderr,
2462 "Warning: SI unhandled vs output type:%d\n",
2463 semantic_name);
2464 }
2465
2466 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2467
2468 if (target >= V_008DFC_SQ_EXP_POS &&
2469 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2470 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2471 args, sizeof(args));
2472 } else {
2473 lp_build_intrinsic(base->gallivm->builder,
2474 "llvm.SI.export", ctx->voidt,
2475 args, 9, 0);
2476 }
2477
2478 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2479 semantic_name = TGSI_SEMANTIC_GENERIC;
2480 goto handle_semantic;
2481 }
2482 }
2483
2484 shader->info.nr_param_exports = param_count;
2485
2486 /* We need to add the position output manually if it's missing. */
2487 if (!pos_args[0][0]) {
2488 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2489 pos_args[0][1] = uint->zero; /* EXEC mask */
2490 pos_args[0][2] = uint->zero; /* last export? */
2491 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2492 pos_args[0][4] = uint->zero; /* COMPR flag */
2493 pos_args[0][5] = base->zero; /* X */
2494 pos_args[0][6] = base->zero; /* Y */
2495 pos_args[0][7] = base->zero; /* Z */
2496 pos_args[0][8] = base->one; /* W */
2497 }
2498
2499 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2500 if (shader->selector->info.writes_psize ||
2501 shader->selector->info.writes_edgeflag ||
2502 shader->selector->info.writes_viewport_index ||
2503 shader->selector->info.writes_layer) {
2504 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2505 shader->selector->info.writes_psize |
2506 (shader->selector->info.writes_edgeflag << 1) |
2507 (shader->selector->info.writes_layer << 2) |
2508 (shader->selector->info.writes_viewport_index << 3));
2509 pos_args[1][1] = uint->zero; /* EXEC mask */
2510 pos_args[1][2] = uint->zero; /* last export? */
2511 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2512 pos_args[1][4] = uint->zero; /* COMPR flag */
2513 pos_args[1][5] = base->zero; /* X */
2514 pos_args[1][6] = base->zero; /* Y */
2515 pos_args[1][7] = base->zero; /* Z */
2516 pos_args[1][8] = base->zero; /* W */
2517
2518 if (shader->selector->info.writes_psize)
2519 pos_args[1][5] = psize_value;
2520
2521 if (shader->selector->info.writes_edgeflag) {
2522 /* The output is a float, but the hw expects an integer
2523 * with the first bit containing the edge flag. */
2524 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2525 edgeflag_value,
2526 ctx->i32, "");
2527 edgeflag_value = lp_build_min(&bld_base->int_bld,
2528 edgeflag_value,
2529 bld_base->int_bld.one);
2530
2531 /* The LLVM intrinsic expects a float. */
2532 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2533 edgeflag_value,
2534 ctx->f32, "");
2535 }
2536
2537 if (shader->selector->info.writes_layer)
2538 pos_args[1][7] = layer_value;
2539
2540 if (shader->selector->info.writes_viewport_index)
2541 pos_args[1][8] = viewport_index_value;
2542 }
2543
2544 for (i = 0; i < 4; i++)
2545 if (pos_args[i][0])
2546 shader->info.nr_pos_exports++;
2547
2548 pos_idx = 0;
2549 for (i = 0; i < 4; i++) {
2550 if (!pos_args[i][0])
2551 continue;
2552
2553 /* Specify the target we are exporting */
2554 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2555
2556 if (pos_idx == shader->info.nr_pos_exports)
2557 /* Specify that this is the last export */
2558 pos_args[i][2] = uint->one;
2559
2560 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2561 ctx->voidt, pos_args[i], 9, 0);
2562 }
2563 }
2564
2565 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2566 {
2567 struct si_shader_context *ctx = si_shader_context(bld_base);
2568 struct gallivm_state *gallivm = bld_base->base.gallivm;
2569 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2570 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2571 uint64_t inputs;
2572
2573 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2574
2575 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2576 buffer = build_indexed_load_const(ctx, rw_buffers,
2577 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2578
2579 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2580
2581 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2582 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2583 lds_vertex_stride, "");
2584 lds_base = get_tcs_in_current_patch_offset(ctx);
2585 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2586
2587 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2588 while (inputs) {
2589 unsigned i = u_bit_scan64(&inputs);
2590
2591 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2592 lp_build_const_int32(gallivm, 4 * i),
2593 "");
2594
2595 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2596 invocation_id,
2597 lp_build_const_int32(gallivm, i));
2598
2599 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2600 lds_ptr);
2601
2602 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2603 buffer_offset, 0);
2604 }
2605 }
2606
2607 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2608 LLVMValueRef rel_patch_id,
2609 LLVMValueRef invocation_id,
2610 LLVMValueRef tcs_out_current_patch_data_offset)
2611 {
2612 struct si_shader_context *ctx = si_shader_context(bld_base);
2613 struct gallivm_state *gallivm = bld_base->base.gallivm;
2614 struct si_shader *shader = ctx->shader;
2615 unsigned tess_inner_index, tess_outer_index;
2616 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2617 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2618 unsigned stride, outer_comps, inner_comps, i;
2619 struct lp_build_if_state if_ctx, inner_if_ctx;
2620
2621 si_llvm_emit_barrier(NULL, bld_base, NULL);
2622
2623 /* Do this only for invocation 0, because the tess levels are per-patch,
2624 * not per-vertex.
2625 *
2626 * This can't jump, because invocation 0 executes this. It should
2627 * at least mask out the loads and stores for other invocations.
2628 */
2629 lp_build_if(&if_ctx, gallivm,
2630 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2631 invocation_id, bld_base->uint_bld.zero, ""));
2632
2633 /* Determine the layout of one tess factor element in the buffer. */
2634 switch (shader->key.tcs.epilog.prim_mode) {
2635 case PIPE_PRIM_LINES:
2636 stride = 2; /* 2 dwords, 1 vec2 store */
2637 outer_comps = 2;
2638 inner_comps = 0;
2639 break;
2640 case PIPE_PRIM_TRIANGLES:
2641 stride = 4; /* 4 dwords, 1 vec4 store */
2642 outer_comps = 3;
2643 inner_comps = 1;
2644 break;
2645 case PIPE_PRIM_QUADS:
2646 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2647 outer_comps = 4;
2648 inner_comps = 2;
2649 break;
2650 default:
2651 assert(0);
2652 return;
2653 }
2654
2655 /* Load tess_inner and tess_outer from LDS.
2656 * Any invocation can write them, so we can't get them from a temporary.
2657 */
2658 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2659 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2660
2661 lds_base = tcs_out_current_patch_data_offset;
2662 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2663 lp_build_const_int32(gallivm,
2664 tess_inner_index * 4), "");
2665 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2666 lp_build_const_int32(gallivm,
2667 tess_outer_index * 4), "");
2668
2669 for (i = 0; i < outer_comps; i++)
2670 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2671 for (i = 0; i < inner_comps; i++)
2672 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2673
2674 /* Convert the outputs to vectors for stores. */
2675 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2676 vec1 = NULL;
2677
2678 if (stride > 4)
2679 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2680
2681 /* Get the buffer. */
2682 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2683 SI_PARAM_RW_BUFFERS);
2684 buffer = build_indexed_load_const(ctx, rw_buffers,
2685 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2686
2687 /* Get the offset. */
2688 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2689 SI_PARAM_TESS_FACTOR_OFFSET);
2690 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2691 lp_build_const_int32(gallivm, 4 * stride), "");
2692
2693 lp_build_if(&inner_if_ctx, gallivm,
2694 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2695 rel_patch_id, bld_base->uint_bld.zero, ""));
2696
2697 /* Store the dynamic HS control word. */
2698 build_tbuffer_store_dwords(ctx, buffer,
2699 lp_build_const_int32(gallivm, 0x80000000),
2700 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2701
2702 lp_build_endif(&inner_if_ctx);
2703
2704 /* Store the tessellation factors. */
2705 build_tbuffer_store_dwords(ctx, buffer, vec0,
2706 MIN2(stride, 4), byteoffset, tf_base, 4);
2707 if (vec1)
2708 build_tbuffer_store_dwords(ctx, buffer, vec1,
2709 stride - 4, byteoffset, tf_base, 20);
2710 lp_build_endif(&if_ctx);
2711 }
2712
2713 /* This only writes the tessellation factor levels. */
2714 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2715 {
2716 struct si_shader_context *ctx = si_shader_context(bld_base);
2717 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2718
2719 rel_patch_id = get_rel_patch_id(ctx);
2720 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2721 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2722
2723 if (!ctx->is_monolithic) {
2724 /* Return epilog parameters from this function. */
2725 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2726 LLVMValueRef ret = ctx->return_value;
2727 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2728 unsigned vgpr;
2729
2730 /* RW_BUFFERS pointer */
2731 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2732 SI_PARAM_RW_BUFFERS);
2733 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2734 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2735 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2736 bld_base->uint_bld.zero, "");
2737 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2738 bld_base->uint_bld.one, "");
2739 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2740 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2741
2742 /* Tess factor buffer soffset is after user SGPRs. */
2743 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2744 SI_PARAM_TESS_FACTOR_OFFSET);
2745 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2746 SI_TCS_NUM_USER_SGPR + 1, "");
2747
2748 /* VGPRs */
2749 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2750 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2751 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2752
2753 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2754 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2755 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2756 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2757 ctx->return_value = ret;
2758 return;
2759 }
2760
2761 si_copy_tcs_inputs(bld_base);
2762 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2763 }
2764
2765 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2766 {
2767 struct si_shader_context *ctx = si_shader_context(bld_base);
2768 struct si_shader *shader = ctx->shader;
2769 struct tgsi_shader_info *info = &shader->selector->info;
2770 struct gallivm_state *gallivm = bld_base->base.gallivm;
2771 unsigned i, chan;
2772 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2773 ctx->param_rel_auto_id);
2774 LLVMValueRef vertex_dw_stride =
2775 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2776 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2777 vertex_dw_stride, "");
2778
2779 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2780 * its inputs from it. */
2781 for (i = 0; i < info->num_outputs; i++) {
2782 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2783 unsigned name = info->output_semantic_name[i];
2784 unsigned index = info->output_semantic_index[i];
2785 int param = si_shader_io_get_unique_index(name, index);
2786 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2787 lp_build_const_int32(gallivm, param * 4), "");
2788
2789 for (chan = 0; chan < 4; chan++) {
2790 lds_store(bld_base, chan, dw_addr,
2791 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2792 }
2793 }
2794 }
2795
2796 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2797 {
2798 struct si_shader_context *ctx = si_shader_context(bld_base);
2799 struct gallivm_state *gallivm = bld_base->base.gallivm;
2800 struct si_shader *es = ctx->shader;
2801 struct tgsi_shader_info *info = &es->selector->info;
2802 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2803 ctx->param_es2gs_offset);
2804 unsigned chan;
2805 int i;
2806
2807 for (i = 0; i < info->num_outputs; i++) {
2808 LLVMValueRef *out_ptr =
2809 ctx->radeon_bld.soa.outputs[i];
2810 int param_index;
2811
2812 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2813 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2814 continue;
2815
2816 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2817 info->output_semantic_index[i]);
2818
2819 for (chan = 0; chan < 4; chan++) {
2820 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2821 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2822
2823 build_tbuffer_store(ctx,
2824 ctx->esgs_ring,
2825 out_val, 1,
2826 LLVMGetUndef(ctx->i32), soffset,
2827 (4 * param_index + chan) * 4,
2828 V_008F0C_BUF_DATA_FORMAT_32,
2829 V_008F0C_BUF_NUM_FORMAT_UINT,
2830 0, 0, 1, 1, 0);
2831 }
2832 }
2833 }
2834
2835 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2836 {
2837 struct si_shader_context *ctx = si_shader_context(bld_base);
2838 struct gallivm_state *gallivm = bld_base->base.gallivm;
2839 LLVMValueRef args[2];
2840
2841 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2842 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2843 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2844 ctx->voidt, args, 2, 0);
2845 }
2846
2847 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2848 {
2849 struct si_shader_context *ctx = si_shader_context(bld_base);
2850 struct gallivm_state *gallivm = bld_base->base.gallivm;
2851 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2852 struct si_shader_output_values *outputs = NULL;
2853 int i,j;
2854
2855 assert(!ctx->is_gs_copy_shader);
2856
2857 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2858
2859 /* Vertex color clamping.
2860 *
2861 * This uses a state constant loaded in a user data SGPR and
2862 * an IF statement is added that clamps all colors if the constant
2863 * is true.
2864 */
2865 if (ctx->type == PIPE_SHADER_VERTEX) {
2866 struct lp_build_if_state if_ctx;
2867 LLVMValueRef cond = NULL;
2868 LLVMValueRef addr, val;
2869
2870 for (i = 0; i < info->num_outputs; i++) {
2871 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2872 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2873 continue;
2874
2875 /* We've found a color. */
2876 if (!cond) {
2877 /* The state is in the first bit of the user SGPR. */
2878 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2879 SI_PARAM_VS_STATE_BITS);
2880 cond = LLVMBuildTrunc(gallivm->builder, cond,
2881 ctx->i1, "");
2882 lp_build_if(&if_ctx, gallivm, cond);
2883 }
2884
2885 for (j = 0; j < 4; j++) {
2886 addr = ctx->radeon_bld.soa.outputs[i][j];
2887 val = LLVMBuildLoad(gallivm->builder, addr, "");
2888 val = radeon_llvm_saturate(bld_base, val);
2889 LLVMBuildStore(gallivm->builder, val, addr);
2890 }
2891 }
2892
2893 if (cond)
2894 lp_build_endif(&if_ctx);
2895 }
2896
2897 for (i = 0; i < info->num_outputs; i++) {
2898 outputs[i].name = info->output_semantic_name[i];
2899 outputs[i].sid = info->output_semantic_index[i];
2900
2901 for (j = 0; j < 4; j++)
2902 outputs[i].values[j] =
2903 LLVMBuildLoad(gallivm->builder,
2904 ctx->radeon_bld.soa.outputs[i][j],
2905 "");
2906 }
2907
2908 if (ctx->is_monolithic) {
2909 /* Export PrimitiveID when PS needs it. */
2910 if (si_vs_exports_prim_id(ctx->shader)) {
2911 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2912 outputs[i].sid = 0;
2913 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2914 get_primitive_id(bld_base, 0));
2915 outputs[i].values[1] = bld_base->base.undef;
2916 outputs[i].values[2] = bld_base->base.undef;
2917 outputs[i].values[3] = bld_base->base.undef;
2918 i++;
2919 }
2920 } else {
2921 /* Return the primitive ID from the LLVM function. */
2922 ctx->return_value =
2923 LLVMBuildInsertValue(gallivm->builder,
2924 ctx->return_value,
2925 bitcast(bld_base, TGSI_TYPE_FLOAT,
2926 get_primitive_id(bld_base, 0)),
2927 VS_EPILOG_PRIMID_LOC, "");
2928 }
2929
2930 si_llvm_export_vs(bld_base, outputs, i);
2931 FREE(outputs);
2932 }
2933
2934 struct si_ps_exports {
2935 unsigned num;
2936 LLVMValueRef args[10][9];
2937 };
2938
2939 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
2940 bool writes_samplemask)
2941 {
2942 if (writes_z) {
2943 /* Z needs 32 bits. */
2944 if (writes_samplemask)
2945 return V_028710_SPI_SHADER_32_ABGR;
2946 else if (writes_stencil)
2947 return V_028710_SPI_SHADER_32_GR;
2948 else
2949 return V_028710_SPI_SHADER_32_R;
2950 } else if (writes_stencil || writes_samplemask) {
2951 /* Both stencil and sample mask need only 16 bits. */
2952 return V_028710_SPI_SHADER_UINT16_ABGR;
2953 } else {
2954 return V_028710_SPI_SHADER_ZERO;
2955 }
2956 }
2957
2958 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2959 LLVMValueRef depth, LLVMValueRef stencil,
2960 LLVMValueRef samplemask, struct si_ps_exports *exp)
2961 {
2962 struct si_shader_context *ctx = si_shader_context(bld_base);
2963 struct lp_build_context *base = &bld_base->base;
2964 struct lp_build_context *uint = &bld_base->uint_bld;
2965 LLVMValueRef args[9];
2966 unsigned mask = 0;
2967 unsigned format = si_get_spi_shader_z_format(depth != NULL,
2968 stencil != NULL,
2969 samplemask != NULL);
2970
2971 assert(depth || stencil || samplemask);
2972
2973 args[1] = uint->one; /* whether the EXEC mask is valid */
2974 args[2] = uint->one; /* DONE bit */
2975
2976 /* Specify the target we are exporting */
2977 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2978
2979 args[4] = uint->zero; /* COMP flag */
2980 args[5] = base->undef; /* R, depth */
2981 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2982 args[7] = base->undef; /* B, sample mask */
2983 args[8] = base->undef; /* A, alpha to mask */
2984
2985 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
2986 assert(!depth);
2987 args[4] = uint->one; /* COMPR flag */
2988
2989 if (stencil) {
2990 /* Stencil should be in X[23:16]. */
2991 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
2992 stencil = LLVMBuildShl(base->gallivm->builder, stencil,
2993 LLVMConstInt(ctx->i32, 16, 0), "");
2994 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
2995 mask |= 0x3;
2996 }
2997 if (samplemask) {
2998 /* SampleMask should be in Y[15:0]. */
2999 args[6] = samplemask;
3000 mask |= 0xc;
3001 }
3002 } else {
3003 if (depth) {
3004 args[5] = depth;
3005 mask |= 0x1;
3006 }
3007 if (stencil) {
3008 args[6] = stencil;
3009 mask |= 0x2;
3010 }
3011 if (samplemask) {
3012 args[7] = samplemask;
3013 mask |= 0x4;
3014 }
3015 }
3016
3017 /* SI (except OLAND) has a bug that it only looks
3018 * at the X writemask component. */
3019 if (ctx->screen->b.chip_class == SI &&
3020 ctx->screen->b.family != CHIP_OLAND)
3021 mask |= 0x1;
3022
3023 /* Specify which components to enable */
3024 args[0] = lp_build_const_int32(base->gallivm, mask);
3025
3026 memcpy(exp->args[exp->num++], args, sizeof(args));
3027 }
3028
3029 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3030 LLVMValueRef *color, unsigned index,
3031 unsigned samplemask_param,
3032 bool is_last, struct si_ps_exports *exp)
3033 {
3034 struct si_shader_context *ctx = si_shader_context(bld_base);
3035 struct lp_build_context *base = &bld_base->base;
3036 int i;
3037
3038 /* Clamp color */
3039 if (ctx->shader->key.ps.epilog.clamp_color)
3040 for (i = 0; i < 4; i++)
3041 color[i] = radeon_llvm_saturate(bld_base, color[i]);
3042
3043 /* Alpha to one */
3044 if (ctx->shader->key.ps.epilog.alpha_to_one)
3045 color[3] = base->one;
3046
3047 /* Alpha test */
3048 if (index == 0 &&
3049 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3050 si_alpha_test(bld_base, color[3]);
3051
3052 /* Line & polygon smoothing */
3053 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3054 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3055 samplemask_param);
3056
3057 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3058 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3059 LLVMValueRef args[8][9];
3060 int c, last = -1;
3061
3062 /* Get the export arguments, also find out what the last one is. */
3063 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3064 si_llvm_init_export_args(bld_base, color,
3065 V_008DFC_SQ_EXP_MRT + c, args[c]);
3066 if (args[c][0] != bld_base->uint_bld.zero)
3067 last = c;
3068 }
3069
3070 /* Emit all exports. */
3071 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3072 if (is_last && last == c) {
3073 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3074 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3075 } else if (args[c][0] == bld_base->uint_bld.zero)
3076 continue; /* unnecessary NULL export */
3077
3078 memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
3079 }
3080 } else {
3081 LLVMValueRef args[9];
3082
3083 /* Export */
3084 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3085 args);
3086 if (is_last) {
3087 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3088 args[2] = bld_base->uint_bld.one; /* DONE bit */
3089 } else if (args[0] == bld_base->uint_bld.zero)
3090 return; /* unnecessary NULL export */
3091
3092 memcpy(exp->args[exp->num++], args, sizeof(args));
3093 }
3094 }
3095
3096 static void si_emit_ps_exports(struct si_shader_context *ctx,
3097 struct si_ps_exports *exp)
3098 {
3099 for (unsigned i = 0; i < exp->num; i++)
3100 lp_build_intrinsic(ctx->radeon_bld.gallivm.builder,
3101 "llvm.SI.export", ctx->voidt,
3102 exp->args[i], 9, 0);
3103 }
3104
3105 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3106 {
3107 struct si_shader_context *ctx = si_shader_context(bld_base);
3108 struct lp_build_context *base = &bld_base->base;
3109 struct lp_build_context *uint = &bld_base->uint_bld;
3110 LLVMValueRef args[9];
3111
3112 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3113 args[1] = uint->one; /* whether the EXEC mask is valid */
3114 args[2] = uint->one; /* DONE bit */
3115 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3116 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3117 args[5] = base->undef; /* R */
3118 args[6] = base->undef; /* G */
3119 args[7] = base->undef; /* B */
3120 args[8] = base->undef; /* A */
3121
3122 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3123 ctx->voidt, args, 9, 0);
3124 }
3125
3126 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3127 {
3128 struct si_shader_context *ctx = si_shader_context(bld_base);
3129 struct si_shader *shader = ctx->shader;
3130 struct lp_build_context *base = &bld_base->base;
3131 struct tgsi_shader_info *info = &shader->selector->info;
3132 LLVMBuilderRef builder = base->gallivm->builder;
3133 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3134 int last_color_export = -1;
3135 int i;
3136 struct si_ps_exports exp = {};
3137
3138 /* Determine the last export. If MRTZ is present, it's always last.
3139 * Otherwise, find the last color export.
3140 */
3141 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3142 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3143
3144 /* Don't export NULL and return if alpha-test is enabled. */
3145 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3146 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3147 (spi_format & 0xf) == 0)
3148 spi_format |= V_028714_SPI_SHADER_32_AR;
3149
3150 for (i = 0; i < info->num_outputs; i++) {
3151 unsigned index = info->output_semantic_index[i];
3152
3153 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3154 continue;
3155
3156 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3157 if (shader->key.ps.epilog.last_cbuf > 0) {
3158 /* Just set this if any of the colorbuffers are enabled. */
3159 if (spi_format &
3160 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3161 last_color_export = i;
3162 continue;
3163 }
3164
3165 if ((spi_format >> (index * 4)) & 0xf)
3166 last_color_export = i;
3167 }
3168
3169 /* If there are no outputs, export NULL. */
3170 if (last_color_export == -1) {
3171 si_export_null(bld_base);
3172 return;
3173 }
3174 }
3175
3176 for (i = 0; i < info->num_outputs; i++) {
3177 unsigned semantic_name = info->output_semantic_name[i];
3178 unsigned semantic_index = info->output_semantic_index[i];
3179 unsigned j;
3180 LLVMValueRef color[4] = {};
3181
3182 /* Select the correct target */
3183 switch (semantic_name) {
3184 case TGSI_SEMANTIC_POSITION:
3185 depth = LLVMBuildLoad(builder,
3186 ctx->radeon_bld.soa.outputs[i][2], "");
3187 break;
3188 case TGSI_SEMANTIC_STENCIL:
3189 stencil = LLVMBuildLoad(builder,
3190 ctx->radeon_bld.soa.outputs[i][1], "");
3191 break;
3192 case TGSI_SEMANTIC_SAMPLEMASK:
3193 samplemask = LLVMBuildLoad(builder,
3194 ctx->radeon_bld.soa.outputs[i][0], "");
3195 break;
3196 case TGSI_SEMANTIC_COLOR:
3197 for (j = 0; j < 4; j++)
3198 color[j] = LLVMBuildLoad(builder,
3199 ctx->radeon_bld.soa.outputs[i][j], "");
3200
3201 si_export_mrt_color(bld_base, color, semantic_index,
3202 SI_PARAM_SAMPLE_COVERAGE,
3203 last_color_export == i, &exp);
3204 break;
3205 default:
3206 fprintf(stderr,
3207 "Warning: SI unhandled fs output type:%d\n",
3208 semantic_name);
3209 }
3210 }
3211
3212 if (depth || stencil || samplemask)
3213 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
3214
3215 si_emit_ps_exports(ctx, &exp);
3216 }
3217
3218 /**
3219 * Return PS outputs in this order:
3220 *
3221 * v[0:3] = color0.xyzw
3222 * v[4:7] = color1.xyzw
3223 * ...
3224 * vN+0 = Depth
3225 * vN+1 = Stencil
3226 * vN+2 = SampleMask
3227 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3228 *
3229 * The alpha-ref SGPR is returned via its original location.
3230 */
3231 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3232 {
3233 struct si_shader_context *ctx = si_shader_context(bld_base);
3234 struct si_shader *shader = ctx->shader;
3235 struct lp_build_context *base = &bld_base->base;
3236 struct tgsi_shader_info *info = &shader->selector->info;
3237 LLVMBuilderRef builder = base->gallivm->builder;
3238 unsigned i, j, first_vgpr, vgpr;
3239
3240 LLVMValueRef color[8][4] = {};
3241 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3242 LLVMValueRef ret;
3243
3244 /* Read the output values. */
3245 for (i = 0; i < info->num_outputs; i++) {
3246 unsigned semantic_name = info->output_semantic_name[i];
3247 unsigned semantic_index = info->output_semantic_index[i];
3248
3249 switch (semantic_name) {
3250 case TGSI_SEMANTIC_COLOR:
3251 assert(semantic_index < 8);
3252 for (j = 0; j < 4; j++) {
3253 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3254 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3255 color[semantic_index][j] = result;
3256 }
3257 break;
3258 case TGSI_SEMANTIC_POSITION:
3259 depth = LLVMBuildLoad(builder,
3260 ctx->radeon_bld.soa.outputs[i][2], "");
3261 break;
3262 case TGSI_SEMANTIC_STENCIL:
3263 stencil = LLVMBuildLoad(builder,
3264 ctx->radeon_bld.soa.outputs[i][1], "");
3265 break;
3266 case TGSI_SEMANTIC_SAMPLEMASK:
3267 samplemask = LLVMBuildLoad(builder,
3268 ctx->radeon_bld.soa.outputs[i][0], "");
3269 break;
3270 default:
3271 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3272 semantic_name);
3273 }
3274 }
3275
3276 /* Fill the return structure. */
3277 ret = ctx->return_value;
3278
3279 /* Set SGPRs. */
3280 ret = LLVMBuildInsertValue(builder, ret,
3281 bitcast(bld_base, TGSI_TYPE_SIGNED,
3282 LLVMGetParam(ctx->radeon_bld.main_fn,
3283 SI_PARAM_ALPHA_REF)),
3284 SI_SGPR_ALPHA_REF, "");
3285
3286 /* Set VGPRs */
3287 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3288 for (i = 0; i < ARRAY_SIZE(color); i++) {
3289 if (!color[i][0])
3290 continue;
3291
3292 for (j = 0; j < 4; j++)
3293 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3294 }
3295 if (depth)
3296 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3297 if (stencil)
3298 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3299 if (samplemask)
3300 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3301
3302 /* Add the input sample mask for smoothing at the end. */
3303 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3304 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3305 ret = LLVMBuildInsertValue(builder, ret,
3306 LLVMGetParam(ctx->radeon_bld.main_fn,
3307 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3308
3309 ctx->return_value = ret;
3310 }
3311
3312 /**
3313 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3314 * buffer in number of elements and return it as an i32.
3315 */
3316 static LLVMValueRef get_buffer_size(
3317 struct lp_build_tgsi_context *bld_base,
3318 LLVMValueRef descriptor)
3319 {
3320 struct si_shader_context *ctx = si_shader_context(bld_base);
3321 struct gallivm_state *gallivm = bld_base->base.gallivm;
3322 LLVMBuilderRef builder = gallivm->builder;
3323 LLVMValueRef size =
3324 LLVMBuildExtractElement(builder, descriptor,
3325 lp_build_const_int32(gallivm, 6), "");
3326
3327 if (ctx->screen->b.chip_class >= VI) {
3328 /* On VI, the descriptor contains the size in bytes,
3329 * but TXQ must return the size in elements.
3330 * The stride is always non-zero for resources using TXQ.
3331 */
3332 LLVMValueRef stride =
3333 LLVMBuildExtractElement(builder, descriptor,
3334 lp_build_const_int32(gallivm, 5), "");
3335 stride = LLVMBuildLShr(builder, stride,
3336 lp_build_const_int32(gallivm, 16), "");
3337 stride = LLVMBuildAnd(builder, stride,
3338 lp_build_const_int32(gallivm, 0x3FFF), "");
3339
3340 size = LLVMBuildUDiv(builder, size, stride, "");
3341 }
3342
3343 return size;
3344 }
3345
3346 /**
3347 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3348 * intrinsic names).
3349 */
3350 static void build_int_type_name(
3351 LLVMTypeRef type,
3352 char *buf, unsigned bufsize)
3353 {
3354 assert(bufsize >= 6);
3355
3356 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3357 snprintf(buf, bufsize, "v%ui32",
3358 LLVMGetVectorSize(type));
3359 else
3360 strcpy(buf, "i32");
3361 }
3362
3363 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3364 struct lp_build_tgsi_context *bld_base,
3365 struct lp_build_emit_data *emit_data);
3366
3367 /* Prevent optimizations (at least of memory accesses) across the current
3368 * point in the program by emitting empty inline assembly that is marked as
3369 * having side effects.
3370 */
3371 static void emit_optimization_barrier(struct si_shader_context *ctx)
3372 {
3373 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3374 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3375 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3376 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3377 }
3378
3379 static void emit_waitcnt(struct si_shader_context *ctx)
3380 {
3381 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3382 LLVMBuilderRef builder = gallivm->builder;
3383 LLVMValueRef args[1] = {
3384 lp_build_const_int32(gallivm, 0xf70)
3385 };
3386 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3387 ctx->voidt, args, 1, 0);
3388 }
3389
3390 static void membar_emit(
3391 const struct lp_build_tgsi_action *action,
3392 struct lp_build_tgsi_context *bld_base,
3393 struct lp_build_emit_data *emit_data)
3394 {
3395 struct si_shader_context *ctx = si_shader_context(bld_base);
3396
3397 emit_waitcnt(ctx);
3398 }
3399
3400 static LLVMValueRef
3401 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3402 const struct tgsi_full_src_register *reg)
3403 {
3404 LLVMValueRef index;
3405 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
3406 SI_PARAM_SHADER_BUFFERS);
3407
3408 if (!reg->Register.Indirect)
3409 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3410 else
3411 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3412 reg->Register.Index,
3413 SI_NUM_SHADER_BUFFERS);
3414
3415 return build_indexed_load_const(ctx, rsrc_ptr, index);
3416 }
3417
3418 static bool tgsi_is_array_sampler(unsigned target)
3419 {
3420 return target == TGSI_TEXTURE_1D_ARRAY ||
3421 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3422 target == TGSI_TEXTURE_2D_ARRAY ||
3423 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3424 target == TGSI_TEXTURE_CUBE_ARRAY ||
3425 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3426 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3427 }
3428
3429 static bool tgsi_is_array_image(unsigned target)
3430 {
3431 return target == TGSI_TEXTURE_3D ||
3432 target == TGSI_TEXTURE_CUBE ||
3433 target == TGSI_TEXTURE_1D_ARRAY ||
3434 target == TGSI_TEXTURE_2D_ARRAY ||
3435 target == TGSI_TEXTURE_CUBE_ARRAY ||
3436 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3437 }
3438
3439 /**
3440 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3441 *
3442 * At least on Tonga, executing image stores on images with DCC enabled and
3443 * non-trivial can eventually lead to lockups. This can occur when an
3444 * application binds an image as read-only but then uses a shader that writes
3445 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3446 * program termination) in this case, but it doesn't cost much to be a bit
3447 * nicer: disabling DCC in the shader still leads to undefined results but
3448 * avoids the lockup.
3449 */
3450 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3451 LLVMValueRef rsrc)
3452 {
3453 if (ctx->screen->b.chip_class <= CIK) {
3454 return rsrc;
3455 } else {
3456 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3457 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3458 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3459 LLVMValueRef tmp;
3460
3461 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3462 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3463 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3464 }
3465 }
3466
3467 /**
3468 * Load the resource descriptor for \p image.
3469 */
3470 static void
3471 image_fetch_rsrc(
3472 struct lp_build_tgsi_context *bld_base,
3473 const struct tgsi_full_src_register *image,
3474 bool dcc_off,
3475 LLVMValueRef *rsrc)
3476 {
3477 struct si_shader_context *ctx = si_shader_context(bld_base);
3478 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
3479 SI_PARAM_IMAGES);
3480 LLVMValueRef index, tmp;
3481
3482 assert(image->Register.File == TGSI_FILE_IMAGE);
3483
3484 if (!image->Register.Indirect) {
3485 const struct tgsi_shader_info *info = bld_base->info;
3486
3487 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3488
3489 if (info->images_writemask & (1 << image->Register.Index) &&
3490 !(info->images_buffers & (1 << image->Register.Index)))
3491 dcc_off = true;
3492 } else {
3493 /* From the GL_ARB_shader_image_load_store extension spec:
3494 *
3495 * If a shader performs an image load, store, or atomic
3496 * operation using an image variable declared as an array,
3497 * and if the index used to select an individual element is
3498 * negative or greater than or equal to the size of the
3499 * array, the results of the operation are undefined but may
3500 * not lead to termination.
3501 */
3502 index = get_bounded_indirect_index(ctx, &image->Indirect,
3503 image->Register.Index,
3504 SI_NUM_IMAGES);
3505 }
3506
3507 tmp = build_indexed_load_const(ctx, rsrc_ptr, index);
3508 if (dcc_off)
3509 tmp = force_dcc_off(ctx, tmp);
3510 *rsrc = tmp;
3511 }
3512
3513 static LLVMValueRef image_fetch_coords(
3514 struct lp_build_tgsi_context *bld_base,
3515 const struct tgsi_full_instruction *inst,
3516 unsigned src)
3517 {
3518 struct gallivm_state *gallivm = bld_base->base.gallivm;
3519 LLVMBuilderRef builder = gallivm->builder;
3520 unsigned target = inst->Memory.Texture;
3521 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3522 LLVMValueRef coords[4];
3523 LLVMValueRef tmp;
3524 int chan;
3525
3526 for (chan = 0; chan < num_coords; ++chan) {
3527 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3528 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3529 coords[chan] = tmp;
3530 }
3531
3532 if (num_coords == 1)
3533 return coords[0];
3534
3535 if (num_coords == 3) {
3536 /* LLVM has difficulties lowering 3-element vectors. */
3537 coords[3] = bld_base->uint_bld.undef;
3538 num_coords = 4;
3539 }
3540
3541 return lp_build_gather_values(gallivm, coords, num_coords);
3542 }
3543
3544 /**
3545 * Append the extra mode bits that are used by image load and store.
3546 */
3547 static void image_append_args(
3548 struct si_shader_context *ctx,
3549 struct lp_build_emit_data * emit_data,
3550 unsigned target,
3551 bool atomic)
3552 {
3553 const struct tgsi_full_instruction *inst = emit_data->inst;
3554 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3555 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3556
3557 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3558 emit_data->args[emit_data->arg_count++] =
3559 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3560 if (!atomic) {
3561 emit_data->args[emit_data->arg_count++] =
3562 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3563 i1true : i1false; /* glc */
3564 }
3565 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3566 }
3567
3568 /**
3569 * Given a 256 bit resource, extract the top half (which stores the buffer
3570 * resource in the case of textures and images).
3571 */
3572 static LLVMValueRef extract_rsrc_top_half(
3573 struct si_shader_context *ctx,
3574 LLVMValueRef rsrc)
3575 {
3576 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3577 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3578 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3579
3580 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3581 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3582 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3583
3584 return rsrc;
3585 }
3586
3587 /**
3588 * Append the resource and indexing arguments for buffer intrinsics.
3589 *
3590 * \param rsrc the v4i32 buffer resource
3591 * \param index index into the buffer (stride-based)
3592 * \param offset byte offset into the buffer
3593 */
3594 static void buffer_append_args(
3595 struct si_shader_context *ctx,
3596 struct lp_build_emit_data *emit_data,
3597 LLVMValueRef rsrc,
3598 LLVMValueRef index,
3599 LLVMValueRef offset,
3600 bool atomic)
3601 {
3602 const struct tgsi_full_instruction *inst = emit_data->inst;
3603 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3604 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3605
3606 emit_data->args[emit_data->arg_count++] = rsrc;
3607 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3608 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3609 if (!atomic) {
3610 emit_data->args[emit_data->arg_count++] =
3611 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3612 i1true : i1false; /* glc */
3613 }
3614 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3615 }
3616
3617 static void load_fetch_args(
3618 struct lp_build_tgsi_context * bld_base,
3619 struct lp_build_emit_data * emit_data)
3620 {
3621 struct si_shader_context *ctx = si_shader_context(bld_base);
3622 struct gallivm_state *gallivm = bld_base->base.gallivm;
3623 const struct tgsi_full_instruction * inst = emit_data->inst;
3624 unsigned target = inst->Memory.Texture;
3625 LLVMValueRef rsrc;
3626
3627 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3628
3629 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3630 LLVMBuilderRef builder = gallivm->builder;
3631 LLVMValueRef offset;
3632 LLVMValueRef tmp;
3633
3634 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3635
3636 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3637 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3638
3639 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3640 offset, false);
3641 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3642 LLVMValueRef coords;
3643
3644 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3645 coords = image_fetch_coords(bld_base, inst, 1);
3646
3647 if (target == TGSI_TEXTURE_BUFFER) {
3648 rsrc = extract_rsrc_top_half(ctx, rsrc);
3649 buffer_append_args(ctx, emit_data, rsrc, coords,
3650 bld_base->uint_bld.zero, false);
3651 } else {
3652 emit_data->args[0] = coords;
3653 emit_data->args[1] = rsrc;
3654 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3655 emit_data->arg_count = 3;
3656
3657 image_append_args(ctx, emit_data, target, false);
3658 }
3659 }
3660 }
3661
3662 static void load_emit_buffer(struct si_shader_context *ctx,
3663 struct lp_build_emit_data *emit_data)
3664 {
3665 const struct tgsi_full_instruction *inst = emit_data->inst;
3666 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3667 LLVMBuilderRef builder = gallivm->builder;
3668 uint writemask = inst->Dst[0].Register.WriteMask;
3669 uint count = util_last_bit(writemask);
3670 const char *intrinsic_name;
3671 LLVMTypeRef dst_type;
3672
3673 switch (count) {
3674 case 1:
3675 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3676 dst_type = ctx->f32;
3677 break;
3678 case 2:
3679 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3680 dst_type = LLVMVectorType(ctx->f32, 2);
3681 break;
3682 default: // 3 & 4
3683 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3684 dst_type = ctx->v4f32;
3685 count = 4;
3686 }
3687
3688 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3689 builder, intrinsic_name, dst_type,
3690 emit_data->args, emit_data->arg_count,
3691 LLVMReadOnlyAttribute);
3692 }
3693
3694 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3695 const struct tgsi_full_instruction *inst,
3696 LLVMTypeRef type, int arg)
3697 {
3698 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3699 LLVMBuilderRef builder = gallivm->builder;
3700 LLVMValueRef offset, ptr;
3701 int addr_space;
3702
3703 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3704 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3705
3706 ptr = ctx->shared_memory;
3707 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3708 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3709 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3710
3711 return ptr;
3712 }
3713
3714 static void load_emit_memory(
3715 struct si_shader_context *ctx,
3716 struct lp_build_emit_data *emit_data)
3717 {
3718 const struct tgsi_full_instruction *inst = emit_data->inst;
3719 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3720 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3721 LLVMBuilderRef builder = gallivm->builder;
3722 unsigned writemask = inst->Dst[0].Register.WriteMask;
3723 LLVMValueRef channels[4], ptr, derived_ptr, index;
3724 int chan;
3725
3726 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3727
3728 for (chan = 0; chan < 4; ++chan) {
3729 if (!(writemask & (1 << chan))) {
3730 channels[chan] = LLVMGetUndef(base->elem_type);
3731 continue;
3732 }
3733
3734 index = lp_build_const_int32(gallivm, chan);
3735 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3736 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3737 }
3738 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3739 }
3740
3741 static void load_emit(
3742 const struct lp_build_tgsi_action *action,
3743 struct lp_build_tgsi_context *bld_base,
3744 struct lp_build_emit_data *emit_data)
3745 {
3746 struct si_shader_context *ctx = si_shader_context(bld_base);
3747 struct gallivm_state *gallivm = bld_base->base.gallivm;
3748 LLVMBuilderRef builder = gallivm->builder;
3749 const struct tgsi_full_instruction * inst = emit_data->inst;
3750 char intrinsic_name[32];
3751 char coords_type[8];
3752
3753 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3754 load_emit_memory(ctx, emit_data);
3755 return;
3756 }
3757
3758 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3759 emit_waitcnt(ctx);
3760
3761 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3762 load_emit_buffer(ctx, emit_data);
3763 return;
3764 }
3765
3766 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3767 emit_data->output[emit_data->chan] =
3768 lp_build_intrinsic(
3769 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3770 emit_data->args, emit_data->arg_count,
3771 LLVMReadOnlyAttribute);
3772 } else {
3773 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3774 coords_type, sizeof(coords_type));
3775
3776 snprintf(intrinsic_name, sizeof(intrinsic_name),
3777 "llvm.amdgcn.image.load.%s", coords_type);
3778
3779 emit_data->output[emit_data->chan] =
3780 lp_build_intrinsic(
3781 builder, intrinsic_name, emit_data->dst_type,
3782 emit_data->args, emit_data->arg_count,
3783 LLVMReadOnlyAttribute);
3784 }
3785 }
3786
3787 static void store_fetch_args(
3788 struct lp_build_tgsi_context * bld_base,
3789 struct lp_build_emit_data * emit_data)
3790 {
3791 struct si_shader_context *ctx = si_shader_context(bld_base);
3792 struct gallivm_state *gallivm = bld_base->base.gallivm;
3793 LLVMBuilderRef builder = gallivm->builder;
3794 const struct tgsi_full_instruction * inst = emit_data->inst;
3795 struct tgsi_full_src_register memory;
3796 LLVMValueRef chans[4];
3797 LLVMValueRef data;
3798 LLVMValueRef rsrc;
3799 unsigned chan;
3800
3801 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3802
3803 for (chan = 0; chan < 4; ++chan) {
3804 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3805 }
3806 data = lp_build_gather_values(gallivm, chans, 4);
3807
3808 emit_data->args[emit_data->arg_count++] = data;
3809
3810 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3811
3812 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3813 LLVMValueRef offset;
3814 LLVMValueRef tmp;
3815
3816 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3817
3818 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3819 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3820
3821 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3822 offset, false);
3823 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3824 unsigned target = inst->Memory.Texture;
3825 LLVMValueRef coords;
3826
3827 coords = image_fetch_coords(bld_base, inst, 0);
3828
3829 if (target == TGSI_TEXTURE_BUFFER) {
3830 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3831
3832 rsrc = extract_rsrc_top_half(ctx, rsrc);
3833 buffer_append_args(ctx, emit_data, rsrc, coords,
3834 bld_base->uint_bld.zero, false);
3835 } else {
3836 emit_data->args[1] = coords;
3837 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3838 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3839 emit_data->arg_count = 4;
3840
3841 image_append_args(ctx, emit_data, target, false);
3842 }
3843 }
3844 }
3845
3846 static void store_emit_buffer(
3847 struct si_shader_context *ctx,
3848 struct lp_build_emit_data *emit_data)
3849 {
3850 const struct tgsi_full_instruction *inst = emit_data->inst;
3851 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3852 LLVMBuilderRef builder = gallivm->builder;
3853 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3854 LLVMValueRef base_data = emit_data->args[0];
3855 LLVMValueRef base_offset = emit_data->args[3];
3856 unsigned writemask = inst->Dst[0].Register.WriteMask;
3857
3858 while (writemask) {
3859 int start, count;
3860 const char *intrinsic_name;
3861 LLVMValueRef data;
3862 LLVMValueRef offset;
3863 LLVMValueRef tmp;
3864
3865 u_bit_scan_consecutive_range(&writemask, &start, &count);
3866
3867 /* Due to an LLVM limitation, split 3-element writes
3868 * into a 2-element and a 1-element write. */
3869 if (count == 3) {
3870 writemask |= 1 << (start + 2);
3871 count = 2;
3872 }
3873
3874 if (count == 4) {
3875 data = base_data;
3876 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3877 } else if (count == 2) {
3878 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3879
3880 tmp = LLVMBuildExtractElement(
3881 builder, base_data,
3882 lp_build_const_int32(gallivm, start), "");
3883 data = LLVMBuildInsertElement(
3884 builder, LLVMGetUndef(v2f32), tmp,
3885 uint_bld->zero, "");
3886
3887 tmp = LLVMBuildExtractElement(
3888 builder, base_data,
3889 lp_build_const_int32(gallivm, start + 1), "");
3890 data = LLVMBuildInsertElement(
3891 builder, data, tmp, uint_bld->one, "");
3892
3893 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3894 } else {
3895 assert(count == 1);
3896 data = LLVMBuildExtractElement(
3897 builder, base_data,
3898 lp_build_const_int32(gallivm, start), "");
3899 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3900 }
3901
3902 offset = base_offset;
3903 if (start != 0) {
3904 offset = LLVMBuildAdd(
3905 builder, offset,
3906 lp_build_const_int32(gallivm, start * 4), "");
3907 }
3908
3909 emit_data->args[0] = data;
3910 emit_data->args[3] = offset;
3911
3912 lp_build_intrinsic(
3913 builder, intrinsic_name, emit_data->dst_type,
3914 emit_data->args, emit_data->arg_count, 0);
3915 }
3916 }
3917
3918 static void store_emit_memory(
3919 struct si_shader_context *ctx,
3920 struct lp_build_emit_data *emit_data)
3921 {
3922 const struct tgsi_full_instruction *inst = emit_data->inst;
3923 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3924 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3925 LLVMBuilderRef builder = gallivm->builder;
3926 unsigned writemask = inst->Dst[0].Register.WriteMask;
3927 LLVMValueRef ptr, derived_ptr, data, index;
3928 int chan;
3929
3930 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3931
3932 for (chan = 0; chan < 4; ++chan) {
3933 if (!(writemask & (1 << chan))) {
3934 continue;
3935 }
3936 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3937 index = lp_build_const_int32(gallivm, chan);
3938 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3939 LLVMBuildStore(builder, data, derived_ptr);
3940 }
3941 }
3942
3943 static void store_emit(
3944 const struct lp_build_tgsi_action *action,
3945 struct lp_build_tgsi_context *bld_base,
3946 struct lp_build_emit_data *emit_data)
3947 {
3948 struct si_shader_context *ctx = si_shader_context(bld_base);
3949 struct gallivm_state *gallivm = bld_base->base.gallivm;
3950 LLVMBuilderRef builder = gallivm->builder;
3951 const struct tgsi_full_instruction * inst = emit_data->inst;
3952 unsigned target = inst->Memory.Texture;
3953 char intrinsic_name[32];
3954 char coords_type[8];
3955
3956 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3957 store_emit_memory(ctx, emit_data);
3958 return;
3959 }
3960
3961 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3962 emit_waitcnt(ctx);
3963
3964 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3965 store_emit_buffer(ctx, emit_data);
3966 return;
3967 }
3968
3969 if (target == TGSI_TEXTURE_BUFFER) {
3970 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3971 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3972 emit_data->dst_type, emit_data->args,
3973 emit_data->arg_count, 0);
3974 } else {
3975 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3976 coords_type, sizeof(coords_type));
3977 snprintf(intrinsic_name, sizeof(intrinsic_name),
3978 "llvm.amdgcn.image.store.%s", coords_type);
3979
3980 emit_data->output[emit_data->chan] =
3981 lp_build_intrinsic(
3982 builder, intrinsic_name, emit_data->dst_type,
3983 emit_data->args, emit_data->arg_count, 0);
3984 }
3985 }
3986
3987 static void atomic_fetch_args(
3988 struct lp_build_tgsi_context * bld_base,
3989 struct lp_build_emit_data * emit_data)
3990 {
3991 struct si_shader_context *ctx = si_shader_context(bld_base);
3992 struct gallivm_state *gallivm = bld_base->base.gallivm;
3993 LLVMBuilderRef builder = gallivm->builder;
3994 const struct tgsi_full_instruction * inst = emit_data->inst;
3995 LLVMValueRef data1, data2;
3996 LLVMValueRef rsrc;
3997 LLVMValueRef tmp;
3998
3999 emit_data->dst_type = bld_base->base.elem_type;
4000
4001 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4002 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
4003
4004 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4005 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4006 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
4007 }
4008
4009 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4010 * of arguments, which is reversed relative to TGSI (and GLSL)
4011 */
4012 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4013 emit_data->args[emit_data->arg_count++] = data2;
4014 emit_data->args[emit_data->arg_count++] = data1;
4015
4016 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4017 LLVMValueRef offset;
4018
4019 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4020
4021 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4022 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
4023
4024 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
4025 offset, true);
4026 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4027 unsigned target = inst->Memory.Texture;
4028 LLVMValueRef coords;
4029
4030 image_fetch_rsrc(bld_base, &inst->Src[0],
4031 target != TGSI_TEXTURE_BUFFER, &rsrc);
4032 coords = image_fetch_coords(bld_base, inst, 1);
4033
4034 if (target == TGSI_TEXTURE_BUFFER) {
4035 rsrc = extract_rsrc_top_half(ctx, rsrc);
4036 buffer_append_args(ctx, emit_data, rsrc, coords,
4037 bld_base->uint_bld.zero, true);
4038 } else {
4039 emit_data->args[emit_data->arg_count++] = coords;
4040 emit_data->args[emit_data->arg_count++] = rsrc;
4041
4042 image_append_args(ctx, emit_data, target, true);
4043 }
4044 }
4045 }
4046
4047 static void atomic_emit_memory(struct si_shader_context *ctx,
4048 struct lp_build_emit_data *emit_data) {
4049 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4050 LLVMBuilderRef builder = gallivm->builder;
4051 const struct tgsi_full_instruction * inst = emit_data->inst;
4052 LLVMValueRef ptr, result, arg;
4053
4054 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4055
4056 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
4057 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4058
4059 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4060 LLVMValueRef new_data;
4061 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4062 inst, 3, 0);
4063
4064 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4065
4066 #if HAVE_LLVM >= 0x309
4067 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4068 LLVMAtomicOrderingSequentiallyConsistent,
4069 LLVMAtomicOrderingSequentiallyConsistent,
4070 false);
4071 #endif
4072
4073 result = LLVMBuildExtractValue(builder, result, 0, "");
4074 } else {
4075 LLVMAtomicRMWBinOp op;
4076
4077 switch(inst->Instruction.Opcode) {
4078 case TGSI_OPCODE_ATOMUADD:
4079 op = LLVMAtomicRMWBinOpAdd;
4080 break;
4081 case TGSI_OPCODE_ATOMXCHG:
4082 op = LLVMAtomicRMWBinOpXchg;
4083 break;
4084 case TGSI_OPCODE_ATOMAND:
4085 op = LLVMAtomicRMWBinOpAnd;
4086 break;
4087 case TGSI_OPCODE_ATOMOR:
4088 op = LLVMAtomicRMWBinOpOr;
4089 break;
4090 case TGSI_OPCODE_ATOMXOR:
4091 op = LLVMAtomicRMWBinOpXor;
4092 break;
4093 case TGSI_OPCODE_ATOMUMIN:
4094 op = LLVMAtomicRMWBinOpUMin;
4095 break;
4096 case TGSI_OPCODE_ATOMUMAX:
4097 op = LLVMAtomicRMWBinOpUMax;
4098 break;
4099 case TGSI_OPCODE_ATOMIMIN:
4100 op = LLVMAtomicRMWBinOpMin;
4101 break;
4102 case TGSI_OPCODE_ATOMIMAX:
4103 op = LLVMAtomicRMWBinOpMax;
4104 break;
4105 default:
4106 unreachable("unknown atomic opcode");
4107 }
4108
4109 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4110 LLVMAtomicOrderingSequentiallyConsistent,
4111 false);
4112 }
4113 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4114 }
4115
4116 static void atomic_emit(
4117 const struct lp_build_tgsi_action *action,
4118 struct lp_build_tgsi_context *bld_base,
4119 struct lp_build_emit_data *emit_data)
4120 {
4121 struct si_shader_context *ctx = si_shader_context(bld_base);
4122 struct gallivm_state *gallivm = bld_base->base.gallivm;
4123 LLVMBuilderRef builder = gallivm->builder;
4124 const struct tgsi_full_instruction * inst = emit_data->inst;
4125 char intrinsic_name[40];
4126 LLVMValueRef tmp;
4127
4128 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4129 atomic_emit_memory(ctx, emit_data);
4130 return;
4131 }
4132
4133 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4134 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4135 snprintf(intrinsic_name, sizeof(intrinsic_name),
4136 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4137 } else {
4138 char coords_type[8];
4139
4140 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4141 coords_type, sizeof(coords_type));
4142 snprintf(intrinsic_name, sizeof(intrinsic_name),
4143 "llvm.amdgcn.image.atomic.%s.%s",
4144 action->intr_name, coords_type);
4145 }
4146
4147 tmp = lp_build_intrinsic(
4148 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4149 emit_data->args, emit_data->arg_count, 0);
4150 emit_data->output[emit_data->chan] =
4151 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4152 }
4153
4154 static void resq_fetch_args(
4155 struct lp_build_tgsi_context * bld_base,
4156 struct lp_build_emit_data * emit_data)
4157 {
4158 struct si_shader_context *ctx = si_shader_context(bld_base);
4159 struct gallivm_state *gallivm = bld_base->base.gallivm;
4160 const struct tgsi_full_instruction *inst = emit_data->inst;
4161 const struct tgsi_full_src_register *reg = &inst->Src[0];
4162
4163 emit_data->dst_type = ctx->v4i32;
4164
4165 if (reg->Register.File == TGSI_FILE_BUFFER) {
4166 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4167 emit_data->arg_count = 1;
4168 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4169 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4170 emit_data->arg_count = 1;
4171 } else {
4172 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4173 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4174 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4175 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4176 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4177 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4178 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4179 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4180 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4181 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4182 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4183 emit_data->arg_count = 10;
4184 }
4185 }
4186
4187 static void resq_emit(
4188 const struct lp_build_tgsi_action *action,
4189 struct lp_build_tgsi_context *bld_base,
4190 struct lp_build_emit_data *emit_data)
4191 {
4192 struct gallivm_state *gallivm = bld_base->base.gallivm;
4193 LLVMBuilderRef builder = gallivm->builder;
4194 const struct tgsi_full_instruction *inst = emit_data->inst;
4195 LLVMValueRef out;
4196
4197 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4198 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4199 lp_build_const_int32(gallivm, 2), "");
4200 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4201 out = get_buffer_size(bld_base, emit_data->args[0]);
4202 } else {
4203 out = lp_build_intrinsic(
4204 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4205 emit_data->args, emit_data->arg_count,
4206 LLVMReadNoneAttribute);
4207
4208 /* Divide the number of layers by 6 to get the number of cubes. */
4209 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4210 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4211 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4212
4213 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4214 z = LLVMBuildSDiv(builder, z, imm6, "");
4215 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4216 }
4217 }
4218
4219 emit_data->output[emit_data->chan] = out;
4220 }
4221
4222 static void set_tex_fetch_args(struct si_shader_context *ctx,
4223 struct lp_build_emit_data *emit_data,
4224 unsigned opcode, unsigned target,
4225 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4226 LLVMValueRef *param, unsigned count,
4227 unsigned dmask)
4228 {
4229 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4230 unsigned num_args;
4231 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4232
4233 /* Pad to power of two vector */
4234 while (count < util_next_power_of_two(count))
4235 param[count++] = LLVMGetUndef(ctx->i32);
4236
4237 /* Texture coordinates. */
4238 if (count > 1)
4239 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4240 else
4241 emit_data->args[0] = param[0];
4242
4243 /* Resource. */
4244 emit_data->args[1] = res_ptr;
4245 num_args = 2;
4246
4247 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4248 emit_data->dst_type = ctx->v4i32;
4249 else {
4250 emit_data->dst_type = ctx->v4f32;
4251
4252 emit_data->args[num_args++] = samp_ptr;
4253 }
4254
4255 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4256 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4257 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4258 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4259 tgsi_is_array_sampler(target)); /* da */
4260 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4261 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4262 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4263 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4264
4265 emit_data->arg_count = num_args;
4266 }
4267
4268 static const struct lp_build_tgsi_action tex_action;
4269
4270 enum desc_type {
4271 DESC_IMAGE,
4272 DESC_FMASK,
4273 DESC_SAMPLER
4274 };
4275
4276 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4277 {
4278 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4279 CONST_ADDR_SPACE);
4280 }
4281
4282 /**
4283 * Load an image view, fmask view. or sampler state descriptor.
4284 */
4285 static LLVMValueRef load_sampler_desc_custom(struct si_shader_context *ctx,
4286 LLVMValueRef list, LLVMValueRef index,
4287 enum desc_type type)
4288 {
4289 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4290 LLVMBuilderRef builder = gallivm->builder;
4291
4292 switch (type) {
4293 case DESC_IMAGE:
4294 /* The image is at [0:7]. */
4295 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4296 break;
4297 case DESC_FMASK:
4298 /* The FMASK is at [8:15]. */
4299 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4300 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4301 break;
4302 case DESC_SAMPLER:
4303 /* The sampler state is at [12:15]. */
4304 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4305 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4306 list = LLVMBuildPointerCast(builder, list,
4307 const_array(ctx->v4i32, 0), "");
4308 break;
4309 }
4310
4311 return build_indexed_load_const(ctx, list, index);
4312 }
4313
4314 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4315 LLVMValueRef index, enum desc_type type)
4316 {
4317 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4318 SI_PARAM_SAMPLERS);
4319
4320 return load_sampler_desc_custom(ctx, list, index, type);
4321 }
4322
4323 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4324 *
4325 * SI-CI:
4326 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4327 * filtering manually. The driver sets img7 to a mask clearing
4328 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4329 * s_and_b32 samp0, samp0, img7
4330 *
4331 * VI:
4332 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4333 */
4334 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4335 LLVMValueRef res, LLVMValueRef samp)
4336 {
4337 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4338 LLVMValueRef img7, samp0;
4339
4340 if (ctx->screen->b.chip_class >= VI)
4341 return samp;
4342
4343 img7 = LLVMBuildExtractElement(builder, res,
4344 LLVMConstInt(ctx->i32, 7, 0), "");
4345 samp0 = LLVMBuildExtractElement(builder, samp,
4346 LLVMConstInt(ctx->i32, 0, 0), "");
4347 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4348 return LLVMBuildInsertElement(builder, samp, samp0,
4349 LLVMConstInt(ctx->i32, 0, 0), "");
4350 }
4351
4352 static void tex_fetch_ptrs(
4353 struct lp_build_tgsi_context *bld_base,
4354 struct lp_build_emit_data *emit_data,
4355 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4356 {
4357 struct si_shader_context *ctx = si_shader_context(bld_base);
4358 const struct tgsi_full_instruction *inst = emit_data->inst;
4359 unsigned target = inst->Texture.Texture;
4360 unsigned sampler_src;
4361 unsigned sampler_index;
4362 LLVMValueRef index;
4363
4364 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4365 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4366
4367 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4368 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4369
4370 index = get_bounded_indirect_index(ctx,
4371 &reg->Indirect,
4372 reg->Register.Index,
4373 SI_NUM_SAMPLERS);
4374 } else {
4375 index = LLVMConstInt(ctx->i32, sampler_index, 0);
4376 }
4377
4378 *res_ptr = load_sampler_desc(ctx, index, DESC_IMAGE);
4379
4380 if (target == TGSI_TEXTURE_2D_MSAA ||
4381 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4382 if (samp_ptr)
4383 *samp_ptr = NULL;
4384 if (fmask_ptr)
4385 *fmask_ptr = load_sampler_desc(ctx, index, DESC_FMASK);
4386 } else {
4387 if (samp_ptr) {
4388 *samp_ptr = load_sampler_desc(ctx, index, DESC_SAMPLER);
4389 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4390 }
4391 if (fmask_ptr)
4392 *fmask_ptr = NULL;
4393 }
4394 }
4395
4396 static void txq_fetch_args(
4397 struct lp_build_tgsi_context *bld_base,
4398 struct lp_build_emit_data *emit_data)
4399 {
4400 struct si_shader_context *ctx = si_shader_context(bld_base);
4401 struct gallivm_state *gallivm = bld_base->base.gallivm;
4402 LLVMBuilderRef builder = gallivm->builder;
4403 const struct tgsi_full_instruction *inst = emit_data->inst;
4404 unsigned target = inst->Texture.Texture;
4405 LLVMValueRef res_ptr;
4406 LLVMValueRef address;
4407
4408 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4409
4410 if (target == TGSI_TEXTURE_BUFFER) {
4411 /* Read the size from the buffer descriptor directly. */
4412 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4413 emit_data->args[0] = get_buffer_size(bld_base, res);
4414 return;
4415 }
4416
4417 /* Textures - set the mip level. */
4418 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4419
4420 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4421 NULL, &address, 1, 0xf);
4422 }
4423
4424 static void txq_emit(const struct lp_build_tgsi_action *action,
4425 struct lp_build_tgsi_context *bld_base,
4426 struct lp_build_emit_data *emit_data)
4427 {
4428 struct lp_build_context *base = &bld_base->base;
4429 unsigned target = emit_data->inst->Texture.Texture;
4430
4431 if (target == TGSI_TEXTURE_BUFFER) {
4432 /* Just return the buffer size. */
4433 emit_data->output[emit_data->chan] = emit_data->args[0];
4434 return;
4435 }
4436
4437 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4438 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4439 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4440 LLVMReadNoneAttribute);
4441
4442 /* Divide the number of layers by 6 to get the number of cubes. */
4443 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4444 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4445 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4446 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4447 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4448
4449 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4450 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4451 z = LLVMBuildSDiv(builder, z, six, "");
4452
4453 emit_data->output[emit_data->chan] =
4454 LLVMBuildInsertElement(builder, v4, z, two, "");
4455 }
4456 }
4457
4458 static void tex_fetch_args(
4459 struct lp_build_tgsi_context *bld_base,
4460 struct lp_build_emit_data *emit_data)
4461 {
4462 struct si_shader_context *ctx = si_shader_context(bld_base);
4463 struct gallivm_state *gallivm = bld_base->base.gallivm;
4464 const struct tgsi_full_instruction *inst = emit_data->inst;
4465 unsigned opcode = inst->Instruction.Opcode;
4466 unsigned target = inst->Texture.Texture;
4467 LLVMValueRef coords[5], derivs[6];
4468 LLVMValueRef address[16];
4469 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4470 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4471 unsigned count = 0;
4472 unsigned chan;
4473 unsigned num_deriv_channels = 0;
4474 bool has_offset = inst->Texture.NumOffsets > 0;
4475 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4476 unsigned dmask = 0xf;
4477
4478 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4479
4480 if (target == TGSI_TEXTURE_BUFFER) {
4481 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4482
4483 /* Bitcast and truncate v8i32 to v16i8. */
4484 LLVMValueRef res = res_ptr;
4485 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4486 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4487 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4488
4489 emit_data->dst_type = ctx->v4f32;
4490 emit_data->args[0] = res;
4491 emit_data->args[1] = bld_base->uint_bld.zero;
4492 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4493 emit_data->arg_count = 3;
4494 return;
4495 }
4496
4497 /* Fetch and project texture coordinates */
4498 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4499 for (chan = 0; chan < 3; chan++ ) {
4500 coords[chan] = lp_build_emit_fetch(bld_base,
4501 emit_data->inst, 0,
4502 chan);
4503 if (opcode == TGSI_OPCODE_TXP)
4504 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4505 TGSI_OPCODE_DIV,
4506 coords[chan],
4507 coords[3]);
4508 }
4509
4510 if (opcode == TGSI_OPCODE_TXP)
4511 coords[3] = bld_base->base.one;
4512
4513 /* Pack offsets. */
4514 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4515 /* The offsets are six-bit signed integers packed like this:
4516 * X=[5:0], Y=[13:8], and Z=[21:16].
4517 */
4518 LLVMValueRef offset[3], pack;
4519
4520 assert(inst->Texture.NumOffsets == 1);
4521
4522 for (chan = 0; chan < 3; chan++) {
4523 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4524 emit_data->inst, 0, chan);
4525 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4526 lp_build_const_int32(gallivm, 0x3f), "");
4527 if (chan)
4528 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4529 lp_build_const_int32(gallivm, chan*8), "");
4530 }
4531
4532 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4533 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4534 address[count++] = pack;
4535 }
4536
4537 /* Pack LOD bias value */
4538 if (opcode == TGSI_OPCODE_TXB)
4539 address[count++] = coords[3];
4540 if (opcode == TGSI_OPCODE_TXB2)
4541 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4542
4543 /* Pack depth comparison value */
4544 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4545 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4546 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4547 } else {
4548 assert(ref_pos >= 0);
4549 address[count++] = coords[ref_pos];
4550 }
4551 }
4552
4553 /* Pack user derivatives */
4554 if (opcode == TGSI_OPCODE_TXD) {
4555 int param, num_src_deriv_channels;
4556
4557 switch (target) {
4558 case TGSI_TEXTURE_3D:
4559 num_src_deriv_channels = 3;
4560 num_deriv_channels = 3;
4561 break;
4562 case TGSI_TEXTURE_2D:
4563 case TGSI_TEXTURE_SHADOW2D:
4564 case TGSI_TEXTURE_RECT:
4565 case TGSI_TEXTURE_SHADOWRECT:
4566 case TGSI_TEXTURE_2D_ARRAY:
4567 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4568 num_src_deriv_channels = 2;
4569 num_deriv_channels = 2;
4570 break;
4571 case TGSI_TEXTURE_CUBE:
4572 case TGSI_TEXTURE_SHADOWCUBE:
4573 case TGSI_TEXTURE_CUBE_ARRAY:
4574 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4575 /* Cube derivatives will be converted to 2D. */
4576 num_src_deriv_channels = 3;
4577 num_deriv_channels = 2;
4578 break;
4579 case TGSI_TEXTURE_1D:
4580 case TGSI_TEXTURE_SHADOW1D:
4581 case TGSI_TEXTURE_1D_ARRAY:
4582 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4583 num_src_deriv_channels = 1;
4584 num_deriv_channels = 1;
4585 break;
4586 default:
4587 unreachable("invalid target");
4588 }
4589
4590 for (param = 0; param < 2; param++)
4591 for (chan = 0; chan < num_src_deriv_channels; chan++)
4592 derivs[param * num_src_deriv_channels + chan] =
4593 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4594 }
4595
4596 if (target == TGSI_TEXTURE_CUBE ||
4597 target == TGSI_TEXTURE_CUBE_ARRAY ||
4598 target == TGSI_TEXTURE_SHADOWCUBE ||
4599 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4600 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4601
4602 if (opcode == TGSI_OPCODE_TXD)
4603 for (int i = 0; i < num_deriv_channels * 2; i++)
4604 address[count++] = derivs[i];
4605
4606 /* Pack texture coordinates */
4607 address[count++] = coords[0];
4608 if (num_coords > 1)
4609 address[count++] = coords[1];
4610 if (num_coords > 2)
4611 address[count++] = coords[2];
4612
4613 /* Pack LOD or sample index */
4614 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4615 address[count++] = coords[3];
4616 else if (opcode == TGSI_OPCODE_TXL2)
4617 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4618
4619 if (count > 16) {
4620 assert(!"Cannot handle more than 16 texture address parameters");
4621 count = 16;
4622 }
4623
4624 for (chan = 0; chan < count; chan++ ) {
4625 address[chan] = LLVMBuildBitCast(gallivm->builder,
4626 address[chan], ctx->i32, "");
4627 }
4628
4629 /* Adjust the sample index according to FMASK.
4630 *
4631 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4632 * which is the identity mapping. Each nibble says which physical sample
4633 * should be fetched to get that sample.
4634 *
4635 * For example, 0x11111100 means there are only 2 samples stored and
4636 * the second sample covers 3/4 of the pixel. When reading samples 0
4637 * and 1, return physical sample 0 (determined by the first two 0s
4638 * in FMASK), otherwise return physical sample 1.
4639 *
4640 * The sample index should be adjusted as follows:
4641 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4642 */
4643 if (target == TGSI_TEXTURE_2D_MSAA ||
4644 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4645 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4646 struct lp_build_emit_data txf_emit_data = *emit_data;
4647 LLVMValueRef txf_address[4];
4648 unsigned txf_count = count;
4649 struct tgsi_full_instruction inst = {};
4650
4651 memcpy(txf_address, address, sizeof(txf_address));
4652
4653 if (target == TGSI_TEXTURE_2D_MSAA) {
4654 txf_address[2] = bld_base->uint_bld.zero;
4655 }
4656 txf_address[3] = bld_base->uint_bld.zero;
4657
4658 /* Read FMASK using TXF. */
4659 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4660 inst.Texture.Texture = target;
4661 txf_emit_data.inst = &inst;
4662 txf_emit_data.chan = 0;
4663 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4664 target, fmask_ptr, NULL,
4665 txf_address, txf_count, 0xf);
4666 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4667
4668 /* Initialize some constants. */
4669 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4670 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4671
4672 /* Apply the formula. */
4673 LLVMValueRef fmask =
4674 LLVMBuildExtractElement(gallivm->builder,
4675 txf_emit_data.output[0],
4676 uint_bld->zero, "");
4677
4678 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4679
4680 LLVMValueRef sample_index4 =
4681 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4682
4683 LLVMValueRef shifted_fmask =
4684 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4685
4686 LLVMValueRef final_sample =
4687 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4688
4689 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4690 * resource descriptor is 0 (invalid),
4691 */
4692 LLVMValueRef fmask_desc =
4693 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4694 ctx->v8i32, "");
4695
4696 LLVMValueRef fmask_word1 =
4697 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4698 uint_bld->one, "");
4699
4700 LLVMValueRef word1_is_nonzero =
4701 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4702 fmask_word1, uint_bld->zero, "");
4703
4704 /* Replace the MSAA sample index. */
4705 address[sample_chan] =
4706 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4707 final_sample, address[sample_chan], "");
4708 }
4709
4710 if (opcode == TGSI_OPCODE_TXF) {
4711 /* add tex offsets */
4712 if (inst->Texture.NumOffsets) {
4713 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4714 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4715 const struct tgsi_texture_offset *off = inst->TexOffsets;
4716
4717 assert(inst->Texture.NumOffsets == 1);
4718
4719 switch (target) {
4720 case TGSI_TEXTURE_3D:
4721 address[2] = lp_build_add(uint_bld, address[2],
4722 bld->immediates[off->Index][off->SwizzleZ]);
4723 /* fall through */
4724 case TGSI_TEXTURE_2D:
4725 case TGSI_TEXTURE_SHADOW2D:
4726 case TGSI_TEXTURE_RECT:
4727 case TGSI_TEXTURE_SHADOWRECT:
4728 case TGSI_TEXTURE_2D_ARRAY:
4729 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4730 address[1] =
4731 lp_build_add(uint_bld, address[1],
4732 bld->immediates[off->Index][off->SwizzleY]);
4733 /* fall through */
4734 case TGSI_TEXTURE_1D:
4735 case TGSI_TEXTURE_SHADOW1D:
4736 case TGSI_TEXTURE_1D_ARRAY:
4737 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4738 address[0] =
4739 lp_build_add(uint_bld, address[0],
4740 bld->immediates[off->Index][off->SwizzleX]);
4741 break;
4742 /* texture offsets do not apply to other texture targets */
4743 }
4744 }
4745 }
4746
4747 if (opcode == TGSI_OPCODE_TG4) {
4748 unsigned gather_comp = 0;
4749
4750 /* DMASK was repurposed for GATHER4. 4 components are always
4751 * returned and DMASK works like a swizzle - it selects
4752 * the component to fetch. The only valid DMASK values are
4753 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4754 * (red,red,red,red) etc.) The ISA document doesn't mention
4755 * this.
4756 */
4757
4758 /* Get the component index from src1.x for Gather4. */
4759 if (!tgsi_is_shadow_target(target)) {
4760 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4761 LLVMValueRef comp_imm;
4762 struct tgsi_src_register src1 = inst->Src[1].Register;
4763
4764 assert(src1.File == TGSI_FILE_IMMEDIATE);
4765
4766 comp_imm = imms[src1.Index][src1.SwizzleX];
4767 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4768 gather_comp = CLAMP(gather_comp, 0, 3);
4769 }
4770
4771 dmask = 1 << gather_comp;
4772 }
4773
4774 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4775 samp_ptr, address, count, dmask);
4776 }
4777
4778 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
4779 * incorrectly forces nearest filtering if the texture format is integer.
4780 * The only effect it has on Gather4, which always returns 4 texels for
4781 * bilinear filtering, is that the final coordinates are off by 0.5 of
4782 * the texel size.
4783 *
4784 * The workaround is to subtract 0.5 from the unnormalized coordinates,
4785 * or (0.5 / size) from the normalized coordinates.
4786 */
4787 static void si_lower_gather4_integer(struct si_shader_context *ctx,
4788 struct lp_build_emit_data *emit_data,
4789 const char *intr_name,
4790 unsigned coord_vgpr_index)
4791 {
4792 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4793 LLVMValueRef coord = emit_data->args[0];
4794 LLVMValueRef half_texel[2];
4795 int c;
4796
4797 if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_RECT ||
4798 emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
4799 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
4800 } else {
4801 struct tgsi_full_instruction txq_inst = {};
4802 struct lp_build_emit_data txq_emit_data = {};
4803
4804 /* Query the texture size. */
4805 txq_inst.Texture.Texture = emit_data->inst->Texture.Texture;
4806 txq_emit_data.inst = &txq_inst;
4807 txq_emit_data.dst_type = ctx->v4i32;
4808 set_tex_fetch_args(ctx, &txq_emit_data, TGSI_OPCODE_TXQ,
4809 txq_inst.Texture.Texture,
4810 emit_data->args[1], NULL,
4811 &ctx->radeon_bld.soa.bld_base.uint_bld.zero,
4812 1, 0xf);
4813 txq_emit(NULL, &ctx->radeon_bld.soa.bld_base, &txq_emit_data);
4814
4815 /* Compute -0.5 / size. */
4816 for (c = 0; c < 2; c++) {
4817 half_texel[c] =
4818 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
4819 LLVMConstInt(ctx->i32, c, 0), "");
4820 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
4821 half_texel[c] =
4822 lp_build_emit_llvm_unary(&ctx->radeon_bld.soa.bld_base,
4823 TGSI_OPCODE_RCP, half_texel[c]);
4824 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
4825 LLVMConstReal(ctx->f32, -0.5), "");
4826 }
4827 }
4828
4829 for (c = 0; c < 2; c++) {
4830 LLVMValueRef tmp;
4831 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
4832
4833 tmp = LLVMBuildExtractElement(builder, coord, index, "");
4834 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4835 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
4836 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4837 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
4838 }
4839
4840 emit_data->args[0] = coord;
4841 emit_data->output[emit_data->chan] =
4842 lp_build_intrinsic(builder, intr_name, emit_data->dst_type,
4843 emit_data->args, emit_data->arg_count,
4844 LLVMReadNoneAttribute);
4845 }
4846
4847 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4848 struct lp_build_tgsi_context *bld_base,
4849 struct lp_build_emit_data *emit_data)
4850 {
4851 struct si_shader_context *ctx = si_shader_context(bld_base);
4852 struct lp_build_context *base = &bld_base->base;
4853 const struct tgsi_full_instruction *inst = emit_data->inst;
4854 unsigned opcode = inst->Instruction.Opcode;
4855 unsigned target = inst->Texture.Texture;
4856 char intr_name[127];
4857 bool has_offset = inst->Texture.NumOffsets > 0;
4858 bool is_shadow = tgsi_is_shadow_target(target);
4859 char type[64];
4860 const char *name = "llvm.SI.image.sample";
4861 const char *infix = "";
4862
4863 if (target == TGSI_TEXTURE_BUFFER) {
4864 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4865 base->gallivm->builder,
4866 "llvm.SI.vs.load.input", emit_data->dst_type,
4867 emit_data->args, emit_data->arg_count,
4868 LLVMReadNoneAttribute);
4869 return;
4870 }
4871
4872 switch (opcode) {
4873 case TGSI_OPCODE_TXF:
4874 name = target == TGSI_TEXTURE_2D_MSAA ||
4875 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4876 "llvm.SI.image.load" :
4877 "llvm.SI.image.load.mip";
4878 is_shadow = false;
4879 has_offset = false;
4880 break;
4881 case TGSI_OPCODE_LODQ:
4882 name = "llvm.SI.getlod";
4883 is_shadow = false;
4884 has_offset = false;
4885 break;
4886 case TGSI_OPCODE_TEX:
4887 case TGSI_OPCODE_TEX2:
4888 case TGSI_OPCODE_TXP:
4889 if (ctx->type != PIPE_SHADER_FRAGMENT)
4890 infix = ".lz";
4891 break;
4892 case TGSI_OPCODE_TXB:
4893 case TGSI_OPCODE_TXB2:
4894 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4895 infix = ".b";
4896 break;
4897 case TGSI_OPCODE_TXL:
4898 case TGSI_OPCODE_TXL2:
4899 infix = ".l";
4900 break;
4901 case TGSI_OPCODE_TXD:
4902 infix = ".d";
4903 break;
4904 case TGSI_OPCODE_TG4:
4905 name = "llvm.SI.gather4";
4906 infix = ".lz";
4907 break;
4908 default:
4909 assert(0);
4910 return;
4911 }
4912
4913 /* Add the type and suffixes .c, .o if needed. */
4914 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4915 sprintf(intr_name, "%s%s%s%s.%s",
4916 name, is_shadow ? ".c" : "", infix,
4917 has_offset ? ".o" : "", type);
4918
4919 /* The hardware needs special lowering for Gather4 with integer formats. */
4920 if (opcode == TGSI_OPCODE_TG4) {
4921 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4922 /* This will also work with non-constant indexing because of how
4923 * glsl_to_tgsi works and we intent to preserve that behavior.
4924 */
4925 const unsigned src_idx = 2;
4926 unsigned sampler = inst->Src[src_idx].Register.Index;
4927
4928 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
4929
4930 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
4931 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT) {
4932 /* Texture coordinates start after:
4933 * {offset, bias, z-compare, derivatives}
4934 * Only the offset and z-compare can occur here.
4935 */
4936 si_lower_gather4_integer(ctx, emit_data, intr_name,
4937 (int)has_offset + (int)is_shadow);
4938 return;
4939 }
4940 }
4941
4942 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4943 base->gallivm->builder, intr_name, emit_data->dst_type,
4944 emit_data->args, emit_data->arg_count,
4945 LLVMReadNoneAttribute);
4946 }
4947
4948 static void si_llvm_emit_txqs(
4949 const struct lp_build_tgsi_action *action,
4950 struct lp_build_tgsi_context *bld_base,
4951 struct lp_build_emit_data *emit_data)
4952 {
4953 struct si_shader_context *ctx = si_shader_context(bld_base);
4954 struct gallivm_state *gallivm = bld_base->base.gallivm;
4955 LLVMBuilderRef builder = gallivm->builder;
4956 LLVMValueRef res, samples;
4957 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4958
4959 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4960
4961
4962 /* Read the samples from the descriptor directly. */
4963 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4964 samples = LLVMBuildExtractElement(
4965 builder, res,
4966 lp_build_const_int32(gallivm, 3), "");
4967 samples = LLVMBuildLShr(builder, samples,
4968 lp_build_const_int32(gallivm, 16), "");
4969 samples = LLVMBuildAnd(builder, samples,
4970 lp_build_const_int32(gallivm, 0xf), "");
4971 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4972 samples, "");
4973
4974 emit_data->output[emit_data->chan] = samples;
4975 }
4976
4977 /*
4978 * SI implements derivatives using the local data store (LDS)
4979 * All writes to the LDS happen in all executing threads at
4980 * the same time. TID is the Thread ID for the current
4981 * thread and is a value between 0 and 63, representing
4982 * the thread's position in the wavefront.
4983 *
4984 * For the pixel shader threads are grouped into quads of four pixels.
4985 * The TIDs of the pixels of a quad are:
4986 *
4987 * +------+------+
4988 * |4n + 0|4n + 1|
4989 * +------+------+
4990 * |4n + 2|4n + 3|
4991 * +------+------+
4992 *
4993 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4994 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4995 * the current pixel's column, and masking with 0xfffffffe yields the TID
4996 * of the left pixel of the current pixel's row.
4997 *
4998 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4999 * adding 2 yields the TID of the pixel below the top pixel.
5000 */
5001 /* masks for thread ID. */
5002 #define TID_MASK_TOP_LEFT 0xfffffffc
5003 #define TID_MASK_TOP 0xfffffffd
5004 #define TID_MASK_LEFT 0xfffffffe
5005
5006 static void si_llvm_emit_ddxy(
5007 const struct lp_build_tgsi_action *action,
5008 struct lp_build_tgsi_context *bld_base,
5009 struct lp_build_emit_data *emit_data)
5010 {
5011 struct si_shader_context *ctx = si_shader_context(bld_base);
5012 struct gallivm_state *gallivm = bld_base->base.gallivm;
5013 const struct tgsi_full_instruction *inst = emit_data->inst;
5014 unsigned opcode = inst->Instruction.Opcode;
5015 LLVMValueRef indices[2];
5016 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
5017 LLVMValueRef tl, trbl, result[4];
5018 LLVMValueRef tl_tid, trbl_tid;
5019 unsigned swizzle[4];
5020 unsigned c;
5021 int idx;
5022 unsigned mask;
5023
5024 indices[0] = bld_base->uint_bld.zero;
5025 indices[1] = get_thread_id(ctx);
5026 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
5027 indices, 2, "");
5028
5029 if (opcode == TGSI_OPCODE_DDX_FINE)
5030 mask = TID_MASK_LEFT;
5031 else if (opcode == TGSI_OPCODE_DDY_FINE)
5032 mask = TID_MASK_TOP;
5033 else
5034 mask = TID_MASK_TOP_LEFT;
5035
5036 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
5037 lp_build_const_int32(gallivm, mask), "");
5038 indices[1] = tl_tid;
5039 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
5040 indices, 2, "");
5041
5042 /* for DDX we want to next X pixel, DDY next Y pixel. */
5043 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5044 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
5045 lp_build_const_int32(gallivm, idx), "");
5046 indices[1] = trbl_tid;
5047 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
5048 indices, 2, "");
5049
5050 for (c = 0; c < 4; ++c) {
5051 unsigned i;
5052 LLVMValueRef val;
5053 LLVMValueRef args[2];
5054
5055 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
5056 for (i = 0; i < c; ++i) {
5057 if (swizzle[i] == swizzle[c]) {
5058 result[c] = result[i];
5059 break;
5060 }
5061 }
5062 if (i != c)
5063 continue;
5064
5065 val = LLVMBuildBitCast(gallivm->builder,
5066 lp_build_emit_fetch(bld_base, inst, 0, c),
5067 ctx->i32, "");
5068
5069 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
5070
5071 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
5072 lp_build_const_int32(gallivm, 4), "");
5073 args[1] = val;
5074 tl = lp_build_intrinsic(gallivm->builder,
5075 "llvm.amdgcn.ds.bpermute", ctx->i32,
5076 args, 2, LLVMReadNoneAttribute);
5077
5078 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
5079 lp_build_const_int32(gallivm, 4), "");
5080 trbl = lp_build_intrinsic(gallivm->builder,
5081 "llvm.amdgcn.ds.bpermute", ctx->i32,
5082 args, 2, LLVMReadNoneAttribute);
5083 } else {
5084 LLVMBuildStore(gallivm->builder, val, store_ptr);
5085 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
5086 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
5087 }
5088 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5089 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
5090 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
5091 }
5092
5093 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
5094 }
5095
5096 /*
5097 * this takes an I,J coordinate pair,
5098 * and works out the X and Y derivatives.
5099 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5100 */
5101 static LLVMValueRef si_llvm_emit_ddxy_interp(
5102 struct lp_build_tgsi_context *bld_base,
5103 LLVMValueRef interp_ij)
5104 {
5105 struct si_shader_context *ctx = si_shader_context(bld_base);
5106 struct gallivm_state *gallivm = bld_base->base.gallivm;
5107 LLVMValueRef indices[2];
5108 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
5109 LLVMValueRef tl, tr, bl, result[4];
5110 unsigned c;
5111
5112 indices[0] = bld_base->uint_bld.zero;
5113 indices[1] = get_thread_id(ctx);
5114 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
5115 indices, 2, "");
5116
5117 temp = LLVMBuildAnd(gallivm->builder, indices[1],
5118 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
5119
5120 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
5121 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
5122
5123 indices[1] = temp;
5124 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
5125 indices, 2, "");
5126
5127 indices[1] = temp2;
5128 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
5129 indices, 2, "");
5130
5131 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
5132 lp_build_const_int32(gallivm, 1), "");
5133 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
5134 indices, 2, "");
5135
5136 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
5137 lp_build_const_int32(gallivm, 2), "");
5138 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
5139 indices, 2, "");
5140
5141 for (c = 0; c < 2; ++c) {
5142 LLVMValueRef store_val;
5143 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
5144
5145 store_val = LLVMBuildExtractElement(gallivm->builder,
5146 interp_ij, c_ll, "");
5147 LLVMBuildStore(gallivm->builder,
5148 store_val,
5149 store_ptr);
5150
5151 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
5152 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5153
5154 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
5155 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
5156
5157 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
5158
5159 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
5160 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5161
5162 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
5163 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
5164
5165 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
5166 }
5167
5168 return lp_build_gather_values(gallivm, result, 4);
5169 }
5170
5171 static void interp_fetch_args(
5172 struct lp_build_tgsi_context *bld_base,
5173 struct lp_build_emit_data *emit_data)
5174 {
5175 struct si_shader_context *ctx = si_shader_context(bld_base);
5176 struct gallivm_state *gallivm = bld_base->base.gallivm;
5177 const struct tgsi_full_instruction *inst = emit_data->inst;
5178
5179 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5180 /* offset is in second src, first two channels */
5181 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5182 emit_data->inst, 1,
5183 TGSI_CHAN_X);
5184 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5185 emit_data->inst, 1,
5186 TGSI_CHAN_Y);
5187 emit_data->arg_count = 2;
5188 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5189 LLVMValueRef sample_position;
5190 LLVMValueRef sample_id;
5191 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5192
5193 /* fetch sample ID, then fetch its sample position,
5194 * and place into first two channels.
5195 */
5196 sample_id = lp_build_emit_fetch(bld_base,
5197 emit_data->inst, 1, TGSI_CHAN_X);
5198 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5199 ctx->i32, "");
5200 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5201
5202 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5203 sample_position,
5204 lp_build_const_int32(gallivm, 0), "");
5205
5206 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5207 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5208 sample_position,
5209 lp_build_const_int32(gallivm, 1), "");
5210 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5211 emit_data->arg_count = 2;
5212 }
5213 }
5214
5215 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5216 struct lp_build_tgsi_context *bld_base,
5217 struct lp_build_emit_data *emit_data)
5218 {
5219 struct si_shader_context *ctx = si_shader_context(bld_base);
5220 struct si_shader *shader = ctx->shader;
5221 struct gallivm_state *gallivm = bld_base->base.gallivm;
5222 LLVMValueRef interp_param;
5223 const struct tgsi_full_instruction *inst = emit_data->inst;
5224 const char *intr_name;
5225 int input_index = inst->Src[0].Register.Index;
5226 int chan;
5227 int i;
5228 LLVMValueRef attr_number;
5229 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5230 int interp_param_idx;
5231 unsigned interp = shader->selector->info.input_interpolate[input_index];
5232 unsigned location;
5233
5234 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5235
5236 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5237 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5238 location = TGSI_INTERPOLATE_LOC_CENTER;
5239 else
5240 location = TGSI_INTERPOLATE_LOC_CENTROID;
5241
5242 interp_param_idx = lookup_interp_param_index(interp, location);
5243 if (interp_param_idx == -1)
5244 return;
5245 else if (interp_param_idx)
5246 interp_param = get_interp_param(ctx, interp_param_idx);
5247 else
5248 interp_param = NULL;
5249
5250 attr_number = lp_build_const_int32(gallivm, input_index);
5251
5252 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5253 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5254 LLVMValueRef ij_out[2];
5255 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5256
5257 /*
5258 * take the I then J parameters, and the DDX/Y for it, and
5259 * calculate the IJ inputs for the interpolator.
5260 * temp1 = ddx * offset/sample.x + I;
5261 * interp_param.I = ddy * offset/sample.y + temp1;
5262 * temp1 = ddx * offset/sample.x + J;
5263 * interp_param.J = ddy * offset/sample.y + temp1;
5264 */
5265 for (i = 0; i < 2; i++) {
5266 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5267 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5268 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5269 ddxy_out, ix_ll, "");
5270 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5271 ddxy_out, iy_ll, "");
5272 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5273 interp_param, ix_ll, "");
5274 LLVMValueRef temp1, temp2;
5275
5276 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5277 ctx->f32, "");
5278
5279 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5280
5281 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5282
5283 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5284
5285 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5286
5287 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5288 temp2, ctx->i32, "");
5289 }
5290 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5291 }
5292
5293 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5294 for (chan = 0; chan < 2; chan++) {
5295 LLVMValueRef args[4];
5296 LLVMValueRef llvm_chan;
5297 unsigned schan;
5298
5299 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5300 llvm_chan = lp_build_const_int32(gallivm, schan);
5301
5302 args[0] = llvm_chan;
5303 args[1] = attr_number;
5304 args[2] = params;
5305 args[3] = interp_param;
5306
5307 emit_data->output[chan] =
5308 lp_build_intrinsic(gallivm->builder, intr_name,
5309 ctx->f32, args, args[3] ? 4 : 3,
5310 LLVMReadNoneAttribute);
5311 }
5312 }
5313
5314 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5315 struct lp_build_emit_data *emit_data)
5316 {
5317 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5318 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5319 unsigned stream;
5320
5321 assert(src0.File == TGSI_FILE_IMMEDIATE);
5322
5323 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5324 return stream;
5325 }
5326
5327 /* Emit one vertex from the geometry shader */
5328 static void si_llvm_emit_vertex(
5329 const struct lp_build_tgsi_action *action,
5330 struct lp_build_tgsi_context *bld_base,
5331 struct lp_build_emit_data *emit_data)
5332 {
5333 struct si_shader_context *ctx = si_shader_context(bld_base);
5334 struct lp_build_context *uint = &bld_base->uint_bld;
5335 struct si_shader *shader = ctx->shader;
5336 struct tgsi_shader_info *info = &shader->selector->info;
5337 struct gallivm_state *gallivm = bld_base->base.gallivm;
5338 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5339 SI_PARAM_GS2VS_OFFSET);
5340 LLVMValueRef gs_next_vertex;
5341 LLVMValueRef can_emit, kill;
5342 LLVMValueRef args[2];
5343 unsigned chan;
5344 int i;
5345 unsigned stream;
5346
5347 stream = si_llvm_get_stream(bld_base, emit_data);
5348
5349 /* Write vertex attribute values to GSVS ring */
5350 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5351 ctx->gs_next_vertex[stream],
5352 "");
5353
5354 /* If this thread has already emitted the declared maximum number of
5355 * vertices, kill it: excessive vertex emissions are not supposed to
5356 * have any effect, and GS threads have no externally observable
5357 * effects other than emitting vertices.
5358 */
5359 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5360 lp_build_const_int32(gallivm,
5361 shader->selector->gs_max_out_vertices), "");
5362 kill = lp_build_select(&bld_base->base, can_emit,
5363 lp_build_const_float(gallivm, 1.0f),
5364 lp_build_const_float(gallivm, -1.0f));
5365
5366 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5367 ctx->voidt, &kill, 1, 0);
5368
5369 for (i = 0; i < info->num_outputs; i++) {
5370 LLVMValueRef *out_ptr =
5371 ctx->radeon_bld.soa.outputs[i];
5372
5373 for (chan = 0; chan < 4; chan++) {
5374 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5375 LLVMValueRef voffset =
5376 lp_build_const_int32(gallivm, (i * 4 + chan) *
5377 shader->selector->gs_max_out_vertices);
5378
5379 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5380 voffset = lp_build_mul_imm(uint, voffset, 4);
5381
5382 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5383
5384 build_tbuffer_store(ctx,
5385 ctx->gsvs_ring[stream],
5386 out_val, 1,
5387 voffset, soffset, 0,
5388 V_008F0C_BUF_DATA_FORMAT_32,
5389 V_008F0C_BUF_NUM_FORMAT_UINT,
5390 1, 0, 1, 1, 0);
5391 }
5392 }
5393 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5394 lp_build_const_int32(gallivm, 1));
5395
5396 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5397
5398 /* Signal vertex emission */
5399 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5400 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5401 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5402 ctx->voidt, args, 2, 0);
5403 }
5404
5405 /* Cut one primitive from the geometry shader */
5406 static void si_llvm_emit_primitive(
5407 const struct lp_build_tgsi_action *action,
5408 struct lp_build_tgsi_context *bld_base,
5409 struct lp_build_emit_data *emit_data)
5410 {
5411 struct si_shader_context *ctx = si_shader_context(bld_base);
5412 struct gallivm_state *gallivm = bld_base->base.gallivm;
5413 LLVMValueRef args[2];
5414 unsigned stream;
5415
5416 /* Signal primitive cut */
5417 stream = si_llvm_get_stream(bld_base, emit_data);
5418 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5419 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5420 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5421 ctx->voidt, args, 2, 0);
5422 }
5423
5424 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5425 struct lp_build_tgsi_context *bld_base,
5426 struct lp_build_emit_data *emit_data)
5427 {
5428 struct si_shader_context *ctx = si_shader_context(bld_base);
5429 struct gallivm_state *gallivm = bld_base->base.gallivm;
5430
5431 /* The real barrier instruction isn’t needed, because an entire patch
5432 * always fits into a single wave.
5433 */
5434 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5435 emit_optimization_barrier(ctx);
5436 return;
5437 }
5438
5439 lp_build_intrinsic(gallivm->builder,
5440 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5441 : "llvm.AMDGPU.barrier.local",
5442 ctx->voidt, NULL, 0, 0);
5443 }
5444
5445 static const struct lp_build_tgsi_action tex_action = {
5446 .fetch_args = tex_fetch_args,
5447 .emit = build_tex_intrinsic,
5448 };
5449
5450 static const struct lp_build_tgsi_action interp_action = {
5451 .fetch_args = interp_fetch_args,
5452 .emit = build_interp_intrinsic,
5453 };
5454
5455 static void si_create_function(struct si_shader_context *ctx,
5456 LLVMTypeRef *returns, unsigned num_returns,
5457 LLVMTypeRef *params, unsigned num_params,
5458 int last_sgpr)
5459 {
5460 int i;
5461
5462 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5463 params, num_params);
5464 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5465 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5466
5467 for (i = 0; i <= last_sgpr; ++i) {
5468 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5469
5470 /* The combination of:
5471 * - ByVal
5472 * - dereferenceable
5473 * - invariant.load
5474 * allows the optimization passes to move loads and reduces
5475 * SGPR spilling significantly.
5476 */
5477 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5478 LLVMAddAttribute(P, LLVMByValAttribute);
5479 lp_add_attr_dereferenceable(P, UINT64_MAX);
5480 } else
5481 LLVMAddAttribute(P, LLVMInRegAttribute);
5482 }
5483
5484 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5485 /* These were copied from some LLVM test. */
5486 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5487 "less-precise-fpmad",
5488 "true");
5489 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5490 "no-infs-fp-math",
5491 "true");
5492 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5493 "no-nans-fp-math",
5494 "true");
5495 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5496 "unsafe-fp-math",
5497 "true");
5498 }
5499 }
5500
5501 static void create_meta_data(struct si_shader_context *ctx)
5502 {
5503 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5504
5505 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5506 "invariant.load", 14);
5507 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5508 "range", 5);
5509 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5510 "amdgpu.uniform", 14);
5511
5512 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5513 }
5514
5515 static void declare_streamout_params(struct si_shader_context *ctx,
5516 struct pipe_stream_output_info *so,
5517 LLVMTypeRef *params, LLVMTypeRef i32,
5518 unsigned *num_params)
5519 {
5520 int i;
5521
5522 /* Streamout SGPRs. */
5523 if (so->num_outputs) {
5524 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5525 params[ctx->param_streamout_config = (*num_params)++] = i32;
5526 else
5527 ctx->param_streamout_config = ctx->param_tess_offchip;
5528
5529 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5530 }
5531 /* A streamout buffer offset is loaded if the stride is non-zero. */
5532 for (i = 0; i < 4; i++) {
5533 if (!so->stride[i])
5534 continue;
5535
5536 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5537 }
5538 }
5539
5540 static unsigned llvm_get_type_size(LLVMTypeRef type)
5541 {
5542 LLVMTypeKind kind = LLVMGetTypeKind(type);
5543
5544 switch (kind) {
5545 case LLVMIntegerTypeKind:
5546 return LLVMGetIntTypeWidth(type) / 8;
5547 case LLVMFloatTypeKind:
5548 return 4;
5549 case LLVMPointerTypeKind:
5550 return 8;
5551 case LLVMVectorTypeKind:
5552 return LLVMGetVectorSize(type) *
5553 llvm_get_type_size(LLVMGetElementType(type));
5554 default:
5555 assert(0);
5556 return 0;
5557 }
5558 }
5559
5560 static void declare_tess_lds(struct si_shader_context *ctx)
5561 {
5562 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5563 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5564 struct lp_build_context *uint = &bld_base->uint_bld;
5565
5566 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5567 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, uint->zero,
5568 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5569 "tess_lds");
5570 }
5571
5572 static void create_function(struct si_shader_context *ctx)
5573 {
5574 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5575 struct gallivm_state *gallivm = bld_base->base.gallivm;
5576 struct si_shader *shader = ctx->shader;
5577 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5578 LLVMTypeRef returns[16+32*4];
5579 unsigned i, last_sgpr, num_params, num_return_sgprs;
5580 unsigned num_returns = 0;
5581
5582 v3i32 = LLVMVectorType(ctx->i32, 3);
5583
5584 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5585 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5586 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5587 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5588 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5589
5590 switch (ctx->type) {
5591 case PIPE_SHADER_VERTEX:
5592 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5593 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5594 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5595 params[SI_PARAM_DRAWID] = ctx->i32;
5596 num_params = SI_PARAM_DRAWID+1;
5597
5598 if (shader->key.vs.as_es) {
5599 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5600 } else if (shader->key.vs.as_ls) {
5601 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5602 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5603 } else {
5604 if (ctx->is_gs_copy_shader) {
5605 num_params = SI_PARAM_RW_BUFFERS+1;
5606 } else {
5607 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5608 num_params = SI_PARAM_VS_STATE_BITS+1;
5609 }
5610
5611 /* The locations of the other parameters are assigned dynamically. */
5612 declare_streamout_params(ctx, &shader->selector->so,
5613 params, ctx->i32, &num_params);
5614 }
5615
5616 last_sgpr = num_params-1;
5617
5618 /* VGPRs */
5619 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5620 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5621 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5622 params[ctx->param_instance_id = num_params++] = ctx->i32;
5623
5624 if (!ctx->is_monolithic &&
5625 !ctx->is_gs_copy_shader) {
5626 /* Vertex load indices. */
5627 ctx->param_vertex_index0 = num_params;
5628
5629 for (i = 0; i < shader->selector->info.num_inputs; i++)
5630 params[num_params++] = ctx->i32;
5631
5632 /* PrimitiveID output. */
5633 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5634 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5635 returns[num_returns++] = ctx->f32;
5636 }
5637 break;
5638
5639 case PIPE_SHADER_TESS_CTRL:
5640 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5641 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5642 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5643 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5644 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5645 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5646 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5647
5648 /* VGPRs */
5649 params[SI_PARAM_PATCH_ID] = ctx->i32;
5650 params[SI_PARAM_REL_IDS] = ctx->i32;
5651 num_params = SI_PARAM_REL_IDS+1;
5652
5653 if (!ctx->is_monolithic) {
5654 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5655 * placed after the user SGPRs.
5656 */
5657 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5658 returns[num_returns++] = ctx->i32; /* SGPRs */
5659
5660 for (i = 0; i < 3; i++)
5661 returns[num_returns++] = ctx->f32; /* VGPRs */
5662 }
5663 break;
5664
5665 case PIPE_SHADER_TESS_EVAL:
5666 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5667 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5668
5669 if (shader->key.tes.as_es) {
5670 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5671 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5672 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5673 } else {
5674 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5675 declare_streamout_params(ctx, &shader->selector->so,
5676 params, ctx->i32, &num_params);
5677 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5678 }
5679 last_sgpr = num_params - 1;
5680
5681 /* VGPRs */
5682 params[ctx->param_tes_u = num_params++] = ctx->f32;
5683 params[ctx->param_tes_v = num_params++] = ctx->f32;
5684 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5685 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5686
5687 /* PrimitiveID output. */
5688 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5689 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5690 returns[num_returns++] = ctx->f32;
5691 break;
5692
5693 case PIPE_SHADER_GEOMETRY:
5694 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5695 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5696 last_sgpr = SI_PARAM_GS_WAVE_ID;
5697
5698 /* VGPRs */
5699 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5700 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5701 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5702 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5703 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5704 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5705 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5706 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5707 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5708 break;
5709
5710 case PIPE_SHADER_FRAGMENT:
5711 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5712 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5713 last_sgpr = SI_PARAM_PRIM_MASK;
5714 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5715 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5716 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5717 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5718 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5719 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5720 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5721 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5722 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5723 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5724 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5725 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5726 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5727 params[SI_PARAM_ANCILLARY] = ctx->i32;
5728 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5729 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5730 num_params = SI_PARAM_POS_FIXED_PT+1;
5731
5732 if (!ctx->is_monolithic) {
5733 /* Color inputs from the prolog. */
5734 if (shader->selector->info.colors_read) {
5735 unsigned num_color_elements =
5736 util_bitcount(shader->selector->info.colors_read);
5737
5738 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5739 for (i = 0; i < num_color_elements; i++)
5740 params[num_params++] = ctx->f32;
5741 }
5742
5743 /* Outputs for the epilog. */
5744 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5745 num_returns =
5746 num_return_sgprs +
5747 util_bitcount(shader->selector->info.colors_written) * 4 +
5748 shader->selector->info.writes_z +
5749 shader->selector->info.writes_stencil +
5750 shader->selector->info.writes_samplemask +
5751 1 /* SampleMaskIn */;
5752
5753 num_returns = MAX2(num_returns,
5754 num_return_sgprs +
5755 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5756
5757 for (i = 0; i < num_return_sgprs; i++)
5758 returns[i] = ctx->i32;
5759 for (; i < num_returns; i++)
5760 returns[i] = ctx->f32;
5761 }
5762 break;
5763
5764 case PIPE_SHADER_COMPUTE:
5765 params[SI_PARAM_GRID_SIZE] = v3i32;
5766 params[SI_PARAM_BLOCK_ID] = v3i32;
5767 last_sgpr = SI_PARAM_BLOCK_ID;
5768
5769 params[SI_PARAM_THREAD_ID] = v3i32;
5770 num_params = SI_PARAM_THREAD_ID + 1;
5771 break;
5772 default:
5773 assert(0 && "unimplemented shader");
5774 return;
5775 }
5776
5777 assert(num_params <= ARRAY_SIZE(params));
5778
5779 si_create_function(ctx, returns, num_returns, params,
5780 num_params, last_sgpr);
5781
5782 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5783 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5784 !ctx->is_monolithic) {
5785 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5786 "InitialPSInputAddr",
5787 S_0286D0_PERSP_SAMPLE_ENA(1) |
5788 S_0286D0_PERSP_CENTER_ENA(1) |
5789 S_0286D0_PERSP_CENTROID_ENA(1) |
5790 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5791 S_0286D0_LINEAR_CENTER_ENA(1) |
5792 S_0286D0_LINEAR_CENTROID_ENA(1) |
5793 S_0286D0_FRONT_FACE_ENA(1) |
5794 S_0286D0_POS_FIXED_PT_ENA(1));
5795 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5796 const unsigned *properties = shader->selector->info.properties;
5797 unsigned max_work_group_size =
5798 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5799 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5800 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5801
5802 assert(max_work_group_size);
5803
5804 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5805 "amdgpu-max-work-group-size",
5806 max_work_group_size);
5807 }
5808
5809 shader->info.num_input_sgprs = 0;
5810 shader->info.num_input_vgprs = 0;
5811
5812 for (i = 0; i <= last_sgpr; ++i)
5813 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5814
5815 /* Unused fragment shader inputs are eliminated by the compiler,
5816 * so we don't know yet how many there will be.
5817 */
5818 if (ctx->type != PIPE_SHADER_FRAGMENT)
5819 for (; i < num_params; ++i)
5820 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5821
5822 if (bld_base->info &&
5823 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5824 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5825 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5826 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5827 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5828 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5829 ctx->lds =
5830 LLVMAddGlobalInAddressSpace(gallivm->module,
5831 LLVMArrayType(ctx->i32, 64),
5832 "ddxy_lds",
5833 LOCAL_ADDR_SPACE);
5834
5835 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5836 ctx->type == PIPE_SHADER_TESS_CTRL ||
5837 ctx->type == PIPE_SHADER_TESS_EVAL)
5838 declare_tess_lds(ctx);
5839 }
5840
5841 static void preload_constant_buffers(struct si_shader_context *ctx)
5842 {
5843 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5844 struct gallivm_state *gallivm = bld_base->base.gallivm;
5845 const struct tgsi_shader_info *info = bld_base->info;
5846 unsigned buf;
5847 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5848
5849 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5850 if (info->const_file_max[buf] == -1)
5851 continue;
5852
5853 /* Load the resource descriptor */
5854 ctx->const_buffers[buf] =
5855 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5856 }
5857 }
5858
5859 /**
5860 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5861 * for later use.
5862 */
5863 static void preload_ring_buffers(struct si_shader_context *ctx)
5864 {
5865 struct gallivm_state *gallivm =
5866 ctx->radeon_bld.soa.bld_base.base.gallivm;
5867
5868 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5869 SI_PARAM_RW_BUFFERS);
5870
5871 if ((ctx->type == PIPE_SHADER_VERTEX &&
5872 ctx->shader->key.vs.as_es) ||
5873 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5874 ctx->shader->key.tes.as_es) ||
5875 ctx->type == PIPE_SHADER_GEOMETRY) {
5876 unsigned ring =
5877 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5878 : SI_ES_RING_ESGS;
5879 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5880
5881 ctx->esgs_ring =
5882 build_indexed_load_const(ctx, buf_ptr, offset);
5883 }
5884
5885 if (ctx->is_gs_copy_shader) {
5886 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5887
5888 ctx->gsvs_ring[0] =
5889 build_indexed_load_const(ctx, buf_ptr, offset);
5890 }
5891 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5892 int i;
5893 for (i = 0; i < 4; i++) {
5894 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5895
5896 ctx->gsvs_ring[i] =
5897 build_indexed_load_const(ctx, buf_ptr, offset);
5898 }
5899 }
5900 }
5901
5902 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5903 LLVMValueRef param_rw_buffers,
5904 unsigned param_pos_fixed_pt)
5905 {
5906 struct lp_build_tgsi_context *bld_base =
5907 &ctx->radeon_bld.soa.bld_base;
5908 struct gallivm_state *gallivm = bld_base->base.gallivm;
5909 LLVMBuilderRef builder = gallivm->builder;
5910 LLVMValueRef slot, desc, offset, row, bit, address[2];
5911
5912 /* Use the fixed-point gl_FragCoord input.
5913 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5914 * per coordinate to get the repeating effect.
5915 */
5916 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5917 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5918
5919 /* Load the buffer descriptor. */
5920 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5921 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5922
5923 /* The stipple pattern is 32x32, each row has 32 bits. */
5924 offset = LLVMBuildMul(builder, address[1],
5925 LLVMConstInt(ctx->i32, 4, 0), "");
5926 row = buffer_load_const(ctx, desc, offset);
5927 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
5928 bit = LLVMBuildLShr(builder, row, address[0], "");
5929 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5930
5931 /* The intrinsic kills the thread if arg < 0. */
5932 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5933 LLVMConstReal(ctx->f32, -1), "");
5934 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5935 }
5936
5937 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5938 struct si_shader_config *conf,
5939 unsigned symbol_offset)
5940 {
5941 unsigned i;
5942 const unsigned char *config =
5943 radeon_shader_binary_config_start(binary, symbol_offset);
5944 bool really_needs_scratch = false;
5945
5946 /* LLVM adds SGPR spills to the scratch size.
5947 * Find out if we really need the scratch buffer.
5948 */
5949 for (i = 0; i < binary->reloc_count; i++) {
5950 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5951
5952 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5953 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5954 really_needs_scratch = true;
5955 break;
5956 }
5957 }
5958
5959 /* XXX: We may be able to emit some of these values directly rather than
5960 * extracting fields to be emitted later.
5961 */
5962
5963 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5964 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5965 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5966 switch (reg) {
5967 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5968 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5969 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5970 case R_00B848_COMPUTE_PGM_RSRC1:
5971 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5972 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5973 conf->float_mode = G_00B028_FLOAT_MODE(value);
5974 conf->rsrc1 = value;
5975 break;
5976 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5977 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5978 break;
5979 case R_00B84C_COMPUTE_PGM_RSRC2:
5980 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5981 conf->rsrc2 = value;
5982 break;
5983 case R_0286CC_SPI_PS_INPUT_ENA:
5984 conf->spi_ps_input_ena = value;
5985 break;
5986 case R_0286D0_SPI_PS_INPUT_ADDR:
5987 conf->spi_ps_input_addr = value;
5988 break;
5989 case R_0286E8_SPI_TMPRING_SIZE:
5990 case R_00B860_COMPUTE_TMPRING_SIZE:
5991 /* WAVESIZE is in units of 256 dwords. */
5992 if (really_needs_scratch)
5993 conf->scratch_bytes_per_wave =
5994 G_00B860_WAVESIZE(value) * 256 * 4;
5995 break;
5996 case 0x4: /* SPILLED_SGPRS */
5997 conf->spilled_sgprs = value;
5998 break;
5999 case 0x8: /* SPILLED_VGPRS */
6000 conf->spilled_vgprs = value;
6001 break;
6002 default:
6003 {
6004 static bool printed;
6005
6006 if (!printed) {
6007 fprintf(stderr, "Warning: LLVM emitted unknown "
6008 "config register: 0x%x\n", reg);
6009 printed = true;
6010 }
6011 }
6012 break;
6013 }
6014 }
6015
6016 if (!conf->spi_ps_input_addr)
6017 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
6018 }
6019
6020 void si_shader_apply_scratch_relocs(struct si_context *sctx,
6021 struct si_shader *shader,
6022 struct si_shader_config *config,
6023 uint64_t scratch_va)
6024 {
6025 unsigned i;
6026 uint32_t scratch_rsrc_dword0 = scratch_va;
6027 uint32_t scratch_rsrc_dword1 =
6028 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6029
6030 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6031 * correctly.
6032 */
6033 if (HAVE_LLVM >= 0x0309)
6034 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6035 else
6036 scratch_rsrc_dword1 |=
6037 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6038
6039 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6040 const struct radeon_shader_reloc *reloc =
6041 &shader->binary.relocs[i];
6042 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6043 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6044 &scratch_rsrc_dword0, 4);
6045 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6046 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6047 &scratch_rsrc_dword1, 4);
6048 }
6049 }
6050 }
6051
6052 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6053 {
6054 unsigned size = shader->binary.code_size;
6055
6056 if (shader->prolog)
6057 size += shader->prolog->binary.code_size;
6058 if (shader->epilog)
6059 size += shader->epilog->binary.code_size;
6060 return size;
6061 }
6062
6063 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6064 {
6065 const struct radeon_shader_binary *prolog =
6066 shader->prolog ? &shader->prolog->binary : NULL;
6067 const struct radeon_shader_binary *epilog =
6068 shader->epilog ? &shader->epilog->binary : NULL;
6069 const struct radeon_shader_binary *mainb = &shader->binary;
6070 unsigned bo_size = si_get_shader_binary_size(shader) +
6071 (!epilog ? mainb->rodata_size : 0);
6072 unsigned char *ptr;
6073
6074 assert(!prolog || !prolog->rodata_size);
6075 assert((!prolog && !epilog) || !mainb->rodata_size);
6076 assert(!epilog || !epilog->rodata_size);
6077
6078 r600_resource_reference(&shader->bo, NULL);
6079 shader->bo = si_resource_create_custom(&sscreen->b.b,
6080 PIPE_USAGE_IMMUTABLE,
6081 bo_size);
6082 if (!shader->bo)
6083 return -ENOMEM;
6084
6085 /* Upload. */
6086 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6087 PIPE_TRANSFER_READ_WRITE);
6088
6089 if (prolog) {
6090 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6091 ptr += prolog->code_size;
6092 }
6093
6094 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6095 ptr += mainb->code_size;
6096
6097 if (epilog)
6098 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6099 else if (mainb->rodata_size > 0)
6100 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6101
6102 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6103 return 0;
6104 }
6105
6106 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6107 struct pipe_debug_callback *debug,
6108 const char *name, FILE *file)
6109 {
6110 char *line, *p;
6111 unsigned i, count;
6112
6113 if (binary->disasm_string) {
6114 fprintf(file, "Shader %s disassembly:\n", name);
6115 fprintf(file, "%s", binary->disasm_string);
6116
6117 if (debug && debug->debug_message) {
6118 /* Very long debug messages are cut off, so send the
6119 * disassembly one line at a time. This causes more
6120 * overhead, but on the plus side it simplifies
6121 * parsing of resulting logs.
6122 */
6123 pipe_debug_message(debug, SHADER_INFO,
6124 "Shader Disassembly Begin");
6125
6126 line = binary->disasm_string;
6127 while (*line) {
6128 p = util_strchrnul(line, '\n');
6129 count = p - line;
6130
6131 if (count) {
6132 pipe_debug_message(debug, SHADER_INFO,
6133 "%.*s", count, line);
6134 }
6135
6136 if (!*p)
6137 break;
6138 line = p + 1;
6139 }
6140
6141 pipe_debug_message(debug, SHADER_INFO,
6142 "Shader Disassembly End");
6143 }
6144 } else {
6145 fprintf(file, "Shader %s binary:\n", name);
6146 for (i = 0; i < binary->code_size; i += 4) {
6147 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6148 binary->code[i + 3], binary->code[i + 2],
6149 binary->code[i + 1], binary->code[i]);
6150 }
6151 }
6152 }
6153
6154 static void si_shader_dump_stats(struct si_screen *sscreen,
6155 struct si_shader_config *conf,
6156 unsigned num_inputs,
6157 unsigned code_size,
6158 struct pipe_debug_callback *debug,
6159 unsigned processor,
6160 FILE *file)
6161 {
6162 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6163 unsigned lds_per_wave = 0;
6164 unsigned max_simd_waves = 10;
6165
6166 /* Compute LDS usage for PS. */
6167 if (processor == PIPE_SHADER_FRAGMENT) {
6168 /* The minimum usage per wave is (num_inputs * 48). The maximum
6169 * usage is (num_inputs * 48 * 16).
6170 * We can get anything in between and it varies between waves.
6171 *
6172 * The 48 bytes per input for a single primitive is equal to
6173 * 4 bytes/component * 4 components/input * 3 points.
6174 *
6175 * Other stages don't know the size at compile time or don't
6176 * allocate LDS per wave, but instead they do it per thread group.
6177 */
6178 lds_per_wave = conf->lds_size * lds_increment +
6179 align(num_inputs * 48, lds_increment);
6180 }
6181
6182 /* Compute the per-SIMD wave counts. */
6183 if (conf->num_sgprs) {
6184 if (sscreen->b.chip_class >= VI)
6185 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6186 else
6187 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6188 }
6189
6190 if (conf->num_vgprs)
6191 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6192
6193 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6194 * that PS can use.
6195 */
6196 if (lds_per_wave)
6197 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6198
6199 if (file != stderr ||
6200 r600_can_dump_shader(&sscreen->b, processor)) {
6201 if (processor == PIPE_SHADER_FRAGMENT) {
6202 fprintf(file, "*** SHADER CONFIG ***\n"
6203 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6204 "SPI_PS_INPUT_ENA = 0x%04x\n",
6205 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6206 }
6207
6208 fprintf(file, "*** SHADER STATS ***\n"
6209 "SGPRS: %d\n"
6210 "VGPRS: %d\n"
6211 "Spilled SGPRs: %d\n"
6212 "Spilled VGPRs: %d\n"
6213 "Code Size: %d bytes\n"
6214 "LDS: %d blocks\n"
6215 "Scratch: %d bytes per wave\n"
6216 "Max Waves: %d\n"
6217 "********************\n\n\n",
6218 conf->num_sgprs, conf->num_vgprs,
6219 conf->spilled_sgprs, conf->spilled_vgprs, code_size,
6220 conf->lds_size, conf->scratch_bytes_per_wave,
6221 max_simd_waves);
6222 }
6223
6224 pipe_debug_message(debug, SHADER_INFO,
6225 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6226 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6227 "Spilled VGPRs: %d",
6228 conf->num_sgprs, conf->num_vgprs, code_size,
6229 conf->lds_size, conf->scratch_bytes_per_wave,
6230 max_simd_waves, conf->spilled_sgprs,
6231 conf->spilled_vgprs);
6232 }
6233
6234 static const char *si_get_shader_name(struct si_shader *shader,
6235 unsigned processor)
6236 {
6237 switch (processor) {
6238 case PIPE_SHADER_VERTEX:
6239 if (shader->key.vs.as_es)
6240 return "Vertex Shader as ES";
6241 else if (shader->key.vs.as_ls)
6242 return "Vertex Shader as LS";
6243 else
6244 return "Vertex Shader as VS";
6245 case PIPE_SHADER_TESS_CTRL:
6246 return "Tessellation Control Shader";
6247 case PIPE_SHADER_TESS_EVAL:
6248 if (shader->key.tes.as_es)
6249 return "Tessellation Evaluation Shader as ES";
6250 else
6251 return "Tessellation Evaluation Shader as VS";
6252 case PIPE_SHADER_GEOMETRY:
6253 if (shader->gs_copy_shader == NULL)
6254 return "GS Copy Shader as VS";
6255 else
6256 return "Geometry Shader";
6257 case PIPE_SHADER_FRAGMENT:
6258 return "Pixel Shader";
6259 case PIPE_SHADER_COMPUTE:
6260 return "Compute Shader";
6261 default:
6262 return "Unknown Shader";
6263 }
6264 }
6265
6266 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6267 struct pipe_debug_callback *debug, unsigned processor,
6268 FILE *file)
6269 {
6270 if (file != stderr ||
6271 r600_can_dump_shader(&sscreen->b, processor))
6272 si_dump_shader_key(processor, &shader->key, file);
6273
6274 if (file != stderr && shader->binary.llvm_ir_string) {
6275 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6276 si_get_shader_name(shader, processor));
6277 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6278 }
6279
6280 if (file != stderr ||
6281 (r600_can_dump_shader(&sscreen->b, processor) &&
6282 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6283 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6284
6285 if (shader->prolog)
6286 si_shader_dump_disassembly(&shader->prolog->binary,
6287 debug, "prolog", file);
6288
6289 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6290
6291 if (shader->epilog)
6292 si_shader_dump_disassembly(&shader->epilog->binary,
6293 debug, "epilog", file);
6294 fprintf(file, "\n");
6295 }
6296
6297 si_shader_dump_stats(sscreen, &shader->config,
6298 shader->selector ? shader->selector->info.num_inputs : 0,
6299 si_get_shader_binary_size(shader), debug, processor,
6300 file);
6301 }
6302
6303 int si_compile_llvm(struct si_screen *sscreen,
6304 struct radeon_shader_binary *binary,
6305 struct si_shader_config *conf,
6306 LLVMTargetMachineRef tm,
6307 LLVMModuleRef mod,
6308 struct pipe_debug_callback *debug,
6309 unsigned processor,
6310 const char *name)
6311 {
6312 int r = 0;
6313 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6314
6315 if (r600_can_dump_shader(&sscreen->b, processor)) {
6316 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6317
6318 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6319 fprintf(stderr, "%s LLVM IR:\n\n", name);
6320 LLVMDumpModule(mod);
6321 fprintf(stderr, "\n");
6322 }
6323 }
6324
6325 if (sscreen->record_llvm_ir) {
6326 char *ir = LLVMPrintModuleToString(mod);
6327 binary->llvm_ir_string = strdup(ir);
6328 LLVMDisposeMessage(ir);
6329 }
6330
6331 if (!si_replace_shader(count, binary)) {
6332 r = radeon_llvm_compile(mod, binary, tm, debug);
6333 if (r)
6334 return r;
6335 }
6336
6337 si_shader_binary_read_config(binary, conf, 0);
6338
6339 /* Enable 64-bit and 16-bit denormals, because there is no performance
6340 * cost.
6341 *
6342 * If denormals are enabled, all floating-point output modifiers are
6343 * ignored.
6344 *
6345 * Don't enable denormals for 32-bit floats, because:
6346 * - Floating-point output modifiers would be ignored by the hw.
6347 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6348 * have to stop using those.
6349 * - SI & CI would be very slow.
6350 */
6351 conf->float_mode |= V_00B028_FP_64_DENORMS;
6352
6353 FREE(binary->config);
6354 FREE(binary->global_symbol_offsets);
6355 binary->config = NULL;
6356 binary->global_symbol_offsets = NULL;
6357
6358 /* Some shaders can't have rodata because their binaries can be
6359 * concatenated.
6360 */
6361 if (binary->rodata_size &&
6362 (processor == PIPE_SHADER_VERTEX ||
6363 processor == PIPE_SHADER_TESS_CTRL ||
6364 processor == PIPE_SHADER_TESS_EVAL ||
6365 processor == PIPE_SHADER_FRAGMENT)) {
6366 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6367 return -EINVAL;
6368 }
6369
6370 return r;
6371 }
6372
6373 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6374 {
6375 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6376 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6377 else
6378 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6379 }
6380
6381 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6382 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6383 struct si_shader_context *ctx,
6384 struct si_shader *gs,
6385 struct pipe_debug_callback *debug)
6386 {
6387 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6388 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6389 struct lp_build_context *uint = &bld_base->uint_bld;
6390 struct si_shader_output_values *outputs;
6391 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6392 LLVMValueRef args[9];
6393 int i, r;
6394
6395 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6396
6397 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6398 ctx->type = PIPE_SHADER_VERTEX;
6399 ctx->is_gs_copy_shader = true;
6400
6401 create_meta_data(ctx);
6402 create_function(ctx);
6403 preload_ring_buffers(ctx);
6404
6405 args[0] = ctx->gsvs_ring[0];
6406 args[1] = lp_build_mul_imm(uint,
6407 LLVMGetParam(ctx->radeon_bld.main_fn,
6408 ctx->param_vertex_id),
6409 4);
6410 args[3] = uint->zero;
6411 args[4] = uint->one; /* OFFEN */
6412 args[5] = uint->zero; /* IDXEN */
6413 args[6] = uint->one; /* GLC */
6414 args[7] = uint->one; /* SLC */
6415 args[8] = uint->zero; /* TFE */
6416
6417 /* Fetch vertex data from GSVS ring */
6418 for (i = 0; i < gsinfo->num_outputs; ++i) {
6419 unsigned chan;
6420
6421 outputs[i].name = gsinfo->output_semantic_name[i];
6422 outputs[i].sid = gsinfo->output_semantic_index[i];
6423
6424 for (chan = 0; chan < 4; chan++) {
6425 args[2] = lp_build_const_int32(gallivm,
6426 (i * 4 + chan) *
6427 gs->selector->gs_max_out_vertices * 16 * 4);
6428
6429 outputs[i].values[chan] =
6430 LLVMBuildBitCast(gallivm->builder,
6431 lp_build_intrinsic(gallivm->builder,
6432 "llvm.SI.buffer.load.dword.i32.i32",
6433 ctx->i32, args, 9,
6434 LLVMReadOnlyAttribute),
6435 ctx->f32, "");
6436 }
6437 }
6438
6439 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6440
6441 LLVMBuildRetVoid(gallivm->builder);
6442
6443 /* Dump LLVM IR before any optimization passes */
6444 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6445 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6446 LLVMDumpModule(bld_base->base.gallivm->module);
6447
6448 radeon_llvm_finalize_module(&ctx->radeon_bld);
6449
6450 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6451 &ctx->shader->config, ctx->tm,
6452 bld_base->base.gallivm->module,
6453 debug, PIPE_SHADER_GEOMETRY,
6454 "GS Copy Shader");
6455 if (!r) {
6456 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6457 fprintf(stderr, "GS Copy Shader:\n");
6458 si_shader_dump(sscreen, ctx->shader, debug,
6459 PIPE_SHADER_GEOMETRY, stderr);
6460 r = si_shader_binary_upload(sscreen, ctx->shader);
6461 }
6462
6463 radeon_llvm_dispose(&ctx->radeon_bld);
6464
6465 FREE(outputs);
6466 return r;
6467 }
6468
6469 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
6470 FILE *f)
6471 {
6472 int i;
6473
6474 fprintf(f, "SHADER KEY\n");
6475
6476 switch (shader) {
6477 case PIPE_SHADER_VERTEX:
6478 fprintf(f, " instance_divisors = {");
6479 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6480 fprintf(f, !i ? "%u" : ", %u",
6481 key->vs.prolog.instance_divisors[i]);
6482 fprintf(f, "}\n");
6483 fprintf(f, " as_es = %u\n", key->vs.as_es);
6484 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6485 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6486 break;
6487
6488 case PIPE_SHADER_TESS_CTRL:
6489 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6490 break;
6491
6492 case PIPE_SHADER_TESS_EVAL:
6493 fprintf(f, " as_es = %u\n", key->tes.as_es);
6494 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6495 break;
6496
6497 case PIPE_SHADER_GEOMETRY:
6498 case PIPE_SHADER_COMPUTE:
6499 break;
6500
6501 case PIPE_SHADER_FRAGMENT:
6502 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6503 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6504 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6505 fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6506 fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6507 fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6508 fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6509 fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6510 fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6511 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6512 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6513 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6514 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6515 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6516 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6517 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6518 break;
6519
6520 default:
6521 assert(0);
6522 }
6523 }
6524
6525 static void si_init_shader_ctx(struct si_shader_context *ctx,
6526 struct si_screen *sscreen,
6527 struct si_shader *shader,
6528 LLVMTargetMachineRef tm)
6529 {
6530 struct lp_build_tgsi_context *bld_base;
6531 struct lp_build_tgsi_action tmpl = {};
6532
6533 memset(ctx, 0, sizeof(*ctx));
6534 radeon_llvm_context_init(
6535 &ctx->radeon_bld, "amdgcn--",
6536 (shader && shader->selector) ? &shader->selector->info : NULL,
6537 (shader && shader->selector) ? shader->selector->tokens : NULL);
6538 ctx->tm = tm;
6539 ctx->screen = sscreen;
6540 if (shader && shader->selector)
6541 ctx->type = shader->selector->info.processor;
6542 else
6543 ctx->type = -1;
6544 ctx->shader = shader;
6545
6546 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6547 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6548 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6549 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6550 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6551 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6552 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6553 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6554 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6555 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6556 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6557 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6558
6559 bld_base = &ctx->radeon_bld.soa.bld_base;
6560 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6561
6562 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6563 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6564 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6565
6566 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6567 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6568 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6569 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6570 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6571 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6572 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6573 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6574 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6575 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6576 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6577 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6578 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6579 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6580
6581 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6582 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6583 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6584 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6585 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6586 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6587
6588 tmpl.fetch_args = atomic_fetch_args;
6589 tmpl.emit = atomic_emit;
6590 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6591 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6592 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6593 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6594 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6595 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6596 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6597 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6598 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6599 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6600 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6601 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6602 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6603 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6604 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6605 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6606 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6607 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6608 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6609 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6610
6611 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6612
6613 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6614 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6615 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6616 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6617
6618 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6619 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6620 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6621
6622 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6623 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6624 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6625 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6626 }
6627
6628 int si_compile_tgsi_shader(struct si_screen *sscreen,
6629 LLVMTargetMachineRef tm,
6630 struct si_shader *shader,
6631 bool is_monolithic,
6632 struct pipe_debug_callback *debug)
6633 {
6634 struct si_shader_selector *sel = shader->selector;
6635 struct si_shader_context ctx;
6636 struct lp_build_tgsi_context *bld_base;
6637 LLVMModuleRef mod;
6638 int r = 0;
6639
6640 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6641 * conversion fails. */
6642 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6643 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6644 tgsi_dump(sel->tokens, 0);
6645 si_dump_streamout(&sel->so);
6646 }
6647
6648 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6649 ctx.is_monolithic = is_monolithic;
6650
6651 shader->info.uses_instanceid = sel->info.uses_instanceid;
6652
6653 bld_base = &ctx.radeon_bld.soa.bld_base;
6654 ctx.radeon_bld.load_system_value = declare_system_value;
6655
6656 switch (ctx.type) {
6657 case PIPE_SHADER_VERTEX:
6658 ctx.radeon_bld.load_input = declare_input_vs;
6659 if (shader->key.vs.as_ls)
6660 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6661 else if (shader->key.vs.as_es)
6662 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6663 else
6664 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6665 break;
6666 case PIPE_SHADER_TESS_CTRL:
6667 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6668 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6669 bld_base->emit_store = store_output_tcs;
6670 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6671 break;
6672 case PIPE_SHADER_TESS_EVAL:
6673 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6674 if (shader->key.tes.as_es)
6675 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6676 else
6677 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6678 break;
6679 case PIPE_SHADER_GEOMETRY:
6680 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6681 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6682 break;
6683 case PIPE_SHADER_FRAGMENT:
6684 ctx.radeon_bld.load_input = declare_input_fs;
6685 if (is_monolithic)
6686 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6687 else
6688 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6689 break;
6690 case PIPE_SHADER_COMPUTE:
6691 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6692 break;
6693 default:
6694 assert(!"Unsupported shader type");
6695 return -1;
6696 }
6697
6698 create_meta_data(&ctx);
6699 create_function(&ctx);
6700 preload_constant_buffers(&ctx);
6701 preload_ring_buffers(&ctx);
6702
6703 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6704 shader->key.ps.prolog.poly_stipple) {
6705 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6706 SI_PARAM_RW_BUFFERS);
6707 si_llvm_emit_polygon_stipple(&ctx, list,
6708 SI_PARAM_POS_FIXED_PT);
6709 }
6710
6711 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6712 int i;
6713 for (i = 0; i < 4; i++) {
6714 ctx.gs_next_vertex[i] =
6715 lp_build_alloca(bld_base->base.gallivm,
6716 ctx.i32, "");
6717 }
6718 }
6719
6720 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6721 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6722 goto out;
6723 }
6724
6725 si_llvm_build_ret(&ctx, ctx.return_value);
6726 mod = bld_base->base.gallivm->module;
6727
6728 /* Dump LLVM IR before any optimization passes */
6729 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6730 r600_can_dump_shader(&sscreen->b, ctx.type))
6731 LLVMDumpModule(mod);
6732
6733 radeon_llvm_finalize_module(&ctx.radeon_bld);
6734
6735 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6736 mod, debug, ctx.type, "TGSI shader");
6737 if (r) {
6738 fprintf(stderr, "LLVM failed to compile shader\n");
6739 goto out;
6740 }
6741
6742 radeon_llvm_dispose(&ctx.radeon_bld);
6743
6744 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6745 * LLVM 3.9svn has this bug.
6746 */
6747 if (sel->type == PIPE_SHADER_COMPUTE) {
6748 unsigned *props = sel->info.properties;
6749 unsigned wave_size = 64;
6750 unsigned max_vgprs = 256;
6751 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6752 unsigned max_sgprs_per_wave = 128;
6753 unsigned min_waves_per_cu =
6754 DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
6755 props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
6756 props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
6757 wave_size);
6758 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6759
6760 max_vgprs = max_vgprs / min_waves_per_simd;
6761 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6762
6763 if (shader->config.num_sgprs > max_sgprs ||
6764 shader->config.num_vgprs > max_vgprs) {
6765 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6766 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6767 shader->config.num_sgprs, shader->config.num_vgprs,
6768 max_sgprs, max_vgprs);
6769
6770 /* Just terminate the process, because dependent
6771 * shaders can hang due to bad input data, but use
6772 * the env var to allow shader-db to work.
6773 */
6774 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6775 abort();
6776 }
6777 }
6778
6779 /* Add the scratch offset to input SGPRs. */
6780 if (shader->config.scratch_bytes_per_wave)
6781 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6782
6783 /* Calculate the number of fragment input VGPRs. */
6784 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6785 shader->info.num_input_vgprs = 0;
6786 shader->info.face_vgpr_index = -1;
6787
6788 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6789 shader->info.num_input_vgprs += 2;
6790 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6791 shader->info.num_input_vgprs += 2;
6792 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6793 shader->info.num_input_vgprs += 2;
6794 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6795 shader->info.num_input_vgprs += 3;
6796 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6797 shader->info.num_input_vgprs += 2;
6798 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6799 shader->info.num_input_vgprs += 2;
6800 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6801 shader->info.num_input_vgprs += 2;
6802 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6803 shader->info.num_input_vgprs += 1;
6804 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6805 shader->info.num_input_vgprs += 1;
6806 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6807 shader->info.num_input_vgprs += 1;
6808 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6809 shader->info.num_input_vgprs += 1;
6810 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6811 shader->info.num_input_vgprs += 1;
6812 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6813 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6814 shader->info.num_input_vgprs += 1;
6815 }
6816 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6817 shader->info.num_input_vgprs += 1;
6818 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6819 shader->info.num_input_vgprs += 1;
6820 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6821 shader->info.num_input_vgprs += 1;
6822 }
6823
6824 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6825 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6826 shader->gs_copy_shader->selector = shader->selector;
6827 ctx.shader = shader->gs_copy_shader;
6828 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6829 shader, debug))) {
6830 free(shader->gs_copy_shader);
6831 shader->gs_copy_shader = NULL;
6832 goto out;
6833 }
6834 }
6835
6836 out:
6837 return r;
6838 }
6839
6840 /**
6841 * Create, compile and return a shader part (prolog or epilog).
6842 *
6843 * \param sscreen screen
6844 * \param list list of shader parts of the same category
6845 * \param key shader part key
6846 * \param tm LLVM target machine
6847 * \param debug debug callback
6848 * \param compile the callback responsible for compilation
6849 * \return non-NULL on success
6850 */
6851 static struct si_shader_part *
6852 si_get_shader_part(struct si_screen *sscreen,
6853 struct si_shader_part **list,
6854 union si_shader_part_key *key,
6855 LLVMTargetMachineRef tm,
6856 struct pipe_debug_callback *debug,
6857 bool (*compile)(struct si_screen *,
6858 LLVMTargetMachineRef,
6859 struct pipe_debug_callback *,
6860 struct si_shader_part *))
6861 {
6862 struct si_shader_part *result;
6863
6864 pipe_mutex_lock(sscreen->shader_parts_mutex);
6865
6866 /* Find existing. */
6867 for (result = *list; result; result = result->next) {
6868 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6869 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6870 return result;
6871 }
6872 }
6873
6874 /* Compile a new one. */
6875 result = CALLOC_STRUCT(si_shader_part);
6876 result->key = *key;
6877 if (!compile(sscreen, tm, debug, result)) {
6878 FREE(result);
6879 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6880 return NULL;
6881 }
6882
6883 result->next = *list;
6884 *list = result;
6885 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6886 return result;
6887 }
6888
6889 /**
6890 * Create a vertex shader prolog.
6891 *
6892 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6893 * All inputs are returned unmodified. The vertex load indices are
6894 * stored after them, which will used by the API VS for fetching inputs.
6895 *
6896 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6897 * input_v0,
6898 * input_v1,
6899 * input_v2,
6900 * input_v3,
6901 * (VertexID + BaseVertex),
6902 * (InstanceID + StartInstance),
6903 * (InstanceID / 2 + StartInstance)
6904 */
6905 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6906 LLVMTargetMachineRef tm,
6907 struct pipe_debug_callback *debug,
6908 struct si_shader_part *out)
6909 {
6910 union si_shader_part_key *key = &out->key;
6911 struct si_shader shader = {};
6912 struct si_shader_context ctx;
6913 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6914 LLVMTypeRef *params, *returns;
6915 LLVMValueRef ret, func;
6916 int last_sgpr, num_params, num_returns, i;
6917 bool status = true;
6918
6919 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6920 ctx.type = PIPE_SHADER_VERTEX;
6921 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6922 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6923
6924 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6925 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6926 sizeof(LLVMTypeRef));
6927 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6928 key->vs_prolog.last_input + 1) *
6929 sizeof(LLVMTypeRef));
6930 num_params = 0;
6931 num_returns = 0;
6932
6933 /* Declare input and output SGPRs. */
6934 num_params = 0;
6935 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6936 params[num_params++] = ctx.i32;
6937 returns[num_returns++] = ctx.i32;
6938 }
6939 last_sgpr = num_params - 1;
6940
6941 /* 4 preloaded VGPRs (outputs must be floats) */
6942 for (i = 0; i < 4; i++) {
6943 params[num_params++] = ctx.i32;
6944 returns[num_returns++] = ctx.f32;
6945 }
6946
6947 /* Vertex load indices. */
6948 for (i = 0; i <= key->vs_prolog.last_input; i++)
6949 returns[num_returns++] = ctx.f32;
6950
6951 /* Create the function. */
6952 si_create_function(&ctx, returns, num_returns, params,
6953 num_params, last_sgpr);
6954 func = ctx.radeon_bld.main_fn;
6955
6956 /* Copy inputs to outputs. This should be no-op, as the registers match,
6957 * but it will prevent the compiler from overwriting them unintentionally.
6958 */
6959 ret = ctx.return_value;
6960 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6961 LLVMValueRef p = LLVMGetParam(func, i);
6962 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6963 }
6964 for (i = num_params - 4; i < num_params; i++) {
6965 LLVMValueRef p = LLVMGetParam(func, i);
6966 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6967 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6968 }
6969
6970 /* Compute vertex load indices from instance divisors. */
6971 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6972 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6973 LLVMValueRef index;
6974
6975 if (divisor) {
6976 /* InstanceID / Divisor + StartInstance */
6977 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6978 SI_SGPR_START_INSTANCE,
6979 divisor);
6980 } else {
6981 /* VertexID + BaseVertex */
6982 index = LLVMBuildAdd(gallivm->builder,
6983 LLVMGetParam(func, ctx.param_vertex_id),
6984 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6985 }
6986
6987 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6988 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6989 num_params++, "");
6990 }
6991
6992 /* Compile. */
6993 si_llvm_build_ret(&ctx, ret);
6994 radeon_llvm_finalize_module(&ctx.radeon_bld);
6995
6996 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6997 gallivm->module, debug, ctx.type,
6998 "Vertex Shader Prolog"))
6999 status = false;
7000
7001 radeon_llvm_dispose(&ctx.radeon_bld);
7002 return status;
7003 }
7004
7005 /**
7006 * Compile the vertex shader epilog. This is also used by the tessellation
7007 * evaluation shader compiled as VS.
7008 *
7009 * The input is PrimitiveID.
7010 *
7011 * If PrimitiveID is required by the pixel shader, export it.
7012 * Otherwise, do nothing.
7013 */
7014 static bool si_compile_vs_epilog(struct si_screen *sscreen,
7015 LLVMTargetMachineRef tm,
7016 struct pipe_debug_callback *debug,
7017 struct si_shader_part *out)
7018 {
7019 union si_shader_part_key *key = &out->key;
7020 struct si_shader_context ctx;
7021 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7022 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7023 LLVMTypeRef params[5];
7024 int num_params, i;
7025 bool status = true;
7026
7027 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
7028 ctx.type = PIPE_SHADER_VERTEX;
7029
7030 /* Declare input VGPRs. */
7031 num_params = key->vs_epilog.states.export_prim_id ?
7032 (VS_EPILOG_PRIMID_LOC + 1) : 0;
7033 assert(num_params <= ARRAY_SIZE(params));
7034
7035 for (i = 0; i < num_params; i++)
7036 params[i] = ctx.f32;
7037
7038 /* Create the function. */
7039 si_create_function(&ctx, NULL, 0, params, num_params, -1);
7040
7041 /* Emit exports. */
7042 if (key->vs_epilog.states.export_prim_id) {
7043 struct lp_build_context *base = &bld_base->base;
7044 struct lp_build_context *uint = &bld_base->uint_bld;
7045 LLVMValueRef args[9];
7046
7047 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7048 args[1] = uint->zero; /* whether the EXEC mask is valid */
7049 args[2] = uint->zero; /* DONE bit */
7050 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7051 key->vs_epilog.prim_id_param_offset);
7052 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7053 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
7054 VS_EPILOG_PRIMID_LOC); /* X */
7055 args[6] = uint->undef; /* Y */
7056 args[7] = uint->undef; /* Z */
7057 args[8] = uint->undef; /* W */
7058
7059 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7060 LLVMVoidTypeInContext(base->gallivm->context),
7061 args, 9, 0);
7062 }
7063
7064 /* Compile. */
7065 LLVMBuildRetVoid(gallivm->builder);
7066 radeon_llvm_finalize_module(&ctx.radeon_bld);
7067
7068 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7069 gallivm->module, debug, ctx.type,
7070 "Vertex Shader Epilog"))
7071 status = false;
7072
7073 radeon_llvm_dispose(&ctx.radeon_bld);
7074 return status;
7075 }
7076
7077 /**
7078 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7079 */
7080 static bool si_get_vs_epilog(struct si_screen *sscreen,
7081 LLVMTargetMachineRef tm,
7082 struct si_shader *shader,
7083 struct pipe_debug_callback *debug,
7084 struct si_vs_epilog_bits *states)
7085 {
7086 union si_shader_part_key epilog_key;
7087
7088 memset(&epilog_key, 0, sizeof(epilog_key));
7089 epilog_key.vs_epilog.states = *states;
7090
7091 /* Set up the PrimitiveID output. */
7092 if (shader->key.vs.epilog.export_prim_id) {
7093 unsigned index = shader->selector->info.num_outputs;
7094 unsigned offset = shader->info.nr_param_exports++;
7095
7096 epilog_key.vs_epilog.prim_id_param_offset = offset;
7097 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7098 shader->info.vs_output_param_offset[index] = offset;
7099 }
7100
7101 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7102 &epilog_key, tm, debug,
7103 si_compile_vs_epilog);
7104 return shader->epilog != NULL;
7105 }
7106
7107 /**
7108 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7109 */
7110 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7111 LLVMTargetMachineRef tm,
7112 struct si_shader *shader,
7113 struct pipe_debug_callback *debug)
7114 {
7115 struct tgsi_shader_info *info = &shader->selector->info;
7116 union si_shader_part_key prolog_key;
7117 unsigned i;
7118
7119 /* Get the prolog. */
7120 memset(&prolog_key, 0, sizeof(prolog_key));
7121 prolog_key.vs_prolog.states = shader->key.vs.prolog;
7122 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7123 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7124
7125 /* The prolog is a no-op if there are no inputs. */
7126 if (info->num_inputs) {
7127 shader->prolog =
7128 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7129 &prolog_key, tm, debug,
7130 si_compile_vs_prolog);
7131 if (!shader->prolog)
7132 return false;
7133 }
7134
7135 /* Get the epilog. */
7136 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7137 !si_get_vs_epilog(sscreen, tm, shader, debug,
7138 &shader->key.vs.epilog))
7139 return false;
7140
7141 /* Set the instanceID flag. */
7142 for (i = 0; i < info->num_inputs; i++)
7143 if (prolog_key.vs_prolog.states.instance_divisors[i])
7144 shader->info.uses_instanceid = true;
7145
7146 return true;
7147 }
7148
7149 /**
7150 * Select and compile (or reuse) TES parts (epilog).
7151 */
7152 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7153 LLVMTargetMachineRef tm,
7154 struct si_shader *shader,
7155 struct pipe_debug_callback *debug)
7156 {
7157 if (shader->key.tes.as_es)
7158 return true;
7159
7160 /* TES compiled as VS. */
7161 return si_get_vs_epilog(sscreen, tm, shader, debug,
7162 &shader->key.tes.epilog);
7163 }
7164
7165 /**
7166 * Compile the TCS epilog. This writes tesselation factors to memory based on
7167 * the output primitive type of the tesselator (determined by TES).
7168 */
7169 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7170 LLVMTargetMachineRef tm,
7171 struct pipe_debug_callback *debug,
7172 struct si_shader_part *out)
7173 {
7174 union si_shader_part_key *key = &out->key;
7175 struct si_shader shader = {};
7176 struct si_shader_context ctx;
7177 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7178 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7179 LLVMTypeRef params[16];
7180 LLVMValueRef func;
7181 int last_sgpr, num_params;
7182 bool status = true;
7183
7184 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7185 ctx.type = PIPE_SHADER_TESS_CTRL;
7186 shader.key.tcs.epilog = key->tcs_epilog.states;
7187
7188 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7189 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7190 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7191 params[SI_PARAM_SAMPLERS] = ctx.i64;
7192 params[SI_PARAM_IMAGES] = ctx.i64;
7193 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7194 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7195 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7196 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7197 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7198 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7199 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7200 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7201 num_params = last_sgpr + 1;
7202
7203 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7204 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7205 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7206
7207 /* Create the function. */
7208 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7209 declare_tess_lds(&ctx);
7210 func = ctx.radeon_bld.main_fn;
7211
7212 si_write_tess_factors(bld_base,
7213 LLVMGetParam(func, last_sgpr + 1),
7214 LLVMGetParam(func, last_sgpr + 2),
7215 LLVMGetParam(func, last_sgpr + 3));
7216
7217 /* Compile. */
7218 LLVMBuildRetVoid(gallivm->builder);
7219 radeon_llvm_finalize_module(&ctx.radeon_bld);
7220
7221 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7222 gallivm->module, debug, ctx.type,
7223 "Tessellation Control Shader Epilog"))
7224 status = false;
7225
7226 radeon_llvm_dispose(&ctx.radeon_bld);
7227 return status;
7228 }
7229
7230 /**
7231 * Select and compile (or reuse) TCS parts (epilog).
7232 */
7233 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7234 LLVMTargetMachineRef tm,
7235 struct si_shader *shader,
7236 struct pipe_debug_callback *debug)
7237 {
7238 union si_shader_part_key epilog_key;
7239
7240 /* Get the epilog. */
7241 memset(&epilog_key, 0, sizeof(epilog_key));
7242 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7243
7244 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7245 &epilog_key, tm, debug,
7246 si_compile_tcs_epilog);
7247 return shader->epilog != NULL;
7248 }
7249
7250 /**
7251 * Compile the pixel shader prolog. This handles:
7252 * - two-side color selection and interpolation
7253 * - overriding interpolation parameters for the API PS
7254 * - polygon stippling
7255 *
7256 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7257 * overriden by other states. (e.g. per-sample interpolation)
7258 * Interpolated colors are stored after the preloaded VGPRs.
7259 */
7260 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7261 LLVMTargetMachineRef tm,
7262 struct pipe_debug_callback *debug,
7263 struct si_shader_part *out)
7264 {
7265 union si_shader_part_key *key = &out->key;
7266 struct si_shader shader = {};
7267 struct si_shader_context ctx;
7268 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7269 LLVMTypeRef *params;
7270 LLVMValueRef ret, func;
7271 int last_sgpr, num_params, num_returns, i, num_color_channels;
7272 bool status = true;
7273
7274 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7275 ctx.type = PIPE_SHADER_FRAGMENT;
7276 shader.key.ps.prolog = key->ps_prolog.states;
7277
7278 /* Number of inputs + 8 color elements. */
7279 params = alloca((key->ps_prolog.num_input_sgprs +
7280 key->ps_prolog.num_input_vgprs + 8) *
7281 sizeof(LLVMTypeRef));
7282
7283 /* Declare inputs. */
7284 num_params = 0;
7285 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7286 params[num_params++] = ctx.i32;
7287 last_sgpr = num_params - 1;
7288
7289 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7290 params[num_params++] = ctx.f32;
7291
7292 /* Declare outputs (same as inputs + add colors if needed) */
7293 num_returns = num_params;
7294 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7295 for (i = 0; i < num_color_channels; i++)
7296 params[num_returns++] = ctx.f32;
7297
7298 /* Create the function. */
7299 si_create_function(&ctx, params, num_returns, params,
7300 num_params, last_sgpr);
7301 func = ctx.radeon_bld.main_fn;
7302
7303 /* Copy inputs to outputs. This should be no-op, as the registers match,
7304 * but it will prevent the compiler from overwriting them unintentionally.
7305 */
7306 ret = ctx.return_value;
7307 for (i = 0; i < num_params; i++) {
7308 LLVMValueRef p = LLVMGetParam(func, i);
7309 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7310 }
7311
7312 /* Polygon stippling. */
7313 if (key->ps_prolog.states.poly_stipple) {
7314 /* POS_FIXED_PT is always last. */
7315 unsigned pos = key->ps_prolog.num_input_sgprs +
7316 key->ps_prolog.num_input_vgprs - 1;
7317 LLVMValueRef ptr[2], list;
7318
7319 /* Get the pointer to rw buffers. */
7320 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7321 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7322 list = lp_build_gather_values(gallivm, ptr, 2);
7323 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7324 list = LLVMBuildIntToPtr(gallivm->builder, list,
7325 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7326
7327 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7328 }
7329
7330 if (key->ps_prolog.states.bc_optimize_for_persp ||
7331 key->ps_prolog.states.bc_optimize_for_linear) {
7332 unsigned i, base = key->ps_prolog.num_input_sgprs;
7333 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7334
7335 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7336 * The hw doesn't compute CENTROID if the whole wave only
7337 * contains fully-covered quads.
7338 *
7339 * PRIM_MASK is after user SGPRs.
7340 */
7341 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7342 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7343 LLVMConstInt(ctx.i32, 31, 0), "");
7344 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7345 ctx.i1, "");
7346
7347 if (key->ps_prolog.states.bc_optimize_for_persp) {
7348 /* Read PERSP_CENTER. */
7349 for (i = 0; i < 2; i++)
7350 center[i] = LLVMGetParam(func, base + 2 + i);
7351 /* Read PERSP_CENTROID. */
7352 for (i = 0; i < 2; i++)
7353 centroid[i] = LLVMGetParam(func, base + 4 + i);
7354 /* Select PERSP_CENTROID. */
7355 for (i = 0; i < 2; i++) {
7356 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7357 center[i], centroid[i], "");
7358 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7359 tmp, base + 4 + i, "");
7360 }
7361 }
7362 if (key->ps_prolog.states.bc_optimize_for_linear) {
7363 /* Read LINEAR_CENTER. */
7364 for (i = 0; i < 2; i++)
7365 center[i] = LLVMGetParam(func, base + 8 + i);
7366 /* Read LINEAR_CENTROID. */
7367 for (i = 0; i < 2; i++)
7368 centroid[i] = LLVMGetParam(func, base + 10 + i);
7369 /* Select LINEAR_CENTROID. */
7370 for (i = 0; i < 2; i++) {
7371 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7372 center[i], centroid[i], "");
7373 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7374 tmp, base + 10 + i, "");
7375 }
7376 }
7377 }
7378
7379 /* Interpolate colors. */
7380 for (i = 0; i < 2; i++) {
7381 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7382 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7383 key->ps_prolog.face_vgpr_index;
7384 LLVMValueRef interp[2], color[4];
7385 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7386
7387 if (!writemask)
7388 continue;
7389
7390 /* If the interpolation qualifier is not CONSTANT (-1). */
7391 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7392 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7393 key->ps_prolog.color_interp_vgpr_index[i];
7394
7395 /* Get the (i,j) updated by bc_optimize handling. */
7396 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7397 interp_vgpr, "");
7398 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7399 interp_vgpr + 1, "");
7400 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7401 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7402 ctx.v2i32, "");
7403 }
7404
7405 /* Use the absolute location of the input. */
7406 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7407
7408 if (key->ps_prolog.states.color_two_side) {
7409 face = LLVMGetParam(func, face_vgpr);
7410 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7411 }
7412
7413 interp_fs_input(&ctx,
7414 key->ps_prolog.color_attr_index[i],
7415 TGSI_SEMANTIC_COLOR, i,
7416 key->ps_prolog.num_interp_inputs,
7417 key->ps_prolog.colors_read, interp_ij,
7418 prim_mask, face, color);
7419
7420 while (writemask) {
7421 unsigned chan = u_bit_scan(&writemask);
7422 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7423 num_params++, "");
7424 }
7425 }
7426
7427 /* Force per-sample interpolation. */
7428 if (key->ps_prolog.states.force_persp_sample_interp) {
7429 unsigned i, base = key->ps_prolog.num_input_sgprs;
7430 LLVMValueRef persp_sample[2];
7431
7432 /* Read PERSP_SAMPLE. */
7433 for (i = 0; i < 2; i++)
7434 persp_sample[i] = LLVMGetParam(func, base + i);
7435 /* Overwrite PERSP_CENTER. */
7436 for (i = 0; i < 2; i++)
7437 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7438 persp_sample[i], base + 2 + i, "");
7439 /* Overwrite PERSP_CENTROID. */
7440 for (i = 0; i < 2; i++)
7441 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7442 persp_sample[i], base + 4 + i, "");
7443 }
7444 if (key->ps_prolog.states.force_linear_sample_interp) {
7445 unsigned i, base = key->ps_prolog.num_input_sgprs;
7446 LLVMValueRef linear_sample[2];
7447
7448 /* Read LINEAR_SAMPLE. */
7449 for (i = 0; i < 2; i++)
7450 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7451 /* Overwrite LINEAR_CENTER. */
7452 for (i = 0; i < 2; i++)
7453 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7454 linear_sample[i], base + 8 + i, "");
7455 /* Overwrite LINEAR_CENTROID. */
7456 for (i = 0; i < 2; i++)
7457 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7458 linear_sample[i], base + 10 + i, "");
7459 }
7460
7461 /* Force center interpolation. */
7462 if (key->ps_prolog.states.force_persp_center_interp) {
7463 unsigned i, base = key->ps_prolog.num_input_sgprs;
7464 LLVMValueRef persp_center[2];
7465
7466 /* Read PERSP_CENTER. */
7467 for (i = 0; i < 2; i++)
7468 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7469 /* Overwrite PERSP_SAMPLE. */
7470 for (i = 0; i < 2; i++)
7471 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7472 persp_center[i], base + i, "");
7473 /* Overwrite PERSP_CENTROID. */
7474 for (i = 0; i < 2; i++)
7475 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7476 persp_center[i], base + 4 + i, "");
7477 }
7478 if (key->ps_prolog.states.force_linear_center_interp) {
7479 unsigned i, base = key->ps_prolog.num_input_sgprs;
7480 LLVMValueRef linear_center[2];
7481
7482 /* Read LINEAR_CENTER. */
7483 for (i = 0; i < 2; i++)
7484 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7485 /* Overwrite LINEAR_SAMPLE. */
7486 for (i = 0; i < 2; i++)
7487 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7488 linear_center[i], base + 6 + i, "");
7489 /* Overwrite LINEAR_CENTROID. */
7490 for (i = 0; i < 2; i++)
7491 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7492 linear_center[i], base + 10 + i, "");
7493 }
7494
7495 /* Tell LLVM to insert WQM instruction sequence when needed. */
7496 if (key->ps_prolog.wqm) {
7497 LLVMAddTargetDependentFunctionAttr(func,
7498 "amdgpu-ps-wqm-outputs", "");
7499 }
7500
7501 /* Compile. */
7502 si_llvm_build_ret(&ctx, ret);
7503 radeon_llvm_finalize_module(&ctx.radeon_bld);
7504
7505 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7506 gallivm->module, debug, ctx.type,
7507 "Fragment Shader Prolog"))
7508 status = false;
7509
7510 radeon_llvm_dispose(&ctx.radeon_bld);
7511 return status;
7512 }
7513
7514 /**
7515 * Compile the pixel shader epilog. This handles everything that must be
7516 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7517 */
7518 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7519 LLVMTargetMachineRef tm,
7520 struct pipe_debug_callback *debug,
7521 struct si_shader_part *out)
7522 {
7523 union si_shader_part_key *key = &out->key;
7524 struct si_shader shader = {};
7525 struct si_shader_context ctx;
7526 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7527 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7528 LLVMTypeRef params[16+8*4+3];
7529 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7530 int last_sgpr, num_params, i;
7531 bool status = true;
7532 struct si_ps_exports exp = {};
7533
7534 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7535 ctx.type = PIPE_SHADER_FRAGMENT;
7536 shader.key.ps.epilog = key->ps_epilog.states;
7537
7538 /* Declare input SGPRs. */
7539 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7540 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7541 params[SI_PARAM_SAMPLERS] = ctx.i64;
7542 params[SI_PARAM_IMAGES] = ctx.i64;
7543 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7544 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7545 last_sgpr = SI_PARAM_ALPHA_REF;
7546
7547 /* Declare input VGPRs. */
7548 num_params = (last_sgpr + 1) +
7549 util_bitcount(key->ps_epilog.colors_written) * 4 +
7550 key->ps_epilog.writes_z +
7551 key->ps_epilog.writes_stencil +
7552 key->ps_epilog.writes_samplemask;
7553
7554 num_params = MAX2(num_params,
7555 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7556
7557 assert(num_params <= ARRAY_SIZE(params));
7558
7559 for (i = last_sgpr + 1; i < num_params; i++)
7560 params[i] = ctx.f32;
7561
7562 /* Create the function. */
7563 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7564 /* Disable elimination of unused inputs. */
7565 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7566 "InitialPSInputAddr", 0xffffff);
7567
7568 /* Process colors. */
7569 unsigned vgpr = last_sgpr + 1;
7570 unsigned colors_written = key->ps_epilog.colors_written;
7571 int last_color_export = -1;
7572
7573 /* Find the last color export. */
7574 if (!key->ps_epilog.writes_z &&
7575 !key->ps_epilog.writes_stencil &&
7576 !key->ps_epilog.writes_samplemask) {
7577 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7578
7579 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7580 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7581 /* Just set this if any of the colorbuffers are enabled. */
7582 if (spi_format &
7583 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7584 last_color_export = 0;
7585 } else {
7586 for (i = 0; i < 8; i++)
7587 if (colors_written & (1 << i) &&
7588 (spi_format >> (i * 4)) & 0xf)
7589 last_color_export = i;
7590 }
7591 }
7592
7593 while (colors_written) {
7594 LLVMValueRef color[4];
7595 int mrt = u_bit_scan(&colors_written);
7596
7597 for (i = 0; i < 4; i++)
7598 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7599
7600 si_export_mrt_color(bld_base, color, mrt,
7601 num_params - 1,
7602 mrt == last_color_export, &exp);
7603 }
7604
7605 /* Process depth, stencil, samplemask. */
7606 if (key->ps_epilog.writes_z)
7607 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7608 if (key->ps_epilog.writes_stencil)
7609 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7610 if (key->ps_epilog.writes_samplemask)
7611 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7612
7613 if (depth || stencil || samplemask)
7614 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7615 else if (last_color_export == -1)
7616 si_export_null(bld_base);
7617
7618 if (exp.num)
7619 si_emit_ps_exports(&ctx, &exp);
7620
7621 /* Compile. */
7622 LLVMBuildRetVoid(gallivm->builder);
7623 radeon_llvm_finalize_module(&ctx.radeon_bld);
7624
7625 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7626 gallivm->module, debug, ctx.type,
7627 "Fragment Shader Epilog"))
7628 status = false;
7629
7630 radeon_llvm_dispose(&ctx.radeon_bld);
7631 return status;
7632 }
7633
7634 /**
7635 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7636 */
7637 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7638 LLVMTargetMachineRef tm,
7639 struct si_shader *shader,
7640 struct pipe_debug_callback *debug)
7641 {
7642 struct tgsi_shader_info *info = &shader->selector->info;
7643 union si_shader_part_key prolog_key;
7644 union si_shader_part_key epilog_key;
7645 unsigned i;
7646
7647 /* Get the prolog. */
7648 memset(&prolog_key, 0, sizeof(prolog_key));
7649 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7650 prolog_key.ps_prolog.colors_read = info->colors_read;
7651 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7652 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7653 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7654 (prolog_key.ps_prolog.colors_read ||
7655 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7656 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7657 prolog_key.ps_prolog.states.force_persp_center_interp ||
7658 prolog_key.ps_prolog.states.force_linear_center_interp ||
7659 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7660 prolog_key.ps_prolog.states.bc_optimize_for_linear);
7661
7662 if (info->colors_read) {
7663 unsigned *color = shader->selector->color_attr_index;
7664
7665 if (shader->key.ps.prolog.color_two_side) {
7666 /* BCOLORs are stored after the last input. */
7667 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7668 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7669 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7670 }
7671
7672 for (i = 0; i < 2; i++) {
7673 unsigned interp = info->input_interpolate[color[i]];
7674 unsigned location = info->input_interpolate_loc[color[i]];
7675
7676 if (!(info->colors_read & (0xf << i*4)))
7677 continue;
7678
7679 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7680
7681 if (shader->key.ps.prolog.flatshade_colors &&
7682 interp == TGSI_INTERPOLATE_COLOR)
7683 interp = TGSI_INTERPOLATE_CONSTANT;
7684
7685 switch (interp) {
7686 case TGSI_INTERPOLATE_CONSTANT:
7687 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7688 break;
7689 case TGSI_INTERPOLATE_PERSPECTIVE:
7690 case TGSI_INTERPOLATE_COLOR:
7691 /* Force the interpolation location for colors here. */
7692 if (shader->key.ps.prolog.force_persp_sample_interp)
7693 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7694 if (shader->key.ps.prolog.force_persp_center_interp)
7695 location = TGSI_INTERPOLATE_LOC_CENTER;
7696
7697 switch (location) {
7698 case TGSI_INTERPOLATE_LOC_SAMPLE:
7699 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7700 shader->config.spi_ps_input_ena |=
7701 S_0286CC_PERSP_SAMPLE_ENA(1);
7702 break;
7703 case TGSI_INTERPOLATE_LOC_CENTER:
7704 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7705 shader->config.spi_ps_input_ena |=
7706 S_0286CC_PERSP_CENTER_ENA(1);
7707 break;
7708 case TGSI_INTERPOLATE_LOC_CENTROID:
7709 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7710 shader->config.spi_ps_input_ena |=
7711 S_0286CC_PERSP_CENTROID_ENA(1);
7712 break;
7713 default:
7714 assert(0);
7715 }
7716 break;
7717 case TGSI_INTERPOLATE_LINEAR:
7718 /* Force the interpolation location for colors here. */
7719 if (shader->key.ps.prolog.force_linear_sample_interp)
7720 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7721 if (shader->key.ps.prolog.force_linear_center_interp)
7722 location = TGSI_INTERPOLATE_LOC_CENTER;
7723
7724 switch (location) {
7725 case TGSI_INTERPOLATE_LOC_SAMPLE:
7726 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7727 shader->config.spi_ps_input_ena |=
7728 S_0286CC_LINEAR_SAMPLE_ENA(1);
7729 break;
7730 case TGSI_INTERPOLATE_LOC_CENTER:
7731 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7732 shader->config.spi_ps_input_ena |=
7733 S_0286CC_LINEAR_CENTER_ENA(1);
7734 break;
7735 case TGSI_INTERPOLATE_LOC_CENTROID:
7736 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7737 shader->config.spi_ps_input_ena |=
7738 S_0286CC_LINEAR_CENTROID_ENA(1);
7739 break;
7740 default:
7741 assert(0);
7742 }
7743 break;
7744 default:
7745 assert(0);
7746 }
7747 }
7748 }
7749
7750 /* The prolog is a no-op if these aren't set. */
7751 if (prolog_key.ps_prolog.colors_read ||
7752 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7753 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7754 prolog_key.ps_prolog.states.force_persp_center_interp ||
7755 prolog_key.ps_prolog.states.force_linear_center_interp ||
7756 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7757 prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7758 prolog_key.ps_prolog.states.poly_stipple) {
7759 shader->prolog =
7760 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7761 &prolog_key, tm, debug,
7762 si_compile_ps_prolog);
7763 if (!shader->prolog)
7764 return false;
7765 }
7766
7767 /* Get the epilog. */
7768 memset(&epilog_key, 0, sizeof(epilog_key));
7769 epilog_key.ps_epilog.colors_written = info->colors_written;
7770 epilog_key.ps_epilog.writes_z = info->writes_z;
7771 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7772 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7773 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7774
7775 shader->epilog =
7776 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7777 &epilog_key, tm, debug,
7778 si_compile_ps_epilog);
7779 if (!shader->epilog)
7780 return false;
7781
7782 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7783 if (shader->key.ps.prolog.poly_stipple) {
7784 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7785 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7786 }
7787
7788 /* Set up the enable bits for per-sample shading if needed. */
7789 if (shader->key.ps.prolog.force_persp_sample_interp &&
7790 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7791 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7792 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7793 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7794 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7795 }
7796 if (shader->key.ps.prolog.force_linear_sample_interp &&
7797 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7798 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7799 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7800 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7801 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7802 }
7803 if (shader->key.ps.prolog.force_persp_center_interp &&
7804 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7805 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7806 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7807 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7808 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7809 }
7810 if (shader->key.ps.prolog.force_linear_center_interp &&
7811 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7812 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7813 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7814 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7815 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7816 }
7817
7818 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7819 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7820 !(shader->config.spi_ps_input_ena & 0xf)) {
7821 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7822 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7823 }
7824
7825 /* At least one pair of interpolation weights must be enabled. */
7826 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7827 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7828 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7829 }
7830
7831 /* The sample mask input is always enabled, because the API shader always
7832 * passes it through to the epilog. Disable it here if it's unused.
7833 */
7834 if (!shader->key.ps.epilog.poly_line_smoothing &&
7835 !shader->selector->info.reads_samplemask)
7836 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7837
7838 return true;
7839 }
7840
7841 static void si_fix_num_sgprs(struct si_shader *shader)
7842 {
7843 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7844
7845 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7846 }
7847
7848 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7849 struct si_shader *shader,
7850 struct pipe_debug_callback *debug)
7851 {
7852 struct si_shader *mainp = shader->selector->main_shader_part;
7853 int r;
7854
7855 /* LS, ES, VS are compiled on demand if the main part hasn't been
7856 * compiled for that stage.
7857 */
7858 if (!mainp ||
7859 (shader->selector->type == PIPE_SHADER_VERTEX &&
7860 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7861 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7862 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7863 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7864 (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7865 shader->key.tcs.epilog.inputs_to_copy) ||
7866 shader->selector->type == PIPE_SHADER_COMPUTE) {
7867 /* Monolithic shader (compiled as a whole, has many variants,
7868 * may take a long time to compile).
7869 */
7870 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7871 if (r)
7872 return r;
7873 } else {
7874 /* The shader consists of 2-3 parts:
7875 *
7876 * - the middle part is the user shader, it has 1 variant only
7877 * and it was compiled during the creation of the shader
7878 * selector
7879 * - the prolog part is inserted at the beginning
7880 * - the epilog part is inserted at the end
7881 *
7882 * The prolog and epilog have many (but simple) variants.
7883 */
7884
7885 /* Copy the compiled TGSI shader data over. */
7886 shader->is_binary_shared = true;
7887 shader->binary = mainp->binary;
7888 shader->config = mainp->config;
7889 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7890 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7891 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7892 memcpy(shader->info.vs_output_param_offset,
7893 mainp->info.vs_output_param_offset,
7894 sizeof(mainp->info.vs_output_param_offset));
7895 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7896 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7897 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7898
7899 /* Select prologs and/or epilogs. */
7900 switch (shader->selector->type) {
7901 case PIPE_SHADER_VERTEX:
7902 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7903 return -1;
7904 break;
7905 case PIPE_SHADER_TESS_CTRL:
7906 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7907 return -1;
7908 break;
7909 case PIPE_SHADER_TESS_EVAL:
7910 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7911 return -1;
7912 break;
7913 case PIPE_SHADER_FRAGMENT:
7914 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7915 return -1;
7916
7917 /* Make sure we have at least as many VGPRs as there
7918 * are allocated inputs.
7919 */
7920 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7921 shader->info.num_input_vgprs);
7922 break;
7923 }
7924
7925 /* Update SGPR and VGPR counts. */
7926 if (shader->prolog) {
7927 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7928 shader->prolog->config.num_sgprs);
7929 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7930 shader->prolog->config.num_vgprs);
7931 }
7932 if (shader->epilog) {
7933 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7934 shader->epilog->config.num_sgprs);
7935 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7936 shader->epilog->config.num_vgprs);
7937 }
7938 }
7939
7940 si_fix_num_sgprs(shader);
7941 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7942 stderr);
7943
7944 /* Upload. */
7945 r = si_shader_binary_upload(sscreen, shader);
7946 if (r) {
7947 fprintf(stderr, "LLVM failed to upload shader\n");
7948 return r;
7949 }
7950
7951 return 0;
7952 }
7953
7954 void si_shader_destroy(struct si_shader *shader)
7955 {
7956 if (shader->gs_copy_shader) {
7957 si_shader_destroy(shader->gs_copy_shader);
7958 FREE(shader->gs_copy_shader);
7959 }
7960
7961 if (shader->scratch_bo)
7962 r600_resource_reference(&shader->scratch_bo, NULL);
7963
7964 r600_resource_reference(&shader->bo, NULL);
7965
7966 if (!shader->is_binary_shared)
7967 radeon_shader_binary_clean(&shader->binary);
7968
7969 free(shader->shader_log);
7970 }