radeonsi: enable scratch coalescing
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "radeon/r600_cs.h"
37 #include "radeon/radeon_llvm.h"
38 #include "radeon/radeon_elf_util.h"
39 #include "radeon/radeon_llvm_emit.h"
40 #include "util/u_memory.h"
41 #include "util/u_pstipple.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94 int param_oc_lds;
95
96 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
97 * 0x800000 for VS, 0x1 for ES.
98 */
99 int param_tess_offchip;
100
101 LLVMTargetMachineRef tm;
102
103 unsigned uniform_md_kind;
104 LLVMValueRef const_md;
105 LLVMValueRef empty_md;
106 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
107 LLVMValueRef lds;
108 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
109 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
110 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
111 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
112 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
113 LLVMValueRef images[SI_NUM_IMAGES];
114 LLVMValueRef so_buffers[4];
115 LLVMValueRef esgs_ring;
116 LLVMValueRef gsvs_ring[4];
117 LLVMValueRef gs_next_vertex[4];
118 LLVMValueRef return_value;
119
120 LLVMTypeRef voidt;
121 LLVMTypeRef i1;
122 LLVMTypeRef i8;
123 LLVMTypeRef i32;
124 LLVMTypeRef i64;
125 LLVMTypeRef i128;
126 LLVMTypeRef f32;
127 LLVMTypeRef v16i8;
128 LLVMTypeRef v2i32;
129 LLVMTypeRef v4i32;
130 LLVMTypeRef v4f32;
131 LLVMTypeRef v8i32;
132
133 LLVMValueRef shared_memory;
134 };
135
136 static struct si_shader_context *si_shader_context(
137 struct lp_build_tgsi_context *bld_base)
138 {
139 return (struct si_shader_context *)bld_base;
140 }
141
142 static void si_init_shader_ctx(struct si_shader_context *ctx,
143 struct si_screen *sscreen,
144 struct si_shader *shader,
145 LLVMTargetMachineRef tm);
146
147 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
148 struct lp_build_tgsi_context *bld_base,
149 struct lp_build_emit_data *emit_data);
150
151 /* Ideally pass the sample mask input to the PS epilog as v13, which
152 * is its usual location, so that the shader doesn't have to add v_mov.
153 */
154 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
155
156 /* The VS location of the PrimitiveID input is the same in the epilog,
157 * so that the main shader part doesn't have to move it.
158 */
159 #define VS_EPILOG_PRIMID_LOC 2
160
161 #define PERSPECTIVE_BASE 0
162 #define LINEAR_BASE 9
163
164 #define SAMPLE_OFFSET 0
165 #define CENTER_OFFSET 2
166 #define CENTROID_OFSET 4
167
168 #define USE_SGPR_MAX_SUFFIX_LEN 5
169 #define CONST_ADDR_SPACE 2
170 #define LOCAL_ADDR_SPACE 3
171 #define USER_SGPR_ADDR_SPACE 8
172
173
174 #define SENDMSG_GS 2
175 #define SENDMSG_GS_DONE 3
176
177 #define SENDMSG_GS_OP_NOP (0 << 4)
178 #define SENDMSG_GS_OP_CUT (1 << 4)
179 #define SENDMSG_GS_OP_EMIT (2 << 4)
180 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
181
182 /**
183 * Returns a unique index for a semantic name and index. The index must be
184 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
185 * calculated.
186 */
187 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
188 {
189 switch (semantic_name) {
190 case TGSI_SEMANTIC_POSITION:
191 return 0;
192 case TGSI_SEMANTIC_PSIZE:
193 return 1;
194 case TGSI_SEMANTIC_CLIPDIST:
195 assert(index <= 1);
196 return 2 + index;
197 case TGSI_SEMANTIC_GENERIC:
198 if (index <= 63-4)
199 return 4 + index;
200 else
201 /* same explanation as in the default statement,
202 * the only user hitting this is st/nine.
203 */
204 return 0;
205
206 /* patch indices are completely separate and thus start from 0 */
207 case TGSI_SEMANTIC_TESSOUTER:
208 return 0;
209 case TGSI_SEMANTIC_TESSINNER:
210 return 1;
211 case TGSI_SEMANTIC_PATCH:
212 return 2 + index;
213
214 default:
215 /* Don't fail here. The result of this function is only used
216 * for LS, TCS, TES, and GS, where legacy GL semantics can't
217 * occur, but this function is called for all vertex shaders
218 * before it's known whether LS will be compiled or not.
219 */
220 return 0;
221 }
222 }
223
224 /**
225 * Get the value of a shader input parameter and extract a bitfield.
226 */
227 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
228 unsigned param, unsigned rshift,
229 unsigned bitwidth)
230 {
231 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
232 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
233 param);
234
235 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
236 value = bitcast(&ctx->radeon_bld.soa.bld_base,
237 TGSI_TYPE_UNSIGNED, value);
238
239 if (rshift)
240 value = LLVMBuildLShr(gallivm->builder, value,
241 lp_build_const_int32(gallivm, rshift), "");
242
243 if (rshift + bitwidth < 32) {
244 unsigned mask = (1 << bitwidth) - 1;
245 value = LLVMBuildAnd(gallivm->builder, value,
246 lp_build_const_int32(gallivm, mask), "");
247 }
248
249 return value;
250 }
251
252 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
253 {
254 switch (ctx->type) {
255 case PIPE_SHADER_TESS_CTRL:
256 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
257
258 case PIPE_SHADER_TESS_EVAL:
259 return LLVMGetParam(ctx->radeon_bld.main_fn,
260 ctx->param_tes_rel_patch_id);
261
262 default:
263 assert(0);
264 return NULL;
265 }
266 }
267
268 /* Tessellation shaders pass outputs to the next shader using LDS.
269 *
270 * LS outputs = TCS inputs
271 * TCS outputs = TES inputs
272 *
273 * The LDS layout is:
274 * - TCS inputs for patch 0
275 * - TCS inputs for patch 1
276 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
277 * - ...
278 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
279 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
280 * - TCS outputs for patch 1
281 * - Per-patch TCS outputs for patch 1
282 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
283 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
284 * - ...
285 *
286 * All three shaders VS(LS), TCS, TES share the same LDS space.
287 */
288
289 static LLVMValueRef
290 get_tcs_in_patch_stride(struct si_shader_context *ctx)
291 {
292 if (ctx->type == PIPE_SHADER_VERTEX)
293 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
294 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
295 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
296 else {
297 assert(0);
298 return NULL;
299 }
300 }
301
302 static LLVMValueRef
303 get_tcs_out_patch_stride(struct si_shader_context *ctx)
304 {
305 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
306 }
307
308 static LLVMValueRef
309 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
310 {
311 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
312 unpack_param(ctx,
313 SI_PARAM_TCS_OUT_OFFSETS,
314 0, 16),
315 4);
316 }
317
318 static LLVMValueRef
319 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
320 {
321 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
322 unpack_param(ctx,
323 SI_PARAM_TCS_OUT_OFFSETS,
324 16, 16),
325 4);
326 }
327
328 static LLVMValueRef
329 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
330 {
331 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
332 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
333 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
334
335 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
336 }
337
338 static LLVMValueRef
339 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
340 {
341 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
342 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
343 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
344 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
345
346 return LLVMBuildAdd(gallivm->builder, patch0_offset,
347 LLVMBuildMul(gallivm->builder, patch_stride,
348 rel_patch_id, ""),
349 "");
350 }
351
352 static LLVMValueRef
353 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
354 {
355 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
356 LLVMValueRef patch0_patch_data_offset =
357 get_tcs_out_patch0_patch_data_offset(ctx);
358 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
359 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
360
361 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
362 LLVMBuildMul(gallivm->builder, patch_stride,
363 rel_patch_id, ""),
364 "");
365 }
366
367 static void build_indexed_store(struct si_shader_context *ctx,
368 LLVMValueRef base_ptr, LLVMValueRef index,
369 LLVMValueRef value)
370 {
371 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
372 struct gallivm_state *gallivm = bld_base->base.gallivm;
373 LLVMValueRef indices[2], pointer;
374
375 indices[0] = bld_base->uint_bld.zero;
376 indices[1] = index;
377
378 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
379 LLVMBuildStore(gallivm->builder, value, pointer);
380 }
381
382 /**
383 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
384 * It's equivalent to doing a load from &base_ptr[index].
385 *
386 * \param base_ptr Where the array starts.
387 * \param index The element index into the array.
388 * \param uniform Whether the base_ptr and index can be assumed to be
389 * dynamically uniform
390 */
391 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
392 LLVMValueRef base_ptr, LLVMValueRef index,
393 bool uniform)
394 {
395 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
396 struct gallivm_state *gallivm = bld_base->base.gallivm;
397 LLVMValueRef indices[2], pointer;
398
399 indices[0] = bld_base->uint_bld.zero;
400 indices[1] = index;
401
402 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
403 if (uniform)
404 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
405 return LLVMBuildLoad(gallivm->builder, pointer, "");
406 }
407
408 /**
409 * Do a load from &base_ptr[index], but also add a flag that it's loading
410 * a constant from a dynamically uniform index.
411 */
412 static LLVMValueRef build_indexed_load_const(
413 struct si_shader_context *ctx,
414 LLVMValueRef base_ptr, LLVMValueRef index)
415 {
416 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
417 LLVMSetMetadata(result, 1, ctx->const_md);
418 return result;
419 }
420
421 static LLVMValueRef get_instance_index_for_fetch(
422 struct radeon_llvm_context *radeon_bld,
423 unsigned param_start_instance, unsigned divisor)
424 {
425 struct si_shader_context *ctx =
426 si_shader_context(&radeon_bld->soa.bld_base);
427 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
428
429 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
430 ctx->param_instance_id);
431
432 /* The division must be done before START_INSTANCE is added. */
433 if (divisor > 1)
434 result = LLVMBuildUDiv(gallivm->builder, result,
435 lp_build_const_int32(gallivm, divisor), "");
436
437 return LLVMBuildAdd(gallivm->builder, result,
438 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
439 }
440
441 static void declare_input_vs(
442 struct radeon_llvm_context *radeon_bld,
443 unsigned input_index,
444 const struct tgsi_full_declaration *decl)
445 {
446 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
447 struct gallivm_state *gallivm = base->gallivm;
448 struct si_shader_context *ctx =
449 si_shader_context(&radeon_bld->soa.bld_base);
450 unsigned divisor =
451 ctx->shader->key.vs.prolog.instance_divisors[input_index];
452
453 unsigned chan;
454
455 LLVMValueRef t_list_ptr;
456 LLVMValueRef t_offset;
457 LLVMValueRef t_list;
458 LLVMValueRef attribute_offset;
459 LLVMValueRef buffer_index;
460 LLVMValueRef args[3];
461 LLVMValueRef input;
462
463 /* Load the T list */
464 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
465
466 t_offset = lp_build_const_int32(gallivm, input_index);
467
468 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
469
470 /* Build the attribute offset */
471 attribute_offset = lp_build_const_int32(gallivm, 0);
472
473 if (!ctx->is_monolithic) {
474 buffer_index = LLVMGetParam(radeon_bld->main_fn,
475 ctx->param_vertex_index0 +
476 input_index);
477 } else if (divisor) {
478 /* Build index from instance ID, start instance and divisor */
479 ctx->shader->info.uses_instanceid = true;
480 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
481 SI_PARAM_START_INSTANCE,
482 divisor);
483 } else {
484 /* Load the buffer index for vertices. */
485 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
486 ctx->param_vertex_id);
487 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
488 SI_PARAM_BASE_VERTEX);
489 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
490 }
491
492 args[0] = t_list;
493 args[1] = attribute_offset;
494 args[2] = buffer_index;
495 input = lp_build_intrinsic(gallivm->builder,
496 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
497 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
498
499 /* Break up the vec4 into individual components */
500 for (chan = 0; chan < 4; chan++) {
501 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
502 /* XXX: Use a helper function for this. There is one in
503 * tgsi_llvm.c. */
504 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
505 LLVMBuildExtractElement(gallivm->builder,
506 input, llvm_chan, "");
507 }
508 }
509
510 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
511 unsigned swizzle)
512 {
513 struct si_shader_context *ctx = si_shader_context(bld_base);
514
515 if (swizzle > 0)
516 return bld_base->uint_bld.zero;
517
518 switch (ctx->type) {
519 case PIPE_SHADER_VERTEX:
520 return LLVMGetParam(ctx->radeon_bld.main_fn,
521 ctx->param_vs_prim_id);
522 case PIPE_SHADER_TESS_CTRL:
523 return LLVMGetParam(ctx->radeon_bld.main_fn,
524 SI_PARAM_PATCH_ID);
525 case PIPE_SHADER_TESS_EVAL:
526 return LLVMGetParam(ctx->radeon_bld.main_fn,
527 ctx->param_tes_patch_id);
528 case PIPE_SHADER_GEOMETRY:
529 return LLVMGetParam(ctx->radeon_bld.main_fn,
530 SI_PARAM_PRIMITIVE_ID);
531 default:
532 assert(0);
533 return bld_base->uint_bld.zero;
534 }
535 }
536
537 /**
538 * Return the value of tgsi_ind_register for indexing.
539 * This is the indirect index with the constant offset added to it.
540 */
541 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
542 const struct tgsi_ind_register *ind,
543 int rel_index)
544 {
545 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
546 LLVMValueRef result;
547
548 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
549 result = LLVMBuildLoad(gallivm->builder, result, "");
550 result = LLVMBuildAdd(gallivm->builder, result,
551 lp_build_const_int32(gallivm, rel_index), "");
552 return result;
553 }
554
555 /**
556 * Like get_indirect_index, but restricts the return value to a (possibly
557 * undefined) value inside [0..num).
558 */
559 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
560 const struct tgsi_ind_register *ind,
561 int rel_index, unsigned num)
562 {
563 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
564 LLVMBuilderRef builder = gallivm->builder;
565 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
566 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
567 LLVMValueRef cc;
568
569 /* LLVM 3.8: If indirect resource indexing is used:
570 * - SI & CIK hang
571 * - VI crashes
572 */
573 if (HAVE_LLVM <= 0x0308)
574 return LLVMGetUndef(ctx->i32);
575
576 if (util_is_power_of_two(num)) {
577 result = LLVMBuildAnd(builder, result, c_max, "");
578 } else {
579 /* In theory, this MAX pattern should result in code that is
580 * as good as the bit-wise AND above.
581 *
582 * In practice, LLVM generates worse code (at the time of
583 * writing), because its value tracking is not strong enough.
584 */
585 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
586 result = LLVMBuildSelect(builder, cc, result, c_max, "");
587 }
588
589 return result;
590 }
591
592
593 /**
594 * Calculate a dword address given an input or output register and a stride.
595 */
596 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
597 const struct tgsi_full_dst_register *dst,
598 const struct tgsi_full_src_register *src,
599 LLVMValueRef vertex_dw_stride,
600 LLVMValueRef base_addr)
601 {
602 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
603 struct tgsi_shader_info *info = &ctx->shader->selector->info;
604 ubyte *name, *index, *array_first;
605 int first, param;
606 struct tgsi_full_dst_register reg;
607
608 /* Set the register description. The address computation is the same
609 * for sources and destinations. */
610 if (src) {
611 reg.Register.File = src->Register.File;
612 reg.Register.Index = src->Register.Index;
613 reg.Register.Indirect = src->Register.Indirect;
614 reg.Register.Dimension = src->Register.Dimension;
615 reg.Indirect = src->Indirect;
616 reg.Dimension = src->Dimension;
617 reg.DimIndirect = src->DimIndirect;
618 } else
619 reg = *dst;
620
621 /* If the register is 2-dimensional (e.g. an array of vertices
622 * in a primitive), calculate the base address of the vertex. */
623 if (reg.Register.Dimension) {
624 LLVMValueRef index;
625
626 if (reg.Dimension.Indirect)
627 index = get_indirect_index(ctx, &reg.DimIndirect,
628 reg.Dimension.Index);
629 else
630 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
631
632 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
633 LLVMBuildMul(gallivm->builder, index,
634 vertex_dw_stride, ""), "");
635 }
636
637 /* Get information about the register. */
638 if (reg.Register.File == TGSI_FILE_INPUT) {
639 name = info->input_semantic_name;
640 index = info->input_semantic_index;
641 array_first = info->input_array_first;
642 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
643 name = info->output_semantic_name;
644 index = info->output_semantic_index;
645 array_first = info->output_array_first;
646 } else {
647 assert(0);
648 return NULL;
649 }
650
651 if (reg.Register.Indirect) {
652 /* Add the relative address of the element. */
653 LLVMValueRef ind_index;
654
655 if (reg.Indirect.ArrayID)
656 first = array_first[reg.Indirect.ArrayID];
657 else
658 first = reg.Register.Index;
659
660 ind_index = get_indirect_index(ctx, &reg.Indirect,
661 reg.Register.Index - first);
662
663 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
664 LLVMBuildMul(gallivm->builder, ind_index,
665 lp_build_const_int32(gallivm, 4), ""), "");
666
667 param = si_shader_io_get_unique_index(name[first], index[first]);
668 } else {
669 param = si_shader_io_get_unique_index(name[reg.Register.Index],
670 index[reg.Register.Index]);
671 }
672
673 /* Add the base address of the element. */
674 return LLVMBuildAdd(gallivm->builder, base_addr,
675 lp_build_const_int32(gallivm, param * 4), "");
676 }
677
678 /* The offchip buffer layout for TCS->TES is
679 *
680 * - attribute 0 of patch 0 vertex 0
681 * - attribute 0 of patch 0 vertex 1
682 * - attribute 0 of patch 0 vertex 2
683 * ...
684 * - attribute 0 of patch 1 vertex 0
685 * - attribute 0 of patch 1 vertex 1
686 * ...
687 * - attribute 1 of patch 0 vertex 0
688 * - attribute 1 of patch 0 vertex 1
689 * ...
690 * - per patch attribute 0 of patch 0
691 * - per patch attribute 0 of patch 1
692 * ...
693 *
694 * Note that every attribute has 4 components.
695 */
696 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
697 LLVMValueRef vertex_index,
698 LLVMValueRef param_index)
699 {
700 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
701 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
702 LLVMValueRef param_stride, constant16;
703
704 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
705 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
706 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
707 num_patches, "");
708
709 constant16 = lp_build_const_int32(gallivm, 16);
710 if (vertex_index) {
711 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
712 vertices_per_patch, "");
713
714 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
715 vertex_index, "");
716
717 param_stride = total_vertices;
718 } else {
719 base_addr = get_rel_patch_id(ctx);
720 param_stride = num_patches;
721 }
722
723 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
724 LLVMBuildMul(gallivm->builder, param_index,
725 param_stride, ""), "");
726
727 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
728
729 if (!vertex_index) {
730 LLVMValueRef patch_data_offset =
731 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
732
733 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
734 patch_data_offset, "");
735 }
736 return base_addr;
737 }
738
739 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
740 struct si_shader_context *ctx,
741 const struct tgsi_full_dst_register *dst,
742 const struct tgsi_full_src_register *src)
743 {
744 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
745 struct tgsi_shader_info *info = &ctx->shader->selector->info;
746 ubyte *name, *index, *array_first;
747 struct tgsi_full_src_register reg;
748 LLVMValueRef vertex_index = NULL;
749 LLVMValueRef param_index = NULL;
750 unsigned param_index_base, param_base;
751
752 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
753
754 if (reg.Register.Dimension) {
755
756 if (reg.Dimension.Indirect)
757 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
758 reg.Dimension.Index);
759 else
760 vertex_index = lp_build_const_int32(gallivm,
761 reg.Dimension.Index);
762 }
763
764 /* Get information about the register. */
765 if (reg.Register.File == TGSI_FILE_INPUT) {
766 name = info->input_semantic_name;
767 index = info->input_semantic_index;
768 array_first = info->input_array_first;
769 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
770 name = info->output_semantic_name;
771 index = info->output_semantic_index;
772 array_first = info->output_array_first;
773 } else {
774 assert(0);
775 return NULL;
776 }
777
778 if (reg.Register.Indirect) {
779 if (reg.Indirect.ArrayID)
780 param_base = array_first[reg.Indirect.ArrayID];
781 else
782 param_base = reg.Register.Index;
783
784 param_index = get_indirect_index(ctx, &reg.Indirect,
785 reg.Register.Index - param_base);
786
787 } else {
788 param_base = reg.Register.Index;
789 param_index = lp_build_const_int32(gallivm, 0);
790 }
791
792 param_index_base = si_shader_io_get_unique_index(name[param_base],
793 index[param_base]);
794
795 param_index = LLVMBuildAdd(gallivm->builder, param_index,
796 lp_build_const_int32(gallivm, param_index_base),
797 "");
798
799 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
800 }
801
802 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
803 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
804 * or v4i32 (num_channels=3,4). */
805 static void build_tbuffer_store(struct si_shader_context *ctx,
806 LLVMValueRef rsrc,
807 LLVMValueRef vdata,
808 unsigned num_channels,
809 LLVMValueRef vaddr,
810 LLVMValueRef soffset,
811 unsigned inst_offset,
812 unsigned dfmt,
813 unsigned nfmt,
814 unsigned offen,
815 unsigned idxen,
816 unsigned glc,
817 unsigned slc,
818 unsigned tfe)
819 {
820 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
821 LLVMValueRef args[] = {
822 rsrc,
823 vdata,
824 LLVMConstInt(ctx->i32, num_channels, 0),
825 vaddr,
826 soffset,
827 LLVMConstInt(ctx->i32, inst_offset, 0),
828 LLVMConstInt(ctx->i32, dfmt, 0),
829 LLVMConstInt(ctx->i32, nfmt, 0),
830 LLVMConstInt(ctx->i32, offen, 0),
831 LLVMConstInt(ctx->i32, idxen, 0),
832 LLVMConstInt(ctx->i32, glc, 0),
833 LLVMConstInt(ctx->i32, slc, 0),
834 LLVMConstInt(ctx->i32, tfe, 0)
835 };
836
837 /* The instruction offset field has 12 bits */
838 assert(offen || inst_offset < (1 << 12));
839
840 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
841 unsigned func = CLAMP(num_channels, 1, 3) - 1;
842 const char *types[] = {"i32", "v2i32", "v4i32"};
843 char name[256];
844 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
845
846 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
847 args, ARRAY_SIZE(args), 0);
848 }
849
850 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
851 LLVMValueRef rsrc,
852 LLVMValueRef vdata,
853 unsigned num_channels,
854 LLVMValueRef vaddr,
855 LLVMValueRef soffset,
856 unsigned inst_offset)
857 {
858 static unsigned dfmt[] = {
859 V_008F0C_BUF_DATA_FORMAT_32,
860 V_008F0C_BUF_DATA_FORMAT_32_32,
861 V_008F0C_BUF_DATA_FORMAT_32_32_32,
862 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
863 };
864 assert(num_channels >= 1 && num_channels <= 4);
865
866 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
867 inst_offset, dfmt[num_channels-1],
868 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
869 }
870
871 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
872 LLVMValueRef rsrc,
873 int num_channels,
874 LLVMValueRef vindex,
875 LLVMValueRef voffset,
876 LLVMValueRef soffset,
877 unsigned inst_offset,
878 unsigned glc,
879 unsigned slc)
880 {
881 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
882 unsigned func = CLAMP(num_channels, 1, 3) - 1;
883
884 if (HAVE_LLVM >= 0x309) {
885 LLVMValueRef args[] = {
886 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
887 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
888 LLVMConstInt(ctx->i32, inst_offset, 0),
889 LLVMConstInt(ctx->i1, glc, 0),
890 LLVMConstInt(ctx->i1, slc, 0)
891 };
892
893 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
894 ctx->v4f32};
895 const char *type_names[] = {"f32", "v2f32", "v4f32"};
896 char name[256];
897
898 if (voffset) {
899 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
900 "");
901 }
902
903 if (soffset) {
904 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
905 "");
906 }
907
908 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
909 type_names[func]);
910
911 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
912 ARRAY_SIZE(args), LLVMReadOnlyAttribute |
913 LLVMNoUnwindAttribute);
914 } else {
915 LLVMValueRef args[] = {
916 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
917 voffset ? voffset : vindex,
918 soffset,
919 LLVMConstInt(ctx->i32, inst_offset, 0),
920 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
921 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
922 LLVMConstInt(ctx->i32, glc, 0),
923 LLVMConstInt(ctx->i32, slc, 0),
924 LLVMConstInt(ctx->i32, 0, 0), // TFE
925 };
926
927 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
928 ctx->v4i32};
929 const char *type_names[] = {"i32", "v2i32", "v4i32"};
930 const char *arg_type = "i32";
931 char name[256];
932
933 if (voffset && vindex) {
934 LLVMValueRef vaddr[] = {vindex, voffset};
935
936 arg_type = "v2i32";
937 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
938 }
939
940 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
941 type_names[func], arg_type);
942
943 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
944 ARRAY_SIZE(args), LLVMReadOnlyAttribute |
945 LLVMNoUnwindAttribute);
946 }
947 }
948
949 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
950 enum tgsi_opcode_type type, unsigned swizzle,
951 LLVMValueRef buffer, LLVMValueRef offset,
952 LLVMValueRef base)
953 {
954 struct si_shader_context *ctx = si_shader_context(bld_base);
955 struct gallivm_state *gallivm = bld_base->base.gallivm;
956 LLVMValueRef value, value2;
957 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
958 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
959
960 if (swizzle == ~0) {
961 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
962 0, 1, 0);
963
964 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
965 }
966
967 if (!tgsi_type_is_64bit(type)) {
968 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
969 0, 1, 0);
970
971 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
972 return LLVMBuildExtractElement(gallivm->builder, value,
973 lp_build_const_int32(gallivm, swizzle), "");
974 }
975
976 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
977 swizzle * 4, 1, 0);
978
979 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
980 swizzle * 4 + 4, 1, 0);
981
982 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
983 }
984
985 /**
986 * Load from LDS.
987 *
988 * \param type output value type
989 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
990 * \param dw_addr address in dwords
991 */
992 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
993 enum tgsi_opcode_type type, unsigned swizzle,
994 LLVMValueRef dw_addr)
995 {
996 struct si_shader_context *ctx = si_shader_context(bld_base);
997 struct gallivm_state *gallivm = bld_base->base.gallivm;
998 LLVMValueRef value;
999
1000 if (swizzle == ~0) {
1001 LLVMValueRef values[TGSI_NUM_CHANNELS];
1002
1003 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
1004 values[chan] = lds_load(bld_base, type, chan, dw_addr);
1005
1006 return lp_build_gather_values(bld_base->base.gallivm, values,
1007 TGSI_NUM_CHANNELS);
1008 }
1009
1010 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1011 lp_build_const_int32(gallivm, swizzle));
1012
1013 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1014 if (tgsi_type_is_64bit(type)) {
1015 LLVMValueRef value2;
1016 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1017 lp_build_const_int32(gallivm, swizzle + 1));
1018 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
1019 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
1020 }
1021
1022 return LLVMBuildBitCast(gallivm->builder, value,
1023 tgsi2llvmtype(bld_base, type), "");
1024 }
1025
1026 /**
1027 * Store to LDS.
1028 *
1029 * \param swizzle offset (typically 0..3)
1030 * \param dw_addr address in dwords
1031 * \param value value to store
1032 */
1033 static void lds_store(struct lp_build_tgsi_context *bld_base,
1034 unsigned swizzle, LLVMValueRef dw_addr,
1035 LLVMValueRef value)
1036 {
1037 struct si_shader_context *ctx = si_shader_context(bld_base);
1038 struct gallivm_state *gallivm = bld_base->base.gallivm;
1039
1040 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1041 lp_build_const_int32(gallivm, swizzle));
1042
1043 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1044 build_indexed_store(ctx, ctx->lds,
1045 dw_addr, value);
1046 }
1047
1048 static LLVMValueRef fetch_input_tcs(
1049 struct lp_build_tgsi_context *bld_base,
1050 const struct tgsi_full_src_register *reg,
1051 enum tgsi_opcode_type type, unsigned swizzle)
1052 {
1053 struct si_shader_context *ctx = si_shader_context(bld_base);
1054 LLVMValueRef dw_addr, stride;
1055
1056 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1057 dw_addr = get_tcs_in_current_patch_offset(ctx);
1058 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1059
1060 return lds_load(bld_base, type, swizzle, dw_addr);
1061 }
1062
1063 static LLVMValueRef fetch_output_tcs(
1064 struct lp_build_tgsi_context *bld_base,
1065 const struct tgsi_full_src_register *reg,
1066 enum tgsi_opcode_type type, unsigned swizzle)
1067 {
1068 struct si_shader_context *ctx = si_shader_context(bld_base);
1069 LLVMValueRef dw_addr, stride;
1070
1071 if (reg->Register.Dimension) {
1072 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1073 dw_addr = get_tcs_out_current_patch_offset(ctx);
1074 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1075 } else {
1076 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1077 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1078 }
1079
1080 return lds_load(bld_base, type, swizzle, dw_addr);
1081 }
1082
1083 static LLVMValueRef fetch_input_tes(
1084 struct lp_build_tgsi_context *bld_base,
1085 const struct tgsi_full_src_register *reg,
1086 enum tgsi_opcode_type type, unsigned swizzle)
1087 {
1088 struct si_shader_context *ctx = si_shader_context(bld_base);
1089 struct gallivm_state *gallivm = bld_base->base.gallivm;
1090 LLVMValueRef rw_buffers, buffer, base, addr;
1091
1092 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1093 SI_PARAM_RW_BUFFERS);
1094 buffer = build_indexed_load_const(ctx, rw_buffers,
1095 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1096
1097 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1098 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1099
1100 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1101 }
1102
1103 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1104 const struct tgsi_full_instruction *inst,
1105 const struct tgsi_opcode_info *info,
1106 LLVMValueRef dst[4])
1107 {
1108 struct si_shader_context *ctx = si_shader_context(bld_base);
1109 struct gallivm_state *gallivm = bld_base->base.gallivm;
1110 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1111 unsigned chan_index;
1112 LLVMValueRef dw_addr, stride;
1113 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1114 LLVMValueRef values[4];
1115
1116 /* Only handle per-patch and per-vertex outputs here.
1117 * Vectors will be lowered to scalars and this function will be called again.
1118 */
1119 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1120 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1121 radeon_llvm_emit_store(bld_base, inst, info, dst);
1122 return;
1123 }
1124
1125 if (reg->Register.Dimension) {
1126 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1127 dw_addr = get_tcs_out_current_patch_offset(ctx);
1128 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1129 } else {
1130 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1131 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1132 }
1133
1134 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1135 SI_PARAM_RW_BUFFERS);
1136 buffer = build_indexed_load_const(ctx, rw_buffers,
1137 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1138
1139 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1140 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1141
1142
1143 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1144 LLVMValueRef value = dst[chan_index];
1145
1146 if (inst->Instruction.Saturate)
1147 value = radeon_llvm_saturate(bld_base, value);
1148
1149 lds_store(bld_base, chan_index, dw_addr, value);
1150
1151 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1152 values[chan_index] = value;
1153
1154 if (inst->Dst[0].Register.WriteMask != 0xF) {
1155 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1156 buf_addr, base,
1157 4 * chan_index);
1158 }
1159 }
1160
1161 if (inst->Dst[0].Register.WriteMask == 0xF) {
1162 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1163 values, 4);
1164 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1165 base, 0);
1166 }
1167 }
1168
1169 static LLVMValueRef fetch_input_gs(
1170 struct lp_build_tgsi_context *bld_base,
1171 const struct tgsi_full_src_register *reg,
1172 enum tgsi_opcode_type type,
1173 unsigned swizzle)
1174 {
1175 struct lp_build_context *base = &bld_base->base;
1176 struct si_shader_context *ctx = si_shader_context(bld_base);
1177 struct si_shader *shader = ctx->shader;
1178 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1179 struct gallivm_state *gallivm = base->gallivm;
1180 LLVMValueRef vtx_offset;
1181 LLVMValueRef args[9];
1182 unsigned vtx_offset_param;
1183 struct tgsi_shader_info *info = &shader->selector->info;
1184 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1185 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1186 unsigned param;
1187 LLVMValueRef value;
1188
1189 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1190 return get_primitive_id(bld_base, swizzle);
1191
1192 if (!reg->Register.Dimension)
1193 return NULL;
1194
1195 if (swizzle == ~0) {
1196 LLVMValueRef values[TGSI_NUM_CHANNELS];
1197 unsigned chan;
1198 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1199 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1200 }
1201 return lp_build_gather_values(bld_base->base.gallivm, values,
1202 TGSI_NUM_CHANNELS);
1203 }
1204
1205 /* Get the vertex offset parameter */
1206 vtx_offset_param = reg->Dimension.Index;
1207 if (vtx_offset_param < 2) {
1208 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1209 } else {
1210 assert(vtx_offset_param < 6);
1211 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1212 }
1213 vtx_offset = lp_build_mul_imm(uint,
1214 LLVMGetParam(ctx->radeon_bld.main_fn,
1215 vtx_offset_param),
1216 4);
1217
1218 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1219 args[0] = ctx->esgs_ring;
1220 args[1] = vtx_offset;
1221 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1222 args[3] = uint->zero;
1223 args[4] = uint->one; /* OFFEN */
1224 args[5] = uint->zero; /* IDXEN */
1225 args[6] = uint->one; /* GLC */
1226 args[7] = uint->zero; /* SLC */
1227 args[8] = uint->zero; /* TFE */
1228
1229 value = lp_build_intrinsic(gallivm->builder,
1230 "llvm.SI.buffer.load.dword.i32.i32",
1231 ctx->i32, args, 9,
1232 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
1233 if (tgsi_type_is_64bit(type)) {
1234 LLVMValueRef value2;
1235 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1236 value2 = lp_build_intrinsic(gallivm->builder,
1237 "llvm.SI.buffer.load.dword.i32.i32",
1238 ctx->i32, args, 9,
1239 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
1240 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1241 value, value2);
1242 }
1243 return LLVMBuildBitCast(gallivm->builder,
1244 value,
1245 tgsi2llvmtype(bld_base, type), "");
1246 }
1247
1248 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1249 {
1250 switch (interpolate) {
1251 case TGSI_INTERPOLATE_CONSTANT:
1252 return 0;
1253
1254 case TGSI_INTERPOLATE_LINEAR:
1255 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1256 return SI_PARAM_LINEAR_SAMPLE;
1257 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1258 return SI_PARAM_LINEAR_CENTROID;
1259 else
1260 return SI_PARAM_LINEAR_CENTER;
1261 break;
1262 case TGSI_INTERPOLATE_COLOR:
1263 case TGSI_INTERPOLATE_PERSPECTIVE:
1264 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1265 return SI_PARAM_PERSP_SAMPLE;
1266 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1267 return SI_PARAM_PERSP_CENTROID;
1268 else
1269 return SI_PARAM_PERSP_CENTER;
1270 break;
1271 default:
1272 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1273 return -1;
1274 }
1275 }
1276
1277 /* This shouldn't be used by explicit INTERP opcodes. */
1278 static unsigned select_interp_param(struct si_shader_context *ctx,
1279 unsigned param)
1280 {
1281 if (!ctx->shader->key.ps.prolog.force_persample_interp ||
1282 !ctx->is_monolithic)
1283 return param;
1284
1285 /* If the shader doesn't use center/centroid, just return the parameter.
1286 *
1287 * If the shader only uses one set of (i,j), "si_emit_spi_ps_input" can
1288 * switch between center/centroid and sample without shader changes.
1289 */
1290 switch (param) {
1291 case SI_PARAM_PERSP_CENTROID:
1292 case SI_PARAM_PERSP_CENTER:
1293 return SI_PARAM_PERSP_SAMPLE;
1294
1295 case SI_PARAM_LINEAR_CENTROID:
1296 case SI_PARAM_LINEAR_CENTER:
1297 return SI_PARAM_LINEAR_SAMPLE;
1298
1299 default:
1300 return param;
1301 }
1302 }
1303
1304 /**
1305 * Interpolate a fragment shader input.
1306 *
1307 * @param ctx context
1308 * @param input_index index of the input in hardware
1309 * @param semantic_name TGSI_SEMANTIC_*
1310 * @param semantic_index semantic index
1311 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1312 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1313 * @param interp_param interpolation weights (i,j)
1314 * @param prim_mask SI_PARAM_PRIM_MASK
1315 * @param face SI_PARAM_FRONT_FACE
1316 * @param result the return value (4 components)
1317 */
1318 static void interp_fs_input(struct si_shader_context *ctx,
1319 unsigned input_index,
1320 unsigned semantic_name,
1321 unsigned semantic_index,
1322 unsigned num_interp_inputs,
1323 unsigned colors_read_mask,
1324 LLVMValueRef interp_param,
1325 LLVMValueRef prim_mask,
1326 LLVMValueRef face,
1327 LLVMValueRef result[4])
1328 {
1329 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1330 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1331 struct gallivm_state *gallivm = base->gallivm;
1332 const char *intr_name;
1333 LLVMValueRef attr_number;
1334
1335 unsigned chan;
1336
1337 attr_number = lp_build_const_int32(gallivm, input_index);
1338
1339 /* fs.constant returns the param from the middle vertex, so it's not
1340 * really useful for flat shading. It's meant to be used for custom
1341 * interpolation (but the intrinsic can't fetch from the other two
1342 * vertices).
1343 *
1344 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1345 * to do the right thing. The only reason we use fs.constant is that
1346 * fs.interp cannot be used on integers, because they can be equal
1347 * to NaN.
1348 */
1349 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1350
1351 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1352 ctx->shader->key.ps.prolog.color_two_side) {
1353 LLVMValueRef args[4];
1354 LLVMValueRef is_face_positive;
1355 LLVMValueRef back_attr_number;
1356
1357 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1358 * otherwise it's at offset "num_inputs".
1359 */
1360 unsigned back_attr_offset = num_interp_inputs;
1361 if (semantic_index == 1 && colors_read_mask & 0xf)
1362 back_attr_offset += 1;
1363
1364 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1365
1366 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1367 face, uint->zero, "");
1368
1369 args[2] = prim_mask;
1370 args[3] = interp_param;
1371 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1372 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1373 LLVMValueRef front, back;
1374
1375 args[0] = llvm_chan;
1376 args[1] = attr_number;
1377 front = lp_build_intrinsic(gallivm->builder, intr_name,
1378 ctx->f32, args, args[3] ? 4 : 3,
1379 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1380
1381 args[1] = back_attr_number;
1382 back = lp_build_intrinsic(gallivm->builder, intr_name,
1383 ctx->f32, args, args[3] ? 4 : 3,
1384 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1385
1386 result[chan] = LLVMBuildSelect(gallivm->builder,
1387 is_face_positive,
1388 front,
1389 back,
1390 "");
1391 }
1392 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1393 LLVMValueRef args[4];
1394
1395 args[0] = uint->zero;
1396 args[1] = attr_number;
1397 args[2] = prim_mask;
1398 args[3] = interp_param;
1399 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1400 ctx->f32, args, args[3] ? 4 : 3,
1401 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1402 result[1] =
1403 result[2] = lp_build_const_float(gallivm, 0.0f);
1404 result[3] = lp_build_const_float(gallivm, 1.0f);
1405 } else {
1406 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1407 LLVMValueRef args[4];
1408 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1409
1410 args[0] = llvm_chan;
1411 args[1] = attr_number;
1412 args[2] = prim_mask;
1413 args[3] = interp_param;
1414 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1415 ctx->f32, args, args[3] ? 4 : 3,
1416 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1417 }
1418 }
1419 }
1420
1421 static void declare_input_fs(
1422 struct radeon_llvm_context *radeon_bld,
1423 unsigned input_index,
1424 const struct tgsi_full_declaration *decl)
1425 {
1426 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1427 struct si_shader_context *ctx =
1428 si_shader_context(&radeon_bld->soa.bld_base);
1429 struct si_shader *shader = ctx->shader;
1430 LLVMValueRef main_fn = radeon_bld->main_fn;
1431 LLVMValueRef interp_param = NULL;
1432 int interp_param_idx;
1433
1434 /* Get colors from input VGPRs (set by the prolog). */
1435 if (!ctx->is_monolithic &&
1436 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1437 unsigned i = decl->Semantic.Index;
1438 unsigned colors_read = shader->selector->info.colors_read;
1439 unsigned mask = colors_read >> (i * 4);
1440 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1441 (i ? util_bitcount(colors_read & 0xf) : 0);
1442
1443 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1444 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1445 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1446 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1447 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1448 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1449 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1450 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1451 return;
1452 }
1453
1454 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1455 decl->Interp.Location);
1456 if (interp_param_idx == -1)
1457 return;
1458 else if (interp_param_idx) {
1459 interp_param_idx = select_interp_param(ctx,
1460 interp_param_idx);
1461 interp_param = LLVMGetParam(main_fn, interp_param_idx);
1462 }
1463
1464 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1465 decl->Semantic.Index, shader->selector->info.num_inputs,
1466 shader->selector->info.colors_read, interp_param,
1467 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1468 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1469 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1470 }
1471
1472 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1473 {
1474 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1475 SI_PARAM_ANCILLARY, 8, 4);
1476 }
1477
1478 /**
1479 * Set range metadata on an instruction. This can only be used on load and
1480 * call instructions. If you know an instruction can only produce the values
1481 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1482 * \p lo is the minimum value inclusive.
1483 * \p hi is the maximum value exclusive.
1484 */
1485 static void set_range_metadata(LLVMValueRef value, unsigned lo, unsigned hi)
1486 {
1487 const char *range_md_string = "range";
1488 LLVMValueRef range_md, md_args[2];
1489 LLVMTypeRef type = LLVMTypeOf(value);
1490 LLVMContextRef context = LLVMGetTypeContext(type);
1491 unsigned md_range_id = LLVMGetMDKindIDInContext(context,
1492 range_md_string, strlen(range_md_string));
1493
1494 md_args[0] = LLVMConstInt(type, lo, false);
1495 md_args[1] = LLVMConstInt(type, hi, false);
1496 range_md = LLVMMDNodeInContext(context, md_args, 2);
1497 LLVMSetMetadata(value, md_range_id, range_md);
1498 }
1499
1500 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1501 {
1502 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1503 LLVMValueRef tid;
1504
1505 if (HAVE_LLVM < 0x0308) {
1506 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1507 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1508 } else {
1509 LLVMValueRef tid_args[2];
1510 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1511 tid_args[1] = lp_build_const_int32(gallivm, 0);
1512 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1513 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1514 tid_args, 2, LLVMReadNoneAttribute);
1515
1516 tid = lp_build_intrinsic(gallivm->builder,
1517 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1518 tid_args, 2, LLVMReadNoneAttribute);
1519 }
1520 set_range_metadata(tid, 0, 64);
1521 return tid;
1522 }
1523
1524 /**
1525 * Load a dword from a constant buffer.
1526 */
1527 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1528 LLVMValueRef offset, LLVMTypeRef return_type)
1529 {
1530 LLVMValueRef args[2] = {resource, offset};
1531
1532 return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1533 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1534 }
1535
1536 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1537 {
1538 struct si_shader_context *ctx =
1539 si_shader_context(&radeon_bld->soa.bld_base);
1540 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1541 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1542 LLVMBuilderRef builder = gallivm->builder;
1543 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1544 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1545 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1546
1547 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1548 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1549 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1550
1551 LLVMValueRef pos[4] = {
1552 buffer_load_const(builder, resource, offset0, ctx->f32),
1553 buffer_load_const(builder, resource, offset1, ctx->f32),
1554 lp_build_const_float(gallivm, 0),
1555 lp_build_const_float(gallivm, 0)
1556 };
1557
1558 return lp_build_gather_values(gallivm, pos, 4);
1559 }
1560
1561 static void declare_system_value(
1562 struct radeon_llvm_context *radeon_bld,
1563 unsigned index,
1564 const struct tgsi_full_declaration *decl)
1565 {
1566 struct si_shader_context *ctx =
1567 si_shader_context(&radeon_bld->soa.bld_base);
1568 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1569 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1570 LLVMValueRef value = 0;
1571
1572 switch (decl->Semantic.Name) {
1573 case TGSI_SEMANTIC_INSTANCEID:
1574 value = LLVMGetParam(radeon_bld->main_fn,
1575 ctx->param_instance_id);
1576 break;
1577
1578 case TGSI_SEMANTIC_VERTEXID:
1579 value = LLVMBuildAdd(gallivm->builder,
1580 LLVMGetParam(radeon_bld->main_fn,
1581 ctx->param_vertex_id),
1582 LLVMGetParam(radeon_bld->main_fn,
1583 SI_PARAM_BASE_VERTEX), "");
1584 break;
1585
1586 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1587 value = LLVMGetParam(radeon_bld->main_fn,
1588 ctx->param_vertex_id);
1589 break;
1590
1591 case TGSI_SEMANTIC_BASEVERTEX:
1592 value = LLVMGetParam(radeon_bld->main_fn,
1593 SI_PARAM_BASE_VERTEX);
1594 break;
1595
1596 case TGSI_SEMANTIC_INVOCATIONID:
1597 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1598 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1599 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1600 value = LLVMGetParam(radeon_bld->main_fn,
1601 SI_PARAM_GS_INSTANCE_ID);
1602 else
1603 assert(!"INVOCATIONID not implemented");
1604 break;
1605
1606 case TGSI_SEMANTIC_POSITION:
1607 {
1608 LLVMValueRef pos[4] = {
1609 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1610 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1611 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1612 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1613 LLVMGetParam(radeon_bld->main_fn,
1614 SI_PARAM_POS_W_FLOAT)),
1615 };
1616 value = lp_build_gather_values(gallivm, pos, 4);
1617 break;
1618 }
1619
1620 case TGSI_SEMANTIC_FACE:
1621 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1622 break;
1623
1624 case TGSI_SEMANTIC_SAMPLEID:
1625 value = get_sample_id(radeon_bld);
1626 break;
1627
1628 case TGSI_SEMANTIC_SAMPLEPOS: {
1629 LLVMValueRef pos[4] = {
1630 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1631 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1632 lp_build_const_float(gallivm, 0),
1633 lp_build_const_float(gallivm, 0)
1634 };
1635 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1636 TGSI_OPCODE_FRC, pos[0]);
1637 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1638 TGSI_OPCODE_FRC, pos[1]);
1639 value = lp_build_gather_values(gallivm, pos, 4);
1640 break;
1641 }
1642
1643 case TGSI_SEMANTIC_SAMPLEMASK:
1644 /* This can only occur with the OpenGL Core profile, which
1645 * doesn't support smoothing.
1646 */
1647 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1648 break;
1649
1650 case TGSI_SEMANTIC_TESSCOORD:
1651 {
1652 LLVMValueRef coord[4] = {
1653 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1654 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1655 bld->zero,
1656 bld->zero
1657 };
1658
1659 /* For triangles, the vector should be (u, v, 1-u-v). */
1660 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1661 PIPE_PRIM_TRIANGLES)
1662 coord[2] = lp_build_sub(bld, bld->one,
1663 lp_build_add(bld, coord[0], coord[1]));
1664
1665 value = lp_build_gather_values(gallivm, coord, 4);
1666 break;
1667 }
1668
1669 case TGSI_SEMANTIC_VERTICESIN:
1670 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1671 break;
1672
1673 case TGSI_SEMANTIC_TESSINNER:
1674 case TGSI_SEMANTIC_TESSOUTER:
1675 {
1676 LLVMValueRef rw_buffers, buffer, base, addr;
1677 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1678
1679 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1680 SI_PARAM_RW_BUFFERS);
1681 buffer = build_indexed_load_const(ctx, rw_buffers,
1682 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1683
1684 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1685 addr = get_tcs_tes_buffer_address(ctx, NULL,
1686 lp_build_const_int32(gallivm, param));
1687
1688 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1689 ~0, buffer, base, addr);
1690
1691 break;
1692 }
1693
1694 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1695 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1696 {
1697 LLVMValueRef buf, slot, val[4];
1698 int i, offset;
1699
1700 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1701 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1702 buf = build_indexed_load_const(ctx, buf, slot);
1703 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1704
1705 for (i = 0; i < 4; i++)
1706 val[i] = buffer_load_const(gallivm->builder, buf,
1707 lp_build_const_int32(gallivm, (offset + i) * 4),
1708 ctx->f32);
1709 value = lp_build_gather_values(gallivm, val, 4);
1710 break;
1711 }
1712
1713 case TGSI_SEMANTIC_PRIMID:
1714 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1715 break;
1716
1717 case TGSI_SEMANTIC_GRID_SIZE:
1718 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1719 break;
1720
1721 case TGSI_SEMANTIC_BLOCK_SIZE:
1722 {
1723 LLVMValueRef values[3];
1724 unsigned i;
1725 unsigned *properties = ctx->shader->selector->info.properties;
1726 unsigned sizes[3] = {
1727 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1728 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1729 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1730 };
1731
1732 for (i = 0; i < 3; ++i)
1733 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1734
1735 value = lp_build_gather_values(gallivm, values, 3);
1736 break;
1737 }
1738
1739 case TGSI_SEMANTIC_BLOCK_ID:
1740 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1741 break;
1742
1743 case TGSI_SEMANTIC_THREAD_ID:
1744 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1745 break;
1746
1747 #if HAVE_LLVM >= 0x0309
1748 case TGSI_SEMANTIC_HELPER_INVOCATION:
1749 value = lp_build_intrinsic(gallivm->builder,
1750 "llvm.amdgcn.ps.live",
1751 ctx->i1, NULL, 0,
1752 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1753 value = LLVMBuildNot(gallivm->builder, value, "");
1754 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1755 break;
1756 #endif
1757
1758 default:
1759 assert(!"unknown system value");
1760 return;
1761 }
1762
1763 radeon_bld->system_values[index] = value;
1764 }
1765
1766 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1767 const struct tgsi_full_declaration *decl)
1768 {
1769 struct si_shader_context *ctx =
1770 si_shader_context(&radeon_bld->soa.bld_base);
1771 struct si_shader_selector *sel = ctx->shader->selector;
1772 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1773
1774 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1775 LLVMValueRef var;
1776
1777 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1778 assert(decl->Range.First == decl->Range.Last);
1779 assert(!ctx->shared_memory);
1780
1781 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1782 LLVMArrayType(ctx->i8, sel->local_size),
1783 "compute_lds",
1784 LOCAL_ADDR_SPACE);
1785 LLVMSetAlignment(var, 4);
1786
1787 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1788 }
1789
1790 static LLVMValueRef fetch_constant(
1791 struct lp_build_tgsi_context *bld_base,
1792 const struct tgsi_full_src_register *reg,
1793 enum tgsi_opcode_type type,
1794 unsigned swizzle)
1795 {
1796 struct si_shader_context *ctx = si_shader_context(bld_base);
1797 struct lp_build_context *base = &bld_base->base;
1798 const struct tgsi_ind_register *ireg = &reg->Indirect;
1799 unsigned buf, idx;
1800
1801 LLVMValueRef addr, bufp;
1802 LLVMValueRef result;
1803
1804 if (swizzle == LP_CHAN_ALL) {
1805 unsigned chan;
1806 LLVMValueRef values[4];
1807 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1808 values[chan] = fetch_constant(bld_base, reg, type, chan);
1809
1810 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1811 }
1812
1813 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1814 idx = reg->Register.Index * 4 + swizzle;
1815
1816 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1817 if (!tgsi_type_is_64bit(type))
1818 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1819 else {
1820 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1821 ctx->constants[buf][idx],
1822 ctx->constants[buf][idx + 1]);
1823 }
1824 }
1825
1826 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1827 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1828 LLVMValueRef index;
1829 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1830 reg->Dimension.Index,
1831 SI_NUM_CONST_BUFFERS);
1832 bufp = build_indexed_load_const(ctx, ptr, index);
1833 } else
1834 bufp = ctx->const_buffers[buf];
1835
1836 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1837 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1838 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1839 addr = lp_build_add(&bld_base->uint_bld, addr,
1840 lp_build_const_int32(base->gallivm, idx * 4));
1841
1842 result = buffer_load_const(base->gallivm->builder, bufp,
1843 addr, ctx->f32);
1844
1845 if (!tgsi_type_is_64bit(type))
1846 result = bitcast(bld_base, type, result);
1847 else {
1848 LLVMValueRef addr2, result2;
1849 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1850 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1851 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1852 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1853 lp_build_const_int32(base->gallivm, idx * 4));
1854
1855 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1856 addr2, ctx->f32);
1857
1858 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1859 result, result2);
1860 }
1861 return result;
1862 }
1863
1864 /* Upper 16 bits must be zero. */
1865 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1866 LLVMValueRef val[2])
1867 {
1868 return LLVMBuildOr(gallivm->builder, val[0],
1869 LLVMBuildShl(gallivm->builder, val[1],
1870 lp_build_const_int32(gallivm, 16),
1871 ""), "");
1872 }
1873
1874 /* Upper 16 bits are ignored and will be dropped. */
1875 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1876 LLVMValueRef val[2])
1877 {
1878 LLVMValueRef v[2] = {
1879 LLVMBuildAnd(gallivm->builder, val[0],
1880 lp_build_const_int32(gallivm, 0xffff), ""),
1881 val[1],
1882 };
1883 return si_llvm_pack_two_int16(gallivm, v);
1884 }
1885
1886 /* Initialize arguments for the shader export intrinsic */
1887 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1888 LLVMValueRef *values,
1889 unsigned target,
1890 LLVMValueRef *args)
1891 {
1892 struct si_shader_context *ctx = si_shader_context(bld_base);
1893 struct lp_build_context *uint =
1894 &ctx->radeon_bld.soa.bld_base.uint_bld;
1895 struct lp_build_context *base = &bld_base->base;
1896 struct gallivm_state *gallivm = base->gallivm;
1897 LLVMBuilderRef builder = base->gallivm->builder;
1898 LLVMValueRef val[4];
1899 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1900 unsigned chan;
1901 bool is_int8;
1902
1903 /* Default is 0xf. Adjusted below depending on the format. */
1904 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1905
1906 /* Specify whether the EXEC mask represents the valid mask */
1907 args[1] = uint->zero;
1908
1909 /* Specify whether this is the last export */
1910 args[2] = uint->zero;
1911
1912 /* Specify the target we are exporting */
1913 args[3] = lp_build_const_int32(base->gallivm, target);
1914
1915 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1916 const union si_shader_key *key = &ctx->shader->key;
1917 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1918 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1919
1920 assert(cbuf >= 0 && cbuf < 8);
1921 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1922 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1923 }
1924
1925 args[4] = uint->zero; /* COMPR flag */
1926 args[5] = base->undef;
1927 args[6] = base->undef;
1928 args[7] = base->undef;
1929 args[8] = base->undef;
1930
1931 switch (spi_shader_col_format) {
1932 case V_028714_SPI_SHADER_ZERO:
1933 args[0] = uint->zero; /* writemask */
1934 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1935 break;
1936
1937 case V_028714_SPI_SHADER_32_R:
1938 args[0] = uint->one; /* writemask */
1939 args[5] = values[0];
1940 break;
1941
1942 case V_028714_SPI_SHADER_32_GR:
1943 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1944 args[5] = values[0];
1945 args[6] = values[1];
1946 break;
1947
1948 case V_028714_SPI_SHADER_32_AR:
1949 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
1950 args[5] = values[0];
1951 args[8] = values[3];
1952 break;
1953
1954 case V_028714_SPI_SHADER_FP16_ABGR:
1955 args[4] = uint->one; /* COMPR flag */
1956
1957 for (chan = 0; chan < 2; chan++) {
1958 LLVMValueRef pack_args[2] = {
1959 values[2 * chan],
1960 values[2 * chan + 1]
1961 };
1962 LLVMValueRef packed;
1963
1964 packed = lp_build_intrinsic(base->gallivm->builder,
1965 "llvm.SI.packf16",
1966 ctx->i32, pack_args, 2,
1967 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1968 args[chan + 5] =
1969 LLVMBuildBitCast(base->gallivm->builder,
1970 packed, ctx->f32, "");
1971 }
1972 break;
1973
1974 case V_028714_SPI_SHADER_UNORM16_ABGR:
1975 for (chan = 0; chan < 4; chan++) {
1976 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
1977 val[chan] = LLVMBuildFMul(builder, val[chan],
1978 lp_build_const_float(gallivm, 65535), "");
1979 val[chan] = LLVMBuildFAdd(builder, val[chan],
1980 lp_build_const_float(gallivm, 0.5), "");
1981 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1982 ctx->i32, "");
1983 }
1984
1985 args[4] = uint->one; /* COMPR flag */
1986 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1987 si_llvm_pack_two_int16(gallivm, val));
1988 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1989 si_llvm_pack_two_int16(gallivm, val+2));
1990 break;
1991
1992 case V_028714_SPI_SHADER_SNORM16_ABGR:
1993 for (chan = 0; chan < 4; chan++) {
1994 /* Clamp between [-1, 1]. */
1995 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1996 values[chan],
1997 lp_build_const_float(gallivm, 1));
1998 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1999 val[chan],
2000 lp_build_const_float(gallivm, -1));
2001 /* Convert to a signed integer in [-32767, 32767]. */
2002 val[chan] = LLVMBuildFMul(builder, val[chan],
2003 lp_build_const_float(gallivm, 32767), "");
2004 /* If positive, add 0.5, else add -0.5. */
2005 val[chan] = LLVMBuildFAdd(builder, val[chan],
2006 LLVMBuildSelect(builder,
2007 LLVMBuildFCmp(builder, LLVMRealOGE,
2008 val[chan], base->zero, ""),
2009 lp_build_const_float(gallivm, 0.5),
2010 lp_build_const_float(gallivm, -0.5), ""), "");
2011 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2012 }
2013
2014 args[4] = uint->one; /* COMPR flag */
2015 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2016 si_llvm_pack_two_int32_as_int16(gallivm, val));
2017 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2018 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2019 break;
2020
2021 case V_028714_SPI_SHADER_UINT16_ABGR: {
2022 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2023 255 : 65535);
2024 /* Clamp. */
2025 for (chan = 0; chan < 4; chan++) {
2026 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2027 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2028 val[chan], max);
2029 }
2030
2031 args[4] = uint->one; /* COMPR flag */
2032 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2033 si_llvm_pack_two_int16(gallivm, val));
2034 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2035 si_llvm_pack_two_int16(gallivm, val+2));
2036 break;
2037 }
2038
2039 case V_028714_SPI_SHADER_SINT16_ABGR: {
2040 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2041 127 : 32767);
2042 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2043 -128 : -32768);
2044 /* Clamp. */
2045 for (chan = 0; chan < 4; chan++) {
2046 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2047 val[chan] = lp_build_emit_llvm_binary(bld_base,
2048 TGSI_OPCODE_IMIN,
2049 val[chan], max);
2050 val[chan] = lp_build_emit_llvm_binary(bld_base,
2051 TGSI_OPCODE_IMAX,
2052 val[chan], min);
2053 }
2054
2055 args[4] = uint->one; /* COMPR flag */
2056 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2057 si_llvm_pack_two_int32_as_int16(gallivm, val));
2058 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2059 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2060 break;
2061 }
2062
2063 case V_028714_SPI_SHADER_32_ABGR:
2064 memcpy(&args[5], values, sizeof(values[0]) * 4);
2065 break;
2066 }
2067 }
2068
2069 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2070 LLVMValueRef alpha)
2071 {
2072 struct si_shader_context *ctx = si_shader_context(bld_base);
2073 struct gallivm_state *gallivm = bld_base->base.gallivm;
2074
2075 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2076 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2077 SI_PARAM_ALPHA_REF);
2078
2079 LLVMValueRef alpha_pass =
2080 lp_build_cmp(&bld_base->base,
2081 ctx->shader->key.ps.epilog.alpha_func,
2082 alpha, alpha_ref);
2083 LLVMValueRef arg =
2084 lp_build_select(&bld_base->base,
2085 alpha_pass,
2086 lp_build_const_float(gallivm, 1.0f),
2087 lp_build_const_float(gallivm, -1.0f));
2088
2089 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2090 ctx->voidt, &arg, 1, 0);
2091 } else {
2092 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2093 ctx->voidt, NULL, 0, 0);
2094 }
2095 }
2096
2097 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2098 LLVMValueRef alpha,
2099 unsigned samplemask_param)
2100 {
2101 struct si_shader_context *ctx = si_shader_context(bld_base);
2102 struct gallivm_state *gallivm = bld_base->base.gallivm;
2103 LLVMValueRef coverage;
2104
2105 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2106 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2107 samplemask_param);
2108 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2109
2110 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2111 ctx->i32,
2112 &coverage, 1, LLVMReadNoneAttribute);
2113
2114 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2115 ctx->f32, "");
2116
2117 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2118 lp_build_const_float(gallivm,
2119 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2120
2121 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2122 }
2123
2124 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2125 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2126 {
2127 struct si_shader_context *ctx = si_shader_context(bld_base);
2128 struct lp_build_context *base = &bld_base->base;
2129 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2130 unsigned reg_index;
2131 unsigned chan;
2132 unsigned const_chan;
2133 LLVMValueRef base_elt;
2134 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2135 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2136 SI_VS_CONST_CLIP_PLANES);
2137 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2138
2139 for (reg_index = 0; reg_index < 2; reg_index ++) {
2140 LLVMValueRef *args = pos[2 + reg_index];
2141
2142 args[5] =
2143 args[6] =
2144 args[7] =
2145 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2146
2147 /* Compute dot products of position and user clip plane vectors */
2148 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2149 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2150 args[1] = lp_build_const_int32(base->gallivm,
2151 ((reg_index * 4 + chan) * 4 +
2152 const_chan) * 4);
2153 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
2154 args[1], ctx->f32);
2155 args[5 + chan] =
2156 lp_build_add(base, args[5 + chan],
2157 lp_build_mul(base, base_elt,
2158 out_elts[const_chan]));
2159 }
2160 }
2161
2162 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2163 args[1] = uint->zero;
2164 args[2] = uint->zero;
2165 args[3] = lp_build_const_int32(base->gallivm,
2166 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2167 args[4] = uint->zero;
2168 }
2169 }
2170
2171 static void si_dump_streamout(struct pipe_stream_output_info *so)
2172 {
2173 unsigned i;
2174
2175 if (so->num_outputs)
2176 fprintf(stderr, "STREAMOUT\n");
2177
2178 for (i = 0; i < so->num_outputs; i++) {
2179 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2180 so->output[i].start_component;
2181 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2182 i, so->output[i].output_buffer,
2183 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2184 so->output[i].register_index,
2185 mask & 1 ? "x" : "",
2186 mask & 2 ? "y" : "",
2187 mask & 4 ? "z" : "",
2188 mask & 8 ? "w" : "");
2189 }
2190 }
2191
2192 /* On SI, the vertex shader is responsible for writing streamout data
2193 * to buffers. */
2194 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2195 struct si_shader_output_values *outputs,
2196 unsigned noutput)
2197 {
2198 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2199 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2200 LLVMBuilderRef builder = gallivm->builder;
2201 int i, j;
2202 struct lp_build_if_state if_ctx;
2203
2204 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2205 LLVMValueRef so_vtx_count =
2206 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2207
2208 LLVMValueRef tid = get_thread_id(ctx);
2209
2210 /* can_emit = tid < so_vtx_count; */
2211 LLVMValueRef can_emit =
2212 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2213
2214 LLVMValueRef stream_id =
2215 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2216
2217 /* Emit the streamout code conditionally. This actually avoids
2218 * out-of-bounds buffer access. The hw tells us via the SGPR
2219 * (so_vtx_count) which threads are allowed to emit streamout data. */
2220 lp_build_if(&if_ctx, gallivm, can_emit);
2221 {
2222 /* The buffer offset is computed as follows:
2223 * ByteOffset = streamout_offset[buffer_id]*4 +
2224 * (streamout_write_index + thread_id)*stride[buffer_id] +
2225 * attrib_offset
2226 */
2227
2228 LLVMValueRef so_write_index =
2229 LLVMGetParam(ctx->radeon_bld.main_fn,
2230 ctx->param_streamout_write_index);
2231
2232 /* Compute (streamout_write_index + thread_id). */
2233 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2234
2235 /* Compute the write offset for each enabled buffer. */
2236 LLVMValueRef so_write_offset[4] = {};
2237 for (i = 0; i < 4; i++) {
2238 if (!so->stride[i])
2239 continue;
2240
2241 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2242 ctx->param_streamout_offset[i]);
2243 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2244
2245 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2246 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2247 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2248 }
2249
2250 /* Write streamout data. */
2251 for (i = 0; i < so->num_outputs; i++) {
2252 unsigned buf_idx = so->output[i].output_buffer;
2253 unsigned reg = so->output[i].register_index;
2254 unsigned start = so->output[i].start_component;
2255 unsigned num_comps = so->output[i].num_components;
2256 unsigned stream = so->output[i].stream;
2257 LLVMValueRef out[4];
2258 struct lp_build_if_state if_ctx_stream;
2259
2260 assert(num_comps && num_comps <= 4);
2261 if (!num_comps || num_comps > 4)
2262 continue;
2263
2264 if (reg >= noutput)
2265 continue;
2266
2267 /* Load the output as int. */
2268 for (j = 0; j < num_comps; j++) {
2269 out[j] = LLVMBuildBitCast(builder,
2270 outputs[reg].values[start+j],
2271 ctx->i32, "");
2272 }
2273
2274 /* Pack the output. */
2275 LLVMValueRef vdata = NULL;
2276
2277 switch (num_comps) {
2278 case 1: /* as i32 */
2279 vdata = out[0];
2280 break;
2281 case 2: /* as v2i32 */
2282 case 3: /* as v4i32 (aligned to 4) */
2283 case 4: /* as v4i32 */
2284 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2285 for (j = 0; j < num_comps; j++) {
2286 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2287 LLVMConstInt(ctx->i32, j, 0), "");
2288 }
2289 break;
2290 }
2291
2292 LLVMValueRef can_emit_stream =
2293 LLVMBuildICmp(builder, LLVMIntEQ,
2294 stream_id,
2295 lp_build_const_int32(gallivm, stream), "");
2296
2297 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2298 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
2299 vdata, num_comps,
2300 so_write_offset[buf_idx],
2301 LLVMConstInt(ctx->i32, 0, 0),
2302 so->output[i].dst_offset*4);
2303 lp_build_endif(&if_ctx_stream);
2304 }
2305 }
2306 lp_build_endif(&if_ctx);
2307 }
2308
2309
2310 /* Generate export instructions for hardware VS shader stage */
2311 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2312 struct si_shader_output_values *outputs,
2313 unsigned noutput)
2314 {
2315 struct si_shader_context *ctx = si_shader_context(bld_base);
2316 struct si_shader *shader = ctx->shader;
2317 struct lp_build_context *base = &bld_base->base;
2318 struct lp_build_context *uint =
2319 &ctx->radeon_bld.soa.bld_base.uint_bld;
2320 LLVMValueRef args[9];
2321 LLVMValueRef pos_args[4][9] = { { 0 } };
2322 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2323 unsigned semantic_name, semantic_index;
2324 unsigned target;
2325 unsigned param_count = 0;
2326 unsigned pos_idx;
2327 int i;
2328
2329 if (outputs && ctx->shader->selector->so.num_outputs) {
2330 si_llvm_emit_streamout(ctx, outputs, noutput);
2331 }
2332
2333 for (i = 0; i < noutput; i++) {
2334 semantic_name = outputs[i].name;
2335 semantic_index = outputs[i].sid;
2336
2337 handle_semantic:
2338 /* Select the correct target */
2339 switch(semantic_name) {
2340 case TGSI_SEMANTIC_PSIZE:
2341 psize_value = outputs[i].values[0];
2342 continue;
2343 case TGSI_SEMANTIC_EDGEFLAG:
2344 edgeflag_value = outputs[i].values[0];
2345 continue;
2346 case TGSI_SEMANTIC_LAYER:
2347 layer_value = outputs[i].values[0];
2348 semantic_name = TGSI_SEMANTIC_GENERIC;
2349 goto handle_semantic;
2350 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2351 viewport_index_value = outputs[i].values[0];
2352 semantic_name = TGSI_SEMANTIC_GENERIC;
2353 goto handle_semantic;
2354 case TGSI_SEMANTIC_POSITION:
2355 target = V_008DFC_SQ_EXP_POS;
2356 break;
2357 case TGSI_SEMANTIC_COLOR:
2358 case TGSI_SEMANTIC_BCOLOR:
2359 target = V_008DFC_SQ_EXP_PARAM + param_count;
2360 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2361 shader->info.vs_output_param_offset[i] = param_count;
2362 param_count++;
2363 break;
2364 case TGSI_SEMANTIC_CLIPDIST:
2365 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2366 break;
2367 case TGSI_SEMANTIC_CLIPVERTEX:
2368 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2369 continue;
2370 case TGSI_SEMANTIC_PRIMID:
2371 case TGSI_SEMANTIC_FOG:
2372 case TGSI_SEMANTIC_TEXCOORD:
2373 case TGSI_SEMANTIC_GENERIC:
2374 target = V_008DFC_SQ_EXP_PARAM + param_count;
2375 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2376 shader->info.vs_output_param_offset[i] = param_count;
2377 param_count++;
2378 break;
2379 default:
2380 target = 0;
2381 fprintf(stderr,
2382 "Warning: SI unhandled vs output type:%d\n",
2383 semantic_name);
2384 }
2385
2386 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2387
2388 if (target >= V_008DFC_SQ_EXP_POS &&
2389 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2390 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2391 args, sizeof(args));
2392 } else {
2393 lp_build_intrinsic(base->gallivm->builder,
2394 "llvm.SI.export", ctx->voidt,
2395 args, 9, 0);
2396 }
2397
2398 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2399 semantic_name = TGSI_SEMANTIC_GENERIC;
2400 goto handle_semantic;
2401 }
2402 }
2403
2404 shader->info.nr_param_exports = param_count;
2405
2406 /* We need to add the position output manually if it's missing. */
2407 if (!pos_args[0][0]) {
2408 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2409 pos_args[0][1] = uint->zero; /* EXEC mask */
2410 pos_args[0][2] = uint->zero; /* last export? */
2411 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2412 pos_args[0][4] = uint->zero; /* COMPR flag */
2413 pos_args[0][5] = base->zero; /* X */
2414 pos_args[0][6] = base->zero; /* Y */
2415 pos_args[0][7] = base->zero; /* Z */
2416 pos_args[0][8] = base->one; /* W */
2417 }
2418
2419 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2420 if (shader->selector->info.writes_psize ||
2421 shader->selector->info.writes_edgeflag ||
2422 shader->selector->info.writes_viewport_index ||
2423 shader->selector->info.writes_layer) {
2424 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2425 shader->selector->info.writes_psize |
2426 (shader->selector->info.writes_edgeflag << 1) |
2427 (shader->selector->info.writes_layer << 2) |
2428 (shader->selector->info.writes_viewport_index << 3));
2429 pos_args[1][1] = uint->zero; /* EXEC mask */
2430 pos_args[1][2] = uint->zero; /* last export? */
2431 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2432 pos_args[1][4] = uint->zero; /* COMPR flag */
2433 pos_args[1][5] = base->zero; /* X */
2434 pos_args[1][6] = base->zero; /* Y */
2435 pos_args[1][7] = base->zero; /* Z */
2436 pos_args[1][8] = base->zero; /* W */
2437
2438 if (shader->selector->info.writes_psize)
2439 pos_args[1][5] = psize_value;
2440
2441 if (shader->selector->info.writes_edgeflag) {
2442 /* The output is a float, but the hw expects an integer
2443 * with the first bit containing the edge flag. */
2444 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2445 edgeflag_value,
2446 ctx->i32, "");
2447 edgeflag_value = lp_build_min(&bld_base->int_bld,
2448 edgeflag_value,
2449 bld_base->int_bld.one);
2450
2451 /* The LLVM intrinsic expects a float. */
2452 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2453 edgeflag_value,
2454 ctx->f32, "");
2455 }
2456
2457 if (shader->selector->info.writes_layer)
2458 pos_args[1][7] = layer_value;
2459
2460 if (shader->selector->info.writes_viewport_index)
2461 pos_args[1][8] = viewport_index_value;
2462 }
2463
2464 for (i = 0; i < 4; i++)
2465 if (pos_args[i][0])
2466 shader->info.nr_pos_exports++;
2467
2468 pos_idx = 0;
2469 for (i = 0; i < 4; i++) {
2470 if (!pos_args[i][0])
2471 continue;
2472
2473 /* Specify the target we are exporting */
2474 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2475
2476 if (pos_idx == shader->info.nr_pos_exports)
2477 /* Specify that this is the last export */
2478 pos_args[i][2] = uint->one;
2479
2480 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2481 ctx->voidt, pos_args[i], 9, 0);
2482 }
2483 }
2484
2485 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2486 {
2487 struct si_shader_context *ctx = si_shader_context(bld_base);
2488 struct gallivm_state *gallivm = bld_base->base.gallivm;
2489 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2490 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2491 uint64_t inputs;
2492
2493 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2494
2495 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2496 buffer = build_indexed_load_const(ctx, rw_buffers,
2497 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2498
2499 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2500
2501 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2502 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2503 lds_vertex_stride, "");
2504 lds_base = get_tcs_in_current_patch_offset(ctx);
2505 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2506
2507 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2508 while (inputs) {
2509 unsigned i = u_bit_scan64(&inputs);
2510
2511 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2512 lp_build_const_int32(gallivm, 4 * i),
2513 "");
2514
2515 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2516 invocation_id,
2517 lp_build_const_int32(gallivm, i));
2518
2519 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2520 lds_ptr);
2521
2522 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2523 buffer_offset, 0);
2524 }
2525 }
2526
2527 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2528 LLVMValueRef rel_patch_id,
2529 LLVMValueRef invocation_id,
2530 LLVMValueRef tcs_out_current_patch_data_offset)
2531 {
2532 struct si_shader_context *ctx = si_shader_context(bld_base);
2533 struct gallivm_state *gallivm = bld_base->base.gallivm;
2534 struct si_shader *shader = ctx->shader;
2535 unsigned tess_inner_index, tess_outer_index;
2536 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2537 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2538 unsigned stride, outer_comps, inner_comps, i;
2539 struct lp_build_if_state if_ctx, inner_if_ctx;
2540
2541 si_llvm_emit_barrier(NULL, bld_base, NULL);
2542
2543 /* Do this only for invocation 0, because the tess levels are per-patch,
2544 * not per-vertex.
2545 *
2546 * This can't jump, because invocation 0 executes this. It should
2547 * at least mask out the loads and stores for other invocations.
2548 */
2549 lp_build_if(&if_ctx, gallivm,
2550 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2551 invocation_id, bld_base->uint_bld.zero, ""));
2552
2553 /* Determine the layout of one tess factor element in the buffer. */
2554 switch (shader->key.tcs.epilog.prim_mode) {
2555 case PIPE_PRIM_LINES:
2556 stride = 2; /* 2 dwords, 1 vec2 store */
2557 outer_comps = 2;
2558 inner_comps = 0;
2559 break;
2560 case PIPE_PRIM_TRIANGLES:
2561 stride = 4; /* 4 dwords, 1 vec4 store */
2562 outer_comps = 3;
2563 inner_comps = 1;
2564 break;
2565 case PIPE_PRIM_QUADS:
2566 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2567 outer_comps = 4;
2568 inner_comps = 2;
2569 break;
2570 default:
2571 assert(0);
2572 return;
2573 }
2574
2575 /* Load tess_inner and tess_outer from LDS.
2576 * Any invocation can write them, so we can't get them from a temporary.
2577 */
2578 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2579 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2580
2581 lds_base = tcs_out_current_patch_data_offset;
2582 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2583 lp_build_const_int32(gallivm,
2584 tess_inner_index * 4), "");
2585 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2586 lp_build_const_int32(gallivm,
2587 tess_outer_index * 4), "");
2588
2589 for (i = 0; i < outer_comps; i++)
2590 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2591 for (i = 0; i < inner_comps; i++)
2592 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2593
2594 /* Convert the outputs to vectors for stores. */
2595 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2596 vec1 = NULL;
2597
2598 if (stride > 4)
2599 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2600
2601 /* Get the buffer. */
2602 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2603 SI_PARAM_RW_BUFFERS);
2604 buffer = build_indexed_load_const(ctx, rw_buffers,
2605 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2606
2607 /* Get the offset. */
2608 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2609 SI_PARAM_TESS_FACTOR_OFFSET);
2610 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2611 lp_build_const_int32(gallivm, 4 * stride), "");
2612
2613 lp_build_if(&inner_if_ctx, gallivm,
2614 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2615 rel_patch_id, bld_base->uint_bld.zero, ""));
2616
2617 /* Store the dynamic HS control word. */
2618 build_tbuffer_store_dwords(ctx, buffer,
2619 lp_build_const_int32(gallivm, 0x80000000),
2620 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2621
2622 lp_build_endif(&inner_if_ctx);
2623
2624 /* Store the tessellation factors. */
2625 build_tbuffer_store_dwords(ctx, buffer, vec0,
2626 MIN2(stride, 4), byteoffset, tf_base, 4);
2627 if (vec1)
2628 build_tbuffer_store_dwords(ctx, buffer, vec1,
2629 stride - 4, byteoffset, tf_base, 20);
2630 lp_build_endif(&if_ctx);
2631 }
2632
2633 /* This only writes the tessellation factor levels. */
2634 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2635 {
2636 struct si_shader_context *ctx = si_shader_context(bld_base);
2637 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2638
2639 rel_patch_id = get_rel_patch_id(ctx);
2640 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2641 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2642
2643 if (!ctx->is_monolithic) {
2644 /* Return epilog parameters from this function. */
2645 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2646 LLVMValueRef ret = ctx->return_value;
2647 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2648 unsigned vgpr;
2649
2650 /* RW_BUFFERS pointer */
2651 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2652 SI_PARAM_RW_BUFFERS);
2653 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2654 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2655 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2656 bld_base->uint_bld.zero, "");
2657 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2658 bld_base->uint_bld.one, "");
2659 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2660 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2661
2662 /* Tess factor buffer soffset is after user SGPRs. */
2663 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2664 SI_PARAM_TESS_FACTOR_OFFSET);
2665 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2666 SI_TCS_NUM_USER_SGPR + 1, "");
2667
2668 /* VGPRs */
2669 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2670 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2671 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2672
2673 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2674 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2675 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2676 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2677 ctx->return_value = ret;
2678 return;
2679 }
2680
2681 si_copy_tcs_inputs(bld_base);
2682 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2683 }
2684
2685 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2686 {
2687 struct si_shader_context *ctx = si_shader_context(bld_base);
2688 struct si_shader *shader = ctx->shader;
2689 struct tgsi_shader_info *info = &shader->selector->info;
2690 struct gallivm_state *gallivm = bld_base->base.gallivm;
2691 unsigned i, chan;
2692 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2693 ctx->param_rel_auto_id);
2694 LLVMValueRef vertex_dw_stride =
2695 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2696 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2697 vertex_dw_stride, "");
2698
2699 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2700 * its inputs from it. */
2701 for (i = 0; i < info->num_outputs; i++) {
2702 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2703 unsigned name = info->output_semantic_name[i];
2704 unsigned index = info->output_semantic_index[i];
2705 int param = si_shader_io_get_unique_index(name, index);
2706 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2707 lp_build_const_int32(gallivm, param * 4), "");
2708
2709 for (chan = 0; chan < 4; chan++) {
2710 lds_store(bld_base, chan, dw_addr,
2711 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2712 }
2713 }
2714 }
2715
2716 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2717 {
2718 struct si_shader_context *ctx = si_shader_context(bld_base);
2719 struct gallivm_state *gallivm = bld_base->base.gallivm;
2720 struct si_shader *es = ctx->shader;
2721 struct tgsi_shader_info *info = &es->selector->info;
2722 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2723 ctx->param_es2gs_offset);
2724 unsigned chan;
2725 int i;
2726
2727 for (i = 0; i < info->num_outputs; i++) {
2728 LLVMValueRef *out_ptr =
2729 ctx->radeon_bld.soa.outputs[i];
2730 int param_index;
2731
2732 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2733 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2734 continue;
2735
2736 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2737 info->output_semantic_index[i]);
2738
2739 for (chan = 0; chan < 4; chan++) {
2740 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2741 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2742
2743 build_tbuffer_store(ctx,
2744 ctx->esgs_ring,
2745 out_val, 1,
2746 LLVMGetUndef(ctx->i32), soffset,
2747 (4 * param_index + chan) * 4,
2748 V_008F0C_BUF_DATA_FORMAT_32,
2749 V_008F0C_BUF_NUM_FORMAT_UINT,
2750 0, 0, 1, 1, 0);
2751 }
2752 }
2753 }
2754
2755 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2756 {
2757 struct si_shader_context *ctx = si_shader_context(bld_base);
2758 struct gallivm_state *gallivm = bld_base->base.gallivm;
2759 LLVMValueRef args[2];
2760
2761 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2762 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2763 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2764 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
2765 }
2766
2767 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2768 {
2769 struct si_shader_context *ctx = si_shader_context(bld_base);
2770 struct gallivm_state *gallivm = bld_base->base.gallivm;
2771 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2772 struct si_shader_output_values *outputs = NULL;
2773 int i,j;
2774
2775 assert(!ctx->is_gs_copy_shader);
2776
2777 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2778
2779 /* Vertex color clamping.
2780 *
2781 * This uses a state constant loaded in a user data SGPR and
2782 * an IF statement is added that clamps all colors if the constant
2783 * is true.
2784 */
2785 if (ctx->type == PIPE_SHADER_VERTEX) {
2786 struct lp_build_if_state if_ctx;
2787 LLVMValueRef cond = NULL;
2788 LLVMValueRef addr, val;
2789
2790 for (i = 0; i < info->num_outputs; i++) {
2791 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2792 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2793 continue;
2794
2795 /* We've found a color. */
2796 if (!cond) {
2797 /* The state is in the first bit of the user SGPR. */
2798 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2799 SI_PARAM_VS_STATE_BITS);
2800 cond = LLVMBuildTrunc(gallivm->builder, cond,
2801 ctx->i1, "");
2802 lp_build_if(&if_ctx, gallivm, cond);
2803 }
2804
2805 for (j = 0; j < 4; j++) {
2806 addr = ctx->radeon_bld.soa.outputs[i][j];
2807 val = LLVMBuildLoad(gallivm->builder, addr, "");
2808 val = radeon_llvm_saturate(bld_base, val);
2809 LLVMBuildStore(gallivm->builder, val, addr);
2810 }
2811 }
2812
2813 if (cond)
2814 lp_build_endif(&if_ctx);
2815 }
2816
2817 for (i = 0; i < info->num_outputs; i++) {
2818 outputs[i].name = info->output_semantic_name[i];
2819 outputs[i].sid = info->output_semantic_index[i];
2820
2821 for (j = 0; j < 4; j++)
2822 outputs[i].values[j] =
2823 LLVMBuildLoad(gallivm->builder,
2824 ctx->radeon_bld.soa.outputs[i][j],
2825 "");
2826 }
2827
2828 if (ctx->is_monolithic) {
2829 /* Export PrimitiveID when PS needs it. */
2830 if (si_vs_exports_prim_id(ctx->shader)) {
2831 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2832 outputs[i].sid = 0;
2833 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2834 get_primitive_id(bld_base, 0));
2835 outputs[i].values[1] = bld_base->base.undef;
2836 outputs[i].values[2] = bld_base->base.undef;
2837 outputs[i].values[3] = bld_base->base.undef;
2838 i++;
2839 }
2840 } else {
2841 /* Return the primitive ID from the LLVM function. */
2842 ctx->return_value =
2843 LLVMBuildInsertValue(gallivm->builder,
2844 ctx->return_value,
2845 bitcast(bld_base, TGSI_TYPE_FLOAT,
2846 get_primitive_id(bld_base, 0)),
2847 VS_EPILOG_PRIMID_LOC, "");
2848 }
2849
2850 si_llvm_export_vs(bld_base, outputs, i);
2851 FREE(outputs);
2852 }
2853
2854 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2855 LLVMValueRef depth, LLVMValueRef stencil,
2856 LLVMValueRef samplemask)
2857 {
2858 struct si_shader_context *ctx = si_shader_context(bld_base);
2859 struct lp_build_context *base = &bld_base->base;
2860 struct lp_build_context *uint = &bld_base->uint_bld;
2861 LLVMValueRef args[9];
2862 unsigned mask = 0;
2863
2864 assert(depth || stencil || samplemask);
2865
2866 args[1] = uint->one; /* whether the EXEC mask is valid */
2867 args[2] = uint->one; /* DONE bit */
2868
2869 /* Specify the target we are exporting */
2870 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2871
2872 args[4] = uint->zero; /* COMP flag */
2873 args[5] = base->undef; /* R, depth */
2874 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2875 args[7] = base->undef; /* B, sample mask */
2876 args[8] = base->undef; /* A, alpha to mask */
2877
2878 if (depth) {
2879 args[5] = depth;
2880 mask |= 0x1;
2881 }
2882
2883 if (stencil) {
2884 args[6] = stencil;
2885 mask |= 0x2;
2886 }
2887
2888 if (samplemask) {
2889 args[7] = samplemask;
2890 mask |= 0x4;
2891 }
2892
2893 /* SI (except OLAND) has a bug that it only looks
2894 * at the X writemask component. */
2895 if (ctx->screen->b.chip_class == SI &&
2896 ctx->screen->b.family != CHIP_OLAND)
2897 mask |= 0x1;
2898
2899 /* Specify which components to enable */
2900 args[0] = lp_build_const_int32(base->gallivm, mask);
2901
2902 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2903 ctx->voidt, args, 9, 0);
2904 }
2905
2906 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2907 LLVMValueRef *color, unsigned index,
2908 unsigned samplemask_param,
2909 bool is_last)
2910 {
2911 struct si_shader_context *ctx = si_shader_context(bld_base);
2912 struct lp_build_context *base = &bld_base->base;
2913 int i;
2914
2915 /* Clamp color */
2916 if (ctx->shader->key.ps.epilog.clamp_color)
2917 for (i = 0; i < 4; i++)
2918 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2919
2920 /* Alpha to one */
2921 if (ctx->shader->key.ps.epilog.alpha_to_one)
2922 color[3] = base->one;
2923
2924 /* Alpha test */
2925 if (index == 0 &&
2926 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2927 si_alpha_test(bld_base, color[3]);
2928
2929 /* Line & polygon smoothing */
2930 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
2931 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2932 samplemask_param);
2933
2934 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2935 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
2936 LLVMValueRef args[8][9];
2937 int c, last = -1;
2938
2939 /* Get the export arguments, also find out what the last one is. */
2940 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2941 si_llvm_init_export_args(bld_base, color,
2942 V_008DFC_SQ_EXP_MRT + c, args[c]);
2943 if (args[c][0] != bld_base->uint_bld.zero)
2944 last = c;
2945 }
2946
2947 /* Emit all exports. */
2948 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2949 if (is_last && last == c) {
2950 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2951 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
2952 } else if (args[c][0] == bld_base->uint_bld.zero)
2953 continue; /* unnecessary NULL export */
2954
2955 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2956 ctx->voidt, args[c], 9, 0);
2957 }
2958 } else {
2959 LLVMValueRef args[9];
2960
2961 /* Export */
2962 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2963 args);
2964 if (is_last) {
2965 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2966 args[2] = bld_base->uint_bld.one; /* DONE bit */
2967 } else if (args[0] == bld_base->uint_bld.zero)
2968 return; /* unnecessary NULL export */
2969
2970 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2971 ctx->voidt, args, 9, 0);
2972 }
2973 }
2974
2975 static void si_export_null(struct lp_build_tgsi_context *bld_base)
2976 {
2977 struct si_shader_context *ctx = si_shader_context(bld_base);
2978 struct lp_build_context *base = &bld_base->base;
2979 struct lp_build_context *uint = &bld_base->uint_bld;
2980 LLVMValueRef args[9];
2981
2982 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
2983 args[1] = uint->one; /* whether the EXEC mask is valid */
2984 args[2] = uint->one; /* DONE bit */
2985 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2986 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
2987 args[5] = uint->undef; /* R */
2988 args[6] = uint->undef; /* G */
2989 args[7] = uint->undef; /* B */
2990 args[8] = uint->undef; /* A */
2991
2992 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2993 ctx->voidt, args, 9, 0);
2994 }
2995
2996 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
2997 {
2998 struct si_shader_context *ctx = si_shader_context(bld_base);
2999 struct si_shader *shader = ctx->shader;
3000 struct lp_build_context *base = &bld_base->base;
3001 struct tgsi_shader_info *info = &shader->selector->info;
3002 LLVMBuilderRef builder = base->gallivm->builder;
3003 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3004 int last_color_export = -1;
3005 int i;
3006
3007 /* Determine the last export. If MRTZ is present, it's always last.
3008 * Otherwise, find the last color export.
3009 */
3010 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3011 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3012
3013 /* Don't export NULL and return if alpha-test is enabled. */
3014 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3015 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3016 (spi_format & 0xf) == 0)
3017 spi_format |= V_028714_SPI_SHADER_32_AR;
3018
3019 for (i = 0; i < info->num_outputs; i++) {
3020 unsigned index = info->output_semantic_index[i];
3021
3022 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3023 continue;
3024
3025 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3026 if (shader->key.ps.epilog.last_cbuf > 0) {
3027 /* Just set this if any of the colorbuffers are enabled. */
3028 if (spi_format &
3029 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3030 last_color_export = i;
3031 continue;
3032 }
3033
3034 if ((spi_format >> (index * 4)) & 0xf)
3035 last_color_export = i;
3036 }
3037
3038 /* If there are no outputs, export NULL. */
3039 if (last_color_export == -1) {
3040 si_export_null(bld_base);
3041 return;
3042 }
3043 }
3044
3045 for (i = 0; i < info->num_outputs; i++) {
3046 unsigned semantic_name = info->output_semantic_name[i];
3047 unsigned semantic_index = info->output_semantic_index[i];
3048 unsigned j;
3049 LLVMValueRef color[4] = {};
3050
3051 /* Select the correct target */
3052 switch (semantic_name) {
3053 case TGSI_SEMANTIC_POSITION:
3054 depth = LLVMBuildLoad(builder,
3055 ctx->radeon_bld.soa.outputs[i][2], "");
3056 break;
3057 case TGSI_SEMANTIC_STENCIL:
3058 stencil = LLVMBuildLoad(builder,
3059 ctx->radeon_bld.soa.outputs[i][1], "");
3060 break;
3061 case TGSI_SEMANTIC_SAMPLEMASK:
3062 samplemask = LLVMBuildLoad(builder,
3063 ctx->radeon_bld.soa.outputs[i][0], "");
3064 break;
3065 case TGSI_SEMANTIC_COLOR:
3066 for (j = 0; j < 4; j++)
3067 color[j] = LLVMBuildLoad(builder,
3068 ctx->radeon_bld.soa.outputs[i][j], "");
3069
3070 si_export_mrt_color(bld_base, color, semantic_index,
3071 SI_PARAM_SAMPLE_COVERAGE,
3072 last_color_export == i);
3073 break;
3074 default:
3075 fprintf(stderr,
3076 "Warning: SI unhandled fs output type:%d\n",
3077 semantic_name);
3078 }
3079 }
3080
3081 if (depth || stencil || samplemask)
3082 si_export_mrt_z(bld_base, depth, stencil, samplemask);
3083 }
3084
3085 /**
3086 * Return PS outputs in this order:
3087 *
3088 * v[0:3] = color0.xyzw
3089 * v[4:7] = color1.xyzw
3090 * ...
3091 * vN+0 = Depth
3092 * vN+1 = Stencil
3093 * vN+2 = SampleMask
3094 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3095 *
3096 * The alpha-ref SGPR is returned via its original location.
3097 */
3098 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3099 {
3100 struct si_shader_context *ctx = si_shader_context(bld_base);
3101 struct si_shader *shader = ctx->shader;
3102 struct lp_build_context *base = &bld_base->base;
3103 struct tgsi_shader_info *info = &shader->selector->info;
3104 LLVMBuilderRef builder = base->gallivm->builder;
3105 unsigned i, j, first_vgpr, vgpr;
3106
3107 LLVMValueRef color[8][4] = {};
3108 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3109 LLVMValueRef ret;
3110
3111 /* Read the output values. */
3112 for (i = 0; i < info->num_outputs; i++) {
3113 unsigned semantic_name = info->output_semantic_name[i];
3114 unsigned semantic_index = info->output_semantic_index[i];
3115
3116 switch (semantic_name) {
3117 case TGSI_SEMANTIC_COLOR:
3118 assert(semantic_index < 8);
3119 for (j = 0; j < 4; j++) {
3120 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3121 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3122 color[semantic_index][j] = result;
3123 }
3124 break;
3125 case TGSI_SEMANTIC_POSITION:
3126 depth = LLVMBuildLoad(builder,
3127 ctx->radeon_bld.soa.outputs[i][2], "");
3128 break;
3129 case TGSI_SEMANTIC_STENCIL:
3130 stencil = LLVMBuildLoad(builder,
3131 ctx->radeon_bld.soa.outputs[i][1], "");
3132 break;
3133 case TGSI_SEMANTIC_SAMPLEMASK:
3134 samplemask = LLVMBuildLoad(builder,
3135 ctx->radeon_bld.soa.outputs[i][0], "");
3136 break;
3137 default:
3138 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3139 semantic_name);
3140 }
3141 }
3142
3143 /* Fill the return structure. */
3144 ret = ctx->return_value;
3145
3146 /* Set SGPRs. */
3147 ret = LLVMBuildInsertValue(builder, ret,
3148 bitcast(bld_base, TGSI_TYPE_SIGNED,
3149 LLVMGetParam(ctx->radeon_bld.main_fn,
3150 SI_PARAM_ALPHA_REF)),
3151 SI_SGPR_ALPHA_REF, "");
3152
3153 /* Set VGPRs */
3154 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3155 for (i = 0; i < ARRAY_SIZE(color); i++) {
3156 if (!color[i][0])
3157 continue;
3158
3159 for (j = 0; j < 4; j++)
3160 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3161 }
3162 if (depth)
3163 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3164 if (stencil)
3165 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3166 if (samplemask)
3167 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3168
3169 /* Add the input sample mask for smoothing at the end. */
3170 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3171 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3172 ret = LLVMBuildInsertValue(builder, ret,
3173 LLVMGetParam(ctx->radeon_bld.main_fn,
3174 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3175
3176 ctx->return_value = ret;
3177 }
3178
3179 /**
3180 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3181 * buffer in number of elements and return it as an i32.
3182 */
3183 static LLVMValueRef get_buffer_size(
3184 struct lp_build_tgsi_context *bld_base,
3185 LLVMValueRef descriptor)
3186 {
3187 struct si_shader_context *ctx = si_shader_context(bld_base);
3188 struct gallivm_state *gallivm = bld_base->base.gallivm;
3189 LLVMBuilderRef builder = gallivm->builder;
3190 LLVMValueRef size =
3191 LLVMBuildExtractElement(builder, descriptor,
3192 lp_build_const_int32(gallivm, 6), "");
3193
3194 if (ctx->screen->b.chip_class >= VI) {
3195 /* On VI, the descriptor contains the size in bytes,
3196 * but TXQ must return the size in elements.
3197 * The stride is always non-zero for resources using TXQ.
3198 */
3199 LLVMValueRef stride =
3200 LLVMBuildExtractElement(builder, descriptor,
3201 lp_build_const_int32(gallivm, 5), "");
3202 stride = LLVMBuildLShr(builder, stride,
3203 lp_build_const_int32(gallivm, 16), "");
3204 stride = LLVMBuildAnd(builder, stride,
3205 lp_build_const_int32(gallivm, 0x3FFF), "");
3206
3207 size = LLVMBuildUDiv(builder, size, stride, "");
3208 }
3209
3210 return size;
3211 }
3212
3213 /**
3214 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3215 * intrinsic names).
3216 */
3217 static void build_int_type_name(
3218 LLVMTypeRef type,
3219 char *buf, unsigned bufsize)
3220 {
3221 assert(bufsize >= 6);
3222
3223 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
3224 snprintf(buf, bufsize, "v%ui32",
3225 LLVMGetVectorSize(type));
3226 else
3227 strcpy(buf, "i32");
3228 }
3229
3230 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3231 struct lp_build_tgsi_context *bld_base,
3232 struct lp_build_emit_data *emit_data);
3233
3234 /* Prevent optimizations (at least of memory accesses) across the current
3235 * point in the program by emitting empty inline assembly that is marked as
3236 * having side effects.
3237 */
3238 static void emit_optimization_barrier(struct si_shader_context *ctx)
3239 {
3240 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3241 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3242 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3243 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3244 }
3245
3246 static void emit_waitcnt(struct si_shader_context *ctx)
3247 {
3248 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3249 LLVMBuilderRef builder = gallivm->builder;
3250 LLVMValueRef args[1] = {
3251 lp_build_const_int32(gallivm, 0xf70)
3252 };
3253 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3254 ctx->voidt, args, 1, LLVMNoUnwindAttribute);
3255 }
3256
3257 static void membar_emit(
3258 const struct lp_build_tgsi_action *action,
3259 struct lp_build_tgsi_context *bld_base,
3260 struct lp_build_emit_data *emit_data)
3261 {
3262 struct si_shader_context *ctx = si_shader_context(bld_base);
3263
3264 emit_waitcnt(ctx);
3265 }
3266
3267 static LLVMValueRef
3268 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3269 const struct tgsi_full_src_register *reg)
3270 {
3271 LLVMValueRef ind_index;
3272 LLVMValueRef rsrc_ptr;
3273
3274 if (!reg->Register.Indirect)
3275 return ctx->shader_buffers[reg->Register.Index];
3276
3277 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
3278 reg->Register.Index,
3279 SI_NUM_SHADER_BUFFERS);
3280
3281 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
3282 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3283 }
3284
3285 static bool tgsi_is_array_sampler(unsigned target)
3286 {
3287 return target == TGSI_TEXTURE_1D_ARRAY ||
3288 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3289 target == TGSI_TEXTURE_2D_ARRAY ||
3290 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3291 target == TGSI_TEXTURE_CUBE_ARRAY ||
3292 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3293 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3294 }
3295
3296 static bool tgsi_is_array_image(unsigned target)
3297 {
3298 return target == TGSI_TEXTURE_3D ||
3299 target == TGSI_TEXTURE_CUBE ||
3300 target == TGSI_TEXTURE_1D_ARRAY ||
3301 target == TGSI_TEXTURE_2D_ARRAY ||
3302 target == TGSI_TEXTURE_CUBE_ARRAY ||
3303 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3304 }
3305
3306 /**
3307 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3308 *
3309 * At least on Tonga, executing image stores on images with DCC enabled and
3310 * non-trivial can eventually lead to lockups. This can occur when an
3311 * application binds an image as read-only but then uses a shader that writes
3312 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3313 * program termination) in this case, but it doesn't cost much to be a bit
3314 * nicer: disabling DCC in the shader still leads to undefined results but
3315 * avoids the lockup.
3316 */
3317 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3318 LLVMValueRef rsrc)
3319 {
3320 if (ctx->screen->b.chip_class <= CIK) {
3321 return rsrc;
3322 } else {
3323 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3324 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3325 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3326 LLVMValueRef tmp;
3327
3328 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3329 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3330 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3331 }
3332 }
3333
3334 /**
3335 * Load the resource descriptor for \p image.
3336 */
3337 static void
3338 image_fetch_rsrc(
3339 struct lp_build_tgsi_context *bld_base,
3340 const struct tgsi_full_src_register *image,
3341 bool dcc_off,
3342 LLVMValueRef *rsrc)
3343 {
3344 struct si_shader_context *ctx = si_shader_context(bld_base);
3345
3346 assert(image->Register.File == TGSI_FILE_IMAGE);
3347
3348 if (!image->Register.Indirect) {
3349 /* Fast path: use preloaded resources */
3350 *rsrc = ctx->images[image->Register.Index];
3351 } else {
3352 /* Indexing and manual load */
3353 LLVMValueRef ind_index;
3354 LLVMValueRef rsrc_ptr;
3355 LLVMValueRef tmp;
3356
3357 /* From the GL_ARB_shader_image_load_store extension spec:
3358 *
3359 * If a shader performs an image load, store, or atomic
3360 * operation using an image variable declared as an array,
3361 * and if the index used to select an individual element is
3362 * negative or greater than or equal to the size of the
3363 * array, the results of the operation are undefined but may
3364 * not lead to termination.
3365 */
3366 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
3367 image->Register.Index,
3368 SI_NUM_IMAGES);
3369
3370 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
3371 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
3372 if (dcc_off)
3373 tmp = force_dcc_off(ctx, tmp);
3374 *rsrc = tmp;
3375 }
3376 }
3377
3378 static LLVMValueRef image_fetch_coords(
3379 struct lp_build_tgsi_context *bld_base,
3380 const struct tgsi_full_instruction *inst,
3381 unsigned src)
3382 {
3383 struct gallivm_state *gallivm = bld_base->base.gallivm;
3384 LLVMBuilderRef builder = gallivm->builder;
3385 unsigned target = inst->Memory.Texture;
3386 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3387 LLVMValueRef coords[4];
3388 LLVMValueRef tmp;
3389 int chan;
3390
3391 for (chan = 0; chan < num_coords; ++chan) {
3392 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3393 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3394 coords[chan] = tmp;
3395 }
3396
3397 if (num_coords == 1)
3398 return coords[0];
3399
3400 if (num_coords == 3) {
3401 /* LLVM has difficulties lowering 3-element vectors. */
3402 coords[3] = bld_base->uint_bld.undef;
3403 num_coords = 4;
3404 }
3405
3406 return lp_build_gather_values(gallivm, coords, num_coords);
3407 }
3408
3409 /**
3410 * Append the extra mode bits that are used by image load and store.
3411 */
3412 static void image_append_args(
3413 struct si_shader_context *ctx,
3414 struct lp_build_emit_data * emit_data,
3415 unsigned target,
3416 bool atomic)
3417 {
3418 const struct tgsi_full_instruction *inst = emit_data->inst;
3419 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3420 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3421
3422 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3423 emit_data->args[emit_data->arg_count++] =
3424 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3425 if (!atomic) {
3426 emit_data->args[emit_data->arg_count++] =
3427 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3428 i1true : i1false; /* glc */
3429 }
3430 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3431 }
3432
3433 /**
3434 * Given a 256 bit resource, extract the top half (which stores the buffer
3435 * resource in the case of textures and images).
3436 */
3437 static LLVMValueRef extract_rsrc_top_half(
3438 struct si_shader_context *ctx,
3439 LLVMValueRef rsrc)
3440 {
3441 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3442 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3443 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3444
3445 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3446 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3447 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3448
3449 return rsrc;
3450 }
3451
3452 /**
3453 * Append the resource and indexing arguments for buffer intrinsics.
3454 *
3455 * \param rsrc the v4i32 buffer resource
3456 * \param index index into the buffer (stride-based)
3457 * \param offset byte offset into the buffer
3458 */
3459 static void buffer_append_args(
3460 struct si_shader_context *ctx,
3461 struct lp_build_emit_data *emit_data,
3462 LLVMValueRef rsrc,
3463 LLVMValueRef index,
3464 LLVMValueRef offset,
3465 bool atomic)
3466 {
3467 const struct tgsi_full_instruction *inst = emit_data->inst;
3468 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3469 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3470
3471 emit_data->args[emit_data->arg_count++] = rsrc;
3472 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3473 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3474 if (!atomic) {
3475 emit_data->args[emit_data->arg_count++] =
3476 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3477 i1true : i1false; /* glc */
3478 }
3479 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3480 }
3481
3482 static void load_fetch_args(
3483 struct lp_build_tgsi_context * bld_base,
3484 struct lp_build_emit_data * emit_data)
3485 {
3486 struct si_shader_context *ctx = si_shader_context(bld_base);
3487 struct gallivm_state *gallivm = bld_base->base.gallivm;
3488 const struct tgsi_full_instruction * inst = emit_data->inst;
3489 unsigned target = inst->Memory.Texture;
3490 LLVMValueRef rsrc;
3491
3492 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3493
3494 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3495 LLVMBuilderRef builder = gallivm->builder;
3496 LLVMValueRef offset;
3497 LLVMValueRef tmp;
3498
3499 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3500
3501 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3502 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3503
3504 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3505 offset, false);
3506 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3507 LLVMValueRef coords;
3508
3509 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3510 coords = image_fetch_coords(bld_base, inst, 1);
3511
3512 if (target == TGSI_TEXTURE_BUFFER) {
3513 rsrc = extract_rsrc_top_half(ctx, rsrc);
3514 buffer_append_args(ctx, emit_data, rsrc, coords,
3515 bld_base->uint_bld.zero, false);
3516 } else {
3517 emit_data->args[0] = coords;
3518 emit_data->args[1] = rsrc;
3519 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3520 emit_data->arg_count = 3;
3521
3522 image_append_args(ctx, emit_data, target, false);
3523 }
3524 }
3525 }
3526
3527 static void load_emit_buffer(struct si_shader_context *ctx,
3528 struct lp_build_emit_data *emit_data)
3529 {
3530 const struct tgsi_full_instruction *inst = emit_data->inst;
3531 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3532 LLVMBuilderRef builder = gallivm->builder;
3533 uint writemask = inst->Dst[0].Register.WriteMask;
3534 uint count = util_last_bit(writemask);
3535 const char *intrinsic_name;
3536 LLVMTypeRef dst_type;
3537
3538 switch (count) {
3539 case 1:
3540 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3541 dst_type = ctx->f32;
3542 break;
3543 case 2:
3544 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3545 dst_type = LLVMVectorType(ctx->f32, 2);
3546 break;
3547 default: // 3 & 4
3548 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3549 dst_type = ctx->v4f32;
3550 count = 4;
3551 }
3552
3553 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3554 builder, intrinsic_name, dst_type,
3555 emit_data->args, emit_data->arg_count,
3556 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3557 }
3558
3559 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3560 const struct tgsi_full_instruction *inst,
3561 LLVMTypeRef type, int arg)
3562 {
3563 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3564 LLVMBuilderRef builder = gallivm->builder;
3565 LLVMValueRef offset, ptr;
3566 int addr_space;
3567
3568 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3569 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3570
3571 ptr = ctx->shared_memory;
3572 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3573 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3574 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3575
3576 return ptr;
3577 }
3578
3579 static void load_emit_memory(
3580 struct si_shader_context *ctx,
3581 struct lp_build_emit_data *emit_data)
3582 {
3583 const struct tgsi_full_instruction *inst = emit_data->inst;
3584 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3585 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3586 LLVMBuilderRef builder = gallivm->builder;
3587 unsigned writemask = inst->Dst[0].Register.WriteMask;
3588 LLVMValueRef channels[4], ptr, derived_ptr, index;
3589 int chan;
3590
3591 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3592
3593 for (chan = 0; chan < 4; ++chan) {
3594 if (!(writemask & (1 << chan))) {
3595 channels[chan] = LLVMGetUndef(base->elem_type);
3596 continue;
3597 }
3598
3599 index = lp_build_const_int32(gallivm, chan);
3600 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3601 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3602 }
3603 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3604 }
3605
3606 static void load_emit(
3607 const struct lp_build_tgsi_action *action,
3608 struct lp_build_tgsi_context *bld_base,
3609 struct lp_build_emit_data *emit_data)
3610 {
3611 struct si_shader_context *ctx = si_shader_context(bld_base);
3612 struct gallivm_state *gallivm = bld_base->base.gallivm;
3613 LLVMBuilderRef builder = gallivm->builder;
3614 const struct tgsi_full_instruction * inst = emit_data->inst;
3615 char intrinsic_name[32];
3616 char coords_type[8];
3617
3618 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3619 load_emit_memory(ctx, emit_data);
3620 return;
3621 }
3622
3623 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3624 emit_waitcnt(ctx);
3625
3626 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3627 load_emit_buffer(ctx, emit_data);
3628 return;
3629 }
3630
3631 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3632 emit_data->output[emit_data->chan] =
3633 lp_build_intrinsic(
3634 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3635 emit_data->args, emit_data->arg_count,
3636 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3637 } else {
3638 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3639 coords_type, sizeof(coords_type));
3640
3641 snprintf(intrinsic_name, sizeof(intrinsic_name),
3642 "llvm.amdgcn.image.load.%s", coords_type);
3643
3644 emit_data->output[emit_data->chan] =
3645 lp_build_intrinsic(
3646 builder, intrinsic_name, emit_data->dst_type,
3647 emit_data->args, emit_data->arg_count,
3648 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3649 }
3650 }
3651
3652 static void store_fetch_args(
3653 struct lp_build_tgsi_context * bld_base,
3654 struct lp_build_emit_data * emit_data)
3655 {
3656 struct si_shader_context *ctx = si_shader_context(bld_base);
3657 struct gallivm_state *gallivm = bld_base->base.gallivm;
3658 LLVMBuilderRef builder = gallivm->builder;
3659 const struct tgsi_full_instruction * inst = emit_data->inst;
3660 struct tgsi_full_src_register memory;
3661 LLVMValueRef chans[4];
3662 LLVMValueRef data;
3663 LLVMValueRef rsrc;
3664 unsigned chan;
3665
3666 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3667
3668 for (chan = 0; chan < 4; ++chan) {
3669 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3670 }
3671 data = lp_build_gather_values(gallivm, chans, 4);
3672
3673 emit_data->args[emit_data->arg_count++] = data;
3674
3675 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3676
3677 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3678 LLVMValueRef offset;
3679 LLVMValueRef tmp;
3680
3681 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3682
3683 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3684 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3685
3686 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3687 offset, false);
3688 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3689 unsigned target = inst->Memory.Texture;
3690 LLVMValueRef coords;
3691
3692 coords = image_fetch_coords(bld_base, inst, 0);
3693
3694 if (target == TGSI_TEXTURE_BUFFER) {
3695 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3696
3697 rsrc = extract_rsrc_top_half(ctx, rsrc);
3698 buffer_append_args(ctx, emit_data, rsrc, coords,
3699 bld_base->uint_bld.zero, false);
3700 } else {
3701 emit_data->args[1] = coords;
3702 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3703 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3704 emit_data->arg_count = 4;
3705
3706 image_append_args(ctx, emit_data, target, false);
3707 }
3708 }
3709 }
3710
3711 static void store_emit_buffer(
3712 struct si_shader_context *ctx,
3713 struct lp_build_emit_data *emit_data)
3714 {
3715 const struct tgsi_full_instruction *inst = emit_data->inst;
3716 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3717 LLVMBuilderRef builder = gallivm->builder;
3718 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3719 LLVMValueRef base_data = emit_data->args[0];
3720 LLVMValueRef base_offset = emit_data->args[3];
3721 unsigned writemask = inst->Dst[0].Register.WriteMask;
3722
3723 while (writemask) {
3724 int start, count;
3725 const char *intrinsic_name;
3726 LLVMValueRef data;
3727 LLVMValueRef offset;
3728 LLVMValueRef tmp;
3729
3730 u_bit_scan_consecutive_range(&writemask, &start, &count);
3731
3732 /* Due to an LLVM limitation, split 3-element writes
3733 * into a 2-element and a 1-element write. */
3734 if (count == 3) {
3735 writemask |= 1 << (start + 2);
3736 count = 2;
3737 }
3738
3739 if (count == 4) {
3740 data = base_data;
3741 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3742 } else if (count == 2) {
3743 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3744
3745 tmp = LLVMBuildExtractElement(
3746 builder, base_data,
3747 lp_build_const_int32(gallivm, start), "");
3748 data = LLVMBuildInsertElement(
3749 builder, LLVMGetUndef(v2f32), tmp,
3750 uint_bld->zero, "");
3751
3752 tmp = LLVMBuildExtractElement(
3753 builder, base_data,
3754 lp_build_const_int32(gallivm, start + 1), "");
3755 data = LLVMBuildInsertElement(
3756 builder, data, tmp, uint_bld->one, "");
3757
3758 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3759 } else {
3760 assert(count == 1);
3761 data = LLVMBuildExtractElement(
3762 builder, base_data,
3763 lp_build_const_int32(gallivm, start), "");
3764 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3765 }
3766
3767 offset = base_offset;
3768 if (start != 0) {
3769 offset = LLVMBuildAdd(
3770 builder, offset,
3771 lp_build_const_int32(gallivm, start * 4), "");
3772 }
3773
3774 emit_data->args[0] = data;
3775 emit_data->args[3] = offset;
3776
3777 lp_build_intrinsic(
3778 builder, intrinsic_name, emit_data->dst_type,
3779 emit_data->args, emit_data->arg_count,
3780 LLVMNoUnwindAttribute);
3781 }
3782 }
3783
3784 static void store_emit_memory(
3785 struct si_shader_context *ctx,
3786 struct lp_build_emit_data *emit_data)
3787 {
3788 const struct tgsi_full_instruction *inst = emit_data->inst;
3789 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3790 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3791 LLVMBuilderRef builder = gallivm->builder;
3792 unsigned writemask = inst->Dst[0].Register.WriteMask;
3793 LLVMValueRef ptr, derived_ptr, data, index;
3794 int chan;
3795
3796 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3797
3798 for (chan = 0; chan < 4; ++chan) {
3799 if (!(writemask & (1 << chan))) {
3800 continue;
3801 }
3802 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3803 index = lp_build_const_int32(gallivm, chan);
3804 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3805 LLVMBuildStore(builder, data, derived_ptr);
3806 }
3807 }
3808
3809 static void store_emit(
3810 const struct lp_build_tgsi_action *action,
3811 struct lp_build_tgsi_context *bld_base,
3812 struct lp_build_emit_data *emit_data)
3813 {
3814 struct si_shader_context *ctx = si_shader_context(bld_base);
3815 struct gallivm_state *gallivm = bld_base->base.gallivm;
3816 LLVMBuilderRef builder = gallivm->builder;
3817 const struct tgsi_full_instruction * inst = emit_data->inst;
3818 unsigned target = inst->Memory.Texture;
3819 char intrinsic_name[32];
3820 char coords_type[8];
3821
3822 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3823 store_emit_memory(ctx, emit_data);
3824 return;
3825 }
3826
3827 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3828 emit_waitcnt(ctx);
3829
3830 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3831 store_emit_buffer(ctx, emit_data);
3832 return;
3833 }
3834
3835 if (target == TGSI_TEXTURE_BUFFER) {
3836 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3837 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3838 emit_data->dst_type, emit_data->args, emit_data->arg_count,
3839 LLVMNoUnwindAttribute);
3840 } else {
3841 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3842 coords_type, sizeof(coords_type));
3843 snprintf(intrinsic_name, sizeof(intrinsic_name),
3844 "llvm.amdgcn.image.store.%s", coords_type);
3845
3846 emit_data->output[emit_data->chan] =
3847 lp_build_intrinsic(
3848 builder, intrinsic_name, emit_data->dst_type,
3849 emit_data->args, emit_data->arg_count,
3850 LLVMNoUnwindAttribute);
3851 }
3852 }
3853
3854 static void atomic_fetch_args(
3855 struct lp_build_tgsi_context * bld_base,
3856 struct lp_build_emit_data * emit_data)
3857 {
3858 struct si_shader_context *ctx = si_shader_context(bld_base);
3859 struct gallivm_state *gallivm = bld_base->base.gallivm;
3860 LLVMBuilderRef builder = gallivm->builder;
3861 const struct tgsi_full_instruction * inst = emit_data->inst;
3862 LLVMValueRef data1, data2;
3863 LLVMValueRef rsrc;
3864 LLVMValueRef tmp;
3865
3866 emit_data->dst_type = bld_base->base.elem_type;
3867
3868 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3869 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3870
3871 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3872 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3873 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3874 }
3875
3876 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3877 * of arguments, which is reversed relative to TGSI (and GLSL)
3878 */
3879 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3880 emit_data->args[emit_data->arg_count++] = data2;
3881 emit_data->args[emit_data->arg_count++] = data1;
3882
3883 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3884 LLVMValueRef offset;
3885
3886 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3887
3888 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3889 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3890
3891 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3892 offset, true);
3893 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3894 unsigned target = inst->Memory.Texture;
3895 LLVMValueRef coords;
3896
3897 image_fetch_rsrc(bld_base, &inst->Src[0],
3898 target != TGSI_TEXTURE_BUFFER, &rsrc);
3899 coords = image_fetch_coords(bld_base, inst, 1);
3900
3901 if (target == TGSI_TEXTURE_BUFFER) {
3902 rsrc = extract_rsrc_top_half(ctx, rsrc);
3903 buffer_append_args(ctx, emit_data, rsrc, coords,
3904 bld_base->uint_bld.zero, true);
3905 } else {
3906 emit_data->args[emit_data->arg_count++] = coords;
3907 emit_data->args[emit_data->arg_count++] = rsrc;
3908
3909 image_append_args(ctx, emit_data, target, true);
3910 }
3911 }
3912 }
3913
3914 static void atomic_emit_memory(struct si_shader_context *ctx,
3915 struct lp_build_emit_data *emit_data) {
3916 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3917 LLVMBuilderRef builder = gallivm->builder;
3918 const struct tgsi_full_instruction * inst = emit_data->inst;
3919 LLVMValueRef ptr, result, arg;
3920
3921 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3922
3923 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3924 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3925
3926 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3927 LLVMValueRef new_data;
3928 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
3929 inst, 3, 0);
3930
3931 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3932
3933 #if HAVE_LLVM >= 0x309
3934 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3935 LLVMAtomicOrderingSequentiallyConsistent,
3936 LLVMAtomicOrderingSequentiallyConsistent,
3937 false);
3938 #endif
3939
3940 result = LLVMBuildExtractValue(builder, result, 0, "");
3941 } else {
3942 LLVMAtomicRMWBinOp op;
3943
3944 switch(inst->Instruction.Opcode) {
3945 case TGSI_OPCODE_ATOMUADD:
3946 op = LLVMAtomicRMWBinOpAdd;
3947 break;
3948 case TGSI_OPCODE_ATOMXCHG:
3949 op = LLVMAtomicRMWBinOpXchg;
3950 break;
3951 case TGSI_OPCODE_ATOMAND:
3952 op = LLVMAtomicRMWBinOpAnd;
3953 break;
3954 case TGSI_OPCODE_ATOMOR:
3955 op = LLVMAtomicRMWBinOpOr;
3956 break;
3957 case TGSI_OPCODE_ATOMXOR:
3958 op = LLVMAtomicRMWBinOpXor;
3959 break;
3960 case TGSI_OPCODE_ATOMUMIN:
3961 op = LLVMAtomicRMWBinOpUMin;
3962 break;
3963 case TGSI_OPCODE_ATOMUMAX:
3964 op = LLVMAtomicRMWBinOpUMax;
3965 break;
3966 case TGSI_OPCODE_ATOMIMIN:
3967 op = LLVMAtomicRMWBinOpMin;
3968 break;
3969 case TGSI_OPCODE_ATOMIMAX:
3970 op = LLVMAtomicRMWBinOpMax;
3971 break;
3972 default:
3973 unreachable("unknown atomic opcode");
3974 }
3975
3976 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
3977 LLVMAtomicOrderingSequentiallyConsistent,
3978 false);
3979 }
3980 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
3981 }
3982
3983 static void atomic_emit(
3984 const struct lp_build_tgsi_action *action,
3985 struct lp_build_tgsi_context *bld_base,
3986 struct lp_build_emit_data *emit_data)
3987 {
3988 struct si_shader_context *ctx = si_shader_context(bld_base);
3989 struct gallivm_state *gallivm = bld_base->base.gallivm;
3990 LLVMBuilderRef builder = gallivm->builder;
3991 const struct tgsi_full_instruction * inst = emit_data->inst;
3992 char intrinsic_name[40];
3993 LLVMValueRef tmp;
3994
3995 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3996 atomic_emit_memory(ctx, emit_data);
3997 return;
3998 }
3999
4000 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4001 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4002 snprintf(intrinsic_name, sizeof(intrinsic_name),
4003 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4004 } else {
4005 char coords_type[8];
4006
4007 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
4008 coords_type, sizeof(coords_type));
4009 snprintf(intrinsic_name, sizeof(intrinsic_name),
4010 "llvm.amdgcn.image.atomic.%s.%s",
4011 action->intr_name, coords_type);
4012 }
4013
4014 tmp = lp_build_intrinsic(
4015 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4016 emit_data->args, emit_data->arg_count,
4017 LLVMNoUnwindAttribute);
4018 emit_data->output[emit_data->chan] =
4019 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4020 }
4021
4022 static void resq_fetch_args(
4023 struct lp_build_tgsi_context * bld_base,
4024 struct lp_build_emit_data * emit_data)
4025 {
4026 struct si_shader_context *ctx = si_shader_context(bld_base);
4027 struct gallivm_state *gallivm = bld_base->base.gallivm;
4028 const struct tgsi_full_instruction *inst = emit_data->inst;
4029 const struct tgsi_full_src_register *reg = &inst->Src[0];
4030
4031 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
4032
4033 if (reg->Register.File == TGSI_FILE_BUFFER) {
4034 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4035 emit_data->arg_count = 1;
4036 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4037 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4038 emit_data->arg_count = 1;
4039 } else {
4040 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4041 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4042 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4043 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4044 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4045 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4046 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4047 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4048 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4049 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4050 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4051 emit_data->arg_count = 10;
4052 }
4053 }
4054
4055 static void resq_emit(
4056 const struct lp_build_tgsi_action *action,
4057 struct lp_build_tgsi_context *bld_base,
4058 struct lp_build_emit_data *emit_data)
4059 {
4060 struct gallivm_state *gallivm = bld_base->base.gallivm;
4061 LLVMBuilderRef builder = gallivm->builder;
4062 const struct tgsi_full_instruction *inst = emit_data->inst;
4063 LLVMValueRef out;
4064
4065 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4066 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4067 lp_build_const_int32(gallivm, 2), "");
4068 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4069 out = get_buffer_size(bld_base, emit_data->args[0]);
4070 } else {
4071 out = lp_build_intrinsic(
4072 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4073 emit_data->args, emit_data->arg_count,
4074 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4075
4076 /* Divide the number of layers by 6 to get the number of cubes. */
4077 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4078 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4079 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4080
4081 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4082 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
4083 z = LLVMBuildSDiv(builder, z, imm6, "");
4084 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
4085 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4086 }
4087 }
4088
4089 emit_data->output[emit_data->chan] = out;
4090 }
4091
4092 static void set_tex_fetch_args(struct si_shader_context *ctx,
4093 struct lp_build_emit_data *emit_data,
4094 unsigned opcode, unsigned target,
4095 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4096 LLVMValueRef *param, unsigned count,
4097 unsigned dmask)
4098 {
4099 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4100 unsigned num_args;
4101 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4102
4103 /* Pad to power of two vector */
4104 while (count < util_next_power_of_two(count))
4105 param[count++] = LLVMGetUndef(ctx->i32);
4106
4107 /* Texture coordinates. */
4108 if (count > 1)
4109 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4110 else
4111 emit_data->args[0] = param[0];
4112
4113 /* Resource. */
4114 emit_data->args[1] = res_ptr;
4115 num_args = 2;
4116
4117 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4118 emit_data->dst_type = ctx->v4i32;
4119 else {
4120 emit_data->dst_type = ctx->v4f32;
4121
4122 emit_data->args[num_args++] = samp_ptr;
4123 }
4124
4125 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4126 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4127 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4128 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4129 tgsi_is_array_sampler(target)); /* da */
4130 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4131 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4132 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4133 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4134
4135 emit_data->arg_count = num_args;
4136 }
4137
4138 static const struct lp_build_tgsi_action tex_action;
4139
4140 enum desc_type {
4141 DESC_IMAGE,
4142 DESC_FMASK,
4143 DESC_SAMPLER
4144 };
4145
4146 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4147 {
4148 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4149 CONST_ADDR_SPACE);
4150 }
4151
4152 /**
4153 * Load an image view, fmask view. or sampler state descriptor.
4154 */
4155 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
4156 LLVMValueRef list, LLVMValueRef index,
4157 enum desc_type type)
4158 {
4159 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4160 LLVMBuilderRef builder = gallivm->builder;
4161
4162 switch (type) {
4163 case DESC_IMAGE:
4164 /* The image is at [0:7]. */
4165 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4166 break;
4167 case DESC_FMASK:
4168 /* The FMASK is at [8:15]. */
4169 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4170 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4171 break;
4172 case DESC_SAMPLER:
4173 /* The sampler state is at [12:15]. */
4174 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4175 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4176 list = LLVMBuildPointerCast(builder, list,
4177 const_array(ctx->v4i32, 0), "");
4178 break;
4179 }
4180
4181 return build_indexed_load_const(ctx, list, index);
4182 }
4183
4184 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
4185 LLVMValueRef index, enum desc_type type)
4186 {
4187 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4188 SI_PARAM_SAMPLERS);
4189
4190 return get_sampler_desc_custom(ctx, list, index, type);
4191 }
4192
4193 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4194 *
4195 * SI-CI:
4196 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4197 * filtering manually. The driver sets img7 to a mask clearing
4198 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4199 * s_and_b32 samp0, samp0, img7
4200 *
4201 * VI:
4202 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4203 */
4204 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4205 LLVMValueRef res, LLVMValueRef samp)
4206 {
4207 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4208 LLVMValueRef img7, samp0;
4209
4210 if (ctx->screen->b.chip_class >= VI)
4211 return samp;
4212
4213 img7 = LLVMBuildExtractElement(builder, res,
4214 LLVMConstInt(ctx->i32, 7, 0), "");
4215 samp0 = LLVMBuildExtractElement(builder, samp,
4216 LLVMConstInt(ctx->i32, 0, 0), "");
4217 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4218 return LLVMBuildInsertElement(builder, samp, samp0,
4219 LLVMConstInt(ctx->i32, 0, 0), "");
4220 }
4221
4222 static void tex_fetch_ptrs(
4223 struct lp_build_tgsi_context *bld_base,
4224 struct lp_build_emit_data *emit_data,
4225 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4226 {
4227 struct si_shader_context *ctx = si_shader_context(bld_base);
4228 const struct tgsi_full_instruction *inst = emit_data->inst;
4229 unsigned target = inst->Texture.Texture;
4230 unsigned sampler_src;
4231 unsigned sampler_index;
4232
4233 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4234 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4235
4236 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4237 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4238 LLVMValueRef ind_index;
4239
4240 ind_index = get_bounded_indirect_index(ctx,
4241 &reg->Indirect,
4242 reg->Register.Index,
4243 SI_NUM_SAMPLERS);
4244
4245 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
4246
4247 if (target == TGSI_TEXTURE_2D_MSAA ||
4248 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4249 if (samp_ptr)
4250 *samp_ptr = NULL;
4251 if (fmask_ptr)
4252 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
4253 } else {
4254 if (samp_ptr) {
4255 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
4256 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4257 }
4258 if (fmask_ptr)
4259 *fmask_ptr = NULL;
4260 }
4261 } else {
4262 *res_ptr = ctx->sampler_views[sampler_index];
4263 if (samp_ptr)
4264 *samp_ptr = ctx->sampler_states[sampler_index];
4265 if (fmask_ptr)
4266 *fmask_ptr = ctx->fmasks[sampler_index];
4267 }
4268 }
4269
4270 static void txq_fetch_args(
4271 struct lp_build_tgsi_context *bld_base,
4272 struct lp_build_emit_data *emit_data)
4273 {
4274 struct si_shader_context *ctx = si_shader_context(bld_base);
4275 struct gallivm_state *gallivm = bld_base->base.gallivm;
4276 LLVMBuilderRef builder = gallivm->builder;
4277 const struct tgsi_full_instruction *inst = emit_data->inst;
4278 unsigned target = inst->Texture.Texture;
4279 LLVMValueRef res_ptr;
4280 LLVMValueRef address;
4281
4282 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4283
4284 if (target == TGSI_TEXTURE_BUFFER) {
4285 /* Read the size from the buffer descriptor directly. */
4286 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4287 emit_data->args[0] = get_buffer_size(bld_base, res);
4288 return;
4289 }
4290
4291 /* Textures - set the mip level. */
4292 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4293
4294 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4295 NULL, &address, 1, 0xf);
4296 }
4297
4298 static void txq_emit(const struct lp_build_tgsi_action *action,
4299 struct lp_build_tgsi_context *bld_base,
4300 struct lp_build_emit_data *emit_data)
4301 {
4302 struct lp_build_context *base = &bld_base->base;
4303 unsigned target = emit_data->inst->Texture.Texture;
4304
4305 if (target == TGSI_TEXTURE_BUFFER) {
4306 /* Just return the buffer size. */
4307 emit_data->output[emit_data->chan] = emit_data->args[0];
4308 return;
4309 }
4310
4311 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4312 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4313 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4314 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4315
4316 /* Divide the number of layers by 6 to get the number of cubes. */
4317 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4318 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4319 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4320 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4321 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4322
4323 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4324 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4325 z = LLVMBuildSDiv(builder, z, six, "");
4326
4327 emit_data->output[emit_data->chan] =
4328 LLVMBuildInsertElement(builder, v4, z, two, "");
4329 }
4330 }
4331
4332 static void tex_fetch_args(
4333 struct lp_build_tgsi_context *bld_base,
4334 struct lp_build_emit_data *emit_data)
4335 {
4336 struct si_shader_context *ctx = si_shader_context(bld_base);
4337 struct gallivm_state *gallivm = bld_base->base.gallivm;
4338 const struct tgsi_full_instruction *inst = emit_data->inst;
4339 unsigned opcode = inst->Instruction.Opcode;
4340 unsigned target = inst->Texture.Texture;
4341 LLVMValueRef coords[5], derivs[6];
4342 LLVMValueRef address[16];
4343 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4344 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4345 unsigned count = 0;
4346 unsigned chan;
4347 unsigned num_deriv_channels = 0;
4348 bool has_offset = inst->Texture.NumOffsets > 0;
4349 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4350 unsigned dmask = 0xf;
4351
4352 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4353
4354 if (target == TGSI_TEXTURE_BUFFER) {
4355 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4356
4357 /* Bitcast and truncate v8i32 to v16i8. */
4358 LLVMValueRef res = res_ptr;
4359 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4360 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4361 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4362
4363 emit_data->dst_type = ctx->v4f32;
4364 emit_data->args[0] = res;
4365 emit_data->args[1] = bld_base->uint_bld.zero;
4366 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4367 emit_data->arg_count = 3;
4368 return;
4369 }
4370
4371 /* Fetch and project texture coordinates */
4372 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4373 for (chan = 0; chan < 3; chan++ ) {
4374 coords[chan] = lp_build_emit_fetch(bld_base,
4375 emit_data->inst, 0,
4376 chan);
4377 if (opcode == TGSI_OPCODE_TXP)
4378 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4379 TGSI_OPCODE_DIV,
4380 coords[chan],
4381 coords[3]);
4382 }
4383
4384 if (opcode == TGSI_OPCODE_TXP)
4385 coords[3] = bld_base->base.one;
4386
4387 /* Pack offsets. */
4388 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4389 /* The offsets are six-bit signed integers packed like this:
4390 * X=[5:0], Y=[13:8], and Z=[21:16].
4391 */
4392 LLVMValueRef offset[3], pack;
4393
4394 assert(inst->Texture.NumOffsets == 1);
4395
4396 for (chan = 0; chan < 3; chan++) {
4397 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4398 emit_data->inst, 0, chan);
4399 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4400 lp_build_const_int32(gallivm, 0x3f), "");
4401 if (chan)
4402 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4403 lp_build_const_int32(gallivm, chan*8), "");
4404 }
4405
4406 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4407 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4408 address[count++] = pack;
4409 }
4410
4411 /* Pack LOD bias value */
4412 if (opcode == TGSI_OPCODE_TXB)
4413 address[count++] = coords[3];
4414 if (opcode == TGSI_OPCODE_TXB2)
4415 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4416
4417 /* Pack depth comparison value */
4418 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4419 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4420 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4421 } else {
4422 assert(ref_pos >= 0);
4423 address[count++] = coords[ref_pos];
4424 }
4425 }
4426
4427 /* Pack user derivatives */
4428 if (opcode == TGSI_OPCODE_TXD) {
4429 int param, num_src_deriv_channels;
4430
4431 switch (target) {
4432 case TGSI_TEXTURE_3D:
4433 num_src_deriv_channels = 3;
4434 num_deriv_channels = 3;
4435 break;
4436 case TGSI_TEXTURE_2D:
4437 case TGSI_TEXTURE_SHADOW2D:
4438 case TGSI_TEXTURE_RECT:
4439 case TGSI_TEXTURE_SHADOWRECT:
4440 case TGSI_TEXTURE_2D_ARRAY:
4441 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4442 num_src_deriv_channels = 2;
4443 num_deriv_channels = 2;
4444 break;
4445 case TGSI_TEXTURE_CUBE:
4446 case TGSI_TEXTURE_SHADOWCUBE:
4447 case TGSI_TEXTURE_CUBE_ARRAY:
4448 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4449 /* Cube derivatives will be converted to 2D. */
4450 num_src_deriv_channels = 3;
4451 num_deriv_channels = 2;
4452 break;
4453 case TGSI_TEXTURE_1D:
4454 case TGSI_TEXTURE_SHADOW1D:
4455 case TGSI_TEXTURE_1D_ARRAY:
4456 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4457 num_src_deriv_channels = 1;
4458 num_deriv_channels = 1;
4459 break;
4460 default:
4461 unreachable("invalid target");
4462 }
4463
4464 for (param = 0; param < 2; param++)
4465 for (chan = 0; chan < num_src_deriv_channels; chan++)
4466 derivs[param * num_src_deriv_channels + chan] =
4467 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4468 }
4469
4470 if (target == TGSI_TEXTURE_CUBE ||
4471 target == TGSI_TEXTURE_CUBE_ARRAY ||
4472 target == TGSI_TEXTURE_SHADOWCUBE ||
4473 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4474 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4475
4476 if (opcode == TGSI_OPCODE_TXD)
4477 for (int i = 0; i < num_deriv_channels * 2; i++)
4478 address[count++] = derivs[i];
4479
4480 /* Pack texture coordinates */
4481 address[count++] = coords[0];
4482 if (num_coords > 1)
4483 address[count++] = coords[1];
4484 if (num_coords > 2)
4485 address[count++] = coords[2];
4486
4487 /* Pack LOD or sample index */
4488 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4489 address[count++] = coords[3];
4490 else if (opcode == TGSI_OPCODE_TXL2)
4491 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4492
4493 if (count > 16) {
4494 assert(!"Cannot handle more than 16 texture address parameters");
4495 count = 16;
4496 }
4497
4498 for (chan = 0; chan < count; chan++ ) {
4499 address[chan] = LLVMBuildBitCast(gallivm->builder,
4500 address[chan], ctx->i32, "");
4501 }
4502
4503 /* Adjust the sample index according to FMASK.
4504 *
4505 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4506 * which is the identity mapping. Each nibble says which physical sample
4507 * should be fetched to get that sample.
4508 *
4509 * For example, 0x11111100 means there are only 2 samples stored and
4510 * the second sample covers 3/4 of the pixel. When reading samples 0
4511 * and 1, return physical sample 0 (determined by the first two 0s
4512 * in FMASK), otherwise return physical sample 1.
4513 *
4514 * The sample index should be adjusted as follows:
4515 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4516 */
4517 if (target == TGSI_TEXTURE_2D_MSAA ||
4518 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4519 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4520 struct lp_build_emit_data txf_emit_data = *emit_data;
4521 LLVMValueRef txf_address[4];
4522 unsigned txf_count = count;
4523 struct tgsi_full_instruction inst = {};
4524
4525 memcpy(txf_address, address, sizeof(txf_address));
4526
4527 if (target == TGSI_TEXTURE_2D_MSAA) {
4528 txf_address[2] = bld_base->uint_bld.zero;
4529 }
4530 txf_address[3] = bld_base->uint_bld.zero;
4531
4532 /* Read FMASK using TXF. */
4533 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4534 inst.Texture.Texture = target;
4535 txf_emit_data.inst = &inst;
4536 txf_emit_data.chan = 0;
4537 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4538 target, fmask_ptr, NULL,
4539 txf_address, txf_count, 0xf);
4540 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4541
4542 /* Initialize some constants. */
4543 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4544 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4545
4546 /* Apply the formula. */
4547 LLVMValueRef fmask =
4548 LLVMBuildExtractElement(gallivm->builder,
4549 txf_emit_data.output[0],
4550 uint_bld->zero, "");
4551
4552 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4553
4554 LLVMValueRef sample_index4 =
4555 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4556
4557 LLVMValueRef shifted_fmask =
4558 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4559
4560 LLVMValueRef final_sample =
4561 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4562
4563 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4564 * resource descriptor is 0 (invalid),
4565 */
4566 LLVMValueRef fmask_desc =
4567 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4568 ctx->v8i32, "");
4569
4570 LLVMValueRef fmask_word1 =
4571 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4572 uint_bld->one, "");
4573
4574 LLVMValueRef word1_is_nonzero =
4575 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4576 fmask_word1, uint_bld->zero, "");
4577
4578 /* Replace the MSAA sample index. */
4579 address[sample_chan] =
4580 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4581 final_sample, address[sample_chan], "");
4582 }
4583
4584 if (opcode == TGSI_OPCODE_TXF) {
4585 /* add tex offsets */
4586 if (inst->Texture.NumOffsets) {
4587 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4588 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4589 const struct tgsi_texture_offset *off = inst->TexOffsets;
4590
4591 assert(inst->Texture.NumOffsets == 1);
4592
4593 switch (target) {
4594 case TGSI_TEXTURE_3D:
4595 address[2] = lp_build_add(uint_bld, address[2],
4596 bld->immediates[off->Index][off->SwizzleZ]);
4597 /* fall through */
4598 case TGSI_TEXTURE_2D:
4599 case TGSI_TEXTURE_SHADOW2D:
4600 case TGSI_TEXTURE_RECT:
4601 case TGSI_TEXTURE_SHADOWRECT:
4602 case TGSI_TEXTURE_2D_ARRAY:
4603 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4604 address[1] =
4605 lp_build_add(uint_bld, address[1],
4606 bld->immediates[off->Index][off->SwizzleY]);
4607 /* fall through */
4608 case TGSI_TEXTURE_1D:
4609 case TGSI_TEXTURE_SHADOW1D:
4610 case TGSI_TEXTURE_1D_ARRAY:
4611 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4612 address[0] =
4613 lp_build_add(uint_bld, address[0],
4614 bld->immediates[off->Index][off->SwizzleX]);
4615 break;
4616 /* texture offsets do not apply to other texture targets */
4617 }
4618 }
4619 }
4620
4621 if (opcode == TGSI_OPCODE_TG4) {
4622 unsigned gather_comp = 0;
4623
4624 /* DMASK was repurposed for GATHER4. 4 components are always
4625 * returned and DMASK works like a swizzle - it selects
4626 * the component to fetch. The only valid DMASK values are
4627 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4628 * (red,red,red,red) etc.) The ISA document doesn't mention
4629 * this.
4630 */
4631
4632 /* Get the component index from src1.x for Gather4. */
4633 if (!tgsi_is_shadow_target(target)) {
4634 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4635 LLVMValueRef comp_imm;
4636 struct tgsi_src_register src1 = inst->Src[1].Register;
4637
4638 assert(src1.File == TGSI_FILE_IMMEDIATE);
4639
4640 comp_imm = imms[src1.Index][src1.SwizzleX];
4641 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4642 gather_comp = CLAMP(gather_comp, 0, 3);
4643 }
4644
4645 dmask = 1 << gather_comp;
4646 }
4647
4648 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4649 samp_ptr, address, count, dmask);
4650 }
4651
4652 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4653 struct lp_build_tgsi_context *bld_base,
4654 struct lp_build_emit_data *emit_data)
4655 {
4656 struct si_shader_context *ctx = si_shader_context(bld_base);
4657 struct lp_build_context *base = &bld_base->base;
4658 unsigned opcode = emit_data->inst->Instruction.Opcode;
4659 unsigned target = emit_data->inst->Texture.Texture;
4660 char intr_name[127];
4661 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4662 bool is_shadow = tgsi_is_shadow_target(target);
4663 char type[64];
4664 const char *name = "llvm.SI.image.sample";
4665 const char *infix = "";
4666
4667 if (target == TGSI_TEXTURE_BUFFER) {
4668 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4669 base->gallivm->builder,
4670 "llvm.SI.vs.load.input", emit_data->dst_type,
4671 emit_data->args, emit_data->arg_count,
4672 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4673 return;
4674 }
4675
4676 switch (opcode) {
4677 case TGSI_OPCODE_TXF:
4678 name = target == TGSI_TEXTURE_2D_MSAA ||
4679 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4680 "llvm.SI.image.load" :
4681 "llvm.SI.image.load.mip";
4682 is_shadow = false;
4683 has_offset = false;
4684 break;
4685 case TGSI_OPCODE_LODQ:
4686 name = "llvm.SI.getlod";
4687 is_shadow = false;
4688 has_offset = false;
4689 break;
4690 case TGSI_OPCODE_TEX:
4691 case TGSI_OPCODE_TEX2:
4692 case TGSI_OPCODE_TXP:
4693 if (ctx->type != PIPE_SHADER_FRAGMENT)
4694 infix = ".lz";
4695 break;
4696 case TGSI_OPCODE_TXB:
4697 case TGSI_OPCODE_TXB2:
4698 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4699 infix = ".b";
4700 break;
4701 case TGSI_OPCODE_TXL:
4702 case TGSI_OPCODE_TXL2:
4703 infix = ".l";
4704 break;
4705 case TGSI_OPCODE_TXD:
4706 infix = ".d";
4707 break;
4708 case TGSI_OPCODE_TG4:
4709 name = "llvm.SI.gather4";
4710 infix = ".lz";
4711 break;
4712 default:
4713 assert(0);
4714 return;
4715 }
4716
4717 /* Add the type and suffixes .c, .o if needed. */
4718 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4719 sprintf(intr_name, "%s%s%s%s.%s",
4720 name, is_shadow ? ".c" : "", infix,
4721 has_offset ? ".o" : "", type);
4722
4723 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4724 base->gallivm->builder, intr_name, emit_data->dst_type,
4725 emit_data->args, emit_data->arg_count,
4726 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4727 }
4728
4729 static void si_llvm_emit_txqs(
4730 const struct lp_build_tgsi_action *action,
4731 struct lp_build_tgsi_context *bld_base,
4732 struct lp_build_emit_data *emit_data)
4733 {
4734 struct si_shader_context *ctx = si_shader_context(bld_base);
4735 struct gallivm_state *gallivm = bld_base->base.gallivm;
4736 LLVMBuilderRef builder = gallivm->builder;
4737 LLVMValueRef res, samples;
4738 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4739
4740 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4741
4742
4743 /* Read the samples from the descriptor directly. */
4744 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4745 samples = LLVMBuildExtractElement(
4746 builder, res,
4747 lp_build_const_int32(gallivm, 3), "");
4748 samples = LLVMBuildLShr(builder, samples,
4749 lp_build_const_int32(gallivm, 16), "");
4750 samples = LLVMBuildAnd(builder, samples,
4751 lp_build_const_int32(gallivm, 0xf), "");
4752 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4753 samples, "");
4754
4755 emit_data->output[emit_data->chan] = samples;
4756 }
4757
4758 /*
4759 * SI implements derivatives using the local data store (LDS)
4760 * All writes to the LDS happen in all executing threads at
4761 * the same time. TID is the Thread ID for the current
4762 * thread and is a value between 0 and 63, representing
4763 * the thread's position in the wavefront.
4764 *
4765 * For the pixel shader threads are grouped into quads of four pixels.
4766 * The TIDs of the pixels of a quad are:
4767 *
4768 * +------+------+
4769 * |4n + 0|4n + 1|
4770 * +------+------+
4771 * |4n + 2|4n + 3|
4772 * +------+------+
4773 *
4774 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4775 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4776 * the current pixel's column, and masking with 0xfffffffe yields the TID
4777 * of the left pixel of the current pixel's row.
4778 *
4779 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4780 * adding 2 yields the TID of the pixel below the top pixel.
4781 */
4782 /* masks for thread ID. */
4783 #define TID_MASK_TOP_LEFT 0xfffffffc
4784 #define TID_MASK_TOP 0xfffffffd
4785 #define TID_MASK_LEFT 0xfffffffe
4786
4787 static void si_llvm_emit_ddxy(
4788 const struct lp_build_tgsi_action *action,
4789 struct lp_build_tgsi_context *bld_base,
4790 struct lp_build_emit_data *emit_data)
4791 {
4792 struct si_shader_context *ctx = si_shader_context(bld_base);
4793 struct gallivm_state *gallivm = bld_base->base.gallivm;
4794 const struct tgsi_full_instruction *inst = emit_data->inst;
4795 unsigned opcode = inst->Instruction.Opcode;
4796 LLVMValueRef indices[2];
4797 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4798 LLVMValueRef tl, trbl, result[4];
4799 LLVMValueRef tl_tid, trbl_tid;
4800 unsigned swizzle[4];
4801 unsigned c;
4802 int idx;
4803 unsigned mask;
4804
4805 indices[0] = bld_base->uint_bld.zero;
4806 indices[1] = get_thread_id(ctx);
4807 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4808 indices, 2, "");
4809
4810 if (opcode == TGSI_OPCODE_DDX_FINE)
4811 mask = TID_MASK_LEFT;
4812 else if (opcode == TGSI_OPCODE_DDY_FINE)
4813 mask = TID_MASK_TOP;
4814 else
4815 mask = TID_MASK_TOP_LEFT;
4816
4817 tl_tid = LLVMBuildAnd(gallivm->builder, indices[1],
4818 lp_build_const_int32(gallivm, mask), "");
4819 indices[1] = tl_tid;
4820 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4821 indices, 2, "");
4822
4823 /* for DDX we want to next X pixel, DDY next Y pixel. */
4824 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4825 trbl_tid = LLVMBuildAdd(gallivm->builder, indices[1],
4826 lp_build_const_int32(gallivm, idx), "");
4827 indices[1] = trbl_tid;
4828 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4829 indices, 2, "");
4830
4831 for (c = 0; c < 4; ++c) {
4832 unsigned i;
4833 LLVMValueRef val;
4834 LLVMValueRef args[2];
4835
4836 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4837 for (i = 0; i < c; ++i) {
4838 if (swizzle[i] == swizzle[c]) {
4839 result[c] = result[i];
4840 break;
4841 }
4842 }
4843 if (i != c)
4844 continue;
4845
4846 val = LLVMBuildBitCast(gallivm->builder,
4847 lp_build_emit_fetch(bld_base, inst, 0, c),
4848 ctx->i32, "");
4849
4850 if ((HAVE_LLVM >= 0x0309) && ctx->screen->b.family >= CHIP_TONGA) {
4851
4852 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
4853 lp_build_const_int32(gallivm, 4), "");
4854 args[1] = val;
4855 tl = lp_build_intrinsic(gallivm->builder,
4856 "llvm.amdgcn.ds.bpermute", ctx->i32,
4857 args, 2, LLVMReadNoneAttribute);
4858
4859 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
4860 lp_build_const_int32(gallivm, 4), "");
4861 trbl = lp_build_intrinsic(gallivm->builder,
4862 "llvm.amdgcn.ds.bpermute", ctx->i32,
4863 args, 2, LLVMReadNoneAttribute);
4864 } else {
4865 LLVMBuildStore(gallivm->builder, val, store_ptr);
4866 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4867 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4868 }
4869 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4870 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4871 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4872 }
4873
4874 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4875 }
4876
4877 /*
4878 * this takes an I,J coordinate pair,
4879 * and works out the X and Y derivatives.
4880 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4881 */
4882 static LLVMValueRef si_llvm_emit_ddxy_interp(
4883 struct lp_build_tgsi_context *bld_base,
4884 LLVMValueRef interp_ij)
4885 {
4886 struct si_shader_context *ctx = si_shader_context(bld_base);
4887 struct gallivm_state *gallivm = bld_base->base.gallivm;
4888 LLVMValueRef indices[2];
4889 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4890 LLVMValueRef tl, tr, bl, result[4];
4891 unsigned c;
4892
4893 indices[0] = bld_base->uint_bld.zero;
4894 indices[1] = get_thread_id(ctx);
4895 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4896 indices, 2, "");
4897
4898 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4899 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4900
4901 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4902 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4903
4904 indices[1] = temp;
4905 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4906 indices, 2, "");
4907
4908 indices[1] = temp2;
4909 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4910 indices, 2, "");
4911
4912 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4913 lp_build_const_int32(gallivm, 1), "");
4914 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4915 indices, 2, "");
4916
4917 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4918 lp_build_const_int32(gallivm, 2), "");
4919 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4920 indices, 2, "");
4921
4922 for (c = 0; c < 2; ++c) {
4923 LLVMValueRef store_val;
4924 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4925
4926 store_val = LLVMBuildExtractElement(gallivm->builder,
4927 interp_ij, c_ll, "");
4928 LLVMBuildStore(gallivm->builder,
4929 store_val,
4930 store_ptr);
4931
4932 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
4933 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4934
4935 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
4936 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
4937
4938 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
4939
4940 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
4941 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4942
4943 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
4944 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
4945
4946 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
4947 }
4948
4949 return lp_build_gather_values(gallivm, result, 4);
4950 }
4951
4952 static void interp_fetch_args(
4953 struct lp_build_tgsi_context *bld_base,
4954 struct lp_build_emit_data *emit_data)
4955 {
4956 struct si_shader_context *ctx = si_shader_context(bld_base);
4957 struct gallivm_state *gallivm = bld_base->base.gallivm;
4958 const struct tgsi_full_instruction *inst = emit_data->inst;
4959
4960 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
4961 /* offset is in second src, first two channels */
4962 emit_data->args[0] = lp_build_emit_fetch(bld_base,
4963 emit_data->inst, 1,
4964 TGSI_CHAN_X);
4965 emit_data->args[1] = lp_build_emit_fetch(bld_base,
4966 emit_data->inst, 1,
4967 TGSI_CHAN_Y);
4968 emit_data->arg_count = 2;
4969 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4970 LLVMValueRef sample_position;
4971 LLVMValueRef sample_id;
4972 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
4973
4974 /* fetch sample ID, then fetch its sample position,
4975 * and place into first two channels.
4976 */
4977 sample_id = lp_build_emit_fetch(bld_base,
4978 emit_data->inst, 1, TGSI_CHAN_X);
4979 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
4980 ctx->i32, "");
4981 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
4982
4983 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
4984 sample_position,
4985 lp_build_const_int32(gallivm, 0), "");
4986
4987 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
4988 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
4989 sample_position,
4990 lp_build_const_int32(gallivm, 1), "");
4991 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
4992 emit_data->arg_count = 2;
4993 }
4994 }
4995
4996 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
4997 struct lp_build_tgsi_context *bld_base,
4998 struct lp_build_emit_data *emit_data)
4999 {
5000 struct si_shader_context *ctx = si_shader_context(bld_base);
5001 struct si_shader *shader = ctx->shader;
5002 struct gallivm_state *gallivm = bld_base->base.gallivm;
5003 LLVMValueRef interp_param;
5004 const struct tgsi_full_instruction *inst = emit_data->inst;
5005 const char *intr_name;
5006 int input_index = inst->Src[0].Register.Index;
5007 int chan;
5008 int i;
5009 LLVMValueRef attr_number;
5010 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5011 int interp_param_idx;
5012 unsigned interp = shader->selector->info.input_interpolate[input_index];
5013 unsigned location;
5014
5015 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5016
5017 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5018 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5019 location = TGSI_INTERPOLATE_LOC_CENTER;
5020 else
5021 location = TGSI_INTERPOLATE_LOC_CENTROID;
5022
5023 interp_param_idx = lookup_interp_param_index(interp, location);
5024 if (interp_param_idx == -1)
5025 return;
5026 else if (interp_param_idx)
5027 interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
5028 else
5029 interp_param = NULL;
5030
5031 attr_number = lp_build_const_int32(gallivm, input_index);
5032
5033 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5034 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5035 LLVMValueRef ij_out[2];
5036 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5037
5038 /*
5039 * take the I then J parameters, and the DDX/Y for it, and
5040 * calculate the IJ inputs for the interpolator.
5041 * temp1 = ddx * offset/sample.x + I;
5042 * interp_param.I = ddy * offset/sample.y + temp1;
5043 * temp1 = ddx * offset/sample.x + J;
5044 * interp_param.J = ddy * offset/sample.y + temp1;
5045 */
5046 for (i = 0; i < 2; i++) {
5047 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5048 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5049 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5050 ddxy_out, ix_ll, "");
5051 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5052 ddxy_out, iy_ll, "");
5053 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5054 interp_param, ix_ll, "");
5055 LLVMValueRef temp1, temp2;
5056
5057 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5058 ctx->f32, "");
5059
5060 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5061
5062 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5063
5064 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5065
5066 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5067
5068 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5069 temp2, ctx->i32, "");
5070 }
5071 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5072 }
5073
5074 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5075 for (chan = 0; chan < 2; chan++) {
5076 LLVMValueRef args[4];
5077 LLVMValueRef llvm_chan;
5078 unsigned schan;
5079
5080 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5081 llvm_chan = lp_build_const_int32(gallivm, schan);
5082
5083 args[0] = llvm_chan;
5084 args[1] = attr_number;
5085 args[2] = params;
5086 args[3] = interp_param;
5087
5088 emit_data->output[chan] =
5089 lp_build_intrinsic(gallivm->builder, intr_name,
5090 ctx->f32, args, args[3] ? 4 : 3,
5091 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
5092 }
5093 }
5094
5095 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5096 struct lp_build_emit_data *emit_data)
5097 {
5098 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5099 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5100 unsigned stream;
5101
5102 assert(src0.File == TGSI_FILE_IMMEDIATE);
5103
5104 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5105 return stream;
5106 }
5107
5108 /* Emit one vertex from the geometry shader */
5109 static void si_llvm_emit_vertex(
5110 const struct lp_build_tgsi_action *action,
5111 struct lp_build_tgsi_context *bld_base,
5112 struct lp_build_emit_data *emit_data)
5113 {
5114 struct si_shader_context *ctx = si_shader_context(bld_base);
5115 struct lp_build_context *uint = &bld_base->uint_bld;
5116 struct si_shader *shader = ctx->shader;
5117 struct tgsi_shader_info *info = &shader->selector->info;
5118 struct gallivm_state *gallivm = bld_base->base.gallivm;
5119 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5120 SI_PARAM_GS2VS_OFFSET);
5121 LLVMValueRef gs_next_vertex;
5122 LLVMValueRef can_emit, kill;
5123 LLVMValueRef args[2];
5124 unsigned chan;
5125 int i;
5126 unsigned stream;
5127
5128 stream = si_llvm_get_stream(bld_base, emit_data);
5129
5130 /* Write vertex attribute values to GSVS ring */
5131 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5132 ctx->gs_next_vertex[stream],
5133 "");
5134
5135 /* If this thread has already emitted the declared maximum number of
5136 * vertices, kill it: excessive vertex emissions are not supposed to
5137 * have any effect, and GS threads have no externally observable
5138 * effects other than emitting vertices.
5139 */
5140 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5141 lp_build_const_int32(gallivm,
5142 shader->selector->gs_max_out_vertices), "");
5143 kill = lp_build_select(&bld_base->base, can_emit,
5144 lp_build_const_float(gallivm, 1.0f),
5145 lp_build_const_float(gallivm, -1.0f));
5146
5147 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5148 ctx->voidt, &kill, 1, 0);
5149
5150 for (i = 0; i < info->num_outputs; i++) {
5151 LLVMValueRef *out_ptr =
5152 ctx->radeon_bld.soa.outputs[i];
5153
5154 for (chan = 0; chan < 4; chan++) {
5155 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5156 LLVMValueRef voffset =
5157 lp_build_const_int32(gallivm, (i * 4 + chan) *
5158 shader->selector->gs_max_out_vertices);
5159
5160 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5161 voffset = lp_build_mul_imm(uint, voffset, 4);
5162
5163 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5164
5165 build_tbuffer_store(ctx,
5166 ctx->gsvs_ring[stream],
5167 out_val, 1,
5168 voffset, soffset, 0,
5169 V_008F0C_BUF_DATA_FORMAT_32,
5170 V_008F0C_BUF_NUM_FORMAT_UINT,
5171 1, 0, 1, 1, 0);
5172 }
5173 }
5174 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5175 lp_build_const_int32(gallivm, 1));
5176
5177 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5178
5179 /* Signal vertex emission */
5180 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5181 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5182 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5183 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
5184 }
5185
5186 /* Cut one primitive from the geometry shader */
5187 static void si_llvm_emit_primitive(
5188 const struct lp_build_tgsi_action *action,
5189 struct lp_build_tgsi_context *bld_base,
5190 struct lp_build_emit_data *emit_data)
5191 {
5192 struct si_shader_context *ctx = si_shader_context(bld_base);
5193 struct gallivm_state *gallivm = bld_base->base.gallivm;
5194 LLVMValueRef args[2];
5195 unsigned stream;
5196
5197 /* Signal primitive cut */
5198 stream = si_llvm_get_stream(bld_base, emit_data);
5199 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5200 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5201 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5202 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
5203 }
5204
5205 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5206 struct lp_build_tgsi_context *bld_base,
5207 struct lp_build_emit_data *emit_data)
5208 {
5209 struct si_shader_context *ctx = si_shader_context(bld_base);
5210 struct gallivm_state *gallivm = bld_base->base.gallivm;
5211
5212 /* The real barrier instruction isn’t needed, because an entire patch
5213 * always fits into a single wave.
5214 */
5215 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5216 emit_optimization_barrier(ctx);
5217 return;
5218 }
5219
5220 lp_build_intrinsic(gallivm->builder,
5221 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5222 : "llvm.AMDGPU.barrier.local",
5223 ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
5224 }
5225
5226 static const struct lp_build_tgsi_action tex_action = {
5227 .fetch_args = tex_fetch_args,
5228 .emit = build_tex_intrinsic,
5229 };
5230
5231 static const struct lp_build_tgsi_action interp_action = {
5232 .fetch_args = interp_fetch_args,
5233 .emit = build_interp_intrinsic,
5234 };
5235
5236 static void si_create_function(struct si_shader_context *ctx,
5237 LLVMTypeRef *returns, unsigned num_returns,
5238 LLVMTypeRef *params, unsigned num_params,
5239 int last_array_pointer, int last_sgpr)
5240 {
5241 int i;
5242
5243 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5244 params, num_params);
5245 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5246 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5247
5248 for (i = 0; i <= last_sgpr; ++i) {
5249 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5250
5251 /* We tell llvm that array inputs are passed by value to allow Sinking pass
5252 * to move load. Inputs are constant so this is fine. */
5253 if (i <= last_array_pointer)
5254 LLVMAddAttribute(P, LLVMByValAttribute);
5255 else
5256 LLVMAddAttribute(P, LLVMInRegAttribute);
5257 }
5258 }
5259
5260 static void create_meta_data(struct si_shader_context *ctx)
5261 {
5262 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5263 LLVMValueRef args[3];
5264
5265 args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
5266 args[1] = 0;
5267 args[2] = lp_build_const_int32(gallivm, 1);
5268
5269 ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
5270
5271 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5272 "amdgpu.uniform", 14);
5273
5274 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5275 }
5276
5277 static void declare_streamout_params(struct si_shader_context *ctx,
5278 struct pipe_stream_output_info *so,
5279 LLVMTypeRef *params, LLVMTypeRef i32,
5280 unsigned *num_params)
5281 {
5282 int i;
5283
5284 /* Streamout SGPRs. */
5285 if (so->num_outputs) {
5286 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5287 params[ctx->param_streamout_config = (*num_params)++] = i32;
5288 else
5289 ctx->param_streamout_config = ctx->param_tess_offchip;
5290
5291 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5292 }
5293 /* A streamout buffer offset is loaded if the stride is non-zero. */
5294 for (i = 0; i < 4; i++) {
5295 if (!so->stride[i])
5296 continue;
5297
5298 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5299 }
5300 }
5301
5302 static unsigned llvm_get_type_size(LLVMTypeRef type)
5303 {
5304 LLVMTypeKind kind = LLVMGetTypeKind(type);
5305
5306 switch (kind) {
5307 case LLVMIntegerTypeKind:
5308 return LLVMGetIntTypeWidth(type) / 8;
5309 case LLVMFloatTypeKind:
5310 return 4;
5311 case LLVMPointerTypeKind:
5312 return 8;
5313 case LLVMVectorTypeKind:
5314 return LLVMGetVectorSize(type) *
5315 llvm_get_type_size(LLVMGetElementType(type));
5316 default:
5317 assert(0);
5318 return 0;
5319 }
5320 }
5321
5322 static void declare_tess_lds(struct si_shader_context *ctx)
5323 {
5324 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5325 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
5326 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5327
5328 /* The actual size is computed outside of the shader to reduce
5329 * the number of shader variants. */
5330 ctx->lds =
5331 LLVMAddGlobalInAddressSpace(gallivm->module,
5332 LLVMArrayType(i32, lds_size / 4),
5333 "tess_lds",
5334 LOCAL_ADDR_SPACE);
5335 }
5336
5337 static void create_function(struct si_shader_context *ctx)
5338 {
5339 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5340 struct gallivm_state *gallivm = bld_base->base.gallivm;
5341 struct si_shader *shader = ctx->shader;
5342 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5343 LLVMTypeRef returns[16+32*4];
5344 unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
5345 unsigned num_returns = 0;
5346
5347 v3i32 = LLVMVectorType(ctx->i32, 3);
5348
5349 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5350 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5351 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5352 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5353 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5354 last_array_pointer = SI_PARAM_SHADER_BUFFERS;
5355
5356 switch (ctx->type) {
5357 case PIPE_SHADER_VERTEX:
5358 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5359 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
5360 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5361 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5362 num_params = SI_PARAM_START_INSTANCE+1;
5363
5364 if (shader->key.vs.as_es) {
5365 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5366 } else if (shader->key.vs.as_ls) {
5367 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5368 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5369 } else {
5370 if (ctx->is_gs_copy_shader) {
5371 last_array_pointer = SI_PARAM_RW_BUFFERS;
5372 num_params = SI_PARAM_RW_BUFFERS+1;
5373 } else {
5374 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5375 num_params = SI_PARAM_VS_STATE_BITS+1;
5376 }
5377
5378 /* The locations of the other parameters are assigned dynamically. */
5379 declare_streamout_params(ctx, &shader->selector->so,
5380 params, ctx->i32, &num_params);
5381 }
5382
5383 last_sgpr = num_params-1;
5384
5385 /* VGPRs */
5386 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5387 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5388 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5389 params[ctx->param_instance_id = num_params++] = ctx->i32;
5390
5391 if (!ctx->is_monolithic &&
5392 !ctx->is_gs_copy_shader) {
5393 /* Vertex load indices. */
5394 ctx->param_vertex_index0 = num_params;
5395
5396 for (i = 0; i < shader->selector->info.num_inputs; i++)
5397 params[num_params++] = ctx->i32;
5398
5399 /* PrimitiveID output. */
5400 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5401 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5402 returns[num_returns++] = ctx->f32;
5403 }
5404 break;
5405
5406 case PIPE_SHADER_TESS_CTRL:
5407 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5408 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5409 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5410 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5411 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5412 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5413 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5414
5415 /* VGPRs */
5416 params[SI_PARAM_PATCH_ID] = ctx->i32;
5417 params[SI_PARAM_REL_IDS] = ctx->i32;
5418 num_params = SI_PARAM_REL_IDS+1;
5419
5420 if (!ctx->is_monolithic) {
5421 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5422 * placed after the user SGPRs.
5423 */
5424 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5425 returns[num_returns++] = ctx->i32; /* SGPRs */
5426
5427 for (i = 0; i < 3; i++)
5428 returns[num_returns++] = ctx->f32; /* VGPRs */
5429 }
5430 break;
5431
5432 case PIPE_SHADER_TESS_EVAL:
5433 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5434 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5435
5436 if (shader->key.tes.as_es) {
5437 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5438 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5439 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5440 } else {
5441 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5442 declare_streamout_params(ctx, &shader->selector->so,
5443 params, ctx->i32, &num_params);
5444 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5445 }
5446 last_sgpr = num_params - 1;
5447
5448 /* VGPRs */
5449 params[ctx->param_tes_u = num_params++] = ctx->f32;
5450 params[ctx->param_tes_v = num_params++] = ctx->f32;
5451 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5452 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5453
5454 /* PrimitiveID output. */
5455 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5456 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5457 returns[num_returns++] = ctx->f32;
5458 break;
5459
5460 case PIPE_SHADER_GEOMETRY:
5461 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5462 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5463 last_sgpr = SI_PARAM_GS_WAVE_ID;
5464
5465 /* VGPRs */
5466 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5467 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5468 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5469 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5470 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5471 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5472 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5473 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5474 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5475 break;
5476
5477 case PIPE_SHADER_FRAGMENT:
5478 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5479 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5480 last_sgpr = SI_PARAM_PRIM_MASK;
5481 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5482 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5483 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5484 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5485 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5486 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5487 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5488 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5489 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5490 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5491 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5492 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5493 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5494 params[SI_PARAM_ANCILLARY] = ctx->i32;
5495 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5496 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5497 num_params = SI_PARAM_POS_FIXED_PT+1;
5498
5499 if (!ctx->is_monolithic) {
5500 /* Color inputs from the prolog. */
5501 if (shader->selector->info.colors_read) {
5502 unsigned num_color_elements =
5503 util_bitcount(shader->selector->info.colors_read);
5504
5505 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5506 for (i = 0; i < num_color_elements; i++)
5507 params[num_params++] = ctx->f32;
5508 }
5509
5510 /* Outputs for the epilog. */
5511 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5512 num_returns =
5513 num_return_sgprs +
5514 util_bitcount(shader->selector->info.colors_written) * 4 +
5515 shader->selector->info.writes_z +
5516 shader->selector->info.writes_stencil +
5517 shader->selector->info.writes_samplemask +
5518 1 /* SampleMaskIn */;
5519
5520 num_returns = MAX2(num_returns,
5521 num_return_sgprs +
5522 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5523
5524 for (i = 0; i < num_return_sgprs; i++)
5525 returns[i] = ctx->i32;
5526 for (; i < num_returns; i++)
5527 returns[i] = ctx->f32;
5528 }
5529 break;
5530
5531 case PIPE_SHADER_COMPUTE:
5532 params[SI_PARAM_GRID_SIZE] = v3i32;
5533 params[SI_PARAM_BLOCK_ID] = v3i32;
5534 last_sgpr = SI_PARAM_BLOCK_ID;
5535
5536 params[SI_PARAM_THREAD_ID] = v3i32;
5537 num_params = SI_PARAM_THREAD_ID + 1;
5538 break;
5539 default:
5540 assert(0 && "unimplemented shader");
5541 return;
5542 }
5543
5544 assert(num_params <= ARRAY_SIZE(params));
5545
5546 si_create_function(ctx, returns, num_returns, params,
5547 num_params, last_array_pointer, last_sgpr);
5548
5549 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5550 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5551 !ctx->is_monolithic) {
5552 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5553 "InitialPSInputAddr",
5554 S_0286D0_PERSP_SAMPLE_ENA(1) |
5555 S_0286D0_PERSP_CENTER_ENA(1) |
5556 S_0286D0_PERSP_CENTROID_ENA(1) |
5557 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5558 S_0286D0_LINEAR_CENTER_ENA(1) |
5559 S_0286D0_LINEAR_CENTROID_ENA(1) |
5560 S_0286D0_FRONT_FACE_ENA(1) |
5561 S_0286D0_POS_FIXED_PT_ENA(1));
5562 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5563 const unsigned *properties = shader->selector->info.properties;
5564 unsigned max_work_group_size =
5565 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5566 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5567 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5568
5569 assert(max_work_group_size);
5570
5571 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5572 "amdgpu-max-work-group-size",
5573 max_work_group_size);
5574 }
5575
5576 shader->info.num_input_sgprs = 0;
5577 shader->info.num_input_vgprs = 0;
5578
5579 for (i = 0; i <= last_sgpr; ++i)
5580 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5581
5582 /* Unused fragment shader inputs are eliminated by the compiler,
5583 * so we don't know yet how many there will be.
5584 */
5585 if (ctx->type != PIPE_SHADER_FRAGMENT)
5586 for (; i < num_params; ++i)
5587 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5588
5589 if (bld_base->info &&
5590 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5591 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5592 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5593 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5594 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5595 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5596 ctx->lds =
5597 LLVMAddGlobalInAddressSpace(gallivm->module,
5598 LLVMArrayType(ctx->i32, 64),
5599 "ddxy_lds",
5600 LOCAL_ADDR_SPACE);
5601
5602 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5603 ctx->type == PIPE_SHADER_TESS_CTRL ||
5604 ctx->type == PIPE_SHADER_TESS_EVAL)
5605 declare_tess_lds(ctx);
5606 }
5607
5608 static void preload_constants(struct si_shader_context *ctx)
5609 {
5610 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5611 struct gallivm_state *gallivm = bld_base->base.gallivm;
5612 const struct tgsi_shader_info *info = bld_base->info;
5613 unsigned buf;
5614 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5615
5616 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5617 unsigned i, num_const = info->const_file_max[buf] + 1;
5618
5619 if (num_const == 0)
5620 continue;
5621
5622 /* Allocate space for the constant values */
5623 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5624
5625 /* Load the resource descriptor */
5626 ctx->const_buffers[buf] =
5627 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5628
5629 /* Load the constants, we rely on the code sinking to do the rest */
5630 for (i = 0; i < num_const * 4; ++i) {
5631 ctx->constants[buf][i] =
5632 buffer_load_const(gallivm->builder,
5633 ctx->const_buffers[buf],
5634 lp_build_const_int32(gallivm, i * 4),
5635 ctx->f32);
5636 }
5637 }
5638 }
5639
5640 static void preload_shader_buffers(struct si_shader_context *ctx)
5641 {
5642 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5643 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5644 int buf, maxbuf;
5645
5646 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5647 SI_NUM_SHADER_BUFFERS - 1);
5648 for (buf = 0; buf <= maxbuf; ++buf) {
5649 ctx->shader_buffers[buf] =
5650 build_indexed_load_const(
5651 ctx, ptr, lp_build_const_int32(gallivm, buf));
5652 }
5653 }
5654
5655 static void preload_samplers(struct si_shader_context *ctx)
5656 {
5657 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5658 struct gallivm_state *gallivm = bld_base->base.gallivm;
5659 const struct tgsi_shader_info *info = bld_base->info;
5660 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5661 LLVMValueRef offset;
5662
5663 if (num_samplers == 0)
5664 return;
5665
5666 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5667 for (i = 0; i < num_samplers; ++i) {
5668 /* Resource */
5669 offset = lp_build_const_int32(gallivm, i);
5670 ctx->sampler_views[i] =
5671 get_sampler_desc(ctx, offset, DESC_IMAGE);
5672
5673 /* FMASK resource */
5674 if (info->is_msaa_sampler[i])
5675 ctx->fmasks[i] =
5676 get_sampler_desc(ctx, offset, DESC_FMASK);
5677 else {
5678 ctx->sampler_states[i] =
5679 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5680 ctx->sampler_states[i] =
5681 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5682 ctx->sampler_states[i]);
5683 }
5684 }
5685 }
5686
5687 static void preload_images(struct si_shader_context *ctx)
5688 {
5689 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5690 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5691 struct gallivm_state *gallivm = bld_base->base.gallivm;
5692 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5693 LLVMValueRef res_ptr;
5694 unsigned i;
5695
5696 if (num_images == 0)
5697 return;
5698
5699 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5700
5701 for (i = 0; i < num_images; ++i) {
5702 /* Rely on LLVM to shrink the load for buffer resources. */
5703 LLVMValueRef rsrc =
5704 build_indexed_load_const(ctx, res_ptr,
5705 lp_build_const_int32(gallivm, i));
5706
5707 if (info->images_writemask & (1 << i) &&
5708 !(info->images_buffers & (1 << i)))
5709 rsrc = force_dcc_off(ctx, rsrc);
5710
5711 ctx->images[i] = rsrc;
5712 }
5713 }
5714
5715 static void preload_streamout_buffers(struct si_shader_context *ctx)
5716 {
5717 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5718 struct gallivm_state *gallivm = bld_base->base.gallivm;
5719 unsigned i;
5720
5721 /* Streamout can only be used if the shader is compiled as VS. */
5722 if (!ctx->shader->selector->so.num_outputs ||
5723 (ctx->type == PIPE_SHADER_VERTEX &&
5724 (ctx->shader->key.vs.as_es ||
5725 ctx->shader->key.vs.as_ls)) ||
5726 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5727 ctx->shader->key.tes.as_es))
5728 return;
5729
5730 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5731 SI_PARAM_RW_BUFFERS);
5732
5733 /* Load the resources, we rely on the code sinking to do the rest */
5734 for (i = 0; i < 4; ++i) {
5735 if (ctx->shader->selector->so.stride[i]) {
5736 LLVMValueRef offset = lp_build_const_int32(gallivm,
5737 SI_VS_STREAMOUT_BUF0 + i);
5738
5739 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5740 }
5741 }
5742 }
5743
5744 /**
5745 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5746 * for later use.
5747 */
5748 static void preload_ring_buffers(struct si_shader_context *ctx)
5749 {
5750 struct gallivm_state *gallivm =
5751 ctx->radeon_bld.soa.bld_base.base.gallivm;
5752
5753 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5754 SI_PARAM_RW_BUFFERS);
5755
5756 if ((ctx->type == PIPE_SHADER_VERTEX &&
5757 ctx->shader->key.vs.as_es) ||
5758 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5759 ctx->shader->key.tes.as_es) ||
5760 ctx->type == PIPE_SHADER_GEOMETRY) {
5761 unsigned ring =
5762 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5763 : SI_ES_RING_ESGS;
5764 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5765
5766 ctx->esgs_ring =
5767 build_indexed_load_const(ctx, buf_ptr, offset);
5768 }
5769
5770 if (ctx->is_gs_copy_shader) {
5771 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5772
5773 ctx->gsvs_ring[0] =
5774 build_indexed_load_const(ctx, buf_ptr, offset);
5775 }
5776 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5777 int i;
5778 for (i = 0; i < 4; i++) {
5779 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5780
5781 ctx->gsvs_ring[i] =
5782 build_indexed_load_const(ctx, buf_ptr, offset);
5783 }
5784 }
5785 }
5786
5787 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5788 LLVMValueRef param_rw_buffers,
5789 unsigned param_pos_fixed_pt)
5790 {
5791 struct lp_build_tgsi_context *bld_base =
5792 &ctx->radeon_bld.soa.bld_base;
5793 struct gallivm_state *gallivm = bld_base->base.gallivm;
5794 LLVMBuilderRef builder = gallivm->builder;
5795 LLVMValueRef slot, desc, offset, row, bit, address[2];
5796
5797 /* Use the fixed-point gl_FragCoord input.
5798 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5799 * per coordinate to get the repeating effect.
5800 */
5801 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5802 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5803
5804 /* Load the buffer descriptor. */
5805 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5806 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5807
5808 /* The stipple pattern is 32x32, each row has 32 bits. */
5809 offset = LLVMBuildMul(builder, address[1],
5810 LLVMConstInt(ctx->i32, 4, 0), "");
5811 row = buffer_load_const(builder, desc, offset, ctx->i32);
5812 bit = LLVMBuildLShr(builder, row, address[0], "");
5813 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5814
5815 /* The intrinsic kills the thread if arg < 0. */
5816 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5817 LLVMConstReal(ctx->f32, -1), "");
5818 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5819 }
5820
5821 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5822 struct si_shader_config *conf,
5823 unsigned symbol_offset)
5824 {
5825 unsigned i;
5826 const unsigned char *config =
5827 radeon_shader_binary_config_start(binary, symbol_offset);
5828 bool really_needs_scratch = false;
5829
5830 /* LLVM adds SGPR spills to the scratch size.
5831 * Find out if we really need the scratch buffer.
5832 */
5833 for (i = 0; i < binary->reloc_count; i++) {
5834 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5835
5836 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5837 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5838 really_needs_scratch = true;
5839 break;
5840 }
5841 }
5842
5843 /* XXX: We may be able to emit some of these values directly rather than
5844 * extracting fields to be emitted later.
5845 */
5846
5847 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5848 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5849 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5850 switch (reg) {
5851 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5852 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5853 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5854 case R_00B848_COMPUTE_PGM_RSRC1:
5855 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5856 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5857 conf->float_mode = G_00B028_FLOAT_MODE(value);
5858 conf->rsrc1 = value;
5859 break;
5860 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5861 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5862 break;
5863 case R_00B84C_COMPUTE_PGM_RSRC2:
5864 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5865 conf->rsrc2 = value;
5866 break;
5867 case R_0286CC_SPI_PS_INPUT_ENA:
5868 conf->spi_ps_input_ena = value;
5869 break;
5870 case R_0286D0_SPI_PS_INPUT_ADDR:
5871 conf->spi_ps_input_addr = value;
5872 break;
5873 case R_0286E8_SPI_TMPRING_SIZE:
5874 case R_00B860_COMPUTE_TMPRING_SIZE:
5875 /* WAVESIZE is in units of 256 dwords. */
5876 if (really_needs_scratch)
5877 conf->scratch_bytes_per_wave =
5878 G_00B860_WAVESIZE(value) * 256 * 4;
5879 break;
5880 default:
5881 {
5882 static bool printed;
5883
5884 if (!printed) {
5885 fprintf(stderr, "Warning: LLVM emitted unknown "
5886 "config register: 0x%x\n", reg);
5887 printed = true;
5888 }
5889 }
5890 break;
5891 }
5892
5893 if (!conf->spi_ps_input_addr)
5894 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5895 }
5896 }
5897
5898 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5899 struct si_shader *shader,
5900 struct si_shader_config *config,
5901 uint64_t scratch_va)
5902 {
5903 unsigned i;
5904 uint32_t scratch_rsrc_dword0 = scratch_va;
5905 uint32_t scratch_rsrc_dword1 =
5906 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
5907
5908 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
5909 * correctly.
5910 */
5911 if (HAVE_LLVM >= 0x0309)
5912 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
5913 else
5914 scratch_rsrc_dword1 |=
5915 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5916
5917 for (i = 0 ; i < shader->binary.reloc_count; i++) {
5918 const struct radeon_shader_reloc *reloc =
5919 &shader->binary.relocs[i];
5920 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5921 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5922 &scratch_rsrc_dword0, 4);
5923 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5924 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5925 &scratch_rsrc_dword1, 4);
5926 }
5927 }
5928 }
5929
5930 static unsigned si_get_shader_binary_size(struct si_shader *shader)
5931 {
5932 unsigned size = shader->binary.code_size;
5933
5934 if (shader->prolog)
5935 size += shader->prolog->binary.code_size;
5936 if (shader->epilog)
5937 size += shader->epilog->binary.code_size;
5938 return size;
5939 }
5940
5941 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5942 {
5943 const struct radeon_shader_binary *prolog =
5944 shader->prolog ? &shader->prolog->binary : NULL;
5945 const struct radeon_shader_binary *epilog =
5946 shader->epilog ? &shader->epilog->binary : NULL;
5947 const struct radeon_shader_binary *mainb = &shader->binary;
5948 unsigned bo_size = si_get_shader_binary_size(shader) +
5949 (!epilog ? mainb->rodata_size : 0);
5950 unsigned char *ptr;
5951
5952 assert(!prolog || !prolog->rodata_size);
5953 assert((!prolog && !epilog) || !mainb->rodata_size);
5954 assert(!epilog || !epilog->rodata_size);
5955
5956 r600_resource_reference(&shader->bo, NULL);
5957 shader->bo = si_resource_create_custom(&sscreen->b.b,
5958 PIPE_USAGE_IMMUTABLE,
5959 bo_size);
5960 if (!shader->bo)
5961 return -ENOMEM;
5962
5963 /* Upload. */
5964 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
5965 PIPE_TRANSFER_READ_WRITE);
5966
5967 if (prolog) {
5968 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
5969 ptr += prolog->code_size;
5970 }
5971
5972 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
5973 ptr += mainb->code_size;
5974
5975 if (epilog)
5976 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
5977 else if (mainb->rodata_size > 0)
5978 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
5979
5980 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5981 return 0;
5982 }
5983
5984 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
5985 struct pipe_debug_callback *debug,
5986 const char *name, FILE *file)
5987 {
5988 char *line, *p;
5989 unsigned i, count;
5990
5991 if (binary->disasm_string) {
5992 fprintf(file, "Shader %s disassembly:\n", name);
5993 fprintf(file, "%s", binary->disasm_string);
5994
5995 if (debug && debug->debug_message) {
5996 /* Very long debug messages are cut off, so send the
5997 * disassembly one line at a time. This causes more
5998 * overhead, but on the plus side it simplifies
5999 * parsing of resulting logs.
6000 */
6001 pipe_debug_message(debug, SHADER_INFO,
6002 "Shader Disassembly Begin");
6003
6004 line = binary->disasm_string;
6005 while (*line) {
6006 p = util_strchrnul(line, '\n');
6007 count = p - line;
6008
6009 if (count) {
6010 pipe_debug_message(debug, SHADER_INFO,
6011 "%.*s", count, line);
6012 }
6013
6014 if (!*p)
6015 break;
6016 line = p + 1;
6017 }
6018
6019 pipe_debug_message(debug, SHADER_INFO,
6020 "Shader Disassembly End");
6021 }
6022 } else {
6023 fprintf(file, "Shader %s binary:\n", name);
6024 for (i = 0; i < binary->code_size; i += 4) {
6025 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6026 binary->code[i + 3], binary->code[i + 2],
6027 binary->code[i + 1], binary->code[i]);
6028 }
6029 }
6030 }
6031
6032 static void si_shader_dump_stats(struct si_screen *sscreen,
6033 struct si_shader_config *conf,
6034 unsigned num_inputs,
6035 unsigned code_size,
6036 struct pipe_debug_callback *debug,
6037 unsigned processor,
6038 FILE *file)
6039 {
6040 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6041 unsigned lds_per_wave = 0;
6042 unsigned max_simd_waves = 10;
6043 /* Assuming SGPRs aren't spilled. */
6044 unsigned spilled_vgprs = conf->scratch_bytes_per_wave / 64 / 4;
6045
6046 /* Compute LDS usage for PS. */
6047 if (processor == PIPE_SHADER_FRAGMENT) {
6048 /* The minimum usage per wave is (num_inputs * 48). The maximum
6049 * usage is (num_inputs * 48 * 16).
6050 * We can get anything in between and it varies between waves.
6051 *
6052 * The 48 bytes per input for a single primitive is equal to
6053 * 4 bytes/component * 4 components/input * 3 points.
6054 *
6055 * Other stages don't know the size at compile time or don't
6056 * allocate LDS per wave, but instead they do it per thread group.
6057 */
6058 lds_per_wave = conf->lds_size * lds_increment +
6059 align(num_inputs * 48, lds_increment);
6060 }
6061
6062 /* Compute the per-SIMD wave counts. */
6063 if (conf->num_sgprs) {
6064 if (sscreen->b.chip_class >= VI)
6065 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6066 else
6067 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6068 }
6069
6070 if (conf->num_vgprs)
6071 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6072
6073 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6074 * that PS can use.
6075 */
6076 if (lds_per_wave)
6077 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6078
6079 if (file != stderr ||
6080 r600_can_dump_shader(&sscreen->b, processor)) {
6081 if (processor == PIPE_SHADER_FRAGMENT) {
6082 fprintf(file, "*** SHADER CONFIG ***\n"
6083 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6084 "SPI_PS_INPUT_ENA = 0x%04x\n",
6085 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6086 }
6087
6088 fprintf(file, "*** SHADER STATS ***\n"
6089 "SGPRS: %d\n"
6090 "VGPRS: %d\n"
6091 "Spilled VGPRs: %d\n"
6092 "Code Size: %d bytes\n"
6093 "LDS: %d blocks\n"
6094 "Scratch: %d bytes per wave\n"
6095 "Max Waves: %d\n"
6096 "********************\n",
6097 conf->num_sgprs, conf->num_vgprs, spilled_vgprs, code_size,
6098 conf->lds_size, conf->scratch_bytes_per_wave,
6099 max_simd_waves);
6100 }
6101
6102 pipe_debug_message(debug, SHADER_INFO,
6103 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6104 "LDS: %d Scratch: %d Max Waves: %d Spilled VGPRs: %d",
6105 conf->num_sgprs, conf->num_vgprs, code_size,
6106 conf->lds_size, conf->scratch_bytes_per_wave,
6107 max_simd_waves, spilled_vgprs);
6108 }
6109
6110 static const char *si_get_shader_name(struct si_shader *shader,
6111 unsigned processor)
6112 {
6113 switch (processor) {
6114 case PIPE_SHADER_VERTEX:
6115 if (shader->key.vs.as_es)
6116 return "Vertex Shader as ES";
6117 else if (shader->key.vs.as_ls)
6118 return "Vertex Shader as LS";
6119 else
6120 return "Vertex Shader as VS";
6121 case PIPE_SHADER_TESS_CTRL:
6122 return "Tessellation Control Shader";
6123 case PIPE_SHADER_TESS_EVAL:
6124 if (shader->key.tes.as_es)
6125 return "Tessellation Evaluation Shader as ES";
6126 else
6127 return "Tessellation Evaluation Shader as VS";
6128 case PIPE_SHADER_GEOMETRY:
6129 if (shader->gs_copy_shader == NULL)
6130 return "GS Copy Shader as VS";
6131 else
6132 return "Geometry Shader";
6133 case PIPE_SHADER_FRAGMENT:
6134 return "Pixel Shader";
6135 case PIPE_SHADER_COMPUTE:
6136 return "Compute Shader";
6137 default:
6138 return "Unknown Shader";
6139 }
6140 }
6141
6142 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6143 struct pipe_debug_callback *debug, unsigned processor,
6144 FILE *file)
6145 {
6146 if (file != stderr ||
6147 (r600_can_dump_shader(&sscreen->b, processor) &&
6148 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6149 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6150
6151 if (shader->prolog)
6152 si_shader_dump_disassembly(&shader->prolog->binary,
6153 debug, "prolog", file);
6154
6155 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6156
6157 if (shader->epilog)
6158 si_shader_dump_disassembly(&shader->epilog->binary,
6159 debug, "epilog", file);
6160 fprintf(file, "\n");
6161 }
6162
6163 si_shader_dump_stats(sscreen, &shader->config,
6164 shader->selector ? shader->selector->info.num_inputs : 0,
6165 si_get_shader_binary_size(shader), debug, processor,
6166 file);
6167 }
6168
6169 int si_compile_llvm(struct si_screen *sscreen,
6170 struct radeon_shader_binary *binary,
6171 struct si_shader_config *conf,
6172 LLVMTargetMachineRef tm,
6173 LLVMModuleRef mod,
6174 struct pipe_debug_callback *debug,
6175 unsigned processor,
6176 const char *name)
6177 {
6178 int r = 0;
6179 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6180
6181 if (r600_can_dump_shader(&sscreen->b, processor)) {
6182 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6183
6184 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6185 fprintf(stderr, "%s LLVM IR:\n\n", name);
6186 LLVMDumpModule(mod);
6187 fprintf(stderr, "\n");
6188 }
6189 }
6190
6191 if (!si_replace_shader(count, binary)) {
6192 r = radeon_llvm_compile(mod, binary, tm, debug);
6193 if (r)
6194 return r;
6195 }
6196
6197 si_shader_binary_read_config(binary, conf, 0);
6198
6199 /* Enable 64-bit and 16-bit denormals, because there is no performance
6200 * cost.
6201 *
6202 * If denormals are enabled, all floating-point output modifiers are
6203 * ignored.
6204 *
6205 * Don't enable denormals for 32-bit floats, because:
6206 * - Floating-point output modifiers would be ignored by the hw.
6207 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6208 * have to stop using those.
6209 * - SI & CI would be very slow.
6210 */
6211 conf->float_mode |= V_00B028_FP_64_DENORMS;
6212
6213 FREE(binary->config);
6214 FREE(binary->global_symbol_offsets);
6215 binary->config = NULL;
6216 binary->global_symbol_offsets = NULL;
6217
6218 /* Some shaders can't have rodata because their binaries can be
6219 * concatenated.
6220 */
6221 if (binary->rodata_size &&
6222 (processor == PIPE_SHADER_VERTEX ||
6223 processor == PIPE_SHADER_TESS_CTRL ||
6224 processor == PIPE_SHADER_TESS_EVAL ||
6225 processor == PIPE_SHADER_FRAGMENT)) {
6226 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6227 return -EINVAL;
6228 }
6229
6230 return r;
6231 }
6232
6233 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6234 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6235 struct si_shader_context *ctx,
6236 struct si_shader *gs,
6237 struct pipe_debug_callback *debug)
6238 {
6239 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6240 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6241 struct lp_build_context *uint = &bld_base->uint_bld;
6242 struct si_shader_output_values *outputs;
6243 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6244 LLVMValueRef args[9];
6245 int i, r;
6246
6247 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6248
6249 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6250 ctx->type = PIPE_SHADER_VERTEX;
6251 ctx->is_gs_copy_shader = true;
6252
6253 create_meta_data(ctx);
6254 create_function(ctx);
6255 preload_streamout_buffers(ctx);
6256 preload_ring_buffers(ctx);
6257
6258 args[0] = ctx->gsvs_ring[0];
6259 args[1] = lp_build_mul_imm(uint,
6260 LLVMGetParam(ctx->radeon_bld.main_fn,
6261 ctx->param_vertex_id),
6262 4);
6263 args[3] = uint->zero;
6264 args[4] = uint->one; /* OFFEN */
6265 args[5] = uint->zero; /* IDXEN */
6266 args[6] = uint->one; /* GLC */
6267 args[7] = uint->one; /* SLC */
6268 args[8] = uint->zero; /* TFE */
6269
6270 /* Fetch vertex data from GSVS ring */
6271 for (i = 0; i < gsinfo->num_outputs; ++i) {
6272 unsigned chan;
6273
6274 outputs[i].name = gsinfo->output_semantic_name[i];
6275 outputs[i].sid = gsinfo->output_semantic_index[i];
6276
6277 for (chan = 0; chan < 4; chan++) {
6278 args[2] = lp_build_const_int32(gallivm,
6279 (i * 4 + chan) *
6280 gs->selector->gs_max_out_vertices * 16 * 4);
6281
6282 outputs[i].values[chan] =
6283 LLVMBuildBitCast(gallivm->builder,
6284 lp_build_intrinsic(gallivm->builder,
6285 "llvm.SI.buffer.load.dword.i32.i32",
6286 ctx->i32, args, 9,
6287 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
6288 ctx->f32, "");
6289 }
6290 }
6291
6292 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6293
6294 LLVMBuildRet(gallivm->builder, ctx->return_value);
6295
6296 /* Dump LLVM IR before any optimization passes */
6297 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6298 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6299 LLVMDumpModule(bld_base->base.gallivm->module);
6300
6301 radeon_llvm_finalize_module(&ctx->radeon_bld);
6302
6303 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6304 &ctx->shader->config, ctx->tm,
6305 bld_base->base.gallivm->module,
6306 debug, PIPE_SHADER_GEOMETRY,
6307 "GS Copy Shader");
6308 if (!r) {
6309 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6310 fprintf(stderr, "GS Copy Shader:\n");
6311 si_shader_dump(sscreen, ctx->shader, debug,
6312 PIPE_SHADER_GEOMETRY, stderr);
6313 r = si_shader_binary_upload(sscreen, ctx->shader);
6314 }
6315
6316 radeon_llvm_dispose(&ctx->radeon_bld);
6317
6318 FREE(outputs);
6319 return r;
6320 }
6321
6322 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
6323 {
6324 int i;
6325
6326 fprintf(f, "SHADER KEY\n");
6327
6328 switch (shader) {
6329 case PIPE_SHADER_VERTEX:
6330 fprintf(f, " instance_divisors = {");
6331 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6332 fprintf(f, !i ? "%u" : ", %u",
6333 key->vs.prolog.instance_divisors[i]);
6334 fprintf(f, "}\n");
6335 fprintf(f, " as_es = %u\n", key->vs.as_es);
6336 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6337 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6338 break;
6339
6340 case PIPE_SHADER_TESS_CTRL:
6341 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6342 break;
6343
6344 case PIPE_SHADER_TESS_EVAL:
6345 fprintf(f, " as_es = %u\n", key->tes.as_es);
6346 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6347 break;
6348
6349 case PIPE_SHADER_GEOMETRY:
6350 case PIPE_SHADER_COMPUTE:
6351 break;
6352
6353 case PIPE_SHADER_FRAGMENT:
6354 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6355 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6356 fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
6357 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6358 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6359 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6360 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6361 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6362 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6363 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6364 break;
6365
6366 default:
6367 assert(0);
6368 }
6369 }
6370
6371 static void si_init_shader_ctx(struct si_shader_context *ctx,
6372 struct si_screen *sscreen,
6373 struct si_shader *shader,
6374 LLVMTargetMachineRef tm)
6375 {
6376 struct lp_build_tgsi_context *bld_base;
6377 struct lp_build_tgsi_action tmpl = {};
6378
6379 memset(ctx, 0, sizeof(*ctx));
6380 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
6381 ctx->tm = tm;
6382 ctx->screen = sscreen;
6383 if (shader && shader->selector)
6384 ctx->type = shader->selector->info.processor;
6385 else
6386 ctx->type = -1;
6387 ctx->shader = shader;
6388
6389 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6390 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6391 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6392 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6393 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6394 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6395 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6396 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6397 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6398 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6399 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6400 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6401
6402 bld_base = &ctx->radeon_bld.soa.bld_base;
6403 if (shader && shader->selector)
6404 bld_base->info = &shader->selector->info;
6405 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6406
6407 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6408 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6409 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6410
6411 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6412 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6413 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6414 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6415 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6416 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6417 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6418 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6419 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6420 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6421 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6422 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6423 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6424 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6425
6426 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6427 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6428 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6429 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6430 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6431 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6432
6433 tmpl.fetch_args = atomic_fetch_args;
6434 tmpl.emit = atomic_emit;
6435 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6436 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6437 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6438 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6439 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6440 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6441 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6442 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6443 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6444 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6445 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6446 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6447 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6448 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6449 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6450 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6451 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6452 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6453 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6454 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6455
6456 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6457
6458 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6459 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6460 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6461 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6462
6463 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6464 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6465 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6466
6467 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
6468 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
6469 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
6470 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
6471 }
6472
6473 int si_compile_tgsi_shader(struct si_screen *sscreen,
6474 LLVMTargetMachineRef tm,
6475 struct si_shader *shader,
6476 bool is_monolithic,
6477 struct pipe_debug_callback *debug)
6478 {
6479 struct si_shader_selector *sel = shader->selector;
6480 struct si_shader_context ctx;
6481 struct lp_build_tgsi_context *bld_base;
6482 LLVMModuleRef mod;
6483 int r = 0;
6484
6485 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6486 * conversion fails. */
6487 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6488 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6489 si_dump_shader_key(sel->type, &shader->key, stderr);
6490 tgsi_dump(sel->tokens, 0);
6491 si_dump_streamout(&sel->so);
6492 }
6493
6494 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6495 ctx.is_monolithic = is_monolithic;
6496
6497 shader->info.uses_instanceid = sel->info.uses_instanceid;
6498
6499 bld_base = &ctx.radeon_bld.soa.bld_base;
6500 ctx.radeon_bld.load_system_value = declare_system_value;
6501
6502 switch (ctx.type) {
6503 case PIPE_SHADER_VERTEX:
6504 ctx.radeon_bld.load_input = declare_input_vs;
6505 if (shader->key.vs.as_ls)
6506 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6507 else if (shader->key.vs.as_es)
6508 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6509 else
6510 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6511 break;
6512 case PIPE_SHADER_TESS_CTRL:
6513 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6514 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6515 bld_base->emit_store = store_output_tcs;
6516 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6517 break;
6518 case PIPE_SHADER_TESS_EVAL:
6519 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6520 if (shader->key.tes.as_es)
6521 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6522 else
6523 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6524 break;
6525 case PIPE_SHADER_GEOMETRY:
6526 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6527 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6528 break;
6529 case PIPE_SHADER_FRAGMENT:
6530 ctx.radeon_bld.load_input = declare_input_fs;
6531 if (is_monolithic)
6532 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6533 else
6534 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6535 break;
6536 case PIPE_SHADER_COMPUTE:
6537 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6538 break;
6539 default:
6540 assert(!"Unsupported shader type");
6541 return -1;
6542 }
6543
6544 create_meta_data(&ctx);
6545 create_function(&ctx);
6546 preload_constants(&ctx);
6547 preload_shader_buffers(&ctx);
6548 preload_samplers(&ctx);
6549 preload_images(&ctx);
6550 preload_streamout_buffers(&ctx);
6551 preload_ring_buffers(&ctx);
6552
6553 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6554 shader->key.ps.prolog.poly_stipple) {
6555 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6556 SI_PARAM_RW_BUFFERS);
6557 si_llvm_emit_polygon_stipple(&ctx, list,
6558 SI_PARAM_POS_FIXED_PT);
6559 }
6560
6561 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6562 int i;
6563 for (i = 0; i < 4; i++) {
6564 ctx.gs_next_vertex[i] =
6565 lp_build_alloca(bld_base->base.gallivm,
6566 ctx.i32, "");
6567 }
6568 }
6569
6570 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6571 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6572 goto out;
6573 }
6574
6575 LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
6576 mod = bld_base->base.gallivm->module;
6577
6578 /* Dump LLVM IR before any optimization passes */
6579 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6580 r600_can_dump_shader(&sscreen->b, ctx.type))
6581 LLVMDumpModule(mod);
6582
6583 radeon_llvm_finalize_module(&ctx.radeon_bld);
6584
6585 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6586 mod, debug, ctx.type, "TGSI shader");
6587 if (r) {
6588 fprintf(stderr, "LLVM failed to compile shader\n");
6589 goto out;
6590 }
6591
6592 radeon_llvm_dispose(&ctx.radeon_bld);
6593
6594 /* Add the scratch offset to input SGPRs. */
6595 if (shader->config.scratch_bytes_per_wave)
6596 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6597
6598 /* Calculate the number of fragment input VGPRs. */
6599 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6600 shader->info.num_input_vgprs = 0;
6601 shader->info.face_vgpr_index = -1;
6602
6603 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6604 shader->info.num_input_vgprs += 2;
6605 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6606 shader->info.num_input_vgprs += 2;
6607 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6608 shader->info.num_input_vgprs += 2;
6609 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6610 shader->info.num_input_vgprs += 3;
6611 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6612 shader->info.num_input_vgprs += 2;
6613 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6614 shader->info.num_input_vgprs += 2;
6615 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6616 shader->info.num_input_vgprs += 2;
6617 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6618 shader->info.num_input_vgprs += 1;
6619 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6620 shader->info.num_input_vgprs += 1;
6621 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6622 shader->info.num_input_vgprs += 1;
6623 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6624 shader->info.num_input_vgprs += 1;
6625 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6626 shader->info.num_input_vgprs += 1;
6627 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6628 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6629 shader->info.num_input_vgprs += 1;
6630 }
6631 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6632 shader->info.num_input_vgprs += 1;
6633 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6634 shader->info.num_input_vgprs += 1;
6635 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6636 shader->info.num_input_vgprs += 1;
6637 }
6638
6639 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6640 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6641 shader->gs_copy_shader->selector = shader->selector;
6642 ctx.shader = shader->gs_copy_shader;
6643 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6644 shader, debug))) {
6645 free(shader->gs_copy_shader);
6646 shader->gs_copy_shader = NULL;
6647 goto out;
6648 }
6649 }
6650
6651 out:
6652 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6653 FREE(ctx.constants[i]);
6654 return r;
6655 }
6656
6657 /**
6658 * Create, compile and return a shader part (prolog or epilog).
6659 *
6660 * \param sscreen screen
6661 * \param list list of shader parts of the same category
6662 * \param key shader part key
6663 * \param tm LLVM target machine
6664 * \param debug debug callback
6665 * \param compile the callback responsible for compilation
6666 * \return non-NULL on success
6667 */
6668 static struct si_shader_part *
6669 si_get_shader_part(struct si_screen *sscreen,
6670 struct si_shader_part **list,
6671 union si_shader_part_key *key,
6672 LLVMTargetMachineRef tm,
6673 struct pipe_debug_callback *debug,
6674 bool (*compile)(struct si_screen *,
6675 LLVMTargetMachineRef,
6676 struct pipe_debug_callback *,
6677 struct si_shader_part *))
6678 {
6679 struct si_shader_part *result;
6680
6681 pipe_mutex_lock(sscreen->shader_parts_mutex);
6682
6683 /* Find existing. */
6684 for (result = *list; result; result = result->next) {
6685 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6686 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6687 return result;
6688 }
6689 }
6690
6691 /* Compile a new one. */
6692 result = CALLOC_STRUCT(si_shader_part);
6693 result->key = *key;
6694 if (!compile(sscreen, tm, debug, result)) {
6695 FREE(result);
6696 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6697 return NULL;
6698 }
6699
6700 result->next = *list;
6701 *list = result;
6702 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6703 return result;
6704 }
6705
6706 /**
6707 * Create a vertex shader prolog.
6708 *
6709 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6710 * All inputs are returned unmodified. The vertex load indices are
6711 * stored after them, which will used by the API VS for fetching inputs.
6712 *
6713 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6714 * input_v0,
6715 * input_v1,
6716 * input_v2,
6717 * input_v3,
6718 * (VertexID + BaseVertex),
6719 * (InstanceID + StartInstance),
6720 * (InstanceID / 2 + StartInstance)
6721 */
6722 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6723 LLVMTargetMachineRef tm,
6724 struct pipe_debug_callback *debug,
6725 struct si_shader_part *out)
6726 {
6727 union si_shader_part_key *key = &out->key;
6728 struct si_shader shader = {};
6729 struct si_shader_context ctx;
6730 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6731 LLVMTypeRef *params, *returns;
6732 LLVMValueRef ret, func;
6733 int last_sgpr, num_params, num_returns, i;
6734 bool status = true;
6735
6736 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6737 ctx.type = PIPE_SHADER_VERTEX;
6738 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6739 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6740
6741 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6742 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6743 sizeof(LLVMTypeRef));
6744 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6745 key->vs_prolog.last_input + 1) *
6746 sizeof(LLVMTypeRef));
6747 num_params = 0;
6748 num_returns = 0;
6749
6750 /* Declare input and output SGPRs. */
6751 num_params = 0;
6752 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6753 params[num_params++] = ctx.i32;
6754 returns[num_returns++] = ctx.i32;
6755 }
6756 last_sgpr = num_params - 1;
6757
6758 /* 4 preloaded VGPRs (outputs must be floats) */
6759 for (i = 0; i < 4; i++) {
6760 params[num_params++] = ctx.i32;
6761 returns[num_returns++] = ctx.f32;
6762 }
6763
6764 /* Vertex load indices. */
6765 for (i = 0; i <= key->vs_prolog.last_input; i++)
6766 returns[num_returns++] = ctx.f32;
6767
6768 /* Create the function. */
6769 si_create_function(&ctx, returns, num_returns, params,
6770 num_params, -1, last_sgpr);
6771 func = ctx.radeon_bld.main_fn;
6772
6773 /* Copy inputs to outputs. This should be no-op, as the registers match,
6774 * but it will prevent the compiler from overwriting them unintentionally.
6775 */
6776 ret = ctx.return_value;
6777 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6778 LLVMValueRef p = LLVMGetParam(func, i);
6779 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6780 }
6781 for (i = num_params - 4; i < num_params; i++) {
6782 LLVMValueRef p = LLVMGetParam(func, i);
6783 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6784 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6785 }
6786
6787 /* Compute vertex load indices from instance divisors. */
6788 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6789 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6790 LLVMValueRef index;
6791
6792 if (divisor) {
6793 /* InstanceID / Divisor + StartInstance */
6794 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6795 SI_SGPR_START_INSTANCE,
6796 divisor);
6797 } else {
6798 /* VertexID + BaseVertex */
6799 index = LLVMBuildAdd(gallivm->builder,
6800 LLVMGetParam(func, ctx.param_vertex_id),
6801 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6802 }
6803
6804 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6805 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6806 num_params++, "");
6807 }
6808
6809 /* Compile. */
6810 LLVMBuildRet(gallivm->builder, ret);
6811 radeon_llvm_finalize_module(&ctx.radeon_bld);
6812
6813 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6814 gallivm->module, debug, ctx.type,
6815 "Vertex Shader Prolog"))
6816 status = false;
6817
6818 radeon_llvm_dispose(&ctx.radeon_bld);
6819 return status;
6820 }
6821
6822 /**
6823 * Compile the vertex shader epilog. This is also used by the tessellation
6824 * evaluation shader compiled as VS.
6825 *
6826 * The input is PrimitiveID.
6827 *
6828 * If PrimitiveID is required by the pixel shader, export it.
6829 * Otherwise, do nothing.
6830 */
6831 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6832 LLVMTargetMachineRef tm,
6833 struct pipe_debug_callback *debug,
6834 struct si_shader_part *out)
6835 {
6836 union si_shader_part_key *key = &out->key;
6837 struct si_shader_context ctx;
6838 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6839 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6840 LLVMTypeRef params[5];
6841 int num_params, i;
6842 bool status = true;
6843
6844 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6845 ctx.type = PIPE_SHADER_VERTEX;
6846
6847 /* Declare input VGPRs. */
6848 num_params = key->vs_epilog.states.export_prim_id ?
6849 (VS_EPILOG_PRIMID_LOC + 1) : 0;
6850 assert(num_params <= ARRAY_SIZE(params));
6851
6852 for (i = 0; i < num_params; i++)
6853 params[i] = ctx.f32;
6854
6855 /* Create the function. */
6856 si_create_function(&ctx, NULL, 0, params, num_params,
6857 -1, -1);
6858
6859 /* Emit exports. */
6860 if (key->vs_epilog.states.export_prim_id) {
6861 struct lp_build_context *base = &bld_base->base;
6862 struct lp_build_context *uint = &bld_base->uint_bld;
6863 LLVMValueRef args[9];
6864
6865 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6866 args[1] = uint->zero; /* whether the EXEC mask is valid */
6867 args[2] = uint->zero; /* DONE bit */
6868 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6869 key->vs_epilog.prim_id_param_offset);
6870 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6871 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6872 VS_EPILOG_PRIMID_LOC); /* X */
6873 args[6] = uint->undef; /* Y */
6874 args[7] = uint->undef; /* Z */
6875 args[8] = uint->undef; /* W */
6876
6877 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6878 LLVMVoidTypeInContext(base->gallivm->context),
6879 args, 9, 0);
6880 }
6881
6882 /* Compile. */
6883 LLVMBuildRet(gallivm->builder, ctx.return_value);
6884 radeon_llvm_finalize_module(&ctx.radeon_bld);
6885
6886 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6887 gallivm->module, debug, ctx.type,
6888 "Vertex Shader Epilog"))
6889 status = false;
6890
6891 radeon_llvm_dispose(&ctx.radeon_bld);
6892 return status;
6893 }
6894
6895 /**
6896 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
6897 */
6898 static bool si_get_vs_epilog(struct si_screen *sscreen,
6899 LLVMTargetMachineRef tm,
6900 struct si_shader *shader,
6901 struct pipe_debug_callback *debug,
6902 struct si_vs_epilog_bits *states)
6903 {
6904 union si_shader_part_key epilog_key;
6905
6906 memset(&epilog_key, 0, sizeof(epilog_key));
6907 epilog_key.vs_epilog.states = *states;
6908
6909 /* Set up the PrimitiveID output. */
6910 if (shader->key.vs.epilog.export_prim_id) {
6911 unsigned index = shader->selector->info.num_outputs;
6912 unsigned offset = shader->info.nr_param_exports++;
6913
6914 epilog_key.vs_epilog.prim_id_param_offset = offset;
6915 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6916 shader->info.vs_output_param_offset[index] = offset;
6917 }
6918
6919 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
6920 &epilog_key, tm, debug,
6921 si_compile_vs_epilog);
6922 return shader->epilog != NULL;
6923 }
6924
6925 /**
6926 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6927 */
6928 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6929 LLVMTargetMachineRef tm,
6930 struct si_shader *shader,
6931 struct pipe_debug_callback *debug)
6932 {
6933 struct tgsi_shader_info *info = &shader->selector->info;
6934 union si_shader_part_key prolog_key;
6935 unsigned i;
6936
6937 /* Get the prolog. */
6938 memset(&prolog_key, 0, sizeof(prolog_key));
6939 prolog_key.vs_prolog.states = shader->key.vs.prolog;
6940 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6941 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6942
6943 /* The prolog is a no-op if there are no inputs. */
6944 if (info->num_inputs) {
6945 shader->prolog =
6946 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6947 &prolog_key, tm, debug,
6948 si_compile_vs_prolog);
6949 if (!shader->prolog)
6950 return false;
6951 }
6952
6953 /* Get the epilog. */
6954 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
6955 !si_get_vs_epilog(sscreen, tm, shader, debug,
6956 &shader->key.vs.epilog))
6957 return false;
6958
6959 /* Set the instanceID flag. */
6960 for (i = 0; i < info->num_inputs; i++)
6961 if (prolog_key.vs_prolog.states.instance_divisors[i])
6962 shader->info.uses_instanceid = true;
6963
6964 return true;
6965 }
6966
6967 /**
6968 * Select and compile (or reuse) TES parts (epilog).
6969 */
6970 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
6971 LLVMTargetMachineRef tm,
6972 struct si_shader *shader,
6973 struct pipe_debug_callback *debug)
6974 {
6975 if (shader->key.tes.as_es)
6976 return true;
6977
6978 /* TES compiled as VS. */
6979 return si_get_vs_epilog(sscreen, tm, shader, debug,
6980 &shader->key.tes.epilog);
6981 }
6982
6983 /**
6984 * Compile the TCS epilog. This writes tesselation factors to memory based on
6985 * the output primitive type of the tesselator (determined by TES).
6986 */
6987 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
6988 LLVMTargetMachineRef tm,
6989 struct pipe_debug_callback *debug,
6990 struct si_shader_part *out)
6991 {
6992 union si_shader_part_key *key = &out->key;
6993 struct si_shader shader = {};
6994 struct si_shader_context ctx;
6995 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6996 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6997 LLVMTypeRef params[16];
6998 LLVMValueRef func;
6999 int last_array_pointer, last_sgpr, num_params;
7000 bool status = true;
7001
7002 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7003 ctx.type = PIPE_SHADER_TESS_CTRL;
7004 shader.key.tcs.epilog = key->tcs_epilog.states;
7005
7006 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7007 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7008 last_array_pointer = SI_PARAM_RW_BUFFERS;
7009 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7010 params[SI_PARAM_SAMPLERS] = ctx.i64;
7011 params[SI_PARAM_IMAGES] = ctx.i64;
7012 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7013 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7014 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7015 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7016 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7017 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7018 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7019 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7020 num_params = last_sgpr + 1;
7021
7022 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7023 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7024 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7025
7026 /* Create the function. */
7027 si_create_function(&ctx, NULL, 0, params, num_params,
7028 last_array_pointer, last_sgpr);
7029 declare_tess_lds(&ctx);
7030 func = ctx.radeon_bld.main_fn;
7031
7032 si_write_tess_factors(bld_base,
7033 LLVMGetParam(func, last_sgpr + 1),
7034 LLVMGetParam(func, last_sgpr + 2),
7035 LLVMGetParam(func, last_sgpr + 3));
7036
7037 /* Compile. */
7038 LLVMBuildRet(gallivm->builder, ctx.return_value);
7039 radeon_llvm_finalize_module(&ctx.radeon_bld);
7040
7041 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7042 gallivm->module, debug, ctx.type,
7043 "Tessellation Control Shader Epilog"))
7044 status = false;
7045
7046 radeon_llvm_dispose(&ctx.radeon_bld);
7047 return status;
7048 }
7049
7050 /**
7051 * Select and compile (or reuse) TCS parts (epilog).
7052 */
7053 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7054 LLVMTargetMachineRef tm,
7055 struct si_shader *shader,
7056 struct pipe_debug_callback *debug)
7057 {
7058 union si_shader_part_key epilog_key;
7059
7060 /* Get the epilog. */
7061 memset(&epilog_key, 0, sizeof(epilog_key));
7062 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7063
7064 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7065 &epilog_key, tm, debug,
7066 si_compile_tcs_epilog);
7067 return shader->epilog != NULL;
7068 }
7069
7070 /**
7071 * Compile the pixel shader prolog. This handles:
7072 * - two-side color selection and interpolation
7073 * - overriding interpolation parameters for the API PS
7074 * - polygon stippling
7075 *
7076 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7077 * overriden by other states. (e.g. per-sample interpolation)
7078 * Interpolated colors are stored after the preloaded VGPRs.
7079 */
7080 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7081 LLVMTargetMachineRef tm,
7082 struct pipe_debug_callback *debug,
7083 struct si_shader_part *out)
7084 {
7085 union si_shader_part_key *key = &out->key;
7086 struct si_shader shader = {};
7087 struct si_shader_context ctx;
7088 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7089 LLVMTypeRef *params;
7090 LLVMValueRef ret, func;
7091 int last_sgpr, num_params, num_returns, i, num_color_channels;
7092 bool status = true;
7093
7094 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7095 ctx.type = PIPE_SHADER_FRAGMENT;
7096 shader.key.ps.prolog = key->ps_prolog.states;
7097
7098 /* Number of inputs + 8 color elements. */
7099 params = alloca((key->ps_prolog.num_input_sgprs +
7100 key->ps_prolog.num_input_vgprs + 8) *
7101 sizeof(LLVMTypeRef));
7102
7103 /* Declare inputs. */
7104 num_params = 0;
7105 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7106 params[num_params++] = ctx.i32;
7107 last_sgpr = num_params - 1;
7108
7109 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7110 params[num_params++] = ctx.f32;
7111
7112 /* Declare outputs (same as inputs + add colors if needed) */
7113 num_returns = num_params;
7114 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7115 for (i = 0; i < num_color_channels; i++)
7116 params[num_returns++] = ctx.f32;
7117
7118 /* Create the function. */
7119 si_create_function(&ctx, params, num_returns, params,
7120 num_params, -1, last_sgpr);
7121 func = ctx.radeon_bld.main_fn;
7122
7123 /* Copy inputs to outputs. This should be no-op, as the registers match,
7124 * but it will prevent the compiler from overwriting them unintentionally.
7125 */
7126 ret = ctx.return_value;
7127 for (i = 0; i < num_params; i++) {
7128 LLVMValueRef p = LLVMGetParam(func, i);
7129 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7130 }
7131
7132 /* Polygon stippling. */
7133 if (key->ps_prolog.states.poly_stipple) {
7134 /* POS_FIXED_PT is always last. */
7135 unsigned pos = key->ps_prolog.num_input_sgprs +
7136 key->ps_prolog.num_input_vgprs - 1;
7137 LLVMValueRef ptr[2], list;
7138
7139 /* Get the pointer to rw buffers. */
7140 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7141 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7142 list = lp_build_gather_values(gallivm, ptr, 2);
7143 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7144 list = LLVMBuildIntToPtr(gallivm->builder, list,
7145 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7146
7147 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7148 }
7149
7150 /* Interpolate colors. */
7151 for (i = 0; i < 2; i++) {
7152 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7153 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7154 key->ps_prolog.face_vgpr_index;
7155 LLVMValueRef interp[2], color[4];
7156 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7157
7158 if (!writemask)
7159 continue;
7160
7161 /* If the interpolation qualifier is not CONSTANT (-1). */
7162 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7163 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7164 key->ps_prolog.color_interp_vgpr_index[i];
7165
7166 interp[0] = LLVMGetParam(func, interp_vgpr);
7167 interp[1] = LLVMGetParam(func, interp_vgpr + 1);
7168 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7169 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7170 ctx.v2i32, "");
7171 }
7172
7173 /* Use the absolute location of the input. */
7174 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7175
7176 if (key->ps_prolog.states.color_two_side) {
7177 face = LLVMGetParam(func, face_vgpr);
7178 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7179 }
7180
7181 interp_fs_input(&ctx,
7182 key->ps_prolog.color_attr_index[i],
7183 TGSI_SEMANTIC_COLOR, i,
7184 key->ps_prolog.num_interp_inputs,
7185 key->ps_prolog.colors_read, interp_ij,
7186 prim_mask, face, color);
7187
7188 while (writemask) {
7189 unsigned chan = u_bit_scan(&writemask);
7190 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7191 num_params++, "");
7192 }
7193 }
7194
7195 /* Force per-sample interpolation. */
7196 if (key->ps_prolog.states.force_persample_interp) {
7197 unsigned i, base = key->ps_prolog.num_input_sgprs;
7198 LLVMValueRef persp_sample[2], linear_sample[2];
7199
7200 /* Read PERSP_SAMPLE. */
7201 for (i = 0; i < 2; i++)
7202 persp_sample[i] = LLVMGetParam(func, base + i);
7203 /* Overwrite PERSP_CENTER. */
7204 for (i = 0; i < 2; i++)
7205 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7206 persp_sample[i], base + 2 + i, "");
7207 /* Overwrite PERSP_CENTROID. */
7208 for (i = 0; i < 2; i++)
7209 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7210 persp_sample[i], base + 4 + i, "");
7211 /* Read LINEAR_SAMPLE. */
7212 for (i = 0; i < 2; i++)
7213 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7214 /* Overwrite LINEAR_CENTER. */
7215 for (i = 0; i < 2; i++)
7216 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7217 linear_sample[i], base + 8 + i, "");
7218 /* Overwrite LINEAR_CENTROID. */
7219 for (i = 0; i < 2; i++)
7220 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7221 linear_sample[i], base + 10 + i, "");
7222 }
7223
7224 /* Tell LLVM to insert WQM instruction sequence when needed. */
7225 if (key->ps_prolog.wqm) {
7226 LLVMAddTargetDependentFunctionAttr(func,
7227 "amdgpu-ps-wqm-outputs", "");
7228 }
7229
7230 /* Compile. */
7231 LLVMBuildRet(gallivm->builder, ret);
7232 radeon_llvm_finalize_module(&ctx.radeon_bld);
7233
7234 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7235 gallivm->module, debug, ctx.type,
7236 "Fragment Shader Prolog"))
7237 status = false;
7238
7239 radeon_llvm_dispose(&ctx.radeon_bld);
7240 return status;
7241 }
7242
7243 /**
7244 * Compile the pixel shader epilog. This handles everything that must be
7245 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7246 */
7247 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7248 LLVMTargetMachineRef tm,
7249 struct pipe_debug_callback *debug,
7250 struct si_shader_part *out)
7251 {
7252 union si_shader_part_key *key = &out->key;
7253 struct si_shader shader = {};
7254 struct si_shader_context ctx;
7255 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7256 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7257 LLVMTypeRef params[16+8*4+3];
7258 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7259 int last_array_pointer, last_sgpr, num_params, i;
7260 bool status = true;
7261
7262 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7263 ctx.type = PIPE_SHADER_FRAGMENT;
7264 shader.key.ps.epilog = key->ps_epilog.states;
7265
7266 /* Declare input SGPRs. */
7267 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7268 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7269 params[SI_PARAM_SAMPLERS] = ctx.i64;
7270 params[SI_PARAM_IMAGES] = ctx.i64;
7271 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7272 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7273 last_array_pointer = -1;
7274 last_sgpr = SI_PARAM_ALPHA_REF;
7275
7276 /* Declare input VGPRs. */
7277 num_params = (last_sgpr + 1) +
7278 util_bitcount(key->ps_epilog.colors_written) * 4 +
7279 key->ps_epilog.writes_z +
7280 key->ps_epilog.writes_stencil +
7281 key->ps_epilog.writes_samplemask;
7282
7283 num_params = MAX2(num_params,
7284 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7285
7286 assert(num_params <= ARRAY_SIZE(params));
7287
7288 for (i = last_sgpr + 1; i < num_params; i++)
7289 params[i] = ctx.f32;
7290
7291 /* Create the function. */
7292 si_create_function(&ctx, NULL, 0, params, num_params,
7293 last_array_pointer, last_sgpr);
7294 /* Disable elimination of unused inputs. */
7295 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7296 "InitialPSInputAddr", 0xffffff);
7297
7298 /* Process colors. */
7299 unsigned vgpr = last_sgpr + 1;
7300 unsigned colors_written = key->ps_epilog.colors_written;
7301 int last_color_export = -1;
7302
7303 /* Find the last color export. */
7304 if (!key->ps_epilog.writes_z &&
7305 !key->ps_epilog.writes_stencil &&
7306 !key->ps_epilog.writes_samplemask) {
7307 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7308
7309 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7310 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7311 /* Just set this if any of the colorbuffers are enabled. */
7312 if (spi_format &
7313 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7314 last_color_export = 0;
7315 } else {
7316 for (i = 0; i < 8; i++)
7317 if (colors_written & (1 << i) &&
7318 (spi_format >> (i * 4)) & 0xf)
7319 last_color_export = i;
7320 }
7321 }
7322
7323 while (colors_written) {
7324 LLVMValueRef color[4];
7325 int mrt = u_bit_scan(&colors_written);
7326
7327 for (i = 0; i < 4; i++)
7328 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7329
7330 si_export_mrt_color(bld_base, color, mrt,
7331 num_params - 1,
7332 mrt == last_color_export);
7333 }
7334
7335 /* Process depth, stencil, samplemask. */
7336 if (key->ps_epilog.writes_z)
7337 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7338 if (key->ps_epilog.writes_stencil)
7339 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7340 if (key->ps_epilog.writes_samplemask)
7341 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7342
7343 if (depth || stencil || samplemask)
7344 si_export_mrt_z(bld_base, depth, stencil, samplemask);
7345 else if (last_color_export == -1)
7346 si_export_null(bld_base);
7347
7348 /* Compile. */
7349 LLVMBuildRetVoid(gallivm->builder);
7350 radeon_llvm_finalize_module(&ctx.radeon_bld);
7351
7352 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7353 gallivm->module, debug, ctx.type,
7354 "Fragment Shader Epilog"))
7355 status = false;
7356
7357 radeon_llvm_dispose(&ctx.radeon_bld);
7358 return status;
7359 }
7360
7361 /**
7362 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7363 */
7364 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7365 LLVMTargetMachineRef tm,
7366 struct si_shader *shader,
7367 struct pipe_debug_callback *debug)
7368 {
7369 struct tgsi_shader_info *info = &shader->selector->info;
7370 union si_shader_part_key prolog_key;
7371 union si_shader_part_key epilog_key;
7372 unsigned i;
7373
7374 /* Get the prolog. */
7375 memset(&prolog_key, 0, sizeof(prolog_key));
7376 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7377 prolog_key.ps_prolog.colors_read = info->colors_read;
7378 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7379 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7380 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7381 (prolog_key.ps_prolog.colors_read ||
7382 prolog_key.ps_prolog.states.force_persample_interp);
7383
7384 if (info->colors_read) {
7385 unsigned *color = shader->selector->color_attr_index;
7386
7387 if (shader->key.ps.prolog.color_two_side) {
7388 /* BCOLORs are stored after the last input. */
7389 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7390 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7391 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7392 }
7393
7394 for (i = 0; i < 2; i++) {
7395 unsigned location = info->input_interpolate_loc[color[i]];
7396
7397 if (!(info->colors_read & (0xf << i*4)))
7398 continue;
7399
7400 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7401
7402 /* Force per-sample interpolation for the colors here. */
7403 if (shader->key.ps.prolog.force_persample_interp)
7404 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7405
7406 switch (info->input_interpolate[color[i]]) {
7407 case TGSI_INTERPOLATE_CONSTANT:
7408 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7409 break;
7410 case TGSI_INTERPOLATE_PERSPECTIVE:
7411 case TGSI_INTERPOLATE_COLOR:
7412 switch (location) {
7413 case TGSI_INTERPOLATE_LOC_SAMPLE:
7414 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7415 shader->config.spi_ps_input_ena |=
7416 S_0286CC_PERSP_SAMPLE_ENA(1);
7417 break;
7418 case TGSI_INTERPOLATE_LOC_CENTER:
7419 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7420 shader->config.spi_ps_input_ena |=
7421 S_0286CC_PERSP_CENTER_ENA(1);
7422 break;
7423 case TGSI_INTERPOLATE_LOC_CENTROID:
7424 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7425 shader->config.spi_ps_input_ena |=
7426 S_0286CC_PERSP_CENTROID_ENA(1);
7427 break;
7428 default:
7429 assert(0);
7430 }
7431 break;
7432 case TGSI_INTERPOLATE_LINEAR:
7433 switch (location) {
7434 case TGSI_INTERPOLATE_LOC_SAMPLE:
7435 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7436 shader->config.spi_ps_input_ena |=
7437 S_0286CC_LINEAR_SAMPLE_ENA(1);
7438 break;
7439 case TGSI_INTERPOLATE_LOC_CENTER:
7440 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7441 shader->config.spi_ps_input_ena |=
7442 S_0286CC_LINEAR_CENTER_ENA(1);
7443 break;
7444 case TGSI_INTERPOLATE_LOC_CENTROID:
7445 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7446 shader->config.spi_ps_input_ena |=
7447 S_0286CC_LINEAR_CENTROID_ENA(1);
7448 break;
7449 default:
7450 assert(0);
7451 }
7452 break;
7453 default:
7454 assert(0);
7455 }
7456 }
7457 }
7458
7459 /* The prolog is a no-op if these aren't set. */
7460 if (prolog_key.ps_prolog.colors_read ||
7461 prolog_key.ps_prolog.states.force_persample_interp ||
7462 prolog_key.ps_prolog.states.poly_stipple) {
7463 shader->prolog =
7464 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7465 &prolog_key, tm, debug,
7466 si_compile_ps_prolog);
7467 if (!shader->prolog)
7468 return false;
7469 }
7470
7471 /* Get the epilog. */
7472 memset(&epilog_key, 0, sizeof(epilog_key));
7473 epilog_key.ps_epilog.colors_written = info->colors_written;
7474 epilog_key.ps_epilog.writes_z = info->writes_z;
7475 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7476 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7477 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7478
7479 shader->epilog =
7480 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7481 &epilog_key, tm, debug,
7482 si_compile_ps_epilog);
7483 if (!shader->epilog)
7484 return false;
7485
7486 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7487 if (shader->key.ps.prolog.poly_stipple) {
7488 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7489 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7490 }
7491
7492 /* Set up the enable bits for per-sample shading if needed. */
7493 if (shader->key.ps.prolog.force_persample_interp) {
7494 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7495 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
7496 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7497 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7498 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7499 }
7500 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7501 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
7502 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7503 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7504 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7505 }
7506 }
7507
7508 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7509 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7510 !(shader->config.spi_ps_input_ena & 0xf)) {
7511 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7512 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7513 }
7514
7515 /* At least one pair of interpolation weights must be enabled. */
7516 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7517 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7518 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7519 }
7520
7521 /* The sample mask input is always enabled, because the API shader always
7522 * passes it through to the epilog. Disable it here if it's unused.
7523 */
7524 if (!shader->key.ps.epilog.poly_line_smoothing &&
7525 !shader->selector->info.reads_samplemask)
7526 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7527
7528 return true;
7529 }
7530
7531 static void si_fix_num_sgprs(struct si_shader *shader)
7532 {
7533 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7534
7535 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7536 }
7537
7538 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7539 struct si_shader *shader,
7540 struct pipe_debug_callback *debug)
7541 {
7542 struct si_shader *mainp = shader->selector->main_shader_part;
7543 int r;
7544
7545 /* LS, ES, VS are compiled on demand if the main part hasn't been
7546 * compiled for that stage.
7547 */
7548 if (!mainp ||
7549 (shader->selector->type == PIPE_SHADER_VERTEX &&
7550 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7551 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7552 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7553 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7554 (shader->selector->type == PIPE_SHADER_TESS_CTRL &&
7555 shader->key.tcs.epilog.inputs_to_copy) ||
7556 shader->selector->type == PIPE_SHADER_COMPUTE) {
7557 /* Monolithic shader (compiled as a whole, has many variants,
7558 * may take a long time to compile).
7559 */
7560 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7561 if (r)
7562 return r;
7563 } else {
7564 /* The shader consists of 2-3 parts:
7565 *
7566 * - the middle part is the user shader, it has 1 variant only
7567 * and it was compiled during the creation of the shader
7568 * selector
7569 * - the prolog part is inserted at the beginning
7570 * - the epilog part is inserted at the end
7571 *
7572 * The prolog and epilog have many (but simple) variants.
7573 */
7574
7575 /* Copy the compiled TGSI shader data over. */
7576 shader->is_binary_shared = true;
7577 shader->binary = mainp->binary;
7578 shader->config = mainp->config;
7579 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7580 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7581 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7582 memcpy(shader->info.vs_output_param_offset,
7583 mainp->info.vs_output_param_offset,
7584 sizeof(mainp->info.vs_output_param_offset));
7585 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7586 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7587 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7588
7589 /* Select prologs and/or epilogs. */
7590 switch (shader->selector->type) {
7591 case PIPE_SHADER_VERTEX:
7592 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7593 return -1;
7594 break;
7595 case PIPE_SHADER_TESS_CTRL:
7596 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7597 return -1;
7598 break;
7599 case PIPE_SHADER_TESS_EVAL:
7600 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7601 return -1;
7602 break;
7603 case PIPE_SHADER_FRAGMENT:
7604 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7605 return -1;
7606
7607 /* Make sure we have at least as many VGPRs as there
7608 * are allocated inputs.
7609 */
7610 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7611 shader->info.num_input_vgprs);
7612 break;
7613 }
7614
7615 /* Update SGPR and VGPR counts. */
7616 if (shader->prolog) {
7617 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7618 shader->prolog->config.num_sgprs);
7619 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7620 shader->prolog->config.num_vgprs);
7621 }
7622 if (shader->epilog) {
7623 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7624 shader->epilog->config.num_sgprs);
7625 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7626 shader->epilog->config.num_vgprs);
7627 }
7628 }
7629
7630 si_fix_num_sgprs(shader);
7631 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7632 stderr);
7633
7634 /* Upload. */
7635 r = si_shader_binary_upload(sscreen, shader);
7636 if (r) {
7637 fprintf(stderr, "LLVM failed to upload shader\n");
7638 return r;
7639 }
7640
7641 return 0;
7642 }
7643
7644 void si_shader_destroy(struct si_shader *shader)
7645 {
7646 if (shader->gs_copy_shader) {
7647 si_shader_destroy(shader->gs_copy_shader);
7648 FREE(shader->gs_copy_shader);
7649 }
7650
7651 if (shader->scratch_bo)
7652 r600_resource_reference(&shader->scratch_bo, NULL);
7653
7654 r600_resource_reference(&shader->bo, NULL);
7655
7656 if (!shader->is_binary_shared)
7657 radeon_shader_binary_clean(&shader->binary);
7658 }