radeonsi: Add config parameter to si_shader_apply_scratch_relocs.
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "radeon/r600_cs.h"
37 #include "radeon/radeon_llvm.h"
38 #include "radeon/radeon_elf_util.h"
39 #include "radeon/radeon_llvm_emit.h"
40 #include "util/u_memory.h"
41 #include "util/u_pstipple.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94
95 LLVMTargetMachineRef tm;
96
97 LLVMValueRef const_md;
98 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
99 LLVMValueRef lds;
100 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
101 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
102 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
103 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
104 LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS];
105 LLVMValueRef images[SI_NUM_IMAGES];
106 LLVMValueRef so_buffers[4];
107 LLVMValueRef esgs_ring;
108 LLVMValueRef gsvs_ring[4];
109 LLVMValueRef gs_next_vertex[4];
110 LLVMValueRef return_value;
111
112 LLVMTypeRef voidt;
113 LLVMTypeRef i1;
114 LLVMTypeRef i8;
115 LLVMTypeRef i32;
116 LLVMTypeRef i64;
117 LLVMTypeRef i128;
118 LLVMTypeRef f32;
119 LLVMTypeRef v16i8;
120 LLVMTypeRef v2i32;
121 LLVMTypeRef v4i32;
122 LLVMTypeRef v4f32;
123 LLVMTypeRef v8i32;
124
125 LLVMValueRef shared_memory;
126 };
127
128 static struct si_shader_context *si_shader_context(
129 struct lp_build_tgsi_context *bld_base)
130 {
131 return (struct si_shader_context *)bld_base;
132 }
133
134 static void si_init_shader_ctx(struct si_shader_context *ctx,
135 struct si_screen *sscreen,
136 struct si_shader *shader,
137 LLVMTargetMachineRef tm);
138
139 /* Ideally pass the sample mask input to the PS epilog as v13, which
140 * is its usual location, so that the shader doesn't have to add v_mov.
141 */
142 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
143
144 /* The VS location of the PrimitiveID input is the same in the epilog,
145 * so that the main shader part doesn't have to move it.
146 */
147 #define VS_EPILOG_PRIMID_LOC 2
148
149 #define PERSPECTIVE_BASE 0
150 #define LINEAR_BASE 9
151
152 #define SAMPLE_OFFSET 0
153 #define CENTER_OFFSET 2
154 #define CENTROID_OFSET 4
155
156 #define USE_SGPR_MAX_SUFFIX_LEN 5
157 #define CONST_ADDR_SPACE 2
158 #define LOCAL_ADDR_SPACE 3
159 #define USER_SGPR_ADDR_SPACE 8
160
161
162 #define SENDMSG_GS 2
163 #define SENDMSG_GS_DONE 3
164
165 #define SENDMSG_GS_OP_NOP (0 << 4)
166 #define SENDMSG_GS_OP_CUT (1 << 4)
167 #define SENDMSG_GS_OP_EMIT (2 << 4)
168 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
169
170 /**
171 * Returns a unique index for a semantic name and index. The index must be
172 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
173 * calculated.
174 */
175 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
176 {
177 switch (semantic_name) {
178 case TGSI_SEMANTIC_POSITION:
179 return 0;
180 case TGSI_SEMANTIC_PSIZE:
181 return 1;
182 case TGSI_SEMANTIC_CLIPDIST:
183 assert(index <= 1);
184 return 2 + index;
185 case TGSI_SEMANTIC_GENERIC:
186 if (index <= 63-4)
187 return 4 + index;
188 else
189 /* same explanation as in the default statement,
190 * the only user hitting this is st/nine.
191 */
192 return 0;
193
194 /* patch indices are completely separate and thus start from 0 */
195 case TGSI_SEMANTIC_TESSOUTER:
196 return 0;
197 case TGSI_SEMANTIC_TESSINNER:
198 return 1;
199 case TGSI_SEMANTIC_PATCH:
200 return 2 + index;
201
202 default:
203 /* Don't fail here. The result of this function is only used
204 * for LS, TCS, TES, and GS, where legacy GL semantics can't
205 * occur, but this function is called for all vertex shaders
206 * before it's known whether LS will be compiled or not.
207 */
208 return 0;
209 }
210 }
211
212 /**
213 * Get the value of a shader input parameter and extract a bitfield.
214 */
215 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
216 unsigned param, unsigned rshift,
217 unsigned bitwidth)
218 {
219 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
220 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
221 param);
222
223 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
224 value = bitcast(&ctx->radeon_bld.soa.bld_base,
225 TGSI_TYPE_UNSIGNED, value);
226
227 if (rshift)
228 value = LLVMBuildLShr(gallivm->builder, value,
229 lp_build_const_int32(gallivm, rshift), "");
230
231 if (rshift + bitwidth < 32) {
232 unsigned mask = (1 << bitwidth) - 1;
233 value = LLVMBuildAnd(gallivm->builder, value,
234 lp_build_const_int32(gallivm, mask), "");
235 }
236
237 return value;
238 }
239
240 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
241 {
242 switch (ctx->type) {
243 case TGSI_PROCESSOR_TESS_CTRL:
244 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
245
246 case TGSI_PROCESSOR_TESS_EVAL:
247 return LLVMGetParam(ctx->radeon_bld.main_fn,
248 ctx->param_tes_rel_patch_id);
249
250 default:
251 assert(0);
252 return NULL;
253 }
254 }
255
256 /* Tessellation shaders pass outputs to the next shader using LDS.
257 *
258 * LS outputs = TCS inputs
259 * TCS outputs = TES inputs
260 *
261 * The LDS layout is:
262 * - TCS inputs for patch 0
263 * - TCS inputs for patch 1
264 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
265 * - ...
266 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
267 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
268 * - TCS outputs for patch 1
269 * - Per-patch TCS outputs for patch 1
270 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
271 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
272 * - ...
273 *
274 * All three shaders VS(LS), TCS, TES share the same LDS space.
275 */
276
277 static LLVMValueRef
278 get_tcs_in_patch_stride(struct si_shader_context *ctx)
279 {
280 if (ctx->type == TGSI_PROCESSOR_VERTEX)
281 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
282 else if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
283 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
284 else {
285 assert(0);
286 return NULL;
287 }
288 }
289
290 static LLVMValueRef
291 get_tcs_out_patch_stride(struct si_shader_context *ctx)
292 {
293 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
294 }
295
296 static LLVMValueRef
297 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
298 {
299 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
300 unpack_param(ctx,
301 SI_PARAM_TCS_OUT_OFFSETS,
302 0, 16),
303 4);
304 }
305
306 static LLVMValueRef
307 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
308 {
309 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
310 unpack_param(ctx,
311 SI_PARAM_TCS_OUT_OFFSETS,
312 16, 16),
313 4);
314 }
315
316 static LLVMValueRef
317 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
318 {
319 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
320 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
321 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
322
323 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
324 }
325
326 static LLVMValueRef
327 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
328 {
329 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
330 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
331 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
332 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
333
334 return LLVMBuildAdd(gallivm->builder, patch0_offset,
335 LLVMBuildMul(gallivm->builder, patch_stride,
336 rel_patch_id, ""),
337 "");
338 }
339
340 static LLVMValueRef
341 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
342 {
343 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
344 LLVMValueRef patch0_patch_data_offset =
345 get_tcs_out_patch0_patch_data_offset(ctx);
346 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
347 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
348
349 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
350 LLVMBuildMul(gallivm->builder, patch_stride,
351 rel_patch_id, ""),
352 "");
353 }
354
355 static void build_indexed_store(struct si_shader_context *ctx,
356 LLVMValueRef base_ptr, LLVMValueRef index,
357 LLVMValueRef value)
358 {
359 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
360 struct gallivm_state *gallivm = bld_base->base.gallivm;
361 LLVMValueRef indices[2], pointer;
362
363 indices[0] = bld_base->uint_bld.zero;
364 indices[1] = index;
365
366 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
367 LLVMBuildStore(gallivm->builder, value, pointer);
368 }
369
370 /**
371 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
372 * It's equivalent to doing a load from &base_ptr[index].
373 *
374 * \param base_ptr Where the array starts.
375 * \param index The element index into the array.
376 */
377 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
378 LLVMValueRef base_ptr, LLVMValueRef index)
379 {
380 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
381 struct gallivm_state *gallivm = bld_base->base.gallivm;
382 LLVMValueRef indices[2], pointer;
383
384 indices[0] = bld_base->uint_bld.zero;
385 indices[1] = index;
386
387 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
388 return LLVMBuildLoad(gallivm->builder, pointer, "");
389 }
390
391 /**
392 * Do a load from &base_ptr[index], but also add a flag that it's loading
393 * a constant.
394 */
395 static LLVMValueRef build_indexed_load_const(
396 struct si_shader_context *ctx,
397 LLVMValueRef base_ptr, LLVMValueRef index)
398 {
399 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index);
400 LLVMSetMetadata(result, 1, ctx->const_md);
401 return result;
402 }
403
404 static LLVMValueRef get_instance_index_for_fetch(
405 struct radeon_llvm_context *radeon_bld,
406 unsigned param_start_instance, unsigned divisor)
407 {
408 struct si_shader_context *ctx =
409 si_shader_context(&radeon_bld->soa.bld_base);
410 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
411
412 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
413 ctx->param_instance_id);
414
415 /* The division must be done before START_INSTANCE is added. */
416 if (divisor > 1)
417 result = LLVMBuildUDiv(gallivm->builder, result,
418 lp_build_const_int32(gallivm, divisor), "");
419
420 return LLVMBuildAdd(gallivm->builder, result,
421 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
422 }
423
424 static void declare_input_vs(
425 struct radeon_llvm_context *radeon_bld,
426 unsigned input_index,
427 const struct tgsi_full_declaration *decl)
428 {
429 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
430 struct gallivm_state *gallivm = base->gallivm;
431 struct si_shader_context *ctx =
432 si_shader_context(&radeon_bld->soa.bld_base);
433 unsigned divisor =
434 ctx->shader->key.vs.prolog.instance_divisors[input_index];
435
436 unsigned chan;
437
438 LLVMValueRef t_list_ptr;
439 LLVMValueRef t_offset;
440 LLVMValueRef t_list;
441 LLVMValueRef attribute_offset;
442 LLVMValueRef buffer_index;
443 LLVMValueRef args[3];
444 LLVMValueRef input;
445
446 /* Load the T list */
447 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
448
449 t_offset = lp_build_const_int32(gallivm, input_index);
450
451 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
452
453 /* Build the attribute offset */
454 attribute_offset = lp_build_const_int32(gallivm, 0);
455
456 if (!ctx->is_monolithic) {
457 buffer_index = LLVMGetParam(radeon_bld->main_fn,
458 ctx->param_vertex_index0 +
459 input_index);
460 } else if (divisor) {
461 /* Build index from instance ID, start instance and divisor */
462 ctx->shader->info.uses_instanceid = true;
463 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
464 SI_PARAM_START_INSTANCE,
465 divisor);
466 } else {
467 /* Load the buffer index for vertices. */
468 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
469 ctx->param_vertex_id);
470 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
471 SI_PARAM_BASE_VERTEX);
472 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
473 }
474
475 args[0] = t_list;
476 args[1] = attribute_offset;
477 args[2] = buffer_index;
478 input = lp_build_intrinsic(gallivm->builder,
479 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
480 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
481
482 /* Break up the vec4 into individual components */
483 for (chan = 0; chan < 4; chan++) {
484 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
485 /* XXX: Use a helper function for this. There is one in
486 * tgsi_llvm.c. */
487 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
488 LLVMBuildExtractElement(gallivm->builder,
489 input, llvm_chan, "");
490 }
491 }
492
493 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
494 unsigned swizzle)
495 {
496 struct si_shader_context *ctx = si_shader_context(bld_base);
497
498 if (swizzle > 0)
499 return bld_base->uint_bld.zero;
500
501 switch (ctx->type) {
502 case TGSI_PROCESSOR_VERTEX:
503 return LLVMGetParam(ctx->radeon_bld.main_fn,
504 ctx->param_vs_prim_id);
505 case TGSI_PROCESSOR_TESS_CTRL:
506 return LLVMGetParam(ctx->radeon_bld.main_fn,
507 SI_PARAM_PATCH_ID);
508 case TGSI_PROCESSOR_TESS_EVAL:
509 return LLVMGetParam(ctx->radeon_bld.main_fn,
510 ctx->param_tes_patch_id);
511 case TGSI_PROCESSOR_GEOMETRY:
512 return LLVMGetParam(ctx->radeon_bld.main_fn,
513 SI_PARAM_PRIMITIVE_ID);
514 default:
515 assert(0);
516 return bld_base->uint_bld.zero;
517 }
518 }
519
520 /**
521 * Return the value of tgsi_ind_register for indexing.
522 * This is the indirect index with the constant offset added to it.
523 */
524 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
525 const struct tgsi_ind_register *ind,
526 int rel_index)
527 {
528 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
529 LLVMValueRef result;
530
531 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
532 result = LLVMBuildLoad(gallivm->builder, result, "");
533 result = LLVMBuildAdd(gallivm->builder, result,
534 lp_build_const_int32(gallivm, rel_index), "");
535 return result;
536 }
537
538 /**
539 * Like get_indirect_index, but restricts the return value to a (possibly
540 * undefined) value inside [0..num).
541 */
542 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
543 const struct tgsi_ind_register *ind,
544 int rel_index, unsigned num)
545 {
546 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
547 LLVMBuilderRef builder = gallivm->builder;
548 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
549 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
550 LLVMValueRef cc;
551
552 if (util_is_power_of_two(num)) {
553 result = LLVMBuildAnd(builder, result, c_max, "");
554 } else {
555 /* In theory, this MAX pattern should result in code that is
556 * as good as the bit-wise AND above.
557 *
558 * In practice, LLVM generates worse code (at the time of
559 * writing), because its value tracking is not strong enough.
560 */
561 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
562 result = LLVMBuildSelect(builder, cc, result, c_max, "");
563 }
564
565 return result;
566 }
567
568
569 /**
570 * Calculate a dword address given an input or output register and a stride.
571 */
572 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
573 const struct tgsi_full_dst_register *dst,
574 const struct tgsi_full_src_register *src,
575 LLVMValueRef vertex_dw_stride,
576 LLVMValueRef base_addr)
577 {
578 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
579 struct tgsi_shader_info *info = &ctx->shader->selector->info;
580 ubyte *name, *index, *array_first;
581 int first, param;
582 struct tgsi_full_dst_register reg;
583
584 /* Set the register description. The address computation is the same
585 * for sources and destinations. */
586 if (src) {
587 reg.Register.File = src->Register.File;
588 reg.Register.Index = src->Register.Index;
589 reg.Register.Indirect = src->Register.Indirect;
590 reg.Register.Dimension = src->Register.Dimension;
591 reg.Indirect = src->Indirect;
592 reg.Dimension = src->Dimension;
593 reg.DimIndirect = src->DimIndirect;
594 } else
595 reg = *dst;
596
597 /* If the register is 2-dimensional (e.g. an array of vertices
598 * in a primitive), calculate the base address of the vertex. */
599 if (reg.Register.Dimension) {
600 LLVMValueRef index;
601
602 if (reg.Dimension.Indirect)
603 index = get_indirect_index(ctx, &reg.DimIndirect,
604 reg.Dimension.Index);
605 else
606 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
607
608 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
609 LLVMBuildMul(gallivm->builder, index,
610 vertex_dw_stride, ""), "");
611 }
612
613 /* Get information about the register. */
614 if (reg.Register.File == TGSI_FILE_INPUT) {
615 name = info->input_semantic_name;
616 index = info->input_semantic_index;
617 array_first = info->input_array_first;
618 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
619 name = info->output_semantic_name;
620 index = info->output_semantic_index;
621 array_first = info->output_array_first;
622 } else {
623 assert(0);
624 return NULL;
625 }
626
627 if (reg.Register.Indirect) {
628 /* Add the relative address of the element. */
629 LLVMValueRef ind_index;
630
631 if (reg.Indirect.ArrayID)
632 first = array_first[reg.Indirect.ArrayID];
633 else
634 first = reg.Register.Index;
635
636 ind_index = get_indirect_index(ctx, &reg.Indirect,
637 reg.Register.Index - first);
638
639 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
640 LLVMBuildMul(gallivm->builder, ind_index,
641 lp_build_const_int32(gallivm, 4), ""), "");
642
643 param = si_shader_io_get_unique_index(name[first], index[first]);
644 } else {
645 param = si_shader_io_get_unique_index(name[reg.Register.Index],
646 index[reg.Register.Index]);
647 }
648
649 /* Add the base address of the element. */
650 return LLVMBuildAdd(gallivm->builder, base_addr,
651 lp_build_const_int32(gallivm, param * 4), "");
652 }
653
654 /**
655 * Load from LDS.
656 *
657 * \param type output value type
658 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
659 * \param dw_addr address in dwords
660 */
661 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
662 enum tgsi_opcode_type type, unsigned swizzle,
663 LLVMValueRef dw_addr)
664 {
665 struct si_shader_context *ctx = si_shader_context(bld_base);
666 struct gallivm_state *gallivm = bld_base->base.gallivm;
667 LLVMValueRef value;
668
669 if (swizzle == ~0) {
670 LLVMValueRef values[TGSI_NUM_CHANNELS];
671
672 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
673 values[chan] = lds_load(bld_base, type, chan, dw_addr);
674
675 return lp_build_gather_values(bld_base->base.gallivm, values,
676 TGSI_NUM_CHANNELS);
677 }
678
679 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
680 lp_build_const_int32(gallivm, swizzle));
681
682 value = build_indexed_load(ctx, ctx->lds, dw_addr);
683 if (type == TGSI_TYPE_DOUBLE) {
684 LLVMValueRef value2;
685 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
686 lp_build_const_int32(gallivm, swizzle + 1));
687 value2 = build_indexed_load(ctx, ctx->lds, dw_addr);
688 return radeon_llvm_emit_fetch_double(bld_base, value, value2);
689 }
690
691 return LLVMBuildBitCast(gallivm->builder, value,
692 tgsi2llvmtype(bld_base, type), "");
693 }
694
695 /**
696 * Store to LDS.
697 *
698 * \param swizzle offset (typically 0..3)
699 * \param dw_addr address in dwords
700 * \param value value to store
701 */
702 static void lds_store(struct lp_build_tgsi_context *bld_base,
703 unsigned swizzle, LLVMValueRef dw_addr,
704 LLVMValueRef value)
705 {
706 struct si_shader_context *ctx = si_shader_context(bld_base);
707 struct gallivm_state *gallivm = bld_base->base.gallivm;
708
709 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
710 lp_build_const_int32(gallivm, swizzle));
711
712 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
713 build_indexed_store(ctx, ctx->lds,
714 dw_addr, value);
715 }
716
717 static LLVMValueRef fetch_input_tcs(
718 struct lp_build_tgsi_context *bld_base,
719 const struct tgsi_full_src_register *reg,
720 enum tgsi_opcode_type type, unsigned swizzle)
721 {
722 struct si_shader_context *ctx = si_shader_context(bld_base);
723 LLVMValueRef dw_addr, stride;
724
725 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
726 dw_addr = get_tcs_in_current_patch_offset(ctx);
727 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
728
729 return lds_load(bld_base, type, swizzle, dw_addr);
730 }
731
732 static LLVMValueRef fetch_output_tcs(
733 struct lp_build_tgsi_context *bld_base,
734 const struct tgsi_full_src_register *reg,
735 enum tgsi_opcode_type type, unsigned swizzle)
736 {
737 struct si_shader_context *ctx = si_shader_context(bld_base);
738 LLVMValueRef dw_addr, stride;
739
740 if (reg->Register.Dimension) {
741 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
742 dw_addr = get_tcs_out_current_patch_offset(ctx);
743 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
744 } else {
745 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
746 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
747 }
748
749 return lds_load(bld_base, type, swizzle, dw_addr);
750 }
751
752 static LLVMValueRef fetch_input_tes(
753 struct lp_build_tgsi_context *bld_base,
754 const struct tgsi_full_src_register *reg,
755 enum tgsi_opcode_type type, unsigned swizzle)
756 {
757 struct si_shader_context *ctx = si_shader_context(bld_base);
758 LLVMValueRef dw_addr, stride;
759
760 if (reg->Register.Dimension) {
761 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
762 dw_addr = get_tcs_out_current_patch_offset(ctx);
763 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
764 } else {
765 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
766 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
767 }
768
769 return lds_load(bld_base, type, swizzle, dw_addr);
770 }
771
772 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
773 const struct tgsi_full_instruction *inst,
774 const struct tgsi_opcode_info *info,
775 LLVMValueRef dst[4])
776 {
777 struct si_shader_context *ctx = si_shader_context(bld_base);
778 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
779 unsigned chan_index;
780 LLVMValueRef dw_addr, stride;
781
782 /* Only handle per-patch and per-vertex outputs here.
783 * Vectors will be lowered to scalars and this function will be called again.
784 */
785 if (reg->Register.File != TGSI_FILE_OUTPUT ||
786 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
787 radeon_llvm_emit_store(bld_base, inst, info, dst);
788 return;
789 }
790
791 if (reg->Register.Dimension) {
792 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
793 dw_addr = get_tcs_out_current_patch_offset(ctx);
794 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
795 } else {
796 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
797 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
798 }
799
800 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
801 LLVMValueRef value = dst[chan_index];
802
803 if (inst->Instruction.Saturate)
804 value = radeon_llvm_saturate(bld_base, value);
805
806 lds_store(bld_base, chan_index, dw_addr, value);
807 }
808 }
809
810 static LLVMValueRef fetch_input_gs(
811 struct lp_build_tgsi_context *bld_base,
812 const struct tgsi_full_src_register *reg,
813 enum tgsi_opcode_type type,
814 unsigned swizzle)
815 {
816 struct lp_build_context *base = &bld_base->base;
817 struct si_shader_context *ctx = si_shader_context(bld_base);
818 struct si_shader *shader = ctx->shader;
819 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
820 struct gallivm_state *gallivm = base->gallivm;
821 LLVMValueRef vtx_offset;
822 LLVMValueRef args[9];
823 unsigned vtx_offset_param;
824 struct tgsi_shader_info *info = &shader->selector->info;
825 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
826 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
827 unsigned param;
828 LLVMValueRef value;
829
830 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
831 return get_primitive_id(bld_base, swizzle);
832
833 if (!reg->Register.Dimension)
834 return NULL;
835
836 if (swizzle == ~0) {
837 LLVMValueRef values[TGSI_NUM_CHANNELS];
838 unsigned chan;
839 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
840 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
841 }
842 return lp_build_gather_values(bld_base->base.gallivm, values,
843 TGSI_NUM_CHANNELS);
844 }
845
846 /* Get the vertex offset parameter */
847 vtx_offset_param = reg->Dimension.Index;
848 if (vtx_offset_param < 2) {
849 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
850 } else {
851 assert(vtx_offset_param < 6);
852 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
853 }
854 vtx_offset = lp_build_mul_imm(uint,
855 LLVMGetParam(ctx->radeon_bld.main_fn,
856 vtx_offset_param),
857 4);
858
859 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
860 args[0] = ctx->esgs_ring;
861 args[1] = vtx_offset;
862 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
863 args[3] = uint->zero;
864 args[4] = uint->one; /* OFFEN */
865 args[5] = uint->zero; /* IDXEN */
866 args[6] = uint->one; /* GLC */
867 args[7] = uint->zero; /* SLC */
868 args[8] = uint->zero; /* TFE */
869
870 value = lp_build_intrinsic(gallivm->builder,
871 "llvm.SI.buffer.load.dword.i32.i32",
872 ctx->i32, args, 9,
873 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
874 if (type == TGSI_TYPE_DOUBLE) {
875 LLVMValueRef value2;
876 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
877 value2 = lp_build_intrinsic(gallivm->builder,
878 "llvm.SI.buffer.load.dword.i32.i32",
879 ctx->i32, args, 9,
880 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
881 return radeon_llvm_emit_fetch_double(bld_base,
882 value, value2);
883 }
884 return LLVMBuildBitCast(gallivm->builder,
885 value,
886 tgsi2llvmtype(bld_base, type), "");
887 }
888
889 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
890 {
891 switch (interpolate) {
892 case TGSI_INTERPOLATE_CONSTANT:
893 return 0;
894
895 case TGSI_INTERPOLATE_LINEAR:
896 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
897 return SI_PARAM_LINEAR_SAMPLE;
898 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
899 return SI_PARAM_LINEAR_CENTROID;
900 else
901 return SI_PARAM_LINEAR_CENTER;
902 break;
903 case TGSI_INTERPOLATE_COLOR:
904 case TGSI_INTERPOLATE_PERSPECTIVE:
905 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
906 return SI_PARAM_PERSP_SAMPLE;
907 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
908 return SI_PARAM_PERSP_CENTROID;
909 else
910 return SI_PARAM_PERSP_CENTER;
911 break;
912 default:
913 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
914 return -1;
915 }
916 }
917
918 /* This shouldn't be used by explicit INTERP opcodes. */
919 static unsigned select_interp_param(struct si_shader_context *ctx,
920 unsigned param)
921 {
922 if (!ctx->shader->key.ps.prolog.force_persample_interp ||
923 !ctx->is_monolithic)
924 return param;
925
926 /* If the shader doesn't use center/centroid, just return the parameter.
927 *
928 * If the shader only uses one set of (i,j), "si_emit_spi_ps_input" can
929 * switch between center/centroid and sample without shader changes.
930 */
931 switch (param) {
932 case SI_PARAM_PERSP_CENTROID:
933 case SI_PARAM_PERSP_CENTER:
934 return SI_PARAM_PERSP_SAMPLE;
935
936 case SI_PARAM_LINEAR_CENTROID:
937 case SI_PARAM_LINEAR_CENTER:
938 return SI_PARAM_LINEAR_SAMPLE;
939
940 default:
941 return param;
942 }
943 }
944
945 /**
946 * Interpolate a fragment shader input.
947 *
948 * @param ctx context
949 * @param input_index index of the input in hardware
950 * @param semantic_name TGSI_SEMANTIC_*
951 * @param semantic_index semantic index
952 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
953 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
954 * @param interp_param interpolation weights (i,j)
955 * @param prim_mask SI_PARAM_PRIM_MASK
956 * @param face SI_PARAM_FRONT_FACE
957 * @param result the return value (4 components)
958 */
959 static void interp_fs_input(struct si_shader_context *ctx,
960 unsigned input_index,
961 unsigned semantic_name,
962 unsigned semantic_index,
963 unsigned num_interp_inputs,
964 unsigned colors_read_mask,
965 LLVMValueRef interp_param,
966 LLVMValueRef prim_mask,
967 LLVMValueRef face,
968 LLVMValueRef result[4])
969 {
970 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
971 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
972 struct gallivm_state *gallivm = base->gallivm;
973 const char *intr_name;
974 LLVMValueRef attr_number;
975
976 unsigned chan;
977
978 attr_number = lp_build_const_int32(gallivm, input_index);
979
980 /* fs.constant returns the param from the middle vertex, so it's not
981 * really useful for flat shading. It's meant to be used for custom
982 * interpolation (but the intrinsic can't fetch from the other two
983 * vertices).
984 *
985 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
986 * to do the right thing. The only reason we use fs.constant is that
987 * fs.interp cannot be used on integers, because they can be equal
988 * to NaN.
989 */
990 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
991
992 if (semantic_name == TGSI_SEMANTIC_COLOR &&
993 ctx->shader->key.ps.prolog.color_two_side) {
994 LLVMValueRef args[4];
995 LLVMValueRef is_face_positive;
996 LLVMValueRef back_attr_number;
997
998 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
999 * otherwise it's at offset "num_inputs".
1000 */
1001 unsigned back_attr_offset = num_interp_inputs;
1002 if (semantic_index == 1 && colors_read_mask & 0xf)
1003 back_attr_offset += 1;
1004
1005 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1006
1007 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1008 face, uint->zero, "");
1009
1010 args[2] = prim_mask;
1011 args[3] = interp_param;
1012 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1013 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1014 LLVMValueRef front, back;
1015
1016 args[0] = llvm_chan;
1017 args[1] = attr_number;
1018 front = lp_build_intrinsic(gallivm->builder, intr_name,
1019 ctx->f32, args, args[3] ? 4 : 3,
1020 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1021
1022 args[1] = back_attr_number;
1023 back = lp_build_intrinsic(gallivm->builder, intr_name,
1024 ctx->f32, args, args[3] ? 4 : 3,
1025 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1026
1027 result[chan] = LLVMBuildSelect(gallivm->builder,
1028 is_face_positive,
1029 front,
1030 back,
1031 "");
1032 }
1033 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1034 LLVMValueRef args[4];
1035
1036 args[0] = uint->zero;
1037 args[1] = attr_number;
1038 args[2] = prim_mask;
1039 args[3] = interp_param;
1040 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1041 ctx->f32, args, args[3] ? 4 : 3,
1042 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1043 result[1] =
1044 result[2] = lp_build_const_float(gallivm, 0.0f);
1045 result[3] = lp_build_const_float(gallivm, 1.0f);
1046 } else {
1047 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1048 LLVMValueRef args[4];
1049 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1050
1051 args[0] = llvm_chan;
1052 args[1] = attr_number;
1053 args[2] = prim_mask;
1054 args[3] = interp_param;
1055 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1056 ctx->f32, args, args[3] ? 4 : 3,
1057 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1058 }
1059 }
1060 }
1061
1062 static void declare_input_fs(
1063 struct radeon_llvm_context *radeon_bld,
1064 unsigned input_index,
1065 const struct tgsi_full_declaration *decl)
1066 {
1067 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1068 struct si_shader_context *ctx =
1069 si_shader_context(&radeon_bld->soa.bld_base);
1070 struct si_shader *shader = ctx->shader;
1071 LLVMValueRef main_fn = radeon_bld->main_fn;
1072 LLVMValueRef interp_param = NULL;
1073 int interp_param_idx;
1074
1075 /* Get colors from input VGPRs (set by the prolog). */
1076 if (!ctx->is_monolithic &&
1077 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1078 unsigned i = decl->Semantic.Index;
1079 unsigned colors_read = shader->selector->info.colors_read;
1080 unsigned mask = colors_read >> (i * 4);
1081 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1082 (i ? util_bitcount(colors_read & 0xf) : 0);
1083
1084 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1085 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1086 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1087 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1088 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1089 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1090 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1091 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1092 return;
1093 }
1094
1095 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1096 decl->Interp.Location);
1097 if (interp_param_idx == -1)
1098 return;
1099 else if (interp_param_idx) {
1100 interp_param_idx = select_interp_param(ctx,
1101 interp_param_idx);
1102 interp_param = LLVMGetParam(main_fn, interp_param_idx);
1103 }
1104
1105 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1106 decl->Semantic.Index, shader->selector->info.num_inputs,
1107 shader->selector->info.colors_read, interp_param,
1108 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1109 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1110 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1111 }
1112
1113 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1114 {
1115 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1116 SI_PARAM_ANCILLARY, 8, 4);
1117 }
1118
1119 /**
1120 * Load a dword from a constant buffer.
1121 */
1122 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1123 LLVMValueRef offset, LLVMTypeRef return_type)
1124 {
1125 LLVMValueRef args[2] = {resource, offset};
1126
1127 return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1128 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1129 }
1130
1131 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1132 {
1133 struct si_shader_context *ctx =
1134 si_shader_context(&radeon_bld->soa.bld_base);
1135 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1136 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1137 LLVMBuilderRef builder = gallivm->builder;
1138 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1139 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_DRIVER_STATE_CONST_BUF);
1140 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1141
1142 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1143 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1144 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1145
1146 LLVMValueRef pos[4] = {
1147 buffer_load_const(builder, resource, offset0, ctx->f32),
1148 buffer_load_const(builder, resource, offset1, ctx->f32),
1149 lp_build_const_float(gallivm, 0),
1150 lp_build_const_float(gallivm, 0)
1151 };
1152
1153 return lp_build_gather_values(gallivm, pos, 4);
1154 }
1155
1156 static void declare_system_value(
1157 struct radeon_llvm_context *radeon_bld,
1158 unsigned index,
1159 const struct tgsi_full_declaration *decl)
1160 {
1161 struct si_shader_context *ctx =
1162 si_shader_context(&radeon_bld->soa.bld_base);
1163 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1164 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1165 LLVMValueRef value = 0;
1166
1167 switch (decl->Semantic.Name) {
1168 case TGSI_SEMANTIC_INSTANCEID:
1169 value = LLVMGetParam(radeon_bld->main_fn,
1170 ctx->param_instance_id);
1171 break;
1172
1173 case TGSI_SEMANTIC_VERTEXID:
1174 value = LLVMBuildAdd(gallivm->builder,
1175 LLVMGetParam(radeon_bld->main_fn,
1176 ctx->param_vertex_id),
1177 LLVMGetParam(radeon_bld->main_fn,
1178 SI_PARAM_BASE_VERTEX), "");
1179 break;
1180
1181 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1182 value = LLVMGetParam(radeon_bld->main_fn,
1183 ctx->param_vertex_id);
1184 break;
1185
1186 case TGSI_SEMANTIC_BASEVERTEX:
1187 value = LLVMGetParam(radeon_bld->main_fn,
1188 SI_PARAM_BASE_VERTEX);
1189 break;
1190
1191 case TGSI_SEMANTIC_INVOCATIONID:
1192 if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
1193 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1194 else if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
1195 value = LLVMGetParam(radeon_bld->main_fn,
1196 SI_PARAM_GS_INSTANCE_ID);
1197 else
1198 assert(!"INVOCATIONID not implemented");
1199 break;
1200
1201 case TGSI_SEMANTIC_POSITION:
1202 {
1203 LLVMValueRef pos[4] = {
1204 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1205 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1206 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1207 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1208 LLVMGetParam(radeon_bld->main_fn,
1209 SI_PARAM_POS_W_FLOAT)),
1210 };
1211 value = lp_build_gather_values(gallivm, pos, 4);
1212 break;
1213 }
1214
1215 case TGSI_SEMANTIC_FACE:
1216 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1217 break;
1218
1219 case TGSI_SEMANTIC_SAMPLEID:
1220 value = get_sample_id(radeon_bld);
1221 break;
1222
1223 case TGSI_SEMANTIC_SAMPLEPOS: {
1224 LLVMValueRef pos[4] = {
1225 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1226 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1227 lp_build_const_float(gallivm, 0),
1228 lp_build_const_float(gallivm, 0)
1229 };
1230 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1231 TGSI_OPCODE_FRC, pos[0]);
1232 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1233 TGSI_OPCODE_FRC, pos[1]);
1234 value = lp_build_gather_values(gallivm, pos, 4);
1235 break;
1236 }
1237
1238 case TGSI_SEMANTIC_SAMPLEMASK:
1239 /* This can only occur with the OpenGL Core profile, which
1240 * doesn't support smoothing.
1241 */
1242 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1243 break;
1244
1245 case TGSI_SEMANTIC_TESSCOORD:
1246 {
1247 LLVMValueRef coord[4] = {
1248 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1249 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1250 bld->zero,
1251 bld->zero
1252 };
1253
1254 /* For triangles, the vector should be (u, v, 1-u-v). */
1255 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1256 PIPE_PRIM_TRIANGLES)
1257 coord[2] = lp_build_sub(bld, bld->one,
1258 lp_build_add(bld, coord[0], coord[1]));
1259
1260 value = lp_build_gather_values(gallivm, coord, 4);
1261 break;
1262 }
1263
1264 case TGSI_SEMANTIC_VERTICESIN:
1265 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1266 break;
1267
1268 case TGSI_SEMANTIC_TESSINNER:
1269 case TGSI_SEMANTIC_TESSOUTER:
1270 {
1271 LLVMValueRef dw_addr;
1272 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1273
1274 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1275 dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
1276 lp_build_const_int32(gallivm, param * 4), "");
1277
1278 value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1279 ~0, dw_addr);
1280 break;
1281 }
1282
1283 case TGSI_SEMANTIC_PRIMID:
1284 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1285 break;
1286
1287 case TGSI_SEMANTIC_GRID_SIZE:
1288 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1289 break;
1290
1291 case TGSI_SEMANTIC_BLOCK_SIZE:
1292 {
1293 LLVMValueRef values[3];
1294 unsigned i;
1295 unsigned *properties = ctx->shader->selector->info.properties;
1296 unsigned sizes[3] = {
1297 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1298 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1299 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1300 };
1301
1302 for (i = 0; i < 3; ++i)
1303 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1304
1305 value = lp_build_gather_values(gallivm, values, 3);
1306 break;
1307 }
1308
1309 case TGSI_SEMANTIC_BLOCK_ID:
1310 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1311 break;
1312
1313 case TGSI_SEMANTIC_THREAD_ID:
1314 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1315 break;
1316
1317 default:
1318 assert(!"unknown system value");
1319 return;
1320 }
1321
1322 radeon_bld->system_values[index] = value;
1323 }
1324
1325 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1326 const struct tgsi_full_declaration *decl)
1327 {
1328 struct si_shader_context *ctx =
1329 si_shader_context(&radeon_bld->soa.bld_base);
1330 struct si_shader_selector *sel = ctx->shader->selector;
1331 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1332
1333 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1334 LLVMValueRef var;
1335
1336 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1337 assert(decl->Range.First == decl->Range.Last);
1338 assert(!ctx->shared_memory);
1339
1340 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1341 LLVMArrayType(ctx->i8, sel->local_size),
1342 "compute_lds",
1343 LOCAL_ADDR_SPACE);
1344 LLVMSetAlignment(var, 4);
1345
1346 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1347 }
1348
1349 static LLVMValueRef fetch_constant(
1350 struct lp_build_tgsi_context *bld_base,
1351 const struct tgsi_full_src_register *reg,
1352 enum tgsi_opcode_type type,
1353 unsigned swizzle)
1354 {
1355 struct si_shader_context *ctx = si_shader_context(bld_base);
1356 struct lp_build_context *base = &bld_base->base;
1357 const struct tgsi_ind_register *ireg = &reg->Indirect;
1358 unsigned buf, idx;
1359
1360 LLVMValueRef addr, bufp;
1361 LLVMValueRef result;
1362
1363 if (swizzle == LP_CHAN_ALL) {
1364 unsigned chan;
1365 LLVMValueRef values[4];
1366 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1367 values[chan] = fetch_constant(bld_base, reg, type, chan);
1368
1369 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1370 }
1371
1372 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1373 idx = reg->Register.Index * 4 + swizzle;
1374
1375 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1376 if (type != TGSI_TYPE_DOUBLE)
1377 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1378 else {
1379 return radeon_llvm_emit_fetch_double(bld_base,
1380 ctx->constants[buf][idx],
1381 ctx->constants[buf][idx + 1]);
1382 }
1383 }
1384
1385 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1386 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1387 LLVMValueRef index;
1388 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1389 reg->Dimension.Index,
1390 SI_NUM_USER_CONST_BUFFERS);
1391 bufp = build_indexed_load_const(ctx, ptr, index);
1392 } else
1393 bufp = ctx->const_buffers[buf];
1394
1395 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1396 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1397 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1398 addr = lp_build_add(&bld_base->uint_bld, addr,
1399 lp_build_const_int32(base->gallivm, idx * 4));
1400
1401 result = buffer_load_const(base->gallivm->builder, bufp,
1402 addr, ctx->f32);
1403
1404 if (type != TGSI_TYPE_DOUBLE)
1405 result = bitcast(bld_base, type, result);
1406 else {
1407 LLVMValueRef addr2, result2;
1408 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1409 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1410 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1411 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1412 lp_build_const_int32(base->gallivm, idx * 4));
1413
1414 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1415 addr2, ctx->f32);
1416
1417 result = radeon_llvm_emit_fetch_double(bld_base,
1418 result, result2);
1419 }
1420 return result;
1421 }
1422
1423 /* Upper 16 bits must be zero. */
1424 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1425 LLVMValueRef val[2])
1426 {
1427 return LLVMBuildOr(gallivm->builder, val[0],
1428 LLVMBuildShl(gallivm->builder, val[1],
1429 lp_build_const_int32(gallivm, 16),
1430 ""), "");
1431 }
1432
1433 /* Upper 16 bits are ignored and will be dropped. */
1434 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1435 LLVMValueRef val[2])
1436 {
1437 LLVMValueRef v[2] = {
1438 LLVMBuildAnd(gallivm->builder, val[0],
1439 lp_build_const_int32(gallivm, 0xffff), ""),
1440 val[1],
1441 };
1442 return si_llvm_pack_two_int16(gallivm, v);
1443 }
1444
1445 /* Initialize arguments for the shader export intrinsic */
1446 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1447 LLVMValueRef *values,
1448 unsigned target,
1449 LLVMValueRef *args)
1450 {
1451 struct si_shader_context *ctx = si_shader_context(bld_base);
1452 struct lp_build_context *uint =
1453 &ctx->radeon_bld.soa.bld_base.uint_bld;
1454 struct lp_build_context *base = &bld_base->base;
1455 struct gallivm_state *gallivm = base->gallivm;
1456 LLVMBuilderRef builder = base->gallivm->builder;
1457 LLVMValueRef val[4];
1458 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1459 unsigned chan;
1460 bool is_int8;
1461
1462 /* Default is 0xf. Adjusted below depending on the format. */
1463 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1464
1465 /* Specify whether the EXEC mask represents the valid mask */
1466 args[1] = uint->zero;
1467
1468 /* Specify whether this is the last export */
1469 args[2] = uint->zero;
1470
1471 /* Specify the target we are exporting */
1472 args[3] = lp_build_const_int32(base->gallivm, target);
1473
1474 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
1475 const union si_shader_key *key = &ctx->shader->key;
1476 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1477 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1478
1479 assert(cbuf >= 0 && cbuf < 8);
1480 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1481 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1482 }
1483
1484 args[4] = uint->zero; /* COMPR flag */
1485 args[5] = base->undef;
1486 args[6] = base->undef;
1487 args[7] = base->undef;
1488 args[8] = base->undef;
1489
1490 switch (spi_shader_col_format) {
1491 case V_028714_SPI_SHADER_ZERO:
1492 args[0] = uint->zero; /* writemask */
1493 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1494 break;
1495
1496 case V_028714_SPI_SHADER_32_R:
1497 args[0] = uint->one; /* writemask */
1498 args[5] = values[0];
1499 break;
1500
1501 case V_028714_SPI_SHADER_32_GR:
1502 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1503 args[5] = values[0];
1504 args[6] = values[1];
1505 break;
1506
1507 case V_028714_SPI_SHADER_32_AR:
1508 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
1509 args[5] = values[0];
1510 args[8] = values[3];
1511 break;
1512
1513 case V_028714_SPI_SHADER_FP16_ABGR:
1514 args[4] = uint->one; /* COMPR flag */
1515
1516 for (chan = 0; chan < 2; chan++) {
1517 LLVMValueRef pack_args[2] = {
1518 values[2 * chan],
1519 values[2 * chan + 1]
1520 };
1521 LLVMValueRef packed;
1522
1523 packed = lp_build_intrinsic(base->gallivm->builder,
1524 "llvm.SI.packf16",
1525 ctx->i32, pack_args, 2,
1526 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1527 args[chan + 5] =
1528 LLVMBuildBitCast(base->gallivm->builder,
1529 packed, ctx->f32, "");
1530 }
1531 break;
1532
1533 case V_028714_SPI_SHADER_UNORM16_ABGR:
1534 for (chan = 0; chan < 4; chan++) {
1535 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
1536 val[chan] = LLVMBuildFMul(builder, val[chan],
1537 lp_build_const_float(gallivm, 65535), "");
1538 val[chan] = LLVMBuildFAdd(builder, val[chan],
1539 lp_build_const_float(gallivm, 0.5), "");
1540 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1541 ctx->i32, "");
1542 }
1543
1544 args[4] = uint->one; /* COMPR flag */
1545 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1546 si_llvm_pack_two_int16(gallivm, val));
1547 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1548 si_llvm_pack_two_int16(gallivm, val+2));
1549 break;
1550
1551 case V_028714_SPI_SHADER_SNORM16_ABGR:
1552 for (chan = 0; chan < 4; chan++) {
1553 /* Clamp between [-1, 1]. */
1554 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1555 values[chan],
1556 lp_build_const_float(gallivm, 1));
1557 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1558 val[chan],
1559 lp_build_const_float(gallivm, -1));
1560 /* Convert to a signed integer in [-32767, 32767]. */
1561 val[chan] = LLVMBuildFMul(builder, val[chan],
1562 lp_build_const_float(gallivm, 32767), "");
1563 /* If positive, add 0.5, else add -0.5. */
1564 val[chan] = LLVMBuildFAdd(builder, val[chan],
1565 LLVMBuildSelect(builder,
1566 LLVMBuildFCmp(builder, LLVMRealOGE,
1567 val[chan], base->zero, ""),
1568 lp_build_const_float(gallivm, 0.5),
1569 lp_build_const_float(gallivm, -0.5), ""), "");
1570 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1571 }
1572
1573 args[4] = uint->one; /* COMPR flag */
1574 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1575 si_llvm_pack_two_int32_as_int16(gallivm, val));
1576 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1577 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1578 break;
1579
1580 case V_028714_SPI_SHADER_UINT16_ABGR: {
1581 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1582 255 : 65535);
1583 /* Clamp. */
1584 for (chan = 0; chan < 4; chan++) {
1585 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1586 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1587 val[chan], max);
1588 }
1589
1590 args[4] = uint->one; /* COMPR flag */
1591 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1592 si_llvm_pack_two_int16(gallivm, val));
1593 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1594 si_llvm_pack_two_int16(gallivm, val+2));
1595 break;
1596 }
1597
1598 case V_028714_SPI_SHADER_SINT16_ABGR: {
1599 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1600 127 : 32767);
1601 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
1602 -128 : -32768);
1603 /* Clamp. */
1604 for (chan = 0; chan < 4; chan++) {
1605 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1606 val[chan] = lp_build_emit_llvm_binary(bld_base,
1607 TGSI_OPCODE_IMIN,
1608 val[chan], max);
1609 val[chan] = lp_build_emit_llvm_binary(bld_base,
1610 TGSI_OPCODE_IMAX,
1611 val[chan], min);
1612 }
1613
1614 args[4] = uint->one; /* COMPR flag */
1615 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1616 si_llvm_pack_two_int32_as_int16(gallivm, val));
1617 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1618 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1619 break;
1620 }
1621
1622 case V_028714_SPI_SHADER_32_ABGR:
1623 memcpy(&args[5], values, sizeof(values[0]) * 4);
1624 break;
1625 }
1626 }
1627
1628 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1629 LLVMValueRef alpha)
1630 {
1631 struct si_shader_context *ctx = si_shader_context(bld_base);
1632 struct gallivm_state *gallivm = bld_base->base.gallivm;
1633
1634 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1635 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
1636 SI_PARAM_ALPHA_REF);
1637
1638 LLVMValueRef alpha_pass =
1639 lp_build_cmp(&bld_base->base,
1640 ctx->shader->key.ps.epilog.alpha_func,
1641 alpha, alpha_ref);
1642 LLVMValueRef arg =
1643 lp_build_select(&bld_base->base,
1644 alpha_pass,
1645 lp_build_const_float(gallivm, 1.0f),
1646 lp_build_const_float(gallivm, -1.0f));
1647
1648 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
1649 ctx->voidt, &arg, 1, 0);
1650 } else {
1651 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
1652 ctx->voidt, NULL, 0, 0);
1653 }
1654 }
1655
1656 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
1657 LLVMValueRef alpha,
1658 unsigned samplemask_param)
1659 {
1660 struct si_shader_context *ctx = si_shader_context(bld_base);
1661 struct gallivm_state *gallivm = bld_base->base.gallivm;
1662 LLVMValueRef coverage;
1663
1664 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1665 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
1666 samplemask_param);
1667 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
1668
1669 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
1670 ctx->i32,
1671 &coverage, 1, LLVMReadNoneAttribute);
1672
1673 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
1674 ctx->f32, "");
1675
1676 coverage = LLVMBuildFMul(gallivm->builder, coverage,
1677 lp_build_const_float(gallivm,
1678 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1679
1680 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
1681 }
1682
1683 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
1684 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
1685 {
1686 struct si_shader_context *ctx = si_shader_context(bld_base);
1687 struct lp_build_context *base = &bld_base->base;
1688 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1689 unsigned reg_index;
1690 unsigned chan;
1691 unsigned const_chan;
1692 LLVMValueRef base_elt;
1693 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1694 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm, SI_DRIVER_STATE_CONST_BUF);
1695 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
1696
1697 for (reg_index = 0; reg_index < 2; reg_index ++) {
1698 LLVMValueRef *args = pos[2 + reg_index];
1699
1700 args[5] =
1701 args[6] =
1702 args[7] =
1703 args[8] = lp_build_const_float(base->gallivm, 0.0f);
1704
1705 /* Compute dot products of position and user clip plane vectors */
1706 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1707 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
1708 args[1] = lp_build_const_int32(base->gallivm,
1709 ((reg_index * 4 + chan) * 4 +
1710 const_chan) * 4);
1711 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
1712 args[1], ctx->f32);
1713 args[5 + chan] =
1714 lp_build_add(base, args[5 + chan],
1715 lp_build_mul(base, base_elt,
1716 out_elts[const_chan]));
1717 }
1718 }
1719
1720 args[0] = lp_build_const_int32(base->gallivm, 0xf);
1721 args[1] = uint->zero;
1722 args[2] = uint->zero;
1723 args[3] = lp_build_const_int32(base->gallivm,
1724 V_008DFC_SQ_EXP_POS + 2 + reg_index);
1725 args[4] = uint->zero;
1726 }
1727 }
1728
1729 static void si_dump_streamout(struct pipe_stream_output_info *so)
1730 {
1731 unsigned i;
1732
1733 if (so->num_outputs)
1734 fprintf(stderr, "STREAMOUT\n");
1735
1736 for (i = 0; i < so->num_outputs; i++) {
1737 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
1738 so->output[i].start_component;
1739 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
1740 i, so->output[i].output_buffer,
1741 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
1742 so->output[i].register_index,
1743 mask & 1 ? "x" : "",
1744 mask & 2 ? "y" : "",
1745 mask & 4 ? "z" : "",
1746 mask & 8 ? "w" : "");
1747 }
1748 }
1749
1750 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1751 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1752 * or v4i32 (num_channels=3,4). */
1753 static void build_tbuffer_store(struct si_shader_context *ctx,
1754 LLVMValueRef rsrc,
1755 LLVMValueRef vdata,
1756 unsigned num_channels,
1757 LLVMValueRef vaddr,
1758 LLVMValueRef soffset,
1759 unsigned inst_offset,
1760 unsigned dfmt,
1761 unsigned nfmt,
1762 unsigned offen,
1763 unsigned idxen,
1764 unsigned glc,
1765 unsigned slc,
1766 unsigned tfe)
1767 {
1768 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1769 LLVMValueRef args[] = {
1770 rsrc,
1771 vdata,
1772 LLVMConstInt(ctx->i32, num_channels, 0),
1773 vaddr,
1774 soffset,
1775 LLVMConstInt(ctx->i32, inst_offset, 0),
1776 LLVMConstInt(ctx->i32, dfmt, 0),
1777 LLVMConstInt(ctx->i32, nfmt, 0),
1778 LLVMConstInt(ctx->i32, offen, 0),
1779 LLVMConstInt(ctx->i32, idxen, 0),
1780 LLVMConstInt(ctx->i32, glc, 0),
1781 LLVMConstInt(ctx->i32, slc, 0),
1782 LLVMConstInt(ctx->i32, tfe, 0)
1783 };
1784
1785 /* The instruction offset field has 12 bits */
1786 assert(offen || inst_offset < (1 << 12));
1787
1788 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
1789 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1790 const char *types[] = {"i32", "v2i32", "v4i32"};
1791 char name[256];
1792 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
1793
1794 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
1795 args, Elements(args), 0);
1796 }
1797
1798 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
1799 LLVMValueRef rsrc,
1800 LLVMValueRef vdata,
1801 unsigned num_channels,
1802 LLVMValueRef vaddr,
1803 LLVMValueRef soffset,
1804 unsigned inst_offset)
1805 {
1806 static unsigned dfmt[] = {
1807 V_008F0C_BUF_DATA_FORMAT_32,
1808 V_008F0C_BUF_DATA_FORMAT_32_32,
1809 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1810 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1811 };
1812 assert(num_channels >= 1 && num_channels <= 4);
1813
1814 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
1815 inst_offset, dfmt[num_channels-1],
1816 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
1817 }
1818
1819 /* On SI, the vertex shader is responsible for writing streamout data
1820 * to buffers. */
1821 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
1822 struct si_shader_output_values *outputs,
1823 unsigned noutput)
1824 {
1825 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
1826 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1827 LLVMBuilderRef builder = gallivm->builder;
1828 int i, j;
1829 struct lp_build_if_state if_ctx;
1830
1831 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1832 LLVMValueRef so_vtx_count =
1833 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
1834
1835 LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", ctx->i32,
1836 NULL, 0, LLVMReadNoneAttribute);
1837
1838 /* can_emit = tid < so_vtx_count; */
1839 LLVMValueRef can_emit =
1840 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
1841
1842 LLVMValueRef stream_id =
1843 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
1844
1845 /* Emit the streamout code conditionally. This actually avoids
1846 * out-of-bounds buffer access. The hw tells us via the SGPR
1847 * (so_vtx_count) which threads are allowed to emit streamout data. */
1848 lp_build_if(&if_ctx, gallivm, can_emit);
1849 {
1850 /* The buffer offset is computed as follows:
1851 * ByteOffset = streamout_offset[buffer_id]*4 +
1852 * (streamout_write_index + thread_id)*stride[buffer_id] +
1853 * attrib_offset
1854 */
1855
1856 LLVMValueRef so_write_index =
1857 LLVMGetParam(ctx->radeon_bld.main_fn,
1858 ctx->param_streamout_write_index);
1859
1860 /* Compute (streamout_write_index + thread_id). */
1861 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
1862
1863 /* Compute the write offset for each enabled buffer. */
1864 LLVMValueRef so_write_offset[4] = {};
1865 for (i = 0; i < 4; i++) {
1866 if (!so->stride[i])
1867 continue;
1868
1869 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
1870 ctx->param_streamout_offset[i]);
1871 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
1872
1873 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
1874 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
1875 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
1876 }
1877
1878 /* Write streamout data. */
1879 for (i = 0; i < so->num_outputs; i++) {
1880 unsigned buf_idx = so->output[i].output_buffer;
1881 unsigned reg = so->output[i].register_index;
1882 unsigned start = so->output[i].start_component;
1883 unsigned num_comps = so->output[i].num_components;
1884 unsigned stream = so->output[i].stream;
1885 LLVMValueRef out[4];
1886 struct lp_build_if_state if_ctx_stream;
1887
1888 assert(num_comps && num_comps <= 4);
1889 if (!num_comps || num_comps > 4)
1890 continue;
1891
1892 if (reg >= noutput)
1893 continue;
1894
1895 /* Load the output as int. */
1896 for (j = 0; j < num_comps; j++) {
1897 out[j] = LLVMBuildBitCast(builder,
1898 outputs[reg].values[start+j],
1899 ctx->i32, "");
1900 }
1901
1902 /* Pack the output. */
1903 LLVMValueRef vdata = NULL;
1904
1905 switch (num_comps) {
1906 case 1: /* as i32 */
1907 vdata = out[0];
1908 break;
1909 case 2: /* as v2i32 */
1910 case 3: /* as v4i32 (aligned to 4) */
1911 case 4: /* as v4i32 */
1912 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
1913 for (j = 0; j < num_comps; j++) {
1914 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
1915 LLVMConstInt(ctx->i32, j, 0), "");
1916 }
1917 break;
1918 }
1919
1920 LLVMValueRef can_emit_stream =
1921 LLVMBuildICmp(builder, LLVMIntEQ,
1922 stream_id,
1923 lp_build_const_int32(gallivm, stream), "");
1924
1925 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
1926 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
1927 vdata, num_comps,
1928 so_write_offset[buf_idx],
1929 LLVMConstInt(ctx->i32, 0, 0),
1930 so->output[i].dst_offset*4);
1931 lp_build_endif(&if_ctx_stream);
1932 }
1933 }
1934 lp_build_endif(&if_ctx);
1935 }
1936
1937
1938 /* Generate export instructions for hardware VS shader stage */
1939 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
1940 struct si_shader_output_values *outputs,
1941 unsigned noutput)
1942 {
1943 struct si_shader_context *ctx = si_shader_context(bld_base);
1944 struct si_shader *shader = ctx->shader;
1945 struct lp_build_context *base = &bld_base->base;
1946 struct lp_build_context *uint =
1947 &ctx->radeon_bld.soa.bld_base.uint_bld;
1948 LLVMValueRef args[9];
1949 LLVMValueRef pos_args[4][9] = { { 0 } };
1950 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
1951 unsigned semantic_name, semantic_index;
1952 unsigned target;
1953 unsigned param_count = 0;
1954 unsigned pos_idx;
1955 int i;
1956
1957 if (outputs && ctx->shader->selector->so.num_outputs) {
1958 si_llvm_emit_streamout(ctx, outputs, noutput);
1959 }
1960
1961 for (i = 0; i < noutput; i++) {
1962 semantic_name = outputs[i].name;
1963 semantic_index = outputs[i].sid;
1964
1965 handle_semantic:
1966 /* Select the correct target */
1967 switch(semantic_name) {
1968 case TGSI_SEMANTIC_PSIZE:
1969 psize_value = outputs[i].values[0];
1970 continue;
1971 case TGSI_SEMANTIC_EDGEFLAG:
1972 edgeflag_value = outputs[i].values[0];
1973 continue;
1974 case TGSI_SEMANTIC_LAYER:
1975 layer_value = outputs[i].values[0];
1976 semantic_name = TGSI_SEMANTIC_GENERIC;
1977 goto handle_semantic;
1978 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1979 viewport_index_value = outputs[i].values[0];
1980 semantic_name = TGSI_SEMANTIC_GENERIC;
1981 goto handle_semantic;
1982 case TGSI_SEMANTIC_POSITION:
1983 target = V_008DFC_SQ_EXP_POS;
1984 break;
1985 case TGSI_SEMANTIC_COLOR:
1986 case TGSI_SEMANTIC_BCOLOR:
1987 target = V_008DFC_SQ_EXP_PARAM + param_count;
1988 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
1989 shader->info.vs_output_param_offset[i] = param_count;
1990 param_count++;
1991 break;
1992 case TGSI_SEMANTIC_CLIPDIST:
1993 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
1994 break;
1995 case TGSI_SEMANTIC_CLIPVERTEX:
1996 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
1997 continue;
1998 case TGSI_SEMANTIC_PRIMID:
1999 case TGSI_SEMANTIC_FOG:
2000 case TGSI_SEMANTIC_TEXCOORD:
2001 case TGSI_SEMANTIC_GENERIC:
2002 target = V_008DFC_SQ_EXP_PARAM + param_count;
2003 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2004 shader->info.vs_output_param_offset[i] = param_count;
2005 param_count++;
2006 break;
2007 default:
2008 target = 0;
2009 fprintf(stderr,
2010 "Warning: SI unhandled vs output type:%d\n",
2011 semantic_name);
2012 }
2013
2014 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2015
2016 if (target >= V_008DFC_SQ_EXP_POS &&
2017 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2018 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2019 args, sizeof(args));
2020 } else {
2021 lp_build_intrinsic(base->gallivm->builder,
2022 "llvm.SI.export", ctx->voidt,
2023 args, 9, 0);
2024 }
2025
2026 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2027 semantic_name = TGSI_SEMANTIC_GENERIC;
2028 goto handle_semantic;
2029 }
2030 }
2031
2032 shader->info.nr_param_exports = param_count;
2033
2034 /* We need to add the position output manually if it's missing. */
2035 if (!pos_args[0][0]) {
2036 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2037 pos_args[0][1] = uint->zero; /* EXEC mask */
2038 pos_args[0][2] = uint->zero; /* last export? */
2039 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2040 pos_args[0][4] = uint->zero; /* COMPR flag */
2041 pos_args[0][5] = base->zero; /* X */
2042 pos_args[0][6] = base->zero; /* Y */
2043 pos_args[0][7] = base->zero; /* Z */
2044 pos_args[0][8] = base->one; /* W */
2045 }
2046
2047 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2048 if (shader->selector->info.writes_psize ||
2049 shader->selector->info.writes_edgeflag ||
2050 shader->selector->info.writes_viewport_index ||
2051 shader->selector->info.writes_layer) {
2052 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2053 shader->selector->info.writes_psize |
2054 (shader->selector->info.writes_edgeflag << 1) |
2055 (shader->selector->info.writes_layer << 2) |
2056 (shader->selector->info.writes_viewport_index << 3));
2057 pos_args[1][1] = uint->zero; /* EXEC mask */
2058 pos_args[1][2] = uint->zero; /* last export? */
2059 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2060 pos_args[1][4] = uint->zero; /* COMPR flag */
2061 pos_args[1][5] = base->zero; /* X */
2062 pos_args[1][6] = base->zero; /* Y */
2063 pos_args[1][7] = base->zero; /* Z */
2064 pos_args[1][8] = base->zero; /* W */
2065
2066 if (shader->selector->info.writes_psize)
2067 pos_args[1][5] = psize_value;
2068
2069 if (shader->selector->info.writes_edgeflag) {
2070 /* The output is a float, but the hw expects an integer
2071 * with the first bit containing the edge flag. */
2072 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2073 edgeflag_value,
2074 ctx->i32, "");
2075 edgeflag_value = lp_build_min(&bld_base->int_bld,
2076 edgeflag_value,
2077 bld_base->int_bld.one);
2078
2079 /* The LLVM intrinsic expects a float. */
2080 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2081 edgeflag_value,
2082 ctx->f32, "");
2083 }
2084
2085 if (shader->selector->info.writes_layer)
2086 pos_args[1][7] = layer_value;
2087
2088 if (shader->selector->info.writes_viewport_index)
2089 pos_args[1][8] = viewport_index_value;
2090 }
2091
2092 for (i = 0; i < 4; i++)
2093 if (pos_args[i][0])
2094 shader->info.nr_pos_exports++;
2095
2096 pos_idx = 0;
2097 for (i = 0; i < 4; i++) {
2098 if (!pos_args[i][0])
2099 continue;
2100
2101 /* Specify the target we are exporting */
2102 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2103
2104 if (pos_idx == shader->info.nr_pos_exports)
2105 /* Specify that this is the last export */
2106 pos_args[i][2] = uint->one;
2107
2108 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2109 ctx->voidt, pos_args[i], 9, 0);
2110 }
2111 }
2112
2113 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2114 LLVMValueRef rel_patch_id,
2115 LLVMValueRef invocation_id,
2116 LLVMValueRef tcs_out_current_patch_data_offset)
2117 {
2118 struct si_shader_context *ctx = si_shader_context(bld_base);
2119 struct gallivm_state *gallivm = bld_base->base.gallivm;
2120 struct si_shader *shader = ctx->shader;
2121 unsigned tess_inner_index, tess_outer_index;
2122 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2123 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2124 unsigned stride, outer_comps, inner_comps, i;
2125 struct lp_build_if_state if_ctx;
2126
2127 /* Do this only for invocation 0, because the tess levels are per-patch,
2128 * not per-vertex.
2129 *
2130 * This can't jump, because invocation 0 executes this. It should
2131 * at least mask out the loads and stores for other invocations.
2132 */
2133 lp_build_if(&if_ctx, gallivm,
2134 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2135 invocation_id, bld_base->uint_bld.zero, ""));
2136
2137 /* Determine the layout of one tess factor element in the buffer. */
2138 switch (shader->key.tcs.epilog.prim_mode) {
2139 case PIPE_PRIM_LINES:
2140 stride = 2; /* 2 dwords, 1 vec2 store */
2141 outer_comps = 2;
2142 inner_comps = 0;
2143 break;
2144 case PIPE_PRIM_TRIANGLES:
2145 stride = 4; /* 4 dwords, 1 vec4 store */
2146 outer_comps = 3;
2147 inner_comps = 1;
2148 break;
2149 case PIPE_PRIM_QUADS:
2150 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2151 outer_comps = 4;
2152 inner_comps = 2;
2153 break;
2154 default:
2155 assert(0);
2156 return;
2157 }
2158
2159 /* Load tess_inner and tess_outer from LDS.
2160 * Any invocation can write them, so we can't get them from a temporary.
2161 */
2162 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2163 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2164
2165 lds_base = tcs_out_current_patch_data_offset;
2166 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2167 lp_build_const_int32(gallivm,
2168 tess_inner_index * 4), "");
2169 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2170 lp_build_const_int32(gallivm,
2171 tess_outer_index * 4), "");
2172
2173 for (i = 0; i < outer_comps; i++)
2174 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2175 for (i = 0; i < inner_comps; i++)
2176 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2177
2178 /* Convert the outputs to vectors for stores. */
2179 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2180 vec1 = NULL;
2181
2182 if (stride > 4)
2183 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2184
2185 /* Get the buffer. */
2186 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2187 SI_PARAM_RW_BUFFERS);
2188 buffer = build_indexed_load_const(ctx, rw_buffers,
2189 lp_build_const_int32(gallivm, SI_RING_TESS_FACTOR));
2190
2191 /* Get the offset. */
2192 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2193 SI_PARAM_TESS_FACTOR_OFFSET);
2194 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2195 lp_build_const_int32(gallivm, 4 * stride), "");
2196
2197 /* Store the outputs. */
2198 build_tbuffer_store_dwords(ctx, buffer, vec0,
2199 MIN2(stride, 4), byteoffset, tf_base, 0);
2200 if (vec1)
2201 build_tbuffer_store_dwords(ctx, buffer, vec1,
2202 stride - 4, byteoffset, tf_base, 16);
2203 lp_build_endif(&if_ctx);
2204 }
2205
2206 /* This only writes the tessellation factor levels. */
2207 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2208 {
2209 struct si_shader_context *ctx = si_shader_context(bld_base);
2210 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2211
2212 rel_patch_id = get_rel_patch_id(ctx);
2213 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2214 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2215
2216 if (!ctx->is_monolithic) {
2217 /* Return epilog parameters from this function. */
2218 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2219 LLVMValueRef ret = ctx->return_value;
2220 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2221 unsigned vgpr;
2222
2223 /* RW_BUFFERS pointer */
2224 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2225 SI_PARAM_RW_BUFFERS);
2226 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2227 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2228 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2229 bld_base->uint_bld.zero, "");
2230 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2231 bld_base->uint_bld.one, "");
2232 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2233 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2234
2235 /* Tess factor buffer soffset is after user SGPRs. */
2236 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2237 SI_PARAM_TESS_FACTOR_OFFSET);
2238 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2239 SI_TCS_NUM_USER_SGPR, "");
2240
2241 /* VGPRs */
2242 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2243 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2244 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2245
2246 vgpr = SI_TCS_NUM_USER_SGPR + 1;
2247 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2248 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2249 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2250 ctx->return_value = ret;
2251 return;
2252 }
2253
2254 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2255 }
2256
2257 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2258 {
2259 struct si_shader_context *ctx = si_shader_context(bld_base);
2260 struct si_shader *shader = ctx->shader;
2261 struct tgsi_shader_info *info = &shader->selector->info;
2262 struct gallivm_state *gallivm = bld_base->base.gallivm;
2263 unsigned i, chan;
2264 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2265 ctx->param_rel_auto_id);
2266 LLVMValueRef vertex_dw_stride =
2267 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2268 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2269 vertex_dw_stride, "");
2270
2271 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2272 * its inputs from it. */
2273 for (i = 0; i < info->num_outputs; i++) {
2274 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2275 unsigned name = info->output_semantic_name[i];
2276 unsigned index = info->output_semantic_index[i];
2277 int param = si_shader_io_get_unique_index(name, index);
2278 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2279 lp_build_const_int32(gallivm, param * 4), "");
2280
2281 for (chan = 0; chan < 4; chan++) {
2282 lds_store(bld_base, chan, dw_addr,
2283 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2284 }
2285 }
2286 }
2287
2288 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2289 {
2290 struct si_shader_context *ctx = si_shader_context(bld_base);
2291 struct gallivm_state *gallivm = bld_base->base.gallivm;
2292 struct si_shader *es = ctx->shader;
2293 struct tgsi_shader_info *info = &es->selector->info;
2294 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2295 ctx->param_es2gs_offset);
2296 unsigned chan;
2297 int i;
2298
2299 for (i = 0; i < info->num_outputs; i++) {
2300 LLVMValueRef *out_ptr =
2301 ctx->radeon_bld.soa.outputs[i];
2302 int param_index;
2303
2304 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2305 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2306 continue;
2307
2308 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2309 info->output_semantic_index[i]);
2310
2311 for (chan = 0; chan < 4; chan++) {
2312 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2313 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2314
2315 build_tbuffer_store(ctx,
2316 ctx->esgs_ring,
2317 out_val, 1,
2318 LLVMGetUndef(ctx->i32), soffset,
2319 (4 * param_index + chan) * 4,
2320 V_008F0C_BUF_DATA_FORMAT_32,
2321 V_008F0C_BUF_NUM_FORMAT_UINT,
2322 0, 0, 1, 1, 0);
2323 }
2324 }
2325 }
2326
2327 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2328 {
2329 struct si_shader_context *ctx = si_shader_context(bld_base);
2330 struct gallivm_state *gallivm = bld_base->base.gallivm;
2331 LLVMValueRef args[2];
2332
2333 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2334 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2335 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2336 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
2337 }
2338
2339 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2340 {
2341 struct si_shader_context *ctx = si_shader_context(bld_base);
2342 struct gallivm_state *gallivm = bld_base->base.gallivm;
2343 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2344 struct si_shader_output_values *outputs = NULL;
2345 int i,j;
2346
2347 assert(!ctx->is_gs_copy_shader);
2348
2349 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2350
2351 /* Vertex color clamping.
2352 *
2353 * This uses a state constant loaded in a user data SGPR and
2354 * an IF statement is added that clamps all colors if the constant
2355 * is true.
2356 */
2357 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
2358 struct lp_build_if_state if_ctx;
2359 LLVMValueRef cond = NULL;
2360 LLVMValueRef addr, val;
2361
2362 for (i = 0; i < info->num_outputs; i++) {
2363 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2364 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2365 continue;
2366
2367 /* We've found a color. */
2368 if (!cond) {
2369 /* The state is in the first bit of the user SGPR. */
2370 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2371 SI_PARAM_VS_STATE_BITS);
2372 cond = LLVMBuildTrunc(gallivm->builder, cond,
2373 ctx->i1, "");
2374 lp_build_if(&if_ctx, gallivm, cond);
2375 }
2376
2377 for (j = 0; j < 4; j++) {
2378 addr = ctx->radeon_bld.soa.outputs[i][j];
2379 val = LLVMBuildLoad(gallivm->builder, addr, "");
2380 val = radeon_llvm_saturate(bld_base, val);
2381 LLVMBuildStore(gallivm->builder, val, addr);
2382 }
2383 }
2384
2385 if (cond)
2386 lp_build_endif(&if_ctx);
2387 }
2388
2389 for (i = 0; i < info->num_outputs; i++) {
2390 outputs[i].name = info->output_semantic_name[i];
2391 outputs[i].sid = info->output_semantic_index[i];
2392
2393 for (j = 0; j < 4; j++)
2394 outputs[i].values[j] =
2395 LLVMBuildLoad(gallivm->builder,
2396 ctx->radeon_bld.soa.outputs[i][j],
2397 "");
2398 }
2399
2400 if (ctx->is_monolithic) {
2401 /* Export PrimitiveID when PS needs it. */
2402 if (si_vs_exports_prim_id(ctx->shader)) {
2403 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2404 outputs[i].sid = 0;
2405 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2406 get_primitive_id(bld_base, 0));
2407 outputs[i].values[1] = bld_base->base.undef;
2408 outputs[i].values[2] = bld_base->base.undef;
2409 outputs[i].values[3] = bld_base->base.undef;
2410 i++;
2411 }
2412 } else {
2413 /* Return the primitive ID from the LLVM function. */
2414 ctx->return_value =
2415 LLVMBuildInsertValue(gallivm->builder,
2416 ctx->return_value,
2417 bitcast(bld_base, TGSI_TYPE_FLOAT,
2418 get_primitive_id(bld_base, 0)),
2419 VS_EPILOG_PRIMID_LOC, "");
2420 }
2421
2422 si_llvm_export_vs(bld_base, outputs, i);
2423 FREE(outputs);
2424 }
2425
2426 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2427 LLVMValueRef depth, LLVMValueRef stencil,
2428 LLVMValueRef samplemask)
2429 {
2430 struct si_shader_context *ctx = si_shader_context(bld_base);
2431 struct lp_build_context *base = &bld_base->base;
2432 struct lp_build_context *uint = &bld_base->uint_bld;
2433 LLVMValueRef args[9];
2434 unsigned mask = 0;
2435
2436 assert(depth || stencil || samplemask);
2437
2438 args[1] = uint->one; /* whether the EXEC mask is valid */
2439 args[2] = uint->one; /* DONE bit */
2440
2441 /* Specify the target we are exporting */
2442 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2443
2444 args[4] = uint->zero; /* COMP flag */
2445 args[5] = base->undef; /* R, depth */
2446 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2447 args[7] = base->undef; /* B, sample mask */
2448 args[8] = base->undef; /* A, alpha to mask */
2449
2450 if (depth) {
2451 args[5] = depth;
2452 mask |= 0x1;
2453 }
2454
2455 if (stencil) {
2456 args[6] = stencil;
2457 mask |= 0x2;
2458 }
2459
2460 if (samplemask) {
2461 args[7] = samplemask;
2462 mask |= 0x4;
2463 }
2464
2465 /* SI (except OLAND) has a bug that it only looks
2466 * at the X writemask component. */
2467 if (ctx->screen->b.chip_class == SI &&
2468 ctx->screen->b.family != CHIP_OLAND)
2469 mask |= 0x1;
2470
2471 /* Specify which components to enable */
2472 args[0] = lp_build_const_int32(base->gallivm, mask);
2473
2474 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2475 ctx->voidt, args, 9, 0);
2476 }
2477
2478 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2479 LLVMValueRef *color, unsigned index,
2480 unsigned samplemask_param,
2481 bool is_last)
2482 {
2483 struct si_shader_context *ctx = si_shader_context(bld_base);
2484 struct lp_build_context *base = &bld_base->base;
2485 int i;
2486
2487 /* Clamp color */
2488 if (ctx->shader->key.ps.epilog.clamp_color)
2489 for (i = 0; i < 4; i++)
2490 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2491
2492 /* Alpha to one */
2493 if (ctx->shader->key.ps.epilog.alpha_to_one)
2494 color[3] = base->one;
2495
2496 /* Alpha test */
2497 if (index == 0 &&
2498 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2499 si_alpha_test(bld_base, color[3]);
2500
2501 /* Line & polygon smoothing */
2502 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
2503 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2504 samplemask_param);
2505
2506 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2507 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
2508 LLVMValueRef args[8][9];
2509 int c, last = -1;
2510
2511 /* Get the export arguments, also find out what the last one is. */
2512 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2513 si_llvm_init_export_args(bld_base, color,
2514 V_008DFC_SQ_EXP_MRT + c, args[c]);
2515 if (args[c][0] != bld_base->uint_bld.zero)
2516 last = c;
2517 }
2518
2519 /* Emit all exports. */
2520 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2521 if (is_last && last == c) {
2522 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2523 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
2524 } else if (args[c][0] == bld_base->uint_bld.zero)
2525 continue; /* unnecessary NULL export */
2526
2527 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2528 ctx->voidt, args[c], 9, 0);
2529 }
2530 } else {
2531 LLVMValueRef args[9];
2532
2533 /* Export */
2534 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2535 args);
2536 if (is_last) {
2537 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2538 args[2] = bld_base->uint_bld.one; /* DONE bit */
2539 } else if (args[0] == bld_base->uint_bld.zero)
2540 return; /* unnecessary NULL export */
2541
2542 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2543 ctx->voidt, args, 9, 0);
2544 }
2545 }
2546
2547 static void si_export_null(struct lp_build_tgsi_context *bld_base)
2548 {
2549 struct si_shader_context *ctx = si_shader_context(bld_base);
2550 struct lp_build_context *base = &bld_base->base;
2551 struct lp_build_context *uint = &bld_base->uint_bld;
2552 LLVMValueRef args[9];
2553
2554 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
2555 args[1] = uint->one; /* whether the EXEC mask is valid */
2556 args[2] = uint->one; /* DONE bit */
2557 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2558 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
2559 args[5] = uint->undef; /* R */
2560 args[6] = uint->undef; /* G */
2561 args[7] = uint->undef; /* B */
2562 args[8] = uint->undef; /* A */
2563
2564 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2565 ctx->voidt, args, 9, 0);
2566 }
2567
2568 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
2569 {
2570 struct si_shader_context *ctx = si_shader_context(bld_base);
2571 struct si_shader *shader = ctx->shader;
2572 struct lp_build_context *base = &bld_base->base;
2573 struct tgsi_shader_info *info = &shader->selector->info;
2574 LLVMBuilderRef builder = base->gallivm->builder;
2575 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2576 int last_color_export = -1;
2577 int i;
2578
2579 /* Determine the last export. If MRTZ is present, it's always last.
2580 * Otherwise, find the last color export.
2581 */
2582 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
2583 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
2584
2585 /* Don't export NULL and return if alpha-test is enabled. */
2586 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
2587 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
2588 (spi_format & 0xf) == 0)
2589 spi_format |= V_028714_SPI_SHADER_32_AR;
2590
2591 for (i = 0; i < info->num_outputs; i++) {
2592 unsigned index = info->output_semantic_index[i];
2593
2594 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
2595 continue;
2596
2597 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2598 if (shader->key.ps.epilog.last_cbuf > 0) {
2599 /* Just set this if any of the colorbuffers are enabled. */
2600 if (spi_format &
2601 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
2602 last_color_export = i;
2603 continue;
2604 }
2605
2606 if ((spi_format >> (index * 4)) & 0xf)
2607 last_color_export = i;
2608 }
2609
2610 /* If there are no outputs, export NULL. */
2611 if (last_color_export == -1) {
2612 si_export_null(bld_base);
2613 return;
2614 }
2615 }
2616
2617 for (i = 0; i < info->num_outputs; i++) {
2618 unsigned semantic_name = info->output_semantic_name[i];
2619 unsigned semantic_index = info->output_semantic_index[i];
2620 unsigned j;
2621 LLVMValueRef color[4] = {};
2622
2623 /* Select the correct target */
2624 switch (semantic_name) {
2625 case TGSI_SEMANTIC_POSITION:
2626 depth = LLVMBuildLoad(builder,
2627 ctx->radeon_bld.soa.outputs[i][2], "");
2628 break;
2629 case TGSI_SEMANTIC_STENCIL:
2630 stencil = LLVMBuildLoad(builder,
2631 ctx->radeon_bld.soa.outputs[i][1], "");
2632 break;
2633 case TGSI_SEMANTIC_SAMPLEMASK:
2634 samplemask = LLVMBuildLoad(builder,
2635 ctx->radeon_bld.soa.outputs[i][0], "");
2636 break;
2637 case TGSI_SEMANTIC_COLOR:
2638 for (j = 0; j < 4; j++)
2639 color[j] = LLVMBuildLoad(builder,
2640 ctx->radeon_bld.soa.outputs[i][j], "");
2641
2642 si_export_mrt_color(bld_base, color, semantic_index,
2643 SI_PARAM_SAMPLE_COVERAGE,
2644 last_color_export == i);
2645 break;
2646 default:
2647 fprintf(stderr,
2648 "Warning: SI unhandled fs output type:%d\n",
2649 semantic_name);
2650 }
2651 }
2652
2653 if (depth || stencil || samplemask)
2654 si_export_mrt_z(bld_base, depth, stencil, samplemask);
2655 }
2656
2657 /**
2658 * Return PS outputs in this order:
2659 *
2660 * v[0:3] = color0.xyzw
2661 * v[4:7] = color1.xyzw
2662 * ...
2663 * vN+0 = Depth
2664 * vN+1 = Stencil
2665 * vN+2 = SampleMask
2666 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
2667 *
2668 * The alpha-ref SGPR is returned via its original location.
2669 */
2670 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
2671 {
2672 struct si_shader_context *ctx = si_shader_context(bld_base);
2673 struct si_shader *shader = ctx->shader;
2674 struct lp_build_context *base = &bld_base->base;
2675 struct tgsi_shader_info *info = &shader->selector->info;
2676 LLVMBuilderRef builder = base->gallivm->builder;
2677 unsigned i, j, first_vgpr, vgpr;
2678
2679 LLVMValueRef color[8][4] = {};
2680 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2681 LLVMValueRef ret;
2682
2683 /* Read the output values. */
2684 for (i = 0; i < info->num_outputs; i++) {
2685 unsigned semantic_name = info->output_semantic_name[i];
2686 unsigned semantic_index = info->output_semantic_index[i];
2687
2688 switch (semantic_name) {
2689 case TGSI_SEMANTIC_COLOR:
2690 assert(semantic_index < 8);
2691 for (j = 0; j < 4; j++) {
2692 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
2693 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
2694 color[semantic_index][j] = result;
2695 }
2696 break;
2697 case TGSI_SEMANTIC_POSITION:
2698 depth = LLVMBuildLoad(builder,
2699 ctx->radeon_bld.soa.outputs[i][2], "");
2700 break;
2701 case TGSI_SEMANTIC_STENCIL:
2702 stencil = LLVMBuildLoad(builder,
2703 ctx->radeon_bld.soa.outputs[i][1], "");
2704 break;
2705 case TGSI_SEMANTIC_SAMPLEMASK:
2706 samplemask = LLVMBuildLoad(builder,
2707 ctx->radeon_bld.soa.outputs[i][0], "");
2708 break;
2709 default:
2710 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
2711 semantic_name);
2712 }
2713 }
2714
2715 /* Fill the return structure. */
2716 ret = ctx->return_value;
2717
2718 /* Set SGPRs. */
2719 ret = LLVMBuildInsertValue(builder, ret,
2720 bitcast(bld_base, TGSI_TYPE_SIGNED,
2721 LLVMGetParam(ctx->radeon_bld.main_fn,
2722 SI_PARAM_ALPHA_REF)),
2723 SI_SGPR_ALPHA_REF, "");
2724
2725 /* Set VGPRs */
2726 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
2727 for (i = 0; i < ARRAY_SIZE(color); i++) {
2728 if (!color[i][0])
2729 continue;
2730
2731 for (j = 0; j < 4; j++)
2732 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
2733 }
2734 if (depth)
2735 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
2736 if (stencil)
2737 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
2738 if (samplemask)
2739 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
2740
2741 /* Add the input sample mask for smoothing at the end. */
2742 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
2743 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
2744 ret = LLVMBuildInsertValue(builder, ret,
2745 LLVMGetParam(ctx->radeon_bld.main_fn,
2746 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
2747
2748 ctx->return_value = ret;
2749 }
2750
2751 /**
2752 * Given a v8i32 resource descriptor for a buffer, extract the size of the
2753 * buffer in number of elements and return it as an i32.
2754 */
2755 static LLVMValueRef get_buffer_size(
2756 struct lp_build_tgsi_context *bld_base,
2757 LLVMValueRef descriptor)
2758 {
2759 struct si_shader_context *ctx = si_shader_context(bld_base);
2760 struct gallivm_state *gallivm = bld_base->base.gallivm;
2761 LLVMBuilderRef builder = gallivm->builder;
2762 LLVMValueRef size =
2763 LLVMBuildExtractElement(builder, descriptor,
2764 lp_build_const_int32(gallivm, 6), "");
2765
2766 if (ctx->screen->b.chip_class >= VI) {
2767 /* On VI, the descriptor contains the size in bytes,
2768 * but TXQ must return the size in elements.
2769 * The stride is always non-zero for resources using TXQ.
2770 */
2771 LLVMValueRef stride =
2772 LLVMBuildExtractElement(builder, descriptor,
2773 lp_build_const_int32(gallivm, 5), "");
2774 stride = LLVMBuildLShr(builder, stride,
2775 lp_build_const_int32(gallivm, 16), "");
2776 stride = LLVMBuildAnd(builder, stride,
2777 lp_build_const_int32(gallivm, 0x3FFF), "");
2778
2779 size = LLVMBuildUDiv(builder, size, stride, "");
2780 }
2781
2782 return size;
2783 }
2784
2785 /**
2786 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
2787 * intrinsic names).
2788 */
2789 static void build_int_type_name(
2790 LLVMTypeRef type,
2791 char *buf, unsigned bufsize)
2792 {
2793 assert(bufsize >= 6);
2794
2795 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
2796 snprintf(buf, bufsize, "v%ui32",
2797 LLVMGetVectorSize(type));
2798 else
2799 strcpy(buf, "i32");
2800 }
2801
2802 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
2803 struct lp_build_tgsi_context *bld_base,
2804 struct lp_build_emit_data *emit_data);
2805
2806 /* Prevent optimizations (at least of memory accesses) across the current
2807 * point in the program by emitting empty inline assembly that is marked as
2808 * having side effects.
2809 */
2810 static void emit_optimization_barrier(struct si_shader_context *ctx)
2811 {
2812 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2813 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
2814 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
2815 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
2816 }
2817
2818 static void membar_emit(
2819 const struct lp_build_tgsi_action *action,
2820 struct lp_build_tgsi_context *bld_base,
2821 struct lp_build_emit_data *emit_data)
2822 {
2823 struct si_shader_context *ctx = si_shader_context(bld_base);
2824
2825 /* Since memoryBarrier only makes guarantees about atomics and
2826 * coherent image accesses (which bypass TC L1), we do not need to emit
2827 * any special cache handling here.
2828 *
2829 * We do have to prevent LLVM from re-ordering loads across
2830 * the barrier though.
2831 */
2832 emit_optimization_barrier(ctx);
2833 }
2834
2835 static LLVMValueRef
2836 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
2837 const struct tgsi_full_src_register *reg)
2838 {
2839 LLVMValueRef ind_index;
2840 LLVMValueRef rsrc_ptr;
2841
2842 if (!reg->Register.Indirect)
2843 return ctx->shader_buffers[reg->Register.Index];
2844
2845 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
2846 reg->Register.Index,
2847 SI_NUM_SHADER_BUFFERS);
2848
2849 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
2850 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
2851 }
2852
2853 static bool tgsi_is_array_sampler(unsigned target)
2854 {
2855 return target == TGSI_TEXTURE_1D_ARRAY ||
2856 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
2857 target == TGSI_TEXTURE_2D_ARRAY ||
2858 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
2859 target == TGSI_TEXTURE_CUBE_ARRAY ||
2860 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
2861 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2862 }
2863
2864 static bool tgsi_is_array_image(unsigned target)
2865 {
2866 return target == TGSI_TEXTURE_3D ||
2867 target == TGSI_TEXTURE_CUBE ||
2868 target == TGSI_TEXTURE_1D_ARRAY ||
2869 target == TGSI_TEXTURE_2D_ARRAY ||
2870 target == TGSI_TEXTURE_CUBE_ARRAY ||
2871 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2872 }
2873
2874 /**
2875 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
2876 *
2877 * At least on Tonga, executing image stores on images with DCC enabled and
2878 * non-trivial can eventually lead to lockups. This can occur when an
2879 * application binds an image as read-only but then uses a shader that writes
2880 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
2881 * program termination) in this case, but it doesn't cost much to be a bit
2882 * nicer: disabling DCC in the shader still leads to undefined results but
2883 * avoids the lockup.
2884 */
2885 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
2886 LLVMValueRef rsrc)
2887 {
2888 if (ctx->screen->b.chip_class <= CIK) {
2889 return rsrc;
2890 } else {
2891 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2892 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
2893 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
2894 LLVMValueRef tmp;
2895
2896 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
2897 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
2898 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
2899 }
2900 }
2901
2902 /**
2903 * Load the resource descriptor for \p image.
2904 */
2905 static void
2906 image_fetch_rsrc(
2907 struct lp_build_tgsi_context *bld_base,
2908 const struct tgsi_full_src_register *image,
2909 bool dcc_off,
2910 LLVMValueRef *rsrc)
2911 {
2912 struct si_shader_context *ctx = si_shader_context(bld_base);
2913
2914 assert(image->Register.File == TGSI_FILE_IMAGE);
2915
2916 if (!image->Register.Indirect) {
2917 /* Fast path: use preloaded resources */
2918 *rsrc = ctx->images[image->Register.Index];
2919 } else {
2920 /* Indexing and manual load */
2921 LLVMValueRef ind_index;
2922 LLVMValueRef rsrc_ptr;
2923 LLVMValueRef tmp;
2924
2925 /* From the GL_ARB_shader_image_load_store extension spec:
2926 *
2927 * If a shader performs an image load, store, or atomic
2928 * operation using an image variable declared as an array,
2929 * and if the index used to select an individual element is
2930 * negative or greater than or equal to the size of the
2931 * array, the results of the operation are undefined but may
2932 * not lead to termination.
2933 */
2934 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
2935 image->Register.Index,
2936 SI_NUM_IMAGES);
2937
2938 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
2939 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
2940 if (dcc_off)
2941 tmp = force_dcc_off(ctx, tmp);
2942 *rsrc = tmp;
2943 }
2944 }
2945
2946 static LLVMValueRef image_fetch_coords(
2947 struct lp_build_tgsi_context *bld_base,
2948 const struct tgsi_full_instruction *inst,
2949 unsigned src)
2950 {
2951 struct gallivm_state *gallivm = bld_base->base.gallivm;
2952 LLVMBuilderRef builder = gallivm->builder;
2953 unsigned target = inst->Memory.Texture;
2954 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
2955 LLVMValueRef coords[4];
2956 LLVMValueRef tmp;
2957 int chan;
2958
2959 for (chan = 0; chan < num_coords; ++chan) {
2960 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
2961 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
2962 coords[chan] = tmp;
2963 }
2964
2965 if (num_coords == 1)
2966 return coords[0];
2967
2968 if (num_coords == 3) {
2969 /* LLVM has difficulties lowering 3-element vectors. */
2970 coords[3] = bld_base->uint_bld.undef;
2971 num_coords = 4;
2972 }
2973
2974 return lp_build_gather_values(gallivm, coords, num_coords);
2975 }
2976
2977 /**
2978 * Append the extra mode bits that are used by image load and store.
2979 */
2980 static void image_append_args(
2981 struct si_shader_context *ctx,
2982 struct lp_build_emit_data * emit_data,
2983 unsigned target,
2984 bool atomic)
2985 {
2986 const struct tgsi_full_instruction *inst = emit_data->inst;
2987 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
2988 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
2989
2990 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
2991 emit_data->args[emit_data->arg_count++] =
2992 tgsi_is_array_image(target) ? i1true : i1false; /* da */
2993 if (!atomic) {
2994 emit_data->args[emit_data->arg_count++] =
2995 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
2996 i1true : i1false; /* glc */
2997 }
2998 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
2999 }
3000
3001 /**
3002 * Given a 256 bit resource, extract the top half (which stores the buffer
3003 * resource in the case of textures and images).
3004 */
3005 static LLVMValueRef extract_rsrc_top_half(
3006 struct si_shader_context *ctx,
3007 LLVMValueRef rsrc)
3008 {
3009 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3010 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3011 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3012
3013 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3014 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3015 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3016
3017 return rsrc;
3018 }
3019
3020 /**
3021 * Append the resource and indexing arguments for buffer intrinsics.
3022 *
3023 * \param rsrc the v4i32 buffer resource
3024 * \param index index into the buffer (stride-based)
3025 * \param offset byte offset into the buffer
3026 */
3027 static void buffer_append_args(
3028 struct si_shader_context *ctx,
3029 struct lp_build_emit_data *emit_data,
3030 LLVMValueRef rsrc,
3031 LLVMValueRef index,
3032 LLVMValueRef offset,
3033 bool atomic)
3034 {
3035 const struct tgsi_full_instruction *inst = emit_data->inst;
3036 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3037 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3038
3039 emit_data->args[emit_data->arg_count++] = rsrc;
3040 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3041 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3042 if (!atomic) {
3043 emit_data->args[emit_data->arg_count++] =
3044 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3045 i1true : i1false; /* glc */
3046 }
3047 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3048 }
3049
3050 static void load_fetch_args(
3051 struct lp_build_tgsi_context * bld_base,
3052 struct lp_build_emit_data * emit_data)
3053 {
3054 struct si_shader_context *ctx = si_shader_context(bld_base);
3055 struct gallivm_state *gallivm = bld_base->base.gallivm;
3056 const struct tgsi_full_instruction * inst = emit_data->inst;
3057 unsigned target = inst->Memory.Texture;
3058 LLVMValueRef rsrc;
3059
3060 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3061
3062 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3063 LLVMBuilderRef builder = gallivm->builder;
3064 LLVMValueRef offset;
3065 LLVMValueRef tmp;
3066
3067 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3068
3069 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3070 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3071
3072 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3073 offset, false);
3074 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3075 LLVMValueRef coords;
3076
3077 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3078 coords = image_fetch_coords(bld_base, inst, 1);
3079
3080 if (target == TGSI_TEXTURE_BUFFER) {
3081 rsrc = extract_rsrc_top_half(ctx, rsrc);
3082 buffer_append_args(ctx, emit_data, rsrc, coords,
3083 bld_base->uint_bld.zero, false);
3084 } else {
3085 emit_data->args[0] = coords;
3086 emit_data->args[1] = rsrc;
3087 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3088 emit_data->arg_count = 3;
3089
3090 image_append_args(ctx, emit_data, target, false);
3091 }
3092 }
3093 }
3094
3095 static void load_emit_buffer(struct si_shader_context *ctx,
3096 struct lp_build_emit_data *emit_data)
3097 {
3098 const struct tgsi_full_instruction *inst = emit_data->inst;
3099 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3100 LLVMBuilderRef builder = gallivm->builder;
3101 uint writemask = inst->Dst[0].Register.WriteMask;
3102 uint count = util_last_bit(writemask);
3103 const char *intrinsic_name;
3104 LLVMTypeRef dst_type;
3105
3106 switch (count) {
3107 case 1:
3108 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3109 dst_type = ctx->f32;
3110 break;
3111 case 2:
3112 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3113 dst_type = LLVMVectorType(ctx->f32, 2);
3114 break;
3115 default: // 3 & 4
3116 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3117 dst_type = ctx->v4f32;
3118 count = 4;
3119 }
3120
3121 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3122 builder, intrinsic_name, dst_type,
3123 emit_data->args, emit_data->arg_count,
3124 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3125 }
3126
3127 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3128 const struct tgsi_full_instruction *inst,
3129 LLVMTypeRef type, int arg)
3130 {
3131 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3132 LLVMBuilderRef builder = gallivm->builder;
3133 LLVMValueRef offset, ptr;
3134 int addr_space;
3135
3136 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3137 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3138
3139 ptr = ctx->shared_memory;
3140 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3141 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3142 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3143
3144 return ptr;
3145 }
3146
3147 static void load_emit_memory(
3148 struct si_shader_context *ctx,
3149 struct lp_build_emit_data *emit_data)
3150 {
3151 const struct tgsi_full_instruction *inst = emit_data->inst;
3152 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3153 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3154 LLVMBuilderRef builder = gallivm->builder;
3155 unsigned writemask = inst->Dst[0].Register.WriteMask;
3156 LLVMValueRef channels[4], ptr, derived_ptr, index;
3157 int chan;
3158
3159 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3160
3161 for (chan = 0; chan < 4; ++chan) {
3162 if (!(writemask & (1 << chan))) {
3163 channels[chan] = LLVMGetUndef(base->elem_type);
3164 continue;
3165 }
3166
3167 index = lp_build_const_int32(gallivm, chan);
3168 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3169 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3170 }
3171 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3172 }
3173
3174 static void load_emit(
3175 const struct lp_build_tgsi_action *action,
3176 struct lp_build_tgsi_context *bld_base,
3177 struct lp_build_emit_data *emit_data)
3178 {
3179 struct si_shader_context *ctx = si_shader_context(bld_base);
3180 struct gallivm_state *gallivm = bld_base->base.gallivm;
3181 LLVMBuilderRef builder = gallivm->builder;
3182 const struct tgsi_full_instruction * inst = emit_data->inst;
3183 char intrinsic_name[32];
3184 char coords_type[8];
3185
3186 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3187 load_emit_memory(ctx, emit_data);
3188 return;
3189 }
3190
3191 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3192 emit_optimization_barrier(ctx);
3193
3194 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3195 load_emit_buffer(ctx, emit_data);
3196 return;
3197 }
3198
3199 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3200 emit_data->output[emit_data->chan] =
3201 lp_build_intrinsic(
3202 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3203 emit_data->args, emit_data->arg_count,
3204 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3205 } else {
3206 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3207 coords_type, sizeof(coords_type));
3208
3209 snprintf(intrinsic_name, sizeof(intrinsic_name),
3210 "llvm.amdgcn.image.load.%s", coords_type);
3211
3212 emit_data->output[emit_data->chan] =
3213 lp_build_intrinsic(
3214 builder, intrinsic_name, emit_data->dst_type,
3215 emit_data->args, emit_data->arg_count,
3216 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3217 }
3218 }
3219
3220 static void store_fetch_args(
3221 struct lp_build_tgsi_context * bld_base,
3222 struct lp_build_emit_data * emit_data)
3223 {
3224 struct si_shader_context *ctx = si_shader_context(bld_base);
3225 struct gallivm_state *gallivm = bld_base->base.gallivm;
3226 LLVMBuilderRef builder = gallivm->builder;
3227 const struct tgsi_full_instruction * inst = emit_data->inst;
3228 struct tgsi_full_src_register memory;
3229 LLVMValueRef chans[4];
3230 LLVMValueRef data;
3231 LLVMValueRef rsrc;
3232 unsigned chan;
3233
3234 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3235
3236 for (chan = 0; chan < 4; ++chan) {
3237 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3238 }
3239 data = lp_build_gather_values(gallivm, chans, 4);
3240
3241 emit_data->args[emit_data->arg_count++] = data;
3242
3243 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3244
3245 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3246 LLVMValueRef offset;
3247 LLVMValueRef tmp;
3248
3249 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3250
3251 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3252 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3253
3254 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3255 offset, false);
3256 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3257 unsigned target = inst->Memory.Texture;
3258 LLVMValueRef coords;
3259
3260 coords = image_fetch_coords(bld_base, inst, 0);
3261
3262 if (target == TGSI_TEXTURE_BUFFER) {
3263 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3264
3265 rsrc = extract_rsrc_top_half(ctx, rsrc);
3266 buffer_append_args(ctx, emit_data, rsrc, coords,
3267 bld_base->uint_bld.zero, false);
3268 } else {
3269 emit_data->args[1] = coords;
3270 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3271 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3272 emit_data->arg_count = 4;
3273
3274 image_append_args(ctx, emit_data, target, false);
3275 }
3276 }
3277 }
3278
3279 static void store_emit_buffer(
3280 struct si_shader_context *ctx,
3281 struct lp_build_emit_data *emit_data)
3282 {
3283 const struct tgsi_full_instruction *inst = emit_data->inst;
3284 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3285 LLVMBuilderRef builder = gallivm->builder;
3286 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3287 LLVMValueRef base_data = emit_data->args[0];
3288 LLVMValueRef base_offset = emit_data->args[3];
3289 unsigned writemask = inst->Dst[0].Register.WriteMask;
3290
3291 while (writemask) {
3292 int start, count;
3293 const char *intrinsic_name;
3294 LLVMValueRef data;
3295 LLVMValueRef offset;
3296 LLVMValueRef tmp;
3297
3298 u_bit_scan_consecutive_range(&writemask, &start, &count);
3299
3300 /* Due to an LLVM limitation, split 3-element writes
3301 * into a 2-element and a 1-element write. */
3302 if (count == 3) {
3303 writemask |= 1 << (start + 2);
3304 count = 2;
3305 }
3306
3307 if (count == 4) {
3308 data = base_data;
3309 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3310 } else if (count == 2) {
3311 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3312
3313 tmp = LLVMBuildExtractElement(
3314 builder, base_data,
3315 lp_build_const_int32(gallivm, start), "");
3316 data = LLVMBuildInsertElement(
3317 builder, LLVMGetUndef(v2f32), tmp,
3318 uint_bld->zero, "");
3319
3320 tmp = LLVMBuildExtractElement(
3321 builder, base_data,
3322 lp_build_const_int32(gallivm, start + 1), "");
3323 data = LLVMBuildInsertElement(
3324 builder, data, tmp, uint_bld->one, "");
3325
3326 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3327 } else {
3328 assert(count == 1);
3329 data = LLVMBuildExtractElement(
3330 builder, base_data,
3331 lp_build_const_int32(gallivm, start), "");
3332 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3333 }
3334
3335 offset = base_offset;
3336 if (start != 0) {
3337 offset = LLVMBuildAdd(
3338 builder, offset,
3339 lp_build_const_int32(gallivm, start * 4), "");
3340 }
3341
3342 emit_data->args[0] = data;
3343 emit_data->args[3] = offset;
3344
3345 lp_build_intrinsic(
3346 builder, intrinsic_name, emit_data->dst_type,
3347 emit_data->args, emit_data->arg_count,
3348 LLVMNoUnwindAttribute);
3349 }
3350 }
3351
3352 static void store_emit_memory(
3353 struct si_shader_context *ctx,
3354 struct lp_build_emit_data *emit_data)
3355 {
3356 const struct tgsi_full_instruction *inst = emit_data->inst;
3357 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3358 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3359 LLVMBuilderRef builder = gallivm->builder;
3360 unsigned writemask = inst->Dst[0].Register.WriteMask;
3361 LLVMValueRef ptr, derived_ptr, data, index;
3362 int chan;
3363
3364 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3365
3366 for (chan = 0; chan < 4; ++chan) {
3367 if (!(writemask & (1 << chan))) {
3368 continue;
3369 }
3370 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3371 index = lp_build_const_int32(gallivm, chan);
3372 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3373 LLVMBuildStore(builder, data, derived_ptr);
3374 }
3375 }
3376
3377 static void store_emit(
3378 const struct lp_build_tgsi_action *action,
3379 struct lp_build_tgsi_context *bld_base,
3380 struct lp_build_emit_data *emit_data)
3381 {
3382 struct gallivm_state *gallivm = bld_base->base.gallivm;
3383 LLVMBuilderRef builder = gallivm->builder;
3384 const struct tgsi_full_instruction * inst = emit_data->inst;
3385 unsigned target = inst->Memory.Texture;
3386 char intrinsic_name[32];
3387 char coords_type[8];
3388
3389 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3390 store_emit_buffer(si_shader_context(bld_base), emit_data);
3391 return;
3392 } else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3393 store_emit_memory(si_shader_context(bld_base), emit_data);
3394 return;
3395 }
3396
3397 if (target == TGSI_TEXTURE_BUFFER) {
3398 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3399 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3400 emit_data->dst_type, emit_data->args, emit_data->arg_count,
3401 LLVMNoUnwindAttribute);
3402 } else {
3403 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3404 coords_type, sizeof(coords_type));
3405 snprintf(intrinsic_name, sizeof(intrinsic_name),
3406 "llvm.amdgcn.image.store.%s", coords_type);
3407
3408 emit_data->output[emit_data->chan] =
3409 lp_build_intrinsic(
3410 builder, intrinsic_name, emit_data->dst_type,
3411 emit_data->args, emit_data->arg_count,
3412 LLVMNoUnwindAttribute);
3413 }
3414 }
3415
3416 static void atomic_fetch_args(
3417 struct lp_build_tgsi_context * bld_base,
3418 struct lp_build_emit_data * emit_data)
3419 {
3420 struct si_shader_context *ctx = si_shader_context(bld_base);
3421 struct gallivm_state *gallivm = bld_base->base.gallivm;
3422 LLVMBuilderRef builder = gallivm->builder;
3423 const struct tgsi_full_instruction * inst = emit_data->inst;
3424 LLVMValueRef data1, data2;
3425 LLVMValueRef rsrc;
3426 LLVMValueRef tmp;
3427
3428 emit_data->dst_type = bld_base->base.elem_type;
3429
3430 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3431 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3432
3433 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3434 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3435 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3436 }
3437
3438 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3439 * of arguments, which is reversed relative to TGSI (and GLSL)
3440 */
3441 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3442 emit_data->args[emit_data->arg_count++] = data2;
3443 emit_data->args[emit_data->arg_count++] = data1;
3444
3445 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3446 LLVMValueRef offset;
3447
3448 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3449
3450 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3451 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3452
3453 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3454 offset, true);
3455 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3456 unsigned target = inst->Memory.Texture;
3457 LLVMValueRef coords;
3458
3459 image_fetch_rsrc(bld_base, &inst->Src[0],
3460 target != TGSI_TEXTURE_BUFFER, &rsrc);
3461 coords = image_fetch_coords(bld_base, inst, 1);
3462
3463 if (target == TGSI_TEXTURE_BUFFER) {
3464 rsrc = extract_rsrc_top_half(ctx, rsrc);
3465 buffer_append_args(ctx, emit_data, rsrc, coords,
3466 bld_base->uint_bld.zero, true);
3467 } else {
3468 emit_data->args[emit_data->arg_count++] = coords;
3469 emit_data->args[emit_data->arg_count++] = rsrc;
3470
3471 image_append_args(ctx, emit_data, target, true);
3472 }
3473 }
3474 }
3475
3476 static void atomic_emit_memory(struct si_shader_context *ctx,
3477 struct lp_build_emit_data *emit_data) {
3478 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3479 LLVMBuilderRef builder = gallivm->builder;
3480 const struct tgsi_full_instruction * inst = emit_data->inst;
3481 LLVMValueRef ptr, result, arg;
3482
3483 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3484
3485 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3486 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3487
3488 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3489 LLVMValueRef new_data;
3490 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
3491 inst, 3, 0);
3492
3493 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3494
3495 #if HAVE_LLVM >= 0x309
3496 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3497 LLVMAtomicOrderingSequentiallyConsistent,
3498 LLVMAtomicOrderingSequentiallyConsistent,
3499 false);
3500 #endif
3501
3502 result = LLVMBuildExtractValue(builder, result, 0, "");
3503 } else {
3504 LLVMAtomicRMWBinOp op;
3505
3506 switch(inst->Instruction.Opcode) {
3507 case TGSI_OPCODE_ATOMUADD:
3508 op = LLVMAtomicRMWBinOpAdd;
3509 break;
3510 case TGSI_OPCODE_ATOMXCHG:
3511 op = LLVMAtomicRMWBinOpXchg;
3512 break;
3513 case TGSI_OPCODE_ATOMAND:
3514 op = LLVMAtomicRMWBinOpAnd;
3515 break;
3516 case TGSI_OPCODE_ATOMOR:
3517 op = LLVMAtomicRMWBinOpOr;
3518 break;
3519 case TGSI_OPCODE_ATOMXOR:
3520 op = LLVMAtomicRMWBinOpXor;
3521 break;
3522 case TGSI_OPCODE_ATOMUMIN:
3523 op = LLVMAtomicRMWBinOpUMin;
3524 break;
3525 case TGSI_OPCODE_ATOMUMAX:
3526 op = LLVMAtomicRMWBinOpUMax;
3527 break;
3528 case TGSI_OPCODE_ATOMIMIN:
3529 op = LLVMAtomicRMWBinOpMin;
3530 break;
3531 case TGSI_OPCODE_ATOMIMAX:
3532 op = LLVMAtomicRMWBinOpMax;
3533 break;
3534 default:
3535 unreachable("unknown atomic opcode");
3536 }
3537
3538 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
3539 LLVMAtomicOrderingSequentiallyConsistent,
3540 false);
3541 }
3542 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
3543 }
3544
3545 static void atomic_emit(
3546 const struct lp_build_tgsi_action *action,
3547 struct lp_build_tgsi_context *bld_base,
3548 struct lp_build_emit_data *emit_data)
3549 {
3550 struct si_shader_context *ctx = si_shader_context(bld_base);
3551 struct gallivm_state *gallivm = bld_base->base.gallivm;
3552 LLVMBuilderRef builder = gallivm->builder;
3553 const struct tgsi_full_instruction * inst = emit_data->inst;
3554 char intrinsic_name[40];
3555 LLVMValueRef tmp;
3556
3557 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3558 atomic_emit_memory(ctx, emit_data);
3559 return;
3560 }
3561
3562 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3563 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3564 snprintf(intrinsic_name, sizeof(intrinsic_name),
3565 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
3566 } else {
3567 char coords_type[8];
3568
3569 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3570 coords_type, sizeof(coords_type));
3571 snprintf(intrinsic_name, sizeof(intrinsic_name),
3572 "llvm.amdgcn.image.atomic.%s.%s",
3573 action->intr_name, coords_type);
3574 }
3575
3576 tmp = lp_build_intrinsic(
3577 builder, intrinsic_name, bld_base->uint_bld.elem_type,
3578 emit_data->args, emit_data->arg_count,
3579 LLVMNoUnwindAttribute);
3580 emit_data->output[emit_data->chan] =
3581 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
3582 }
3583
3584 static void resq_fetch_args(
3585 struct lp_build_tgsi_context * bld_base,
3586 struct lp_build_emit_data * emit_data)
3587 {
3588 struct si_shader_context *ctx = si_shader_context(bld_base);
3589 struct gallivm_state *gallivm = bld_base->base.gallivm;
3590 const struct tgsi_full_instruction *inst = emit_data->inst;
3591 const struct tgsi_full_src_register *reg = &inst->Src[0];
3592
3593 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3594
3595 if (reg->Register.File == TGSI_FILE_BUFFER) {
3596 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
3597 emit_data->arg_count = 1;
3598 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3599 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
3600 emit_data->arg_count = 1;
3601 } else {
3602 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
3603 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
3604 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3605 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
3606 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
3607 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
3608 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
3609 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
3610 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
3611 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
3612 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
3613 emit_data->arg_count = 10;
3614 }
3615 }
3616
3617 static void resq_emit(
3618 const struct lp_build_tgsi_action *action,
3619 struct lp_build_tgsi_context *bld_base,
3620 struct lp_build_emit_data *emit_data)
3621 {
3622 struct gallivm_state *gallivm = bld_base->base.gallivm;
3623 LLVMBuilderRef builder = gallivm->builder;
3624 const struct tgsi_full_instruction *inst = emit_data->inst;
3625 LLVMValueRef out;
3626
3627 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3628 out = LLVMBuildExtractElement(builder, emit_data->args[0],
3629 lp_build_const_int32(gallivm, 2), "");
3630 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3631 out = get_buffer_size(bld_base, emit_data->args[0]);
3632 } else {
3633 out = lp_build_intrinsic(
3634 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
3635 emit_data->args, emit_data->arg_count,
3636 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
3637
3638 /* Divide the number of layers by 6 to get the number of cubes. */
3639 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
3640 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
3641 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
3642
3643 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
3644 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
3645 z = LLVMBuildSDiv(builder, z, imm6, "");
3646 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
3647 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
3648 }
3649 }
3650
3651 emit_data->output[emit_data->chan] = out;
3652 }
3653
3654 static void set_tex_fetch_args(struct si_shader_context *ctx,
3655 struct lp_build_emit_data *emit_data,
3656 unsigned opcode, unsigned target,
3657 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
3658 LLVMValueRef *param, unsigned count,
3659 unsigned dmask)
3660 {
3661 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3662 unsigned num_args;
3663 unsigned is_rect = target == TGSI_TEXTURE_RECT;
3664
3665 /* Pad to power of two vector */
3666 while (count < util_next_power_of_two(count))
3667 param[count++] = LLVMGetUndef(ctx->i32);
3668
3669 /* Texture coordinates. */
3670 if (count > 1)
3671 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
3672 else
3673 emit_data->args[0] = param[0];
3674
3675 /* Resource. */
3676 emit_data->args[1] = res_ptr;
3677 num_args = 2;
3678
3679 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
3680 emit_data->dst_type = ctx->v4i32;
3681 else {
3682 emit_data->dst_type = ctx->v4f32;
3683
3684 emit_data->args[num_args++] = samp_ptr;
3685 }
3686
3687 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
3688 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
3689 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
3690 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
3691 tgsi_is_array_sampler(target)); /* da */
3692 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
3693 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
3694 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
3695 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
3696
3697 emit_data->arg_count = num_args;
3698 }
3699
3700 static const struct lp_build_tgsi_action tex_action;
3701
3702 enum desc_type {
3703 DESC_IMAGE,
3704 DESC_FMASK,
3705 DESC_SAMPLER
3706 };
3707
3708 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3709 {
3710 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3711 CONST_ADDR_SPACE);
3712 }
3713
3714 /**
3715 * Load an image view, fmask view. or sampler state descriptor.
3716 */
3717 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
3718 LLVMValueRef list, LLVMValueRef index,
3719 enum desc_type type)
3720 {
3721 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3722 LLVMBuilderRef builder = gallivm->builder;
3723
3724 switch (type) {
3725 case DESC_IMAGE:
3726 /* The image is at [0:7]. */
3727 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3728 break;
3729 case DESC_FMASK:
3730 /* The FMASK is at [8:15]. */
3731 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3732 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
3733 break;
3734 case DESC_SAMPLER:
3735 /* The sampler state is at [12:15]. */
3736 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3737 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
3738 list = LLVMBuildPointerCast(builder, list,
3739 const_array(ctx->v4i32, 0), "");
3740 break;
3741 }
3742
3743 return build_indexed_load_const(ctx, list, index);
3744 }
3745
3746 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
3747 LLVMValueRef index, enum desc_type type)
3748 {
3749 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
3750 SI_PARAM_SAMPLERS);
3751
3752 return get_sampler_desc_custom(ctx, list, index, type);
3753 }
3754
3755 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
3756 *
3757 * SI-CI:
3758 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
3759 * filtering manually. The driver sets img7 to a mask clearing
3760 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
3761 * s_and_b32 samp0, samp0, img7
3762 *
3763 * VI:
3764 * The ANISO_OVERRIDE sampler field enables this fix in TA.
3765 */
3766 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
3767 LLVMValueRef res, LLVMValueRef samp)
3768 {
3769 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3770 LLVMValueRef img7, samp0;
3771
3772 if (ctx->screen->b.chip_class >= VI)
3773 return samp;
3774
3775 img7 = LLVMBuildExtractElement(builder, res,
3776 LLVMConstInt(ctx->i32, 7, 0), "");
3777 samp0 = LLVMBuildExtractElement(builder, samp,
3778 LLVMConstInt(ctx->i32, 0, 0), "");
3779 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
3780 return LLVMBuildInsertElement(builder, samp, samp0,
3781 LLVMConstInt(ctx->i32, 0, 0), "");
3782 }
3783
3784 static void tex_fetch_ptrs(
3785 struct lp_build_tgsi_context *bld_base,
3786 struct lp_build_emit_data *emit_data,
3787 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
3788 {
3789 struct si_shader_context *ctx = si_shader_context(bld_base);
3790 const struct tgsi_full_instruction *inst = emit_data->inst;
3791 unsigned target = inst->Texture.Texture;
3792 unsigned sampler_src;
3793 unsigned sampler_index;
3794
3795 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
3796 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
3797
3798 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
3799 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
3800 LLVMValueRef ind_index;
3801
3802 ind_index = get_bounded_indirect_index(ctx,
3803 &reg->Indirect,
3804 reg->Register.Index,
3805 SI_NUM_USER_SAMPLERS);
3806
3807 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
3808
3809 if (target == TGSI_TEXTURE_2D_MSAA ||
3810 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
3811 *samp_ptr = NULL;
3812 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
3813 } else {
3814 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
3815 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
3816 *fmask_ptr = NULL;
3817 }
3818 } else {
3819 *res_ptr = ctx->sampler_views[sampler_index];
3820 *samp_ptr = ctx->sampler_states[sampler_index];
3821 *fmask_ptr = ctx->fmasks[sampler_index];
3822 }
3823 }
3824
3825 static void tex_fetch_args(
3826 struct lp_build_tgsi_context *bld_base,
3827 struct lp_build_emit_data *emit_data)
3828 {
3829 struct si_shader_context *ctx = si_shader_context(bld_base);
3830 struct gallivm_state *gallivm = bld_base->base.gallivm;
3831 LLVMBuilderRef builder = gallivm->builder;
3832 const struct tgsi_full_instruction *inst = emit_data->inst;
3833 unsigned opcode = inst->Instruction.Opcode;
3834 unsigned target = inst->Texture.Texture;
3835 LLVMValueRef coords[5], derivs[6];
3836 LLVMValueRef address[16];
3837 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3838 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
3839 unsigned count = 0;
3840 unsigned chan;
3841 unsigned num_deriv_channels = 0;
3842 bool has_offset = inst->Texture.NumOffsets > 0;
3843 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
3844 unsigned dmask = 0xf;
3845
3846 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
3847
3848 if (opcode == TGSI_OPCODE_TXQ) {
3849 if (target == TGSI_TEXTURE_BUFFER) {
3850 /* Read the size from the buffer descriptor directly. */
3851 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
3852 emit_data->args[0] = get_buffer_size(bld_base, res);
3853 return;
3854 }
3855
3856 /* Textures - set the mip level. */
3857 address[count++] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
3858
3859 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
3860 NULL, address, count, 0xf);
3861 return;
3862 }
3863
3864 if (target == TGSI_TEXTURE_BUFFER) {
3865 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3866
3867 /* Bitcast and truncate v8i32 to v16i8. */
3868 LLVMValueRef res = res_ptr;
3869 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
3870 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
3871 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
3872
3873 emit_data->dst_type = ctx->v4f32;
3874 emit_data->args[0] = res;
3875 emit_data->args[1] = bld_base->uint_bld.zero;
3876 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3877 emit_data->arg_count = 3;
3878 return;
3879 }
3880
3881 /* Fetch and project texture coordinates */
3882 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
3883 for (chan = 0; chan < 3; chan++ ) {
3884 coords[chan] = lp_build_emit_fetch(bld_base,
3885 emit_data->inst, 0,
3886 chan);
3887 if (opcode == TGSI_OPCODE_TXP)
3888 coords[chan] = lp_build_emit_llvm_binary(bld_base,
3889 TGSI_OPCODE_DIV,
3890 coords[chan],
3891 coords[3]);
3892 }
3893
3894 if (opcode == TGSI_OPCODE_TXP)
3895 coords[3] = bld_base->base.one;
3896
3897 /* Pack offsets. */
3898 if (has_offset && opcode != TGSI_OPCODE_TXF) {
3899 /* The offsets are six-bit signed integers packed like this:
3900 * X=[5:0], Y=[13:8], and Z=[21:16].
3901 */
3902 LLVMValueRef offset[3], pack;
3903
3904 assert(inst->Texture.NumOffsets == 1);
3905
3906 for (chan = 0; chan < 3; chan++) {
3907 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
3908 emit_data->inst, 0, chan);
3909 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
3910 lp_build_const_int32(gallivm, 0x3f), "");
3911 if (chan)
3912 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
3913 lp_build_const_int32(gallivm, chan*8), "");
3914 }
3915
3916 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
3917 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
3918 address[count++] = pack;
3919 }
3920
3921 /* Pack LOD bias value */
3922 if (opcode == TGSI_OPCODE_TXB)
3923 address[count++] = coords[3];
3924 if (opcode == TGSI_OPCODE_TXB2)
3925 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
3926
3927 /* Pack depth comparison value */
3928 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
3929 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
3930 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
3931 } else {
3932 assert(ref_pos >= 0);
3933 address[count++] = coords[ref_pos];
3934 }
3935 }
3936
3937 /* Pack user derivatives */
3938 if (opcode == TGSI_OPCODE_TXD) {
3939 int param, num_src_deriv_channels;
3940
3941 switch (target) {
3942 case TGSI_TEXTURE_3D:
3943 num_src_deriv_channels = 3;
3944 num_deriv_channels = 3;
3945 break;
3946 case TGSI_TEXTURE_2D:
3947 case TGSI_TEXTURE_SHADOW2D:
3948 case TGSI_TEXTURE_RECT:
3949 case TGSI_TEXTURE_SHADOWRECT:
3950 case TGSI_TEXTURE_2D_ARRAY:
3951 case TGSI_TEXTURE_SHADOW2D_ARRAY:
3952 num_src_deriv_channels = 2;
3953 num_deriv_channels = 2;
3954 break;
3955 case TGSI_TEXTURE_CUBE:
3956 case TGSI_TEXTURE_SHADOWCUBE:
3957 case TGSI_TEXTURE_CUBE_ARRAY:
3958 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
3959 /* Cube derivatives will be converted to 2D. */
3960 num_src_deriv_channels = 3;
3961 num_deriv_channels = 2;
3962 break;
3963 case TGSI_TEXTURE_1D:
3964 case TGSI_TEXTURE_SHADOW1D:
3965 case TGSI_TEXTURE_1D_ARRAY:
3966 case TGSI_TEXTURE_SHADOW1D_ARRAY:
3967 num_src_deriv_channels = 1;
3968 num_deriv_channels = 1;
3969 break;
3970 default:
3971 unreachable("invalid target");
3972 }
3973
3974 for (param = 0; param < 2; param++)
3975 for (chan = 0; chan < num_src_deriv_channels; chan++)
3976 derivs[param * num_src_deriv_channels + chan] =
3977 lp_build_emit_fetch(bld_base, inst, param+1, chan);
3978 }
3979
3980 if (target == TGSI_TEXTURE_CUBE ||
3981 target == TGSI_TEXTURE_CUBE_ARRAY ||
3982 target == TGSI_TEXTURE_SHADOWCUBE ||
3983 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
3984 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
3985
3986 if (opcode == TGSI_OPCODE_TXD)
3987 for (int i = 0; i < num_deriv_channels * 2; i++)
3988 address[count++] = derivs[i];
3989
3990 /* Pack texture coordinates */
3991 address[count++] = coords[0];
3992 if (num_coords > 1)
3993 address[count++] = coords[1];
3994 if (num_coords > 2)
3995 address[count++] = coords[2];
3996
3997 /* Pack LOD or sample index */
3998 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
3999 address[count++] = coords[3];
4000 else if (opcode == TGSI_OPCODE_TXL2)
4001 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4002
4003 if (count > 16) {
4004 assert(!"Cannot handle more than 16 texture address parameters");
4005 count = 16;
4006 }
4007
4008 for (chan = 0; chan < count; chan++ ) {
4009 address[chan] = LLVMBuildBitCast(gallivm->builder,
4010 address[chan], ctx->i32, "");
4011 }
4012
4013 /* Adjust the sample index according to FMASK.
4014 *
4015 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4016 * which is the identity mapping. Each nibble says which physical sample
4017 * should be fetched to get that sample.
4018 *
4019 * For example, 0x11111100 means there are only 2 samples stored and
4020 * the second sample covers 3/4 of the pixel. When reading samples 0
4021 * and 1, return physical sample 0 (determined by the first two 0s
4022 * in FMASK), otherwise return physical sample 1.
4023 *
4024 * The sample index should be adjusted as follows:
4025 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4026 */
4027 if (target == TGSI_TEXTURE_2D_MSAA ||
4028 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4029 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4030 struct lp_build_emit_data txf_emit_data = *emit_data;
4031 LLVMValueRef txf_address[4];
4032 unsigned txf_count = count;
4033 struct tgsi_full_instruction inst = {};
4034
4035 memcpy(txf_address, address, sizeof(txf_address));
4036
4037 if (target == TGSI_TEXTURE_2D_MSAA) {
4038 txf_address[2] = bld_base->uint_bld.zero;
4039 }
4040 txf_address[3] = bld_base->uint_bld.zero;
4041
4042 /* Read FMASK using TXF. */
4043 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4044 inst.Texture.Texture = target;
4045 txf_emit_data.inst = &inst;
4046 txf_emit_data.chan = 0;
4047 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4048 target, fmask_ptr, NULL,
4049 txf_address, txf_count, 0xf);
4050 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4051
4052 /* Initialize some constants. */
4053 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4054 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4055
4056 /* Apply the formula. */
4057 LLVMValueRef fmask =
4058 LLVMBuildExtractElement(gallivm->builder,
4059 txf_emit_data.output[0],
4060 uint_bld->zero, "");
4061
4062 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4063
4064 LLVMValueRef sample_index4 =
4065 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4066
4067 LLVMValueRef shifted_fmask =
4068 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4069
4070 LLVMValueRef final_sample =
4071 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4072
4073 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4074 * resource descriptor is 0 (invalid),
4075 */
4076 LLVMValueRef fmask_desc =
4077 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4078 ctx->v8i32, "");
4079
4080 LLVMValueRef fmask_word1 =
4081 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4082 uint_bld->one, "");
4083
4084 LLVMValueRef word1_is_nonzero =
4085 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4086 fmask_word1, uint_bld->zero, "");
4087
4088 /* Replace the MSAA sample index. */
4089 address[sample_chan] =
4090 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4091 final_sample, address[sample_chan], "");
4092 }
4093
4094 if (opcode == TGSI_OPCODE_TXF) {
4095 /* add tex offsets */
4096 if (inst->Texture.NumOffsets) {
4097 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4098 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4099 const struct tgsi_texture_offset *off = inst->TexOffsets;
4100
4101 assert(inst->Texture.NumOffsets == 1);
4102
4103 switch (target) {
4104 case TGSI_TEXTURE_3D:
4105 address[2] = lp_build_add(uint_bld, address[2],
4106 bld->immediates[off->Index][off->SwizzleZ]);
4107 /* fall through */
4108 case TGSI_TEXTURE_2D:
4109 case TGSI_TEXTURE_SHADOW2D:
4110 case TGSI_TEXTURE_RECT:
4111 case TGSI_TEXTURE_SHADOWRECT:
4112 case TGSI_TEXTURE_2D_ARRAY:
4113 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4114 address[1] =
4115 lp_build_add(uint_bld, address[1],
4116 bld->immediates[off->Index][off->SwizzleY]);
4117 /* fall through */
4118 case TGSI_TEXTURE_1D:
4119 case TGSI_TEXTURE_SHADOW1D:
4120 case TGSI_TEXTURE_1D_ARRAY:
4121 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4122 address[0] =
4123 lp_build_add(uint_bld, address[0],
4124 bld->immediates[off->Index][off->SwizzleX]);
4125 break;
4126 /* texture offsets do not apply to other texture targets */
4127 }
4128 }
4129 }
4130
4131 if (opcode == TGSI_OPCODE_TG4) {
4132 unsigned gather_comp = 0;
4133
4134 /* DMASK was repurposed for GATHER4. 4 components are always
4135 * returned and DMASK works like a swizzle - it selects
4136 * the component to fetch. The only valid DMASK values are
4137 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4138 * (red,red,red,red) etc.) The ISA document doesn't mention
4139 * this.
4140 */
4141
4142 /* Get the component index from src1.x for Gather4. */
4143 if (!tgsi_is_shadow_target(target)) {
4144 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4145 LLVMValueRef comp_imm;
4146 struct tgsi_src_register src1 = inst->Src[1].Register;
4147
4148 assert(src1.File == TGSI_FILE_IMMEDIATE);
4149
4150 comp_imm = imms[src1.Index][src1.SwizzleX];
4151 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4152 gather_comp = CLAMP(gather_comp, 0, 3);
4153 }
4154
4155 dmask = 1 << gather_comp;
4156 }
4157
4158 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4159 samp_ptr, address, count, dmask);
4160 }
4161
4162 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4163 struct lp_build_tgsi_context *bld_base,
4164 struct lp_build_emit_data *emit_data)
4165 {
4166 struct lp_build_context *base = &bld_base->base;
4167 unsigned opcode = emit_data->inst->Instruction.Opcode;
4168 unsigned target = emit_data->inst->Texture.Texture;
4169 char intr_name[127];
4170 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4171 bool is_shadow = tgsi_is_shadow_target(target);
4172 char type[64];
4173 const char *name = "llvm.SI.image.sample";
4174 const char *infix = "";
4175
4176 if (opcode == TGSI_OPCODE_TXQ && target == TGSI_TEXTURE_BUFFER) {
4177 /* Just return the buffer size. */
4178 emit_data->output[emit_data->chan] = emit_data->args[0];
4179 return;
4180 }
4181
4182 if (target == TGSI_TEXTURE_BUFFER) {
4183 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4184 base->gallivm->builder,
4185 "llvm.SI.vs.load.input", emit_data->dst_type,
4186 emit_data->args, emit_data->arg_count,
4187 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4188 return;
4189 }
4190
4191 switch (opcode) {
4192 case TGSI_OPCODE_TXF:
4193 name = target == TGSI_TEXTURE_2D_MSAA ||
4194 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4195 "llvm.SI.image.load" :
4196 "llvm.SI.image.load.mip";
4197 is_shadow = false;
4198 has_offset = false;
4199 break;
4200 case TGSI_OPCODE_TXQ:
4201 name = "llvm.SI.getresinfo";
4202 is_shadow = false;
4203 has_offset = false;
4204 break;
4205 case TGSI_OPCODE_LODQ:
4206 name = "llvm.SI.getlod";
4207 is_shadow = false;
4208 has_offset = false;
4209 break;
4210 case TGSI_OPCODE_TEX:
4211 case TGSI_OPCODE_TEX2:
4212 case TGSI_OPCODE_TXP:
4213 break;
4214 case TGSI_OPCODE_TXB:
4215 case TGSI_OPCODE_TXB2:
4216 infix = ".b";
4217 break;
4218 case TGSI_OPCODE_TXL:
4219 case TGSI_OPCODE_TXL2:
4220 infix = ".l";
4221 break;
4222 case TGSI_OPCODE_TXD:
4223 infix = ".d";
4224 break;
4225 case TGSI_OPCODE_TG4:
4226 name = "llvm.SI.gather4";
4227 break;
4228 default:
4229 assert(0);
4230 return;
4231 }
4232
4233 /* Add the type and suffixes .c, .o if needed. */
4234 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4235 sprintf(intr_name, "%s%s%s%s.%s",
4236 name, is_shadow ? ".c" : "", infix,
4237 has_offset ? ".o" : "", type);
4238
4239 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4240 base->gallivm->builder, intr_name, emit_data->dst_type,
4241 emit_data->args, emit_data->arg_count,
4242 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4243
4244 /* Divide the number of layers by 6 to get the number of cubes. */
4245 if (opcode == TGSI_OPCODE_TXQ &&
4246 (target == TGSI_TEXTURE_CUBE_ARRAY ||
4247 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)) {
4248 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4249 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4250 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4251
4252 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4253 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4254 z = LLVMBuildSDiv(builder, z, six, "");
4255
4256 emit_data->output[emit_data->chan] =
4257 LLVMBuildInsertElement(builder, v4, z, two, "");
4258 }
4259 }
4260
4261 static void si_llvm_emit_txqs(
4262 const struct lp_build_tgsi_action *action,
4263 struct lp_build_tgsi_context *bld_base,
4264 struct lp_build_emit_data *emit_data)
4265 {
4266 struct si_shader_context *ctx = si_shader_context(bld_base);
4267 struct gallivm_state *gallivm = bld_base->base.gallivm;
4268 LLVMBuilderRef builder = gallivm->builder;
4269 LLVMValueRef res, samples;
4270 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4271
4272 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4273
4274
4275 /* Read the samples from the descriptor directly. */
4276 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4277 samples = LLVMBuildExtractElement(
4278 builder, res,
4279 lp_build_const_int32(gallivm, 3), "");
4280 samples = LLVMBuildLShr(builder, samples,
4281 lp_build_const_int32(gallivm, 16), "");
4282 samples = LLVMBuildAnd(builder, samples,
4283 lp_build_const_int32(gallivm, 0xf), "");
4284 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4285 samples, "");
4286
4287 emit_data->output[emit_data->chan] = samples;
4288 }
4289
4290 /*
4291 * SI implements derivatives using the local data store (LDS)
4292 * All writes to the LDS happen in all executing threads at
4293 * the same time. TID is the Thread ID for the current
4294 * thread and is a value between 0 and 63, representing
4295 * the thread's position in the wavefront.
4296 *
4297 * For the pixel shader threads are grouped into quads of four pixels.
4298 * The TIDs of the pixels of a quad are:
4299 *
4300 * +------+------+
4301 * |4n + 0|4n + 1|
4302 * +------+------+
4303 * |4n + 2|4n + 3|
4304 * +------+------+
4305 *
4306 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4307 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4308 * the current pixel's column, and masking with 0xfffffffe yields the TID
4309 * of the left pixel of the current pixel's row.
4310 *
4311 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4312 * adding 2 yields the TID of the pixel below the top pixel.
4313 */
4314 /* masks for thread ID. */
4315 #define TID_MASK_TOP_LEFT 0xfffffffc
4316 #define TID_MASK_TOP 0xfffffffd
4317 #define TID_MASK_LEFT 0xfffffffe
4318
4319 static void si_llvm_emit_ddxy(
4320 const struct lp_build_tgsi_action *action,
4321 struct lp_build_tgsi_context *bld_base,
4322 struct lp_build_emit_data *emit_data)
4323 {
4324 struct si_shader_context *ctx = si_shader_context(bld_base);
4325 struct gallivm_state *gallivm = bld_base->base.gallivm;
4326 const struct tgsi_full_instruction *inst = emit_data->inst;
4327 unsigned opcode = inst->Instruction.Opcode;
4328 LLVMValueRef indices[2];
4329 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4330 LLVMValueRef tl, trbl, result[4];
4331 unsigned swizzle[4];
4332 unsigned c;
4333 int idx;
4334 unsigned mask;
4335
4336 indices[0] = bld_base->uint_bld.zero;
4337 indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32,
4338 NULL, 0, LLVMReadNoneAttribute);
4339 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4340 indices, 2, "");
4341
4342 if (opcode == TGSI_OPCODE_DDX_FINE)
4343 mask = TID_MASK_LEFT;
4344 else if (opcode == TGSI_OPCODE_DDY_FINE)
4345 mask = TID_MASK_TOP;
4346 else
4347 mask = TID_MASK_TOP_LEFT;
4348
4349 indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
4350 lp_build_const_int32(gallivm, mask), "");
4351 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4352 indices, 2, "");
4353
4354 /* for DDX we want to next X pixel, DDY next Y pixel. */
4355 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4356 indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
4357 lp_build_const_int32(gallivm, idx), "");
4358 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4359 indices, 2, "");
4360
4361 for (c = 0; c < 4; ++c) {
4362 unsigned i;
4363
4364 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4365 for (i = 0; i < c; ++i) {
4366 if (swizzle[i] == swizzle[c]) {
4367 result[c] = result[i];
4368 break;
4369 }
4370 }
4371 if (i != c)
4372 continue;
4373
4374 LLVMBuildStore(gallivm->builder,
4375 LLVMBuildBitCast(gallivm->builder,
4376 lp_build_emit_fetch(bld_base, inst, 0, c),
4377 ctx->i32, ""),
4378 store_ptr);
4379
4380 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4381 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4382
4383 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4384 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4385
4386 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4387 }
4388
4389 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4390 }
4391
4392 /*
4393 * this takes an I,J coordinate pair,
4394 * and works out the X and Y derivatives.
4395 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4396 */
4397 static LLVMValueRef si_llvm_emit_ddxy_interp(
4398 struct lp_build_tgsi_context *bld_base,
4399 LLVMValueRef interp_ij)
4400 {
4401 struct si_shader_context *ctx = si_shader_context(bld_base);
4402 struct gallivm_state *gallivm = bld_base->base.gallivm;
4403 LLVMValueRef indices[2];
4404 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4405 LLVMValueRef tl, tr, bl, result[4];
4406 unsigned c;
4407
4408 indices[0] = bld_base->uint_bld.zero;
4409 indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32,
4410 NULL, 0, LLVMReadNoneAttribute);
4411 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4412 indices, 2, "");
4413
4414 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4415 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4416
4417 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4418 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4419
4420 indices[1] = temp;
4421 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4422 indices, 2, "");
4423
4424 indices[1] = temp2;
4425 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4426 indices, 2, "");
4427
4428 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4429 lp_build_const_int32(gallivm, 1), "");
4430 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4431 indices, 2, "");
4432
4433 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4434 lp_build_const_int32(gallivm, 2), "");
4435 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4436 indices, 2, "");
4437
4438 for (c = 0; c < 2; ++c) {
4439 LLVMValueRef store_val;
4440 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4441
4442 store_val = LLVMBuildExtractElement(gallivm->builder,
4443 interp_ij, c_ll, "");
4444 LLVMBuildStore(gallivm->builder,
4445 store_val,
4446 store_ptr);
4447
4448 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
4449 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4450
4451 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
4452 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
4453
4454 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
4455
4456 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
4457 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4458
4459 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
4460 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
4461
4462 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
4463 }
4464
4465 return lp_build_gather_values(gallivm, result, 4);
4466 }
4467
4468 static void interp_fetch_args(
4469 struct lp_build_tgsi_context *bld_base,
4470 struct lp_build_emit_data *emit_data)
4471 {
4472 struct si_shader_context *ctx = si_shader_context(bld_base);
4473 struct gallivm_state *gallivm = bld_base->base.gallivm;
4474 const struct tgsi_full_instruction *inst = emit_data->inst;
4475
4476 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
4477 /* offset is in second src, first two channels */
4478 emit_data->args[0] = lp_build_emit_fetch(bld_base,
4479 emit_data->inst, 1,
4480 TGSI_CHAN_X);
4481 emit_data->args[1] = lp_build_emit_fetch(bld_base,
4482 emit_data->inst, 1,
4483 TGSI_CHAN_Y);
4484 emit_data->arg_count = 2;
4485 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4486 LLVMValueRef sample_position;
4487 LLVMValueRef sample_id;
4488 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
4489
4490 /* fetch sample ID, then fetch its sample position,
4491 * and place into first two channels.
4492 */
4493 sample_id = lp_build_emit_fetch(bld_base,
4494 emit_data->inst, 1, TGSI_CHAN_X);
4495 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
4496 ctx->i32, "");
4497 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
4498
4499 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
4500 sample_position,
4501 lp_build_const_int32(gallivm, 0), "");
4502
4503 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
4504 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
4505 sample_position,
4506 lp_build_const_int32(gallivm, 1), "");
4507 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
4508 emit_data->arg_count = 2;
4509 }
4510 }
4511
4512 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
4513 struct lp_build_tgsi_context *bld_base,
4514 struct lp_build_emit_data *emit_data)
4515 {
4516 struct si_shader_context *ctx = si_shader_context(bld_base);
4517 struct si_shader *shader = ctx->shader;
4518 struct gallivm_state *gallivm = bld_base->base.gallivm;
4519 LLVMValueRef interp_param;
4520 const struct tgsi_full_instruction *inst = emit_data->inst;
4521 const char *intr_name;
4522 int input_index = inst->Src[0].Register.Index;
4523 int chan;
4524 int i;
4525 LLVMValueRef attr_number;
4526 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
4527 int interp_param_idx;
4528 unsigned interp = shader->selector->info.input_interpolate[input_index];
4529 unsigned location;
4530
4531 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
4532
4533 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4534 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
4535 location = TGSI_INTERPOLATE_LOC_CENTER;
4536 else
4537 location = TGSI_INTERPOLATE_LOC_CENTROID;
4538
4539 interp_param_idx = lookup_interp_param_index(interp, location);
4540 if (interp_param_idx == -1)
4541 return;
4542 else if (interp_param_idx)
4543 interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
4544 else
4545 interp_param = NULL;
4546
4547 attr_number = lp_build_const_int32(gallivm, input_index);
4548
4549 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4550 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4551 LLVMValueRef ij_out[2];
4552 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
4553
4554 /*
4555 * take the I then J parameters, and the DDX/Y for it, and
4556 * calculate the IJ inputs for the interpolator.
4557 * temp1 = ddx * offset/sample.x + I;
4558 * interp_param.I = ddy * offset/sample.y + temp1;
4559 * temp1 = ddx * offset/sample.x + J;
4560 * interp_param.J = ddy * offset/sample.y + temp1;
4561 */
4562 for (i = 0; i < 2; i++) {
4563 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
4564 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
4565 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
4566 ddxy_out, ix_ll, "");
4567 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
4568 ddxy_out, iy_ll, "");
4569 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
4570 interp_param, ix_ll, "");
4571 LLVMValueRef temp1, temp2;
4572
4573 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
4574 ctx->f32, "");
4575
4576 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
4577
4578 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
4579
4580 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
4581
4582 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
4583
4584 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
4585 temp2, ctx->i32, "");
4586 }
4587 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
4588 }
4589
4590 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
4591 for (chan = 0; chan < 2; chan++) {
4592 LLVMValueRef args[4];
4593 LLVMValueRef llvm_chan;
4594 unsigned schan;
4595
4596 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
4597 llvm_chan = lp_build_const_int32(gallivm, schan);
4598
4599 args[0] = llvm_chan;
4600 args[1] = attr_number;
4601 args[2] = params;
4602 args[3] = interp_param;
4603
4604 emit_data->output[chan] =
4605 lp_build_intrinsic(gallivm->builder, intr_name,
4606 ctx->f32, args, args[3] ? 4 : 3,
4607 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4608 }
4609 }
4610
4611 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4612 struct lp_build_emit_data *emit_data)
4613 {
4614 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4615 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4616 unsigned stream;
4617
4618 assert(src0.File == TGSI_FILE_IMMEDIATE);
4619
4620 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
4621 return stream;
4622 }
4623
4624 /* Emit one vertex from the geometry shader */
4625 static void si_llvm_emit_vertex(
4626 const struct lp_build_tgsi_action *action,
4627 struct lp_build_tgsi_context *bld_base,
4628 struct lp_build_emit_data *emit_data)
4629 {
4630 struct si_shader_context *ctx = si_shader_context(bld_base);
4631 struct lp_build_context *uint = &bld_base->uint_bld;
4632 struct si_shader *shader = ctx->shader;
4633 struct tgsi_shader_info *info = &shader->selector->info;
4634 struct gallivm_state *gallivm = bld_base->base.gallivm;
4635 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
4636 SI_PARAM_GS2VS_OFFSET);
4637 LLVMValueRef gs_next_vertex;
4638 LLVMValueRef can_emit, kill;
4639 LLVMValueRef args[2];
4640 unsigned chan;
4641 int i;
4642 unsigned stream;
4643
4644 stream = si_llvm_get_stream(bld_base, emit_data);
4645
4646 /* Write vertex attribute values to GSVS ring */
4647 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
4648 ctx->gs_next_vertex[stream],
4649 "");
4650
4651 /* If this thread has already emitted the declared maximum number of
4652 * vertices, kill it: excessive vertex emissions are not supposed to
4653 * have any effect, and GS threads have no externally observable
4654 * effects other than emitting vertices.
4655 */
4656 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
4657 lp_build_const_int32(gallivm,
4658 shader->selector->gs_max_out_vertices), "");
4659 kill = lp_build_select(&bld_base->base, can_emit,
4660 lp_build_const_float(gallivm, 1.0f),
4661 lp_build_const_float(gallivm, -1.0f));
4662
4663 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
4664 ctx->voidt, &kill, 1, 0);
4665
4666 for (i = 0; i < info->num_outputs; i++) {
4667 LLVMValueRef *out_ptr =
4668 ctx->radeon_bld.soa.outputs[i];
4669
4670 for (chan = 0; chan < 4; chan++) {
4671 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4672 LLVMValueRef voffset =
4673 lp_build_const_int32(gallivm, (i * 4 + chan) *
4674 shader->selector->gs_max_out_vertices);
4675
4676 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4677 voffset = lp_build_mul_imm(uint, voffset, 4);
4678
4679 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4680
4681 build_tbuffer_store(ctx,
4682 ctx->gsvs_ring[stream],
4683 out_val, 1,
4684 voffset, soffset, 0,
4685 V_008F0C_BUF_DATA_FORMAT_32,
4686 V_008F0C_BUF_NUM_FORMAT_UINT,
4687 1, 0, 1, 1, 0);
4688 }
4689 }
4690 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4691 lp_build_const_int32(gallivm, 1));
4692
4693 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4694
4695 /* Signal vertex emission */
4696 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
4697 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4698 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4699 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4700 }
4701
4702 /* Cut one primitive from the geometry shader */
4703 static void si_llvm_emit_primitive(
4704 const struct lp_build_tgsi_action *action,
4705 struct lp_build_tgsi_context *bld_base,
4706 struct lp_build_emit_data *emit_data)
4707 {
4708 struct si_shader_context *ctx = si_shader_context(bld_base);
4709 struct gallivm_state *gallivm = bld_base->base.gallivm;
4710 LLVMValueRef args[2];
4711 unsigned stream;
4712
4713 /* Signal primitive cut */
4714 stream = si_llvm_get_stream(bld_base, emit_data);
4715 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
4716 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4717 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4718 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4719 }
4720
4721 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4722 struct lp_build_tgsi_context *bld_base,
4723 struct lp_build_emit_data *emit_data)
4724 {
4725 struct si_shader_context *ctx = si_shader_context(bld_base);
4726 struct gallivm_state *gallivm = bld_base->base.gallivm;
4727
4728 /* The real barrier instruction isn’t needed, because an entire patch
4729 * always fits into a single wave.
4730 */
4731 if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) {
4732 emit_optimization_barrier(ctx);
4733 return;
4734 }
4735
4736 lp_build_intrinsic(gallivm->builder,
4737 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
4738 : "llvm.AMDGPU.barrier.local",
4739 ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
4740 }
4741
4742 static const struct lp_build_tgsi_action tex_action = {
4743 .fetch_args = tex_fetch_args,
4744 .emit = build_tex_intrinsic,
4745 };
4746
4747 static const struct lp_build_tgsi_action interp_action = {
4748 .fetch_args = interp_fetch_args,
4749 .emit = build_interp_intrinsic,
4750 };
4751
4752 static void si_create_function(struct si_shader_context *ctx,
4753 LLVMTypeRef *returns, unsigned num_returns,
4754 LLVMTypeRef *params, unsigned num_params,
4755 int last_array_pointer, int last_sgpr)
4756 {
4757 int i;
4758
4759 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
4760 params, num_params);
4761 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
4762 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
4763
4764 for (i = 0; i <= last_sgpr; ++i) {
4765 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
4766
4767 /* We tell llvm that array inputs are passed by value to allow Sinking pass
4768 * to move load. Inputs are constant so this is fine. */
4769 if (i <= last_array_pointer)
4770 LLVMAddAttribute(P, LLVMByValAttribute);
4771 else
4772 LLVMAddAttribute(P, LLVMInRegAttribute);
4773 }
4774 }
4775
4776 static void create_meta_data(struct si_shader_context *ctx)
4777 {
4778 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
4779 LLVMValueRef args[3];
4780
4781 args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
4782 args[1] = 0;
4783 args[2] = lp_build_const_int32(gallivm, 1);
4784
4785 ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
4786 }
4787
4788 static void declare_streamout_params(struct si_shader_context *ctx,
4789 struct pipe_stream_output_info *so,
4790 LLVMTypeRef *params, LLVMTypeRef i32,
4791 unsigned *num_params)
4792 {
4793 int i;
4794
4795 /* Streamout SGPRs. */
4796 if (so->num_outputs) {
4797 params[ctx->param_streamout_config = (*num_params)++] = i32;
4798 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
4799 }
4800 /* A streamout buffer offset is loaded if the stride is non-zero. */
4801 for (i = 0; i < 4; i++) {
4802 if (!so->stride[i])
4803 continue;
4804
4805 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
4806 }
4807 }
4808
4809 static unsigned llvm_get_type_size(LLVMTypeRef type)
4810 {
4811 LLVMTypeKind kind = LLVMGetTypeKind(type);
4812
4813 switch (kind) {
4814 case LLVMIntegerTypeKind:
4815 return LLVMGetIntTypeWidth(type) / 8;
4816 case LLVMFloatTypeKind:
4817 return 4;
4818 case LLVMPointerTypeKind:
4819 return 8;
4820 case LLVMVectorTypeKind:
4821 return LLVMGetVectorSize(type) *
4822 llvm_get_type_size(LLVMGetElementType(type));
4823 default:
4824 assert(0);
4825 return 0;
4826 }
4827 }
4828
4829 static void declare_tess_lds(struct si_shader_context *ctx)
4830 {
4831 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4832 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
4833
4834 /* This is the upper bound, maximum is 32 inputs times 32 vertices */
4835 unsigned vertex_data_dw_size = 32*32*4;
4836 unsigned patch_data_dw_size = 32*4;
4837 /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
4838 unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
4839 unsigned lds_dwords = patch_dw_size;
4840
4841 /* The actual size is computed outside of the shader to reduce
4842 * the number of shader variants. */
4843 ctx->lds =
4844 LLVMAddGlobalInAddressSpace(gallivm->module,
4845 LLVMArrayType(i32, lds_dwords),
4846 "tess_lds",
4847 LOCAL_ADDR_SPACE);
4848 }
4849
4850 static void create_function(struct si_shader_context *ctx)
4851 {
4852 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
4853 struct gallivm_state *gallivm = bld_base->base.gallivm;
4854 struct si_shader *shader = ctx->shader;
4855 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
4856 LLVMTypeRef returns[16+32*4];
4857 unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
4858 unsigned num_returns = 0;
4859
4860 v3i32 = LLVMVectorType(ctx->i32, 3);
4861
4862 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
4863 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
4864 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
4865 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
4866 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
4867 last_array_pointer = SI_PARAM_SHADER_BUFFERS;
4868
4869 switch (ctx->type) {
4870 case TGSI_PROCESSOR_VERTEX:
4871 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
4872 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
4873 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
4874 params[SI_PARAM_START_INSTANCE] = ctx->i32;
4875 num_params = SI_PARAM_START_INSTANCE+1;
4876
4877 if (shader->key.vs.as_es) {
4878 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4879 } else if (shader->key.vs.as_ls) {
4880 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
4881 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
4882 } else {
4883 if (ctx->is_gs_copy_shader) {
4884 last_array_pointer = SI_PARAM_CONST_BUFFERS;
4885 num_params = SI_PARAM_CONST_BUFFERS+1;
4886 } else {
4887 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
4888 num_params = SI_PARAM_VS_STATE_BITS+1;
4889 }
4890
4891 /* The locations of the other parameters are assigned dynamically. */
4892 declare_streamout_params(ctx, &shader->selector->so,
4893 params, ctx->i32, &num_params);
4894 }
4895
4896 last_sgpr = num_params-1;
4897
4898 /* VGPRs */
4899 params[ctx->param_vertex_id = num_params++] = ctx->i32;
4900 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
4901 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
4902 params[ctx->param_instance_id = num_params++] = ctx->i32;
4903
4904 if (!ctx->is_monolithic &&
4905 !ctx->is_gs_copy_shader) {
4906 /* Vertex load indices. */
4907 ctx->param_vertex_index0 = num_params;
4908
4909 for (i = 0; i < shader->selector->info.num_inputs; i++)
4910 params[num_params++] = ctx->i32;
4911
4912 /* PrimitiveID output. */
4913 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
4914 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
4915 returns[num_returns++] = ctx->f32;
4916 }
4917 break;
4918
4919 case TGSI_PROCESSOR_TESS_CTRL:
4920 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
4921 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
4922 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
4923 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
4924 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
4925
4926 /* VGPRs */
4927 params[SI_PARAM_PATCH_ID] = ctx->i32;
4928 params[SI_PARAM_REL_IDS] = ctx->i32;
4929 num_params = SI_PARAM_REL_IDS+1;
4930
4931 if (!ctx->is_monolithic) {
4932 /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
4933 for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
4934 returns[num_returns++] = ctx->i32; /* SGPRs */
4935
4936 for (i = 0; i < 3; i++)
4937 returns[num_returns++] = ctx->f32; /* VGPRs */
4938 }
4939 break;
4940
4941 case TGSI_PROCESSOR_TESS_EVAL:
4942 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
4943 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
4944 num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
4945
4946 if (shader->key.tes.as_es) {
4947 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4948 } else {
4949 declare_streamout_params(ctx, &shader->selector->so,
4950 params, ctx->i32, &num_params);
4951 }
4952 last_sgpr = num_params - 1;
4953
4954 /* VGPRs */
4955 params[ctx->param_tes_u = num_params++] = ctx->f32;
4956 params[ctx->param_tes_v = num_params++] = ctx->f32;
4957 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
4958 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
4959
4960 /* PrimitiveID output. */
4961 if (!ctx->is_monolithic && !shader->key.tes.as_es)
4962 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
4963 returns[num_returns++] = ctx->f32;
4964 break;
4965
4966 case TGSI_PROCESSOR_GEOMETRY:
4967 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
4968 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
4969 last_sgpr = SI_PARAM_GS_WAVE_ID;
4970
4971 /* VGPRs */
4972 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
4973 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
4974 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
4975 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
4976 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
4977 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
4978 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
4979 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
4980 num_params = SI_PARAM_GS_INSTANCE_ID+1;
4981 break;
4982
4983 case TGSI_PROCESSOR_FRAGMENT:
4984 params[SI_PARAM_ALPHA_REF] = ctx->f32;
4985 params[SI_PARAM_PRIM_MASK] = ctx->i32;
4986 last_sgpr = SI_PARAM_PRIM_MASK;
4987 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
4988 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
4989 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
4990 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
4991 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
4992 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
4993 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
4994 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
4995 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
4996 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
4997 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
4998 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
4999 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5000 params[SI_PARAM_ANCILLARY] = ctx->i32;
5001 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5002 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5003 num_params = SI_PARAM_POS_FIXED_PT+1;
5004
5005 if (!ctx->is_monolithic) {
5006 /* Color inputs from the prolog. */
5007 if (shader->selector->info.colors_read) {
5008 unsigned num_color_elements =
5009 util_bitcount(shader->selector->info.colors_read);
5010
5011 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5012 for (i = 0; i < num_color_elements; i++)
5013 params[num_params++] = ctx->f32;
5014 }
5015
5016 /* Outputs for the epilog. */
5017 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5018 num_returns =
5019 num_return_sgprs +
5020 util_bitcount(shader->selector->info.colors_written) * 4 +
5021 shader->selector->info.writes_z +
5022 shader->selector->info.writes_stencil +
5023 shader->selector->info.writes_samplemask +
5024 1 /* SampleMaskIn */;
5025
5026 num_returns = MAX2(num_returns,
5027 num_return_sgprs +
5028 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5029
5030 for (i = 0; i < num_return_sgprs; i++)
5031 returns[i] = ctx->i32;
5032 for (; i < num_returns; i++)
5033 returns[i] = ctx->f32;
5034 }
5035 break;
5036
5037 case TGSI_PROCESSOR_COMPUTE:
5038 params[SI_PARAM_GRID_SIZE] = v3i32;
5039 params[SI_PARAM_BLOCK_ID] = v3i32;
5040 last_sgpr = SI_PARAM_BLOCK_ID;
5041
5042 params[SI_PARAM_THREAD_ID] = v3i32;
5043 num_params = SI_PARAM_THREAD_ID + 1;
5044 break;
5045 default:
5046 assert(0 && "unimplemented shader");
5047 return;
5048 }
5049
5050 assert(num_params <= Elements(params));
5051
5052 si_create_function(ctx, returns, num_returns, params,
5053 num_params, last_array_pointer, last_sgpr);
5054
5055 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5056 if (ctx->type == TGSI_PROCESSOR_FRAGMENT &&
5057 !ctx->is_monolithic) {
5058 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5059 "InitialPSInputAddr",
5060 S_0286D0_PERSP_SAMPLE_ENA(1) |
5061 S_0286D0_PERSP_CENTER_ENA(1) |
5062 S_0286D0_PERSP_CENTROID_ENA(1) |
5063 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5064 S_0286D0_LINEAR_CENTER_ENA(1) |
5065 S_0286D0_LINEAR_CENTROID_ENA(1) |
5066 S_0286D0_FRONT_FACE_ENA(1) |
5067 S_0286D0_POS_FIXED_PT_ENA(1));
5068 } else if (ctx->type == TGSI_PROCESSOR_COMPUTE) {
5069 const unsigned *properties = shader->selector->info.properties;
5070 unsigned max_work_group_size =
5071 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5072 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5073 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5074
5075 assert(max_work_group_size);
5076
5077 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5078 "amdgpu-max-work-group-size",
5079 max_work_group_size);
5080 }
5081
5082 shader->info.num_input_sgprs = 0;
5083 shader->info.num_input_vgprs = 0;
5084
5085 for (i = 0; i <= last_sgpr; ++i)
5086 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5087
5088 /* Unused fragment shader inputs are eliminated by the compiler,
5089 * so we don't know yet how many there will be.
5090 */
5091 if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
5092 for (; i < num_params; ++i)
5093 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5094
5095 if (bld_base->info &&
5096 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5097 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5098 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5099 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5100 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5101 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5102 ctx->lds =
5103 LLVMAddGlobalInAddressSpace(gallivm->module,
5104 LLVMArrayType(ctx->i32, 64),
5105 "ddxy_lds",
5106 LOCAL_ADDR_SPACE);
5107
5108 if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
5109 ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
5110 ctx->type == TGSI_PROCESSOR_TESS_EVAL)
5111 declare_tess_lds(ctx);
5112 }
5113
5114 static void preload_constants(struct si_shader_context *ctx)
5115 {
5116 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5117 struct gallivm_state *gallivm = bld_base->base.gallivm;
5118 const struct tgsi_shader_info *info = bld_base->info;
5119 unsigned buf;
5120 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5121
5122 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5123 unsigned i, num_const = info->const_file_max[buf] + 1;
5124
5125 if (num_const == 0)
5126 continue;
5127
5128 /* Allocate space for the constant values */
5129 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5130
5131 /* Load the resource descriptor */
5132 ctx->const_buffers[buf] =
5133 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5134
5135 /* Load the constants, we rely on the code sinking to do the rest */
5136 for (i = 0; i < num_const * 4; ++i) {
5137 ctx->constants[buf][i] =
5138 buffer_load_const(gallivm->builder,
5139 ctx->const_buffers[buf],
5140 lp_build_const_int32(gallivm, i * 4),
5141 ctx->f32);
5142 }
5143 }
5144 }
5145
5146 static void preload_shader_buffers(struct si_shader_context *ctx)
5147 {
5148 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5149 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5150 int buf, maxbuf;
5151
5152 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5153 SI_NUM_SHADER_BUFFERS - 1);
5154 for (buf = 0; buf <= maxbuf; ++buf) {
5155 ctx->shader_buffers[buf] =
5156 build_indexed_load_const(
5157 ctx, ptr, lp_build_const_int32(gallivm, buf));
5158 }
5159 }
5160
5161 static void preload_samplers(struct si_shader_context *ctx)
5162 {
5163 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5164 struct gallivm_state *gallivm = bld_base->base.gallivm;
5165 const struct tgsi_shader_info *info = bld_base->info;
5166 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5167 LLVMValueRef offset;
5168
5169 if (num_samplers == 0)
5170 return;
5171
5172 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5173 for (i = 0; i < num_samplers; ++i) {
5174 /* Resource */
5175 offset = lp_build_const_int32(gallivm, i);
5176 ctx->sampler_views[i] =
5177 get_sampler_desc(ctx, offset, DESC_IMAGE);
5178
5179 /* FMASK resource */
5180 if (info->is_msaa_sampler[i])
5181 ctx->fmasks[i] =
5182 get_sampler_desc(ctx, offset, DESC_FMASK);
5183 else {
5184 ctx->sampler_states[i] =
5185 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5186 ctx->sampler_states[i] =
5187 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5188 ctx->sampler_states[i]);
5189 }
5190 }
5191 }
5192
5193 static void preload_images(struct si_shader_context *ctx)
5194 {
5195 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5196 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5197 struct gallivm_state *gallivm = bld_base->base.gallivm;
5198 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5199 LLVMValueRef res_ptr;
5200 unsigned i;
5201
5202 if (num_images == 0)
5203 return;
5204
5205 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5206
5207 for (i = 0; i < num_images; ++i) {
5208 /* Rely on LLVM to shrink the load for buffer resources. */
5209 LLVMValueRef rsrc =
5210 build_indexed_load_const(ctx, res_ptr,
5211 lp_build_const_int32(gallivm, i));
5212
5213 if (info->images_writemask & (1 << i) &&
5214 !(info->images_buffers & (1 << i)))
5215 rsrc = force_dcc_off(ctx, rsrc);
5216
5217 ctx->images[i] = rsrc;
5218 }
5219 }
5220
5221 static void preload_streamout_buffers(struct si_shader_context *ctx)
5222 {
5223 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5224 struct gallivm_state *gallivm = bld_base->base.gallivm;
5225 unsigned i;
5226
5227 /* Streamout can only be used if the shader is compiled as VS. */
5228 if (!ctx->shader->selector->so.num_outputs ||
5229 (ctx->type == TGSI_PROCESSOR_VERTEX &&
5230 (ctx->shader->key.vs.as_es ||
5231 ctx->shader->key.vs.as_ls)) ||
5232 (ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
5233 ctx->shader->key.tes.as_es))
5234 return;
5235
5236 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5237 SI_PARAM_RW_BUFFERS);
5238
5239 /* Load the resources, we rely on the code sinking to do the rest */
5240 for (i = 0; i < 4; ++i) {
5241 if (ctx->shader->selector->so.stride[i]) {
5242 LLVMValueRef offset = lp_build_const_int32(gallivm,
5243 SI_SO_BUF_OFFSET + i);
5244
5245 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5246 }
5247 }
5248 }
5249
5250 /**
5251 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5252 * for later use.
5253 */
5254 static void preload_ring_buffers(struct si_shader_context *ctx)
5255 {
5256 struct gallivm_state *gallivm =
5257 ctx->radeon_bld.soa.bld_base.base.gallivm;
5258
5259 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5260 SI_PARAM_RW_BUFFERS);
5261
5262 if ((ctx->type == TGSI_PROCESSOR_VERTEX &&
5263 ctx->shader->key.vs.as_es) ||
5264 (ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
5265 ctx->shader->key.tes.as_es) ||
5266 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
5267 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_ESGS);
5268
5269 ctx->esgs_ring =
5270 build_indexed_load_const(ctx, buf_ptr, offset);
5271 }
5272
5273 if (ctx->is_gs_copy_shader) {
5274 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
5275
5276 ctx->gsvs_ring[0] =
5277 build_indexed_load_const(ctx, buf_ptr, offset);
5278 }
5279 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
5280 int i;
5281 for (i = 0; i < 4; i++) {
5282 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS + i);
5283
5284 ctx->gsvs_ring[i] =
5285 build_indexed_load_const(ctx, buf_ptr, offset);
5286 }
5287 }
5288 }
5289
5290 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5291 LLVMValueRef param_sampler_views,
5292 unsigned param_pos_fixed_pt)
5293 {
5294 struct lp_build_tgsi_context *bld_base =
5295 &ctx->radeon_bld.soa.bld_base;
5296 struct gallivm_state *gallivm = bld_base->base.gallivm;
5297 struct lp_build_emit_data result = {};
5298 struct tgsi_full_instruction inst = {};
5299 LLVMValueRef desc, sampler_index, address[2], pix;
5300
5301 /* Use the fixed-point gl_FragCoord input.
5302 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5303 * per coordinate to get the repeating effect.
5304 */
5305 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5306 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5307
5308 /* Load the sampler view descriptor. */
5309 sampler_index = lp_build_const_int32(gallivm, SI_POLY_STIPPLE_SAMPLER);
5310 desc = get_sampler_desc_custom(ctx, param_sampler_views,
5311 sampler_index, DESC_IMAGE);
5312
5313 /* Load the texel. */
5314 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
5315 inst.Texture.Texture = TGSI_TEXTURE_2D_MSAA; /* = use load, not load_mip */
5316 result.inst = &inst;
5317 set_tex_fetch_args(ctx, &result, TGSI_OPCODE_TXF,
5318 inst.Texture.Texture,
5319 desc, NULL, address, ARRAY_SIZE(address), 0xf);
5320 build_tex_intrinsic(&tex_action, bld_base, &result);
5321
5322 /* Kill the thread accordingly. */
5323 pix = LLVMBuildExtractElement(gallivm->builder, result.output[0],
5324 lp_build_const_int32(gallivm, 3), "");
5325 pix = bitcast(bld_base, TGSI_TYPE_FLOAT, pix);
5326 pix = LLVMBuildFNeg(gallivm->builder, pix, "");
5327
5328 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5329 LLVMVoidTypeInContext(gallivm->context),
5330 &pix, 1, 0);
5331 }
5332
5333 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5334 struct si_shader_config *conf,
5335 unsigned symbol_offset)
5336 {
5337 unsigned i;
5338 const unsigned char *config =
5339 radeon_shader_binary_config_start(binary, symbol_offset);
5340
5341 /* XXX: We may be able to emit some of these values directly rather than
5342 * extracting fields to be emitted later.
5343 */
5344
5345 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5346 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5347 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5348 switch (reg) {
5349 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5350 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5351 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5352 case R_00B848_COMPUTE_PGM_RSRC1:
5353 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5354 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5355 conf->float_mode = G_00B028_FLOAT_MODE(value);
5356 conf->rsrc1 = value;
5357 break;
5358 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5359 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5360 break;
5361 case R_00B84C_COMPUTE_PGM_RSRC2:
5362 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5363 conf->rsrc2 = value;
5364 break;
5365 case R_0286CC_SPI_PS_INPUT_ENA:
5366 conf->spi_ps_input_ena = value;
5367 break;
5368 case R_0286D0_SPI_PS_INPUT_ADDR:
5369 conf->spi_ps_input_addr = value;
5370 break;
5371 case R_0286E8_SPI_TMPRING_SIZE:
5372 case R_00B860_COMPUTE_TMPRING_SIZE:
5373 /* WAVESIZE is in units of 256 dwords. */
5374 conf->scratch_bytes_per_wave =
5375 G_00B860_WAVESIZE(value) * 256 * 4 * 1;
5376 break;
5377 default:
5378 {
5379 static bool printed;
5380
5381 if (!printed) {
5382 fprintf(stderr, "Warning: LLVM emitted unknown "
5383 "config register: 0x%x\n", reg);
5384 printed = true;
5385 }
5386 }
5387 break;
5388 }
5389
5390 if (!conf->spi_ps_input_addr)
5391 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5392 }
5393 }
5394
5395 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5396 struct si_shader *shader,
5397 struct si_shader_config *config,
5398 uint64_t scratch_va)
5399 {
5400 unsigned i;
5401 uint32_t scratch_rsrc_dword0 = scratch_va;
5402 uint32_t scratch_rsrc_dword1 =
5403 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
5404 | S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5405
5406 for (i = 0 ; i < shader->binary.reloc_count; i++) {
5407 const struct radeon_shader_reloc *reloc =
5408 &shader->binary.relocs[i];
5409 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5410 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5411 &scratch_rsrc_dword0, 4);
5412 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5413 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5414 &scratch_rsrc_dword1, 4);
5415 }
5416 }
5417 }
5418
5419 static unsigned si_get_shader_binary_size(struct si_shader *shader)
5420 {
5421 unsigned size = shader->binary.code_size;
5422
5423 if (shader->prolog)
5424 size += shader->prolog->binary.code_size;
5425 if (shader->epilog)
5426 size += shader->epilog->binary.code_size;
5427 return size;
5428 }
5429
5430 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5431 {
5432 const struct radeon_shader_binary *prolog =
5433 shader->prolog ? &shader->prolog->binary : NULL;
5434 const struct radeon_shader_binary *epilog =
5435 shader->epilog ? &shader->epilog->binary : NULL;
5436 const struct radeon_shader_binary *mainb = &shader->binary;
5437 unsigned bo_size = si_get_shader_binary_size(shader) +
5438 (!epilog ? mainb->rodata_size : 0);
5439 unsigned char *ptr;
5440
5441 assert(!prolog || !prolog->rodata_size);
5442 assert((!prolog && !epilog) || !mainb->rodata_size);
5443 assert(!epilog || !epilog->rodata_size);
5444
5445 r600_resource_reference(&shader->bo, NULL);
5446 shader->bo = si_resource_create_custom(&sscreen->b.b,
5447 PIPE_USAGE_IMMUTABLE,
5448 bo_size);
5449 if (!shader->bo)
5450 return -ENOMEM;
5451
5452 /* Upload. */
5453 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
5454 PIPE_TRANSFER_READ_WRITE);
5455
5456 if (prolog) {
5457 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
5458 ptr += prolog->code_size;
5459 }
5460
5461 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
5462 ptr += mainb->code_size;
5463
5464 if (epilog)
5465 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
5466 else if (mainb->rodata_size > 0)
5467 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
5468
5469 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5470 return 0;
5471 }
5472
5473 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
5474 struct pipe_debug_callback *debug,
5475 const char *name, FILE *file)
5476 {
5477 char *line, *p;
5478 unsigned i, count;
5479
5480 if (binary->disasm_string) {
5481 fprintf(file, "Shader %s disassembly:\n", name);
5482 fprintf(file, "%s", binary->disasm_string);
5483
5484 if (debug && debug->debug_message) {
5485 /* Very long debug messages are cut off, so send the
5486 * disassembly one line at a time. This causes more
5487 * overhead, but on the plus side it simplifies
5488 * parsing of resulting logs.
5489 */
5490 pipe_debug_message(debug, SHADER_INFO,
5491 "Shader Disassembly Begin");
5492
5493 line = binary->disasm_string;
5494 while (*line) {
5495 p = util_strchrnul(line, '\n');
5496 count = p - line;
5497
5498 if (count) {
5499 pipe_debug_message(debug, SHADER_INFO,
5500 "%.*s", count, line);
5501 }
5502
5503 if (!*p)
5504 break;
5505 line = p + 1;
5506 }
5507
5508 pipe_debug_message(debug, SHADER_INFO,
5509 "Shader Disassembly End");
5510 }
5511 } else {
5512 fprintf(file, "Shader %s binary:\n", name);
5513 for (i = 0; i < binary->code_size; i += 4) {
5514 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5515 binary->code[i + 3], binary->code[i + 2],
5516 binary->code[i + 1], binary->code[i]);
5517 }
5518 }
5519 }
5520
5521 static void si_shader_dump_stats(struct si_screen *sscreen,
5522 struct si_shader_config *conf,
5523 unsigned num_inputs,
5524 unsigned code_size,
5525 struct pipe_debug_callback *debug,
5526 unsigned processor,
5527 FILE *file)
5528 {
5529 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5530 unsigned lds_per_wave = 0;
5531 unsigned max_simd_waves = 10;
5532
5533 /* Compute LDS usage for PS. */
5534 if (processor == TGSI_PROCESSOR_FRAGMENT) {
5535 /* The minimum usage per wave is (num_inputs * 36). The maximum
5536 * usage is (num_inputs * 36 * 16).
5537 * We can get anything in between and it varies between waves.
5538 *
5539 * Other stages don't know the size at compile time or don't
5540 * allocate LDS per wave, but instead they do it per thread group.
5541 */
5542 lds_per_wave = conf->lds_size * lds_increment +
5543 align(num_inputs * 36, lds_increment);
5544 }
5545
5546 /* Compute the per-SIMD wave counts. */
5547 if (conf->num_sgprs) {
5548 if (sscreen->b.chip_class >= VI)
5549 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5550 else
5551 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5552 }
5553
5554 if (conf->num_vgprs)
5555 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5556
5557 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
5558 * that PS can use.
5559 */
5560 if (lds_per_wave)
5561 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5562
5563 if (file != stderr ||
5564 r600_can_dump_shader(&sscreen->b, processor)) {
5565 if (processor == TGSI_PROCESSOR_FRAGMENT) {
5566 fprintf(file, "*** SHADER CONFIG ***\n"
5567 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5568 "SPI_PS_INPUT_ENA = 0x%04x\n",
5569 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5570 }
5571
5572 fprintf(file, "*** SHADER STATS ***\n"
5573 "SGPRS: %d\n"
5574 "VGPRS: %d\n"
5575 "Code Size: %d bytes\n"
5576 "LDS: %d blocks\n"
5577 "Scratch: %d bytes per wave\n"
5578 "Max Waves: %d\n"
5579 "********************\n",
5580 conf->num_sgprs, conf->num_vgprs, code_size,
5581 conf->lds_size, conf->scratch_bytes_per_wave,
5582 max_simd_waves);
5583 }
5584
5585 pipe_debug_message(debug, SHADER_INFO,
5586 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5587 "LDS: %d Scratch: %d Max Waves: %d",
5588 conf->num_sgprs, conf->num_vgprs, code_size,
5589 conf->lds_size, conf->scratch_bytes_per_wave,
5590 max_simd_waves);
5591 }
5592
5593 static const char *si_get_shader_name(struct si_shader *shader,
5594 unsigned processor)
5595 {
5596 switch (processor) {
5597 case TGSI_PROCESSOR_VERTEX:
5598 if (shader->key.vs.as_es)
5599 return "Vertex Shader as ES";
5600 else if (shader->key.vs.as_ls)
5601 return "Vertex Shader as LS";
5602 else
5603 return "Vertex Shader as VS";
5604 case TGSI_PROCESSOR_TESS_CTRL:
5605 return "Tessellation Control Shader";
5606 case TGSI_PROCESSOR_TESS_EVAL:
5607 if (shader->key.tes.as_es)
5608 return "Tessellation Evaluation Shader as ES";
5609 else
5610 return "Tessellation Evaluation Shader as VS";
5611 case TGSI_PROCESSOR_GEOMETRY:
5612 if (shader->gs_copy_shader == NULL)
5613 return "GS Copy Shader as VS";
5614 else
5615 return "Geometry Shader";
5616 case TGSI_PROCESSOR_FRAGMENT:
5617 return "Pixel Shader";
5618 case TGSI_PROCESSOR_COMPUTE:
5619 return "Compute Shader";
5620 default:
5621 return "Unknown Shader";
5622 }
5623 }
5624
5625 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
5626 struct pipe_debug_callback *debug, unsigned processor,
5627 FILE *file)
5628 {
5629 if (file != stderr ||
5630 (r600_can_dump_shader(&sscreen->b, processor) &&
5631 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5632 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5633
5634 if (shader->prolog)
5635 si_shader_dump_disassembly(&shader->prolog->binary,
5636 debug, "prolog", file);
5637
5638 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5639
5640 if (shader->epilog)
5641 si_shader_dump_disassembly(&shader->epilog->binary,
5642 debug, "epilog", file);
5643 fprintf(file, "\n");
5644 }
5645
5646 si_shader_dump_stats(sscreen, &shader->config,
5647 shader->selector ? shader->selector->info.num_inputs : 0,
5648 si_get_shader_binary_size(shader), debug, processor,
5649 file);
5650 }
5651
5652 int si_compile_llvm(struct si_screen *sscreen,
5653 struct radeon_shader_binary *binary,
5654 struct si_shader_config *conf,
5655 LLVMTargetMachineRef tm,
5656 LLVMModuleRef mod,
5657 struct pipe_debug_callback *debug,
5658 unsigned processor,
5659 const char *name)
5660 {
5661 int r = 0;
5662 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5663
5664 if (r600_can_dump_shader(&sscreen->b, processor)) {
5665 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5666
5667 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5668 fprintf(stderr, "%s LLVM IR:\n\n", name);
5669 LLVMDumpModule(mod);
5670 fprintf(stderr, "\n");
5671 }
5672 }
5673
5674 if (!si_replace_shader(count, binary)) {
5675 r = radeon_llvm_compile(mod, binary,
5676 r600_get_llvm_processor_name(sscreen->b.family), tm,
5677 debug);
5678 if (r)
5679 return r;
5680 }
5681
5682 si_shader_binary_read_config(binary, conf, 0);
5683
5684 /* Enable 64-bit and 16-bit denormals, because there is no performance
5685 * cost.
5686 *
5687 * If denormals are enabled, all floating-point output modifiers are
5688 * ignored.
5689 *
5690 * Don't enable denormals for 32-bit floats, because:
5691 * - Floating-point output modifiers would be ignored by the hw.
5692 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5693 * have to stop using those.
5694 * - SI & CI would be very slow.
5695 */
5696 conf->float_mode |= V_00B028_FP_64_DENORMS;
5697
5698 FREE(binary->config);
5699 FREE(binary->global_symbol_offsets);
5700 binary->config = NULL;
5701 binary->global_symbol_offsets = NULL;
5702
5703 /* Some shaders can't have rodata because their binaries can be
5704 * concatenated.
5705 */
5706 if (binary->rodata_size &&
5707 (processor == TGSI_PROCESSOR_VERTEX ||
5708 processor == TGSI_PROCESSOR_TESS_CTRL ||
5709 processor == TGSI_PROCESSOR_TESS_EVAL ||
5710 processor == TGSI_PROCESSOR_FRAGMENT)) {
5711 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5712 return -EINVAL;
5713 }
5714
5715 return r;
5716 }
5717
5718 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5719 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
5720 struct si_shader_context *ctx,
5721 struct si_shader *gs,
5722 struct pipe_debug_callback *debug)
5723 {
5724 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5725 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5726 struct lp_build_context *uint = &bld_base->uint_bld;
5727 struct si_shader_output_values *outputs;
5728 struct tgsi_shader_info *gsinfo = &gs->selector->info;
5729 LLVMValueRef args[9];
5730 int i, r;
5731
5732 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5733
5734 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
5735 ctx->type = TGSI_PROCESSOR_VERTEX;
5736 ctx->is_gs_copy_shader = true;
5737
5738 create_meta_data(ctx);
5739 create_function(ctx);
5740 preload_streamout_buffers(ctx);
5741 preload_ring_buffers(ctx);
5742
5743 args[0] = ctx->gsvs_ring[0];
5744 args[1] = lp_build_mul_imm(uint,
5745 LLVMGetParam(ctx->radeon_bld.main_fn,
5746 ctx->param_vertex_id),
5747 4);
5748 args[3] = uint->zero;
5749 args[4] = uint->one; /* OFFEN */
5750 args[5] = uint->zero; /* IDXEN */
5751 args[6] = uint->one; /* GLC */
5752 args[7] = uint->one; /* SLC */
5753 args[8] = uint->zero; /* TFE */
5754
5755 /* Fetch vertex data from GSVS ring */
5756 for (i = 0; i < gsinfo->num_outputs; ++i) {
5757 unsigned chan;
5758
5759 outputs[i].name = gsinfo->output_semantic_name[i];
5760 outputs[i].sid = gsinfo->output_semantic_index[i];
5761
5762 for (chan = 0; chan < 4; chan++) {
5763 args[2] = lp_build_const_int32(gallivm,
5764 (i * 4 + chan) *
5765 gs->selector->gs_max_out_vertices * 16 * 4);
5766
5767 outputs[i].values[chan] =
5768 LLVMBuildBitCast(gallivm->builder,
5769 lp_build_intrinsic(gallivm->builder,
5770 "llvm.SI.buffer.load.dword.i32.i32",
5771 ctx->i32, args, 9,
5772 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
5773 ctx->f32, "");
5774 }
5775 }
5776
5777 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5778
5779 LLVMBuildRet(gallivm->builder, ctx->return_value);
5780
5781 /* Dump LLVM IR before any optimization passes */
5782 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
5783 r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
5784 LLVMDumpModule(bld_base->base.gallivm->module);
5785
5786 radeon_llvm_finalize_module(&ctx->radeon_bld);
5787
5788 r = si_compile_llvm(sscreen, &ctx->shader->binary,
5789 &ctx->shader->config, ctx->tm,
5790 bld_base->base.gallivm->module,
5791 debug, TGSI_PROCESSOR_GEOMETRY,
5792 "GS Copy Shader");
5793 if (!r) {
5794 if (r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
5795 fprintf(stderr, "GS Copy Shader:\n");
5796 si_shader_dump(sscreen, ctx->shader, debug,
5797 TGSI_PROCESSOR_GEOMETRY, stderr);
5798 r = si_shader_binary_upload(sscreen, ctx->shader);
5799 }
5800
5801 radeon_llvm_dispose(&ctx->radeon_bld);
5802
5803 FREE(outputs);
5804 return r;
5805 }
5806
5807 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
5808 {
5809 int i;
5810
5811 fprintf(f, "SHADER KEY\n");
5812
5813 switch (shader) {
5814 case PIPE_SHADER_VERTEX:
5815 fprintf(f, " instance_divisors = {");
5816 for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
5817 fprintf(f, !i ? "%u" : ", %u",
5818 key->vs.prolog.instance_divisors[i]);
5819 fprintf(f, "}\n");
5820 fprintf(f, " as_es = %u\n", key->vs.as_es);
5821 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
5822 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
5823 break;
5824
5825 case PIPE_SHADER_TESS_CTRL:
5826 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
5827 break;
5828
5829 case PIPE_SHADER_TESS_EVAL:
5830 fprintf(f, " as_es = %u\n", key->tes.as_es);
5831 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
5832 break;
5833
5834 case PIPE_SHADER_GEOMETRY:
5835 case PIPE_SHADER_COMPUTE:
5836 break;
5837
5838 case PIPE_SHADER_FRAGMENT:
5839 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
5840 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
5841 fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
5842 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
5843 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
5844 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
5845 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
5846 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
5847 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
5848 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
5849 break;
5850
5851 default:
5852 assert(0);
5853 }
5854 }
5855
5856 static void si_init_shader_ctx(struct si_shader_context *ctx,
5857 struct si_screen *sscreen,
5858 struct si_shader *shader,
5859 LLVMTargetMachineRef tm)
5860 {
5861 struct lp_build_tgsi_context *bld_base;
5862 struct lp_build_tgsi_action tmpl = {};
5863
5864 memset(ctx, 0, sizeof(*ctx));
5865 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
5866 ctx->tm = tm;
5867 ctx->screen = sscreen;
5868 if (shader && shader->selector)
5869 ctx->type = shader->selector->info.processor;
5870 else
5871 ctx->type = -1;
5872 ctx->shader = shader;
5873
5874 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
5875 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
5876 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
5877 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
5878 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
5879 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
5880 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
5881 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
5882 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
5883 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
5884 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
5885 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
5886
5887 bld_base = &ctx->radeon_bld.soa.bld_base;
5888 if (shader && shader->selector)
5889 bld_base->info = &shader->selector->info;
5890 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5891
5892 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5893 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5894 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5895
5896 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
5897 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
5898 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
5899 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
5900 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
5901 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
5902 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
5903 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
5904 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
5905 bld_base->op_actions[TGSI_OPCODE_TXQ] = tex_action;
5906 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
5907 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
5908 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
5909
5910 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
5911 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
5912 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
5913 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
5914 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
5915 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
5916
5917 tmpl.fetch_args = atomic_fetch_args;
5918 tmpl.emit = atomic_emit;
5919 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
5920 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
5921 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
5922 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
5923 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
5924 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
5925 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
5926 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
5927 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
5928 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
5929 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
5930 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
5931 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
5932 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
5933 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
5934 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
5935 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
5936 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
5937 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
5938 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
5939
5940 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5941
5942 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5943 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5944 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5945 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5946
5947 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5948 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5949 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5950
5951 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
5952 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
5953 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
5954 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
5955 }
5956
5957 int si_compile_tgsi_shader(struct si_screen *sscreen,
5958 LLVMTargetMachineRef tm,
5959 struct si_shader *shader,
5960 bool is_monolithic,
5961 struct pipe_debug_callback *debug)
5962 {
5963 struct si_shader_selector *sel = shader->selector;
5964 struct si_shader_context ctx;
5965 struct lp_build_tgsi_context *bld_base;
5966 LLVMModuleRef mod;
5967 int r = 0;
5968
5969 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
5970 * conversion fails. */
5971 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
5972 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
5973 si_dump_shader_key(sel->type, &shader->key, stderr);
5974 tgsi_dump(sel->tokens, 0);
5975 si_dump_streamout(&sel->so);
5976 }
5977
5978 si_init_shader_ctx(&ctx, sscreen, shader, tm);
5979 ctx.is_monolithic = is_monolithic;
5980
5981 shader->info.uses_instanceid = sel->info.uses_instanceid;
5982
5983 bld_base = &ctx.radeon_bld.soa.bld_base;
5984 ctx.radeon_bld.load_system_value = declare_system_value;
5985
5986 switch (ctx.type) {
5987 case TGSI_PROCESSOR_VERTEX:
5988 ctx.radeon_bld.load_input = declare_input_vs;
5989 if (shader->key.vs.as_ls)
5990 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
5991 else if (shader->key.vs.as_es)
5992 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
5993 else
5994 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
5995 break;
5996 case TGSI_PROCESSOR_TESS_CTRL:
5997 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
5998 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
5999 bld_base->emit_store = store_output_tcs;
6000 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6001 break;
6002 case TGSI_PROCESSOR_TESS_EVAL:
6003 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6004 if (shader->key.tes.as_es)
6005 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6006 else
6007 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6008 break;
6009 case TGSI_PROCESSOR_GEOMETRY:
6010 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6011 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6012 break;
6013 case TGSI_PROCESSOR_FRAGMENT:
6014 ctx.radeon_bld.load_input = declare_input_fs;
6015 if (is_monolithic)
6016 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6017 else
6018 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6019 break;
6020 case TGSI_PROCESSOR_COMPUTE:
6021 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6022 break;
6023 default:
6024 assert(!"Unsupported shader type");
6025 return -1;
6026 }
6027
6028 create_meta_data(&ctx);
6029 create_function(&ctx);
6030 preload_constants(&ctx);
6031 preload_shader_buffers(&ctx);
6032 preload_samplers(&ctx);
6033 preload_images(&ctx);
6034 preload_streamout_buffers(&ctx);
6035 preload_ring_buffers(&ctx);
6036
6037 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6038 shader->key.ps.prolog.poly_stipple) {
6039 LLVMValueRef views = LLVMGetParam(ctx.radeon_bld.main_fn,
6040 SI_PARAM_SAMPLERS);
6041 si_llvm_emit_polygon_stipple(&ctx, views,
6042 SI_PARAM_POS_FIXED_PT);
6043 }
6044
6045 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
6046 int i;
6047 for (i = 0; i < 4; i++) {
6048 ctx.gs_next_vertex[i] =
6049 lp_build_alloca(bld_base->base.gallivm,
6050 ctx.i32, "");
6051 }
6052 }
6053
6054 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6055 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6056 goto out;
6057 }
6058
6059 LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
6060 mod = bld_base->base.gallivm->module;
6061
6062 /* Dump LLVM IR before any optimization passes */
6063 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6064 r600_can_dump_shader(&sscreen->b, ctx.type))
6065 LLVMDumpModule(mod);
6066
6067 radeon_llvm_finalize_module(&ctx.radeon_bld);
6068
6069 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6070 mod, debug, ctx.type, "TGSI shader");
6071 if (r) {
6072 fprintf(stderr, "LLVM failed to compile shader\n");
6073 goto out;
6074 }
6075
6076 radeon_llvm_dispose(&ctx.radeon_bld);
6077
6078 /* Add the scratch offset to input SGPRs. */
6079 if (shader->config.scratch_bytes_per_wave)
6080 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6081
6082 /* Calculate the number of fragment input VGPRs. */
6083 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
6084 shader->info.num_input_vgprs = 0;
6085 shader->info.face_vgpr_index = -1;
6086
6087 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6088 shader->info.num_input_vgprs += 2;
6089 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6090 shader->info.num_input_vgprs += 2;
6091 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6092 shader->info.num_input_vgprs += 2;
6093 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6094 shader->info.num_input_vgprs += 3;
6095 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6096 shader->info.num_input_vgprs += 2;
6097 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6098 shader->info.num_input_vgprs += 2;
6099 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6100 shader->info.num_input_vgprs += 2;
6101 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6102 shader->info.num_input_vgprs += 1;
6103 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6104 shader->info.num_input_vgprs += 1;
6105 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6106 shader->info.num_input_vgprs += 1;
6107 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6108 shader->info.num_input_vgprs += 1;
6109 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6110 shader->info.num_input_vgprs += 1;
6111 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6112 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6113 shader->info.num_input_vgprs += 1;
6114 }
6115 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6116 shader->info.num_input_vgprs += 1;
6117 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6118 shader->info.num_input_vgprs += 1;
6119 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6120 shader->info.num_input_vgprs += 1;
6121 }
6122
6123 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
6124 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6125 shader->gs_copy_shader->selector = shader->selector;
6126 ctx.shader = shader->gs_copy_shader;
6127 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6128 shader, debug))) {
6129 free(shader->gs_copy_shader);
6130 shader->gs_copy_shader = NULL;
6131 goto out;
6132 }
6133 }
6134
6135 out:
6136 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6137 FREE(ctx.constants[i]);
6138 return r;
6139 }
6140
6141 /**
6142 * Create, compile and return a shader part (prolog or epilog).
6143 *
6144 * \param sscreen screen
6145 * \param list list of shader parts of the same category
6146 * \param key shader part key
6147 * \param tm LLVM target machine
6148 * \param debug debug callback
6149 * \param compile the callback responsible for compilation
6150 * \return non-NULL on success
6151 */
6152 static struct si_shader_part *
6153 si_get_shader_part(struct si_screen *sscreen,
6154 struct si_shader_part **list,
6155 union si_shader_part_key *key,
6156 LLVMTargetMachineRef tm,
6157 struct pipe_debug_callback *debug,
6158 bool (*compile)(struct si_screen *,
6159 LLVMTargetMachineRef,
6160 struct pipe_debug_callback *,
6161 struct si_shader_part *))
6162 {
6163 struct si_shader_part *result;
6164
6165 pipe_mutex_lock(sscreen->shader_parts_mutex);
6166
6167 /* Find existing. */
6168 for (result = *list; result; result = result->next) {
6169 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6170 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6171 return result;
6172 }
6173 }
6174
6175 /* Compile a new one. */
6176 result = CALLOC_STRUCT(si_shader_part);
6177 result->key = *key;
6178 if (!compile(sscreen, tm, debug, result)) {
6179 FREE(result);
6180 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6181 return NULL;
6182 }
6183
6184 result->next = *list;
6185 *list = result;
6186 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6187 return result;
6188 }
6189
6190 /**
6191 * Create a vertex shader prolog.
6192 *
6193 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6194 * All inputs are returned unmodified. The vertex load indices are
6195 * stored after them, which will used by the API VS for fetching inputs.
6196 *
6197 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6198 * input_v0,
6199 * input_v1,
6200 * input_v2,
6201 * input_v3,
6202 * (VertexID + BaseVertex),
6203 * (InstanceID + StartInstance),
6204 * (InstanceID / 2 + StartInstance)
6205 */
6206 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6207 LLVMTargetMachineRef tm,
6208 struct pipe_debug_callback *debug,
6209 struct si_shader_part *out)
6210 {
6211 union si_shader_part_key *key = &out->key;
6212 struct si_shader shader = {};
6213 struct si_shader_context ctx;
6214 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6215 LLVMTypeRef *params, *returns;
6216 LLVMValueRef ret, func;
6217 int last_sgpr, num_params, num_returns, i;
6218 bool status = true;
6219
6220 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6221 ctx.type = TGSI_PROCESSOR_VERTEX;
6222 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6223 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6224
6225 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6226 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6227 sizeof(LLVMTypeRef));
6228 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6229 key->vs_prolog.last_input + 1) *
6230 sizeof(LLVMTypeRef));
6231 num_params = 0;
6232 num_returns = 0;
6233
6234 /* Declare input and output SGPRs. */
6235 num_params = 0;
6236 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6237 params[num_params++] = ctx.i32;
6238 returns[num_returns++] = ctx.i32;
6239 }
6240 last_sgpr = num_params - 1;
6241
6242 /* 4 preloaded VGPRs (outputs must be floats) */
6243 for (i = 0; i < 4; i++) {
6244 params[num_params++] = ctx.i32;
6245 returns[num_returns++] = ctx.f32;
6246 }
6247
6248 /* Vertex load indices. */
6249 for (i = 0; i <= key->vs_prolog.last_input; i++)
6250 returns[num_returns++] = ctx.f32;
6251
6252 /* Create the function. */
6253 si_create_function(&ctx, returns, num_returns, params,
6254 num_params, -1, last_sgpr);
6255 func = ctx.radeon_bld.main_fn;
6256
6257 /* Copy inputs to outputs. This should be no-op, as the registers match,
6258 * but it will prevent the compiler from overwriting them unintentionally.
6259 */
6260 ret = ctx.return_value;
6261 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6262 LLVMValueRef p = LLVMGetParam(func, i);
6263 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6264 }
6265 for (i = num_params - 4; i < num_params; i++) {
6266 LLVMValueRef p = LLVMGetParam(func, i);
6267 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6268 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6269 }
6270
6271 /* Compute vertex load indices from instance divisors. */
6272 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6273 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6274 LLVMValueRef index;
6275
6276 if (divisor) {
6277 /* InstanceID / Divisor + StartInstance */
6278 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6279 SI_SGPR_START_INSTANCE,
6280 divisor);
6281 } else {
6282 /* VertexID + BaseVertex */
6283 index = LLVMBuildAdd(gallivm->builder,
6284 LLVMGetParam(func, ctx.param_vertex_id),
6285 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6286 }
6287
6288 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6289 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6290 num_params++, "");
6291 }
6292
6293 /* Compile. */
6294 LLVMBuildRet(gallivm->builder, ret);
6295 radeon_llvm_finalize_module(&ctx.radeon_bld);
6296
6297 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6298 gallivm->module, debug, ctx.type,
6299 "Vertex Shader Prolog"))
6300 status = false;
6301
6302 radeon_llvm_dispose(&ctx.radeon_bld);
6303 return status;
6304 }
6305
6306 /**
6307 * Compile the vertex shader epilog. This is also used by the tessellation
6308 * evaluation shader compiled as VS.
6309 *
6310 * The input is PrimitiveID.
6311 *
6312 * If PrimitiveID is required by the pixel shader, export it.
6313 * Otherwise, do nothing.
6314 */
6315 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6316 LLVMTargetMachineRef tm,
6317 struct pipe_debug_callback *debug,
6318 struct si_shader_part *out)
6319 {
6320 union si_shader_part_key *key = &out->key;
6321 struct si_shader_context ctx;
6322 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6323 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6324 LLVMTypeRef params[5];
6325 int num_params, i;
6326 bool status = true;
6327
6328 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6329 ctx.type = TGSI_PROCESSOR_VERTEX;
6330
6331 /* Declare input VGPRs. */
6332 num_params = key->vs_epilog.states.export_prim_id ?
6333 (VS_EPILOG_PRIMID_LOC + 1) : 0;
6334 assert(num_params <= ARRAY_SIZE(params));
6335
6336 for (i = 0; i < num_params; i++)
6337 params[i] = ctx.f32;
6338
6339 /* Create the function. */
6340 si_create_function(&ctx, NULL, 0, params, num_params,
6341 -1, -1);
6342
6343 /* Emit exports. */
6344 if (key->vs_epilog.states.export_prim_id) {
6345 struct lp_build_context *base = &bld_base->base;
6346 struct lp_build_context *uint = &bld_base->uint_bld;
6347 LLVMValueRef args[9];
6348
6349 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6350 args[1] = uint->zero; /* whether the EXEC mask is valid */
6351 args[2] = uint->zero; /* DONE bit */
6352 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6353 key->vs_epilog.prim_id_param_offset);
6354 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6355 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6356 VS_EPILOG_PRIMID_LOC); /* X */
6357 args[6] = uint->undef; /* Y */
6358 args[7] = uint->undef; /* Z */
6359 args[8] = uint->undef; /* W */
6360
6361 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6362 LLVMVoidTypeInContext(base->gallivm->context),
6363 args, 9, 0);
6364 }
6365
6366 /* Compile. */
6367 LLVMBuildRet(gallivm->builder, ctx.return_value);
6368 radeon_llvm_finalize_module(&ctx.radeon_bld);
6369
6370 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6371 gallivm->module, debug, ctx.type,
6372 "Vertex Shader Epilog"))
6373 status = false;
6374
6375 radeon_llvm_dispose(&ctx.radeon_bld);
6376 return status;
6377 }
6378
6379 /**
6380 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
6381 */
6382 static bool si_get_vs_epilog(struct si_screen *sscreen,
6383 LLVMTargetMachineRef tm,
6384 struct si_shader *shader,
6385 struct pipe_debug_callback *debug,
6386 struct si_vs_epilog_bits *states)
6387 {
6388 union si_shader_part_key epilog_key;
6389
6390 memset(&epilog_key, 0, sizeof(epilog_key));
6391 epilog_key.vs_epilog.states = *states;
6392
6393 /* Set up the PrimitiveID output. */
6394 if (shader->key.vs.epilog.export_prim_id) {
6395 unsigned index = shader->selector->info.num_outputs;
6396 unsigned offset = shader->info.nr_param_exports++;
6397
6398 epilog_key.vs_epilog.prim_id_param_offset = offset;
6399 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6400 shader->info.vs_output_param_offset[index] = offset;
6401 }
6402
6403 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
6404 &epilog_key, tm, debug,
6405 si_compile_vs_epilog);
6406 return shader->epilog != NULL;
6407 }
6408
6409 /**
6410 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6411 */
6412 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6413 LLVMTargetMachineRef tm,
6414 struct si_shader *shader,
6415 struct pipe_debug_callback *debug)
6416 {
6417 struct tgsi_shader_info *info = &shader->selector->info;
6418 union si_shader_part_key prolog_key;
6419 unsigned i;
6420
6421 /* Get the prolog. */
6422 memset(&prolog_key, 0, sizeof(prolog_key));
6423 prolog_key.vs_prolog.states = shader->key.vs.prolog;
6424 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6425 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6426
6427 /* The prolog is a no-op if there are no inputs. */
6428 if (info->num_inputs) {
6429 shader->prolog =
6430 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6431 &prolog_key, tm, debug,
6432 si_compile_vs_prolog);
6433 if (!shader->prolog)
6434 return false;
6435 }
6436
6437 /* Get the epilog. */
6438 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
6439 !si_get_vs_epilog(sscreen, tm, shader, debug,
6440 &shader->key.vs.epilog))
6441 return false;
6442
6443 /* Set the instanceID flag. */
6444 for (i = 0; i < info->num_inputs; i++)
6445 if (prolog_key.vs_prolog.states.instance_divisors[i])
6446 shader->info.uses_instanceid = true;
6447
6448 return true;
6449 }
6450
6451 /**
6452 * Select and compile (or reuse) TES parts (epilog).
6453 */
6454 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
6455 LLVMTargetMachineRef tm,
6456 struct si_shader *shader,
6457 struct pipe_debug_callback *debug)
6458 {
6459 if (shader->key.tes.as_es)
6460 return true;
6461
6462 /* TES compiled as VS. */
6463 return si_get_vs_epilog(sscreen, tm, shader, debug,
6464 &shader->key.tes.epilog);
6465 }
6466
6467 /**
6468 * Compile the TCS epilog. This writes tesselation factors to memory based on
6469 * the output primitive type of the tesselator (determined by TES).
6470 */
6471 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
6472 LLVMTargetMachineRef tm,
6473 struct pipe_debug_callback *debug,
6474 struct si_shader_part *out)
6475 {
6476 union si_shader_part_key *key = &out->key;
6477 struct si_shader shader = {};
6478 struct si_shader_context ctx;
6479 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6480 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6481 LLVMTypeRef params[16];
6482 LLVMValueRef func;
6483 int last_array_pointer, last_sgpr, num_params;
6484 bool status = true;
6485
6486 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6487 ctx.type = TGSI_PROCESSOR_TESS_CTRL;
6488 shader.key.tcs.epilog = key->tcs_epilog.states;
6489
6490 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
6491 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
6492 last_array_pointer = SI_PARAM_RW_BUFFERS;
6493 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6494 params[SI_PARAM_SAMPLERS] = ctx.i64;
6495 params[SI_PARAM_IMAGES] = ctx.i64;
6496 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6497 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
6498 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
6499 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
6500 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
6501 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
6502 num_params = last_sgpr + 1;
6503
6504 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
6505 params[num_params++] = ctx.i32; /* invocation ID within the patch */
6506 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
6507
6508 /* Create the function. */
6509 si_create_function(&ctx, NULL, 0, params, num_params,
6510 last_array_pointer, last_sgpr);
6511 declare_tess_lds(&ctx);
6512 func = ctx.radeon_bld.main_fn;
6513
6514 si_write_tess_factors(bld_base,
6515 LLVMGetParam(func, last_sgpr + 1),
6516 LLVMGetParam(func, last_sgpr + 2),
6517 LLVMGetParam(func, last_sgpr + 3));
6518
6519 /* Compile. */
6520 LLVMBuildRet(gallivm->builder, ctx.return_value);
6521 radeon_llvm_finalize_module(&ctx.radeon_bld);
6522
6523 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6524 gallivm->module, debug, ctx.type,
6525 "Tessellation Control Shader Epilog"))
6526 status = false;
6527
6528 radeon_llvm_dispose(&ctx.radeon_bld);
6529 return status;
6530 }
6531
6532 /**
6533 * Select and compile (or reuse) TCS parts (epilog).
6534 */
6535 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6536 LLVMTargetMachineRef tm,
6537 struct si_shader *shader,
6538 struct pipe_debug_callback *debug)
6539 {
6540 union si_shader_part_key epilog_key;
6541
6542 /* Get the epilog. */
6543 memset(&epilog_key, 0, sizeof(epilog_key));
6544 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
6545
6546 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6547 &epilog_key, tm, debug,
6548 si_compile_tcs_epilog);
6549 return shader->epilog != NULL;
6550 }
6551
6552 /**
6553 * Compile the pixel shader prolog. This handles:
6554 * - two-side color selection and interpolation
6555 * - overriding interpolation parameters for the API PS
6556 * - polygon stippling
6557 *
6558 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6559 * overriden by other states. (e.g. per-sample interpolation)
6560 * Interpolated colors are stored after the preloaded VGPRs.
6561 */
6562 static bool si_compile_ps_prolog(struct si_screen *sscreen,
6563 LLVMTargetMachineRef tm,
6564 struct pipe_debug_callback *debug,
6565 struct si_shader_part *out)
6566 {
6567 union si_shader_part_key *key = &out->key;
6568 struct si_shader shader = {};
6569 struct si_shader_context ctx;
6570 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6571 LLVMTypeRef *params;
6572 LLVMValueRef ret, func;
6573 int last_sgpr, num_params, num_returns, i, num_color_channels;
6574 bool status = true;
6575
6576 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6577 ctx.type = TGSI_PROCESSOR_FRAGMENT;
6578 shader.key.ps.prolog = key->ps_prolog.states;
6579
6580 /* Number of inputs + 8 color elements. */
6581 params = alloca((key->ps_prolog.num_input_sgprs +
6582 key->ps_prolog.num_input_vgprs + 8) *
6583 sizeof(LLVMTypeRef));
6584
6585 /* Declare inputs. */
6586 num_params = 0;
6587 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6588 params[num_params++] = ctx.i32;
6589 last_sgpr = num_params - 1;
6590
6591 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6592 params[num_params++] = ctx.f32;
6593
6594 /* Declare outputs (same as inputs + add colors if needed) */
6595 num_returns = num_params;
6596 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6597 for (i = 0; i < num_color_channels; i++)
6598 params[num_returns++] = ctx.f32;
6599
6600 /* Create the function. */
6601 si_create_function(&ctx, params, num_returns, params,
6602 num_params, -1, last_sgpr);
6603 func = ctx.radeon_bld.main_fn;
6604
6605 /* Copy inputs to outputs. This should be no-op, as the registers match,
6606 * but it will prevent the compiler from overwriting them unintentionally.
6607 */
6608 ret = ctx.return_value;
6609 for (i = 0; i < num_params; i++) {
6610 LLVMValueRef p = LLVMGetParam(func, i);
6611 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6612 }
6613
6614 /* Polygon stippling. */
6615 if (key->ps_prolog.states.poly_stipple) {
6616 /* POS_FIXED_PT is always last. */
6617 unsigned pos = key->ps_prolog.num_input_sgprs +
6618 key->ps_prolog.num_input_vgprs - 1;
6619 LLVMValueRef ptr[2], views;
6620
6621 /* Get the pointer to sampler views. */
6622 ptr[0] = LLVMGetParam(func, SI_SGPR_SAMPLERS);
6623 ptr[1] = LLVMGetParam(func, SI_SGPR_SAMPLERS+1);
6624 views = lp_build_gather_values(gallivm, ptr, 2);
6625 views = LLVMBuildBitCast(gallivm->builder, views, ctx.i64, "");
6626 views = LLVMBuildIntToPtr(gallivm->builder, views,
6627 const_array(ctx.v8i32, SI_NUM_SAMPLERS), "");
6628
6629 si_llvm_emit_polygon_stipple(&ctx, views, pos);
6630 }
6631
6632 /* Interpolate colors. */
6633 for (i = 0; i < 2; i++) {
6634 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
6635 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
6636 key->ps_prolog.face_vgpr_index;
6637 LLVMValueRef interp[2], color[4];
6638 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
6639
6640 if (!writemask)
6641 continue;
6642
6643 /* If the interpolation qualifier is not CONSTANT (-1). */
6644 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
6645 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
6646 key->ps_prolog.color_interp_vgpr_index[i];
6647
6648 interp[0] = LLVMGetParam(func, interp_vgpr);
6649 interp[1] = LLVMGetParam(func, interp_vgpr + 1);
6650 interp_ij = lp_build_gather_values(gallivm, interp, 2);
6651 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
6652 ctx.v2i32, "");
6653 }
6654
6655 /* Use the absolute location of the input. */
6656 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6657
6658 if (key->ps_prolog.states.color_two_side) {
6659 face = LLVMGetParam(func, face_vgpr);
6660 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
6661 }
6662
6663 interp_fs_input(&ctx,
6664 key->ps_prolog.color_attr_index[i],
6665 TGSI_SEMANTIC_COLOR, i,
6666 key->ps_prolog.num_interp_inputs,
6667 key->ps_prolog.colors_read, interp_ij,
6668 prim_mask, face, color);
6669
6670 while (writemask) {
6671 unsigned chan = u_bit_scan(&writemask);
6672 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
6673 num_params++, "");
6674 }
6675 }
6676
6677 /* Force per-sample interpolation. */
6678 if (key->ps_prolog.states.force_persample_interp) {
6679 unsigned i, base = key->ps_prolog.num_input_sgprs;
6680 LLVMValueRef persp_sample[2], linear_sample[2];
6681
6682 /* Read PERSP_SAMPLE. */
6683 for (i = 0; i < 2; i++)
6684 persp_sample[i] = LLVMGetParam(func, base + i);
6685 /* Overwrite PERSP_CENTER. */
6686 for (i = 0; i < 2; i++)
6687 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6688 persp_sample[i], base + 2 + i, "");
6689 /* Overwrite PERSP_CENTROID. */
6690 for (i = 0; i < 2; i++)
6691 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6692 persp_sample[i], base + 4 + i, "");
6693 /* Read LINEAR_SAMPLE. */
6694 for (i = 0; i < 2; i++)
6695 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
6696 /* Overwrite LINEAR_CENTER. */
6697 for (i = 0; i < 2; i++)
6698 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6699 linear_sample[i], base + 8 + i, "");
6700 /* Overwrite LINEAR_CENTROID. */
6701 for (i = 0; i < 2; i++)
6702 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6703 linear_sample[i], base + 10 + i, "");
6704 }
6705
6706 /* Compile. */
6707 LLVMBuildRet(gallivm->builder, ret);
6708 radeon_llvm_finalize_module(&ctx.radeon_bld);
6709
6710 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6711 gallivm->module, debug, ctx.type,
6712 "Fragment Shader Prolog"))
6713 status = false;
6714
6715 radeon_llvm_dispose(&ctx.radeon_bld);
6716 return status;
6717 }
6718
6719 /**
6720 * Compile the pixel shader epilog. This handles everything that must be
6721 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
6722 */
6723 static bool si_compile_ps_epilog(struct si_screen *sscreen,
6724 LLVMTargetMachineRef tm,
6725 struct pipe_debug_callback *debug,
6726 struct si_shader_part *out)
6727 {
6728 union si_shader_part_key *key = &out->key;
6729 struct si_shader shader = {};
6730 struct si_shader_context ctx;
6731 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6732 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6733 LLVMTypeRef params[16+8*4+3];
6734 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
6735 int last_array_pointer, last_sgpr, num_params, i;
6736 bool status = true;
6737
6738 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6739 ctx.type = TGSI_PROCESSOR_FRAGMENT;
6740 shader.key.ps.epilog = key->ps_epilog.states;
6741
6742 /* Declare input SGPRs. */
6743 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
6744 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6745 params[SI_PARAM_SAMPLERS] = ctx.i64;
6746 params[SI_PARAM_IMAGES] = ctx.i64;
6747 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6748 params[SI_PARAM_ALPHA_REF] = ctx.f32;
6749 last_array_pointer = -1;
6750 last_sgpr = SI_PARAM_ALPHA_REF;
6751
6752 /* Declare input VGPRs. */
6753 num_params = (last_sgpr + 1) +
6754 util_bitcount(key->ps_epilog.colors_written) * 4 +
6755 key->ps_epilog.writes_z +
6756 key->ps_epilog.writes_stencil +
6757 key->ps_epilog.writes_samplemask;
6758
6759 num_params = MAX2(num_params,
6760 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6761
6762 assert(num_params <= ARRAY_SIZE(params));
6763
6764 for (i = last_sgpr + 1; i < num_params; i++)
6765 params[i] = ctx.f32;
6766
6767 /* Create the function. */
6768 si_create_function(&ctx, NULL, 0, params, num_params,
6769 last_array_pointer, last_sgpr);
6770 /* Disable elimination of unused inputs. */
6771 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
6772 "InitialPSInputAddr", 0xffffff);
6773
6774 /* Process colors. */
6775 unsigned vgpr = last_sgpr + 1;
6776 unsigned colors_written = key->ps_epilog.colors_written;
6777 int last_color_export = -1;
6778
6779 /* Find the last color export. */
6780 if (!key->ps_epilog.writes_z &&
6781 !key->ps_epilog.writes_stencil &&
6782 !key->ps_epilog.writes_samplemask) {
6783 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
6784
6785 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
6786 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
6787 /* Just set this if any of the colorbuffers are enabled. */
6788 if (spi_format &
6789 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
6790 last_color_export = 0;
6791 } else {
6792 for (i = 0; i < 8; i++)
6793 if (colors_written & (1 << i) &&
6794 (spi_format >> (i * 4)) & 0xf)
6795 last_color_export = i;
6796 }
6797 }
6798
6799 while (colors_written) {
6800 LLVMValueRef color[4];
6801 int mrt = u_bit_scan(&colors_written);
6802
6803 for (i = 0; i < 4; i++)
6804 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6805
6806 si_export_mrt_color(bld_base, color, mrt,
6807 num_params - 1,
6808 mrt == last_color_export);
6809 }
6810
6811 /* Process depth, stencil, samplemask. */
6812 if (key->ps_epilog.writes_z)
6813 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6814 if (key->ps_epilog.writes_stencil)
6815 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6816 if (key->ps_epilog.writes_samplemask)
6817 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6818
6819 if (depth || stencil || samplemask)
6820 si_export_mrt_z(bld_base, depth, stencil, samplemask);
6821 else if (last_color_export == -1)
6822 si_export_null(bld_base);
6823
6824 /* Compile. */
6825 LLVMBuildRetVoid(gallivm->builder);
6826 radeon_llvm_finalize_module(&ctx.radeon_bld);
6827
6828 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6829 gallivm->module, debug, ctx.type,
6830 "Fragment Shader Epilog"))
6831 status = false;
6832
6833 radeon_llvm_dispose(&ctx.radeon_bld);
6834 return status;
6835 }
6836
6837 /**
6838 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
6839 */
6840 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
6841 LLVMTargetMachineRef tm,
6842 struct si_shader *shader,
6843 struct pipe_debug_callback *debug)
6844 {
6845 struct tgsi_shader_info *info = &shader->selector->info;
6846 union si_shader_part_key prolog_key;
6847 union si_shader_part_key epilog_key;
6848 unsigned i;
6849
6850 /* Get the prolog. */
6851 memset(&prolog_key, 0, sizeof(prolog_key));
6852 prolog_key.ps_prolog.states = shader->key.ps.prolog;
6853 prolog_key.ps_prolog.colors_read = info->colors_read;
6854 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6855 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6856
6857 if (info->colors_read) {
6858 unsigned *color = shader->selector->color_attr_index;
6859
6860 if (shader->key.ps.prolog.color_two_side) {
6861 /* BCOLORs are stored after the last input. */
6862 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
6863 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6864 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6865 }
6866
6867 for (i = 0; i < 2; i++) {
6868 unsigned location = info->input_interpolate_loc[color[i]];
6869
6870 if (!(info->colors_read & (0xf << i*4)))
6871 continue;
6872
6873 prolog_key.ps_prolog.color_attr_index[i] = color[i];
6874
6875 /* Force per-sample interpolation for the colors here. */
6876 if (shader->key.ps.prolog.force_persample_interp)
6877 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6878
6879 switch (info->input_interpolate[color[i]]) {
6880 case TGSI_INTERPOLATE_CONSTANT:
6881 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
6882 break;
6883 case TGSI_INTERPOLATE_PERSPECTIVE:
6884 case TGSI_INTERPOLATE_COLOR:
6885 switch (location) {
6886 case TGSI_INTERPOLATE_LOC_SAMPLE:
6887 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
6888 shader->config.spi_ps_input_ena |=
6889 S_0286CC_PERSP_SAMPLE_ENA(1);
6890 break;
6891 case TGSI_INTERPOLATE_LOC_CENTER:
6892 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
6893 shader->config.spi_ps_input_ena |=
6894 S_0286CC_PERSP_CENTER_ENA(1);
6895 break;
6896 case TGSI_INTERPOLATE_LOC_CENTROID:
6897 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
6898 shader->config.spi_ps_input_ena |=
6899 S_0286CC_PERSP_CENTROID_ENA(1);
6900 break;
6901 default:
6902 assert(0);
6903 }
6904 break;
6905 case TGSI_INTERPOLATE_LINEAR:
6906 switch (location) {
6907 case TGSI_INTERPOLATE_LOC_SAMPLE:
6908 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
6909 shader->config.spi_ps_input_ena |=
6910 S_0286CC_LINEAR_SAMPLE_ENA(1);
6911 break;
6912 case TGSI_INTERPOLATE_LOC_CENTER:
6913 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
6914 shader->config.spi_ps_input_ena |=
6915 S_0286CC_LINEAR_CENTER_ENA(1);
6916 break;
6917 case TGSI_INTERPOLATE_LOC_CENTROID:
6918 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
6919 shader->config.spi_ps_input_ena |=
6920 S_0286CC_LINEAR_CENTROID_ENA(1);
6921 break;
6922 default:
6923 assert(0);
6924 }
6925 break;
6926 default:
6927 assert(0);
6928 }
6929 }
6930 }
6931
6932 /* The prolog is a no-op if these aren't set. */
6933 if (prolog_key.ps_prolog.colors_read ||
6934 prolog_key.ps_prolog.states.force_persample_interp ||
6935 prolog_key.ps_prolog.states.poly_stipple) {
6936 shader->prolog =
6937 si_get_shader_part(sscreen, &sscreen->ps_prologs,
6938 &prolog_key, tm, debug,
6939 si_compile_ps_prolog);
6940 if (!shader->prolog)
6941 return false;
6942 }
6943
6944 /* Get the epilog. */
6945 memset(&epilog_key, 0, sizeof(epilog_key));
6946 epilog_key.ps_epilog.colors_written = info->colors_written;
6947 epilog_key.ps_epilog.writes_z = info->writes_z;
6948 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
6949 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
6950 epilog_key.ps_epilog.states = shader->key.ps.epilog;
6951
6952 shader->epilog =
6953 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
6954 &epilog_key, tm, debug,
6955 si_compile_ps_epilog);
6956 if (!shader->epilog)
6957 return false;
6958
6959 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
6960 if (shader->key.ps.prolog.poly_stipple) {
6961 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
6962 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
6963 }
6964
6965 /* Set up the enable bits for per-sample shading if needed. */
6966 if (shader->key.ps.prolog.force_persample_interp) {
6967 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
6968 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
6969 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
6970 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
6971 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
6972 }
6973 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
6974 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
6975 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
6976 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
6977 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
6978 }
6979 }
6980
6981 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
6982 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
6983 !(shader->config.spi_ps_input_ena & 0xf)) {
6984 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
6985 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
6986 }
6987
6988 /* At least one pair of interpolation weights must be enabled. */
6989 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
6990 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
6991 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
6992 }
6993
6994 /* The sample mask input is always enabled, because the API shader always
6995 * passes it through to the epilog. Disable it here if it's unused.
6996 */
6997 if (!shader->key.ps.epilog.poly_line_smoothing &&
6998 !shader->selector->info.reads_samplemask)
6999 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7000
7001 return true;
7002 }
7003
7004 static void si_fix_num_sgprs(struct si_shader *shader)
7005 {
7006 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7007
7008 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7009 }
7010
7011 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7012 struct si_shader *shader,
7013 struct pipe_debug_callback *debug)
7014 {
7015 struct si_shader *mainp = shader->selector->main_shader_part;
7016 int r;
7017
7018 /* LS, ES, VS are compiled on demand if the main part hasn't been
7019 * compiled for that stage.
7020 */
7021 if (!mainp ||
7022 (shader->selector->type == PIPE_SHADER_VERTEX &&
7023 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7024 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7025 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7026 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7027 shader->selector->type == PIPE_SHADER_COMPUTE) {
7028 /* Monolithic shader (compiled as a whole, has many variants,
7029 * may take a long time to compile).
7030 */
7031 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7032 if (r)
7033 return r;
7034 } else {
7035 /* The shader consists of 2-3 parts:
7036 *
7037 * - the middle part is the user shader, it has 1 variant only
7038 * and it was compiled during the creation of the shader
7039 * selector
7040 * - the prolog part is inserted at the beginning
7041 * - the epilog part is inserted at the end
7042 *
7043 * The prolog and epilog have many (but simple) variants.
7044 */
7045
7046 /* Copy the compiled TGSI shader data over. */
7047 shader->is_binary_shared = true;
7048 shader->binary = mainp->binary;
7049 shader->config = mainp->config;
7050 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7051 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7052 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7053 memcpy(shader->info.vs_output_param_offset,
7054 mainp->info.vs_output_param_offset,
7055 sizeof(mainp->info.vs_output_param_offset));
7056 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7057 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7058 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7059
7060 /* Select prologs and/or epilogs. */
7061 switch (shader->selector->type) {
7062 case PIPE_SHADER_VERTEX:
7063 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7064 return -1;
7065 break;
7066 case PIPE_SHADER_TESS_CTRL:
7067 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7068 return -1;
7069 break;
7070 case PIPE_SHADER_TESS_EVAL:
7071 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7072 return -1;
7073 break;
7074 case PIPE_SHADER_FRAGMENT:
7075 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7076 return -1;
7077
7078 /* Make sure we have at least as many VGPRs as there
7079 * are allocated inputs.
7080 */
7081 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7082 shader->info.num_input_vgprs);
7083 break;
7084 }
7085
7086 /* Update SGPR and VGPR counts. */
7087 if (shader->prolog) {
7088 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7089 shader->prolog->config.num_sgprs);
7090 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7091 shader->prolog->config.num_vgprs);
7092 }
7093 if (shader->epilog) {
7094 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7095 shader->epilog->config.num_sgprs);
7096 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7097 shader->epilog->config.num_vgprs);
7098 }
7099 }
7100
7101 si_fix_num_sgprs(shader);
7102 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7103 stderr);
7104
7105 /* Upload. */
7106 r = si_shader_binary_upload(sscreen, shader);
7107 if (r) {
7108 fprintf(stderr, "LLVM failed to upload shader\n");
7109 return r;
7110 }
7111
7112 return 0;
7113 }
7114
7115 void si_shader_destroy(struct si_shader *shader)
7116 {
7117 if (shader->gs_copy_shader) {
7118 si_shader_destroy(shader->gs_copy_shader);
7119 FREE(shader->gs_copy_shader);
7120 }
7121
7122 if (shader->scratch_bo)
7123 r600_resource_reference(&shader->scratch_bo, NULL);
7124
7125 r600_resource_reference(&shader->bo, NULL);
7126
7127 if (!shader->is_binary_shared)
7128 radeon_shader_binary_clean(&shader->binary);
7129 }