radeonsi: decrease GS copy shader user SGPRs to 2
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_bitarit.h"
35 #include "gallivm/lp_bld_flow.h"
36 #include "radeon/r600_cs.h"
37 #include "radeon/radeon_llvm.h"
38 #include "radeon/radeon_elf_util.h"
39 #include "radeon/radeon_llvm_emit.h"
40 #include "util/u_memory.h"
41 #include "util/u_pstipple.h"
42 #include "util/u_string.h"
43 #include "tgsi/tgsi_parse.h"
44 #include "tgsi/tgsi_build.h"
45 #include "tgsi/tgsi_util.h"
46 #include "tgsi/tgsi_dump.h"
47
48 #include "si_pipe.h"
49 #include "si_shader.h"
50 #include "sid.h"
51
52 #include <errno.h>
53
54 static const char *scratch_rsrc_dword0_symbol =
55 "SCRATCH_RSRC_DWORD0";
56
57 static const char *scratch_rsrc_dword1_symbol =
58 "SCRATCH_RSRC_DWORD1";
59
60 struct si_shader_output_values
61 {
62 LLVMValueRef values[4];
63 unsigned name;
64 unsigned sid;
65 };
66
67 struct si_shader_context
68 {
69 struct radeon_llvm_context radeon_bld;
70 struct si_shader *shader;
71 struct si_screen *screen;
72
73 unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
74 bool is_gs_copy_shader;
75
76 /* Whether to generate the optimized shader variant compiled as a whole
77 * (without a prolog and epilog)
78 */
79 bool is_monolithic;
80
81 int param_streamout_config;
82 int param_streamout_write_index;
83 int param_streamout_offset[4];
84 int param_vertex_id;
85 int param_rel_auto_id;
86 int param_vs_prim_id;
87 int param_instance_id;
88 int param_vertex_index0;
89 int param_tes_u;
90 int param_tes_v;
91 int param_tes_rel_patch_id;
92 int param_tes_patch_id;
93 int param_es2gs_offset;
94
95 LLVMTargetMachineRef tm;
96
97 LLVMValueRef const_md;
98 LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
99 LLVMValueRef lds;
100 LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
101 LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
102 LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
103 LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
104 LLVMValueRef fmasks[SI_NUM_SAMPLERS];
105 LLVMValueRef images[SI_NUM_IMAGES];
106 LLVMValueRef so_buffers[4];
107 LLVMValueRef esgs_ring;
108 LLVMValueRef gsvs_ring[4];
109 LLVMValueRef gs_next_vertex[4];
110 LLVMValueRef return_value;
111
112 LLVMTypeRef voidt;
113 LLVMTypeRef i1;
114 LLVMTypeRef i8;
115 LLVMTypeRef i32;
116 LLVMTypeRef i64;
117 LLVMTypeRef i128;
118 LLVMTypeRef f32;
119 LLVMTypeRef v16i8;
120 LLVMTypeRef v2i32;
121 LLVMTypeRef v4i32;
122 LLVMTypeRef v4f32;
123 LLVMTypeRef v8i32;
124
125 LLVMValueRef shared_memory;
126 };
127
128 static struct si_shader_context *si_shader_context(
129 struct lp_build_tgsi_context *bld_base)
130 {
131 return (struct si_shader_context *)bld_base;
132 }
133
134 static void si_init_shader_ctx(struct si_shader_context *ctx,
135 struct si_screen *sscreen,
136 struct si_shader *shader,
137 LLVMTargetMachineRef tm);
138
139 /* Ideally pass the sample mask input to the PS epilog as v13, which
140 * is its usual location, so that the shader doesn't have to add v_mov.
141 */
142 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
143
144 /* The VS location of the PrimitiveID input is the same in the epilog,
145 * so that the main shader part doesn't have to move it.
146 */
147 #define VS_EPILOG_PRIMID_LOC 2
148
149 #define PERSPECTIVE_BASE 0
150 #define LINEAR_BASE 9
151
152 #define SAMPLE_OFFSET 0
153 #define CENTER_OFFSET 2
154 #define CENTROID_OFSET 4
155
156 #define USE_SGPR_MAX_SUFFIX_LEN 5
157 #define CONST_ADDR_SPACE 2
158 #define LOCAL_ADDR_SPACE 3
159 #define USER_SGPR_ADDR_SPACE 8
160
161
162 #define SENDMSG_GS 2
163 #define SENDMSG_GS_DONE 3
164
165 #define SENDMSG_GS_OP_NOP (0 << 4)
166 #define SENDMSG_GS_OP_CUT (1 << 4)
167 #define SENDMSG_GS_OP_EMIT (2 << 4)
168 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
169
170 /**
171 * Returns a unique index for a semantic name and index. The index must be
172 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
173 * calculated.
174 */
175 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
176 {
177 switch (semantic_name) {
178 case TGSI_SEMANTIC_POSITION:
179 return 0;
180 case TGSI_SEMANTIC_PSIZE:
181 return 1;
182 case TGSI_SEMANTIC_CLIPDIST:
183 assert(index <= 1);
184 return 2 + index;
185 case TGSI_SEMANTIC_GENERIC:
186 if (index <= 63-4)
187 return 4 + index;
188 else
189 /* same explanation as in the default statement,
190 * the only user hitting this is st/nine.
191 */
192 return 0;
193
194 /* patch indices are completely separate and thus start from 0 */
195 case TGSI_SEMANTIC_TESSOUTER:
196 return 0;
197 case TGSI_SEMANTIC_TESSINNER:
198 return 1;
199 case TGSI_SEMANTIC_PATCH:
200 return 2 + index;
201
202 default:
203 /* Don't fail here. The result of this function is only used
204 * for LS, TCS, TES, and GS, where legacy GL semantics can't
205 * occur, but this function is called for all vertex shaders
206 * before it's known whether LS will be compiled or not.
207 */
208 return 0;
209 }
210 }
211
212 /**
213 * Get the value of a shader input parameter and extract a bitfield.
214 */
215 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
216 unsigned param, unsigned rshift,
217 unsigned bitwidth)
218 {
219 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
220 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
221 param);
222
223 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
224 value = bitcast(&ctx->radeon_bld.soa.bld_base,
225 TGSI_TYPE_UNSIGNED, value);
226
227 if (rshift)
228 value = LLVMBuildLShr(gallivm->builder, value,
229 lp_build_const_int32(gallivm, rshift), "");
230
231 if (rshift + bitwidth < 32) {
232 unsigned mask = (1 << bitwidth) - 1;
233 value = LLVMBuildAnd(gallivm->builder, value,
234 lp_build_const_int32(gallivm, mask), "");
235 }
236
237 return value;
238 }
239
240 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
241 {
242 switch (ctx->type) {
243 case TGSI_PROCESSOR_TESS_CTRL:
244 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
245
246 case TGSI_PROCESSOR_TESS_EVAL:
247 return LLVMGetParam(ctx->radeon_bld.main_fn,
248 ctx->param_tes_rel_patch_id);
249
250 default:
251 assert(0);
252 return NULL;
253 }
254 }
255
256 /* Tessellation shaders pass outputs to the next shader using LDS.
257 *
258 * LS outputs = TCS inputs
259 * TCS outputs = TES inputs
260 *
261 * The LDS layout is:
262 * - TCS inputs for patch 0
263 * - TCS inputs for patch 1
264 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
265 * - ...
266 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
267 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
268 * - TCS outputs for patch 1
269 * - Per-patch TCS outputs for patch 1
270 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
271 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
272 * - ...
273 *
274 * All three shaders VS(LS), TCS, TES share the same LDS space.
275 */
276
277 static LLVMValueRef
278 get_tcs_in_patch_stride(struct si_shader_context *ctx)
279 {
280 if (ctx->type == TGSI_PROCESSOR_VERTEX)
281 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
282 else if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
283 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
284 else {
285 assert(0);
286 return NULL;
287 }
288 }
289
290 static LLVMValueRef
291 get_tcs_out_patch_stride(struct si_shader_context *ctx)
292 {
293 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
294 }
295
296 static LLVMValueRef
297 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
298 {
299 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
300 unpack_param(ctx,
301 SI_PARAM_TCS_OUT_OFFSETS,
302 0, 16),
303 4);
304 }
305
306 static LLVMValueRef
307 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
308 {
309 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
310 unpack_param(ctx,
311 SI_PARAM_TCS_OUT_OFFSETS,
312 16, 16),
313 4);
314 }
315
316 static LLVMValueRef
317 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
318 {
319 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
320 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
321 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
322
323 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
324 }
325
326 static LLVMValueRef
327 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
328 {
329 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
330 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
331 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
332 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
333
334 return LLVMBuildAdd(gallivm->builder, patch0_offset,
335 LLVMBuildMul(gallivm->builder, patch_stride,
336 rel_patch_id, ""),
337 "");
338 }
339
340 static LLVMValueRef
341 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
342 {
343 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
344 LLVMValueRef patch0_patch_data_offset =
345 get_tcs_out_patch0_patch_data_offset(ctx);
346 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
347 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
348
349 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
350 LLVMBuildMul(gallivm->builder, patch_stride,
351 rel_patch_id, ""),
352 "");
353 }
354
355 static void build_indexed_store(struct si_shader_context *ctx,
356 LLVMValueRef base_ptr, LLVMValueRef index,
357 LLVMValueRef value)
358 {
359 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
360 struct gallivm_state *gallivm = bld_base->base.gallivm;
361 LLVMValueRef indices[2], pointer;
362
363 indices[0] = bld_base->uint_bld.zero;
364 indices[1] = index;
365
366 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
367 LLVMBuildStore(gallivm->builder, value, pointer);
368 }
369
370 /**
371 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
372 * It's equivalent to doing a load from &base_ptr[index].
373 *
374 * \param base_ptr Where the array starts.
375 * \param index The element index into the array.
376 */
377 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
378 LLVMValueRef base_ptr, LLVMValueRef index)
379 {
380 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
381 struct gallivm_state *gallivm = bld_base->base.gallivm;
382 LLVMValueRef indices[2], pointer;
383
384 indices[0] = bld_base->uint_bld.zero;
385 indices[1] = index;
386
387 pointer = LLVMBuildGEP(gallivm->builder, base_ptr, indices, 2, "");
388 return LLVMBuildLoad(gallivm->builder, pointer, "");
389 }
390
391 /**
392 * Do a load from &base_ptr[index], but also add a flag that it's loading
393 * a constant.
394 */
395 static LLVMValueRef build_indexed_load_const(
396 struct si_shader_context *ctx,
397 LLVMValueRef base_ptr, LLVMValueRef index)
398 {
399 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index);
400 LLVMSetMetadata(result, 1, ctx->const_md);
401 return result;
402 }
403
404 static LLVMValueRef get_instance_index_for_fetch(
405 struct radeon_llvm_context *radeon_bld,
406 unsigned param_start_instance, unsigned divisor)
407 {
408 struct si_shader_context *ctx =
409 si_shader_context(&radeon_bld->soa.bld_base);
410 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
411
412 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
413 ctx->param_instance_id);
414
415 /* The division must be done before START_INSTANCE is added. */
416 if (divisor > 1)
417 result = LLVMBuildUDiv(gallivm->builder, result,
418 lp_build_const_int32(gallivm, divisor), "");
419
420 return LLVMBuildAdd(gallivm->builder, result,
421 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
422 }
423
424 static void declare_input_vs(
425 struct radeon_llvm_context *radeon_bld,
426 unsigned input_index,
427 const struct tgsi_full_declaration *decl)
428 {
429 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
430 struct gallivm_state *gallivm = base->gallivm;
431 struct si_shader_context *ctx =
432 si_shader_context(&radeon_bld->soa.bld_base);
433 unsigned divisor =
434 ctx->shader->key.vs.prolog.instance_divisors[input_index];
435
436 unsigned chan;
437
438 LLVMValueRef t_list_ptr;
439 LLVMValueRef t_offset;
440 LLVMValueRef t_list;
441 LLVMValueRef attribute_offset;
442 LLVMValueRef buffer_index;
443 LLVMValueRef args[3];
444 LLVMValueRef input;
445
446 /* Load the T list */
447 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
448
449 t_offset = lp_build_const_int32(gallivm, input_index);
450
451 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
452
453 /* Build the attribute offset */
454 attribute_offset = lp_build_const_int32(gallivm, 0);
455
456 if (!ctx->is_monolithic) {
457 buffer_index = LLVMGetParam(radeon_bld->main_fn,
458 ctx->param_vertex_index0 +
459 input_index);
460 } else if (divisor) {
461 /* Build index from instance ID, start instance and divisor */
462 ctx->shader->info.uses_instanceid = true;
463 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
464 SI_PARAM_START_INSTANCE,
465 divisor);
466 } else {
467 /* Load the buffer index for vertices. */
468 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
469 ctx->param_vertex_id);
470 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
471 SI_PARAM_BASE_VERTEX);
472 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
473 }
474
475 args[0] = t_list;
476 args[1] = attribute_offset;
477 args[2] = buffer_index;
478 input = lp_build_intrinsic(gallivm->builder,
479 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
480 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
481
482 /* Break up the vec4 into individual components */
483 for (chan = 0; chan < 4; chan++) {
484 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
485 /* XXX: Use a helper function for this. There is one in
486 * tgsi_llvm.c. */
487 ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
488 LLVMBuildExtractElement(gallivm->builder,
489 input, llvm_chan, "");
490 }
491 }
492
493 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
494 unsigned swizzle)
495 {
496 struct si_shader_context *ctx = si_shader_context(bld_base);
497
498 if (swizzle > 0)
499 return bld_base->uint_bld.zero;
500
501 switch (ctx->type) {
502 case TGSI_PROCESSOR_VERTEX:
503 return LLVMGetParam(ctx->radeon_bld.main_fn,
504 ctx->param_vs_prim_id);
505 case TGSI_PROCESSOR_TESS_CTRL:
506 return LLVMGetParam(ctx->radeon_bld.main_fn,
507 SI_PARAM_PATCH_ID);
508 case TGSI_PROCESSOR_TESS_EVAL:
509 return LLVMGetParam(ctx->radeon_bld.main_fn,
510 ctx->param_tes_patch_id);
511 case TGSI_PROCESSOR_GEOMETRY:
512 return LLVMGetParam(ctx->radeon_bld.main_fn,
513 SI_PARAM_PRIMITIVE_ID);
514 default:
515 assert(0);
516 return bld_base->uint_bld.zero;
517 }
518 }
519
520 /**
521 * Return the value of tgsi_ind_register for indexing.
522 * This is the indirect index with the constant offset added to it.
523 */
524 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
525 const struct tgsi_ind_register *ind,
526 int rel_index)
527 {
528 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
529 LLVMValueRef result;
530
531 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
532 result = LLVMBuildLoad(gallivm->builder, result, "");
533 result = LLVMBuildAdd(gallivm->builder, result,
534 lp_build_const_int32(gallivm, rel_index), "");
535 return result;
536 }
537
538 /**
539 * Like get_indirect_index, but restricts the return value to a (possibly
540 * undefined) value inside [0..num).
541 */
542 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
543 const struct tgsi_ind_register *ind,
544 int rel_index, unsigned num)
545 {
546 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
547 LLVMBuilderRef builder = gallivm->builder;
548 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
549 LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
550 LLVMValueRef cc;
551
552 if (util_is_power_of_two(num)) {
553 result = LLVMBuildAnd(builder, result, c_max, "");
554 } else {
555 /* In theory, this MAX pattern should result in code that is
556 * as good as the bit-wise AND above.
557 *
558 * In practice, LLVM generates worse code (at the time of
559 * writing), because its value tracking is not strong enough.
560 */
561 cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
562 result = LLVMBuildSelect(builder, cc, result, c_max, "");
563 }
564
565 return result;
566 }
567
568
569 /**
570 * Calculate a dword address given an input or output register and a stride.
571 */
572 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
573 const struct tgsi_full_dst_register *dst,
574 const struct tgsi_full_src_register *src,
575 LLVMValueRef vertex_dw_stride,
576 LLVMValueRef base_addr)
577 {
578 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
579 struct tgsi_shader_info *info = &ctx->shader->selector->info;
580 ubyte *name, *index, *array_first;
581 int first, param;
582 struct tgsi_full_dst_register reg;
583
584 /* Set the register description. The address computation is the same
585 * for sources and destinations. */
586 if (src) {
587 reg.Register.File = src->Register.File;
588 reg.Register.Index = src->Register.Index;
589 reg.Register.Indirect = src->Register.Indirect;
590 reg.Register.Dimension = src->Register.Dimension;
591 reg.Indirect = src->Indirect;
592 reg.Dimension = src->Dimension;
593 reg.DimIndirect = src->DimIndirect;
594 } else
595 reg = *dst;
596
597 /* If the register is 2-dimensional (e.g. an array of vertices
598 * in a primitive), calculate the base address of the vertex. */
599 if (reg.Register.Dimension) {
600 LLVMValueRef index;
601
602 if (reg.Dimension.Indirect)
603 index = get_indirect_index(ctx, &reg.DimIndirect,
604 reg.Dimension.Index);
605 else
606 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
607
608 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
609 LLVMBuildMul(gallivm->builder, index,
610 vertex_dw_stride, ""), "");
611 }
612
613 /* Get information about the register. */
614 if (reg.Register.File == TGSI_FILE_INPUT) {
615 name = info->input_semantic_name;
616 index = info->input_semantic_index;
617 array_first = info->input_array_first;
618 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
619 name = info->output_semantic_name;
620 index = info->output_semantic_index;
621 array_first = info->output_array_first;
622 } else {
623 assert(0);
624 return NULL;
625 }
626
627 if (reg.Register.Indirect) {
628 /* Add the relative address of the element. */
629 LLVMValueRef ind_index;
630
631 if (reg.Indirect.ArrayID)
632 first = array_first[reg.Indirect.ArrayID];
633 else
634 first = reg.Register.Index;
635
636 ind_index = get_indirect_index(ctx, &reg.Indirect,
637 reg.Register.Index - first);
638
639 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
640 LLVMBuildMul(gallivm->builder, ind_index,
641 lp_build_const_int32(gallivm, 4), ""), "");
642
643 param = si_shader_io_get_unique_index(name[first], index[first]);
644 } else {
645 param = si_shader_io_get_unique_index(name[reg.Register.Index],
646 index[reg.Register.Index]);
647 }
648
649 /* Add the base address of the element. */
650 return LLVMBuildAdd(gallivm->builder, base_addr,
651 lp_build_const_int32(gallivm, param * 4), "");
652 }
653
654 /**
655 * Load from LDS.
656 *
657 * \param type output value type
658 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
659 * \param dw_addr address in dwords
660 */
661 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
662 enum tgsi_opcode_type type, unsigned swizzle,
663 LLVMValueRef dw_addr)
664 {
665 struct si_shader_context *ctx = si_shader_context(bld_base);
666 struct gallivm_state *gallivm = bld_base->base.gallivm;
667 LLVMValueRef value;
668
669 if (swizzle == ~0) {
670 LLVMValueRef values[TGSI_NUM_CHANNELS];
671
672 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
673 values[chan] = lds_load(bld_base, type, chan, dw_addr);
674
675 return lp_build_gather_values(bld_base->base.gallivm, values,
676 TGSI_NUM_CHANNELS);
677 }
678
679 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
680 lp_build_const_int32(gallivm, swizzle));
681
682 value = build_indexed_load(ctx, ctx->lds, dw_addr);
683 if (type == TGSI_TYPE_DOUBLE) {
684 LLVMValueRef value2;
685 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
686 lp_build_const_int32(gallivm, swizzle + 1));
687 value2 = build_indexed_load(ctx, ctx->lds, dw_addr);
688 return radeon_llvm_emit_fetch_double(bld_base, value, value2);
689 }
690
691 return LLVMBuildBitCast(gallivm->builder, value,
692 tgsi2llvmtype(bld_base, type), "");
693 }
694
695 /**
696 * Store to LDS.
697 *
698 * \param swizzle offset (typically 0..3)
699 * \param dw_addr address in dwords
700 * \param value value to store
701 */
702 static void lds_store(struct lp_build_tgsi_context *bld_base,
703 unsigned swizzle, LLVMValueRef dw_addr,
704 LLVMValueRef value)
705 {
706 struct si_shader_context *ctx = si_shader_context(bld_base);
707 struct gallivm_state *gallivm = bld_base->base.gallivm;
708
709 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
710 lp_build_const_int32(gallivm, swizzle));
711
712 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
713 build_indexed_store(ctx, ctx->lds,
714 dw_addr, value);
715 }
716
717 static LLVMValueRef fetch_input_tcs(
718 struct lp_build_tgsi_context *bld_base,
719 const struct tgsi_full_src_register *reg,
720 enum tgsi_opcode_type type, unsigned swizzle)
721 {
722 struct si_shader_context *ctx = si_shader_context(bld_base);
723 LLVMValueRef dw_addr, stride;
724
725 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
726 dw_addr = get_tcs_in_current_patch_offset(ctx);
727 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
728
729 return lds_load(bld_base, type, swizzle, dw_addr);
730 }
731
732 static LLVMValueRef fetch_output_tcs(
733 struct lp_build_tgsi_context *bld_base,
734 const struct tgsi_full_src_register *reg,
735 enum tgsi_opcode_type type, unsigned swizzle)
736 {
737 struct si_shader_context *ctx = si_shader_context(bld_base);
738 LLVMValueRef dw_addr, stride;
739
740 if (reg->Register.Dimension) {
741 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
742 dw_addr = get_tcs_out_current_patch_offset(ctx);
743 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
744 } else {
745 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
746 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
747 }
748
749 return lds_load(bld_base, type, swizzle, dw_addr);
750 }
751
752 static LLVMValueRef fetch_input_tes(
753 struct lp_build_tgsi_context *bld_base,
754 const struct tgsi_full_src_register *reg,
755 enum tgsi_opcode_type type, unsigned swizzle)
756 {
757 struct si_shader_context *ctx = si_shader_context(bld_base);
758 LLVMValueRef dw_addr, stride;
759
760 if (reg->Register.Dimension) {
761 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
762 dw_addr = get_tcs_out_current_patch_offset(ctx);
763 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
764 } else {
765 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
766 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
767 }
768
769 return lds_load(bld_base, type, swizzle, dw_addr);
770 }
771
772 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
773 const struct tgsi_full_instruction *inst,
774 const struct tgsi_opcode_info *info,
775 LLVMValueRef dst[4])
776 {
777 struct si_shader_context *ctx = si_shader_context(bld_base);
778 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
779 unsigned chan_index;
780 LLVMValueRef dw_addr, stride;
781
782 /* Only handle per-patch and per-vertex outputs here.
783 * Vectors will be lowered to scalars and this function will be called again.
784 */
785 if (reg->Register.File != TGSI_FILE_OUTPUT ||
786 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
787 radeon_llvm_emit_store(bld_base, inst, info, dst);
788 return;
789 }
790
791 if (reg->Register.Dimension) {
792 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
793 dw_addr = get_tcs_out_current_patch_offset(ctx);
794 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
795 } else {
796 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
797 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
798 }
799
800 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
801 LLVMValueRef value = dst[chan_index];
802
803 if (inst->Instruction.Saturate)
804 value = radeon_llvm_saturate(bld_base, value);
805
806 lds_store(bld_base, chan_index, dw_addr, value);
807 }
808 }
809
810 static LLVMValueRef fetch_input_gs(
811 struct lp_build_tgsi_context *bld_base,
812 const struct tgsi_full_src_register *reg,
813 enum tgsi_opcode_type type,
814 unsigned swizzle)
815 {
816 struct lp_build_context *base = &bld_base->base;
817 struct si_shader_context *ctx = si_shader_context(bld_base);
818 struct si_shader *shader = ctx->shader;
819 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
820 struct gallivm_state *gallivm = base->gallivm;
821 LLVMValueRef vtx_offset;
822 LLVMValueRef args[9];
823 unsigned vtx_offset_param;
824 struct tgsi_shader_info *info = &shader->selector->info;
825 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
826 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
827 unsigned param;
828 LLVMValueRef value;
829
830 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
831 return get_primitive_id(bld_base, swizzle);
832
833 if (!reg->Register.Dimension)
834 return NULL;
835
836 if (swizzle == ~0) {
837 LLVMValueRef values[TGSI_NUM_CHANNELS];
838 unsigned chan;
839 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
840 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
841 }
842 return lp_build_gather_values(bld_base->base.gallivm, values,
843 TGSI_NUM_CHANNELS);
844 }
845
846 /* Get the vertex offset parameter */
847 vtx_offset_param = reg->Dimension.Index;
848 if (vtx_offset_param < 2) {
849 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
850 } else {
851 assert(vtx_offset_param < 6);
852 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
853 }
854 vtx_offset = lp_build_mul_imm(uint,
855 LLVMGetParam(ctx->radeon_bld.main_fn,
856 vtx_offset_param),
857 4);
858
859 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
860 args[0] = ctx->esgs_ring;
861 args[1] = vtx_offset;
862 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
863 args[3] = uint->zero;
864 args[4] = uint->one; /* OFFEN */
865 args[5] = uint->zero; /* IDXEN */
866 args[6] = uint->one; /* GLC */
867 args[7] = uint->zero; /* SLC */
868 args[8] = uint->zero; /* TFE */
869
870 value = lp_build_intrinsic(gallivm->builder,
871 "llvm.SI.buffer.load.dword.i32.i32",
872 ctx->i32, args, 9,
873 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
874 if (type == TGSI_TYPE_DOUBLE) {
875 LLVMValueRef value2;
876 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
877 value2 = lp_build_intrinsic(gallivm->builder,
878 "llvm.SI.buffer.load.dword.i32.i32",
879 ctx->i32, args, 9,
880 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
881 return radeon_llvm_emit_fetch_double(bld_base,
882 value, value2);
883 }
884 return LLVMBuildBitCast(gallivm->builder,
885 value,
886 tgsi2llvmtype(bld_base, type), "");
887 }
888
889 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
890 {
891 switch (interpolate) {
892 case TGSI_INTERPOLATE_CONSTANT:
893 return 0;
894
895 case TGSI_INTERPOLATE_LINEAR:
896 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
897 return SI_PARAM_LINEAR_SAMPLE;
898 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
899 return SI_PARAM_LINEAR_CENTROID;
900 else
901 return SI_PARAM_LINEAR_CENTER;
902 break;
903 case TGSI_INTERPOLATE_COLOR:
904 case TGSI_INTERPOLATE_PERSPECTIVE:
905 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
906 return SI_PARAM_PERSP_SAMPLE;
907 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
908 return SI_PARAM_PERSP_CENTROID;
909 else
910 return SI_PARAM_PERSP_CENTER;
911 break;
912 default:
913 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
914 return -1;
915 }
916 }
917
918 /* This shouldn't be used by explicit INTERP opcodes. */
919 static unsigned select_interp_param(struct si_shader_context *ctx,
920 unsigned param)
921 {
922 if (!ctx->shader->key.ps.prolog.force_persample_interp ||
923 !ctx->is_monolithic)
924 return param;
925
926 /* If the shader doesn't use center/centroid, just return the parameter.
927 *
928 * If the shader only uses one set of (i,j), "si_emit_spi_ps_input" can
929 * switch between center/centroid and sample without shader changes.
930 */
931 switch (param) {
932 case SI_PARAM_PERSP_CENTROID:
933 case SI_PARAM_PERSP_CENTER:
934 return SI_PARAM_PERSP_SAMPLE;
935
936 case SI_PARAM_LINEAR_CENTROID:
937 case SI_PARAM_LINEAR_CENTER:
938 return SI_PARAM_LINEAR_SAMPLE;
939
940 default:
941 return param;
942 }
943 }
944
945 /**
946 * Interpolate a fragment shader input.
947 *
948 * @param ctx context
949 * @param input_index index of the input in hardware
950 * @param semantic_name TGSI_SEMANTIC_*
951 * @param semantic_index semantic index
952 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
953 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
954 * @param interp_param interpolation weights (i,j)
955 * @param prim_mask SI_PARAM_PRIM_MASK
956 * @param face SI_PARAM_FRONT_FACE
957 * @param result the return value (4 components)
958 */
959 static void interp_fs_input(struct si_shader_context *ctx,
960 unsigned input_index,
961 unsigned semantic_name,
962 unsigned semantic_index,
963 unsigned num_interp_inputs,
964 unsigned colors_read_mask,
965 LLVMValueRef interp_param,
966 LLVMValueRef prim_mask,
967 LLVMValueRef face,
968 LLVMValueRef result[4])
969 {
970 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
971 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
972 struct gallivm_state *gallivm = base->gallivm;
973 const char *intr_name;
974 LLVMValueRef attr_number;
975
976 unsigned chan;
977
978 attr_number = lp_build_const_int32(gallivm, input_index);
979
980 /* fs.constant returns the param from the middle vertex, so it's not
981 * really useful for flat shading. It's meant to be used for custom
982 * interpolation (but the intrinsic can't fetch from the other two
983 * vertices).
984 *
985 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
986 * to do the right thing. The only reason we use fs.constant is that
987 * fs.interp cannot be used on integers, because they can be equal
988 * to NaN.
989 */
990 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
991
992 if (semantic_name == TGSI_SEMANTIC_COLOR &&
993 ctx->shader->key.ps.prolog.color_two_side) {
994 LLVMValueRef args[4];
995 LLVMValueRef is_face_positive;
996 LLVMValueRef back_attr_number;
997
998 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
999 * otherwise it's at offset "num_inputs".
1000 */
1001 unsigned back_attr_offset = num_interp_inputs;
1002 if (semantic_index == 1 && colors_read_mask & 0xf)
1003 back_attr_offset += 1;
1004
1005 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1006
1007 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1008 face, uint->zero, "");
1009
1010 args[2] = prim_mask;
1011 args[3] = interp_param;
1012 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1013 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1014 LLVMValueRef front, back;
1015
1016 args[0] = llvm_chan;
1017 args[1] = attr_number;
1018 front = lp_build_intrinsic(gallivm->builder, intr_name,
1019 ctx->f32, args, args[3] ? 4 : 3,
1020 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1021
1022 args[1] = back_attr_number;
1023 back = lp_build_intrinsic(gallivm->builder, intr_name,
1024 ctx->f32, args, args[3] ? 4 : 3,
1025 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1026
1027 result[chan] = LLVMBuildSelect(gallivm->builder,
1028 is_face_positive,
1029 front,
1030 back,
1031 "");
1032 }
1033 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1034 LLVMValueRef args[4];
1035
1036 args[0] = uint->zero;
1037 args[1] = attr_number;
1038 args[2] = prim_mask;
1039 args[3] = interp_param;
1040 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1041 ctx->f32, args, args[3] ? 4 : 3,
1042 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1043 result[1] =
1044 result[2] = lp_build_const_float(gallivm, 0.0f);
1045 result[3] = lp_build_const_float(gallivm, 1.0f);
1046 } else {
1047 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1048 LLVMValueRef args[4];
1049 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1050
1051 args[0] = llvm_chan;
1052 args[1] = attr_number;
1053 args[2] = prim_mask;
1054 args[3] = interp_param;
1055 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1056 ctx->f32, args, args[3] ? 4 : 3,
1057 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1058 }
1059 }
1060 }
1061
1062 static void declare_input_fs(
1063 struct radeon_llvm_context *radeon_bld,
1064 unsigned input_index,
1065 const struct tgsi_full_declaration *decl)
1066 {
1067 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1068 struct si_shader_context *ctx =
1069 si_shader_context(&radeon_bld->soa.bld_base);
1070 struct si_shader *shader = ctx->shader;
1071 LLVMValueRef main_fn = radeon_bld->main_fn;
1072 LLVMValueRef interp_param = NULL;
1073 int interp_param_idx;
1074
1075 /* Get colors from input VGPRs (set by the prolog). */
1076 if (!ctx->is_monolithic &&
1077 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1078 unsigned i = decl->Semantic.Index;
1079 unsigned colors_read = shader->selector->info.colors_read;
1080 unsigned mask = colors_read >> (i * 4);
1081 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1082 (i ? util_bitcount(colors_read & 0xf) : 0);
1083
1084 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
1085 mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1086 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
1087 mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1088 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
1089 mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1090 radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
1091 mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1092 return;
1093 }
1094
1095 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1096 decl->Interp.Location);
1097 if (interp_param_idx == -1)
1098 return;
1099 else if (interp_param_idx) {
1100 interp_param_idx = select_interp_param(ctx,
1101 interp_param_idx);
1102 interp_param = LLVMGetParam(main_fn, interp_param_idx);
1103 }
1104
1105 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1106 decl->Semantic.Index, shader->selector->info.num_inputs,
1107 shader->selector->info.colors_read, interp_param,
1108 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1109 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1110 &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
1111 }
1112
1113 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1114 {
1115 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1116 SI_PARAM_ANCILLARY, 8, 4);
1117 }
1118
1119 /**
1120 * Load a dword from a constant buffer.
1121 */
1122 static LLVMValueRef buffer_load_const(LLVMBuilderRef builder, LLVMValueRef resource,
1123 LLVMValueRef offset, LLVMTypeRef return_type)
1124 {
1125 LLVMValueRef args[2] = {resource, offset};
1126
1127 return lp_build_intrinsic(builder, "llvm.SI.load.const", return_type, args, 2,
1128 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1129 }
1130
1131 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1132 {
1133 struct si_shader_context *ctx =
1134 si_shader_context(&radeon_bld->soa.bld_base);
1135 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1136 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1137 LLVMBuilderRef builder = gallivm->builder;
1138 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1139 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1140 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1141
1142 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1143 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1144 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1145
1146 LLVMValueRef pos[4] = {
1147 buffer_load_const(builder, resource, offset0, ctx->f32),
1148 buffer_load_const(builder, resource, offset1, ctx->f32),
1149 lp_build_const_float(gallivm, 0),
1150 lp_build_const_float(gallivm, 0)
1151 };
1152
1153 return lp_build_gather_values(gallivm, pos, 4);
1154 }
1155
1156 static void declare_system_value(
1157 struct radeon_llvm_context *radeon_bld,
1158 unsigned index,
1159 const struct tgsi_full_declaration *decl)
1160 {
1161 struct si_shader_context *ctx =
1162 si_shader_context(&radeon_bld->soa.bld_base);
1163 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1164 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1165 LLVMValueRef value = 0;
1166
1167 switch (decl->Semantic.Name) {
1168 case TGSI_SEMANTIC_INSTANCEID:
1169 value = LLVMGetParam(radeon_bld->main_fn,
1170 ctx->param_instance_id);
1171 break;
1172
1173 case TGSI_SEMANTIC_VERTEXID:
1174 value = LLVMBuildAdd(gallivm->builder,
1175 LLVMGetParam(radeon_bld->main_fn,
1176 ctx->param_vertex_id),
1177 LLVMGetParam(radeon_bld->main_fn,
1178 SI_PARAM_BASE_VERTEX), "");
1179 break;
1180
1181 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1182 value = LLVMGetParam(radeon_bld->main_fn,
1183 ctx->param_vertex_id);
1184 break;
1185
1186 case TGSI_SEMANTIC_BASEVERTEX:
1187 value = LLVMGetParam(radeon_bld->main_fn,
1188 SI_PARAM_BASE_VERTEX);
1189 break;
1190
1191 case TGSI_SEMANTIC_INVOCATIONID:
1192 if (ctx->type == TGSI_PROCESSOR_TESS_CTRL)
1193 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1194 else if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
1195 value = LLVMGetParam(radeon_bld->main_fn,
1196 SI_PARAM_GS_INSTANCE_ID);
1197 else
1198 assert(!"INVOCATIONID not implemented");
1199 break;
1200
1201 case TGSI_SEMANTIC_POSITION:
1202 {
1203 LLVMValueRef pos[4] = {
1204 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1205 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1206 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1207 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1208 LLVMGetParam(radeon_bld->main_fn,
1209 SI_PARAM_POS_W_FLOAT)),
1210 };
1211 value = lp_build_gather_values(gallivm, pos, 4);
1212 break;
1213 }
1214
1215 case TGSI_SEMANTIC_FACE:
1216 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1217 break;
1218
1219 case TGSI_SEMANTIC_SAMPLEID:
1220 value = get_sample_id(radeon_bld);
1221 break;
1222
1223 case TGSI_SEMANTIC_SAMPLEPOS: {
1224 LLVMValueRef pos[4] = {
1225 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1226 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1227 lp_build_const_float(gallivm, 0),
1228 lp_build_const_float(gallivm, 0)
1229 };
1230 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1231 TGSI_OPCODE_FRC, pos[0]);
1232 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1233 TGSI_OPCODE_FRC, pos[1]);
1234 value = lp_build_gather_values(gallivm, pos, 4);
1235 break;
1236 }
1237
1238 case TGSI_SEMANTIC_SAMPLEMASK:
1239 /* This can only occur with the OpenGL Core profile, which
1240 * doesn't support smoothing.
1241 */
1242 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1243 break;
1244
1245 case TGSI_SEMANTIC_TESSCOORD:
1246 {
1247 LLVMValueRef coord[4] = {
1248 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1249 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1250 bld->zero,
1251 bld->zero
1252 };
1253
1254 /* For triangles, the vector should be (u, v, 1-u-v). */
1255 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1256 PIPE_PRIM_TRIANGLES)
1257 coord[2] = lp_build_sub(bld, bld->one,
1258 lp_build_add(bld, coord[0], coord[1]));
1259
1260 value = lp_build_gather_values(gallivm, coord, 4);
1261 break;
1262 }
1263
1264 case TGSI_SEMANTIC_VERTICESIN:
1265 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1266 break;
1267
1268 case TGSI_SEMANTIC_TESSINNER:
1269 case TGSI_SEMANTIC_TESSOUTER:
1270 {
1271 LLVMValueRef dw_addr;
1272 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1273
1274 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1275 dw_addr = LLVMBuildAdd(gallivm->builder, dw_addr,
1276 lp_build_const_int32(gallivm, param * 4), "");
1277
1278 value = lds_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1279 ~0, dw_addr);
1280 break;
1281 }
1282
1283 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1284 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1285 {
1286 LLVMValueRef buf, slot, val[4];
1287 int i, offset;
1288
1289 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1290 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1291 buf = build_indexed_load_const(ctx, buf, slot);
1292 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1293
1294 for (i = 0; i < 4; i++)
1295 val[i] = buffer_load_const(gallivm->builder, buf,
1296 lp_build_const_int32(gallivm, (offset + i) * 4),
1297 ctx->f32);
1298 value = lp_build_gather_values(gallivm, val, 4);
1299 break;
1300 }
1301
1302 case TGSI_SEMANTIC_PRIMID:
1303 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1304 break;
1305
1306 case TGSI_SEMANTIC_GRID_SIZE:
1307 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1308 break;
1309
1310 case TGSI_SEMANTIC_BLOCK_SIZE:
1311 {
1312 LLVMValueRef values[3];
1313 unsigned i;
1314 unsigned *properties = ctx->shader->selector->info.properties;
1315 unsigned sizes[3] = {
1316 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1317 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1318 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1319 };
1320
1321 for (i = 0; i < 3; ++i)
1322 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1323
1324 value = lp_build_gather_values(gallivm, values, 3);
1325 break;
1326 }
1327
1328 case TGSI_SEMANTIC_BLOCK_ID:
1329 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1330 break;
1331
1332 case TGSI_SEMANTIC_THREAD_ID:
1333 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1334 break;
1335
1336 default:
1337 assert(!"unknown system value");
1338 return;
1339 }
1340
1341 radeon_bld->system_values[index] = value;
1342 }
1343
1344 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1345 const struct tgsi_full_declaration *decl)
1346 {
1347 struct si_shader_context *ctx =
1348 si_shader_context(&radeon_bld->soa.bld_base);
1349 struct si_shader_selector *sel = ctx->shader->selector;
1350 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1351
1352 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1353 LLVMValueRef var;
1354
1355 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1356 assert(decl->Range.First == decl->Range.Last);
1357 assert(!ctx->shared_memory);
1358
1359 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1360 LLVMArrayType(ctx->i8, sel->local_size),
1361 "compute_lds",
1362 LOCAL_ADDR_SPACE);
1363 LLVMSetAlignment(var, 4);
1364
1365 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1366 }
1367
1368 static LLVMValueRef fetch_constant(
1369 struct lp_build_tgsi_context *bld_base,
1370 const struct tgsi_full_src_register *reg,
1371 enum tgsi_opcode_type type,
1372 unsigned swizzle)
1373 {
1374 struct si_shader_context *ctx = si_shader_context(bld_base);
1375 struct lp_build_context *base = &bld_base->base;
1376 const struct tgsi_ind_register *ireg = &reg->Indirect;
1377 unsigned buf, idx;
1378
1379 LLVMValueRef addr, bufp;
1380 LLVMValueRef result;
1381
1382 if (swizzle == LP_CHAN_ALL) {
1383 unsigned chan;
1384 LLVMValueRef values[4];
1385 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1386 values[chan] = fetch_constant(bld_base, reg, type, chan);
1387
1388 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1389 }
1390
1391 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1392 idx = reg->Register.Index * 4 + swizzle;
1393
1394 if (!reg->Register.Indirect && !reg->Dimension.Indirect) {
1395 if (type != TGSI_TYPE_DOUBLE)
1396 return bitcast(bld_base, type, ctx->constants[buf][idx]);
1397 else {
1398 return radeon_llvm_emit_fetch_double(bld_base,
1399 ctx->constants[buf][idx],
1400 ctx->constants[buf][idx + 1]);
1401 }
1402 }
1403
1404 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1405 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1406 LLVMValueRef index;
1407 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1408 reg->Dimension.Index,
1409 SI_NUM_CONST_BUFFERS);
1410 bufp = build_indexed_load_const(ctx, ptr, index);
1411 } else
1412 bufp = ctx->const_buffers[buf];
1413
1414 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1415 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1416 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1417 addr = lp_build_add(&bld_base->uint_bld, addr,
1418 lp_build_const_int32(base->gallivm, idx * 4));
1419
1420 result = buffer_load_const(base->gallivm->builder, bufp,
1421 addr, ctx->f32);
1422
1423 if (type != TGSI_TYPE_DOUBLE)
1424 result = bitcast(bld_base, type, result);
1425 else {
1426 LLVMValueRef addr2, result2;
1427 addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
1428 addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
1429 addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
1430 addr2 = lp_build_add(&bld_base->uint_bld, addr2,
1431 lp_build_const_int32(base->gallivm, idx * 4));
1432
1433 result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
1434 addr2, ctx->f32);
1435
1436 result = radeon_llvm_emit_fetch_double(bld_base,
1437 result, result2);
1438 }
1439 return result;
1440 }
1441
1442 /* Upper 16 bits must be zero. */
1443 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1444 LLVMValueRef val[2])
1445 {
1446 return LLVMBuildOr(gallivm->builder, val[0],
1447 LLVMBuildShl(gallivm->builder, val[1],
1448 lp_build_const_int32(gallivm, 16),
1449 ""), "");
1450 }
1451
1452 /* Upper 16 bits are ignored and will be dropped. */
1453 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1454 LLVMValueRef val[2])
1455 {
1456 LLVMValueRef v[2] = {
1457 LLVMBuildAnd(gallivm->builder, val[0],
1458 lp_build_const_int32(gallivm, 0xffff), ""),
1459 val[1],
1460 };
1461 return si_llvm_pack_two_int16(gallivm, v);
1462 }
1463
1464 /* Initialize arguments for the shader export intrinsic */
1465 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1466 LLVMValueRef *values,
1467 unsigned target,
1468 LLVMValueRef *args)
1469 {
1470 struct si_shader_context *ctx = si_shader_context(bld_base);
1471 struct lp_build_context *uint =
1472 &ctx->radeon_bld.soa.bld_base.uint_bld;
1473 struct lp_build_context *base = &bld_base->base;
1474 struct gallivm_state *gallivm = base->gallivm;
1475 LLVMBuilderRef builder = base->gallivm->builder;
1476 LLVMValueRef val[4];
1477 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1478 unsigned chan;
1479 bool is_int8;
1480
1481 /* Default is 0xf. Adjusted below depending on the format. */
1482 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1483
1484 /* Specify whether the EXEC mask represents the valid mask */
1485 args[1] = uint->zero;
1486
1487 /* Specify whether this is the last export */
1488 args[2] = uint->zero;
1489
1490 /* Specify the target we are exporting */
1491 args[3] = lp_build_const_int32(base->gallivm, target);
1492
1493 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
1494 const union si_shader_key *key = &ctx->shader->key;
1495 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1496 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1497
1498 assert(cbuf >= 0 && cbuf < 8);
1499 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1500 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1501 }
1502
1503 args[4] = uint->zero; /* COMPR flag */
1504 args[5] = base->undef;
1505 args[6] = base->undef;
1506 args[7] = base->undef;
1507 args[8] = base->undef;
1508
1509 switch (spi_shader_col_format) {
1510 case V_028714_SPI_SHADER_ZERO:
1511 args[0] = uint->zero; /* writemask */
1512 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1513 break;
1514
1515 case V_028714_SPI_SHADER_32_R:
1516 args[0] = uint->one; /* writemask */
1517 args[5] = values[0];
1518 break;
1519
1520 case V_028714_SPI_SHADER_32_GR:
1521 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1522 args[5] = values[0];
1523 args[6] = values[1];
1524 break;
1525
1526 case V_028714_SPI_SHADER_32_AR:
1527 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
1528 args[5] = values[0];
1529 args[8] = values[3];
1530 break;
1531
1532 case V_028714_SPI_SHADER_FP16_ABGR:
1533 args[4] = uint->one; /* COMPR flag */
1534
1535 for (chan = 0; chan < 2; chan++) {
1536 LLVMValueRef pack_args[2] = {
1537 values[2 * chan],
1538 values[2 * chan + 1]
1539 };
1540 LLVMValueRef packed;
1541
1542 packed = lp_build_intrinsic(base->gallivm->builder,
1543 "llvm.SI.packf16",
1544 ctx->i32, pack_args, 2,
1545 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
1546 args[chan + 5] =
1547 LLVMBuildBitCast(base->gallivm->builder,
1548 packed, ctx->f32, "");
1549 }
1550 break;
1551
1552 case V_028714_SPI_SHADER_UNORM16_ABGR:
1553 for (chan = 0; chan < 4; chan++) {
1554 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
1555 val[chan] = LLVMBuildFMul(builder, val[chan],
1556 lp_build_const_float(gallivm, 65535), "");
1557 val[chan] = LLVMBuildFAdd(builder, val[chan],
1558 lp_build_const_float(gallivm, 0.5), "");
1559 val[chan] = LLVMBuildFPToUI(builder, val[chan],
1560 ctx->i32, "");
1561 }
1562
1563 args[4] = uint->one; /* COMPR flag */
1564 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1565 si_llvm_pack_two_int16(gallivm, val));
1566 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1567 si_llvm_pack_two_int16(gallivm, val+2));
1568 break;
1569
1570 case V_028714_SPI_SHADER_SNORM16_ABGR:
1571 for (chan = 0; chan < 4; chan++) {
1572 /* Clamp between [-1, 1]. */
1573 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
1574 values[chan],
1575 lp_build_const_float(gallivm, 1));
1576 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
1577 val[chan],
1578 lp_build_const_float(gallivm, -1));
1579 /* Convert to a signed integer in [-32767, 32767]. */
1580 val[chan] = LLVMBuildFMul(builder, val[chan],
1581 lp_build_const_float(gallivm, 32767), "");
1582 /* If positive, add 0.5, else add -0.5. */
1583 val[chan] = LLVMBuildFAdd(builder, val[chan],
1584 LLVMBuildSelect(builder,
1585 LLVMBuildFCmp(builder, LLVMRealOGE,
1586 val[chan], base->zero, ""),
1587 lp_build_const_float(gallivm, 0.5),
1588 lp_build_const_float(gallivm, -0.5), ""), "");
1589 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
1590 }
1591
1592 args[4] = uint->one; /* COMPR flag */
1593 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1594 si_llvm_pack_two_int32_as_int16(gallivm, val));
1595 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1596 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1597 break;
1598
1599 case V_028714_SPI_SHADER_UINT16_ABGR: {
1600 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1601 255 : 65535);
1602 /* Clamp. */
1603 for (chan = 0; chan < 4; chan++) {
1604 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1605 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
1606 val[chan], max);
1607 }
1608
1609 args[4] = uint->one; /* COMPR flag */
1610 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1611 si_llvm_pack_two_int16(gallivm, val));
1612 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1613 si_llvm_pack_two_int16(gallivm, val+2));
1614 break;
1615 }
1616
1617 case V_028714_SPI_SHADER_SINT16_ABGR: {
1618 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
1619 127 : 32767);
1620 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
1621 -128 : -32768);
1622 /* Clamp. */
1623 for (chan = 0; chan < 4; chan++) {
1624 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
1625 val[chan] = lp_build_emit_llvm_binary(bld_base,
1626 TGSI_OPCODE_IMIN,
1627 val[chan], max);
1628 val[chan] = lp_build_emit_llvm_binary(bld_base,
1629 TGSI_OPCODE_IMAX,
1630 val[chan], min);
1631 }
1632
1633 args[4] = uint->one; /* COMPR flag */
1634 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1635 si_llvm_pack_two_int32_as_int16(gallivm, val));
1636 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
1637 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
1638 break;
1639 }
1640
1641 case V_028714_SPI_SHADER_32_ABGR:
1642 memcpy(&args[5], values, sizeof(values[0]) * 4);
1643 break;
1644 }
1645 }
1646
1647 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
1648 LLVMValueRef alpha)
1649 {
1650 struct si_shader_context *ctx = si_shader_context(bld_base);
1651 struct gallivm_state *gallivm = bld_base->base.gallivm;
1652
1653 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
1654 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
1655 SI_PARAM_ALPHA_REF);
1656
1657 LLVMValueRef alpha_pass =
1658 lp_build_cmp(&bld_base->base,
1659 ctx->shader->key.ps.epilog.alpha_func,
1660 alpha, alpha_ref);
1661 LLVMValueRef arg =
1662 lp_build_select(&bld_base->base,
1663 alpha_pass,
1664 lp_build_const_float(gallivm, 1.0f),
1665 lp_build_const_float(gallivm, -1.0f));
1666
1667 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
1668 ctx->voidt, &arg, 1, 0);
1669 } else {
1670 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
1671 ctx->voidt, NULL, 0, 0);
1672 }
1673 }
1674
1675 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
1676 LLVMValueRef alpha,
1677 unsigned samplemask_param)
1678 {
1679 struct si_shader_context *ctx = si_shader_context(bld_base);
1680 struct gallivm_state *gallivm = bld_base->base.gallivm;
1681 LLVMValueRef coverage;
1682
1683 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
1684 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
1685 samplemask_param);
1686 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
1687
1688 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
1689 ctx->i32,
1690 &coverage, 1, LLVMReadNoneAttribute);
1691
1692 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
1693 ctx->f32, "");
1694
1695 coverage = LLVMBuildFMul(gallivm->builder, coverage,
1696 lp_build_const_float(gallivm,
1697 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
1698
1699 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
1700 }
1701
1702 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
1703 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
1704 {
1705 struct si_shader_context *ctx = si_shader_context(bld_base);
1706 struct lp_build_context *base = &bld_base->base;
1707 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1708 unsigned reg_index;
1709 unsigned chan;
1710 unsigned const_chan;
1711 LLVMValueRef base_elt;
1712 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1713 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
1714 SI_VS_CONST_CLIP_PLANES);
1715 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
1716
1717 for (reg_index = 0; reg_index < 2; reg_index ++) {
1718 LLVMValueRef *args = pos[2 + reg_index];
1719
1720 args[5] =
1721 args[6] =
1722 args[7] =
1723 args[8] = lp_build_const_float(base->gallivm, 0.0f);
1724
1725 /* Compute dot products of position and user clip plane vectors */
1726 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1727 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
1728 args[1] = lp_build_const_int32(base->gallivm,
1729 ((reg_index * 4 + chan) * 4 +
1730 const_chan) * 4);
1731 base_elt = buffer_load_const(base->gallivm->builder, const_resource,
1732 args[1], ctx->f32);
1733 args[5 + chan] =
1734 lp_build_add(base, args[5 + chan],
1735 lp_build_mul(base, base_elt,
1736 out_elts[const_chan]));
1737 }
1738 }
1739
1740 args[0] = lp_build_const_int32(base->gallivm, 0xf);
1741 args[1] = uint->zero;
1742 args[2] = uint->zero;
1743 args[3] = lp_build_const_int32(base->gallivm,
1744 V_008DFC_SQ_EXP_POS + 2 + reg_index);
1745 args[4] = uint->zero;
1746 }
1747 }
1748
1749 static void si_dump_streamout(struct pipe_stream_output_info *so)
1750 {
1751 unsigned i;
1752
1753 if (so->num_outputs)
1754 fprintf(stderr, "STREAMOUT\n");
1755
1756 for (i = 0; i < so->num_outputs; i++) {
1757 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
1758 so->output[i].start_component;
1759 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
1760 i, so->output[i].output_buffer,
1761 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
1762 so->output[i].register_index,
1763 mask & 1 ? "x" : "",
1764 mask & 2 ? "y" : "",
1765 mask & 4 ? "z" : "",
1766 mask & 8 ? "w" : "");
1767 }
1768 }
1769
1770 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1771 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1772 * or v4i32 (num_channels=3,4). */
1773 static void build_tbuffer_store(struct si_shader_context *ctx,
1774 LLVMValueRef rsrc,
1775 LLVMValueRef vdata,
1776 unsigned num_channels,
1777 LLVMValueRef vaddr,
1778 LLVMValueRef soffset,
1779 unsigned inst_offset,
1780 unsigned dfmt,
1781 unsigned nfmt,
1782 unsigned offen,
1783 unsigned idxen,
1784 unsigned glc,
1785 unsigned slc,
1786 unsigned tfe)
1787 {
1788 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1789 LLVMValueRef args[] = {
1790 rsrc,
1791 vdata,
1792 LLVMConstInt(ctx->i32, num_channels, 0),
1793 vaddr,
1794 soffset,
1795 LLVMConstInt(ctx->i32, inst_offset, 0),
1796 LLVMConstInt(ctx->i32, dfmt, 0),
1797 LLVMConstInt(ctx->i32, nfmt, 0),
1798 LLVMConstInt(ctx->i32, offen, 0),
1799 LLVMConstInt(ctx->i32, idxen, 0),
1800 LLVMConstInt(ctx->i32, glc, 0),
1801 LLVMConstInt(ctx->i32, slc, 0),
1802 LLVMConstInt(ctx->i32, tfe, 0)
1803 };
1804
1805 /* The instruction offset field has 12 bits */
1806 assert(offen || inst_offset < (1 << 12));
1807
1808 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
1809 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1810 const char *types[] = {"i32", "v2i32", "v4i32"};
1811 char name[256];
1812 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
1813
1814 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
1815 args, Elements(args), 0);
1816 }
1817
1818 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
1819 LLVMValueRef rsrc,
1820 LLVMValueRef vdata,
1821 unsigned num_channels,
1822 LLVMValueRef vaddr,
1823 LLVMValueRef soffset,
1824 unsigned inst_offset)
1825 {
1826 static unsigned dfmt[] = {
1827 V_008F0C_BUF_DATA_FORMAT_32,
1828 V_008F0C_BUF_DATA_FORMAT_32_32,
1829 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1830 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1831 };
1832 assert(num_channels >= 1 && num_channels <= 4);
1833
1834 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
1835 inst_offset, dfmt[num_channels-1],
1836 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
1837 }
1838
1839 /* On SI, the vertex shader is responsible for writing streamout data
1840 * to buffers. */
1841 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
1842 struct si_shader_output_values *outputs,
1843 unsigned noutput)
1844 {
1845 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
1846 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1847 LLVMBuilderRef builder = gallivm->builder;
1848 int i, j;
1849 struct lp_build_if_state if_ctx;
1850
1851 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
1852 LLVMValueRef so_vtx_count =
1853 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
1854
1855 LLVMValueRef tid = lp_build_intrinsic(builder, "llvm.SI.tid", ctx->i32,
1856 NULL, 0, LLVMReadNoneAttribute);
1857
1858 /* can_emit = tid < so_vtx_count; */
1859 LLVMValueRef can_emit =
1860 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
1861
1862 LLVMValueRef stream_id =
1863 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
1864
1865 /* Emit the streamout code conditionally. This actually avoids
1866 * out-of-bounds buffer access. The hw tells us via the SGPR
1867 * (so_vtx_count) which threads are allowed to emit streamout data. */
1868 lp_build_if(&if_ctx, gallivm, can_emit);
1869 {
1870 /* The buffer offset is computed as follows:
1871 * ByteOffset = streamout_offset[buffer_id]*4 +
1872 * (streamout_write_index + thread_id)*stride[buffer_id] +
1873 * attrib_offset
1874 */
1875
1876 LLVMValueRef so_write_index =
1877 LLVMGetParam(ctx->radeon_bld.main_fn,
1878 ctx->param_streamout_write_index);
1879
1880 /* Compute (streamout_write_index + thread_id). */
1881 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
1882
1883 /* Compute the write offset for each enabled buffer. */
1884 LLVMValueRef so_write_offset[4] = {};
1885 for (i = 0; i < 4; i++) {
1886 if (!so->stride[i])
1887 continue;
1888
1889 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
1890 ctx->param_streamout_offset[i]);
1891 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
1892
1893 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
1894 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
1895 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
1896 }
1897
1898 /* Write streamout data. */
1899 for (i = 0; i < so->num_outputs; i++) {
1900 unsigned buf_idx = so->output[i].output_buffer;
1901 unsigned reg = so->output[i].register_index;
1902 unsigned start = so->output[i].start_component;
1903 unsigned num_comps = so->output[i].num_components;
1904 unsigned stream = so->output[i].stream;
1905 LLVMValueRef out[4];
1906 struct lp_build_if_state if_ctx_stream;
1907
1908 assert(num_comps && num_comps <= 4);
1909 if (!num_comps || num_comps > 4)
1910 continue;
1911
1912 if (reg >= noutput)
1913 continue;
1914
1915 /* Load the output as int. */
1916 for (j = 0; j < num_comps; j++) {
1917 out[j] = LLVMBuildBitCast(builder,
1918 outputs[reg].values[start+j],
1919 ctx->i32, "");
1920 }
1921
1922 /* Pack the output. */
1923 LLVMValueRef vdata = NULL;
1924
1925 switch (num_comps) {
1926 case 1: /* as i32 */
1927 vdata = out[0];
1928 break;
1929 case 2: /* as v2i32 */
1930 case 3: /* as v4i32 (aligned to 4) */
1931 case 4: /* as v4i32 */
1932 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
1933 for (j = 0; j < num_comps; j++) {
1934 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
1935 LLVMConstInt(ctx->i32, j, 0), "");
1936 }
1937 break;
1938 }
1939
1940 LLVMValueRef can_emit_stream =
1941 LLVMBuildICmp(builder, LLVMIntEQ,
1942 stream_id,
1943 lp_build_const_int32(gallivm, stream), "");
1944
1945 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
1946 build_tbuffer_store_dwords(ctx, ctx->so_buffers[buf_idx],
1947 vdata, num_comps,
1948 so_write_offset[buf_idx],
1949 LLVMConstInt(ctx->i32, 0, 0),
1950 so->output[i].dst_offset*4);
1951 lp_build_endif(&if_ctx_stream);
1952 }
1953 }
1954 lp_build_endif(&if_ctx);
1955 }
1956
1957
1958 /* Generate export instructions for hardware VS shader stage */
1959 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
1960 struct si_shader_output_values *outputs,
1961 unsigned noutput)
1962 {
1963 struct si_shader_context *ctx = si_shader_context(bld_base);
1964 struct si_shader *shader = ctx->shader;
1965 struct lp_build_context *base = &bld_base->base;
1966 struct lp_build_context *uint =
1967 &ctx->radeon_bld.soa.bld_base.uint_bld;
1968 LLVMValueRef args[9];
1969 LLVMValueRef pos_args[4][9] = { { 0 } };
1970 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
1971 unsigned semantic_name, semantic_index;
1972 unsigned target;
1973 unsigned param_count = 0;
1974 unsigned pos_idx;
1975 int i;
1976
1977 if (outputs && ctx->shader->selector->so.num_outputs) {
1978 si_llvm_emit_streamout(ctx, outputs, noutput);
1979 }
1980
1981 for (i = 0; i < noutput; i++) {
1982 semantic_name = outputs[i].name;
1983 semantic_index = outputs[i].sid;
1984
1985 handle_semantic:
1986 /* Select the correct target */
1987 switch(semantic_name) {
1988 case TGSI_SEMANTIC_PSIZE:
1989 psize_value = outputs[i].values[0];
1990 continue;
1991 case TGSI_SEMANTIC_EDGEFLAG:
1992 edgeflag_value = outputs[i].values[0];
1993 continue;
1994 case TGSI_SEMANTIC_LAYER:
1995 layer_value = outputs[i].values[0];
1996 semantic_name = TGSI_SEMANTIC_GENERIC;
1997 goto handle_semantic;
1998 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1999 viewport_index_value = outputs[i].values[0];
2000 semantic_name = TGSI_SEMANTIC_GENERIC;
2001 goto handle_semantic;
2002 case TGSI_SEMANTIC_POSITION:
2003 target = V_008DFC_SQ_EXP_POS;
2004 break;
2005 case TGSI_SEMANTIC_COLOR:
2006 case TGSI_SEMANTIC_BCOLOR:
2007 target = V_008DFC_SQ_EXP_PARAM + param_count;
2008 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2009 shader->info.vs_output_param_offset[i] = param_count;
2010 param_count++;
2011 break;
2012 case TGSI_SEMANTIC_CLIPDIST:
2013 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2014 break;
2015 case TGSI_SEMANTIC_CLIPVERTEX:
2016 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2017 continue;
2018 case TGSI_SEMANTIC_PRIMID:
2019 case TGSI_SEMANTIC_FOG:
2020 case TGSI_SEMANTIC_TEXCOORD:
2021 case TGSI_SEMANTIC_GENERIC:
2022 target = V_008DFC_SQ_EXP_PARAM + param_count;
2023 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2024 shader->info.vs_output_param_offset[i] = param_count;
2025 param_count++;
2026 break;
2027 default:
2028 target = 0;
2029 fprintf(stderr,
2030 "Warning: SI unhandled vs output type:%d\n",
2031 semantic_name);
2032 }
2033
2034 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2035
2036 if (target >= V_008DFC_SQ_EXP_POS &&
2037 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2038 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2039 args, sizeof(args));
2040 } else {
2041 lp_build_intrinsic(base->gallivm->builder,
2042 "llvm.SI.export", ctx->voidt,
2043 args, 9, 0);
2044 }
2045
2046 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2047 semantic_name = TGSI_SEMANTIC_GENERIC;
2048 goto handle_semantic;
2049 }
2050 }
2051
2052 shader->info.nr_param_exports = param_count;
2053
2054 /* We need to add the position output manually if it's missing. */
2055 if (!pos_args[0][0]) {
2056 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2057 pos_args[0][1] = uint->zero; /* EXEC mask */
2058 pos_args[0][2] = uint->zero; /* last export? */
2059 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2060 pos_args[0][4] = uint->zero; /* COMPR flag */
2061 pos_args[0][5] = base->zero; /* X */
2062 pos_args[0][6] = base->zero; /* Y */
2063 pos_args[0][7] = base->zero; /* Z */
2064 pos_args[0][8] = base->one; /* W */
2065 }
2066
2067 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2068 if (shader->selector->info.writes_psize ||
2069 shader->selector->info.writes_edgeflag ||
2070 shader->selector->info.writes_viewport_index ||
2071 shader->selector->info.writes_layer) {
2072 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2073 shader->selector->info.writes_psize |
2074 (shader->selector->info.writes_edgeflag << 1) |
2075 (shader->selector->info.writes_layer << 2) |
2076 (shader->selector->info.writes_viewport_index << 3));
2077 pos_args[1][1] = uint->zero; /* EXEC mask */
2078 pos_args[1][2] = uint->zero; /* last export? */
2079 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2080 pos_args[1][4] = uint->zero; /* COMPR flag */
2081 pos_args[1][5] = base->zero; /* X */
2082 pos_args[1][6] = base->zero; /* Y */
2083 pos_args[1][7] = base->zero; /* Z */
2084 pos_args[1][8] = base->zero; /* W */
2085
2086 if (shader->selector->info.writes_psize)
2087 pos_args[1][5] = psize_value;
2088
2089 if (shader->selector->info.writes_edgeflag) {
2090 /* The output is a float, but the hw expects an integer
2091 * with the first bit containing the edge flag. */
2092 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2093 edgeflag_value,
2094 ctx->i32, "");
2095 edgeflag_value = lp_build_min(&bld_base->int_bld,
2096 edgeflag_value,
2097 bld_base->int_bld.one);
2098
2099 /* The LLVM intrinsic expects a float. */
2100 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2101 edgeflag_value,
2102 ctx->f32, "");
2103 }
2104
2105 if (shader->selector->info.writes_layer)
2106 pos_args[1][7] = layer_value;
2107
2108 if (shader->selector->info.writes_viewport_index)
2109 pos_args[1][8] = viewport_index_value;
2110 }
2111
2112 for (i = 0; i < 4; i++)
2113 if (pos_args[i][0])
2114 shader->info.nr_pos_exports++;
2115
2116 pos_idx = 0;
2117 for (i = 0; i < 4; i++) {
2118 if (!pos_args[i][0])
2119 continue;
2120
2121 /* Specify the target we are exporting */
2122 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2123
2124 if (pos_idx == shader->info.nr_pos_exports)
2125 /* Specify that this is the last export */
2126 pos_args[i][2] = uint->one;
2127
2128 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2129 ctx->voidt, pos_args[i], 9, 0);
2130 }
2131 }
2132
2133 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2134 LLVMValueRef rel_patch_id,
2135 LLVMValueRef invocation_id,
2136 LLVMValueRef tcs_out_current_patch_data_offset)
2137 {
2138 struct si_shader_context *ctx = si_shader_context(bld_base);
2139 struct gallivm_state *gallivm = bld_base->base.gallivm;
2140 struct si_shader *shader = ctx->shader;
2141 unsigned tess_inner_index, tess_outer_index;
2142 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2143 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2144 unsigned stride, outer_comps, inner_comps, i;
2145 struct lp_build_if_state if_ctx;
2146
2147 /* Do this only for invocation 0, because the tess levels are per-patch,
2148 * not per-vertex.
2149 *
2150 * This can't jump, because invocation 0 executes this. It should
2151 * at least mask out the loads and stores for other invocations.
2152 */
2153 lp_build_if(&if_ctx, gallivm,
2154 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2155 invocation_id, bld_base->uint_bld.zero, ""));
2156
2157 /* Determine the layout of one tess factor element in the buffer. */
2158 switch (shader->key.tcs.epilog.prim_mode) {
2159 case PIPE_PRIM_LINES:
2160 stride = 2; /* 2 dwords, 1 vec2 store */
2161 outer_comps = 2;
2162 inner_comps = 0;
2163 break;
2164 case PIPE_PRIM_TRIANGLES:
2165 stride = 4; /* 4 dwords, 1 vec4 store */
2166 outer_comps = 3;
2167 inner_comps = 1;
2168 break;
2169 case PIPE_PRIM_QUADS:
2170 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2171 outer_comps = 4;
2172 inner_comps = 2;
2173 break;
2174 default:
2175 assert(0);
2176 return;
2177 }
2178
2179 /* Load tess_inner and tess_outer from LDS.
2180 * Any invocation can write them, so we can't get them from a temporary.
2181 */
2182 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2183 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2184
2185 lds_base = tcs_out_current_patch_data_offset;
2186 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2187 lp_build_const_int32(gallivm,
2188 tess_inner_index * 4), "");
2189 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2190 lp_build_const_int32(gallivm,
2191 tess_outer_index * 4), "");
2192
2193 for (i = 0; i < outer_comps; i++)
2194 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2195 for (i = 0; i < inner_comps; i++)
2196 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2197
2198 /* Convert the outputs to vectors for stores. */
2199 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2200 vec1 = NULL;
2201
2202 if (stride > 4)
2203 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2204
2205 /* Get the buffer. */
2206 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2207 SI_PARAM_RW_BUFFERS);
2208 buffer = build_indexed_load_const(ctx, rw_buffers,
2209 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2210
2211 /* Get the offset. */
2212 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2213 SI_PARAM_TESS_FACTOR_OFFSET);
2214 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2215 lp_build_const_int32(gallivm, 4 * stride), "");
2216
2217 /* Store the outputs. */
2218 build_tbuffer_store_dwords(ctx, buffer, vec0,
2219 MIN2(stride, 4), byteoffset, tf_base, 0);
2220 if (vec1)
2221 build_tbuffer_store_dwords(ctx, buffer, vec1,
2222 stride - 4, byteoffset, tf_base, 16);
2223 lp_build_endif(&if_ctx);
2224 }
2225
2226 /* This only writes the tessellation factor levels. */
2227 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2228 {
2229 struct si_shader_context *ctx = si_shader_context(bld_base);
2230 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2231
2232 rel_patch_id = get_rel_patch_id(ctx);
2233 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2234 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2235
2236 if (!ctx->is_monolithic) {
2237 /* Return epilog parameters from this function. */
2238 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2239 LLVMValueRef ret = ctx->return_value;
2240 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2241 unsigned vgpr;
2242
2243 /* RW_BUFFERS pointer */
2244 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2245 SI_PARAM_RW_BUFFERS);
2246 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2247 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2248 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2249 bld_base->uint_bld.zero, "");
2250 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2251 bld_base->uint_bld.one, "");
2252 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2253 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2254
2255 /* Tess factor buffer soffset is after user SGPRs. */
2256 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2257 SI_PARAM_TESS_FACTOR_OFFSET);
2258 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2259 SI_TCS_NUM_USER_SGPR, "");
2260
2261 /* VGPRs */
2262 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2263 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2264 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2265
2266 vgpr = SI_TCS_NUM_USER_SGPR + 1;
2267 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2268 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2269 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2270 ctx->return_value = ret;
2271 return;
2272 }
2273
2274 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2275 }
2276
2277 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2278 {
2279 struct si_shader_context *ctx = si_shader_context(bld_base);
2280 struct si_shader *shader = ctx->shader;
2281 struct tgsi_shader_info *info = &shader->selector->info;
2282 struct gallivm_state *gallivm = bld_base->base.gallivm;
2283 unsigned i, chan;
2284 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2285 ctx->param_rel_auto_id);
2286 LLVMValueRef vertex_dw_stride =
2287 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2288 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2289 vertex_dw_stride, "");
2290
2291 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2292 * its inputs from it. */
2293 for (i = 0; i < info->num_outputs; i++) {
2294 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2295 unsigned name = info->output_semantic_name[i];
2296 unsigned index = info->output_semantic_index[i];
2297 int param = si_shader_io_get_unique_index(name, index);
2298 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2299 lp_build_const_int32(gallivm, param * 4), "");
2300
2301 for (chan = 0; chan < 4; chan++) {
2302 lds_store(bld_base, chan, dw_addr,
2303 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2304 }
2305 }
2306 }
2307
2308 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2309 {
2310 struct si_shader_context *ctx = si_shader_context(bld_base);
2311 struct gallivm_state *gallivm = bld_base->base.gallivm;
2312 struct si_shader *es = ctx->shader;
2313 struct tgsi_shader_info *info = &es->selector->info;
2314 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2315 ctx->param_es2gs_offset);
2316 unsigned chan;
2317 int i;
2318
2319 for (i = 0; i < info->num_outputs; i++) {
2320 LLVMValueRef *out_ptr =
2321 ctx->radeon_bld.soa.outputs[i];
2322 int param_index;
2323
2324 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2325 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2326 continue;
2327
2328 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2329 info->output_semantic_index[i]);
2330
2331 for (chan = 0; chan < 4; chan++) {
2332 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2333 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2334
2335 build_tbuffer_store(ctx,
2336 ctx->esgs_ring,
2337 out_val, 1,
2338 LLVMGetUndef(ctx->i32), soffset,
2339 (4 * param_index + chan) * 4,
2340 V_008F0C_BUF_DATA_FORMAT_32,
2341 V_008F0C_BUF_NUM_FORMAT_UINT,
2342 0, 0, 1, 1, 0);
2343 }
2344 }
2345 }
2346
2347 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2348 {
2349 struct si_shader_context *ctx = si_shader_context(bld_base);
2350 struct gallivm_state *gallivm = bld_base->base.gallivm;
2351 LLVMValueRef args[2];
2352
2353 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2354 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2355 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2356 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
2357 }
2358
2359 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2360 {
2361 struct si_shader_context *ctx = si_shader_context(bld_base);
2362 struct gallivm_state *gallivm = bld_base->base.gallivm;
2363 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2364 struct si_shader_output_values *outputs = NULL;
2365 int i,j;
2366
2367 assert(!ctx->is_gs_copy_shader);
2368
2369 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2370
2371 /* Vertex color clamping.
2372 *
2373 * This uses a state constant loaded in a user data SGPR and
2374 * an IF statement is added that clamps all colors if the constant
2375 * is true.
2376 */
2377 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
2378 struct lp_build_if_state if_ctx;
2379 LLVMValueRef cond = NULL;
2380 LLVMValueRef addr, val;
2381
2382 for (i = 0; i < info->num_outputs; i++) {
2383 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2384 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2385 continue;
2386
2387 /* We've found a color. */
2388 if (!cond) {
2389 /* The state is in the first bit of the user SGPR. */
2390 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2391 SI_PARAM_VS_STATE_BITS);
2392 cond = LLVMBuildTrunc(gallivm->builder, cond,
2393 ctx->i1, "");
2394 lp_build_if(&if_ctx, gallivm, cond);
2395 }
2396
2397 for (j = 0; j < 4; j++) {
2398 addr = ctx->radeon_bld.soa.outputs[i][j];
2399 val = LLVMBuildLoad(gallivm->builder, addr, "");
2400 val = radeon_llvm_saturate(bld_base, val);
2401 LLVMBuildStore(gallivm->builder, val, addr);
2402 }
2403 }
2404
2405 if (cond)
2406 lp_build_endif(&if_ctx);
2407 }
2408
2409 for (i = 0; i < info->num_outputs; i++) {
2410 outputs[i].name = info->output_semantic_name[i];
2411 outputs[i].sid = info->output_semantic_index[i];
2412
2413 for (j = 0; j < 4; j++)
2414 outputs[i].values[j] =
2415 LLVMBuildLoad(gallivm->builder,
2416 ctx->radeon_bld.soa.outputs[i][j],
2417 "");
2418 }
2419
2420 if (ctx->is_monolithic) {
2421 /* Export PrimitiveID when PS needs it. */
2422 if (si_vs_exports_prim_id(ctx->shader)) {
2423 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2424 outputs[i].sid = 0;
2425 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2426 get_primitive_id(bld_base, 0));
2427 outputs[i].values[1] = bld_base->base.undef;
2428 outputs[i].values[2] = bld_base->base.undef;
2429 outputs[i].values[3] = bld_base->base.undef;
2430 i++;
2431 }
2432 } else {
2433 /* Return the primitive ID from the LLVM function. */
2434 ctx->return_value =
2435 LLVMBuildInsertValue(gallivm->builder,
2436 ctx->return_value,
2437 bitcast(bld_base, TGSI_TYPE_FLOAT,
2438 get_primitive_id(bld_base, 0)),
2439 VS_EPILOG_PRIMID_LOC, "");
2440 }
2441
2442 si_llvm_export_vs(bld_base, outputs, i);
2443 FREE(outputs);
2444 }
2445
2446 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2447 LLVMValueRef depth, LLVMValueRef stencil,
2448 LLVMValueRef samplemask)
2449 {
2450 struct si_shader_context *ctx = si_shader_context(bld_base);
2451 struct lp_build_context *base = &bld_base->base;
2452 struct lp_build_context *uint = &bld_base->uint_bld;
2453 LLVMValueRef args[9];
2454 unsigned mask = 0;
2455
2456 assert(depth || stencil || samplemask);
2457
2458 args[1] = uint->one; /* whether the EXEC mask is valid */
2459 args[2] = uint->one; /* DONE bit */
2460
2461 /* Specify the target we are exporting */
2462 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2463
2464 args[4] = uint->zero; /* COMP flag */
2465 args[5] = base->undef; /* R, depth */
2466 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2467 args[7] = base->undef; /* B, sample mask */
2468 args[8] = base->undef; /* A, alpha to mask */
2469
2470 if (depth) {
2471 args[5] = depth;
2472 mask |= 0x1;
2473 }
2474
2475 if (stencil) {
2476 args[6] = stencil;
2477 mask |= 0x2;
2478 }
2479
2480 if (samplemask) {
2481 args[7] = samplemask;
2482 mask |= 0x4;
2483 }
2484
2485 /* SI (except OLAND) has a bug that it only looks
2486 * at the X writemask component. */
2487 if (ctx->screen->b.chip_class == SI &&
2488 ctx->screen->b.family != CHIP_OLAND)
2489 mask |= 0x1;
2490
2491 /* Specify which components to enable */
2492 args[0] = lp_build_const_int32(base->gallivm, mask);
2493
2494 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2495 ctx->voidt, args, 9, 0);
2496 }
2497
2498 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
2499 LLVMValueRef *color, unsigned index,
2500 unsigned samplemask_param,
2501 bool is_last)
2502 {
2503 struct si_shader_context *ctx = si_shader_context(bld_base);
2504 struct lp_build_context *base = &bld_base->base;
2505 int i;
2506
2507 /* Clamp color */
2508 if (ctx->shader->key.ps.epilog.clamp_color)
2509 for (i = 0; i < 4; i++)
2510 color[i] = radeon_llvm_saturate(bld_base, color[i]);
2511
2512 /* Alpha to one */
2513 if (ctx->shader->key.ps.epilog.alpha_to_one)
2514 color[3] = base->one;
2515
2516 /* Alpha test */
2517 if (index == 0 &&
2518 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
2519 si_alpha_test(bld_base, color[3]);
2520
2521 /* Line & polygon smoothing */
2522 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
2523 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
2524 samplemask_param);
2525
2526 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2527 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
2528 LLVMValueRef args[8][9];
2529 int c, last = -1;
2530
2531 /* Get the export arguments, also find out what the last one is. */
2532 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2533 si_llvm_init_export_args(bld_base, color,
2534 V_008DFC_SQ_EXP_MRT + c, args[c]);
2535 if (args[c][0] != bld_base->uint_bld.zero)
2536 last = c;
2537 }
2538
2539 /* Emit all exports. */
2540 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
2541 if (is_last && last == c) {
2542 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2543 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
2544 } else if (args[c][0] == bld_base->uint_bld.zero)
2545 continue; /* unnecessary NULL export */
2546
2547 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2548 ctx->voidt, args[c], 9, 0);
2549 }
2550 } else {
2551 LLVMValueRef args[9];
2552
2553 /* Export */
2554 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
2555 args);
2556 if (is_last) {
2557 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
2558 args[2] = bld_base->uint_bld.one; /* DONE bit */
2559 } else if (args[0] == bld_base->uint_bld.zero)
2560 return; /* unnecessary NULL export */
2561
2562 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2563 ctx->voidt, args, 9, 0);
2564 }
2565 }
2566
2567 static void si_export_null(struct lp_build_tgsi_context *bld_base)
2568 {
2569 struct si_shader_context *ctx = si_shader_context(bld_base);
2570 struct lp_build_context *base = &bld_base->base;
2571 struct lp_build_context *uint = &bld_base->uint_bld;
2572 LLVMValueRef args[9];
2573
2574 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
2575 args[1] = uint->one; /* whether the EXEC mask is valid */
2576 args[2] = uint->one; /* DONE bit */
2577 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
2578 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
2579 args[5] = uint->undef; /* R */
2580 args[6] = uint->undef; /* G */
2581 args[7] = uint->undef; /* B */
2582 args[8] = uint->undef; /* A */
2583
2584 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2585 ctx->voidt, args, 9, 0);
2586 }
2587
2588 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
2589 {
2590 struct si_shader_context *ctx = si_shader_context(bld_base);
2591 struct si_shader *shader = ctx->shader;
2592 struct lp_build_context *base = &bld_base->base;
2593 struct tgsi_shader_info *info = &shader->selector->info;
2594 LLVMBuilderRef builder = base->gallivm->builder;
2595 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2596 int last_color_export = -1;
2597 int i;
2598
2599 /* Determine the last export. If MRTZ is present, it's always last.
2600 * Otherwise, find the last color export.
2601 */
2602 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
2603 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
2604
2605 /* Don't export NULL and return if alpha-test is enabled. */
2606 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
2607 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
2608 (spi_format & 0xf) == 0)
2609 spi_format |= V_028714_SPI_SHADER_32_AR;
2610
2611 for (i = 0; i < info->num_outputs; i++) {
2612 unsigned index = info->output_semantic_index[i];
2613
2614 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
2615 continue;
2616
2617 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
2618 if (shader->key.ps.epilog.last_cbuf > 0) {
2619 /* Just set this if any of the colorbuffers are enabled. */
2620 if (spi_format &
2621 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
2622 last_color_export = i;
2623 continue;
2624 }
2625
2626 if ((spi_format >> (index * 4)) & 0xf)
2627 last_color_export = i;
2628 }
2629
2630 /* If there are no outputs, export NULL. */
2631 if (last_color_export == -1) {
2632 si_export_null(bld_base);
2633 return;
2634 }
2635 }
2636
2637 for (i = 0; i < info->num_outputs; i++) {
2638 unsigned semantic_name = info->output_semantic_name[i];
2639 unsigned semantic_index = info->output_semantic_index[i];
2640 unsigned j;
2641 LLVMValueRef color[4] = {};
2642
2643 /* Select the correct target */
2644 switch (semantic_name) {
2645 case TGSI_SEMANTIC_POSITION:
2646 depth = LLVMBuildLoad(builder,
2647 ctx->radeon_bld.soa.outputs[i][2], "");
2648 break;
2649 case TGSI_SEMANTIC_STENCIL:
2650 stencil = LLVMBuildLoad(builder,
2651 ctx->radeon_bld.soa.outputs[i][1], "");
2652 break;
2653 case TGSI_SEMANTIC_SAMPLEMASK:
2654 samplemask = LLVMBuildLoad(builder,
2655 ctx->radeon_bld.soa.outputs[i][0], "");
2656 break;
2657 case TGSI_SEMANTIC_COLOR:
2658 for (j = 0; j < 4; j++)
2659 color[j] = LLVMBuildLoad(builder,
2660 ctx->radeon_bld.soa.outputs[i][j], "");
2661
2662 si_export_mrt_color(bld_base, color, semantic_index,
2663 SI_PARAM_SAMPLE_COVERAGE,
2664 last_color_export == i);
2665 break;
2666 default:
2667 fprintf(stderr,
2668 "Warning: SI unhandled fs output type:%d\n",
2669 semantic_name);
2670 }
2671 }
2672
2673 if (depth || stencil || samplemask)
2674 si_export_mrt_z(bld_base, depth, stencil, samplemask);
2675 }
2676
2677 /**
2678 * Return PS outputs in this order:
2679 *
2680 * v[0:3] = color0.xyzw
2681 * v[4:7] = color1.xyzw
2682 * ...
2683 * vN+0 = Depth
2684 * vN+1 = Stencil
2685 * vN+2 = SampleMask
2686 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
2687 *
2688 * The alpha-ref SGPR is returned via its original location.
2689 */
2690 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
2691 {
2692 struct si_shader_context *ctx = si_shader_context(bld_base);
2693 struct si_shader *shader = ctx->shader;
2694 struct lp_build_context *base = &bld_base->base;
2695 struct tgsi_shader_info *info = &shader->selector->info;
2696 LLVMBuilderRef builder = base->gallivm->builder;
2697 unsigned i, j, first_vgpr, vgpr;
2698
2699 LLVMValueRef color[8][4] = {};
2700 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
2701 LLVMValueRef ret;
2702
2703 /* Read the output values. */
2704 for (i = 0; i < info->num_outputs; i++) {
2705 unsigned semantic_name = info->output_semantic_name[i];
2706 unsigned semantic_index = info->output_semantic_index[i];
2707
2708 switch (semantic_name) {
2709 case TGSI_SEMANTIC_COLOR:
2710 assert(semantic_index < 8);
2711 for (j = 0; j < 4; j++) {
2712 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
2713 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
2714 color[semantic_index][j] = result;
2715 }
2716 break;
2717 case TGSI_SEMANTIC_POSITION:
2718 depth = LLVMBuildLoad(builder,
2719 ctx->radeon_bld.soa.outputs[i][2], "");
2720 break;
2721 case TGSI_SEMANTIC_STENCIL:
2722 stencil = LLVMBuildLoad(builder,
2723 ctx->radeon_bld.soa.outputs[i][1], "");
2724 break;
2725 case TGSI_SEMANTIC_SAMPLEMASK:
2726 samplemask = LLVMBuildLoad(builder,
2727 ctx->radeon_bld.soa.outputs[i][0], "");
2728 break;
2729 default:
2730 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
2731 semantic_name);
2732 }
2733 }
2734
2735 /* Fill the return structure. */
2736 ret = ctx->return_value;
2737
2738 /* Set SGPRs. */
2739 ret = LLVMBuildInsertValue(builder, ret,
2740 bitcast(bld_base, TGSI_TYPE_SIGNED,
2741 LLVMGetParam(ctx->radeon_bld.main_fn,
2742 SI_PARAM_ALPHA_REF)),
2743 SI_SGPR_ALPHA_REF, "");
2744
2745 /* Set VGPRs */
2746 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
2747 for (i = 0; i < ARRAY_SIZE(color); i++) {
2748 if (!color[i][0])
2749 continue;
2750
2751 for (j = 0; j < 4; j++)
2752 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
2753 }
2754 if (depth)
2755 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
2756 if (stencil)
2757 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
2758 if (samplemask)
2759 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
2760
2761 /* Add the input sample mask for smoothing at the end. */
2762 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
2763 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
2764 ret = LLVMBuildInsertValue(builder, ret,
2765 LLVMGetParam(ctx->radeon_bld.main_fn,
2766 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
2767
2768 ctx->return_value = ret;
2769 }
2770
2771 /**
2772 * Given a v8i32 resource descriptor for a buffer, extract the size of the
2773 * buffer in number of elements and return it as an i32.
2774 */
2775 static LLVMValueRef get_buffer_size(
2776 struct lp_build_tgsi_context *bld_base,
2777 LLVMValueRef descriptor)
2778 {
2779 struct si_shader_context *ctx = si_shader_context(bld_base);
2780 struct gallivm_state *gallivm = bld_base->base.gallivm;
2781 LLVMBuilderRef builder = gallivm->builder;
2782 LLVMValueRef size =
2783 LLVMBuildExtractElement(builder, descriptor,
2784 lp_build_const_int32(gallivm, 6), "");
2785
2786 if (ctx->screen->b.chip_class >= VI) {
2787 /* On VI, the descriptor contains the size in bytes,
2788 * but TXQ must return the size in elements.
2789 * The stride is always non-zero for resources using TXQ.
2790 */
2791 LLVMValueRef stride =
2792 LLVMBuildExtractElement(builder, descriptor,
2793 lp_build_const_int32(gallivm, 5), "");
2794 stride = LLVMBuildLShr(builder, stride,
2795 lp_build_const_int32(gallivm, 16), "");
2796 stride = LLVMBuildAnd(builder, stride,
2797 lp_build_const_int32(gallivm, 0x3FFF), "");
2798
2799 size = LLVMBuildUDiv(builder, size, stride, "");
2800 }
2801
2802 return size;
2803 }
2804
2805 /**
2806 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
2807 * intrinsic names).
2808 */
2809 static void build_int_type_name(
2810 LLVMTypeRef type,
2811 char *buf, unsigned bufsize)
2812 {
2813 assert(bufsize >= 6);
2814
2815 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
2816 snprintf(buf, bufsize, "v%ui32",
2817 LLVMGetVectorSize(type));
2818 else
2819 strcpy(buf, "i32");
2820 }
2821
2822 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
2823 struct lp_build_tgsi_context *bld_base,
2824 struct lp_build_emit_data *emit_data);
2825
2826 /* Prevent optimizations (at least of memory accesses) across the current
2827 * point in the program by emitting empty inline assembly that is marked as
2828 * having side effects.
2829 */
2830 static void emit_optimization_barrier(struct si_shader_context *ctx)
2831 {
2832 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2833 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
2834 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
2835 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
2836 }
2837
2838 static void membar_emit(
2839 const struct lp_build_tgsi_action *action,
2840 struct lp_build_tgsi_context *bld_base,
2841 struct lp_build_emit_data *emit_data)
2842 {
2843 struct si_shader_context *ctx = si_shader_context(bld_base);
2844
2845 /* Since memoryBarrier only makes guarantees about atomics and
2846 * coherent image accesses (which bypass TC L1), we do not need to emit
2847 * any special cache handling here.
2848 *
2849 * We do have to prevent LLVM from re-ordering loads across
2850 * the barrier though.
2851 */
2852 emit_optimization_barrier(ctx);
2853 }
2854
2855 static LLVMValueRef
2856 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
2857 const struct tgsi_full_src_register *reg)
2858 {
2859 LLVMValueRef ind_index;
2860 LLVMValueRef rsrc_ptr;
2861
2862 if (!reg->Register.Indirect)
2863 return ctx->shader_buffers[reg->Register.Index];
2864
2865 ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
2866 reg->Register.Index,
2867 SI_NUM_SHADER_BUFFERS);
2868
2869 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
2870 return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
2871 }
2872
2873 static bool tgsi_is_array_sampler(unsigned target)
2874 {
2875 return target == TGSI_TEXTURE_1D_ARRAY ||
2876 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
2877 target == TGSI_TEXTURE_2D_ARRAY ||
2878 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
2879 target == TGSI_TEXTURE_CUBE_ARRAY ||
2880 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
2881 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2882 }
2883
2884 static bool tgsi_is_array_image(unsigned target)
2885 {
2886 return target == TGSI_TEXTURE_3D ||
2887 target == TGSI_TEXTURE_CUBE ||
2888 target == TGSI_TEXTURE_1D_ARRAY ||
2889 target == TGSI_TEXTURE_2D_ARRAY ||
2890 target == TGSI_TEXTURE_CUBE_ARRAY ||
2891 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
2892 }
2893
2894 /**
2895 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
2896 *
2897 * At least on Tonga, executing image stores on images with DCC enabled and
2898 * non-trivial can eventually lead to lockups. This can occur when an
2899 * application binds an image as read-only but then uses a shader that writes
2900 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
2901 * program termination) in this case, but it doesn't cost much to be a bit
2902 * nicer: disabling DCC in the shader still leads to undefined results but
2903 * avoids the lockup.
2904 */
2905 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
2906 LLVMValueRef rsrc)
2907 {
2908 if (ctx->screen->b.chip_class <= CIK) {
2909 return rsrc;
2910 } else {
2911 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
2912 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
2913 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
2914 LLVMValueRef tmp;
2915
2916 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
2917 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
2918 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
2919 }
2920 }
2921
2922 /**
2923 * Load the resource descriptor for \p image.
2924 */
2925 static void
2926 image_fetch_rsrc(
2927 struct lp_build_tgsi_context *bld_base,
2928 const struct tgsi_full_src_register *image,
2929 bool dcc_off,
2930 LLVMValueRef *rsrc)
2931 {
2932 struct si_shader_context *ctx = si_shader_context(bld_base);
2933
2934 assert(image->Register.File == TGSI_FILE_IMAGE);
2935
2936 if (!image->Register.Indirect) {
2937 /* Fast path: use preloaded resources */
2938 *rsrc = ctx->images[image->Register.Index];
2939 } else {
2940 /* Indexing and manual load */
2941 LLVMValueRef ind_index;
2942 LLVMValueRef rsrc_ptr;
2943 LLVMValueRef tmp;
2944
2945 /* From the GL_ARB_shader_image_load_store extension spec:
2946 *
2947 * If a shader performs an image load, store, or atomic
2948 * operation using an image variable declared as an array,
2949 * and if the index used to select an individual element is
2950 * negative or greater than or equal to the size of the
2951 * array, the results of the operation are undefined but may
2952 * not lead to termination.
2953 */
2954 ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
2955 image->Register.Index,
2956 SI_NUM_IMAGES);
2957
2958 rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
2959 tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
2960 if (dcc_off)
2961 tmp = force_dcc_off(ctx, tmp);
2962 *rsrc = tmp;
2963 }
2964 }
2965
2966 static LLVMValueRef image_fetch_coords(
2967 struct lp_build_tgsi_context *bld_base,
2968 const struct tgsi_full_instruction *inst,
2969 unsigned src)
2970 {
2971 struct gallivm_state *gallivm = bld_base->base.gallivm;
2972 LLVMBuilderRef builder = gallivm->builder;
2973 unsigned target = inst->Memory.Texture;
2974 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
2975 LLVMValueRef coords[4];
2976 LLVMValueRef tmp;
2977 int chan;
2978
2979 for (chan = 0; chan < num_coords; ++chan) {
2980 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
2981 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
2982 coords[chan] = tmp;
2983 }
2984
2985 if (num_coords == 1)
2986 return coords[0];
2987
2988 if (num_coords == 3) {
2989 /* LLVM has difficulties lowering 3-element vectors. */
2990 coords[3] = bld_base->uint_bld.undef;
2991 num_coords = 4;
2992 }
2993
2994 return lp_build_gather_values(gallivm, coords, num_coords);
2995 }
2996
2997 /**
2998 * Append the extra mode bits that are used by image load and store.
2999 */
3000 static void image_append_args(
3001 struct si_shader_context *ctx,
3002 struct lp_build_emit_data * emit_data,
3003 unsigned target,
3004 bool atomic)
3005 {
3006 const struct tgsi_full_instruction *inst = emit_data->inst;
3007 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3008 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3009
3010 emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
3011 emit_data->args[emit_data->arg_count++] =
3012 tgsi_is_array_image(target) ? i1true : i1false; /* da */
3013 if (!atomic) {
3014 emit_data->args[emit_data->arg_count++] =
3015 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3016 i1true : i1false; /* glc */
3017 }
3018 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3019 }
3020
3021 /**
3022 * Given a 256 bit resource, extract the top half (which stores the buffer
3023 * resource in the case of textures and images).
3024 */
3025 static LLVMValueRef extract_rsrc_top_half(
3026 struct si_shader_context *ctx,
3027 LLVMValueRef rsrc)
3028 {
3029 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3030 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3031 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3032
3033 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3034 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3035 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3036
3037 return rsrc;
3038 }
3039
3040 /**
3041 * Append the resource and indexing arguments for buffer intrinsics.
3042 *
3043 * \param rsrc the v4i32 buffer resource
3044 * \param index index into the buffer (stride-based)
3045 * \param offset byte offset into the buffer
3046 */
3047 static void buffer_append_args(
3048 struct si_shader_context *ctx,
3049 struct lp_build_emit_data *emit_data,
3050 LLVMValueRef rsrc,
3051 LLVMValueRef index,
3052 LLVMValueRef offset,
3053 bool atomic)
3054 {
3055 const struct tgsi_full_instruction *inst = emit_data->inst;
3056 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3057 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3058
3059 emit_data->args[emit_data->arg_count++] = rsrc;
3060 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3061 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3062 if (!atomic) {
3063 emit_data->args[emit_data->arg_count++] =
3064 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3065 i1true : i1false; /* glc */
3066 }
3067 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3068 }
3069
3070 static void load_fetch_args(
3071 struct lp_build_tgsi_context * bld_base,
3072 struct lp_build_emit_data * emit_data)
3073 {
3074 struct si_shader_context *ctx = si_shader_context(bld_base);
3075 struct gallivm_state *gallivm = bld_base->base.gallivm;
3076 const struct tgsi_full_instruction * inst = emit_data->inst;
3077 unsigned target = inst->Memory.Texture;
3078 LLVMValueRef rsrc;
3079
3080 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3081
3082 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3083 LLVMBuilderRef builder = gallivm->builder;
3084 LLVMValueRef offset;
3085 LLVMValueRef tmp;
3086
3087 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3088
3089 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3090 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3091
3092 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3093 offset, false);
3094 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3095 LLVMValueRef coords;
3096
3097 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3098 coords = image_fetch_coords(bld_base, inst, 1);
3099
3100 if (target == TGSI_TEXTURE_BUFFER) {
3101 rsrc = extract_rsrc_top_half(ctx, rsrc);
3102 buffer_append_args(ctx, emit_data, rsrc, coords,
3103 bld_base->uint_bld.zero, false);
3104 } else {
3105 emit_data->args[0] = coords;
3106 emit_data->args[1] = rsrc;
3107 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3108 emit_data->arg_count = 3;
3109
3110 image_append_args(ctx, emit_data, target, false);
3111 }
3112 }
3113 }
3114
3115 static void load_emit_buffer(struct si_shader_context *ctx,
3116 struct lp_build_emit_data *emit_data)
3117 {
3118 const struct tgsi_full_instruction *inst = emit_data->inst;
3119 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3120 LLVMBuilderRef builder = gallivm->builder;
3121 uint writemask = inst->Dst[0].Register.WriteMask;
3122 uint count = util_last_bit(writemask);
3123 const char *intrinsic_name;
3124 LLVMTypeRef dst_type;
3125
3126 switch (count) {
3127 case 1:
3128 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3129 dst_type = ctx->f32;
3130 break;
3131 case 2:
3132 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3133 dst_type = LLVMVectorType(ctx->f32, 2);
3134 break;
3135 default: // 3 & 4
3136 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3137 dst_type = ctx->v4f32;
3138 count = 4;
3139 }
3140
3141 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3142 builder, intrinsic_name, dst_type,
3143 emit_data->args, emit_data->arg_count,
3144 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3145 }
3146
3147 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3148 const struct tgsi_full_instruction *inst,
3149 LLVMTypeRef type, int arg)
3150 {
3151 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3152 LLVMBuilderRef builder = gallivm->builder;
3153 LLVMValueRef offset, ptr;
3154 int addr_space;
3155
3156 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3157 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3158
3159 ptr = ctx->shared_memory;
3160 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3161 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3162 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3163
3164 return ptr;
3165 }
3166
3167 static void load_emit_memory(
3168 struct si_shader_context *ctx,
3169 struct lp_build_emit_data *emit_data)
3170 {
3171 const struct tgsi_full_instruction *inst = emit_data->inst;
3172 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3173 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3174 LLVMBuilderRef builder = gallivm->builder;
3175 unsigned writemask = inst->Dst[0].Register.WriteMask;
3176 LLVMValueRef channels[4], ptr, derived_ptr, index;
3177 int chan;
3178
3179 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3180
3181 for (chan = 0; chan < 4; ++chan) {
3182 if (!(writemask & (1 << chan))) {
3183 channels[chan] = LLVMGetUndef(base->elem_type);
3184 continue;
3185 }
3186
3187 index = lp_build_const_int32(gallivm, chan);
3188 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3189 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3190 }
3191 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3192 }
3193
3194 static void load_emit(
3195 const struct lp_build_tgsi_action *action,
3196 struct lp_build_tgsi_context *bld_base,
3197 struct lp_build_emit_data *emit_data)
3198 {
3199 struct si_shader_context *ctx = si_shader_context(bld_base);
3200 struct gallivm_state *gallivm = bld_base->base.gallivm;
3201 LLVMBuilderRef builder = gallivm->builder;
3202 const struct tgsi_full_instruction * inst = emit_data->inst;
3203 char intrinsic_name[32];
3204 char coords_type[8];
3205
3206 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3207 load_emit_memory(ctx, emit_data);
3208 return;
3209 }
3210
3211 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3212 emit_optimization_barrier(ctx);
3213
3214 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3215 load_emit_buffer(ctx, emit_data);
3216 return;
3217 }
3218
3219 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3220 emit_data->output[emit_data->chan] =
3221 lp_build_intrinsic(
3222 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3223 emit_data->args, emit_data->arg_count,
3224 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3225 } else {
3226 build_int_type_name(LLVMTypeOf(emit_data->args[0]),
3227 coords_type, sizeof(coords_type));
3228
3229 snprintf(intrinsic_name, sizeof(intrinsic_name),
3230 "llvm.amdgcn.image.load.%s", coords_type);
3231
3232 emit_data->output[emit_data->chan] =
3233 lp_build_intrinsic(
3234 builder, intrinsic_name, emit_data->dst_type,
3235 emit_data->args, emit_data->arg_count,
3236 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
3237 }
3238 }
3239
3240 static void store_fetch_args(
3241 struct lp_build_tgsi_context * bld_base,
3242 struct lp_build_emit_data * emit_data)
3243 {
3244 struct si_shader_context *ctx = si_shader_context(bld_base);
3245 struct gallivm_state *gallivm = bld_base->base.gallivm;
3246 LLVMBuilderRef builder = gallivm->builder;
3247 const struct tgsi_full_instruction * inst = emit_data->inst;
3248 struct tgsi_full_src_register memory;
3249 LLVMValueRef chans[4];
3250 LLVMValueRef data;
3251 LLVMValueRef rsrc;
3252 unsigned chan;
3253
3254 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3255
3256 for (chan = 0; chan < 4; ++chan) {
3257 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3258 }
3259 data = lp_build_gather_values(gallivm, chans, 4);
3260
3261 emit_data->args[emit_data->arg_count++] = data;
3262
3263 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3264
3265 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3266 LLVMValueRef offset;
3267 LLVMValueRef tmp;
3268
3269 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3270
3271 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3272 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3273
3274 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3275 offset, false);
3276 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3277 unsigned target = inst->Memory.Texture;
3278 LLVMValueRef coords;
3279
3280 coords = image_fetch_coords(bld_base, inst, 0);
3281
3282 if (target == TGSI_TEXTURE_BUFFER) {
3283 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3284
3285 rsrc = extract_rsrc_top_half(ctx, rsrc);
3286 buffer_append_args(ctx, emit_data, rsrc, coords,
3287 bld_base->uint_bld.zero, false);
3288 } else {
3289 emit_data->args[1] = coords;
3290 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3291 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3292 emit_data->arg_count = 4;
3293
3294 image_append_args(ctx, emit_data, target, false);
3295 }
3296 }
3297 }
3298
3299 static void store_emit_buffer(
3300 struct si_shader_context *ctx,
3301 struct lp_build_emit_data *emit_data)
3302 {
3303 const struct tgsi_full_instruction *inst = emit_data->inst;
3304 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3305 LLVMBuilderRef builder = gallivm->builder;
3306 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3307 LLVMValueRef base_data = emit_data->args[0];
3308 LLVMValueRef base_offset = emit_data->args[3];
3309 unsigned writemask = inst->Dst[0].Register.WriteMask;
3310
3311 while (writemask) {
3312 int start, count;
3313 const char *intrinsic_name;
3314 LLVMValueRef data;
3315 LLVMValueRef offset;
3316 LLVMValueRef tmp;
3317
3318 u_bit_scan_consecutive_range(&writemask, &start, &count);
3319
3320 /* Due to an LLVM limitation, split 3-element writes
3321 * into a 2-element and a 1-element write. */
3322 if (count == 3) {
3323 writemask |= 1 << (start + 2);
3324 count = 2;
3325 }
3326
3327 if (count == 4) {
3328 data = base_data;
3329 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3330 } else if (count == 2) {
3331 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3332
3333 tmp = LLVMBuildExtractElement(
3334 builder, base_data,
3335 lp_build_const_int32(gallivm, start), "");
3336 data = LLVMBuildInsertElement(
3337 builder, LLVMGetUndef(v2f32), tmp,
3338 uint_bld->zero, "");
3339
3340 tmp = LLVMBuildExtractElement(
3341 builder, base_data,
3342 lp_build_const_int32(gallivm, start + 1), "");
3343 data = LLVMBuildInsertElement(
3344 builder, data, tmp, uint_bld->one, "");
3345
3346 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3347 } else {
3348 assert(count == 1);
3349 data = LLVMBuildExtractElement(
3350 builder, base_data,
3351 lp_build_const_int32(gallivm, start), "");
3352 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3353 }
3354
3355 offset = base_offset;
3356 if (start != 0) {
3357 offset = LLVMBuildAdd(
3358 builder, offset,
3359 lp_build_const_int32(gallivm, start * 4), "");
3360 }
3361
3362 emit_data->args[0] = data;
3363 emit_data->args[3] = offset;
3364
3365 lp_build_intrinsic(
3366 builder, intrinsic_name, emit_data->dst_type,
3367 emit_data->args, emit_data->arg_count,
3368 LLVMNoUnwindAttribute);
3369 }
3370 }
3371
3372 static void store_emit_memory(
3373 struct si_shader_context *ctx,
3374 struct lp_build_emit_data *emit_data)
3375 {
3376 const struct tgsi_full_instruction *inst = emit_data->inst;
3377 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3378 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3379 LLVMBuilderRef builder = gallivm->builder;
3380 unsigned writemask = inst->Dst[0].Register.WriteMask;
3381 LLVMValueRef ptr, derived_ptr, data, index;
3382 int chan;
3383
3384 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3385
3386 for (chan = 0; chan < 4; ++chan) {
3387 if (!(writemask & (1 << chan))) {
3388 continue;
3389 }
3390 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3391 index = lp_build_const_int32(gallivm, chan);
3392 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3393 LLVMBuildStore(builder, data, derived_ptr);
3394 }
3395 }
3396
3397 static void store_emit(
3398 const struct lp_build_tgsi_action *action,
3399 struct lp_build_tgsi_context *bld_base,
3400 struct lp_build_emit_data *emit_data)
3401 {
3402 struct gallivm_state *gallivm = bld_base->base.gallivm;
3403 LLVMBuilderRef builder = gallivm->builder;
3404 const struct tgsi_full_instruction * inst = emit_data->inst;
3405 unsigned target = inst->Memory.Texture;
3406 char intrinsic_name[32];
3407 char coords_type[8];
3408
3409 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3410 store_emit_buffer(si_shader_context(bld_base), emit_data);
3411 return;
3412 } else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
3413 store_emit_memory(si_shader_context(bld_base), emit_data);
3414 return;
3415 }
3416
3417 if (target == TGSI_TEXTURE_BUFFER) {
3418 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3419 builder, "llvm.amdgcn.buffer.store.format.v4f32",
3420 emit_data->dst_type, emit_data->args, emit_data->arg_count,
3421 LLVMNoUnwindAttribute);
3422 } else {
3423 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3424 coords_type, sizeof(coords_type));
3425 snprintf(intrinsic_name, sizeof(intrinsic_name),
3426 "llvm.amdgcn.image.store.%s", coords_type);
3427
3428 emit_data->output[emit_data->chan] =
3429 lp_build_intrinsic(
3430 builder, intrinsic_name, emit_data->dst_type,
3431 emit_data->args, emit_data->arg_count,
3432 LLVMNoUnwindAttribute);
3433 }
3434 }
3435
3436 static void atomic_fetch_args(
3437 struct lp_build_tgsi_context * bld_base,
3438 struct lp_build_emit_data * emit_data)
3439 {
3440 struct si_shader_context *ctx = si_shader_context(bld_base);
3441 struct gallivm_state *gallivm = bld_base->base.gallivm;
3442 LLVMBuilderRef builder = gallivm->builder;
3443 const struct tgsi_full_instruction * inst = emit_data->inst;
3444 LLVMValueRef data1, data2;
3445 LLVMValueRef rsrc;
3446 LLVMValueRef tmp;
3447
3448 emit_data->dst_type = bld_base->base.elem_type;
3449
3450 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
3451 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3452
3453 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3454 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
3455 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3456 }
3457
3458 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
3459 * of arguments, which is reversed relative to TGSI (and GLSL)
3460 */
3461 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3462 emit_data->args[emit_data->arg_count++] = data2;
3463 emit_data->args[emit_data->arg_count++] = data1;
3464
3465 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3466 LLVMValueRef offset;
3467
3468 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3469
3470 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3471 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3472
3473 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3474 offset, true);
3475 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3476 unsigned target = inst->Memory.Texture;
3477 LLVMValueRef coords;
3478
3479 image_fetch_rsrc(bld_base, &inst->Src[0],
3480 target != TGSI_TEXTURE_BUFFER, &rsrc);
3481 coords = image_fetch_coords(bld_base, inst, 1);
3482
3483 if (target == TGSI_TEXTURE_BUFFER) {
3484 rsrc = extract_rsrc_top_half(ctx, rsrc);
3485 buffer_append_args(ctx, emit_data, rsrc, coords,
3486 bld_base->uint_bld.zero, true);
3487 } else {
3488 emit_data->args[emit_data->arg_count++] = coords;
3489 emit_data->args[emit_data->arg_count++] = rsrc;
3490
3491 image_append_args(ctx, emit_data, target, true);
3492 }
3493 }
3494 }
3495
3496 static void atomic_emit_memory(struct si_shader_context *ctx,
3497 struct lp_build_emit_data *emit_data) {
3498 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3499 LLVMBuilderRef builder = gallivm->builder;
3500 const struct tgsi_full_instruction * inst = emit_data->inst;
3501 LLVMValueRef ptr, result, arg;
3502
3503 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
3504
3505 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
3506 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
3507
3508 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3509 LLVMValueRef new_data;
3510 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
3511 inst, 3, 0);
3512
3513 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
3514
3515 #if HAVE_LLVM >= 0x309
3516 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
3517 LLVMAtomicOrderingSequentiallyConsistent,
3518 LLVMAtomicOrderingSequentiallyConsistent,
3519 false);
3520 #endif
3521
3522 result = LLVMBuildExtractValue(builder, result, 0, "");
3523 } else {
3524 LLVMAtomicRMWBinOp op;
3525
3526 switch(inst->Instruction.Opcode) {
3527 case TGSI_OPCODE_ATOMUADD:
3528 op = LLVMAtomicRMWBinOpAdd;
3529 break;
3530 case TGSI_OPCODE_ATOMXCHG:
3531 op = LLVMAtomicRMWBinOpXchg;
3532 break;
3533 case TGSI_OPCODE_ATOMAND:
3534 op = LLVMAtomicRMWBinOpAnd;
3535 break;
3536 case TGSI_OPCODE_ATOMOR:
3537 op = LLVMAtomicRMWBinOpOr;
3538 break;
3539 case TGSI_OPCODE_ATOMXOR:
3540 op = LLVMAtomicRMWBinOpXor;
3541 break;
3542 case TGSI_OPCODE_ATOMUMIN:
3543 op = LLVMAtomicRMWBinOpUMin;
3544 break;
3545 case TGSI_OPCODE_ATOMUMAX:
3546 op = LLVMAtomicRMWBinOpUMax;
3547 break;
3548 case TGSI_OPCODE_ATOMIMIN:
3549 op = LLVMAtomicRMWBinOpMin;
3550 break;
3551 case TGSI_OPCODE_ATOMIMAX:
3552 op = LLVMAtomicRMWBinOpMax;
3553 break;
3554 default:
3555 unreachable("unknown atomic opcode");
3556 }
3557
3558 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
3559 LLVMAtomicOrderingSequentiallyConsistent,
3560 false);
3561 }
3562 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
3563 }
3564
3565 static void atomic_emit(
3566 const struct lp_build_tgsi_action *action,
3567 struct lp_build_tgsi_context *bld_base,
3568 struct lp_build_emit_data *emit_data)
3569 {
3570 struct si_shader_context *ctx = si_shader_context(bld_base);
3571 struct gallivm_state *gallivm = bld_base->base.gallivm;
3572 LLVMBuilderRef builder = gallivm->builder;
3573 const struct tgsi_full_instruction * inst = emit_data->inst;
3574 char intrinsic_name[40];
3575 LLVMValueRef tmp;
3576
3577 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3578 atomic_emit_memory(ctx, emit_data);
3579 return;
3580 }
3581
3582 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
3583 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3584 snprintf(intrinsic_name, sizeof(intrinsic_name),
3585 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
3586 } else {
3587 char coords_type[8];
3588
3589 build_int_type_name(LLVMTypeOf(emit_data->args[1]),
3590 coords_type, sizeof(coords_type));
3591 snprintf(intrinsic_name, sizeof(intrinsic_name),
3592 "llvm.amdgcn.image.atomic.%s.%s",
3593 action->intr_name, coords_type);
3594 }
3595
3596 tmp = lp_build_intrinsic(
3597 builder, intrinsic_name, bld_base->uint_bld.elem_type,
3598 emit_data->args, emit_data->arg_count,
3599 LLVMNoUnwindAttribute);
3600 emit_data->output[emit_data->chan] =
3601 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
3602 }
3603
3604 static void resq_fetch_args(
3605 struct lp_build_tgsi_context * bld_base,
3606 struct lp_build_emit_data * emit_data)
3607 {
3608 struct si_shader_context *ctx = si_shader_context(bld_base);
3609 struct gallivm_state *gallivm = bld_base->base.gallivm;
3610 const struct tgsi_full_instruction *inst = emit_data->inst;
3611 const struct tgsi_full_src_register *reg = &inst->Src[0];
3612
3613 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3614
3615 if (reg->Register.File == TGSI_FILE_BUFFER) {
3616 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
3617 emit_data->arg_count = 1;
3618 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3619 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
3620 emit_data->arg_count = 1;
3621 } else {
3622 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
3623 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
3624 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3625 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
3626 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
3627 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
3628 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
3629 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
3630 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
3631 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
3632 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
3633 emit_data->arg_count = 10;
3634 }
3635 }
3636
3637 static void resq_emit(
3638 const struct lp_build_tgsi_action *action,
3639 struct lp_build_tgsi_context *bld_base,
3640 struct lp_build_emit_data *emit_data)
3641 {
3642 struct gallivm_state *gallivm = bld_base->base.gallivm;
3643 LLVMBuilderRef builder = gallivm->builder;
3644 const struct tgsi_full_instruction *inst = emit_data->inst;
3645 LLVMValueRef out;
3646
3647 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3648 out = LLVMBuildExtractElement(builder, emit_data->args[0],
3649 lp_build_const_int32(gallivm, 2), "");
3650 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3651 out = get_buffer_size(bld_base, emit_data->args[0]);
3652 } else {
3653 out = lp_build_intrinsic(
3654 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
3655 emit_data->args, emit_data->arg_count,
3656 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
3657
3658 /* Divide the number of layers by 6 to get the number of cubes. */
3659 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
3660 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
3661 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
3662
3663 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
3664 z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
3665 z = LLVMBuildSDiv(builder, z, imm6, "");
3666 z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
3667 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
3668 }
3669 }
3670
3671 emit_data->output[emit_data->chan] = out;
3672 }
3673
3674 static void set_tex_fetch_args(struct si_shader_context *ctx,
3675 struct lp_build_emit_data *emit_data,
3676 unsigned opcode, unsigned target,
3677 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
3678 LLVMValueRef *param, unsigned count,
3679 unsigned dmask)
3680 {
3681 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3682 unsigned num_args;
3683 unsigned is_rect = target == TGSI_TEXTURE_RECT;
3684
3685 /* Pad to power of two vector */
3686 while (count < util_next_power_of_two(count))
3687 param[count++] = LLVMGetUndef(ctx->i32);
3688
3689 /* Texture coordinates. */
3690 if (count > 1)
3691 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
3692 else
3693 emit_data->args[0] = param[0];
3694
3695 /* Resource. */
3696 emit_data->args[1] = res_ptr;
3697 num_args = 2;
3698
3699 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
3700 emit_data->dst_type = ctx->v4i32;
3701 else {
3702 emit_data->dst_type = ctx->v4f32;
3703
3704 emit_data->args[num_args++] = samp_ptr;
3705 }
3706
3707 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
3708 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
3709 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
3710 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
3711 tgsi_is_array_sampler(target)); /* da */
3712 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
3713 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
3714 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
3715 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
3716
3717 emit_data->arg_count = num_args;
3718 }
3719
3720 static const struct lp_build_tgsi_action tex_action;
3721
3722 enum desc_type {
3723 DESC_IMAGE,
3724 DESC_FMASK,
3725 DESC_SAMPLER
3726 };
3727
3728 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
3729 {
3730 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
3731 CONST_ADDR_SPACE);
3732 }
3733
3734 /**
3735 * Load an image view, fmask view. or sampler state descriptor.
3736 */
3737 static LLVMValueRef get_sampler_desc_custom(struct si_shader_context *ctx,
3738 LLVMValueRef list, LLVMValueRef index,
3739 enum desc_type type)
3740 {
3741 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3742 LLVMBuilderRef builder = gallivm->builder;
3743
3744 switch (type) {
3745 case DESC_IMAGE:
3746 /* The image is at [0:7]. */
3747 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3748 break;
3749 case DESC_FMASK:
3750 /* The FMASK is at [8:15]. */
3751 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
3752 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
3753 break;
3754 case DESC_SAMPLER:
3755 /* The sampler state is at [12:15]. */
3756 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3757 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
3758 list = LLVMBuildPointerCast(builder, list,
3759 const_array(ctx->v4i32, 0), "");
3760 break;
3761 }
3762
3763 return build_indexed_load_const(ctx, list, index);
3764 }
3765
3766 static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
3767 LLVMValueRef index, enum desc_type type)
3768 {
3769 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
3770 SI_PARAM_SAMPLERS);
3771
3772 return get_sampler_desc_custom(ctx, list, index, type);
3773 }
3774
3775 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
3776 *
3777 * SI-CI:
3778 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
3779 * filtering manually. The driver sets img7 to a mask clearing
3780 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
3781 * s_and_b32 samp0, samp0, img7
3782 *
3783 * VI:
3784 * The ANISO_OVERRIDE sampler field enables this fix in TA.
3785 */
3786 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
3787 LLVMValueRef res, LLVMValueRef samp)
3788 {
3789 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3790 LLVMValueRef img7, samp0;
3791
3792 if (ctx->screen->b.chip_class >= VI)
3793 return samp;
3794
3795 img7 = LLVMBuildExtractElement(builder, res,
3796 LLVMConstInt(ctx->i32, 7, 0), "");
3797 samp0 = LLVMBuildExtractElement(builder, samp,
3798 LLVMConstInt(ctx->i32, 0, 0), "");
3799 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
3800 return LLVMBuildInsertElement(builder, samp, samp0,
3801 LLVMConstInt(ctx->i32, 0, 0), "");
3802 }
3803
3804 static void tex_fetch_ptrs(
3805 struct lp_build_tgsi_context *bld_base,
3806 struct lp_build_emit_data *emit_data,
3807 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
3808 {
3809 struct si_shader_context *ctx = si_shader_context(bld_base);
3810 const struct tgsi_full_instruction *inst = emit_data->inst;
3811 unsigned target = inst->Texture.Texture;
3812 unsigned sampler_src;
3813 unsigned sampler_index;
3814
3815 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
3816 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
3817
3818 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
3819 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
3820 LLVMValueRef ind_index;
3821
3822 ind_index = get_bounded_indirect_index(ctx,
3823 &reg->Indirect,
3824 reg->Register.Index,
3825 SI_NUM_SAMPLERS);
3826
3827 *res_ptr = get_sampler_desc(ctx, ind_index, DESC_IMAGE);
3828
3829 if (target == TGSI_TEXTURE_2D_MSAA ||
3830 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
3831 *samp_ptr = NULL;
3832 *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
3833 } else {
3834 *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
3835 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
3836 *fmask_ptr = NULL;
3837 }
3838 } else {
3839 *res_ptr = ctx->sampler_views[sampler_index];
3840 *samp_ptr = ctx->sampler_states[sampler_index];
3841 *fmask_ptr = ctx->fmasks[sampler_index];
3842 }
3843 }
3844
3845 static void tex_fetch_args(
3846 struct lp_build_tgsi_context *bld_base,
3847 struct lp_build_emit_data *emit_data)
3848 {
3849 struct si_shader_context *ctx = si_shader_context(bld_base);
3850 struct gallivm_state *gallivm = bld_base->base.gallivm;
3851 LLVMBuilderRef builder = gallivm->builder;
3852 const struct tgsi_full_instruction *inst = emit_data->inst;
3853 unsigned opcode = inst->Instruction.Opcode;
3854 unsigned target = inst->Texture.Texture;
3855 LLVMValueRef coords[5], derivs[6];
3856 LLVMValueRef address[16];
3857 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3858 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
3859 unsigned count = 0;
3860 unsigned chan;
3861 unsigned num_deriv_channels = 0;
3862 bool has_offset = inst->Texture.NumOffsets > 0;
3863 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
3864 unsigned dmask = 0xf;
3865
3866 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
3867
3868 if (opcode == TGSI_OPCODE_TXQ) {
3869 if (target == TGSI_TEXTURE_BUFFER) {
3870 /* Read the size from the buffer descriptor directly. */
3871 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
3872 emit_data->args[0] = get_buffer_size(bld_base, res);
3873 return;
3874 }
3875
3876 /* Textures - set the mip level. */
3877 address[count++] = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
3878
3879 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
3880 NULL, address, count, 0xf);
3881 return;
3882 }
3883
3884 if (target == TGSI_TEXTURE_BUFFER) {
3885 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3886
3887 /* Bitcast and truncate v8i32 to v16i8. */
3888 LLVMValueRef res = res_ptr;
3889 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
3890 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
3891 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
3892
3893 emit_data->dst_type = ctx->v4f32;
3894 emit_data->args[0] = res;
3895 emit_data->args[1] = bld_base->uint_bld.zero;
3896 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
3897 emit_data->arg_count = 3;
3898 return;
3899 }
3900
3901 /* Fetch and project texture coordinates */
3902 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
3903 for (chan = 0; chan < 3; chan++ ) {
3904 coords[chan] = lp_build_emit_fetch(bld_base,
3905 emit_data->inst, 0,
3906 chan);
3907 if (opcode == TGSI_OPCODE_TXP)
3908 coords[chan] = lp_build_emit_llvm_binary(bld_base,
3909 TGSI_OPCODE_DIV,
3910 coords[chan],
3911 coords[3]);
3912 }
3913
3914 if (opcode == TGSI_OPCODE_TXP)
3915 coords[3] = bld_base->base.one;
3916
3917 /* Pack offsets. */
3918 if (has_offset && opcode != TGSI_OPCODE_TXF) {
3919 /* The offsets are six-bit signed integers packed like this:
3920 * X=[5:0], Y=[13:8], and Z=[21:16].
3921 */
3922 LLVMValueRef offset[3], pack;
3923
3924 assert(inst->Texture.NumOffsets == 1);
3925
3926 for (chan = 0; chan < 3; chan++) {
3927 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
3928 emit_data->inst, 0, chan);
3929 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
3930 lp_build_const_int32(gallivm, 0x3f), "");
3931 if (chan)
3932 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
3933 lp_build_const_int32(gallivm, chan*8), "");
3934 }
3935
3936 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
3937 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
3938 address[count++] = pack;
3939 }
3940
3941 /* Pack LOD bias value */
3942 if (opcode == TGSI_OPCODE_TXB)
3943 address[count++] = coords[3];
3944 if (opcode == TGSI_OPCODE_TXB2)
3945 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
3946
3947 /* Pack depth comparison value */
3948 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
3949 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
3950 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
3951 } else {
3952 assert(ref_pos >= 0);
3953 address[count++] = coords[ref_pos];
3954 }
3955 }
3956
3957 /* Pack user derivatives */
3958 if (opcode == TGSI_OPCODE_TXD) {
3959 int param, num_src_deriv_channels;
3960
3961 switch (target) {
3962 case TGSI_TEXTURE_3D:
3963 num_src_deriv_channels = 3;
3964 num_deriv_channels = 3;
3965 break;
3966 case TGSI_TEXTURE_2D:
3967 case TGSI_TEXTURE_SHADOW2D:
3968 case TGSI_TEXTURE_RECT:
3969 case TGSI_TEXTURE_SHADOWRECT:
3970 case TGSI_TEXTURE_2D_ARRAY:
3971 case TGSI_TEXTURE_SHADOW2D_ARRAY:
3972 num_src_deriv_channels = 2;
3973 num_deriv_channels = 2;
3974 break;
3975 case TGSI_TEXTURE_CUBE:
3976 case TGSI_TEXTURE_SHADOWCUBE:
3977 case TGSI_TEXTURE_CUBE_ARRAY:
3978 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
3979 /* Cube derivatives will be converted to 2D. */
3980 num_src_deriv_channels = 3;
3981 num_deriv_channels = 2;
3982 break;
3983 case TGSI_TEXTURE_1D:
3984 case TGSI_TEXTURE_SHADOW1D:
3985 case TGSI_TEXTURE_1D_ARRAY:
3986 case TGSI_TEXTURE_SHADOW1D_ARRAY:
3987 num_src_deriv_channels = 1;
3988 num_deriv_channels = 1;
3989 break;
3990 default:
3991 unreachable("invalid target");
3992 }
3993
3994 for (param = 0; param < 2; param++)
3995 for (chan = 0; chan < num_src_deriv_channels; chan++)
3996 derivs[param * num_src_deriv_channels + chan] =
3997 lp_build_emit_fetch(bld_base, inst, param+1, chan);
3998 }
3999
4000 if (target == TGSI_TEXTURE_CUBE ||
4001 target == TGSI_TEXTURE_CUBE_ARRAY ||
4002 target == TGSI_TEXTURE_SHADOWCUBE ||
4003 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4004 radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4005
4006 if (opcode == TGSI_OPCODE_TXD)
4007 for (int i = 0; i < num_deriv_channels * 2; i++)
4008 address[count++] = derivs[i];
4009
4010 /* Pack texture coordinates */
4011 address[count++] = coords[0];
4012 if (num_coords > 1)
4013 address[count++] = coords[1];
4014 if (num_coords > 2)
4015 address[count++] = coords[2];
4016
4017 /* Pack LOD or sample index */
4018 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4019 address[count++] = coords[3];
4020 else if (opcode == TGSI_OPCODE_TXL2)
4021 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4022
4023 if (count > 16) {
4024 assert(!"Cannot handle more than 16 texture address parameters");
4025 count = 16;
4026 }
4027
4028 for (chan = 0; chan < count; chan++ ) {
4029 address[chan] = LLVMBuildBitCast(gallivm->builder,
4030 address[chan], ctx->i32, "");
4031 }
4032
4033 /* Adjust the sample index according to FMASK.
4034 *
4035 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4036 * which is the identity mapping. Each nibble says which physical sample
4037 * should be fetched to get that sample.
4038 *
4039 * For example, 0x11111100 means there are only 2 samples stored and
4040 * the second sample covers 3/4 of the pixel. When reading samples 0
4041 * and 1, return physical sample 0 (determined by the first two 0s
4042 * in FMASK), otherwise return physical sample 1.
4043 *
4044 * The sample index should be adjusted as follows:
4045 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4046 */
4047 if (target == TGSI_TEXTURE_2D_MSAA ||
4048 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4049 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4050 struct lp_build_emit_data txf_emit_data = *emit_data;
4051 LLVMValueRef txf_address[4];
4052 unsigned txf_count = count;
4053 struct tgsi_full_instruction inst = {};
4054
4055 memcpy(txf_address, address, sizeof(txf_address));
4056
4057 if (target == TGSI_TEXTURE_2D_MSAA) {
4058 txf_address[2] = bld_base->uint_bld.zero;
4059 }
4060 txf_address[3] = bld_base->uint_bld.zero;
4061
4062 /* Read FMASK using TXF. */
4063 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4064 inst.Texture.Texture = target;
4065 txf_emit_data.inst = &inst;
4066 txf_emit_data.chan = 0;
4067 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4068 target, fmask_ptr, NULL,
4069 txf_address, txf_count, 0xf);
4070 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4071
4072 /* Initialize some constants. */
4073 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4074 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4075
4076 /* Apply the formula. */
4077 LLVMValueRef fmask =
4078 LLVMBuildExtractElement(gallivm->builder,
4079 txf_emit_data.output[0],
4080 uint_bld->zero, "");
4081
4082 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4083
4084 LLVMValueRef sample_index4 =
4085 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4086
4087 LLVMValueRef shifted_fmask =
4088 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4089
4090 LLVMValueRef final_sample =
4091 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4092
4093 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4094 * resource descriptor is 0 (invalid),
4095 */
4096 LLVMValueRef fmask_desc =
4097 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4098 ctx->v8i32, "");
4099
4100 LLVMValueRef fmask_word1 =
4101 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4102 uint_bld->one, "");
4103
4104 LLVMValueRef word1_is_nonzero =
4105 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4106 fmask_word1, uint_bld->zero, "");
4107
4108 /* Replace the MSAA sample index. */
4109 address[sample_chan] =
4110 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4111 final_sample, address[sample_chan], "");
4112 }
4113
4114 if (opcode == TGSI_OPCODE_TXF) {
4115 /* add tex offsets */
4116 if (inst->Texture.NumOffsets) {
4117 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4118 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4119 const struct tgsi_texture_offset *off = inst->TexOffsets;
4120
4121 assert(inst->Texture.NumOffsets == 1);
4122
4123 switch (target) {
4124 case TGSI_TEXTURE_3D:
4125 address[2] = lp_build_add(uint_bld, address[2],
4126 bld->immediates[off->Index][off->SwizzleZ]);
4127 /* fall through */
4128 case TGSI_TEXTURE_2D:
4129 case TGSI_TEXTURE_SHADOW2D:
4130 case TGSI_TEXTURE_RECT:
4131 case TGSI_TEXTURE_SHADOWRECT:
4132 case TGSI_TEXTURE_2D_ARRAY:
4133 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4134 address[1] =
4135 lp_build_add(uint_bld, address[1],
4136 bld->immediates[off->Index][off->SwizzleY]);
4137 /* fall through */
4138 case TGSI_TEXTURE_1D:
4139 case TGSI_TEXTURE_SHADOW1D:
4140 case TGSI_TEXTURE_1D_ARRAY:
4141 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4142 address[0] =
4143 lp_build_add(uint_bld, address[0],
4144 bld->immediates[off->Index][off->SwizzleX]);
4145 break;
4146 /* texture offsets do not apply to other texture targets */
4147 }
4148 }
4149 }
4150
4151 if (opcode == TGSI_OPCODE_TG4) {
4152 unsigned gather_comp = 0;
4153
4154 /* DMASK was repurposed for GATHER4. 4 components are always
4155 * returned and DMASK works like a swizzle - it selects
4156 * the component to fetch. The only valid DMASK values are
4157 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4158 * (red,red,red,red) etc.) The ISA document doesn't mention
4159 * this.
4160 */
4161
4162 /* Get the component index from src1.x for Gather4. */
4163 if (!tgsi_is_shadow_target(target)) {
4164 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4165 LLVMValueRef comp_imm;
4166 struct tgsi_src_register src1 = inst->Src[1].Register;
4167
4168 assert(src1.File == TGSI_FILE_IMMEDIATE);
4169
4170 comp_imm = imms[src1.Index][src1.SwizzleX];
4171 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4172 gather_comp = CLAMP(gather_comp, 0, 3);
4173 }
4174
4175 dmask = 1 << gather_comp;
4176 }
4177
4178 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4179 samp_ptr, address, count, dmask);
4180 }
4181
4182 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4183 struct lp_build_tgsi_context *bld_base,
4184 struct lp_build_emit_data *emit_data)
4185 {
4186 struct lp_build_context *base = &bld_base->base;
4187 unsigned opcode = emit_data->inst->Instruction.Opcode;
4188 unsigned target = emit_data->inst->Texture.Texture;
4189 char intr_name[127];
4190 bool has_offset = emit_data->inst->Texture.NumOffsets > 0;
4191 bool is_shadow = tgsi_is_shadow_target(target);
4192 char type[64];
4193 const char *name = "llvm.SI.image.sample";
4194 const char *infix = "";
4195
4196 if (opcode == TGSI_OPCODE_TXQ && target == TGSI_TEXTURE_BUFFER) {
4197 /* Just return the buffer size. */
4198 emit_data->output[emit_data->chan] = emit_data->args[0];
4199 return;
4200 }
4201
4202 if (target == TGSI_TEXTURE_BUFFER) {
4203 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4204 base->gallivm->builder,
4205 "llvm.SI.vs.load.input", emit_data->dst_type,
4206 emit_data->args, emit_data->arg_count,
4207 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4208 return;
4209 }
4210
4211 switch (opcode) {
4212 case TGSI_OPCODE_TXF:
4213 name = target == TGSI_TEXTURE_2D_MSAA ||
4214 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4215 "llvm.SI.image.load" :
4216 "llvm.SI.image.load.mip";
4217 is_shadow = false;
4218 has_offset = false;
4219 break;
4220 case TGSI_OPCODE_TXQ:
4221 name = "llvm.SI.getresinfo";
4222 is_shadow = false;
4223 has_offset = false;
4224 break;
4225 case TGSI_OPCODE_LODQ:
4226 name = "llvm.SI.getlod";
4227 is_shadow = false;
4228 has_offset = false;
4229 break;
4230 case TGSI_OPCODE_TEX:
4231 case TGSI_OPCODE_TEX2:
4232 case TGSI_OPCODE_TXP:
4233 break;
4234 case TGSI_OPCODE_TXB:
4235 case TGSI_OPCODE_TXB2:
4236 infix = ".b";
4237 break;
4238 case TGSI_OPCODE_TXL:
4239 case TGSI_OPCODE_TXL2:
4240 infix = ".l";
4241 break;
4242 case TGSI_OPCODE_TXD:
4243 infix = ".d";
4244 break;
4245 case TGSI_OPCODE_TG4:
4246 name = "llvm.SI.gather4";
4247 break;
4248 default:
4249 assert(0);
4250 return;
4251 }
4252
4253 /* Add the type and suffixes .c, .o if needed. */
4254 build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4255 sprintf(intr_name, "%s%s%s%s.%s",
4256 name, is_shadow ? ".c" : "", infix,
4257 has_offset ? ".o" : "", type);
4258
4259 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4260 base->gallivm->builder, intr_name, emit_data->dst_type,
4261 emit_data->args, emit_data->arg_count,
4262 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4263
4264 /* Divide the number of layers by 6 to get the number of cubes. */
4265 if (opcode == TGSI_OPCODE_TXQ &&
4266 (target == TGSI_TEXTURE_CUBE_ARRAY ||
4267 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)) {
4268 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4269 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4270 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4271
4272 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4273 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4274 z = LLVMBuildSDiv(builder, z, six, "");
4275
4276 emit_data->output[emit_data->chan] =
4277 LLVMBuildInsertElement(builder, v4, z, two, "");
4278 }
4279 }
4280
4281 static void si_llvm_emit_txqs(
4282 const struct lp_build_tgsi_action *action,
4283 struct lp_build_tgsi_context *bld_base,
4284 struct lp_build_emit_data *emit_data)
4285 {
4286 struct si_shader_context *ctx = si_shader_context(bld_base);
4287 struct gallivm_state *gallivm = bld_base->base.gallivm;
4288 LLVMBuilderRef builder = gallivm->builder;
4289 LLVMValueRef res, samples;
4290 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4291
4292 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4293
4294
4295 /* Read the samples from the descriptor directly. */
4296 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4297 samples = LLVMBuildExtractElement(
4298 builder, res,
4299 lp_build_const_int32(gallivm, 3), "");
4300 samples = LLVMBuildLShr(builder, samples,
4301 lp_build_const_int32(gallivm, 16), "");
4302 samples = LLVMBuildAnd(builder, samples,
4303 lp_build_const_int32(gallivm, 0xf), "");
4304 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
4305 samples, "");
4306
4307 emit_data->output[emit_data->chan] = samples;
4308 }
4309
4310 /*
4311 * SI implements derivatives using the local data store (LDS)
4312 * All writes to the LDS happen in all executing threads at
4313 * the same time. TID is the Thread ID for the current
4314 * thread and is a value between 0 and 63, representing
4315 * the thread's position in the wavefront.
4316 *
4317 * For the pixel shader threads are grouped into quads of four pixels.
4318 * The TIDs of the pixels of a quad are:
4319 *
4320 * +------+------+
4321 * |4n + 0|4n + 1|
4322 * +------+------+
4323 * |4n + 2|4n + 3|
4324 * +------+------+
4325 *
4326 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
4327 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
4328 * the current pixel's column, and masking with 0xfffffffe yields the TID
4329 * of the left pixel of the current pixel's row.
4330 *
4331 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
4332 * adding 2 yields the TID of the pixel below the top pixel.
4333 */
4334 /* masks for thread ID. */
4335 #define TID_MASK_TOP_LEFT 0xfffffffc
4336 #define TID_MASK_TOP 0xfffffffd
4337 #define TID_MASK_LEFT 0xfffffffe
4338
4339 static void si_llvm_emit_ddxy(
4340 const struct lp_build_tgsi_action *action,
4341 struct lp_build_tgsi_context *bld_base,
4342 struct lp_build_emit_data *emit_data)
4343 {
4344 struct si_shader_context *ctx = si_shader_context(bld_base);
4345 struct gallivm_state *gallivm = bld_base->base.gallivm;
4346 const struct tgsi_full_instruction *inst = emit_data->inst;
4347 unsigned opcode = inst->Instruction.Opcode;
4348 LLVMValueRef indices[2];
4349 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
4350 LLVMValueRef tl, trbl, result[4];
4351 unsigned swizzle[4];
4352 unsigned c;
4353 int idx;
4354 unsigned mask;
4355
4356 indices[0] = bld_base->uint_bld.zero;
4357 indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32,
4358 NULL, 0, LLVMReadNoneAttribute);
4359 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4360 indices, 2, "");
4361
4362 if (opcode == TGSI_OPCODE_DDX_FINE)
4363 mask = TID_MASK_LEFT;
4364 else if (opcode == TGSI_OPCODE_DDY_FINE)
4365 mask = TID_MASK_TOP;
4366 else
4367 mask = TID_MASK_TOP_LEFT;
4368
4369 indices[1] = LLVMBuildAnd(gallivm->builder, indices[1],
4370 lp_build_const_int32(gallivm, mask), "");
4371 load_ptr0 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4372 indices, 2, "");
4373
4374 /* for DDX we want to next X pixel, DDY next Y pixel. */
4375 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
4376 indices[1] = LLVMBuildAdd(gallivm->builder, indices[1],
4377 lp_build_const_int32(gallivm, idx), "");
4378 load_ptr1 = LLVMBuildGEP(gallivm->builder, ctx->lds,
4379 indices, 2, "");
4380
4381 for (c = 0; c < 4; ++c) {
4382 unsigned i;
4383
4384 swizzle[c] = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], c);
4385 for (i = 0; i < c; ++i) {
4386 if (swizzle[i] == swizzle[c]) {
4387 result[c] = result[i];
4388 break;
4389 }
4390 }
4391 if (i != c)
4392 continue;
4393
4394 LLVMBuildStore(gallivm->builder,
4395 LLVMBuildBitCast(gallivm->builder,
4396 lp_build_emit_fetch(bld_base, inst, 0, c),
4397 ctx->i32, ""),
4398 store_ptr);
4399
4400 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
4401 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4402
4403 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
4404 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
4405
4406 result[c] = LLVMBuildFSub(gallivm->builder, trbl, tl, "");
4407 }
4408
4409 emit_data->output[0] = lp_build_gather_values(gallivm, result, 4);
4410 }
4411
4412 /*
4413 * this takes an I,J coordinate pair,
4414 * and works out the X and Y derivatives.
4415 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4416 */
4417 static LLVMValueRef si_llvm_emit_ddxy_interp(
4418 struct lp_build_tgsi_context *bld_base,
4419 LLVMValueRef interp_ij)
4420 {
4421 struct si_shader_context *ctx = si_shader_context(bld_base);
4422 struct gallivm_state *gallivm = bld_base->base.gallivm;
4423 LLVMValueRef indices[2];
4424 LLVMValueRef store_ptr, load_ptr_x, load_ptr_y, load_ptr_ddx, load_ptr_ddy, temp, temp2;
4425 LLVMValueRef tl, tr, bl, result[4];
4426 unsigned c;
4427
4428 indices[0] = bld_base->uint_bld.zero;
4429 indices[1] = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid", ctx->i32,
4430 NULL, 0, LLVMReadNoneAttribute);
4431 store_ptr = LLVMBuildGEP(gallivm->builder, ctx->lds,
4432 indices, 2, "");
4433
4434 temp = LLVMBuildAnd(gallivm->builder, indices[1],
4435 lp_build_const_int32(gallivm, TID_MASK_LEFT), "");
4436
4437 temp2 = LLVMBuildAnd(gallivm->builder, indices[1],
4438 lp_build_const_int32(gallivm, TID_MASK_TOP), "");
4439
4440 indices[1] = temp;
4441 load_ptr_x = LLVMBuildGEP(gallivm->builder, ctx->lds,
4442 indices, 2, "");
4443
4444 indices[1] = temp2;
4445 load_ptr_y = LLVMBuildGEP(gallivm->builder, ctx->lds,
4446 indices, 2, "");
4447
4448 indices[1] = LLVMBuildAdd(gallivm->builder, temp,
4449 lp_build_const_int32(gallivm, 1), "");
4450 load_ptr_ddx = LLVMBuildGEP(gallivm->builder, ctx->lds,
4451 indices, 2, "");
4452
4453 indices[1] = LLVMBuildAdd(gallivm->builder, temp2,
4454 lp_build_const_int32(gallivm, 2), "");
4455 load_ptr_ddy = LLVMBuildGEP(gallivm->builder, ctx->lds,
4456 indices, 2, "");
4457
4458 for (c = 0; c < 2; ++c) {
4459 LLVMValueRef store_val;
4460 LLVMValueRef c_ll = lp_build_const_int32(gallivm, c);
4461
4462 store_val = LLVMBuildExtractElement(gallivm->builder,
4463 interp_ij, c_ll, "");
4464 LLVMBuildStore(gallivm->builder,
4465 store_val,
4466 store_ptr);
4467
4468 tl = LLVMBuildLoad(gallivm->builder, load_ptr_x, "");
4469 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4470
4471 tr = LLVMBuildLoad(gallivm->builder, load_ptr_ddx, "");
4472 tr = LLVMBuildBitCast(gallivm->builder, tr, ctx->f32, "");
4473
4474 result[c] = LLVMBuildFSub(gallivm->builder, tr, tl, "");
4475
4476 tl = LLVMBuildLoad(gallivm->builder, load_ptr_y, "");
4477 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
4478
4479 bl = LLVMBuildLoad(gallivm->builder, load_ptr_ddy, "");
4480 bl = LLVMBuildBitCast(gallivm->builder, bl, ctx->f32, "");
4481
4482 result[c + 2] = LLVMBuildFSub(gallivm->builder, bl, tl, "");
4483 }
4484
4485 return lp_build_gather_values(gallivm, result, 4);
4486 }
4487
4488 static void interp_fetch_args(
4489 struct lp_build_tgsi_context *bld_base,
4490 struct lp_build_emit_data *emit_data)
4491 {
4492 struct si_shader_context *ctx = si_shader_context(bld_base);
4493 struct gallivm_state *gallivm = bld_base->base.gallivm;
4494 const struct tgsi_full_instruction *inst = emit_data->inst;
4495
4496 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
4497 /* offset is in second src, first two channels */
4498 emit_data->args[0] = lp_build_emit_fetch(bld_base,
4499 emit_data->inst, 1,
4500 TGSI_CHAN_X);
4501 emit_data->args[1] = lp_build_emit_fetch(bld_base,
4502 emit_data->inst, 1,
4503 TGSI_CHAN_Y);
4504 emit_data->arg_count = 2;
4505 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4506 LLVMValueRef sample_position;
4507 LLVMValueRef sample_id;
4508 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
4509
4510 /* fetch sample ID, then fetch its sample position,
4511 * and place into first two channels.
4512 */
4513 sample_id = lp_build_emit_fetch(bld_base,
4514 emit_data->inst, 1, TGSI_CHAN_X);
4515 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
4516 ctx->i32, "");
4517 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
4518
4519 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
4520 sample_position,
4521 lp_build_const_int32(gallivm, 0), "");
4522
4523 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
4524 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
4525 sample_position,
4526 lp_build_const_int32(gallivm, 1), "");
4527 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
4528 emit_data->arg_count = 2;
4529 }
4530 }
4531
4532 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
4533 struct lp_build_tgsi_context *bld_base,
4534 struct lp_build_emit_data *emit_data)
4535 {
4536 struct si_shader_context *ctx = si_shader_context(bld_base);
4537 struct si_shader *shader = ctx->shader;
4538 struct gallivm_state *gallivm = bld_base->base.gallivm;
4539 LLVMValueRef interp_param;
4540 const struct tgsi_full_instruction *inst = emit_data->inst;
4541 const char *intr_name;
4542 int input_index = inst->Src[0].Register.Index;
4543 int chan;
4544 int i;
4545 LLVMValueRef attr_number;
4546 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
4547 int interp_param_idx;
4548 unsigned interp = shader->selector->info.input_interpolate[input_index];
4549 unsigned location;
4550
4551 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
4552
4553 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4554 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
4555 location = TGSI_INTERPOLATE_LOC_CENTER;
4556 else
4557 location = TGSI_INTERPOLATE_LOC_CENTROID;
4558
4559 interp_param_idx = lookup_interp_param_index(interp, location);
4560 if (interp_param_idx == -1)
4561 return;
4562 else if (interp_param_idx)
4563 interp_param = LLVMGetParam(ctx->radeon_bld.main_fn, interp_param_idx);
4564 else
4565 interp_param = NULL;
4566
4567 attr_number = lp_build_const_int32(gallivm, input_index);
4568
4569 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4570 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4571 LLVMValueRef ij_out[2];
4572 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
4573
4574 /*
4575 * take the I then J parameters, and the DDX/Y for it, and
4576 * calculate the IJ inputs for the interpolator.
4577 * temp1 = ddx * offset/sample.x + I;
4578 * interp_param.I = ddy * offset/sample.y + temp1;
4579 * temp1 = ddx * offset/sample.x + J;
4580 * interp_param.J = ddy * offset/sample.y + temp1;
4581 */
4582 for (i = 0; i < 2; i++) {
4583 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
4584 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
4585 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
4586 ddxy_out, ix_ll, "");
4587 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
4588 ddxy_out, iy_ll, "");
4589 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
4590 interp_param, ix_ll, "");
4591 LLVMValueRef temp1, temp2;
4592
4593 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
4594 ctx->f32, "");
4595
4596 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
4597
4598 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
4599
4600 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
4601
4602 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
4603
4604 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
4605 temp2, ctx->i32, "");
4606 }
4607 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
4608 }
4609
4610 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
4611 for (chan = 0; chan < 2; chan++) {
4612 LLVMValueRef args[4];
4613 LLVMValueRef llvm_chan;
4614 unsigned schan;
4615
4616 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
4617 llvm_chan = lp_build_const_int32(gallivm, schan);
4618
4619 args[0] = llvm_chan;
4620 args[1] = attr_number;
4621 args[2] = params;
4622 args[3] = interp_param;
4623
4624 emit_data->output[chan] =
4625 lp_build_intrinsic(gallivm->builder, intr_name,
4626 ctx->f32, args, args[3] ? 4 : 3,
4627 LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
4628 }
4629 }
4630
4631 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
4632 struct lp_build_emit_data *emit_data)
4633 {
4634 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4635 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
4636 unsigned stream;
4637
4638 assert(src0.File == TGSI_FILE_IMMEDIATE);
4639
4640 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
4641 return stream;
4642 }
4643
4644 /* Emit one vertex from the geometry shader */
4645 static void si_llvm_emit_vertex(
4646 const struct lp_build_tgsi_action *action,
4647 struct lp_build_tgsi_context *bld_base,
4648 struct lp_build_emit_data *emit_data)
4649 {
4650 struct si_shader_context *ctx = si_shader_context(bld_base);
4651 struct lp_build_context *uint = &bld_base->uint_bld;
4652 struct si_shader *shader = ctx->shader;
4653 struct tgsi_shader_info *info = &shader->selector->info;
4654 struct gallivm_state *gallivm = bld_base->base.gallivm;
4655 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
4656 SI_PARAM_GS2VS_OFFSET);
4657 LLVMValueRef gs_next_vertex;
4658 LLVMValueRef can_emit, kill;
4659 LLVMValueRef args[2];
4660 unsigned chan;
4661 int i;
4662 unsigned stream;
4663
4664 stream = si_llvm_get_stream(bld_base, emit_data);
4665
4666 /* Write vertex attribute values to GSVS ring */
4667 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
4668 ctx->gs_next_vertex[stream],
4669 "");
4670
4671 /* If this thread has already emitted the declared maximum number of
4672 * vertices, kill it: excessive vertex emissions are not supposed to
4673 * have any effect, and GS threads have no externally observable
4674 * effects other than emitting vertices.
4675 */
4676 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
4677 lp_build_const_int32(gallivm,
4678 shader->selector->gs_max_out_vertices), "");
4679 kill = lp_build_select(&bld_base->base, can_emit,
4680 lp_build_const_float(gallivm, 1.0f),
4681 lp_build_const_float(gallivm, -1.0f));
4682
4683 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
4684 ctx->voidt, &kill, 1, 0);
4685
4686 for (i = 0; i < info->num_outputs; i++) {
4687 LLVMValueRef *out_ptr =
4688 ctx->radeon_bld.soa.outputs[i];
4689
4690 for (chan = 0; chan < 4; chan++) {
4691 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
4692 LLVMValueRef voffset =
4693 lp_build_const_int32(gallivm, (i * 4 + chan) *
4694 shader->selector->gs_max_out_vertices);
4695
4696 voffset = lp_build_add(uint, voffset, gs_next_vertex);
4697 voffset = lp_build_mul_imm(uint, voffset, 4);
4698
4699 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
4700
4701 build_tbuffer_store(ctx,
4702 ctx->gsvs_ring[stream],
4703 out_val, 1,
4704 voffset, soffset, 0,
4705 V_008F0C_BUF_DATA_FORMAT_32,
4706 V_008F0C_BUF_NUM_FORMAT_UINT,
4707 1, 0, 1, 1, 0);
4708 }
4709 }
4710 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
4711 lp_build_const_int32(gallivm, 1));
4712
4713 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
4714
4715 /* Signal vertex emission */
4716 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
4717 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4718 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4719 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4720 }
4721
4722 /* Cut one primitive from the geometry shader */
4723 static void si_llvm_emit_primitive(
4724 const struct lp_build_tgsi_action *action,
4725 struct lp_build_tgsi_context *bld_base,
4726 struct lp_build_emit_data *emit_data)
4727 {
4728 struct si_shader_context *ctx = si_shader_context(bld_base);
4729 struct gallivm_state *gallivm = bld_base->base.gallivm;
4730 LLVMValueRef args[2];
4731 unsigned stream;
4732
4733 /* Signal primitive cut */
4734 stream = si_llvm_get_stream(bld_base, emit_data);
4735 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
4736 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
4737 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
4738 ctx->voidt, args, 2, LLVMNoUnwindAttribute);
4739 }
4740
4741 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
4742 struct lp_build_tgsi_context *bld_base,
4743 struct lp_build_emit_data *emit_data)
4744 {
4745 struct si_shader_context *ctx = si_shader_context(bld_base);
4746 struct gallivm_state *gallivm = bld_base->base.gallivm;
4747
4748 /* The real barrier instruction isn’t needed, because an entire patch
4749 * always fits into a single wave.
4750 */
4751 if (ctx->type == TGSI_PROCESSOR_TESS_CTRL) {
4752 emit_optimization_barrier(ctx);
4753 return;
4754 }
4755
4756 lp_build_intrinsic(gallivm->builder,
4757 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
4758 : "llvm.AMDGPU.barrier.local",
4759 ctx->voidt, NULL, 0, LLVMNoUnwindAttribute);
4760 }
4761
4762 static const struct lp_build_tgsi_action tex_action = {
4763 .fetch_args = tex_fetch_args,
4764 .emit = build_tex_intrinsic,
4765 };
4766
4767 static const struct lp_build_tgsi_action interp_action = {
4768 .fetch_args = interp_fetch_args,
4769 .emit = build_interp_intrinsic,
4770 };
4771
4772 static void si_create_function(struct si_shader_context *ctx,
4773 LLVMTypeRef *returns, unsigned num_returns,
4774 LLVMTypeRef *params, unsigned num_params,
4775 int last_array_pointer, int last_sgpr)
4776 {
4777 int i;
4778
4779 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
4780 params, num_params);
4781 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
4782 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
4783
4784 for (i = 0; i <= last_sgpr; ++i) {
4785 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
4786
4787 /* We tell llvm that array inputs are passed by value to allow Sinking pass
4788 * to move load. Inputs are constant so this is fine. */
4789 if (i <= last_array_pointer)
4790 LLVMAddAttribute(P, LLVMByValAttribute);
4791 else
4792 LLVMAddAttribute(P, LLVMInRegAttribute);
4793 }
4794 }
4795
4796 static void create_meta_data(struct si_shader_context *ctx)
4797 {
4798 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
4799 LLVMValueRef args[3];
4800
4801 args[0] = LLVMMDStringInContext(gallivm->context, "const", 5);
4802 args[1] = 0;
4803 args[2] = lp_build_const_int32(gallivm, 1);
4804
4805 ctx->const_md = LLVMMDNodeInContext(gallivm->context, args, 3);
4806 }
4807
4808 static void declare_streamout_params(struct si_shader_context *ctx,
4809 struct pipe_stream_output_info *so,
4810 LLVMTypeRef *params, LLVMTypeRef i32,
4811 unsigned *num_params)
4812 {
4813 int i;
4814
4815 /* Streamout SGPRs. */
4816 if (so->num_outputs) {
4817 params[ctx->param_streamout_config = (*num_params)++] = i32;
4818 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
4819 }
4820 /* A streamout buffer offset is loaded if the stride is non-zero. */
4821 for (i = 0; i < 4; i++) {
4822 if (!so->stride[i])
4823 continue;
4824
4825 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
4826 }
4827 }
4828
4829 static unsigned llvm_get_type_size(LLVMTypeRef type)
4830 {
4831 LLVMTypeKind kind = LLVMGetTypeKind(type);
4832
4833 switch (kind) {
4834 case LLVMIntegerTypeKind:
4835 return LLVMGetIntTypeWidth(type) / 8;
4836 case LLVMFloatTypeKind:
4837 return 4;
4838 case LLVMPointerTypeKind:
4839 return 8;
4840 case LLVMVectorTypeKind:
4841 return LLVMGetVectorSize(type) *
4842 llvm_get_type_size(LLVMGetElementType(type));
4843 default:
4844 assert(0);
4845 return 0;
4846 }
4847 }
4848
4849 static void declare_tess_lds(struct si_shader_context *ctx)
4850 {
4851 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4852 LLVMTypeRef i32 = ctx->radeon_bld.soa.bld_base.uint_bld.elem_type;
4853
4854 /* This is the upper bound, maximum is 32 inputs times 32 vertices */
4855 unsigned vertex_data_dw_size = 32*32*4;
4856 unsigned patch_data_dw_size = 32*4;
4857 /* The formula is: TCS inputs + TCS outputs + TCS patch outputs. */
4858 unsigned patch_dw_size = vertex_data_dw_size*2 + patch_data_dw_size;
4859 unsigned lds_dwords = patch_dw_size;
4860
4861 /* The actual size is computed outside of the shader to reduce
4862 * the number of shader variants. */
4863 ctx->lds =
4864 LLVMAddGlobalInAddressSpace(gallivm->module,
4865 LLVMArrayType(i32, lds_dwords),
4866 "tess_lds",
4867 LOCAL_ADDR_SPACE);
4868 }
4869
4870 static void create_function(struct si_shader_context *ctx)
4871 {
4872 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
4873 struct gallivm_state *gallivm = bld_base->base.gallivm;
4874 struct si_shader *shader = ctx->shader;
4875 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
4876 LLVMTypeRef returns[16+32*4];
4877 unsigned i, last_array_pointer, last_sgpr, num_params, num_return_sgprs;
4878 unsigned num_returns = 0;
4879
4880 v3i32 = LLVMVectorType(ctx->i32, 3);
4881
4882 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
4883 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
4884 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
4885 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
4886 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
4887 last_array_pointer = SI_PARAM_SHADER_BUFFERS;
4888
4889 switch (ctx->type) {
4890 case TGSI_PROCESSOR_VERTEX:
4891 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
4892 last_array_pointer = SI_PARAM_VERTEX_BUFFERS;
4893 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
4894 params[SI_PARAM_START_INSTANCE] = ctx->i32;
4895 num_params = SI_PARAM_START_INSTANCE+1;
4896
4897 if (shader->key.vs.as_es) {
4898 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4899 } else if (shader->key.vs.as_ls) {
4900 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
4901 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
4902 } else {
4903 if (ctx->is_gs_copy_shader) {
4904 last_array_pointer = SI_PARAM_RW_BUFFERS;
4905 num_params = SI_PARAM_RW_BUFFERS+1;
4906 } else {
4907 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
4908 num_params = SI_PARAM_VS_STATE_BITS+1;
4909 }
4910
4911 /* The locations of the other parameters are assigned dynamically. */
4912 declare_streamout_params(ctx, &shader->selector->so,
4913 params, ctx->i32, &num_params);
4914 }
4915
4916 last_sgpr = num_params-1;
4917
4918 /* VGPRs */
4919 params[ctx->param_vertex_id = num_params++] = ctx->i32;
4920 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
4921 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
4922 params[ctx->param_instance_id = num_params++] = ctx->i32;
4923
4924 if (!ctx->is_monolithic &&
4925 !ctx->is_gs_copy_shader) {
4926 /* Vertex load indices. */
4927 ctx->param_vertex_index0 = num_params;
4928
4929 for (i = 0; i < shader->selector->info.num_inputs; i++)
4930 params[num_params++] = ctx->i32;
4931
4932 /* PrimitiveID output. */
4933 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
4934 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
4935 returns[num_returns++] = ctx->f32;
4936 }
4937 break;
4938
4939 case TGSI_PROCESSOR_TESS_CTRL:
4940 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
4941 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
4942 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
4943 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
4944 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
4945
4946 /* VGPRs */
4947 params[SI_PARAM_PATCH_ID] = ctx->i32;
4948 params[SI_PARAM_REL_IDS] = ctx->i32;
4949 num_params = SI_PARAM_REL_IDS+1;
4950
4951 if (!ctx->is_monolithic) {
4952 /* PARAM_TESS_FACTOR_OFFSET is after user SGPRs. */
4953 for (i = 0; i <= SI_TCS_NUM_USER_SGPR; i++)
4954 returns[num_returns++] = ctx->i32; /* SGPRs */
4955
4956 for (i = 0; i < 3; i++)
4957 returns[num_returns++] = ctx->f32; /* VGPRs */
4958 }
4959 break;
4960
4961 case TGSI_PROCESSOR_TESS_EVAL:
4962 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
4963 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
4964 num_params = SI_PARAM_TCS_OUT_LAYOUT+1;
4965
4966 if (shader->key.tes.as_es) {
4967 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
4968 } else {
4969 declare_streamout_params(ctx, &shader->selector->so,
4970 params, ctx->i32, &num_params);
4971 }
4972 last_sgpr = num_params - 1;
4973
4974 /* VGPRs */
4975 params[ctx->param_tes_u = num_params++] = ctx->f32;
4976 params[ctx->param_tes_v = num_params++] = ctx->f32;
4977 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
4978 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
4979
4980 /* PrimitiveID output. */
4981 if (!ctx->is_monolithic && !shader->key.tes.as_es)
4982 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
4983 returns[num_returns++] = ctx->f32;
4984 break;
4985
4986 case TGSI_PROCESSOR_GEOMETRY:
4987 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
4988 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
4989 last_sgpr = SI_PARAM_GS_WAVE_ID;
4990
4991 /* VGPRs */
4992 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
4993 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
4994 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
4995 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
4996 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
4997 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
4998 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
4999 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5000 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5001 break;
5002
5003 case TGSI_PROCESSOR_FRAGMENT:
5004 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5005 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5006 last_sgpr = SI_PARAM_PRIM_MASK;
5007 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5008 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5009 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5010 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5011 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5012 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5013 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5014 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5015 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5016 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5017 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5018 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5019 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5020 params[SI_PARAM_ANCILLARY] = ctx->i32;
5021 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5022 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5023 num_params = SI_PARAM_POS_FIXED_PT+1;
5024
5025 if (!ctx->is_monolithic) {
5026 /* Color inputs from the prolog. */
5027 if (shader->selector->info.colors_read) {
5028 unsigned num_color_elements =
5029 util_bitcount(shader->selector->info.colors_read);
5030
5031 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5032 for (i = 0; i < num_color_elements; i++)
5033 params[num_params++] = ctx->f32;
5034 }
5035
5036 /* Outputs for the epilog. */
5037 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5038 num_returns =
5039 num_return_sgprs +
5040 util_bitcount(shader->selector->info.colors_written) * 4 +
5041 shader->selector->info.writes_z +
5042 shader->selector->info.writes_stencil +
5043 shader->selector->info.writes_samplemask +
5044 1 /* SampleMaskIn */;
5045
5046 num_returns = MAX2(num_returns,
5047 num_return_sgprs +
5048 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5049
5050 for (i = 0; i < num_return_sgprs; i++)
5051 returns[i] = ctx->i32;
5052 for (; i < num_returns; i++)
5053 returns[i] = ctx->f32;
5054 }
5055 break;
5056
5057 case TGSI_PROCESSOR_COMPUTE:
5058 params[SI_PARAM_GRID_SIZE] = v3i32;
5059 params[SI_PARAM_BLOCK_ID] = v3i32;
5060 last_sgpr = SI_PARAM_BLOCK_ID;
5061
5062 params[SI_PARAM_THREAD_ID] = v3i32;
5063 num_params = SI_PARAM_THREAD_ID + 1;
5064 break;
5065 default:
5066 assert(0 && "unimplemented shader");
5067 return;
5068 }
5069
5070 assert(num_params <= Elements(params));
5071
5072 si_create_function(ctx, returns, num_returns, params,
5073 num_params, last_array_pointer, last_sgpr);
5074
5075 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5076 if (ctx->type == TGSI_PROCESSOR_FRAGMENT &&
5077 !ctx->is_monolithic) {
5078 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5079 "InitialPSInputAddr",
5080 S_0286D0_PERSP_SAMPLE_ENA(1) |
5081 S_0286D0_PERSP_CENTER_ENA(1) |
5082 S_0286D0_PERSP_CENTROID_ENA(1) |
5083 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5084 S_0286D0_LINEAR_CENTER_ENA(1) |
5085 S_0286D0_LINEAR_CENTROID_ENA(1) |
5086 S_0286D0_FRONT_FACE_ENA(1) |
5087 S_0286D0_POS_FIXED_PT_ENA(1));
5088 } else if (ctx->type == TGSI_PROCESSOR_COMPUTE) {
5089 const unsigned *properties = shader->selector->info.properties;
5090 unsigned max_work_group_size =
5091 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5092 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5093 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5094
5095 assert(max_work_group_size);
5096
5097 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5098 "amdgpu-max-work-group-size",
5099 max_work_group_size);
5100 }
5101
5102 shader->info.num_input_sgprs = 0;
5103 shader->info.num_input_vgprs = 0;
5104
5105 for (i = 0; i <= last_sgpr; ++i)
5106 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5107
5108 /* Unused fragment shader inputs are eliminated by the compiler,
5109 * so we don't know yet how many there will be.
5110 */
5111 if (ctx->type != TGSI_PROCESSOR_FRAGMENT)
5112 for (; i < num_params; ++i)
5113 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5114
5115 if (bld_base->info &&
5116 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5117 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5118 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5119 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5120 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5121 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5122 ctx->lds =
5123 LLVMAddGlobalInAddressSpace(gallivm->module,
5124 LLVMArrayType(ctx->i32, 64),
5125 "ddxy_lds",
5126 LOCAL_ADDR_SPACE);
5127
5128 if ((ctx->type == TGSI_PROCESSOR_VERTEX && shader->key.vs.as_ls) ||
5129 ctx->type == TGSI_PROCESSOR_TESS_CTRL ||
5130 ctx->type == TGSI_PROCESSOR_TESS_EVAL)
5131 declare_tess_lds(ctx);
5132 }
5133
5134 static void preload_constants(struct si_shader_context *ctx)
5135 {
5136 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5137 struct gallivm_state *gallivm = bld_base->base.gallivm;
5138 const struct tgsi_shader_info *info = bld_base->info;
5139 unsigned buf;
5140 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
5141
5142 for (buf = 0; buf < SI_NUM_CONST_BUFFERS; buf++) {
5143 unsigned i, num_const = info->const_file_max[buf] + 1;
5144
5145 if (num_const == 0)
5146 continue;
5147
5148 /* Allocate space for the constant values */
5149 ctx->constants[buf] = CALLOC(num_const * 4, sizeof(LLVMValueRef));
5150
5151 /* Load the resource descriptor */
5152 ctx->const_buffers[buf] =
5153 build_indexed_load_const(ctx, ptr, lp_build_const_int32(gallivm, buf));
5154
5155 /* Load the constants, we rely on the code sinking to do the rest */
5156 for (i = 0; i < num_const * 4; ++i) {
5157 ctx->constants[buf][i] =
5158 buffer_load_const(gallivm->builder,
5159 ctx->const_buffers[buf],
5160 lp_build_const_int32(gallivm, i * 4),
5161 ctx->f32);
5162 }
5163 }
5164 }
5165
5166 static void preload_shader_buffers(struct si_shader_context *ctx)
5167 {
5168 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5169 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
5170 int buf, maxbuf;
5171
5172 maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
5173 SI_NUM_SHADER_BUFFERS - 1);
5174 for (buf = 0; buf <= maxbuf; ++buf) {
5175 ctx->shader_buffers[buf] =
5176 build_indexed_load_const(
5177 ctx, ptr, lp_build_const_int32(gallivm, buf));
5178 }
5179 }
5180
5181 static void preload_samplers(struct si_shader_context *ctx)
5182 {
5183 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5184 struct gallivm_state *gallivm = bld_base->base.gallivm;
5185 const struct tgsi_shader_info *info = bld_base->info;
5186 unsigned i, num_samplers = info->file_max[TGSI_FILE_SAMPLER] + 1;
5187 LLVMValueRef offset;
5188
5189 if (num_samplers == 0)
5190 return;
5191
5192 /* Load the resources and samplers, we rely on the code sinking to do the rest */
5193 for (i = 0; i < num_samplers; ++i) {
5194 /* Resource */
5195 offset = lp_build_const_int32(gallivm, i);
5196 ctx->sampler_views[i] =
5197 get_sampler_desc(ctx, offset, DESC_IMAGE);
5198
5199 /* FMASK resource */
5200 if (info->is_msaa_sampler[i])
5201 ctx->fmasks[i] =
5202 get_sampler_desc(ctx, offset, DESC_FMASK);
5203 else {
5204 ctx->sampler_states[i] =
5205 get_sampler_desc(ctx, offset, DESC_SAMPLER);
5206 ctx->sampler_states[i] =
5207 sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
5208 ctx->sampler_states[i]);
5209 }
5210 }
5211 }
5212
5213 static void preload_images(struct si_shader_context *ctx)
5214 {
5215 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5216 struct tgsi_shader_info *info = &ctx->shader->selector->info;
5217 struct gallivm_state *gallivm = bld_base->base.gallivm;
5218 unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
5219 LLVMValueRef res_ptr;
5220 unsigned i;
5221
5222 if (num_images == 0)
5223 return;
5224
5225 res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
5226
5227 for (i = 0; i < num_images; ++i) {
5228 /* Rely on LLVM to shrink the load for buffer resources. */
5229 LLVMValueRef rsrc =
5230 build_indexed_load_const(ctx, res_ptr,
5231 lp_build_const_int32(gallivm, i));
5232
5233 if (info->images_writemask & (1 << i) &&
5234 !(info->images_buffers & (1 << i)))
5235 rsrc = force_dcc_off(ctx, rsrc);
5236
5237 ctx->images[i] = rsrc;
5238 }
5239 }
5240
5241 static void preload_streamout_buffers(struct si_shader_context *ctx)
5242 {
5243 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5244 struct gallivm_state *gallivm = bld_base->base.gallivm;
5245 unsigned i;
5246
5247 /* Streamout can only be used if the shader is compiled as VS. */
5248 if (!ctx->shader->selector->so.num_outputs ||
5249 (ctx->type == TGSI_PROCESSOR_VERTEX &&
5250 (ctx->shader->key.vs.as_es ||
5251 ctx->shader->key.vs.as_ls)) ||
5252 (ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
5253 ctx->shader->key.tes.as_es))
5254 return;
5255
5256 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5257 SI_PARAM_RW_BUFFERS);
5258
5259 /* Load the resources, we rely on the code sinking to do the rest */
5260 for (i = 0; i < 4; ++i) {
5261 if (ctx->shader->selector->so.stride[i]) {
5262 LLVMValueRef offset = lp_build_const_int32(gallivm,
5263 SI_VS_STREAMOUT_BUF0 + i);
5264
5265 ctx->so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
5266 }
5267 }
5268 }
5269
5270 /**
5271 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5272 * for later use.
5273 */
5274 static void preload_ring_buffers(struct si_shader_context *ctx)
5275 {
5276 struct gallivm_state *gallivm =
5277 ctx->radeon_bld.soa.bld_base.base.gallivm;
5278
5279 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5280 SI_PARAM_RW_BUFFERS);
5281
5282 if ((ctx->type == TGSI_PROCESSOR_VERTEX &&
5283 ctx->shader->key.vs.as_es) ||
5284 (ctx->type == TGSI_PROCESSOR_TESS_EVAL &&
5285 ctx->shader->key.tes.as_es) ||
5286 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
5287 unsigned ring =
5288 ctx->type == TGSI_PROCESSOR_GEOMETRY ? SI_GS_RING_ESGS
5289 : SI_ES_RING_ESGS;
5290 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5291
5292 ctx->esgs_ring =
5293 build_indexed_load_const(ctx, buf_ptr, offset);
5294 }
5295
5296 if (ctx->is_gs_copy_shader) {
5297 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5298
5299 ctx->gsvs_ring[0] =
5300 build_indexed_load_const(ctx, buf_ptr, offset);
5301 }
5302 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
5303 int i;
5304 for (i = 0; i < 4; i++) {
5305 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5306
5307 ctx->gsvs_ring[i] =
5308 build_indexed_load_const(ctx, buf_ptr, offset);
5309 }
5310 }
5311 }
5312
5313 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5314 LLVMValueRef param_rw_buffers,
5315 unsigned param_pos_fixed_pt)
5316 {
5317 struct lp_build_tgsi_context *bld_base =
5318 &ctx->radeon_bld.soa.bld_base;
5319 struct gallivm_state *gallivm = bld_base->base.gallivm;
5320 LLVMBuilderRef builder = gallivm->builder;
5321 LLVMValueRef slot, desc, offset, row, bit, address[2];
5322
5323 /* Use the fixed-point gl_FragCoord input.
5324 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5325 * per coordinate to get the repeating effect.
5326 */
5327 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5328 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5329
5330 /* Load the buffer descriptor. */
5331 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5332 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5333
5334 /* The stipple pattern is 32x32, each row has 32 bits. */
5335 offset = LLVMBuildMul(builder, address[1],
5336 LLVMConstInt(ctx->i32, 4, 0), "");
5337 row = buffer_load_const(builder, desc, offset, ctx->i32);
5338 bit = LLVMBuildLShr(builder, row, address[0], "");
5339 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5340
5341 /* The intrinsic kills the thread if arg < 0. */
5342 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5343 LLVMConstReal(ctx->f32, -1), "");
5344 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5345 }
5346
5347 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5348 struct si_shader_config *conf,
5349 unsigned symbol_offset)
5350 {
5351 unsigned i;
5352 const unsigned char *config =
5353 radeon_shader_binary_config_start(binary, symbol_offset);
5354
5355 /* XXX: We may be able to emit some of these values directly rather than
5356 * extracting fields to be emitted later.
5357 */
5358
5359 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5360 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5361 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5362 switch (reg) {
5363 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5364 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5365 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5366 case R_00B848_COMPUTE_PGM_RSRC1:
5367 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5368 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5369 conf->float_mode = G_00B028_FLOAT_MODE(value);
5370 conf->rsrc1 = value;
5371 break;
5372 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5373 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5374 break;
5375 case R_00B84C_COMPUTE_PGM_RSRC2:
5376 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5377 conf->rsrc2 = value;
5378 break;
5379 case R_0286CC_SPI_PS_INPUT_ENA:
5380 conf->spi_ps_input_ena = value;
5381 break;
5382 case R_0286D0_SPI_PS_INPUT_ADDR:
5383 conf->spi_ps_input_addr = value;
5384 break;
5385 case R_0286E8_SPI_TMPRING_SIZE:
5386 case R_00B860_COMPUTE_TMPRING_SIZE:
5387 /* WAVESIZE is in units of 256 dwords. */
5388 conf->scratch_bytes_per_wave =
5389 G_00B860_WAVESIZE(value) * 256 * 4 * 1;
5390 break;
5391 default:
5392 {
5393 static bool printed;
5394
5395 if (!printed) {
5396 fprintf(stderr, "Warning: LLVM emitted unknown "
5397 "config register: 0x%x\n", reg);
5398 printed = true;
5399 }
5400 }
5401 break;
5402 }
5403
5404 if (!conf->spi_ps_input_addr)
5405 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5406 }
5407 }
5408
5409 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5410 struct si_shader *shader,
5411 struct si_shader_config *config,
5412 uint64_t scratch_va)
5413 {
5414 unsigned i;
5415 uint32_t scratch_rsrc_dword0 = scratch_va;
5416 uint32_t scratch_rsrc_dword1 =
5417 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
5418 | S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
5419
5420 for (i = 0 ; i < shader->binary.reloc_count; i++) {
5421 const struct radeon_shader_reloc *reloc =
5422 &shader->binary.relocs[i];
5423 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
5424 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5425 &scratch_rsrc_dword0, 4);
5426 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5427 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
5428 &scratch_rsrc_dword1, 4);
5429 }
5430 }
5431 }
5432
5433 static unsigned si_get_shader_binary_size(struct si_shader *shader)
5434 {
5435 unsigned size = shader->binary.code_size;
5436
5437 if (shader->prolog)
5438 size += shader->prolog->binary.code_size;
5439 if (shader->epilog)
5440 size += shader->epilog->binary.code_size;
5441 return size;
5442 }
5443
5444 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
5445 {
5446 const struct radeon_shader_binary *prolog =
5447 shader->prolog ? &shader->prolog->binary : NULL;
5448 const struct radeon_shader_binary *epilog =
5449 shader->epilog ? &shader->epilog->binary : NULL;
5450 const struct radeon_shader_binary *mainb = &shader->binary;
5451 unsigned bo_size = si_get_shader_binary_size(shader) +
5452 (!epilog ? mainb->rodata_size : 0);
5453 unsigned char *ptr;
5454
5455 assert(!prolog || !prolog->rodata_size);
5456 assert((!prolog && !epilog) || !mainb->rodata_size);
5457 assert(!epilog || !epilog->rodata_size);
5458
5459 r600_resource_reference(&shader->bo, NULL);
5460 shader->bo = si_resource_create_custom(&sscreen->b.b,
5461 PIPE_USAGE_IMMUTABLE,
5462 bo_size);
5463 if (!shader->bo)
5464 return -ENOMEM;
5465
5466 /* Upload. */
5467 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
5468 PIPE_TRANSFER_READ_WRITE);
5469
5470 if (prolog) {
5471 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
5472 ptr += prolog->code_size;
5473 }
5474
5475 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
5476 ptr += mainb->code_size;
5477
5478 if (epilog)
5479 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
5480 else if (mainb->rodata_size > 0)
5481 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
5482
5483 sscreen->b.ws->buffer_unmap(shader->bo->buf);
5484 return 0;
5485 }
5486
5487 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
5488 struct pipe_debug_callback *debug,
5489 const char *name, FILE *file)
5490 {
5491 char *line, *p;
5492 unsigned i, count;
5493
5494 if (binary->disasm_string) {
5495 fprintf(file, "Shader %s disassembly:\n", name);
5496 fprintf(file, "%s", binary->disasm_string);
5497
5498 if (debug && debug->debug_message) {
5499 /* Very long debug messages are cut off, so send the
5500 * disassembly one line at a time. This causes more
5501 * overhead, but on the plus side it simplifies
5502 * parsing of resulting logs.
5503 */
5504 pipe_debug_message(debug, SHADER_INFO,
5505 "Shader Disassembly Begin");
5506
5507 line = binary->disasm_string;
5508 while (*line) {
5509 p = util_strchrnul(line, '\n');
5510 count = p - line;
5511
5512 if (count) {
5513 pipe_debug_message(debug, SHADER_INFO,
5514 "%.*s", count, line);
5515 }
5516
5517 if (!*p)
5518 break;
5519 line = p + 1;
5520 }
5521
5522 pipe_debug_message(debug, SHADER_INFO,
5523 "Shader Disassembly End");
5524 }
5525 } else {
5526 fprintf(file, "Shader %s binary:\n", name);
5527 for (i = 0; i < binary->code_size; i += 4) {
5528 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
5529 binary->code[i + 3], binary->code[i + 2],
5530 binary->code[i + 1], binary->code[i]);
5531 }
5532 }
5533 }
5534
5535 static void si_shader_dump_stats(struct si_screen *sscreen,
5536 struct si_shader_config *conf,
5537 unsigned num_inputs,
5538 unsigned code_size,
5539 struct pipe_debug_callback *debug,
5540 unsigned processor,
5541 FILE *file)
5542 {
5543 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
5544 unsigned lds_per_wave = 0;
5545 unsigned max_simd_waves = 10;
5546
5547 /* Compute LDS usage for PS. */
5548 if (processor == TGSI_PROCESSOR_FRAGMENT) {
5549 /* The minimum usage per wave is (num_inputs * 36). The maximum
5550 * usage is (num_inputs * 36 * 16).
5551 * We can get anything in between and it varies between waves.
5552 *
5553 * Other stages don't know the size at compile time or don't
5554 * allocate LDS per wave, but instead they do it per thread group.
5555 */
5556 lds_per_wave = conf->lds_size * lds_increment +
5557 align(num_inputs * 36, lds_increment);
5558 }
5559
5560 /* Compute the per-SIMD wave counts. */
5561 if (conf->num_sgprs) {
5562 if (sscreen->b.chip_class >= VI)
5563 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
5564 else
5565 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
5566 }
5567
5568 if (conf->num_vgprs)
5569 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
5570
5571 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
5572 * that PS can use.
5573 */
5574 if (lds_per_wave)
5575 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
5576
5577 if (file != stderr ||
5578 r600_can_dump_shader(&sscreen->b, processor)) {
5579 if (processor == TGSI_PROCESSOR_FRAGMENT) {
5580 fprintf(file, "*** SHADER CONFIG ***\n"
5581 "SPI_PS_INPUT_ADDR = 0x%04x\n"
5582 "SPI_PS_INPUT_ENA = 0x%04x\n",
5583 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
5584 }
5585
5586 fprintf(file, "*** SHADER STATS ***\n"
5587 "SGPRS: %d\n"
5588 "VGPRS: %d\n"
5589 "Code Size: %d bytes\n"
5590 "LDS: %d blocks\n"
5591 "Scratch: %d bytes per wave\n"
5592 "Max Waves: %d\n"
5593 "********************\n",
5594 conf->num_sgprs, conf->num_vgprs, code_size,
5595 conf->lds_size, conf->scratch_bytes_per_wave,
5596 max_simd_waves);
5597 }
5598
5599 pipe_debug_message(debug, SHADER_INFO,
5600 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
5601 "LDS: %d Scratch: %d Max Waves: %d",
5602 conf->num_sgprs, conf->num_vgprs, code_size,
5603 conf->lds_size, conf->scratch_bytes_per_wave,
5604 max_simd_waves);
5605 }
5606
5607 static const char *si_get_shader_name(struct si_shader *shader,
5608 unsigned processor)
5609 {
5610 switch (processor) {
5611 case TGSI_PROCESSOR_VERTEX:
5612 if (shader->key.vs.as_es)
5613 return "Vertex Shader as ES";
5614 else if (shader->key.vs.as_ls)
5615 return "Vertex Shader as LS";
5616 else
5617 return "Vertex Shader as VS";
5618 case TGSI_PROCESSOR_TESS_CTRL:
5619 return "Tessellation Control Shader";
5620 case TGSI_PROCESSOR_TESS_EVAL:
5621 if (shader->key.tes.as_es)
5622 return "Tessellation Evaluation Shader as ES";
5623 else
5624 return "Tessellation Evaluation Shader as VS";
5625 case TGSI_PROCESSOR_GEOMETRY:
5626 if (shader->gs_copy_shader == NULL)
5627 return "GS Copy Shader as VS";
5628 else
5629 return "Geometry Shader";
5630 case TGSI_PROCESSOR_FRAGMENT:
5631 return "Pixel Shader";
5632 case TGSI_PROCESSOR_COMPUTE:
5633 return "Compute Shader";
5634 default:
5635 return "Unknown Shader";
5636 }
5637 }
5638
5639 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
5640 struct pipe_debug_callback *debug, unsigned processor,
5641 FILE *file)
5642 {
5643 if (file != stderr ||
5644 (r600_can_dump_shader(&sscreen->b, processor) &&
5645 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
5646 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
5647
5648 if (shader->prolog)
5649 si_shader_dump_disassembly(&shader->prolog->binary,
5650 debug, "prolog", file);
5651
5652 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
5653
5654 if (shader->epilog)
5655 si_shader_dump_disassembly(&shader->epilog->binary,
5656 debug, "epilog", file);
5657 fprintf(file, "\n");
5658 }
5659
5660 si_shader_dump_stats(sscreen, &shader->config,
5661 shader->selector ? shader->selector->info.num_inputs : 0,
5662 si_get_shader_binary_size(shader), debug, processor,
5663 file);
5664 }
5665
5666 int si_compile_llvm(struct si_screen *sscreen,
5667 struct radeon_shader_binary *binary,
5668 struct si_shader_config *conf,
5669 LLVMTargetMachineRef tm,
5670 LLVMModuleRef mod,
5671 struct pipe_debug_callback *debug,
5672 unsigned processor,
5673 const char *name)
5674 {
5675 int r = 0;
5676 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
5677
5678 if (r600_can_dump_shader(&sscreen->b, processor)) {
5679 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
5680
5681 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
5682 fprintf(stderr, "%s LLVM IR:\n\n", name);
5683 LLVMDumpModule(mod);
5684 fprintf(stderr, "\n");
5685 }
5686 }
5687
5688 if (!si_replace_shader(count, binary)) {
5689 r = radeon_llvm_compile(mod, binary,
5690 r600_get_llvm_processor_name(sscreen->b.family), tm,
5691 debug);
5692 if (r)
5693 return r;
5694 }
5695
5696 si_shader_binary_read_config(binary, conf, 0);
5697
5698 /* Enable 64-bit and 16-bit denormals, because there is no performance
5699 * cost.
5700 *
5701 * If denormals are enabled, all floating-point output modifiers are
5702 * ignored.
5703 *
5704 * Don't enable denormals for 32-bit floats, because:
5705 * - Floating-point output modifiers would be ignored by the hw.
5706 * - Some opcodes don't support denormals, such as v_mad_f32. We would
5707 * have to stop using those.
5708 * - SI & CI would be very slow.
5709 */
5710 conf->float_mode |= V_00B028_FP_64_DENORMS;
5711
5712 FREE(binary->config);
5713 FREE(binary->global_symbol_offsets);
5714 binary->config = NULL;
5715 binary->global_symbol_offsets = NULL;
5716
5717 /* Some shaders can't have rodata because their binaries can be
5718 * concatenated.
5719 */
5720 if (binary->rodata_size &&
5721 (processor == TGSI_PROCESSOR_VERTEX ||
5722 processor == TGSI_PROCESSOR_TESS_CTRL ||
5723 processor == TGSI_PROCESSOR_TESS_EVAL ||
5724 processor == TGSI_PROCESSOR_FRAGMENT)) {
5725 fprintf(stderr, "radeonsi: The shader can't have rodata.");
5726 return -EINVAL;
5727 }
5728
5729 return r;
5730 }
5731
5732 /* Generate code for the hardware VS shader stage to go with a geometry shader */
5733 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
5734 struct si_shader_context *ctx,
5735 struct si_shader *gs,
5736 struct pipe_debug_callback *debug)
5737 {
5738 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5739 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5740 struct lp_build_context *uint = &bld_base->uint_bld;
5741 struct si_shader_output_values *outputs;
5742 struct tgsi_shader_info *gsinfo = &gs->selector->info;
5743 LLVMValueRef args[9];
5744 int i, r;
5745
5746 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
5747
5748 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
5749 ctx->type = TGSI_PROCESSOR_VERTEX;
5750 ctx->is_gs_copy_shader = true;
5751
5752 create_meta_data(ctx);
5753 create_function(ctx);
5754 preload_streamout_buffers(ctx);
5755 preload_ring_buffers(ctx);
5756
5757 args[0] = ctx->gsvs_ring[0];
5758 args[1] = lp_build_mul_imm(uint,
5759 LLVMGetParam(ctx->radeon_bld.main_fn,
5760 ctx->param_vertex_id),
5761 4);
5762 args[3] = uint->zero;
5763 args[4] = uint->one; /* OFFEN */
5764 args[5] = uint->zero; /* IDXEN */
5765 args[6] = uint->one; /* GLC */
5766 args[7] = uint->one; /* SLC */
5767 args[8] = uint->zero; /* TFE */
5768
5769 /* Fetch vertex data from GSVS ring */
5770 for (i = 0; i < gsinfo->num_outputs; ++i) {
5771 unsigned chan;
5772
5773 outputs[i].name = gsinfo->output_semantic_name[i];
5774 outputs[i].sid = gsinfo->output_semantic_index[i];
5775
5776 for (chan = 0; chan < 4; chan++) {
5777 args[2] = lp_build_const_int32(gallivm,
5778 (i * 4 + chan) *
5779 gs->selector->gs_max_out_vertices * 16 * 4);
5780
5781 outputs[i].values[chan] =
5782 LLVMBuildBitCast(gallivm->builder,
5783 lp_build_intrinsic(gallivm->builder,
5784 "llvm.SI.buffer.load.dword.i32.i32",
5785 ctx->i32, args, 9,
5786 LLVMReadOnlyAttribute | LLVMNoUnwindAttribute),
5787 ctx->f32, "");
5788 }
5789 }
5790
5791 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
5792
5793 LLVMBuildRet(gallivm->builder, ctx->return_value);
5794
5795 /* Dump LLVM IR before any optimization passes */
5796 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
5797 r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
5798 LLVMDumpModule(bld_base->base.gallivm->module);
5799
5800 radeon_llvm_finalize_module(&ctx->radeon_bld);
5801
5802 r = si_compile_llvm(sscreen, &ctx->shader->binary,
5803 &ctx->shader->config, ctx->tm,
5804 bld_base->base.gallivm->module,
5805 debug, TGSI_PROCESSOR_GEOMETRY,
5806 "GS Copy Shader");
5807 if (!r) {
5808 if (r600_can_dump_shader(&sscreen->b, TGSI_PROCESSOR_GEOMETRY))
5809 fprintf(stderr, "GS Copy Shader:\n");
5810 si_shader_dump(sscreen, ctx->shader, debug,
5811 TGSI_PROCESSOR_GEOMETRY, stderr);
5812 r = si_shader_binary_upload(sscreen, ctx->shader);
5813 }
5814
5815 radeon_llvm_dispose(&ctx->radeon_bld);
5816
5817 FREE(outputs);
5818 return r;
5819 }
5820
5821 void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
5822 {
5823 int i;
5824
5825 fprintf(f, "SHADER KEY\n");
5826
5827 switch (shader) {
5828 case PIPE_SHADER_VERTEX:
5829 fprintf(f, " instance_divisors = {");
5830 for (i = 0; i < Elements(key->vs.prolog.instance_divisors); i++)
5831 fprintf(f, !i ? "%u" : ", %u",
5832 key->vs.prolog.instance_divisors[i]);
5833 fprintf(f, "}\n");
5834 fprintf(f, " as_es = %u\n", key->vs.as_es);
5835 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
5836 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
5837 break;
5838
5839 case PIPE_SHADER_TESS_CTRL:
5840 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
5841 break;
5842
5843 case PIPE_SHADER_TESS_EVAL:
5844 fprintf(f, " as_es = %u\n", key->tes.as_es);
5845 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
5846 break;
5847
5848 case PIPE_SHADER_GEOMETRY:
5849 case PIPE_SHADER_COMPUTE:
5850 break;
5851
5852 case PIPE_SHADER_FRAGMENT:
5853 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
5854 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
5855 fprintf(f, " prolog.force_persample_interp = %u\n", key->ps.prolog.force_persample_interp);
5856 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
5857 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
5858 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
5859 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
5860 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
5861 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
5862 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
5863 break;
5864
5865 default:
5866 assert(0);
5867 }
5868 }
5869
5870 static void si_init_shader_ctx(struct si_shader_context *ctx,
5871 struct si_screen *sscreen,
5872 struct si_shader *shader,
5873 LLVMTargetMachineRef tm)
5874 {
5875 struct lp_build_tgsi_context *bld_base;
5876 struct lp_build_tgsi_action tmpl = {};
5877
5878 memset(ctx, 0, sizeof(*ctx));
5879 radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
5880 ctx->tm = tm;
5881 ctx->screen = sscreen;
5882 if (shader && shader->selector)
5883 ctx->type = shader->selector->info.processor;
5884 else
5885 ctx->type = -1;
5886 ctx->shader = shader;
5887
5888 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
5889 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
5890 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
5891 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
5892 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
5893 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
5894 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
5895 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
5896 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
5897 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
5898 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
5899 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
5900
5901 bld_base = &ctx->radeon_bld.soa.bld_base;
5902 if (shader && shader->selector)
5903 bld_base->info = &shader->selector->info;
5904 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
5905
5906 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
5907 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
5908 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
5909
5910 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
5911 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
5912 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
5913 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
5914 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
5915 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
5916 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
5917 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
5918 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
5919 bld_base->op_actions[TGSI_OPCODE_TXQ] = tex_action;
5920 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
5921 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
5922 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
5923
5924 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
5925 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
5926 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
5927 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
5928 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
5929 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
5930
5931 tmpl.fetch_args = atomic_fetch_args;
5932 tmpl.emit = atomic_emit;
5933 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
5934 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
5935 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
5936 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
5937 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
5938 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
5939 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
5940 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
5941 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
5942 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
5943 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
5944 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
5945 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
5946 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
5947 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
5948 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
5949 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
5950 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
5951 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
5952 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
5953
5954 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
5955
5956 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
5957 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
5958 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
5959 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
5960
5961 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
5962 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
5963 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
5964
5965 bld_base->op_actions[TGSI_OPCODE_MAX].emit = build_tgsi_intrinsic_nomem;
5966 bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.maxnum.f32";
5967 bld_base->op_actions[TGSI_OPCODE_MIN].emit = build_tgsi_intrinsic_nomem;
5968 bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.minnum.f32";
5969 }
5970
5971 int si_compile_tgsi_shader(struct si_screen *sscreen,
5972 LLVMTargetMachineRef tm,
5973 struct si_shader *shader,
5974 bool is_monolithic,
5975 struct pipe_debug_callback *debug)
5976 {
5977 struct si_shader_selector *sel = shader->selector;
5978 struct si_shader_context ctx;
5979 struct lp_build_tgsi_context *bld_base;
5980 LLVMModuleRef mod;
5981 int r = 0;
5982
5983 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
5984 * conversion fails. */
5985 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
5986 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
5987 si_dump_shader_key(sel->type, &shader->key, stderr);
5988 tgsi_dump(sel->tokens, 0);
5989 si_dump_streamout(&sel->so);
5990 }
5991
5992 si_init_shader_ctx(&ctx, sscreen, shader, tm);
5993 ctx.is_monolithic = is_monolithic;
5994
5995 shader->info.uses_instanceid = sel->info.uses_instanceid;
5996
5997 bld_base = &ctx.radeon_bld.soa.bld_base;
5998 ctx.radeon_bld.load_system_value = declare_system_value;
5999
6000 switch (ctx.type) {
6001 case TGSI_PROCESSOR_VERTEX:
6002 ctx.radeon_bld.load_input = declare_input_vs;
6003 if (shader->key.vs.as_ls)
6004 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6005 else if (shader->key.vs.as_es)
6006 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6007 else
6008 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6009 break;
6010 case TGSI_PROCESSOR_TESS_CTRL:
6011 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6012 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6013 bld_base->emit_store = store_output_tcs;
6014 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6015 break;
6016 case TGSI_PROCESSOR_TESS_EVAL:
6017 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6018 if (shader->key.tes.as_es)
6019 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6020 else
6021 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6022 break;
6023 case TGSI_PROCESSOR_GEOMETRY:
6024 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6025 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6026 break;
6027 case TGSI_PROCESSOR_FRAGMENT:
6028 ctx.radeon_bld.load_input = declare_input_fs;
6029 if (is_monolithic)
6030 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6031 else
6032 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6033 break;
6034 case TGSI_PROCESSOR_COMPUTE:
6035 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6036 break;
6037 default:
6038 assert(!"Unsupported shader type");
6039 return -1;
6040 }
6041
6042 create_meta_data(&ctx);
6043 create_function(&ctx);
6044 preload_constants(&ctx);
6045 preload_shader_buffers(&ctx);
6046 preload_samplers(&ctx);
6047 preload_images(&ctx);
6048 preload_streamout_buffers(&ctx);
6049 preload_ring_buffers(&ctx);
6050
6051 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6052 shader->key.ps.prolog.poly_stipple) {
6053 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6054 SI_PARAM_RW_BUFFERS);
6055 si_llvm_emit_polygon_stipple(&ctx, list,
6056 SI_PARAM_POS_FIXED_PT);
6057 }
6058
6059 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
6060 int i;
6061 for (i = 0; i < 4; i++) {
6062 ctx.gs_next_vertex[i] =
6063 lp_build_alloca(bld_base->base.gallivm,
6064 ctx.i32, "");
6065 }
6066 }
6067
6068 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6069 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6070 goto out;
6071 }
6072
6073 LLVMBuildRet(bld_base->base.gallivm->builder, ctx.return_value);
6074 mod = bld_base->base.gallivm->module;
6075
6076 /* Dump LLVM IR before any optimization passes */
6077 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6078 r600_can_dump_shader(&sscreen->b, ctx.type))
6079 LLVMDumpModule(mod);
6080
6081 radeon_llvm_finalize_module(&ctx.radeon_bld);
6082
6083 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6084 mod, debug, ctx.type, "TGSI shader");
6085 if (r) {
6086 fprintf(stderr, "LLVM failed to compile shader\n");
6087 goto out;
6088 }
6089
6090 radeon_llvm_dispose(&ctx.radeon_bld);
6091
6092 /* Add the scratch offset to input SGPRs. */
6093 if (shader->config.scratch_bytes_per_wave)
6094 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6095
6096 /* Calculate the number of fragment input VGPRs. */
6097 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
6098 shader->info.num_input_vgprs = 0;
6099 shader->info.face_vgpr_index = -1;
6100
6101 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6102 shader->info.num_input_vgprs += 2;
6103 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6104 shader->info.num_input_vgprs += 2;
6105 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6106 shader->info.num_input_vgprs += 2;
6107 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6108 shader->info.num_input_vgprs += 3;
6109 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6110 shader->info.num_input_vgprs += 2;
6111 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6112 shader->info.num_input_vgprs += 2;
6113 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6114 shader->info.num_input_vgprs += 2;
6115 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6116 shader->info.num_input_vgprs += 1;
6117 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6118 shader->info.num_input_vgprs += 1;
6119 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6120 shader->info.num_input_vgprs += 1;
6121 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6122 shader->info.num_input_vgprs += 1;
6123 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6124 shader->info.num_input_vgprs += 1;
6125 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6126 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6127 shader->info.num_input_vgprs += 1;
6128 }
6129 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6130 shader->info.num_input_vgprs += 1;
6131 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6132 shader->info.num_input_vgprs += 1;
6133 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6134 shader->info.num_input_vgprs += 1;
6135 }
6136
6137 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
6138 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6139 shader->gs_copy_shader->selector = shader->selector;
6140 ctx.shader = shader->gs_copy_shader;
6141 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6142 shader, debug))) {
6143 free(shader->gs_copy_shader);
6144 shader->gs_copy_shader = NULL;
6145 goto out;
6146 }
6147 }
6148
6149 out:
6150 for (int i = 0; i < SI_NUM_CONST_BUFFERS; i++)
6151 FREE(ctx.constants[i]);
6152 return r;
6153 }
6154
6155 /**
6156 * Create, compile and return a shader part (prolog or epilog).
6157 *
6158 * \param sscreen screen
6159 * \param list list of shader parts of the same category
6160 * \param key shader part key
6161 * \param tm LLVM target machine
6162 * \param debug debug callback
6163 * \param compile the callback responsible for compilation
6164 * \return non-NULL on success
6165 */
6166 static struct si_shader_part *
6167 si_get_shader_part(struct si_screen *sscreen,
6168 struct si_shader_part **list,
6169 union si_shader_part_key *key,
6170 LLVMTargetMachineRef tm,
6171 struct pipe_debug_callback *debug,
6172 bool (*compile)(struct si_screen *,
6173 LLVMTargetMachineRef,
6174 struct pipe_debug_callback *,
6175 struct si_shader_part *))
6176 {
6177 struct si_shader_part *result;
6178
6179 pipe_mutex_lock(sscreen->shader_parts_mutex);
6180
6181 /* Find existing. */
6182 for (result = *list; result; result = result->next) {
6183 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6184 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6185 return result;
6186 }
6187 }
6188
6189 /* Compile a new one. */
6190 result = CALLOC_STRUCT(si_shader_part);
6191 result->key = *key;
6192 if (!compile(sscreen, tm, debug, result)) {
6193 FREE(result);
6194 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6195 return NULL;
6196 }
6197
6198 result->next = *list;
6199 *list = result;
6200 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6201 return result;
6202 }
6203
6204 /**
6205 * Create a vertex shader prolog.
6206 *
6207 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6208 * All inputs are returned unmodified. The vertex load indices are
6209 * stored after them, which will used by the API VS for fetching inputs.
6210 *
6211 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6212 * input_v0,
6213 * input_v1,
6214 * input_v2,
6215 * input_v3,
6216 * (VertexID + BaseVertex),
6217 * (InstanceID + StartInstance),
6218 * (InstanceID / 2 + StartInstance)
6219 */
6220 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6221 LLVMTargetMachineRef tm,
6222 struct pipe_debug_callback *debug,
6223 struct si_shader_part *out)
6224 {
6225 union si_shader_part_key *key = &out->key;
6226 struct si_shader shader = {};
6227 struct si_shader_context ctx;
6228 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6229 LLVMTypeRef *params, *returns;
6230 LLVMValueRef ret, func;
6231 int last_sgpr, num_params, num_returns, i;
6232 bool status = true;
6233
6234 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6235 ctx.type = TGSI_PROCESSOR_VERTEX;
6236 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6237 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6238
6239 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6240 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6241 sizeof(LLVMTypeRef));
6242 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6243 key->vs_prolog.last_input + 1) *
6244 sizeof(LLVMTypeRef));
6245 num_params = 0;
6246 num_returns = 0;
6247
6248 /* Declare input and output SGPRs. */
6249 num_params = 0;
6250 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6251 params[num_params++] = ctx.i32;
6252 returns[num_returns++] = ctx.i32;
6253 }
6254 last_sgpr = num_params - 1;
6255
6256 /* 4 preloaded VGPRs (outputs must be floats) */
6257 for (i = 0; i < 4; i++) {
6258 params[num_params++] = ctx.i32;
6259 returns[num_returns++] = ctx.f32;
6260 }
6261
6262 /* Vertex load indices. */
6263 for (i = 0; i <= key->vs_prolog.last_input; i++)
6264 returns[num_returns++] = ctx.f32;
6265
6266 /* Create the function. */
6267 si_create_function(&ctx, returns, num_returns, params,
6268 num_params, -1, last_sgpr);
6269 func = ctx.radeon_bld.main_fn;
6270
6271 /* Copy inputs to outputs. This should be no-op, as the registers match,
6272 * but it will prevent the compiler from overwriting them unintentionally.
6273 */
6274 ret = ctx.return_value;
6275 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6276 LLVMValueRef p = LLVMGetParam(func, i);
6277 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6278 }
6279 for (i = num_params - 4; i < num_params; i++) {
6280 LLVMValueRef p = LLVMGetParam(func, i);
6281 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6282 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6283 }
6284
6285 /* Compute vertex load indices from instance divisors. */
6286 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6287 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6288 LLVMValueRef index;
6289
6290 if (divisor) {
6291 /* InstanceID / Divisor + StartInstance */
6292 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6293 SI_SGPR_START_INSTANCE,
6294 divisor);
6295 } else {
6296 /* VertexID + BaseVertex */
6297 index = LLVMBuildAdd(gallivm->builder,
6298 LLVMGetParam(func, ctx.param_vertex_id),
6299 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6300 }
6301
6302 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6303 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6304 num_params++, "");
6305 }
6306
6307 /* Compile. */
6308 LLVMBuildRet(gallivm->builder, ret);
6309 radeon_llvm_finalize_module(&ctx.radeon_bld);
6310
6311 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6312 gallivm->module, debug, ctx.type,
6313 "Vertex Shader Prolog"))
6314 status = false;
6315
6316 radeon_llvm_dispose(&ctx.radeon_bld);
6317 return status;
6318 }
6319
6320 /**
6321 * Compile the vertex shader epilog. This is also used by the tessellation
6322 * evaluation shader compiled as VS.
6323 *
6324 * The input is PrimitiveID.
6325 *
6326 * If PrimitiveID is required by the pixel shader, export it.
6327 * Otherwise, do nothing.
6328 */
6329 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6330 LLVMTargetMachineRef tm,
6331 struct pipe_debug_callback *debug,
6332 struct si_shader_part *out)
6333 {
6334 union si_shader_part_key *key = &out->key;
6335 struct si_shader_context ctx;
6336 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6337 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6338 LLVMTypeRef params[5];
6339 int num_params, i;
6340 bool status = true;
6341
6342 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
6343 ctx.type = TGSI_PROCESSOR_VERTEX;
6344
6345 /* Declare input VGPRs. */
6346 num_params = key->vs_epilog.states.export_prim_id ?
6347 (VS_EPILOG_PRIMID_LOC + 1) : 0;
6348 assert(num_params <= ARRAY_SIZE(params));
6349
6350 for (i = 0; i < num_params; i++)
6351 params[i] = ctx.f32;
6352
6353 /* Create the function. */
6354 si_create_function(&ctx, NULL, 0, params, num_params,
6355 -1, -1);
6356
6357 /* Emit exports. */
6358 if (key->vs_epilog.states.export_prim_id) {
6359 struct lp_build_context *base = &bld_base->base;
6360 struct lp_build_context *uint = &bld_base->uint_bld;
6361 LLVMValueRef args[9];
6362
6363 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
6364 args[1] = uint->zero; /* whether the EXEC mask is valid */
6365 args[2] = uint->zero; /* DONE bit */
6366 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
6367 key->vs_epilog.prim_id_param_offset);
6368 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
6369 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
6370 VS_EPILOG_PRIMID_LOC); /* X */
6371 args[6] = uint->undef; /* Y */
6372 args[7] = uint->undef; /* Z */
6373 args[8] = uint->undef; /* W */
6374
6375 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
6376 LLVMVoidTypeInContext(base->gallivm->context),
6377 args, 9, 0);
6378 }
6379
6380 /* Compile. */
6381 LLVMBuildRet(gallivm->builder, ctx.return_value);
6382 radeon_llvm_finalize_module(&ctx.radeon_bld);
6383
6384 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6385 gallivm->module, debug, ctx.type,
6386 "Vertex Shader Epilog"))
6387 status = false;
6388
6389 radeon_llvm_dispose(&ctx.radeon_bld);
6390 return status;
6391 }
6392
6393 /**
6394 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
6395 */
6396 static bool si_get_vs_epilog(struct si_screen *sscreen,
6397 LLVMTargetMachineRef tm,
6398 struct si_shader *shader,
6399 struct pipe_debug_callback *debug,
6400 struct si_vs_epilog_bits *states)
6401 {
6402 union si_shader_part_key epilog_key;
6403
6404 memset(&epilog_key, 0, sizeof(epilog_key));
6405 epilog_key.vs_epilog.states = *states;
6406
6407 /* Set up the PrimitiveID output. */
6408 if (shader->key.vs.epilog.export_prim_id) {
6409 unsigned index = shader->selector->info.num_outputs;
6410 unsigned offset = shader->info.nr_param_exports++;
6411
6412 epilog_key.vs_epilog.prim_id_param_offset = offset;
6413 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
6414 shader->info.vs_output_param_offset[index] = offset;
6415 }
6416
6417 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
6418 &epilog_key, tm, debug,
6419 si_compile_vs_epilog);
6420 return shader->epilog != NULL;
6421 }
6422
6423 /**
6424 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
6425 */
6426 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
6427 LLVMTargetMachineRef tm,
6428 struct si_shader *shader,
6429 struct pipe_debug_callback *debug)
6430 {
6431 struct tgsi_shader_info *info = &shader->selector->info;
6432 union si_shader_part_key prolog_key;
6433 unsigned i;
6434
6435 /* Get the prolog. */
6436 memset(&prolog_key, 0, sizeof(prolog_key));
6437 prolog_key.vs_prolog.states = shader->key.vs.prolog;
6438 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6439 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
6440
6441 /* The prolog is a no-op if there are no inputs. */
6442 if (info->num_inputs) {
6443 shader->prolog =
6444 si_get_shader_part(sscreen, &sscreen->vs_prologs,
6445 &prolog_key, tm, debug,
6446 si_compile_vs_prolog);
6447 if (!shader->prolog)
6448 return false;
6449 }
6450
6451 /* Get the epilog. */
6452 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
6453 !si_get_vs_epilog(sscreen, tm, shader, debug,
6454 &shader->key.vs.epilog))
6455 return false;
6456
6457 /* Set the instanceID flag. */
6458 for (i = 0; i < info->num_inputs; i++)
6459 if (prolog_key.vs_prolog.states.instance_divisors[i])
6460 shader->info.uses_instanceid = true;
6461
6462 return true;
6463 }
6464
6465 /**
6466 * Select and compile (or reuse) TES parts (epilog).
6467 */
6468 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
6469 LLVMTargetMachineRef tm,
6470 struct si_shader *shader,
6471 struct pipe_debug_callback *debug)
6472 {
6473 if (shader->key.tes.as_es)
6474 return true;
6475
6476 /* TES compiled as VS. */
6477 return si_get_vs_epilog(sscreen, tm, shader, debug,
6478 &shader->key.tes.epilog);
6479 }
6480
6481 /**
6482 * Compile the TCS epilog. This writes tesselation factors to memory based on
6483 * the output primitive type of the tesselator (determined by TES).
6484 */
6485 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
6486 LLVMTargetMachineRef tm,
6487 struct pipe_debug_callback *debug,
6488 struct si_shader_part *out)
6489 {
6490 union si_shader_part_key *key = &out->key;
6491 struct si_shader shader = {};
6492 struct si_shader_context ctx;
6493 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6494 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6495 LLVMTypeRef params[16];
6496 LLVMValueRef func;
6497 int last_array_pointer, last_sgpr, num_params;
6498 bool status = true;
6499
6500 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6501 ctx.type = TGSI_PROCESSOR_TESS_CTRL;
6502 shader.key.tcs.epilog = key->tcs_epilog.states;
6503
6504 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
6505 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
6506 last_array_pointer = SI_PARAM_RW_BUFFERS;
6507 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6508 params[SI_PARAM_SAMPLERS] = ctx.i64;
6509 params[SI_PARAM_IMAGES] = ctx.i64;
6510 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6511 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
6512 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
6513 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
6514 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
6515 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
6516 num_params = last_sgpr + 1;
6517
6518 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
6519 params[num_params++] = ctx.i32; /* invocation ID within the patch */
6520 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
6521
6522 /* Create the function. */
6523 si_create_function(&ctx, NULL, 0, params, num_params,
6524 last_array_pointer, last_sgpr);
6525 declare_tess_lds(&ctx);
6526 func = ctx.radeon_bld.main_fn;
6527
6528 si_write_tess_factors(bld_base,
6529 LLVMGetParam(func, last_sgpr + 1),
6530 LLVMGetParam(func, last_sgpr + 2),
6531 LLVMGetParam(func, last_sgpr + 3));
6532
6533 /* Compile. */
6534 LLVMBuildRet(gallivm->builder, ctx.return_value);
6535 radeon_llvm_finalize_module(&ctx.radeon_bld);
6536
6537 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6538 gallivm->module, debug, ctx.type,
6539 "Tessellation Control Shader Epilog"))
6540 status = false;
6541
6542 radeon_llvm_dispose(&ctx.radeon_bld);
6543 return status;
6544 }
6545
6546 /**
6547 * Select and compile (or reuse) TCS parts (epilog).
6548 */
6549 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
6550 LLVMTargetMachineRef tm,
6551 struct si_shader *shader,
6552 struct pipe_debug_callback *debug)
6553 {
6554 union si_shader_part_key epilog_key;
6555
6556 /* Get the epilog. */
6557 memset(&epilog_key, 0, sizeof(epilog_key));
6558 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
6559
6560 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
6561 &epilog_key, tm, debug,
6562 si_compile_tcs_epilog);
6563 return shader->epilog != NULL;
6564 }
6565
6566 /**
6567 * Compile the pixel shader prolog. This handles:
6568 * - two-side color selection and interpolation
6569 * - overriding interpolation parameters for the API PS
6570 * - polygon stippling
6571 *
6572 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
6573 * overriden by other states. (e.g. per-sample interpolation)
6574 * Interpolated colors are stored after the preloaded VGPRs.
6575 */
6576 static bool si_compile_ps_prolog(struct si_screen *sscreen,
6577 LLVMTargetMachineRef tm,
6578 struct pipe_debug_callback *debug,
6579 struct si_shader_part *out)
6580 {
6581 union si_shader_part_key *key = &out->key;
6582 struct si_shader shader = {};
6583 struct si_shader_context ctx;
6584 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6585 LLVMTypeRef *params;
6586 LLVMValueRef ret, func;
6587 int last_sgpr, num_params, num_returns, i, num_color_channels;
6588 bool status = true;
6589
6590 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6591 ctx.type = TGSI_PROCESSOR_FRAGMENT;
6592 shader.key.ps.prolog = key->ps_prolog.states;
6593
6594 /* Number of inputs + 8 color elements. */
6595 params = alloca((key->ps_prolog.num_input_sgprs +
6596 key->ps_prolog.num_input_vgprs + 8) *
6597 sizeof(LLVMTypeRef));
6598
6599 /* Declare inputs. */
6600 num_params = 0;
6601 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
6602 params[num_params++] = ctx.i32;
6603 last_sgpr = num_params - 1;
6604
6605 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
6606 params[num_params++] = ctx.f32;
6607
6608 /* Declare outputs (same as inputs + add colors if needed) */
6609 num_returns = num_params;
6610 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
6611 for (i = 0; i < num_color_channels; i++)
6612 params[num_returns++] = ctx.f32;
6613
6614 /* Create the function. */
6615 si_create_function(&ctx, params, num_returns, params,
6616 num_params, -1, last_sgpr);
6617 func = ctx.radeon_bld.main_fn;
6618
6619 /* Copy inputs to outputs. This should be no-op, as the registers match,
6620 * but it will prevent the compiler from overwriting them unintentionally.
6621 */
6622 ret = ctx.return_value;
6623 for (i = 0; i < num_params; i++) {
6624 LLVMValueRef p = LLVMGetParam(func, i);
6625 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6626 }
6627
6628 /* Polygon stippling. */
6629 if (key->ps_prolog.states.poly_stipple) {
6630 /* POS_FIXED_PT is always last. */
6631 unsigned pos = key->ps_prolog.num_input_sgprs +
6632 key->ps_prolog.num_input_vgprs - 1;
6633 LLVMValueRef ptr[2], list;
6634
6635 /* Get the pointer to rw buffers. */
6636 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
6637 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
6638 list = lp_build_gather_values(gallivm, ptr, 2);
6639 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
6640 list = LLVMBuildIntToPtr(gallivm->builder, list,
6641 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
6642
6643 si_llvm_emit_polygon_stipple(&ctx, list, pos);
6644 }
6645
6646 /* Interpolate colors. */
6647 for (i = 0; i < 2; i++) {
6648 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
6649 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
6650 key->ps_prolog.face_vgpr_index;
6651 LLVMValueRef interp[2], color[4];
6652 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
6653
6654 if (!writemask)
6655 continue;
6656
6657 /* If the interpolation qualifier is not CONSTANT (-1). */
6658 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
6659 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
6660 key->ps_prolog.color_interp_vgpr_index[i];
6661
6662 interp[0] = LLVMGetParam(func, interp_vgpr);
6663 interp[1] = LLVMGetParam(func, interp_vgpr + 1);
6664 interp_ij = lp_build_gather_values(gallivm, interp, 2);
6665 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
6666 ctx.v2i32, "");
6667 }
6668
6669 /* Use the absolute location of the input. */
6670 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
6671
6672 if (key->ps_prolog.states.color_two_side) {
6673 face = LLVMGetParam(func, face_vgpr);
6674 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
6675 }
6676
6677 interp_fs_input(&ctx,
6678 key->ps_prolog.color_attr_index[i],
6679 TGSI_SEMANTIC_COLOR, i,
6680 key->ps_prolog.num_interp_inputs,
6681 key->ps_prolog.colors_read, interp_ij,
6682 prim_mask, face, color);
6683
6684 while (writemask) {
6685 unsigned chan = u_bit_scan(&writemask);
6686 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
6687 num_params++, "");
6688 }
6689 }
6690
6691 /* Force per-sample interpolation. */
6692 if (key->ps_prolog.states.force_persample_interp) {
6693 unsigned i, base = key->ps_prolog.num_input_sgprs;
6694 LLVMValueRef persp_sample[2], linear_sample[2];
6695
6696 /* Read PERSP_SAMPLE. */
6697 for (i = 0; i < 2; i++)
6698 persp_sample[i] = LLVMGetParam(func, base + i);
6699 /* Overwrite PERSP_CENTER. */
6700 for (i = 0; i < 2; i++)
6701 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6702 persp_sample[i], base + 2 + i, "");
6703 /* Overwrite PERSP_CENTROID. */
6704 for (i = 0; i < 2; i++)
6705 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6706 persp_sample[i], base + 4 + i, "");
6707 /* Read LINEAR_SAMPLE. */
6708 for (i = 0; i < 2; i++)
6709 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
6710 /* Overwrite LINEAR_CENTER. */
6711 for (i = 0; i < 2; i++)
6712 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6713 linear_sample[i], base + 8 + i, "");
6714 /* Overwrite LINEAR_CENTROID. */
6715 for (i = 0; i < 2; i++)
6716 ret = LLVMBuildInsertValue(gallivm->builder, ret,
6717 linear_sample[i], base + 10 + i, "");
6718 }
6719
6720 /* Compile. */
6721 LLVMBuildRet(gallivm->builder, ret);
6722 radeon_llvm_finalize_module(&ctx.radeon_bld);
6723
6724 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6725 gallivm->module, debug, ctx.type,
6726 "Fragment Shader Prolog"))
6727 status = false;
6728
6729 radeon_llvm_dispose(&ctx.radeon_bld);
6730 return status;
6731 }
6732
6733 /**
6734 * Compile the pixel shader epilog. This handles everything that must be
6735 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
6736 */
6737 static bool si_compile_ps_epilog(struct si_screen *sscreen,
6738 LLVMTargetMachineRef tm,
6739 struct pipe_debug_callback *debug,
6740 struct si_shader_part *out)
6741 {
6742 union si_shader_part_key *key = &out->key;
6743 struct si_shader shader = {};
6744 struct si_shader_context ctx;
6745 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6746 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
6747 LLVMTypeRef params[16+8*4+3];
6748 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
6749 int last_array_pointer, last_sgpr, num_params, i;
6750 bool status = true;
6751
6752 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6753 ctx.type = TGSI_PROCESSOR_FRAGMENT;
6754 shader.key.ps.epilog = key->ps_epilog.states;
6755
6756 /* Declare input SGPRs. */
6757 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
6758 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
6759 params[SI_PARAM_SAMPLERS] = ctx.i64;
6760 params[SI_PARAM_IMAGES] = ctx.i64;
6761 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
6762 params[SI_PARAM_ALPHA_REF] = ctx.f32;
6763 last_array_pointer = -1;
6764 last_sgpr = SI_PARAM_ALPHA_REF;
6765
6766 /* Declare input VGPRs. */
6767 num_params = (last_sgpr + 1) +
6768 util_bitcount(key->ps_epilog.colors_written) * 4 +
6769 key->ps_epilog.writes_z +
6770 key->ps_epilog.writes_stencil +
6771 key->ps_epilog.writes_samplemask;
6772
6773 num_params = MAX2(num_params,
6774 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6775
6776 assert(num_params <= ARRAY_SIZE(params));
6777
6778 for (i = last_sgpr + 1; i < num_params; i++)
6779 params[i] = ctx.f32;
6780
6781 /* Create the function. */
6782 si_create_function(&ctx, NULL, 0, params, num_params,
6783 last_array_pointer, last_sgpr);
6784 /* Disable elimination of unused inputs. */
6785 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
6786 "InitialPSInputAddr", 0xffffff);
6787
6788 /* Process colors. */
6789 unsigned vgpr = last_sgpr + 1;
6790 unsigned colors_written = key->ps_epilog.colors_written;
6791 int last_color_export = -1;
6792
6793 /* Find the last color export. */
6794 if (!key->ps_epilog.writes_z &&
6795 !key->ps_epilog.writes_stencil &&
6796 !key->ps_epilog.writes_samplemask) {
6797 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
6798
6799 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
6800 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
6801 /* Just set this if any of the colorbuffers are enabled. */
6802 if (spi_format &
6803 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
6804 last_color_export = 0;
6805 } else {
6806 for (i = 0; i < 8; i++)
6807 if (colors_written & (1 << i) &&
6808 (spi_format >> (i * 4)) & 0xf)
6809 last_color_export = i;
6810 }
6811 }
6812
6813 while (colors_written) {
6814 LLVMValueRef color[4];
6815 int mrt = u_bit_scan(&colors_written);
6816
6817 for (i = 0; i < 4; i++)
6818 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6819
6820 si_export_mrt_color(bld_base, color, mrt,
6821 num_params - 1,
6822 mrt == last_color_export);
6823 }
6824
6825 /* Process depth, stencil, samplemask. */
6826 if (key->ps_epilog.writes_z)
6827 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6828 if (key->ps_epilog.writes_stencil)
6829 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6830 if (key->ps_epilog.writes_samplemask)
6831 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
6832
6833 if (depth || stencil || samplemask)
6834 si_export_mrt_z(bld_base, depth, stencil, samplemask);
6835 else if (last_color_export == -1)
6836 si_export_null(bld_base);
6837
6838 /* Compile. */
6839 LLVMBuildRetVoid(gallivm->builder);
6840 radeon_llvm_finalize_module(&ctx.radeon_bld);
6841
6842 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6843 gallivm->module, debug, ctx.type,
6844 "Fragment Shader Epilog"))
6845 status = false;
6846
6847 radeon_llvm_dispose(&ctx.radeon_bld);
6848 return status;
6849 }
6850
6851 /**
6852 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
6853 */
6854 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
6855 LLVMTargetMachineRef tm,
6856 struct si_shader *shader,
6857 struct pipe_debug_callback *debug)
6858 {
6859 struct tgsi_shader_info *info = &shader->selector->info;
6860 union si_shader_part_key prolog_key;
6861 union si_shader_part_key epilog_key;
6862 unsigned i;
6863
6864 /* Get the prolog. */
6865 memset(&prolog_key, 0, sizeof(prolog_key));
6866 prolog_key.ps_prolog.states = shader->key.ps.prolog;
6867 prolog_key.ps_prolog.colors_read = info->colors_read;
6868 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
6869 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
6870
6871 if (info->colors_read) {
6872 unsigned *color = shader->selector->color_attr_index;
6873
6874 if (shader->key.ps.prolog.color_two_side) {
6875 /* BCOLORs are stored after the last input. */
6876 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
6877 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
6878 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
6879 }
6880
6881 for (i = 0; i < 2; i++) {
6882 unsigned location = info->input_interpolate_loc[color[i]];
6883
6884 if (!(info->colors_read & (0xf << i*4)))
6885 continue;
6886
6887 prolog_key.ps_prolog.color_attr_index[i] = color[i];
6888
6889 /* Force per-sample interpolation for the colors here. */
6890 if (shader->key.ps.prolog.force_persample_interp)
6891 location = TGSI_INTERPOLATE_LOC_SAMPLE;
6892
6893 switch (info->input_interpolate[color[i]]) {
6894 case TGSI_INTERPOLATE_CONSTANT:
6895 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
6896 break;
6897 case TGSI_INTERPOLATE_PERSPECTIVE:
6898 case TGSI_INTERPOLATE_COLOR:
6899 switch (location) {
6900 case TGSI_INTERPOLATE_LOC_SAMPLE:
6901 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
6902 shader->config.spi_ps_input_ena |=
6903 S_0286CC_PERSP_SAMPLE_ENA(1);
6904 break;
6905 case TGSI_INTERPOLATE_LOC_CENTER:
6906 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
6907 shader->config.spi_ps_input_ena |=
6908 S_0286CC_PERSP_CENTER_ENA(1);
6909 break;
6910 case TGSI_INTERPOLATE_LOC_CENTROID:
6911 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
6912 shader->config.spi_ps_input_ena |=
6913 S_0286CC_PERSP_CENTROID_ENA(1);
6914 break;
6915 default:
6916 assert(0);
6917 }
6918 break;
6919 case TGSI_INTERPOLATE_LINEAR:
6920 switch (location) {
6921 case TGSI_INTERPOLATE_LOC_SAMPLE:
6922 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
6923 shader->config.spi_ps_input_ena |=
6924 S_0286CC_LINEAR_SAMPLE_ENA(1);
6925 break;
6926 case TGSI_INTERPOLATE_LOC_CENTER:
6927 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
6928 shader->config.spi_ps_input_ena |=
6929 S_0286CC_LINEAR_CENTER_ENA(1);
6930 break;
6931 case TGSI_INTERPOLATE_LOC_CENTROID:
6932 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
6933 shader->config.spi_ps_input_ena |=
6934 S_0286CC_LINEAR_CENTROID_ENA(1);
6935 break;
6936 default:
6937 assert(0);
6938 }
6939 break;
6940 default:
6941 assert(0);
6942 }
6943 }
6944 }
6945
6946 /* The prolog is a no-op if these aren't set. */
6947 if (prolog_key.ps_prolog.colors_read ||
6948 prolog_key.ps_prolog.states.force_persample_interp ||
6949 prolog_key.ps_prolog.states.poly_stipple) {
6950 shader->prolog =
6951 si_get_shader_part(sscreen, &sscreen->ps_prologs,
6952 &prolog_key, tm, debug,
6953 si_compile_ps_prolog);
6954 if (!shader->prolog)
6955 return false;
6956 }
6957
6958 /* Get the epilog. */
6959 memset(&epilog_key, 0, sizeof(epilog_key));
6960 epilog_key.ps_epilog.colors_written = info->colors_written;
6961 epilog_key.ps_epilog.writes_z = info->writes_z;
6962 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
6963 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
6964 epilog_key.ps_epilog.states = shader->key.ps.epilog;
6965
6966 shader->epilog =
6967 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
6968 &epilog_key, tm, debug,
6969 si_compile_ps_epilog);
6970 if (!shader->epilog)
6971 return false;
6972
6973 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
6974 if (shader->key.ps.prolog.poly_stipple) {
6975 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
6976 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
6977 }
6978
6979 /* Set up the enable bits for per-sample shading if needed. */
6980 if (shader->key.ps.prolog.force_persample_interp) {
6981 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
6982 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
6983 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
6984 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
6985 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
6986 }
6987 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
6988 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena)) {
6989 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
6990 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
6991 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
6992 }
6993 }
6994
6995 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
6996 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
6997 !(shader->config.spi_ps_input_ena & 0xf)) {
6998 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
6999 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7000 }
7001
7002 /* At least one pair of interpolation weights must be enabled. */
7003 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7004 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7005 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7006 }
7007
7008 /* The sample mask input is always enabled, because the API shader always
7009 * passes it through to the epilog. Disable it here if it's unused.
7010 */
7011 if (!shader->key.ps.epilog.poly_line_smoothing &&
7012 !shader->selector->info.reads_samplemask)
7013 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7014
7015 return true;
7016 }
7017
7018 static void si_fix_num_sgprs(struct si_shader *shader)
7019 {
7020 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7021
7022 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7023 }
7024
7025 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7026 struct si_shader *shader,
7027 struct pipe_debug_callback *debug)
7028 {
7029 struct si_shader *mainp = shader->selector->main_shader_part;
7030 int r;
7031
7032 /* LS, ES, VS are compiled on demand if the main part hasn't been
7033 * compiled for that stage.
7034 */
7035 if (!mainp ||
7036 (shader->selector->type == PIPE_SHADER_VERTEX &&
7037 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7038 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7039 (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
7040 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7041 shader->selector->type == PIPE_SHADER_COMPUTE) {
7042 /* Monolithic shader (compiled as a whole, has many variants,
7043 * may take a long time to compile).
7044 */
7045 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7046 if (r)
7047 return r;
7048 } else {
7049 /* The shader consists of 2-3 parts:
7050 *
7051 * - the middle part is the user shader, it has 1 variant only
7052 * and it was compiled during the creation of the shader
7053 * selector
7054 * - the prolog part is inserted at the beginning
7055 * - the epilog part is inserted at the end
7056 *
7057 * The prolog and epilog have many (but simple) variants.
7058 */
7059
7060 /* Copy the compiled TGSI shader data over. */
7061 shader->is_binary_shared = true;
7062 shader->binary = mainp->binary;
7063 shader->config = mainp->config;
7064 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7065 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7066 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7067 memcpy(shader->info.vs_output_param_offset,
7068 mainp->info.vs_output_param_offset,
7069 sizeof(mainp->info.vs_output_param_offset));
7070 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7071 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7072 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7073
7074 /* Select prologs and/or epilogs. */
7075 switch (shader->selector->type) {
7076 case PIPE_SHADER_VERTEX:
7077 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7078 return -1;
7079 break;
7080 case PIPE_SHADER_TESS_CTRL:
7081 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7082 return -1;
7083 break;
7084 case PIPE_SHADER_TESS_EVAL:
7085 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7086 return -1;
7087 break;
7088 case PIPE_SHADER_FRAGMENT:
7089 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7090 return -1;
7091
7092 /* Make sure we have at least as many VGPRs as there
7093 * are allocated inputs.
7094 */
7095 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7096 shader->info.num_input_vgprs);
7097 break;
7098 }
7099
7100 /* Update SGPR and VGPR counts. */
7101 if (shader->prolog) {
7102 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7103 shader->prolog->config.num_sgprs);
7104 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7105 shader->prolog->config.num_vgprs);
7106 }
7107 if (shader->epilog) {
7108 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7109 shader->epilog->config.num_sgprs);
7110 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7111 shader->epilog->config.num_vgprs);
7112 }
7113 }
7114
7115 si_fix_num_sgprs(shader);
7116 si_shader_dump(sscreen, shader, debug, shader->selector->info.processor,
7117 stderr);
7118
7119 /* Upload. */
7120 r = si_shader_binary_upload(sscreen, shader);
7121 if (r) {
7122 fprintf(stderr, "LLVM failed to upload shader\n");
7123 return r;
7124 }
7125
7126 return 0;
7127 }
7128
7129 void si_shader_destroy(struct si_shader *shader)
7130 {
7131 if (shader->gs_copy_shader) {
7132 si_shader_destroy(shader->gs_copy_shader);
7133 FREE(shader->gs_copy_shader);
7134 }
7135
7136 if (shader->scratch_bo)
7137 r600_resource_reference(&shader->scratch_bo, NULL);
7138
7139 r600_resource_reference(&shader->bo, NULL);
7140
7141 if (!shader->is_binary_shared)
7142 radeon_shader_binary_clean(&shader->binary);
7143 }