radeonsi: move LLVM ALU codegen into radeonsi
[mesa.git] / src / gallium / drivers / radeonsi / si_shader.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Tom Stellard <thomas.stellard@amd.com>
25 * Michel Dänzer <michel.daenzer@amd.com>
26 * Christian König <christian.koenig@amd.com>
27 */
28
29 #include "gallivm/lp_bld_const.h"
30 #include "gallivm/lp_bld_gather.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_logic.h"
33 #include "gallivm/lp_bld_arit.h"
34 #include "gallivm/lp_bld_flow.h"
35 #include "gallivm/lp_bld_misc.h"
36 #include "radeon/radeon_llvm.h"
37 #include "radeon/radeon_elf_util.h"
38 #include "radeon/radeon_llvm_emit.h"
39 #include "util/u_memory.h"
40 #include "util/u_string.h"
41 #include "tgsi/tgsi_build.h"
42 #include "tgsi/tgsi_util.h"
43 #include "tgsi/tgsi_dump.h"
44
45 #include "si_shader_internal.h"
46 #include "si_pipe.h"
47 #include "sid.h"
48
49
50 static const char *scratch_rsrc_dword0_symbol =
51 "SCRATCH_RSRC_DWORD0";
52
53 static const char *scratch_rsrc_dword1_symbol =
54 "SCRATCH_RSRC_DWORD1";
55
56 struct si_shader_output_values
57 {
58 LLVMValueRef values[4];
59 unsigned name;
60 unsigned sid;
61 };
62
63 struct si_shader_context
64 {
65 struct radeon_llvm_context radeon_bld;
66 struct si_shader *shader;
67 struct si_screen *screen;
68
69 unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
70 bool is_gs_copy_shader;
71
72 /* Whether to generate the optimized shader variant compiled as a whole
73 * (without a prolog and epilog)
74 */
75 bool is_monolithic;
76
77 int param_streamout_config;
78 int param_streamout_write_index;
79 int param_streamout_offset[4];
80 int param_vertex_id;
81 int param_rel_auto_id;
82 int param_vs_prim_id;
83 int param_instance_id;
84 int param_vertex_index0;
85 int param_tes_u;
86 int param_tes_v;
87 int param_tes_rel_patch_id;
88 int param_tes_patch_id;
89 int param_es2gs_offset;
90 int param_oc_lds;
91
92 /* Sets a bit if the dynamic HS control word was 0x80000000. The bit is
93 * 0x800000 for VS, 0x1 for ES.
94 */
95 int param_tess_offchip;
96
97 LLVMTargetMachineRef tm;
98
99 unsigned invariant_load_md_kind;
100 unsigned range_md_kind;
101 unsigned uniform_md_kind;
102 LLVMValueRef empty_md;
103
104 /* Preloaded descriptors. */
105 LLVMValueRef esgs_ring;
106 LLVMValueRef gsvs_ring[4];
107
108 LLVMValueRef lds;
109 LLVMValueRef gs_next_vertex[4];
110 LLVMValueRef return_value;
111
112 LLVMTypeRef voidt;
113 LLVMTypeRef i1;
114 LLVMTypeRef i8;
115 LLVMTypeRef i32;
116 LLVMTypeRef i64;
117 LLVMTypeRef i128;
118 LLVMTypeRef f32;
119 LLVMTypeRef v16i8;
120 LLVMTypeRef v2i32;
121 LLVMTypeRef v4i32;
122 LLVMTypeRef v4f32;
123 LLVMTypeRef v8i32;
124
125 LLVMValueRef shared_memory;
126 };
127
128 static struct si_shader_context *si_shader_context(
129 struct lp_build_tgsi_context *bld_base)
130 {
131 return (struct si_shader_context *)bld_base;
132 }
133
134 static void si_init_shader_ctx(struct si_shader_context *ctx,
135 struct si_screen *sscreen,
136 struct si_shader *shader,
137 LLVMTargetMachineRef tm);
138
139 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
140 struct lp_build_tgsi_context *bld_base,
141 struct lp_build_emit_data *emit_data);
142
143 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
144 FILE *f);
145
146 /* Ideally pass the sample mask input to the PS epilog as v13, which
147 * is its usual location, so that the shader doesn't have to add v_mov.
148 */
149 #define PS_EPILOG_SAMPLEMASK_MIN_LOC 13
150
151 /* The VS location of the PrimitiveID input is the same in the epilog,
152 * so that the main shader part doesn't have to move it.
153 */
154 #define VS_EPILOG_PRIMID_LOC 2
155
156 enum {
157 CONST_ADDR_SPACE = 2,
158 LOCAL_ADDR_SPACE = 3,
159 };
160
161 #define SENDMSG_GS 2
162 #define SENDMSG_GS_DONE 3
163
164 #define SENDMSG_GS_OP_NOP (0 << 4)
165 #define SENDMSG_GS_OP_CUT (1 << 4)
166 #define SENDMSG_GS_OP_EMIT (2 << 4)
167 #define SENDMSG_GS_OP_EMIT_CUT (3 << 4)
168
169 /**
170 * Returns a unique index for a semantic name and index. The index must be
171 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
172 * calculated.
173 */
174 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
175 {
176 switch (semantic_name) {
177 case TGSI_SEMANTIC_POSITION:
178 return 0;
179 case TGSI_SEMANTIC_PSIZE:
180 return 1;
181 case TGSI_SEMANTIC_CLIPDIST:
182 assert(index <= 1);
183 return 2 + index;
184 case TGSI_SEMANTIC_GENERIC:
185 if (index <= 63-4)
186 return 4 + index;
187 else
188 /* same explanation as in the default statement,
189 * the only user hitting this is st/nine.
190 */
191 return 0;
192
193 /* patch indices are completely separate and thus start from 0 */
194 case TGSI_SEMANTIC_TESSOUTER:
195 return 0;
196 case TGSI_SEMANTIC_TESSINNER:
197 return 1;
198 case TGSI_SEMANTIC_PATCH:
199 return 2 + index;
200
201 default:
202 /* Don't fail here. The result of this function is only used
203 * for LS, TCS, TES, and GS, where legacy GL semantics can't
204 * occur, but this function is called for all vertex shaders
205 * before it's known whether LS will be compiled or not.
206 */
207 return 0;
208 }
209 }
210
211 /**
212 * Get the value of a shader input parameter and extract a bitfield.
213 */
214 static LLVMValueRef unpack_param(struct si_shader_context *ctx,
215 unsigned param, unsigned rshift,
216 unsigned bitwidth)
217 {
218 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
219 LLVMValueRef value = LLVMGetParam(ctx->radeon_bld.main_fn,
220 param);
221
222 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
223 value = bitcast(&ctx->radeon_bld.soa.bld_base,
224 TGSI_TYPE_UNSIGNED, value);
225
226 if (rshift)
227 value = LLVMBuildLShr(gallivm->builder, value,
228 lp_build_const_int32(gallivm, rshift), "");
229
230 if (rshift + bitwidth < 32) {
231 unsigned mask = (1 << bitwidth) - 1;
232 value = LLVMBuildAnd(gallivm->builder, value,
233 lp_build_const_int32(gallivm, mask), "");
234 }
235
236 return value;
237 }
238
239 static LLVMValueRef get_rel_patch_id(struct si_shader_context *ctx)
240 {
241 switch (ctx->type) {
242 case PIPE_SHADER_TESS_CTRL:
243 return unpack_param(ctx, SI_PARAM_REL_IDS, 0, 8);
244
245 case PIPE_SHADER_TESS_EVAL:
246 return LLVMGetParam(ctx->radeon_bld.main_fn,
247 ctx->param_tes_rel_patch_id);
248
249 default:
250 assert(0);
251 return NULL;
252 }
253 }
254
255 /* Tessellation shaders pass outputs to the next shader using LDS.
256 *
257 * LS outputs = TCS inputs
258 * TCS outputs = TES inputs
259 *
260 * The LDS layout is:
261 * - TCS inputs for patch 0
262 * - TCS inputs for patch 1
263 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
264 * - ...
265 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
266 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
267 * - TCS outputs for patch 1
268 * - Per-patch TCS outputs for patch 1
269 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
270 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
271 * - ...
272 *
273 * All three shaders VS(LS), TCS, TES share the same LDS space.
274 */
275
276 static LLVMValueRef
277 get_tcs_in_patch_stride(struct si_shader_context *ctx)
278 {
279 if (ctx->type == PIPE_SHADER_VERTEX)
280 return unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 0, 13);
281 else if (ctx->type == PIPE_SHADER_TESS_CTRL)
282 return unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 0, 13);
283 else {
284 assert(0);
285 return NULL;
286 }
287 }
288
289 static LLVMValueRef
290 get_tcs_out_patch_stride(struct si_shader_context *ctx)
291 {
292 return unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 0, 13);
293 }
294
295 static LLVMValueRef
296 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
297 {
298 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
299 unpack_param(ctx,
300 SI_PARAM_TCS_OUT_OFFSETS,
301 0, 16),
302 4);
303 }
304
305 static LLVMValueRef
306 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
307 {
308 return lp_build_mul_imm(&ctx->radeon_bld.soa.bld_base.uint_bld,
309 unpack_param(ctx,
310 SI_PARAM_TCS_OUT_OFFSETS,
311 16, 16),
312 4);
313 }
314
315 static LLVMValueRef
316 get_tcs_in_current_patch_offset(struct si_shader_context *ctx)
317 {
318 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
319 LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
320 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
321
322 return LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, "");
323 }
324
325 static LLVMValueRef
326 get_tcs_out_current_patch_offset(struct si_shader_context *ctx)
327 {
328 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
329 LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
330 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
331 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
332
333 return LLVMBuildAdd(gallivm->builder, patch0_offset,
334 LLVMBuildMul(gallivm->builder, patch_stride,
335 rel_patch_id, ""),
336 "");
337 }
338
339 static LLVMValueRef
340 get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
341 {
342 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
343 LLVMValueRef patch0_patch_data_offset =
344 get_tcs_out_patch0_patch_data_offset(ctx);
345 LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
346 LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
347
348 return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
349 LLVMBuildMul(gallivm->builder, patch_stride,
350 rel_patch_id, ""),
351 "");
352 }
353
354 static LLVMValueRef build_gep0(struct si_shader_context *ctx,
355 LLVMValueRef base_ptr, LLVMValueRef index)
356 {
357 LLVMValueRef indices[2] = {
358 LLVMConstInt(ctx->i32, 0, 0),
359 index,
360 };
361 return LLVMBuildGEP(ctx->radeon_bld.gallivm.builder, base_ptr,
362 indices, 2, "");
363 }
364
365 static void build_indexed_store(struct si_shader_context *ctx,
366 LLVMValueRef base_ptr, LLVMValueRef index,
367 LLVMValueRef value)
368 {
369 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
370 struct gallivm_state *gallivm = bld_base->base.gallivm;
371
372 LLVMBuildStore(gallivm->builder, value,
373 build_gep0(ctx, base_ptr, index));
374 }
375
376 /**
377 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
378 * It's equivalent to doing a load from &base_ptr[index].
379 *
380 * \param base_ptr Where the array starts.
381 * \param index The element index into the array.
382 * \param uniform Whether the base_ptr and index can be assumed to be
383 * dynamically uniform
384 */
385 static LLVMValueRef build_indexed_load(struct si_shader_context *ctx,
386 LLVMValueRef base_ptr, LLVMValueRef index,
387 bool uniform)
388 {
389 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
390 struct gallivm_state *gallivm = bld_base->base.gallivm;
391 LLVMValueRef pointer;
392
393 pointer = build_gep0(ctx, base_ptr, index);
394 if (uniform)
395 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
396 return LLVMBuildLoad(gallivm->builder, pointer, "");
397 }
398
399 /**
400 * Do a load from &base_ptr[index], but also add a flag that it's loading
401 * a constant from a dynamically uniform index.
402 */
403 static LLVMValueRef build_indexed_load_const(
404 struct si_shader_context *ctx,
405 LLVMValueRef base_ptr, LLVMValueRef index)
406 {
407 LLVMValueRef result = build_indexed_load(ctx, base_ptr, index, true);
408 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
409 return result;
410 }
411
412 static LLVMValueRef get_instance_index_for_fetch(
413 struct radeon_llvm_context *radeon_bld,
414 unsigned param_start_instance, unsigned divisor)
415 {
416 struct si_shader_context *ctx =
417 si_shader_context(&radeon_bld->soa.bld_base);
418 struct gallivm_state *gallivm = radeon_bld->soa.bld_base.base.gallivm;
419
420 LLVMValueRef result = LLVMGetParam(radeon_bld->main_fn,
421 ctx->param_instance_id);
422
423 /* The division must be done before START_INSTANCE is added. */
424 if (divisor > 1)
425 result = LLVMBuildUDiv(gallivm->builder, result,
426 lp_build_const_int32(gallivm, divisor), "");
427
428 return LLVMBuildAdd(gallivm->builder, result,
429 LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
430 }
431
432 static void declare_input_vs(
433 struct radeon_llvm_context *radeon_bld,
434 unsigned input_index,
435 const struct tgsi_full_declaration *decl,
436 LLVMValueRef out[4])
437 {
438 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
439 struct gallivm_state *gallivm = base->gallivm;
440 struct si_shader_context *ctx =
441 si_shader_context(&radeon_bld->soa.bld_base);
442 unsigned divisor =
443 ctx->shader->key.vs.prolog.instance_divisors[input_index];
444
445 unsigned chan;
446
447 LLVMValueRef t_list_ptr;
448 LLVMValueRef t_offset;
449 LLVMValueRef t_list;
450 LLVMValueRef attribute_offset;
451 LLVMValueRef buffer_index;
452 LLVMValueRef args[3];
453 LLVMValueRef input;
454
455 /* Load the T list */
456 t_list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_VERTEX_BUFFERS);
457
458 t_offset = lp_build_const_int32(gallivm, input_index);
459
460 t_list = build_indexed_load_const(ctx, t_list_ptr, t_offset);
461
462 /* Build the attribute offset */
463 attribute_offset = lp_build_const_int32(gallivm, 0);
464
465 if (!ctx->is_monolithic) {
466 buffer_index = LLVMGetParam(radeon_bld->main_fn,
467 ctx->param_vertex_index0 +
468 input_index);
469 } else if (divisor) {
470 /* Build index from instance ID, start instance and divisor */
471 ctx->shader->info.uses_instanceid = true;
472 buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
473 SI_PARAM_START_INSTANCE,
474 divisor);
475 } else {
476 /* Load the buffer index for vertices. */
477 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
478 ctx->param_vertex_id);
479 LLVMValueRef base_vertex = LLVMGetParam(radeon_bld->main_fn,
480 SI_PARAM_BASE_VERTEX);
481 buffer_index = LLVMBuildAdd(gallivm->builder, base_vertex, vertex_id, "");
482 }
483
484 args[0] = t_list;
485 args[1] = attribute_offset;
486 args[2] = buffer_index;
487 input = lp_build_intrinsic(gallivm->builder,
488 "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
489 LLVMReadNoneAttribute);
490
491 /* Break up the vec4 into individual components */
492 for (chan = 0; chan < 4; chan++) {
493 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
494 out[chan] = LLVMBuildExtractElement(gallivm->builder,
495 input, llvm_chan, "");
496 }
497 }
498
499 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
500 unsigned swizzle)
501 {
502 struct si_shader_context *ctx = si_shader_context(bld_base);
503
504 if (swizzle > 0)
505 return bld_base->uint_bld.zero;
506
507 switch (ctx->type) {
508 case PIPE_SHADER_VERTEX:
509 return LLVMGetParam(ctx->radeon_bld.main_fn,
510 ctx->param_vs_prim_id);
511 case PIPE_SHADER_TESS_CTRL:
512 return LLVMGetParam(ctx->radeon_bld.main_fn,
513 SI_PARAM_PATCH_ID);
514 case PIPE_SHADER_TESS_EVAL:
515 return LLVMGetParam(ctx->radeon_bld.main_fn,
516 ctx->param_tes_patch_id);
517 case PIPE_SHADER_GEOMETRY:
518 return LLVMGetParam(ctx->radeon_bld.main_fn,
519 SI_PARAM_PRIMITIVE_ID);
520 default:
521 assert(0);
522 return bld_base->uint_bld.zero;
523 }
524 }
525
526 /**
527 * Return the value of tgsi_ind_register for indexing.
528 * This is the indirect index with the constant offset added to it.
529 */
530 static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
531 const struct tgsi_ind_register *ind,
532 int rel_index)
533 {
534 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
535 LLVMValueRef result;
536
537 result = ctx->radeon_bld.soa.addr[ind->Index][ind->Swizzle];
538 result = LLVMBuildLoad(gallivm->builder, result, "");
539 result = LLVMBuildAdd(gallivm->builder, result,
540 lp_build_const_int32(gallivm, rel_index), "");
541 return result;
542 }
543
544 /**
545 * Like get_indirect_index, but restricts the return value to a (possibly
546 * undefined) value inside [0..num).
547 */
548 static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
549 const struct tgsi_ind_register *ind,
550 int rel_index, unsigned num)
551 {
552 LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
553
554 /* LLVM 3.8: If indirect resource indexing is used:
555 * - SI & CIK hang
556 * - VI crashes
557 */
558 if (HAVE_LLVM <= 0x0308)
559 return LLVMGetUndef(ctx->i32);
560
561 return radeon_llvm_bound_index(&ctx->radeon_bld, result, num);
562 }
563
564
565 /**
566 * Calculate a dword address given an input or output register and a stride.
567 */
568 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
569 const struct tgsi_full_dst_register *dst,
570 const struct tgsi_full_src_register *src,
571 LLVMValueRef vertex_dw_stride,
572 LLVMValueRef base_addr)
573 {
574 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
575 struct tgsi_shader_info *info = &ctx->shader->selector->info;
576 ubyte *name, *index, *array_first;
577 int first, param;
578 struct tgsi_full_dst_register reg;
579
580 /* Set the register description. The address computation is the same
581 * for sources and destinations. */
582 if (src) {
583 reg.Register.File = src->Register.File;
584 reg.Register.Index = src->Register.Index;
585 reg.Register.Indirect = src->Register.Indirect;
586 reg.Register.Dimension = src->Register.Dimension;
587 reg.Indirect = src->Indirect;
588 reg.Dimension = src->Dimension;
589 reg.DimIndirect = src->DimIndirect;
590 } else
591 reg = *dst;
592
593 /* If the register is 2-dimensional (e.g. an array of vertices
594 * in a primitive), calculate the base address of the vertex. */
595 if (reg.Register.Dimension) {
596 LLVMValueRef index;
597
598 if (reg.Dimension.Indirect)
599 index = get_indirect_index(ctx, &reg.DimIndirect,
600 reg.Dimension.Index);
601 else
602 index = lp_build_const_int32(gallivm, reg.Dimension.Index);
603
604 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
605 LLVMBuildMul(gallivm->builder, index,
606 vertex_dw_stride, ""), "");
607 }
608
609 /* Get information about the register. */
610 if (reg.Register.File == TGSI_FILE_INPUT) {
611 name = info->input_semantic_name;
612 index = info->input_semantic_index;
613 array_first = info->input_array_first;
614 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
615 name = info->output_semantic_name;
616 index = info->output_semantic_index;
617 array_first = info->output_array_first;
618 } else {
619 assert(0);
620 return NULL;
621 }
622
623 if (reg.Register.Indirect) {
624 /* Add the relative address of the element. */
625 LLVMValueRef ind_index;
626
627 if (reg.Indirect.ArrayID)
628 first = array_first[reg.Indirect.ArrayID];
629 else
630 first = reg.Register.Index;
631
632 ind_index = get_indirect_index(ctx, &reg.Indirect,
633 reg.Register.Index - first);
634
635 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
636 LLVMBuildMul(gallivm->builder, ind_index,
637 lp_build_const_int32(gallivm, 4), ""), "");
638
639 param = si_shader_io_get_unique_index(name[first], index[first]);
640 } else {
641 param = si_shader_io_get_unique_index(name[reg.Register.Index],
642 index[reg.Register.Index]);
643 }
644
645 /* Add the base address of the element. */
646 return LLVMBuildAdd(gallivm->builder, base_addr,
647 lp_build_const_int32(gallivm, param * 4), "");
648 }
649
650 /* The offchip buffer layout for TCS->TES is
651 *
652 * - attribute 0 of patch 0 vertex 0
653 * - attribute 0 of patch 0 vertex 1
654 * - attribute 0 of patch 0 vertex 2
655 * ...
656 * - attribute 0 of patch 1 vertex 0
657 * - attribute 0 of patch 1 vertex 1
658 * ...
659 * - attribute 1 of patch 0 vertex 0
660 * - attribute 1 of patch 0 vertex 1
661 * ...
662 * - per patch attribute 0 of patch 0
663 * - per patch attribute 0 of patch 1
664 * ...
665 *
666 * Note that every attribute has 4 components.
667 */
668 static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx,
669 LLVMValueRef vertex_index,
670 LLVMValueRef param_index)
671 {
672 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
673 LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
674 LLVMValueRef param_stride, constant16;
675
676 vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6);
677 num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9);
678 total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch,
679 num_patches, "");
680
681 constant16 = lp_build_const_int32(gallivm, 16);
682 if (vertex_index) {
683 base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx),
684 vertices_per_patch, "");
685
686 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
687 vertex_index, "");
688
689 param_stride = total_vertices;
690 } else {
691 base_addr = get_rel_patch_id(ctx);
692 param_stride = num_patches;
693 }
694
695 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
696 LLVMBuildMul(gallivm->builder, param_index,
697 param_stride, ""), "");
698
699 base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, "");
700
701 if (!vertex_index) {
702 LLVMValueRef patch_data_offset =
703 unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 16, 16);
704
705 base_addr = LLVMBuildAdd(gallivm->builder, base_addr,
706 patch_data_offset, "");
707 }
708 return base_addr;
709 }
710
711 static LLVMValueRef get_tcs_tes_buffer_address_from_reg(
712 struct si_shader_context *ctx,
713 const struct tgsi_full_dst_register *dst,
714 const struct tgsi_full_src_register *src)
715 {
716 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
717 struct tgsi_shader_info *info = &ctx->shader->selector->info;
718 ubyte *name, *index, *array_first;
719 struct tgsi_full_src_register reg;
720 LLVMValueRef vertex_index = NULL;
721 LLVMValueRef param_index = NULL;
722 unsigned param_index_base, param_base;
723
724 reg = src ? *src : tgsi_full_src_register_from_dst(dst);
725
726 if (reg.Register.Dimension) {
727
728 if (reg.Dimension.Indirect)
729 vertex_index = get_indirect_index(ctx, &reg.DimIndirect,
730 reg.Dimension.Index);
731 else
732 vertex_index = lp_build_const_int32(gallivm,
733 reg.Dimension.Index);
734 }
735
736 /* Get information about the register. */
737 if (reg.Register.File == TGSI_FILE_INPUT) {
738 name = info->input_semantic_name;
739 index = info->input_semantic_index;
740 array_first = info->input_array_first;
741 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
742 name = info->output_semantic_name;
743 index = info->output_semantic_index;
744 array_first = info->output_array_first;
745 } else {
746 assert(0);
747 return NULL;
748 }
749
750 if (reg.Register.Indirect) {
751 if (reg.Indirect.ArrayID)
752 param_base = array_first[reg.Indirect.ArrayID];
753 else
754 param_base = reg.Register.Index;
755
756 param_index = get_indirect_index(ctx, &reg.Indirect,
757 reg.Register.Index - param_base);
758
759 } else {
760 param_base = reg.Register.Index;
761 param_index = lp_build_const_int32(gallivm, 0);
762 }
763
764 param_index_base = si_shader_io_get_unique_index(name[param_base],
765 index[param_base]);
766
767 param_index = LLVMBuildAdd(gallivm->builder, param_index,
768 lp_build_const_int32(gallivm, param_index_base),
769 "");
770
771 return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
772 }
773
774 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
775 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
776 * or v4i32 (num_channels=3,4). */
777 static void build_tbuffer_store(struct si_shader_context *ctx,
778 LLVMValueRef rsrc,
779 LLVMValueRef vdata,
780 unsigned num_channels,
781 LLVMValueRef vaddr,
782 LLVMValueRef soffset,
783 unsigned inst_offset,
784 unsigned dfmt,
785 unsigned nfmt,
786 unsigned offen,
787 unsigned idxen,
788 unsigned glc,
789 unsigned slc,
790 unsigned tfe)
791 {
792 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
793 LLVMValueRef args[] = {
794 rsrc,
795 vdata,
796 LLVMConstInt(ctx->i32, num_channels, 0),
797 vaddr,
798 soffset,
799 LLVMConstInt(ctx->i32, inst_offset, 0),
800 LLVMConstInt(ctx->i32, dfmt, 0),
801 LLVMConstInt(ctx->i32, nfmt, 0),
802 LLVMConstInt(ctx->i32, offen, 0),
803 LLVMConstInt(ctx->i32, idxen, 0),
804 LLVMConstInt(ctx->i32, glc, 0),
805 LLVMConstInt(ctx->i32, slc, 0),
806 LLVMConstInt(ctx->i32, tfe, 0)
807 };
808
809 /* The instruction offset field has 12 bits */
810 assert(offen || inst_offset < (1 << 12));
811
812 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
813 unsigned func = CLAMP(num_channels, 1, 3) - 1;
814 const char *types[] = {"i32", "v2i32", "v4i32"};
815 char name[256];
816 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
817
818 lp_build_intrinsic(gallivm->builder, name, ctx->voidt,
819 args, ARRAY_SIZE(args), 0);
820 }
821
822 static void build_tbuffer_store_dwords(struct si_shader_context *ctx,
823 LLVMValueRef rsrc,
824 LLVMValueRef vdata,
825 unsigned num_channels,
826 LLVMValueRef vaddr,
827 LLVMValueRef soffset,
828 unsigned inst_offset)
829 {
830 static unsigned dfmt[] = {
831 V_008F0C_BUF_DATA_FORMAT_32,
832 V_008F0C_BUF_DATA_FORMAT_32_32,
833 V_008F0C_BUF_DATA_FORMAT_32_32_32,
834 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
835 };
836 assert(num_channels >= 1 && num_channels <= 4);
837
838 build_tbuffer_store(ctx, rsrc, vdata, num_channels, vaddr, soffset,
839 inst_offset, dfmt[num_channels-1],
840 V_008F0C_BUF_NUM_FORMAT_UINT, 1, 0, 1, 1, 0);
841 }
842
843 static LLVMValueRef build_buffer_load(struct si_shader_context *ctx,
844 LLVMValueRef rsrc,
845 int num_channels,
846 LLVMValueRef vindex,
847 LLVMValueRef voffset,
848 LLVMValueRef soffset,
849 unsigned inst_offset,
850 unsigned glc,
851 unsigned slc)
852 {
853 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
854 unsigned func = CLAMP(num_channels, 1, 3) - 1;
855
856 if (HAVE_LLVM >= 0x309) {
857 LLVMValueRef args[] = {
858 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""),
859 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
860 LLVMConstInt(ctx->i32, inst_offset, 0),
861 LLVMConstInt(ctx->i1, glc, 0),
862 LLVMConstInt(ctx->i1, slc, 0)
863 };
864
865 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
866 ctx->v4f32};
867 const char *type_names[] = {"f32", "v2f32", "v4f32"};
868 char name[256];
869
870 if (voffset) {
871 args[2] = LLVMBuildAdd(gallivm->builder, args[2], voffset,
872 "");
873 }
874
875 if (soffset) {
876 args[2] = LLVMBuildAdd(gallivm->builder, args[2], soffset,
877 "");
878 }
879
880 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
881 type_names[func]);
882
883 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
884 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
885 } else {
886 LLVMValueRef args[] = {
887 LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v16i8, ""),
888 voffset ? voffset : vindex,
889 soffset,
890 LLVMConstInt(ctx->i32, inst_offset, 0),
891 LLVMConstInt(ctx->i32, voffset ? 1 : 0, 0), // offen
892 LLVMConstInt(ctx->i32, vindex ? 1 : 0, 0), //idxen
893 LLVMConstInt(ctx->i32, glc, 0),
894 LLVMConstInt(ctx->i32, slc, 0),
895 LLVMConstInt(ctx->i32, 0, 0), // TFE
896 };
897
898 LLVMTypeRef types[] = {ctx->i32, LLVMVectorType(ctx->i32, 2),
899 ctx->v4i32};
900 const char *type_names[] = {"i32", "v2i32", "v4i32"};
901 const char *arg_type = "i32";
902 char name[256];
903
904 if (voffset && vindex) {
905 LLVMValueRef vaddr[] = {vindex, voffset};
906
907 arg_type = "v2i32";
908 args[1] = lp_build_gather_values(gallivm, vaddr, 2);
909 }
910
911 snprintf(name, sizeof(name), "llvm.SI.buffer.load.dword.%s.%s",
912 type_names[func], arg_type);
913
914 return lp_build_intrinsic(gallivm->builder, name, types[func], args,
915 ARRAY_SIZE(args), LLVMReadOnlyAttribute);
916 }
917 }
918
919 static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base,
920 enum tgsi_opcode_type type, unsigned swizzle,
921 LLVMValueRef buffer, LLVMValueRef offset,
922 LLVMValueRef base)
923 {
924 struct si_shader_context *ctx = si_shader_context(bld_base);
925 struct gallivm_state *gallivm = bld_base->base.gallivm;
926 LLVMValueRef value, value2;
927 LLVMTypeRef llvm_type = tgsi2llvmtype(bld_base, type);
928 LLVMTypeRef vec_type = LLVMVectorType(llvm_type, 4);
929
930 if (swizzle == ~0) {
931 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
932 0, 1, 0);
933
934 return LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
935 }
936
937 if (!tgsi_type_is_64bit(type)) {
938 value = build_buffer_load(ctx, buffer, 4, NULL, base, offset,
939 0, 1, 0);
940
941 value = LLVMBuildBitCast(gallivm->builder, value, vec_type, "");
942 return LLVMBuildExtractElement(gallivm->builder, value,
943 lp_build_const_int32(gallivm, swizzle), "");
944 }
945
946 value = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
947 swizzle * 4, 1, 0);
948
949 value2 = build_buffer_load(ctx, buffer, 1, NULL, base, offset,
950 swizzle * 4 + 4, 1, 0);
951
952 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
953 }
954
955 /**
956 * Load from LDS.
957 *
958 * \param type output value type
959 * \param swizzle offset (typically 0..3); it can be ~0, which loads a vec4
960 * \param dw_addr address in dwords
961 */
962 static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
963 enum tgsi_opcode_type type, unsigned swizzle,
964 LLVMValueRef dw_addr)
965 {
966 struct si_shader_context *ctx = si_shader_context(bld_base);
967 struct gallivm_state *gallivm = bld_base->base.gallivm;
968 LLVMValueRef value;
969
970 if (swizzle == ~0) {
971 LLVMValueRef values[TGSI_NUM_CHANNELS];
972
973 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
974 values[chan] = lds_load(bld_base, type, chan, dw_addr);
975
976 return lp_build_gather_values(bld_base->base.gallivm, values,
977 TGSI_NUM_CHANNELS);
978 }
979
980 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
981 lp_build_const_int32(gallivm, swizzle));
982
983 value = build_indexed_load(ctx, ctx->lds, dw_addr, false);
984 if (tgsi_type_is_64bit(type)) {
985 LLVMValueRef value2;
986 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
987 lp_build_const_int32(gallivm, swizzle + 1));
988 value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
989 return radeon_llvm_emit_fetch_64bit(bld_base, type, value, value2);
990 }
991
992 return LLVMBuildBitCast(gallivm->builder, value,
993 tgsi2llvmtype(bld_base, type), "");
994 }
995
996 /**
997 * Store to LDS.
998 *
999 * \param swizzle offset (typically 0..3)
1000 * \param dw_addr address in dwords
1001 * \param value value to store
1002 */
1003 static void lds_store(struct lp_build_tgsi_context *bld_base,
1004 unsigned swizzle, LLVMValueRef dw_addr,
1005 LLVMValueRef value)
1006 {
1007 struct si_shader_context *ctx = si_shader_context(bld_base);
1008 struct gallivm_state *gallivm = bld_base->base.gallivm;
1009
1010 dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
1011 lp_build_const_int32(gallivm, swizzle));
1012
1013 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1014 build_indexed_store(ctx, ctx->lds,
1015 dw_addr, value);
1016 }
1017
1018 static LLVMValueRef fetch_input_tcs(
1019 struct lp_build_tgsi_context *bld_base,
1020 const struct tgsi_full_src_register *reg,
1021 enum tgsi_opcode_type type, unsigned swizzle)
1022 {
1023 struct si_shader_context *ctx = si_shader_context(bld_base);
1024 LLVMValueRef dw_addr, stride;
1025
1026 stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
1027 dw_addr = get_tcs_in_current_patch_offset(ctx);
1028 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1029
1030 return lds_load(bld_base, type, swizzle, dw_addr);
1031 }
1032
1033 static LLVMValueRef fetch_output_tcs(
1034 struct lp_build_tgsi_context *bld_base,
1035 const struct tgsi_full_src_register *reg,
1036 enum tgsi_opcode_type type, unsigned swizzle)
1037 {
1038 struct si_shader_context *ctx = si_shader_context(bld_base);
1039 LLVMValueRef dw_addr, stride;
1040
1041 if (reg->Register.Dimension) {
1042 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1043 dw_addr = get_tcs_out_current_patch_offset(ctx);
1044 dw_addr = get_dw_address(ctx, NULL, reg, stride, dw_addr);
1045 } else {
1046 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1047 dw_addr = get_dw_address(ctx, NULL, reg, NULL, dw_addr);
1048 }
1049
1050 return lds_load(bld_base, type, swizzle, dw_addr);
1051 }
1052
1053 static LLVMValueRef fetch_input_tes(
1054 struct lp_build_tgsi_context *bld_base,
1055 const struct tgsi_full_src_register *reg,
1056 enum tgsi_opcode_type type, unsigned swizzle)
1057 {
1058 struct si_shader_context *ctx = si_shader_context(bld_base);
1059 struct gallivm_state *gallivm = bld_base->base.gallivm;
1060 LLVMValueRef rw_buffers, buffer, base, addr;
1061
1062 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1063 SI_PARAM_RW_BUFFERS);
1064 buffer = build_indexed_load_const(ctx, rw_buffers,
1065 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1066
1067 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1068 addr = get_tcs_tes_buffer_address_from_reg(ctx, NULL, reg);
1069
1070 return buffer_load(bld_base, type, swizzle, buffer, base, addr);
1071 }
1072
1073 static void store_output_tcs(struct lp_build_tgsi_context *bld_base,
1074 const struct tgsi_full_instruction *inst,
1075 const struct tgsi_opcode_info *info,
1076 LLVMValueRef dst[4])
1077 {
1078 struct si_shader_context *ctx = si_shader_context(bld_base);
1079 struct gallivm_state *gallivm = bld_base->base.gallivm;
1080 const struct tgsi_full_dst_register *reg = &inst->Dst[0];
1081 unsigned chan_index;
1082 LLVMValueRef dw_addr, stride;
1083 LLVMValueRef rw_buffers, buffer, base, buf_addr;
1084 LLVMValueRef values[4];
1085
1086 /* Only handle per-patch and per-vertex outputs here.
1087 * Vectors will be lowered to scalars and this function will be called again.
1088 */
1089 if (reg->Register.File != TGSI_FILE_OUTPUT ||
1090 (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) {
1091 radeon_llvm_emit_store(bld_base, inst, info, dst);
1092 return;
1093 }
1094
1095 if (reg->Register.Dimension) {
1096 stride = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 13, 8);
1097 dw_addr = get_tcs_out_current_patch_offset(ctx);
1098 dw_addr = get_dw_address(ctx, reg, NULL, stride, dw_addr);
1099 } else {
1100 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
1101 dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr);
1102 }
1103
1104 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1105 SI_PARAM_RW_BUFFERS);
1106 buffer = build_indexed_load_const(ctx, rw_buffers,
1107 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1108
1109 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1110 buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL);
1111
1112
1113 TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan_index) {
1114 LLVMValueRef value = dst[chan_index];
1115
1116 if (inst->Instruction.Saturate)
1117 value = radeon_llvm_saturate(bld_base, value);
1118
1119 lds_store(bld_base, chan_index, dw_addr, value);
1120
1121 value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, "");
1122 values[chan_index] = value;
1123
1124 if (inst->Dst[0].Register.WriteMask != 0xF) {
1125 build_tbuffer_store_dwords(ctx, buffer, value, 1,
1126 buf_addr, base,
1127 4 * chan_index);
1128 }
1129 }
1130
1131 if (inst->Dst[0].Register.WriteMask == 0xF) {
1132 LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm,
1133 values, 4);
1134 build_tbuffer_store_dwords(ctx, buffer, value, 4, buf_addr,
1135 base, 0);
1136 }
1137 }
1138
1139 static LLVMValueRef fetch_input_gs(
1140 struct lp_build_tgsi_context *bld_base,
1141 const struct tgsi_full_src_register *reg,
1142 enum tgsi_opcode_type type,
1143 unsigned swizzle)
1144 {
1145 struct lp_build_context *base = &bld_base->base;
1146 struct si_shader_context *ctx = si_shader_context(bld_base);
1147 struct si_shader *shader = ctx->shader;
1148 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1149 struct gallivm_state *gallivm = base->gallivm;
1150 LLVMValueRef vtx_offset;
1151 LLVMValueRef args[9];
1152 unsigned vtx_offset_param;
1153 struct tgsi_shader_info *info = &shader->selector->info;
1154 unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
1155 unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
1156 unsigned param;
1157 LLVMValueRef value;
1158
1159 if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
1160 return get_primitive_id(bld_base, swizzle);
1161
1162 if (!reg->Register.Dimension)
1163 return NULL;
1164
1165 if (swizzle == ~0) {
1166 LLVMValueRef values[TGSI_NUM_CHANNELS];
1167 unsigned chan;
1168 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1169 values[chan] = fetch_input_gs(bld_base, reg, type, chan);
1170 }
1171 return lp_build_gather_values(bld_base->base.gallivm, values,
1172 TGSI_NUM_CHANNELS);
1173 }
1174
1175 /* Get the vertex offset parameter */
1176 vtx_offset_param = reg->Dimension.Index;
1177 if (vtx_offset_param < 2) {
1178 vtx_offset_param += SI_PARAM_VTX0_OFFSET;
1179 } else {
1180 assert(vtx_offset_param < 6);
1181 vtx_offset_param += SI_PARAM_VTX2_OFFSET - 2;
1182 }
1183 vtx_offset = lp_build_mul_imm(uint,
1184 LLVMGetParam(ctx->radeon_bld.main_fn,
1185 vtx_offset_param),
1186 4);
1187
1188 param = si_shader_io_get_unique_index(semantic_name, semantic_index);
1189 args[0] = ctx->esgs_ring;
1190 args[1] = vtx_offset;
1191 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
1192 args[3] = uint->zero;
1193 args[4] = uint->one; /* OFFEN */
1194 args[5] = uint->zero; /* IDXEN */
1195 args[6] = uint->one; /* GLC */
1196 args[7] = uint->zero; /* SLC */
1197 args[8] = uint->zero; /* TFE */
1198
1199 value = lp_build_intrinsic(gallivm->builder,
1200 "llvm.SI.buffer.load.dword.i32.i32",
1201 ctx->i32, args, 9,
1202 LLVMReadOnlyAttribute);
1203 if (tgsi_type_is_64bit(type)) {
1204 LLVMValueRef value2;
1205 args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle + 1) * 256);
1206 value2 = lp_build_intrinsic(gallivm->builder,
1207 "llvm.SI.buffer.load.dword.i32.i32",
1208 ctx->i32, args, 9,
1209 LLVMReadOnlyAttribute);
1210 return radeon_llvm_emit_fetch_64bit(bld_base, type,
1211 value, value2);
1212 }
1213 return LLVMBuildBitCast(gallivm->builder,
1214 value,
1215 tgsi2llvmtype(bld_base, type), "");
1216 }
1217
1218 static int lookup_interp_param_index(unsigned interpolate, unsigned location)
1219 {
1220 switch (interpolate) {
1221 case TGSI_INTERPOLATE_CONSTANT:
1222 return 0;
1223
1224 case TGSI_INTERPOLATE_LINEAR:
1225 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1226 return SI_PARAM_LINEAR_SAMPLE;
1227 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1228 return SI_PARAM_LINEAR_CENTROID;
1229 else
1230 return SI_PARAM_LINEAR_CENTER;
1231 break;
1232 case TGSI_INTERPOLATE_COLOR:
1233 case TGSI_INTERPOLATE_PERSPECTIVE:
1234 if (location == TGSI_INTERPOLATE_LOC_SAMPLE)
1235 return SI_PARAM_PERSP_SAMPLE;
1236 else if (location == TGSI_INTERPOLATE_LOC_CENTROID)
1237 return SI_PARAM_PERSP_CENTROID;
1238 else
1239 return SI_PARAM_PERSP_CENTER;
1240 break;
1241 default:
1242 fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
1243 return -1;
1244 }
1245 }
1246
1247 /* This shouldn't be used by explicit INTERP opcodes. */
1248 static unsigned select_interp_param(struct si_shader_context *ctx,
1249 unsigned param)
1250 {
1251 if (!ctx->is_monolithic)
1252 return param;
1253
1254 if (ctx->shader->key.ps.prolog.force_persp_sample_interp) {
1255 switch (param) {
1256 case SI_PARAM_PERSP_CENTROID:
1257 case SI_PARAM_PERSP_CENTER:
1258 return SI_PARAM_PERSP_SAMPLE;
1259 }
1260 }
1261 if (ctx->shader->key.ps.prolog.force_linear_sample_interp) {
1262 switch (param) {
1263 case SI_PARAM_LINEAR_CENTROID:
1264 case SI_PARAM_LINEAR_CENTER:
1265 return SI_PARAM_LINEAR_SAMPLE;
1266 }
1267 }
1268 if (ctx->shader->key.ps.prolog.force_persp_center_interp) {
1269 switch (param) {
1270 case SI_PARAM_PERSP_CENTROID:
1271 case SI_PARAM_PERSP_SAMPLE:
1272 return SI_PARAM_PERSP_CENTER;
1273 }
1274 }
1275 if (ctx->shader->key.ps.prolog.force_linear_center_interp) {
1276 switch (param) {
1277 case SI_PARAM_LINEAR_CENTROID:
1278 case SI_PARAM_LINEAR_SAMPLE:
1279 return SI_PARAM_LINEAR_CENTER;
1280 }
1281 }
1282
1283 return param;
1284 }
1285
1286 /**
1287 * Interpolate a fragment shader input.
1288 *
1289 * @param ctx context
1290 * @param input_index index of the input in hardware
1291 * @param semantic_name TGSI_SEMANTIC_*
1292 * @param semantic_index semantic index
1293 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
1294 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
1295 * @param interp_param interpolation weights (i,j)
1296 * @param prim_mask SI_PARAM_PRIM_MASK
1297 * @param face SI_PARAM_FRONT_FACE
1298 * @param result the return value (4 components)
1299 */
1300 static void interp_fs_input(struct si_shader_context *ctx,
1301 unsigned input_index,
1302 unsigned semantic_name,
1303 unsigned semantic_index,
1304 unsigned num_interp_inputs,
1305 unsigned colors_read_mask,
1306 LLVMValueRef interp_param,
1307 LLVMValueRef prim_mask,
1308 LLVMValueRef face,
1309 LLVMValueRef result[4])
1310 {
1311 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
1312 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
1313 struct gallivm_state *gallivm = base->gallivm;
1314 const char *intr_name;
1315 LLVMValueRef attr_number;
1316
1317 unsigned chan;
1318
1319 attr_number = lp_build_const_int32(gallivm, input_index);
1320
1321 /* fs.constant returns the param from the middle vertex, so it's not
1322 * really useful for flat shading. It's meant to be used for custom
1323 * interpolation (but the intrinsic can't fetch from the other two
1324 * vertices).
1325 *
1326 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
1327 * to do the right thing. The only reason we use fs.constant is that
1328 * fs.interp cannot be used on integers, because they can be equal
1329 * to NaN.
1330 */
1331 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
1332
1333 if (semantic_name == TGSI_SEMANTIC_COLOR &&
1334 ctx->shader->key.ps.prolog.color_two_side) {
1335 LLVMValueRef args[4];
1336 LLVMValueRef is_face_positive;
1337 LLVMValueRef back_attr_number;
1338
1339 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
1340 * otherwise it's at offset "num_inputs".
1341 */
1342 unsigned back_attr_offset = num_interp_inputs;
1343 if (semantic_index == 1 && colors_read_mask & 0xf)
1344 back_attr_offset += 1;
1345
1346 back_attr_number = lp_build_const_int32(gallivm, back_attr_offset);
1347
1348 is_face_positive = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
1349 face, uint->zero, "");
1350
1351 args[2] = prim_mask;
1352 args[3] = interp_param;
1353 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1354 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1355 LLVMValueRef front, back;
1356
1357 args[0] = llvm_chan;
1358 args[1] = attr_number;
1359 front = lp_build_intrinsic(gallivm->builder, intr_name,
1360 ctx->f32, args, args[3] ? 4 : 3,
1361 LLVMReadNoneAttribute);
1362
1363 args[1] = back_attr_number;
1364 back = lp_build_intrinsic(gallivm->builder, intr_name,
1365 ctx->f32, args, args[3] ? 4 : 3,
1366 LLVMReadNoneAttribute);
1367
1368 result[chan] = LLVMBuildSelect(gallivm->builder,
1369 is_face_positive,
1370 front,
1371 back,
1372 "");
1373 }
1374 } else if (semantic_name == TGSI_SEMANTIC_FOG) {
1375 LLVMValueRef args[4];
1376
1377 args[0] = uint->zero;
1378 args[1] = attr_number;
1379 args[2] = prim_mask;
1380 args[3] = interp_param;
1381 result[0] = lp_build_intrinsic(gallivm->builder, intr_name,
1382 ctx->f32, args, args[3] ? 4 : 3,
1383 LLVMReadNoneAttribute);
1384 result[1] =
1385 result[2] = lp_build_const_float(gallivm, 0.0f);
1386 result[3] = lp_build_const_float(gallivm, 1.0f);
1387 } else {
1388 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1389 LLVMValueRef args[4];
1390 LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
1391
1392 args[0] = llvm_chan;
1393 args[1] = attr_number;
1394 args[2] = prim_mask;
1395 args[3] = interp_param;
1396 result[chan] = lp_build_intrinsic(gallivm->builder, intr_name,
1397 ctx->f32, args, args[3] ? 4 : 3,
1398 LLVMReadNoneAttribute);
1399 }
1400 }
1401 }
1402
1403 /* LLVMGetParam with bc_optimize resolved. */
1404 static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
1405 int interp_param_idx)
1406 {
1407 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1408 LLVMValueRef main_fn = ctx->radeon_bld.main_fn;
1409 LLVMValueRef param = NULL;
1410
1411 /* Handle PRIM_MASK[31] (bc_optimize). */
1412 if (ctx->is_monolithic &&
1413 ((ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1414 interp_param_idx == SI_PARAM_PERSP_CENTROID) ||
1415 (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1416 interp_param_idx == SI_PARAM_LINEAR_CENTROID))) {
1417 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
1418 * The hw doesn't compute CENTROID if the whole wave only
1419 * contains fully-covered quads.
1420 */
1421 LLVMValueRef bc_optimize =
1422 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK);
1423 bc_optimize = LLVMBuildLShr(builder,
1424 bc_optimize,
1425 LLVMConstInt(ctx->i32, 31, 0), "");
1426 bc_optimize = LLVMBuildTrunc(builder, bc_optimize, ctx->i1, "");
1427
1428 if (ctx->shader->key.ps.prolog.bc_optimize_for_persp &&
1429 interp_param_idx == SI_PARAM_PERSP_CENTROID) {
1430 param = LLVMBuildSelect(builder, bc_optimize,
1431 LLVMGetParam(main_fn,
1432 SI_PARAM_PERSP_CENTER),
1433 LLVMGetParam(main_fn,
1434 SI_PARAM_PERSP_CENTROID),
1435 "");
1436 }
1437 if (ctx->shader->key.ps.prolog.bc_optimize_for_linear &&
1438 interp_param_idx == SI_PARAM_LINEAR_CENTROID) {
1439 param = LLVMBuildSelect(builder, bc_optimize,
1440 LLVMGetParam(main_fn,
1441 SI_PARAM_LINEAR_CENTER),
1442 LLVMGetParam(main_fn,
1443 SI_PARAM_LINEAR_CENTROID),
1444 "");
1445 }
1446 }
1447
1448 if (!param)
1449 param = LLVMGetParam(main_fn, interp_param_idx);
1450 return param;
1451 }
1452
1453 static void declare_input_fs(
1454 struct radeon_llvm_context *radeon_bld,
1455 unsigned input_index,
1456 const struct tgsi_full_declaration *decl,
1457 LLVMValueRef out[4])
1458 {
1459 struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
1460 struct si_shader_context *ctx =
1461 si_shader_context(&radeon_bld->soa.bld_base);
1462 struct si_shader *shader = ctx->shader;
1463 LLVMValueRef main_fn = radeon_bld->main_fn;
1464 LLVMValueRef interp_param = NULL;
1465 int interp_param_idx;
1466
1467 /* Get colors from input VGPRs (set by the prolog). */
1468 if (!ctx->is_monolithic &&
1469 decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
1470 unsigned i = decl->Semantic.Index;
1471 unsigned colors_read = shader->selector->info.colors_read;
1472 unsigned mask = colors_read >> (i * 4);
1473 unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
1474 (i ? util_bitcount(colors_read & 0xf) : 0);
1475
1476 out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
1477 out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
1478 out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
1479 out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
1480 return;
1481 }
1482
1483 interp_param_idx = lookup_interp_param_index(decl->Interp.Interpolate,
1484 decl->Interp.Location);
1485 if (interp_param_idx == -1)
1486 return;
1487 else if (interp_param_idx) {
1488 interp_param_idx = select_interp_param(ctx,
1489 interp_param_idx);
1490 interp_param = get_interp_param(ctx, interp_param_idx);
1491 }
1492
1493 if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
1494 decl->Interp.Interpolate == TGSI_INTERPOLATE_COLOR &&
1495 ctx->shader->key.ps.prolog.flatshade_colors)
1496 interp_param = NULL; /* load the constant color */
1497
1498 interp_fs_input(ctx, input_index, decl->Semantic.Name,
1499 decl->Semantic.Index, shader->selector->info.num_inputs,
1500 shader->selector->info.colors_read, interp_param,
1501 LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
1502 LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
1503 &out[0]);
1504 }
1505
1506 static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
1507 {
1508 return unpack_param(si_shader_context(&radeon_bld->soa.bld_base),
1509 SI_PARAM_ANCILLARY, 8, 4);
1510 }
1511
1512 /**
1513 * Set range metadata on an instruction. This can only be used on load and
1514 * call instructions. If you know an instruction can only produce the values
1515 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1516 * \p lo is the minimum value inclusive.
1517 * \p hi is the maximum value exclusive.
1518 */
1519 static void set_range_metadata(struct si_shader_context *ctx,
1520 LLVMValueRef value, unsigned lo, unsigned hi)
1521 {
1522 LLVMValueRef range_md, md_args[2];
1523 LLVMTypeRef type = LLVMTypeOf(value);
1524 LLVMContextRef context = LLVMGetTypeContext(type);
1525
1526 md_args[0] = LLVMConstInt(type, lo, false);
1527 md_args[1] = LLVMConstInt(type, hi, false);
1528 range_md = LLVMMDNodeInContext(context, md_args, 2);
1529 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1530 }
1531
1532 static LLVMValueRef get_thread_id(struct si_shader_context *ctx)
1533 {
1534 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
1535 LLVMValueRef tid;
1536
1537 if (HAVE_LLVM < 0x0308) {
1538 tid = lp_build_intrinsic(gallivm->builder, "llvm.SI.tid",
1539 ctx->i32, NULL, 0, LLVMReadNoneAttribute);
1540 } else {
1541 LLVMValueRef tid_args[2];
1542 tid_args[0] = lp_build_const_int32(gallivm, 0xffffffff);
1543 tid_args[1] = lp_build_const_int32(gallivm, 0);
1544 tid_args[1] = lp_build_intrinsic(gallivm->builder,
1545 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1546 tid_args, 2, LLVMReadNoneAttribute);
1547
1548 tid = lp_build_intrinsic(gallivm->builder,
1549 "llvm.amdgcn.mbcnt.hi", ctx->i32,
1550 tid_args, 2, LLVMReadNoneAttribute);
1551 }
1552 set_range_metadata(ctx, tid, 0, 64);
1553 return tid;
1554 }
1555
1556 /**
1557 * Load a dword from a constant buffer.
1558 */
1559 static LLVMValueRef buffer_load_const(struct si_shader_context *ctx,
1560 LLVMValueRef resource,
1561 LLVMValueRef offset)
1562 {
1563 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
1564 LLVMValueRef args[2] = {resource, offset};
1565
1566 return lp_build_intrinsic(builder, "llvm.SI.load.const", ctx->f32, args, 2,
1567 LLVMReadNoneAttribute);
1568 }
1569
1570 static LLVMValueRef load_sample_position(struct radeon_llvm_context *radeon_bld, LLVMValueRef sample_id)
1571 {
1572 struct si_shader_context *ctx =
1573 si_shader_context(&radeon_bld->soa.bld_base);
1574 struct lp_build_context *uint_bld = &radeon_bld->soa.bld_base.uint_bld;
1575 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1576 LLVMBuilderRef builder = gallivm->builder;
1577 LLVMValueRef desc = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1578 LLVMValueRef buf_index = lp_build_const_int32(gallivm, SI_PS_CONST_SAMPLE_POSITIONS);
1579 LLVMValueRef resource = build_indexed_load_const(ctx, desc, buf_index);
1580
1581 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
1582 LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
1583 LLVMValueRef offset1 = LLVMBuildAdd(builder, offset0, lp_build_const_int32(gallivm, 4), "");
1584
1585 LLVMValueRef pos[4] = {
1586 buffer_load_const(ctx, resource, offset0),
1587 buffer_load_const(ctx, resource, offset1),
1588 lp_build_const_float(gallivm, 0),
1589 lp_build_const_float(gallivm, 0)
1590 };
1591
1592 return lp_build_gather_values(gallivm, pos, 4);
1593 }
1594
1595 static void declare_system_value(
1596 struct radeon_llvm_context *radeon_bld,
1597 unsigned index,
1598 const struct tgsi_full_declaration *decl)
1599 {
1600 struct si_shader_context *ctx =
1601 si_shader_context(&radeon_bld->soa.bld_base);
1602 struct lp_build_context *bld = &radeon_bld->soa.bld_base.base;
1603 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1604 LLVMValueRef value = 0;
1605
1606 switch (decl->Semantic.Name) {
1607 case TGSI_SEMANTIC_INSTANCEID:
1608 value = LLVMGetParam(radeon_bld->main_fn,
1609 ctx->param_instance_id);
1610 break;
1611
1612 case TGSI_SEMANTIC_VERTEXID:
1613 value = LLVMBuildAdd(gallivm->builder,
1614 LLVMGetParam(radeon_bld->main_fn,
1615 ctx->param_vertex_id),
1616 LLVMGetParam(radeon_bld->main_fn,
1617 SI_PARAM_BASE_VERTEX), "");
1618 break;
1619
1620 case TGSI_SEMANTIC_VERTEXID_NOBASE:
1621 value = LLVMGetParam(radeon_bld->main_fn,
1622 ctx->param_vertex_id);
1623 break;
1624
1625 case TGSI_SEMANTIC_BASEVERTEX:
1626 value = LLVMGetParam(radeon_bld->main_fn,
1627 SI_PARAM_BASE_VERTEX);
1628 break;
1629
1630 case TGSI_SEMANTIC_BASEINSTANCE:
1631 value = LLVMGetParam(radeon_bld->main_fn,
1632 SI_PARAM_START_INSTANCE);
1633 break;
1634
1635 case TGSI_SEMANTIC_DRAWID:
1636 value = LLVMGetParam(radeon_bld->main_fn,
1637 SI_PARAM_DRAWID);
1638 break;
1639
1640 case TGSI_SEMANTIC_INVOCATIONID:
1641 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1642 value = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
1643 else if (ctx->type == PIPE_SHADER_GEOMETRY)
1644 value = LLVMGetParam(radeon_bld->main_fn,
1645 SI_PARAM_GS_INSTANCE_ID);
1646 else
1647 assert(!"INVOCATIONID not implemented");
1648 break;
1649
1650 case TGSI_SEMANTIC_POSITION:
1651 {
1652 LLVMValueRef pos[4] = {
1653 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1654 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1655 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Z_FLOAT),
1656 lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base, TGSI_OPCODE_RCP,
1657 LLVMGetParam(radeon_bld->main_fn,
1658 SI_PARAM_POS_W_FLOAT)),
1659 };
1660 value = lp_build_gather_values(gallivm, pos, 4);
1661 break;
1662 }
1663
1664 case TGSI_SEMANTIC_FACE:
1665 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_FRONT_FACE);
1666 break;
1667
1668 case TGSI_SEMANTIC_SAMPLEID:
1669 value = get_sample_id(radeon_bld);
1670 break;
1671
1672 case TGSI_SEMANTIC_SAMPLEPOS: {
1673 LLVMValueRef pos[4] = {
1674 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_X_FLOAT),
1675 LLVMGetParam(radeon_bld->main_fn, SI_PARAM_POS_Y_FLOAT),
1676 lp_build_const_float(gallivm, 0),
1677 lp_build_const_float(gallivm, 0)
1678 };
1679 pos[0] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1680 TGSI_OPCODE_FRC, pos[0]);
1681 pos[1] = lp_build_emit_llvm_unary(&radeon_bld->soa.bld_base,
1682 TGSI_OPCODE_FRC, pos[1]);
1683 value = lp_build_gather_values(gallivm, pos, 4);
1684 break;
1685 }
1686
1687 case TGSI_SEMANTIC_SAMPLEMASK:
1688 /* This can only occur with the OpenGL Core profile, which
1689 * doesn't support smoothing.
1690 */
1691 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_SAMPLE_COVERAGE);
1692 break;
1693
1694 case TGSI_SEMANTIC_TESSCOORD:
1695 {
1696 LLVMValueRef coord[4] = {
1697 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_u),
1698 LLVMGetParam(radeon_bld->main_fn, ctx->param_tes_v),
1699 bld->zero,
1700 bld->zero
1701 };
1702
1703 /* For triangles, the vector should be (u, v, 1-u-v). */
1704 if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
1705 PIPE_PRIM_TRIANGLES)
1706 coord[2] = lp_build_sub(bld, bld->one,
1707 lp_build_add(bld, coord[0], coord[1]));
1708
1709 value = lp_build_gather_values(gallivm, coord, 4);
1710 break;
1711 }
1712
1713 case TGSI_SEMANTIC_VERTICESIN:
1714 if (ctx->type == PIPE_SHADER_TESS_CTRL)
1715 value = unpack_param(ctx, SI_PARAM_TCS_OUT_LAYOUT, 26, 6);
1716 else if (ctx->type == PIPE_SHADER_TESS_EVAL)
1717 value = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 7);
1718 else
1719 assert(!"invalid shader stage for TGSI_SEMANTIC_VERTICESIN");
1720 break;
1721
1722 case TGSI_SEMANTIC_TESSINNER:
1723 case TGSI_SEMANTIC_TESSOUTER:
1724 {
1725 LLVMValueRef rw_buffers, buffer, base, addr;
1726 int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0);
1727
1728 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
1729 SI_PARAM_RW_BUFFERS);
1730 buffer = build_indexed_load_const(ctx, rw_buffers,
1731 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
1732
1733 base = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
1734 addr = get_tcs_tes_buffer_address(ctx, NULL,
1735 lp_build_const_int32(gallivm, param));
1736
1737 value = buffer_load(&radeon_bld->soa.bld_base, TGSI_TYPE_FLOAT,
1738 ~0, buffer, base, addr);
1739
1740 break;
1741 }
1742
1743 case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI:
1744 case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI:
1745 {
1746 LLVMValueRef buf, slot, val[4];
1747 int i, offset;
1748
1749 slot = lp_build_const_int32(gallivm, SI_HS_CONST_DEFAULT_TESS_LEVELS);
1750 buf = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
1751 buf = build_indexed_load_const(ctx, buf, slot);
1752 offset = decl->Semantic.Name == TGSI_SEMANTIC_DEFAULT_TESSINNER_SI ? 4 : 0;
1753
1754 for (i = 0; i < 4; i++)
1755 val[i] = buffer_load_const(ctx, buf,
1756 lp_build_const_int32(gallivm, (offset + i) * 4));
1757 value = lp_build_gather_values(gallivm, val, 4);
1758 break;
1759 }
1760
1761 case TGSI_SEMANTIC_PRIMID:
1762 value = get_primitive_id(&radeon_bld->soa.bld_base, 0);
1763 break;
1764
1765 case TGSI_SEMANTIC_GRID_SIZE:
1766 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE);
1767 break;
1768
1769 case TGSI_SEMANTIC_BLOCK_SIZE:
1770 {
1771 LLVMValueRef values[3];
1772 unsigned i;
1773 unsigned *properties = ctx->shader->selector->info.properties;
1774
1775 if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) {
1776 unsigned sizes[3] = {
1777 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH],
1778 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT],
1779 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]
1780 };
1781
1782 for (i = 0; i < 3; ++i)
1783 values[i] = lp_build_const_int32(gallivm, sizes[i]);
1784
1785 value = lp_build_gather_values(gallivm, values, 3);
1786 } else {
1787 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE);
1788 }
1789 break;
1790 }
1791
1792 case TGSI_SEMANTIC_BLOCK_ID:
1793 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID);
1794 break;
1795
1796 case TGSI_SEMANTIC_THREAD_ID:
1797 value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID);
1798 break;
1799
1800 #if HAVE_LLVM >= 0x0309
1801 case TGSI_SEMANTIC_HELPER_INVOCATION:
1802 value = lp_build_intrinsic(gallivm->builder,
1803 "llvm.amdgcn.ps.live",
1804 ctx->i1, NULL, 0,
1805 LLVMReadNoneAttribute);
1806 value = LLVMBuildNot(gallivm->builder, value, "");
1807 value = LLVMBuildSExt(gallivm->builder, value, ctx->i32, "");
1808 break;
1809 #endif
1810
1811 default:
1812 assert(!"unknown system value");
1813 return;
1814 }
1815
1816 radeon_bld->system_values[index] = value;
1817 }
1818
1819 static void declare_compute_memory(struct radeon_llvm_context *radeon_bld,
1820 const struct tgsi_full_declaration *decl)
1821 {
1822 struct si_shader_context *ctx =
1823 si_shader_context(&radeon_bld->soa.bld_base);
1824 struct si_shader_selector *sel = ctx->shader->selector;
1825 struct gallivm_state *gallivm = &radeon_bld->gallivm;
1826
1827 LLVMTypeRef i8p = LLVMPointerType(ctx->i8, LOCAL_ADDR_SPACE);
1828 LLVMValueRef var;
1829
1830 assert(decl->Declaration.MemType == TGSI_MEMORY_TYPE_SHARED);
1831 assert(decl->Range.First == decl->Range.Last);
1832 assert(!ctx->shared_memory);
1833
1834 var = LLVMAddGlobalInAddressSpace(gallivm->module,
1835 LLVMArrayType(ctx->i8, sel->local_size),
1836 "compute_lds",
1837 LOCAL_ADDR_SPACE);
1838 LLVMSetAlignment(var, 4);
1839
1840 ctx->shared_memory = LLVMBuildBitCast(gallivm->builder, var, i8p, "");
1841 }
1842
1843 static LLVMValueRef load_const_buffer_desc(struct si_shader_context *ctx, int i)
1844 {
1845 LLVMValueRef list_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
1846 SI_PARAM_CONST_BUFFERS);
1847
1848 return build_indexed_load_const(ctx, list_ptr,
1849 LLVMConstInt(ctx->i32, i, 0));
1850 }
1851
1852 static LLVMValueRef fetch_constant(
1853 struct lp_build_tgsi_context *bld_base,
1854 const struct tgsi_full_src_register *reg,
1855 enum tgsi_opcode_type type,
1856 unsigned swizzle)
1857 {
1858 struct si_shader_context *ctx = si_shader_context(bld_base);
1859 struct lp_build_context *base = &bld_base->base;
1860 const struct tgsi_ind_register *ireg = &reg->Indirect;
1861 unsigned buf, idx;
1862
1863 LLVMValueRef addr, bufp;
1864 LLVMValueRef result;
1865
1866 if (swizzle == LP_CHAN_ALL) {
1867 unsigned chan;
1868 LLVMValueRef values[4];
1869 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
1870 values[chan] = fetch_constant(bld_base, reg, type, chan);
1871
1872 return lp_build_gather_values(bld_base->base.gallivm, values, 4);
1873 }
1874
1875 buf = reg->Register.Dimension ? reg->Dimension.Index : 0;
1876 idx = reg->Register.Index * 4 + swizzle;
1877
1878 if (reg->Register.Dimension && reg->Dimension.Indirect) {
1879 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_CONST_BUFFERS);
1880 LLVMValueRef index;
1881 index = get_bounded_indirect_index(ctx, &reg->DimIndirect,
1882 reg->Dimension.Index,
1883 SI_NUM_CONST_BUFFERS);
1884 bufp = build_indexed_load_const(ctx, ptr, index);
1885 } else
1886 bufp = load_const_buffer_desc(ctx, buf);
1887
1888 if (reg->Register.Indirect) {
1889 addr = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
1890 addr = LLVMBuildLoad(base->gallivm->builder, addr, "load addr reg");
1891 addr = lp_build_mul_imm(&bld_base->uint_bld, addr, 16);
1892 addr = lp_build_add(&bld_base->uint_bld, addr,
1893 lp_build_const_int32(base->gallivm, idx * 4));
1894 } else {
1895 addr = LLVMConstInt(ctx->i32, idx * 4, 0);
1896 }
1897
1898 result = buffer_load_const(ctx, bufp, addr);
1899
1900 if (!tgsi_type_is_64bit(type))
1901 result = bitcast(bld_base, type, result);
1902 else {
1903 LLVMValueRef addr2, result2;
1904
1905 addr2 = lp_build_add(&bld_base->uint_bld, addr,
1906 LLVMConstInt(ctx->i32, 4, 0));
1907 result2 = buffer_load_const(ctx, bufp, addr2);
1908
1909 result = radeon_llvm_emit_fetch_64bit(bld_base, type,
1910 result, result2);
1911 }
1912 return result;
1913 }
1914
1915 /* Upper 16 bits must be zero. */
1916 static LLVMValueRef si_llvm_pack_two_int16(struct gallivm_state *gallivm,
1917 LLVMValueRef val[2])
1918 {
1919 return LLVMBuildOr(gallivm->builder, val[0],
1920 LLVMBuildShl(gallivm->builder, val[1],
1921 lp_build_const_int32(gallivm, 16),
1922 ""), "");
1923 }
1924
1925 /* Upper 16 bits are ignored and will be dropped. */
1926 static LLVMValueRef si_llvm_pack_two_int32_as_int16(struct gallivm_state *gallivm,
1927 LLVMValueRef val[2])
1928 {
1929 LLVMValueRef v[2] = {
1930 LLVMBuildAnd(gallivm->builder, val[0],
1931 lp_build_const_int32(gallivm, 0xffff), ""),
1932 val[1],
1933 };
1934 return si_llvm_pack_two_int16(gallivm, v);
1935 }
1936
1937 /* Initialize arguments for the shader export intrinsic */
1938 static void si_llvm_init_export_args(struct lp_build_tgsi_context *bld_base,
1939 LLVMValueRef *values,
1940 unsigned target,
1941 LLVMValueRef *args)
1942 {
1943 struct si_shader_context *ctx = si_shader_context(bld_base);
1944 struct lp_build_context *uint =
1945 &ctx->radeon_bld.soa.bld_base.uint_bld;
1946 struct lp_build_context *base = &bld_base->base;
1947 struct gallivm_state *gallivm = base->gallivm;
1948 LLVMBuilderRef builder = base->gallivm->builder;
1949 LLVMValueRef val[4];
1950 unsigned spi_shader_col_format = V_028714_SPI_SHADER_32_ABGR;
1951 unsigned chan;
1952 bool is_int8;
1953
1954 /* Default is 0xf. Adjusted below depending on the format. */
1955 args[0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
1956
1957 /* Specify whether the EXEC mask represents the valid mask */
1958 args[1] = uint->zero;
1959
1960 /* Specify whether this is the last export */
1961 args[2] = uint->zero;
1962
1963 /* Specify the target we are exporting */
1964 args[3] = lp_build_const_int32(base->gallivm, target);
1965
1966 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1967 const union si_shader_key *key = &ctx->shader->key;
1968 unsigned col_formats = key->ps.epilog.spi_shader_col_format;
1969 int cbuf = target - V_008DFC_SQ_EXP_MRT;
1970
1971 assert(cbuf >= 0 && cbuf < 8);
1972 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
1973 is_int8 = (key->ps.epilog.color_is_int8 >> cbuf) & 0x1;
1974 }
1975
1976 args[4] = uint->zero; /* COMPR flag */
1977 args[5] = base->undef;
1978 args[6] = base->undef;
1979 args[7] = base->undef;
1980 args[8] = base->undef;
1981
1982 switch (spi_shader_col_format) {
1983 case V_028714_SPI_SHADER_ZERO:
1984 args[0] = uint->zero; /* writemask */
1985 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
1986 break;
1987
1988 case V_028714_SPI_SHADER_32_R:
1989 args[0] = uint->one; /* writemask */
1990 args[5] = values[0];
1991 break;
1992
1993 case V_028714_SPI_SHADER_32_GR:
1994 args[0] = lp_build_const_int32(base->gallivm, 0x3); /* writemask */
1995 args[5] = values[0];
1996 args[6] = values[1];
1997 break;
1998
1999 case V_028714_SPI_SHADER_32_AR:
2000 args[0] = lp_build_const_int32(base->gallivm, 0x9); /* writemask */
2001 args[5] = values[0];
2002 args[8] = values[3];
2003 break;
2004
2005 case V_028714_SPI_SHADER_FP16_ABGR:
2006 args[4] = uint->one; /* COMPR flag */
2007
2008 for (chan = 0; chan < 2; chan++) {
2009 LLVMValueRef pack_args[2] = {
2010 values[2 * chan],
2011 values[2 * chan + 1]
2012 };
2013 LLVMValueRef packed;
2014
2015 packed = lp_build_intrinsic(base->gallivm->builder,
2016 "llvm.SI.packf16",
2017 ctx->i32, pack_args, 2,
2018 LLVMReadNoneAttribute);
2019 args[chan + 5] =
2020 LLVMBuildBitCast(base->gallivm->builder,
2021 packed, ctx->f32, "");
2022 }
2023 break;
2024
2025 case V_028714_SPI_SHADER_UNORM16_ABGR:
2026 for (chan = 0; chan < 4; chan++) {
2027 val[chan] = radeon_llvm_saturate(bld_base, values[chan]);
2028 val[chan] = LLVMBuildFMul(builder, val[chan],
2029 lp_build_const_float(gallivm, 65535), "");
2030 val[chan] = LLVMBuildFAdd(builder, val[chan],
2031 lp_build_const_float(gallivm, 0.5), "");
2032 val[chan] = LLVMBuildFPToUI(builder, val[chan],
2033 ctx->i32, "");
2034 }
2035
2036 args[4] = uint->one; /* COMPR flag */
2037 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2038 si_llvm_pack_two_int16(gallivm, val));
2039 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2040 si_llvm_pack_two_int16(gallivm, val+2));
2041 break;
2042
2043 case V_028714_SPI_SHADER_SNORM16_ABGR:
2044 for (chan = 0; chan < 4; chan++) {
2045 /* Clamp between [-1, 1]. */
2046 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MIN,
2047 values[chan],
2048 lp_build_const_float(gallivm, 1));
2049 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_MAX,
2050 val[chan],
2051 lp_build_const_float(gallivm, -1));
2052 /* Convert to a signed integer in [-32767, 32767]. */
2053 val[chan] = LLVMBuildFMul(builder, val[chan],
2054 lp_build_const_float(gallivm, 32767), "");
2055 /* If positive, add 0.5, else add -0.5. */
2056 val[chan] = LLVMBuildFAdd(builder, val[chan],
2057 LLVMBuildSelect(builder,
2058 LLVMBuildFCmp(builder, LLVMRealOGE,
2059 val[chan], base->zero, ""),
2060 lp_build_const_float(gallivm, 0.5),
2061 lp_build_const_float(gallivm, -0.5), ""), "");
2062 val[chan] = LLVMBuildFPToSI(builder, val[chan], ctx->i32, "");
2063 }
2064
2065 args[4] = uint->one; /* COMPR flag */
2066 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2067 si_llvm_pack_two_int32_as_int16(gallivm, val));
2068 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2069 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2070 break;
2071
2072 case V_028714_SPI_SHADER_UINT16_ABGR: {
2073 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2074 255 : 65535);
2075 /* Clamp. */
2076 for (chan = 0; chan < 4; chan++) {
2077 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2078 val[chan] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_UMIN,
2079 val[chan], max);
2080 }
2081
2082 args[4] = uint->one; /* COMPR flag */
2083 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2084 si_llvm_pack_two_int16(gallivm, val));
2085 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2086 si_llvm_pack_two_int16(gallivm, val+2));
2087 break;
2088 }
2089
2090 case V_028714_SPI_SHADER_SINT16_ABGR: {
2091 LLVMValueRef max = lp_build_const_int32(gallivm, is_int8 ?
2092 127 : 32767);
2093 LLVMValueRef min = lp_build_const_int32(gallivm, is_int8 ?
2094 -128 : -32768);
2095 /* Clamp. */
2096 for (chan = 0; chan < 4; chan++) {
2097 val[chan] = bitcast(bld_base, TGSI_TYPE_UNSIGNED, values[chan]);
2098 val[chan] = lp_build_emit_llvm_binary(bld_base,
2099 TGSI_OPCODE_IMIN,
2100 val[chan], max);
2101 val[chan] = lp_build_emit_llvm_binary(bld_base,
2102 TGSI_OPCODE_IMAX,
2103 val[chan], min);
2104 }
2105
2106 args[4] = uint->one; /* COMPR flag */
2107 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2108 si_llvm_pack_two_int32_as_int16(gallivm, val));
2109 args[6] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2110 si_llvm_pack_two_int32_as_int16(gallivm, val+2));
2111 break;
2112 }
2113
2114 case V_028714_SPI_SHADER_32_ABGR:
2115 memcpy(&args[5], values, sizeof(values[0]) * 4);
2116 break;
2117 }
2118 }
2119
2120 static void si_alpha_test(struct lp_build_tgsi_context *bld_base,
2121 LLVMValueRef alpha)
2122 {
2123 struct si_shader_context *ctx = si_shader_context(bld_base);
2124 struct gallivm_state *gallivm = bld_base->base.gallivm;
2125
2126 if (ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
2127 LLVMValueRef alpha_ref = LLVMGetParam(ctx->radeon_bld.main_fn,
2128 SI_PARAM_ALPHA_REF);
2129
2130 LLVMValueRef alpha_pass =
2131 lp_build_cmp(&bld_base->base,
2132 ctx->shader->key.ps.epilog.alpha_func,
2133 alpha, alpha_ref);
2134 LLVMValueRef arg =
2135 lp_build_select(&bld_base->base,
2136 alpha_pass,
2137 lp_build_const_float(gallivm, 1.0f),
2138 lp_build_const_float(gallivm, -1.0f));
2139
2140 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
2141 ctx->voidt, &arg, 1, 0);
2142 } else {
2143 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kilp",
2144 ctx->voidt, NULL, 0, 0);
2145 }
2146 }
2147
2148 static LLVMValueRef si_scale_alpha_by_sample_mask(struct lp_build_tgsi_context *bld_base,
2149 LLVMValueRef alpha,
2150 unsigned samplemask_param)
2151 {
2152 struct si_shader_context *ctx = si_shader_context(bld_base);
2153 struct gallivm_state *gallivm = bld_base->base.gallivm;
2154 LLVMValueRef coverage;
2155
2156 /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
2157 coverage = LLVMGetParam(ctx->radeon_bld.main_fn,
2158 samplemask_param);
2159 coverage = bitcast(bld_base, TGSI_TYPE_SIGNED, coverage);
2160
2161 coverage = lp_build_intrinsic(gallivm->builder, "llvm.ctpop.i32",
2162 ctx->i32,
2163 &coverage, 1, LLVMReadNoneAttribute);
2164
2165 coverage = LLVMBuildUIToFP(gallivm->builder, coverage,
2166 ctx->f32, "");
2167
2168 coverage = LLVMBuildFMul(gallivm->builder, coverage,
2169 lp_build_const_float(gallivm,
2170 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
2171
2172 return LLVMBuildFMul(gallivm->builder, alpha, coverage, "");
2173 }
2174
2175 static void si_llvm_emit_clipvertex(struct lp_build_tgsi_context *bld_base,
2176 LLVMValueRef (*pos)[9], LLVMValueRef *out_elts)
2177 {
2178 struct si_shader_context *ctx = si_shader_context(bld_base);
2179 struct lp_build_context *base = &bld_base->base;
2180 struct lp_build_context *uint = &ctx->radeon_bld.soa.bld_base.uint_bld;
2181 unsigned reg_index;
2182 unsigned chan;
2183 unsigned const_chan;
2184 LLVMValueRef base_elt;
2185 LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2186 LLVMValueRef constbuf_index = lp_build_const_int32(base->gallivm,
2187 SI_VS_CONST_CLIP_PLANES);
2188 LLVMValueRef const_resource = build_indexed_load_const(ctx, ptr, constbuf_index);
2189
2190 for (reg_index = 0; reg_index < 2; reg_index ++) {
2191 LLVMValueRef *args = pos[2 + reg_index];
2192
2193 args[5] =
2194 args[6] =
2195 args[7] =
2196 args[8] = lp_build_const_float(base->gallivm, 0.0f);
2197
2198 /* Compute dot products of position and user clip plane vectors */
2199 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2200 for (const_chan = 0; const_chan < TGSI_NUM_CHANNELS; const_chan++) {
2201 args[1] = lp_build_const_int32(base->gallivm,
2202 ((reg_index * 4 + chan) * 4 +
2203 const_chan) * 4);
2204 base_elt = buffer_load_const(ctx, const_resource,
2205 args[1]);
2206 args[5 + chan] =
2207 lp_build_add(base, args[5 + chan],
2208 lp_build_mul(base, base_elt,
2209 out_elts[const_chan]));
2210 }
2211 }
2212
2213 args[0] = lp_build_const_int32(base->gallivm, 0xf);
2214 args[1] = uint->zero;
2215 args[2] = uint->zero;
2216 args[3] = lp_build_const_int32(base->gallivm,
2217 V_008DFC_SQ_EXP_POS + 2 + reg_index);
2218 args[4] = uint->zero;
2219 }
2220 }
2221
2222 static void si_dump_streamout(struct pipe_stream_output_info *so)
2223 {
2224 unsigned i;
2225
2226 if (so->num_outputs)
2227 fprintf(stderr, "STREAMOUT\n");
2228
2229 for (i = 0; i < so->num_outputs; i++) {
2230 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
2231 so->output[i].start_component;
2232 fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
2233 i, so->output[i].output_buffer,
2234 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
2235 so->output[i].register_index,
2236 mask & 1 ? "x" : "",
2237 mask & 2 ? "y" : "",
2238 mask & 4 ? "z" : "",
2239 mask & 8 ? "w" : "");
2240 }
2241 }
2242
2243 /* On SI, the vertex shader is responsible for writing streamout data
2244 * to buffers. */
2245 static void si_llvm_emit_streamout(struct si_shader_context *ctx,
2246 struct si_shader_output_values *outputs,
2247 unsigned noutput)
2248 {
2249 struct pipe_stream_output_info *so = &ctx->shader->selector->so;
2250 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
2251 LLVMBuilderRef builder = gallivm->builder;
2252 int i, j;
2253 struct lp_build_if_state if_ctx;
2254 LLVMValueRef so_buffers[4];
2255 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
2256 SI_PARAM_RW_BUFFERS);
2257
2258 /* Load the descriptors. */
2259 for (i = 0; i < 4; ++i) {
2260 if (ctx->shader->selector->so.stride[i]) {
2261 LLVMValueRef offset = lp_build_const_int32(gallivm,
2262 SI_VS_STREAMOUT_BUF0 + i);
2263
2264 so_buffers[i] = build_indexed_load_const(ctx, buf_ptr, offset);
2265 }
2266 }
2267
2268 /* Get bits [22:16], i.e. (so_param >> 16) & 127; */
2269 LLVMValueRef so_vtx_count =
2270 unpack_param(ctx, ctx->param_streamout_config, 16, 7);
2271
2272 LLVMValueRef tid = get_thread_id(ctx);
2273
2274 /* can_emit = tid < so_vtx_count; */
2275 LLVMValueRef can_emit =
2276 LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
2277
2278 LLVMValueRef stream_id =
2279 unpack_param(ctx, ctx->param_streamout_config, 24, 2);
2280
2281 /* Emit the streamout code conditionally. This actually avoids
2282 * out-of-bounds buffer access. The hw tells us via the SGPR
2283 * (so_vtx_count) which threads are allowed to emit streamout data. */
2284 lp_build_if(&if_ctx, gallivm, can_emit);
2285 {
2286 /* The buffer offset is computed as follows:
2287 * ByteOffset = streamout_offset[buffer_id]*4 +
2288 * (streamout_write_index + thread_id)*stride[buffer_id] +
2289 * attrib_offset
2290 */
2291
2292 LLVMValueRef so_write_index =
2293 LLVMGetParam(ctx->radeon_bld.main_fn,
2294 ctx->param_streamout_write_index);
2295
2296 /* Compute (streamout_write_index + thread_id). */
2297 so_write_index = LLVMBuildAdd(builder, so_write_index, tid, "");
2298
2299 /* Compute the write offset for each enabled buffer. */
2300 LLVMValueRef so_write_offset[4] = {};
2301 for (i = 0; i < 4; i++) {
2302 if (!so->stride[i])
2303 continue;
2304
2305 LLVMValueRef so_offset = LLVMGetParam(ctx->radeon_bld.main_fn,
2306 ctx->param_streamout_offset[i]);
2307 so_offset = LLVMBuildMul(builder, so_offset, LLVMConstInt(ctx->i32, 4, 0), "");
2308
2309 so_write_offset[i] = LLVMBuildMul(builder, so_write_index,
2310 LLVMConstInt(ctx->i32, so->stride[i]*4, 0), "");
2311 so_write_offset[i] = LLVMBuildAdd(builder, so_write_offset[i], so_offset, "");
2312 }
2313
2314 /* Write streamout data. */
2315 for (i = 0; i < so->num_outputs; i++) {
2316 unsigned buf_idx = so->output[i].output_buffer;
2317 unsigned reg = so->output[i].register_index;
2318 unsigned start = so->output[i].start_component;
2319 unsigned num_comps = so->output[i].num_components;
2320 unsigned stream = so->output[i].stream;
2321 LLVMValueRef out[4];
2322 struct lp_build_if_state if_ctx_stream;
2323
2324 assert(num_comps && num_comps <= 4);
2325 if (!num_comps || num_comps > 4)
2326 continue;
2327
2328 if (reg >= noutput)
2329 continue;
2330
2331 /* Load the output as int. */
2332 for (j = 0; j < num_comps; j++) {
2333 out[j] = LLVMBuildBitCast(builder,
2334 outputs[reg].values[start+j],
2335 ctx->i32, "");
2336 }
2337
2338 /* Pack the output. */
2339 LLVMValueRef vdata = NULL;
2340
2341 switch (num_comps) {
2342 case 1: /* as i32 */
2343 vdata = out[0];
2344 break;
2345 case 2: /* as v2i32 */
2346 case 3: /* as v4i32 (aligned to 4) */
2347 case 4: /* as v4i32 */
2348 vdata = LLVMGetUndef(LLVMVectorType(ctx->i32, util_next_power_of_two(num_comps)));
2349 for (j = 0; j < num_comps; j++) {
2350 vdata = LLVMBuildInsertElement(builder, vdata, out[j],
2351 LLVMConstInt(ctx->i32, j, 0), "");
2352 }
2353 break;
2354 }
2355
2356 LLVMValueRef can_emit_stream =
2357 LLVMBuildICmp(builder, LLVMIntEQ,
2358 stream_id,
2359 lp_build_const_int32(gallivm, stream), "");
2360
2361 lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
2362 build_tbuffer_store_dwords(ctx, so_buffers[buf_idx],
2363 vdata, num_comps,
2364 so_write_offset[buf_idx],
2365 LLVMConstInt(ctx->i32, 0, 0),
2366 so->output[i].dst_offset*4);
2367 lp_build_endif(&if_ctx_stream);
2368 }
2369 }
2370 lp_build_endif(&if_ctx);
2371 }
2372
2373
2374 /* Generate export instructions for hardware VS shader stage */
2375 static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base,
2376 struct si_shader_output_values *outputs,
2377 unsigned noutput)
2378 {
2379 struct si_shader_context *ctx = si_shader_context(bld_base);
2380 struct si_shader *shader = ctx->shader;
2381 struct lp_build_context *base = &bld_base->base;
2382 struct lp_build_context *uint =
2383 &ctx->radeon_bld.soa.bld_base.uint_bld;
2384 LLVMValueRef args[9];
2385 LLVMValueRef pos_args[4][9] = { { 0 } };
2386 LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL, viewport_index_value = NULL;
2387 unsigned semantic_name, semantic_index;
2388 unsigned target;
2389 unsigned param_count = 0;
2390 unsigned pos_idx;
2391 int i;
2392
2393 if (outputs && ctx->shader->selector->so.num_outputs) {
2394 si_llvm_emit_streamout(ctx, outputs, noutput);
2395 }
2396
2397 for (i = 0; i < noutput; i++) {
2398 semantic_name = outputs[i].name;
2399 semantic_index = outputs[i].sid;
2400
2401 handle_semantic:
2402 /* Select the correct target */
2403 switch(semantic_name) {
2404 case TGSI_SEMANTIC_PSIZE:
2405 psize_value = outputs[i].values[0];
2406 continue;
2407 case TGSI_SEMANTIC_EDGEFLAG:
2408 edgeflag_value = outputs[i].values[0];
2409 continue;
2410 case TGSI_SEMANTIC_LAYER:
2411 layer_value = outputs[i].values[0];
2412 semantic_name = TGSI_SEMANTIC_GENERIC;
2413 goto handle_semantic;
2414 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2415 viewport_index_value = outputs[i].values[0];
2416 semantic_name = TGSI_SEMANTIC_GENERIC;
2417 goto handle_semantic;
2418 case TGSI_SEMANTIC_POSITION:
2419 target = V_008DFC_SQ_EXP_POS;
2420 break;
2421 case TGSI_SEMANTIC_COLOR:
2422 case TGSI_SEMANTIC_BCOLOR:
2423 target = V_008DFC_SQ_EXP_PARAM + param_count;
2424 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2425 shader->info.vs_output_param_offset[i] = param_count;
2426 param_count++;
2427 break;
2428 case TGSI_SEMANTIC_CLIPDIST:
2429 target = V_008DFC_SQ_EXP_POS + 2 + semantic_index;
2430 break;
2431 case TGSI_SEMANTIC_CLIPVERTEX:
2432 si_llvm_emit_clipvertex(bld_base, pos_args, outputs[i].values);
2433 continue;
2434 case TGSI_SEMANTIC_PRIMID:
2435 case TGSI_SEMANTIC_FOG:
2436 case TGSI_SEMANTIC_TEXCOORD:
2437 case TGSI_SEMANTIC_GENERIC:
2438 target = V_008DFC_SQ_EXP_PARAM + param_count;
2439 assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
2440 shader->info.vs_output_param_offset[i] = param_count;
2441 param_count++;
2442 break;
2443 default:
2444 target = 0;
2445 fprintf(stderr,
2446 "Warning: SI unhandled vs output type:%d\n",
2447 semantic_name);
2448 }
2449
2450 si_llvm_init_export_args(bld_base, outputs[i].values, target, args);
2451
2452 if (target >= V_008DFC_SQ_EXP_POS &&
2453 target <= (V_008DFC_SQ_EXP_POS + 3)) {
2454 memcpy(pos_args[target - V_008DFC_SQ_EXP_POS],
2455 args, sizeof(args));
2456 } else {
2457 lp_build_intrinsic(base->gallivm->builder,
2458 "llvm.SI.export", ctx->voidt,
2459 args, 9, 0);
2460 }
2461
2462 if (semantic_name == TGSI_SEMANTIC_CLIPDIST) {
2463 semantic_name = TGSI_SEMANTIC_GENERIC;
2464 goto handle_semantic;
2465 }
2466 }
2467
2468 shader->info.nr_param_exports = param_count;
2469
2470 /* We need to add the position output manually if it's missing. */
2471 if (!pos_args[0][0]) {
2472 pos_args[0][0] = lp_build_const_int32(base->gallivm, 0xf); /* writemask */
2473 pos_args[0][1] = uint->zero; /* EXEC mask */
2474 pos_args[0][2] = uint->zero; /* last export? */
2475 pos_args[0][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS);
2476 pos_args[0][4] = uint->zero; /* COMPR flag */
2477 pos_args[0][5] = base->zero; /* X */
2478 pos_args[0][6] = base->zero; /* Y */
2479 pos_args[0][7] = base->zero; /* Z */
2480 pos_args[0][8] = base->one; /* W */
2481 }
2482
2483 /* Write the misc vector (point size, edgeflag, layer, viewport). */
2484 if (shader->selector->info.writes_psize ||
2485 shader->selector->info.writes_edgeflag ||
2486 shader->selector->info.writes_viewport_index ||
2487 shader->selector->info.writes_layer) {
2488 pos_args[1][0] = lp_build_const_int32(base->gallivm, /* writemask */
2489 shader->selector->info.writes_psize |
2490 (shader->selector->info.writes_edgeflag << 1) |
2491 (shader->selector->info.writes_layer << 2) |
2492 (shader->selector->info.writes_viewport_index << 3));
2493 pos_args[1][1] = uint->zero; /* EXEC mask */
2494 pos_args[1][2] = uint->zero; /* last export? */
2495 pos_args[1][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + 1);
2496 pos_args[1][4] = uint->zero; /* COMPR flag */
2497 pos_args[1][5] = base->zero; /* X */
2498 pos_args[1][6] = base->zero; /* Y */
2499 pos_args[1][7] = base->zero; /* Z */
2500 pos_args[1][8] = base->zero; /* W */
2501
2502 if (shader->selector->info.writes_psize)
2503 pos_args[1][5] = psize_value;
2504
2505 if (shader->selector->info.writes_edgeflag) {
2506 /* The output is a float, but the hw expects an integer
2507 * with the first bit containing the edge flag. */
2508 edgeflag_value = LLVMBuildFPToUI(base->gallivm->builder,
2509 edgeflag_value,
2510 ctx->i32, "");
2511 edgeflag_value = lp_build_min(&bld_base->int_bld,
2512 edgeflag_value,
2513 bld_base->int_bld.one);
2514
2515 /* The LLVM intrinsic expects a float. */
2516 pos_args[1][6] = LLVMBuildBitCast(base->gallivm->builder,
2517 edgeflag_value,
2518 ctx->f32, "");
2519 }
2520
2521 if (shader->selector->info.writes_layer)
2522 pos_args[1][7] = layer_value;
2523
2524 if (shader->selector->info.writes_viewport_index)
2525 pos_args[1][8] = viewport_index_value;
2526 }
2527
2528 for (i = 0; i < 4; i++)
2529 if (pos_args[i][0])
2530 shader->info.nr_pos_exports++;
2531
2532 pos_idx = 0;
2533 for (i = 0; i < 4; i++) {
2534 if (!pos_args[i][0])
2535 continue;
2536
2537 /* Specify the target we are exporting */
2538 pos_args[i][3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_POS + pos_idx++);
2539
2540 if (pos_idx == shader->info.nr_pos_exports)
2541 /* Specify that this is the last export */
2542 pos_args[i][2] = uint->one;
2543
2544 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
2545 ctx->voidt, pos_args[i], 9, 0);
2546 }
2547 }
2548
2549 static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base)
2550 {
2551 struct si_shader_context *ctx = si_shader_context(bld_base);
2552 struct gallivm_state *gallivm = bld_base->base.gallivm;
2553 LLVMValueRef invocation_id, rw_buffers, buffer, buffer_offset;
2554 LLVMValueRef lds_vertex_stride, lds_vertex_offset, lds_base;
2555 uint64_t inputs;
2556
2557 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2558
2559 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_RW_BUFFERS);
2560 buffer = build_indexed_load_const(ctx, rw_buffers,
2561 lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP));
2562
2563 buffer_offset = LLVMGetParam(ctx->radeon_bld.main_fn, ctx->param_oc_lds);
2564
2565 lds_vertex_stride = unpack_param(ctx, SI_PARAM_TCS_IN_LAYOUT, 13, 8);
2566 lds_vertex_offset = LLVMBuildMul(gallivm->builder, invocation_id,
2567 lds_vertex_stride, "");
2568 lds_base = get_tcs_in_current_patch_offset(ctx);
2569 lds_base = LLVMBuildAdd(gallivm->builder, lds_base, lds_vertex_offset, "");
2570
2571 inputs = ctx->shader->key.tcs.epilog.inputs_to_copy;
2572 while (inputs) {
2573 unsigned i = u_bit_scan64(&inputs);
2574
2575 LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base,
2576 lp_build_const_int32(gallivm, 4 * i),
2577 "");
2578
2579 LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx,
2580 invocation_id,
2581 lp_build_const_int32(gallivm, i));
2582
2583 LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0,
2584 lds_ptr);
2585
2586 build_tbuffer_store_dwords(ctx, buffer, value, 4, buffer_addr,
2587 buffer_offset, 0);
2588 }
2589 }
2590
2591 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
2592 LLVMValueRef rel_patch_id,
2593 LLVMValueRef invocation_id,
2594 LLVMValueRef tcs_out_current_patch_data_offset)
2595 {
2596 struct si_shader_context *ctx = si_shader_context(bld_base);
2597 struct gallivm_state *gallivm = bld_base->base.gallivm;
2598 struct si_shader *shader = ctx->shader;
2599 unsigned tess_inner_index, tess_outer_index;
2600 LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
2601 LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base;
2602 unsigned stride, outer_comps, inner_comps, i;
2603 struct lp_build_if_state if_ctx, inner_if_ctx;
2604
2605 si_llvm_emit_barrier(NULL, bld_base, NULL);
2606
2607 /* Do this only for invocation 0, because the tess levels are per-patch,
2608 * not per-vertex.
2609 *
2610 * This can't jump, because invocation 0 executes this. It should
2611 * at least mask out the loads and stores for other invocations.
2612 */
2613 lp_build_if(&if_ctx, gallivm,
2614 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2615 invocation_id, bld_base->uint_bld.zero, ""));
2616
2617 /* Determine the layout of one tess factor element in the buffer. */
2618 switch (shader->key.tcs.epilog.prim_mode) {
2619 case PIPE_PRIM_LINES:
2620 stride = 2; /* 2 dwords, 1 vec2 store */
2621 outer_comps = 2;
2622 inner_comps = 0;
2623 break;
2624 case PIPE_PRIM_TRIANGLES:
2625 stride = 4; /* 4 dwords, 1 vec4 store */
2626 outer_comps = 3;
2627 inner_comps = 1;
2628 break;
2629 case PIPE_PRIM_QUADS:
2630 stride = 6; /* 6 dwords, 2 stores (vec4 + vec2) */
2631 outer_comps = 4;
2632 inner_comps = 2;
2633 break;
2634 default:
2635 assert(0);
2636 return;
2637 }
2638
2639 /* Load tess_inner and tess_outer from LDS.
2640 * Any invocation can write them, so we can't get them from a temporary.
2641 */
2642 tess_inner_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSINNER, 0);
2643 tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0);
2644
2645 lds_base = tcs_out_current_patch_data_offset;
2646 lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
2647 lp_build_const_int32(gallivm,
2648 tess_inner_index * 4), "");
2649 lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
2650 lp_build_const_int32(gallivm,
2651 tess_outer_index * 4), "");
2652
2653 for (i = 0; i < outer_comps; i++)
2654 out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer);
2655 for (i = 0; i < inner_comps; i++)
2656 out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner);
2657
2658 /* Convert the outputs to vectors for stores. */
2659 vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
2660 vec1 = NULL;
2661
2662 if (stride > 4)
2663 vec1 = lp_build_gather_values(gallivm, out+4, stride - 4);
2664
2665 /* Get the buffer. */
2666 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2667 SI_PARAM_RW_BUFFERS);
2668 buffer = build_indexed_load_const(ctx, rw_buffers,
2669 lp_build_const_int32(gallivm, SI_HS_RING_TESS_FACTOR));
2670
2671 /* Get the offset. */
2672 tf_base = LLVMGetParam(ctx->radeon_bld.main_fn,
2673 SI_PARAM_TESS_FACTOR_OFFSET);
2674 byteoffset = LLVMBuildMul(gallivm->builder, rel_patch_id,
2675 lp_build_const_int32(gallivm, 4 * stride), "");
2676
2677 lp_build_if(&inner_if_ctx, gallivm,
2678 LLVMBuildICmp(gallivm->builder, LLVMIntEQ,
2679 rel_patch_id, bld_base->uint_bld.zero, ""));
2680
2681 /* Store the dynamic HS control word. */
2682 build_tbuffer_store_dwords(ctx, buffer,
2683 lp_build_const_int32(gallivm, 0x80000000),
2684 1, lp_build_const_int32(gallivm, 0), tf_base, 0);
2685
2686 lp_build_endif(&inner_if_ctx);
2687
2688 /* Store the tessellation factors. */
2689 build_tbuffer_store_dwords(ctx, buffer, vec0,
2690 MIN2(stride, 4), byteoffset, tf_base, 4);
2691 if (vec1)
2692 build_tbuffer_store_dwords(ctx, buffer, vec1,
2693 stride - 4, byteoffset, tf_base, 20);
2694 lp_build_endif(&if_ctx);
2695 }
2696
2697 /* This only writes the tessellation factor levels. */
2698 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
2699 {
2700 struct si_shader_context *ctx = si_shader_context(bld_base);
2701 LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
2702
2703 rel_patch_id = get_rel_patch_id(ctx);
2704 invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5);
2705 tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
2706
2707 if (!ctx->is_monolithic) {
2708 /* Return epilog parameters from this function. */
2709 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
2710 LLVMValueRef ret = ctx->return_value;
2711 LLVMValueRef rw_buffers, rw0, rw1, tf_soffset;
2712 unsigned vgpr;
2713
2714 /* RW_BUFFERS pointer */
2715 rw_buffers = LLVMGetParam(ctx->radeon_bld.main_fn,
2716 SI_PARAM_RW_BUFFERS);
2717 rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, "");
2718 rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, "");
2719 rw0 = LLVMBuildExtractElement(builder, rw_buffers,
2720 bld_base->uint_bld.zero, "");
2721 rw1 = LLVMBuildExtractElement(builder, rw_buffers,
2722 bld_base->uint_bld.one, "");
2723 ret = LLVMBuildInsertValue(builder, ret, rw0, 0, "");
2724 ret = LLVMBuildInsertValue(builder, ret, rw1, 1, "");
2725
2726 /* Tess factor buffer soffset is after user SGPRs. */
2727 tf_soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2728 SI_PARAM_TESS_FACTOR_OFFSET);
2729 ret = LLVMBuildInsertValue(builder, ret, tf_soffset,
2730 SI_TCS_NUM_USER_SGPR + 1, "");
2731
2732 /* VGPRs */
2733 rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id);
2734 invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id);
2735 tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset);
2736
2737 vgpr = SI_TCS_NUM_USER_SGPR + 2;
2738 ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
2739 ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
2740 ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
2741 ctx->return_value = ret;
2742 return;
2743 }
2744
2745 si_copy_tcs_inputs(bld_base);
2746 si_write_tess_factors(bld_base, rel_patch_id, invocation_id, tf_lds_offset);
2747 }
2748
2749 static void si_llvm_emit_ls_epilogue(struct lp_build_tgsi_context *bld_base)
2750 {
2751 struct si_shader_context *ctx = si_shader_context(bld_base);
2752 struct si_shader *shader = ctx->shader;
2753 struct tgsi_shader_info *info = &shader->selector->info;
2754 struct gallivm_state *gallivm = bld_base->base.gallivm;
2755 unsigned i, chan;
2756 LLVMValueRef vertex_id = LLVMGetParam(ctx->radeon_bld.main_fn,
2757 ctx->param_rel_auto_id);
2758 LLVMValueRef vertex_dw_stride =
2759 unpack_param(ctx, SI_PARAM_LS_OUT_LAYOUT, 13, 8);
2760 LLVMValueRef base_dw_addr = LLVMBuildMul(gallivm->builder, vertex_id,
2761 vertex_dw_stride, "");
2762
2763 /* Write outputs to LDS. The next shader (TCS aka HS) will read
2764 * its inputs from it. */
2765 for (i = 0; i < info->num_outputs; i++) {
2766 LLVMValueRef *out_ptr = ctx->radeon_bld.soa.outputs[i];
2767 unsigned name = info->output_semantic_name[i];
2768 unsigned index = info->output_semantic_index[i];
2769 int param = si_shader_io_get_unique_index(name, index);
2770 LLVMValueRef dw_addr = LLVMBuildAdd(gallivm->builder, base_dw_addr,
2771 lp_build_const_int32(gallivm, param * 4), "");
2772
2773 for (chan = 0; chan < 4; chan++) {
2774 lds_store(bld_base, chan, dw_addr,
2775 LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""));
2776 }
2777 }
2778 }
2779
2780 static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context *bld_base)
2781 {
2782 struct si_shader_context *ctx = si_shader_context(bld_base);
2783 struct gallivm_state *gallivm = bld_base->base.gallivm;
2784 struct si_shader *es = ctx->shader;
2785 struct tgsi_shader_info *info = &es->selector->info;
2786 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
2787 ctx->param_es2gs_offset);
2788 unsigned chan;
2789 int i;
2790
2791 for (i = 0; i < info->num_outputs; i++) {
2792 LLVMValueRef *out_ptr =
2793 ctx->radeon_bld.soa.outputs[i];
2794 int param_index;
2795
2796 if (info->output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX ||
2797 info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
2798 continue;
2799
2800 param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
2801 info->output_semantic_index[i]);
2802
2803 for (chan = 0; chan < 4; chan++) {
2804 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
2805 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
2806
2807 build_tbuffer_store(ctx,
2808 ctx->esgs_ring,
2809 out_val, 1,
2810 LLVMGetUndef(ctx->i32), soffset,
2811 (4 * param_index + chan) * 4,
2812 V_008F0C_BUF_DATA_FORMAT_32,
2813 V_008F0C_BUF_NUM_FORMAT_UINT,
2814 0, 0, 1, 1, 0);
2815 }
2816 }
2817 }
2818
2819 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
2820 {
2821 struct si_shader_context *ctx = si_shader_context(bld_base);
2822 struct gallivm_state *gallivm = bld_base->base.gallivm;
2823 LLVMValueRef args[2];
2824
2825 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_NOP | SENDMSG_GS_DONE);
2826 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
2827 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
2828 ctx->voidt, args, 2, 0);
2829 }
2830
2831 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
2832 {
2833 struct si_shader_context *ctx = si_shader_context(bld_base);
2834 struct gallivm_state *gallivm = bld_base->base.gallivm;
2835 struct tgsi_shader_info *info = &ctx->shader->selector->info;
2836 struct si_shader_output_values *outputs = NULL;
2837 int i,j;
2838
2839 assert(!ctx->is_gs_copy_shader);
2840
2841 outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
2842
2843 /* Vertex color clamping.
2844 *
2845 * This uses a state constant loaded in a user data SGPR and
2846 * an IF statement is added that clamps all colors if the constant
2847 * is true.
2848 */
2849 if (ctx->type == PIPE_SHADER_VERTEX) {
2850 struct lp_build_if_state if_ctx;
2851 LLVMValueRef cond = NULL;
2852 LLVMValueRef addr, val;
2853
2854 for (i = 0; i < info->num_outputs; i++) {
2855 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
2856 info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
2857 continue;
2858
2859 /* We've found a color. */
2860 if (!cond) {
2861 /* The state is in the first bit of the user SGPR. */
2862 cond = LLVMGetParam(ctx->radeon_bld.main_fn,
2863 SI_PARAM_VS_STATE_BITS);
2864 cond = LLVMBuildTrunc(gallivm->builder, cond,
2865 ctx->i1, "");
2866 lp_build_if(&if_ctx, gallivm, cond);
2867 }
2868
2869 for (j = 0; j < 4; j++) {
2870 addr = ctx->radeon_bld.soa.outputs[i][j];
2871 val = LLVMBuildLoad(gallivm->builder, addr, "");
2872 val = radeon_llvm_saturate(bld_base, val);
2873 LLVMBuildStore(gallivm->builder, val, addr);
2874 }
2875 }
2876
2877 if (cond)
2878 lp_build_endif(&if_ctx);
2879 }
2880
2881 for (i = 0; i < info->num_outputs; i++) {
2882 outputs[i].name = info->output_semantic_name[i];
2883 outputs[i].sid = info->output_semantic_index[i];
2884
2885 for (j = 0; j < 4; j++)
2886 outputs[i].values[j] =
2887 LLVMBuildLoad(gallivm->builder,
2888 ctx->radeon_bld.soa.outputs[i][j],
2889 "");
2890 }
2891
2892 if (ctx->is_monolithic) {
2893 /* Export PrimitiveID when PS needs it. */
2894 if (si_vs_exports_prim_id(ctx->shader)) {
2895 outputs[i].name = TGSI_SEMANTIC_PRIMID;
2896 outputs[i].sid = 0;
2897 outputs[i].values[0] = bitcast(bld_base, TGSI_TYPE_FLOAT,
2898 get_primitive_id(bld_base, 0));
2899 outputs[i].values[1] = bld_base->base.undef;
2900 outputs[i].values[2] = bld_base->base.undef;
2901 outputs[i].values[3] = bld_base->base.undef;
2902 i++;
2903 }
2904 } else {
2905 /* Return the primitive ID from the LLVM function. */
2906 ctx->return_value =
2907 LLVMBuildInsertValue(gallivm->builder,
2908 ctx->return_value,
2909 bitcast(bld_base, TGSI_TYPE_FLOAT,
2910 get_primitive_id(bld_base, 0)),
2911 VS_EPILOG_PRIMID_LOC, "");
2912 }
2913
2914 si_llvm_export_vs(bld_base, outputs, i);
2915 FREE(outputs);
2916 }
2917
2918 struct si_ps_exports {
2919 unsigned num;
2920 LLVMValueRef args[10][9];
2921 };
2922
2923 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
2924 bool writes_samplemask)
2925 {
2926 if (writes_z) {
2927 /* Z needs 32 bits. */
2928 if (writes_samplemask)
2929 return V_028710_SPI_SHADER_32_ABGR;
2930 else if (writes_stencil)
2931 return V_028710_SPI_SHADER_32_GR;
2932 else
2933 return V_028710_SPI_SHADER_32_R;
2934 } else if (writes_stencil || writes_samplemask) {
2935 /* Both stencil and sample mask need only 16 bits. */
2936 return V_028710_SPI_SHADER_UINT16_ABGR;
2937 } else {
2938 return V_028710_SPI_SHADER_ZERO;
2939 }
2940 }
2941
2942 static void si_export_mrt_z(struct lp_build_tgsi_context *bld_base,
2943 LLVMValueRef depth, LLVMValueRef stencil,
2944 LLVMValueRef samplemask, struct si_ps_exports *exp)
2945 {
2946 struct si_shader_context *ctx = si_shader_context(bld_base);
2947 struct lp_build_context *base = &bld_base->base;
2948 struct lp_build_context *uint = &bld_base->uint_bld;
2949 LLVMValueRef args[9];
2950 unsigned mask = 0;
2951 unsigned format = si_get_spi_shader_z_format(depth != NULL,
2952 stencil != NULL,
2953 samplemask != NULL);
2954
2955 assert(depth || stencil || samplemask);
2956
2957 args[1] = uint->one; /* whether the EXEC mask is valid */
2958 args[2] = uint->one; /* DONE bit */
2959
2960 /* Specify the target we are exporting */
2961 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_MRTZ);
2962
2963 args[4] = uint->zero; /* COMP flag */
2964 args[5] = base->undef; /* R, depth */
2965 args[6] = base->undef; /* G, stencil test value[0:7], stencil op value[8:15] */
2966 args[7] = base->undef; /* B, sample mask */
2967 args[8] = base->undef; /* A, alpha to mask */
2968
2969 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
2970 assert(!depth);
2971 args[4] = uint->one; /* COMPR flag */
2972
2973 if (stencil) {
2974 /* Stencil should be in X[23:16]. */
2975 stencil = bitcast(bld_base, TGSI_TYPE_UNSIGNED, stencil);
2976 stencil = LLVMBuildShl(base->gallivm->builder, stencil,
2977 LLVMConstInt(ctx->i32, 16, 0), "");
2978 args[5] = bitcast(bld_base, TGSI_TYPE_FLOAT, stencil);
2979 mask |= 0x3;
2980 }
2981 if (samplemask) {
2982 /* SampleMask should be in Y[15:0]. */
2983 args[6] = samplemask;
2984 mask |= 0xc;
2985 }
2986 } else {
2987 if (depth) {
2988 args[5] = depth;
2989 mask |= 0x1;
2990 }
2991 if (stencil) {
2992 args[6] = stencil;
2993 mask |= 0x2;
2994 }
2995 if (samplemask) {
2996 args[7] = samplemask;
2997 mask |= 0x4;
2998 }
2999 }
3000
3001 /* SI (except OLAND) has a bug that it only looks
3002 * at the X writemask component. */
3003 if (ctx->screen->b.chip_class == SI &&
3004 ctx->screen->b.family != CHIP_OLAND)
3005 mask |= 0x1;
3006
3007 /* Specify which components to enable */
3008 args[0] = lp_build_const_int32(base->gallivm, mask);
3009
3010 memcpy(exp->args[exp->num++], args, sizeof(args));
3011 }
3012
3013 static void si_export_mrt_color(struct lp_build_tgsi_context *bld_base,
3014 LLVMValueRef *color, unsigned index,
3015 unsigned samplemask_param,
3016 bool is_last, struct si_ps_exports *exp)
3017 {
3018 struct si_shader_context *ctx = si_shader_context(bld_base);
3019 struct lp_build_context *base = &bld_base->base;
3020 int i;
3021
3022 /* Clamp color */
3023 if (ctx->shader->key.ps.epilog.clamp_color)
3024 for (i = 0; i < 4; i++)
3025 color[i] = radeon_llvm_saturate(bld_base, color[i]);
3026
3027 /* Alpha to one */
3028 if (ctx->shader->key.ps.epilog.alpha_to_one)
3029 color[3] = base->one;
3030
3031 /* Alpha test */
3032 if (index == 0 &&
3033 ctx->shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
3034 si_alpha_test(bld_base, color[3]);
3035
3036 /* Line & polygon smoothing */
3037 if (ctx->shader->key.ps.epilog.poly_line_smoothing)
3038 color[3] = si_scale_alpha_by_sample_mask(bld_base, color[3],
3039 samplemask_param);
3040
3041 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3042 if (ctx->shader->key.ps.epilog.last_cbuf > 0) {
3043 LLVMValueRef args[8][9];
3044 int c, last = -1;
3045
3046 /* Get the export arguments, also find out what the last one is. */
3047 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3048 si_llvm_init_export_args(bld_base, color,
3049 V_008DFC_SQ_EXP_MRT + c, args[c]);
3050 if (args[c][0] != bld_base->uint_bld.zero)
3051 last = c;
3052 }
3053
3054 /* Emit all exports. */
3055 for (c = 0; c <= ctx->shader->key.ps.epilog.last_cbuf; c++) {
3056 if (is_last && last == c) {
3057 args[c][1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3058 args[c][2] = bld_base->uint_bld.one; /* DONE bit */
3059 } else if (args[c][0] == bld_base->uint_bld.zero)
3060 continue; /* unnecessary NULL export */
3061
3062 memcpy(exp->args[exp->num++], args[c], sizeof(args[c]));
3063 }
3064 } else {
3065 LLVMValueRef args[9];
3066
3067 /* Export */
3068 si_llvm_init_export_args(bld_base, color, V_008DFC_SQ_EXP_MRT + index,
3069 args);
3070 if (is_last) {
3071 args[1] = bld_base->uint_bld.one; /* whether the EXEC mask is valid */
3072 args[2] = bld_base->uint_bld.one; /* DONE bit */
3073 } else if (args[0] == bld_base->uint_bld.zero)
3074 return; /* unnecessary NULL export */
3075
3076 memcpy(exp->args[exp->num++], args, sizeof(args));
3077 }
3078 }
3079
3080 static void si_emit_ps_exports(struct si_shader_context *ctx,
3081 struct si_ps_exports *exp)
3082 {
3083 for (unsigned i = 0; i < exp->num; i++)
3084 lp_build_intrinsic(ctx->radeon_bld.gallivm.builder,
3085 "llvm.SI.export", ctx->voidt,
3086 exp->args[i], 9, 0);
3087 }
3088
3089 static void si_export_null(struct lp_build_tgsi_context *bld_base)
3090 {
3091 struct si_shader_context *ctx = si_shader_context(bld_base);
3092 struct lp_build_context *base = &bld_base->base;
3093 struct lp_build_context *uint = &bld_base->uint_bld;
3094 LLVMValueRef args[9];
3095
3096 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
3097 args[1] = uint->one; /* whether the EXEC mask is valid */
3098 args[2] = uint->one; /* DONE bit */
3099 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_NULL);
3100 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
3101 args[5] = base->undef; /* R */
3102 args[6] = base->undef; /* G */
3103 args[7] = base->undef; /* B */
3104 args[8] = base->undef; /* A */
3105
3106 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
3107 ctx->voidt, args, 9, 0);
3108 }
3109
3110 static void si_llvm_emit_fs_epilogue(struct lp_build_tgsi_context *bld_base)
3111 {
3112 struct si_shader_context *ctx = si_shader_context(bld_base);
3113 struct si_shader *shader = ctx->shader;
3114 struct lp_build_context *base = &bld_base->base;
3115 struct tgsi_shader_info *info = &shader->selector->info;
3116 LLVMBuilderRef builder = base->gallivm->builder;
3117 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3118 int last_color_export = -1;
3119 int i;
3120 struct si_ps_exports exp = {};
3121
3122 /* Determine the last export. If MRTZ is present, it's always last.
3123 * Otherwise, find the last color export.
3124 */
3125 if (!info->writes_z && !info->writes_stencil && !info->writes_samplemask) {
3126 unsigned spi_format = shader->key.ps.epilog.spi_shader_col_format;
3127
3128 /* Don't export NULL and return if alpha-test is enabled. */
3129 if (shader->key.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS &&
3130 shader->key.ps.epilog.alpha_func != PIPE_FUNC_NEVER &&
3131 (spi_format & 0xf) == 0)
3132 spi_format |= V_028714_SPI_SHADER_32_AR;
3133
3134 for (i = 0; i < info->num_outputs; i++) {
3135 unsigned index = info->output_semantic_index[i];
3136
3137 if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR)
3138 continue;
3139
3140 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
3141 if (shader->key.ps.epilog.last_cbuf > 0) {
3142 /* Just set this if any of the colorbuffers are enabled. */
3143 if (spi_format &
3144 ((1llu << (4 * (shader->key.ps.epilog.last_cbuf + 1))) - 1))
3145 last_color_export = i;
3146 continue;
3147 }
3148
3149 if ((spi_format >> (index * 4)) & 0xf)
3150 last_color_export = i;
3151 }
3152
3153 /* If there are no outputs, export NULL. */
3154 if (last_color_export == -1) {
3155 si_export_null(bld_base);
3156 return;
3157 }
3158 }
3159
3160 for (i = 0; i < info->num_outputs; i++) {
3161 unsigned semantic_name = info->output_semantic_name[i];
3162 unsigned semantic_index = info->output_semantic_index[i];
3163 unsigned j;
3164 LLVMValueRef color[4] = {};
3165
3166 /* Select the correct target */
3167 switch (semantic_name) {
3168 case TGSI_SEMANTIC_POSITION:
3169 depth = LLVMBuildLoad(builder,
3170 ctx->radeon_bld.soa.outputs[i][2], "");
3171 break;
3172 case TGSI_SEMANTIC_STENCIL:
3173 stencil = LLVMBuildLoad(builder,
3174 ctx->radeon_bld.soa.outputs[i][1], "");
3175 break;
3176 case TGSI_SEMANTIC_SAMPLEMASK:
3177 samplemask = LLVMBuildLoad(builder,
3178 ctx->radeon_bld.soa.outputs[i][0], "");
3179 break;
3180 case TGSI_SEMANTIC_COLOR:
3181 for (j = 0; j < 4; j++)
3182 color[j] = LLVMBuildLoad(builder,
3183 ctx->radeon_bld.soa.outputs[i][j], "");
3184
3185 si_export_mrt_color(bld_base, color, semantic_index,
3186 SI_PARAM_SAMPLE_COVERAGE,
3187 last_color_export == i, &exp);
3188 break;
3189 default:
3190 fprintf(stderr,
3191 "Warning: SI unhandled fs output type:%d\n",
3192 semantic_name);
3193 }
3194 }
3195
3196 if (depth || stencil || samplemask)
3197 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
3198
3199 si_emit_ps_exports(ctx, &exp);
3200 }
3201
3202 /**
3203 * Return PS outputs in this order:
3204 *
3205 * v[0:3] = color0.xyzw
3206 * v[4:7] = color1.xyzw
3207 * ...
3208 * vN+0 = Depth
3209 * vN+1 = Stencil
3210 * vN+2 = SampleMask
3211 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
3212 *
3213 * The alpha-ref SGPR is returned via its original location.
3214 */
3215 static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
3216 {
3217 struct si_shader_context *ctx = si_shader_context(bld_base);
3218 struct si_shader *shader = ctx->shader;
3219 struct lp_build_context *base = &bld_base->base;
3220 struct tgsi_shader_info *info = &shader->selector->info;
3221 LLVMBuilderRef builder = base->gallivm->builder;
3222 unsigned i, j, first_vgpr, vgpr;
3223
3224 LLVMValueRef color[8][4] = {};
3225 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
3226 LLVMValueRef ret;
3227
3228 /* Read the output values. */
3229 for (i = 0; i < info->num_outputs; i++) {
3230 unsigned semantic_name = info->output_semantic_name[i];
3231 unsigned semantic_index = info->output_semantic_index[i];
3232
3233 switch (semantic_name) {
3234 case TGSI_SEMANTIC_COLOR:
3235 assert(semantic_index < 8);
3236 for (j = 0; j < 4; j++) {
3237 LLVMValueRef ptr = ctx->radeon_bld.soa.outputs[i][j];
3238 LLVMValueRef result = LLVMBuildLoad(builder, ptr, "");
3239 color[semantic_index][j] = result;
3240 }
3241 break;
3242 case TGSI_SEMANTIC_POSITION:
3243 depth = LLVMBuildLoad(builder,
3244 ctx->radeon_bld.soa.outputs[i][2], "");
3245 break;
3246 case TGSI_SEMANTIC_STENCIL:
3247 stencil = LLVMBuildLoad(builder,
3248 ctx->radeon_bld.soa.outputs[i][1], "");
3249 break;
3250 case TGSI_SEMANTIC_SAMPLEMASK:
3251 samplemask = LLVMBuildLoad(builder,
3252 ctx->radeon_bld.soa.outputs[i][0], "");
3253 break;
3254 default:
3255 fprintf(stderr, "Warning: SI unhandled fs output type:%d\n",
3256 semantic_name);
3257 }
3258 }
3259
3260 /* Fill the return structure. */
3261 ret = ctx->return_value;
3262
3263 /* Set SGPRs. */
3264 ret = LLVMBuildInsertValue(builder, ret,
3265 bitcast(bld_base, TGSI_TYPE_SIGNED,
3266 LLVMGetParam(ctx->radeon_bld.main_fn,
3267 SI_PARAM_ALPHA_REF)),
3268 SI_SGPR_ALPHA_REF, "");
3269
3270 /* Set VGPRs */
3271 first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
3272 for (i = 0; i < ARRAY_SIZE(color); i++) {
3273 if (!color[i][0])
3274 continue;
3275
3276 for (j = 0; j < 4; j++)
3277 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
3278 }
3279 if (depth)
3280 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
3281 if (stencil)
3282 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
3283 if (samplemask)
3284 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
3285
3286 /* Add the input sample mask for smoothing at the end. */
3287 if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
3288 vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
3289 ret = LLVMBuildInsertValue(builder, ret,
3290 LLVMGetParam(ctx->radeon_bld.main_fn,
3291 SI_PARAM_SAMPLE_COVERAGE), vgpr++, "");
3292
3293 ctx->return_value = ret;
3294 }
3295
3296 /**
3297 * Given a v8i32 resource descriptor for a buffer, extract the size of the
3298 * buffer in number of elements and return it as an i32.
3299 */
3300 static LLVMValueRef get_buffer_size(
3301 struct lp_build_tgsi_context *bld_base,
3302 LLVMValueRef descriptor)
3303 {
3304 struct si_shader_context *ctx = si_shader_context(bld_base);
3305 struct gallivm_state *gallivm = bld_base->base.gallivm;
3306 LLVMBuilderRef builder = gallivm->builder;
3307 LLVMValueRef size =
3308 LLVMBuildExtractElement(builder, descriptor,
3309 lp_build_const_int32(gallivm, 6), "");
3310
3311 if (ctx->screen->b.chip_class >= VI) {
3312 /* On VI, the descriptor contains the size in bytes,
3313 * but TXQ must return the size in elements.
3314 * The stride is always non-zero for resources using TXQ.
3315 */
3316 LLVMValueRef stride =
3317 LLVMBuildExtractElement(builder, descriptor,
3318 lp_build_const_int32(gallivm, 5), "");
3319 stride = LLVMBuildLShr(builder, stride,
3320 lp_build_const_int32(gallivm, 16), "");
3321 stride = LLVMBuildAnd(builder, stride,
3322 lp_build_const_int32(gallivm, 0x3FFF), "");
3323
3324 size = LLVMBuildUDiv(builder, size, stride, "");
3325 }
3326
3327 return size;
3328 }
3329
3330 /**
3331 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
3332 * intrinsic names).
3333 */
3334 static void build_type_name_for_intr(
3335 LLVMTypeRef type,
3336 char *buf, unsigned bufsize)
3337 {
3338 LLVMTypeRef elem_type = type;
3339
3340 assert(bufsize >= 8);
3341
3342 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
3343 int ret = snprintf(buf, bufsize, "v%u",
3344 LLVMGetVectorSize(type));
3345 if (ret < 0) {
3346 char *type_name = LLVMPrintTypeToString(type);
3347 fprintf(stderr, "Error building type name for: %s\n",
3348 type_name);
3349 return;
3350 }
3351 elem_type = LLVMGetElementType(type);
3352 buf += ret;
3353 bufsize -= ret;
3354 }
3355 switch (LLVMGetTypeKind(elem_type)) {
3356 default: break;
3357 case LLVMIntegerTypeKind:
3358 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
3359 break;
3360 case LLVMFloatTypeKind:
3361 snprintf(buf, bufsize, "f32");
3362 break;
3363 case LLVMDoubleTypeKind:
3364 snprintf(buf, bufsize, "f64");
3365 break;
3366 }
3367 }
3368
3369 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
3370 struct lp_build_tgsi_context *bld_base,
3371 struct lp_build_emit_data *emit_data);
3372
3373 /* Prevent optimizations (at least of memory accesses) across the current
3374 * point in the program by emitting empty inline assembly that is marked as
3375 * having side effects.
3376 */
3377 static void emit_optimization_barrier(struct si_shader_context *ctx)
3378 {
3379 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3380 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3381 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
3382 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
3383 }
3384
3385 static void emit_waitcnt(struct si_shader_context *ctx)
3386 {
3387 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3388 LLVMBuilderRef builder = gallivm->builder;
3389 LLVMValueRef args[1] = {
3390 lp_build_const_int32(gallivm, 0xf70)
3391 };
3392 lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt",
3393 ctx->voidt, args, 1, 0);
3394 }
3395
3396 static void membar_emit(
3397 const struct lp_build_tgsi_action *action,
3398 struct lp_build_tgsi_context *bld_base,
3399 struct lp_build_emit_data *emit_data)
3400 {
3401 struct si_shader_context *ctx = si_shader_context(bld_base);
3402
3403 emit_waitcnt(ctx);
3404 }
3405
3406 static LLVMValueRef
3407 shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
3408 const struct tgsi_full_src_register *reg)
3409 {
3410 LLVMValueRef index;
3411 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
3412 SI_PARAM_SHADER_BUFFERS);
3413
3414 if (!reg->Register.Indirect)
3415 index = LLVMConstInt(ctx->i32, reg->Register.Index, 0);
3416 else
3417 index = get_bounded_indirect_index(ctx, &reg->Indirect,
3418 reg->Register.Index,
3419 SI_NUM_SHADER_BUFFERS);
3420
3421 return build_indexed_load_const(ctx, rsrc_ptr, index);
3422 }
3423
3424 static bool tgsi_is_array_sampler(unsigned target)
3425 {
3426 return target == TGSI_TEXTURE_1D_ARRAY ||
3427 target == TGSI_TEXTURE_SHADOW1D_ARRAY ||
3428 target == TGSI_TEXTURE_2D_ARRAY ||
3429 target == TGSI_TEXTURE_SHADOW2D_ARRAY ||
3430 target == TGSI_TEXTURE_CUBE_ARRAY ||
3431 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY ||
3432 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3433 }
3434
3435 static bool tgsi_is_array_image(unsigned target)
3436 {
3437 return target == TGSI_TEXTURE_3D ||
3438 target == TGSI_TEXTURE_CUBE ||
3439 target == TGSI_TEXTURE_1D_ARRAY ||
3440 target == TGSI_TEXTURE_2D_ARRAY ||
3441 target == TGSI_TEXTURE_CUBE_ARRAY ||
3442 target == TGSI_TEXTURE_2D_ARRAY_MSAA;
3443 }
3444
3445 /**
3446 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
3447 *
3448 * At least on Tonga, executing image stores on images with DCC enabled and
3449 * non-trivial can eventually lead to lockups. This can occur when an
3450 * application binds an image as read-only but then uses a shader that writes
3451 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
3452 * program termination) in this case, but it doesn't cost much to be a bit
3453 * nicer: disabling DCC in the shader still leads to undefined results but
3454 * avoids the lockup.
3455 */
3456 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
3457 LLVMValueRef rsrc)
3458 {
3459 if (ctx->screen->b.chip_class <= CIK) {
3460 return rsrc;
3461 } else {
3462 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
3463 LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
3464 LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
3465 LLVMValueRef tmp;
3466
3467 tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
3468 tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
3469 return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
3470 }
3471 }
3472
3473 /**
3474 * Load the resource descriptor for \p image.
3475 */
3476 static void
3477 image_fetch_rsrc(
3478 struct lp_build_tgsi_context *bld_base,
3479 const struct tgsi_full_src_register *image,
3480 bool dcc_off,
3481 LLVMValueRef *rsrc)
3482 {
3483 struct si_shader_context *ctx = si_shader_context(bld_base);
3484 LLVMValueRef rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
3485 SI_PARAM_IMAGES);
3486 LLVMValueRef index, tmp;
3487
3488 assert(image->Register.File == TGSI_FILE_IMAGE);
3489
3490 if (!image->Register.Indirect) {
3491 const struct tgsi_shader_info *info = bld_base->info;
3492
3493 index = LLVMConstInt(ctx->i32, image->Register.Index, 0);
3494
3495 if (info->images_writemask & (1 << image->Register.Index) &&
3496 !(info->images_buffers & (1 << image->Register.Index)))
3497 dcc_off = true;
3498 } else {
3499 /* From the GL_ARB_shader_image_load_store extension spec:
3500 *
3501 * If a shader performs an image load, store, or atomic
3502 * operation using an image variable declared as an array,
3503 * and if the index used to select an individual element is
3504 * negative or greater than or equal to the size of the
3505 * array, the results of the operation are undefined but may
3506 * not lead to termination.
3507 */
3508 index = get_bounded_indirect_index(ctx, &image->Indirect,
3509 image->Register.Index,
3510 SI_NUM_IMAGES);
3511 }
3512
3513 tmp = build_indexed_load_const(ctx, rsrc_ptr, index);
3514 if (dcc_off)
3515 tmp = force_dcc_off(ctx, tmp);
3516 *rsrc = tmp;
3517 }
3518
3519 static LLVMValueRef image_fetch_coords(
3520 struct lp_build_tgsi_context *bld_base,
3521 const struct tgsi_full_instruction *inst,
3522 unsigned src)
3523 {
3524 struct gallivm_state *gallivm = bld_base->base.gallivm;
3525 LLVMBuilderRef builder = gallivm->builder;
3526 unsigned target = inst->Memory.Texture;
3527 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
3528 LLVMValueRef coords[4];
3529 LLVMValueRef tmp;
3530 int chan;
3531
3532 for (chan = 0; chan < num_coords; ++chan) {
3533 tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
3534 tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3535 coords[chan] = tmp;
3536 }
3537
3538 if (num_coords == 1)
3539 return coords[0];
3540
3541 if (num_coords == 3) {
3542 /* LLVM has difficulties lowering 3-element vectors. */
3543 coords[3] = bld_base->uint_bld.undef;
3544 num_coords = 4;
3545 }
3546
3547 return lp_build_gather_values(gallivm, coords, num_coords);
3548 }
3549
3550 /**
3551 * Append the extra mode bits that are used by image load and store.
3552 */
3553 static void image_append_args(
3554 struct si_shader_context *ctx,
3555 struct lp_build_emit_data * emit_data,
3556 unsigned target,
3557 bool atomic)
3558 {
3559 const struct tgsi_full_instruction *inst = emit_data->inst;
3560 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3561 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3562 LLVMValueRef r128 = i1false;
3563 LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false;
3564 LLVMValueRef glc =
3565 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3566 i1true : i1false;
3567 LLVMValueRef slc = i1false;
3568 LLVMValueRef lwe = i1false;
3569
3570 if (atomic || (HAVE_LLVM <= 0x0309)) {
3571 emit_data->args[emit_data->arg_count++] = r128;
3572 emit_data->args[emit_data->arg_count++] = da;
3573 if (!atomic) {
3574 emit_data->args[emit_data->arg_count++] = glc;
3575 }
3576 emit_data->args[emit_data->arg_count++] = slc;
3577 return;
3578 }
3579
3580 /* HAVE_LLVM >= 0x0400 */
3581 emit_data->args[emit_data->arg_count++] = glc;
3582 emit_data->args[emit_data->arg_count++] = slc;
3583 emit_data->args[emit_data->arg_count++] = lwe;
3584 emit_data->args[emit_data->arg_count++] = da;
3585 }
3586
3587 /**
3588 * Given a 256 bit resource, extract the top half (which stores the buffer
3589 * resource in the case of textures and images).
3590 */
3591 static LLVMValueRef extract_rsrc_top_half(
3592 struct si_shader_context *ctx,
3593 LLVMValueRef rsrc)
3594 {
3595 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3596 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
3597 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
3598
3599 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
3600 rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
3601 rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
3602
3603 return rsrc;
3604 }
3605
3606 /**
3607 * Append the resource and indexing arguments for buffer intrinsics.
3608 *
3609 * \param rsrc the v4i32 buffer resource
3610 * \param index index into the buffer (stride-based)
3611 * \param offset byte offset into the buffer
3612 */
3613 static void buffer_append_args(
3614 struct si_shader_context *ctx,
3615 struct lp_build_emit_data *emit_data,
3616 LLVMValueRef rsrc,
3617 LLVMValueRef index,
3618 LLVMValueRef offset,
3619 bool atomic)
3620 {
3621 const struct tgsi_full_instruction *inst = emit_data->inst;
3622 LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
3623 LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
3624
3625 emit_data->args[emit_data->arg_count++] = rsrc;
3626 emit_data->args[emit_data->arg_count++] = index; /* vindex */
3627 emit_data->args[emit_data->arg_count++] = offset; /* voffset */
3628 if (!atomic) {
3629 emit_data->args[emit_data->arg_count++] =
3630 inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
3631 i1true : i1false; /* glc */
3632 }
3633 emit_data->args[emit_data->arg_count++] = i1false; /* slc */
3634 }
3635
3636 static void load_fetch_args(
3637 struct lp_build_tgsi_context * bld_base,
3638 struct lp_build_emit_data * emit_data)
3639 {
3640 struct si_shader_context *ctx = si_shader_context(bld_base);
3641 struct gallivm_state *gallivm = bld_base->base.gallivm;
3642 const struct tgsi_full_instruction * inst = emit_data->inst;
3643 unsigned target = inst->Memory.Texture;
3644 LLVMValueRef rsrc;
3645
3646 emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
3647
3648 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3649 LLVMBuilderRef builder = gallivm->builder;
3650 LLVMValueRef offset;
3651 LLVMValueRef tmp;
3652
3653 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
3654
3655 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
3656 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3657
3658 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3659 offset, false);
3660 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
3661 LLVMValueRef coords;
3662
3663 image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
3664 coords = image_fetch_coords(bld_base, inst, 1);
3665
3666 if (target == TGSI_TEXTURE_BUFFER) {
3667 rsrc = extract_rsrc_top_half(ctx, rsrc);
3668 buffer_append_args(ctx, emit_data, rsrc, coords,
3669 bld_base->uint_bld.zero, false);
3670 } else {
3671 emit_data->args[0] = coords;
3672 emit_data->args[1] = rsrc;
3673 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
3674 emit_data->arg_count = 3;
3675
3676 image_append_args(ctx, emit_data, target, false);
3677 }
3678 }
3679 }
3680
3681 static void load_emit_buffer(struct si_shader_context *ctx,
3682 struct lp_build_emit_data *emit_data)
3683 {
3684 const struct tgsi_full_instruction *inst = emit_data->inst;
3685 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3686 LLVMBuilderRef builder = gallivm->builder;
3687 uint writemask = inst->Dst[0].Register.WriteMask;
3688 uint count = util_last_bit(writemask);
3689 const char *intrinsic_name;
3690 LLVMTypeRef dst_type;
3691
3692 switch (count) {
3693 case 1:
3694 intrinsic_name = "llvm.amdgcn.buffer.load.f32";
3695 dst_type = ctx->f32;
3696 break;
3697 case 2:
3698 intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
3699 dst_type = LLVMVectorType(ctx->f32, 2);
3700 break;
3701 default: // 3 & 4
3702 intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
3703 dst_type = ctx->v4f32;
3704 count = 4;
3705 }
3706
3707 emit_data->output[emit_data->chan] = lp_build_intrinsic(
3708 builder, intrinsic_name, dst_type,
3709 emit_data->args, emit_data->arg_count,
3710 LLVMReadOnlyAttribute);
3711 }
3712
3713 static LLVMValueRef get_memory_ptr(struct si_shader_context *ctx,
3714 const struct tgsi_full_instruction *inst,
3715 LLVMTypeRef type, int arg)
3716 {
3717 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3718 LLVMBuilderRef builder = gallivm->builder;
3719 LLVMValueRef offset, ptr;
3720 int addr_space;
3721
3722 offset = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, arg, 0);
3723 offset = LLVMBuildBitCast(builder, offset, ctx->i32, "");
3724
3725 ptr = ctx->shared_memory;
3726 ptr = LLVMBuildGEP(builder, ptr, &offset, 1, "");
3727 addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3728 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, addr_space), "");
3729
3730 return ptr;
3731 }
3732
3733 static void load_emit_memory(
3734 struct si_shader_context *ctx,
3735 struct lp_build_emit_data *emit_data)
3736 {
3737 const struct tgsi_full_instruction *inst = emit_data->inst;
3738 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3739 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3740 LLVMBuilderRef builder = gallivm->builder;
3741 unsigned writemask = inst->Dst[0].Register.WriteMask;
3742 LLVMValueRef channels[4], ptr, derived_ptr, index;
3743 int chan;
3744
3745 ptr = get_memory_ptr(ctx, inst, base->elem_type, 1);
3746
3747 for (chan = 0; chan < 4; ++chan) {
3748 if (!(writemask & (1 << chan))) {
3749 channels[chan] = LLVMGetUndef(base->elem_type);
3750 continue;
3751 }
3752
3753 index = lp_build_const_int32(gallivm, chan);
3754 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3755 channels[chan] = LLVMBuildLoad(builder, derived_ptr, "");
3756 }
3757 emit_data->output[emit_data->chan] = lp_build_gather_values(gallivm, channels, 4);
3758 }
3759
3760 static void get_image_intr_name(const char *base_name,
3761 LLVMTypeRef data_type,
3762 LLVMTypeRef coords_type,
3763 LLVMTypeRef rsrc_type,
3764 char *out_name, unsigned out_len)
3765 {
3766 char coords_type_name[8];
3767
3768 build_type_name_for_intr(coords_type, coords_type_name,
3769 sizeof(coords_type_name));
3770
3771 if (HAVE_LLVM <= 0x0309) {
3772 snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name);
3773 } else {
3774 char data_type_name[8];
3775 char rsrc_type_name[8];
3776
3777 build_type_name_for_intr(data_type, data_type_name,
3778 sizeof(data_type_name));
3779 build_type_name_for_intr(rsrc_type, rsrc_type_name,
3780 sizeof(rsrc_type_name));
3781 snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
3782 data_type_name, coords_type_name, rsrc_type_name);
3783 }
3784 }
3785
3786 static void load_emit(
3787 const struct lp_build_tgsi_action *action,
3788 struct lp_build_tgsi_context *bld_base,
3789 struct lp_build_emit_data *emit_data)
3790 {
3791 struct si_shader_context *ctx = si_shader_context(bld_base);
3792 struct gallivm_state *gallivm = bld_base->base.gallivm;
3793 LLVMBuilderRef builder = gallivm->builder;
3794 const struct tgsi_full_instruction * inst = emit_data->inst;
3795 char intrinsic_name[64];
3796
3797 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
3798 load_emit_memory(ctx, emit_data);
3799 return;
3800 }
3801
3802 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
3803 emit_waitcnt(ctx);
3804
3805 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3806 load_emit_buffer(ctx, emit_data);
3807 return;
3808 }
3809
3810 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
3811 emit_data->output[emit_data->chan] =
3812 lp_build_intrinsic(
3813 builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
3814 emit_data->args, emit_data->arg_count,
3815 LLVMReadOnlyAttribute);
3816 } else {
3817 get_image_intr_name("llvm.amdgcn.image.load",
3818 emit_data->dst_type, /* vdata */
3819 LLVMTypeOf(emit_data->args[0]), /* coords */
3820 LLVMTypeOf(emit_data->args[1]), /* rsrc */
3821 intrinsic_name, sizeof(intrinsic_name));
3822
3823 emit_data->output[emit_data->chan] =
3824 lp_build_intrinsic(
3825 builder, intrinsic_name, emit_data->dst_type,
3826 emit_data->args, emit_data->arg_count,
3827 LLVMReadOnlyAttribute);
3828 }
3829 }
3830
3831 static void store_fetch_args(
3832 struct lp_build_tgsi_context * bld_base,
3833 struct lp_build_emit_data * emit_data)
3834 {
3835 struct si_shader_context *ctx = si_shader_context(bld_base);
3836 struct gallivm_state *gallivm = bld_base->base.gallivm;
3837 LLVMBuilderRef builder = gallivm->builder;
3838 const struct tgsi_full_instruction * inst = emit_data->inst;
3839 struct tgsi_full_src_register memory;
3840 LLVMValueRef chans[4];
3841 LLVMValueRef data;
3842 LLVMValueRef rsrc;
3843 unsigned chan;
3844
3845 emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
3846
3847 for (chan = 0; chan < 4; ++chan) {
3848 chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
3849 }
3850 data = lp_build_gather_values(gallivm, chans, 4);
3851
3852 emit_data->args[emit_data->arg_count++] = data;
3853
3854 memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
3855
3856 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
3857 LLVMValueRef offset;
3858 LLVMValueRef tmp;
3859
3860 rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
3861
3862 tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
3863 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
3864
3865 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
3866 offset, false);
3867 } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) {
3868 unsigned target = inst->Memory.Texture;
3869 LLVMValueRef coords;
3870
3871 coords = image_fetch_coords(bld_base, inst, 0);
3872
3873 if (target == TGSI_TEXTURE_BUFFER) {
3874 image_fetch_rsrc(bld_base, &memory, false, &rsrc);
3875
3876 rsrc = extract_rsrc_top_half(ctx, rsrc);
3877 buffer_append_args(ctx, emit_data, rsrc, coords,
3878 bld_base->uint_bld.zero, false);
3879 } else {
3880 emit_data->args[1] = coords;
3881 image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
3882 emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
3883 emit_data->arg_count = 4;
3884
3885 image_append_args(ctx, emit_data, target, false);
3886 }
3887 }
3888 }
3889
3890 static void store_emit_buffer(
3891 struct si_shader_context *ctx,
3892 struct lp_build_emit_data *emit_data)
3893 {
3894 const struct tgsi_full_instruction *inst = emit_data->inst;
3895 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3896 LLVMBuilderRef builder = gallivm->builder;
3897 struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
3898 LLVMValueRef base_data = emit_data->args[0];
3899 LLVMValueRef base_offset = emit_data->args[3];
3900 unsigned writemask = inst->Dst[0].Register.WriteMask;
3901
3902 while (writemask) {
3903 int start, count;
3904 const char *intrinsic_name;
3905 LLVMValueRef data;
3906 LLVMValueRef offset;
3907 LLVMValueRef tmp;
3908
3909 u_bit_scan_consecutive_range(&writemask, &start, &count);
3910
3911 /* Due to an LLVM limitation, split 3-element writes
3912 * into a 2-element and a 1-element write. */
3913 if (count == 3) {
3914 writemask |= 1 << (start + 2);
3915 count = 2;
3916 }
3917
3918 if (count == 4) {
3919 data = base_data;
3920 intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
3921 } else if (count == 2) {
3922 LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
3923
3924 tmp = LLVMBuildExtractElement(
3925 builder, base_data,
3926 lp_build_const_int32(gallivm, start), "");
3927 data = LLVMBuildInsertElement(
3928 builder, LLVMGetUndef(v2f32), tmp,
3929 uint_bld->zero, "");
3930
3931 tmp = LLVMBuildExtractElement(
3932 builder, base_data,
3933 lp_build_const_int32(gallivm, start + 1), "");
3934 data = LLVMBuildInsertElement(
3935 builder, data, tmp, uint_bld->one, "");
3936
3937 intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
3938 } else {
3939 assert(count == 1);
3940 data = LLVMBuildExtractElement(
3941 builder, base_data,
3942 lp_build_const_int32(gallivm, start), "");
3943 intrinsic_name = "llvm.amdgcn.buffer.store.f32";
3944 }
3945
3946 offset = base_offset;
3947 if (start != 0) {
3948 offset = LLVMBuildAdd(
3949 builder, offset,
3950 lp_build_const_int32(gallivm, start * 4), "");
3951 }
3952
3953 emit_data->args[0] = data;
3954 emit_data->args[3] = offset;
3955
3956 lp_build_intrinsic(
3957 builder, intrinsic_name, emit_data->dst_type,
3958 emit_data->args, emit_data->arg_count, 0);
3959 }
3960 }
3961
3962 static void store_emit_memory(
3963 struct si_shader_context *ctx,
3964 struct lp_build_emit_data *emit_data)
3965 {
3966 const struct tgsi_full_instruction *inst = emit_data->inst;
3967 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
3968 struct lp_build_context *base = &ctx->radeon_bld.soa.bld_base.base;
3969 LLVMBuilderRef builder = gallivm->builder;
3970 unsigned writemask = inst->Dst[0].Register.WriteMask;
3971 LLVMValueRef ptr, derived_ptr, data, index;
3972 int chan;
3973
3974 ptr = get_memory_ptr(ctx, inst, base->elem_type, 0);
3975
3976 for (chan = 0; chan < 4; ++chan) {
3977 if (!(writemask & (1 << chan))) {
3978 continue;
3979 }
3980 data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 1, chan);
3981 index = lp_build_const_int32(gallivm, chan);
3982 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3983 LLVMBuildStore(builder, data, derived_ptr);
3984 }
3985 }
3986
3987 static void store_emit(
3988 const struct lp_build_tgsi_action *action,
3989 struct lp_build_tgsi_context *bld_base,
3990 struct lp_build_emit_data *emit_data)
3991 {
3992 struct si_shader_context *ctx = si_shader_context(bld_base);
3993 struct gallivm_state *gallivm = bld_base->base.gallivm;
3994 LLVMBuilderRef builder = gallivm->builder;
3995 const struct tgsi_full_instruction * inst = emit_data->inst;
3996 unsigned target = inst->Memory.Texture;
3997 char intrinsic_name[64];
3998
3999 if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) {
4000 store_emit_memory(ctx, emit_data);
4001 return;
4002 }
4003
4004 if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
4005 emit_waitcnt(ctx);
4006
4007 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
4008 store_emit_buffer(ctx, emit_data);
4009 return;
4010 }
4011
4012 if (target == TGSI_TEXTURE_BUFFER) {
4013 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4014 builder, "llvm.amdgcn.buffer.store.format.v4f32",
4015 emit_data->dst_type, emit_data->args,
4016 emit_data->arg_count, 0);
4017 } else {
4018 get_image_intr_name("llvm.amdgcn.image.store",
4019 LLVMTypeOf(emit_data->args[0]), /* vdata */
4020 LLVMTypeOf(emit_data->args[1]), /* coords */
4021 LLVMTypeOf(emit_data->args[2]), /* rsrc */
4022 intrinsic_name, sizeof(intrinsic_name));
4023
4024 emit_data->output[emit_data->chan] =
4025 lp_build_intrinsic(
4026 builder, intrinsic_name, emit_data->dst_type,
4027 emit_data->args, emit_data->arg_count, 0);
4028 }
4029 }
4030
4031 static void atomic_fetch_args(
4032 struct lp_build_tgsi_context * bld_base,
4033 struct lp_build_emit_data * emit_data)
4034 {
4035 struct si_shader_context *ctx = si_shader_context(bld_base);
4036 struct gallivm_state *gallivm = bld_base->base.gallivm;
4037 LLVMBuilderRef builder = gallivm->builder;
4038 const struct tgsi_full_instruction * inst = emit_data->inst;
4039 LLVMValueRef data1, data2;
4040 LLVMValueRef rsrc;
4041 LLVMValueRef tmp;
4042
4043 emit_data->dst_type = bld_base->base.elem_type;
4044
4045 tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
4046 data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
4047
4048 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4049 tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
4050 data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
4051 }
4052
4053 /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
4054 * of arguments, which is reversed relative to TGSI (and GLSL)
4055 */
4056 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4057 emit_data->args[emit_data->arg_count++] = data2;
4058 emit_data->args[emit_data->arg_count++] = data1;
4059
4060 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4061 LLVMValueRef offset;
4062
4063 rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
4064
4065 tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
4066 offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
4067
4068 buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
4069 offset, true);
4070 } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
4071 unsigned target = inst->Memory.Texture;
4072 LLVMValueRef coords;
4073
4074 image_fetch_rsrc(bld_base, &inst->Src[0],
4075 target != TGSI_TEXTURE_BUFFER, &rsrc);
4076 coords = image_fetch_coords(bld_base, inst, 1);
4077
4078 if (target == TGSI_TEXTURE_BUFFER) {
4079 rsrc = extract_rsrc_top_half(ctx, rsrc);
4080 buffer_append_args(ctx, emit_data, rsrc, coords,
4081 bld_base->uint_bld.zero, true);
4082 } else {
4083 emit_data->args[emit_data->arg_count++] = coords;
4084 emit_data->args[emit_data->arg_count++] = rsrc;
4085
4086 image_append_args(ctx, emit_data, target, true);
4087 }
4088 }
4089 }
4090
4091 static void atomic_emit_memory(struct si_shader_context *ctx,
4092 struct lp_build_emit_data *emit_data) {
4093 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4094 LLVMBuilderRef builder = gallivm->builder;
4095 const struct tgsi_full_instruction * inst = emit_data->inst;
4096 LLVMValueRef ptr, result, arg;
4097
4098 ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
4099
4100 arg = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base, inst, 2, 0);
4101 arg = LLVMBuildBitCast(builder, arg, ctx->i32, "");
4102
4103 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4104 LLVMValueRef new_data;
4105 new_data = lp_build_emit_fetch(&ctx->radeon_bld.soa.bld_base,
4106 inst, 3, 0);
4107
4108 new_data = LLVMBuildBitCast(builder, new_data, ctx->i32, "");
4109
4110 #if HAVE_LLVM >= 0x309
4111 result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
4112 LLVMAtomicOrderingSequentiallyConsistent,
4113 LLVMAtomicOrderingSequentiallyConsistent,
4114 false);
4115 #endif
4116
4117 result = LLVMBuildExtractValue(builder, result, 0, "");
4118 } else {
4119 LLVMAtomicRMWBinOp op;
4120
4121 switch(inst->Instruction.Opcode) {
4122 case TGSI_OPCODE_ATOMUADD:
4123 op = LLVMAtomicRMWBinOpAdd;
4124 break;
4125 case TGSI_OPCODE_ATOMXCHG:
4126 op = LLVMAtomicRMWBinOpXchg;
4127 break;
4128 case TGSI_OPCODE_ATOMAND:
4129 op = LLVMAtomicRMWBinOpAnd;
4130 break;
4131 case TGSI_OPCODE_ATOMOR:
4132 op = LLVMAtomicRMWBinOpOr;
4133 break;
4134 case TGSI_OPCODE_ATOMXOR:
4135 op = LLVMAtomicRMWBinOpXor;
4136 break;
4137 case TGSI_OPCODE_ATOMUMIN:
4138 op = LLVMAtomicRMWBinOpUMin;
4139 break;
4140 case TGSI_OPCODE_ATOMUMAX:
4141 op = LLVMAtomicRMWBinOpUMax;
4142 break;
4143 case TGSI_OPCODE_ATOMIMIN:
4144 op = LLVMAtomicRMWBinOpMin;
4145 break;
4146 case TGSI_OPCODE_ATOMIMAX:
4147 op = LLVMAtomicRMWBinOpMax;
4148 break;
4149 default:
4150 unreachable("unknown atomic opcode");
4151 }
4152
4153 result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
4154 LLVMAtomicOrderingSequentiallyConsistent,
4155 false);
4156 }
4157 emit_data->output[emit_data->chan] = LLVMBuildBitCast(builder, result, emit_data->dst_type, "");
4158 }
4159
4160 static void atomic_emit(
4161 const struct lp_build_tgsi_action *action,
4162 struct lp_build_tgsi_context *bld_base,
4163 struct lp_build_emit_data *emit_data)
4164 {
4165 struct si_shader_context *ctx = si_shader_context(bld_base);
4166 struct gallivm_state *gallivm = bld_base->base.gallivm;
4167 LLVMBuilderRef builder = gallivm->builder;
4168 const struct tgsi_full_instruction * inst = emit_data->inst;
4169 char intrinsic_name[40];
4170 LLVMValueRef tmp;
4171
4172 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) {
4173 atomic_emit_memory(ctx, emit_data);
4174 return;
4175 }
4176
4177 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
4178 inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4179 snprintf(intrinsic_name, sizeof(intrinsic_name),
4180 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
4181 } else {
4182 LLVMValueRef coords;
4183 char coords_type[8];
4184
4185 if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4186 coords = emit_data->args[2];
4187 else
4188 coords = emit_data->args[1];
4189
4190 build_type_name_for_intr(LLVMTypeOf(coords), coords_type, sizeof(coords_type));
4191 snprintf(intrinsic_name, sizeof(intrinsic_name),
4192 "llvm.amdgcn.image.atomic.%s.%s",
4193 action->intr_name, coords_type);
4194 }
4195
4196 tmp = lp_build_intrinsic(
4197 builder, intrinsic_name, bld_base->uint_bld.elem_type,
4198 emit_data->args, emit_data->arg_count, 0);
4199 emit_data->output[emit_data->chan] =
4200 LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
4201 }
4202
4203 static void resq_fetch_args(
4204 struct lp_build_tgsi_context * bld_base,
4205 struct lp_build_emit_data * emit_data)
4206 {
4207 struct si_shader_context *ctx = si_shader_context(bld_base);
4208 struct gallivm_state *gallivm = bld_base->base.gallivm;
4209 const struct tgsi_full_instruction *inst = emit_data->inst;
4210 const struct tgsi_full_src_register *reg = &inst->Src[0];
4211
4212 emit_data->dst_type = ctx->v4i32;
4213
4214 if (reg->Register.File == TGSI_FILE_BUFFER) {
4215 emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
4216 emit_data->arg_count = 1;
4217 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4218 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
4219 emit_data->arg_count = 1;
4220 } else {
4221 emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
4222 image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
4223 emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
4224 emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
4225 emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
4226 emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
4227 bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
4228 emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
4229 emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
4230 emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
4231 emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
4232 emit_data->arg_count = 10;
4233 }
4234 }
4235
4236 static void resq_emit(
4237 const struct lp_build_tgsi_action *action,
4238 struct lp_build_tgsi_context *bld_base,
4239 struct lp_build_emit_data *emit_data)
4240 {
4241 struct gallivm_state *gallivm = bld_base->base.gallivm;
4242 LLVMBuilderRef builder = gallivm->builder;
4243 const struct tgsi_full_instruction *inst = emit_data->inst;
4244 LLVMValueRef out;
4245
4246 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4247 out = LLVMBuildExtractElement(builder, emit_data->args[0],
4248 lp_build_const_int32(gallivm, 2), "");
4249 } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
4250 out = get_buffer_size(bld_base, emit_data->args[0]);
4251 } else {
4252 out = lp_build_intrinsic(
4253 builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
4254 emit_data->args, emit_data->arg_count,
4255 LLVMReadNoneAttribute);
4256
4257 /* Divide the number of layers by 6 to get the number of cubes. */
4258 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
4259 LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
4260 LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
4261
4262 LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
4263 z = LLVMBuildSDiv(builder, z, imm6, "");
4264 out = LLVMBuildInsertElement(builder, out, z, imm2, "");
4265 }
4266 }
4267
4268 emit_data->output[emit_data->chan] = out;
4269 }
4270
4271 static void set_tex_fetch_args(struct si_shader_context *ctx,
4272 struct lp_build_emit_data *emit_data,
4273 unsigned opcode, unsigned target,
4274 LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4275 LLVMValueRef *param, unsigned count,
4276 unsigned dmask)
4277 {
4278 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4279 unsigned num_args;
4280 unsigned is_rect = target == TGSI_TEXTURE_RECT;
4281
4282 /* Pad to power of two vector */
4283 while (count < util_next_power_of_two(count))
4284 param[count++] = LLVMGetUndef(ctx->i32);
4285
4286 /* Texture coordinates. */
4287 if (count > 1)
4288 emit_data->args[0] = lp_build_gather_values(gallivm, param, count);
4289 else
4290 emit_data->args[0] = param[0];
4291
4292 /* Resource. */
4293 emit_data->args[1] = res_ptr;
4294 num_args = 2;
4295
4296 if (opcode == TGSI_OPCODE_TXF || opcode == TGSI_OPCODE_TXQ)
4297 emit_data->dst_type = ctx->v4i32;
4298 else {
4299 emit_data->dst_type = ctx->v4f32;
4300
4301 emit_data->args[num_args++] = samp_ptr;
4302 }
4303
4304 emit_data->args[num_args++] = lp_build_const_int32(gallivm, dmask);
4305 emit_data->args[num_args++] = lp_build_const_int32(gallivm, is_rect); /* unorm */
4306 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* r128 */
4307 emit_data->args[num_args++] = lp_build_const_int32(gallivm,
4308 tgsi_is_array_sampler(target)); /* da */
4309 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* glc */
4310 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* slc */
4311 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* tfe */
4312 emit_data->args[num_args++] = lp_build_const_int32(gallivm, 0); /* lwe */
4313
4314 emit_data->arg_count = num_args;
4315 }
4316
4317 static const struct lp_build_tgsi_action tex_action;
4318
4319 enum desc_type {
4320 DESC_IMAGE,
4321 DESC_FMASK,
4322 DESC_SAMPLER
4323 };
4324
4325 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
4326 {
4327 return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
4328 CONST_ADDR_SPACE);
4329 }
4330
4331 /**
4332 * Load an image view, fmask view. or sampler state descriptor.
4333 */
4334 static LLVMValueRef load_sampler_desc_custom(struct si_shader_context *ctx,
4335 LLVMValueRef list, LLVMValueRef index,
4336 enum desc_type type)
4337 {
4338 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
4339 LLVMBuilderRef builder = gallivm->builder;
4340
4341 switch (type) {
4342 case DESC_IMAGE:
4343 /* The image is at [0:7]. */
4344 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4345 break;
4346 case DESC_FMASK:
4347 /* The FMASK is at [8:15]. */
4348 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
4349 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 1, 0), "");
4350 break;
4351 case DESC_SAMPLER:
4352 /* The sampler state is at [12:15]. */
4353 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4354 index = LLVMBuildAdd(builder, index, LLVMConstInt(ctx->i32, 3, 0), "");
4355 list = LLVMBuildPointerCast(builder, list,
4356 const_array(ctx->v4i32, 0), "");
4357 break;
4358 }
4359
4360 return build_indexed_load_const(ctx, list, index);
4361 }
4362
4363 static LLVMValueRef load_sampler_desc(struct si_shader_context *ctx,
4364 LLVMValueRef index, enum desc_type type)
4365 {
4366 LLVMValueRef list = LLVMGetParam(ctx->radeon_bld.main_fn,
4367 SI_PARAM_SAMPLERS);
4368
4369 return load_sampler_desc_custom(ctx, list, index, type);
4370 }
4371
4372 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4373 *
4374 * SI-CI:
4375 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4376 * filtering manually. The driver sets img7 to a mask clearing
4377 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4378 * s_and_b32 samp0, samp0, img7
4379 *
4380 * VI:
4381 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4382 */
4383 static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
4384 LLVMValueRef res, LLVMValueRef samp)
4385 {
4386 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4387 LLVMValueRef img7, samp0;
4388
4389 if (ctx->screen->b.chip_class >= VI)
4390 return samp;
4391
4392 img7 = LLVMBuildExtractElement(builder, res,
4393 LLVMConstInt(ctx->i32, 7, 0), "");
4394 samp0 = LLVMBuildExtractElement(builder, samp,
4395 LLVMConstInt(ctx->i32, 0, 0), "");
4396 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4397 return LLVMBuildInsertElement(builder, samp, samp0,
4398 LLVMConstInt(ctx->i32, 0, 0), "");
4399 }
4400
4401 static void tex_fetch_ptrs(
4402 struct lp_build_tgsi_context *bld_base,
4403 struct lp_build_emit_data *emit_data,
4404 LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4405 {
4406 struct si_shader_context *ctx = si_shader_context(bld_base);
4407 const struct tgsi_full_instruction *inst = emit_data->inst;
4408 unsigned target = inst->Texture.Texture;
4409 unsigned sampler_src;
4410 unsigned sampler_index;
4411 LLVMValueRef index;
4412
4413 sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
4414 sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
4415
4416 if (emit_data->inst->Src[sampler_src].Register.Indirect) {
4417 const struct tgsi_full_src_register *reg = &emit_data->inst->Src[sampler_src];
4418
4419 index = get_bounded_indirect_index(ctx,
4420 &reg->Indirect,
4421 reg->Register.Index,
4422 SI_NUM_SAMPLERS);
4423 } else {
4424 index = LLVMConstInt(ctx->i32, sampler_index, 0);
4425 }
4426
4427 *res_ptr = load_sampler_desc(ctx, index, DESC_IMAGE);
4428
4429 if (target == TGSI_TEXTURE_2D_MSAA ||
4430 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4431 if (samp_ptr)
4432 *samp_ptr = NULL;
4433 if (fmask_ptr)
4434 *fmask_ptr = load_sampler_desc(ctx, index, DESC_FMASK);
4435 } else {
4436 if (samp_ptr) {
4437 *samp_ptr = load_sampler_desc(ctx, index, DESC_SAMPLER);
4438 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4439 }
4440 if (fmask_ptr)
4441 *fmask_ptr = NULL;
4442 }
4443 }
4444
4445 static void txq_fetch_args(
4446 struct lp_build_tgsi_context *bld_base,
4447 struct lp_build_emit_data *emit_data)
4448 {
4449 struct si_shader_context *ctx = si_shader_context(bld_base);
4450 struct gallivm_state *gallivm = bld_base->base.gallivm;
4451 LLVMBuilderRef builder = gallivm->builder;
4452 const struct tgsi_full_instruction *inst = emit_data->inst;
4453 unsigned target = inst->Texture.Texture;
4454 LLVMValueRef res_ptr;
4455 LLVMValueRef address;
4456
4457 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, NULL, NULL);
4458
4459 if (target == TGSI_TEXTURE_BUFFER) {
4460 /* Read the size from the buffer descriptor directly. */
4461 LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
4462 emit_data->args[0] = get_buffer_size(bld_base, res);
4463 return;
4464 }
4465
4466 /* Textures - set the mip level. */
4467 address = lp_build_emit_fetch(bld_base, inst, 0, TGSI_CHAN_X);
4468
4469 set_tex_fetch_args(ctx, emit_data, TGSI_OPCODE_TXQ, target, res_ptr,
4470 NULL, &address, 1, 0xf);
4471 }
4472
4473 static void txq_emit(const struct lp_build_tgsi_action *action,
4474 struct lp_build_tgsi_context *bld_base,
4475 struct lp_build_emit_data *emit_data)
4476 {
4477 struct lp_build_context *base = &bld_base->base;
4478 unsigned target = emit_data->inst->Texture.Texture;
4479
4480 if (target == TGSI_TEXTURE_BUFFER) {
4481 /* Just return the buffer size. */
4482 emit_data->output[emit_data->chan] = emit_data->args[0];
4483 return;
4484 }
4485
4486 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4487 base->gallivm->builder, "llvm.SI.getresinfo.i32",
4488 emit_data->dst_type, emit_data->args, emit_data->arg_count,
4489 LLVMReadNoneAttribute);
4490
4491 /* Divide the number of layers by 6 to get the number of cubes. */
4492 if (target == TGSI_TEXTURE_CUBE_ARRAY ||
4493 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4494 LLVMBuilderRef builder = bld_base->base.gallivm->builder;
4495 LLVMValueRef two = lp_build_const_int32(bld_base->base.gallivm, 2);
4496 LLVMValueRef six = lp_build_const_int32(bld_base->base.gallivm, 6);
4497
4498 LLVMValueRef v4 = emit_data->output[emit_data->chan];
4499 LLVMValueRef z = LLVMBuildExtractElement(builder, v4, two, "");
4500 z = LLVMBuildSDiv(builder, z, six, "");
4501
4502 emit_data->output[emit_data->chan] =
4503 LLVMBuildInsertElement(builder, v4, z, two, "");
4504 }
4505 }
4506
4507 static void tex_fetch_args(
4508 struct lp_build_tgsi_context *bld_base,
4509 struct lp_build_emit_data *emit_data)
4510 {
4511 struct si_shader_context *ctx = si_shader_context(bld_base);
4512 struct gallivm_state *gallivm = bld_base->base.gallivm;
4513 const struct tgsi_full_instruction *inst = emit_data->inst;
4514 unsigned opcode = inst->Instruction.Opcode;
4515 unsigned target = inst->Texture.Texture;
4516 LLVMValueRef coords[5], derivs[6];
4517 LLVMValueRef address[16];
4518 unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
4519 int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
4520 unsigned count = 0;
4521 unsigned chan;
4522 unsigned num_deriv_channels = 0;
4523 bool has_offset = inst->Texture.NumOffsets > 0;
4524 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
4525 unsigned dmask = 0xf;
4526
4527 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
4528
4529 if (target == TGSI_TEXTURE_BUFFER) {
4530 LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
4531
4532 /* Bitcast and truncate v8i32 to v16i8. */
4533 LLVMValueRef res = res_ptr;
4534 res = LLVMBuildBitCast(gallivm->builder, res, v2i128, "");
4535 res = LLVMBuildExtractElement(gallivm->builder, res, bld_base->uint_bld.one, "");
4536 res = LLVMBuildBitCast(gallivm->builder, res, ctx->v16i8, "");
4537
4538 emit_data->dst_type = ctx->v4f32;
4539 emit_data->args[0] = res;
4540 emit_data->args[1] = bld_base->uint_bld.zero;
4541 emit_data->args[2] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_X);
4542 emit_data->arg_count = 3;
4543 return;
4544 }
4545
4546 /* Fetch and project texture coordinates */
4547 coords[3] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
4548 for (chan = 0; chan < 3; chan++ ) {
4549 coords[chan] = lp_build_emit_fetch(bld_base,
4550 emit_data->inst, 0,
4551 chan);
4552 if (opcode == TGSI_OPCODE_TXP)
4553 coords[chan] = lp_build_emit_llvm_binary(bld_base,
4554 TGSI_OPCODE_DIV,
4555 coords[chan],
4556 coords[3]);
4557 }
4558
4559 if (opcode == TGSI_OPCODE_TXP)
4560 coords[3] = bld_base->base.one;
4561
4562 /* Pack offsets. */
4563 if (has_offset && opcode != TGSI_OPCODE_TXF) {
4564 /* The offsets are six-bit signed integers packed like this:
4565 * X=[5:0], Y=[13:8], and Z=[21:16].
4566 */
4567 LLVMValueRef offset[3], pack;
4568
4569 assert(inst->Texture.NumOffsets == 1);
4570
4571 for (chan = 0; chan < 3; chan++) {
4572 offset[chan] = lp_build_emit_fetch_texoffset(bld_base,
4573 emit_data->inst, 0, chan);
4574 offset[chan] = LLVMBuildAnd(gallivm->builder, offset[chan],
4575 lp_build_const_int32(gallivm, 0x3f), "");
4576 if (chan)
4577 offset[chan] = LLVMBuildShl(gallivm->builder, offset[chan],
4578 lp_build_const_int32(gallivm, chan*8), "");
4579 }
4580
4581 pack = LLVMBuildOr(gallivm->builder, offset[0], offset[1], "");
4582 pack = LLVMBuildOr(gallivm->builder, pack, offset[2], "");
4583 address[count++] = pack;
4584 }
4585
4586 /* Pack LOD bias value */
4587 if (opcode == TGSI_OPCODE_TXB)
4588 address[count++] = coords[3];
4589 if (opcode == TGSI_OPCODE_TXB2)
4590 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4591
4592 /* Pack depth comparison value */
4593 if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
4594 LLVMValueRef z;
4595
4596 if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4597 z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4598 } else {
4599 assert(ref_pos >= 0);
4600 z = coords[ref_pos];
4601 }
4602
4603 /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4604 * so the depth comparison value isn't clamped for Z16 and
4605 * Z24 anymore. Do it manually here.
4606 *
4607 * It's unnecessary if the original texture format was
4608 * Z32_FLOAT, but we don't know that here.
4609 */
4610 if (ctx->screen->b.chip_class == VI)
4611 z = radeon_llvm_saturate(bld_base, z);
4612
4613 address[count++] = z;
4614 }
4615
4616 /* Pack user derivatives */
4617 if (opcode == TGSI_OPCODE_TXD) {
4618 int param, num_src_deriv_channels;
4619
4620 switch (target) {
4621 case TGSI_TEXTURE_3D:
4622 num_src_deriv_channels = 3;
4623 num_deriv_channels = 3;
4624 break;
4625 case TGSI_TEXTURE_2D:
4626 case TGSI_TEXTURE_SHADOW2D:
4627 case TGSI_TEXTURE_RECT:
4628 case TGSI_TEXTURE_SHADOWRECT:
4629 case TGSI_TEXTURE_2D_ARRAY:
4630 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4631 num_src_deriv_channels = 2;
4632 num_deriv_channels = 2;
4633 break;
4634 case TGSI_TEXTURE_CUBE:
4635 case TGSI_TEXTURE_SHADOWCUBE:
4636 case TGSI_TEXTURE_CUBE_ARRAY:
4637 case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
4638 /* Cube derivatives will be converted to 2D. */
4639 num_src_deriv_channels = 3;
4640 num_deriv_channels = 2;
4641 break;
4642 case TGSI_TEXTURE_1D:
4643 case TGSI_TEXTURE_SHADOW1D:
4644 case TGSI_TEXTURE_1D_ARRAY:
4645 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4646 num_src_deriv_channels = 1;
4647 num_deriv_channels = 1;
4648 break;
4649 default:
4650 unreachable("invalid target");
4651 }
4652
4653 for (param = 0; param < 2; param++)
4654 for (chan = 0; chan < num_src_deriv_channels; chan++)
4655 derivs[param * num_src_deriv_channels + chan] =
4656 lp_build_emit_fetch(bld_base, inst, param+1, chan);
4657 }
4658
4659 if (target == TGSI_TEXTURE_CUBE ||
4660 target == TGSI_TEXTURE_CUBE_ARRAY ||
4661 target == TGSI_TEXTURE_SHADOWCUBE ||
4662 target == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4663 si_prepare_cube_coords(bld_base, emit_data, coords, derivs);
4664
4665 if (opcode == TGSI_OPCODE_TXD)
4666 for (int i = 0; i < num_deriv_channels * 2; i++)
4667 address[count++] = derivs[i];
4668
4669 /* Pack texture coordinates */
4670 address[count++] = coords[0];
4671 if (num_coords > 1)
4672 address[count++] = coords[1];
4673 if (num_coords > 2)
4674 address[count++] = coords[2];
4675
4676 /* Pack LOD or sample index */
4677 if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXF)
4678 address[count++] = coords[3];
4679 else if (opcode == TGSI_OPCODE_TXL2)
4680 address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
4681
4682 if (count > 16) {
4683 assert(!"Cannot handle more than 16 texture address parameters");
4684 count = 16;
4685 }
4686
4687 for (chan = 0; chan < count; chan++ ) {
4688 address[chan] = LLVMBuildBitCast(gallivm->builder,
4689 address[chan], ctx->i32, "");
4690 }
4691
4692 /* Adjust the sample index according to FMASK.
4693 *
4694 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
4695 * which is the identity mapping. Each nibble says which physical sample
4696 * should be fetched to get that sample.
4697 *
4698 * For example, 0x11111100 means there are only 2 samples stored and
4699 * the second sample covers 3/4 of the pixel. When reading samples 0
4700 * and 1, return physical sample 0 (determined by the first two 0s
4701 * in FMASK), otherwise return physical sample 1.
4702 *
4703 * The sample index should be adjusted as follows:
4704 * sample_index = (fmask >> (sample_index * 4)) & 0xF;
4705 */
4706 if (target == TGSI_TEXTURE_2D_MSAA ||
4707 target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
4708 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4709 struct lp_build_emit_data txf_emit_data = *emit_data;
4710 LLVMValueRef txf_address[4];
4711 unsigned txf_count = count;
4712 struct tgsi_full_instruction inst = {};
4713
4714 memcpy(txf_address, address, sizeof(txf_address));
4715
4716 if (target == TGSI_TEXTURE_2D_MSAA) {
4717 txf_address[2] = bld_base->uint_bld.zero;
4718 }
4719 txf_address[3] = bld_base->uint_bld.zero;
4720
4721 /* Read FMASK using TXF. */
4722 inst.Instruction.Opcode = TGSI_OPCODE_TXF;
4723 inst.Texture.Texture = target;
4724 txf_emit_data.inst = &inst;
4725 txf_emit_data.chan = 0;
4726 set_tex_fetch_args(ctx, &txf_emit_data, TGSI_OPCODE_TXF,
4727 target, fmask_ptr, NULL,
4728 txf_address, txf_count, 0xf);
4729 build_tex_intrinsic(&tex_action, bld_base, &txf_emit_data);
4730
4731 /* Initialize some constants. */
4732 LLVMValueRef four = LLVMConstInt(ctx->i32, 4, 0);
4733 LLVMValueRef F = LLVMConstInt(ctx->i32, 0xF, 0);
4734
4735 /* Apply the formula. */
4736 LLVMValueRef fmask =
4737 LLVMBuildExtractElement(gallivm->builder,
4738 txf_emit_data.output[0],
4739 uint_bld->zero, "");
4740
4741 unsigned sample_chan = target == TGSI_TEXTURE_2D_MSAA ? 2 : 3;
4742
4743 LLVMValueRef sample_index4 =
4744 LLVMBuildMul(gallivm->builder, address[sample_chan], four, "");
4745
4746 LLVMValueRef shifted_fmask =
4747 LLVMBuildLShr(gallivm->builder, fmask, sample_index4, "");
4748
4749 LLVMValueRef final_sample =
4750 LLVMBuildAnd(gallivm->builder, shifted_fmask, F, "");
4751
4752 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
4753 * resource descriptor is 0 (invalid),
4754 */
4755 LLVMValueRef fmask_desc =
4756 LLVMBuildBitCast(gallivm->builder, fmask_ptr,
4757 ctx->v8i32, "");
4758
4759 LLVMValueRef fmask_word1 =
4760 LLVMBuildExtractElement(gallivm->builder, fmask_desc,
4761 uint_bld->one, "");
4762
4763 LLVMValueRef word1_is_nonzero =
4764 LLVMBuildICmp(gallivm->builder, LLVMIntNE,
4765 fmask_word1, uint_bld->zero, "");
4766
4767 /* Replace the MSAA sample index. */
4768 address[sample_chan] =
4769 LLVMBuildSelect(gallivm->builder, word1_is_nonzero,
4770 final_sample, address[sample_chan], "");
4771 }
4772
4773 if (opcode == TGSI_OPCODE_TXF) {
4774 /* add tex offsets */
4775 if (inst->Texture.NumOffsets) {
4776 struct lp_build_context *uint_bld = &bld_base->uint_bld;
4777 struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
4778 const struct tgsi_texture_offset *off = inst->TexOffsets;
4779
4780 assert(inst->Texture.NumOffsets == 1);
4781
4782 switch (target) {
4783 case TGSI_TEXTURE_3D:
4784 address[2] = lp_build_add(uint_bld, address[2],
4785 bld->immediates[off->Index][off->SwizzleZ]);
4786 /* fall through */
4787 case TGSI_TEXTURE_2D:
4788 case TGSI_TEXTURE_SHADOW2D:
4789 case TGSI_TEXTURE_RECT:
4790 case TGSI_TEXTURE_SHADOWRECT:
4791 case TGSI_TEXTURE_2D_ARRAY:
4792 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4793 address[1] =
4794 lp_build_add(uint_bld, address[1],
4795 bld->immediates[off->Index][off->SwizzleY]);
4796 /* fall through */
4797 case TGSI_TEXTURE_1D:
4798 case TGSI_TEXTURE_SHADOW1D:
4799 case TGSI_TEXTURE_1D_ARRAY:
4800 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4801 address[0] =
4802 lp_build_add(uint_bld, address[0],
4803 bld->immediates[off->Index][off->SwizzleX]);
4804 break;
4805 /* texture offsets do not apply to other texture targets */
4806 }
4807 }
4808 }
4809
4810 if (opcode == TGSI_OPCODE_TG4) {
4811 unsigned gather_comp = 0;
4812
4813 /* DMASK was repurposed for GATHER4. 4 components are always
4814 * returned and DMASK works like a swizzle - it selects
4815 * the component to fetch. The only valid DMASK values are
4816 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4817 * (red,red,red,red) etc.) The ISA document doesn't mention
4818 * this.
4819 */
4820
4821 /* Get the component index from src1.x for Gather4. */
4822 if (!tgsi_is_shadow_target(target)) {
4823 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
4824 LLVMValueRef comp_imm;
4825 struct tgsi_src_register src1 = inst->Src[1].Register;
4826
4827 assert(src1.File == TGSI_FILE_IMMEDIATE);
4828
4829 comp_imm = imms[src1.Index][src1.SwizzleX];
4830 gather_comp = LLVMConstIntGetZExtValue(comp_imm);
4831 gather_comp = CLAMP(gather_comp, 0, 3);
4832 }
4833
4834 dmask = 1 << gather_comp;
4835 }
4836
4837 set_tex_fetch_args(ctx, emit_data, opcode, target, res_ptr,
4838 samp_ptr, address, count, dmask);
4839 }
4840
4841 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
4842 * incorrectly forces nearest filtering if the texture format is integer.
4843 * The only effect it has on Gather4, which always returns 4 texels for
4844 * bilinear filtering, is that the final coordinates are off by 0.5 of
4845 * the texel size.
4846 *
4847 * The workaround is to subtract 0.5 from the unnormalized coordinates,
4848 * or (0.5 / size) from the normalized coordinates.
4849 */
4850 static void si_lower_gather4_integer(struct si_shader_context *ctx,
4851 struct lp_build_emit_data *emit_data,
4852 const char *intr_name,
4853 unsigned coord_vgpr_index)
4854 {
4855 LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
4856 LLVMValueRef coord = emit_data->args[0];
4857 LLVMValueRef half_texel[2];
4858 int c;
4859
4860 if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_RECT ||
4861 emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
4862 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
4863 } else {
4864 struct tgsi_full_instruction txq_inst = {};
4865 struct lp_build_emit_data txq_emit_data = {};
4866
4867 /* Query the texture size. */
4868 txq_inst.Texture.Texture = emit_data->inst->Texture.Texture;
4869 txq_emit_data.inst = &txq_inst;
4870 txq_emit_data.dst_type = ctx->v4i32;
4871 set_tex_fetch_args(ctx, &txq_emit_data, TGSI_OPCODE_TXQ,
4872 txq_inst.Texture.Texture,
4873 emit_data->args[1], NULL,
4874 &ctx->radeon_bld.soa.bld_base.uint_bld.zero,
4875 1, 0xf);
4876 txq_emit(NULL, &ctx->radeon_bld.soa.bld_base, &txq_emit_data);
4877
4878 /* Compute -0.5 / size. */
4879 for (c = 0; c < 2; c++) {
4880 half_texel[c] =
4881 LLVMBuildExtractElement(builder, txq_emit_data.output[0],
4882 LLVMConstInt(ctx->i32, c, 0), "");
4883 half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
4884 half_texel[c] =
4885 lp_build_emit_llvm_unary(&ctx->radeon_bld.soa.bld_base,
4886 TGSI_OPCODE_RCP, half_texel[c]);
4887 half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
4888 LLVMConstReal(ctx->f32, -0.5), "");
4889 }
4890 }
4891
4892 for (c = 0; c < 2; c++) {
4893 LLVMValueRef tmp;
4894 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
4895
4896 tmp = LLVMBuildExtractElement(builder, coord, index, "");
4897 tmp = LLVMBuildBitCast(builder, tmp, ctx->f32, "");
4898 tmp = LLVMBuildFAdd(builder, tmp, half_texel[c], "");
4899 tmp = LLVMBuildBitCast(builder, tmp, ctx->i32, "");
4900 coord = LLVMBuildInsertElement(builder, coord, tmp, index, "");
4901 }
4902
4903 emit_data->args[0] = coord;
4904 emit_data->output[emit_data->chan] =
4905 lp_build_intrinsic(builder, intr_name, emit_data->dst_type,
4906 emit_data->args, emit_data->arg_count,
4907 LLVMReadNoneAttribute);
4908 }
4909
4910 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
4911 struct lp_build_tgsi_context *bld_base,
4912 struct lp_build_emit_data *emit_data)
4913 {
4914 struct si_shader_context *ctx = si_shader_context(bld_base);
4915 struct lp_build_context *base = &bld_base->base;
4916 const struct tgsi_full_instruction *inst = emit_data->inst;
4917 unsigned opcode = inst->Instruction.Opcode;
4918 unsigned target = inst->Texture.Texture;
4919 char intr_name[127];
4920 bool has_offset = inst->Texture.NumOffsets > 0;
4921 bool is_shadow = tgsi_is_shadow_target(target);
4922 char type[64];
4923 const char *name = "llvm.SI.image.sample";
4924 const char *infix = "";
4925
4926 if (target == TGSI_TEXTURE_BUFFER) {
4927 emit_data->output[emit_data->chan] = lp_build_intrinsic(
4928 base->gallivm->builder,
4929 "llvm.SI.vs.load.input", emit_data->dst_type,
4930 emit_data->args, emit_data->arg_count,
4931 LLVMReadNoneAttribute);
4932 return;
4933 }
4934
4935 switch (opcode) {
4936 case TGSI_OPCODE_TXF:
4937 name = target == TGSI_TEXTURE_2D_MSAA ||
4938 target == TGSI_TEXTURE_2D_ARRAY_MSAA ?
4939 "llvm.SI.image.load" :
4940 "llvm.SI.image.load.mip";
4941 is_shadow = false;
4942 has_offset = false;
4943 break;
4944 case TGSI_OPCODE_LODQ:
4945 name = "llvm.SI.getlod";
4946 is_shadow = false;
4947 has_offset = false;
4948 break;
4949 case TGSI_OPCODE_TEX:
4950 case TGSI_OPCODE_TEX2:
4951 case TGSI_OPCODE_TXP:
4952 if (ctx->type != PIPE_SHADER_FRAGMENT)
4953 infix = ".lz";
4954 break;
4955 case TGSI_OPCODE_TXB:
4956 case TGSI_OPCODE_TXB2:
4957 assert(ctx->type == PIPE_SHADER_FRAGMENT);
4958 infix = ".b";
4959 break;
4960 case TGSI_OPCODE_TXL:
4961 case TGSI_OPCODE_TXL2:
4962 infix = ".l";
4963 break;
4964 case TGSI_OPCODE_TXD:
4965 infix = ".d";
4966 break;
4967 case TGSI_OPCODE_TG4:
4968 name = "llvm.SI.gather4";
4969 infix = ".lz";
4970 break;
4971 default:
4972 assert(0);
4973 return;
4974 }
4975
4976 /* Add the type and suffixes .c, .o if needed. */
4977 build_type_name_for_intr(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
4978 sprintf(intr_name, "%s%s%s%s.%s",
4979 name, is_shadow ? ".c" : "", infix,
4980 has_offset ? ".o" : "", type);
4981
4982 /* The hardware needs special lowering for Gather4 with integer formats. */
4983 if (opcode == TGSI_OPCODE_TG4) {
4984 struct tgsi_shader_info *info = &ctx->shader->selector->info;
4985 /* This will also work with non-constant indexing because of how
4986 * glsl_to_tgsi works and we intent to preserve that behavior.
4987 */
4988 const unsigned src_idx = 2;
4989 unsigned sampler = inst->Src[src_idx].Register.Index;
4990
4991 assert(inst->Src[src_idx].Register.File == TGSI_FILE_SAMPLER);
4992
4993 if (info->sampler_type[sampler] == TGSI_RETURN_TYPE_SINT ||
4994 info->sampler_type[sampler] == TGSI_RETURN_TYPE_UINT) {
4995 /* Texture coordinates start after:
4996 * {offset, bias, z-compare, derivatives}
4997 * Only the offset and z-compare can occur here.
4998 */
4999 si_lower_gather4_integer(ctx, emit_data, intr_name,
5000 (int)has_offset + (int)is_shadow);
5001 return;
5002 }
5003 }
5004
5005 emit_data->output[emit_data->chan] = lp_build_intrinsic(
5006 base->gallivm->builder, intr_name, emit_data->dst_type,
5007 emit_data->args, emit_data->arg_count,
5008 LLVMReadNoneAttribute);
5009 }
5010
5011 static void si_llvm_emit_txqs(
5012 const struct lp_build_tgsi_action *action,
5013 struct lp_build_tgsi_context *bld_base,
5014 struct lp_build_emit_data *emit_data)
5015 {
5016 struct si_shader_context *ctx = si_shader_context(bld_base);
5017 struct gallivm_state *gallivm = bld_base->base.gallivm;
5018 LLVMBuilderRef builder = gallivm->builder;
5019 LLVMValueRef res, samples;
5020 LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
5021
5022 tex_fetch_ptrs(bld_base, emit_data, &res_ptr, &samp_ptr, &fmask_ptr);
5023
5024
5025 /* Read the samples from the descriptor directly. */
5026 res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
5027 samples = LLVMBuildExtractElement(
5028 builder, res,
5029 lp_build_const_int32(gallivm, 3), "");
5030 samples = LLVMBuildLShr(builder, samples,
5031 lp_build_const_int32(gallivm, 16), "");
5032 samples = LLVMBuildAnd(builder, samples,
5033 lp_build_const_int32(gallivm, 0xf), "");
5034 samples = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1),
5035 samples, "");
5036
5037 emit_data->output[emit_data->chan] = samples;
5038 }
5039
5040 /*
5041 * SI implements derivatives using the local data store (LDS)
5042 * All writes to the LDS happen in all executing threads at
5043 * the same time. TID is the Thread ID for the current
5044 * thread and is a value between 0 and 63, representing
5045 * the thread's position in the wavefront.
5046 *
5047 * For the pixel shader threads are grouped into quads of four pixels.
5048 * The TIDs of the pixels of a quad are:
5049 *
5050 * +------+------+
5051 * |4n + 0|4n + 1|
5052 * +------+------+
5053 * |4n + 2|4n + 3|
5054 * +------+------+
5055 *
5056 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
5057 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
5058 * the current pixel's column, and masking with 0xfffffffe yields the TID
5059 * of the left pixel of the current pixel's row.
5060 *
5061 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
5062 * adding 2 yields the TID of the pixel below the top pixel.
5063 */
5064 /* masks for thread ID. */
5065 #define TID_MASK_TOP_LEFT 0xfffffffc
5066 #define TID_MASK_TOP 0xfffffffd
5067 #define TID_MASK_LEFT 0xfffffffe
5068
5069 static void si_llvm_emit_ddxy(
5070 const struct lp_build_tgsi_action *action,
5071 struct lp_build_tgsi_context *bld_base,
5072 struct lp_build_emit_data *emit_data)
5073 {
5074 struct si_shader_context *ctx = si_shader_context(bld_base);
5075 struct gallivm_state *gallivm = bld_base->base.gallivm;
5076 unsigned opcode = emit_data->info->opcode;
5077 LLVMValueRef thread_id, tl, trbl, tl_tid, trbl_tid, val, args[2];
5078 int idx;
5079 unsigned mask;
5080
5081 thread_id = get_thread_id(ctx);
5082
5083 if (opcode == TGSI_OPCODE_DDX_FINE)
5084 mask = TID_MASK_LEFT;
5085 else if (opcode == TGSI_OPCODE_DDY_FINE)
5086 mask = TID_MASK_TOP;
5087 else
5088 mask = TID_MASK_TOP_LEFT;
5089
5090 tl_tid = LLVMBuildAnd(gallivm->builder, thread_id,
5091 lp_build_const_int32(gallivm, mask), "");
5092
5093 /* for DDX we want to next X pixel, DDY next Y pixel. */
5094 idx = (opcode == TGSI_OPCODE_DDX || opcode == TGSI_OPCODE_DDX_FINE) ? 1 : 2;
5095 trbl_tid = LLVMBuildAdd(gallivm->builder, tl_tid,
5096 lp_build_const_int32(gallivm, idx), "");
5097
5098 val = LLVMBuildBitCast(gallivm->builder, emit_data->args[0], ctx->i32, "");
5099
5100 if (ctx->screen->has_ds_bpermute) {
5101 args[0] = LLVMBuildMul(gallivm->builder, tl_tid,
5102 lp_build_const_int32(gallivm, 4), "");
5103 args[1] = val;
5104 tl = lp_build_intrinsic(gallivm->builder,
5105 "llvm.amdgcn.ds.bpermute", ctx->i32,
5106 args, 2, LLVMReadNoneAttribute);
5107
5108 args[0] = LLVMBuildMul(gallivm->builder, trbl_tid,
5109 lp_build_const_int32(gallivm, 4), "");
5110 trbl = lp_build_intrinsic(gallivm->builder,
5111 "llvm.amdgcn.ds.bpermute", ctx->i32,
5112 args, 2, LLVMReadNoneAttribute);
5113 } else {
5114 LLVMValueRef store_ptr, load_ptr0, load_ptr1;
5115
5116 store_ptr = build_gep0(ctx, ctx->lds, thread_id);
5117 load_ptr0 = build_gep0(ctx, ctx->lds, tl_tid);
5118 load_ptr1 = build_gep0(ctx, ctx->lds, trbl_tid);
5119
5120 LLVMBuildStore(gallivm->builder, val, store_ptr);
5121 tl = LLVMBuildLoad(gallivm->builder, load_ptr0, "");
5122 trbl = LLVMBuildLoad(gallivm->builder, load_ptr1, "");
5123 }
5124
5125 tl = LLVMBuildBitCast(gallivm->builder, tl, ctx->f32, "");
5126 trbl = LLVMBuildBitCast(gallivm->builder, trbl, ctx->f32, "");
5127
5128 emit_data->output[emit_data->chan] =
5129 LLVMBuildFSub(gallivm->builder, trbl, tl, "");
5130 }
5131
5132 /*
5133 * this takes an I,J coordinate pair,
5134 * and works out the X and Y derivatives.
5135 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
5136 */
5137 static LLVMValueRef si_llvm_emit_ddxy_interp(
5138 struct lp_build_tgsi_context *bld_base,
5139 LLVMValueRef interp_ij)
5140 {
5141 struct si_shader_context *ctx = si_shader_context(bld_base);
5142 struct gallivm_state *gallivm = bld_base->base.gallivm;
5143 LLVMValueRef result[4], a;
5144 unsigned i;
5145
5146 for (i = 0; i < 2; i++) {
5147 a = LLVMBuildExtractElement(gallivm->builder, interp_ij,
5148 LLVMConstInt(ctx->i32, i, 0), "");
5149 result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
5150 result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
5151 }
5152
5153 return lp_build_gather_values(gallivm, result, 4);
5154 }
5155
5156 static void interp_fetch_args(
5157 struct lp_build_tgsi_context *bld_base,
5158 struct lp_build_emit_data *emit_data)
5159 {
5160 struct si_shader_context *ctx = si_shader_context(bld_base);
5161 struct gallivm_state *gallivm = bld_base->base.gallivm;
5162 const struct tgsi_full_instruction *inst = emit_data->inst;
5163
5164 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
5165 /* offset is in second src, first two channels */
5166 emit_data->args[0] = lp_build_emit_fetch(bld_base,
5167 emit_data->inst, 1,
5168 TGSI_CHAN_X);
5169 emit_data->args[1] = lp_build_emit_fetch(bld_base,
5170 emit_data->inst, 1,
5171 TGSI_CHAN_Y);
5172 emit_data->arg_count = 2;
5173 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5174 LLVMValueRef sample_position;
5175 LLVMValueRef sample_id;
5176 LLVMValueRef halfval = lp_build_const_float(gallivm, 0.5f);
5177
5178 /* fetch sample ID, then fetch its sample position,
5179 * and place into first two channels.
5180 */
5181 sample_id = lp_build_emit_fetch(bld_base,
5182 emit_data->inst, 1, TGSI_CHAN_X);
5183 sample_id = LLVMBuildBitCast(gallivm->builder, sample_id,
5184 ctx->i32, "");
5185 sample_position = load_sample_position(&ctx->radeon_bld, sample_id);
5186
5187 emit_data->args[0] = LLVMBuildExtractElement(gallivm->builder,
5188 sample_position,
5189 lp_build_const_int32(gallivm, 0), "");
5190
5191 emit_data->args[0] = LLVMBuildFSub(gallivm->builder, emit_data->args[0], halfval, "");
5192 emit_data->args[1] = LLVMBuildExtractElement(gallivm->builder,
5193 sample_position,
5194 lp_build_const_int32(gallivm, 1), "");
5195 emit_data->args[1] = LLVMBuildFSub(gallivm->builder, emit_data->args[1], halfval, "");
5196 emit_data->arg_count = 2;
5197 }
5198 }
5199
5200 static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
5201 struct lp_build_tgsi_context *bld_base,
5202 struct lp_build_emit_data *emit_data)
5203 {
5204 struct si_shader_context *ctx = si_shader_context(bld_base);
5205 struct si_shader *shader = ctx->shader;
5206 struct gallivm_state *gallivm = bld_base->base.gallivm;
5207 LLVMValueRef interp_param;
5208 const struct tgsi_full_instruction *inst = emit_data->inst;
5209 const char *intr_name;
5210 int input_index = inst->Src[0].Register.Index;
5211 int chan;
5212 int i;
5213 LLVMValueRef attr_number;
5214 LLVMValueRef params = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_PRIM_MASK);
5215 int interp_param_idx;
5216 unsigned interp = shader->selector->info.input_interpolate[input_index];
5217 unsigned location;
5218
5219 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5220
5221 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5222 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE)
5223 location = TGSI_INTERPOLATE_LOC_CENTER;
5224 else
5225 location = TGSI_INTERPOLATE_LOC_CENTROID;
5226
5227 interp_param_idx = lookup_interp_param_index(interp, location);
5228 if (interp_param_idx == -1)
5229 return;
5230 else if (interp_param_idx)
5231 interp_param = get_interp_param(ctx, interp_param_idx);
5232 else
5233 interp_param = NULL;
5234
5235 attr_number = lp_build_const_int32(gallivm, input_index);
5236
5237 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5238 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5239 LLVMValueRef ij_out[2];
5240 LLVMValueRef ddxy_out = si_llvm_emit_ddxy_interp(bld_base, interp_param);
5241
5242 /*
5243 * take the I then J parameters, and the DDX/Y for it, and
5244 * calculate the IJ inputs for the interpolator.
5245 * temp1 = ddx * offset/sample.x + I;
5246 * interp_param.I = ddy * offset/sample.y + temp1;
5247 * temp1 = ddx * offset/sample.x + J;
5248 * interp_param.J = ddy * offset/sample.y + temp1;
5249 */
5250 for (i = 0; i < 2; i++) {
5251 LLVMValueRef ix_ll = lp_build_const_int32(gallivm, i);
5252 LLVMValueRef iy_ll = lp_build_const_int32(gallivm, i + 2);
5253 LLVMValueRef ddx_el = LLVMBuildExtractElement(gallivm->builder,
5254 ddxy_out, ix_ll, "");
5255 LLVMValueRef ddy_el = LLVMBuildExtractElement(gallivm->builder,
5256 ddxy_out, iy_ll, "");
5257 LLVMValueRef interp_el = LLVMBuildExtractElement(gallivm->builder,
5258 interp_param, ix_ll, "");
5259 LLVMValueRef temp1, temp2;
5260
5261 interp_el = LLVMBuildBitCast(gallivm->builder, interp_el,
5262 ctx->f32, "");
5263
5264 temp1 = LLVMBuildFMul(gallivm->builder, ddx_el, emit_data->args[0], "");
5265
5266 temp1 = LLVMBuildFAdd(gallivm->builder, temp1, interp_el, "");
5267
5268 temp2 = LLVMBuildFMul(gallivm->builder, ddy_el, emit_data->args[1], "");
5269
5270 temp2 = LLVMBuildFAdd(gallivm->builder, temp2, temp1, "");
5271
5272 ij_out[i] = LLVMBuildBitCast(gallivm->builder,
5273 temp2, ctx->i32, "");
5274 }
5275 interp_param = lp_build_gather_values(bld_base->base.gallivm, ij_out, 2);
5276 }
5277
5278 intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
5279 for (chan = 0; chan < 4; chan++) {
5280 LLVMValueRef args[4];
5281 LLVMValueRef llvm_chan;
5282 unsigned schan;
5283
5284 schan = tgsi_util_get_full_src_register_swizzle(&inst->Src[0], chan);
5285 llvm_chan = lp_build_const_int32(gallivm, schan);
5286
5287 args[0] = llvm_chan;
5288 args[1] = attr_number;
5289 args[2] = params;
5290 args[3] = interp_param;
5291
5292 emit_data->output[chan] =
5293 lp_build_intrinsic(gallivm->builder, intr_name,
5294 ctx->f32, args, args[3] ? 4 : 3,
5295 LLVMReadNoneAttribute);
5296 }
5297 }
5298
5299 static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
5300 struct lp_build_emit_data *emit_data)
5301 {
5302 LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
5303 struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
5304 unsigned stream;
5305
5306 assert(src0.File == TGSI_FILE_IMMEDIATE);
5307
5308 stream = LLVMConstIntGetZExtValue(imms[src0.Index][src0.SwizzleX]) & 0x3;
5309 return stream;
5310 }
5311
5312 /* Emit one vertex from the geometry shader */
5313 static void si_llvm_emit_vertex(
5314 const struct lp_build_tgsi_action *action,
5315 struct lp_build_tgsi_context *bld_base,
5316 struct lp_build_emit_data *emit_data)
5317 {
5318 struct si_shader_context *ctx = si_shader_context(bld_base);
5319 struct lp_build_context *uint = &bld_base->uint_bld;
5320 struct si_shader *shader = ctx->shader;
5321 struct tgsi_shader_info *info = &shader->selector->info;
5322 struct gallivm_state *gallivm = bld_base->base.gallivm;
5323 LLVMValueRef soffset = LLVMGetParam(ctx->radeon_bld.main_fn,
5324 SI_PARAM_GS2VS_OFFSET);
5325 LLVMValueRef gs_next_vertex;
5326 LLVMValueRef can_emit, kill;
5327 LLVMValueRef args[2];
5328 unsigned chan;
5329 int i;
5330 unsigned stream;
5331
5332 stream = si_llvm_get_stream(bld_base, emit_data);
5333
5334 /* Write vertex attribute values to GSVS ring */
5335 gs_next_vertex = LLVMBuildLoad(gallivm->builder,
5336 ctx->gs_next_vertex[stream],
5337 "");
5338
5339 /* If this thread has already emitted the declared maximum number of
5340 * vertices, kill it: excessive vertex emissions are not supposed to
5341 * have any effect, and GS threads have no externally observable
5342 * effects other than emitting vertices.
5343 */
5344 can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
5345 lp_build_const_int32(gallivm,
5346 shader->selector->gs_max_out_vertices), "");
5347 kill = lp_build_select(&bld_base->base, can_emit,
5348 lp_build_const_float(gallivm, 1.0f),
5349 lp_build_const_float(gallivm, -1.0f));
5350
5351 lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
5352 ctx->voidt, &kill, 1, 0);
5353
5354 for (i = 0; i < info->num_outputs; i++) {
5355 LLVMValueRef *out_ptr =
5356 ctx->radeon_bld.soa.outputs[i];
5357
5358 for (chan = 0; chan < 4; chan++) {
5359 LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
5360 LLVMValueRef voffset =
5361 lp_build_const_int32(gallivm, (i * 4 + chan) *
5362 shader->selector->gs_max_out_vertices);
5363
5364 voffset = lp_build_add(uint, voffset, gs_next_vertex);
5365 voffset = lp_build_mul_imm(uint, voffset, 4);
5366
5367 out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
5368
5369 build_tbuffer_store(ctx,
5370 ctx->gsvs_ring[stream],
5371 out_val, 1,
5372 voffset, soffset, 0,
5373 V_008F0C_BUF_DATA_FORMAT_32,
5374 V_008F0C_BUF_NUM_FORMAT_UINT,
5375 1, 0, 1, 1, 0);
5376 }
5377 }
5378 gs_next_vertex = lp_build_add(uint, gs_next_vertex,
5379 lp_build_const_int32(gallivm, 1));
5380
5381 LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
5382
5383 /* Signal vertex emission */
5384 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
5385 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5386 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5387 ctx->voidt, args, 2, 0);
5388 }
5389
5390 /* Cut one primitive from the geometry shader */
5391 static void si_llvm_emit_primitive(
5392 const struct lp_build_tgsi_action *action,
5393 struct lp_build_tgsi_context *bld_base,
5394 struct lp_build_emit_data *emit_data)
5395 {
5396 struct si_shader_context *ctx = si_shader_context(bld_base);
5397 struct gallivm_state *gallivm = bld_base->base.gallivm;
5398 LLVMValueRef args[2];
5399 unsigned stream;
5400
5401 /* Signal primitive cut */
5402 stream = si_llvm_get_stream(bld_base, emit_data);
5403 args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS | (stream << 8));
5404 args[1] = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
5405 lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
5406 ctx->voidt, args, 2, 0);
5407 }
5408
5409 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
5410 struct lp_build_tgsi_context *bld_base,
5411 struct lp_build_emit_data *emit_data)
5412 {
5413 struct si_shader_context *ctx = si_shader_context(bld_base);
5414 struct gallivm_state *gallivm = bld_base->base.gallivm;
5415
5416 /* The real barrier instruction isn’t needed, because an entire patch
5417 * always fits into a single wave.
5418 */
5419 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
5420 emit_optimization_barrier(ctx);
5421 return;
5422 }
5423
5424 lp_build_intrinsic(gallivm->builder,
5425 HAVE_LLVM >= 0x0309 ? "llvm.amdgcn.s.barrier"
5426 : "llvm.AMDGPU.barrier.local",
5427 ctx->voidt, NULL, 0, 0);
5428 }
5429
5430 static const struct lp_build_tgsi_action tex_action = {
5431 .fetch_args = tex_fetch_args,
5432 .emit = build_tex_intrinsic,
5433 };
5434
5435 static const struct lp_build_tgsi_action interp_action = {
5436 .fetch_args = interp_fetch_args,
5437 .emit = build_interp_intrinsic,
5438 };
5439
5440 static void si_create_function(struct si_shader_context *ctx,
5441 LLVMTypeRef *returns, unsigned num_returns,
5442 LLVMTypeRef *params, unsigned num_params,
5443 int last_sgpr)
5444 {
5445 int i;
5446
5447 radeon_llvm_create_func(&ctx->radeon_bld, returns, num_returns,
5448 params, num_params);
5449 radeon_llvm_shader_type(ctx->radeon_bld.main_fn, ctx->type);
5450 ctx->return_value = LLVMGetUndef(ctx->radeon_bld.return_type);
5451
5452 for (i = 0; i <= last_sgpr; ++i) {
5453 LLVMValueRef P = LLVMGetParam(ctx->radeon_bld.main_fn, i);
5454
5455 /* The combination of:
5456 * - ByVal
5457 * - dereferenceable
5458 * - invariant.load
5459 * allows the optimization passes to move loads and reduces
5460 * SGPR spilling significantly.
5461 */
5462 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
5463 LLVMAddAttribute(P, LLVMByValAttribute);
5464 lp_add_attr_dereferenceable(P, UINT64_MAX);
5465 } else
5466 LLVMAddAttribute(P, LLVMInRegAttribute);
5467 }
5468
5469 if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
5470 /* These were copied from some LLVM test. */
5471 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5472 "less-precise-fpmad",
5473 "true");
5474 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5475 "no-infs-fp-math",
5476 "true");
5477 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5478 "no-nans-fp-math",
5479 "true");
5480 LLVMAddTargetDependentFunctionAttr(ctx->radeon_bld.main_fn,
5481 "unsafe-fp-math",
5482 "true");
5483 }
5484 }
5485
5486 static void create_meta_data(struct si_shader_context *ctx)
5487 {
5488 struct gallivm_state *gallivm = ctx->radeon_bld.soa.bld_base.base.gallivm;
5489
5490 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5491 "invariant.load", 14);
5492 ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5493 "range", 5);
5494 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
5495 "amdgpu.uniform", 14);
5496
5497 ctx->empty_md = LLVMMDNodeInContext(gallivm->context, NULL, 0);
5498 }
5499
5500 static void declare_streamout_params(struct si_shader_context *ctx,
5501 struct pipe_stream_output_info *so,
5502 LLVMTypeRef *params, LLVMTypeRef i32,
5503 unsigned *num_params)
5504 {
5505 int i;
5506
5507 /* Streamout SGPRs. */
5508 if (so->num_outputs) {
5509 if (ctx->type != PIPE_SHADER_TESS_EVAL)
5510 params[ctx->param_streamout_config = (*num_params)++] = i32;
5511 else
5512 ctx->param_streamout_config = ctx->param_tess_offchip;
5513
5514 params[ctx->param_streamout_write_index = (*num_params)++] = i32;
5515 }
5516 /* A streamout buffer offset is loaded if the stride is non-zero. */
5517 for (i = 0; i < 4; i++) {
5518 if (!so->stride[i])
5519 continue;
5520
5521 params[ctx->param_streamout_offset[i] = (*num_params)++] = i32;
5522 }
5523 }
5524
5525 static unsigned llvm_get_type_size(LLVMTypeRef type)
5526 {
5527 LLVMTypeKind kind = LLVMGetTypeKind(type);
5528
5529 switch (kind) {
5530 case LLVMIntegerTypeKind:
5531 return LLVMGetIntTypeWidth(type) / 8;
5532 case LLVMFloatTypeKind:
5533 return 4;
5534 case LLVMPointerTypeKind:
5535 return 8;
5536 case LLVMVectorTypeKind:
5537 return LLVMGetVectorSize(type) *
5538 llvm_get_type_size(LLVMGetElementType(type));
5539 default:
5540 assert(0);
5541 return 0;
5542 }
5543 }
5544
5545 static void declare_tess_lds(struct si_shader_context *ctx)
5546 {
5547 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
5548 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5549 struct lp_build_context *uint = &bld_base->uint_bld;
5550
5551 unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
5552 ctx->lds = LLVMBuildIntToPtr(gallivm->builder, uint->zero,
5553 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), LOCAL_ADDR_SPACE),
5554 "tess_lds");
5555 }
5556
5557 static void create_function(struct si_shader_context *ctx)
5558 {
5559 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
5560 struct gallivm_state *gallivm = bld_base->base.gallivm;
5561 struct si_shader *shader = ctx->shader;
5562 LLVMTypeRef params[SI_NUM_PARAMS + SI_NUM_VERTEX_BUFFERS], v3i32;
5563 LLVMTypeRef returns[16+32*4];
5564 unsigned i, last_sgpr, num_params, num_return_sgprs;
5565 unsigned num_returns = 0;
5566
5567 v3i32 = LLVMVectorType(ctx->i32, 3);
5568
5569 params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
5570 params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
5571 params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
5572 params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
5573 params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
5574
5575 switch (ctx->type) {
5576 case PIPE_SHADER_VERTEX:
5577 params[SI_PARAM_VERTEX_BUFFERS] = const_array(ctx->v16i8, SI_NUM_VERTEX_BUFFERS);
5578 params[SI_PARAM_BASE_VERTEX] = ctx->i32;
5579 params[SI_PARAM_START_INSTANCE] = ctx->i32;
5580 params[SI_PARAM_DRAWID] = ctx->i32;
5581 num_params = SI_PARAM_DRAWID+1;
5582
5583 if (shader->key.vs.as_es) {
5584 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5585 } else if (shader->key.vs.as_ls) {
5586 params[SI_PARAM_LS_OUT_LAYOUT] = ctx->i32;
5587 num_params = SI_PARAM_LS_OUT_LAYOUT+1;
5588 } else {
5589 if (ctx->is_gs_copy_shader) {
5590 num_params = SI_PARAM_RW_BUFFERS+1;
5591 } else {
5592 params[SI_PARAM_VS_STATE_BITS] = ctx->i32;
5593 num_params = SI_PARAM_VS_STATE_BITS+1;
5594 }
5595
5596 /* The locations of the other parameters are assigned dynamically. */
5597 declare_streamout_params(ctx, &shader->selector->so,
5598 params, ctx->i32, &num_params);
5599 }
5600
5601 last_sgpr = num_params-1;
5602
5603 /* VGPRs */
5604 params[ctx->param_vertex_id = num_params++] = ctx->i32;
5605 params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
5606 params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
5607 params[ctx->param_instance_id = num_params++] = ctx->i32;
5608
5609 if (!ctx->is_monolithic &&
5610 !ctx->is_gs_copy_shader) {
5611 /* Vertex load indices. */
5612 ctx->param_vertex_index0 = num_params;
5613
5614 for (i = 0; i < shader->selector->info.num_inputs; i++)
5615 params[num_params++] = ctx->i32;
5616
5617 /* PrimitiveID output. */
5618 if (!shader->key.vs.as_es && !shader->key.vs.as_ls)
5619 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5620 returns[num_returns++] = ctx->f32;
5621 }
5622 break;
5623
5624 case PIPE_SHADER_TESS_CTRL:
5625 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5626 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx->i32;
5627 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx->i32;
5628 params[SI_PARAM_TCS_IN_LAYOUT] = ctx->i32;
5629 params[ctx->param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx->i32;
5630 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx->i32;
5631 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
5632
5633 /* VGPRs */
5634 params[SI_PARAM_PATCH_ID] = ctx->i32;
5635 params[SI_PARAM_REL_IDS] = ctx->i32;
5636 num_params = SI_PARAM_REL_IDS+1;
5637
5638 if (!ctx->is_monolithic) {
5639 /* SI_PARAM_TCS_OC_LDS and PARAM_TESS_FACTOR_OFFSET are
5640 * placed after the user SGPRs.
5641 */
5642 for (i = 0; i < SI_TCS_NUM_USER_SGPR + 2; i++)
5643 returns[num_returns++] = ctx->i32; /* SGPRs */
5644
5645 for (i = 0; i < 3; i++)
5646 returns[num_returns++] = ctx->f32; /* VGPRs */
5647 }
5648 break;
5649
5650 case PIPE_SHADER_TESS_EVAL:
5651 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx->i32;
5652 num_params = SI_PARAM_TCS_OFFCHIP_LAYOUT+1;
5653
5654 if (shader->key.tes.as_es) {
5655 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5656 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5657 params[ctx->param_es2gs_offset = num_params++] = ctx->i32;
5658 } else {
5659 params[ctx->param_tess_offchip = num_params++] = ctx->i32;
5660 declare_streamout_params(ctx, &shader->selector->so,
5661 params, ctx->i32, &num_params);
5662 params[ctx->param_oc_lds = num_params++] = ctx->i32;
5663 }
5664 last_sgpr = num_params - 1;
5665
5666 /* VGPRs */
5667 params[ctx->param_tes_u = num_params++] = ctx->f32;
5668 params[ctx->param_tes_v = num_params++] = ctx->f32;
5669 params[ctx->param_tes_rel_patch_id = num_params++] = ctx->i32;
5670 params[ctx->param_tes_patch_id = num_params++] = ctx->i32;
5671
5672 /* PrimitiveID output. */
5673 if (!ctx->is_monolithic && !shader->key.tes.as_es)
5674 for (i = 0; i <= VS_EPILOG_PRIMID_LOC; i++)
5675 returns[num_returns++] = ctx->f32;
5676 break;
5677
5678 case PIPE_SHADER_GEOMETRY:
5679 params[SI_PARAM_GS2VS_OFFSET] = ctx->i32;
5680 params[SI_PARAM_GS_WAVE_ID] = ctx->i32;
5681 last_sgpr = SI_PARAM_GS_WAVE_ID;
5682
5683 /* VGPRs */
5684 params[SI_PARAM_VTX0_OFFSET] = ctx->i32;
5685 params[SI_PARAM_VTX1_OFFSET] = ctx->i32;
5686 params[SI_PARAM_PRIMITIVE_ID] = ctx->i32;
5687 params[SI_PARAM_VTX2_OFFSET] = ctx->i32;
5688 params[SI_PARAM_VTX3_OFFSET] = ctx->i32;
5689 params[SI_PARAM_VTX4_OFFSET] = ctx->i32;
5690 params[SI_PARAM_VTX5_OFFSET] = ctx->i32;
5691 params[SI_PARAM_GS_INSTANCE_ID] = ctx->i32;
5692 num_params = SI_PARAM_GS_INSTANCE_ID+1;
5693 break;
5694
5695 case PIPE_SHADER_FRAGMENT:
5696 params[SI_PARAM_ALPHA_REF] = ctx->f32;
5697 params[SI_PARAM_PRIM_MASK] = ctx->i32;
5698 last_sgpr = SI_PARAM_PRIM_MASK;
5699 params[SI_PARAM_PERSP_SAMPLE] = ctx->v2i32;
5700 params[SI_PARAM_PERSP_CENTER] = ctx->v2i32;
5701 params[SI_PARAM_PERSP_CENTROID] = ctx->v2i32;
5702 params[SI_PARAM_PERSP_PULL_MODEL] = v3i32;
5703 params[SI_PARAM_LINEAR_SAMPLE] = ctx->v2i32;
5704 params[SI_PARAM_LINEAR_CENTER] = ctx->v2i32;
5705 params[SI_PARAM_LINEAR_CENTROID] = ctx->v2i32;
5706 params[SI_PARAM_LINE_STIPPLE_TEX] = ctx->f32;
5707 params[SI_PARAM_POS_X_FLOAT] = ctx->f32;
5708 params[SI_PARAM_POS_Y_FLOAT] = ctx->f32;
5709 params[SI_PARAM_POS_Z_FLOAT] = ctx->f32;
5710 params[SI_PARAM_POS_W_FLOAT] = ctx->f32;
5711 params[SI_PARAM_FRONT_FACE] = ctx->i32;
5712 params[SI_PARAM_ANCILLARY] = ctx->i32;
5713 params[SI_PARAM_SAMPLE_COVERAGE] = ctx->f32;
5714 params[SI_PARAM_POS_FIXED_PT] = ctx->i32;
5715 num_params = SI_PARAM_POS_FIXED_PT+1;
5716
5717 if (!ctx->is_monolithic) {
5718 /* Color inputs from the prolog. */
5719 if (shader->selector->info.colors_read) {
5720 unsigned num_color_elements =
5721 util_bitcount(shader->selector->info.colors_read);
5722
5723 assert(num_params + num_color_elements <= ARRAY_SIZE(params));
5724 for (i = 0; i < num_color_elements; i++)
5725 params[num_params++] = ctx->f32;
5726 }
5727
5728 /* Outputs for the epilog. */
5729 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
5730 num_returns =
5731 num_return_sgprs +
5732 util_bitcount(shader->selector->info.colors_written) * 4 +
5733 shader->selector->info.writes_z +
5734 shader->selector->info.writes_stencil +
5735 shader->selector->info.writes_samplemask +
5736 1 /* SampleMaskIn */;
5737
5738 num_returns = MAX2(num_returns,
5739 num_return_sgprs +
5740 PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
5741
5742 for (i = 0; i < num_return_sgprs; i++)
5743 returns[i] = ctx->i32;
5744 for (; i < num_returns; i++)
5745 returns[i] = ctx->f32;
5746 }
5747 break;
5748
5749 case PIPE_SHADER_COMPUTE:
5750 params[SI_PARAM_GRID_SIZE] = v3i32;
5751 params[SI_PARAM_BLOCK_SIZE] = v3i32;
5752 params[SI_PARAM_BLOCK_ID] = v3i32;
5753 last_sgpr = SI_PARAM_BLOCK_ID;
5754
5755 params[SI_PARAM_THREAD_ID] = v3i32;
5756 num_params = SI_PARAM_THREAD_ID + 1;
5757 break;
5758 default:
5759 assert(0 && "unimplemented shader");
5760 return;
5761 }
5762
5763 assert(num_params <= ARRAY_SIZE(params));
5764
5765 si_create_function(ctx, returns, num_returns, params,
5766 num_params, last_sgpr);
5767
5768 /* Reserve register locations for VGPR inputs the PS prolog may need. */
5769 if (ctx->type == PIPE_SHADER_FRAGMENT &&
5770 !ctx->is_monolithic) {
5771 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5772 "InitialPSInputAddr",
5773 S_0286D0_PERSP_SAMPLE_ENA(1) |
5774 S_0286D0_PERSP_CENTER_ENA(1) |
5775 S_0286D0_PERSP_CENTROID_ENA(1) |
5776 S_0286D0_LINEAR_SAMPLE_ENA(1) |
5777 S_0286D0_LINEAR_CENTER_ENA(1) |
5778 S_0286D0_LINEAR_CENTROID_ENA(1) |
5779 S_0286D0_FRONT_FACE_ENA(1) |
5780 S_0286D0_POS_FIXED_PT_ENA(1));
5781 } else if (ctx->type == PIPE_SHADER_COMPUTE) {
5782 const unsigned *properties = shader->selector->info.properties;
5783 unsigned max_work_group_size =
5784 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
5785 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
5786 properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
5787
5788 if (!max_work_group_size) {
5789 /* This is a variable group size compute shader,
5790 * compile it for the maximum possible group size.
5791 */
5792 max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
5793 }
5794
5795 radeon_llvm_add_attribute(ctx->radeon_bld.main_fn,
5796 "amdgpu-max-work-group-size",
5797 max_work_group_size);
5798 }
5799
5800 shader->info.num_input_sgprs = 0;
5801 shader->info.num_input_vgprs = 0;
5802
5803 for (i = 0; i <= last_sgpr; ++i)
5804 shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 4;
5805
5806 /* Unused fragment shader inputs are eliminated by the compiler,
5807 * so we don't know yet how many there will be.
5808 */
5809 if (ctx->type != PIPE_SHADER_FRAGMENT)
5810 for (; i < num_params; ++i)
5811 shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 4;
5812
5813 if (!ctx->screen->has_ds_bpermute &&
5814 bld_base->info &&
5815 (bld_base->info->opcode_count[TGSI_OPCODE_DDX] > 0 ||
5816 bld_base->info->opcode_count[TGSI_OPCODE_DDY] > 0 ||
5817 bld_base->info->opcode_count[TGSI_OPCODE_DDX_FINE] > 0 ||
5818 bld_base->info->opcode_count[TGSI_OPCODE_DDY_FINE] > 0 ||
5819 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0 ||
5820 bld_base->info->opcode_count[TGSI_OPCODE_INTERP_SAMPLE] > 0))
5821 ctx->lds =
5822 LLVMAddGlobalInAddressSpace(gallivm->module,
5823 LLVMArrayType(ctx->i32, 64),
5824 "ddxy_lds",
5825 LOCAL_ADDR_SPACE);
5826
5827 if ((ctx->type == PIPE_SHADER_VERTEX && shader->key.vs.as_ls) ||
5828 ctx->type == PIPE_SHADER_TESS_CTRL ||
5829 ctx->type == PIPE_SHADER_TESS_EVAL)
5830 declare_tess_lds(ctx);
5831 }
5832
5833 /**
5834 * Load ESGS and GSVS ring buffer resource descriptors and save the variables
5835 * for later use.
5836 */
5837 static void preload_ring_buffers(struct si_shader_context *ctx)
5838 {
5839 struct gallivm_state *gallivm =
5840 ctx->radeon_bld.soa.bld_base.base.gallivm;
5841
5842 LLVMValueRef buf_ptr = LLVMGetParam(ctx->radeon_bld.main_fn,
5843 SI_PARAM_RW_BUFFERS);
5844
5845 if ((ctx->type == PIPE_SHADER_VERTEX &&
5846 ctx->shader->key.vs.as_es) ||
5847 (ctx->type == PIPE_SHADER_TESS_EVAL &&
5848 ctx->shader->key.tes.as_es) ||
5849 ctx->type == PIPE_SHADER_GEOMETRY) {
5850 unsigned ring =
5851 ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS
5852 : SI_ES_RING_ESGS;
5853 LLVMValueRef offset = lp_build_const_int32(gallivm, ring);
5854
5855 ctx->esgs_ring =
5856 build_indexed_load_const(ctx, buf_ptr, offset);
5857 }
5858
5859 if (ctx->is_gs_copy_shader) {
5860 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_VS_RING_GSVS);
5861
5862 ctx->gsvs_ring[0] =
5863 build_indexed_load_const(ctx, buf_ptr, offset);
5864 }
5865 if (ctx->type == PIPE_SHADER_GEOMETRY) {
5866 int i;
5867 for (i = 0; i < 4; i++) {
5868 LLVMValueRef offset = lp_build_const_int32(gallivm, SI_GS_RING_GSVS0 + i);
5869
5870 ctx->gsvs_ring[i] =
5871 build_indexed_load_const(ctx, buf_ptr, offset);
5872 }
5873 }
5874 }
5875
5876 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
5877 LLVMValueRef param_rw_buffers,
5878 unsigned param_pos_fixed_pt)
5879 {
5880 struct lp_build_tgsi_context *bld_base =
5881 &ctx->radeon_bld.soa.bld_base;
5882 struct gallivm_state *gallivm = bld_base->base.gallivm;
5883 LLVMBuilderRef builder = gallivm->builder;
5884 LLVMValueRef slot, desc, offset, row, bit, address[2];
5885
5886 /* Use the fixed-point gl_FragCoord input.
5887 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
5888 * per coordinate to get the repeating effect.
5889 */
5890 address[0] = unpack_param(ctx, param_pos_fixed_pt, 0, 5);
5891 address[1] = unpack_param(ctx, param_pos_fixed_pt, 16, 5);
5892
5893 /* Load the buffer descriptor. */
5894 slot = lp_build_const_int32(gallivm, SI_PS_CONST_POLY_STIPPLE);
5895 desc = build_indexed_load_const(ctx, param_rw_buffers, slot);
5896
5897 /* The stipple pattern is 32x32, each row has 32 bits. */
5898 offset = LLVMBuildMul(builder, address[1],
5899 LLVMConstInt(ctx->i32, 4, 0), "");
5900 row = buffer_load_const(ctx, desc, offset);
5901 row = LLVMBuildBitCast(builder, row, ctx->i32, "");
5902 bit = LLVMBuildLShr(builder, row, address[0], "");
5903 bit = LLVMBuildTrunc(builder, bit, ctx->i1, "");
5904
5905 /* The intrinsic kills the thread if arg < 0. */
5906 bit = LLVMBuildSelect(builder, bit, LLVMConstReal(ctx->f32, 0),
5907 LLVMConstReal(ctx->f32, -1), "");
5908 lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
5909 }
5910
5911 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
5912 struct si_shader_config *conf,
5913 unsigned symbol_offset)
5914 {
5915 unsigned i;
5916 const unsigned char *config =
5917 radeon_shader_binary_config_start(binary, symbol_offset);
5918 bool really_needs_scratch = false;
5919
5920 /* LLVM adds SGPR spills to the scratch size.
5921 * Find out if we really need the scratch buffer.
5922 */
5923 for (i = 0; i < binary->reloc_count; i++) {
5924 const struct radeon_shader_reloc *reloc = &binary->relocs[i];
5925
5926 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
5927 !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
5928 really_needs_scratch = true;
5929 break;
5930 }
5931 }
5932
5933 /* XXX: We may be able to emit some of these values directly rather than
5934 * extracting fields to be emitted later.
5935 */
5936
5937 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
5938 unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
5939 unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
5940 switch (reg) {
5941 case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
5942 case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
5943 case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
5944 case R_00B848_COMPUTE_PGM_RSRC1:
5945 conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
5946 conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
5947 conf->float_mode = G_00B028_FLOAT_MODE(value);
5948 conf->rsrc1 = value;
5949 break;
5950 case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
5951 conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
5952 break;
5953 case R_00B84C_COMPUTE_PGM_RSRC2:
5954 conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
5955 conf->rsrc2 = value;
5956 break;
5957 case R_0286CC_SPI_PS_INPUT_ENA:
5958 conf->spi_ps_input_ena = value;
5959 break;
5960 case R_0286D0_SPI_PS_INPUT_ADDR:
5961 conf->spi_ps_input_addr = value;
5962 break;
5963 case R_0286E8_SPI_TMPRING_SIZE:
5964 case R_00B860_COMPUTE_TMPRING_SIZE:
5965 /* WAVESIZE is in units of 256 dwords. */
5966 if (really_needs_scratch)
5967 conf->scratch_bytes_per_wave =
5968 G_00B860_WAVESIZE(value) * 256 * 4;
5969 break;
5970 case 0x4: /* SPILLED_SGPRS */
5971 conf->spilled_sgprs = value;
5972 break;
5973 case 0x8: /* SPILLED_VGPRS */
5974 conf->spilled_vgprs = value;
5975 break;
5976 default:
5977 {
5978 static bool printed;
5979
5980 if (!printed) {
5981 fprintf(stderr, "Warning: LLVM emitted unknown "
5982 "config register: 0x%x\n", reg);
5983 printed = true;
5984 }
5985 }
5986 break;
5987 }
5988 }
5989
5990 if (!conf->spi_ps_input_addr)
5991 conf->spi_ps_input_addr = conf->spi_ps_input_ena;
5992 }
5993
5994 void si_shader_apply_scratch_relocs(struct si_context *sctx,
5995 struct si_shader *shader,
5996 struct si_shader_config *config,
5997 uint64_t scratch_va)
5998 {
5999 unsigned i;
6000 uint32_t scratch_rsrc_dword0 = scratch_va;
6001 uint32_t scratch_rsrc_dword1 =
6002 S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
6003
6004 /* Enable scratch coalescing if LLVM sets ELEMENT_SIZE & INDEX_STRIDE
6005 * correctly.
6006 */
6007 if (HAVE_LLVM >= 0x0309)
6008 scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1);
6009 else
6010 scratch_rsrc_dword1 |=
6011 S_008F04_STRIDE(config->scratch_bytes_per_wave / 64);
6012
6013 for (i = 0 ; i < shader->binary.reloc_count; i++) {
6014 const struct radeon_shader_reloc *reloc =
6015 &shader->binary.relocs[i];
6016 if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
6017 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6018 &scratch_rsrc_dword0, 4);
6019 } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
6020 util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
6021 &scratch_rsrc_dword1, 4);
6022 }
6023 }
6024 }
6025
6026 static unsigned si_get_shader_binary_size(struct si_shader *shader)
6027 {
6028 unsigned size = shader->binary.code_size;
6029
6030 if (shader->prolog)
6031 size += shader->prolog->binary.code_size;
6032 if (shader->epilog)
6033 size += shader->epilog->binary.code_size;
6034 return size;
6035 }
6036
6037 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
6038 {
6039 const struct radeon_shader_binary *prolog =
6040 shader->prolog ? &shader->prolog->binary : NULL;
6041 const struct radeon_shader_binary *epilog =
6042 shader->epilog ? &shader->epilog->binary : NULL;
6043 const struct radeon_shader_binary *mainb = &shader->binary;
6044 unsigned bo_size = si_get_shader_binary_size(shader) +
6045 (!epilog ? mainb->rodata_size : 0);
6046 unsigned char *ptr;
6047
6048 assert(!prolog || !prolog->rodata_size);
6049 assert((!prolog && !epilog) || !mainb->rodata_size);
6050 assert(!epilog || !epilog->rodata_size);
6051
6052 r600_resource_reference(&shader->bo, NULL);
6053 shader->bo = si_resource_create_custom(&sscreen->b.b,
6054 PIPE_USAGE_IMMUTABLE,
6055 bo_size);
6056 if (!shader->bo)
6057 return -ENOMEM;
6058
6059 /* Upload. */
6060 ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
6061 PIPE_TRANSFER_READ_WRITE);
6062
6063 if (prolog) {
6064 util_memcpy_cpu_to_le32(ptr, prolog->code, prolog->code_size);
6065 ptr += prolog->code_size;
6066 }
6067
6068 util_memcpy_cpu_to_le32(ptr, mainb->code, mainb->code_size);
6069 ptr += mainb->code_size;
6070
6071 if (epilog)
6072 util_memcpy_cpu_to_le32(ptr, epilog->code, epilog->code_size);
6073 else if (mainb->rodata_size > 0)
6074 util_memcpy_cpu_to_le32(ptr, mainb->rodata, mainb->rodata_size);
6075
6076 sscreen->b.ws->buffer_unmap(shader->bo->buf);
6077 return 0;
6078 }
6079
6080 static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary,
6081 struct pipe_debug_callback *debug,
6082 const char *name, FILE *file)
6083 {
6084 char *line, *p;
6085 unsigned i, count;
6086
6087 if (binary->disasm_string) {
6088 fprintf(file, "Shader %s disassembly:\n", name);
6089 fprintf(file, "%s", binary->disasm_string);
6090
6091 if (debug && debug->debug_message) {
6092 /* Very long debug messages are cut off, so send the
6093 * disassembly one line at a time. This causes more
6094 * overhead, but on the plus side it simplifies
6095 * parsing of resulting logs.
6096 */
6097 pipe_debug_message(debug, SHADER_INFO,
6098 "Shader Disassembly Begin");
6099
6100 line = binary->disasm_string;
6101 while (*line) {
6102 p = util_strchrnul(line, '\n');
6103 count = p - line;
6104
6105 if (count) {
6106 pipe_debug_message(debug, SHADER_INFO,
6107 "%.*s", count, line);
6108 }
6109
6110 if (!*p)
6111 break;
6112 line = p + 1;
6113 }
6114
6115 pipe_debug_message(debug, SHADER_INFO,
6116 "Shader Disassembly End");
6117 }
6118 } else {
6119 fprintf(file, "Shader %s binary:\n", name);
6120 for (i = 0; i < binary->code_size; i += 4) {
6121 fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
6122 binary->code[i + 3], binary->code[i + 2],
6123 binary->code[i + 1], binary->code[i]);
6124 }
6125 }
6126 }
6127
6128 static void si_shader_dump_stats(struct si_screen *sscreen,
6129 struct si_shader_config *conf,
6130 unsigned num_inputs,
6131 unsigned code_size,
6132 struct pipe_debug_callback *debug,
6133 unsigned processor,
6134 FILE *file)
6135 {
6136 unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
6137 unsigned lds_per_wave = 0;
6138 unsigned max_simd_waves = 10;
6139
6140 /* Compute LDS usage for PS. */
6141 if (processor == PIPE_SHADER_FRAGMENT) {
6142 /* The minimum usage per wave is (num_inputs * 48). The maximum
6143 * usage is (num_inputs * 48 * 16).
6144 * We can get anything in between and it varies between waves.
6145 *
6146 * The 48 bytes per input for a single primitive is equal to
6147 * 4 bytes/component * 4 components/input * 3 points.
6148 *
6149 * Other stages don't know the size at compile time or don't
6150 * allocate LDS per wave, but instead they do it per thread group.
6151 */
6152 lds_per_wave = conf->lds_size * lds_increment +
6153 align(num_inputs * 48, lds_increment);
6154 }
6155
6156 /* Compute the per-SIMD wave counts. */
6157 if (conf->num_sgprs) {
6158 if (sscreen->b.chip_class >= VI)
6159 max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
6160 else
6161 max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
6162 }
6163
6164 if (conf->num_vgprs)
6165 max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
6166
6167 /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
6168 * that PS can use.
6169 */
6170 if (lds_per_wave)
6171 max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
6172
6173 if (file != stderr ||
6174 r600_can_dump_shader(&sscreen->b, processor)) {
6175 if (processor == PIPE_SHADER_FRAGMENT) {
6176 fprintf(file, "*** SHADER CONFIG ***\n"
6177 "SPI_PS_INPUT_ADDR = 0x%04x\n"
6178 "SPI_PS_INPUT_ENA = 0x%04x\n",
6179 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
6180 }
6181
6182 fprintf(file, "*** SHADER STATS ***\n"
6183 "SGPRS: %d\n"
6184 "VGPRS: %d\n"
6185 "Spilled SGPRs: %d\n"
6186 "Spilled VGPRs: %d\n"
6187 "Code Size: %d bytes\n"
6188 "LDS: %d blocks\n"
6189 "Scratch: %d bytes per wave\n"
6190 "Max Waves: %d\n"
6191 "********************\n\n\n",
6192 conf->num_sgprs, conf->num_vgprs,
6193 conf->spilled_sgprs, conf->spilled_vgprs, code_size,
6194 conf->lds_size, conf->scratch_bytes_per_wave,
6195 max_simd_waves);
6196 }
6197
6198 pipe_debug_message(debug, SHADER_INFO,
6199 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
6200 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
6201 "Spilled VGPRs: %d",
6202 conf->num_sgprs, conf->num_vgprs, code_size,
6203 conf->lds_size, conf->scratch_bytes_per_wave,
6204 max_simd_waves, conf->spilled_sgprs,
6205 conf->spilled_vgprs);
6206 }
6207
6208 static const char *si_get_shader_name(struct si_shader *shader,
6209 unsigned processor)
6210 {
6211 switch (processor) {
6212 case PIPE_SHADER_VERTEX:
6213 if (shader->key.vs.as_es)
6214 return "Vertex Shader as ES";
6215 else if (shader->key.vs.as_ls)
6216 return "Vertex Shader as LS";
6217 else
6218 return "Vertex Shader as VS";
6219 case PIPE_SHADER_TESS_CTRL:
6220 return "Tessellation Control Shader";
6221 case PIPE_SHADER_TESS_EVAL:
6222 if (shader->key.tes.as_es)
6223 return "Tessellation Evaluation Shader as ES";
6224 else
6225 return "Tessellation Evaluation Shader as VS";
6226 case PIPE_SHADER_GEOMETRY:
6227 if (shader->gs_copy_shader == NULL)
6228 return "GS Copy Shader as VS";
6229 else
6230 return "Geometry Shader";
6231 case PIPE_SHADER_FRAGMENT:
6232 return "Pixel Shader";
6233 case PIPE_SHADER_COMPUTE:
6234 return "Compute Shader";
6235 default:
6236 return "Unknown Shader";
6237 }
6238 }
6239
6240 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
6241 struct pipe_debug_callback *debug, unsigned processor,
6242 FILE *file)
6243 {
6244 if (file != stderr ||
6245 r600_can_dump_shader(&sscreen->b, processor))
6246 si_dump_shader_key(processor, &shader->key, file);
6247
6248 if (file != stderr && shader->binary.llvm_ir_string) {
6249 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n",
6250 si_get_shader_name(shader, processor));
6251 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
6252 }
6253
6254 if (file != stderr ||
6255 (r600_can_dump_shader(&sscreen->b, processor) &&
6256 !(sscreen->b.debug_flags & DBG_NO_ASM))) {
6257 fprintf(file, "\n%s:\n", si_get_shader_name(shader, processor));
6258
6259 if (shader->prolog)
6260 si_shader_dump_disassembly(&shader->prolog->binary,
6261 debug, "prolog", file);
6262
6263 si_shader_dump_disassembly(&shader->binary, debug, "main", file);
6264
6265 if (shader->epilog)
6266 si_shader_dump_disassembly(&shader->epilog->binary,
6267 debug, "epilog", file);
6268 fprintf(file, "\n");
6269 }
6270
6271 si_shader_dump_stats(sscreen, &shader->config,
6272 shader->selector ? shader->selector->info.num_inputs : 0,
6273 si_get_shader_binary_size(shader), debug, processor,
6274 file);
6275 }
6276
6277 int si_compile_llvm(struct si_screen *sscreen,
6278 struct radeon_shader_binary *binary,
6279 struct si_shader_config *conf,
6280 LLVMTargetMachineRef tm,
6281 LLVMModuleRef mod,
6282 struct pipe_debug_callback *debug,
6283 unsigned processor,
6284 const char *name)
6285 {
6286 int r = 0;
6287 unsigned count = p_atomic_inc_return(&sscreen->b.num_compilations);
6288
6289 if (r600_can_dump_shader(&sscreen->b, processor)) {
6290 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
6291
6292 if (!(sscreen->b.debug_flags & (DBG_NO_IR | DBG_PREOPT_IR))) {
6293 fprintf(stderr, "%s LLVM IR:\n\n", name);
6294 LLVMDumpModule(mod);
6295 fprintf(stderr, "\n");
6296 }
6297 }
6298
6299 if (sscreen->record_llvm_ir) {
6300 char *ir = LLVMPrintModuleToString(mod);
6301 binary->llvm_ir_string = strdup(ir);
6302 LLVMDisposeMessage(ir);
6303 }
6304
6305 if (!si_replace_shader(count, binary)) {
6306 r = radeon_llvm_compile(mod, binary, tm, debug);
6307 if (r)
6308 return r;
6309 }
6310
6311 si_shader_binary_read_config(binary, conf, 0);
6312
6313 /* Enable 64-bit and 16-bit denormals, because there is no performance
6314 * cost.
6315 *
6316 * If denormals are enabled, all floating-point output modifiers are
6317 * ignored.
6318 *
6319 * Don't enable denormals for 32-bit floats, because:
6320 * - Floating-point output modifiers would be ignored by the hw.
6321 * - Some opcodes don't support denormals, such as v_mad_f32. We would
6322 * have to stop using those.
6323 * - SI & CI would be very slow.
6324 */
6325 conf->float_mode |= V_00B028_FP_64_DENORMS;
6326
6327 FREE(binary->config);
6328 FREE(binary->global_symbol_offsets);
6329 binary->config = NULL;
6330 binary->global_symbol_offsets = NULL;
6331
6332 /* Some shaders can't have rodata because their binaries can be
6333 * concatenated.
6334 */
6335 if (binary->rodata_size &&
6336 (processor == PIPE_SHADER_VERTEX ||
6337 processor == PIPE_SHADER_TESS_CTRL ||
6338 processor == PIPE_SHADER_TESS_EVAL ||
6339 processor == PIPE_SHADER_FRAGMENT)) {
6340 fprintf(stderr, "radeonsi: The shader can't have rodata.");
6341 return -EINVAL;
6342 }
6343
6344 return r;
6345 }
6346
6347 static void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
6348 {
6349 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
6350 LLVMBuildRetVoid(ctx->radeon_bld.gallivm.builder);
6351 else
6352 LLVMBuildRet(ctx->radeon_bld.gallivm.builder, ret);
6353 }
6354
6355 /* Generate code for the hardware VS shader stage to go with a geometry shader */
6356 static int si_generate_gs_copy_shader(struct si_screen *sscreen,
6357 struct si_shader_context *ctx,
6358 struct si_shader *gs,
6359 struct pipe_debug_callback *debug)
6360 {
6361 struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
6362 struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
6363 struct lp_build_context *uint = &bld_base->uint_bld;
6364 struct si_shader_output_values *outputs;
6365 struct tgsi_shader_info *gsinfo = &gs->selector->info;
6366 LLVMValueRef args[9];
6367 int i, r;
6368
6369 outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
6370
6371 si_init_shader_ctx(ctx, sscreen, ctx->shader, ctx->tm);
6372 ctx->type = PIPE_SHADER_VERTEX;
6373 ctx->is_gs_copy_shader = true;
6374
6375 create_meta_data(ctx);
6376 create_function(ctx);
6377 preload_ring_buffers(ctx);
6378
6379 args[0] = ctx->gsvs_ring[0];
6380 args[1] = lp_build_mul_imm(uint,
6381 LLVMGetParam(ctx->radeon_bld.main_fn,
6382 ctx->param_vertex_id),
6383 4);
6384 args[3] = uint->zero;
6385 args[4] = uint->one; /* OFFEN */
6386 args[5] = uint->zero; /* IDXEN */
6387 args[6] = uint->one; /* GLC */
6388 args[7] = uint->one; /* SLC */
6389 args[8] = uint->zero; /* TFE */
6390
6391 /* Fetch vertex data from GSVS ring */
6392 for (i = 0; i < gsinfo->num_outputs; ++i) {
6393 unsigned chan;
6394
6395 outputs[i].name = gsinfo->output_semantic_name[i];
6396 outputs[i].sid = gsinfo->output_semantic_index[i];
6397
6398 for (chan = 0; chan < 4; chan++) {
6399 args[2] = lp_build_const_int32(gallivm,
6400 (i * 4 + chan) *
6401 gs->selector->gs_max_out_vertices * 16 * 4);
6402
6403 outputs[i].values[chan] =
6404 LLVMBuildBitCast(gallivm->builder,
6405 lp_build_intrinsic(gallivm->builder,
6406 "llvm.SI.buffer.load.dword.i32.i32",
6407 ctx->i32, args, 9,
6408 LLVMReadOnlyAttribute),
6409 ctx->f32, "");
6410 }
6411 }
6412
6413 si_llvm_export_vs(bld_base, outputs, gsinfo->num_outputs);
6414
6415 LLVMBuildRetVoid(gallivm->builder);
6416
6417 /* Dump LLVM IR before any optimization passes */
6418 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6419 r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6420 LLVMDumpModule(bld_base->base.gallivm->module);
6421
6422 radeon_llvm_finalize_module(
6423 &ctx->radeon_bld,
6424 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_GEOMETRY));
6425
6426 r = si_compile_llvm(sscreen, &ctx->shader->binary,
6427 &ctx->shader->config, ctx->tm,
6428 bld_base->base.gallivm->module,
6429 debug, PIPE_SHADER_GEOMETRY,
6430 "GS Copy Shader");
6431 if (!r) {
6432 if (r600_can_dump_shader(&sscreen->b, PIPE_SHADER_GEOMETRY))
6433 fprintf(stderr, "GS Copy Shader:\n");
6434 si_shader_dump(sscreen, ctx->shader, debug,
6435 PIPE_SHADER_GEOMETRY, stderr);
6436 r = si_shader_binary_upload(sscreen, ctx->shader);
6437 }
6438
6439 radeon_llvm_dispose(&ctx->radeon_bld);
6440
6441 FREE(outputs);
6442 return r;
6443 }
6444
6445 static void si_dump_shader_key(unsigned shader, union si_shader_key *key,
6446 FILE *f)
6447 {
6448 int i;
6449
6450 fprintf(f, "SHADER KEY\n");
6451
6452 switch (shader) {
6453 case PIPE_SHADER_VERTEX:
6454 fprintf(f, " instance_divisors = {");
6455 for (i = 0; i < ARRAY_SIZE(key->vs.prolog.instance_divisors); i++)
6456 fprintf(f, !i ? "%u" : ", %u",
6457 key->vs.prolog.instance_divisors[i]);
6458 fprintf(f, "}\n");
6459 fprintf(f, " as_es = %u\n", key->vs.as_es);
6460 fprintf(f, " as_ls = %u\n", key->vs.as_ls);
6461 fprintf(f, " export_prim_id = %u\n", key->vs.epilog.export_prim_id);
6462 break;
6463
6464 case PIPE_SHADER_TESS_CTRL:
6465 fprintf(f, " prim_mode = %u\n", key->tcs.epilog.prim_mode);
6466 break;
6467
6468 case PIPE_SHADER_TESS_EVAL:
6469 fprintf(f, " as_es = %u\n", key->tes.as_es);
6470 fprintf(f, " export_prim_id = %u\n", key->tes.epilog.export_prim_id);
6471 break;
6472
6473 case PIPE_SHADER_GEOMETRY:
6474 case PIPE_SHADER_COMPUTE:
6475 break;
6476
6477 case PIPE_SHADER_FRAGMENT:
6478 fprintf(f, " prolog.color_two_side = %u\n", key->ps.prolog.color_two_side);
6479 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.prolog.flatshade_colors);
6480 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.prolog.poly_stipple);
6481 fprintf(f, " prolog.force_persp_sample_interp = %u\n", key->ps.prolog.force_persp_sample_interp);
6482 fprintf(f, " prolog.force_linear_sample_interp = %u\n", key->ps.prolog.force_linear_sample_interp);
6483 fprintf(f, " prolog.force_persp_center_interp = %u\n", key->ps.prolog.force_persp_center_interp);
6484 fprintf(f, " prolog.force_linear_center_interp = %u\n", key->ps.prolog.force_linear_center_interp);
6485 fprintf(f, " prolog.bc_optimize_for_persp = %u\n", key->ps.prolog.bc_optimize_for_persp);
6486 fprintf(f, " prolog.bc_optimize_for_linear = %u\n", key->ps.prolog.bc_optimize_for_linear);
6487 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n", key->ps.epilog.spi_shader_col_format);
6488 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.epilog.color_is_int8);
6489 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.epilog.last_cbuf);
6490 fprintf(f, " epilog.alpha_func = %u\n", key->ps.epilog.alpha_func);
6491 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.epilog.alpha_to_one);
6492 fprintf(f, " epilog.poly_line_smoothing = %u\n", key->ps.epilog.poly_line_smoothing);
6493 fprintf(f, " epilog.clamp_color = %u\n", key->ps.epilog.clamp_color);
6494 break;
6495
6496 default:
6497 assert(0);
6498 }
6499 }
6500
6501 static void si_init_shader_ctx(struct si_shader_context *ctx,
6502 struct si_screen *sscreen,
6503 struct si_shader *shader,
6504 LLVMTargetMachineRef tm)
6505 {
6506 struct lp_build_tgsi_context *bld_base;
6507 struct lp_build_tgsi_action tmpl = {};
6508
6509 memset(ctx, 0, sizeof(*ctx));
6510 radeon_llvm_context_init(
6511 &ctx->radeon_bld, "amdgcn--",
6512 (shader && shader->selector) ? &shader->selector->info : NULL,
6513 (shader && shader->selector) ? shader->selector->tokens : NULL);
6514 si_shader_context_init_alu(&ctx->radeon_bld.soa.bld_base);
6515 ctx->tm = tm;
6516 ctx->screen = sscreen;
6517 if (shader && shader->selector)
6518 ctx->type = shader->selector->info.processor;
6519 else
6520 ctx->type = -1;
6521 ctx->shader = shader;
6522
6523 ctx->voidt = LLVMVoidTypeInContext(ctx->radeon_bld.gallivm.context);
6524 ctx->i1 = LLVMInt1TypeInContext(ctx->radeon_bld.gallivm.context);
6525 ctx->i8 = LLVMInt8TypeInContext(ctx->radeon_bld.gallivm.context);
6526 ctx->i32 = LLVMInt32TypeInContext(ctx->radeon_bld.gallivm.context);
6527 ctx->i64 = LLVMInt64TypeInContext(ctx->radeon_bld.gallivm.context);
6528 ctx->i128 = LLVMIntTypeInContext(ctx->radeon_bld.gallivm.context, 128);
6529 ctx->f32 = LLVMFloatTypeInContext(ctx->radeon_bld.gallivm.context);
6530 ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
6531 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
6532 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
6533 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
6534 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
6535
6536 bld_base = &ctx->radeon_bld.soa.bld_base;
6537 bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
6538
6539 bld_base->op_actions[TGSI_OPCODE_INTERP_CENTROID] = interp_action;
6540 bld_base->op_actions[TGSI_OPCODE_INTERP_SAMPLE] = interp_action;
6541 bld_base->op_actions[TGSI_OPCODE_INTERP_OFFSET] = interp_action;
6542
6543 bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
6544 bld_base->op_actions[TGSI_OPCODE_TEX2] = tex_action;
6545 bld_base->op_actions[TGSI_OPCODE_TXB] = tex_action;
6546 bld_base->op_actions[TGSI_OPCODE_TXB2] = tex_action;
6547 bld_base->op_actions[TGSI_OPCODE_TXD] = tex_action;
6548 bld_base->op_actions[TGSI_OPCODE_TXF] = tex_action;
6549 bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
6550 bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
6551 bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
6552 bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
6553 bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
6554 bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
6555 bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
6556 bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
6557
6558 bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
6559 bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
6560 bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
6561 bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
6562 bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
6563 bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
6564
6565 tmpl.fetch_args = atomic_fetch_args;
6566 tmpl.emit = atomic_emit;
6567 bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
6568 bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
6569 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
6570 bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
6571 bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
6572 bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
6573 bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
6574 bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
6575 bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
6576 bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
6577 bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
6578 bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
6579 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
6580 bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
6581 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
6582 bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
6583 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
6584 bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
6585 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
6586 bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
6587
6588 bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
6589
6590 bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
6591 bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
6592 bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
6593 bld_base->op_actions[TGSI_OPCODE_DDY_FINE].emit = si_llvm_emit_ddxy;
6594
6595 bld_base->op_actions[TGSI_OPCODE_EMIT].emit = si_llvm_emit_vertex;
6596 bld_base->op_actions[TGSI_OPCODE_ENDPRIM].emit = si_llvm_emit_primitive;
6597 bld_base->op_actions[TGSI_OPCODE_BARRIER].emit = si_llvm_emit_barrier;
6598 }
6599
6600 int si_compile_tgsi_shader(struct si_screen *sscreen,
6601 LLVMTargetMachineRef tm,
6602 struct si_shader *shader,
6603 bool is_monolithic,
6604 struct pipe_debug_callback *debug)
6605 {
6606 struct si_shader_selector *sel = shader->selector;
6607 struct si_shader_context ctx;
6608 struct lp_build_tgsi_context *bld_base;
6609 LLVMModuleRef mod;
6610 int r = 0;
6611
6612 /* Dump TGSI code before doing TGSI->LLVM conversion in case the
6613 * conversion fails. */
6614 if (r600_can_dump_shader(&sscreen->b, sel->info.processor) &&
6615 !(sscreen->b.debug_flags & DBG_NO_TGSI)) {
6616 tgsi_dump(sel->tokens, 0);
6617 si_dump_streamout(&sel->so);
6618 }
6619
6620 si_init_shader_ctx(&ctx, sscreen, shader, tm);
6621 ctx.is_monolithic = is_monolithic;
6622
6623 shader->info.uses_instanceid = sel->info.uses_instanceid;
6624
6625 bld_base = &ctx.radeon_bld.soa.bld_base;
6626 ctx.radeon_bld.load_system_value = declare_system_value;
6627
6628 switch (ctx.type) {
6629 case PIPE_SHADER_VERTEX:
6630 ctx.radeon_bld.load_input = declare_input_vs;
6631 if (shader->key.vs.as_ls)
6632 bld_base->emit_epilogue = si_llvm_emit_ls_epilogue;
6633 else if (shader->key.vs.as_es)
6634 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6635 else
6636 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6637 break;
6638 case PIPE_SHADER_TESS_CTRL:
6639 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tcs;
6640 bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = fetch_output_tcs;
6641 bld_base->emit_store = store_output_tcs;
6642 bld_base->emit_epilogue = si_llvm_emit_tcs_epilogue;
6643 break;
6644 case PIPE_SHADER_TESS_EVAL:
6645 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_tes;
6646 if (shader->key.tes.as_es)
6647 bld_base->emit_epilogue = si_llvm_emit_es_epilogue;
6648 else
6649 bld_base->emit_epilogue = si_llvm_emit_vs_epilogue;
6650 break;
6651 case PIPE_SHADER_GEOMETRY:
6652 bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = fetch_input_gs;
6653 bld_base->emit_epilogue = si_llvm_emit_gs_epilogue;
6654 break;
6655 case PIPE_SHADER_FRAGMENT:
6656 ctx.radeon_bld.load_input = declare_input_fs;
6657 if (is_monolithic)
6658 bld_base->emit_epilogue = si_llvm_emit_fs_epilogue;
6659 else
6660 bld_base->emit_epilogue = si_llvm_return_fs_outputs;
6661 break;
6662 case PIPE_SHADER_COMPUTE:
6663 ctx.radeon_bld.declare_memory_region = declare_compute_memory;
6664 break;
6665 default:
6666 assert(!"Unsupported shader type");
6667 return -1;
6668 }
6669
6670 create_meta_data(&ctx);
6671 create_function(&ctx);
6672 preload_ring_buffers(&ctx);
6673
6674 if (ctx.is_monolithic && sel->type == PIPE_SHADER_FRAGMENT &&
6675 shader->key.ps.prolog.poly_stipple) {
6676 LLVMValueRef list = LLVMGetParam(ctx.radeon_bld.main_fn,
6677 SI_PARAM_RW_BUFFERS);
6678 si_llvm_emit_polygon_stipple(&ctx, list,
6679 SI_PARAM_POS_FIXED_PT);
6680 }
6681
6682 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6683 int i;
6684 for (i = 0; i < 4; i++) {
6685 ctx.gs_next_vertex[i] =
6686 lp_build_alloca(bld_base->base.gallivm,
6687 ctx.i32, "");
6688 }
6689 }
6690
6691 if (!lp_build_tgsi_llvm(bld_base, sel->tokens)) {
6692 fprintf(stderr, "Failed to translate shader from TGSI to LLVM\n");
6693 goto out;
6694 }
6695
6696 si_llvm_build_ret(&ctx, ctx.return_value);
6697 mod = bld_base->base.gallivm->module;
6698
6699 /* Dump LLVM IR before any optimization passes */
6700 if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
6701 r600_can_dump_shader(&sscreen->b, ctx.type))
6702 LLVMDumpModule(mod);
6703
6704 radeon_llvm_finalize_module(
6705 &ctx.radeon_bld,
6706 r600_extra_shader_checks(&sscreen->b, ctx.type));
6707
6708 r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
6709 mod, debug, ctx.type, "TGSI shader");
6710 if (r) {
6711 fprintf(stderr, "LLVM failed to compile shader\n");
6712 goto out;
6713 }
6714
6715 radeon_llvm_dispose(&ctx.radeon_bld);
6716
6717 /* Validate SGPR and VGPR usage for compute to detect compiler bugs.
6718 * LLVM 3.9svn has this bug.
6719 */
6720 if (sel->type == PIPE_SHADER_COMPUTE) {
6721 unsigned *props = sel->info.properties;
6722 unsigned wave_size = 64;
6723 unsigned max_vgprs = 256;
6724 unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
6725 unsigned max_sgprs_per_wave = 128;
6726 unsigned max_block_threads;
6727
6728 if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH])
6729 max_block_threads = props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
6730 props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
6731 props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
6732 else
6733 max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK;
6734
6735 unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
6736 unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
6737
6738 max_vgprs = max_vgprs / min_waves_per_simd;
6739 max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
6740
6741 if (shader->config.num_sgprs > max_sgprs ||
6742 shader->config.num_vgprs > max_vgprs) {
6743 fprintf(stderr, "LLVM failed to compile a shader correctly: "
6744 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
6745 shader->config.num_sgprs, shader->config.num_vgprs,
6746 max_sgprs, max_vgprs);
6747
6748 /* Just terminate the process, because dependent
6749 * shaders can hang due to bad input data, but use
6750 * the env var to allow shader-db to work.
6751 */
6752 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
6753 abort();
6754 }
6755 }
6756
6757 /* Add the scratch offset to input SGPRs. */
6758 if (shader->config.scratch_bytes_per_wave)
6759 shader->info.num_input_sgprs += 1; /* scratch byte offset */
6760
6761 /* Calculate the number of fragment input VGPRs. */
6762 if (ctx.type == PIPE_SHADER_FRAGMENT) {
6763 shader->info.num_input_vgprs = 0;
6764 shader->info.face_vgpr_index = -1;
6765
6766 if (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6767 shader->info.num_input_vgprs += 2;
6768 if (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr))
6769 shader->info.num_input_vgprs += 2;
6770 if (G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_addr))
6771 shader->info.num_input_vgprs += 2;
6772 if (G_0286CC_PERSP_PULL_MODEL_ENA(shader->config.spi_ps_input_addr))
6773 shader->info.num_input_vgprs += 3;
6774 if (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_addr))
6775 shader->info.num_input_vgprs += 2;
6776 if (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr))
6777 shader->info.num_input_vgprs += 2;
6778 if (G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_addr))
6779 shader->info.num_input_vgprs += 2;
6780 if (G_0286CC_LINE_STIPPLE_TEX_ENA(shader->config.spi_ps_input_addr))
6781 shader->info.num_input_vgprs += 1;
6782 if (G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_addr))
6783 shader->info.num_input_vgprs += 1;
6784 if (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_addr))
6785 shader->info.num_input_vgprs += 1;
6786 if (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_addr))
6787 shader->info.num_input_vgprs += 1;
6788 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_addr))
6789 shader->info.num_input_vgprs += 1;
6790 if (G_0286CC_FRONT_FACE_ENA(shader->config.spi_ps_input_addr)) {
6791 shader->info.face_vgpr_index = shader->info.num_input_vgprs;
6792 shader->info.num_input_vgprs += 1;
6793 }
6794 if (G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr))
6795 shader->info.num_input_vgprs += 1;
6796 if (G_0286CC_SAMPLE_COVERAGE_ENA(shader->config.spi_ps_input_addr))
6797 shader->info.num_input_vgprs += 1;
6798 if (G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr))
6799 shader->info.num_input_vgprs += 1;
6800 }
6801
6802 if (ctx.type == PIPE_SHADER_GEOMETRY) {
6803 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
6804 shader->gs_copy_shader->selector = shader->selector;
6805 ctx.shader = shader->gs_copy_shader;
6806 if ((r = si_generate_gs_copy_shader(sscreen, &ctx,
6807 shader, debug))) {
6808 free(shader->gs_copy_shader);
6809 shader->gs_copy_shader = NULL;
6810 goto out;
6811 }
6812 }
6813
6814 out:
6815 return r;
6816 }
6817
6818 /**
6819 * Create, compile and return a shader part (prolog or epilog).
6820 *
6821 * \param sscreen screen
6822 * \param list list of shader parts of the same category
6823 * \param key shader part key
6824 * \param tm LLVM target machine
6825 * \param debug debug callback
6826 * \param compile the callback responsible for compilation
6827 * \return non-NULL on success
6828 */
6829 static struct si_shader_part *
6830 si_get_shader_part(struct si_screen *sscreen,
6831 struct si_shader_part **list,
6832 union si_shader_part_key *key,
6833 LLVMTargetMachineRef tm,
6834 struct pipe_debug_callback *debug,
6835 bool (*compile)(struct si_screen *,
6836 LLVMTargetMachineRef,
6837 struct pipe_debug_callback *,
6838 struct si_shader_part *))
6839 {
6840 struct si_shader_part *result;
6841
6842 pipe_mutex_lock(sscreen->shader_parts_mutex);
6843
6844 /* Find existing. */
6845 for (result = *list; result; result = result->next) {
6846 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
6847 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6848 return result;
6849 }
6850 }
6851
6852 /* Compile a new one. */
6853 result = CALLOC_STRUCT(si_shader_part);
6854 result->key = *key;
6855 if (!compile(sscreen, tm, debug, result)) {
6856 FREE(result);
6857 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6858 return NULL;
6859 }
6860
6861 result->next = *list;
6862 *list = result;
6863 pipe_mutex_unlock(sscreen->shader_parts_mutex);
6864 return result;
6865 }
6866
6867 /**
6868 * Create a vertex shader prolog.
6869 *
6870 * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
6871 * All inputs are returned unmodified. The vertex load indices are
6872 * stored after them, which will used by the API VS for fetching inputs.
6873 *
6874 * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
6875 * input_v0,
6876 * input_v1,
6877 * input_v2,
6878 * input_v3,
6879 * (VertexID + BaseVertex),
6880 * (InstanceID + StartInstance),
6881 * (InstanceID / 2 + StartInstance)
6882 */
6883 static bool si_compile_vs_prolog(struct si_screen *sscreen,
6884 LLVMTargetMachineRef tm,
6885 struct pipe_debug_callback *debug,
6886 struct si_shader_part *out)
6887 {
6888 union si_shader_part_key *key = &out->key;
6889 struct si_shader shader = {};
6890 struct si_shader_context ctx;
6891 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
6892 LLVMTypeRef *params, *returns;
6893 LLVMValueRef ret, func;
6894 int last_sgpr, num_params, num_returns, i;
6895 bool status = true;
6896
6897 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
6898 ctx.type = PIPE_SHADER_VERTEX;
6899 ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
6900 ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
6901
6902 /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
6903 params = alloca((key->vs_prolog.num_input_sgprs + 4) *
6904 sizeof(LLVMTypeRef));
6905 returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
6906 key->vs_prolog.last_input + 1) *
6907 sizeof(LLVMTypeRef));
6908 num_params = 0;
6909 num_returns = 0;
6910
6911 /* Declare input and output SGPRs. */
6912 num_params = 0;
6913 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6914 params[num_params++] = ctx.i32;
6915 returns[num_returns++] = ctx.i32;
6916 }
6917 last_sgpr = num_params - 1;
6918
6919 /* 4 preloaded VGPRs (outputs must be floats) */
6920 for (i = 0; i < 4; i++) {
6921 params[num_params++] = ctx.i32;
6922 returns[num_returns++] = ctx.f32;
6923 }
6924
6925 /* Vertex load indices. */
6926 for (i = 0; i <= key->vs_prolog.last_input; i++)
6927 returns[num_returns++] = ctx.f32;
6928
6929 /* Create the function. */
6930 si_create_function(&ctx, returns, num_returns, params,
6931 num_params, last_sgpr);
6932 func = ctx.radeon_bld.main_fn;
6933
6934 /* Copy inputs to outputs. This should be no-op, as the registers match,
6935 * but it will prevent the compiler from overwriting them unintentionally.
6936 */
6937 ret = ctx.return_value;
6938 for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
6939 LLVMValueRef p = LLVMGetParam(func, i);
6940 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6941 }
6942 for (i = num_params - 4; i < num_params; i++) {
6943 LLVMValueRef p = LLVMGetParam(func, i);
6944 p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
6945 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
6946 }
6947
6948 /* Compute vertex load indices from instance divisors. */
6949 for (i = 0; i <= key->vs_prolog.last_input; i++) {
6950 unsigned divisor = key->vs_prolog.states.instance_divisors[i];
6951 LLVMValueRef index;
6952
6953 if (divisor) {
6954 /* InstanceID / Divisor + StartInstance */
6955 index = get_instance_index_for_fetch(&ctx.radeon_bld,
6956 SI_SGPR_START_INSTANCE,
6957 divisor);
6958 } else {
6959 /* VertexID + BaseVertex */
6960 index = LLVMBuildAdd(gallivm->builder,
6961 LLVMGetParam(func, ctx.param_vertex_id),
6962 LLVMGetParam(func, SI_SGPR_BASE_VERTEX), "");
6963 }
6964
6965 index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
6966 ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
6967 num_params++, "");
6968 }
6969
6970 /* Compile. */
6971 si_llvm_build_ret(&ctx, ret);
6972 radeon_llvm_finalize_module(
6973 &ctx.radeon_bld,
6974 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_VERTEX));
6975
6976 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
6977 gallivm->module, debug, ctx.type,
6978 "Vertex Shader Prolog"))
6979 status = false;
6980
6981 radeon_llvm_dispose(&ctx.radeon_bld);
6982 return status;
6983 }
6984
6985 /**
6986 * Compile the vertex shader epilog. This is also used by the tessellation
6987 * evaluation shader compiled as VS.
6988 *
6989 * The input is PrimitiveID.
6990 *
6991 * If PrimitiveID is required by the pixel shader, export it.
6992 * Otherwise, do nothing.
6993 */
6994 static bool si_compile_vs_epilog(struct si_screen *sscreen,
6995 LLVMTargetMachineRef tm,
6996 struct pipe_debug_callback *debug,
6997 struct si_shader_part *out)
6998 {
6999 union si_shader_part_key *key = &out->key;
7000 struct si_shader_context ctx;
7001 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7002 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7003 LLVMTypeRef params[5];
7004 int num_params, i;
7005 bool status = true;
7006
7007 si_init_shader_ctx(&ctx, sscreen, NULL, tm);
7008 ctx.type = PIPE_SHADER_VERTEX;
7009
7010 /* Declare input VGPRs. */
7011 num_params = key->vs_epilog.states.export_prim_id ?
7012 (VS_EPILOG_PRIMID_LOC + 1) : 0;
7013 assert(num_params <= ARRAY_SIZE(params));
7014
7015 for (i = 0; i < num_params; i++)
7016 params[i] = ctx.f32;
7017
7018 /* Create the function. */
7019 si_create_function(&ctx, NULL, 0, params, num_params, -1);
7020
7021 /* Emit exports. */
7022 if (key->vs_epilog.states.export_prim_id) {
7023 struct lp_build_context *base = &bld_base->base;
7024 struct lp_build_context *uint = &bld_base->uint_bld;
7025 LLVMValueRef args[9];
7026
7027 args[0] = lp_build_const_int32(base->gallivm, 0x0); /* enabled channels */
7028 args[1] = uint->zero; /* whether the EXEC mask is valid */
7029 args[2] = uint->zero; /* DONE bit */
7030 args[3] = lp_build_const_int32(base->gallivm, V_008DFC_SQ_EXP_PARAM +
7031 key->vs_epilog.prim_id_param_offset);
7032 args[4] = uint->zero; /* COMPR flag (0 = 32-bit export) */
7033 args[5] = LLVMGetParam(ctx.radeon_bld.main_fn,
7034 VS_EPILOG_PRIMID_LOC); /* X */
7035 args[6] = uint->undef; /* Y */
7036 args[7] = uint->undef; /* Z */
7037 args[8] = uint->undef; /* W */
7038
7039 lp_build_intrinsic(base->gallivm->builder, "llvm.SI.export",
7040 LLVMVoidTypeInContext(base->gallivm->context),
7041 args, 9, 0);
7042 }
7043
7044 /* Compile. */
7045 LLVMBuildRetVoid(gallivm->builder);
7046 radeon_llvm_finalize_module(
7047 &ctx.radeon_bld,
7048 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_VERTEX));
7049
7050 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7051 gallivm->module, debug, ctx.type,
7052 "Vertex Shader Epilog"))
7053 status = false;
7054
7055 radeon_llvm_dispose(&ctx.radeon_bld);
7056 return status;
7057 }
7058
7059 /**
7060 * Create & compile a vertex shader epilog. This a helper used by VS and TES.
7061 */
7062 static bool si_get_vs_epilog(struct si_screen *sscreen,
7063 LLVMTargetMachineRef tm,
7064 struct si_shader *shader,
7065 struct pipe_debug_callback *debug,
7066 struct si_vs_epilog_bits *states)
7067 {
7068 union si_shader_part_key epilog_key;
7069
7070 memset(&epilog_key, 0, sizeof(epilog_key));
7071 epilog_key.vs_epilog.states = *states;
7072
7073 /* Set up the PrimitiveID output. */
7074 if (shader->key.vs.epilog.export_prim_id) {
7075 unsigned index = shader->selector->info.num_outputs;
7076 unsigned offset = shader->info.nr_param_exports++;
7077
7078 epilog_key.vs_epilog.prim_id_param_offset = offset;
7079 assert(index < ARRAY_SIZE(shader->info.vs_output_param_offset));
7080 shader->info.vs_output_param_offset[index] = offset;
7081 }
7082
7083 shader->epilog = si_get_shader_part(sscreen, &sscreen->vs_epilogs,
7084 &epilog_key, tm, debug,
7085 si_compile_vs_epilog);
7086 return shader->epilog != NULL;
7087 }
7088
7089 /**
7090 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
7091 */
7092 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
7093 LLVMTargetMachineRef tm,
7094 struct si_shader *shader,
7095 struct pipe_debug_callback *debug)
7096 {
7097 struct tgsi_shader_info *info = &shader->selector->info;
7098 union si_shader_part_key prolog_key;
7099 unsigned i;
7100
7101 /* Get the prolog. */
7102 memset(&prolog_key, 0, sizeof(prolog_key));
7103 prolog_key.vs_prolog.states = shader->key.vs.prolog;
7104 prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7105 prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
7106
7107 /* The prolog is a no-op if there are no inputs. */
7108 if (info->num_inputs) {
7109 shader->prolog =
7110 si_get_shader_part(sscreen, &sscreen->vs_prologs,
7111 &prolog_key, tm, debug,
7112 si_compile_vs_prolog);
7113 if (!shader->prolog)
7114 return false;
7115 }
7116
7117 /* Get the epilog. */
7118 if (!shader->key.vs.as_es && !shader->key.vs.as_ls &&
7119 !si_get_vs_epilog(sscreen, tm, shader, debug,
7120 &shader->key.vs.epilog))
7121 return false;
7122
7123 /* Set the instanceID flag. */
7124 for (i = 0; i < info->num_inputs; i++)
7125 if (prolog_key.vs_prolog.states.instance_divisors[i])
7126 shader->info.uses_instanceid = true;
7127
7128 return true;
7129 }
7130
7131 /**
7132 * Select and compile (or reuse) TES parts (epilog).
7133 */
7134 static bool si_shader_select_tes_parts(struct si_screen *sscreen,
7135 LLVMTargetMachineRef tm,
7136 struct si_shader *shader,
7137 struct pipe_debug_callback *debug)
7138 {
7139 if (shader->key.tes.as_es)
7140 return true;
7141
7142 /* TES compiled as VS. */
7143 return si_get_vs_epilog(sscreen, tm, shader, debug,
7144 &shader->key.tes.epilog);
7145 }
7146
7147 /**
7148 * Compile the TCS epilog. This writes tesselation factors to memory based on
7149 * the output primitive type of the tesselator (determined by TES).
7150 */
7151 static bool si_compile_tcs_epilog(struct si_screen *sscreen,
7152 LLVMTargetMachineRef tm,
7153 struct pipe_debug_callback *debug,
7154 struct si_shader_part *out)
7155 {
7156 union si_shader_part_key *key = &out->key;
7157 struct si_shader shader = {};
7158 struct si_shader_context ctx;
7159 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7160 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7161 LLVMTypeRef params[16];
7162 LLVMValueRef func;
7163 int last_sgpr, num_params;
7164 bool status = true;
7165
7166 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7167 ctx.type = PIPE_SHADER_TESS_CTRL;
7168 shader.key.tcs.epilog = key->tcs_epilog.states;
7169
7170 /* Declare inputs. Only RW_BUFFERS and TESS_FACTOR_OFFSET are used. */
7171 params[SI_PARAM_RW_BUFFERS] = const_array(ctx.v16i8, SI_NUM_RW_BUFFERS);
7172 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7173 params[SI_PARAM_SAMPLERS] = ctx.i64;
7174 params[SI_PARAM_IMAGES] = ctx.i64;
7175 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7176 params[SI_PARAM_TCS_OFFCHIP_LAYOUT] = ctx.i32;
7177 params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
7178 params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
7179 params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
7180 params[ctx.param_oc_lds = SI_PARAM_TCS_OC_LDS] = ctx.i32;
7181 params[SI_PARAM_TESS_FACTOR_OFFSET] = ctx.i32;
7182 last_sgpr = SI_PARAM_TESS_FACTOR_OFFSET;
7183 num_params = last_sgpr + 1;
7184
7185 params[num_params++] = ctx.i32; /* patch index within the wave (REL_PATCH_ID) */
7186 params[num_params++] = ctx.i32; /* invocation ID within the patch */
7187 params[num_params++] = ctx.i32; /* LDS offset where tess factors should be loaded from */
7188
7189 /* Create the function. */
7190 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7191 declare_tess_lds(&ctx);
7192 func = ctx.radeon_bld.main_fn;
7193
7194 si_write_tess_factors(bld_base,
7195 LLVMGetParam(func, last_sgpr + 1),
7196 LLVMGetParam(func, last_sgpr + 2),
7197 LLVMGetParam(func, last_sgpr + 3));
7198
7199 /* Compile. */
7200 LLVMBuildRetVoid(gallivm->builder);
7201 radeon_llvm_finalize_module(
7202 &ctx.radeon_bld,
7203 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_TESS_CTRL));
7204
7205 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7206 gallivm->module, debug, ctx.type,
7207 "Tessellation Control Shader Epilog"))
7208 status = false;
7209
7210 radeon_llvm_dispose(&ctx.radeon_bld);
7211 return status;
7212 }
7213
7214 /**
7215 * Select and compile (or reuse) TCS parts (epilog).
7216 */
7217 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
7218 LLVMTargetMachineRef tm,
7219 struct si_shader *shader,
7220 struct pipe_debug_callback *debug)
7221 {
7222 union si_shader_part_key epilog_key;
7223
7224 /* Get the epilog. */
7225 memset(&epilog_key, 0, sizeof(epilog_key));
7226 epilog_key.tcs_epilog.states = shader->key.tcs.epilog;
7227
7228 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
7229 &epilog_key, tm, debug,
7230 si_compile_tcs_epilog);
7231 return shader->epilog != NULL;
7232 }
7233
7234 /**
7235 * Compile the pixel shader prolog. This handles:
7236 * - two-side color selection and interpolation
7237 * - overriding interpolation parameters for the API PS
7238 * - polygon stippling
7239 *
7240 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
7241 * overriden by other states. (e.g. per-sample interpolation)
7242 * Interpolated colors are stored after the preloaded VGPRs.
7243 */
7244 static bool si_compile_ps_prolog(struct si_screen *sscreen,
7245 LLVMTargetMachineRef tm,
7246 struct pipe_debug_callback *debug,
7247 struct si_shader_part *out)
7248 {
7249 union si_shader_part_key *key = &out->key;
7250 struct si_shader shader = {};
7251 struct si_shader_context ctx;
7252 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7253 LLVMTypeRef *params;
7254 LLVMValueRef ret, func;
7255 int last_sgpr, num_params, num_returns, i, num_color_channels;
7256 bool status = true;
7257
7258 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7259 ctx.type = PIPE_SHADER_FRAGMENT;
7260 shader.key.ps.prolog = key->ps_prolog.states;
7261
7262 /* Number of inputs + 8 color elements. */
7263 params = alloca((key->ps_prolog.num_input_sgprs +
7264 key->ps_prolog.num_input_vgprs + 8) *
7265 sizeof(LLVMTypeRef));
7266
7267 /* Declare inputs. */
7268 num_params = 0;
7269 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++)
7270 params[num_params++] = ctx.i32;
7271 last_sgpr = num_params - 1;
7272
7273 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++)
7274 params[num_params++] = ctx.f32;
7275
7276 /* Declare outputs (same as inputs + add colors if needed) */
7277 num_returns = num_params;
7278 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
7279 for (i = 0; i < num_color_channels; i++)
7280 params[num_returns++] = ctx.f32;
7281
7282 /* Create the function. */
7283 si_create_function(&ctx, params, num_returns, params,
7284 num_params, last_sgpr);
7285 func = ctx.radeon_bld.main_fn;
7286
7287 /* Copy inputs to outputs. This should be no-op, as the registers match,
7288 * but it will prevent the compiler from overwriting them unintentionally.
7289 */
7290 ret = ctx.return_value;
7291 for (i = 0; i < num_params; i++) {
7292 LLVMValueRef p = LLVMGetParam(func, i);
7293 ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
7294 }
7295
7296 /* Polygon stippling. */
7297 if (key->ps_prolog.states.poly_stipple) {
7298 /* POS_FIXED_PT is always last. */
7299 unsigned pos = key->ps_prolog.num_input_sgprs +
7300 key->ps_prolog.num_input_vgprs - 1;
7301 LLVMValueRef ptr[2], list;
7302
7303 /* Get the pointer to rw buffers. */
7304 ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
7305 ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
7306 list = lp_build_gather_values(gallivm, ptr, 2);
7307 list = LLVMBuildBitCast(gallivm->builder, list, ctx.i64, "");
7308 list = LLVMBuildIntToPtr(gallivm->builder, list,
7309 const_array(ctx.v16i8, SI_NUM_RW_BUFFERS), "");
7310
7311 si_llvm_emit_polygon_stipple(&ctx, list, pos);
7312 }
7313
7314 if (key->ps_prolog.states.bc_optimize_for_persp ||
7315 key->ps_prolog.states.bc_optimize_for_linear) {
7316 unsigned i, base = key->ps_prolog.num_input_sgprs;
7317 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
7318
7319 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
7320 * The hw doesn't compute CENTROID if the whole wave only
7321 * contains fully-covered quads.
7322 *
7323 * PRIM_MASK is after user SGPRs.
7324 */
7325 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7326 bc_optimize = LLVMBuildLShr(gallivm->builder, bc_optimize,
7327 LLVMConstInt(ctx.i32, 31, 0), "");
7328 bc_optimize = LLVMBuildTrunc(gallivm->builder, bc_optimize,
7329 ctx.i1, "");
7330
7331 if (key->ps_prolog.states.bc_optimize_for_persp) {
7332 /* Read PERSP_CENTER. */
7333 for (i = 0; i < 2; i++)
7334 center[i] = LLVMGetParam(func, base + 2 + i);
7335 /* Read PERSP_CENTROID. */
7336 for (i = 0; i < 2; i++)
7337 centroid[i] = LLVMGetParam(func, base + 4 + i);
7338 /* Select PERSP_CENTROID. */
7339 for (i = 0; i < 2; i++) {
7340 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7341 center[i], centroid[i], "");
7342 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7343 tmp, base + 4 + i, "");
7344 }
7345 }
7346 if (key->ps_prolog.states.bc_optimize_for_linear) {
7347 /* Read LINEAR_CENTER. */
7348 for (i = 0; i < 2; i++)
7349 center[i] = LLVMGetParam(func, base + 8 + i);
7350 /* Read LINEAR_CENTROID. */
7351 for (i = 0; i < 2; i++)
7352 centroid[i] = LLVMGetParam(func, base + 10 + i);
7353 /* Select LINEAR_CENTROID. */
7354 for (i = 0; i < 2; i++) {
7355 tmp = LLVMBuildSelect(gallivm->builder, bc_optimize,
7356 center[i], centroid[i], "");
7357 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7358 tmp, base + 10 + i, "");
7359 }
7360 }
7361 }
7362
7363 /* Force per-sample interpolation. */
7364 if (key->ps_prolog.states.force_persp_sample_interp) {
7365 unsigned i, base = key->ps_prolog.num_input_sgprs;
7366 LLVMValueRef persp_sample[2];
7367
7368 /* Read PERSP_SAMPLE. */
7369 for (i = 0; i < 2; i++)
7370 persp_sample[i] = LLVMGetParam(func, base + i);
7371 /* Overwrite PERSP_CENTER. */
7372 for (i = 0; i < 2; i++)
7373 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7374 persp_sample[i], base + 2 + i, "");
7375 /* Overwrite PERSP_CENTROID. */
7376 for (i = 0; i < 2; i++)
7377 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7378 persp_sample[i], base + 4 + i, "");
7379 }
7380 if (key->ps_prolog.states.force_linear_sample_interp) {
7381 unsigned i, base = key->ps_prolog.num_input_sgprs;
7382 LLVMValueRef linear_sample[2];
7383
7384 /* Read LINEAR_SAMPLE. */
7385 for (i = 0; i < 2; i++)
7386 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
7387 /* Overwrite LINEAR_CENTER. */
7388 for (i = 0; i < 2; i++)
7389 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7390 linear_sample[i], base + 8 + i, "");
7391 /* Overwrite LINEAR_CENTROID. */
7392 for (i = 0; i < 2; i++)
7393 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7394 linear_sample[i], base + 10 + i, "");
7395 }
7396
7397 /* Force center interpolation. */
7398 if (key->ps_prolog.states.force_persp_center_interp) {
7399 unsigned i, base = key->ps_prolog.num_input_sgprs;
7400 LLVMValueRef persp_center[2];
7401
7402 /* Read PERSP_CENTER. */
7403 for (i = 0; i < 2; i++)
7404 persp_center[i] = LLVMGetParam(func, base + 2 + i);
7405 /* Overwrite PERSP_SAMPLE. */
7406 for (i = 0; i < 2; i++)
7407 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7408 persp_center[i], base + i, "");
7409 /* Overwrite PERSP_CENTROID. */
7410 for (i = 0; i < 2; i++)
7411 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7412 persp_center[i], base + 4 + i, "");
7413 }
7414 if (key->ps_prolog.states.force_linear_center_interp) {
7415 unsigned i, base = key->ps_prolog.num_input_sgprs;
7416 LLVMValueRef linear_center[2];
7417
7418 /* Read LINEAR_CENTER. */
7419 for (i = 0; i < 2; i++)
7420 linear_center[i] = LLVMGetParam(func, base + 8 + i);
7421 /* Overwrite LINEAR_SAMPLE. */
7422 for (i = 0; i < 2; i++)
7423 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7424 linear_center[i], base + 6 + i, "");
7425 /* Overwrite LINEAR_CENTROID. */
7426 for (i = 0; i < 2; i++)
7427 ret = LLVMBuildInsertValue(gallivm->builder, ret,
7428 linear_center[i], base + 10 + i, "");
7429 }
7430
7431 /* Interpolate colors. */
7432 for (i = 0; i < 2; i++) {
7433 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
7434 unsigned face_vgpr = key->ps_prolog.num_input_sgprs +
7435 key->ps_prolog.face_vgpr_index;
7436 LLVMValueRef interp[2], color[4];
7437 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
7438
7439 if (!writemask)
7440 continue;
7441
7442 /* If the interpolation qualifier is not CONSTANT (-1). */
7443 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
7444 unsigned interp_vgpr = key->ps_prolog.num_input_sgprs +
7445 key->ps_prolog.color_interp_vgpr_index[i];
7446
7447 /* Get the (i,j) updated by bc_optimize handling. */
7448 interp[0] = LLVMBuildExtractValue(gallivm->builder, ret,
7449 interp_vgpr, "");
7450 interp[1] = LLVMBuildExtractValue(gallivm->builder, ret,
7451 interp_vgpr + 1, "");
7452 interp_ij = lp_build_gather_values(gallivm, interp, 2);
7453 interp_ij = LLVMBuildBitCast(gallivm->builder, interp_ij,
7454 ctx.v2i32, "");
7455 }
7456
7457 /* Use the absolute location of the input. */
7458 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
7459
7460 if (key->ps_prolog.states.color_two_side) {
7461 face = LLVMGetParam(func, face_vgpr);
7462 face = LLVMBuildBitCast(gallivm->builder, face, ctx.i32, "");
7463 }
7464
7465 interp_fs_input(&ctx,
7466 key->ps_prolog.color_attr_index[i],
7467 TGSI_SEMANTIC_COLOR, i,
7468 key->ps_prolog.num_interp_inputs,
7469 key->ps_prolog.colors_read, interp_ij,
7470 prim_mask, face, color);
7471
7472 while (writemask) {
7473 unsigned chan = u_bit_scan(&writemask);
7474 ret = LLVMBuildInsertValue(gallivm->builder, ret, color[chan],
7475 num_params++, "");
7476 }
7477 }
7478
7479 /* Tell LLVM to insert WQM instruction sequence when needed. */
7480 if (key->ps_prolog.wqm) {
7481 LLVMAddTargetDependentFunctionAttr(func,
7482 "amdgpu-ps-wqm-outputs", "");
7483 }
7484
7485 /* Compile. */
7486 si_llvm_build_ret(&ctx, ret);
7487 radeon_llvm_finalize_module(
7488 &ctx.radeon_bld,
7489 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
7490
7491 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7492 gallivm->module, debug, ctx.type,
7493 "Fragment Shader Prolog"))
7494 status = false;
7495
7496 radeon_llvm_dispose(&ctx.radeon_bld);
7497 return status;
7498 }
7499
7500 /**
7501 * Compile the pixel shader epilog. This handles everything that must be
7502 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
7503 */
7504 static bool si_compile_ps_epilog(struct si_screen *sscreen,
7505 LLVMTargetMachineRef tm,
7506 struct pipe_debug_callback *debug,
7507 struct si_shader_part *out)
7508 {
7509 union si_shader_part_key *key = &out->key;
7510 struct si_shader shader = {};
7511 struct si_shader_context ctx;
7512 struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
7513 struct lp_build_tgsi_context *bld_base = &ctx.radeon_bld.soa.bld_base;
7514 LLVMTypeRef params[16+8*4+3];
7515 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
7516 int last_sgpr, num_params, i;
7517 bool status = true;
7518 struct si_ps_exports exp = {};
7519
7520 si_init_shader_ctx(&ctx, sscreen, &shader, tm);
7521 ctx.type = PIPE_SHADER_FRAGMENT;
7522 shader.key.ps.epilog = key->ps_epilog.states;
7523
7524 /* Declare input SGPRs. */
7525 params[SI_PARAM_RW_BUFFERS] = ctx.i64;
7526 params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
7527 params[SI_PARAM_SAMPLERS] = ctx.i64;
7528 params[SI_PARAM_IMAGES] = ctx.i64;
7529 params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
7530 params[SI_PARAM_ALPHA_REF] = ctx.f32;
7531 last_sgpr = SI_PARAM_ALPHA_REF;
7532
7533 /* Declare input VGPRs. */
7534 num_params = (last_sgpr + 1) +
7535 util_bitcount(key->ps_epilog.colors_written) * 4 +
7536 key->ps_epilog.writes_z +
7537 key->ps_epilog.writes_stencil +
7538 key->ps_epilog.writes_samplemask;
7539
7540 num_params = MAX2(num_params,
7541 last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
7542
7543 assert(num_params <= ARRAY_SIZE(params));
7544
7545 for (i = last_sgpr + 1; i < num_params; i++)
7546 params[i] = ctx.f32;
7547
7548 /* Create the function. */
7549 si_create_function(&ctx, NULL, 0, params, num_params, last_sgpr);
7550 /* Disable elimination of unused inputs. */
7551 radeon_llvm_add_attribute(ctx.radeon_bld.main_fn,
7552 "InitialPSInputAddr", 0xffffff);
7553
7554 /* Process colors. */
7555 unsigned vgpr = last_sgpr + 1;
7556 unsigned colors_written = key->ps_epilog.colors_written;
7557 int last_color_export = -1;
7558
7559 /* Find the last color export. */
7560 if (!key->ps_epilog.writes_z &&
7561 !key->ps_epilog.writes_stencil &&
7562 !key->ps_epilog.writes_samplemask) {
7563 unsigned spi_format = key->ps_epilog.states.spi_shader_col_format;
7564
7565 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
7566 if (colors_written == 0x1 && key->ps_epilog.states.last_cbuf > 0) {
7567 /* Just set this if any of the colorbuffers are enabled. */
7568 if (spi_format &
7569 ((1llu << (4 * (key->ps_epilog.states.last_cbuf + 1))) - 1))
7570 last_color_export = 0;
7571 } else {
7572 for (i = 0; i < 8; i++)
7573 if (colors_written & (1 << i) &&
7574 (spi_format >> (i * 4)) & 0xf)
7575 last_color_export = i;
7576 }
7577 }
7578
7579 while (colors_written) {
7580 LLVMValueRef color[4];
7581 int mrt = u_bit_scan(&colors_written);
7582
7583 for (i = 0; i < 4; i++)
7584 color[i] = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7585
7586 si_export_mrt_color(bld_base, color, mrt,
7587 num_params - 1,
7588 mrt == last_color_export, &exp);
7589 }
7590
7591 /* Process depth, stencil, samplemask. */
7592 if (key->ps_epilog.writes_z)
7593 depth = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7594 if (key->ps_epilog.writes_stencil)
7595 stencil = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7596 if (key->ps_epilog.writes_samplemask)
7597 samplemask = LLVMGetParam(ctx.radeon_bld.main_fn, vgpr++);
7598
7599 if (depth || stencil || samplemask)
7600 si_export_mrt_z(bld_base, depth, stencil, samplemask, &exp);
7601 else if (last_color_export == -1)
7602 si_export_null(bld_base);
7603
7604 if (exp.num)
7605 si_emit_ps_exports(&ctx, &exp);
7606
7607 /* Compile. */
7608 LLVMBuildRetVoid(gallivm->builder);
7609 radeon_llvm_finalize_module(
7610 &ctx.radeon_bld,
7611 r600_extra_shader_checks(&sscreen->b, PIPE_SHADER_FRAGMENT));
7612
7613 if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
7614 gallivm->module, debug, ctx.type,
7615 "Fragment Shader Epilog"))
7616 status = false;
7617
7618 radeon_llvm_dispose(&ctx.radeon_bld);
7619 return status;
7620 }
7621
7622 /**
7623 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
7624 */
7625 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
7626 LLVMTargetMachineRef tm,
7627 struct si_shader *shader,
7628 struct pipe_debug_callback *debug)
7629 {
7630 struct tgsi_shader_info *info = &shader->selector->info;
7631 union si_shader_part_key prolog_key;
7632 union si_shader_part_key epilog_key;
7633 unsigned i;
7634
7635 /* Get the prolog. */
7636 memset(&prolog_key, 0, sizeof(prolog_key));
7637 prolog_key.ps_prolog.states = shader->key.ps.prolog;
7638 prolog_key.ps_prolog.colors_read = info->colors_read;
7639 prolog_key.ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
7640 prolog_key.ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
7641 prolog_key.ps_prolog.wqm = info->uses_derivatives &&
7642 (prolog_key.ps_prolog.colors_read ||
7643 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7644 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7645 prolog_key.ps_prolog.states.force_persp_center_interp ||
7646 prolog_key.ps_prolog.states.force_linear_center_interp ||
7647 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7648 prolog_key.ps_prolog.states.bc_optimize_for_linear);
7649
7650 if (info->colors_read) {
7651 unsigned *color = shader->selector->color_attr_index;
7652
7653 if (shader->key.ps.prolog.color_two_side) {
7654 /* BCOLORs are stored after the last input. */
7655 prolog_key.ps_prolog.num_interp_inputs = info->num_inputs;
7656 prolog_key.ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
7657 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
7658 }
7659
7660 for (i = 0; i < 2; i++) {
7661 unsigned interp = info->input_interpolate[color[i]];
7662 unsigned location = info->input_interpolate_loc[color[i]];
7663
7664 if (!(info->colors_read & (0xf << i*4)))
7665 continue;
7666
7667 prolog_key.ps_prolog.color_attr_index[i] = color[i];
7668
7669 if (shader->key.ps.prolog.flatshade_colors &&
7670 interp == TGSI_INTERPOLATE_COLOR)
7671 interp = TGSI_INTERPOLATE_CONSTANT;
7672
7673 switch (interp) {
7674 case TGSI_INTERPOLATE_CONSTANT:
7675 prolog_key.ps_prolog.color_interp_vgpr_index[i] = -1;
7676 break;
7677 case TGSI_INTERPOLATE_PERSPECTIVE:
7678 case TGSI_INTERPOLATE_COLOR:
7679 /* Force the interpolation location for colors here. */
7680 if (shader->key.ps.prolog.force_persp_sample_interp)
7681 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7682 if (shader->key.ps.prolog.force_persp_center_interp)
7683 location = TGSI_INTERPOLATE_LOC_CENTER;
7684
7685 switch (location) {
7686 case TGSI_INTERPOLATE_LOC_SAMPLE:
7687 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 0;
7688 shader->config.spi_ps_input_ena |=
7689 S_0286CC_PERSP_SAMPLE_ENA(1);
7690 break;
7691 case TGSI_INTERPOLATE_LOC_CENTER:
7692 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 2;
7693 shader->config.spi_ps_input_ena |=
7694 S_0286CC_PERSP_CENTER_ENA(1);
7695 break;
7696 case TGSI_INTERPOLATE_LOC_CENTROID:
7697 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 4;
7698 shader->config.spi_ps_input_ena |=
7699 S_0286CC_PERSP_CENTROID_ENA(1);
7700 break;
7701 default:
7702 assert(0);
7703 }
7704 break;
7705 case TGSI_INTERPOLATE_LINEAR:
7706 /* Force the interpolation location for colors here. */
7707 if (shader->key.ps.prolog.force_linear_sample_interp)
7708 location = TGSI_INTERPOLATE_LOC_SAMPLE;
7709 if (shader->key.ps.prolog.force_linear_center_interp)
7710 location = TGSI_INTERPOLATE_LOC_CENTER;
7711
7712 switch (location) {
7713 case TGSI_INTERPOLATE_LOC_SAMPLE:
7714 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 6;
7715 shader->config.spi_ps_input_ena |=
7716 S_0286CC_LINEAR_SAMPLE_ENA(1);
7717 break;
7718 case TGSI_INTERPOLATE_LOC_CENTER:
7719 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 8;
7720 shader->config.spi_ps_input_ena |=
7721 S_0286CC_LINEAR_CENTER_ENA(1);
7722 break;
7723 case TGSI_INTERPOLATE_LOC_CENTROID:
7724 prolog_key.ps_prolog.color_interp_vgpr_index[i] = 10;
7725 shader->config.spi_ps_input_ena |=
7726 S_0286CC_LINEAR_CENTROID_ENA(1);
7727 break;
7728 default:
7729 assert(0);
7730 }
7731 break;
7732 default:
7733 assert(0);
7734 }
7735 }
7736 }
7737
7738 /* The prolog is a no-op if these aren't set. */
7739 if (prolog_key.ps_prolog.colors_read ||
7740 prolog_key.ps_prolog.states.force_persp_sample_interp ||
7741 prolog_key.ps_prolog.states.force_linear_sample_interp ||
7742 prolog_key.ps_prolog.states.force_persp_center_interp ||
7743 prolog_key.ps_prolog.states.force_linear_center_interp ||
7744 prolog_key.ps_prolog.states.bc_optimize_for_persp ||
7745 prolog_key.ps_prolog.states.bc_optimize_for_linear ||
7746 prolog_key.ps_prolog.states.poly_stipple) {
7747 shader->prolog =
7748 si_get_shader_part(sscreen, &sscreen->ps_prologs,
7749 &prolog_key, tm, debug,
7750 si_compile_ps_prolog);
7751 if (!shader->prolog)
7752 return false;
7753 }
7754
7755 /* Get the epilog. */
7756 memset(&epilog_key, 0, sizeof(epilog_key));
7757 epilog_key.ps_epilog.colors_written = info->colors_written;
7758 epilog_key.ps_epilog.writes_z = info->writes_z;
7759 epilog_key.ps_epilog.writes_stencil = info->writes_stencil;
7760 epilog_key.ps_epilog.writes_samplemask = info->writes_samplemask;
7761 epilog_key.ps_epilog.states = shader->key.ps.epilog;
7762
7763 shader->epilog =
7764 si_get_shader_part(sscreen, &sscreen->ps_epilogs,
7765 &epilog_key, tm, debug,
7766 si_compile_ps_epilog);
7767 if (!shader->epilog)
7768 return false;
7769
7770 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
7771 if (shader->key.ps.prolog.poly_stipple) {
7772 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
7773 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
7774 }
7775
7776 /* Set up the enable bits for per-sample shading if needed. */
7777 if (shader->key.ps.prolog.force_persp_sample_interp &&
7778 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7779 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7780 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
7781 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7782 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
7783 }
7784 if (shader->key.ps.prolog.force_linear_sample_interp &&
7785 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
7786 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7787 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
7788 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7789 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
7790 }
7791 if (shader->key.ps.prolog.force_persp_center_interp &&
7792 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7793 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7794 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
7795 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
7796 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7797 }
7798 if (shader->key.ps.prolog.force_linear_center_interp &&
7799 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
7800 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
7801 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
7802 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
7803 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7804 }
7805
7806 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
7807 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
7808 !(shader->config.spi_ps_input_ena & 0xf)) {
7809 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
7810 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
7811 }
7812
7813 /* At least one pair of interpolation weights must be enabled. */
7814 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
7815 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
7816 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
7817 }
7818
7819 /* The sample mask input is always enabled, because the API shader always
7820 * passes it through to the epilog. Disable it here if it's unused.
7821 */
7822 if (!shader->key.ps.epilog.poly_line_smoothing &&
7823 !shader->selector->info.reads_samplemask)
7824 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
7825
7826 return true;
7827 }
7828
7829 static void si_fix_num_sgprs(struct si_shader *shader)
7830 {
7831 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
7832
7833 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
7834 }
7835
7836 int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
7837 struct si_shader *shader,
7838 struct pipe_debug_callback *debug)
7839 {
7840 struct si_shader_selector *sel = shader->selector;
7841 struct si_shader *mainp = sel->main_shader_part;
7842 int r;
7843
7844 /* LS, ES, VS are compiled on demand if the main part hasn't been
7845 * compiled for that stage.
7846 */
7847 if (!mainp ||
7848 (sel->type == PIPE_SHADER_VERTEX &&
7849 (shader->key.vs.as_es != mainp->key.vs.as_es ||
7850 shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
7851 (sel->type == PIPE_SHADER_TESS_EVAL &&
7852 shader->key.tes.as_es != mainp->key.tes.as_es) ||
7853 (sel->type == PIPE_SHADER_TESS_CTRL &&
7854 shader->key.tcs.epilog.inputs_to_copy) ||
7855 sel->type == PIPE_SHADER_COMPUTE) {
7856 /* Monolithic shader (compiled as a whole, has many variants,
7857 * may take a long time to compile).
7858 */
7859 r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
7860 if (r)
7861 return r;
7862 } else {
7863 /* The shader consists of 2-3 parts:
7864 *
7865 * - the middle part is the user shader, it has 1 variant only
7866 * and it was compiled during the creation of the shader
7867 * selector
7868 * - the prolog part is inserted at the beginning
7869 * - the epilog part is inserted at the end
7870 *
7871 * The prolog and epilog have many (but simple) variants.
7872 */
7873
7874 /* Copy the compiled TGSI shader data over. */
7875 shader->is_binary_shared = true;
7876 shader->binary = mainp->binary;
7877 shader->config = mainp->config;
7878 shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
7879 shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
7880 shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
7881 memcpy(shader->info.vs_output_param_offset,
7882 mainp->info.vs_output_param_offset,
7883 sizeof(mainp->info.vs_output_param_offset));
7884 shader->info.uses_instanceid = mainp->info.uses_instanceid;
7885 shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
7886 shader->info.nr_param_exports = mainp->info.nr_param_exports;
7887
7888 /* Select prologs and/or epilogs. */
7889 switch (sel->type) {
7890 case PIPE_SHADER_VERTEX:
7891 if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
7892 return -1;
7893 break;
7894 case PIPE_SHADER_TESS_CTRL:
7895 if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
7896 return -1;
7897 break;
7898 case PIPE_SHADER_TESS_EVAL:
7899 if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
7900 return -1;
7901 break;
7902 case PIPE_SHADER_FRAGMENT:
7903 if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
7904 return -1;
7905
7906 /* Make sure we have at least as many VGPRs as there
7907 * are allocated inputs.
7908 */
7909 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7910 shader->info.num_input_vgprs);
7911 break;
7912 }
7913
7914 /* Update SGPR and VGPR counts. */
7915 if (shader->prolog) {
7916 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7917 shader->prolog->config.num_sgprs);
7918 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7919 shader->prolog->config.num_vgprs);
7920 }
7921 if (shader->epilog) {
7922 shader->config.num_sgprs = MAX2(shader->config.num_sgprs,
7923 shader->epilog->config.num_sgprs);
7924 shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
7925 shader->epilog->config.num_vgprs);
7926 }
7927 }
7928
7929 si_fix_num_sgprs(shader);
7930 si_shader_dump(sscreen, shader, debug, sel->info.processor,
7931 stderr);
7932
7933 /* Upload. */
7934 r = si_shader_binary_upload(sscreen, shader);
7935 if (r) {
7936 fprintf(stderr, "LLVM failed to upload shader\n");
7937 return r;
7938 }
7939
7940 return 0;
7941 }
7942
7943 void si_shader_destroy(struct si_shader *shader)
7944 {
7945 if (shader->gs_copy_shader) {
7946 si_shader_destroy(shader->gs_copy_shader);
7947 FREE(shader->gs_copy_shader);
7948 }
7949
7950 if (shader->scratch_bo)
7951 r600_resource_reference(&shader->scratch_bo, NULL);
7952
7953 r600_resource_reference(&shader->bo, NULL);
7954
7955 if (!shader->is_binary_shared)
7956 radeon_shader_binary_clean(&shader->binary);
7957
7958 free(shader->shader_log);
7959 }