2 * Copyright 2016 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
24 #include "si_shader_internal.h"
27 #include "gallivm/lp_bld_const.h"
28 #include "gallivm/lp_bld_gather.h"
29 #include "gallivm/lp_bld_flow.h"
30 #include "gallivm/lp_bld_init.h"
31 #include "gallivm/lp_bld_intr.h"
32 #include "gallivm/lp_bld_misc.h"
33 #include "gallivm/lp_bld_swizzle.h"
34 #include "tgsi/tgsi_info.h"
35 #include "tgsi/tgsi_parse.h"
36 #include "util/u_math.h"
37 #include "util/u_memory.h"
38 #include "util/u_debug.h"
41 #include <llvm-c/Transforms/IPO.h>
42 #include <llvm-c/Transforms/Scalar.h>
44 enum si_llvm_calling_convention
{
45 RADEON_LLVM_AMDGPU_VS
= 87,
46 RADEON_LLVM_AMDGPU_GS
= 88,
47 RADEON_LLVM_AMDGPU_PS
= 89,
48 RADEON_LLVM_AMDGPU_CS
= 90,
49 RADEON_LLVM_AMDGPU_HS
= 93,
52 struct si_llvm_diagnostics
{
53 struct pipe_debug_callback
*debug
;
57 static void si_diagnostic_handler(LLVMDiagnosticInfoRef di
, void *context
)
59 struct si_llvm_diagnostics
*diag
= (struct si_llvm_diagnostics
*)context
;
60 LLVMDiagnosticSeverity severity
= LLVMGetDiagInfoSeverity(di
);
61 char *description
= LLVMGetDiagInfoDescription(di
);
62 const char *severity_str
= NULL
;
66 severity_str
= "error";
69 severity_str
= "warning";
72 severity_str
= "remark";
75 severity_str
= "note";
78 severity_str
= "unknown";
81 pipe_debug_message(diag
->debug
, SHADER_INFO
,
82 "LLVM diagnostic (%s): %s", severity_str
, description
);
84 if (severity
== LLVMDSError
) {
86 fprintf(stderr
,"LLVM triggered Diagnostic Handler: %s\n", description
);
89 LLVMDisposeMessage(description
);
93 * Compile an LLVM module to machine code.
95 * @returns 0 for success, 1 for failure
97 unsigned si_llvm_compile(LLVMModuleRef M
, struct ac_shader_binary
*binary
,
98 LLVMTargetMachineRef tm
,
99 struct pipe_debug_callback
*debug
)
101 struct si_llvm_diagnostics diag
;
103 LLVMContextRef llvm_ctx
;
104 LLVMMemoryBufferRef out_buffer
;
105 unsigned buffer_size
;
106 const char *buffer_data
;
112 /* Setup Diagnostic Handler*/
113 llvm_ctx
= LLVMGetModuleContext(M
);
115 LLVMContextSetDiagnosticHandler(llvm_ctx
, si_diagnostic_handler
, &diag
);
118 mem_err
= LLVMTargetMachineEmitToMemoryBuffer(tm
, M
, LLVMObjectFile
, &err
,
121 /* Process Errors/Warnings */
123 fprintf(stderr
, "%s: %s", __FUNCTION__
, err
);
124 pipe_debug_message(debug
, SHADER_INFO
,
125 "LLVM emit error: %s", err
);
131 /* Extract Shader Code*/
132 buffer_size
= LLVMGetBufferSize(out_buffer
);
133 buffer_data
= LLVMGetBufferStart(out_buffer
);
135 if (!ac_elf_read(buffer_data
, buffer_size
, binary
)) {
136 fprintf(stderr
, "radeonsi: cannot read an ELF shader binary\n");
141 LLVMDisposeMemoryBuffer(out_buffer
);
144 if (diag
.retval
!= 0)
145 pipe_debug_message(debug
, SHADER_INFO
, "LLVM compile failed");
149 LLVMTypeRef
tgsi2llvmtype(struct lp_build_tgsi_context
*bld_base
,
150 enum tgsi_opcode_type type
)
152 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
155 case TGSI_TYPE_UNSIGNED
:
156 case TGSI_TYPE_SIGNED
:
158 case TGSI_TYPE_UNSIGNED64
:
159 case TGSI_TYPE_SIGNED64
:
161 case TGSI_TYPE_DOUBLE
:
163 case TGSI_TYPE_UNTYPED
:
164 case TGSI_TYPE_FLOAT
:
171 LLVMValueRef
bitcast(struct lp_build_tgsi_context
*bld_base
,
172 enum tgsi_opcode_type type
, LLVMValueRef value
)
174 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
175 LLVMTypeRef dst_type
= tgsi2llvmtype(bld_base
, type
);
178 return LLVMBuildBitCast(ctx
->ac
.builder
, value
, dst_type
, "");
184 * Return a value that is equal to the given i32 \p index if it lies in [0,num)
185 * or an undefined value in the same interval otherwise.
187 LLVMValueRef
si_llvm_bound_index(struct si_shader_context
*ctx
,
191 LLVMBuilderRef builder
= ctx
->ac
.builder
;
192 LLVMValueRef c_max
= LLVMConstInt(ctx
->i32
, num
- 1, 0);
195 if (util_is_power_of_two(num
)) {
196 index
= LLVMBuildAnd(builder
, index
, c_max
, "");
198 /* In theory, this MAX pattern should result in code that is
199 * as good as the bit-wise AND above.
201 * In practice, LLVM generates worse code (at the time of
202 * writing), because its value tracking is not strong enough.
204 cc
= LLVMBuildICmp(builder
, LLVMIntULE
, index
, c_max
, "");
205 index
= LLVMBuildSelect(builder
, cc
, index
, c_max
, "");
211 static LLVMValueRef
emit_swizzle(struct lp_build_tgsi_context
*bld_base
,
218 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
219 LLVMValueRef swizzles
[4];
221 swizzles
[0] = LLVMConstInt(ctx
->i32
, swizzle_x
, 0);
222 swizzles
[1] = LLVMConstInt(ctx
->i32
, swizzle_y
, 0);
223 swizzles
[2] = LLVMConstInt(ctx
->i32
, swizzle_z
, 0);
224 swizzles
[3] = LLVMConstInt(ctx
->i32
, swizzle_w
, 0);
226 return LLVMBuildShuffleVector(ctx
->ac
.builder
,
228 LLVMGetUndef(LLVMTypeOf(value
)),
229 LLVMConstVector(swizzles
, 4), "");
233 * Return the description of the array covering the given temporary register
237 get_temp_array_id(struct lp_build_tgsi_context
*bld_base
,
239 const struct tgsi_ind_register
*reg
)
241 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
242 unsigned num_arrays
= ctx
->bld_base
.info
->array_max
[TGSI_FILE_TEMPORARY
];
245 if (reg
&& reg
->ArrayID
> 0 && reg
->ArrayID
<= num_arrays
)
248 for (i
= 0; i
< num_arrays
; i
++) {
249 const struct tgsi_array_info
*array
= &ctx
->temp_arrays
[i
];
251 if (reg_index
>= array
->range
.First
&& reg_index
<= array
->range
.Last
)
258 static struct tgsi_declaration_range
259 get_array_range(struct lp_build_tgsi_context
*bld_base
,
260 unsigned File
, unsigned reg_index
,
261 const struct tgsi_ind_register
*reg
)
263 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
264 struct tgsi_declaration_range range
;
266 if (File
== TGSI_FILE_TEMPORARY
) {
267 unsigned array_id
= get_temp_array_id(bld_base
, reg_index
, reg
);
269 return ctx
->temp_arrays
[array_id
- 1].range
;
273 range
.Last
= bld_base
->info
->file_max
[File
];
278 * For indirect registers, construct a pointer directly to the requested
279 * element using getelementptr if possible.
281 * Returns NULL if the insertelement/extractelement fallback for array access
285 get_pointer_into_array(struct si_shader_context
*ctx
,
289 const struct tgsi_ind_register
*reg_indirect
)
292 struct tgsi_array_info
*array
;
293 LLVMBuilderRef builder
= ctx
->ac
.builder
;
294 LLVMValueRef idxs
[2];
298 if (file
!= TGSI_FILE_TEMPORARY
)
301 array_id
= get_temp_array_id(&ctx
->bld_base
, reg_index
, reg_indirect
);
305 alloca
= ctx
->temp_array_allocas
[array_id
- 1];
309 array
= &ctx
->temp_arrays
[array_id
- 1];
311 if (!(array
->writemask
& (1 << swizzle
)))
312 return ctx
->undef_alloca
;
314 index
= si_get_indirect_index(ctx
, reg_indirect
, 1,
315 reg_index
- ctx
->temp_arrays
[array_id
- 1].range
.First
);
317 /* Ensure that the index is within a valid range, to guard against
318 * VM faults and overwriting critical data (e.g. spilled resource
321 * TODO It should be possible to avoid the additional instructions
322 * if LLVM is changed so that it guarantuees:
323 * 1. the scratch space descriptor isolates the current wave (this
324 * could even save the scratch offset SGPR at the cost of an
325 * additional SALU instruction)
326 * 2. the memory for allocas must be allocated at the _end_ of the
327 * scratch space (after spilled registers)
329 index
= si_llvm_bound_index(ctx
, index
, array
->range
.Last
- array
->range
.First
+ 1);
331 index
= LLVMBuildMul(
333 LLVMConstInt(ctx
->i32
, util_bitcount(array
->writemask
), 0),
335 index
= LLVMBuildAdd(
337 LLVMConstInt(ctx
->i32
,
338 util_bitcount(array
->writemask
& ((1 << swizzle
) - 1)), 0),
340 idxs
[0] = ctx
->i32_0
;
342 return LLVMBuildGEP(ctx
->ac
.builder
, alloca
, idxs
, 2, "");
346 si_llvm_emit_fetch_64bit(struct lp_build_tgsi_context
*bld_base
,
351 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
354 result
= LLVMGetUndef(LLVMVectorType(ctx
->i32
, 2));
356 result
= LLVMBuildInsertElement(ctx
->ac
.builder
,
358 ac_to_integer(&ctx
->ac
, ptr
),
360 result
= LLVMBuildInsertElement(ctx
->ac
.builder
,
362 ac_to_integer(&ctx
->ac
, ptr2
),
364 return LLVMBuildBitCast(ctx
->ac
.builder
, result
, type
, "");
368 emit_array_fetch(struct lp_build_tgsi_context
*bld_base
,
369 unsigned File
, enum tgsi_opcode_type type
,
370 struct tgsi_declaration_range range
,
373 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
374 unsigned i
, size
= range
.Last
- range
.First
+ 1;
375 LLVMTypeRef vec
= LLVMVectorType(tgsi2llvmtype(bld_base
, type
), size
);
376 LLVMValueRef result
= LLVMGetUndef(vec
);
378 struct tgsi_full_src_register tmp_reg
= {};
379 tmp_reg
.Register
.File
= File
;
381 for (i
= 0; i
< size
; ++i
) {
382 tmp_reg
.Register
.Index
= i
+ range
.First
;
383 LLVMValueRef temp
= si_llvm_emit_fetch(bld_base
, &tmp_reg
, type
, swizzle
);
384 result
= LLVMBuildInsertElement(ctx
->ac
.builder
, result
, temp
,
385 LLVMConstInt(ctx
->i32
, i
, 0), "array_vector");
391 load_value_from_array(struct lp_build_tgsi_context
*bld_base
,
393 enum tgsi_opcode_type type
,
396 const struct tgsi_ind_register
*reg_indirect
)
398 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
399 LLVMBuilderRef builder
= ctx
->ac
.builder
;
402 ptr
= get_pointer_into_array(ctx
, file
, swizzle
, reg_index
, reg_indirect
);
404 LLVMValueRef val
= LLVMBuildLoad(builder
, ptr
, "");
405 if (tgsi_type_is_64bit(type
)) {
406 LLVMValueRef ptr_hi
, val_hi
;
407 ptr_hi
= LLVMBuildGEP(builder
, ptr
, &ctx
->i32_1
, 1, "");
408 val_hi
= LLVMBuildLoad(builder
, ptr_hi
, "");
409 val
= si_llvm_emit_fetch_64bit(bld_base
, tgsi2llvmtype(bld_base
, type
),
415 struct tgsi_declaration_range range
=
416 get_array_range(bld_base
, file
, reg_index
, reg_indirect
);
418 si_get_indirect_index(ctx
, reg_indirect
, 1, reg_index
- range
.First
);
420 emit_array_fetch(bld_base
, file
, type
, range
, swizzle
);
421 return LLVMBuildExtractElement(builder
, array
, index
, "");
426 store_value_to_array(struct lp_build_tgsi_context
*bld_base
,
431 const struct tgsi_ind_register
*reg_indirect
)
433 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
434 LLVMBuilderRef builder
= ctx
->ac
.builder
;
437 ptr
= get_pointer_into_array(ctx
, file
, chan_index
, reg_index
, reg_indirect
);
439 LLVMBuildStore(builder
, value
, ptr
);
442 struct tgsi_declaration_range range
= get_array_range(bld_base
, file
, reg_index
, reg_indirect
);
443 LLVMValueRef index
= si_get_indirect_index(ctx
, reg_indirect
, 1, reg_index
- range
.First
);
445 emit_array_fetch(bld_base
, file
, TGSI_TYPE_FLOAT
, range
, chan_index
);
446 LLVMValueRef temp_ptr
;
448 array
= LLVMBuildInsertElement(builder
, array
, value
, index
, "");
450 size
= range
.Last
- range
.First
+ 1;
451 for (i
= 0; i
< size
; ++i
) {
453 case TGSI_FILE_OUTPUT
:
454 temp_ptr
= ctx
->outputs
[i
+ range
.First
][chan_index
];
457 case TGSI_FILE_TEMPORARY
:
458 if (range
.First
+ i
>= ctx
->temps_count
)
460 temp_ptr
= ctx
->temps
[(i
+ range
.First
) * TGSI_NUM_CHANNELS
+ chan_index
];
466 value
= LLVMBuildExtractElement(builder
, array
,
467 LLVMConstInt(ctx
->i32
, i
, 0), "");
468 LLVMBuildStore(builder
, value
, temp_ptr
);
473 /* If this is true, preload FS inputs at the beginning of shaders. Otherwise,
474 * reload them at each use. This must be true if the shader is using
475 * derivatives and KILL, because KILL can leave the WQM and then a lazy
476 * input load isn't in the WQM anymore.
478 static bool si_preload_fs_inputs(struct si_shader_context
*ctx
)
480 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
482 return sel
->info
.uses_derivatives
&&
487 get_output_ptr(struct lp_build_tgsi_context
*bld_base
, unsigned index
,
490 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
492 assert(index
<= ctx
->bld_base
.info
->file_max
[TGSI_FILE_OUTPUT
]);
493 return ctx
->outputs
[index
][chan
];
496 LLVMValueRef
si_llvm_emit_fetch(struct lp_build_tgsi_context
*bld_base
,
497 const struct tgsi_full_src_register
*reg
,
498 enum tgsi_opcode_type type
,
501 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
502 LLVMBuilderRef builder
= ctx
->ac
.builder
;
503 LLVMValueRef result
= NULL
, ptr
, ptr2
;
506 LLVMValueRef values
[TGSI_NUM_CHANNELS
];
508 for (chan
= 0; chan
< TGSI_NUM_CHANNELS
; chan
++) {
509 values
[chan
] = si_llvm_emit_fetch(bld_base
, reg
, type
, chan
);
511 return lp_build_gather_values(&ctx
->gallivm
, values
,
515 if (reg
->Register
.Indirect
) {
516 LLVMValueRef load
= load_value_from_array(bld_base
, reg
->Register
.File
, type
,
517 swizzle
, reg
->Register
.Index
, ®
->Indirect
);
518 return bitcast(bld_base
, type
, load
);
521 switch(reg
->Register
.File
) {
522 case TGSI_FILE_IMMEDIATE
: {
523 LLVMTypeRef ctype
= tgsi2llvmtype(bld_base
, type
);
524 if (tgsi_type_is_64bit(type
)) {
525 result
= LLVMGetUndef(LLVMVectorType(ctx
->i32
, 2));
526 result
= LLVMConstInsertElement(result
,
527 ctx
->imms
[reg
->Register
.Index
* TGSI_NUM_CHANNELS
+ swizzle
],
529 result
= LLVMConstInsertElement(result
,
530 ctx
->imms
[reg
->Register
.Index
* TGSI_NUM_CHANNELS
+ swizzle
+ 1],
532 return LLVMConstBitCast(result
, ctype
);
534 return LLVMConstBitCast(ctx
->imms
[reg
->Register
.Index
* TGSI_NUM_CHANNELS
+ swizzle
], ctype
);
538 case TGSI_FILE_INPUT
: {
539 unsigned index
= reg
->Register
.Index
;
540 LLVMValueRef input
[4];
542 /* I don't think doing this for vertex shaders is beneficial.
543 * For those, we want to make sure the VMEM loads are executed
544 * only once. Fragment shaders don't care much, because
545 * v_interp instructions are much cheaper than VMEM loads.
547 if (!si_preload_fs_inputs(ctx
) &&
548 ctx
->bld_base
.info
->processor
== PIPE_SHADER_FRAGMENT
)
549 ctx
->load_input(ctx
, index
, &ctx
->input_decls
[index
], input
);
551 memcpy(input
, &ctx
->inputs
[index
* 4], sizeof(input
));
553 result
= input
[swizzle
];
555 if (tgsi_type_is_64bit(type
)) {
557 ptr2
= input
[swizzle
+ 1];
558 return si_llvm_emit_fetch_64bit(bld_base
, tgsi2llvmtype(bld_base
, type
),
564 case TGSI_FILE_TEMPORARY
:
565 if (reg
->Register
.Index
>= ctx
->temps_count
)
566 return LLVMGetUndef(tgsi2llvmtype(bld_base
, type
));
567 ptr
= ctx
->temps
[reg
->Register
.Index
* TGSI_NUM_CHANNELS
+ swizzle
];
568 if (tgsi_type_is_64bit(type
)) {
569 ptr2
= ctx
->temps
[reg
->Register
.Index
* TGSI_NUM_CHANNELS
+ swizzle
+ 1];
570 return si_llvm_emit_fetch_64bit(bld_base
, tgsi2llvmtype(bld_base
, type
),
571 LLVMBuildLoad(builder
, ptr
, ""),
572 LLVMBuildLoad(builder
, ptr2
, ""));
574 result
= LLVMBuildLoad(builder
, ptr
, "");
577 case TGSI_FILE_OUTPUT
:
578 ptr
= get_output_ptr(bld_base
, reg
->Register
.Index
, swizzle
);
579 if (tgsi_type_is_64bit(type
)) {
580 ptr2
= get_output_ptr(bld_base
, reg
->Register
.Index
, swizzle
+ 1);
581 return si_llvm_emit_fetch_64bit(bld_base
, tgsi2llvmtype(bld_base
, type
),
582 LLVMBuildLoad(builder
, ptr
, ""),
583 LLVMBuildLoad(builder
, ptr2
, ""));
585 result
= LLVMBuildLoad(builder
, ptr
, "");
589 return LLVMGetUndef(tgsi2llvmtype(bld_base
, type
));
592 return bitcast(bld_base
, type
, result
);
595 static LLVMValueRef
fetch_system_value(struct lp_build_tgsi_context
*bld_base
,
596 const struct tgsi_full_src_register
*reg
,
597 enum tgsi_opcode_type type
,
600 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
601 LLVMBuilderRef builder
= ctx
->ac
.builder
;
602 LLVMValueRef cval
= ctx
->system_values
[reg
->Register
.Index
];
604 if (tgsi_type_is_64bit(type
)) {
607 assert(swizzle
== 0 || swizzle
== 2);
609 lo
= LLVMBuildExtractElement(
610 builder
, cval
, LLVMConstInt(ctx
->i32
, swizzle
, 0), "");
611 hi
= LLVMBuildExtractElement(
612 builder
, cval
, LLVMConstInt(ctx
->i32
, swizzle
+ 1, 0), "");
614 return si_llvm_emit_fetch_64bit(bld_base
, tgsi2llvmtype(bld_base
, type
),
618 if (LLVMGetTypeKind(LLVMTypeOf(cval
)) == LLVMVectorTypeKind
) {
619 cval
= LLVMBuildExtractElement(
620 builder
, cval
, LLVMConstInt(ctx
->i32
, swizzle
, 0), "");
622 assert(swizzle
== 0);
625 return bitcast(bld_base
, type
, cval
);
628 static void emit_declaration(struct lp_build_tgsi_context
*bld_base
,
629 const struct tgsi_full_declaration
*decl
)
631 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
632 LLVMBuilderRef builder
= ctx
->ac
.builder
;
633 unsigned first
, last
, i
;
634 switch(decl
->Declaration
.File
) {
635 case TGSI_FILE_ADDRESS
:
638 for (idx
= decl
->Range
.First
; idx
<= decl
->Range
.Last
; idx
++) {
640 for (chan
= 0; chan
< TGSI_NUM_CHANNELS
; chan
++) {
641 ctx
->addrs
[idx
][chan
] = lp_build_alloca_undef(
649 case TGSI_FILE_TEMPORARY
:
652 LLVMValueRef array_alloca
= NULL
;
654 unsigned writemask
= decl
->Declaration
.UsageMask
;
655 first
= decl
->Range
.First
;
656 last
= decl
->Range
.Last
;
657 decl_size
= 4 * ((last
- first
) + 1);
659 if (decl
->Declaration
.Array
) {
660 unsigned id
= decl
->Array
.ArrayID
- 1;
663 writemask
&= ctx
->temp_arrays
[id
].writemask
;
664 ctx
->temp_arrays
[id
].writemask
= writemask
;
665 array_size
= ((last
- first
) + 1) * util_bitcount(writemask
);
667 /* If the array has more than 16 elements, store it
668 * in memory using an alloca that spans the entire
671 * Otherwise, store each array element individually.
672 * We will then generate vectors (per-channel, up to
673 * <16 x float> if the usagemask is a single bit) for
674 * indirect addressing.
676 * Note that 16 is the number of vector elements that
677 * LLVM will store in a register, so theoretically an
678 * array with up to 4 * 16 = 64 elements could be
679 * handled this way, but whether that's a good idea
680 * depends on VGPR register pressure elsewhere.
682 * FIXME: We shouldn't need to have the non-alloca
683 * code path for arrays. LLVM should be smart enough to
684 * promote allocas into registers when profitable.
686 if (array_size
> 16 ||
687 !ctx
->screen
->llvm_has_working_vgpr_indexing
) {
688 array_alloca
= lp_build_alloca_undef(&ctx
->gallivm
,
689 LLVMArrayType(ctx
->f32
,
690 array_size
), "array");
691 ctx
->temp_array_allocas
[id
] = array_alloca
;
695 if (!ctx
->temps_count
) {
696 ctx
->temps_count
= bld_base
->info
->file_max
[TGSI_FILE_TEMPORARY
] + 1;
697 ctx
->temps
= MALLOC(TGSI_NUM_CHANNELS
* ctx
->temps_count
* sizeof(LLVMValueRef
));
700 for (i
= 0; i
< decl_size
; ++i
) {
702 snprintf(name
, sizeof(name
), "TEMP%d.%c",
703 first
+ i
/ 4, "xyzw"[i
% 4]);
705 ctx
->temps
[first
* TGSI_NUM_CHANNELS
+ i
] =
706 lp_build_alloca_undef(&ctx
->gallivm
,
711 LLVMValueRef idxs
[2] = {
717 if (writemask
!= TGSI_WRITEMASK_XYZW
&&
718 !ctx
->undef_alloca
) {
719 /* Create a dummy alloca. We use it so that we
720 * have a pointer that is safe to load from if
721 * a shader ever reads from a channel that
722 * it never writes to.
724 ctx
->undef_alloca
= lp_build_alloca_undef(
729 for (i
= 0; i
< decl_size
; ++i
) {
731 if (writemask
& (1 << (i
% 4))) {
733 snprintf(name
, sizeof(name
), "TEMP%d.%c",
734 first
+ i
/ 4, "xyzw"[i
% 4]);
736 idxs
[1] = LLVMConstInt(ctx
->i32
, j
, 0);
737 ptr
= LLVMBuildGEP(builder
, array_alloca
, idxs
, 2, name
);
740 ptr
= ctx
->undef_alloca
;
742 ctx
->temps
[first
* TGSI_NUM_CHANNELS
+ i
] = ptr
;
747 case TGSI_FILE_INPUT
:
750 for (idx
= decl
->Range
.First
; idx
<= decl
->Range
.Last
; idx
++) {
751 if (ctx
->load_input
&&
752 ctx
->input_decls
[idx
].Declaration
.File
!= TGSI_FILE_INPUT
) {
753 ctx
->input_decls
[idx
] = *decl
;
754 ctx
->input_decls
[idx
].Range
.First
= idx
;
755 ctx
->input_decls
[idx
].Range
.Last
= idx
;
756 ctx
->input_decls
[idx
].Semantic
.Index
+= idx
- decl
->Range
.First
;
758 if (si_preload_fs_inputs(ctx
) ||
759 bld_base
->info
->processor
!= PIPE_SHADER_FRAGMENT
)
760 ctx
->load_input(ctx
, idx
, &ctx
->input_decls
[idx
],
761 &ctx
->inputs
[idx
* 4]);
767 case TGSI_FILE_SYSTEM_VALUE
:
770 for (idx
= decl
->Range
.First
; idx
<= decl
->Range
.Last
; idx
++) {
771 si_load_system_value(ctx
, idx
, decl
);
776 case TGSI_FILE_OUTPUT
:
780 for (idx
= decl
->Range
.First
; idx
<= decl
->Range
.Last
; idx
++) {
782 assert(idx
< RADEON_LLVM_MAX_OUTPUTS
);
783 if (ctx
->outputs
[idx
][0])
785 for (chan
= 0; chan
< TGSI_NUM_CHANNELS
; chan
++) {
787 snprintf(name
, sizeof(name
), "OUT%d.%c",
788 idx
, "xyzw"[chan
% 4]);
790 ctx
->outputs
[idx
][chan
] = lp_build_alloca_undef(
798 case TGSI_FILE_MEMORY
:
799 si_tgsi_declare_compute_memory(ctx
, decl
);
807 void si_llvm_emit_store(struct lp_build_tgsi_context
*bld_base
,
808 const struct tgsi_full_instruction
*inst
,
809 const struct tgsi_opcode_info
*info
,
813 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
814 const struct tgsi_full_dst_register
*reg
= &inst
->Dst
[index
];
815 LLVMBuilderRef builder
= ctx
->ac
.builder
;
816 LLVMValueRef temp_ptr
, temp_ptr2
= NULL
;
817 bool is_vec_store
= false;
818 enum tgsi_opcode_type dtype
= tgsi_opcode_infer_dst_type(inst
->Instruction
.Opcode
, index
);
821 LLVMTypeKind k
= LLVMGetTypeKind(LLVMTypeOf(dst
[0]));
822 is_vec_store
= (k
== LLVMVectorTypeKind
);
826 LLVMValueRef values
[4] = {};
827 uint32_t writemask
= reg
->Register
.WriteMask
;
829 unsigned chan
= u_bit_scan(&writemask
);
830 LLVMValueRef index
= LLVMConstInt(ctx
->i32
, chan
, 0);
831 values
[chan
] = LLVMBuildExtractElement(ctx
->ac
.builder
,
834 bld_base
->emit_store(bld_base
, inst
, info
, index
, values
);
838 uint32_t writemask
= reg
->Register
.WriteMask
;
840 unsigned chan_index
= u_bit_scan(&writemask
);
841 LLVMValueRef value
= dst
[chan_index
];
843 if (tgsi_type_is_64bit(dtype
) && (chan_index
== 1 || chan_index
== 3))
845 if (inst
->Instruction
.Saturate
)
846 value
= ac_build_clamp(&ctx
->ac
, value
);
848 if (reg
->Register
.File
== TGSI_FILE_ADDRESS
) {
849 temp_ptr
= ctx
->addrs
[reg
->Register
.Index
][chan_index
];
850 LLVMBuildStore(builder
, value
, temp_ptr
);
854 if (!tgsi_type_is_64bit(dtype
))
855 value
= ac_to_float(&ctx
->ac
, value
);
857 if (reg
->Register
.Indirect
) {
858 unsigned file
= reg
->Register
.File
;
859 unsigned reg_index
= reg
->Register
.Index
;
860 store_value_to_array(bld_base
, value
, file
, chan_index
,
861 reg_index
, ®
->Indirect
);
863 switch(reg
->Register
.File
) {
864 case TGSI_FILE_OUTPUT
:
865 temp_ptr
= ctx
->outputs
[reg
->Register
.Index
][chan_index
];
866 if (tgsi_type_is_64bit(dtype
))
867 temp_ptr2
= ctx
->outputs
[reg
->Register
.Index
][chan_index
+ 1];
870 case TGSI_FILE_TEMPORARY
:
872 if (reg
->Register
.Index
>= ctx
->temps_count
)
875 temp_ptr
= ctx
->temps
[ TGSI_NUM_CHANNELS
* reg
->Register
.Index
+ chan_index
];
876 if (tgsi_type_is_64bit(dtype
))
877 temp_ptr2
= ctx
->temps
[ TGSI_NUM_CHANNELS
* reg
->Register
.Index
+ chan_index
+ 1];
884 if (!tgsi_type_is_64bit(dtype
))
885 LLVMBuildStore(builder
, value
, temp_ptr
);
887 LLVMValueRef ptr
= LLVMBuildBitCast(builder
, value
,
888 LLVMVectorType(ctx
->i32
, 2), "");
890 value
= LLVMBuildExtractElement(builder
, ptr
,
892 val2
= LLVMBuildExtractElement(builder
, ptr
,
895 LLVMBuildStore(builder
, ac_to_float(&ctx
->ac
, value
), temp_ptr
);
896 LLVMBuildStore(builder
, ac_to_float(&ctx
->ac
, val2
), temp_ptr2
);
902 static int get_line(int pc
)
904 /* Subtract 1 so that the number shown is that of the corresponding
905 * opcode in the TGSI dump, e.g. an if block has the same suffix as
906 * the instruction number of the corresponding TGSI IF.
911 static void bgnloop_emit(const struct lp_build_tgsi_action
*action
,
912 struct lp_build_tgsi_context
*bld_base
,
913 struct lp_build_emit_data
*emit_data
)
915 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
916 ac_build_bgnloop(&ctx
->ac
, get_line(bld_base
->pc
));
919 static void brk_emit(const struct lp_build_tgsi_action
*action
,
920 struct lp_build_tgsi_context
*bld_base
,
921 struct lp_build_emit_data
*emit_data
)
923 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
924 ac_build_break(&ctx
->ac
);
927 static void cont_emit(const struct lp_build_tgsi_action
*action
,
928 struct lp_build_tgsi_context
*bld_base
,
929 struct lp_build_emit_data
*emit_data
)
931 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
932 ac_build_continue(&ctx
->ac
);
935 static void else_emit(const struct lp_build_tgsi_action
*action
,
936 struct lp_build_tgsi_context
*bld_base
,
937 struct lp_build_emit_data
*emit_data
)
939 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
940 ac_build_else(&ctx
->ac
, get_line(bld_base
->pc
));
943 static void endif_emit(const struct lp_build_tgsi_action
*action
,
944 struct lp_build_tgsi_context
*bld_base
,
945 struct lp_build_emit_data
*emit_data
)
947 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
948 ac_build_endif(&ctx
->ac
, get_line(bld_base
->pc
));
951 static void endloop_emit(const struct lp_build_tgsi_action
*action
,
952 struct lp_build_tgsi_context
*bld_base
,
953 struct lp_build_emit_data
*emit_data
)
955 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
956 ac_build_endloop(&ctx
->ac
, get_line(bld_base
->pc
));
959 static void if_emit(const struct lp_build_tgsi_action
*action
,
960 struct lp_build_tgsi_context
*bld_base
,
961 struct lp_build_emit_data
*emit_data
)
963 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
964 ac_build_if(&ctx
->ac
, emit_data
->args
[0], get_line(bld_base
->pc
));
967 static void uif_emit(const struct lp_build_tgsi_action
*action
,
968 struct lp_build_tgsi_context
*bld_base
,
969 struct lp_build_emit_data
*emit_data
)
971 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
972 ac_build_uif(&ctx
->ac
, emit_data
->args
[0], get_line(bld_base
->pc
));
975 static void emit_immediate(struct lp_build_tgsi_context
*bld_base
,
976 const struct tgsi_full_immediate
*imm
)
979 struct si_shader_context
*ctx
= si_shader_context(bld_base
);
981 for (i
= 0; i
< 4; ++i
) {
982 ctx
->imms
[ctx
->imms_num
* TGSI_NUM_CHANNELS
+ i
] =
983 LLVMConstInt(ctx
->i32
, imm
->u
[i
].Uint
, false );
989 void si_llvm_context_init(struct si_shader_context
*ctx
,
990 struct si_screen
*sscreen
,
991 LLVMTargetMachineRef tm
)
995 /* Initialize the gallivm object:
996 * We are only using the module, context, and builder fields of this struct.
997 * This should be enough for us to be able to pass our gallivm struct to the
998 * helper functions in the gallivm module.
1000 memset(ctx
, 0, sizeof(*ctx
));
1001 ctx
->screen
= sscreen
;
1004 ctx
->gallivm
.context
= LLVMContextCreate();
1005 ctx
->gallivm
.module
= LLVMModuleCreateWithNameInContext("tgsi",
1006 ctx
->gallivm
.context
);
1007 LLVMSetTarget(ctx
->gallivm
.module
, "amdgcn--");
1009 LLVMTargetDataRef data_layout
= LLVMCreateTargetDataLayout(tm
);
1010 char *data_layout_str
= LLVMCopyStringRepOfTargetData(data_layout
);
1011 LLVMSetDataLayout(ctx
->gallivm
.module
, data_layout_str
);
1012 LLVMDisposeTargetData(data_layout
);
1013 LLVMDisposeMessage(data_layout_str
);
1015 bool unsafe_fpmath
= (sscreen
->debug_flags
& DBG(UNSAFE_MATH
)) != 0;
1016 enum ac_float_mode float_mode
=
1017 unsafe_fpmath
? AC_FLOAT_MODE_UNSAFE_FP_MATH
:
1018 AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH
;
1020 ctx
->gallivm
.builder
= ac_create_builder(ctx
->gallivm
.context
,
1023 ac_llvm_context_init(&ctx
->ac
, ctx
->gallivm
.context
,
1024 sscreen
->info
.chip_class
, sscreen
->info
.family
);
1025 ctx
->ac
.module
= ctx
->gallivm
.module
;
1026 ctx
->ac
.builder
= ctx
->gallivm
.builder
;
1028 struct lp_build_tgsi_context
*bld_base
= &ctx
->bld_base
;
1030 type
.floating
= true;
1037 lp_build_context_init(&bld_base
->base
, &ctx
->gallivm
, type
);
1038 lp_build_context_init(&ctx
->bld_base
.uint_bld
, &ctx
->gallivm
, lp_uint_type(type
));
1039 lp_build_context_init(&ctx
->bld_base
.int_bld
, &ctx
->gallivm
, lp_int_type(type
));
1041 lp_build_context_init(&ctx
->bld_base
.dbl_bld
, &ctx
->gallivm
, type
);
1042 lp_build_context_init(&ctx
->bld_base
.uint64_bld
, &ctx
->gallivm
, lp_uint_type(type
));
1043 lp_build_context_init(&ctx
->bld_base
.int64_bld
, &ctx
->gallivm
, lp_int_type(type
));
1046 bld_base
->emit_swizzle
= emit_swizzle
;
1047 bld_base
->emit_declaration
= emit_declaration
;
1048 bld_base
->emit_immediate
= emit_immediate
;
1050 bld_base
->op_actions
[TGSI_OPCODE_BGNLOOP
].emit
= bgnloop_emit
;
1051 bld_base
->op_actions
[TGSI_OPCODE_BRK
].emit
= brk_emit
;
1052 bld_base
->op_actions
[TGSI_OPCODE_CONT
].emit
= cont_emit
;
1053 bld_base
->op_actions
[TGSI_OPCODE_IF
].emit
= if_emit
;
1054 bld_base
->op_actions
[TGSI_OPCODE_UIF
].emit
= uif_emit
;
1055 bld_base
->op_actions
[TGSI_OPCODE_ELSE
].emit
= else_emit
;
1056 bld_base
->op_actions
[TGSI_OPCODE_ENDIF
].emit
= endif_emit
;
1057 bld_base
->op_actions
[TGSI_OPCODE_ENDLOOP
].emit
= endloop_emit
;
1059 si_shader_context_init_alu(&ctx
->bld_base
);
1060 si_shader_context_init_mem(ctx
);
1062 ctx
->voidt
= LLVMVoidTypeInContext(ctx
->ac
.context
);
1063 ctx
->i1
= LLVMInt1TypeInContext(ctx
->ac
.context
);
1064 ctx
->i8
= LLVMInt8TypeInContext(ctx
->ac
.context
);
1065 ctx
->i32
= LLVMInt32TypeInContext(ctx
->ac
.context
);
1066 ctx
->i64
= LLVMInt64TypeInContext(ctx
->ac
.context
);
1067 ctx
->i128
= LLVMIntTypeInContext(ctx
->ac
.context
, 128);
1068 ctx
->f32
= LLVMFloatTypeInContext(ctx
->ac
.context
);
1069 ctx
->v2i32
= LLVMVectorType(ctx
->i32
, 2);
1070 ctx
->v4i32
= LLVMVectorType(ctx
->i32
, 4);
1071 ctx
->v4f32
= LLVMVectorType(ctx
->f32
, 4);
1072 ctx
->v8i32
= LLVMVectorType(ctx
->i32
, 8);
1074 ctx
->i32_0
= LLVMConstInt(ctx
->i32
, 0, 0);
1075 ctx
->i32_1
= LLVMConstInt(ctx
->i32
, 1, 0);
1078 /* Set the context to a certain TGSI shader. Can be called repeatedly
1079 * to change the shader. */
1080 void si_llvm_context_set_tgsi(struct si_shader_context
*ctx
,
1081 struct si_shader
*shader
)
1083 const struct tgsi_shader_info
*info
= NULL
;
1084 const struct tgsi_token
*tokens
= NULL
;
1086 if (shader
&& shader
->selector
) {
1087 info
= &shader
->selector
->info
;
1088 tokens
= shader
->selector
->tokens
;
1091 ctx
->shader
= shader
;
1092 ctx
->type
= info
? info
->processor
: -1;
1093 ctx
->bld_base
.info
= info
;
1095 /* Clean up the old contents. */
1096 FREE(ctx
->temp_arrays
);
1097 ctx
->temp_arrays
= NULL
;
1098 FREE(ctx
->temp_array_allocas
);
1099 ctx
->temp_array_allocas
= NULL
;
1107 ctx
->temps_count
= 0;
1112 ctx
->num_const_buffers
= util_last_bit(info
->const_buffers_declared
);
1113 ctx
->num_shader_buffers
= util_last_bit(info
->shader_buffers_declared
);
1115 ctx
->num_samplers
= util_last_bit(info
->samplers_declared
);
1116 ctx
->num_images
= util_last_bit(info
->images_declared
);
1121 if (info
->array_max
[TGSI_FILE_TEMPORARY
] > 0) {
1122 int size
= info
->array_max
[TGSI_FILE_TEMPORARY
];
1124 ctx
->temp_arrays
= CALLOC(size
, sizeof(ctx
->temp_arrays
[0]));
1125 ctx
->temp_array_allocas
= CALLOC(size
, sizeof(ctx
->temp_array_allocas
[0]));
1127 tgsi_scan_arrays(tokens
, TGSI_FILE_TEMPORARY
, size
,
1130 if (info
->file_max
[TGSI_FILE_IMMEDIATE
] >= 0) {
1131 int size
= info
->file_max
[TGSI_FILE_IMMEDIATE
] + 1;
1132 ctx
->imms
= MALLOC(size
* TGSI_NUM_CHANNELS
* sizeof(LLVMValueRef
));
1135 /* Re-set these to start with a clean slate. */
1136 ctx
->bld_base
.num_instructions
= 0;
1137 ctx
->bld_base
.pc
= 0;
1138 memset(ctx
->outputs
, 0, sizeof(ctx
->outputs
));
1140 ctx
->bld_base
.emit_store
= si_llvm_emit_store
;
1141 ctx
->bld_base
.emit_fetch_funcs
[TGSI_FILE_IMMEDIATE
] = si_llvm_emit_fetch
;
1142 ctx
->bld_base
.emit_fetch_funcs
[TGSI_FILE_INPUT
] = si_llvm_emit_fetch
;
1143 ctx
->bld_base
.emit_fetch_funcs
[TGSI_FILE_TEMPORARY
] = si_llvm_emit_fetch
;
1144 ctx
->bld_base
.emit_fetch_funcs
[TGSI_FILE_OUTPUT
] = si_llvm_emit_fetch
;
1145 ctx
->bld_base
.emit_fetch_funcs
[TGSI_FILE_SYSTEM_VALUE
] = fetch_system_value
;
1148 void si_llvm_create_func(struct si_shader_context
*ctx
,
1150 LLVMTypeRef
*return_types
, unsigned num_return_elems
,
1151 LLVMTypeRef
*ParamTypes
, unsigned ParamCount
)
1153 LLVMTypeRef main_fn_type
, ret_type
;
1154 LLVMBasicBlockRef main_fn_body
;
1155 enum si_llvm_calling_convention call_conv
;
1156 unsigned real_shader_type
;
1158 if (num_return_elems
)
1159 ret_type
= LLVMStructTypeInContext(ctx
->ac
.context
,
1161 num_return_elems
, true);
1163 ret_type
= ctx
->voidt
;
1165 /* Setup the function */
1166 ctx
->return_type
= ret_type
;
1167 main_fn_type
= LLVMFunctionType(ret_type
, ParamTypes
, ParamCount
, 0);
1168 ctx
->main_fn
= LLVMAddFunction(ctx
->gallivm
.module
, name
, main_fn_type
);
1169 main_fn_body
= LLVMAppendBasicBlockInContext(ctx
->ac
.context
,
1170 ctx
->main_fn
, "main_body");
1171 LLVMPositionBuilderAtEnd(ctx
->ac
.builder
, main_fn_body
);
1173 real_shader_type
= ctx
->type
;
1175 /* LS is merged into HS (TCS), and ES is merged into GS. */
1176 if (ctx
->screen
->info
.chip_class
>= GFX9
) {
1177 if (ctx
->shader
->key
.as_ls
)
1178 real_shader_type
= PIPE_SHADER_TESS_CTRL
;
1179 else if (ctx
->shader
->key
.as_es
)
1180 real_shader_type
= PIPE_SHADER_GEOMETRY
;
1183 switch (real_shader_type
) {
1184 case PIPE_SHADER_VERTEX
:
1185 case PIPE_SHADER_TESS_EVAL
:
1186 call_conv
= RADEON_LLVM_AMDGPU_VS
;
1188 case PIPE_SHADER_TESS_CTRL
:
1189 call_conv
= HAVE_LLVM
>= 0x0500 ? RADEON_LLVM_AMDGPU_HS
:
1190 RADEON_LLVM_AMDGPU_VS
;
1192 case PIPE_SHADER_GEOMETRY
:
1193 call_conv
= RADEON_LLVM_AMDGPU_GS
;
1195 case PIPE_SHADER_FRAGMENT
:
1196 call_conv
= RADEON_LLVM_AMDGPU_PS
;
1198 case PIPE_SHADER_COMPUTE
:
1199 call_conv
= RADEON_LLVM_AMDGPU_CS
;
1202 unreachable("Unhandle shader type");
1205 LLVMSetFunctionCallConv(ctx
->main_fn
, call_conv
);
1208 void si_llvm_optimize_module(struct si_shader_context
*ctx
)
1210 struct gallivm_state
*gallivm
= &ctx
->gallivm
;
1211 const char *triple
= LLVMGetTarget(gallivm
->module
);
1212 LLVMTargetLibraryInfoRef target_library_info
;
1214 /* Dump LLVM IR before any optimization passes */
1215 if (ctx
->screen
->debug_flags
& DBG(PREOPT_IR
) &&
1216 si_can_dump_shader(ctx
->screen
, ctx
->type
))
1217 LLVMDumpModule(ctx
->gallivm
.module
);
1219 /* Create the pass manager */
1220 gallivm
->passmgr
= LLVMCreatePassManager();
1222 target_library_info
= gallivm_create_target_library_info(triple
);
1223 LLVMAddTargetLibraryInfo(target_library_info
, gallivm
->passmgr
);
1225 if (si_extra_shader_checks(ctx
->screen
, ctx
->type
))
1226 LLVMAddVerifierPass(gallivm
->passmgr
);
1228 LLVMAddAlwaysInlinerPass(gallivm
->passmgr
);
1230 /* This pass should eliminate all the load and store instructions */
1231 LLVMAddPromoteMemoryToRegisterPass(gallivm
->passmgr
);
1233 /* Add some optimization passes */
1234 LLVMAddScalarReplAggregatesPass(gallivm
->passmgr
);
1235 LLVMAddLICMPass(gallivm
->passmgr
);
1236 LLVMAddAggressiveDCEPass(gallivm
->passmgr
);
1237 LLVMAddCFGSimplificationPass(gallivm
->passmgr
);
1238 /* This is recommended by the instruction combining pass. */
1239 LLVMAddEarlyCSEMemSSAPass(gallivm
->passmgr
);
1240 LLVMAddInstructionCombiningPass(gallivm
->passmgr
);
1243 LLVMRunPassManager(gallivm
->passmgr
, ctx
->gallivm
.module
);
1245 LLVMDisposeBuilder(ctx
->ac
.builder
);
1246 LLVMDisposePassManager(gallivm
->passmgr
);
1247 gallivm_dispose_target_library_info(target_library_info
);
1250 void si_llvm_dispose(struct si_shader_context
*ctx
)
1252 LLVMDisposeModule(ctx
->gallivm
.module
);
1253 LLVMContextDispose(ctx
->gallivm
.context
);
1254 FREE(ctx
->temp_arrays
);
1255 ctx
->temp_arrays
= NULL
;
1256 FREE(ctx
->temp_array_allocas
);
1257 ctx
->temp_array_allocas
= NULL
;
1260 ctx
->temps_count
= 0;
1264 ac_llvm_context_dispose(&ctx
->ac
);