2 * Copyright 2020 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "si_shader_internal.h"
30 * Return a value that is equal to the given i32 \p index if it lies in [0,num)
31 * or an undefined value in the same interval otherwise.
33 static LLVMValueRef
si_llvm_bound_index(struct si_shader_context
*ctx
, LLVMValueRef index
,
36 LLVMBuilderRef builder
= ctx
->ac
.builder
;
37 LLVMValueRef c_max
= LLVMConstInt(ctx
->ac
.i32
, num
- 1, 0);
40 if (util_is_power_of_two_or_zero(num
)) {
41 index
= LLVMBuildAnd(builder
, index
, c_max
, "");
43 /* In theory, this MAX pattern should result in code that is
44 * as good as the bit-wise AND above.
46 * In practice, LLVM generates worse code (at the time of
47 * writing), because its value tracking is not strong enough.
49 cc
= LLVMBuildICmp(builder
, LLVMIntULE
, index
, c_max
, "");
50 index
= LLVMBuildSelect(builder
, cc
, index
, c_max
, "");
56 static LLVMValueRef
load_const_buffer_desc_fast_path(struct si_shader_context
*ctx
)
58 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->const_and_shader_buffers
);
59 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
61 /* Do the bounds checking with a descriptor, because
62 * doing computation and manual bounds checking of 64-bit
63 * addresses generates horrible VALU code with very high
64 * VGPR usage and very low SIMD occupancy.
66 ptr
= LLVMBuildPtrToInt(ctx
->ac
.builder
, ptr
, ctx
->ac
.intptr
, "");
68 LLVMValueRef desc0
, desc1
;
70 desc1
= LLVMConstInt(ctx
->ac
.i32
, S_008F04_BASE_ADDRESS_HI(ctx
->screen
->info
.address32_hi
), 0);
72 uint32_t rsrc3
= S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X
) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y
) |
73 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z
) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W
);
75 if (ctx
->screen
->info
.chip_class
>= GFX10
)
76 rsrc3
|= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT
) |
77 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW
) | S_008F0C_RESOURCE_LEVEL(1);
79 rsrc3
|= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT
) |
80 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32
);
82 LLVMValueRef desc_elems
[] = {desc0
, desc1
,
83 LLVMConstInt(ctx
->ac
.i32
, sel
->info
.constbuf0_num_slots
* 16, 0),
84 LLVMConstInt(ctx
->ac
.i32
, rsrc3
, false)};
86 return ac_build_gather_values(&ctx
->ac
, desc_elems
, 4);
89 static LLVMValueRef
load_ubo(struct ac_shader_abi
*abi
, LLVMValueRef index
)
91 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
92 struct si_shader_selector
*sel
= ctx
->shader
->selector
;
94 LLVMValueRef ptr
= ac_get_arg(&ctx
->ac
, ctx
->const_and_shader_buffers
);
96 if (sel
->info
.const_buffers_declared
== 1 && sel
->info
.shader_buffers_declared
== 0) {
97 return load_const_buffer_desc_fast_path(ctx
);
100 index
= si_llvm_bound_index(ctx
, index
, ctx
->num_const_buffers
);
102 LLVMBuildAdd(ctx
->ac
.builder
, index
, LLVMConstInt(ctx
->ac
.i32
, SI_NUM_SHADER_BUFFERS
, 0), "");
104 return ac_build_load_to_sgpr(&ctx
->ac
, ptr
, index
);
107 static LLVMValueRef
load_ssbo(struct ac_shader_abi
*abi
, LLVMValueRef index
, bool write
)
109 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
111 /* Fast path if the shader buffer is in user SGPRs. */
112 if (LLVMIsConstant(index
) &&
113 LLVMConstIntGetZExtValue(index
) < ctx
->shader
->selector
->cs_num_shaderbufs_in_user_sgprs
)
114 return ac_get_arg(&ctx
->ac
, ctx
->cs_shaderbuf
[LLVMConstIntGetZExtValue(index
)]);
116 LLVMValueRef rsrc_ptr
= ac_get_arg(&ctx
->ac
, ctx
->const_and_shader_buffers
);
118 index
= si_llvm_bound_index(ctx
, index
, ctx
->num_shader_buffers
);
119 index
= LLVMBuildSub(ctx
->ac
.builder
, LLVMConstInt(ctx
->ac
.i32
, SI_NUM_SHADER_BUFFERS
- 1, 0),
122 return ac_build_load_to_sgpr(&ctx
->ac
, rsrc_ptr
, index
);
126 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
128 * At least on Tonga, executing image stores on images with DCC enabled and
129 * non-trivial can eventually lead to lockups. This can occur when an
130 * application binds an image as read-only but then uses a shader that writes
131 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
132 * program termination) in this case, but it doesn't cost much to be a bit
133 * nicer: disabling DCC in the shader still leads to undefined results but
136 static LLVMValueRef
force_dcc_off(struct si_shader_context
*ctx
, LLVMValueRef rsrc
)
138 if (ctx
->screen
->info
.chip_class
<= GFX7
) {
141 LLVMValueRef i32_6
= LLVMConstInt(ctx
->ac
.i32
, 6, 0);
142 LLVMValueRef i32_C
= LLVMConstInt(ctx
->ac
.i32
, C_008F28_COMPRESSION_EN
, 0);
145 tmp
= LLVMBuildExtractElement(ctx
->ac
.builder
, rsrc
, i32_6
, "");
146 tmp
= LLVMBuildAnd(ctx
->ac
.builder
, tmp
, i32_C
, "");
147 return LLVMBuildInsertElement(ctx
->ac
.builder
, rsrc
, tmp
, i32_6
, "");
151 /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
152 * adjust "index" to point to FMASK. */
153 static LLVMValueRef
si_load_image_desc(struct si_shader_context
*ctx
, LLVMValueRef list
,
154 LLVMValueRef index
, enum ac_descriptor_type desc_type
,
155 bool uses_store
, bool bindless
)
157 LLVMBuilderRef builder
= ctx
->ac
.builder
;
160 if (desc_type
== AC_DESC_BUFFER
) {
161 index
= ac_build_imad(&ctx
->ac
, index
, LLVMConstInt(ctx
->ac
.i32
, 2, 0), ctx
->ac
.i32_1
);
162 list
= LLVMBuildPointerCast(builder
, list
, ac_array_in_const32_addr_space(ctx
->ac
.v4i32
), "");
164 assert(desc_type
== AC_DESC_IMAGE
|| desc_type
== AC_DESC_FMASK
);
168 rsrc
= ac_build_load_to_sgpr_uint_wraparound(&ctx
->ac
, list
, index
);
170 rsrc
= ac_build_load_to_sgpr(&ctx
->ac
, list
, index
);
172 if (desc_type
== AC_DESC_IMAGE
&& uses_store
)
173 rsrc
= force_dcc_off(ctx
, rsrc
);
178 * Load an image view, fmask view. or sampler state descriptor.
180 static LLVMValueRef
si_load_sampler_desc(struct si_shader_context
*ctx
, LLVMValueRef list
,
181 LLVMValueRef index
, enum ac_descriptor_type type
)
183 LLVMBuilderRef builder
= ctx
->ac
.builder
;
187 /* The image is at [0:7]. */
188 index
= LLVMBuildMul(builder
, index
, LLVMConstInt(ctx
->ac
.i32
, 2, 0), "");
191 /* The buffer is in [4:7]. */
192 index
= ac_build_imad(&ctx
->ac
, index
, LLVMConstInt(ctx
->ac
.i32
, 4, 0), ctx
->ac
.i32_1
);
193 list
= LLVMBuildPointerCast(builder
, list
, ac_array_in_const32_addr_space(ctx
->ac
.v4i32
), "");
196 /* The FMASK is at [8:15]. */
197 index
= ac_build_imad(&ctx
->ac
, index
, LLVMConstInt(ctx
->ac
.i32
, 2, 0), ctx
->ac
.i32_1
);
199 case AC_DESC_SAMPLER
:
200 /* The sampler state is at [12:15]. */
201 index
= ac_build_imad(&ctx
->ac
, index
, LLVMConstInt(ctx
->ac
.i32
, 4, 0),
202 LLVMConstInt(ctx
->ac
.i32
, 3, 0));
203 list
= LLVMBuildPointerCast(builder
, list
, ac_array_in_const32_addr_space(ctx
->ac
.v4i32
), "");
205 case AC_DESC_PLANE_0
:
206 case AC_DESC_PLANE_1
:
207 case AC_DESC_PLANE_2
:
208 /* Only used for the multiplane image support for Vulkan. Should
209 * never be reached in radeonsi.
211 unreachable("Plane descriptor requested in radeonsi.");
214 return ac_build_load_to_sgpr(&ctx
->ac
, list
, index
);
217 static LLVMValueRef
si_nir_load_sampler_desc(struct ac_shader_abi
*abi
, unsigned descriptor_set
,
218 unsigned base_index
, unsigned constant_index
,
219 LLVMValueRef dynamic_index
,
220 enum ac_descriptor_type desc_type
, bool image
,
221 bool write
, bool bindless
)
223 struct si_shader_context
*ctx
= si_shader_context_from_abi(abi
);
224 LLVMBuilderRef builder
= ctx
->ac
.builder
;
225 unsigned const_index
= base_index
+ constant_index
;
227 assert(!descriptor_set
);
228 assert(desc_type
<= AC_DESC_BUFFER
);
231 LLVMValueRef list
= ac_get_arg(&ctx
->ac
, ctx
->bindless_samplers_and_images
);
233 /* dynamic_index is the bindless handle */
235 /* Bindless image descriptors use 16-dword slots. */
237 LLVMBuildMul(ctx
->ac
.builder
, dynamic_index
, LLVMConstInt(ctx
->ac
.i64
, 2, 0), "");
238 /* FMASK is right after the image. */
239 if (desc_type
== AC_DESC_FMASK
) {
240 dynamic_index
= LLVMBuildAdd(ctx
->ac
.builder
, dynamic_index
, ctx
->ac
.i32_1
, "");
243 return si_load_image_desc(ctx
, list
, dynamic_index
, desc_type
, write
, true);
246 /* Since bindless handle arithmetic can contain an unsigned integer
247 * wraparound and si_load_sampler_desc assumes there isn't any,
248 * use GEP without "inbounds" (inside ac_build_pointer_add)
249 * to prevent incorrect code generation and hangs.
252 LLVMBuildMul(ctx
->ac
.builder
, dynamic_index
, LLVMConstInt(ctx
->ac
.i64
, 2, 0), "");
253 list
= ac_build_pointer_add(&ctx
->ac
, list
, dynamic_index
);
254 return si_load_sampler_desc(ctx
, list
, ctx
->ac
.i32_0
, desc_type
);
257 unsigned num_slots
= image
? ctx
->num_images
: ctx
->num_samplers
;
258 assert(const_index
< num_slots
|| dynamic_index
);
260 LLVMValueRef list
= ac_get_arg(&ctx
->ac
, ctx
->samplers_and_images
);
261 LLVMValueRef index
= LLVMConstInt(ctx
->ac
.i32
, const_index
, false);
264 index
= LLVMBuildAdd(builder
, index
, dynamic_index
, "");
266 /* From the GL_ARB_shader_image_load_store extension spec:
268 * If a shader performs an image load, store, or atomic
269 * operation using an image variable declared as an array,
270 * and if the index used to select an individual element is
271 * negative or greater than or equal to the size of the
272 * array, the results of the operation are undefined but may
273 * not lead to termination.
275 index
= si_llvm_bound_index(ctx
, index
, num_slots
);
279 /* Fast path if the image is in user SGPRs. */
280 if (!dynamic_index
&&
281 const_index
< ctx
->shader
->selector
->cs_num_images_in_user_sgprs
&&
282 (desc_type
== AC_DESC_IMAGE
|| desc_type
== AC_DESC_BUFFER
))
283 return ac_get_arg(&ctx
->ac
, ctx
->cs_image
[const_index
]);
285 /* FMASKs are separate from images. */
286 if (desc_type
== AC_DESC_FMASK
) {
288 LLVMBuildAdd(ctx
->ac
.builder
, index
, LLVMConstInt(ctx
->ac
.i32
, SI_NUM_IMAGES
, 0), "");
290 index
= LLVMBuildSub(ctx
->ac
.builder
, LLVMConstInt(ctx
->ac
.i32
, SI_NUM_IMAGE_SLOTS
- 1, 0),
292 return si_load_image_desc(ctx
, list
, index
, desc_type
, write
, false);
295 index
= LLVMBuildAdd(ctx
->ac
.builder
, index
,
296 LLVMConstInt(ctx
->ac
.i32
, SI_NUM_IMAGE_SLOTS
/ 2, 0), "");
297 return si_load_sampler_desc(ctx
, list
, index
, desc_type
);
300 void si_llvm_init_resource_callbacks(struct si_shader_context
*ctx
)
302 ctx
->abi
.load_ubo
= load_ubo
;
303 ctx
->abi
.load_ssbo
= load_ssbo
;
304 ctx
->abi
.load_sampler_desc
= si_nir_load_sampler_desc
;