radeonsi: kill point size VS output if it's not used by the rasterizer
[mesa.git] / src / gallium / drivers / radeonsi / si_shader_llvm_resources.c
1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28
29 /**
30 * Return a value that is equal to the given i32 \p index if it lies in [0,num)
31 * or an undefined value in the same interval otherwise.
32 */
33 static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
34 unsigned num)
35 {
36 LLVMBuilderRef builder = ctx->ac.builder;
37 LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
38 LLVMValueRef cc;
39
40 if (util_is_power_of_two_or_zero(num)) {
41 index = LLVMBuildAnd(builder, index, c_max, "");
42 } else {
43 /* In theory, this MAX pattern should result in code that is
44 * as good as the bit-wise AND above.
45 *
46 * In practice, LLVM generates worse code (at the time of
47 * writing), because its value tracking is not strong enough.
48 */
49 cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
50 index = LLVMBuildSelect(builder, cc, index, c_max, "");
51 }
52
53 return index;
54 }
55
56 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
57 {
58 LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
59 struct si_shader_selector *sel = ctx->shader->selector;
60
61 /* Do the bounds checking with a descriptor, because
62 * doing computation and manual bounds checking of 64-bit
63 * addresses generates horrible VALU code with very high
64 * VGPR usage and very low SIMD occupancy.
65 */
66 ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
67
68 LLVMValueRef desc0, desc1;
69 desc0 = ptr;
70 desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
71
72 uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
73 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
74
75 if (ctx->screen->info.chip_class >= GFX10)
76 rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
77 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
78 else
79 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
80 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
81
82 LLVMValueRef desc_elems[] = {desc0, desc1,
83 LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
84 LLVMConstInt(ctx->ac.i32, rsrc3, false)};
85
86 return ac_build_gather_values(&ctx->ac, desc_elems, 4);
87 }
88
89 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
90 {
91 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
92 struct si_shader_selector *sel = ctx->shader->selector;
93
94 LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
95
96 if (sel->info.base.num_ubos == 1 && sel->info.base.num_ssbos == 0) {
97 return load_const_buffer_desc_fast_path(ctx);
98 }
99
100 index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
101 index =
102 LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
103
104 return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
105 }
106
107 static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
108 {
109 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
110
111 /* Fast path if the shader buffer is in user SGPRs. */
112 if (LLVMIsConstant(index) &&
113 LLVMConstIntGetZExtValue(index) < ctx->shader->selector->cs_num_shaderbufs_in_user_sgprs)
114 return ac_get_arg(&ctx->ac, ctx->cs_shaderbuf[LLVMConstIntGetZExtValue(index)]);
115
116 LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
117
118 index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
119 index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
120 index, "");
121
122 return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
123 }
124
125 /**
126 * Given a 256-bit resource descriptor, force the DCC enable bit to off.
127 *
128 * At least on Tonga, executing image stores on images with DCC enabled and
129 * non-trivial can eventually lead to lockups. This can occur when an
130 * application binds an image as read-only but then uses a shader that writes
131 * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
132 * program termination) in this case, but it doesn't cost much to be a bit
133 * nicer: disabling DCC in the shader still leads to undefined results but
134 * avoids the lockup.
135 */
136 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
137 {
138 if (ctx->screen->info.chip_class <= GFX7) {
139 return rsrc;
140 } else {
141 LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
142 LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
143 LLVMValueRef tmp;
144
145 tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
146 tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
147 return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
148 }
149 }
150
151 /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
152 * adjust "index" to point to FMASK. */
153 static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
154 LLVMValueRef index, enum ac_descriptor_type desc_type,
155 bool uses_store, bool bindless)
156 {
157 LLVMBuilderRef builder = ctx->ac.builder;
158 LLVMValueRef rsrc;
159
160 if (desc_type == AC_DESC_BUFFER) {
161 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
162 list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
163 } else {
164 assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
165 }
166
167 if (bindless)
168 rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
169 else
170 rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
171
172 if (desc_type == AC_DESC_IMAGE && uses_store)
173 rsrc = force_dcc_off(ctx, rsrc);
174 return rsrc;
175 }
176
177 /**
178 * Load an image view, fmask view. or sampler state descriptor.
179 */
180 static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
181 LLVMValueRef index, enum ac_descriptor_type type)
182 {
183 LLVMBuilderRef builder = ctx->ac.builder;
184
185 switch (type) {
186 case AC_DESC_IMAGE:
187 /* The image is at [0:7]. */
188 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
189 break;
190 case AC_DESC_BUFFER:
191 /* The buffer is in [4:7]. */
192 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
193 list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
194 break;
195 case AC_DESC_FMASK:
196 /* The FMASK is at [8:15]. */
197 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
198 break;
199 case AC_DESC_SAMPLER:
200 /* The sampler state is at [12:15]. */
201 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
202 LLVMConstInt(ctx->ac.i32, 3, 0));
203 list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
204 break;
205 case AC_DESC_PLANE_0:
206 case AC_DESC_PLANE_1:
207 case AC_DESC_PLANE_2:
208 /* Only used for the multiplane image support for Vulkan. Should
209 * never be reached in radeonsi.
210 */
211 unreachable("Plane descriptor requested in radeonsi.");
212 }
213
214 return ac_build_load_to_sgpr(&ctx->ac, list, index);
215 }
216
217 static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
218 unsigned base_index, unsigned constant_index,
219 LLVMValueRef dynamic_index,
220 enum ac_descriptor_type desc_type, bool image,
221 bool write, bool bindless)
222 {
223 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
224 LLVMBuilderRef builder = ctx->ac.builder;
225 unsigned const_index = base_index + constant_index;
226
227 assert(!descriptor_set);
228 assert(desc_type <= AC_DESC_BUFFER);
229
230 if (bindless) {
231 LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
232
233 /* dynamic_index is the bindless handle */
234 if (image) {
235 /* Bindless image descriptors use 16-dword slots. */
236 dynamic_index =
237 LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
238 /* FMASK is right after the image. */
239 if (desc_type == AC_DESC_FMASK) {
240 dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
241 }
242
243 return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
244 }
245
246 /* Since bindless handle arithmetic can contain an unsigned integer
247 * wraparound and si_load_sampler_desc assumes there isn't any,
248 * use GEP without "inbounds" (inside ac_build_pointer_add)
249 * to prevent incorrect code generation and hangs.
250 */
251 dynamic_index =
252 LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
253 list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
254 return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
255 }
256
257 unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
258 assert(const_index < num_slots || dynamic_index);
259
260 LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
261 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
262
263 if (dynamic_index) {
264 index = LLVMBuildAdd(builder, index, dynamic_index, "");
265
266 /* From the GL_ARB_shader_image_load_store extension spec:
267 *
268 * If a shader performs an image load, store, or atomic
269 * operation using an image variable declared as an array,
270 * and if the index used to select an individual element is
271 * negative or greater than or equal to the size of the
272 * array, the results of the operation are undefined but may
273 * not lead to termination.
274 */
275 index = si_llvm_bound_index(ctx, index, num_slots);
276 }
277
278 if (image) {
279 /* Fast path if the image is in user SGPRs. */
280 if (!dynamic_index &&
281 const_index < ctx->shader->selector->cs_num_images_in_user_sgprs &&
282 (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER))
283 return ac_get_arg(&ctx->ac, ctx->cs_image[const_index]);
284
285 /* FMASKs are separate from images. */
286 if (desc_type == AC_DESC_FMASK) {
287 index =
288 LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
289 }
290 index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
291 index, "");
292 return si_load_image_desc(ctx, list, index, desc_type, write, false);
293 }
294
295 index = LLVMBuildAdd(ctx->ac.builder, index,
296 LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
297 return si_load_sampler_desc(ctx, list, index, desc_type);
298 }
299
300 void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
301 {
302 ctx->abi.load_ubo = load_ubo;
303 ctx->abi.load_ssbo = load_ssbo;
304 ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
305 }