Added few more stubs so that control reaches to DestroyDevice().
[mesa.git] / src / amd / llvm / ac_nir_to_llvm.c
1 /*
2 * Copyright © 2016 Bas Nieuwenhuizen
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "ac_nir_to_llvm.h"
25
26 #include "ac_binary.h"
27 #include "ac_llvm_build.h"
28 #include "ac_llvm_util.h"
29 #include "ac_shader_abi.h"
30 #include "ac_shader_util.h"
31 #include "nir/nir.h"
32 #include "nir/nir_deref.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/u_math.h"
36 #include <llvm/Config/llvm-config.h>
37
38 struct ac_nir_context {
39 struct ac_llvm_context ac;
40 struct ac_shader_abi *abi;
41 const struct ac_shader_args *args;
42
43 gl_shader_stage stage;
44 shader_info *info;
45
46 LLVMValueRef *ssa_defs;
47
48 LLVMValueRef scratch;
49 LLVMValueRef constant_data;
50
51 struct hash_table *defs;
52 struct hash_table *phis;
53 struct hash_table *vars;
54 struct hash_table *verified_interp;
55
56 LLVMValueRef main_function;
57 LLVMBasicBlockRef continue_block;
58 LLVMBasicBlockRef break_block;
59
60 int num_locals;
61 LLVMValueRef *locals;
62 };
63
64 static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
65 const nir_instr *instr, bool image);
66
67 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
68 enum ac_descriptor_type desc_type, const nir_instr *instr,
69 LLVMValueRef index, bool image, bool write);
70
71 static void build_store_values_extended(struct ac_llvm_context *ac, LLVMValueRef *values,
72 unsigned value_count, unsigned value_stride,
73 LLVMValueRef vec)
74 {
75 LLVMBuilderRef builder = ac->builder;
76 unsigned i;
77
78 for (i = 0; i < value_count; i++) {
79 LLVMValueRef ptr = values[i * value_stride];
80 LLVMValueRef index = LLVMConstInt(ac->i32, i, false);
81 LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, "");
82 LLVMBuildStore(builder, value, ptr);
83 }
84 }
85
86 static LLVMTypeRef get_def_type(struct ac_nir_context *ctx, const nir_ssa_def *def)
87 {
88 LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size);
89 if (def->num_components > 1) {
90 type = LLVMVectorType(type, def->num_components);
91 }
92 return type;
93 }
94
95 static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src)
96 {
97 assert(src.is_ssa);
98 return nir->ssa_defs[src.ssa->index];
99 }
100
101 static LLVMValueRef get_memory_ptr(struct ac_nir_context *ctx, nir_src src, unsigned bit_size)
102 {
103 LLVMValueRef ptr = get_src(ctx, src);
104 ptr = LLVMBuildGEP(ctx->ac.builder, ctx->ac.lds, &ptr, 1, "");
105 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
106
107 LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, bit_size);
108
109 return LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(type, addr_space), "");
110 }
111
112 static LLVMBasicBlockRef get_block(struct ac_nir_context *nir, const struct nir_block *b)
113 {
114 struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b);
115 return (LLVMBasicBlockRef)entry->data;
116 }
117
118 static LLVMValueRef get_alu_src(struct ac_nir_context *ctx, nir_alu_src src,
119 unsigned num_components)
120 {
121 LLVMValueRef value = get_src(ctx, src.src);
122 bool need_swizzle = false;
123
124 assert(value);
125 unsigned src_components = ac_get_llvm_num_components(value);
126 for (unsigned i = 0; i < num_components; ++i) {
127 assert(src.swizzle[i] < src_components);
128 if (src.swizzle[i] != i)
129 need_swizzle = true;
130 }
131
132 if (need_swizzle || num_components != src_components) {
133 LLVMValueRef masks[] = {LLVMConstInt(ctx->ac.i32, src.swizzle[0], false),
134 LLVMConstInt(ctx->ac.i32, src.swizzle[1], false),
135 LLVMConstInt(ctx->ac.i32, src.swizzle[2], false),
136 LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)};
137
138 if (src_components > 1 && num_components == 1) {
139 value = LLVMBuildExtractElement(ctx->ac.builder, value, masks[0], "");
140 } else if (src_components == 1 && num_components > 1) {
141 LLVMValueRef values[] = {value, value, value, value};
142 value = ac_build_gather_values(&ctx->ac, values, num_components);
143 } else {
144 LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
145 value = LLVMBuildShuffleVector(ctx->ac.builder, value, value, swizzle, "");
146 }
147 }
148 assert(!src.negate);
149 assert(!src.abs);
150 return value;
151 }
152
153 static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx, LLVMIntPredicate pred,
154 LLVMValueRef src0, LLVMValueRef src1)
155 {
156 LLVMTypeRef src0_type = LLVMTypeOf(src0);
157 LLVMTypeRef src1_type = LLVMTypeOf(src1);
158
159 if (LLVMGetTypeKind(src0_type) == LLVMPointerTypeKind &&
160 LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) {
161 src1 = LLVMBuildIntToPtr(ctx->builder, src1, src0_type, "");
162 } else if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind &&
163 LLVMGetTypeKind(src0_type) != LLVMPointerTypeKind) {
164 src0 = LLVMBuildIntToPtr(ctx->builder, src0, src1_type, "");
165 }
166
167 LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
168 return LLVMBuildSelect(ctx->builder, result, LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
169 ctx->i32_0, "");
170 }
171
172 static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx, LLVMRealPredicate pred,
173 LLVMValueRef src0, LLVMValueRef src1)
174 {
175 LLVMValueRef result;
176 src0 = ac_to_float(ctx, src0);
177 src1 = ac_to_float(ctx, src1);
178 result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, "");
179 return LLVMBuildSelect(ctx->builder, result, LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
180 ctx->i32_0, "");
181 }
182
183 static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx, const char *intrin,
184 LLVMTypeRef result_type, LLVMValueRef src0)
185 {
186 char name[64], type[64];
187 LLVMValueRef params[] = {
188 ac_to_float(ctx, src0),
189 };
190
191 ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
192 ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
193 assert(length < sizeof(name));
194 return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
195 }
196
197 static LLVMValueRef emit_intrin_1f_param_scalar(struct ac_llvm_context *ctx, const char *intrin,
198 LLVMTypeRef result_type, LLVMValueRef src0)
199 {
200 if (LLVMGetTypeKind(result_type) != LLVMVectorTypeKind)
201 return emit_intrin_1f_param(ctx, intrin, result_type, src0);
202
203 LLVMTypeRef elem_type = LLVMGetElementType(result_type);
204 LLVMValueRef ret = LLVMGetUndef(result_type);
205
206 /* Scalarize the intrinsic, because vectors are not supported. */
207 for (unsigned i = 0; i < LLVMGetVectorSize(result_type); i++) {
208 char name[64], type[64];
209 LLVMValueRef params[] = {
210 ac_to_float(ctx, ac_llvm_extract_elem(ctx, src0, i)),
211 };
212
213 ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
214 ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
215 assert(length < sizeof(name));
216 ret = LLVMBuildInsertElement(
217 ctx->builder, ret,
218 ac_build_intrinsic(ctx, name, elem_type, params, 1, AC_FUNC_ATTR_READNONE),
219 LLVMConstInt(ctx->i32, i, 0), "");
220 }
221 return ret;
222 }
223
224 static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx, const char *intrin,
225 LLVMTypeRef result_type, LLVMValueRef src0,
226 LLVMValueRef src1)
227 {
228 char name[64], type[64];
229 LLVMValueRef params[] = {
230 ac_to_float(ctx, src0),
231 ac_to_float(ctx, src1),
232 };
233
234 ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
235 ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
236 assert(length < sizeof(name));
237 return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
238 }
239
240 static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx, const char *intrin,
241 LLVMTypeRef result_type, LLVMValueRef src0,
242 LLVMValueRef src1, LLVMValueRef src2)
243 {
244 char name[64], type[64];
245 LLVMValueRef params[] = {
246 ac_to_float(ctx, src0),
247 ac_to_float(ctx, src1),
248 ac_to_float(ctx, src2),
249 };
250
251 ac_build_type_name_for_intr(LLVMTypeOf(params[0]), type, sizeof(type));
252 ASSERTED const int length = snprintf(name, sizeof(name), "%s.%s", intrin, type);
253 assert(length < sizeof(name));
254 return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
255 }
256
257 static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx, LLVMValueRef src0, LLVMValueRef src1,
258 LLVMValueRef src2)
259 {
260 LLVMTypeRef src1_type = LLVMTypeOf(src1);
261 LLVMTypeRef src2_type = LLVMTypeOf(src2);
262
263 if (LLVMGetTypeKind(src1_type) == LLVMPointerTypeKind &&
264 LLVMGetTypeKind(src2_type) != LLVMPointerTypeKind) {
265 src2 = LLVMBuildIntToPtr(ctx->builder, src2, src1_type, "");
266 } else if (LLVMGetTypeKind(src2_type) == LLVMPointerTypeKind &&
267 LLVMGetTypeKind(src1_type) != LLVMPointerTypeKind) {
268 src1 = LLVMBuildIntToPtr(ctx->builder, src1, src2_type, "");
269 }
270
271 LLVMValueRef v =
272 LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, LLVMConstNull(LLVMTypeOf(src0)), "");
273 return LLVMBuildSelect(ctx->builder, v, ac_to_integer_or_pointer(ctx, src1),
274 ac_to_integer_or_pointer(ctx, src2), "");
275 }
276
277 static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx, LLVMValueRef src0)
278 {
279 return ac_build_imax(ctx, src0, LLVMBuildNeg(ctx->builder, src0, ""));
280 }
281
282 static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx, const char *intrin,
283 LLVMValueRef src0, LLVMValueRef src1)
284 {
285 LLVMTypeRef ret_type;
286 LLVMTypeRef types[] = {ctx->i32, ctx->i1};
287 LLVMValueRef res;
288 LLVMValueRef params[] = {src0, src1};
289 ret_type = LLVMStructTypeInContext(ctx->context, types, 2, true);
290
291 res = ac_build_intrinsic(ctx, intrin, ret_type, params, 2, AC_FUNC_ATTR_READNONE);
292
293 res = LLVMBuildExtractValue(ctx->builder, res, 1, "");
294 res = LLVMBuildZExt(ctx->builder, res, ctx->i32, "");
295 return res;
296 }
297
298 static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
299 {
300 assert(ac_get_elem_bits(ctx, LLVMTypeOf(src0)) == 32);
301 LLVMValueRef result =
302 LLVMBuildAnd(ctx->builder, src0, ac_const_uint_vec(ctx, LLVMTypeOf(src0), 0x3f800000), "");
303 result = ac_to_float(ctx, result);
304
305 switch (bitsize) {
306 case 16: {
307 bool vec2 = LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind;
308 return LLVMBuildFPTrunc(ctx->builder, result, vec2 ? ctx->v2f16 : ctx->f16, "");
309 }
310 case 32:
311 return result;
312 case 64:
313 return LLVMBuildFPExt(ctx->builder, result, ctx->f64, "");
314 default:
315 unreachable("Unsupported bit size.");
316 }
317 }
318
319 static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx, LLVMValueRef src0)
320 {
321 src0 = ac_to_float(ctx, src0);
322 LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0));
323 return LLVMBuildSExt(ctx->builder, LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, zero, ""),
324 ctx->i32, "");
325 }
326
327 static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
328 {
329 LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
330
331 switch (bitsize) {
332 case 8:
333 return LLVMBuildTrunc(ctx->builder, result, ctx->i8, "");
334 case 16:
335 return LLVMBuildTrunc(ctx->builder, result, ctx->i16, "");
336 case 32:
337 return result;
338 case 64:
339 return LLVMBuildZExt(ctx->builder, result, ctx->i64, "");
340 default:
341 unreachable("Unsupported bit size.");
342 }
343 }
344
345 static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx, LLVMValueRef src0)
346 {
347 LLVMValueRef zero = LLVMConstNull(LLVMTypeOf(src0));
348 return LLVMBuildSExt(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, zero, ""),
349 ctx->i32, "");
350 }
351
352 static LLVMValueRef emit_f2f16(struct ac_llvm_context *ctx, LLVMValueRef src0)
353 {
354 LLVMValueRef result;
355 LLVMValueRef cond = NULL;
356
357 src0 = ac_to_float(ctx, src0);
358 result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->f16, "");
359
360 if (ctx->chip_class >= GFX8) {
361 LLVMValueRef args[2];
362 /* Check if the result is a denormal - and flush to 0 if so. */
363 args[0] = result;
364 args[1] = LLVMConstInt(ctx->i32, N_SUBNORMAL | P_SUBNORMAL, false);
365 cond =
366 ac_build_intrinsic(ctx, "llvm.amdgcn.class.f16", ctx->i1, args, 2, AC_FUNC_ATTR_READNONE);
367 }
368
369 /* need to convert back up to f32 */
370 result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, "");
371
372 if (ctx->chip_class >= GFX8)
373 result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, "");
374 else {
375 /* for GFX6-GFX7 */
376 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
377 * so compare the result and flush to 0 if it's smaller.
378 */
379 LLVMValueRef temp, cond2;
380 temp = emit_intrin_1f_param(ctx, "llvm.fabs", ctx->f32, result);
381 cond = LLVMBuildFCmp(
382 ctx->builder, LLVMRealOGT,
383 LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 0x38800000, false), ctx->f32, ""),
384 temp, "");
385 cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealONE, temp, ctx->f32_0, "");
386 cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
387 result = LLVMBuildSelect(ctx->builder, cond, ctx->f32_0, result, "");
388 }
389 return result;
390 }
391
392 static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx, LLVMValueRef src0,
393 LLVMValueRef src1)
394 {
395 LLVMValueRef dst64, result;
396 src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, "");
397 src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, "");
398
399 dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
400 dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
401 result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
402 return result;
403 }
404
405 static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx, LLVMValueRef src0,
406 LLVMValueRef src1)
407 {
408 LLVMValueRef dst64, result;
409 src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, "");
410 src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, "");
411
412 dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
413 dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
414 result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
415 return result;
416 }
417
418 static LLVMValueRef emit_bfm(struct ac_llvm_context *ctx, LLVMValueRef bits, LLVMValueRef offset)
419 {
420 /* mask = ((1 << bits) - 1) << offset */
421 return LLVMBuildShl(
422 ctx->builder,
423 LLVMBuildSub(ctx->builder, LLVMBuildShl(ctx->builder, ctx->i32_1, bits, ""), ctx->i32_1, ""),
424 offset, "");
425 }
426
427 static LLVMValueRef emit_bitfield_select(struct ac_llvm_context *ctx, LLVMValueRef mask,
428 LLVMValueRef insert, LLVMValueRef base)
429 {
430 /* Calculate:
431 * (mask & insert) | (~mask & base) = base ^ (mask & (insert ^ base))
432 * Use the right-hand side, which the LLVM backend can convert to V_BFI.
433 */
434 return LLVMBuildXor(
435 ctx->builder, base,
436 LLVMBuildAnd(ctx->builder, mask, LLVMBuildXor(ctx->builder, insert, base, ""), ""), "");
437 }
438
439 static LLVMValueRef emit_pack_2x16(struct ac_llvm_context *ctx, LLVMValueRef src0,
440 LLVMValueRef (*pack)(struct ac_llvm_context *ctx,
441 LLVMValueRef args[2]))
442 {
443 LLVMValueRef comp[2];
444
445 src0 = ac_to_float(ctx, src0);
446 comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, "");
447 comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, "");
448
449 return LLVMBuildBitCast(ctx->builder, pack(ctx, comp), ctx->i32, "");
450 }
451
452 static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx, LLVMValueRef src0)
453 {
454 LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
455 LLVMValueRef temps[2], val;
456 int i;
457
458 for (i = 0; i < 2; i++) {
459 val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0;
460 val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, "");
461 val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, "");
462 temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, "");
463 }
464 return ac_build_gather_values(ctx, temps, 2);
465 }
466
467 static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx, nir_op op, LLVMValueRef src0)
468 {
469 unsigned mask;
470 int idx;
471 LLVMValueRef result;
472
473 if (op == nir_op_fddx_fine)
474 mask = AC_TID_MASK_LEFT;
475 else if (op == nir_op_fddy_fine)
476 mask = AC_TID_MASK_TOP;
477 else
478 mask = AC_TID_MASK_TOP_LEFT;
479
480 /* for DDX we want to next X pixel, DDY next Y pixel. */
481 if (op == nir_op_fddx_fine || op == nir_op_fddx_coarse || op == nir_op_fddx)
482 idx = 1;
483 else
484 idx = 2;
485
486 result = ac_build_ddxy(&ctx->ac, mask, idx, src0);
487 return result;
488 }
489
490 struct waterfall_context {
491 LLVMBasicBlockRef phi_bb[2];
492 bool use_waterfall;
493 };
494
495 /* To deal with divergent descriptors we can create a loop that handles all
496 * lanes with the same descriptor on a given iteration (henceforth a
497 * waterfall loop).
498 *
499 * These helper create the begin and end of the loop leaving the caller
500 * to implement the body.
501 *
502 * params:
503 * - ctx is the usal nir context
504 * - wctx is a temporary struct containing some loop info. Can be left uninitialized.
505 * - value is the possibly divergent value for which we built the loop
506 * - divergent is whether value is actually divergent. If false we just pass
507 * things through.
508 */
509 static LLVMValueRef enter_waterfall(struct ac_nir_context *ctx, struct waterfall_context *wctx,
510 LLVMValueRef value, bool divergent)
511 {
512 /* If the app claims the value is divergent but it is constant we can
513 * end up with a dynamic index of NULL. */
514 if (!value)
515 divergent = false;
516
517 wctx->use_waterfall = divergent;
518 if (!divergent)
519 return value;
520
521 ac_build_bgnloop(&ctx->ac, 6000);
522
523 LLVMValueRef scalar_value = ac_build_readlane(&ctx->ac, value, NULL);
524
525 LLVMValueRef active =
526 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, value, scalar_value, "uniform_active");
527
528 wctx->phi_bb[0] = LLVMGetInsertBlock(ctx->ac.builder);
529 ac_build_ifcc(&ctx->ac, active, 6001);
530
531 return scalar_value;
532 }
533
534 static LLVMValueRef exit_waterfall(struct ac_nir_context *ctx, struct waterfall_context *wctx,
535 LLVMValueRef value)
536 {
537 LLVMValueRef ret = NULL;
538 LLVMValueRef phi_src[2];
539 LLVMValueRef cc_phi_src[2] = {
540 LLVMConstInt(ctx->ac.i32, 0, false),
541 LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
542 };
543
544 if (!wctx->use_waterfall)
545 return value;
546
547 wctx->phi_bb[1] = LLVMGetInsertBlock(ctx->ac.builder);
548
549 ac_build_endif(&ctx->ac, 6001);
550
551 if (value) {
552 phi_src[0] = LLVMGetUndef(LLVMTypeOf(value));
553 phi_src[1] = value;
554
555 ret = ac_build_phi(&ctx->ac, LLVMTypeOf(value), 2, phi_src, wctx->phi_bb);
556 }
557
558 /*
559 * By using the optimization barrier on the exit decision, we decouple
560 * the operations from the break, and hence avoid LLVM hoisting the
561 * opteration into the break block.
562 */
563 LLVMValueRef cc = ac_build_phi(&ctx->ac, ctx->ac.i32, 2, cc_phi_src, wctx->phi_bb);
564 ac_build_optimization_barrier(&ctx->ac, &cc);
565
566 LLVMValueRef active =
567 LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, cc, ctx->ac.i32_0, "uniform_active2");
568 ac_build_ifcc(&ctx->ac, active, 6002);
569 ac_build_break(&ctx->ac);
570 ac_build_endif(&ctx->ac, 6002);
571
572 ac_build_endloop(&ctx->ac, 6000);
573 return ret;
574 }
575
576 static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
577 {
578 LLVMValueRef src[4], result = NULL;
579 unsigned num_components = instr->dest.dest.ssa.num_components;
580 unsigned src_components;
581 LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
582
583 assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
584 switch (instr->op) {
585 case nir_op_vec2:
586 case nir_op_vec3:
587 case nir_op_vec4:
588 src_components = 1;
589 break;
590 case nir_op_pack_half_2x16:
591 case nir_op_pack_snorm_2x16:
592 case nir_op_pack_unorm_2x16:
593 src_components = 2;
594 break;
595 case nir_op_unpack_half_2x16:
596 src_components = 1;
597 break;
598 case nir_op_cube_face_coord:
599 case nir_op_cube_face_index:
600 src_components = 3;
601 break;
602 default:
603 src_components = num_components;
604 break;
605 }
606 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
607 src[i] = get_alu_src(ctx, instr->src[i], src_components);
608
609 switch (instr->op) {
610 case nir_op_mov:
611 result = src[0];
612 break;
613 case nir_op_fneg:
614 src[0] = ac_to_float(&ctx->ac, src[0]);
615 result = LLVMBuildFNeg(ctx->ac.builder, src[0], "");
616 if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) {
617 /* fneg will be optimized by backend compiler with sign
618 * bit removed via XOR. This is probably a LLVM bug.
619 */
620 result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
621 }
622 break;
623 case nir_op_ineg:
624 result = LLVMBuildNeg(ctx->ac.builder, src[0], "");
625 break;
626 case nir_op_inot:
627 result = LLVMBuildNot(ctx->ac.builder, src[0], "");
628 break;
629 case nir_op_iadd:
630 result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], "");
631 break;
632 case nir_op_fadd:
633 src[0] = ac_to_float(&ctx->ac, src[0]);
634 src[1] = ac_to_float(&ctx->ac, src[1]);
635 result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], "");
636 break;
637 case nir_op_fsub:
638 src[0] = ac_to_float(&ctx->ac, src[0]);
639 src[1] = ac_to_float(&ctx->ac, src[1]);
640 result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], "");
641 break;
642 case nir_op_isub:
643 result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], "");
644 break;
645 case nir_op_imul:
646 result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], "");
647 break;
648 case nir_op_imod:
649 result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
650 break;
651 case nir_op_umod:
652 result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], "");
653 break;
654 case nir_op_irem:
655 result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
656 break;
657 case nir_op_idiv:
658 result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], "");
659 break;
660 case nir_op_udiv:
661 result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], "");
662 break;
663 case nir_op_fmul:
664 src[0] = ac_to_float(&ctx->ac, src[0]);
665 src[1] = ac_to_float(&ctx->ac, src[1]);
666 result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
667 break;
668 case nir_op_frcp:
669 /* For doubles, we need precise division to pass GLCTS. */
670 if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && ac_get_type_size(def_type) == 8) {
671 result = LLVMBuildFDiv(ctx->ac.builder, ctx->ac.f64_1, ac_to_float(&ctx->ac, src[0]), "");
672 } else {
673 result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rcp",
674 ac_to_float_type(&ctx->ac, def_type), src[0]);
675 }
676 if (ctx->abi->clamp_div_by_zero)
677 result = ac_build_fmin(&ctx->ac, result,
678 LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
679 break;
680 case nir_op_iand:
681 result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
682 break;
683 case nir_op_ior:
684 result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], "");
685 break;
686 case nir_op_ixor:
687 result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], "");
688 break;
689 case nir_op_ishl:
690 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) <
691 ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
692 src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
693 else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) >
694 ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
695 src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
696 result = LLVMBuildShl(ctx->ac.builder, src[0], src[1], "");
697 break;
698 case nir_op_ishr:
699 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) <
700 ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
701 src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
702 else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) >
703 ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
704 src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
705 result = LLVMBuildAShr(ctx->ac.builder, src[0], src[1], "");
706 break;
707 case nir_op_ushr:
708 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) <
709 ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
710 src[1] = LLVMBuildZExt(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
711 else if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[1])) >
712 ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])))
713 src[1] = LLVMBuildTrunc(ctx->ac.builder, src[1], LLVMTypeOf(src[0]), "");
714 result = LLVMBuildLShr(ctx->ac.builder, src[0], src[1], "");
715 break;
716 case nir_op_ilt32:
717 result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]);
718 break;
719 case nir_op_ine32:
720 result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]);
721 break;
722 case nir_op_ieq32:
723 result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]);
724 break;
725 case nir_op_ige32:
726 result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]);
727 break;
728 case nir_op_ult32:
729 result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]);
730 break;
731 case nir_op_uge32:
732 result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]);
733 break;
734 case nir_op_feq32:
735 result = emit_float_cmp(&ctx->ac, LLVMRealOEQ, src[0], src[1]);
736 break;
737 case nir_op_fneu32:
738 result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
739 break;
740 case nir_op_flt32:
741 result = emit_float_cmp(&ctx->ac, LLVMRealOLT, src[0], src[1]);
742 break;
743 case nir_op_fge32:
744 result = emit_float_cmp(&ctx->ac, LLVMRealOGE, src[0], src[1]);
745 break;
746 case nir_op_fabs:
747 result =
748 emit_intrin_1f_param(&ctx->ac, "llvm.fabs", ac_to_float_type(&ctx->ac, def_type), src[0]);
749 if (ctx->ac.float_mode == AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO) {
750 /* fabs will be optimized by backend compiler with sign
751 * bit removed via AND.
752 */
753 result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
754 }
755 break;
756 case nir_op_iabs:
757 result = emit_iabs(&ctx->ac, src[0]);
758 break;
759 case nir_op_imax:
760 result = ac_build_imax(&ctx->ac, src[0], src[1]);
761 break;
762 case nir_op_imin:
763 result = ac_build_imin(&ctx->ac, src[0], src[1]);
764 break;
765 case nir_op_umax:
766 result = ac_build_umax(&ctx->ac, src[0], src[1]);
767 break;
768 case nir_op_umin:
769 result = ac_build_umin(&ctx->ac, src[0], src[1]);
770 break;
771 case nir_op_isign:
772 result = ac_build_isign(&ctx->ac, src[0]);
773 break;
774 case nir_op_fsign:
775 src[0] = ac_to_float(&ctx->ac, src[0]);
776 result = ac_build_fsign(&ctx->ac, src[0]);
777 break;
778 case nir_op_ffloor:
779 result =
780 emit_intrin_1f_param(&ctx->ac, "llvm.floor", ac_to_float_type(&ctx->ac, def_type), src[0]);
781 break;
782 case nir_op_ftrunc:
783 result =
784 emit_intrin_1f_param(&ctx->ac, "llvm.trunc", ac_to_float_type(&ctx->ac, def_type), src[0]);
785 break;
786 case nir_op_fceil:
787 result =
788 emit_intrin_1f_param(&ctx->ac, "llvm.ceil", ac_to_float_type(&ctx->ac, def_type), src[0]);
789 break;
790 case nir_op_fround_even:
791 result =
792 emit_intrin_1f_param(&ctx->ac, "llvm.rint", ac_to_float_type(&ctx->ac, def_type), src[0]);
793 break;
794 case nir_op_ffract:
795 result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.fract",
796 ac_to_float_type(&ctx->ac, def_type), src[0]);
797 break;
798 case nir_op_fsin:
799 result =
800 emit_intrin_1f_param(&ctx->ac, "llvm.sin", ac_to_float_type(&ctx->ac, def_type), src[0]);
801 break;
802 case nir_op_fcos:
803 result =
804 emit_intrin_1f_param(&ctx->ac, "llvm.cos", ac_to_float_type(&ctx->ac, def_type), src[0]);
805 break;
806 case nir_op_fsqrt:
807 result =
808 emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", ac_to_float_type(&ctx->ac, def_type), src[0]);
809 break;
810 case nir_op_fexp2:
811 result =
812 emit_intrin_1f_param(&ctx->ac, "llvm.exp2", ac_to_float_type(&ctx->ac, def_type), src[0]);
813 break;
814 case nir_op_flog2:
815 result =
816 emit_intrin_1f_param(&ctx->ac, "llvm.log2", ac_to_float_type(&ctx->ac, def_type), src[0]);
817 break;
818 case nir_op_frsq:
819 result = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.rsq",
820 ac_to_float_type(&ctx->ac, def_type), src[0]);
821 if (ctx->abi->clamp_div_by_zero)
822 result = ac_build_fmin(&ctx->ac, result,
823 LLVMConstReal(ac_to_float_type(&ctx->ac, def_type), FLT_MAX));
824 break;
825 case nir_op_frexp_exp:
826 src[0] = ac_to_float(&ctx->ac, src[0]);
827 result = ac_build_frexp_exp(&ctx->ac, src[0], ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])));
828 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 16)
829 result = LLVMBuildSExt(ctx->ac.builder, result, ctx->ac.i32, "");
830 break;
831 case nir_op_frexp_sig:
832 src[0] = ac_to_float(&ctx->ac, src[0]);
833 result = ac_build_frexp_mant(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
834 break;
835 case nir_op_fpow:
836 result = emit_intrin_2f_param(&ctx->ac, "llvm.pow", ac_to_float_type(&ctx->ac, def_type),
837 src[0], src[1]);
838 break;
839 case nir_op_fmax:
840 result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum", ac_to_float_type(&ctx->ac, def_type),
841 src[0], src[1]);
842 if (ctx->ac.chip_class < GFX9 && instr->dest.dest.ssa.bit_size == 32) {
843 /* Only pre-GFX9 chips do not flush denorms. */
844 result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
845 }
846 break;
847 case nir_op_fmin:
848 result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum", ac_to_float_type(&ctx->ac, def_type),
849 src[0], src[1]);
850 if (ctx->ac.chip_class < GFX9 && instr->dest.dest.ssa.bit_size == 32) {
851 /* Only pre-GFX9 chips do not flush denorms. */
852 result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
853 }
854 break;
855 case nir_op_ffma:
856 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
857 result =
858 emit_intrin_3f_param(&ctx->ac, ctx->ac.chip_class >= GFX10 ? "llvm.fma" : "llvm.fmuladd",
859 ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
860 break;
861 case nir_op_ldexp:
862 src[0] = ac_to_float(&ctx->ac, src[0]);
863 if (ac_get_elem_bits(&ctx->ac, def_type) == 32)
864 result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2,
865 AC_FUNC_ATTR_READNONE);
866 else if (ac_get_elem_bits(&ctx->ac, def_type) == 16)
867 result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2,
868 AC_FUNC_ATTR_READNONE);
869 else
870 result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2,
871 AC_FUNC_ATTR_READNONE);
872 break;
873 case nir_op_bfm:
874 result = emit_bfm(&ctx->ac, src[0], src[1]);
875 break;
876 case nir_op_bitfield_select:
877 result = emit_bitfield_select(&ctx->ac, src[0], src[1], src[2]);
878 break;
879 case nir_op_ubfe:
880 result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], false);
881 break;
882 case nir_op_ibfe:
883 result = ac_build_bfe(&ctx->ac, src[0], src[1], src[2], true);
884 break;
885 case nir_op_bitfield_reverse:
886 result = ac_build_bitfield_reverse(&ctx->ac, src[0]);
887 break;
888 case nir_op_bit_count:
889 result = ac_build_bit_count(&ctx->ac, src[0]);
890 break;
891 case nir_op_vec2:
892 case nir_op_vec3:
893 case nir_op_vec4:
894 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
895 src[i] = ac_to_integer(&ctx->ac, src[i]);
896 result = ac_build_gather_values(&ctx->ac, src, num_components);
897 break;
898 case nir_op_f2i8:
899 case nir_op_f2i16:
900 case nir_op_f2i32:
901 case nir_op_f2i64:
902 src[0] = ac_to_float(&ctx->ac, src[0]);
903 result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
904 break;
905 case nir_op_f2u8:
906 case nir_op_f2u16:
907 case nir_op_f2u32:
908 case nir_op_f2u64:
909 src[0] = ac_to_float(&ctx->ac, src[0]);
910 result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, "");
911 break;
912 case nir_op_i2f16:
913 case nir_op_i2f32:
914 case nir_op_i2f64:
915 result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
916 break;
917 case nir_op_u2f16:
918 case nir_op_u2f32:
919 case nir_op_u2f64:
920 result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
921 break;
922 case nir_op_f2f16_rtz:
923 case nir_op_f2f16:
924 case nir_op_f2fmp:
925 src[0] = ac_to_float(&ctx->ac, src[0]);
926
927 /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it,
928 * all f32->f16 conversions have to round towards zero, because both scalar
929 * and vec2 down-conversions have to round equally.
930 */
931 if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL || instr->op == nir_op_f2f16_rtz) {
932 src[0] = ac_to_float(&ctx->ac, src[0]);
933
934 if (LLVMTypeOf(src[0]) == ctx->ac.f64)
935 src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
936
937 /* Fast path conversion. This only works if NIR is vectorized
938 * to vec2 16.
939 */
940 if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
941 LLVMValueRef args[] = {
942 ac_llvm_extract_elem(&ctx->ac, src[0], 0),
943 ac_llvm_extract_elem(&ctx->ac, src[0], 1),
944 };
945 result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
946 break;
947 }
948
949 assert(ac_get_llvm_num_components(src[0]) == 1);
950 LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)};
951 result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
952 result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
953 } else {
954 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
955 result =
956 LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
957 else
958 result =
959 LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
960 }
961 break;
962 case nir_op_f2f16_rtne:
963 case nir_op_f2f32:
964 case nir_op_f2f64:
965 src[0] = ac_to_float(&ctx->ac, src[0]);
966 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
967 result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
968 else
969 result =
970 LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
971 break;
972 case nir_op_u2u8:
973 case nir_op_u2u16:
974 case nir_op_u2ump:
975 case nir_op_u2u32:
976 case nir_op_u2u64:
977 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
978 result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, "");
979 else
980 result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
981 break;
982 case nir_op_i2i8:
983 case nir_op_i2i16:
984 case nir_op_i2imp:
985 case nir_op_i2i32:
986 case nir_op_i2i64:
987 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
988 result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, "");
989 else
990 result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
991 break;
992 case nir_op_b32csel:
993 result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]);
994 break;
995 case nir_op_find_lsb:
996 result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]);
997 break;
998 case nir_op_ufind_msb:
999 result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32);
1000 break;
1001 case nir_op_ifind_msb:
1002 result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32);
1003 break;
1004 case nir_op_uadd_carry:
1005 result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]);
1006 break;
1007 case nir_op_usub_borrow:
1008 result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]);
1009 break;
1010 case nir_op_b2f16:
1011 case nir_op_b2f32:
1012 case nir_op_b2f64:
1013 result = emit_b2f(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
1014 break;
1015 case nir_op_f2b32:
1016 result = emit_f2b(&ctx->ac, src[0]);
1017 break;
1018 case nir_op_b2i8:
1019 case nir_op_b2i16:
1020 case nir_op_b2i32:
1021 case nir_op_b2i64:
1022 result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
1023 break;
1024 case nir_op_i2b32:
1025 result = emit_i2b(&ctx->ac, src[0]);
1026 break;
1027 case nir_op_fquantize2f16:
1028 result = emit_f2f16(&ctx->ac, src[0]);
1029 break;
1030 case nir_op_umul_high:
1031 result = emit_umul_high(&ctx->ac, src[0], src[1]);
1032 break;
1033 case nir_op_imul_high:
1034 result = emit_imul_high(&ctx->ac, src[0], src[1]);
1035 break;
1036 case nir_op_pack_half_2x16:
1037 result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pkrtz_f16);
1038 break;
1039 case nir_op_pack_snorm_2x16:
1040 result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_i16);
1041 break;
1042 case nir_op_pack_unorm_2x16:
1043 result = emit_pack_2x16(&ctx->ac, src[0], ac_build_cvt_pknorm_u16);
1044 break;
1045 case nir_op_unpack_half_2x16:
1046 result = emit_unpack_half_2x16(&ctx->ac, src[0]);
1047 break;
1048 case nir_op_fddx:
1049 case nir_op_fddy:
1050 case nir_op_fddx_fine:
1051 case nir_op_fddy_fine:
1052 case nir_op_fddx_coarse:
1053 case nir_op_fddy_coarse:
1054 result = emit_ddxy(ctx, instr->op, src[0]);
1055 break;
1056
1057 case nir_op_unpack_64_2x32_split_x: {
1058 assert(ac_get_llvm_num_components(src[0]) == 1);
1059 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i32, "");
1060 result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_0, "");
1061 break;
1062 }
1063
1064 case nir_op_unpack_64_2x32_split_y: {
1065 assert(ac_get_llvm_num_components(src[0]) == 1);
1066 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i32, "");
1067 result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, "");
1068 break;
1069 }
1070
1071 case nir_op_pack_64_2x32_split: {
1072 LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
1073 result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, "");
1074 break;
1075 }
1076
1077 case nir_op_pack_32_2x16_split: {
1078 LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
1079 result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, "");
1080 break;
1081 }
1082
1083 case nir_op_unpack_32_2x16_split_x: {
1084 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, "");
1085 result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_0, "");
1086 break;
1087 }
1088
1089 case nir_op_unpack_32_2x16_split_y: {
1090 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, "");
1091 result = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, "");
1092 break;
1093 }
1094
1095 case nir_op_cube_face_coord: {
1096 src[0] = ac_to_float(&ctx->ac, src[0]);
1097 LLVMValueRef results[2];
1098 LLVMValueRef in[3];
1099 for (unsigned chan = 0; chan < 3; chan++)
1100 in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
1101 results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc", ctx->ac.f32, in, 3,
1102 AC_FUNC_ATTR_READNONE);
1103 results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc", ctx->ac.f32, in, 3,
1104 AC_FUNC_ATTR_READNONE);
1105 LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubema", ctx->ac.f32, in, 3,
1106 AC_FUNC_ATTR_READNONE);
1107 results[0] = ac_build_fdiv(&ctx->ac, results[0], ma);
1108 results[1] = ac_build_fdiv(&ctx->ac, results[1], ma);
1109 LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5);
1110 results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, "");
1111 results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, "");
1112 result = ac_build_gather_values(&ctx->ac, results, 2);
1113 break;
1114 }
1115
1116 case nir_op_cube_face_index: {
1117 src[0] = ac_to_float(&ctx->ac, src[0]);
1118 LLVMValueRef in[3];
1119 for (unsigned chan = 0; chan < 3; chan++)
1120 in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
1121 result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubeid", ctx->ac.f32, in, 3,
1122 AC_FUNC_ATTR_READNONE);
1123 break;
1124 }
1125
1126 default:
1127 fprintf(stderr, "Unknown NIR alu instr: ");
1128 nir_print_instr(&instr->instr, stderr);
1129 fprintf(stderr, "\n");
1130 abort();
1131 }
1132
1133 if (result) {
1134 assert(instr->dest.dest.is_ssa);
1135 result = ac_to_integer_or_pointer(&ctx->ac, result);
1136 ctx->ssa_defs[instr->dest.dest.ssa.index] = result;
1137 }
1138 }
1139
1140 static void visit_load_const(struct ac_nir_context *ctx, const nir_load_const_instr *instr)
1141 {
1142 LLVMValueRef values[4], value = NULL;
1143 LLVMTypeRef element_type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
1144
1145 for (unsigned i = 0; i < instr->def.num_components; ++i) {
1146 switch (instr->def.bit_size) {
1147 case 8:
1148 values[i] = LLVMConstInt(element_type, instr->value[i].u8, false);
1149 break;
1150 case 16:
1151 values[i] = LLVMConstInt(element_type, instr->value[i].u16, false);
1152 break;
1153 case 32:
1154 values[i] = LLVMConstInt(element_type, instr->value[i].u32, false);
1155 break;
1156 case 64:
1157 values[i] = LLVMConstInt(element_type, instr->value[i].u64, false);
1158 break;
1159 default:
1160 fprintf(stderr, "unsupported nir load_const bit_size: %d\n", instr->def.bit_size);
1161 abort();
1162 }
1163 }
1164 if (instr->def.num_components > 1) {
1165 value = LLVMConstVector(values, instr->def.num_components);
1166 } else
1167 value = values[0];
1168
1169 ctx->ssa_defs[instr->def.index] = value;
1170 }
1171
1172 static LLVMValueRef get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor,
1173 bool in_elements)
1174 {
1175 LLVMValueRef size =
1176 LLVMBuildExtractElement(ctx->ac.builder, descriptor, LLVMConstInt(ctx->ac.i32, 2, false), "");
1177
1178 /* GFX8 only */
1179 if (ctx->ac.chip_class == GFX8 && in_elements) {
1180 /* On GFX8, the descriptor contains the size in bytes,
1181 * but TXQ must return the size in elements.
1182 * The stride is always non-zero for resources using TXQ.
1183 */
1184 LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, descriptor, ctx->ac.i32_1, "");
1185 stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, false), "");
1186 stride = LLVMBuildAnd(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 0x3fff, false), "");
1187
1188 size = LLVMBuildUDiv(ctx->ac.builder, size, stride, "");
1189 }
1190 return size;
1191 }
1192
1193 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
1194 * incorrectly forces nearest filtering if the texture format is integer.
1195 * The only effect it has on Gather4, which always returns 4 texels for
1196 * bilinear filtering, is that the final coordinates are off by 0.5 of
1197 * the texel size.
1198 *
1199 * The workaround is to subtract 0.5 from the unnormalized coordinates,
1200 * or (0.5 / size) from the normalized coordinates.
1201 *
1202 * However, cube textures with 8_8_8_8 data formats require a different
1203 * workaround of overriding the num format to USCALED/SSCALED. This would lose
1204 * precision in 32-bit data formats, so it needs to be applied dynamically at
1205 * runtime. In this case, return an i1 value that indicates whether the
1206 * descriptor was overridden (and hence a fixup of the sampler result is needed).
1207 */
1208 static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx, nir_variable *var,
1209 struct ac_image_args *args, const nir_tex_instr *instr)
1210 {
1211 const struct glsl_type *type = glsl_without_array(var->type);
1212 enum glsl_base_type stype = glsl_get_sampler_result_type(type);
1213 LLVMValueRef wa_8888 = NULL;
1214 LLVMValueRef half_texel[2];
1215 LLVMValueRef result;
1216
1217 assert(stype == GLSL_TYPE_INT || stype == GLSL_TYPE_UINT);
1218
1219 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1220 LLVMValueRef formats;
1221 LLVMValueRef data_format;
1222 LLVMValueRef wa_formats;
1223
1224 formats = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, "");
1225
1226 data_format = LLVMBuildLShr(ctx->builder, formats, LLVMConstInt(ctx->i32, 20, false), "");
1227 data_format =
1228 LLVMBuildAnd(ctx->builder, data_format, LLVMConstInt(ctx->i32, (1u << 6) - 1, false), "");
1229 wa_8888 = LLVMBuildICmp(ctx->builder, LLVMIntEQ, data_format,
1230 LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), "");
1231
1232 uint32_t wa_num_format = stype == GLSL_TYPE_UINT
1233 ? S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_USCALED)
1234 : S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_SSCALED);
1235 wa_formats = LLVMBuildAnd(ctx->builder, formats,
1236 LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), "");
1237 wa_formats =
1238 LLVMBuildOr(ctx->builder, wa_formats, LLVMConstInt(ctx->i32, wa_num_format, false), "");
1239
1240 formats = LLVMBuildSelect(ctx->builder, wa_8888, wa_formats, formats, "");
1241 args->resource =
1242 LLVMBuildInsertElement(ctx->builder, args->resource, formats, ctx->i32_1, "");
1243 }
1244
1245 if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
1246 assert(!wa_8888);
1247 half_texel[0] = half_texel[1] = LLVMConstReal(ctx->f32, -0.5);
1248 } else {
1249 struct ac_image_args resinfo = {};
1250 LLVMBasicBlockRef bbs[2];
1251
1252 LLVMValueRef unnorm = NULL;
1253 LLVMValueRef default_offset = ctx->f32_0;
1254 if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
1255 /* In vulkan, whether the sampler uses unnormalized
1256 * coordinates or not is a dynamic property of the
1257 * sampler. Hence, to figure out whether or not we
1258 * need to divide by the texture size, we need to test
1259 * the sampler at runtime. This tests the bit set by
1260 * radv_init_sampler().
1261 */
1262 LLVMValueRef sampler0 =
1263 LLVMBuildExtractElement(ctx->builder, args->sampler, ctx->i32_0, "");
1264 sampler0 = LLVMBuildLShr(ctx->builder, sampler0, LLVMConstInt(ctx->i32, 15, false), "");
1265 sampler0 = LLVMBuildAnd(ctx->builder, sampler0, ctx->i32_1, "");
1266 unnorm = LLVMBuildICmp(ctx->builder, LLVMIntEQ, sampler0, ctx->i32_1, "");
1267 default_offset = LLVMConstReal(ctx->f32, -0.5);
1268 }
1269
1270 bbs[0] = LLVMGetInsertBlock(ctx->builder);
1271 if (wa_8888 || unnorm) {
1272 assert(!(wa_8888 && unnorm));
1273 LLVMValueRef not_needed = wa_8888 ? wa_8888 : unnorm;
1274 /* Skip the texture size query entirely if we don't need it. */
1275 ac_build_ifcc(ctx, LLVMBuildNot(ctx->builder, not_needed, ""), 2000);
1276 bbs[1] = LLVMGetInsertBlock(ctx->builder);
1277 }
1278
1279 /* Query the texture size. */
1280 resinfo.dim = ac_get_sampler_dim(ctx->chip_class, instr->sampler_dim, instr->is_array);
1281 resinfo.opcode = ac_image_get_resinfo;
1282 resinfo.dmask = 0xf;
1283 resinfo.lod = ctx->i32_0;
1284 resinfo.resource = args->resource;
1285 resinfo.attributes = AC_FUNC_ATTR_READNONE;
1286 LLVMValueRef size = ac_build_image_opcode(ctx, &resinfo);
1287
1288 /* Compute -0.5 / size. */
1289 for (unsigned c = 0; c < 2; c++) {
1290 half_texel[c] =
1291 LLVMBuildExtractElement(ctx->builder, size, LLVMConstInt(ctx->i32, c, 0), "");
1292 half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, "");
1293 half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]);
1294 half_texel[c] =
1295 LLVMBuildFMul(ctx->builder, half_texel[c], LLVMConstReal(ctx->f32, -0.5), "");
1296 }
1297
1298 if (wa_8888 || unnorm) {
1299 ac_build_endif(ctx, 2000);
1300
1301 for (unsigned c = 0; c < 2; c++) {
1302 LLVMValueRef values[2] = {default_offset, half_texel[c]};
1303 half_texel[c] = ac_build_phi(ctx, ctx->f32, 2, values, bbs);
1304 }
1305 }
1306 }
1307
1308 for (unsigned c = 0; c < 2; c++) {
1309 LLVMValueRef tmp;
1310 tmp = LLVMBuildBitCast(ctx->builder, args->coords[c], ctx->f32, "");
1311 args->coords[c] = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], "");
1312 }
1313
1314 args->attributes = AC_FUNC_ATTR_READNONE;
1315 result = ac_build_image_opcode(ctx, args);
1316
1317 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1318 LLVMValueRef tmp, tmp2;
1319
1320 /* if the cube workaround is in place, f2i the result. */
1321 for (unsigned c = 0; c < 4; c++) {
1322 tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), "");
1323 if (stype == GLSL_TYPE_UINT)
1324 tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, "");
1325 else
1326 tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, "");
1327 tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
1328 tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, "");
1329 tmp = LLVMBuildSelect(ctx->builder, wa_8888, tmp2, tmp, "");
1330 tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
1331 result =
1332 LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), "");
1333 }
1334 }
1335 return result;
1336 }
1337
1338 static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr)
1339 {
1340 nir_deref_instr *texture_deref_instr = NULL;
1341
1342 for (unsigned i = 0; i < instr->num_srcs; i++) {
1343 switch (instr->src[i].src_type) {
1344 case nir_tex_src_texture_deref:
1345 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
1346 break;
1347 default:
1348 break;
1349 }
1350 }
1351 return texture_deref_instr;
1352 }
1353
1354 static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, const nir_tex_instr *instr,
1355 struct ac_image_args *args)
1356 {
1357 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
1358 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
1359
1360 assert(instr->dest.is_ssa);
1361 return ac_build_buffer_load_format(&ctx->ac, args->resource, args->coords[0], ctx->ac.i32_0,
1362 util_last_bit(mask), 0, true,
1363 instr->dest.ssa.bit_size == 16);
1364 }
1365
1366 args->opcode = ac_image_sample;
1367
1368 switch (instr->op) {
1369 case nir_texop_txf:
1370 case nir_texop_txf_ms:
1371 case nir_texop_samples_identical:
1372 args->opcode = args->level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS
1373 ? ac_image_load
1374 : ac_image_load_mip;
1375 args->level_zero = false;
1376 break;
1377 case nir_texop_txs:
1378 case nir_texop_query_levels:
1379 args->opcode = ac_image_get_resinfo;
1380 if (!args->lod)
1381 args->lod = ctx->ac.i32_0;
1382 args->level_zero = false;
1383 break;
1384 case nir_texop_tex:
1385 if (ctx->stage != MESA_SHADER_FRAGMENT) {
1386 assert(!args->lod);
1387 args->level_zero = true;
1388 }
1389 break;
1390 case nir_texop_tg4:
1391 args->opcode = ac_image_gather4;
1392 if (!args->lod && !args->bias)
1393 args->level_zero = true;
1394 break;
1395 case nir_texop_lod:
1396 args->opcode = ac_image_get_lod;
1397 break;
1398 case nir_texop_fragment_fetch:
1399 case nir_texop_fragment_mask_fetch:
1400 args->opcode = ac_image_load;
1401 args->level_zero = false;
1402 break;
1403 default:
1404 break;
1405 }
1406
1407 if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= GFX8) {
1408 nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr);
1409 nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr);
1410 const struct glsl_type *type = glsl_without_array(var->type);
1411 enum glsl_base_type stype = glsl_get_sampler_result_type(type);
1412 if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
1413 return lower_gather4_integer(&ctx->ac, var, args, instr);
1414 }
1415 }
1416
1417 /* Fixup for GFX9 which allocates 1D textures as 2D. */
1418 if (instr->op == nir_texop_lod && ctx->ac.chip_class == GFX9) {
1419 if ((args->dim == ac_image_2darray || args->dim == ac_image_2d) && !args->coords[1]) {
1420 args->coords[1] = ctx->ac.i32_0;
1421 }
1422 }
1423
1424 args->attributes = AC_FUNC_ATTR_READNONE;
1425 bool cs_derivs =
1426 ctx->stage == MESA_SHADER_COMPUTE && ctx->info->cs.derivative_group != DERIVATIVE_GROUP_NONE;
1427 if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) {
1428 /* Prevent texture instructions with implicit derivatives from being
1429 * sinked into branches. */
1430 switch (instr->op) {
1431 case nir_texop_tex:
1432 case nir_texop_txb:
1433 case nir_texop_lod:
1434 args->attributes |= AC_FUNC_ATTR_CONVERGENT;
1435 break;
1436 default:
1437 break;
1438 }
1439 }
1440
1441 return ac_build_image_opcode(&ctx->ac, args);
1442 }
1443
1444 static LLVMValueRef visit_vulkan_resource_reindex(struct ac_nir_context *ctx,
1445 nir_intrinsic_instr *instr)
1446 {
1447 LLVMValueRef ptr = get_src(ctx, instr->src[0]);
1448 LLVMValueRef index = get_src(ctx, instr->src[1]);
1449
1450 LLVMValueRef result = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
1451 LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
1452 return result;
1453 }
1454
1455 static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1456 {
1457 LLVMValueRef ptr, addr;
1458 LLVMValueRef src0 = get_src(ctx, instr->src[0]);
1459 unsigned index = nir_intrinsic_base(instr);
1460
1461 addr = LLVMConstInt(ctx->ac.i32, index, 0);
1462 addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, "");
1463
1464 /* Load constant values from user SGPRS when possible, otherwise
1465 * fallback to the default path that loads directly from memory.
1466 */
1467 if (LLVMIsConstant(src0) && instr->dest.ssa.bit_size == 32) {
1468 unsigned count = instr->dest.ssa.num_components;
1469 unsigned offset = index;
1470
1471 offset += LLVMConstIntGetZExtValue(src0);
1472 offset /= 4;
1473
1474 offset -= ctx->args->base_inline_push_consts;
1475
1476 unsigned num_inline_push_consts = ctx->args->num_inline_push_consts;
1477 if (offset + count <= num_inline_push_consts) {
1478 LLVMValueRef push_constants[num_inline_push_consts];
1479 for (unsigned i = 0; i < num_inline_push_consts; i++)
1480 push_constants[i] = ac_get_arg(&ctx->ac, ctx->args->inline_push_consts[i]);
1481 return ac_build_gather_values(&ctx->ac, push_constants + offset, count);
1482 }
1483 }
1484
1485 ptr =
1486 LLVMBuildGEP(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->push_constants), &addr, 1, "");
1487
1488 if (instr->dest.ssa.bit_size == 8) {
1489 unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 1;
1490 LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i8, 4 * load_dwords);
1491 ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
1492 LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
1493
1494 LLVMValueRef params[3];
1495 if (load_dwords > 1) {
1496 LLVMValueRef res_vec = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.v2i32, "");
1497 params[0] = LLVMBuildExtractElement(ctx->ac.builder, res_vec,
1498 LLVMConstInt(ctx->ac.i32, 1, false), "");
1499 params[1] = LLVMBuildExtractElement(ctx->ac.builder, res_vec,
1500 LLVMConstInt(ctx->ac.i32, 0, false), "");
1501 } else {
1502 res = LLVMBuildBitCast(ctx->ac.builder, res, ctx->ac.i32, "");
1503 params[0] = ctx->ac.i32_0;
1504 params[1] = res;
1505 }
1506 params[2] = addr;
1507 res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", ctx->ac.i32, params, 3, 0);
1508
1509 res = LLVMBuildTrunc(
1510 ctx->ac.builder, res,
1511 LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
1512 if (instr->dest.ssa.num_components > 1)
1513 res = LLVMBuildBitCast(ctx->ac.builder, res,
1514 LLVMVectorType(ctx->ac.i8, instr->dest.ssa.num_components), "");
1515 return res;
1516 } else if (instr->dest.ssa.bit_size == 16) {
1517 unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
1518 LLVMTypeRef vec_type = LLVMVectorType(ctx->ac.i16, 2 * load_dwords);
1519 ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
1520 LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
1521 res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
1522 LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, "");
1523 cond = LLVMBuildTrunc(ctx->ac.builder, cond, ctx->ac.i1, "");
1524 LLVMValueRef mask[] = {
1525 LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
1526 LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
1527 LLVMConstInt(ctx->ac.i32, 4, false)};
1528 LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components);
1529 LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components);
1530 LLVMValueRef shuffle_aligned =
1531 LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, "");
1532 LLVMValueRef shuffle_unaligned =
1533 LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, "");
1534 res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, "");
1535 return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), "");
1536 }
1537
1538 ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa));
1539
1540 return LLVMBuildLoad(ctx->ac.builder, ptr, "");
1541 }
1542
1543 static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx,
1544 const nir_intrinsic_instr *instr)
1545 {
1546 LLVMValueRef index = get_src(ctx, instr->src[0]);
1547
1548 return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false);
1549 }
1550
1551 static uint32_t widen_mask(uint32_t mask, unsigned multiplier)
1552 {
1553 uint32_t new_mask = 0;
1554 for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
1555 if (mask & (1u << i))
1556 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
1557 return new_mask;
1558 }
1559
1560 static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src,
1561 unsigned start, unsigned count)
1562 {
1563 LLVMValueRef mask[] = {ctx->i32_0, ctx->i32_1, LLVMConstInt(ctx->i32, 2, false),
1564 LLVMConstInt(ctx->i32, 3, false)};
1565
1566 unsigned src_elements = ac_get_llvm_num_components(src);
1567
1568 if (count == src_elements) {
1569 assert(start == 0);
1570 return src;
1571 } else if (count == 1) {
1572 assert(start < src_elements);
1573 return LLVMBuildExtractElement(ctx->builder, src, mask[start], "");
1574 } else {
1575 assert(start + count <= src_elements);
1576 assert(count <= 4);
1577 LLVMValueRef swizzle = LLVMConstVector(&mask[start], count);
1578 return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, "");
1579 }
1580 }
1581
1582 static unsigned get_cache_policy(struct ac_nir_context *ctx, enum gl_access_qualifier access,
1583 bool may_store_unaligned, bool writeonly_memory)
1584 {
1585 unsigned cache_policy = 0;
1586
1587 /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All
1588 * store opcodes not aligned to a dword are affected. The only way to
1589 * get unaligned stores is through shader images.
1590 */
1591 if (((may_store_unaligned && ctx->ac.chip_class == GFX6) ||
1592 /* If this is write-only, don't keep data in L1 to prevent
1593 * evicting L1 cache lines that may be needed by other
1594 * instructions.
1595 */
1596 writeonly_memory || access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
1597 cache_policy |= ac_glc;
1598 }
1599
1600 if (access & ACCESS_STREAM_CACHE_POLICY)
1601 cache_policy |= ac_slc | ac_glc;
1602
1603 return cache_policy;
1604 }
1605
1606 static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, struct waterfall_context *wctx,
1607 const nir_intrinsic_instr *instr, nir_src src)
1608 {
1609 return enter_waterfall(ctx, wctx, get_src(ctx, src),
1610 nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
1611 }
1612
1613 static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1614 {
1615 if (ctx->ac.postponed_kill) {
1616 LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
1617 ac_build_ifcc(&ctx->ac, cond, 7000);
1618 }
1619
1620 LLVMValueRef src_data = get_src(ctx, instr->src[0]);
1621 int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
1622 unsigned writemask = nir_intrinsic_write_mask(instr);
1623 enum gl_access_qualifier access = nir_intrinsic_access(instr);
1624 bool writeonly_memory = access & ACCESS_NON_READABLE;
1625 unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
1626
1627 struct waterfall_context wctx;
1628 LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
1629
1630 LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true);
1631 LLVMValueRef base_data = src_data;
1632 base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
1633 LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
1634
1635 while (writemask) {
1636 int start, count;
1637 LLVMValueRef data, offset;
1638 LLVMTypeRef data_type;
1639
1640 u_bit_scan_consecutive_range(&writemask, &start, &count);
1641
1642 /* Due to an LLVM limitation with LLVM < 9, split 3-element
1643 * writes into a 2-element and a 1-element write. */
1644 if (count == 3 && (elem_size_bytes != 4 || !ac_has_vec3_support(ctx->ac.chip_class, false))) {
1645 writemask |= 1 << (start + 2);
1646 count = 2;
1647 }
1648 int num_bytes = count * elem_size_bytes; /* count in bytes */
1649
1650 /* we can only store 4 DWords at the same time.
1651 * can only happen for 64 Bit vectors. */
1652 if (num_bytes > 16) {
1653 writemask |= ((1u << (count - 2)) - 1u) << (start + 2);
1654 count = 2;
1655 num_bytes = 16;
1656 }
1657
1658 /* check alignment of 16 Bit stores */
1659 if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) {
1660 writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
1661 count = 1;
1662 num_bytes = 2;
1663 }
1664
1665 /* Due to alignment issues, split stores of 8-bit/16-bit
1666 * vectors.
1667 */
1668 if (ctx->ac.chip_class == GFX6 && count > 1 && elem_size_bytes < 4) {
1669 writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
1670 count = 1;
1671 num_bytes = elem_size_bytes;
1672 }
1673
1674 data = extract_vector_range(&ctx->ac, base_data, start, count);
1675
1676 offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
1677 LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), "");
1678
1679 if (num_bytes == 1) {
1680 ac_build_tbuffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
1681 } else if (num_bytes == 2) {
1682 ac_build_tbuffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
1683 } else {
1684 int num_channels = num_bytes / 4;
1685
1686 switch (num_bytes) {
1687 case 16: /* v4f32 */
1688 data_type = ctx->ac.v4f32;
1689 break;
1690 case 12: /* v3f32 */
1691 data_type = ctx->ac.v3f32;
1692 break;
1693 case 8: /* v2f32 */
1694 data_type = ctx->ac.v2f32;
1695 break;
1696 case 4: /* f32 */
1697 data_type = ctx->ac.f32;
1698 break;
1699 default:
1700 unreachable("Malformed vector store.");
1701 }
1702 data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, "");
1703
1704 ac_build_buffer_store_dword(&ctx->ac, rsrc, data, num_channels, offset, ctx->ac.i32_0, 0,
1705 cache_policy);
1706 }
1707 }
1708
1709 exit_waterfall(ctx, &wctx, NULL);
1710
1711 if (ctx->ac.postponed_kill)
1712 ac_build_endif(&ctx->ac, 7000);
1713 }
1714
1715 static LLVMValueRef emit_ssbo_comp_swap_64(struct ac_nir_context *ctx, LLVMValueRef descriptor,
1716 LLVMValueRef offset, LLVMValueRef compare,
1717 LLVMValueRef exchange)
1718 {
1719 LLVMBasicBlockRef start_block = NULL, then_block = NULL;
1720 if (ctx->abi->robust_buffer_access) {
1721 LLVMValueRef size = ac_llvm_extract_elem(&ctx->ac, descriptor, 2);
1722
1723 LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, "");
1724 start_block = LLVMGetInsertBlock(ctx->ac.builder);
1725
1726 ac_build_ifcc(&ctx->ac, cond, -1);
1727
1728 then_block = LLVMGetInsertBlock(ctx->ac.builder);
1729 }
1730
1731 LLVMValueRef ptr_parts[2] = {
1732 ac_llvm_extract_elem(&ctx->ac, descriptor, 0),
1733 LLVMBuildAnd(ctx->ac.builder, ac_llvm_extract_elem(&ctx->ac, descriptor, 1),
1734 LLVMConstInt(ctx->ac.i32, 65535, 0), "")};
1735
1736 ptr_parts[1] = LLVMBuildTrunc(ctx->ac.builder, ptr_parts[1], ctx->ac.i16, "");
1737 ptr_parts[1] = LLVMBuildSExt(ctx->ac.builder, ptr_parts[1], ctx->ac.i32, "");
1738
1739 offset = LLVMBuildZExt(ctx->ac.builder, offset, ctx->ac.i64, "");
1740
1741 LLVMValueRef ptr = ac_build_gather_values(&ctx->ac, ptr_parts, 2);
1742 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
1743 ptr = LLVMBuildAdd(ctx->ac.builder, ptr, offset, "");
1744 ptr = LLVMBuildIntToPtr(ctx->ac.builder, ptr, LLVMPointerType(ctx->ac.i64, AC_ADDR_SPACE_GLOBAL),
1745 "");
1746
1747 LLVMValueRef result =
1748 ac_build_atomic_cmp_xchg(&ctx->ac, ptr, compare, exchange, "singlethread-one-as");
1749 result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
1750
1751 if (ctx->abi->robust_buffer_access) {
1752 ac_build_endif(&ctx->ac, -1);
1753
1754 LLVMBasicBlockRef incoming_blocks[2] = {
1755 start_block,
1756 then_block,
1757 };
1758
1759 LLVMValueRef incoming_values[2] = {
1760 LLVMConstInt(ctx->ac.i64, 0, 0),
1761 result,
1762 };
1763 LLVMValueRef ret = LLVMBuildPhi(ctx->ac.builder, ctx->ac.i64, "");
1764 LLVMAddIncoming(ret, incoming_values, incoming_blocks, 2);
1765 return ret;
1766 } else {
1767 return result;
1768 }
1769 }
1770
1771 static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1772 {
1773 if (ctx->ac.postponed_kill) {
1774 LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
1775 ac_build_ifcc(&ctx->ac, cond, 7001);
1776 }
1777
1778 LLVMTypeRef return_type = LLVMTypeOf(get_src(ctx, instr->src[2]));
1779 const char *op;
1780 char name[64], type[8];
1781 LLVMValueRef params[6], descriptor;
1782 LLVMValueRef result;
1783 int arg_count = 0;
1784
1785 struct waterfall_context wctx;
1786 LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
1787
1788 switch (instr->intrinsic) {
1789 case nir_intrinsic_ssbo_atomic_add:
1790 op = "add";
1791 break;
1792 case nir_intrinsic_ssbo_atomic_imin:
1793 op = "smin";
1794 break;
1795 case nir_intrinsic_ssbo_atomic_umin:
1796 op = "umin";
1797 break;
1798 case nir_intrinsic_ssbo_atomic_imax:
1799 op = "smax";
1800 break;
1801 case nir_intrinsic_ssbo_atomic_umax:
1802 op = "umax";
1803 break;
1804 case nir_intrinsic_ssbo_atomic_and:
1805 op = "and";
1806 break;
1807 case nir_intrinsic_ssbo_atomic_or:
1808 op = "or";
1809 break;
1810 case nir_intrinsic_ssbo_atomic_xor:
1811 op = "xor";
1812 break;
1813 case nir_intrinsic_ssbo_atomic_exchange:
1814 op = "swap";
1815 break;
1816 case nir_intrinsic_ssbo_atomic_comp_swap:
1817 op = "cmpswap";
1818 break;
1819 default:
1820 abort();
1821 }
1822
1823 descriptor = ctx->abi->load_ssbo(ctx->abi, rsrc_base, true);
1824
1825 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap && return_type == ctx->ac.i64) {
1826 result = emit_ssbo_comp_swap_64(ctx, descriptor, get_src(ctx, instr->src[1]),
1827 get_src(ctx, instr->src[2]), get_src(ctx, instr->src[3]));
1828 } else {
1829 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
1830 params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
1831 }
1832 params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
1833 params[arg_count++] = descriptor;
1834
1835 if (LLVM_VERSION_MAJOR >= 9) {
1836 /* XXX: The new raw/struct atomic intrinsics are buggy with
1837 * LLVM 8, see r358579.
1838 */
1839 params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
1840 params[arg_count++] = ctx->ac.i32_0; /* soffset */
1841 params[arg_count++] = ctx->ac.i32_0; /* slc */
1842
1843 ac_build_type_name_for_intr(return_type, type, sizeof(type));
1844 snprintf(name, sizeof(name), "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
1845 } else {
1846 params[arg_count++] = ctx->ac.i32_0; /* vindex */
1847 params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
1848 params[arg_count++] = ctx->ac.i1false; /* slc */
1849
1850 assert(return_type == ctx->ac.i32);
1851 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.atomic.%s", op);
1852 }
1853
1854 result = ac_build_intrinsic(&ctx->ac, name, return_type, params, arg_count, 0);
1855 }
1856
1857 result = exit_waterfall(ctx, &wctx, result);
1858 if (ctx->ac.postponed_kill)
1859 ac_build_endif(&ctx->ac, 7001);
1860 return result;
1861 }
1862
1863 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1864 {
1865 struct waterfall_context wctx;
1866 LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[0]);
1867
1868 int elem_size_bytes = instr->dest.ssa.bit_size / 8;
1869 int num_components = instr->num_components;
1870 enum gl_access_qualifier access = nir_intrinsic_access(instr);
1871 unsigned cache_policy = get_cache_policy(ctx, access, false, false);
1872
1873 LLVMValueRef offset = get_src(ctx, instr->src[1]);
1874 LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi, rsrc_base, false);
1875 LLVMValueRef vindex = ctx->ac.i32_0;
1876
1877 LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
1878 LLVMTypeRef def_elem_type = num_components > 1 ? LLVMGetElementType(def_type) : def_type;
1879
1880 LLVMValueRef results[4];
1881 for (int i = 0; i < num_components;) {
1882 int num_elems = num_components - i;
1883 if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0)
1884 num_elems = 1;
1885 if (num_elems * elem_size_bytes > 16)
1886 num_elems = 16 / elem_size_bytes;
1887 int load_bytes = num_elems * elem_size_bytes;
1888
1889 LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * elem_size_bytes, false);
1890
1891 LLVMValueRef ret;
1892
1893 if (load_bytes == 1) {
1894 ret = ac_build_tbuffer_load_byte(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset,
1895 cache_policy);
1896 } else if (load_bytes == 2) {
1897 ret = ac_build_tbuffer_load_short(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset,
1898 cache_policy);
1899 } else {
1900 int num_channels = util_next_power_of_two(load_bytes) / 4;
1901 bool can_speculate = access & ACCESS_CAN_REORDER;
1902
1903 ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, vindex, offset, immoffset, 0,
1904 cache_policy, can_speculate, false);
1905 }
1906
1907 LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret)));
1908 ret = LLVMBuildBitCast(ctx->ac.builder, ret, byte_vec, "");
1909 ret = ac_trim_vector(&ctx->ac, ret, load_bytes);
1910
1911 LLVMTypeRef ret_type = LLVMVectorType(def_elem_type, num_elems);
1912 ret = LLVMBuildBitCast(ctx->ac.builder, ret, ret_type, "");
1913
1914 for (unsigned j = 0; j < num_elems; j++) {
1915 results[i + j] =
1916 LLVMBuildExtractElement(ctx->ac.builder, ret, LLVMConstInt(ctx->ac.i32, j, false), "");
1917 }
1918 i += num_elems;
1919 }
1920
1921 LLVMValueRef ret = ac_build_gather_values(&ctx->ac, results, num_components);
1922 return exit_waterfall(ctx, &wctx, ret);
1923 }
1924
1925 static LLVMValueRef enter_waterfall_ubo(struct ac_nir_context *ctx, struct waterfall_context *wctx,
1926 const nir_intrinsic_instr *instr)
1927 {
1928 return enter_waterfall(ctx, wctx, get_src(ctx, instr->src[0]),
1929 nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
1930 }
1931
1932 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
1933 {
1934 struct waterfall_context wctx;
1935 LLVMValueRef rsrc_base = enter_waterfall_ubo(ctx, &wctx, instr);
1936
1937 LLVMValueRef ret;
1938 LLVMValueRef rsrc = rsrc_base;
1939 LLVMValueRef offset = get_src(ctx, instr->src[1]);
1940 int num_components = instr->num_components;
1941
1942 if (ctx->abi->load_ubo)
1943 rsrc = ctx->abi->load_ubo(ctx->abi, rsrc);
1944
1945 if (instr->dest.ssa.bit_size == 64)
1946 num_components *= 2;
1947
1948 if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) {
1949 unsigned load_bytes = instr->dest.ssa.bit_size / 8;
1950 LLVMValueRef results[num_components];
1951 for (unsigned i = 0; i < num_components; ++i) {
1952 LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, load_bytes * i, 0);
1953
1954 if (load_bytes == 1) {
1955 results[i] =
1956 ac_build_tbuffer_load_byte(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset, 0);
1957 } else {
1958 assert(load_bytes == 2);
1959 results[i] =
1960 ac_build_tbuffer_load_short(&ctx->ac, rsrc, offset, ctx->ac.i32_0, immoffset, 0);
1961 }
1962 }
1963 ret = ac_build_gather_values(&ctx->ac, results, num_components);
1964 } else {
1965 ret =
1966 ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset, NULL, 0, 0, true, true);
1967
1968 ret = ac_trim_vector(&ctx->ac, ret, num_components);
1969 }
1970
1971 ret = LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
1972
1973 return exit_waterfall(ctx, &wctx, ret);
1974 }
1975
1976 static void get_deref_offset(struct ac_nir_context *ctx, nir_deref_instr *instr, bool vs_in,
1977 unsigned *vertex_index_out, LLVMValueRef *vertex_index_ref,
1978 unsigned *const_out, LLVMValueRef *indir_out)
1979 {
1980 nir_variable *var = nir_deref_instr_get_variable(instr);
1981 nir_deref_path path;
1982 unsigned idx_lvl = 1;
1983
1984 nir_deref_path_init(&path, instr, NULL);
1985
1986 if (vertex_index_out != NULL || vertex_index_ref != NULL) {
1987 if (vertex_index_ref) {
1988 *vertex_index_ref = get_src(ctx, path.path[idx_lvl]->arr.index);
1989 if (vertex_index_out)
1990 *vertex_index_out = 0;
1991 } else {
1992 *vertex_index_out = nir_src_as_uint(path.path[idx_lvl]->arr.index);
1993 }
1994 ++idx_lvl;
1995 }
1996
1997 uint32_t const_offset = 0;
1998 LLVMValueRef offset = NULL;
1999
2000 if (var->data.compact) {
2001 assert(instr->deref_type == nir_deref_type_array);
2002 const_offset = nir_src_as_uint(instr->arr.index);
2003 goto out;
2004 }
2005
2006 for (; path.path[idx_lvl]; ++idx_lvl) {
2007 const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type;
2008 if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) {
2009 unsigned index = path.path[idx_lvl]->strct.index;
2010
2011 for (unsigned i = 0; i < index; i++) {
2012 const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
2013 const_offset += glsl_count_attribute_slots(ft, vs_in);
2014 }
2015 } else if (path.path[idx_lvl]->deref_type == nir_deref_type_array) {
2016 unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in);
2017 if (nir_src_is_const(path.path[idx_lvl]->arr.index)) {
2018 const_offset += size * nir_src_as_uint(path.path[idx_lvl]->arr.index);
2019 } else {
2020 LLVMValueRef array_off =
2021 LLVMBuildMul(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, size, 0),
2022 get_src(ctx, path.path[idx_lvl]->arr.index), "");
2023 if (offset)
2024 offset = LLVMBuildAdd(ctx->ac.builder, offset, array_off, "");
2025 else
2026 offset = array_off;
2027 }
2028 } else
2029 unreachable("Uhandled deref type in get_deref_instr_offset");
2030 }
2031
2032 out:
2033 nir_deref_path_finish(&path);
2034
2035 if (const_offset && offset)
2036 offset =
2037 LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, const_offset, 0), "");
2038
2039 *const_out = const_offset;
2040 *indir_out = offset;
2041 }
2042
2043 static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx, nir_intrinsic_instr *instr,
2044 bool load_inputs)
2045 {
2046 LLVMValueRef result;
2047 LLVMValueRef vertex_index = NULL;
2048 LLVMValueRef indir_index = NULL;
2049 unsigned const_index = 0;
2050
2051 nir_variable *var =
2052 nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
2053
2054 unsigned location = var->data.location;
2055 unsigned driver_location = var->data.driver_location;
2056 const bool is_patch = var->data.patch || var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
2057 var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
2058 const bool is_compact = var->data.compact;
2059
2060 get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false, NULL,
2061 is_patch ? NULL : &vertex_index, &const_index, &indir_index);
2062
2063 LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa);
2064
2065 LLVMTypeRef src_component_type;
2066 if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind)
2067 src_component_type = LLVMGetElementType(dest_type);
2068 else
2069 src_component_type = dest_type;
2070
2071 result =
2072 ctx->abi->load_tess_varyings(ctx->abi, src_component_type, vertex_index, indir_index,
2073 const_index, location, driver_location, var->data.location_frac,
2074 instr->num_components, is_patch, is_compact, load_inputs);
2075 if (instr->dest.ssa.bit_size == 16) {
2076 result = ac_to_integer(&ctx->ac, result);
2077 result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, "");
2078 }
2079 return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
2080 }
2081
2082 static unsigned type_scalar_size_bytes(const struct glsl_type *type)
2083 {
2084 assert(glsl_type_is_vector_or_scalar(type) || glsl_type_is_matrix(type));
2085 return glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
2086 }
2087
2088 static LLVMValueRef visit_load_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2089 {
2090 nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
2091 nir_variable *var = nir_deref_instr_get_variable(deref);
2092
2093 LLVMValueRef values[8];
2094 int idx = 0;
2095 int ve = instr->dest.ssa.num_components;
2096 unsigned comp = 0;
2097 LLVMValueRef indir_index;
2098 LLVMValueRef ret;
2099 unsigned const_index;
2100 unsigned stride = 4;
2101 int mode = deref->mode;
2102
2103 if (var) {
2104 bool vs_in = ctx->stage == MESA_SHADER_VERTEX && var->data.mode == nir_var_shader_in;
2105 idx = var->data.driver_location;
2106 comp = var->data.location_frac;
2107 mode = var->data.mode;
2108
2109 get_deref_offset(ctx, deref, vs_in, NULL, NULL, &const_index, &indir_index);
2110
2111 if (var->data.compact) {
2112 stride = 1;
2113 const_index += comp;
2114 comp = 0;
2115 }
2116 }
2117
2118 if (instr->dest.ssa.bit_size == 64 &&
2119 (deref->mode == nir_var_shader_in || deref->mode == nir_var_shader_out ||
2120 deref->mode == nir_var_function_temp))
2121 ve *= 2;
2122
2123 switch (mode) {
2124 case nir_var_shader_in:
2125 /* TODO: remove this after RADV switches to lowered IO */
2126 if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_TESS_EVAL) {
2127 return load_tess_varyings(ctx, instr, true);
2128 }
2129
2130 if (ctx->stage == MESA_SHADER_GEOMETRY) {
2131 LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
2132 LLVMValueRef indir_index;
2133 unsigned const_index, vertex_index;
2134 get_deref_offset(ctx, deref, false, &vertex_index, NULL, &const_index, &indir_index);
2135 assert(indir_index == NULL);
2136
2137 return ctx->abi->load_inputs(ctx->abi, var->data.location, var->data.driver_location,
2138 var->data.location_frac, instr->num_components, vertex_index,
2139 const_index, type);
2140 }
2141
2142 for (unsigned chan = comp; chan < ve + comp; chan++) {
2143 if (indir_index) {
2144 unsigned count =
2145 glsl_count_attribute_slots(var->type, ctx->stage == MESA_SHADER_VERTEX);
2146 count -= chan / 4;
2147 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2148 &ctx->ac, ctx->abi->inputs + idx + chan, count, stride, false, true);
2149
2150 values[chan] = LLVMBuildExtractElement(ctx->ac.builder, tmp_vec, indir_index, "");
2151 } else
2152 values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
2153 }
2154 break;
2155 case nir_var_function_temp:
2156 for (unsigned chan = 0; chan < ve; chan++) {
2157 if (indir_index) {
2158 unsigned count = glsl_count_attribute_slots(var->type, false);
2159 count -= chan / 4;
2160 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2161 &ctx->ac, ctx->locals + idx + chan, count, stride, true, true);
2162
2163 values[chan] = LLVMBuildExtractElement(ctx->ac.builder, tmp_vec, indir_index, "");
2164 } else {
2165 values[chan] =
2166 LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], "");
2167 }
2168 }
2169 break;
2170 case nir_var_shader_out:
2171 /* TODO: remove this after RADV switches to lowered IO */
2172 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
2173 return load_tess_varyings(ctx, instr, false);
2174 }
2175
2176 if (ctx->stage == MESA_SHADER_FRAGMENT && var->data.fb_fetch_output && ctx->abi->emit_fbfetch)
2177 return ctx->abi->emit_fbfetch(ctx->abi);
2178
2179 for (unsigned chan = comp; chan < ve + comp; chan++) {
2180 if (indir_index) {
2181 unsigned count = glsl_count_attribute_slots(var->type, false);
2182 count -= chan / 4;
2183 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2184 &ctx->ac, ctx->abi->outputs + idx + chan, count, stride, true, true);
2185
2186 values[chan] = LLVMBuildExtractElement(ctx->ac.builder, tmp_vec, indir_index, "");
2187 } else {
2188 values[chan] = LLVMBuildLoad(ctx->ac.builder,
2189 ctx->abi->outputs[idx + chan + const_index * stride], "");
2190 }
2191 }
2192 break;
2193 case nir_var_mem_global: {
2194 LLVMValueRef address = get_src(ctx, instr->src[0]);
2195 LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
2196 unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
2197 unsigned natural_stride = type_scalar_size_bytes(deref->type);
2198 unsigned stride = explicit_stride ? explicit_stride : natural_stride;
2199 int elem_size_bytes = ac_get_elem_bits(&ctx->ac, result_type) / 8;
2200 bool split_loads = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
2201
2202 if (stride != natural_stride || split_loads) {
2203 if (LLVMGetTypeKind(result_type) == LLVMVectorTypeKind)
2204 result_type = LLVMGetElementType(result_type);
2205
2206 LLVMTypeRef ptr_type =
2207 LLVMPointerType(result_type, LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
2208 address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, "");
2209
2210 for (unsigned i = 0; i < instr->dest.ssa.num_components; ++i) {
2211 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * stride / natural_stride, 0);
2212 values[i] =
2213 LLVMBuildLoad(ctx->ac.builder, ac_build_gep_ptr(&ctx->ac, address, offset), "");
2214
2215 if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
2216 LLVMSetOrdering(values[i], LLVMAtomicOrderingMonotonic);
2217 }
2218 return ac_build_gather_values(&ctx->ac, values, instr->dest.ssa.num_components);
2219 } else {
2220 LLVMTypeRef ptr_type =
2221 LLVMPointerType(result_type, LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
2222 address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, "");
2223 LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
2224
2225 if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
2226 LLVMSetOrdering(val, LLVMAtomicOrderingMonotonic);
2227 return val;
2228 }
2229 }
2230 default:
2231 unreachable("unhandle variable mode");
2232 }
2233 ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp);
2234 return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
2235 }
2236
2237 static void visit_store_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2238 {
2239 if (ctx->ac.postponed_kill) {
2240 LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2241 ac_build_ifcc(&ctx->ac, cond, 7002);
2242 }
2243
2244 nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
2245 nir_variable *var = nir_deref_instr_get_variable(deref);
2246
2247 LLVMValueRef temp_ptr, value;
2248 int idx = 0;
2249 unsigned comp = 0;
2250 LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1]));
2251 int writemask = instr->const_index[0];
2252 LLVMValueRef indir_index;
2253 unsigned const_index;
2254
2255 if (var) {
2256 get_deref_offset(ctx, deref, false, NULL, NULL, &const_index, &indir_index);
2257 idx = var->data.driver_location;
2258 comp = var->data.location_frac;
2259
2260 if (var->data.compact) {
2261 const_index += comp;
2262 comp = 0;
2263 }
2264 }
2265
2266 if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64 &&
2267 (deref->mode == nir_var_shader_out || deref->mode == nir_var_function_temp)) {
2268
2269 src = LLVMBuildBitCast(ctx->ac.builder, src,
2270 LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), "");
2271
2272 writemask = widen_mask(writemask, 2);
2273 }
2274
2275 writemask = writemask << comp;
2276
2277 switch (deref->mode) {
2278 case nir_var_shader_out:
2279 /* TODO: remove this after RADV switches to lowered IO */
2280 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
2281 LLVMValueRef vertex_index = NULL;
2282 LLVMValueRef indir_index = NULL;
2283 unsigned const_index = 0;
2284 const bool is_patch = var->data.patch ||
2285 var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
2286 var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER;
2287
2288 get_deref_offset(ctx, deref, false, NULL, is_patch ? NULL : &vertex_index, &const_index,
2289 &indir_index);
2290
2291 ctx->abi->store_tcs_outputs(ctx->abi, var, vertex_index, indir_index, const_index, src,
2292 writemask, var->data.location_frac, var->data.driver_location);
2293 break;
2294 }
2295
2296 for (unsigned chan = 0; chan < 8; chan++) {
2297 int stride = 4;
2298 if (!(writemask & (1 << chan)))
2299 continue;
2300
2301 value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp);
2302
2303 if (var->data.compact)
2304 stride = 1;
2305 if (indir_index) {
2306 unsigned count = glsl_count_attribute_slots(var->type, false);
2307 count -= chan / 4;
2308 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2309 &ctx->ac, ctx->abi->outputs + idx + chan, count, stride, true, true);
2310
2311 tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, value, indir_index, "");
2312 build_store_values_extended(&ctx->ac, ctx->abi->outputs + idx + chan, count, stride,
2313 tmp_vec);
2314
2315 } else {
2316 temp_ptr = ctx->abi->outputs[idx + chan + const_index * stride];
2317
2318 LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
2319 }
2320 }
2321 break;
2322 case nir_var_function_temp:
2323 for (unsigned chan = 0; chan < 8; chan++) {
2324 if (!(writemask & (1 << chan)))
2325 continue;
2326
2327 value = ac_llvm_extract_elem(&ctx->ac, src, chan);
2328 if (indir_index) {
2329 unsigned count = glsl_count_attribute_slots(var->type, false);
2330 count -= chan / 4;
2331 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2332 &ctx->ac, ctx->locals + idx + chan, count, 4, true, true);
2333
2334 tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec, value, indir_index, "");
2335 build_store_values_extended(&ctx->ac, ctx->locals + idx + chan, count, 4, tmp_vec);
2336 } else {
2337 temp_ptr = ctx->locals[idx + chan + const_index * 4];
2338
2339 LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
2340 }
2341 }
2342 break;
2343
2344 case nir_var_mem_global: {
2345 int writemask = instr->const_index[0];
2346 LLVMValueRef address = get_src(ctx, instr->src[0]);
2347 LLVMValueRef val = get_src(ctx, instr->src[1]);
2348
2349 unsigned explicit_stride = glsl_get_explicit_stride(deref->type);
2350 unsigned natural_stride = type_scalar_size_bytes(deref->type);
2351 unsigned stride = explicit_stride ? explicit_stride : natural_stride;
2352 int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(val)) / 8;
2353 bool split_stores = ctx->ac.chip_class == GFX6 && elem_size_bytes < 4;
2354
2355 LLVMTypeRef ptr_type =
2356 LLVMPointerType(LLVMTypeOf(val), LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
2357 address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, "");
2358
2359 if (writemask == (1u << ac_get_llvm_num_components(val)) - 1 && stride == natural_stride &&
2360 !split_stores) {
2361 LLVMTypeRef ptr_type =
2362 LLVMPointerType(LLVMTypeOf(val), LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
2363 address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, "");
2364
2365 val = LLVMBuildBitCast(ctx->ac.builder, val, LLVMGetElementType(LLVMTypeOf(address)), "");
2366 LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, val, address);
2367
2368 if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
2369 LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
2370 } else {
2371 LLVMTypeRef val_type = LLVMTypeOf(val);
2372 if (LLVMGetTypeKind(LLVMTypeOf(val)) == LLVMVectorTypeKind)
2373 val_type = LLVMGetElementType(val_type);
2374
2375 LLVMTypeRef ptr_type =
2376 LLVMPointerType(val_type, LLVMGetPointerAddressSpace(LLVMTypeOf(address)));
2377 address = LLVMBuildBitCast(ctx->ac.builder, address, ptr_type, "");
2378 for (unsigned chan = 0; chan < 4; chan++) {
2379 if (!(writemask & (1 << chan)))
2380 continue;
2381
2382 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, chan * stride / natural_stride, 0);
2383
2384 LLVMValueRef ptr = ac_build_gep_ptr(&ctx->ac, address, offset);
2385 LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val, chan);
2386 src = LLVMBuildBitCast(ctx->ac.builder, src, LLVMGetElementType(LLVMTypeOf(ptr)), "");
2387 LLVMValueRef store = LLVMBuildStore(ctx->ac.builder, src, ptr);
2388
2389 if (nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE))
2390 LLVMSetOrdering(store, LLVMAtomicOrderingMonotonic);
2391 }
2392 }
2393 break;
2394 }
2395 default:
2396 abort();
2397 break;
2398 }
2399
2400 if (ctx->ac.postponed_kill)
2401 ac_build_endif(&ctx->ac, 7002);
2402 }
2403
2404 static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2405 {
2406 if (ctx->ac.postponed_kill) {
2407 LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2408 ac_build_ifcc(&ctx->ac, cond, 7002);
2409 }
2410
2411 unsigned base = nir_intrinsic_base(instr);
2412 unsigned writemask = nir_intrinsic_write_mask(instr);
2413 unsigned component = nir_intrinsic_component(instr);
2414 LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
2415 nir_src offset = *nir_get_io_offset_src(instr);
2416 LLVMValueRef indir_index = NULL;
2417
2418 if (nir_src_is_const(offset))
2419 assert(nir_src_as_uint(offset) == 0);
2420 else
2421 indir_index = get_src(ctx, offset);
2422
2423 switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) {
2424 case 32:
2425 break;
2426 case 64:
2427 writemask = widen_mask(writemask, 2);
2428 src = LLVMBuildBitCast(ctx->ac.builder, src,
2429 LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2), "");
2430 break;
2431 default:
2432 unreachable("unhandled store_output bit size");
2433 return;
2434 }
2435
2436 writemask <<= component;
2437
2438 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
2439 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
2440 LLVMValueRef vertex_index = vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
2441
2442 ctx->abi->store_tcs_outputs(ctx->abi, NULL, vertex_index, indir_index, 0, src, writemask,
2443 component, base * 4);
2444 return;
2445 }
2446
2447 /* No indirect indexing is allowed after this point. */
2448 assert(!indir_index);
2449
2450 for (unsigned chan = 0; chan < 8; chan++) {
2451 if (!(writemask & (1 << chan)))
2452 continue;
2453
2454 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
2455 LLVMBuildStore(ctx->ac.builder, value, ctx->abi->outputs[base * 4 + chan]);
2456 }
2457
2458 if (ctx->ac.postponed_kill)
2459 ac_build_endif(&ctx->ac, 7002);
2460 }
2461
2462 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
2463 {
2464 switch (dim) {
2465 case GLSL_SAMPLER_DIM_BUF:
2466 return 1;
2467 case GLSL_SAMPLER_DIM_1D:
2468 return array ? 2 : 1;
2469 case GLSL_SAMPLER_DIM_2D:
2470 return array ? 3 : 2;
2471 case GLSL_SAMPLER_DIM_MS:
2472 return array ? 4 : 3;
2473 case GLSL_SAMPLER_DIM_3D:
2474 case GLSL_SAMPLER_DIM_CUBE:
2475 return 3;
2476 case GLSL_SAMPLER_DIM_RECT:
2477 case GLSL_SAMPLER_DIM_SUBPASS:
2478 return 2;
2479 case GLSL_SAMPLER_DIM_SUBPASS_MS:
2480 return 3;
2481 default:
2482 break;
2483 }
2484 return 0;
2485 }
2486
2487 static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx,
2488 LLVMValueRef coord_x, LLVMValueRef coord_y,
2489 LLVMValueRef coord_z, LLVMValueRef sample_index,
2490 LLVMValueRef fmask_desc_ptr)
2491 {
2492 unsigned sample_chan = coord_z ? 3 : 2;
2493 LLVMValueRef addr[4] = {coord_x, coord_y, coord_z};
2494 addr[sample_chan] = sample_index;
2495
2496 ac_apply_fmask_to_sample(ctx, fmask_desc_ptr, addr, coord_z != NULL);
2497 return addr[sample_chan];
2498 }
2499
2500 static nir_deref_instr *get_image_deref(const nir_intrinsic_instr *instr)
2501 {
2502 assert(instr->src[0].is_ssa);
2503 return nir_instr_as_deref(instr->src[0].ssa->parent_instr);
2504 }
2505
2506 static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
2507 const nir_intrinsic_instr *instr,
2508 LLVMValueRef dynamic_index,
2509 enum ac_descriptor_type desc_type, bool write)
2510 {
2511 nir_deref_instr *deref_instr = instr->src[0].ssa->parent_instr->type == nir_instr_type_deref
2512 ? nir_instr_as_deref(instr->src[0].ssa->parent_instr)
2513 : NULL;
2514
2515 return get_sampler_desc(ctx, deref_instr, desc_type, &instr->instr, dynamic_index, true, write);
2516 }
2517
2518 static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2519 LLVMValueRef dynamic_desc_index, struct ac_image_args *args,
2520 enum glsl_sampler_dim dim, bool is_array)
2521 {
2522 LLVMValueRef src0 = get_src(ctx, instr->src[1]);
2523 LLVMValueRef masks[] = {
2524 LLVMConstInt(ctx->ac.i32, 0, false),
2525 LLVMConstInt(ctx->ac.i32, 1, false),
2526 LLVMConstInt(ctx->ac.i32, 2, false),
2527 LLVMConstInt(ctx->ac.i32, 3, false),
2528 };
2529 LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
2530
2531 int count;
2532 ASSERTED bool add_frag_pos =
2533 (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
2534 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
2535 bool gfx9_1d = ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
2536 assert(!add_frag_pos && "Input attachments should be lowered by this point.");
2537 count = image_type_to_components_count(dim, is_array);
2538
2539 if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load ||
2540 instr->intrinsic == nir_intrinsic_bindless_image_load)) {
2541 LLVMValueRef fmask_load_address[3];
2542
2543 fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
2544 fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], "");
2545 if (is_array)
2546 fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], "");
2547 else
2548 fmask_load_address[2] = NULL;
2549
2550 sample_index = adjust_sample_index_using_fmask(
2551 &ctx->ac, fmask_load_address[0], fmask_load_address[1], fmask_load_address[2],
2552 sample_index,
2553 get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), AC_DESC_FMASK,
2554 &instr->instr, dynamic_desc_index, true, false));
2555 }
2556 if (count == 1 && !gfx9_1d) {
2557 if (instr->src[1].ssa->num_components)
2558 args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
2559 else
2560 args->coords[0] = src0;
2561 } else {
2562 int chan;
2563 if (is_ms)
2564 count--;
2565 for (chan = 0; chan < count; ++chan) {
2566 args->coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan);
2567 }
2568
2569 if (gfx9_1d) {
2570 if (is_array) {
2571 args->coords[2] = args->coords[1];
2572 args->coords[1] = ctx->ac.i32_0;
2573 } else
2574 args->coords[1] = ctx->ac.i32_0;
2575 count++;
2576 }
2577 if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
2578 /* The hw can't bind a slice of a 3D image as a 2D
2579 * image, because it ignores BASE_ARRAY if the target
2580 * is 3D. The workaround is to read BASE_ARRAY and set
2581 * it as the 3rd address operand for all 2D images.
2582 */
2583 LLVMValueRef first_layer, const5, mask;
2584
2585 const5 = LLVMConstInt(ctx->ac.i32, 5, 0);
2586 mask = LLVMConstInt(ctx->ac.i32, S_008F24_BASE_ARRAY(~0), 0);
2587 first_layer = LLVMBuildExtractElement(ctx->ac.builder, args->resource, const5, "");
2588 first_layer = LLVMBuildAnd(ctx->ac.builder, first_layer, mask, "");
2589
2590 args->coords[count] = first_layer;
2591 count++;
2592 }
2593
2594 if (is_ms) {
2595 args->coords[count] = sample_index;
2596 count++;
2597 }
2598 }
2599 }
2600
2601 static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
2602 const nir_intrinsic_instr *instr,
2603 LLVMValueRef dynamic_index, bool write, bool atomic)
2604 {
2605 LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, write);
2606 if (ctx->ac.chip_class == GFX9 && LLVM_VERSION_MAJOR < 9 && atomic) {
2607 LLVMValueRef elem_count =
2608 LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
2609 LLVMValueRef stride =
2610 LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
2611 stride = LLVMBuildLShr(ctx->ac.builder, stride, LLVMConstInt(ctx->ac.i32, 16, 0), "");
2612
2613 LLVMValueRef new_elem_count = LLVMBuildSelect(
2614 ctx->ac.builder, LLVMBuildICmp(ctx->ac.builder, LLVMIntUGT, elem_count, stride, ""),
2615 elem_count, stride, "");
2616
2617 rsrc = LLVMBuildInsertElement(ctx->ac.builder, rsrc, new_elem_count,
2618 LLVMConstInt(ctx->ac.i32, 2, 0), "");
2619 }
2620 return rsrc;
2621 }
2622
2623 static LLVMValueRef enter_waterfall_image(struct ac_nir_context *ctx,
2624 struct waterfall_context *wctx,
2625 const nir_intrinsic_instr *instr)
2626 {
2627 nir_deref_instr *deref_instr = NULL;
2628
2629 if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref)
2630 deref_instr = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
2631
2632 LLVMValueRef value = get_sampler_desc_index(ctx, deref_instr, &instr->instr, true);
2633 return enter_waterfall(ctx, wctx, value, nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM);
2634 }
2635
2636 static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2637 bool bindless)
2638 {
2639 LLVMValueRef res;
2640
2641 enum glsl_sampler_dim dim;
2642 enum gl_access_qualifier access = nir_intrinsic_access(instr);
2643 bool is_array;
2644 if (bindless) {
2645 dim = nir_intrinsic_image_dim(instr);
2646 is_array = nir_intrinsic_image_array(instr);
2647 } else {
2648 const nir_deref_instr *image_deref = get_image_deref(instr);
2649 const struct glsl_type *type = image_deref->type;
2650 const nir_variable *var = nir_deref_instr_get_variable(image_deref);
2651 dim = glsl_get_sampler_dim(type);
2652 access |= var->data.access;
2653 is_array = glsl_sampler_type_is_array(type);
2654 }
2655
2656 struct waterfall_context wctx;
2657 LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2658
2659 struct ac_image_args args = {};
2660
2661 args.cache_policy = get_cache_policy(ctx, access, false, false);
2662
2663 if (dim == GLSL_SAMPLER_DIM_BUF) {
2664 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
2665 unsigned num_channels = util_last_bit(mask);
2666 LLVMValueRef rsrc, vindex;
2667
2668 rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, false, false);
2669 vindex =
2670 LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, "");
2671
2672 assert(instr->dest.is_ssa);
2673 bool can_speculate = access & ACCESS_CAN_REORDER;
2674 res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, ctx->ac.i32_0, num_channels,
2675 args.cache_policy, can_speculate,
2676 instr->dest.ssa.bit_size == 16);
2677 res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
2678
2679 res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
2680 res = ac_to_integer(&ctx->ac, res);
2681 } else {
2682 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
2683
2684 args.opcode = level_zero ? ac_image_load : ac_image_load_mip;
2685 args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
2686 get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
2687 args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2688 if (!level_zero)
2689 args.lod = get_src(ctx, instr->src[3]);
2690 args.dmask = 15;
2691 args.attributes = AC_FUNC_ATTR_READONLY;
2692
2693 assert(instr->dest.is_ssa);
2694 args.d16 = instr->dest.ssa.bit_size == 16;
2695
2696 res = ac_build_image_opcode(&ctx->ac, &args);
2697 }
2698 return exit_waterfall(ctx, &wctx, res);
2699 }
2700
2701 static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2702 bool bindless)
2703 {
2704 if (ctx->ac.postponed_kill) {
2705 LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2706 ac_build_ifcc(&ctx->ac, cond, 7003);
2707 }
2708
2709 enum glsl_sampler_dim dim;
2710 enum gl_access_qualifier access = nir_intrinsic_access(instr);
2711 bool is_array;
2712
2713 if (bindless) {
2714 dim = nir_intrinsic_image_dim(instr);
2715 is_array = nir_intrinsic_image_array(instr);
2716 } else {
2717 const nir_deref_instr *image_deref = get_image_deref(instr);
2718 const struct glsl_type *type = image_deref->type;
2719 const nir_variable *var = nir_deref_instr_get_variable(image_deref);
2720 dim = glsl_get_sampler_dim(type);
2721 access |= var->data.access;
2722 is_array = glsl_sampler_type_is_array(type);
2723 }
2724
2725 struct waterfall_context wctx;
2726 LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2727
2728 bool writeonly_memory = access & ACCESS_NON_READABLE;
2729 struct ac_image_args args = {};
2730
2731 args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
2732
2733 if (dim == GLSL_SAMPLER_DIM_BUF) {
2734 LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, false);
2735 LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
2736 unsigned src_channels = ac_get_llvm_num_components(src);
2737 LLVMValueRef vindex;
2738
2739 if (src_channels == 3)
2740 src = ac_build_expand_to_vec4(&ctx->ac, src, 3);
2741
2742 vindex =
2743 LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, "");
2744
2745 ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.cache_policy);
2746 } else {
2747 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
2748
2749 args.opcode = level_zero ? ac_image_store : ac_image_store_mip;
2750 args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
2751 args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
2752 get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
2753 args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2754 if (!level_zero)
2755 args.lod = get_src(ctx, instr->src[4]);
2756 args.dmask = 15;
2757 args.d16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.data[0])) == 16;
2758
2759 ac_build_image_opcode(&ctx->ac, &args);
2760 }
2761
2762 exit_waterfall(ctx, &wctx, NULL);
2763 if (ctx->ac.postponed_kill)
2764 ac_build_endif(&ctx->ac, 7003);
2765 }
2766
2767 static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2768 bool bindless)
2769 {
2770 if (ctx->ac.postponed_kill) {
2771 LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
2772 ac_build_ifcc(&ctx->ac, cond, 7004);
2773 }
2774
2775 LLVMValueRef params[7];
2776 int param_count = 0;
2777
2778 bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap ||
2779 instr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap;
2780 const char *atomic_name;
2781 char intrinsic_name[64];
2782 enum ac_atomic_op atomic_subop;
2783 ASSERTED int length;
2784
2785 enum glsl_sampler_dim dim;
2786 bool is_array;
2787 if (bindless) {
2788 if (instr->intrinsic == nir_intrinsic_bindless_image_atomic_imin ||
2789 instr->intrinsic == nir_intrinsic_bindless_image_atomic_umin ||
2790 instr->intrinsic == nir_intrinsic_bindless_image_atomic_imax ||
2791 instr->intrinsic == nir_intrinsic_bindless_image_atomic_umax) {
2792 ASSERTED const GLenum format = nir_intrinsic_format(instr);
2793 assert(format == GL_R32UI || format == GL_R32I);
2794 }
2795 dim = nir_intrinsic_image_dim(instr);
2796 is_array = nir_intrinsic_image_array(instr);
2797 } else {
2798 const struct glsl_type *type = get_image_deref(instr)->type;
2799 dim = glsl_get_sampler_dim(type);
2800 is_array = glsl_sampler_type_is_array(type);
2801 }
2802
2803 struct waterfall_context wctx;
2804 LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2805
2806 switch (instr->intrinsic) {
2807 case nir_intrinsic_bindless_image_atomic_add:
2808 case nir_intrinsic_image_deref_atomic_add:
2809 atomic_name = "add";
2810 atomic_subop = ac_atomic_add;
2811 break;
2812 case nir_intrinsic_bindless_image_atomic_imin:
2813 case nir_intrinsic_image_deref_atomic_imin:
2814 atomic_name = "smin";
2815 atomic_subop = ac_atomic_smin;
2816 break;
2817 case nir_intrinsic_bindless_image_atomic_umin:
2818 case nir_intrinsic_image_deref_atomic_umin:
2819 atomic_name = "umin";
2820 atomic_subop = ac_atomic_umin;
2821 break;
2822 case nir_intrinsic_bindless_image_atomic_imax:
2823 case nir_intrinsic_image_deref_atomic_imax:
2824 atomic_name = "smax";
2825 atomic_subop = ac_atomic_smax;
2826 break;
2827 case nir_intrinsic_bindless_image_atomic_umax:
2828 case nir_intrinsic_image_deref_atomic_umax:
2829 atomic_name = "umax";
2830 atomic_subop = ac_atomic_umax;
2831 break;
2832 case nir_intrinsic_bindless_image_atomic_and:
2833 case nir_intrinsic_image_deref_atomic_and:
2834 atomic_name = "and";
2835 atomic_subop = ac_atomic_and;
2836 break;
2837 case nir_intrinsic_bindless_image_atomic_or:
2838 case nir_intrinsic_image_deref_atomic_or:
2839 atomic_name = "or";
2840 atomic_subop = ac_atomic_or;
2841 break;
2842 case nir_intrinsic_bindless_image_atomic_xor:
2843 case nir_intrinsic_image_deref_atomic_xor:
2844 atomic_name = "xor";
2845 atomic_subop = ac_atomic_xor;
2846 break;
2847 case nir_intrinsic_bindless_image_atomic_exchange:
2848 case nir_intrinsic_image_deref_atomic_exchange:
2849 atomic_name = "swap";
2850 atomic_subop = ac_atomic_swap;
2851 break;
2852 case nir_intrinsic_bindless_image_atomic_comp_swap:
2853 case nir_intrinsic_image_deref_atomic_comp_swap:
2854 atomic_name = "cmpswap";
2855 atomic_subop = 0; /* not used */
2856 break;
2857 case nir_intrinsic_bindless_image_atomic_inc_wrap:
2858 case nir_intrinsic_image_deref_atomic_inc_wrap: {
2859 atomic_name = "inc";
2860 atomic_subop = ac_atomic_inc_wrap;
2861 break;
2862 }
2863 case nir_intrinsic_bindless_image_atomic_dec_wrap:
2864 case nir_intrinsic_image_deref_atomic_dec_wrap:
2865 atomic_name = "dec";
2866 atomic_subop = ac_atomic_dec_wrap;
2867 break;
2868 default:
2869 abort();
2870 }
2871
2872 if (cmpswap)
2873 params[param_count++] = get_src(ctx, instr->src[4]);
2874 params[param_count++] = get_src(ctx, instr->src[3]);
2875
2876 LLVMValueRef result;
2877 if (dim == GLSL_SAMPLER_DIM_BUF) {
2878 params[param_count++] = get_image_buffer_descriptor(ctx, instr, dynamic_index, true, true);
2879 params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
2880 ctx->ac.i32_0, ""); /* vindex */
2881 params[param_count++] = ctx->ac.i32_0; /* voffset */
2882 if (LLVM_VERSION_MAJOR >= 9) {
2883 /* XXX: The new raw/struct atomic intrinsics are buggy
2884 * with LLVM 8, see r358579.
2885 */
2886 params[param_count++] = ctx->ac.i32_0; /* soffset */
2887 params[param_count++] = ctx->ac.i32_0; /* slc */
2888
2889 length = snprintf(intrinsic_name, sizeof(intrinsic_name),
2890 "llvm.amdgcn.struct.buffer.atomic.%s.i32", atomic_name);
2891 } else {
2892 params[param_count++] = ctx->ac.i1false; /* slc */
2893
2894 length = snprintf(intrinsic_name, sizeof(intrinsic_name), "llvm.amdgcn.buffer.atomic.%s",
2895 atomic_name);
2896 }
2897
2898 assert(length < sizeof(intrinsic_name));
2899 result = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32, params, param_count, 0);
2900 } else {
2901 struct ac_image_args args = {};
2902 args.opcode = cmpswap ? ac_image_atomic_cmpswap : ac_image_atomic;
2903 args.atomic = atomic_subop;
2904 args.data[0] = params[0];
2905 if (cmpswap)
2906 args.data[1] = params[1];
2907 args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, true);
2908 get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
2909 args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2910
2911 result = ac_build_image_opcode(&ctx->ac, &args);
2912 }
2913
2914 result = exit_waterfall(ctx, &wctx, result);
2915 if (ctx->ac.postponed_kill)
2916 ac_build_endif(&ctx->ac, 7004);
2917 return result;
2918 }
2919
2920 static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
2921 {
2922 struct waterfall_context wctx;
2923 LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2924 LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
2925
2926 LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc);
2927
2928 return exit_waterfall(ctx, &wctx, ret);
2929 }
2930
2931 static LLVMValueRef visit_image_size(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
2932 bool bindless)
2933 {
2934 LLVMValueRef res;
2935
2936 enum glsl_sampler_dim dim;
2937 bool is_array;
2938 if (bindless) {
2939 dim = nir_intrinsic_image_dim(instr);
2940 is_array = nir_intrinsic_image_array(instr);
2941 } else {
2942 const struct glsl_type *type = get_image_deref(instr)->type;
2943 dim = glsl_get_sampler_dim(type);
2944 is_array = glsl_sampler_type_is_array(type);
2945 }
2946
2947 struct waterfall_context wctx;
2948 LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
2949
2950 if (dim == GLSL_SAMPLER_DIM_BUF) {
2951 res = get_buffer_size(
2952 ctx, get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, false), true);
2953 } else {
2954
2955 struct ac_image_args args = {0};
2956
2957 args.dim = ac_get_image_dim(ctx->ac.chip_class, dim, is_array);
2958 args.dmask = 0xf;
2959 args.resource = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
2960 args.opcode = ac_image_get_resinfo;
2961 assert(nir_src_as_uint(instr->src[1]) == 0);
2962 args.lod = ctx->ac.i32_0;
2963 args.attributes = AC_FUNC_ATTR_READNONE;
2964
2965 res = ac_build_image_opcode(&ctx->ac, &args);
2966
2967 LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
2968
2969 if (dim == GLSL_SAMPLER_DIM_CUBE && is_array) {
2970 LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
2971 LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
2972 z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
2973 res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
2974 }
2975
2976 if (ctx->ac.chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
2977 LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
2978 res = LLVMBuildInsertElement(ctx->ac.builder, res, layers, ctx->ac.i32_1, "");
2979 }
2980 }
2981 return exit_waterfall(ctx, &wctx, res);
2982 }
2983
2984 static void emit_membar(struct ac_llvm_context *ac, const nir_intrinsic_instr *instr)
2985 {
2986 unsigned wait_flags = 0;
2987
2988 switch (instr->intrinsic) {
2989 case nir_intrinsic_memory_barrier:
2990 case nir_intrinsic_group_memory_barrier:
2991 wait_flags = AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE;
2992 break;
2993 case nir_intrinsic_memory_barrier_buffer:
2994 case nir_intrinsic_memory_barrier_image:
2995 wait_flags = AC_WAIT_VLOAD | AC_WAIT_VSTORE;
2996 break;
2997 case nir_intrinsic_memory_barrier_shared:
2998 wait_flags = AC_WAIT_LGKM;
2999 break;
3000 default:
3001 break;
3002 }
3003
3004 ac_build_waitcnt(ac, wait_flags);
3005 }
3006
3007 void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage)
3008 {
3009 /* GFX6 only (thanks to a hw bug workaround):
3010 * The real barrier instruction isn’t needed, because an entire patch
3011 * always fits into a single wave.
3012 */
3013 if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL) {
3014 ac_build_waitcnt(ac, AC_WAIT_LGKM | AC_WAIT_VLOAD | AC_WAIT_VSTORE);
3015 return;
3016 }
3017 ac_build_s_barrier(ac);
3018 }
3019
3020 static void emit_discard(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
3021 {
3022 LLVMValueRef cond;
3023
3024 if (instr->intrinsic == nir_intrinsic_discard_if) {
3025 cond =
3026 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, get_src(ctx, instr->src[0]), ctx->ac.i32_0, "");
3027 } else {
3028 assert(instr->intrinsic == nir_intrinsic_discard);
3029 cond = ctx->ac.i1false;
3030 }
3031
3032 ac_build_kill_if_false(&ctx->ac, cond);
3033 }
3034
3035 static void emit_demote(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
3036 {
3037 LLVMValueRef cond;
3038
3039 if (instr->intrinsic == nir_intrinsic_demote_if) {
3040 cond =
3041 LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, get_src(ctx, instr->src[0]), ctx->ac.i32_0, "");
3042 } else {
3043 assert(instr->intrinsic == nir_intrinsic_demote);
3044 cond = ctx->ac.i1false;
3045 }
3046
3047 /* Kill immediately while maintaining WQM. */
3048 ac_build_kill_if_false(&ctx->ac, ac_build_wqm_vote(&ctx->ac, cond));
3049
3050 LLVMValueRef mask = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
3051 mask = LLVMBuildAnd(ctx->ac.builder, mask, cond, "");
3052 LLVMBuildStore(ctx->ac.builder, mask, ctx->ac.postponed_kill);
3053 return;
3054 }
3055
3056 static LLVMValueRef visit_load_local_invocation_index(struct ac_nir_context *ctx)
3057 {
3058 LLVMValueRef result;
3059 LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac);
3060 result = LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size),
3061 LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
3062
3063 if (ctx->ac.wave_size == 32)
3064 result = LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 1, false), "");
3065
3066 return LLVMBuildAdd(ctx->ac.builder, result, thread_id, "");
3067 }
3068
3069 static LLVMValueRef visit_load_subgroup_id(struct ac_nir_context *ctx)
3070 {
3071 if (ctx->stage == MESA_SHADER_COMPUTE) {
3072 LLVMValueRef result;
3073 result = LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size),
3074 LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
3075 return LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 6, false), "");
3076 } else {
3077 return LLVMConstInt(ctx->ac.i32, 0, false);
3078 }
3079 }
3080
3081 static LLVMValueRef visit_load_num_subgroups(struct ac_nir_context *ctx)
3082 {
3083 if (ctx->stage == MESA_SHADER_COMPUTE) {
3084 return LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size),
3085 LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
3086 } else {
3087 return LLVMConstInt(ctx->ac.i32, 1, false);
3088 }
3089 }
3090
3091 static LLVMValueRef visit_first_invocation(struct ac_nir_context *ctx)
3092 {
3093 LLVMValueRef active_set = ac_build_ballot(&ctx->ac, ctx->ac.i32_1);
3094 const char *intr = ctx->ac.wave_size == 32 ? "llvm.cttz.i32" : "llvm.cttz.i64";
3095
3096 /* The second argument is whether cttz(0) should be defined, but we do not care. */
3097 LLVMValueRef args[] = {active_set, ctx->ac.i1false};
3098 LLVMValueRef result = ac_build_intrinsic(&ctx->ac, intr, ctx->ac.iN_wavemask, args, 2,
3099 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE);
3100
3101 return LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i32, "");
3102 }
3103
3104 static LLVMValueRef visit_load_shared(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
3105 {
3106 LLVMValueRef values[4], derived_ptr, index, ret;
3107
3108 LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->dest.ssa.bit_size);
3109
3110 for (int chan = 0; chan < instr->num_components; chan++) {
3111 index = LLVMConstInt(ctx->ac.i32, chan, 0);
3112 derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
3113 values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
3114 }
3115
3116 ret = ac_build_gather_values(&ctx->ac, values, instr->num_components);
3117 return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
3118 }
3119
3120 static void visit_store_shared(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
3121 {
3122 LLVMValueRef derived_ptr, data, index;
3123 LLVMBuilderRef builder = ctx->ac.builder;
3124
3125 LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[1], instr->src[0].ssa->bit_size);
3126 LLVMValueRef src = get_src(ctx, instr->src[0]);
3127
3128 int writemask = nir_intrinsic_write_mask(instr);
3129 for (int chan = 0; chan < 4; chan++) {
3130 if (!(writemask & (1 << chan))) {
3131 continue;
3132 }
3133 data = ac_llvm_extract_elem(&ctx->ac, src, chan);
3134 index = LLVMConstInt(ctx->ac.i32, chan, 0);
3135 derived_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
3136 LLVMBuildStore(builder, data, derived_ptr);
3137 }
3138 }
3139
3140 static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr,
3141 LLVMValueRef ptr, int src_idx)
3142 {
3143 if (ctx->ac.postponed_kill) {
3144 LLVMValueRef cond = LLVMBuildLoad(ctx->ac.builder, ctx->ac.postponed_kill, "");
3145 ac_build_ifcc(&ctx->ac, cond, 7005);
3146 }
3147
3148 LLVMValueRef result;
3149 LLVMValueRef src = get_src(ctx, instr->src[src_idx]);
3150
3151 const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
3152
3153 if (instr->src[0].ssa->parent_instr->type == nir_instr_type_deref) {
3154 nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr);
3155 if (deref->mode == nir_var_mem_global) {
3156 /* use "singlethread" sync scope to implement relaxed ordering */
3157 sync_scope = LLVM_VERSION_MAJOR >= 9 ? "singlethread-one-as" : "singlethread";
3158
3159 LLVMTypeRef ptr_type =
3160 LLVMPointerType(LLVMTypeOf(src), LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)));
3161 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, "");
3162 }
3163 }
3164
3165 if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap ||
3166 instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) {
3167 LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]);
3168 result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope);
3169 result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
3170 } else {
3171 LLVMAtomicRMWBinOp op;
3172 switch (instr->intrinsic) {
3173 case nir_intrinsic_shared_atomic_add:
3174 case nir_intrinsic_deref_atomic_add:
3175 op = LLVMAtomicRMWBinOpAdd;
3176 break;
3177 case nir_intrinsic_shared_atomic_umin:
3178 case nir_intrinsic_deref_atomic_umin:
3179 op = LLVMAtomicRMWBinOpUMin;
3180 break;
3181 case nir_intrinsic_shared_atomic_umax:
3182 case nir_intrinsic_deref_atomic_umax:
3183 op = LLVMAtomicRMWBinOpUMax;
3184 break;
3185 case nir_intrinsic_shared_atomic_imin:
3186 case nir_intrinsic_deref_atomic_imin:
3187 op = LLVMAtomicRMWBinOpMin;
3188 break;
3189 case nir_intrinsic_shared_atomic_imax:
3190 case nir_intrinsic_deref_atomic_imax:
3191 op = LLVMAtomicRMWBinOpMax;
3192 break;
3193 case nir_intrinsic_shared_atomic_and:
3194 case nir_intrinsic_deref_atomic_and:
3195 op = LLVMAtomicRMWBinOpAnd;
3196 break;
3197 case nir_intrinsic_shared_atomic_or:
3198 case nir_intrinsic_deref_atomic_or:
3199 op = LLVMAtomicRMWBinOpOr;
3200 break;
3201 case nir_intrinsic_shared_atomic_xor:
3202 case nir_intrinsic_deref_atomic_xor:
3203 op = LLVMAtomicRMWBinOpXor;
3204 break;
3205 case nir_intrinsic_shared_atomic_exchange:
3206 case nir_intrinsic_deref_atomic_exchange:
3207 op = LLVMAtomicRMWBinOpXchg;
3208 break;
3209 #if LLVM_VERSION_MAJOR >= 10
3210 case nir_intrinsic_shared_atomic_fadd:
3211 case nir_intrinsic_deref_atomic_fadd:
3212 op = LLVMAtomicRMWBinOpFAdd;
3213 break;
3214 #endif
3215 default:
3216 return NULL;
3217 }
3218
3219 LLVMValueRef val;
3220
3221 if (instr->intrinsic == nir_intrinsic_shared_atomic_fadd ||
3222 instr->intrinsic == nir_intrinsic_deref_atomic_fadd) {
3223 val = ac_to_float(&ctx->ac, src);
3224 } else {
3225 val = ac_to_integer(&ctx->ac, src);
3226 }
3227
3228 result = ac_build_atomic_rmw(&ctx->ac, op, ptr, val, sync_scope);
3229 }
3230
3231 if (ctx->ac.postponed_kill)
3232 ac_build_endif(&ctx->ac, 7005);
3233 return result;
3234 }
3235
3236 static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx)
3237 {
3238 LLVMValueRef values[2];
3239 LLVMValueRef pos[2];
3240
3241 pos[0] = ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->frag_pos[0]));
3242 pos[1] = ac_to_float(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->frag_pos[1]));
3243
3244 values[0] = ac_build_fract(&ctx->ac, pos[0], 32);
3245 values[1] = ac_build_fract(&ctx->ac, pos[1], 32);
3246 return ac_build_gather_values(&ctx->ac, values, 2);
3247 }
3248
3249 static LLVMValueRef lookup_interp_param(struct ac_nir_context *ctx, enum glsl_interp_mode interp,
3250 unsigned location)
3251 {
3252 switch (interp) {
3253 case INTERP_MODE_FLAT:
3254 default:
3255 return NULL;
3256 case INTERP_MODE_SMOOTH:
3257 case INTERP_MODE_NONE:
3258 if (location == INTERP_CENTER)
3259 return ac_get_arg(&ctx->ac, ctx->args->persp_center);
3260 else if (location == INTERP_CENTROID)
3261 return ctx->abi->persp_centroid;
3262 else if (location == INTERP_SAMPLE)
3263 return ac_get_arg(&ctx->ac, ctx->args->persp_sample);
3264 break;
3265 case INTERP_MODE_NOPERSPECTIVE:
3266 if (location == INTERP_CENTER)
3267 return ac_get_arg(&ctx->ac, ctx->args->linear_center);
3268 else if (location == INTERP_CENTROID)
3269 return ctx->abi->linear_centroid;
3270 else if (location == INTERP_SAMPLE)
3271 return ac_get_arg(&ctx->ac, ctx->args->linear_sample);
3272 break;
3273 }
3274 return NULL;
3275 }
3276
3277 static LLVMValueRef barycentric_center(struct ac_nir_context *ctx, unsigned mode)
3278 {
3279 LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER);
3280 return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3281 }
3282
3283 static LLVMValueRef barycentric_offset(struct ac_nir_context *ctx, unsigned mode,
3284 LLVMValueRef offset)
3285 {
3286 LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTER);
3287 LLVMValueRef src_c0 =
3288 ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_0, ""));
3289 LLVMValueRef src_c1 =
3290 ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, offset, ctx->ac.i32_1, ""));
3291
3292 LLVMValueRef ij_out[2];
3293 LLVMValueRef ddxy_out = ac_build_ddxy_interp(&ctx->ac, interp_param);
3294
3295 /*
3296 * take the I then J parameters, and the DDX/Y for it, and
3297 * calculate the IJ inputs for the interpolator.
3298 * temp1 = ddx * offset/sample.x + I;
3299 * interp_param.I = ddy * offset/sample.y + temp1;
3300 * temp1 = ddx * offset/sample.x + J;
3301 * interp_param.J = ddy * offset/sample.y + temp1;
3302 */
3303 for (unsigned i = 0; i < 2; i++) {
3304 LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false);
3305 LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false);
3306 LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, ix_ll, "");
3307 LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder, ddxy_out, iy_ll, "");
3308 LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ix_ll, "");
3309 LLVMValueRef temp1, temp2;
3310
3311 interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el, ctx->ac.f32, "");
3312
3313 temp1 = ac_build_fmad(&ctx->ac, ddx_el, src_c0, interp_el);
3314 temp2 = ac_build_fmad(&ctx->ac, ddy_el, src_c1, temp1);
3315
3316 ij_out[i] = LLVMBuildBitCast(ctx->ac.builder, temp2, ctx->ac.i32, "");
3317 }
3318 interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
3319 return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3320 }
3321
3322 static LLVMValueRef barycentric_centroid(struct ac_nir_context *ctx, unsigned mode)
3323 {
3324 LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_CENTROID);
3325 return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3326 }
3327
3328 static LLVMValueRef barycentric_at_sample(struct ac_nir_context *ctx, unsigned mode,
3329 LLVMValueRef sample_id)
3330 {
3331 if (ctx->abi->interp_at_sample_force_center)
3332 return barycentric_center(ctx, mode);
3333
3334 LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f);
3335
3336 /* fetch sample ID */
3337 LLVMValueRef sample_pos = ctx->abi->load_sample_position(ctx->abi, sample_id);
3338
3339 LLVMValueRef src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_0, "");
3340 src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, "");
3341 LLVMValueRef src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_pos, ctx->ac.i32_1, "");
3342 src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, "");
3343 LLVMValueRef coords[] = {src_c0, src_c1};
3344 LLVMValueRef offset = ac_build_gather_values(&ctx->ac, coords, 2);
3345
3346 return barycentric_offset(ctx, mode, offset);
3347 }
3348
3349 static LLVMValueRef barycentric_sample(struct ac_nir_context *ctx, unsigned mode)
3350 {
3351 LLVMValueRef interp_param = lookup_interp_param(ctx, mode, INTERP_SAMPLE);
3352 return LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2i32, "");
3353 }
3354
3355 static LLVMValueRef barycentric_model(struct ac_nir_context *ctx)
3356 {
3357 return LLVMBuildBitCast(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->pull_model),
3358 ctx->ac.v3i32, "");
3359 }
3360
3361 static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx, LLVMValueRef interp_param,
3362 unsigned index, unsigned comp_start,
3363 unsigned num_components, unsigned bitsize)
3364 {
3365 LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, index, false);
3366 LLVMValueRef interp_param_f;
3367
3368 interp_param_f = LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2f32, "");
3369 LLVMValueRef i = LLVMBuildExtractElement(ctx->ac.builder, interp_param_f, ctx->ac.i32_0, "");
3370 LLVMValueRef j = LLVMBuildExtractElement(ctx->ac.builder, interp_param_f, ctx->ac.i32_1, "");
3371
3372 /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */
3373 if (ctx->verified_interp && !_mesa_hash_table_search(ctx->verified_interp, interp_param)) {
3374 LLVMValueRef args[2];
3375 args[0] = i;
3376 args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false);
3377 LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1, args, 2,
3378 AC_FUNC_ATTR_READNONE);
3379 ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, ""));
3380 _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param);
3381 }
3382
3383 LLVMValueRef values[4];
3384 assert(bitsize == 16 || bitsize == 32);
3385 for (unsigned comp = 0; comp < num_components; comp++) {
3386 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, comp_start + comp, false);
3387 if (bitsize == 16) {
3388 values[comp] = ac_build_fs_interp_f16(&ctx->ac, llvm_chan, attr_number,
3389 ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j);
3390 } else {
3391 values[comp] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number,
3392 ac_get_arg(&ctx->ac, ctx->args->prim_mask), i, j);
3393 }
3394 }
3395
3396 return ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, num_components));
3397 }
3398
3399 static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr *instr,
3400 bool is_output)
3401 {
3402 LLVMValueRef values[8];
3403 LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa);
3404 LLVMTypeRef component_type;
3405 unsigned base = nir_intrinsic_base(instr);
3406 unsigned component = nir_intrinsic_component(instr);
3407 unsigned count = instr->dest.ssa.num_components * (instr->dest.ssa.bit_size == 64 ? 2 : 1);
3408 nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3409 LLVMValueRef vertex_index = vertex_index_src ? get_src(ctx, *vertex_index_src) : NULL;
3410 nir_src offset = *nir_get_io_offset_src(instr);
3411 LLVMValueRef indir_index = NULL;
3412
3413 if (LLVMGetTypeKind(dest_type) == LLVMVectorTypeKind)
3414 component_type = LLVMGetElementType(dest_type);
3415 else
3416 component_type = dest_type;
3417
3418 if (nir_src_is_const(offset))
3419 assert(nir_src_as_uint(offset) == 0);
3420 else
3421 indir_index = get_src(ctx, offset);
3422
3423 if (ctx->stage == MESA_SHADER_TESS_CTRL || (ctx->stage == MESA_SHADER_TESS_EVAL && !is_output)) {
3424 LLVMValueRef result = ctx->abi->load_tess_varyings(
3425 ctx->abi, component_type, vertex_index, indir_index, 0, 0, base * 4, component,
3426 instr->num_components, false, false, !is_output);
3427 if (instr->dest.ssa.bit_size == 16) {
3428 result = ac_to_integer(&ctx->ac, result);
3429 result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, "");
3430 }
3431 return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
3432 }
3433
3434 /* No indirect indexing is allowed after this point. */
3435 assert(!indir_index);
3436
3437 if (ctx->stage == MESA_SHADER_GEOMETRY) {
3438 LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
3439 assert(nir_src_is_const(*vertex_index_src));
3440
3441 return ctx->abi->load_inputs(ctx->abi, 0, base * 4, component, instr->num_components,
3442 nir_src_as_uint(*vertex_index_src), 0, type);
3443 }
3444
3445 if (ctx->stage == MESA_SHADER_FRAGMENT && is_output &&
3446 nir_intrinsic_io_semantics(instr).fb_fetch_output)
3447 return ctx->abi->emit_fbfetch(ctx->abi);
3448
3449 /* Other non-fragment cases have inputs and outputs in temporaries. */
3450 if (ctx->stage != MESA_SHADER_FRAGMENT) {
3451 for (unsigned chan = component; chan < count + component; chan++) {
3452 if (is_output) {
3453 values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->abi->outputs[base * 4 + chan], "");
3454 } else {
3455 values[chan] = ctx->abi->inputs[base * 4 + chan];
3456 if (!values[chan])
3457 values[chan] = LLVMGetUndef(ctx->ac.i32);
3458 }
3459 }
3460 LLVMValueRef result = ac_build_varying_gather_values(&ctx->ac, values, count, component);
3461 return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
3462 }
3463
3464 /* Fragment shader inputs. */
3465 unsigned vertex_id = 2; /* P0 */
3466
3467 if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
3468 nir_const_value *src0 = nir_src_as_const_value(instr->src[0]);
3469
3470 switch (src0[0].i32) {
3471 case 0:
3472 vertex_id = 2;
3473 break;
3474 case 1:
3475 vertex_id = 0;
3476 break;
3477 case 2:
3478 vertex_id = 1;
3479 break;
3480 default:
3481 unreachable("Invalid vertex index");
3482 }
3483 }
3484
3485 LLVMValueRef attr_number = LLVMConstInt(ctx->ac.i32, base, false);
3486
3487 for (unsigned chan = 0; chan < count; chan++) {
3488 if (component + chan > 4)
3489 attr_number = LLVMConstInt(ctx->ac.i32, base + 1, false);
3490 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, (component + chan) % 4, false);
3491 values[chan] =
3492 ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, vertex_id, false), llvm_chan,
3493 attr_number, ac_get_arg(&ctx->ac, ctx->args->prim_mask));
3494 values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i32, "");
3495 values[chan] =
3496 LLVMBuildTruncOrBitCast(ctx->ac.builder, values[chan],
3497 instr->dest.ssa.bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32, "");
3498 }
3499
3500 LLVMValueRef result = ac_build_gather_values(&ctx->ac, values, count);
3501 return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
3502 }
3503
3504 static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *instr)
3505 {
3506 LLVMValueRef result = NULL;
3507
3508 switch (instr->intrinsic) {
3509 case nir_intrinsic_ballot:
3510 result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0]));
3511 if (ctx->ac.ballot_mask_bits > ctx->ac.wave_size)
3512 result = LLVMBuildZExt(ctx->ac.builder, result, ctx->ac.iN_ballotmask, "");
3513 break;
3514 case nir_intrinsic_read_invocation:
3515 result =
3516 ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), get_src(ctx, instr->src[1]));
3517 break;
3518 case nir_intrinsic_read_first_invocation:
3519 result = ac_build_readlane(&ctx->ac, get_src(ctx, instr->src[0]), NULL);
3520 break;
3521 case nir_intrinsic_load_subgroup_invocation:
3522 result = ac_get_thread_id(&ctx->ac);
3523 break;
3524 case nir_intrinsic_load_work_group_id: {
3525 LLVMValueRef values[3];
3526
3527 for (int i = 0; i < 3; i++) {
3528 values[i] = ctx->args->workgroup_ids[i].used
3529 ? ac_get_arg(&ctx->ac, ctx->args->workgroup_ids[i])
3530 : ctx->ac.i32_0;
3531 }
3532
3533 result = ac_build_gather_values(&ctx->ac, values, 3);
3534 break;
3535 }
3536 case nir_intrinsic_load_base_vertex:
3537 case nir_intrinsic_load_first_vertex:
3538 result = ctx->abi->load_base_vertex(ctx->abi);
3539 break;
3540 case nir_intrinsic_load_local_group_size:
3541 result = ctx->abi->load_local_group_size(ctx->abi);
3542 break;
3543 case nir_intrinsic_load_vertex_id:
3544 result = LLVMBuildAdd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->vertex_id),
3545 ac_get_arg(&ctx->ac, ctx->args->base_vertex), "");
3546 break;
3547 case nir_intrinsic_load_vertex_id_zero_base: {
3548 result = ctx->abi->vertex_id;
3549 break;
3550 }
3551 case nir_intrinsic_load_local_invocation_id: {
3552 result = ac_get_arg(&ctx->ac, ctx->args->local_invocation_ids);
3553 break;
3554 }
3555 case nir_intrinsic_load_base_instance:
3556 result = ac_get_arg(&ctx->ac, ctx->args->start_instance);
3557 break;
3558 case nir_intrinsic_load_draw_id:
3559 result = ac_get_arg(&ctx->ac, ctx->args->draw_id);
3560 break;
3561 case nir_intrinsic_load_view_index:
3562 result = ac_get_arg(&ctx->ac, ctx->args->view_index);
3563 break;
3564 case nir_intrinsic_load_invocation_id:
3565 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
3566 result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->tcs_rel_ids), 8, 5);
3567 } else {
3568 if (ctx->ac.chip_class >= GFX10) {
3569 result =
3570 LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id),
3571 LLVMConstInt(ctx->ac.i32, 127, 0), "");
3572 } else {
3573 result = ac_get_arg(&ctx->ac, ctx->args->gs_invocation_id);
3574 }
3575 }
3576 break;
3577 case nir_intrinsic_load_primitive_id:
3578 if (ctx->stage == MESA_SHADER_GEOMETRY) {
3579 result = ac_get_arg(&ctx->ac, ctx->args->gs_prim_id);
3580 } else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
3581 result = ac_get_arg(&ctx->ac, ctx->args->tcs_patch_id);
3582 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
3583 result = ac_get_arg(&ctx->ac, ctx->args->tes_patch_id);
3584 } else
3585 fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
3586 break;
3587 case nir_intrinsic_load_sample_id:
3588 result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ancillary), 8, 4);
3589 break;
3590 case nir_intrinsic_load_sample_pos:
3591 result = load_sample_pos(ctx);
3592 break;
3593 case nir_intrinsic_load_sample_mask_in:
3594 result = ctx->abi->load_sample_mask_in(ctx->abi);
3595 break;
3596 case nir_intrinsic_load_frag_coord: {
3597 LLVMValueRef values[4] = {
3598 ac_get_arg(&ctx->ac, ctx->args->frag_pos[0]), ac_get_arg(&ctx->ac, ctx->args->frag_pos[1]),
3599 ac_get_arg(&ctx->ac, ctx->args->frag_pos[2]),
3600 ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ac_get_arg(&ctx->ac, ctx->args->frag_pos[3]))};
3601 result = ac_to_integer(&ctx->ac, ac_build_gather_values(&ctx->ac, values, 4));
3602 break;
3603 }
3604 case nir_intrinsic_load_layer_id:
3605 result = ctx->abi->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
3606 break;
3607 case nir_intrinsic_load_front_face:
3608 result = ac_get_arg(&ctx->ac, ctx->args->front_face);
3609 break;
3610 case nir_intrinsic_load_helper_invocation:
3611 result = ac_build_load_helper_invocation(&ctx->ac);
3612 break;
3613 case nir_intrinsic_is_helper_invocation:
3614 result = ac_build_is_helper_invocation(&ctx->ac);
3615 break;
3616 case nir_intrinsic_load_color0:
3617 result = ctx->abi->color0;
3618 break;
3619 case nir_intrinsic_load_color1:
3620 result = ctx->abi->color1;
3621 break;
3622 case nir_intrinsic_load_user_data_amd:
3623 assert(LLVMTypeOf(ctx->abi->user_data) == ctx->ac.v4i32);
3624 result = ctx->abi->user_data;
3625 break;
3626 case nir_intrinsic_load_instance_id:
3627 result = ctx->abi->instance_id;
3628 break;
3629 case nir_intrinsic_load_num_work_groups:
3630 result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
3631 break;
3632 case nir_intrinsic_load_local_invocation_index:
3633 result = visit_load_local_invocation_index(ctx);
3634 break;
3635 case nir_intrinsic_load_subgroup_id:
3636 result = visit_load_subgroup_id(ctx);
3637 break;
3638 case nir_intrinsic_load_num_subgroups:
3639 result = visit_load_num_subgroups(ctx);
3640 break;
3641 case nir_intrinsic_first_invocation:
3642 result = visit_first_invocation(ctx);
3643 break;
3644 case nir_intrinsic_load_push_constant:
3645 result = visit_load_push_constant(ctx, instr);
3646 break;
3647 case nir_intrinsic_vulkan_resource_index: {
3648 LLVMValueRef index = get_src(ctx, instr->src[0]);
3649 unsigned desc_set = nir_intrinsic_desc_set(instr);
3650 unsigned binding = nir_intrinsic_binding(instr);
3651
3652 result = ctx->abi->load_resource(ctx->abi, index, desc_set, binding);
3653 break;
3654 }
3655 case nir_intrinsic_vulkan_resource_reindex:
3656 result = visit_vulkan_resource_reindex(ctx, instr);
3657 break;
3658 case nir_intrinsic_store_ssbo:
3659 visit_store_ssbo(ctx, instr);
3660 break;
3661 case nir_intrinsic_load_ssbo:
3662 result = visit_load_buffer(ctx, instr);
3663 break;
3664 case nir_intrinsic_ssbo_atomic_add:
3665 case nir_intrinsic_ssbo_atomic_imin:
3666 case nir_intrinsic_ssbo_atomic_umin:
3667 case nir_intrinsic_ssbo_atomic_imax:
3668 case nir_intrinsic_ssbo_atomic_umax:
3669 case nir_intrinsic_ssbo_atomic_and:
3670 case nir_intrinsic_ssbo_atomic_or:
3671 case nir_intrinsic_ssbo_atomic_xor:
3672 case nir_intrinsic_ssbo_atomic_exchange:
3673 case nir_intrinsic_ssbo_atomic_comp_swap:
3674 result = visit_atomic_ssbo(ctx, instr);
3675 break;
3676 case nir_intrinsic_load_ubo:
3677 result = visit_load_ubo_buffer(ctx, instr);
3678 break;
3679 case nir_intrinsic_get_buffer_size:
3680 result = visit_get_buffer_size(ctx, instr);
3681 break;
3682 case nir_intrinsic_load_deref:
3683 result = visit_load_var(ctx, instr);
3684 break;
3685 case nir_intrinsic_store_deref:
3686 visit_store_var(ctx, instr);
3687 break;
3688 case nir_intrinsic_load_input:
3689 case nir_intrinsic_load_input_vertex:
3690 case nir_intrinsic_load_per_vertex_input:
3691 result = visit_load(ctx, instr, false);
3692 break;
3693 case nir_intrinsic_load_output:
3694 case nir_intrinsic_load_per_vertex_output:
3695 result = visit_load(ctx, instr, true);
3696 break;
3697 case nir_intrinsic_store_output:
3698 case nir_intrinsic_store_per_vertex_output:
3699 visit_store_output(ctx, instr);
3700 break;
3701 case nir_intrinsic_load_shared:
3702 result = visit_load_shared(ctx, instr);
3703 break;
3704 case nir_intrinsic_store_shared:
3705 visit_store_shared(ctx, instr);
3706 break;
3707 case nir_intrinsic_bindless_image_samples:
3708 case nir_intrinsic_image_deref_samples:
3709 result = visit_image_samples(ctx, instr);
3710 break;
3711 case nir_intrinsic_bindless_image_load:
3712 result = visit_image_load(ctx, instr, true);
3713 break;
3714 case nir_intrinsic_image_deref_load:
3715 result = visit_image_load(ctx, instr, false);
3716 break;
3717 case nir_intrinsic_bindless_image_store:
3718 visit_image_store(ctx, instr, true);
3719 break;
3720 case nir_intrinsic_image_deref_store:
3721 visit_image_store(ctx, instr, false);
3722 break;
3723 case nir_intrinsic_bindless_image_atomic_add:
3724 case nir_intrinsic_bindless_image_atomic_imin:
3725 case nir_intrinsic_bindless_image_atomic_umin:
3726 case nir_intrinsic_bindless_image_atomic_imax:
3727 case nir_intrinsic_bindless_image_atomic_umax:
3728 case nir_intrinsic_bindless_image_atomic_and:
3729 case nir_intrinsic_bindless_image_atomic_or:
3730 case nir_intrinsic_bindless_image_atomic_xor:
3731 case nir_intrinsic_bindless_image_atomic_exchange:
3732 case nir_intrinsic_bindless_image_atomic_comp_swap:
3733 case nir_intrinsic_bindless_image_atomic_inc_wrap:
3734 case nir_intrinsic_bindless_image_atomic_dec_wrap:
3735 result = visit_image_atomic(ctx, instr, true);
3736 break;
3737 case nir_intrinsic_image_deref_atomic_add:
3738 case nir_intrinsic_image_deref_atomic_imin:
3739 case nir_intrinsic_image_deref_atomic_umin:
3740 case nir_intrinsic_image_deref_atomic_imax:
3741 case nir_intrinsic_image_deref_atomic_umax:
3742 case nir_intrinsic_image_deref_atomic_and:
3743 case nir_intrinsic_image_deref_atomic_or:
3744 case nir_intrinsic_image_deref_atomic_xor:
3745 case nir_intrinsic_image_deref_atomic_exchange:
3746 case nir_intrinsic_image_deref_atomic_comp_swap:
3747 case nir_intrinsic_image_deref_atomic_inc_wrap:
3748 case nir_intrinsic_image_deref_atomic_dec_wrap:
3749 result = visit_image_atomic(ctx, instr, false);
3750 break;
3751 case nir_intrinsic_bindless_image_size:
3752 result = visit_image_size(ctx, instr, true);
3753 break;
3754 case nir_intrinsic_image_deref_size:
3755 result = visit_image_size(ctx, instr, false);
3756 break;
3757 case nir_intrinsic_shader_clock:
3758 result = ac_build_shader_clock(&ctx->ac, nir_intrinsic_memory_scope(instr));
3759 break;
3760 case nir_intrinsic_discard:
3761 case nir_intrinsic_discard_if:
3762 emit_discard(ctx, instr);
3763 break;
3764 case nir_intrinsic_demote:
3765 case nir_intrinsic_demote_if:
3766 emit_demote(ctx, instr);
3767 break;
3768 case nir_intrinsic_memory_barrier:
3769 case nir_intrinsic_group_memory_barrier:
3770 case nir_intrinsic_memory_barrier_buffer:
3771 case nir_intrinsic_memory_barrier_image:
3772 case nir_intrinsic_memory_barrier_shared:
3773 emit_membar(&ctx->ac, instr);
3774 break;
3775 case nir_intrinsic_scoped_barrier: {
3776 assert(!(nir_intrinsic_memory_semantics(instr) &
3777 (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
3778
3779 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
3780
3781 unsigned wait_flags = 0;
3782 if (modes & (nir_var_mem_global | nir_var_mem_ssbo))
3783 wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
3784 if (modes & nir_var_mem_shared)
3785 wait_flags |= AC_WAIT_LGKM;
3786
3787 if (wait_flags)
3788 ac_build_waitcnt(&ctx->ac, wait_flags);
3789
3790 if (nir_intrinsic_execution_scope(instr) == NIR_SCOPE_WORKGROUP)
3791 ac_emit_barrier(&ctx->ac, ctx->stage);
3792 break;
3793 }
3794 case nir_intrinsic_memory_barrier_tcs_patch:
3795 break;
3796 case nir_intrinsic_control_barrier:
3797 ac_emit_barrier(&ctx->ac, ctx->stage);
3798 break;
3799 case nir_intrinsic_shared_atomic_add:
3800 case nir_intrinsic_shared_atomic_imin:
3801 case nir_intrinsic_shared_atomic_umin:
3802 case nir_intrinsic_shared_atomic_imax:
3803 case nir_intrinsic_shared_atomic_umax:
3804 case nir_intrinsic_shared_atomic_and:
3805 case nir_intrinsic_shared_atomic_or:
3806 case nir_intrinsic_shared_atomic_xor:
3807 case nir_intrinsic_shared_atomic_exchange:
3808 case nir_intrinsic_shared_atomic_comp_swap:
3809 case nir_intrinsic_shared_atomic_fadd: {
3810 LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->src[1].ssa->bit_size);
3811 result = visit_var_atomic(ctx, instr, ptr, 1);
3812 break;
3813 }
3814 case nir_intrinsic_deref_atomic_add:
3815 case nir_intrinsic_deref_atomic_imin:
3816 case nir_intrinsic_deref_atomic_umin:
3817 case nir_intrinsic_deref_atomic_imax:
3818 case nir_intrinsic_deref_atomic_umax:
3819 case nir_intrinsic_deref_atomic_and:
3820 case nir_intrinsic_deref_atomic_or:
3821 case nir_intrinsic_deref_atomic_xor:
3822 case nir_intrinsic_deref_atomic_exchange:
3823 case nir_intrinsic_deref_atomic_comp_swap:
3824 case nir_intrinsic_deref_atomic_fadd: {
3825 LLVMValueRef ptr = get_src(ctx, instr->src[0]);
3826 result = visit_var_atomic(ctx, instr, ptr, 1);
3827 break;
3828 }
3829 case nir_intrinsic_load_barycentric_pixel:
3830 result = barycentric_center(ctx, nir_intrinsic_interp_mode(instr));
3831 break;
3832 case nir_intrinsic_load_barycentric_centroid:
3833 result = barycentric_centroid(ctx, nir_intrinsic_interp_mode(instr));
3834 break;
3835 case nir_intrinsic_load_barycentric_sample:
3836 result = barycentric_sample(ctx, nir_intrinsic_interp_mode(instr));
3837 break;
3838 case nir_intrinsic_load_barycentric_model:
3839 result = barycentric_model(ctx);
3840 break;
3841 case nir_intrinsic_load_barycentric_at_offset: {
3842 LLVMValueRef offset = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
3843 result = barycentric_offset(ctx, nir_intrinsic_interp_mode(instr), offset);
3844 break;
3845 }
3846 case nir_intrinsic_load_barycentric_at_sample: {
3847 LLVMValueRef sample_id = get_src(ctx, instr->src[0]);
3848 result = barycentric_at_sample(ctx, nir_intrinsic_interp_mode(instr), sample_id);
3849 break;
3850 }
3851 case nir_intrinsic_load_interpolated_input: {
3852 /* We assume any indirect loads have been lowered away */
3853 ASSERTED nir_const_value *offset = nir_src_as_const_value(instr->src[1]);
3854 assert(offset);
3855 assert(offset[0].i32 == 0);
3856
3857 LLVMValueRef interp_param = get_src(ctx, instr->src[0]);
3858 unsigned index = nir_intrinsic_base(instr);
3859 unsigned component = nir_intrinsic_component(instr);
3860 result = load_interpolated_input(ctx, interp_param, index, component,
3861 instr->dest.ssa.num_components, instr->dest.ssa.bit_size);
3862 break;
3863 }
3864 case nir_intrinsic_emit_vertex:
3865 ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->abi->outputs);
3866 break;
3867 case nir_intrinsic_emit_vertex_with_counter: {
3868 unsigned stream = nir_intrinsic_stream_id(instr);
3869 LLVMValueRef next_vertex = get_src(ctx, instr->src[0]);
3870 ctx->abi->emit_vertex_with_counter(ctx->abi, stream, next_vertex, ctx->abi->outputs);
3871 break;
3872 }
3873 case nir_intrinsic_end_primitive:
3874 case nir_intrinsic_end_primitive_with_counter:
3875 ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
3876 break;
3877 case nir_intrinsic_load_tess_coord:
3878 result = ctx->abi->load_tess_coord(ctx->abi);
3879 break;
3880 case nir_intrinsic_load_tess_level_outer:
3881 result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false);
3882 break;
3883 case nir_intrinsic_load_tess_level_inner:
3884 result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, false);
3885 break;
3886 case nir_intrinsic_load_tess_level_outer_default:
3887 result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, true);
3888 break;
3889 case nir_intrinsic_load_tess_level_inner_default:
3890 result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER, true);
3891 break;
3892 case nir_intrinsic_load_patch_vertices_in:
3893 result = ctx->abi->load_patch_vertices_in(ctx->abi);
3894 break;
3895 case nir_intrinsic_vote_all: {
3896 LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0]));
3897 result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
3898 break;
3899 }
3900 case nir_intrinsic_vote_any: {
3901 LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0]));
3902 result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
3903 break;
3904 }
3905 case nir_intrinsic_shuffle:
3906 if (ctx->ac.chip_class == GFX8 || ctx->ac.chip_class == GFX9 ||
3907 (ctx->ac.chip_class >= GFX10 && ctx->ac.wave_size == 32)) {
3908 result =
3909 ac_build_shuffle(&ctx->ac, get_src(ctx, instr->src[0]), get_src(ctx, instr->src[1]));
3910 } else {
3911 LLVMValueRef src = get_src(ctx, instr->src[0]);
3912 LLVMValueRef index = get_src(ctx, instr->src[1]);
3913 LLVMTypeRef type = LLVMTypeOf(src);
3914 struct waterfall_context wctx;
3915 LLVMValueRef index_val;
3916
3917 index_val = enter_waterfall(ctx, &wctx, index, true);
3918
3919 src = LLVMBuildZExt(ctx->ac.builder, src, ctx->ac.i32, "");
3920
3921 result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.readlane", ctx->ac.i32,
3922 (LLVMValueRef[]){src, index_val}, 2,
3923 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3924
3925 result = LLVMBuildTrunc(ctx->ac.builder, result, type, "");
3926
3927 result = exit_waterfall(ctx, &wctx, result);
3928 }
3929 break;
3930 case nir_intrinsic_reduce:
3931 result = ac_build_reduce(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0],
3932 instr->const_index[1]);
3933 break;
3934 case nir_intrinsic_inclusive_scan:
3935 result =
3936 ac_build_inclusive_scan(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0]);
3937 break;
3938 case nir_intrinsic_exclusive_scan:
3939 result =
3940 ac_build_exclusive_scan(&ctx->ac, get_src(ctx, instr->src[0]), instr->const_index[0]);
3941 break;
3942 case nir_intrinsic_quad_broadcast: {
3943 unsigned lane = nir_src_as_uint(instr->src[1]);
3944 result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), lane, lane, lane, lane);
3945 break;
3946 }
3947 case nir_intrinsic_quad_swap_horizontal:
3948 result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 1, 0, 3, 2);
3949 break;
3950 case nir_intrinsic_quad_swap_vertical:
3951 result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 2, 3, 0, 1);
3952 break;
3953 case nir_intrinsic_quad_swap_diagonal:
3954 result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), 3, 2, 1, 0);
3955 break;
3956 case nir_intrinsic_quad_swizzle_amd: {
3957 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
3958 result = ac_build_quad_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask & 0x3,
3959 (mask >> 2) & 0x3, (mask >> 4) & 0x3, (mask >> 6) & 0x3);
3960 break;
3961 }
3962 case nir_intrinsic_masked_swizzle_amd: {
3963 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
3964 result = ac_build_ds_swizzle(&ctx->ac, get_src(ctx, instr->src[0]), mask);
3965 break;
3966 }
3967 case nir_intrinsic_write_invocation_amd:
3968 result = ac_build_writelane(&ctx->ac, get_src(ctx, instr->src[0]),
3969 get_src(ctx, instr->src[1]), get_src(ctx, instr->src[2]));
3970 break;
3971 case nir_intrinsic_mbcnt_amd:
3972 result = ac_build_mbcnt(&ctx->ac, get_src(ctx, instr->src[0]));
3973 break;
3974 case nir_intrinsic_load_scratch: {
3975 LLVMValueRef offset = get_src(ctx, instr->src[0]);
3976 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, offset);
3977 LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
3978 LLVMTypeRef vec_type = instr->dest.ssa.num_components == 1
3979 ? comp_type
3980 : LLVMVectorType(comp_type, instr->dest.ssa.num_components);
3981 unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3982 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(vec_type, addr_space), "");
3983 result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
3984 break;
3985 }
3986 case nir_intrinsic_store_scratch: {
3987 LLVMValueRef offset = get_src(ctx, instr->src[1]);
3988 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->scratch, offset);
3989 LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->src[0].ssa->bit_size);
3990 unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3991 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(comp_type, addr_space), "");
3992 LLVMValueRef src = get_src(ctx, instr->src[0]);
3993 unsigned wrmask = nir_intrinsic_write_mask(instr);
3994 while (wrmask) {
3995 int start, count;
3996 u_bit_scan_consecutive_range(&wrmask, &start, &count);
3997
3998 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, start, false);
3999 LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &offset, 1, "");
4000 LLVMTypeRef vec_type = count == 1 ? comp_type : LLVMVectorType(comp_type, count);
4001 offset_ptr = LLVMBuildBitCast(ctx->ac.builder, offset_ptr,
4002 LLVMPointerType(vec_type, addr_space), "");
4003 LLVMValueRef offset_src = ac_extract_components(&ctx->ac, src, start, count);
4004 LLVMBuildStore(ctx->ac.builder, offset_src, offset_ptr);
4005 }
4006 break;
4007 }
4008 case nir_intrinsic_load_constant: {
4009 unsigned base = nir_intrinsic_base(instr);
4010 unsigned range = nir_intrinsic_range(instr);
4011
4012 LLVMValueRef offset = get_src(ctx, instr->src[0]);
4013 offset = LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, base, false), "");
4014
4015 /* Clamp the offset to avoid out-of-bound access because global
4016 * instructions can't handle them.
4017 */
4018 LLVMValueRef size = LLVMConstInt(ctx->ac.i32, base + range, false);
4019 LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, offset, size, "");
4020 offset = LLVMBuildSelect(ctx->ac.builder, cond, offset, size, "");
4021
4022 LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->constant_data, offset);
4023 LLVMTypeRef comp_type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
4024 LLVMTypeRef vec_type = instr->dest.ssa.num_components == 1
4025 ? comp_type
4026 : LLVMVectorType(comp_type, instr->dest.ssa.num_components);
4027 unsigned addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
4028 ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(vec_type, addr_space), "");
4029 result = LLVMBuildLoad(ctx->ac.builder, ptr, "");
4030 break;
4031 }
4032 default:
4033 fprintf(stderr, "Unknown intrinsic: ");
4034 nir_print_instr(&instr->instr, stderr);
4035 fprintf(stderr, "\n");
4036 break;
4037 }
4038 if (result) {
4039 ctx->ssa_defs[instr->dest.ssa.index] = result;
4040 }
4041 }
4042
4043 static LLVMValueRef get_bindless_index_from_uniform(struct ac_nir_context *ctx, unsigned base_index,
4044 unsigned constant_index,
4045 LLVMValueRef dynamic_index)
4046 {
4047 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, base_index * 4, 0);
4048 LLVMValueRef index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
4049 LLVMConstInt(ctx->ac.i32, constant_index, 0), "");
4050
4051 /* Bindless uniforms are 64bit so multiple index by 8 */
4052 index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, 8, 0), "");
4053 offset = LLVMBuildAdd(ctx->ac.builder, offset, index, "");
4054
4055 LLVMValueRef ubo_index = ctx->abi->load_ubo(ctx->abi, ctx->ac.i32_0);
4056
4057 LLVMValueRef ret =
4058 ac_build_buffer_load(&ctx->ac, ubo_index, 1, NULL, offset, NULL, 0, 0, true, true);
4059
4060 return LLVMBuildBitCast(ctx->ac.builder, ret, ctx->ac.i32, "");
4061 }
4062
4063 struct sampler_desc_address {
4064 unsigned descriptor_set;
4065 unsigned base_index; /* binding in vulkan */
4066 unsigned constant_index;
4067 LLVMValueRef dynamic_index;
4068 bool image;
4069 bool bindless;
4070 };
4071
4072 static struct sampler_desc_address get_sampler_desc_internal(struct ac_nir_context *ctx,
4073 nir_deref_instr *deref_instr,
4074 const nir_instr *instr, bool image)
4075 {
4076 LLVMValueRef index = NULL;
4077 unsigned constant_index = 0;
4078 unsigned descriptor_set;
4079 unsigned base_index;
4080 bool bindless = false;
4081
4082 if (!deref_instr) {
4083 descriptor_set = 0;
4084 if (image) {
4085 nir_intrinsic_instr *img_instr = nir_instr_as_intrinsic(instr);
4086 base_index = 0;
4087 bindless = true;
4088 index = get_src(ctx, img_instr->src[0]);
4089 } else {
4090 nir_tex_instr *tex_instr = nir_instr_as_tex(instr);
4091 int sampSrcIdx = nir_tex_instr_src_index(tex_instr, nir_tex_src_sampler_handle);
4092 if (sampSrcIdx != -1) {
4093 base_index = 0;
4094 bindless = true;
4095 index = get_src(ctx, tex_instr->src[sampSrcIdx].src);
4096 } else {
4097 assert(tex_instr && !image);
4098 base_index = tex_instr->sampler_index;
4099 }
4100 }
4101 } else {
4102 while (deref_instr->deref_type != nir_deref_type_var) {
4103 if (deref_instr->deref_type == nir_deref_type_array) {
4104 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
4105 if (!array_size)
4106 array_size = 1;
4107
4108 if (nir_src_is_const(deref_instr->arr.index)) {
4109 constant_index += array_size * nir_src_as_uint(deref_instr->arr.index);
4110 } else {
4111 LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index);
4112
4113 indirect = LLVMBuildMul(ctx->ac.builder, indirect,
4114 LLVMConstInt(ctx->ac.i32, array_size, false), "");
4115
4116 if (!index)
4117 index = indirect;
4118 else
4119 index = LLVMBuildAdd(ctx->ac.builder, index, indirect, "");
4120 }
4121
4122 deref_instr = nir_src_as_deref(deref_instr->parent);
4123 } else if (deref_instr->deref_type == nir_deref_type_struct) {
4124 unsigned sidx = deref_instr->strct.index;
4125 deref_instr = nir_src_as_deref(deref_instr->parent);
4126 constant_index += glsl_get_struct_location_offset(deref_instr->type, sidx);
4127 } else {
4128 unreachable("Unsupported deref type");
4129 }
4130 }
4131 descriptor_set = deref_instr->var->data.descriptor_set;
4132
4133 if (deref_instr->var->data.bindless) {
4134 /* For now just assert on unhandled variable types */
4135 assert(deref_instr->var->data.mode == nir_var_uniform);
4136
4137 base_index = deref_instr->var->data.driver_location;
4138 bindless = true;
4139
4140 index = index ? index : ctx->ac.i32_0;
4141 index = get_bindless_index_from_uniform(ctx, base_index, constant_index, index);
4142 } else
4143 base_index = deref_instr->var->data.binding;
4144 }
4145 return (struct sampler_desc_address){
4146 .descriptor_set = descriptor_set,
4147 .base_index = base_index,
4148 .constant_index = constant_index,
4149 .dynamic_index = index,
4150 .image = image,
4151 .bindless = bindless,
4152 };
4153 }
4154
4155 /* Extract any possibly divergent index into a separate value that can be fed
4156 * into get_sampler_desc with the same arguments. */
4157 static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
4158 const nir_instr *instr, bool image)
4159 {
4160 struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
4161 return addr.dynamic_index;
4162 }
4163
4164 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
4165 enum ac_descriptor_type desc_type, const nir_instr *instr,
4166 LLVMValueRef index, bool image, bool write)
4167 {
4168 struct sampler_desc_address addr = get_sampler_desc_internal(ctx, deref_instr, instr, image);
4169 return ctx->abi->load_sampler_desc(ctx->abi, addr.descriptor_set, addr.base_index,
4170 addr.constant_index, index, desc_type, addr.image, write,
4171 addr.bindless);
4172 }
4173
4174 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4175 *
4176 * GFX6-GFX7:
4177 * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4178 * filtering manually. The driver sets img7 to a mask clearing
4179 * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4180 * s_and_b32 samp0, samp0, img7
4181 *
4182 * GFX8:
4183 * The ANISO_OVERRIDE sampler field enables this fix in TA.
4184 */
4185 static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx, LLVMValueRef res,
4186 LLVMValueRef samp)
4187 {
4188 LLVMBuilderRef builder = ctx->ac.builder;
4189 LLVMValueRef img7, samp0;
4190
4191 if (ctx->ac.chip_class >= GFX8)
4192 return samp;
4193
4194 img7 = LLVMBuildExtractElement(builder, res, LLVMConstInt(ctx->ac.i32, 7, 0), "");
4195 samp0 = LLVMBuildExtractElement(builder, samp, LLVMConstInt(ctx->ac.i32, 0, 0), "");
4196 samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4197 return LLVMBuildInsertElement(builder, samp, samp0, LLVMConstInt(ctx->ac.i32, 0, 0), "");
4198 }
4199
4200 static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr,
4201 struct waterfall_context *wctx, LLVMValueRef *res_ptr,
4202 LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr)
4203 {
4204 nir_deref_instr *texture_deref_instr = NULL;
4205 nir_deref_instr *sampler_deref_instr = NULL;
4206 int plane = -1;
4207
4208 for (unsigned i = 0; i < instr->num_srcs; i++) {
4209 switch (instr->src[i].src_type) {
4210 case nir_tex_src_texture_deref:
4211 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
4212 break;
4213 case nir_tex_src_sampler_deref:
4214 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
4215 break;
4216 case nir_tex_src_plane:
4217 plane = nir_src_as_int(instr->src[i].src);
4218 break;
4219 default:
4220 break;
4221 }
4222 }
4223
4224 LLVMValueRef texture_dynamic_index =
4225 get_sampler_desc_index(ctx, texture_deref_instr, &instr->instr, false);
4226 if (!sampler_deref_instr)
4227 sampler_deref_instr = texture_deref_instr;
4228
4229 LLVMValueRef sampler_dynamic_index =
4230 get_sampler_desc_index(ctx, sampler_deref_instr, &instr->instr, false);
4231 if (instr->texture_non_uniform)
4232 texture_dynamic_index = enter_waterfall(ctx, wctx + 0, texture_dynamic_index, true);
4233
4234 if (instr->sampler_non_uniform)
4235 sampler_dynamic_index = enter_waterfall(ctx, wctx + 1, sampler_dynamic_index, true);
4236
4237 enum ac_descriptor_type main_descriptor =
4238 instr->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
4239
4240 if (plane >= 0) {
4241 assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
4242 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
4243
4244 main_descriptor = AC_DESC_PLANE_0 + plane;
4245 }
4246
4247 if (instr->op == nir_texop_fragment_mask_fetch) {
4248 /* The fragment mask is fetched from the compressed
4249 * multisampled surface.
4250 */
4251 main_descriptor = AC_DESC_FMASK;
4252 }
4253
4254 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, main_descriptor, &instr->instr,
4255 texture_dynamic_index, false, false);
4256
4257 if (samp_ptr) {
4258 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, &instr->instr,
4259 sampler_dynamic_index, false, false);
4260 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
4261 *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4262 }
4263 if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical))
4264 *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, &instr->instr,
4265 texture_dynamic_index, false, false);
4266 }
4267
4268 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx, LLVMValueRef coord)
4269 {
4270 coord = ac_to_float(ctx, coord);
4271 coord = ac_build_round(ctx, coord);
4272 coord = ac_to_integer(ctx, coord);
4273 return coord;
4274 }
4275
4276 static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
4277 {
4278 LLVMValueRef result = NULL;
4279 struct ac_image_args args = {0};
4280 LLVMValueRef fmask_ptr = NULL, sample_index = NULL;
4281 LLVMValueRef ddx = NULL, ddy = NULL;
4282 unsigned offset_src = 0;
4283 struct waterfall_context wctx[2] = {{{0}}};
4284
4285 tex_fetch_ptrs(ctx, instr, wctx, &args.resource, &args.sampler, &fmask_ptr);
4286
4287 for (unsigned i = 0; i < instr->num_srcs; i++) {
4288 switch (instr->src[i].src_type) {
4289 case nir_tex_src_coord: {
4290 LLVMValueRef coord = get_src(ctx, instr->src[i].src);
4291 for (unsigned chan = 0; chan < instr->coord_components; ++chan)
4292 args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan);
4293 break;
4294 }
4295 case nir_tex_src_projector:
4296 break;
4297 case nir_tex_src_comparator:
4298 if (instr->is_shadow) {
4299 args.compare = get_src(ctx, instr->src[i].src);
4300 args.compare = ac_to_float(&ctx->ac, args.compare);
4301 }
4302 break;
4303 case nir_tex_src_offset:
4304 args.offset = get_src(ctx, instr->src[i].src);
4305 offset_src = i;
4306 break;
4307 case nir_tex_src_bias:
4308 args.bias = get_src(ctx, instr->src[i].src);
4309 break;
4310 case nir_tex_src_lod: {
4311 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0)
4312 args.level_zero = true;
4313 else
4314 args.lod = get_src(ctx, instr->src[i].src);
4315 break;
4316 }
4317 case nir_tex_src_ms_index:
4318 sample_index = get_src(ctx, instr->src[i].src);
4319 break;
4320 case nir_tex_src_ms_mcs:
4321 break;
4322 case nir_tex_src_ddx:
4323 ddx = get_src(ctx, instr->src[i].src);
4324 break;
4325 case nir_tex_src_ddy:
4326 ddy = get_src(ctx, instr->src[i].src);
4327 break;
4328 case nir_tex_src_min_lod:
4329 args.min_lod = get_src(ctx, instr->src[i].src);
4330 break;
4331 case nir_tex_src_texture_offset:
4332 case nir_tex_src_sampler_offset:
4333 case nir_tex_src_plane:
4334 default:
4335 break;
4336 }
4337 }
4338
4339 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
4340 result = get_buffer_size(ctx, args.resource, true);
4341 goto write_result;
4342 }
4343
4344 if (instr->op == nir_texop_texture_samples) {
4345 LLVMValueRef res, samples, is_msaa;
4346 LLVMValueRef default_sample;
4347
4348 res = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
4349 samples =
4350 LLVMBuildExtractElement(ctx->ac.builder, res, LLVMConstInt(ctx->ac.i32, 3, false), "");
4351 is_msaa = LLVMBuildLShr(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 28, false), "");
4352 is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa, LLVMConstInt(ctx->ac.i32, 0xe, false), "");
4353 is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa,
4354 LLVMConstInt(ctx->ac.i32, 0xe, false), "");
4355
4356 samples = LLVMBuildLShr(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 16, false), "");
4357 samples = LLVMBuildAnd(ctx->ac.builder, samples, LLVMConstInt(ctx->ac.i32, 0xf, false), "");
4358 samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, samples, "");
4359
4360 if (ctx->abi->robust_buffer_access) {
4361 LLVMValueRef dword1, is_null_descriptor;
4362
4363 /* Extract the second dword of the descriptor, if it's
4364 * all zero, then it's a null descriptor.
4365 */
4366 dword1 =
4367 LLVMBuildExtractElement(ctx->ac.builder, res, LLVMConstInt(ctx->ac.i32, 1, false), "");
4368 is_null_descriptor = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
4369 LLVMConstInt(ctx->ac.i32, 0, false), "");
4370 default_sample =
4371 LLVMBuildSelect(ctx->ac.builder, is_null_descriptor, ctx->ac.i32_0, ctx->ac.i32_1, "");
4372 } else {
4373 default_sample = ctx->ac.i32_1;
4374 }
4375
4376 samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples, default_sample, "");
4377 result = samples;
4378 goto write_result;
4379 }
4380
4381 if (args.offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
4382 LLVMValueRef offset[3], pack;
4383 for (unsigned chan = 0; chan < 3; ++chan)
4384 offset[chan] = ctx->ac.i32_0;
4385
4386 unsigned num_components = ac_get_llvm_num_components(args.offset);
4387 for (unsigned chan = 0; chan < num_components; chan++) {
4388 offset[chan] = ac_llvm_extract_elem(&ctx->ac, args.offset, chan);
4389 offset[chan] =
4390 LLVMBuildAnd(ctx->ac.builder, offset[chan], LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
4391 if (chan)
4392 offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
4393 LLVMConstInt(ctx->ac.i32, chan * 8, false), "");
4394 }
4395 pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
4396 pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
4397 args.offset = pack;
4398 }
4399
4400 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
4401 * OpenGL 4.5 spec says:
4402 *
4403 * "If the texture’s internal format indicates a fixed-point
4404 * depth texture, then D_t and D_ref are clamped to the
4405 * range [0, 1]; otherwise no clamping is performed."
4406 *
4407 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
4408 * so the depth comparison value isn't clamped for Z16 and
4409 * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
4410 * an explicitly clamped 32-bit float format.
4411 */
4412 if (args.compare && ctx->ac.chip_class >= GFX8 && ctx->ac.chip_class <= GFX9 &&
4413 ctx->abi->clamp_shadow_reference) {
4414 LLVMValueRef upgraded, clamped;
4415
4416 upgraded = LLVMBuildExtractElement(ctx->ac.builder, args.sampler,
4417 LLVMConstInt(ctx->ac.i32, 3, false), "");
4418 upgraded = LLVMBuildLShr(ctx->ac.builder, upgraded, LLVMConstInt(ctx->ac.i32, 29, false), "");
4419 upgraded = LLVMBuildTrunc(ctx->ac.builder, upgraded, ctx->ac.i1, "");
4420 clamped = ac_build_clamp(&ctx->ac, args.compare);
4421 args.compare = LLVMBuildSelect(ctx->ac.builder, upgraded, clamped, args.compare, "");
4422 }
4423
4424 /* pack derivatives */
4425 if (ddx || ddy) {
4426 int num_src_deriv_channels, num_dest_deriv_channels;
4427 switch (instr->sampler_dim) {
4428 case GLSL_SAMPLER_DIM_3D:
4429 case GLSL_SAMPLER_DIM_CUBE:
4430 num_src_deriv_channels = 3;
4431 num_dest_deriv_channels = 3;
4432 break;
4433 case GLSL_SAMPLER_DIM_2D:
4434 default:
4435 num_src_deriv_channels = 2;
4436 num_dest_deriv_channels = 2;
4437 break;
4438 case GLSL_SAMPLER_DIM_1D:
4439 num_src_deriv_channels = 1;
4440 if (ctx->ac.chip_class == GFX9) {
4441 num_dest_deriv_channels = 2;
4442 } else {
4443 num_dest_deriv_channels = 1;
4444 }
4445 break;
4446 }
4447
4448 for (unsigned i = 0; i < num_src_deriv_channels; i++) {
4449 args.derivs[i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddx, i));
4450 args.derivs[num_dest_deriv_channels + i] =
4451 ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddy, i));
4452 }
4453 for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) {
4454 args.derivs[i] = ctx->ac.f32_0;
4455 args.derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0;
4456 }
4457 }
4458
4459 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && args.coords[0]) {
4460 for (unsigned chan = 0; chan < instr->coord_components; chan++)
4461 args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]);
4462 if (instr->coord_components == 3)
4463 args.coords[3] = LLVMGetUndef(ctx->ac.f32);
4464 ac_prepare_cube_coords(&ctx->ac, instr->op == nir_texop_txd, instr->is_array,
4465 instr->op == nir_texop_lod, args.coords, args.derivs);
4466 }
4467
4468 /* Texture coordinates fixups */
4469 if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
4470 instr->is_array && instr->op != nir_texop_txf) {
4471 args.coords[1] = apply_round_slice(&ctx->ac, args.coords[1]);
4472 }
4473
4474 if (instr->coord_components > 2 &&
4475 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
4476 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
4477 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
4478 instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&
4479 instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch) {
4480 args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]);
4481 }
4482
4483 if (ctx->ac.chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
4484 instr->op != nir_texop_lod) {
4485 LLVMValueRef filler;
4486 if (instr->op == nir_texop_txf)
4487 filler = ctx->ac.i32_0;
4488 else
4489 filler = LLVMConstReal(ctx->ac.f32, 0.5);
4490
4491 if (instr->is_array)
4492 args.coords[2] = args.coords[1];
4493 args.coords[1] = filler;
4494 }
4495
4496 /* Pack sample index */
4497 if (sample_index && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_fragment_fetch))
4498 args.coords[instr->coord_components] = sample_index;
4499
4500 if (instr->op == nir_texop_samples_identical) {
4501 struct ac_image_args txf_args = {0};
4502 memcpy(txf_args.coords, args.coords, sizeof(txf_args.coords));
4503
4504 txf_args.dmask = 0xf;
4505 txf_args.resource = fmask_ptr;
4506 txf_args.dim = instr->is_array ? ac_image_2darray : ac_image_2d;
4507 result = build_tex_intrinsic(ctx, instr, &txf_args);
4508
4509 result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
4510 result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0);
4511 goto write_result;
4512 }
4513
4514 if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ||
4515 instr->sampler_dim == GLSL_SAMPLER_DIM_MS) &&
4516 instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch &&
4517 instr->op != nir_texop_fragment_mask_fetch) {
4518 unsigned sample_chan = instr->is_array ? 3 : 2;
4519 args.coords[sample_chan] = adjust_sample_index_using_fmask(
4520 &ctx->ac, args.coords[0], args.coords[1], instr->is_array ? args.coords[2] : NULL,
4521 args.coords[sample_chan], fmask_ptr);
4522 }
4523
4524 if (args.offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
4525 int num_offsets = instr->src[offset_src].src.ssa->num_components;
4526 num_offsets = MIN2(num_offsets, instr->coord_components);
4527 for (unsigned i = 0; i < num_offsets; ++i) {
4528 args.coords[i] = LLVMBuildAdd(
4529 ctx->ac.builder, args.coords[i],
4530 LLVMConstInt(ctx->ac.i32, nir_src_comp_as_uint(instr->src[offset_src].src, i), false),
4531 "");
4532 }
4533 args.offset = NULL;
4534 }
4535
4536 /* DMASK was repurposed for GATHER4. 4 components are always
4537 * returned and DMASK works like a swizzle - it selects
4538 * the component to fetch. The only valid DMASK values are
4539 * 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns
4540 * (red,red,red,red) etc.) The ISA document doesn't mention
4541 * this.
4542 */
4543 args.dmask = 0xf;
4544 if (instr->op == nir_texop_tg4) {
4545 if (instr->is_shadow)
4546 args.dmask = 1;
4547 else
4548 args.dmask = 1 << instr->component;
4549 }
4550
4551 if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
4552 args.dim = ac_get_sampler_dim(ctx->ac.chip_class, instr->sampler_dim, instr->is_array);
4553 args.unorm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
4554 }
4555
4556 /* Adjust the number of coordinates because we only need (x,y) for 2D
4557 * multisampled images and (x,y,layer) for 2D multisampled layered
4558 * images or for multisampled input attachments.
4559 */
4560 if (instr->op == nir_texop_fragment_mask_fetch) {
4561 if (args.dim == ac_image_2dmsaa) {
4562 args.dim = ac_image_2d;
4563 } else {
4564 assert(args.dim == ac_image_2darraymsaa);
4565 args.dim = ac_image_2darray;
4566 }
4567 }
4568
4569 assert(instr->dest.is_ssa);
4570 args.d16 = instr->dest.ssa.bit_size == 16;
4571
4572 result = build_tex_intrinsic(ctx, instr, &args);
4573
4574 if (instr->op == nir_texop_query_levels)
4575 result =
4576 LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), "");
4577 else if (instr->is_shadow && instr->is_new_style_shadow && instr->op != nir_texop_txs &&
4578 instr->op != nir_texop_lod && instr->op != nir_texop_tg4)
4579 result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
4580 else if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
4581 instr->is_array) {
4582 LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
4583 LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
4584 LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
4585 z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
4586 result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, "");
4587 } else if (ctx->ac.chip_class == GFX9 && instr->op == nir_texop_txs &&
4588 instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
4589 LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
4590 LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
4591 result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, ctx->ac.i32_1, "");
4592 } else if (instr->dest.ssa.num_components != 4)
4593 result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components);
4594
4595 write_result:
4596 if (result) {
4597 assert(instr->dest.is_ssa);
4598 result = ac_to_integer(&ctx->ac, result);
4599
4600 for (int i = ARRAY_SIZE(wctx); --i >= 0;) {
4601 result = exit_waterfall(ctx, wctx + i, result);
4602 }
4603
4604 ctx->ssa_defs[instr->dest.ssa.index] = result;
4605 }
4606 }
4607
4608 static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
4609 {
4610 LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
4611 LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, "");
4612
4613 ctx->ssa_defs[instr->dest.ssa.index] = result;
4614 _mesa_hash_table_insert(ctx->phis, instr, result);
4615 }
4616
4617 static void visit_post_phi(struct ac_nir_context *ctx, nir_phi_instr *instr, LLVMValueRef llvm_phi)
4618 {
4619 nir_foreach_phi_src (src, instr) {
4620 LLVMBasicBlockRef block = get_block(ctx, src->pred);
4621 LLVMValueRef llvm_src = get_src(ctx, src->src);
4622
4623 LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1);
4624 }
4625 }
4626
4627 static void phi_post_pass(struct ac_nir_context *ctx)
4628 {
4629 hash_table_foreach(ctx->phis, entry)
4630 {
4631 visit_post_phi(ctx, (nir_phi_instr *)entry->key, (LLVMValueRef)entry->data);
4632 }
4633 }
4634
4635 static bool is_def_used_in_an_export(const nir_ssa_def *def)
4636 {
4637 nir_foreach_use (use_src, def) {
4638 if (use_src->parent_instr->type == nir_instr_type_intrinsic) {
4639 nir_intrinsic_instr *instr = nir_instr_as_intrinsic(use_src->parent_instr);
4640 if (instr->intrinsic == nir_intrinsic_store_deref)
4641 return true;
4642 } else if (use_src->parent_instr->type == nir_instr_type_alu) {
4643 nir_alu_instr *instr = nir_instr_as_alu(use_src->parent_instr);
4644 if (instr->op == nir_op_vec4 && is_def_used_in_an_export(&instr->dest.dest.ssa)) {
4645 return true;
4646 }
4647 }
4648 }
4649 return false;
4650 }
4651
4652 static void visit_ssa_undef(struct ac_nir_context *ctx, const nir_ssa_undef_instr *instr)
4653 {
4654 unsigned num_components = instr->def.num_components;
4655 LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
4656
4657 if (!ctx->abi->convert_undef_to_zero || is_def_used_in_an_export(&instr->def)) {
4658 LLVMValueRef undef;
4659
4660 if (num_components == 1)
4661 undef = LLVMGetUndef(type);
4662 else {
4663 undef = LLVMGetUndef(LLVMVectorType(type, num_components));
4664 }
4665 ctx->ssa_defs[instr->def.index] = undef;
4666 } else {
4667 LLVMValueRef zero = LLVMConstInt(type, 0, false);
4668 if (num_components > 1) {
4669 zero = ac_build_gather_values_extended(&ctx->ac, &zero, 4, 0, false, false);
4670 }
4671 ctx->ssa_defs[instr->def.index] = zero;
4672 }
4673 }
4674
4675 static void visit_jump(struct ac_llvm_context *ctx, const nir_jump_instr *instr)
4676 {
4677 switch (instr->type) {
4678 case nir_jump_break:
4679 ac_build_break(ctx);
4680 break;
4681 case nir_jump_continue:
4682 ac_build_continue(ctx);
4683 break;
4684 default:
4685 fprintf(stderr, "Unknown NIR jump instr: ");
4686 nir_print_instr(&instr->instr, stderr);
4687 fprintf(stderr, "\n");
4688 abort();
4689 }
4690 }
4691
4692 static LLVMTypeRef glsl_base_to_llvm_type(struct ac_llvm_context *ac, enum glsl_base_type type)
4693 {
4694 switch (type) {
4695 case GLSL_TYPE_INT:
4696 case GLSL_TYPE_UINT:
4697 case GLSL_TYPE_BOOL:
4698 case GLSL_TYPE_SUBROUTINE:
4699 return ac->i32;
4700 case GLSL_TYPE_INT8:
4701 case GLSL_TYPE_UINT8:
4702 return ac->i8;
4703 case GLSL_TYPE_INT16:
4704 case GLSL_TYPE_UINT16:
4705 return ac->i16;
4706 case GLSL_TYPE_FLOAT:
4707 return ac->f32;
4708 case GLSL_TYPE_FLOAT16:
4709 return ac->f16;
4710 case GLSL_TYPE_INT64:
4711 case GLSL_TYPE_UINT64:
4712 return ac->i64;
4713 case GLSL_TYPE_DOUBLE:
4714 return ac->f64;
4715 default:
4716 unreachable("unknown GLSL type");
4717 }
4718 }
4719
4720 static LLVMTypeRef glsl_to_llvm_type(struct ac_llvm_context *ac, const struct glsl_type *type)
4721 {
4722 if (glsl_type_is_scalar(type)) {
4723 return glsl_base_to_llvm_type(ac, glsl_get_base_type(type));
4724 }
4725
4726 if (glsl_type_is_vector(type)) {
4727 return LLVMVectorType(glsl_base_to_llvm_type(ac, glsl_get_base_type(type)),
4728 glsl_get_vector_elements(type));
4729 }
4730
4731 if (glsl_type_is_matrix(type)) {
4732 return LLVMArrayType(glsl_to_llvm_type(ac, glsl_get_column_type(type)),
4733 glsl_get_matrix_columns(type));
4734 }
4735
4736 if (glsl_type_is_array(type)) {
4737 return LLVMArrayType(glsl_to_llvm_type(ac, glsl_get_array_element(type)),
4738 glsl_get_length(type));
4739 }
4740
4741 assert(glsl_type_is_struct_or_ifc(type));
4742
4743 LLVMTypeRef member_types[glsl_get_length(type)];
4744
4745 for (unsigned i = 0; i < glsl_get_length(type); i++) {
4746 member_types[i] = glsl_to_llvm_type(ac, glsl_get_struct_field(type, i));
4747 }
4748
4749 return LLVMStructTypeInContext(ac->context, member_types, glsl_get_length(type), false);
4750 }
4751
4752 static void visit_deref(struct ac_nir_context *ctx, nir_deref_instr *instr)
4753 {
4754 if (instr->mode != nir_var_mem_shared && instr->mode != nir_var_mem_global)
4755 return;
4756
4757 LLVMValueRef result = NULL;
4758 switch (instr->deref_type) {
4759 case nir_deref_type_var: {
4760 struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var);
4761 result = entry->data;
4762 break;
4763 }
4764 case nir_deref_type_struct:
4765 if (instr->mode == nir_var_mem_global) {
4766 nir_deref_instr *parent = nir_deref_instr_parent(instr);
4767 uint64_t offset = glsl_get_struct_field_offset(parent->type, instr->strct.index);
4768 result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent),
4769 LLVMConstInt(ctx->ac.i32, offset, 0));
4770 } else {
4771 result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
4772 LLVMConstInt(ctx->ac.i32, instr->strct.index, 0));
4773 }
4774 break;
4775 case nir_deref_type_array:
4776 if (instr->mode == nir_var_mem_global) {
4777 nir_deref_instr *parent = nir_deref_instr_parent(instr);
4778 unsigned stride = glsl_get_explicit_stride(parent->type);
4779
4780 if ((glsl_type_is_matrix(parent->type) && glsl_matrix_type_is_row_major(parent->type)) ||
4781 (glsl_type_is_vector(parent->type) && stride == 0))
4782 stride = type_scalar_size_bytes(parent->type);
4783
4784 assert(stride > 0);
4785 LLVMValueRef index = get_src(ctx, instr->arr.index);
4786 if (LLVMTypeOf(index) != ctx->ac.i64)
4787 index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, "");
4788
4789 LLVMValueRef offset =
4790 LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), "");
4791
4792 result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset);
4793 } else {
4794 result =
4795 ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index));
4796 }
4797 break;
4798 case nir_deref_type_ptr_as_array:
4799 if (instr->mode == nir_var_mem_global) {
4800 unsigned stride = nir_deref_instr_array_stride(instr);
4801
4802 LLVMValueRef index = get_src(ctx, instr->arr.index);
4803 if (LLVMTypeOf(index) != ctx->ac.i64)
4804 index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i64, "");
4805
4806 LLVMValueRef offset =
4807 LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i64, stride, 0), "");
4808
4809 result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), offset);
4810 } else {
4811 result =
4812 ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index));
4813 }
4814 break;
4815 case nir_deref_type_cast: {
4816 result = get_src(ctx, instr->parent);
4817
4818 /* We can't use the structs from LLVM because the shader
4819 * specifies its own offsets. */
4820 LLVMTypeRef pointee_type = ctx->ac.i8;
4821 if (instr->mode == nir_var_mem_shared)
4822 pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type);
4823
4824 unsigned address_space;
4825
4826 switch (instr->mode) {
4827 case nir_var_mem_shared:
4828 address_space = AC_ADDR_SPACE_LDS;
4829 break;
4830 case nir_var_mem_global:
4831 address_space = AC_ADDR_SPACE_GLOBAL;
4832 break;
4833 default:
4834 unreachable("Unhandled address space");
4835 }
4836
4837 LLVMTypeRef type = LLVMPointerType(pointee_type, address_space);
4838
4839 if (LLVMTypeOf(result) != type) {
4840 if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) {
4841 result = LLVMBuildBitCast(ctx->ac.builder, result, type, "");
4842 } else {
4843 result = LLVMBuildIntToPtr(ctx->ac.builder, result, type, "");
4844 }
4845 }
4846 break;
4847 }
4848 default:
4849 unreachable("Unhandled deref_instr deref type");
4850 }
4851
4852 ctx->ssa_defs[instr->dest.ssa.index] = result;
4853 }
4854
4855 static void visit_cf_list(struct ac_nir_context *ctx, struct exec_list *list);
4856
4857 static void visit_block(struct ac_nir_context *ctx, nir_block *block)
4858 {
4859 nir_foreach_instr (instr, block) {
4860 switch (instr->type) {
4861 case nir_instr_type_alu:
4862 visit_alu(ctx, nir_instr_as_alu(instr));
4863 break;
4864 case nir_instr_type_load_const:
4865 visit_load_const(ctx, nir_instr_as_load_const(instr));
4866 break;
4867 case nir_instr_type_intrinsic:
4868 visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
4869 break;
4870 case nir_instr_type_tex:
4871 visit_tex(ctx, nir_instr_as_tex(instr));
4872 break;
4873 case nir_instr_type_phi:
4874 visit_phi(ctx, nir_instr_as_phi(instr));
4875 break;
4876 case nir_instr_type_ssa_undef:
4877 visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr));
4878 break;
4879 case nir_instr_type_jump:
4880 visit_jump(&ctx->ac, nir_instr_as_jump(instr));
4881 break;
4882 case nir_instr_type_deref:
4883 visit_deref(ctx, nir_instr_as_deref(instr));
4884 break;
4885 default:
4886 fprintf(stderr, "Unknown NIR instr type: ");
4887 nir_print_instr(instr, stderr);
4888 fprintf(stderr, "\n");
4889 abort();
4890 }
4891 }
4892
4893 _mesa_hash_table_insert(ctx->defs, block, LLVMGetInsertBlock(ctx->ac.builder));
4894 }
4895
4896 static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt)
4897 {
4898 LLVMValueRef value = get_src(ctx, if_stmt->condition);
4899
4900 nir_block *then_block = (nir_block *)exec_list_get_head(&if_stmt->then_list);
4901
4902 ac_build_uif(&ctx->ac, value, then_block->index);
4903
4904 visit_cf_list(ctx, &if_stmt->then_list);
4905
4906 if (!exec_list_is_empty(&if_stmt->else_list)) {
4907 nir_block *else_block = (nir_block *)exec_list_get_head(&if_stmt->else_list);
4908
4909 ac_build_else(&ctx->ac, else_block->index);
4910 visit_cf_list(ctx, &if_stmt->else_list);
4911 }
4912
4913 ac_build_endif(&ctx->ac, then_block->index);
4914 }
4915
4916 static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop)
4917 {
4918 nir_block *first_loop_block = (nir_block *)exec_list_get_head(&loop->body);
4919
4920 ac_build_bgnloop(&ctx->ac, first_loop_block->index);
4921
4922 visit_cf_list(ctx, &loop->body);
4923
4924 ac_build_endloop(&ctx->ac, first_loop_block->index);
4925 }
4926
4927 static void visit_cf_list(struct ac_nir_context *ctx, struct exec_list *list)
4928 {
4929 foreach_list_typed(nir_cf_node, node, node, list)
4930 {
4931 switch (node->type) {
4932 case nir_cf_node_block:
4933 visit_block(ctx, nir_cf_node_as_block(node));
4934 break;
4935
4936 case nir_cf_node_if:
4937 visit_if(ctx, nir_cf_node_as_if(node));
4938 break;
4939
4940 case nir_cf_node_loop:
4941 visit_loop(ctx, nir_cf_node_as_loop(node));
4942 break;
4943
4944 default:
4945 assert(0);
4946 }
4947 }
4948 }
4949
4950 void ac_handle_shader_output_decl(struct ac_llvm_context *ctx, struct ac_shader_abi *abi,
4951 struct nir_shader *nir, struct nir_variable *variable,
4952 gl_shader_stage stage)
4953 {
4954 unsigned output_loc = variable->data.driver_location / 4;
4955 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
4956
4957 /* tess ctrl has it's own load/store paths for outputs */
4958 if (stage == MESA_SHADER_TESS_CTRL)
4959 return;
4960
4961 if (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL ||
4962 stage == MESA_SHADER_GEOMETRY) {
4963 int idx = variable->data.location + variable->data.index;
4964 if (idx == VARYING_SLOT_CLIP_DIST0) {
4965 int length = nir->info.clip_distance_array_size + nir->info.cull_distance_array_size;
4966
4967 if (length > 4)
4968 attrib_count = 2;
4969 else
4970 attrib_count = 1;
4971 }
4972 }
4973
4974 bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
4975 LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32;
4976 for (unsigned i = 0; i < attrib_count; ++i) {
4977 for (unsigned chan = 0; chan < 4; chan++) {
4978 abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] =
4979 ac_build_alloca_undef(ctx, type, "");
4980 }
4981 }
4982 }
4983
4984 static void setup_locals(struct ac_nir_context *ctx, struct nir_function *func)
4985 {
4986 int i, j;
4987 ctx->num_locals = 0;
4988 nir_foreach_function_temp_variable(variable, func->impl)
4989 {
4990 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
4991 variable->data.driver_location = ctx->num_locals * 4;
4992 variable->data.location_frac = 0;
4993 ctx->num_locals += attrib_count;
4994 }
4995 ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef));
4996 if (!ctx->locals)
4997 return;
4998
4999 for (i = 0; i < ctx->num_locals; i++) {
5000 for (j = 0; j < 4; j++) {
5001 ctx->locals[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp");
5002 }
5003 }
5004 }
5005
5006 static void setup_scratch(struct ac_nir_context *ctx, struct nir_shader *shader)
5007 {
5008 if (shader->scratch_size == 0)
5009 return;
5010
5011 ctx->scratch =
5012 ac_build_alloca_undef(&ctx->ac, LLVMArrayType(ctx->ac.i8, shader->scratch_size), "scratch");
5013 }
5014
5015 static void setup_constant_data(struct ac_nir_context *ctx, struct nir_shader *shader)
5016 {
5017 if (!shader->constant_data)
5018 return;
5019
5020 LLVMValueRef data = LLVMConstStringInContext(ctx->ac.context, shader->constant_data,
5021 shader->constant_data_size, true);
5022 LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, shader->constant_data_size);
5023
5024 /* We want to put the constant data in the CONST address space so that
5025 * we can use scalar loads. However, LLVM versions before 10 put these
5026 * variables in the same section as the code, which is unacceptable
5027 * for RadeonSI as it needs to relocate all the data sections after
5028 * the code sections. See https://reviews.llvm.org/D65813.
5029 */
5030 unsigned address_space = LLVM_VERSION_MAJOR < 10 ? AC_ADDR_SPACE_GLOBAL : AC_ADDR_SPACE_CONST;
5031
5032 LLVMValueRef global =
5033 LLVMAddGlobalInAddressSpace(ctx->ac.module, type, "const_data", address_space);
5034
5035 LLVMSetInitializer(global, data);
5036 LLVMSetGlobalConstant(global, true);
5037 LLVMSetVisibility(global, LLVMHiddenVisibility);
5038 ctx->constant_data = global;
5039 }
5040
5041 static void setup_shared(struct ac_nir_context *ctx, struct nir_shader *nir)
5042 {
5043 if (ctx->ac.lds)
5044 return;
5045
5046 LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, nir->info.cs.shared_size);
5047
5048 LLVMValueRef lds =
5049 LLVMAddGlobalInAddressSpace(ctx->ac.module, type, "compute_lds", AC_ADDR_SPACE_LDS);
5050 LLVMSetAlignment(lds, 64 * 1024);
5051
5052 ctx->ac.lds =
5053 LLVMBuildBitCast(ctx->ac.builder, lds, LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS), "");
5054 }
5055
5056 void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
5057 const struct ac_shader_args *args, struct nir_shader *nir)
5058 {
5059 struct ac_nir_context ctx = {};
5060 struct nir_function *func;
5061
5062 ctx.ac = *ac;
5063 ctx.abi = abi;
5064 ctx.args = args;
5065
5066 ctx.stage = nir->info.stage;
5067 ctx.info = &nir->info;
5068
5069 ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
5070
5071 /* TODO: remove this after RADV switches to lowered IO */
5072 if (!nir->info.io_lowered) {
5073 nir_foreach_shader_out_variable(variable, nir)
5074 {
5075 ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable, ctx.stage);
5076 }
5077 }
5078
5079 ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5080 ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5081 ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5082
5083 if (ctx.abi->kill_ps_if_inf_interp)
5084 ctx.verified_interp =
5085 _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
5086
5087 func = (struct nir_function *)exec_list_get_head(&nir->functions);
5088
5089 nir_index_ssa_defs(func->impl);
5090 ctx.ssa_defs = calloc(func->impl->ssa_alloc, sizeof(LLVMValueRef));
5091
5092 setup_locals(&ctx, func);
5093 setup_scratch(&ctx, nir);
5094 setup_constant_data(&ctx, nir);
5095
5096 if (gl_shader_stage_is_compute(nir->info.stage))
5097 setup_shared(&ctx, nir);
5098
5099 if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.fs.uses_demote) {
5100 ctx.ac.postponed_kill = ac_build_alloca_undef(&ctx.ac, ac->i1, "");
5101 /* true = don't kill. */
5102 LLVMBuildStore(ctx.ac.builder, ctx.ac.i1true, ctx.ac.postponed_kill);
5103 }
5104
5105 visit_cf_list(&ctx, &func->impl->body);
5106 phi_post_pass(&ctx);
5107
5108 if (ctx.ac.postponed_kill)
5109 ac_build_kill_if_false(&ctx.ac, LLVMBuildLoad(ctx.ac.builder, ctx.ac.postponed_kill, ""));
5110
5111 if (!gl_shader_stage_is_compute(nir->info.stage))
5112 ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS, ctx.abi->outputs);
5113
5114 free(ctx.locals);
5115 free(ctx.ssa_defs);
5116 ralloc_free(ctx.defs);
5117 ralloc_free(ctx.phis);
5118 ralloc_free(ctx.vars);
5119 if (ctx.abi->kill_ps_if_inf_interp)
5120 ralloc_free(ctx.verified_interp);
5121 }
5122
5123 bool ac_lower_indirect_derefs(struct nir_shader *nir, enum chip_class chip_class)
5124 {
5125 bool progress = false;
5126
5127 /* Lower large variables to scratch first so that we won't bloat the
5128 * shader by generating large if ladders for them. We later lower
5129 * scratch to alloca's, assuming LLVM won't generate VGPR indexing.
5130 */
5131 NIR_PASS(progress, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
5132 glsl_get_natural_size_align_bytes);
5133
5134 /* While it would be nice not to have this flag, we are constrained
5135 * by the reality that LLVM 9.0 has buggy VGPR indexing on GFX9.
5136 */
5137 bool llvm_has_working_vgpr_indexing = chip_class != GFX9;
5138
5139 /* TODO: Indirect indexing of GS inputs is unimplemented.
5140 *
5141 * TCS and TES load inputs directly from LDS or offchip memory, so
5142 * indirect indexing is trivial.
5143 */
5144 nir_variable_mode indirect_mask = 0;
5145 if (nir->info.stage == MESA_SHADER_GEOMETRY ||
5146 (nir->info.stage != MESA_SHADER_TESS_CTRL && nir->info.stage != MESA_SHADER_TESS_EVAL &&
5147 !llvm_has_working_vgpr_indexing)) {
5148 indirect_mask |= nir_var_shader_in;
5149 }
5150 if (!llvm_has_working_vgpr_indexing && nir->info.stage != MESA_SHADER_TESS_CTRL)
5151 indirect_mask |= nir_var_shader_out;
5152
5153 /* TODO: We shouldn't need to do this, however LLVM isn't currently
5154 * smart enough to handle indirects without causing excess spilling
5155 * causing the gpu to hang.
5156 *
5157 * See the following thread for more details of the problem:
5158 * https://lists.freedesktop.org/archives/mesa-dev/2017-July/162106.html
5159 */
5160 indirect_mask |= nir_var_function_temp;
5161
5162 progress |= nir_lower_indirect_derefs(nir, indirect_mask, UINT32_MAX);
5163 return progress;
5164 }
5165
5166 static unsigned get_inst_tessfactor_writemask(nir_intrinsic_instr *intrin)
5167 {
5168 if (intrin->intrinsic != nir_intrinsic_store_output)
5169 return 0;
5170
5171 unsigned writemask = nir_intrinsic_write_mask(intrin) << nir_intrinsic_component(intrin);
5172 unsigned location = nir_intrinsic_io_semantics(intrin).location;
5173
5174 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
5175 return writemask << 4;
5176 else if (location == VARYING_SLOT_TESS_LEVEL_INNER)
5177 return writemask;
5178
5179 return 0;
5180 }
5181
5182 static void scan_tess_ctrl(nir_cf_node *cf_node, unsigned *upper_block_tf_writemask,
5183 unsigned *cond_block_tf_writemask,
5184 bool *tessfactors_are_def_in_all_invocs, bool is_nested_cf)
5185 {
5186 switch (cf_node->type) {
5187 case nir_cf_node_block: {
5188 nir_block *block = nir_cf_node_as_block(cf_node);
5189 nir_foreach_instr (instr, block) {
5190 if (instr->type != nir_instr_type_intrinsic)
5191 continue;
5192
5193 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
5194 if (intrin->intrinsic == nir_intrinsic_control_barrier) {
5195
5196 /* If we find a barrier in nested control flow put this in the
5197 * too hard basket. In GLSL this is not possible but it is in
5198 * SPIR-V.
5199 */
5200 if (is_nested_cf) {
5201 *tessfactors_are_def_in_all_invocs = false;
5202 return;
5203 }
5204
5205 /* The following case must be prevented:
5206 * gl_TessLevelInner = ...;
5207 * barrier();
5208 * if (gl_InvocationID == 1)
5209 * gl_TessLevelInner = ...;
5210 *
5211 * If you consider disjoint code segments separated by barriers, each
5212 * such segment that writes tess factor channels should write the same
5213 * channels in all codepaths within that segment.
5214 */
5215 if (upper_block_tf_writemask || cond_block_tf_writemask) {
5216 /* Accumulate the result: */
5217 *tessfactors_are_def_in_all_invocs &=
5218 !(*cond_block_tf_writemask & ~(*upper_block_tf_writemask));
5219
5220 /* Analyze the next code segment from scratch. */
5221 *upper_block_tf_writemask = 0;
5222 *cond_block_tf_writemask = 0;
5223 }
5224 } else
5225 *upper_block_tf_writemask |= get_inst_tessfactor_writemask(intrin);
5226 }
5227
5228 break;
5229 }
5230 case nir_cf_node_if: {
5231 unsigned then_tessfactor_writemask = 0;
5232 unsigned else_tessfactor_writemask = 0;
5233
5234 nir_if *if_stmt = nir_cf_node_as_if(cf_node);
5235 foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->then_list)
5236 {
5237 scan_tess_ctrl(nested_node, &then_tessfactor_writemask, cond_block_tf_writemask,
5238 tessfactors_are_def_in_all_invocs, true);
5239 }
5240
5241 foreach_list_typed(nir_cf_node, nested_node, node, &if_stmt->else_list)
5242 {
5243 scan_tess_ctrl(nested_node, &else_tessfactor_writemask, cond_block_tf_writemask,
5244 tessfactors_are_def_in_all_invocs, true);
5245 }
5246
5247 if (then_tessfactor_writemask || else_tessfactor_writemask) {
5248 /* If both statements write the same tess factor channels,
5249 * we can say that the upper block writes them too.
5250 */
5251 *upper_block_tf_writemask |= then_tessfactor_writemask & else_tessfactor_writemask;
5252 *cond_block_tf_writemask |= then_tessfactor_writemask | else_tessfactor_writemask;
5253 }
5254
5255 break;
5256 }
5257 case nir_cf_node_loop: {
5258 nir_loop *loop = nir_cf_node_as_loop(cf_node);
5259 foreach_list_typed(nir_cf_node, nested_node, node, &loop->body)
5260 {
5261 scan_tess_ctrl(nested_node, cond_block_tf_writemask, cond_block_tf_writemask,
5262 tessfactors_are_def_in_all_invocs, true);
5263 }
5264
5265 break;
5266 }
5267 default:
5268 unreachable("unknown cf node type");
5269 }
5270 }
5271
5272 bool ac_are_tessfactors_def_in_all_invocs(const struct nir_shader *nir)
5273 {
5274 assert(nir->info.stage == MESA_SHADER_TESS_CTRL);
5275
5276 /* The pass works as follows:
5277 * If all codepaths write tess factors, we can say that all
5278 * invocations define tess factors.
5279 *
5280 * Each tess factor channel is tracked separately.
5281 */
5282 unsigned main_block_tf_writemask = 0; /* if main block writes tess factors */
5283 unsigned cond_block_tf_writemask = 0; /* if cond block writes tess factors */
5284
5285 /* Initial value = true. Here the pass will accumulate results from
5286 * multiple segments surrounded by barriers. If tess factors aren't
5287 * written at all, it's a shader bug and we don't care if this will be
5288 * true.
5289 */
5290 bool tessfactors_are_def_in_all_invocs = true;
5291
5292 nir_foreach_function (function, nir) {
5293 if (function->impl) {
5294 foreach_list_typed(nir_cf_node, node, node, &function->impl->body)
5295 {
5296 scan_tess_ctrl(node, &main_block_tf_writemask, &cond_block_tf_writemask,
5297 &tessfactors_are_def_in_all_invocs, false);
5298 }
5299 }
5300 }
5301
5302 /* Accumulate the result for the last code segment separated by a
5303 * barrier.
5304 */
5305 if (main_block_tf_writemask || cond_block_tf_writemask) {
5306 tessfactors_are_def_in_all_invocs &= !(cond_block_tf_writemask & ~main_block_tf_writemask);
5307 }
5308
5309 return tessfactors_are_def_in_all_invocs;
5310 }