amd/llvm: switch to 3-spaces style
[mesa.git] / src / amd / llvm / ac_llvm_build.c
1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include "ac_exp_param.h"
29 #include "ac_llvm_util.h"
30 #include "ac_shader_util.h"
31 #include "c11/threads.h"
32 #include "shader_enums.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/macros.h"
36 #include "util/u_atomic.h"
37 #include "util/u_math.h"
38 #include <llvm-c/Core.h>
39 #include <llvm/Config/llvm-config.h>
40
41 #include <assert.h>
42 #include <stdio.h>
43
44 #define AC_LLVM_INITIAL_CF_DEPTH 4
45
46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
47 */
48 struct ac_llvm_flow {
49 /* Loop exit or next part of if/else/endif. */
50 LLVMBasicBlockRef next_block;
51 LLVMBasicBlockRef loop_entry_block;
52 };
53
54 /* Initialize module-independent parts of the context.
55 *
56 * The caller is responsible for initializing ctx::module and ctx::builder.
57 */
58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59 enum chip_class chip_class, enum radeon_family family,
60 enum ac_float_mode float_mode, unsigned wave_size,
61 unsigned ballot_mask_bits)
62 {
63 ctx->context = LLVMContextCreate();
64
65 ctx->chip_class = chip_class;
66 ctx->family = family;
67 ctx->wave_size = wave_size;
68 ctx->ballot_mask_bits = ballot_mask_bits;
69 ctx->float_mode = float_mode;
70 ctx->module =
71 ac_create_module(wave_size == 32 ? compiler->tm_wave32 : compiler->tm, ctx->context);
72 ctx->builder = ac_create_builder(ctx->context, float_mode);
73
74 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
75 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
76 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
77 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
78 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
79 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
80 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
81 ctx->intptr = ctx->i32;
82 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
83 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
84 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
85 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
86 ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
87 ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
88 ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
89 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
90 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
91 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
92 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
93 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
94 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
95 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
96 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
97 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
98
99 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
100 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
101 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
102 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
103 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
104 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
105 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
106 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
107 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
108 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
109 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
110 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
111 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
112 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
113 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
114 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
115
116 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
117 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
118
119 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
120
121 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
122
123 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
124
125 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
126 ctx->flow = calloc(1, sizeof(*ctx->flow));
127 }
128
129 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
130 {
131 free(ctx->flow->stack);
132 free(ctx->flow);
133 ctx->flow = NULL;
134 }
135
136 int ac_get_llvm_num_components(LLVMValueRef value)
137 {
138 LLVMTypeRef type = LLVMTypeOf(value);
139 unsigned num_components =
140 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
141 return num_components;
142 }
143
144 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
145 {
146 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
147 assert(index == 0);
148 return value;
149 }
150
151 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
152 }
153
154 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
155 {
156 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
157 type = LLVMGetElementType(type);
158
159 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
160 return LLVMGetIntTypeWidth(type);
161
162 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
163 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
164 return 32;
165 }
166
167 if (type == ctx->f16)
168 return 16;
169 if (type == ctx->f32)
170 return 32;
171 if (type == ctx->f64)
172 return 64;
173
174 unreachable("Unhandled type kind in get_elem_bits");
175 }
176
177 unsigned ac_get_type_size(LLVMTypeRef type)
178 {
179 LLVMTypeKind kind = LLVMGetTypeKind(type);
180
181 switch (kind) {
182 case LLVMIntegerTypeKind:
183 return LLVMGetIntTypeWidth(type) / 8;
184 case LLVMHalfTypeKind:
185 return 2;
186 case LLVMFloatTypeKind:
187 return 4;
188 case LLVMDoubleTypeKind:
189 return 8;
190 case LLVMPointerTypeKind:
191 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
192 return 4;
193 return 8;
194 case LLVMVectorTypeKind:
195 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
196 case LLVMArrayTypeKind:
197 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
198 default:
199 assert(0);
200 return 0;
201 }
202 }
203
204 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
205 {
206 if (t == ctx->i8)
207 return ctx->i8;
208 else if (t == ctx->f16 || t == ctx->i16)
209 return ctx->i16;
210 else if (t == ctx->f32 || t == ctx->i32)
211 return ctx->i32;
212 else if (t == ctx->f64 || t == ctx->i64)
213 return ctx->i64;
214 else
215 unreachable("Unhandled integer size");
216 }
217
218 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
219 {
220 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
221 LLVMTypeRef elem_type = LLVMGetElementType(t);
222 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
223 }
224 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
225 switch (LLVMGetPointerAddressSpace(t)) {
226 case AC_ADDR_SPACE_GLOBAL:
227 return ctx->i64;
228 case AC_ADDR_SPACE_CONST_32BIT:
229 case AC_ADDR_SPACE_LDS:
230 return ctx->i32;
231 default:
232 unreachable("unhandled address space");
233 }
234 }
235 return to_integer_type_scalar(ctx, t);
236 }
237
238 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
239 {
240 LLVMTypeRef type = LLVMTypeOf(v);
241 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
242 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
243 }
244 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
245 }
246
247 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
248 {
249 LLVMTypeRef type = LLVMTypeOf(v);
250 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
251 return v;
252 return ac_to_integer(ctx, v);
253 }
254
255 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
256 {
257 if (t == ctx->i8)
258 return ctx->i8;
259 else if (t == ctx->i16 || t == ctx->f16)
260 return ctx->f16;
261 else if (t == ctx->i32 || t == ctx->f32)
262 return ctx->f32;
263 else if (t == ctx->i64 || t == ctx->f64)
264 return ctx->f64;
265 else
266 unreachable("Unhandled float size");
267 }
268
269 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
270 {
271 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
272 LLVMTypeRef elem_type = LLVMGetElementType(t);
273 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
274 }
275 return to_float_type_scalar(ctx, t);
276 }
277
278 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
279 {
280 LLVMTypeRef type = LLVMTypeOf(v);
281 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
282 }
283
284 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
285 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
286 unsigned attrib_mask)
287 {
288 LLVMValueRef function, call;
289 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
290
291 function = LLVMGetNamedFunction(ctx->module, name);
292 if (!function) {
293 LLVMTypeRef param_types[32], function_type;
294 unsigned i;
295
296 assert(param_count <= 32);
297
298 for (i = 0; i < param_count; ++i) {
299 assert(params[i]);
300 param_types[i] = LLVMTypeOf(params[i]);
301 }
302 function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
303 function = LLVMAddFunction(ctx->module, name, function_type);
304
305 LLVMSetFunctionCallConv(function, LLVMCCallConv);
306 LLVMSetLinkage(function, LLVMExternalLinkage);
307
308 if (!set_callsite_attrs)
309 ac_add_func_attributes(ctx->context, function, attrib_mask);
310 }
311
312 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
313 if (set_callsite_attrs)
314 ac_add_func_attributes(ctx->context, call, attrib_mask);
315 return call;
316 }
317
318 /**
319 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
320 * intrinsic names).
321 */
322 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
323 {
324 LLVMTypeRef elem_type = type;
325
326 assert(bufsize >= 8);
327
328 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
329 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
330 if (ret < 0) {
331 char *type_name = LLVMPrintTypeToString(type);
332 fprintf(stderr, "Error building type name for: %s\n", type_name);
333 LLVMDisposeMessage(type_name);
334 return;
335 }
336 elem_type = LLVMGetElementType(type);
337 buf += ret;
338 bufsize -= ret;
339 }
340 switch (LLVMGetTypeKind(elem_type)) {
341 default:
342 break;
343 case LLVMIntegerTypeKind:
344 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
345 break;
346 case LLVMHalfTypeKind:
347 snprintf(buf, bufsize, "f16");
348 break;
349 case LLVMFloatTypeKind:
350 snprintf(buf, bufsize, "f32");
351 break;
352 case LLVMDoubleTypeKind:
353 snprintf(buf, bufsize, "f64");
354 break;
355 }
356 }
357
358 /**
359 * Helper function that builds an LLVM IR PHI node and immediately adds
360 * incoming edges.
361 */
362 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
363 LLVMValueRef *values, LLVMBasicBlockRef *blocks)
364 {
365 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
366 LLVMAddIncoming(phi, values, blocks, count_incoming);
367 return phi;
368 }
369
370 void ac_build_s_barrier(struct ac_llvm_context *ctx)
371 {
372 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
373 }
374
375 /* Prevent optimizations (at least of memory accesses) across the current
376 * point in the program by emitting empty inline assembly that is marked as
377 * having side effects.
378 *
379 * Optionally, a value can be passed through the inline assembly to prevent
380 * LLVM from hoisting calls to ReadNone functions.
381 */
382 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr)
383 {
384 static int counter = 0;
385
386 LLVMBuilderRef builder = ctx->builder;
387 char code[16];
388
389 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
390
391 if (!pvgpr) {
392 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
393 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
394 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
395 } else {
396 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
397 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
398 LLVMTypeRef type = LLVMTypeOf(*pvgpr);
399 unsigned bitsize = ac_get_elem_bits(ctx, type);
400 LLVMValueRef vgpr = *pvgpr;
401 LLVMTypeRef vgpr_type;
402 unsigned vgpr_size;
403 LLVMValueRef vgpr0;
404
405 if (bitsize < 32)
406 vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
407
408 vgpr_type = LLVMTypeOf(vgpr);
409 vgpr_size = ac_get_type_size(vgpr_type);
410
411 assert(vgpr_size % 4 == 0);
412
413 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
414 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
415 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
416 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
417 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
418
419 if (bitsize < 32)
420 vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
421
422 *pvgpr = vgpr;
423 }
424 }
425
426 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
427 {
428 const char *name =
429 scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : "llvm.amdgcn.s.memtime";
430 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
431 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
432 }
433
434 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
435 {
436 const char *name;
437
438 if (LLVM_VERSION_MAJOR >= 9) {
439 if (ctx->wave_size == 64)
440 name = "llvm.amdgcn.icmp.i64.i32";
441 else
442 name = "llvm.amdgcn.icmp.i32.i32";
443 } else {
444 name = "llvm.amdgcn.icmp.i32";
445 }
446 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
447
448 /* We currently have no other way to prevent LLVM from lifting the icmp
449 * calls to a dominating basic block.
450 */
451 ac_build_optimization_barrier(ctx, &args[0]);
452
453 args[0] = ac_to_integer(ctx, args[0]);
454
455 return ac_build_intrinsic(
456 ctx, name, ctx->iN_wavemask, args, 3,
457 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
458 }
459
460 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
461 {
462 const char *name;
463
464 if (LLVM_VERSION_MAJOR >= 9) {
465 if (ctx->wave_size == 64)
466 name = "llvm.amdgcn.icmp.i64.i1";
467 else
468 name = "llvm.amdgcn.icmp.i32.i1";
469 } else {
470 name = "llvm.amdgcn.icmp.i1";
471 }
472 LLVMValueRef args[3] = {
473 value,
474 ctx->i1false,
475 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
476 };
477
478 return ac_build_intrinsic(
479 ctx, name, ctx->iN_wavemask, args, 3,
480 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
481 }
482
483 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
484 {
485 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
486 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
487 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
488 }
489
490 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
491 {
492 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
493 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
494 "");
495 }
496
497 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
498 {
499 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
500 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
501
502 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
503 LLVMValueRef none =
504 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
505 return LLVMBuildOr(ctx->builder, all, none, "");
506 }
507
508 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
509 unsigned value_count, unsigned component)
510 {
511 LLVMValueRef vec = NULL;
512
513 if (value_count == 1) {
514 return values[component];
515 } else if (!value_count)
516 unreachable("value_count is 0");
517
518 for (unsigned i = component; i < value_count + component; i++) {
519 LLVMValueRef value = values[i];
520
521 if (i == component)
522 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
523 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
524 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
525 }
526 return vec;
527 }
528
529 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
530 unsigned value_count, unsigned value_stride, bool load,
531 bool always_vector)
532 {
533 LLVMBuilderRef builder = ctx->builder;
534 LLVMValueRef vec = NULL;
535 unsigned i;
536
537 if (value_count == 1 && !always_vector) {
538 if (load)
539 return LLVMBuildLoad(builder, values[0], "");
540 return values[0];
541 } else if (!value_count)
542 unreachable("value_count is 0");
543
544 for (i = 0; i < value_count; i++) {
545 LLVMValueRef value = values[i * value_stride];
546 if (load)
547 value = LLVMBuildLoad(builder, value, "");
548
549 if (!i)
550 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
551 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
552 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
553 }
554 return vec;
555 }
556
557 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
558 unsigned value_count)
559 {
560 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
561 }
562
563 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
564 * channels with undef. Extract at most src_channels components from the input.
565 */
566 static LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
567 unsigned src_channels, unsigned dst_channels)
568 {
569 LLVMTypeRef elemtype;
570 LLVMValueRef chan[dst_channels];
571
572 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
573 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
574
575 if (src_channels == dst_channels && vec_size == dst_channels)
576 return value;
577
578 src_channels = MIN2(src_channels, vec_size);
579
580 for (unsigned i = 0; i < src_channels; i++)
581 chan[i] = ac_llvm_extract_elem(ctx, value, i);
582
583 elemtype = LLVMGetElementType(LLVMTypeOf(value));
584 } else {
585 if (src_channels) {
586 assert(src_channels == 1);
587 chan[0] = value;
588 }
589 elemtype = LLVMTypeOf(value);
590 }
591
592 for (unsigned i = src_channels; i < dst_channels; i++)
593 chan[i] = LLVMGetUndef(elemtype);
594
595 return ac_build_gather_values(ctx, chan, dst_channels);
596 }
597
598 /* Extract components [start, start + channels) from a vector.
599 */
600 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
601 unsigned channels)
602 {
603 LLVMValueRef chan[channels];
604
605 for (unsigned i = 0; i < channels; i++)
606 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
607
608 return ac_build_gather_values(ctx, chan, channels);
609 }
610
611 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
612 * with undef. Extract at most num_channels components from the input.
613 */
614 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
615 unsigned num_channels)
616 {
617 return ac_build_expand(ctx, value, num_channels, 4);
618 }
619
620 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
621 {
622 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
623 const char *name;
624
625 if (type_size == 2)
626 name = "llvm.rint.f16";
627 else if (type_size == 4)
628 name = "llvm.rint.f32";
629 else
630 name = "llvm.rint.f64";
631
632 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
633 }
634
635 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
636 {
637 unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
638 const char *name;
639
640 /* For doubles, we need precise division to pass GLCTS. */
641 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
642 return LLVMBuildFDiv(ctx->builder, num, den, "");
643
644 if (type_size == 2)
645 name = "llvm.amdgcn.rcp.f16";
646 else if (type_size == 4)
647 name = "llvm.amdgcn.rcp.f32";
648 else
649 name = "llvm.amdgcn.rcp.f64";
650
651 LLVMValueRef rcp =
652 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
653
654 return LLVMBuildFMul(ctx->builder, num, rcp, "");
655 }
656
657 /* See fast_idiv_by_const.h. */
658 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
659 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
660 LLVMValueRef multiplier, LLVMValueRef pre_shift,
661 LLVMValueRef post_shift, LLVMValueRef increment)
662 {
663 LLVMBuilderRef builder = ctx->builder;
664
665 num = LLVMBuildLShr(builder, num, pre_shift, "");
666 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
667 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
668 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
669 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
670 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
671 return LLVMBuildLShr(builder, num, post_shift, "");
672 }
673
674 /* See fast_idiv_by_const.h. */
675 /* If num != UINT_MAX, this more efficient version can be used. */
676 /* Set: increment = util_fast_udiv_info::increment; */
677 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
678 LLVMValueRef multiplier, LLVMValueRef pre_shift,
679 LLVMValueRef post_shift, LLVMValueRef increment)
680 {
681 LLVMBuilderRef builder = ctx->builder;
682
683 num = LLVMBuildLShr(builder, num, pre_shift, "");
684 num = LLVMBuildNUWAdd(builder, num, increment, "");
685 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
686 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
687 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
688 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
689 return LLVMBuildLShr(builder, num, post_shift, "");
690 }
691
692 /* See fast_idiv_by_const.h. */
693 /* Both operands must fit in 31 bits and the divisor must not be 1. */
694 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
695 LLVMValueRef multiplier, LLVMValueRef post_shift)
696 {
697 LLVMBuilderRef builder = ctx->builder;
698
699 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
700 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
701 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
702 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
703 return LLVMBuildLShr(builder, num, post_shift, "");
704 }
705
706 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
707 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
708 * already multiplied by two. id is the cube face number.
709 */
710 struct cube_selection_coords {
711 LLVMValueRef stc[2];
712 LLVMValueRef ma;
713 LLVMValueRef id;
714 };
715
716 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
717 struct cube_selection_coords *out)
718 {
719 LLVMTypeRef f32 = ctx->f32;
720
721 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
722 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
723 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
724 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
725 }
726
727 /**
728 * Build a manual selection sequence for cube face sc/tc coordinates and
729 * major axis vector (multiplied by 2 for consistency) for the given
730 * vec3 \p coords, for the face implied by \p selcoords.
731 *
732 * For the major axis, we always adjust the sign to be in the direction of
733 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
734 * the selcoords major axis.
735 */
736 static void build_cube_select(struct ac_llvm_context *ctx,
737 const struct cube_selection_coords *selcoords,
738 const LLVMValueRef *coords, LLVMValueRef *out_st,
739 LLVMValueRef *out_ma)
740 {
741 LLVMBuilderRef builder = ctx->builder;
742 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
743 LLVMValueRef is_ma_positive;
744 LLVMValueRef sgn_ma;
745 LLVMValueRef is_ma_z, is_not_ma_z;
746 LLVMValueRef is_ma_y;
747 LLVMValueRef is_ma_x;
748 LLVMValueRef sgn;
749 LLVMValueRef tmp;
750
751 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
752 sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
753 LLVMConstReal(f32, -1.0), "");
754
755 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
756 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
757 is_ma_y = LLVMBuildAnd(
758 builder, is_not_ma_z,
759 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
760 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
761
762 /* Select sc */
763 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
764 sgn = LLVMBuildSelect(
765 builder, is_ma_y, LLVMConstReal(f32, 1.0),
766 LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
767 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
768
769 /* Select tc */
770 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
771 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
772 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
773
774 /* Select ma */
775 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
776 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
777 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
778 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
779 }
780
781 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
782 LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
783 {
784
785 LLVMBuilderRef builder = ctx->builder;
786 struct cube_selection_coords selcoords;
787 LLVMValueRef coords[3];
788 LLVMValueRef invma;
789
790 if (is_array && !is_lod) {
791 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
792
793 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
794 *
795 * "For Array forms, the array layer used will be
796 *
797 * max(0, min(d−1, floor(layer+0.5)))
798 *
799 * where d is the depth of the texture array and layer
800 * comes from the component indicated in the tables below.
801 * Workaroudn for an issue where the layer is taken from a
802 * helper invocation which happens to fall on a different
803 * layer due to extrapolation."
804 *
805 * GFX8 and earlier attempt to implement this in hardware by
806 * clamping the value of coords[2] = (8 * layer) + face.
807 * Unfortunately, this means that the we end up with the wrong
808 * face when clamping occurs.
809 *
810 * Clamp the layer earlier to work around the issue.
811 */
812 if (ctx->chip_class <= GFX8) {
813 LLVMValueRef ge0;
814 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
815 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
816 }
817
818 coords_arg[3] = tmp;
819 }
820
821 build_cube_intrinsic(ctx, coords_arg, &selcoords);
822
823 invma =
824 ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
825 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
826
827 for (int i = 0; i < 2; ++i)
828 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
829
830 coords[2] = selcoords.id;
831
832 if (is_deriv && derivs_arg) {
833 LLVMValueRef derivs[4];
834 int axis;
835
836 /* Convert cube derivatives to 2D derivatives. */
837 for (axis = 0; axis < 2; axis++) {
838 LLVMValueRef deriv_st[2];
839 LLVMValueRef deriv_ma;
840
841 /* Transform the derivative alongside the texture
842 * coordinate. Mathematically, the correct formula is
843 * as follows. Assume we're projecting onto the +Z face
844 * and denote by dx/dh the derivative of the (original)
845 * X texture coordinate with respect to horizontal
846 * window coordinates. The projection onto the +Z face
847 * plane is:
848 *
849 * f(x,z) = x/z
850 *
851 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
852 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
853 *
854 * This motivatives the implementation below.
855 *
856 * Whether this actually gives the expected results for
857 * apps that might feed in derivatives obtained via
858 * finite differences is anyone's guess. The OpenGL spec
859 * seems awfully quiet about how textureGrad for cube
860 * maps should be handled.
861 */
862 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
863
864 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
865
866 for (int i = 0; i < 2; ++i)
867 derivs[axis * 2 + i] =
868 LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
869 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
870 }
871
872 memcpy(derivs_arg, derivs, sizeof(derivs));
873 }
874
875 /* Shift the texture coordinate. This must be applied after the
876 * derivative calculation.
877 */
878 for (int i = 0; i < 2; ++i)
879 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
880
881 if (is_array) {
882 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
883 /* coords_arg.w component - array_index for cube arrays */
884 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
885 }
886
887 memcpy(coords_arg, coords, sizeof(coords));
888 }
889
890 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
891 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
892 LLVMValueRef j)
893 {
894 LLVMValueRef args[5];
895 LLVMValueRef p1;
896
897 args[0] = i;
898 args[1] = llvm_chan;
899 args[2] = attr_number;
900 args[3] = params;
901
902 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
903
904 args[0] = p1;
905 args[1] = j;
906 args[2] = llvm_chan;
907 args[3] = attr_number;
908 args[4] = params;
909
910 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,
911 AC_FUNC_ATTR_READNONE);
912 }
913
914 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
915 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
916 LLVMValueRef j)
917 {
918 LLVMValueRef args[6];
919 LLVMValueRef p1;
920
921 args[0] = i;
922 args[1] = llvm_chan;
923 args[2] = attr_number;
924 args[3] = ctx->i1false;
925 args[4] = params;
926
927 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
928 AC_FUNC_ATTR_READNONE);
929
930 args[0] = p1;
931 args[1] = j;
932 args[2] = llvm_chan;
933 args[3] = attr_number;
934 args[4] = ctx->i1false;
935 args[5] = params;
936
937 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
938 AC_FUNC_ATTR_READNONE);
939 }
940
941 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
942 LLVMValueRef llvm_chan, LLVMValueRef attr_number,
943 LLVMValueRef params)
944 {
945 LLVMValueRef args[4];
946
947 args[0] = parameter;
948 args[1] = llvm_chan;
949 args[2] = attr_number;
950 args[3] = params;
951
952 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
953 AC_FUNC_ATTR_READNONE);
954 }
955
956 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
957 LLVMValueRef index)
958 {
959 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
960 }
961
962 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
963 {
964 LLVMValueRef indices[2] = {
965 ctx->i32_0,
966 index,
967 };
968 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
969 }
970
971 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
972 {
973 return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
974 LLVMTypeOf(ptr), "");
975 }
976
977 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
978 LLVMValueRef value)
979 {
980 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
981 }
982
983 /**
984 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
985 * It's equivalent to doing a load from &base_ptr[index].
986 *
987 * \param base_ptr Where the array starts.
988 * \param index The element index into the array.
989 * \param uniform Whether the base_ptr and index can be assumed to be
990 * dynamically uniform (i.e. load to an SGPR)
991 * \param invariant Whether the load is invariant (no other opcodes affect it)
992 * \param no_unsigned_wraparound
993 * For all possible re-associations and re-distributions of an expression
994 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
995 * without inbounds in base_ptr), this parameter is true if "addr + offset"
996 * does not result in an unsigned integer wraparound. This is used for
997 * optimal code generation of 32-bit pointer arithmetic.
998 *
999 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
1000 * integer wraparound can't be an imm offset in s_load_dword, because
1001 * the instruction performs "addr + offset" in 64 bits.
1002 *
1003 * Expected usage for bindless textures by chaining GEPs:
1004 * // possible unsigned wraparound, don't use InBounds:
1005 * ptr1 = LLVMBuildGEP(base_ptr, index);
1006 * image = load(ptr1); // becomes "s_load ptr1, 0"
1007 *
1008 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1009 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1010 */
1011 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1012 LLVMValueRef index, bool uniform, bool invariant,
1013 bool no_unsigned_wraparound)
1014 {
1015 LLVMValueRef pointer, result;
1016
1017 if (no_unsigned_wraparound &&
1018 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1019 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1020 else
1021 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1022
1023 if (uniform)
1024 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1025 result = LLVMBuildLoad(ctx->builder, pointer, "");
1026 if (invariant)
1027 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1028 return result;
1029 }
1030
1031 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1032 {
1033 return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1034 }
1035
1036 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1037 LLVMValueRef index)
1038 {
1039 return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1040 }
1041
1042 /* This assumes that there is no unsigned integer wraparound during the address
1043 * computation, excluding all GEPs within base_ptr. */
1044 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1045 LLVMValueRef index)
1046 {
1047 return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1048 }
1049
1050 /* See ac_build_load_custom() documentation. */
1051 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1052 LLVMValueRef base_ptr, LLVMValueRef index)
1053 {
1054 return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1055 }
1056
1057 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1058 {
1059 return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1060 }
1061
1062 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1063 LLVMValueRef data, LLVMValueRef vindex,
1064 LLVMValueRef voffset, LLVMValueRef soffset,
1065 unsigned cache_policy, bool use_format, bool structurized)
1066 {
1067 LLVMValueRef args[6];
1068 int idx = 0;
1069 args[idx++] = data;
1070 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1071 if (structurized)
1072 args[idx++] = vindex ? vindex : ctx->i32_0;
1073 args[idx++] = voffset ? voffset : ctx->i32_0;
1074 args[idx++] = soffset ? soffset : ctx->i32_0;
1075 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1076 const char *indexing_kind = structurized ? "struct" : "raw";
1077 char name[256], type_name[8];
1078
1079 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1080
1081 if (use_format) {
1082 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1083 type_name);
1084 } else {
1085 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1086 }
1087
1088 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1089 }
1090
1091 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1092 LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1093 {
1094 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);
1095 }
1096
1097 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1098 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1099 * or v4i32 (num_channels=3,4).
1100 */
1101 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1102 unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,
1103 unsigned inst_offset, unsigned cache_policy)
1104 {
1105 /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1106 * intrinsics. */
1107 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1108 LLVMValueRef v[3], v01;
1109
1110 for (int i = 0; i < 3; i++) {
1111 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1112 }
1113 v01 = ac_build_gather_values(ctx, v, 2);
1114
1115 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);
1116 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,
1117 cache_policy);
1118 return;
1119 }
1120
1121 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1122 * (voffset is swizzled, but soffset isn't swizzled).
1123 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1124 */
1125 if (!(cache_policy & ac_swizzled)) {
1126 LLVMValueRef offset = soffset;
1127
1128 if (inst_offset)
1129 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");
1130
1131 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,
1132 cache_policy, false, false);
1133 return;
1134 }
1135
1136 static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
1137 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1138 V_008F0C_BUF_DATA_FORMAT_32_32_32_32};
1139 unsigned dfmt = dfmts[num_channels - 1];
1140 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1141 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1142
1143 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,
1144 nfmt, cache_policy);
1145 }
1146
1147 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1148 LLVMValueRef vindex, LLVMValueRef voffset,
1149 LLVMValueRef soffset, unsigned num_channels,
1150 LLVMTypeRef channel_type, unsigned cache_policy,
1151 bool can_speculate, bool use_format,
1152 bool structurized)
1153 {
1154 LLVMValueRef args[5];
1155 int idx = 0;
1156 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1157 if (structurized)
1158 args[idx++] = vindex ? vindex : ctx->i32_0;
1159 args[idx++] = voffset ? voffset : ctx->i32_0;
1160 args[idx++] = soffset ? soffset : ctx->i32_0;
1161 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1162 unsigned func =
1163 !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1164 const char *indexing_kind = structurized ? "struct" : "raw";
1165 char name[256], type_name[8];
1166
1167 /* D16 is only supported on gfx8+ */
1168 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1169 ctx->chip_class >= GFX8);
1170
1171 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1172 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1173
1174 if (use_format) {
1175 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1176 type_name);
1177 } else {
1178 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1179 }
1180
1181 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1182 }
1183
1184 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1185 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1186 unsigned inst_offset, unsigned cache_policy, bool can_speculate,
1187 bool allow_smem)
1188 {
1189 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1190 if (voffset)
1191 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1192 if (soffset)
1193 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1194
1195 if (allow_smem && !(cache_policy & ac_slc) &&
1196 (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1197 assert(vindex == NULL);
1198
1199 LLVMValueRef result[8];
1200
1201 for (int i = 0; i < num_channels; i++) {
1202 if (i) {
1203 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1204 }
1205 LLVMValueRef args[3] = {
1206 rsrc,
1207 offset,
1208 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1209 };
1210 result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1211 AC_FUNC_ATTR_READNONE);
1212 }
1213 if (num_channels == 1)
1214 return result[0];
1215
1216 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1217 result[num_channels++] = LLVMGetUndef(ctx->f32);
1218 return ac_build_gather_values(ctx, result, num_channels);
1219 }
1220
1221 return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels, ctx->f32,
1222 cache_policy, can_speculate, false, false);
1223 }
1224
1225 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1226 LLVMValueRef vindex, LLVMValueRef voffset,
1227 unsigned num_channels, unsigned cache_policy,
1228 bool can_speculate, bool d16)
1229 {
1230 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1231 d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1232 true);
1233 }
1234
1235 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1236 LLVMValueRef vindex, LLVMValueRef voffset,
1237 LLVMValueRef soffset, LLVMValueRef immoffset,
1238 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1239 unsigned cache_policy, bool can_speculate,
1240 bool structurized)
1241 {
1242 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1243
1244 LLVMValueRef args[6];
1245 int idx = 0;
1246 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1247 if (structurized)
1248 args[idx++] = vindex ? vindex : ctx->i32_0;
1249 args[idx++] = voffset ? voffset : ctx->i32_0;
1250 args[idx++] = soffset ? soffset : ctx->i32_0;
1251 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1252 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1253 unsigned func =
1254 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1255 const char *indexing_kind = structurized ? "struct" : "raw";
1256 char name[256], type_name[8];
1257
1258 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1259 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1260
1261 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1262
1263 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1264 }
1265
1266 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1267 LLVMValueRef vindex, LLVMValueRef voffset,
1268 LLVMValueRef soffset, LLVMValueRef immoffset,
1269 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1270 unsigned cache_policy, bool can_speculate)
1271 {
1272 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1273 nfmt, cache_policy, can_speculate, true);
1274 }
1275
1276 LLVMValueRef ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1277 LLVMValueRef voffset, LLVMValueRef soffset,
1278 LLVMValueRef immoffset, unsigned num_channels, unsigned dfmt,
1279 unsigned nfmt, unsigned cache_policy, bool can_speculate)
1280 {
1281 return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1282 nfmt, cache_policy, can_speculate, false);
1283 }
1284
1285 LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1286 LLVMValueRef voffset, LLVMValueRef soffset,
1287 LLVMValueRef immoffset, unsigned cache_policy)
1288 {
1289 LLVMValueRef res;
1290
1291 if (LLVM_VERSION_MAJOR >= 9) {
1292 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1293
1294 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1295 res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1296 cache_policy, false, false, false);
1297 } else {
1298 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1299 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1300
1301 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1302 cache_policy, false);
1303
1304 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1305 }
1306
1307 return res;
1308 }
1309
1310 LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1311 LLVMValueRef voffset, LLVMValueRef soffset,
1312 LLVMValueRef immoffset, unsigned cache_policy)
1313 {
1314 LLVMValueRef res;
1315
1316 if (LLVM_VERSION_MAJOR >= 9) {
1317 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1318
1319 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1320 res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1321 false, false, false);
1322 } else {
1323 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1324 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1325
1326 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1327 cache_policy, false);
1328
1329 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1330 }
1331
1332 return res;
1333 }
1334
1335 /**
1336 * Convert an 11- or 10-bit unsigned floating point number to an f32.
1337 *
1338 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1339 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1340 */
1341 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1342 unsigned exp_bits, unsigned mant_bits)
1343 {
1344 assert(LLVMTypeOf(src) == ctx->i32);
1345
1346 LLVMValueRef tmp;
1347 LLVMValueRef mantissa;
1348 mantissa =
1349 LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1350
1351 /* Converting normal numbers is just a shift + correcting the exponent bias */
1352 unsigned normal_shift = 23 - mant_bits;
1353 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1354 LLVMValueRef shifted, normal;
1355
1356 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1357 normal =
1358 LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1359
1360 /* Converting nan/inf numbers is the same, but with a different exponent update */
1361 LLVMValueRef naninf;
1362 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1363
1364 /* Converting denormals is the complex case: determine the leading zeros of the
1365 * mantissa to obtain the correct shift for the mantissa and exponent correction.
1366 */
1367 LLVMValueRef denormal;
1368 LLVMValueRef params[2] = {
1369 mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1370 };
1371 LLVMValueRef ctlz =
1372 ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1373
1374 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1375 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1376 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1377
1378 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1379 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1380 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1381 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1382
1383 /* Select the final result. */
1384 LLVMValueRef result;
1385
1386 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1387 LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1388 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1389
1390 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, LLVMConstInt(ctx->i32, 1 << mant_bits, false),
1391 "");
1392 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1393
1394 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1395 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1396
1397 return ac_to_float(ctx, result);
1398 }
1399
1400 /**
1401 * Generate a fully general open coded buffer format fetch with all required
1402 * fixups suitable for vertex fetch, using non-format buffer loads.
1403 *
1404 * Some combinations of argument values have special interpretations:
1405 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1406 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1407 *
1408 * \param log_size log(size of channel in bytes)
1409 * \param num_channels number of channels (1 to 4)
1410 * \param format AC_FETCH_FORMAT_xxx value
1411 * \param reverse whether XYZ channels are reversed
1412 * \param known_aligned whether the source is known to be aligned to hardware's
1413 * effective element size for loading the given format
1414 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1415 * \param rsrc buffer resource descriptor
1416 * \return the resulting vector of floats or integers bitcast to <4 x i32>
1417 */
1418 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1419 unsigned num_channels, unsigned format, bool reverse,
1420 bool known_aligned, LLVMValueRef rsrc,
1421 LLVMValueRef vindex, LLVMValueRef voffset,
1422 LLVMValueRef soffset, unsigned cache_policy,
1423 bool can_speculate)
1424 {
1425 LLVMValueRef tmp;
1426 unsigned load_log_size = log_size;
1427 unsigned load_num_channels = num_channels;
1428 if (log_size == 3) {
1429 load_log_size = 2;
1430 if (format == AC_FETCH_FORMAT_FLOAT) {
1431 load_num_channels = 2 * num_channels;
1432 } else {
1433 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1434 }
1435 }
1436
1437 int log_recombine = 0;
1438 if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {
1439 /* Avoid alignment restrictions by loading one byte at a time. */
1440 load_num_channels <<= load_log_size;
1441 log_recombine = load_log_size;
1442 load_log_size = 0;
1443 } else if (load_num_channels == 2 || load_num_channels == 4) {
1444 log_recombine = -util_logbase2(load_num_channels);
1445 load_num_channels = 1;
1446 load_log_size += -log_recombine;
1447 }
1448
1449 assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9);
1450
1451 LLVMValueRef loads[32]; /* up to 32 bytes */
1452 for (unsigned i = 0; i < load_num_channels; ++i) {
1453 tmp =
1454 LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1455 LLVMTypeRef channel_type =
1456 load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1457 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1458 loads[i] =
1459 ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1460 cache_policy, can_speculate, false, true);
1461 if (load_log_size >= 2)
1462 loads[i] = ac_to_integer(ctx, loads[i]);
1463 }
1464
1465 if (log_recombine > 0) {
1466 /* Recombine bytes if necessary (GFX6 only) */
1467 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1468
1469 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1470 LLVMValueRef accum = NULL;
1471 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1472 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1473 if (i == 0) {
1474 accum = tmp;
1475 } else {
1476 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1477 accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1478 }
1479 }
1480 loads[dst] = accum;
1481 }
1482 } else if (log_recombine < 0) {
1483 /* Split vectors of dwords */
1484 if (load_log_size > 2) {
1485 assert(load_num_channels == 1);
1486 LLVMValueRef loaded = loads[0];
1487 unsigned log_split = load_log_size - 2;
1488 log_recombine += log_split;
1489 load_num_channels = 1 << log_split;
1490 load_log_size = 2;
1491 for (unsigned i = 0; i < load_num_channels; ++i) {
1492 tmp = LLVMConstInt(ctx->i32, i, false);
1493 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1494 }
1495 }
1496
1497 /* Further split dwords and shorts if required */
1498 if (log_recombine < 0) {
1499 for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1500 --src) {
1501 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1502 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1503 LLVMValueRef loaded = loads[src - 1];
1504 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1505 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1506 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1507 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1508 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1509 }
1510 }
1511 }
1512 }
1513
1514 if (log_size == 3) {
1515 if (format == AC_FETCH_FORMAT_FLOAT) {
1516 for (unsigned i = 0; i < num_channels; ++i) {
1517 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1518 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1519 }
1520 } else if (format == AC_FETCH_FORMAT_FIXED) {
1521 /* 10_11_11_FLOAT */
1522 LLVMValueRef data = loads[0];
1523 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1524 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1525 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1526 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1527 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1528
1529 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1530 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1531 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1532
1533 num_channels = 3;
1534 log_size = 2;
1535 format = AC_FETCH_FORMAT_FLOAT;
1536 } else {
1537 /* 2_10_10_10 data formats */
1538 LLVMValueRef data = loads[0];
1539 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1540 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1541 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1542 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1543 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1544 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1545 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1546 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1547 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1548
1549 num_channels = 4;
1550 }
1551 }
1552
1553 if (format == AC_FETCH_FORMAT_FLOAT) {
1554 if (log_size != 2) {
1555 for (unsigned chan = 0; chan < num_channels; ++chan) {
1556 tmp = ac_to_float(ctx, loads[chan]);
1557 if (log_size == 3)
1558 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1559 else if (log_size == 1)
1560 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1561 loads[chan] = ac_to_integer(ctx, tmp);
1562 }
1563 }
1564 } else if (format == AC_FETCH_FORMAT_UINT) {
1565 if (log_size != 2) {
1566 for (unsigned chan = 0; chan < num_channels; ++chan)
1567 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1568 }
1569 } else if (format == AC_FETCH_FORMAT_SINT) {
1570 if (log_size != 2) {
1571 for (unsigned chan = 0; chan < num_channels; ++chan)
1572 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1573 }
1574 } else {
1575 bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1576 format == AC_FETCH_FORMAT_UINT;
1577
1578 for (unsigned chan = 0; chan < num_channels; ++chan) {
1579 if (unsign) {
1580 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1581 } else {
1582 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1583 }
1584
1585 LLVMValueRef scale = NULL;
1586 if (format == AC_FETCH_FORMAT_FIXED) {
1587 assert(log_size == 2);
1588 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1589 } else if (format == AC_FETCH_FORMAT_UNORM) {
1590 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1591 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1592 } else if (format == AC_FETCH_FORMAT_SNORM) {
1593 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1594 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1595 }
1596 if (scale)
1597 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1598
1599 if (format == AC_FETCH_FORMAT_SNORM) {
1600 /* Clamp to [-1, 1] */
1601 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1602 LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1603 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1604 }
1605
1606 loads[chan] = ac_to_integer(ctx, tmp);
1607 }
1608 }
1609
1610 while (num_channels < 4) {
1611 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1612 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1613 } else {
1614 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1615 }
1616 num_channels++;
1617 }
1618
1619 if (reverse) {
1620 tmp = loads[0];
1621 loads[0] = loads[2];
1622 loads[2] = tmp;
1623 }
1624
1625 return ac_build_gather_values(ctx, loads, 4);
1626 }
1627
1628 static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1629 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1630 LLVMValueRef soffset, LLVMValueRef immoffset,
1631 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1632 unsigned cache_policy, bool structurized)
1633 {
1634 voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");
1635
1636 LLVMValueRef args[7];
1637 int idx = 0;
1638 args[idx++] = vdata;
1639 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1640 if (structurized)
1641 args[idx++] = vindex ? vindex : ctx->i32_0;
1642 args[idx++] = voffset ? voffset : ctx->i32_0;
1643 args[idx++] = soffset ? soffset : ctx->i32_0;
1644 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1645 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1646 unsigned func =
1647 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1648 const char *indexing_kind = structurized ? "struct" : "raw";
1649 char name[256], type_name[8];
1650
1651 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1652 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1653
1654 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);
1655
1656 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1657 }
1658
1659 void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1660 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1661 LLVMValueRef soffset, LLVMValueRef immoffset,
1662 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1663 unsigned cache_policy)
1664 {
1665 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1666 nfmt, cache_policy, true);
1667 }
1668
1669 void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1670 LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,
1671 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1672 unsigned cache_policy)
1673 {
1674 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1675 nfmt, cache_policy, false);
1676 }
1677
1678 void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1679 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1680 unsigned cache_policy)
1681 {
1682 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1683
1684 if (LLVM_VERSION_MAJOR >= 9) {
1685 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1686 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1687 false);
1688 } else {
1689 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1690 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1691
1692 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1693
1694 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1695 cache_policy);
1696 }
1697 }
1698
1699 void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1700 LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1701 {
1702 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1703
1704 if (LLVM_VERSION_MAJOR >= 9) {
1705 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1706 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1707 false);
1708 } else {
1709 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1710 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1711
1712 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1713
1714 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1715 cache_policy);
1716 }
1717 }
1718 /**
1719 * Set range metadata on an instruction. This can only be used on load and
1720 * call instructions. If you know an instruction can only produce the values
1721 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1722 * \p lo is the minimum value inclusive.
1723 * \p hi is the maximum value exclusive.
1724 */
1725 static void set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1726 unsigned hi)
1727 {
1728 LLVMValueRef range_md, md_args[2];
1729 LLVMTypeRef type = LLVMTypeOf(value);
1730 LLVMContextRef context = LLVMGetTypeContext(type);
1731
1732 md_args[0] = LLVMConstInt(type, lo, false);
1733 md_args[1] = LLVMConstInt(type, hi, false);
1734 range_md = LLVMMDNodeInContext(context, md_args, 2);
1735 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1736 }
1737
1738 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1739 {
1740 LLVMValueRef tid;
1741
1742 LLVMValueRef tid_args[2];
1743 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1744 tid_args[1] = ctx->i32_0;
1745 tid_args[1] =
1746 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, tid_args, 2, AC_FUNC_ATTR_READNONE);
1747
1748 if (ctx->wave_size == 32) {
1749 tid = tid_args[1];
1750 } else {
1751 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, tid_args, 2,
1752 AC_FUNC_ATTR_READNONE);
1753 }
1754 set_range_metadata(ctx, tid, 0, ctx->wave_size);
1755 return tid;
1756 }
1757
1758 /*
1759 * AMD GCN implements derivatives using the local data store (LDS)
1760 * All writes to the LDS happen in all executing threads at
1761 * the same time. TID is the Thread ID for the current
1762 * thread and is a value between 0 and 63, representing
1763 * the thread's position in the wavefront.
1764 *
1765 * For the pixel shader threads are grouped into quads of four pixels.
1766 * The TIDs of the pixels of a quad are:
1767 *
1768 * +------+------+
1769 * |4n + 0|4n + 1|
1770 * +------+------+
1771 * |4n + 2|4n + 3|
1772 * +------+------+
1773 *
1774 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1775 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1776 * the current pixel's column, and masking with 0xfffffffe yields the TID
1777 * of the left pixel of the current pixel's row.
1778 *
1779 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1780 * adding 2 yields the TID of the pixel below the top pixel.
1781 */
1782 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1783 {
1784 unsigned tl_lanes[4], trbl_lanes[4];
1785 char name[32], type[8];
1786 LLVMValueRef tl, trbl;
1787 LLVMTypeRef result_type;
1788 LLVMValueRef result;
1789
1790 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1791
1792 if (result_type == ctx->f16)
1793 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1794 else if (result_type == ctx->v2f16)
1795 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1796
1797 for (unsigned i = 0; i < 4; ++i) {
1798 tl_lanes[i] = i & mask;
1799 trbl_lanes[i] = (i & mask) + idx;
1800 }
1801
1802 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1803 trbl =
1804 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1805
1806 if (result_type == ctx->f16) {
1807 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1808 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1809 }
1810
1811 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1812 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1813 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1814
1815 ac_build_type_name_for_intr(result_type, type, sizeof(type));
1816 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1817
1818 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1819 }
1820
1821 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1822 {
1823 LLVMValueRef args[2];
1824 args[0] = LLVMConstInt(ctx->i32, msg, false);
1825 args[1] = wave_id;
1826 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1827 }
1828
1829 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1830 {
1831 LLVMValueRef msb =
1832 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1833
1834 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1835 * the index from LSB. Invert it by doing "31 - msb". */
1836 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1837
1838 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1839 LLVMValueRef cond =
1840 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1841 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1842
1843 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1844 }
1845
1846 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1847 {
1848 const char *intrin_name;
1849 LLVMTypeRef type;
1850 LLVMValueRef highest_bit;
1851 LLVMValueRef zero;
1852 unsigned bitsize;
1853
1854 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1855 switch (bitsize) {
1856 case 64:
1857 intrin_name = "llvm.ctlz.i64";
1858 type = ctx->i64;
1859 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1860 zero = ctx->i64_0;
1861 break;
1862 case 32:
1863 intrin_name = "llvm.ctlz.i32";
1864 type = ctx->i32;
1865 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1866 zero = ctx->i32_0;
1867 break;
1868 case 16:
1869 intrin_name = "llvm.ctlz.i16";
1870 type = ctx->i16;
1871 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1872 zero = ctx->i16_0;
1873 break;
1874 case 8:
1875 intrin_name = "llvm.ctlz.i8";
1876 type = ctx->i8;
1877 highest_bit = LLVMConstInt(ctx->i8, 7, false);
1878 zero = ctx->i8_0;
1879 break;
1880 default:
1881 unreachable(!"invalid bitsize");
1882 break;
1883 }
1884
1885 LLVMValueRef params[2] = {
1886 arg,
1887 ctx->i1true,
1888 };
1889
1890 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1891
1892 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1893 * the index from LSB. Invert it by doing "31 - msb". */
1894 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1895
1896 if (bitsize == 64) {
1897 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1898 } else if (bitsize < 32) {
1899 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1900 }
1901
1902 /* check for zero */
1903 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1904 LLVMConstInt(ctx->i32, -1, true), msb, "");
1905 }
1906
1907 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1908 {
1909 char name[64], type[64];
1910
1911 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1912 snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1913 LLVMValueRef args[2] = {a, b};
1914 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1915 }
1916
1917 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1918 {
1919 char name[64], type[64];
1920
1921 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1922 snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1923 LLVMValueRef args[2] = {a, b};
1924 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1925 }
1926
1927 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1928 {
1929 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1930 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1931 }
1932
1933 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934 {
1935 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1936 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1937 }
1938
1939 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1940 {
1941 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1942 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1943 }
1944
1945 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1946 {
1947 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1948 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1949 }
1950
1951 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1952 {
1953 LLVMTypeRef t = LLVMTypeOf(value);
1954 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1955 LLVMConstReal(t, 1.0));
1956 }
1957
1958 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1959 {
1960 LLVMValueRef args[9];
1961
1962 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1963 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1964
1965 if (a->compr) {
1966 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1967 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1968 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1969 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1970
1971 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1972 } else {
1973 args[2] = a->out[0];
1974 args[3] = a->out[1];
1975 args[4] = a->out[2];
1976 args[5] = a->out[3];
1977 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1978 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1979
1980 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1981 }
1982 }
1983
1984 void ac_build_export_null(struct ac_llvm_context *ctx)
1985 {
1986 struct ac_export_args args;
1987
1988 args.enabled_channels = 0x0; /* enabled channels */
1989 args.valid_mask = 1; /* whether the EXEC mask is valid */
1990 args.done = 1; /* DONE bit */
1991 args.target = V_008DFC_SQ_EXP_NULL;
1992 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
1993 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1994 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1995 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1996 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1997
1998 ac_build_export(ctx, &args);
1999 }
2000
2001 static unsigned ac_num_coords(enum ac_image_dim dim)
2002 {
2003 switch (dim) {
2004 case ac_image_1d:
2005 return 1;
2006 case ac_image_2d:
2007 case ac_image_1darray:
2008 return 2;
2009 case ac_image_3d:
2010 case ac_image_cube:
2011 case ac_image_2darray:
2012 case ac_image_2dmsaa:
2013 return 3;
2014 case ac_image_2darraymsaa:
2015 return 4;
2016 default:
2017 unreachable("ac_num_coords: bad dim");
2018 }
2019 }
2020
2021 static unsigned ac_num_derivs(enum ac_image_dim dim)
2022 {
2023 switch (dim) {
2024 case ac_image_1d:
2025 case ac_image_1darray:
2026 return 2;
2027 case ac_image_2d:
2028 case ac_image_2darray:
2029 case ac_image_cube:
2030 return 4;
2031 case ac_image_3d:
2032 return 6;
2033 case ac_image_2dmsaa:
2034 case ac_image_2darraymsaa:
2035 default:
2036 unreachable("derivatives not supported");
2037 }
2038 }
2039
2040 static const char *get_atomic_name(enum ac_atomic_op op)
2041 {
2042 switch (op) {
2043 case ac_atomic_swap:
2044 return "swap";
2045 case ac_atomic_add:
2046 return "add";
2047 case ac_atomic_sub:
2048 return "sub";
2049 case ac_atomic_smin:
2050 return "smin";
2051 case ac_atomic_umin:
2052 return "umin";
2053 case ac_atomic_smax:
2054 return "smax";
2055 case ac_atomic_umax:
2056 return "umax";
2057 case ac_atomic_and:
2058 return "and";
2059 case ac_atomic_or:
2060 return "or";
2061 case ac_atomic_xor:
2062 return "xor";
2063 case ac_atomic_inc_wrap:
2064 return "inc";
2065 case ac_atomic_dec_wrap:
2066 return "dec";
2067 }
2068 unreachable("bad atomic op");
2069 }
2070
2071 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2072 {
2073 const char *overload[3] = {"", "", ""};
2074 unsigned num_overloads = 0;
2075 LLVMValueRef args[18];
2076 unsigned num_args = 0;
2077 enum ac_image_dim dim = a->dim;
2078
2079 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2080 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2081 a->opcode != ac_image_store_mip) ||
2082 a->lod);
2083 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2084 (!a->compare && !a->offset));
2085 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2086 a->opcode == ac_image_get_lod) ||
2087 !a->bias);
2088 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2089 1);
2090 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2091 assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&
2092 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2093 a->opcode != ac_image_get_resinfo));
2094
2095 if (a->opcode == ac_image_get_lod) {
2096 switch (dim) {
2097 case ac_image_1darray:
2098 dim = ac_image_1d;
2099 break;
2100 case ac_image_2darray:
2101 case ac_image_cube:
2102 dim = ac_image_2d;
2103 break;
2104 default:
2105 break;
2106 }
2107 }
2108
2109 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2110 a->opcode == ac_image_get_lod;
2111 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2112 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2113 a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2114 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2115
2116 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2117 args[num_args++] = a->data[0];
2118 if (a->opcode == ac_image_atomic_cmpswap)
2119 args[num_args++] = a->data[1];
2120 }
2121
2122 if (!atomic)
2123 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2124
2125 if (a->offset)
2126 args[num_args++] = ac_to_integer(ctx, a->offset);
2127 if (a->bias) {
2128 args[num_args++] = ac_to_float(ctx, a->bias);
2129 overload[num_overloads++] = ".f32";
2130 }
2131 if (a->compare)
2132 args[num_args++] = ac_to_float(ctx, a->compare);
2133 if (a->derivs[0]) {
2134 unsigned count = ac_num_derivs(dim);
2135 for (unsigned i = 0; i < count; ++i)
2136 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2137 overload[num_overloads++] = ".f32";
2138 }
2139 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2140 for (unsigned i = 0; i < num_coords; ++i)
2141 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2142 if (a->lod)
2143 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2144 if (a->min_lod)
2145 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2146
2147 overload[num_overloads++] = sample ? ".f32" : ".i32";
2148
2149 args[num_args++] = a->resource;
2150 if (sample) {
2151 args[num_args++] = a->sampler;
2152 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2153 }
2154
2155 args[num_args++] = ctx->i32_0; /* texfailctrl */
2156 args[num_args++] = LLVMConstInt(
2157 ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2158
2159 const char *name;
2160 const char *atomic_subop = "";
2161 switch (a->opcode) {
2162 case ac_image_sample:
2163 name = "sample";
2164 break;
2165 case ac_image_gather4:
2166 name = "gather4";
2167 break;
2168 case ac_image_load:
2169 name = "load";
2170 break;
2171 case ac_image_load_mip:
2172 name = "load.mip";
2173 break;
2174 case ac_image_store:
2175 name = "store";
2176 break;
2177 case ac_image_store_mip:
2178 name = "store.mip";
2179 break;
2180 case ac_image_atomic:
2181 name = "atomic.";
2182 atomic_subop = get_atomic_name(a->atomic);
2183 break;
2184 case ac_image_atomic_cmpswap:
2185 name = "atomic.";
2186 atomic_subop = "cmpswap";
2187 break;
2188 case ac_image_get_lod:
2189 name = "getlod";
2190 break;
2191 case ac_image_get_resinfo:
2192 name = "getresinfo";
2193 break;
2194 default:
2195 unreachable("invalid image opcode");
2196 }
2197
2198 const char *dimname;
2199 switch (dim) {
2200 case ac_image_1d:
2201 dimname = "1d";
2202 break;
2203 case ac_image_2d:
2204 dimname = "2d";
2205 break;
2206 case ac_image_3d:
2207 dimname = "3d";
2208 break;
2209 case ac_image_cube:
2210 dimname = "cube";
2211 break;
2212 case ac_image_1darray:
2213 dimname = "1darray";
2214 break;
2215 case ac_image_2darray:
2216 dimname = "2darray";
2217 break;
2218 case ac_image_2dmsaa:
2219 dimname = "2dmsaa";
2220 break;
2221 case ac_image_2darraymsaa:
2222 dimname = "2darraymsaa";
2223 break;
2224 default:
2225 unreachable("invalid dim");
2226 }
2227
2228 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2229 char intr_name[96];
2230 snprintf(intr_name, sizeof(intr_name),
2231 "llvm.amdgcn.image.%s%s" /* base name */
2232 "%s%s%s%s" /* sample/gather modifiers */
2233 ".%s.%s%s%s%s", /* dimension and type overloads */
2234 name, atomic_subop, a->compare ? ".c" : "",
2235 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2236 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2237 atomic ? "i32" : (a->d16 ? "v4f16" : "v4f32"), overload[0], overload[1], overload[2]);
2238
2239 LLVMTypeRef retty;
2240 if (atomic)
2241 retty = ctx->i32;
2242 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2243 retty = ctx->voidt;
2244 else
2245 retty = a->d16 ? ctx->v4f16 : ctx->v4f32;
2246
2247 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2248 if (!sample && !atomic && retty != ctx->voidt)
2249 result = ac_to_integer(ctx, result);
2250
2251 return result;
2252 }
2253
2254 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2255 {
2256 LLVMValueRef samples;
2257
2258 /* Read the samples from the descriptor directly.
2259 * Hardware doesn't have any instruction for this.
2260 */
2261 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2262 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2263 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2264 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2265 return samples;
2266 }
2267
2268 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2269 {
2270 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2271 AC_FUNC_ATTR_READNONE);
2272 }
2273
2274 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2275 {
2276 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2277 AC_FUNC_ATTR_READNONE);
2278 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2279 }
2280
2281 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2282 {
2283 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2284 AC_FUNC_ATTR_READNONE);
2285 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2286 }
2287
2288 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2289 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2290 bool hi)
2291 {
2292 assert(bits == 8 || bits == 10 || bits == 16);
2293
2294 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2295 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2296 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2297 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2298
2299 /* Clamp. */
2300 if (bits != 16) {
2301 for (int i = 0; i < 2; i++) {
2302 bool alpha = hi && i == 1;
2303 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2304 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2305 }
2306 }
2307
2308 LLVMValueRef res =
2309 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2310 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2311 }
2312
2313 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2314 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2315 bool hi)
2316 {
2317 assert(bits == 8 || bits == 10 || bits == 16);
2318
2319 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2320 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2321
2322 /* Clamp. */
2323 if (bits != 16) {
2324 for (int i = 0; i < 2; i++) {
2325 bool alpha = hi && i == 1;
2326 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2327 }
2328 }
2329
2330 LLVMValueRef res =
2331 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2332 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2333 }
2334
2335 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2336 {
2337 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2338 }
2339
2340 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2341 {
2342 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2343 }
2344
2345 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2346 LLVMValueRef width, bool is_signed)
2347 {
2348 LLVMValueRef args[] = {
2349 input,
2350 offset,
2351 width,
2352 };
2353
2354 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2355 ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2356 }
2357
2358 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2359 LLVMValueRef s2)
2360 {
2361 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2362 }
2363
2364 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2365 LLVMValueRef s2)
2366 {
2367 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2368 if (ctx->chip_class >= GFX10) {
2369 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2370 AC_FUNC_ATTR_READNONE);
2371 }
2372
2373 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2374 }
2375
2376 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2377 {
2378 if (!wait_flags)
2379 return;
2380
2381 unsigned lgkmcnt = 63;
2382 unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2383 unsigned vscnt = 63;
2384
2385 if (wait_flags & AC_WAIT_LGKM)
2386 lgkmcnt = 0;
2387 if (wait_flags & AC_WAIT_VLOAD)
2388 vmcnt = 0;
2389
2390 if (wait_flags & AC_WAIT_VSTORE) {
2391 if (ctx->chip_class >= GFX10)
2392 vscnt = 0;
2393 else
2394 vmcnt = 0;
2395 }
2396
2397 /* There is no intrinsic for vscnt(0), so use a fence. */
2398 if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2399 vscnt == 0) {
2400 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2401 return;
2402 }
2403
2404 unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */
2405 (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2406
2407 LLVMValueRef args[1] = {
2408 LLVMConstInt(ctx->i32, simm16, false),
2409 };
2410 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2411 }
2412
2413 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2414 {
2415 LLVMTypeRef type;
2416 char *intr;
2417
2418 if (bitsize == 16) {
2419 intr = "llvm.amdgcn.fract.f16";
2420 type = ctx->f16;
2421 } else if (bitsize == 32) {
2422 intr = "llvm.amdgcn.fract.f32";
2423 type = ctx->f32;
2424 } else {
2425 intr = "llvm.amdgcn.fract.f64";
2426 type = ctx->f64;
2427 }
2428
2429 LLVMValueRef params[] = {
2430 src0,
2431 };
2432 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2433 }
2434
2435 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2436 {
2437
2438 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2439 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2440 unsigned vec_size = LLVMGetVectorSize(type);
2441 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef *));
2442
2443 for (unsigned i = 0; i < vec_size; i++)
2444 scalars[i] = scalar;
2445 return LLVMConstVector(scalars, vec_size);
2446 }
2447 return LLVMConstInt(type, value, 0);
2448 }
2449
2450 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2451 {
2452 LLVMTypeRef type = LLVMTypeOf(src0);
2453 LLVMValueRef val;
2454
2455 /* v_med3 is selected only when max is first. (LLVM bug?) */
2456 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2457 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2458 }
2459
2460 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2461 {
2462 ac_enable_signed_zeros(ctx);
2463 /* (val + 0) converts negative zero to positive zero. */
2464 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2465 ac_disable_signed_zeros(ctx);
2466 return val;
2467 }
2468
2469 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2470 {
2471 LLVMTypeRef type = LLVMTypeOf(src);
2472 LLVMValueRef pos, neg, dw[2], val;
2473 unsigned bitsize = ac_get_elem_bits(ctx, type);
2474
2475 /* The standard version leads to this:
2476 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2477 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2478 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2479 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2480 *
2481 * The isign version:
2482 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2483 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2484 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2485 *
2486 * (src0 + 0) converts negative zero to positive zero.
2487 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2488 *
2489 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2490 * reduction. (FP64 comparisons are as fast as int64 comparisons)
2491 */
2492 if (bitsize == 16 || bitsize == 32) {
2493 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2494 val = ac_build_isign(ctx, val);
2495 return LLVMBuildSIToFP(ctx->builder, val, type, "");
2496 }
2497
2498 assert(bitsize == 64);
2499 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2500 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2501 dw[0] = ctx->i32_0;
2502 dw[1] = LLVMBuildSelect(
2503 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2504 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2505 "");
2506 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2507 }
2508
2509 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2510 {
2511 LLVMValueRef result;
2512 unsigned bitsize;
2513
2514 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2515
2516 switch (bitsize) {
2517 case 128:
2518 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2519 AC_FUNC_ATTR_READNONE);
2520 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2521 break;
2522 case 64:
2523 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2524 AC_FUNC_ATTR_READNONE);
2525
2526 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2527 break;
2528 case 32:
2529 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2530 AC_FUNC_ATTR_READNONE);
2531 break;
2532 case 16:
2533 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2534 AC_FUNC_ATTR_READNONE);
2535
2536 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2537 break;
2538 case 8:
2539 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2540 AC_FUNC_ATTR_READNONE);
2541
2542 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2543 break;
2544 default:
2545 unreachable(!"invalid bitsize");
2546 break;
2547 }
2548
2549 return result;
2550 }
2551
2552 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2553 {
2554 LLVMValueRef result;
2555 unsigned bitsize;
2556
2557 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2558
2559 switch (bitsize) {
2560 case 64:
2561 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2562 AC_FUNC_ATTR_READNONE);
2563
2564 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2565 break;
2566 case 32:
2567 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2568 AC_FUNC_ATTR_READNONE);
2569 break;
2570 case 16:
2571 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2572 AC_FUNC_ATTR_READNONE);
2573
2574 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2575 break;
2576 case 8:
2577 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2578 AC_FUNC_ATTR_READNONE);
2579
2580 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2581 break;
2582 default:
2583 unreachable(!"invalid bitsize");
2584 break;
2585 }
2586
2587 return result;
2588 }
2589
2590 #define AC_EXP_TARGET 0
2591 #define AC_EXP_ENABLED_CHANNELS 1
2592 #define AC_EXP_OUT0 2
2593
2594 enum ac_ir_type
2595 {
2596 AC_IR_UNDEF,
2597 AC_IR_CONST,
2598 AC_IR_VALUE,
2599 };
2600
2601 struct ac_vs_exp_chan {
2602 LLVMValueRef value;
2603 float const_float;
2604 enum ac_ir_type type;
2605 };
2606
2607 struct ac_vs_exp_inst {
2608 unsigned offset;
2609 LLVMValueRef inst;
2610 struct ac_vs_exp_chan chan[4];
2611 };
2612
2613 struct ac_vs_exports {
2614 unsigned num;
2615 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2616 };
2617
2618 /* Return true if the PARAM export has been eliminated. */
2619 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,
2620 struct ac_vs_exp_inst *exp)
2621 {
2622 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2623 bool is_zero[4] = {}, is_one[4] = {};
2624
2625 for (i = 0; i < 4; i++) {
2626 /* It's a constant expression. Undef outputs are eliminated too. */
2627 if (exp->chan[i].type == AC_IR_UNDEF) {
2628 is_zero[i] = true;
2629 is_one[i] = true;
2630 } else if (exp->chan[i].type == AC_IR_CONST) {
2631 if (exp->chan[i].const_float == 0)
2632 is_zero[i] = true;
2633 else if (exp->chan[i].const_float == 1)
2634 is_one[i] = true;
2635 else
2636 return false; /* other constant */
2637 } else
2638 return false;
2639 }
2640
2641 /* Only certain combinations of 0 and 1 can be eliminated. */
2642 if (is_zero[0] && is_zero[1] && is_zero[2])
2643 default_val = is_zero[3] ? 0 : 1;
2644 else if (is_one[0] && is_one[1] && is_one[2])
2645 default_val = is_zero[3] ? 2 : 3;
2646 else
2647 return false;
2648
2649 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2650 LLVMInstructionEraseFromParent(exp->inst);
2651
2652 /* Change OFFSET to DEFAULT_VAL. */
2653 for (i = 0; i < num_outputs; i++) {
2654 if (vs_output_param_offset[i] == exp->offset) {
2655 vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2656 break;
2657 }
2658 }
2659 return true;
2660 }
2661
2662 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2663 uint8_t *vs_output_param_offset, uint32_t num_outputs,
2664 struct ac_vs_exports *processed,
2665 struct ac_vs_exp_inst *exp)
2666 {
2667 unsigned p, copy_back_channels = 0;
2668
2669 /* See if the output is already in the list of processed outputs.
2670 * The LLVMValueRef comparison relies on SSA.
2671 */
2672 for (p = 0; p < processed->num; p++) {
2673 bool different = false;
2674
2675 for (unsigned j = 0; j < 4; j++) {
2676 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2677 struct ac_vs_exp_chan *c2 = &exp->chan[j];
2678
2679 /* Treat undef as a match. */
2680 if (c2->type == AC_IR_UNDEF)
2681 continue;
2682
2683 /* If c1 is undef but c2 isn't, we can copy c2 to c1
2684 * and consider the instruction duplicated.
2685 */
2686 if (c1->type == AC_IR_UNDEF) {
2687 copy_back_channels |= 1 << j;
2688 continue;
2689 }
2690
2691 /* Test whether the channels are not equal. */
2692 if (c1->type != c2->type ||
2693 (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||
2694 (c1->type == AC_IR_VALUE && c1->value != c2->value)) {
2695 different = true;
2696 break;
2697 }
2698 }
2699 if (!different)
2700 break;
2701
2702 copy_back_channels = 0;
2703 }
2704 if (p == processed->num)
2705 return false;
2706
2707 /* If a match was found, but the matching export has undef where the new
2708 * one has a normal value, copy the normal value to the undef channel.
2709 */
2710 struct ac_vs_exp_inst *match = &processed->exp[p];
2711
2712 /* Get current enabled channels mask. */
2713 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2714 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2715
2716 while (copy_back_channels) {
2717 unsigned chan = u_bit_scan(&copy_back_channels);
2718
2719 assert(match->chan[chan].type == AC_IR_UNDEF);
2720 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);
2721 match->chan[chan] = exp->chan[chan];
2722
2723 /* Update number of enabled channels because the original mask
2724 * is not always 0xf.
2725 */
2726 enabled_channels |= (1 << chan);
2727 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2728 LLVMConstInt(ctx->i32, enabled_channels, 0));
2729 }
2730
2731 /* The PARAM export is duplicated. Kill it. */
2732 LLVMInstructionEraseFromParent(exp->inst);
2733
2734 /* Change OFFSET to the matching export. */
2735 for (unsigned i = 0; i < num_outputs; i++) {
2736 if (vs_output_param_offset[i] == exp->offset) {
2737 vs_output_param_offset[i] = match->offset;
2738 break;
2739 }
2740 }
2741 return true;
2742 }
2743
2744 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,
2745 uint8_t *vs_output_param_offset, uint32_t num_outputs,
2746 uint32_t skip_output_mask, uint8_t *num_param_exports)
2747 {
2748 LLVMBasicBlockRef bb;
2749 bool removed_any = false;
2750 struct ac_vs_exports exports;
2751
2752 exports.num = 0;
2753
2754 /* Process all LLVM instructions. */
2755 bb = LLVMGetFirstBasicBlock(main_fn);
2756 while (bb) {
2757 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2758
2759 while (inst) {
2760 LLVMValueRef cur = inst;
2761 inst = LLVMGetNextInstruction(inst);
2762 struct ac_vs_exp_inst exp;
2763
2764 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2765 continue;
2766
2767 LLVMValueRef callee = ac_llvm_get_called_value(cur);
2768
2769 if (!ac_llvm_is_function(callee))
2770 continue;
2771
2772 const char *name = LLVMGetValueName(callee);
2773 unsigned num_args = LLVMCountParams(callee);
2774
2775 /* Check if this is an export instruction. */
2776 if ((num_args != 9 && num_args != 8) ||
2777 (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))
2778 continue;
2779
2780 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2781 unsigned target = LLVMConstIntGetZExtValue(arg);
2782
2783 if (target < V_008DFC_SQ_EXP_PARAM)
2784 continue;
2785
2786 target -= V_008DFC_SQ_EXP_PARAM;
2787
2788 /* Parse the instruction. */
2789 memset(&exp, 0, sizeof(exp));
2790 exp.offset = target;
2791 exp.inst = cur;
2792
2793 for (unsigned i = 0; i < 4; i++) {
2794 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2795
2796 exp.chan[i].value = v;
2797
2798 if (LLVMIsUndef(v)) {
2799 exp.chan[i].type = AC_IR_UNDEF;
2800 } else if (LLVMIsAConstantFP(v)) {
2801 LLVMBool loses_info;
2802 exp.chan[i].type = AC_IR_CONST;
2803 exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);
2804 } else {
2805 exp.chan[i].type = AC_IR_VALUE;
2806 }
2807 }
2808
2809 /* Eliminate constant and duplicated PARAM exports. */
2810 if (!((1u << target) & skip_output_mask) &&
2811 (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||
2812 ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,
2813 &exp))) {
2814 removed_any = true;
2815 } else {
2816 exports.exp[exports.num++] = exp;
2817 }
2818 }
2819 bb = LLVMGetNextBasicBlock(bb);
2820 }
2821
2822 /* Remove holes in export memory due to removed PARAM exports.
2823 * This is done by renumbering all PARAM exports.
2824 */
2825 if (removed_any) {
2826 uint8_t old_offset[VARYING_SLOT_MAX];
2827 unsigned out, i;
2828
2829 /* Make a copy of the offsets. We need the old version while
2830 * we are modifying some of them. */
2831 memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));
2832
2833 for (i = 0; i < exports.num; i++) {
2834 unsigned offset = exports.exp[i].offset;
2835
2836 /* Update vs_output_param_offset. Multiple outputs can
2837 * have the same offset.
2838 */
2839 for (out = 0; out < num_outputs; out++) {
2840 if (old_offset[out] == offset)
2841 vs_output_param_offset[out] = i;
2842 }
2843
2844 /* Change the PARAM offset in the instruction. */
2845 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2846 LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));
2847 }
2848 *num_param_exports = exports.num;
2849 }
2850 }
2851
2852 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2853 {
2854 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2855 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2856 AC_FUNC_ATTR_CONVERGENT);
2857 }
2858
2859 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2860 {
2861 unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
2862 ctx->lds = LLVMBuildIntToPtr(
2863 ctx->builder, ctx->i32_0,
2864 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2865 }
2866
2867 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2868 {
2869 return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2870 }
2871
2872 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2873 {
2874 value = ac_to_integer(ctx, value);
2875 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2876 }
2877
2878 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2879 {
2880 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2881 const char *intrin_name;
2882 LLVMTypeRef type;
2883 LLVMValueRef zero;
2884
2885 switch (src0_bitsize) {
2886 case 64:
2887 intrin_name = "llvm.cttz.i64";
2888 type = ctx->i64;
2889 zero = ctx->i64_0;
2890 break;
2891 case 32:
2892 intrin_name = "llvm.cttz.i32";
2893 type = ctx->i32;
2894 zero = ctx->i32_0;
2895 break;
2896 case 16:
2897 intrin_name = "llvm.cttz.i16";
2898 type = ctx->i16;
2899 zero = ctx->i16_0;
2900 break;
2901 case 8:
2902 intrin_name = "llvm.cttz.i8";
2903 type = ctx->i8;
2904 zero = ctx->i8_0;
2905 break;
2906 default:
2907 unreachable(!"invalid bitsize");
2908 }
2909
2910 LLVMValueRef params[2] = {
2911 src0,
2912
2913 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2914 * add special code to check for x=0. The reason is that
2915 * the LLVM behavior for x=0 is different from what we
2916 * need here. However, LLVM also assumes that ffs(x) is
2917 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2918 * a conditional assignment to handle 0 is still required.
2919 *
2920 * The hardware already implements the correct behavior.
2921 */
2922 ctx->i1true,
2923 };
2924
2925 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
2926
2927 if (src0_bitsize == 64) {
2928 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2929 } else if (src0_bitsize < 32) {
2930 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2931 }
2932
2933 /* TODO: We need an intrinsic to skip this conditional. */
2934 /* Check for zero: */
2935 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2936 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2937 }
2938
2939 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2940 {
2941 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2942 }
2943
2944 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2945 {
2946 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2947 }
2948
2949 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2950 {
2951 if (ctx->flow->depth > 0)
2952 return &ctx->flow->stack[ctx->flow->depth - 1];
2953 return NULL;
2954 }
2955
2956 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2957 {
2958 for (unsigned i = ctx->flow->depth; i > 0; --i) {
2959 if (ctx->flow->stack[i - 1].loop_entry_block)
2960 return &ctx->flow->stack[i - 1];
2961 }
2962 return NULL;
2963 }
2964
2965 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2966 {
2967 struct ac_llvm_flow *flow;
2968
2969 if (ctx->flow->depth >= ctx->flow->depth_max) {
2970 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2971
2972 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2973 ctx->flow->depth_max = new_max;
2974 }
2975
2976 flow = &ctx->flow->stack[ctx->flow->depth];
2977 ctx->flow->depth++;
2978
2979 flow->next_block = NULL;
2980 flow->loop_entry_block = NULL;
2981 return flow;
2982 }
2983
2984 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2985 {
2986 char buf[32];
2987 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2988 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2989 }
2990
2991 /* Append a basic block at the level of the parent flow.
2992 */
2993 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2994 {
2995 assert(ctx->flow->depth >= 1);
2996
2997 if (ctx->flow->depth >= 2) {
2998 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2999
3000 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
3001 }
3002
3003 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3004 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3005 }
3006
3007 /* Emit a branch to the given default target for the current block if
3008 * applicable -- that is, if the current block does not already contain a
3009 * branch from a break or continue.
3010 */
3011 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
3012 {
3013 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3014 LLVMBuildBr(builder, target);
3015 }
3016
3017 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3018 {
3019 struct ac_llvm_flow *flow = push_flow(ctx);
3020 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3021 flow->next_block = append_basic_block(ctx, "ENDLOOP");
3022 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3023 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3024 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3025 }
3026
3027 void ac_build_break(struct ac_llvm_context *ctx)
3028 {
3029 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3030 LLVMBuildBr(ctx->builder, flow->next_block);
3031 }
3032
3033 void ac_build_continue(struct ac_llvm_context *ctx)
3034 {
3035 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3036 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3037 }
3038
3039 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3040 {
3041 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3042 LLVMBasicBlockRef endif_block;
3043
3044 assert(!current_branch->loop_entry_block);
3045
3046 endif_block = append_basic_block(ctx, "ENDIF");
3047 emit_default_branch(ctx->builder, endif_block);
3048
3049 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3050 set_basicblock_name(current_branch->next_block, "else", label_id);
3051
3052 current_branch->next_block = endif_block;
3053 }
3054
3055 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3056 {
3057 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3058
3059 assert(!current_branch->loop_entry_block);
3060
3061 emit_default_branch(ctx->builder, current_branch->next_block);
3062 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3063 set_basicblock_name(current_branch->next_block, "endif", label_id);
3064
3065 ctx->flow->depth--;
3066 }
3067
3068 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3069 {
3070 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3071
3072 assert(current_loop->loop_entry_block);
3073
3074 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3075
3076 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3077 set_basicblock_name(current_loop->next_block, "endloop", label_id);
3078 ctx->flow->depth--;
3079 }
3080
3081 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3082 {
3083 struct ac_llvm_flow *flow = push_flow(ctx);
3084 LLVMBasicBlockRef if_block;
3085
3086 if_block = append_basic_block(ctx, "IF");
3087 flow->next_block = append_basic_block(ctx, "ELSE");
3088 set_basicblock_name(if_block, "if", label_id);
3089 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3090 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3091 }
3092
3093 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value, int label_id)
3094 {
3095 LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE, value, ctx->f32_0, "");
3096 ac_build_ifcc(ctx, cond, label_id);
3097 }
3098
3099 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value, int label_id)
3100 {
3101 LLVMValueRef cond =
3102 LLVMBuildICmp(ctx->builder, LLVMIntNE, ac_to_integer(ctx, value), ctx->i32_0, "");
3103 ac_build_ifcc(ctx, cond, label_id);
3104 }
3105
3106 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3107 {
3108 LLVMBuilderRef builder = ac->builder;
3109 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3110 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3111 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3112 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3113 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3114 LLVMValueRef res;
3115
3116 if (first_instr) {
3117 LLVMPositionBuilderBefore(first_builder, first_instr);
3118 } else {
3119 LLVMPositionBuilderAtEnd(first_builder, first_block);
3120 }
3121
3122 res = LLVMBuildAlloca(first_builder, type, name);
3123 LLVMDisposeBuilder(first_builder);
3124 return res;
3125 }
3126
3127 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3128 {
3129 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3130 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3131 return ptr;
3132 }
3133
3134 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3135 {
3136 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3137 return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3138 }
3139
3140 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3141 {
3142 unsigned num_components = ac_get_llvm_num_components(value);
3143 if (count == num_components)
3144 return value;
3145
3146 LLVMValueRef masks[MAX2(count, 2)];
3147 masks[0] = ctx->i32_0;
3148 masks[1] = ctx->i32_1;
3149 for (unsigned i = 2; i < count; i++)
3150 masks[i] = LLVMConstInt(ctx->i32, i, false);
3151
3152 if (count == 1)
3153 return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3154
3155 LLVMValueRef swizzle = LLVMConstVector(masks, count);
3156 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3157 }
3158
3159 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3160 unsigned bitwidth)
3161 {
3162 LLVMValueRef value = param;
3163 if (rshift)
3164 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(ctx->i32, rshift, false), "");
3165
3166 if (rshift + bitwidth < 32) {
3167 unsigned mask = (1 << bitwidth) - 1;
3168 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(ctx->i32, mask, false), "");
3169 }
3170 return value;
3171 }
3172
3173 /* Adjust the sample index according to FMASK.
3174 *
3175 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3176 * which is the identity mapping. Each nibble says which physical sample
3177 * should be fetched to get that sample.
3178 *
3179 * For example, 0x11111100 means there are only 2 samples stored and
3180 * the second sample covers 3/4 of the pixel. When reading samples 0
3181 * and 1, return physical sample 0 (determined by the first two 0s
3182 * in FMASK), otherwise return physical sample 1.
3183 *
3184 * The sample index should be adjusted as follows:
3185 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3186 */
3187 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3188 bool is_array_tex)
3189 {
3190 struct ac_image_args fmask_load = {};
3191 fmask_load.opcode = ac_image_load;
3192 fmask_load.resource = fmask;
3193 fmask_load.dmask = 0xf;
3194 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3195 fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3196
3197 fmask_load.coords[0] = addr[0];
3198 fmask_load.coords[1] = addr[1];
3199 if (is_array_tex)
3200 fmask_load.coords[2] = addr[2];
3201
3202 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3203 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3204
3205 /* Apply the formula. */
3206 unsigned sample_chan = is_array_tex ? 3 : 2;
3207 LLVMValueRef final_sample;
3208 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], LLVMConstInt(ac->i32, 4, 0), "");
3209 final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3210 /* Mask the sample index by 0x7, because 0x8 means an unknown value
3211 * with EQAA, so those will map to 0. */
3212 final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3213
3214 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3215 * resource descriptor is 0 (invalid).
3216 */
3217 LLVMValueRef tmp;
3218 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3219 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3220 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3221
3222 /* Replace the MSAA sample index. */
3223 addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], "");
3224 }
3225
3226 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3227 LLVMValueRef lane, bool with_opt_barrier)
3228 {
3229 LLVMTypeRef type = LLVMTypeOf(src);
3230 LLVMValueRef result;
3231
3232 if (with_opt_barrier)
3233 ac_build_optimization_barrier(ctx, &src);
3234
3235 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3236 if (lane)
3237 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3238
3239 result =
3240 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3241 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3242 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3243
3244 return LLVMBuildTrunc(ctx->builder, result, type, "");
3245 }
3246
3247 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3248 LLVMValueRef lane, bool with_opt_barrier)
3249 {
3250 LLVMTypeRef src_type = LLVMTypeOf(src);
3251 src = ac_to_integer(ctx, src);
3252 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3253 LLVMValueRef ret;
3254
3255 if (bits > 32) {
3256 assert(bits % 32 == 0);
3257 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3258 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3259 ret = LLVMGetUndef(vec_type);
3260 for (unsigned i = 0; i < bits / 32; i++) {
3261 LLVMValueRef ret_comp;
3262
3263 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3264
3265 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3266
3267 ret =
3268 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3269 }
3270 } else {
3271 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3272 }
3273
3274 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3275 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3276 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3277 }
3278
3279 /**
3280 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3281 *
3282 * The optimization barrier is not needed if the value is the same in all lanes
3283 * or if this is called in the outermost block.
3284 *
3285 * @param ctx
3286 * @param src
3287 * @param lane - id of the lane or NULL for the first active lane
3288 * @return value of the lane
3289 */
3290 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3291 LLVMValueRef lane)
3292 {
3293 return ac_build_readlane_common(ctx, src, lane, false);
3294 }
3295
3296 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3297 {
3298 return ac_build_readlane_common(ctx, src, lane, true);
3299 }
3300
3301 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3302 LLVMValueRef lane)
3303 {
3304 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3305 (LLVMValueRef[]){value, lane, src}, 3,
3306 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3307 }
3308
3309 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3310 {
3311 if (ctx->wave_size == 32) {
3312 return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3313 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3314 }
3315 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3316 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3317 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3318 LLVMValueRef val =
3319 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3320 (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3321 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3322 2, AC_FUNC_ATTR_READNONE);
3323 return val;
3324 }
3325
3326 enum dpp_ctrl
3327 {
3328 _dpp_quad_perm = 0x000,
3329 _dpp_row_sl = 0x100,
3330 _dpp_row_sr = 0x110,
3331 _dpp_row_rr = 0x120,
3332 dpp_wf_sl1 = 0x130,
3333 dpp_wf_rl1 = 0x134,
3334 dpp_wf_sr1 = 0x138,
3335 dpp_wf_rr1 = 0x13C,
3336 dpp_row_mirror = 0x140,
3337 dpp_row_half_mirror = 0x141,
3338 dpp_row_bcast15 = 0x142,
3339 dpp_row_bcast31 = 0x143
3340 };
3341
3342 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3343 unsigned lane3)
3344 {
3345 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3346 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3347 }
3348
3349 static inline enum dpp_ctrl dpp_row_sl(unsigned amount)
3350 {
3351 assert(amount > 0 && amount < 16);
3352 return _dpp_row_sl | amount;
3353 }
3354
3355 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3356 {
3357 assert(amount > 0 && amount < 16);
3358 return _dpp_row_sr | amount;
3359 }
3360
3361 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3362 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3363 bool bound_ctrl)
3364 {
3365 LLVMTypeRef type = LLVMTypeOf(src);
3366 LLVMValueRef res;
3367
3368 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3369 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3370
3371 res = ac_build_intrinsic(
3372 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3373 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3374 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3375 LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3376 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3377
3378 return LLVMBuildTrunc(ctx->builder, res, type, "");
3379 }
3380
3381 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3382 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3383 bool bound_ctrl)
3384 {
3385 LLVMTypeRef src_type = LLVMTypeOf(src);
3386 src = ac_to_integer(ctx, src);
3387 old = ac_to_integer(ctx, old);
3388 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3389 LLVMValueRef ret;
3390 if (bits > 32) {
3391 assert(bits % 32 == 0);
3392 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3393 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3394 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3395 ret = LLVMGetUndef(vec_type);
3396 for (unsigned i = 0; i < bits / 32; i++) {
3397 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3398 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3399 LLVMValueRef ret_comp =
3400 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3401 ret =
3402 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3403 }
3404 } else {
3405 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3406 }
3407 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3408 }
3409
3410 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3411 uint64_t sel, bool exchange_rows, bool bound_ctrl)
3412 {
3413 LLVMTypeRef type = LLVMTypeOf(src);
3414 LLVMValueRef result;
3415
3416 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3417
3418 LLVMValueRef args[6] = {
3419 src,
3420 src,
3421 LLVMConstInt(ctx->i32, sel, false),
3422 LLVMConstInt(ctx->i32, sel >> 32, false),
3423 ctx->i1true, /* fi */
3424 bound_ctrl ? ctx->i1true : ctx->i1false,
3425 };
3426
3427 result =
3428 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3429 ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3430
3431 return LLVMBuildTrunc(ctx->builder, result, type, "");
3432 }
3433
3434 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3435 bool exchange_rows, bool bound_ctrl)
3436 {
3437 LLVMTypeRef src_type = LLVMTypeOf(src);
3438 src = ac_to_integer(ctx, src);
3439 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3440 LLVMValueRef ret;
3441 if (bits > 32) {
3442 assert(bits % 32 == 0);
3443 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3444 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3445 ret = LLVMGetUndef(vec_type);
3446 for (unsigned i = 0; i < bits / 32; i++) {
3447 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3448 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3449 ret =
3450 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3451 }
3452 } else {
3453 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3454 }
3455 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3456 }
3457
3458 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3459 {
3460 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3461 return and_mask | (or_mask << 5) | (xor_mask << 10);
3462 }
3463
3464 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3465 unsigned mask)
3466 {
3467 LLVMTypeRef src_type = LLVMTypeOf(src);
3468 LLVMValueRef ret;
3469
3470 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3471
3472 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3473 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3474 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3475
3476 return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3477 }
3478
3479 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3480 {
3481 LLVMTypeRef src_type = LLVMTypeOf(src);
3482 src = ac_to_integer(ctx, src);
3483 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3484 LLVMValueRef ret;
3485 if (bits > 32) {
3486 assert(bits % 32 == 0);
3487 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3488 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3489 ret = LLVMGetUndef(vec_type);
3490 for (unsigned i = 0; i < bits / 32; i++) {
3491 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3492 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3493 ret =
3494 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3495 }
3496 } else {
3497 ret = _ac_build_ds_swizzle(ctx, src, mask);
3498 }
3499 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3500 }
3501
3502 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3503 {
3504 LLVMTypeRef src_type = LLVMTypeOf(src);
3505 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3506 char name[32], type[8];
3507 LLVMValueRef ret;
3508
3509 src = ac_to_integer(ctx, src);
3510
3511 if (bitsize < 32)
3512 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3513
3514 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3515 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3516 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3517 AC_FUNC_ATTR_READNONE);
3518
3519 if (bitsize < 32)
3520 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3521
3522 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3523 }
3524
3525 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3526 LLVMValueRef inactive)
3527 {
3528 char name[33], type[8];
3529 LLVMTypeRef src_type = LLVMTypeOf(src);
3530 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3531 src = ac_to_integer(ctx, src);
3532 inactive = ac_to_integer(ctx, inactive);
3533
3534 if (bitsize < 32) {
3535 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3536 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3537 }
3538
3539 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3540 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3541 LLVMValueRef ret =
3542 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3543 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3544 if (bitsize < 32)
3545 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3546
3547 return ret;
3548 }
3549
3550 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3551 unsigned type_size)
3552 {
3553 if (type_size == 1) {
3554 switch (op) {
3555 case nir_op_iadd:
3556 return ctx->i8_0;
3557 case nir_op_imul:
3558 return ctx->i8_1;
3559 case nir_op_imin:
3560 return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3561 case nir_op_umin:
3562 return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3563 case nir_op_imax:
3564 return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3565 case nir_op_umax:
3566 return ctx->i8_0;
3567 case nir_op_iand:
3568 return LLVMConstInt(ctx->i8, -1, 0);
3569 case nir_op_ior:
3570 return ctx->i8_0;
3571 case nir_op_ixor:
3572 return ctx->i8_0;
3573 default:
3574 unreachable("bad reduction intrinsic");
3575 }
3576 } else if (type_size == 2) {
3577 switch (op) {
3578 case nir_op_iadd:
3579 return ctx->i16_0;
3580 case nir_op_fadd:
3581 return ctx->f16_0;
3582 case nir_op_imul:
3583 return ctx->i16_1;
3584 case nir_op_fmul:
3585 return ctx->f16_1;
3586 case nir_op_imin:
3587 return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3588 case nir_op_umin:
3589 return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3590 case nir_op_fmin:
3591 return LLVMConstReal(ctx->f16, INFINITY);
3592 case nir_op_imax:
3593 return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3594 case nir_op_umax:
3595 return ctx->i16_0;
3596 case nir_op_fmax:
3597 return LLVMConstReal(ctx->f16, -INFINITY);
3598 case nir_op_iand:
3599 return LLVMConstInt(ctx->i16, -1, 0);
3600 case nir_op_ior:
3601 return ctx->i16_0;
3602 case nir_op_ixor:
3603 return ctx->i16_0;
3604 default:
3605 unreachable("bad reduction intrinsic");
3606 }
3607 } else if (type_size == 4) {
3608 switch (op) {
3609 case nir_op_iadd:
3610 return ctx->i32_0;
3611 case nir_op_fadd:
3612 return ctx->f32_0;
3613 case nir_op_imul:
3614 return ctx->i32_1;
3615 case nir_op_fmul:
3616 return ctx->f32_1;
3617 case nir_op_imin:
3618 return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3619 case nir_op_umin:
3620 return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3621 case nir_op_fmin:
3622 return LLVMConstReal(ctx->f32, INFINITY);
3623 case nir_op_imax:
3624 return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3625 case nir_op_umax:
3626 return ctx->i32_0;
3627 case nir_op_fmax:
3628 return LLVMConstReal(ctx->f32, -INFINITY);
3629 case nir_op_iand:
3630 return LLVMConstInt(ctx->i32, -1, 0);
3631 case nir_op_ior:
3632 return ctx->i32_0;
3633 case nir_op_ixor:
3634 return ctx->i32_0;
3635 default:
3636 unreachable("bad reduction intrinsic");
3637 }
3638 } else { /* type_size == 64bit */
3639 switch (op) {
3640 case nir_op_iadd:
3641 return ctx->i64_0;
3642 case nir_op_fadd:
3643 return ctx->f64_0;
3644 case nir_op_imul:
3645 return ctx->i64_1;
3646 case nir_op_fmul:
3647 return ctx->f64_1;
3648 case nir_op_imin:
3649 return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3650 case nir_op_umin:
3651 return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3652 case nir_op_fmin:
3653 return LLVMConstReal(ctx->f64, INFINITY);
3654 case nir_op_imax:
3655 return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3656 case nir_op_umax:
3657 return ctx->i64_0;
3658 case nir_op_fmax:
3659 return LLVMConstReal(ctx->f64, -INFINITY);
3660 case nir_op_iand:
3661 return LLVMConstInt(ctx->i64, -1, 0);
3662 case nir_op_ior:
3663 return ctx->i64_0;
3664 case nir_op_ixor:
3665 return ctx->i64_0;
3666 default:
3667 unreachable("bad reduction intrinsic");
3668 }
3669 }
3670 }
3671
3672 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3673 nir_op op)
3674 {
3675 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3676 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3677 switch (op) {
3678 case nir_op_iadd:
3679 return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3680 case nir_op_fadd:
3681 return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3682 case nir_op_imul:
3683 return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3684 case nir_op_fmul:
3685 return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3686 case nir_op_imin:
3687 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3688 lhs, rhs, "");
3689 case nir_op_umin:
3690 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3691 lhs, rhs, "");
3692 case nir_op_fmin:
3693 return ac_build_intrinsic(
3694 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3695 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3696 AC_FUNC_ATTR_READNONE);
3697 case nir_op_imax:
3698 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3699 lhs, rhs, "");
3700 case nir_op_umax:
3701 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3702 lhs, rhs, "");
3703 case nir_op_fmax:
3704 return ac_build_intrinsic(
3705 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3706 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3707 AC_FUNC_ATTR_READNONE);
3708 case nir_op_iand:
3709 return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3710 case nir_op_ior:
3711 return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3712 case nir_op_ixor:
3713 return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3714 default:
3715 unreachable("bad reduction intrinsic");
3716 }
3717 }
3718
3719 /**
3720 * \param src The value to shift.
3721 * \param identity The value to use the first lane.
3722 * \param maxprefix specifies that the result only needs to be correct for a
3723 * prefix of this many threads
3724 * \return src, shifted 1 lane up, and identity shifted into lane 0.
3725 */
3726 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3727 LLVMValueRef identity, unsigned maxprefix)
3728 {
3729 if (ctx->chip_class >= GFX10) {
3730 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3731 LLVMValueRef active, tmp1, tmp2;
3732 LLVMValueRef tid = ac_get_thread_id(ctx);
3733
3734 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3735
3736 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3737
3738 if (maxprefix > 32) {
3739 active =
3740 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3741
3742 tmp2 = LLVMBuildSelect(ctx->builder, active,
3743 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3744 tmp2, "");
3745
3746 active = LLVMBuildOr(
3747 ctx->builder, active,
3748 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3749 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3750 LLVMConstInt(ctx->i32, 0x10, false), ""),
3751 "");
3752 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3753 } else if (maxprefix > 16) {
3754 active =
3755 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3756
3757 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3758 }
3759 } else if (ctx->chip_class >= GFX8) {
3760 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3761 }
3762
3763 /* wavefront shift_right by 1 on SI/CI */
3764 LLVMValueRef active, tmp1, tmp2;
3765 LLVMValueRef tid = ac_get_thread_id(ctx);
3766 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3767 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3768 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3769 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3770 LLVMConstInt(ctx->i32, 0x4, 0), "");
3771 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3772 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3773 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3774 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3775 LLVMConstInt(ctx->i32, 0x8, 0), "");
3776 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3777 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3778 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3779 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3780 LLVMConstInt(ctx->i32, 0x10, 0), "");
3781 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3782 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3783 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3784 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3785 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3786 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3787 }
3788
3789 /**
3790 * \param maxprefix specifies that the result only needs to be correct for a
3791 * prefix of this many threads
3792 */
3793 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3794 LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3795 {
3796 LLVMValueRef result, tmp;
3797
3798 if (!inclusive)
3799 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3800
3801 result = src;
3802
3803 if (ctx->chip_class <= GFX7) {
3804 assert(maxprefix == 64);
3805 LLVMValueRef tid = ac_get_thread_id(ctx);
3806 LLVMValueRef active;
3807 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3808 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3809 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3810 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3811 result = ac_build_alu_op(ctx, result, tmp, op);
3812 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3813 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3814 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3815 ctx->i32_0, "");
3816 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3817 result = ac_build_alu_op(ctx, result, tmp, op);
3818 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3819 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3820 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3821 ctx->i32_0, "");
3822 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3823 result = ac_build_alu_op(ctx, result, tmp, op);
3824 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3825 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3826 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3827 ctx->i32_0, "");
3828 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3829 result = ac_build_alu_op(ctx, result, tmp, op);
3830 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3831 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3832 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3833 ctx->i32_0, "");
3834 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3835 result = ac_build_alu_op(ctx, result, tmp, op);
3836 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3837 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3838 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3839 ctx->i32_0, "");
3840 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3841 result = ac_build_alu_op(ctx, result, tmp, op);
3842 return result;
3843 }
3844
3845 if (maxprefix <= 1)
3846 return result;
3847 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3848 result = ac_build_alu_op(ctx, result, tmp, op);
3849 if (maxprefix <= 2)
3850 return result;
3851 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3852 result = ac_build_alu_op(ctx, result, tmp, op);
3853 if (maxprefix <= 3)
3854 return result;
3855 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3856 result = ac_build_alu_op(ctx, result, tmp, op);
3857 if (maxprefix <= 4)
3858 return result;
3859 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3860 result = ac_build_alu_op(ctx, result, tmp, op);
3861 if (maxprefix <= 8)
3862 return result;
3863 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3864 result = ac_build_alu_op(ctx, result, tmp, op);
3865 if (maxprefix <= 16)
3866 return result;
3867
3868 if (ctx->chip_class >= GFX10) {
3869 LLVMValueRef tid = ac_get_thread_id(ctx);
3870 LLVMValueRef active;
3871
3872 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3873
3874 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3875 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3876 ctx->i32_0, "");
3877
3878 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3879
3880 result = ac_build_alu_op(ctx, result, tmp, op);
3881
3882 if (maxprefix <= 32)
3883 return result;
3884
3885 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3886
3887 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3888
3889 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3890
3891 result = ac_build_alu_op(ctx, result, tmp, op);
3892 return result;
3893 }
3894
3895 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3896 result = ac_build_alu_op(ctx, result, tmp, op);
3897 if (maxprefix <= 32)
3898 return result;
3899 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3900 result = ac_build_alu_op(ctx, result, tmp, op);
3901 return result;
3902 }
3903
3904 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3905 {
3906 LLVMValueRef result;
3907
3908 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3909 LLVMBuilderRef builder = ctx->builder;
3910 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3911 result = ac_build_ballot(ctx, src);
3912 result = ac_build_mbcnt(ctx, result);
3913 result = LLVMBuildAdd(builder, result, src, "");
3914 return result;
3915 }
3916
3917 ac_build_optimization_barrier(ctx, &src);
3918
3919 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3920 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3921 LLVMTypeOf(identity), "");
3922 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3923
3924 return ac_build_wwm(ctx, result);
3925 }
3926
3927 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3928 {
3929 LLVMValueRef result;
3930
3931 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3932 LLVMBuilderRef builder = ctx->builder;
3933 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3934 result = ac_build_ballot(ctx, src);
3935 result = ac_build_mbcnt(ctx, result);
3936 return result;
3937 }
3938
3939 ac_build_optimization_barrier(ctx, &src);
3940
3941 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3942 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3943 LLVMTypeOf(identity), "");
3944 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3945
3946 return ac_build_wwm(ctx, result);
3947 }
3948
3949 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3950 unsigned cluster_size)
3951 {
3952 if (cluster_size == 1)
3953 return src;
3954 ac_build_optimization_barrier(ctx, &src);
3955 LLVMValueRef result, swap;
3956 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3957 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3958 LLVMTypeOf(identity), "");
3959 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3960 result = ac_build_alu_op(ctx, result, swap, op);
3961 if (cluster_size == 2)
3962 return ac_build_wwm(ctx, result);
3963
3964 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3965 result = ac_build_alu_op(ctx, result, swap, op);
3966 if (cluster_size == 4)
3967 return ac_build_wwm(ctx, result);
3968
3969 if (ctx->chip_class >= GFX8)
3970 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3971 else
3972 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3973 result = ac_build_alu_op(ctx, result, swap, op);
3974 if (cluster_size == 8)
3975 return ac_build_wwm(ctx, result);
3976
3977 if (ctx->chip_class >= GFX8)
3978 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3979 else
3980 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3981 result = ac_build_alu_op(ctx, result, swap, op);
3982 if (cluster_size == 16)
3983 return ac_build_wwm(ctx, result);
3984
3985 if (ctx->chip_class >= GFX10)
3986 swap = ac_build_permlane16(ctx, result, 0, true, false);
3987 else if (ctx->chip_class >= GFX8 && cluster_size != 32)
3988 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3989 else
3990 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3991 result = ac_build_alu_op(ctx, result, swap, op);
3992 if (cluster_size == 32)
3993 return ac_build_wwm(ctx, result);
3994
3995 if (ctx->chip_class >= GFX8) {
3996 if (ctx->wave_size == 64) {
3997 if (ctx->chip_class >= GFX10)
3998 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3999 else
4000 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4001 result = ac_build_alu_op(ctx, result, swap, op);
4002 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4003 }
4004
4005 return ac_build_wwm(ctx, result);
4006 } else {
4007 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4008 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4009 result = ac_build_alu_op(ctx, result, swap, op);
4010 return ac_build_wwm(ctx, result);
4011 }
4012 }
4013
4014 /**
4015 * "Top half" of a scan that reduces per-wave values across an entire
4016 * workgroup.
4017 *
4018 * The source value must be present in the highest lane of the wave, and the
4019 * highest lane must be live.
4020 */
4021 void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4022 {
4023 if (ws->maxwaves <= 1)
4024 return;
4025
4026 const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4027 LLVMBuilderRef builder = ctx->builder;
4028 LLVMValueRef tid = ac_get_thread_id(ctx);
4029 LLVMValueRef tmp;
4030
4031 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4032 ac_build_ifcc(ctx, tmp, 1000);
4033 LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4034 ac_build_endif(ctx, 1000);
4035 }
4036
4037 /**
4038 * "Bottom half" of a scan that reduces per-wave values across an entire
4039 * workgroup.
4040 *
4041 * The caller must place a barrier between the top and bottom halves.
4042 */
4043 void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4044 {
4045 const LLVMTypeRef type = LLVMTypeOf(ws->src);
4046 const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4047
4048 if (ws->maxwaves <= 1) {
4049 ws->result_reduce = ws->src;
4050 ws->result_inclusive = ws->src;
4051 ws->result_exclusive = identity;
4052 return;
4053 }
4054 assert(ws->maxwaves <= 32);
4055
4056 LLVMBuilderRef builder = ctx->builder;
4057 LLVMValueRef tid = ac_get_thread_id(ctx);
4058 LLVMBasicBlockRef bbs[2];
4059 LLVMValueRef phivalues_scan[2];
4060 LLVMValueRef tmp, tmp2;
4061
4062 bbs[0] = LLVMGetInsertBlock(builder);
4063 phivalues_scan[0] = LLVMGetUndef(type);
4064
4065 if (ws->enable_reduce)
4066 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4067 else if (ws->enable_inclusive)
4068 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4069 else
4070 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4071 ac_build_ifcc(ctx, tmp, 1001);
4072 {
4073 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4074
4075 ac_build_optimization_barrier(ctx, &tmp);
4076
4077 bbs[1] = LLVMGetInsertBlock(builder);
4078 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4079 }
4080 ac_build_endif(ctx, 1001);
4081
4082 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4083
4084 if (ws->enable_reduce) {
4085 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4086 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4087 }
4088 if (ws->enable_inclusive)
4089 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4090 if (ws->enable_exclusive) {
4091 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4092 tmp = ac_build_readlane(ctx, scan, tmp);
4093 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4094 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4095 }
4096 }
4097
4098 /**
4099 * Inclusive scan of a per-wave value across an entire workgroup.
4100 *
4101 * This implies an s_barrier instruction.
4102 *
4103 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4104 * of the workgroup are live. (This requirement cannot easily be relaxed in a
4105 * useful manner because of the barrier in the algorithm.)
4106 */
4107 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4108 {
4109 ac_build_wg_wavescan_top(ctx, ws);
4110 ac_build_s_barrier(ctx);
4111 ac_build_wg_wavescan_bottom(ctx, ws);
4112 }
4113
4114 /**
4115 * "Top half" of a scan that reduces per-thread values across an entire
4116 * workgroup.
4117 *
4118 * All lanes must be active when this code runs.
4119 */
4120 void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4121 {
4122 if (ws->enable_exclusive) {
4123 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4124 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4125 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4126 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4127 } else {
4128 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4129 }
4130
4131 bool enable_inclusive = ws->enable_inclusive;
4132 bool enable_exclusive = ws->enable_exclusive;
4133 ws->enable_inclusive = false;
4134 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4135 ac_build_wg_wavescan_top(ctx, ws);
4136 ws->enable_inclusive = enable_inclusive;
4137 ws->enable_exclusive = enable_exclusive;
4138 }
4139
4140 /**
4141 * "Bottom half" of a scan that reduces per-thread values across an entire
4142 * workgroup.
4143 *
4144 * The caller must place a barrier between the top and bottom halves.
4145 */
4146 void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4147 {
4148 bool enable_inclusive = ws->enable_inclusive;
4149 bool enable_exclusive = ws->enable_exclusive;
4150 ws->enable_inclusive = false;
4151 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4152 ac_build_wg_wavescan_bottom(ctx, ws);
4153 ws->enable_inclusive = enable_inclusive;
4154 ws->enable_exclusive = enable_exclusive;
4155
4156 /* ws->result_reduce is already the correct value */
4157 if (ws->enable_inclusive)
4158 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4159 if (ws->enable_exclusive)
4160 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4161 }
4162
4163 /**
4164 * A scan that reduces per-thread values across an entire workgroup.
4165 *
4166 * The caller must ensure that all lanes are active when this code runs
4167 * (WWM is insufficient!), because there is an implied barrier.
4168 */
4169 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4170 {
4171 ac_build_wg_scan_top(ctx, ws);
4172 ac_build_s_barrier(ctx);
4173 ac_build_wg_scan_bottom(ctx, ws);
4174 }
4175
4176 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4177 unsigned lane1, unsigned lane2, unsigned lane3)
4178 {
4179 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4180 if (ctx->chip_class >= GFX8) {
4181 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4182 } else {
4183 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4184 }
4185 }
4186
4187 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4188 {
4189 LLVMTypeRef type = LLVMTypeOf(src);
4190 LLVMValueRef result;
4191
4192 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4193 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4194
4195 result =
4196 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4197 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4198 return LLVMBuildTrunc(ctx->builder, result, type, "");
4199 }
4200
4201 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4202 {
4203 LLVMTypeRef type;
4204 char *intr;
4205
4206 if (bitsize == 16) {
4207 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4208 type = ctx->i16;
4209 } else if (bitsize == 32) {
4210 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4211 type = ctx->i32;
4212 } else {
4213 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4214 type = ctx->i32;
4215 }
4216
4217 LLVMValueRef params[] = {
4218 src0,
4219 };
4220 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4221 }
4222 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4223 {
4224 LLVMTypeRef type;
4225 char *intr;
4226
4227 if (bitsize == 16) {
4228 intr = "llvm.amdgcn.frexp.mant.f16";
4229 type = ctx->f16;
4230 } else if (bitsize == 32) {
4231 intr = "llvm.amdgcn.frexp.mant.f32";
4232 type = ctx->f32;
4233 } else {
4234 intr = "llvm.amdgcn.frexp.mant.f64";
4235 type = ctx->f64;
4236 }
4237
4238 LLVMValueRef params[] = {
4239 src0,
4240 };
4241 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4242 }
4243
4244 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4245 {
4246 LLVMTypeRef type;
4247 char *intr;
4248
4249 if (bitsize == 16) {
4250 intr = "llvm.canonicalize.f16";
4251 type = ctx->f16;
4252 } else if (bitsize == 32) {
4253 intr = "llvm.canonicalize.f32";
4254 type = ctx->f32;
4255 } else {
4256 intr = "llvm.canonicalize.f64";
4257 type = ctx->f64;
4258 }
4259
4260 LLVMValueRef params[] = {
4261 src0,
4262 };
4263 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4264 }
4265
4266 /*
4267 * this takes an I,J coordinate pair,
4268 * and works out the X and Y derivatives.
4269 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4270 */
4271 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4272 {
4273 LLVMValueRef result[4], a;
4274 unsigned i;
4275
4276 for (i = 0; i < 2; i++) {
4277 a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4278 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4279 result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4280 }
4281 return ac_build_gather_values(ctx, result, 4);
4282 }
4283
4284 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4285 {
4286 LLVMValueRef result =
4287 ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4288 result = LLVMBuildNot(ctx->builder, result, "");
4289 return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4290 }
4291
4292 LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4293 {
4294 if (!ctx->postponed_kill)
4295 return ac_build_load_helper_invocation(ctx);
4296
4297 /* !(exact && postponed) */
4298 LLVMValueRef exact =
4299 ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4300
4301 LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
4302 LLVMValueRef result = LLVMBuildAnd(ctx->builder, exact, postponed, "");
4303
4304 return LLVMBuildSelect(ctx->builder, result, ctx->i32_0,
4305 LLVMConstInt(ctx->i32, 0xFFFFFFFF, false), "");
4306 }
4307
4308 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4309 unsigned num_args)
4310 {
4311 LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4312 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4313 return ret;
4314 }
4315
4316 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4317 LLVMValueRef samplemask, struct ac_export_args *args)
4318 {
4319 unsigned mask = 0;
4320 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL);
4321
4322 assert(depth || stencil || samplemask);
4323
4324 memset(args, 0, sizeof(*args));
4325
4326 args->valid_mask = 1; /* whether the EXEC mask is valid */
4327 args->done = 1; /* DONE bit */
4328
4329 /* Specify the target we are exporting */
4330 args->target = V_008DFC_SQ_EXP_MRTZ;
4331
4332 args->compr = 0; /* COMP flag */
4333 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4334 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4335 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4336 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4337
4338 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4339 assert(!depth);
4340 args->compr = 1; /* COMPR flag */
4341
4342 if (stencil) {
4343 /* Stencil should be in X[23:16]. */
4344 stencil = ac_to_integer(ctx, stencil);
4345 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4346 args->out[0] = ac_to_float(ctx, stencil);
4347 mask |= 0x3;
4348 }
4349 if (samplemask) {
4350 /* SampleMask should be in Y[15:0]. */
4351 args->out[1] = samplemask;
4352 mask |= 0xc;
4353 }
4354 } else {
4355 if (depth) {
4356 args->out[0] = depth;
4357 mask |= 0x1;
4358 }
4359 if (stencil) {
4360 args->out[1] = stencil;
4361 mask |= 0x2;
4362 }
4363 if (samplemask) {
4364 args->out[2] = samplemask;
4365 mask |= 0x4;
4366 }
4367 }
4368
4369 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4370 * at the X writemask component. */
4371 if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4372 mask |= 0x1;
4373
4374 /* Specify which components to enable */
4375 args->enabled_channels = mask;
4376 }
4377
4378 /* Send GS Alloc Req message from the first wave of the group to SPI.
4379 * Message payload is:
4380 * - bits 0..10: vertices in group
4381 * - bits 12..22: primitives in group
4382 */
4383 void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4384 LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4385 {
4386 LLVMBuilderRef builder = ctx->builder;
4387 LLVMValueRef tmp;
4388 bool export_dummy_prim = false;
4389
4390 /* HW workaround for a GPU hang with 100% culling.
4391 * We always have to export at least 1 primitive.
4392 * Export a degenerate triangle using vertex 0 for all 3 vertices.
4393 */
4394 if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
4395 assert(vtx_cnt == ctx->i32_0);
4396 prim_cnt = ctx->i32_1;
4397 vtx_cnt = ctx->i32_1;
4398 export_dummy_prim = true;
4399 }
4400
4401 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4402
4403 tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4404 tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4405 ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4406
4407 if (export_dummy_prim) {
4408 struct ac_ngg_prim prim = {};
4409 /* The vertex indices are 0,0,0. */
4410 prim.passthrough = ctx->i32_0;
4411
4412 struct ac_export_args pos = {};
4413 pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0;
4414 pos.target = V_008DFC_SQ_EXP_POS;
4415 pos.enabled_channels = 0xf;
4416 pos.done = true;
4417
4418 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4419 5021);
4420 ac_build_export_prim(ctx, &prim);
4421 ac_build_export(ctx, &pos);
4422 ac_build_endif(ctx, 5021);
4423 }
4424
4425 ac_build_endif(ctx, 5020);
4426 }
4427
4428 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4429 {
4430 /* The prim export format is:
4431 * - bits 0..8: index 0
4432 * - bit 9: edge flag 0
4433 * - bits 10..18: index 1
4434 * - bit 19: edge flag 1
4435 * - bits 20..28: index 2
4436 * - bit 29: edge flag 2
4437 * - bit 31: null primitive (skip)
4438 */
4439 LLVMBuilderRef builder = ctx->builder;
4440 LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4441 LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4442
4443 for (unsigned i = 0; i < prim->num_vertices; ++i) {
4444 tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4445 result = LLVMBuildOr(builder, result, tmp, "");
4446 tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");
4447 tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), "");
4448 result = LLVMBuildOr(builder, result, tmp, "");
4449 }
4450 return result;
4451 }
4452
4453 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4454 {
4455 struct ac_export_args args;
4456
4457 if (prim->passthrough) {
4458 args.out[0] = prim->passthrough;
4459 } else {
4460 args.out[0] = ac_pack_prim_export(ctx, prim);
4461 }
4462
4463 args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4464 args.out[1] = LLVMGetUndef(ctx->f32);
4465 args.out[2] = LLVMGetUndef(ctx->f32);
4466 args.out[3] = LLVMGetUndef(ctx->f32);
4467
4468 args.target = V_008DFC_SQ_EXP_PRIM;
4469 args.enabled_channels = 1;
4470 args.done = true;
4471 args.valid_mask = false;
4472 args.compr = false;
4473
4474 ac_build_export(ctx, &args);
4475 }
4476
4477 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4478 {
4479 if (type == AC_ARG_FLOAT) {
4480 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4481 } else if (type == AC_ARG_INT) {
4482 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4483 } else {
4484 LLVMTypeRef ptr_type;
4485 switch (type) {
4486 case AC_ARG_CONST_PTR:
4487 ptr_type = ctx->i8;
4488 break;
4489 case AC_ARG_CONST_FLOAT_PTR:
4490 ptr_type = ctx->f32;
4491 break;
4492 case AC_ARG_CONST_PTR_PTR:
4493 ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4494 break;
4495 case AC_ARG_CONST_DESC_PTR:
4496 ptr_type = ctx->v4i32;
4497 break;
4498 case AC_ARG_CONST_IMAGE_PTR:
4499 ptr_type = ctx->v8i32;
4500 break;
4501 default:
4502 unreachable("unknown arg type");
4503 }
4504 if (size == 1) {
4505 return ac_array_in_const32_addr_space(ptr_type);
4506 } else {
4507 assert(size == 2);
4508 return ac_array_in_const_addr_space(ptr_type);
4509 }
4510 }
4511 }
4512
4513 LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4514 enum ac_llvm_calling_convention convention, const char *name,
4515 LLVMTypeRef ret_type, LLVMModuleRef module)
4516 {
4517 LLVMTypeRef arg_types[AC_MAX_ARGS];
4518
4519 for (unsigned i = 0; i < args->arg_count; i++) {
4520 arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4521 }
4522
4523 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4524
4525 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4526 LLVMBasicBlockRef main_function_body =
4527 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4528 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4529
4530 LLVMSetFunctionCallConv(main_function, convention);
4531 for (unsigned i = 0; i < args->arg_count; ++i) {
4532 LLVMValueRef P = LLVMGetParam(main_function, i);
4533
4534 if (args->args[i].file != AC_ARG_SGPR)
4535 continue;
4536
4537 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4538
4539 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4540 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4541 ac_add_attr_dereferenceable(P, UINT64_MAX);
4542 ac_add_attr_alignment(P, 32);
4543 }
4544 }
4545
4546 ctx->main_function = main_function;
4547
4548 if (LLVM_VERSION_MAJOR >= 11) {
4549 /* Enable denormals for FP16 and FP64: */
4550 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4551 /* Disable denormals for FP32: */
4552 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4553 "preserve-sign,preserve-sign");
4554 }
4555 return main_function;
4556 }
4557
4558 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4559 {
4560 LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4561 LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4562 LLVMBuildCall(ctx->builder, code, NULL, 0, "");
4563 }
4564
4565 LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef index)
4566 {
4567 LLVMBuilderRef builder = ctx->builder;
4568 LLVMTypeRef type = LLVMTypeOf(mask);
4569
4570 LLVMValueRef bit =
4571 LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), LLVMBuildZExt(builder, index, type, ""), "");
4572 LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), "");
4573 LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, "");
4574 return ac_build_bit_count(ctx, prefix_mask);
4575 }
4576
4577 /* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */
4578 LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, LLVMValueRef mask[2],
4579 LLVMValueRef index)
4580 {
4581 LLVMBuilderRef builder = ctx->builder;
4582 #if 0
4583 /* Reference version using i128. */
4584 LLVMValueRef input_mask =
4585 LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, "");
4586
4587 return ac_prefix_bitcount(ctx, input_mask, index);
4588 #else
4589 /* Optimized version using 2 64-bit masks. */
4590 LLVMValueRef is_hi, is_0, c64, c128, all_bits;
4591 LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2];
4592
4593 /* Compute the 128-bit prefix mask. */
4594 c64 = LLVMConstInt(ctx->i32, 64, 0);
4595 c128 = LLVMConstInt(ctx->i32, 128, 0);
4596 all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4597 /* The first index that can have non-zero high bits in the prefix mask is 65. */
4598 is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, "");
4599 is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, "");
4600 mask_bcnt0 = ac_build_bit_count(ctx, mask[0]);
4601
4602 for (unsigned i = 0; i < 2; i++) {
4603 shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, "");
4604 /* For i==0, index==0, the right shift by 64 doesn't give the desired result,
4605 * so we handle it by the is_0 select.
4606 * For i==1, index==64, same story, so we handle it by the last is_hi select.
4607 * For i==0, index==64, we shift by 0, which is what we want.
4608 */
4609 prefix_mask[i] =
4610 LLVMBuildLShr(builder, all_bits, LLVMBuildZExt(builder, shift[i], ctx->i64, ""), "");
4611 prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], "");
4612 prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]);
4613 }
4614
4615 prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], "");
4616 prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], "");
4617 prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, "");
4618
4619 return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], "");
4620 #endif
4621 }
4622
4623 /**
4624 * Convert triangle strip indices to triangle indices. This is used to decompose
4625 * triangle strips into triangles.
4626 */
4627 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4628 LLVMValueRef flatshade_first,
4629 LLVMValueRef index[3])
4630 {
4631 LLVMBuilderRef builder = ctx->builder;
4632 LLVMValueRef out[3];
4633
4634 /* We need to change the vertex order for odd triangles to get correct
4635 * front/back facing by swapping 2 vertex indices, but we also have to
4636 * keep the provoking vertex in the same place.
4637 *
4638 * If the first vertex is provoking, swap index 1 and 2.
4639 * If the last vertex is provoking, swap index 0 and 1.
4640 */
4641 out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4642 LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4643 out[1] = LLVMBuildSelect(builder, flatshade_first,
4644 LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4645 LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4646 out[2] = LLVMBuildSelect(builder, flatshade_first,
4647 LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4648 memcpy(index, out, sizeof(out));
4649 }