amd/llvm: switch to 3-spaces style
[mesa.git] / src / amd / llvm / ac_llvm_build.c
1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include "ac_exp_param.h"
29 #include "ac_llvm_util.h"
30 #include "ac_shader_util.h"
31 #include "c11/threads.h"
32 #include "shader_enums.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/macros.h"
36 #include "util/u_atomic.h"
37 #include "util/u_math.h"
38 #include <llvm-c/Core.h>
39 #include <llvm/Config/llvm-config.h>
40
41 #include <assert.h>
42 #include <stdio.h>
43
44 #define AC_LLVM_INITIAL_CF_DEPTH 4
45
46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
47 */
48 struct ac_llvm_flow {
49 /* Loop exit or next part of if/else/endif. */
50 LLVMBasicBlockRef next_block;
51 LLVMBasicBlockRef loop_entry_block;
52 };
53
54 /* Initialize module-independent parts of the context.
55 *
56 * The caller is responsible for initializing ctx::module and ctx::builder.
57 */
58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59 enum chip_class chip_class, enum radeon_family family,
60 enum ac_float_mode float_mode, unsigned wave_size,
61 unsigned ballot_mask_bits)
62 {
63 ctx->context = LLVMContextCreate();
64
65 ctx->chip_class = chip_class;
66 ctx->family = family;
67 ctx->wave_size = wave_size;
68 ctx->ballot_mask_bits = ballot_mask_bits;
69 ctx->float_mode = float_mode;
70 ctx->module =
71 ac_create_module(wave_size == 32 ? compiler->tm_wave32 : compiler->tm, ctx->context);
72 ctx->builder = ac_create_builder(ctx->context, float_mode);
73
74 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
75 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
76 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
77 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
78 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
79 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
80 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
81 ctx->intptr = ctx->i32;
82 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
83 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
84 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
85 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
86 ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
87 ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
88 ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
89 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
90 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
91 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
92 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
93 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
94 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
95 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
96 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
97 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
98
99 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
100 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
101 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
102 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
103 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
104 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
105 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
106 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
107 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
108 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
109 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
110 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
111 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
112 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
113 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
114 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
115
116 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
117 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
118
119 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
120
121 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
122
123 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
124
125 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
126 ctx->flow = calloc(1, sizeof(*ctx->flow));
127 }
128
129 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
130 {
131 free(ctx->flow->stack);
132 free(ctx->flow);
133 ctx->flow = NULL;
134 }
135
136 int ac_get_llvm_num_components(LLVMValueRef value)
137 {
138 LLVMTypeRef type = LLVMTypeOf(value);
139 unsigned num_components =
140 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
141 return num_components;
142 }
143
144 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
145 {
146 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
147 assert(index == 0);
148 return value;
149 }
150
151 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
152 }
153
154 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
155 {
156 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
157 type = LLVMGetElementType(type);
158
159 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
160 return LLVMGetIntTypeWidth(type);
161
162 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
163 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
164 return 32;
165 }
166
167 if (type == ctx->f16)
168 return 16;
169 if (type == ctx->f32)
170 return 32;
171 if (type == ctx->f64)
172 return 64;
173
174 unreachable("Unhandled type kind in get_elem_bits");
175 }
176
177 unsigned ac_get_type_size(LLVMTypeRef type)
178 {
179 LLVMTypeKind kind = LLVMGetTypeKind(type);
180
181 switch (kind) {
182 case LLVMIntegerTypeKind:
183 return LLVMGetIntTypeWidth(type) / 8;
184 case LLVMHalfTypeKind:
185 return 2;
186 case LLVMFloatTypeKind:
187 return 4;
188 case LLVMDoubleTypeKind:
189 return 8;
190 case LLVMPointerTypeKind:
191 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
192 return 4;
193 return 8;
194 case LLVMVectorTypeKind:
195 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
196 case LLVMArrayTypeKind:
197 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
198 default:
199 assert(0);
200 return 0;
201 }
202 }
203
204 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
205 {
206 if (t == ctx->i8)
207 return ctx->i8;
208 else if (t == ctx->f16 || t == ctx->i16)
209 return ctx->i16;
210 else if (t == ctx->f32 || t == ctx->i32)
211 return ctx->i32;
212 else if (t == ctx->f64 || t == ctx->i64)
213 return ctx->i64;
214 else
215 unreachable("Unhandled integer size");
216 }
217
218 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
219 {
220 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
221 LLVMTypeRef elem_type = LLVMGetElementType(t);
222 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
223 }
224 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
225 switch (LLVMGetPointerAddressSpace(t)) {
226 case AC_ADDR_SPACE_GLOBAL:
227 return ctx->i64;
228 case AC_ADDR_SPACE_CONST_32BIT:
229 case AC_ADDR_SPACE_LDS:
230 return ctx->i32;
231 default:
232 unreachable("unhandled address space");
233 }
234 }
235 return to_integer_type_scalar(ctx, t);
236 }
237
238 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
239 {
240 LLVMTypeRef type = LLVMTypeOf(v);
241 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
242 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
243 }
244 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
245 }
246
247 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
248 {
249 LLVMTypeRef type = LLVMTypeOf(v);
250 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
251 return v;
252 return ac_to_integer(ctx, v);
253 }
254
255 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
256 {
257 if (t == ctx->i8)
258 return ctx->i8;
259 else if (t == ctx->i16 || t == ctx->f16)
260 return ctx->f16;
261 else if (t == ctx->i32 || t == ctx->f32)
262 return ctx->f32;
263 else if (t == ctx->i64 || t == ctx->f64)
264 return ctx->f64;
265 else
266 unreachable("Unhandled float size");
267 }
268
269 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
270 {
271 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
272 LLVMTypeRef elem_type = LLVMGetElementType(t);
273 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
274 }
275 return to_float_type_scalar(ctx, t);
276 }
277
278 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
279 {
280 LLVMTypeRef type = LLVMTypeOf(v);
281 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
282 }
283
284 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
285 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
286 unsigned attrib_mask)
287 {
288 LLVMValueRef function, call;
289 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
290
291 function = LLVMGetNamedFunction(ctx->module, name);
292 if (!function) {
293 LLVMTypeRef param_types[32], function_type;
294 unsigned i;
295
296 assert(param_count <= 32);
297
298 for (i = 0; i < param_count; ++i) {
299 assert(params[i]);
300 param_types[i] = LLVMTypeOf(params[i]);
301 }
302 function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
303 function = LLVMAddFunction(ctx->module, name, function_type);
304
305 LLVMSetFunctionCallConv(function, LLVMCCallConv);
306 LLVMSetLinkage(function, LLVMExternalLinkage);
307
308 if (!set_callsite_attrs)
309 ac_add_func_attributes(ctx->context, function, attrib_mask);
310 }
311
312 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
313 if (set_callsite_attrs)
314 ac_add_func_attributes(ctx->context, call, attrib_mask);
315 return call;
316 }
317
318 /**
319 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
320 * intrinsic names).
321 */
322 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
323 {
324 LLVMTypeRef elem_type = type;
325
326 assert(bufsize >= 8);
327
328 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
329 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
330 if (ret < 0) {
331 char *type_name = LLVMPrintTypeToString(type);
332 fprintf(stderr, "Error building type name for: %s\n", type_name);
333 LLVMDisposeMessage(type_name);
334 return;
335 }
336 elem_type = LLVMGetElementType(type);
337 buf += ret;
338 bufsize -= ret;
339 }
340 switch (LLVMGetTypeKind(elem_type)) {
341 default:
342 break;
343 case LLVMIntegerTypeKind:
344 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
345 break;
346 case LLVMHalfTypeKind:
347 snprintf(buf, bufsize, "f16");
348 break;
349 case LLVMFloatTypeKind:
350 snprintf(buf, bufsize, "f32");
351 break;
352 case LLVMDoubleTypeKind:
353 snprintf(buf, bufsize, "f64");
354 break;
355 }
356 }
357
358 /**
359 * Helper function that builds an LLVM IR PHI node and immediately adds
360 * incoming edges.
361 */
362 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
363 LLVMValueRef *values, LLVMBasicBlockRef *blocks)
364 {
365 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
366 LLVMAddIncoming(phi, values, blocks, count_incoming);
367 return phi;
368 }
369
370 void ac_build_s_barrier(struct ac_llvm_context *ctx)
371 {
372 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
373 }
374
375 /* Prevent optimizations (at least of memory accesses) across the current
376 * point in the program by emitting empty inline assembly that is marked as
377 * having side effects.
378 *
379 * Optionally, a value can be passed through the inline assembly to prevent
380 * LLVM from hoisting calls to ReadNone functions.
381 */
382 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr)
383 {
384 static int counter = 0;
385
386 LLVMBuilderRef builder = ctx->builder;
387 char code[16];
388
389 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
390
391 if (!pvgpr) {
392 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
393 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
394 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
395 } else {
396 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
397 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
398 LLVMTypeRef type = LLVMTypeOf(*pvgpr);
399 unsigned bitsize = ac_get_elem_bits(ctx, type);
400 LLVMValueRef vgpr = *pvgpr;
401 LLVMTypeRef vgpr_type;
402 unsigned vgpr_size;
403 LLVMValueRef vgpr0;
404
405 if (bitsize < 32)
406 vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
407
408 vgpr_type = LLVMTypeOf(vgpr);
409 vgpr_size = ac_get_type_size(vgpr_type);
410
411 assert(vgpr_size % 4 == 0);
412
413 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
414 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
415 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
416 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
417 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
418
419 if (bitsize < 32)
420 vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
421
422 *pvgpr = vgpr;
423 }
424 }
425
426 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
427 {
428 const char *name =
429 scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : "llvm.amdgcn.s.memtime";
430 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
431 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
432 }
433
434 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
435 {
436 const char *name;
437
438 if (LLVM_VERSION_MAJOR >= 9) {
439 if (ctx->wave_size == 64)
440 name = "llvm.amdgcn.icmp.i64.i32";
441 else
442 name = "llvm.amdgcn.icmp.i32.i32";
443 } else {
444 name = "llvm.amdgcn.icmp.i32";
445 }
446 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
447
448 /* We currently have no other way to prevent LLVM from lifting the icmp
449 * calls to a dominating basic block.
450 */
451 ac_build_optimization_barrier(ctx, &args[0]);
452
453 args[0] = ac_to_integer(ctx, args[0]);
454
455 return ac_build_intrinsic(
456 ctx, name, ctx->iN_wavemask, args, 3,
457 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
458 }
459
460 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
461 {
462 const char *name;
463
464 if (LLVM_VERSION_MAJOR >= 9) {
465 if (ctx->wave_size == 64)
466 name = "llvm.amdgcn.icmp.i64.i1";
467 else
468 name = "llvm.amdgcn.icmp.i32.i1";
469 } else {
470 name = "llvm.amdgcn.icmp.i1";
471 }
472 LLVMValueRef args[3] = {
473 value,
474 ctx->i1false,
475 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
476 };
477
478 return ac_build_intrinsic(
479 ctx, name, ctx->iN_wavemask, args, 3,
480 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
481 }
482
483 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
484 {
485 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
486 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
487 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
488 }
489
490 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
491 {
492 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
493 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
494 "");
495 }
496
497 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
498 {
499 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
500 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
501
502 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
503 LLVMValueRef none =
504 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
505 return LLVMBuildOr(ctx->builder, all, none, "");
506 }
507
508 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
509 unsigned value_count, unsigned component)
510 {
511 LLVMValueRef vec = NULL;
512
513 if (value_count == 1) {
514 return values[component];
515 } else if (!value_count)
516 unreachable("value_count is 0");
517
518 for (unsigned i = component; i < value_count + component; i++) {
519 LLVMValueRef value = values[i];
520
521 if (i == component)
522 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
523 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
524 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
525 }
526 return vec;
527 }
528
529 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
530 unsigned value_count, unsigned value_stride, bool load,
531 bool always_vector)
532 {
533 LLVMBuilderRef builder = ctx->builder;
534 LLVMValueRef vec = NULL;
535 unsigned i;
536
537 if (value_count == 1 && !always_vector) {
538 if (load)
539 return LLVMBuildLoad(builder, values[0], "");
540 return values[0];
541 } else if (!value_count)
542 unreachable("value_count is 0");
543
544 for (i = 0; i < value_count; i++) {
545 LLVMValueRef value = values[i * value_stride];
546 if (load)
547 value = LLVMBuildLoad(builder, value, "");
548
549 if (!i)
550 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
551 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
552 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
553 }
554 return vec;
555 }
556
557 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
558 unsigned value_count)
559 {
560 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
561 }
562
563 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
564 * channels with undef. Extract at most src_channels components from the input.
565 */
566 static LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
567 unsigned src_channels, unsigned dst_channels)
568 {
569 LLVMTypeRef elemtype;
570 LLVMValueRef chan[dst_channels];
571
572 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
573 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
574
575 if (src_channels == dst_channels && vec_size == dst_channels)
576 return value;
577
578 src_channels = MIN2(src_channels, vec_size);
579
580 for (unsigned i = 0; i < src_channels; i++)
581 chan[i] = ac_llvm_extract_elem(ctx, value, i);
582
583 elemtype = LLVMGetElementType(LLVMTypeOf(value));
584 } else {
585 if (src_channels) {
586 assert(src_channels == 1);
587 chan[0] = value;
588 }
589 elemtype = LLVMTypeOf(value);
590 }
591
592 for (unsigned i = src_channels; i < dst_channels; i++)
593 chan[i] = LLVMGetUndef(elemtype);
594
595 return ac_build_gather_values(ctx, chan, dst_channels);
596 }
597
598 /* Extract components [start, start + channels) from a vector.
599 */
600 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
601 unsigned channels)
602 {
603 LLVMValueRef chan[channels];
604
605 for (unsigned i = 0; i < channels; i++)
606 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
607
608 return ac_build_gather_values(ctx, chan, channels);
609 }
610
611 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
612 * with undef. Extract at most num_channels components from the input.
613 */
614 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
615 unsigned num_channels)
616 {
617 return ac_build_expand(ctx, value, num_channels, 4);
618 }
619
620 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
621 {
622 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
623 const char *name;
624
625 if (type_size == 2)
626 name = "llvm.rint.f16";
627 else if (type_size == 4)
628 name = "llvm.rint.f32";
629 else
630 name = "llvm.rint.f64";
631
632 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
633 }
634
635 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
636 {
637 unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
638 const char *name;
639
640 /* For doubles, we need precise division to pass GLCTS. */
641 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
642 return LLVMBuildFDiv(ctx->builder, num, den, "");
643
644 if (type_size == 2)
645 name = "llvm.amdgcn.rcp.f16";
646 else if (type_size == 4)
647 name = "llvm.amdgcn.rcp.f32";
648 else
649 name = "llvm.amdgcn.rcp.f64";
650
651 LLVMValueRef rcp =
652 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
653
654 return LLVMBuildFMul(ctx->builder, num, rcp, "");
655 }
656
657 /* See fast_idiv_by_const.h. */
658 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
659 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
660 LLVMValueRef multiplier, LLVMValueRef pre_shift,
661 LLVMValueRef post_shift, LLVMValueRef increment)
662 {
663 LLVMBuilderRef builder = ctx->builder;
664
665 num = LLVMBuildLShr(builder, num, pre_shift, "");
666 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
667 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
668 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
669 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
670 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
671 return LLVMBuildLShr(builder, num, post_shift, "");
672 }
673
674 /* See fast_idiv_by_const.h. */
675 /* If num != UINT_MAX, this more efficient version can be used. */
676 /* Set: increment = util_fast_udiv_info::increment; */
677 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
678 LLVMValueRef multiplier, LLVMValueRef pre_shift,
679 LLVMValueRef post_shift, LLVMValueRef increment)
680 {
681 LLVMBuilderRef builder = ctx->builder;
682
683 num = LLVMBuildLShr(builder, num, pre_shift, "");
684 num = LLVMBuildNUWAdd(builder, num, increment, "");
685 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
686 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
687 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
688 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
689 return LLVMBuildLShr(builder, num, post_shift, "");
690 }
691
692 /* See fast_idiv_by_const.h. */
693 /* Both operands must fit in 31 bits and the divisor must not be 1. */
694 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
695 LLVMValueRef multiplier, LLVMValueRef post_shift)
696 {
697 LLVMBuilderRef builder = ctx->builder;
698
699 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
700 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
701 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
702 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
703 return LLVMBuildLShr(builder, num, post_shift, "");
704 }
705
706 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
707 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
708 * already multiplied by two. id is the cube face number.
709 */
710 struct cube_selection_coords {
711 LLVMValueRef stc[2];
712 LLVMValueRef ma;
713 LLVMValueRef id;
714 };
715
716 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
717 struct cube_selection_coords *out)
718 {
719 LLVMTypeRef f32 = ctx->f32;
720
721 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
722 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
723 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
724 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
725 }
726
727 /**
728 * Build a manual selection sequence for cube face sc/tc coordinates and
729 * major axis vector (multiplied by 2 for consistency) for the given
730 * vec3 \p coords, for the face implied by \p selcoords.
731 *
732 * For the major axis, we always adjust the sign to be in the direction of
733 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
734 * the selcoords major axis.
735 */
736 static void build_cube_select(struct ac_llvm_context *ctx,
737 const struct cube_selection_coords *selcoords,
738 const LLVMValueRef *coords, LLVMValueRef *out_st,
739 LLVMValueRef *out_ma)
740 {
741 LLVMBuilderRef builder = ctx->builder;
742 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
743 LLVMValueRef is_ma_positive;
744 LLVMValueRef sgn_ma;
745 LLVMValueRef is_ma_z, is_not_ma_z;
746 LLVMValueRef is_ma_y;
747 LLVMValueRef is_ma_x;
748 LLVMValueRef sgn;
749 LLVMValueRef tmp;
750
751 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
752 sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
753 LLVMConstReal(f32, -1.0), "");
754
755 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
756 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
757 is_ma_y = LLVMBuildAnd(
758 builder, is_not_ma_z,
759 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
760 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
761
762 /* Select sc */
763 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
764 sgn = LLVMBuildSelect(
765 builder, is_ma_y, LLVMConstReal(f32, 1.0),
766 LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
767 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
768
769 /* Select tc */
770 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
771 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
772 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
773
774 /* Select ma */
775 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
776 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
777 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
778 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
779 }
780
781 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
782 LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
783 {
784
785 LLVMBuilderRef builder = ctx->builder;
786 struct cube_selection_coords selcoords;
787 LLVMValueRef coords[3];
788 LLVMValueRef invma;
789
790 if (is_array && !is_lod) {
791 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
792
793 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
794 *
795 * "For Array forms, the array layer used will be
796 *
797 * max(0, min(d−1, floor(layer+0.5)))
798 *
799 * where d is the depth of the texture array and layer
800 * comes from the component indicated in the tables below.
801 * Workaroudn for an issue where the layer is taken from a
802 * helper invocation which happens to fall on a different
803 * layer due to extrapolation."
804 *
805 * GFX8 and earlier attempt to implement this in hardware by
806 * clamping the value of coords[2] = (8 * layer) + face.
807 * Unfortunately, this means that the we end up with the wrong
808 * face when clamping occurs.
809 *
810 * Clamp the layer earlier to work around the issue.
811 */
812 if (ctx->chip_class <= GFX8) {
813 LLVMValueRef ge0;
814 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
815 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
816 }
817
818 coords_arg[3] = tmp;
819 }
820
821 build_cube_intrinsic(ctx, coords_arg, &selcoords);
822
823 invma =
824 ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
825 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
826
827 for (int i = 0; i < 2; ++i)
828 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
829
830 coords[2] = selcoords.id;
831
832 if (is_deriv && derivs_arg) {
833 LLVMValueRef derivs[4];
834 int axis;
835
836 /* Convert cube derivatives to 2D derivatives. */
837 for (axis = 0; axis < 2; axis++) {
838 LLVMValueRef deriv_st[2];
839 LLVMValueRef deriv_ma;
840
841 /* Transform the derivative alongside the texture
842 * coordinate. Mathematically, the correct formula is
843 * as follows. Assume we're projecting onto the +Z face
844 * and denote by dx/dh the derivative of the (original)
845 * X texture coordinate with respect to horizontal
846 * window coordinates. The projection onto the +Z face
847 * plane is:
848 *
849 * f(x,z) = x/z
850 *
851 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
852 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
853 *
854 * This motivatives the implementation below.
855 *
856 * Whether this actually gives the expected results for
857 * apps that might feed in derivatives obtained via
858 * finite differences is anyone's guess. The OpenGL spec
859 * seems awfully quiet about how textureGrad for cube
860 * maps should be handled.
861 */
862 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
863
864 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
865
866 for (int i = 0; i < 2; ++i)
867 derivs[axis * 2 + i] =
868 LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
869 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
870 }
871
872 memcpy(derivs_arg, derivs, sizeof(derivs));
873 }
874
875 /* Shift the texture coordinate. This must be applied after the
876 * derivative calculation.
877 */
878 for (int i = 0; i < 2; ++i)
879 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
880
881 if (is_array) {
882 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
883 /* coords_arg.w component - array_index for cube arrays */
884 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
885 }
886
887 memcpy(coords_arg, coords, sizeof(coords));
888 }
889
890 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
891 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
892 LLVMValueRef j)
893 {
894 LLVMValueRef args[5];
895 LLVMValueRef p1;
896
897 args[0] = i;
898 args[1] = llvm_chan;
899 args[2] = attr_number;
900 args[3] = params;
901
902 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
903
904 args[0] = p1;
905 args[1] = j;
906 args[2] = llvm_chan;
907 args[3] = attr_number;
908 args[4] = params;
909
910 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,
911 AC_FUNC_ATTR_READNONE);
912 }
913
914 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
915 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
916 LLVMValueRef j)
917 {
918 LLVMValueRef args[6];
919 LLVMValueRef p1;
920
921 args[0] = i;
922 args[1] = llvm_chan;
923 args[2] = attr_number;
924 args[3] = ctx->i1false;
925 args[4] = params;
926
927 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
928 AC_FUNC_ATTR_READNONE);
929
930 args[0] = p1;
931 args[1] = j;
932 args[2] = llvm_chan;
933 args[3] = attr_number;
934 args[4] = ctx->i1false;
935 args[5] = params;
936
937 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
938 AC_FUNC_ATTR_READNONE);
939 }
940
941 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
942 LLVMValueRef llvm_chan, LLVMValueRef attr_number,
943 LLVMValueRef params)
944 {
945 LLVMValueRef args[4];
946
947 args[0] = parameter;
948 args[1] = llvm_chan;
949 args[2] = attr_number;
950 args[3] = params;
951
952 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
953 AC_FUNC_ATTR_READNONE);
954 }
955
956 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
957 LLVMValueRef index)
958 {
959 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
960 }
961
962 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
963 {
964 LLVMValueRef indices[2] = {
965 ctx->i32_0,
966 index,
967 };
968 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
969 }
970
971 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
972 {
973 return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
974 LLVMTypeOf(ptr), "");
975 }
976
977 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
978 LLVMValueRef value)
979 {
980 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
981 }
982
983 /**
984 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
985 * It's equivalent to doing a load from &base_ptr[index].
986 *
987 * \param base_ptr Where the array starts.
988 * \param index The element index into the array.
989 * \param uniform Whether the base_ptr and index can be assumed to be
990 * dynamically uniform (i.e. load to an SGPR)
991 * \param invariant Whether the load is invariant (no other opcodes affect it)
992 * \param no_unsigned_wraparound
993 * For all possible re-associations and re-distributions of an expression
994 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
995 * without inbounds in base_ptr), this parameter is true if "addr + offset"
996 * does not result in an unsigned integer wraparound. This is used for
997 * optimal code generation of 32-bit pointer arithmetic.
998 *
999 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
1000 * integer wraparound can't be an imm offset in s_load_dword, because
1001 * the instruction performs "addr + offset" in 64 bits.
1002 *
1003 * Expected usage for bindless textures by chaining GEPs:
1004 * // possible unsigned wraparound, don't use InBounds:
1005 * ptr1 = LLVMBuildGEP(base_ptr, index);
1006 * image = load(ptr1); // becomes "s_load ptr1, 0"
1007 *
1008 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1009 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1010 */
1011 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1012 LLVMValueRef index, bool uniform, bool invariant,
1013 bool no_unsigned_wraparound)
1014 {
1015 LLVMValueRef pointer, result;
1016
1017 if (no_unsigned_wraparound &&
1018 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1019 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1020 else
1021 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1022
1023 if (uniform)
1024 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1025 result = LLVMBuildLoad(ctx->builder, pointer, "");
1026 if (invariant)
1027 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1028 return result;
1029 }
1030
1031 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1032 {
1033 return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1034 }
1035
1036 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1037 LLVMValueRef index)
1038 {
1039 return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1040 }
1041
1042 /* This assumes that there is no unsigned integer wraparound during the address
1043 * computation, excluding all GEPs within base_ptr. */
1044 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1045 LLVMValueRef index)
1046 {
1047 return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1048 }
1049
1050 /* See ac_build_load_custom() documentation. */
1051 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1052 LLVMValueRef base_ptr, LLVMValueRef index)
1053 {
1054 return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1055 }
1056
1057 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1058 {
1059 return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1060 }
1061
1062 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1063 LLVMValueRef data, LLVMValueRef vindex,
1064 LLVMValueRef voffset, LLVMValueRef soffset,
1065 unsigned cache_policy, bool use_format, bool structurized)
1066 {
1067 LLVMValueRef args[6];
1068 int idx = 0;
1069 args[idx++] = data;
1070 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1071 if (structurized)
1072 args[idx++] = vindex ? vindex : ctx->i32_0;
1073 args[idx++] = voffset ? voffset : ctx->i32_0;
1074 args[idx++] = soffset ? soffset : ctx->i32_0;
1075 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1076 const char *indexing_kind = structurized ? "struct" : "raw";
1077 char name[256], type_name[8];
1078
1079 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1080
1081 if (use_format) {
1082 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1083 type_name);
1084 } else {
1085 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1086 }
1087
1088 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1089 }
1090
1091 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1092 LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1093 {
1094 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);
1095 }
1096
1097 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1098 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1099 * or v4i32 (num_channels=3,4).
1100 */
1101 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1102 unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,
1103 unsigned inst_offset, unsigned cache_policy)
1104 {
1105 /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1106 * intrinsics. */
1107 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1108 LLVMValueRef v[3], v01;
1109
1110 for (int i = 0; i < 3; i++) {
1111 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1112 }
1113 v01 = ac_build_gather_values(ctx, v, 2);
1114
1115 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);
1116 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,
1117 cache_policy);
1118 return;
1119 }
1120
1121 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1122 * (voffset is swizzled, but soffset isn't swizzled).
1123 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1124 */
1125 if (!(cache_policy & ac_swizzled)) {
1126 LLVMValueRef offset = soffset;
1127
1128 if (inst_offset)
1129 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");
1130
1131 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,
1132 cache_policy, false, false);
1133 return;
1134 }
1135
1136 static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
1137 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1138 V_008F0C_BUF_DATA_FORMAT_32_32_32_32};
1139 unsigned dfmt = dfmts[num_channels - 1];
1140 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1141 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1142
1143 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,
1144 nfmt, cache_policy);
1145 }
1146
1147 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1148 LLVMValueRef vindex, LLVMValueRef voffset,
1149 LLVMValueRef soffset, unsigned num_channels,
1150 LLVMTypeRef channel_type, unsigned cache_policy,
1151 bool can_speculate, bool use_format,
1152 bool structurized)
1153 {
1154 LLVMValueRef args[5];
1155 int idx = 0;
1156 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1157 if (structurized)
1158 args[idx++] = vindex ? vindex : ctx->i32_0;
1159 args[idx++] = voffset ? voffset : ctx->i32_0;
1160 args[idx++] = soffset ? soffset : ctx->i32_0;
1161 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1162 unsigned func =
1163 !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1164 const char *indexing_kind = structurized ? "struct" : "raw";
1165 char name[256], type_name[8];
1166
1167 /* D16 is only supported on gfx8+ */
1168 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1169 ctx->chip_class >= GFX8);
1170
1171 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1172 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1173
1174 if (use_format) {
1175 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1176 type_name);
1177 } else {
1178 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1179 }
1180
1181 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1182 }
1183
1184 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1185 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1186 unsigned inst_offset, unsigned cache_policy, bool can_speculate,
1187 bool allow_smem)
1188 {
1189 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1190 if (voffset)
1191 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1192 if (soffset)
1193 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1194
1195 if (allow_smem && !(cache_policy & ac_slc) &&
1196 (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1197 assert(vindex == NULL);
1198
1199 LLVMValueRef result[8];
1200
1201 for (int i = 0; i < num_channels; i++) {
1202 if (i) {
1203 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1204 }
1205 LLVMValueRef args[3] = {
1206 rsrc,
1207 offset,
1208 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1209 };
1210 result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1211 AC_FUNC_ATTR_READNONE);
1212 }
1213 if (num_channels == 1)
1214 return result[0];
1215
1216 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1217 result[num_channels++] = LLVMGetUndef(ctx->f32);
1218 return ac_build_gather_values(ctx, result, num_channels);
1219 }
1220
1221 return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels, ctx->f32,
1222 cache_policy, can_speculate, false, false);
1223 }
1224
1225 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1226 LLVMValueRef vindex, LLVMValueRef voffset,
1227 unsigned num_channels, unsigned cache_policy,
1228 bool can_speculate, bool d16)
1229 {
1230 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1231 d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1232 true);
1233 }
1234
1235 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1236 LLVMValueRef vindex, LLVMValueRef voffset,
1237 LLVMValueRef soffset, LLVMValueRef immoffset,
1238 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1239 unsigned cache_policy, bool can_speculate,
1240 bool structurized)
1241 {
1242 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1243
1244 LLVMValueRef args[6];
1245 int idx = 0;
1246 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1247 if (structurized)
1248 args[idx++] = vindex ? vindex : ctx->i32_0;
1249 args[idx++] = voffset ? voffset : ctx->i32_0;
1250 args[idx++] = soffset ? soffset : ctx->i32_0;
1251 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1252 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1253 unsigned func =
1254 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1255 const char *indexing_kind = structurized ? "struct" : "raw";
1256 char name[256], type_name[8];
1257
1258 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1259 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1260
1261 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1262
1263 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1264 }
1265
1266 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1267 LLVMValueRef vindex, LLVMValueRef voffset,
1268 LLVMValueRef soffset, LLVMValueRef immoffset,
1269 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1270 unsigned cache_policy, bool can_speculate)
1271 {
1272 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1273 nfmt, cache_policy, can_speculate, true);
1274 }
1275
1276 LLVMValueRef ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1277 LLVMValueRef voffset, LLVMValueRef soffset,
1278 LLVMValueRef immoffset, unsigned num_channels, unsigned dfmt,
1279 unsigned nfmt, unsigned cache_policy, bool can_speculate)
1280 {
1281 return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1282 nfmt, cache_policy, can_speculate, false);
1283 }
1284
1285 LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1286 LLVMValueRef voffset, LLVMValueRef soffset,
1287 LLVMValueRef immoffset, unsigned cache_policy)
1288 {
1289 LLVMValueRef res;
1290
1291 if (LLVM_VERSION_MAJOR >= 9) {
1292 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1293
1294 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1295 res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1296 cache_policy, false, false, false);
1297 } else {
1298 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1299 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1300
1301 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1302 cache_policy, false);
1303
1304 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1305 }
1306
1307 return res;
1308 }
1309
1310 LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1311 LLVMValueRef voffset, LLVMValueRef soffset,
1312 LLVMValueRef immoffset, unsigned cache_policy)
1313 {
1314 LLVMValueRef res;
1315
1316 if (LLVM_VERSION_MAJOR >= 9) {
1317 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1318
1319 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1320 res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1321 false, false, false);
1322 } else {
1323 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1324 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1325
1326 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1327 cache_policy, false);
1328
1329 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1330 }
1331
1332 return res;
1333 }
1334
1335 /**
1336 * Convert an 11- or 10-bit unsigned floating point number to an f32.
1337 *
1338 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1339 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1340 */
1341 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1342 unsigned exp_bits, unsigned mant_bits)
1343 {
1344 assert(LLVMTypeOf(src) == ctx->i32);
1345
1346 LLVMValueRef tmp;
1347 LLVMValueRef mantissa;
1348 mantissa =
1349 LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1350
1351 /* Converting normal numbers is just a shift + correcting the exponent bias */
1352 unsigned normal_shift = 23 - mant_bits;
1353 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1354 LLVMValueRef shifted, normal;
1355
1356 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1357 normal =
1358 LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1359
1360 /* Converting nan/inf numbers is the same, but with a different exponent update */
1361 LLVMValueRef naninf;
1362 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1363
1364 /* Converting denormals is the complex case: determine the leading zeros of the
1365 * mantissa to obtain the correct shift for the mantissa and exponent correction.
1366 */
1367 LLVMValueRef denormal;
1368 LLVMValueRef params[2] = {
1369 mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1370 };
1371 LLVMValueRef ctlz =
1372 ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1373
1374 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1375 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1376 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1377
1378 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1379 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1380 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1381 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1382
1383 /* Select the final result. */
1384 LLVMValueRef result;
1385
1386 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1387 LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1388 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1389
1390 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, LLVMConstInt(ctx->i32, 1 << mant_bits, false),
1391 "");
1392 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1393
1394 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1395 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1396
1397 return ac_to_float(ctx, result);
1398 }
1399
1400 /**
1401 * Generate a fully general open coded buffer format fetch with all required
1402 * fixups suitable for vertex fetch, using non-format buffer loads.
1403 *
1404 * Some combinations of argument values have special interpretations:
1405 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1406 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1407 *
1408 * \param log_size log(size of channel in bytes)
1409 * \param num_channels number of channels (1 to 4)
1410 * \param format AC_FETCH_FORMAT_xxx value
1411 * \param reverse whether XYZ channels are reversed
1412 * \param known_aligned whether the source is known to be aligned to hardware's
1413 * effective element size for loading the given format
1414 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1415 * \param rsrc buffer resource descriptor
1416 * \return the resulting vector of floats or integers bitcast to <4 x i32>
1417 */
1418 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1419 unsigned num_channels, unsigned format, bool reverse,
1420 bool known_aligned, LLVMValueRef rsrc,
1421 LLVMValueRef vindex, LLVMValueRef voffset,
1422 LLVMValueRef soffset, unsigned cache_policy,
1423 bool can_speculate)
1424 {
1425 LLVMValueRef tmp;
1426 unsigned load_log_size = log_size;
1427 unsigned load_num_channels = num_channels;
1428 if (log_size == 3) {
1429 load_log_size = 2;
1430 if (format == AC_FETCH_FORMAT_FLOAT) {
1431 load_num_channels = 2 * num_channels;
1432 } else {
1433 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1434 }
1435 }
1436
1437 int log_recombine = 0;
1438 if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {
1439 /* Avoid alignment restrictions by loading one byte at a time. */
1440 load_num_channels <<= load_log_size;
1441 log_recombine = load_log_size;
1442 load_log_size = 0;
1443 } else if (load_num_channels == 2 || load_num_channels == 4) {
1444 log_recombine = -util_logbase2(load_num_channels);
1445 load_num_channels = 1;
1446 load_log_size += -log_recombine;
1447 }
1448
1449 assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9);
1450
1451 LLVMValueRef loads[32]; /* up to 32 bytes */
1452 for (unsigned i = 0; i < load_num_channels; ++i) {
1453 tmp =
1454 LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1455 LLVMTypeRef channel_type =
1456 load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1457 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1458 loads[i] =
1459 ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1460 cache_policy, can_speculate, false, true);
1461 if (load_log_size >= 2)
1462 loads[i] = ac_to_integer(ctx, loads[i]);
1463 }
1464
1465 if (log_recombine > 0) {
1466 /* Recombine bytes if necessary (GFX6 only) */
1467 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1468
1469 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1470 LLVMValueRef accum = NULL;
1471 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1472 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1473 if (i == 0) {
1474 accum = tmp;
1475 } else {
1476 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1477 accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1478 }
1479 }
1480 loads[dst] = accum;
1481 }
1482 } else if (log_recombine < 0) {
1483 /* Split vectors of dwords */
1484 if (load_log_size > 2) {
1485 assert(load_num_channels == 1);
1486 LLVMValueRef loaded = loads[0];
1487 unsigned log_split = load_log_size - 2;
1488 log_recombine += log_split;
1489 load_num_channels = 1 << log_split;
1490 load_log_size = 2;
1491 for (unsigned i = 0; i < load_num_channels; ++i) {
1492 tmp = LLVMConstInt(ctx->i32, i, false);
1493 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1494 }
1495 }
1496
1497 /* Further split dwords and shorts if required */
1498 if (log_recombine < 0) {
1499 for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1500 --src) {
1501 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1502 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1503 LLVMValueRef loaded = loads[src - 1];
1504 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1505 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1506 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1507 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1508 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1509 }
1510 }
1511 }
1512 }
1513
1514 if (log_size == 3) {
1515 if (format == AC_FETCH_FORMAT_FLOAT) {
1516 for (unsigned i = 0; i < num_channels; ++i) {
1517 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1518 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1519 }
1520 } else if (format == AC_FETCH_FORMAT_FIXED) {
1521 /* 10_11_11_FLOAT */
1522 LLVMValueRef data = loads[0];
1523 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1524 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1525 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1526 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1527 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1528
1529 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1530 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1531 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1532
1533 num_channels = 3;
1534 log_size = 2;
1535 format = AC_FETCH_FORMAT_FLOAT;
1536 } else {
1537 /* 2_10_10_10 data formats */
1538 LLVMValueRef data = loads[0];
1539 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1540 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1541 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1542 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1543 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1544 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1545 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1546 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1547 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1548
1549 num_channels = 4;
1550 }
1551 }
1552
1553 if (format == AC_FETCH_FORMAT_FLOAT) {
1554 if (log_size != 2) {
1555 for (unsigned chan = 0; chan < num_channels; ++chan) {
1556 tmp = ac_to_float(ctx, loads[chan]);
1557 if (log_size == 3)
1558 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1559 else if (log_size == 1)
1560 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1561 loads[chan] = ac_to_integer(ctx, tmp);
1562 }
1563 }
1564 } else if (format == AC_FETCH_FORMAT_UINT) {
1565 if (log_size != 2) {
1566 for (unsigned chan = 0; chan < num_channels; ++chan)
1567 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1568 }
1569 } else if (format == AC_FETCH_FORMAT_SINT) {
1570 if (log_size != 2) {
1571 for (unsigned chan = 0; chan < num_channels; ++chan)
1572 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1573 }
1574 } else {
1575 bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1576 format == AC_FETCH_FORMAT_UINT;
1577
1578 for (unsigned chan = 0; chan < num_channels; ++chan) {
1579 if (unsign) {
1580 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1581 } else {
1582 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1583 }
1584
1585 LLVMValueRef scale = NULL;
1586 if (format == AC_FETCH_FORMAT_FIXED) {
1587 assert(log_size == 2);
1588 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1589 } else if (format == AC_FETCH_FORMAT_UNORM) {
1590 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1591 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1592 } else if (format == AC_FETCH_FORMAT_SNORM) {
1593 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1594 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1595 }
1596 if (scale)
1597 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1598
1599 if (format == AC_FETCH_FORMAT_SNORM) {
1600 /* Clamp to [-1, 1] */
1601 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1602 LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1603 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1604 }
1605
1606 loads[chan] = ac_to_integer(ctx, tmp);
1607 }
1608 }
1609
1610 while (num_channels < 4) {
1611 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1612 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1613 } else {
1614 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1615 }
1616 num_channels++;
1617 }
1618
1619 if (reverse) {
1620 tmp = loads[0];
1621 loads[0] = loads[2];
1622 loads[2] = tmp;
1623 }
1624
1625 return ac_build_gather_values(ctx, loads, 4);
1626 }
1627
1628 static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1629 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1630 LLVMValueRef soffset, LLVMValueRef immoffset,
1631 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1632 unsigned cache_policy, bool structurized)
1633 {
1634 voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");
1635
1636 LLVMValueRef args[7];
1637 int idx = 0;
1638 args[idx++] = vdata;
1639 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1640 if (structurized)
1641 args[idx++] = vindex ? vindex : ctx->i32_0;
1642 args[idx++] = voffset ? voffset : ctx->i32_0;
1643 args[idx++] = soffset ? soffset : ctx->i32_0;
1644 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1645 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1646 unsigned func =
1647 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1648 const char *indexing_kind = structurized ? "struct" : "raw";
1649 char name[256], type_name[8];
1650
1651 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1652 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1653
1654 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);
1655
1656 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1657 }
1658
1659 void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1660 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1661 LLVMValueRef soffset, LLVMValueRef immoffset,
1662 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1663 unsigned cache_policy)
1664 {
1665 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1666 nfmt, cache_policy, true);
1667 }
1668
1669 void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1670 LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,
1671 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1672 unsigned cache_policy)
1673 {
1674 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1675 nfmt, cache_policy, false);
1676 }
1677
1678 void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1679 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1680 unsigned cache_policy)
1681 {
1682 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1683
1684 if (LLVM_VERSION_MAJOR >= 9) {
1685 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1686 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1687 false);
1688 } else {
1689 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1690 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1691
1692 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1693
1694 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1695 cache_policy);
1696 }
1697 }
1698
1699 void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1700 LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1701 {
1702 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1703
1704 if (LLVM_VERSION_MAJOR >= 9) {
1705 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1706 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1707 false);
1708 } else {
1709 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1710 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1711
1712 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1713
1714 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1715 cache_policy);
1716 }
1717 }
1718 /**
1719 * Set range metadata on an instruction. This can only be used on load and
1720 * call instructions. If you know an instruction can only produce the values
1721 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1722 * \p lo is the minimum value inclusive.
1723 * \p hi is the maximum value exclusive.
1724 */
1725 static void set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1726 unsigned hi)
1727 {
1728 LLVMValueRef range_md, md_args[2];
1729 LLVMTypeRef type = LLVMTypeOf(value);
1730 LLVMContextRef context = LLVMGetTypeContext(type);
1731
1732 md_args[0] = LLVMConstInt(type, lo, false);
1733 md_args[1] = LLVMConstInt(type, hi, false);
1734 range_md = LLVMMDNodeInContext(context, md_args, 2);
1735 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1736 }
1737
1738 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1739 {
1740 LLVMValueRef tid;
1741
1742 LLVMValueRef tid_args[2];
1743 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1744 tid_args[1] = ctx->i32_0;
1745 tid_args[1] =
1746 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, tid_args, 2, AC_FUNC_ATTR_READNONE);
1747
1748 if (ctx->wave_size == 32) {
1749 tid = tid_args[1];
1750 } else {
1751 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, tid_args, 2,
1752 AC_FUNC_ATTR_READNONE);
1753 }
1754 set_range_metadata(ctx, tid, 0, ctx->wave_size);
1755 return tid;
1756 }
1757
1758 /*
1759 * AMD GCN implements derivatives using the local data store (LDS)
1760 * All writes to the LDS happen in all executing threads at
1761 * the same time. TID is the Thread ID for the current
1762 * thread and is a value between 0 and 63, representing
1763 * the thread's position in the wavefront.
1764 *
1765 * For the pixel shader threads are grouped into quads of four pixels.
1766 * The TIDs of the pixels of a quad are:
1767 *
1768 * +------+------+
1769 * |4n + 0|4n + 1|
1770 * +------+------+
1771 * |4n + 2|4n + 3|
1772 * +------+------+
1773 *
1774 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1775 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1776 * the current pixel's column, and masking with 0xfffffffe yields the TID
1777 * of the left pixel of the current pixel's row.
1778 *
1779 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1780 * adding 2 yields the TID of the pixel below the top pixel.
1781 */
1782 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1783 {
1784 unsigned tl_lanes[4], trbl_lanes[4];
1785 char name[32], type[8];
1786 LLVMValueRef tl, trbl;
1787 LLVMTypeRef result_type;
1788 LLVMValueRef result;
1789
1790 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1791
1792 if (result_type == ctx->f16)
1793 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1794 else if (result_type == ctx->v2f16)
1795 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1796
1797 for (unsigned i = 0; i < 4; ++i) {
1798 tl_lanes[i] = i & mask;
1799 trbl_lanes[i] = (i & mask) + idx;
1800 }
1801
1802 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1803 trbl =
1804 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1805
1806 if (result_type == ctx->f16) {
1807 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1808 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1809 }
1810
1811 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1812 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1813 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1814
1815 ac_build_type_name_for_intr(result_type, type, sizeof(type));
1816 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1817
1818 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1819 }
1820
1821 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1822 {
1823 LLVMValueRef args[2];
1824 args[0] = LLVMConstInt(ctx->i32, msg, false);
1825 args[1] = wave_id;
1826 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1827 }
1828
1829 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1830 {
1831 LLVMValueRef msb =
1832 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1833
1834 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1835 * the index from LSB. Invert it by doing "31 - msb". */
1836 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1837
1838 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1839 LLVMValueRef cond =
1840 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1841 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1842
1843 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1844 }
1845
1846 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1847 {
1848 const char *intrin_name;
1849 LLVMTypeRef type;
1850 LLVMValueRef highest_bit;
1851 LLVMValueRef zero;
1852 unsigned bitsize;
1853
1854 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1855 switch (bitsize) {
1856 case 64:
1857 intrin_name = "llvm.ctlz.i64";
1858 type = ctx->i64;
1859 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1860 zero = ctx->i64_0;
1861 break;
1862 case 32:
1863 intrin_name = "llvm.ctlz.i32";
1864 type = ctx->i32;
1865 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1866 zero = ctx->i32_0;
1867 break;
1868 case 16:
1869 intrin_name = "llvm.ctlz.i16";
1870 type = ctx->i16;
1871 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1872 zero = ctx->i16_0;
1873 break;
1874 case 8:
1875 intrin_name = "llvm.ctlz.i8";
1876 type = ctx->i8;
1877 highest_bit = LLVMConstInt(ctx->i8, 7, false);
1878 zero = ctx->i8_0;
1879 break;
1880 default:
1881 unreachable(!"invalid bitsize");
1882 break;
1883 }
1884
1885 LLVMValueRef params[2] = {
1886 arg,
1887 ctx->i1true,
1888 };
1889
1890 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1891
1892 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1893 * the index from LSB. Invert it by doing "31 - msb". */
1894 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1895
1896 if (bitsize == 64) {
1897 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1898 } else if (bitsize < 32) {
1899 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1900 }
1901
1902 /* check for zero */
1903 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1904 LLVMConstInt(ctx->i32, -1, true), msb, "");
1905 }
1906
1907 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1908 {
1909 char name[64], type[64];
1910
1911 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1912 snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1913 LLVMValueRef args[2] = {a, b};
1914 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1915 }
1916
1917 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1918 {
1919 char name[64], type[64];
1920
1921 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1922 snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1923 LLVMValueRef args[2] = {a, b};
1924 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1925 }
1926
1927 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1928 {
1929 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1930 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1931 }
1932
1933 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934 {
1935 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1936 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1937 }
1938
1939 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1940 {
1941 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1942 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1943 }
1944
1945 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1946 {
1947 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1948 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1949 }
1950
1951 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1952 {
1953 LLVMTypeRef t = LLVMTypeOf(value);
1954 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1955 LLVMConstReal(t, 1.0));
1956 }
1957
1958 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1959 {
1960 LLVMValueRef args[9];
1961
1962 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1963 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1964
1965 if (a->compr) {
1966 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1967 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1968 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1969 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1970
1971 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1972 } else {
1973 args[2] = a->out[0];
1974 args[3] = a->out[1];
1975 args[4] = a->out[2];
1976 args[5] = a->out[3];
1977 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1978 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1979
1980 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1981 }
1982 }
1983
1984 void ac_build_export_null(struct ac_llvm_context *ctx)
1985 {
1986 struct ac_export_args args;
1987
1988 args.enabled_channels = 0x0; /* enabled channels */
1989 args.valid_mask = 1; /* whether the EXEC mask is valid */
1990 args.done = 1; /* DONE bit */
1991 args.target = V_008DFC_SQ_EXP_NULL;
1992 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
1993 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1994 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1995 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1996 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1997
1998 ac_build_export(ctx, &args);
1999 }
2000
2001 static unsigned ac_num_coords(enum ac_image_dim dim)
2002 {
2003 switch (dim) {
2004 case ac_image_1d:
2005 return 1;
2006 case ac_image_2d:
2007 case ac_image_1darray:
2008 return 2;
2009 case ac_image_3d:
2010 case ac_image_cube:
2011 case ac_image_2darray:
2012 case ac_image_2dmsaa:
2013 return 3;
2014 case ac_image_2darraymsaa:
2015 return 4;
2016 default:
2017 unreachable("ac_num_coords: bad dim");
2018 }
2019 }
2020
2021 static unsigned ac_num_derivs(enum ac_image_dim dim)
2022 {
2023 switch (dim) {
2024 case ac_image_1d:
2025 case ac_image_1darray:
2026 return 2;
2027 case ac_image_2d:
2028 case ac_image_2darray:
2029 case ac_image_cube:
2030 return 4;
2031 case ac_image_3d:
2032 return 6;
2033 case ac_image_2dmsaa:
2034 case ac_image_2darraymsaa:
2035 default:
2036 unreachable("derivatives not supported");
2037 }
2038 }
2039
2040 static const char *get_atomic_name(enum ac_atomic_op op)
2041 {
2042 switch (op) {
2043 case ac_atomic_swap:
2044 return "swap";
2045 case ac_atomic_add:
2046 return "add";
2047 case ac_atomic_sub:
2048 return "sub";
2049 case ac_atomic_smin:
2050 return "smin";
2051 case ac_atomic_umin:
2052 return "umin";
2053 case ac_atomic_smax:
2054 return "smax";
2055 case ac_atomic_umax:
2056 return "umax";
2057 case ac_atomic_and:
2058 return "and";
2059 case ac_atomic_or:
2060 return "or";
2061 case ac_atomic_xor:
2062 return "xor";
2063 case ac_atomic_inc_wrap:
2064 return "inc";
2065 case ac_atomic_dec_wrap:
2066 return "dec";
2067 }
2068 unreachable("bad atomic op");
2069 }
2070
2071 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2072 {
2073 const char *overload[3] = {"", "", ""};
2074 unsigned num_overloads = 0;
2075 LLVMValueRef args[18];
2076 unsigned num_args = 0;
2077 enum ac_image_dim dim = a->dim;
2078
2079 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2080 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2081 a->opcode != ac_image_store_mip) ||
2082 a->lod);
2083 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2084 (!a->compare && !a->offset));
2085 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2086 a->opcode == ac_image_get_lod) ||
2087 !a->bias);
2088 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2089 1);
2090 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2091 assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&
2092 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2093 a->opcode != ac_image_get_resinfo));
2094
2095 if (a->opcode == ac_image_get_lod) {
2096 switch (dim) {
2097 case ac_image_1darray:
2098 dim = ac_image_1d;
2099 break;
2100 case ac_image_2darray:
2101 case ac_image_cube:
2102 dim = ac_image_2d;
2103 break;
2104 default:
2105 break;
2106 }
2107 }
2108
2109 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2110 a->opcode == ac_image_get_lod;
2111 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2112 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2113 a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2114 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2115
2116 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2117 args[num_args++] = a->data[0];
2118 if (a->opcode == ac_image_atomic_cmpswap)
2119 args[num_args++] = a->data[1];
2120 }
2121
2122 if (!atomic)
2123 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2124
2125 if (a->offset)
2126 args[num_args++] = ac_to_integer(ctx, a->offset);
2127 if (a->bias) {
2128 args[num_args++] = ac_to_float(ctx, a->bias);
2129 overload[num_overloads++] = ".f32";
2130 }
2131 if (a->compare)
2132 args[num_args++] = ac_to_float(ctx, a->compare);
2133 if (a->derivs[0]) {
2134 unsigned count = ac_num_derivs(dim);
2135 for (unsigned i = 0; i < count; ++i)
2136 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2137 overload[num_overloads++] = ".f32";
2138 }
2139 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2140 for (unsigned i = 0; i < num_coords; ++i)
2141 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2142 if (a->lod)
2143 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2144 if (a->min_lod)
2145 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2146
2147 overload[num_overloads++] = sample ? ".f32" : ".i32";
2148
2149 args[num_args++] = a->resource;
2150 if (sample) {
2151 args[num_args++] = a->sampler;
2152 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2153 }
2154
2155 args[num_args++] = ctx->i32_0; /* texfailctrl */
2156 args[num_args++] = LLVMConstInt(
2157 ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2158
2159 const char *name;
2160 const char *atomic_subop = "";
2161 switch (a->opcode) {
2162 case ac_image_sample:
2163 name = "sample";
2164 break;
2165 case ac_image_gather4:
2166 name = "gather4";
2167 break;
2168 case ac_image_load:
2169 name = "load";
2170 break;
2171 case ac_image_load_mip:
2172 name = "load.mip";
2173 break;
2174 case ac_image_store:
2175 name = "store";
2176 break;
2177 case ac_image_store_mip:
2178 name = "store.mip";
2179 break;
2180 case ac_image_atomic:
2181 name = "atomic.";
2182 atomic_subop = get_atomic_name(a->atomic);
2183 break;
2184 case ac_image_atomic_cmpswap:
2185 name = "atomic.";
2186 atomic_subop = "cmpswap";
2187 break;
2188 case ac_image_get_lod:
2189 name = "getlod";
2190 break;
2191 case ac_image_get_resinfo:
2192 name = "getresinfo";
2193 break;
2194 default:
2195 unreachable("invalid image opcode");
2196 }
2197
2198 const char *dimname;
2199 switch (dim) {
2200 case ac_image_1d:
2201 dimname = "1d";
2202 break;
2203 case ac_image_2d:
2204 dimname = "2d";
2205 break;
2206 case ac_image_3d:
2207 dimname = "3d";
2208 break;
2209 case ac_image_cube:
2210 dimname = "cube";
2211 break;
2212 case ac_image_1darray:
2213 dimname = "1darray";
2214 break;
2215 case ac_image_2darray:
2216 dimname = "2darray";
2217 break;
2218 case ac_image_2dmsaa:
2219 dimname = "2dmsaa";
2220 break;
2221 case ac_image_2darraymsaa:
2222 dimname = "2darraymsaa";
2223 break;
2224 default:
2225 unreachable("invalid dim");
2226 }
2227
2228 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2229 char intr_name[96];
2230 snprintf(intr_name, sizeof(intr_name),
2231 "llvm.amdgcn.image.%s%s" /* base name */
2232 "%s%s%s%s" /* sample/gather modifiers */
2233 ".%s.%s%s%s%s", /* dimension and type overloads */
2234 name, atomic_subop, a->compare ? ".c" : "",
2235 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2236 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2237 atomic ? "i32" : (a->d16 ? "v4f16" : "v4f32"), overload[0], overload[1], overload[2]);
2238
2239 LLVMTypeRef retty;
2240 if (atomic)
2241 retty = ctx->i32;
2242 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2243 retty = ctx->voidt;
2244 else
2245 retty = a->d16 ? ctx->v4f16 : ctx->v4f32;
2246
2247 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2248 if (!sample && !atomic && retty != ctx->voidt)
2249 result = ac_to_integer(ctx, result);
2250
2251 return result;
2252 }
2253
2254 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2255 {
2256 LLVMValueRef samples;
2257
2258 /* Read the samples from the descriptor directly.
2259 * Hardware doesn't have any instruction for this.
2260 */
2261 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2262 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2263 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2264 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2265 return samples;
2266 }
2267
2268 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2269 {
2270 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2271 AC_FUNC_ATTR_READNONE);
2272 }
2273
2274 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2275 {
2276 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2277 AC_FUNC_ATTR_READNONE);
2278 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2279 }
2280
2281 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2282 {
2283 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2284 AC_FUNC_ATTR_READNONE);
2285 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2286 }
2287
2288 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2289 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2290 bool hi)
2291 {
2292 assert(bits == 8 || bits == 10 || bits == 16);
2293
2294 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2295 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2296 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2297 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2298
2299 /* Clamp. */
2300 if (bits != 16) {
2301 for (int i = 0; i < 2; i++) {
2302 bool alpha = hi && i == 1;
2303 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2304 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2305 }
2306 }
2307
2308 LLVMValueRef res =
2309 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2310 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2311 }
2312
2313 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2314 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2315 bool hi)
2316 {
2317 assert(bits == 8 || bits == 10 || bits == 16);
2318
2319 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2320 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2321
2322 /* Clamp. */
2323 if (bits != 16) {
2324 for (int i = 0; i < 2; i++) {
2325 bool alpha = hi && i == 1;
2326 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2327 }
2328 }
2329
2330 LLVMValueRef res =
2331 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2332 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2333 }
2334
2335 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2336 {
2337 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2338 }
2339
2340 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2341 {
2342 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2343 }
2344
2345 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2346 LLVMValueRef width, bool is_signed)
2347 {
2348 LLVMValueRef args[] = {
2349 input,
2350 offset,
2351 width,
2352 };
2353
2354 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2355 ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2356 }
2357
2358 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2359 LLVMValueRef s2)
2360 {
2361 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2362 }
2363
2364 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2365 LLVMValueRef s2)
2366 {
2367 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2368 if (ctx->chip_class >= GFX10) {
2369 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2370 AC_FUNC_ATTR_READNONE);
2371 }
2372
2373 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2374 }
2375
2376 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2377 {
2378 if (!wait_flags)
2379 return;
2380
2381 unsigned lgkmcnt = 63;
2382 unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2383 unsigned vscnt = 63;
2384
2385 if (wait_flags & AC_WAIT_LGKM)
2386 lgkmcnt = 0;
2387 if (wait_flags & AC_WAIT_VLOAD)
2388 vmcnt = 0;
2389
2390 if (wait_flags & AC_WAIT_VSTORE) {
2391 if (ctx->chip_class >= GFX10)
2392 vscnt = 0;
2393 else
2394 vmcnt = 0;
2395 }
2396
2397 /* There is no intrinsic for vscnt(0), so use a fence. */
2398 if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2399 vscnt == 0) {
2400 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2401 return;
2402 }
2403
2404 unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */
2405 (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2406
2407 LLVMValueRef args[1] = {
2408 LLVMConstInt(ctx->i32, simm16, false),
2409 };
2410 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2411 }
2412
2413 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2414 {
2415 LLVMTypeRef type;
2416 char *intr;
2417
2418 if (bitsize == 16) {
2419 intr = "llvm.amdgcn.fract.f16";
2420 type = ctx->f16;
2421 } else if (bitsize == 32) {
2422 intr = "llvm.amdgcn.fract.f32";
2423 type = ctx->f32;
2424 } else {
2425 intr = "llvm.amdgcn.fract.f64";
2426 type = ctx->f64;
2427 }
2428
2429 LLVMValueRef params[] = {
2430 src0,
2431 };
2432 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2433 }
2434
2435 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2436 {
2437
2438 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2439 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2440 unsigned vec_size = LLVMGetVectorSize(type);
2441 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef *));
2442
2443 for (unsigned i = 0; i < vec_size; i++)
2444 scalars[i] = scalar;
2445 return LLVMConstVector(scalars, vec_size);
2446 }
2447 return LLVMConstInt(type, value, 0);
2448 }
2449
2450 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2451 {
2452 LLVMTypeRef type = LLVMTypeOf(src0);
2453 LLVMValueRef val;
2454
2455 /* v_med3 is selected only when max is first. (LLVM bug?) */
2456 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2457 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2458 }
2459
2460 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2461 {
2462 ac_enable_signed_zeros(ctx);
2463 /* (val + 0) converts negative zero to positive zero. */
2464 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2465 ac_disable_signed_zeros(ctx);
2466 return val;
2467 }
2468
2469 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2470 {
2471 LLVMTypeRef type = LLVMTypeOf(src);
2472 LLVMValueRef pos, neg, dw[2], val;
2473 unsigned bitsize = ac_get_elem_bits(ctx, type);
2474
2475 /* The standard version leads to this:
2476 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2477 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2478 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2479 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2480 *
2481 * The isign version:
2482 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2483 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2484 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2485 *
2486 * (src0 + 0) converts negative zero to positive zero.
2487 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2488 *
2489 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2490 * reduction. (FP64 comparisons are as fast as int64 comparisons)
2491 */
2492 if (bitsize == 16 || bitsize == 32) {
2493 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2494 val = ac_build_isign(ctx, val);
2495 return LLVMBuildSIToFP(ctx->builder, val, type, "");
2496 }
2497
2498 assert(bitsize == 64);
2499 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2500 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2501 dw[0] = ctx->i32_0;
2502 dw[1] = LLVMBuildSelect(
2503 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2504 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2505 "");
2506 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2507 }
2508
2509 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2510 {
2511 LLVMValueRef result;
2512 unsigned bitsize;
2513
2514 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2515
2516 switch (bitsize) {
2517 case 128:
2518 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2519 AC_FUNC_ATTR_READNONE);
2520 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2521 break;
2522 case 64:
2523 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2524 AC_FUNC_ATTR_READNONE);
2525
2526 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2527 break;
2528 case 32:
2529 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2530 AC_FUNC_ATTR_READNONE);
2531 break;
2532 case 16:
2533 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2534 AC_FUNC_ATTR_READNONE);
2535
2536 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2537 break;
2538 case 8:
2539 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2540 AC_FUNC_ATTR_READNONE);
2541
2542 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2543 break;
2544 default:
2545 unreachable(!"invalid bitsize");
2546 break;
2547 }
2548
2549 return result;
2550 }
2551
2552 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2553 {
2554 LLVMValueRef result;
2555 unsigned bitsize;
2556
2557 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2558
2559 switch (bitsize) {
2560 case 64:
2561 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2562 AC_FUNC_ATTR_READNONE);
2563
2564 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2565 break;
2566 case 32:
2567 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2568 AC_FUNC_ATTR_READNONE);
2569 break;
2570 case 16:
2571 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2572 AC_FUNC_ATTR_READNONE);
2573
2574 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2575 break;
2576 case 8:
2577 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2578 AC_FUNC_ATTR_READNONE);
2579
2580 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2581 break;
2582 default:
2583 unreachable(!"invalid bitsize");
2584 break;
2585 }
2586
2587 return result;
2588 }
2589
2590 #define AC_EXP_TARGET 0
2591 #define AC_EXP_ENABLED_CHANNELS 1
2592 #define AC_EXP_OUT0 2
2593
2594 enum ac_ir_type
2595 {
2596 AC_IR_UNDEF,
2597 AC_IR_CONST,
2598 AC_IR_VALUE,
2599 };
2600
2601 struct ac_vs_exp_chan {
2602 LLVMValueRef value;
2603 float const_float;
2604 enum ac_ir_type type;
2605 };
2606
2607 struct ac_vs_exp_inst {
2608 unsigned offset;
2609 LLVMValueRef inst;
2610 struct ac_vs_exp_chan chan[4];
2611 };
2612
2613 struct ac_vs_exports {
2614 unsigned num;
2615 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2616 };
2617
2618 /* Return true if the PARAM export has been eliminated. */
2619 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,
2620 struct ac_vs_exp_inst *exp)
2621 {
2622 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2623 bool is_zero[4] = {}, is_one[4] = {};
2624
2625 for (i = 0; i < 4; i++) {
2626 /* It's a constant expression. Undef outputs are eliminated too. */
2627 if (exp->chan[i].type == AC_IR_UNDEF) {
2628 is_zero[i] = true;
2629 is_one[i] = true;
2630 } else if (exp->chan[i].type == AC_IR_CONST) {
2631 if (exp->chan[i].const_float == 0)
2632 is_zero[i] = true;
2633 else if (exp->chan[i].const_float == 1)
2634 is_one[i] = true;
2635 else
2636 return false; /* other constant */
2637 } else
2638 return false;
2639 }
2640
2641 /* Only certain combinations of 0 and 1 can be eliminated. */
2642 if (is_zero[0] && is_zero[1] && is_zero[2])
2643 default_val = is_zero[3] ? 0 : 1;
2644 else if (is_one[0] && is_one[1] && is_one[2])
2645 default_val = is_zero[3] ? 2 : 3;
2646 else
2647 return false;
2648
2649 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2650 LLVMInstructionEraseFromParent(exp->inst);
2651
2652 /* Change OFFSET to DEFAULT_VAL. */
2653 for (i = 0; i < num_outputs; i++) {
2654 if (vs_output_param_offset[i] == exp->offset) {
2655 vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2656 break;
2657 }
2658 }
2659 return true;
2660 }
2661
2662 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2663 uint8_t *vs_output_param_offset, uint32_t num_outputs,
2664 struct ac_vs_exports *processed,
2665 struct ac_vs_exp_inst *exp)
2666 {
2667 unsigned p, copy_back_channels = 0;
2668
2669 /* See if the output is already in the list of processed outputs.
2670 * The LLVMValueRef comparison relies on SSA.
2671 */
2672 for (p = 0; p < processed->num; p++) {
2673 bool different = false;
2674
2675 for (unsigned j = 0; j < 4; j++) {
2676 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2677 struct ac_vs_exp_chan *c2 = &exp->chan[j];
2678
2679 /* Treat undef as a match. */
2680 if (c2->type == AC_IR_UNDEF)
2681 continue;
2682
2683 /* If c1 is undef but c2 isn't, we can copy c2 to c1
2684 * and consider the instruction duplicated.
2685 */
2686 if (c1->type == AC_IR_UNDEF) {
2687 copy_back_channels |= 1 << j;
2688 continue;
2689 }
2690
2691 /* Test whether the channels are not equal. */
2692 if (c1->type != c2->type ||
2693 (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||
2694 (c1->type == AC_IR_VALUE && c1->value != c2->value)) {
2695 different = true;
2696 break;
2697 }
2698 }
2699 if (!different)
2700 break;
2701
2702 copy_back_channels = 0;
2703 }
2704 if (p == processed->num)
2705 return false;
2706
2707 /* If a match was found, but the matching export has undef where the new
2708 * one has a normal value, copy the normal value to the undef channel.
2709 */
2710 struct ac_vs_exp_inst *match = &processed->exp[p];
2711
2712 /* Get current enabled channels mask. */
2713 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2714 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2715
2716 while (copy_back_channels) {
2717 unsigned chan = u_bit_scan(&copy_back_channels);
2718
2719 assert(match->chan[chan].type == AC_IR_UNDEF);
2720 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);
2721 match->chan[chan] = exp->chan[chan];
2722
2723 /* Update number of enabled channels because the original mask
2724 * is not always 0xf.
2725 */
2726 enabled_channels |= (1 << chan);
2727 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2728 LLVMConstInt(ctx->i32, enabled_channels, 0));
2729 }
2730
2731 /* The PARAM export is duplicated. Kill it. */
2732 LLVMInstructionEraseFromParent(exp->inst);
2733
2734 /* Change OFFSET to the matching export. */
2735 for (unsigned i = 0; i < num_outputs; i++) {
2736 if (vs_output_param_offset[i] == exp->offset) {
2737 vs_output_param_offset[i] = match->offset;
2738 break;
2739 }
2740 }
2741 return true;
2742 }
2743
2744 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,
2745 uint8_t *vs_output_param_offset, uint32_t num_outputs,
2746 uint32_t skip_output_mask, uint8_t *num_param_exports)
2747 {
2748 LLVMBasicBlockRef bb;
2749 bool removed_any = false;
2750 struct ac_vs_exports exports;
2751
2752 exports.num = 0;
2753
2754 /* Process all LLVM instructions. */
2755 bb = LLVMGetFirstBasicBlock(main_fn);
2756 while (bb) {
2757 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2758
2759 while (inst) {
2760 LLVMValueRef cur = inst;
2761 inst = LLVMGetNextInstruction(inst);
2762 struct ac_vs_exp_inst exp;
2763
2764 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2765 continue;
2766
2767 LLVMValueRef callee = ac_llvm_get_called_value(cur);
2768
2769 if (!ac_llvm_is_function(callee))
2770 continue;
2771
2772 const char *name = LLVMGetValueName(callee);
2773 unsigned num_args = LLVMCountParams(callee);
2774
2775 /* Check if this is an export instruction. */
2776 if ((num_args != 9 && num_args != 8) ||
2777 (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))
2778 continue;
2779
2780 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2781 unsigned target = LLVMConstIntGetZExtValue(arg);
2782
2783 if (target < V_008DFC_SQ_EXP_PARAM)
2784 continue;
2785
2786 target -= V_008DFC_SQ_EXP_PARAM;
2787
2788 /* Parse the instruction. */
2789 memset(&exp, 0, sizeof(exp));
2790 exp.offset = target;
2791 exp.inst = cur;
2792
2793 for (unsigned i = 0; i < 4; i++) {
2794 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2795
2796 exp.chan[i].value = v;
2797
2798 if (LLVMIsUndef(v)) {
2799 exp.chan[i].type = AC_IR_UNDEF;
2800 } else if (LLVMIsAConstantFP(v)) {
2801 LLVMBool loses_info;
2802 exp.chan[i].type = AC_IR_CONST;
2803 exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);
2804 } else {
2805 exp.chan[i].type = AC_IR_VALUE;
2806 }
2807 }
2808
2809 /* Eliminate constant and duplicated PARAM exports. */
2810 if (!((1u << target) & skip_output_mask) &&
2811 (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||
2812 ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,
2813 &exp))) {
2814 removed_any = true;
2815 } else {
2816 exports.exp[exports.num++] = exp;
2817 }
2818 }
2819 bb = LLVMGetNextBasicBlock(bb);
2820 }
2821
2822 /* Remove holes in export memory due to removed PARAM exports.
2823 * This is done by renumbering all PARAM exports.
2824 */
2825 if (removed_any) {
2826 uint8_t old_offset[VARYING_SLOT_MAX];
2827 unsigned out, i;
2828
2829 /* Make a copy of the offsets. We need the old version while
2830 * we are modifying some of them. */
2831 memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));
2832
2833 for (i = 0; i < exports.num; i++) {
2834 unsigned offset = exports.exp[i].offset;
2835
2836 /* Update vs_output_param_offset. Multiple outputs can
2837 * have the same offset.
2838 */
2839 for (out = 0; out < num_outputs; out++) {
2840 if (old_offset[out] == offset)
2841 vs_output_param_offset[out] = i;
2842 }
2843
2844 /* Change the PARAM offset in the instruction. */
2845 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2846 LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));
2847 }
2848 *num_param_exports = exports.num;
2849 }
2850 }
2851
2852 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2853 {
2854 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2855 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2856 AC_FUNC_ATTR_CONVERGENT);
2857 }
2858
2859 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2860 {
2861 unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
2862 ctx->lds = LLVMBuildIntToPtr(
2863 ctx->builder, ctx->i32_0,
2864 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2865 }
2866
2867 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2868 {
2869 return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2870 }
2871
2872 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2873 {
2874 value = ac_to_integer(ctx, value);
2875 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2876 }
2877
2878 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2879 {
2880 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2881 const char *intrin_name;
2882 LLVMTypeRef type;
2883 LLVMValueRef zero;
2884
2885 switch (src0_bitsize) {
2886 case 64:
2887 intrin_name = "llvm.cttz.i64";
2888 type = ctx->i64;
2889 zero = ctx->i64_0;
2890 break;
2891 case 32:
2892 intrin_name = "llvm.cttz.i32";
2893 type = ctx->i32;
2894 zero = ctx->i32_0;
2895 break;
2896 case 16:
2897 intrin_name = "llvm.cttz.i16";
2898 type = ctx->i16;
2899 zero = ctx->i16_0;
2900 break;
2901 case 8:
2902 intrin_name = "llvm.cttz.i8";
2903 type = ctx->i8;
2904 zero = ctx->i8_0;
2905 break;
2906 default:
2907 unreachable(!"invalid bitsize");
2908 }
2909
2910 LLVMValueRef params[2] = {
2911 src0,
2912
2913 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2914 * add special code to check for x=0. The reason is that
2915 * the LLVM behavior for x=0 is different from what we
2916 * need here. However, LLVM also assumes that ffs(x) is
2917 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2918 * a conditional assignment to handle 0 is still required.
2919 *
2920 * The hardware already implements the correct behavior.
2921 */
2922 ctx->i1true,
2923 };