4465acbd0ad808884a7ba0fc03fdf82ddd6628e9
[mesa.git] / src / amd / common / ac_llvm_build.c
1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include <llvm-c/Core.h>
29
30 #include "c11/threads.h"
31
32 #include <assert.h>
33 #include <stdio.h>
34
35 #include "ac_llvm_util.h"
36 #include "ac_exp_param.h"
37 #include "util/bitscan.h"
38 #include "util/macros.h"
39 #include "util/u_atomic.h"
40 #include "util/u_math.h"
41 #include "sid.h"
42
43 #include "shader_enums.h"
44
45 #define AC_LLVM_INITIAL_CF_DEPTH 4
46
47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
48 */
49 struct ac_llvm_flow {
50 /* Loop exit or next part of if/else/endif. */
51 LLVMBasicBlockRef next_block;
52 LLVMBasicBlockRef loop_entry_block;
53 };
54
55 /* Initialize module-independent parts of the context.
56 *
57 * The caller is responsible for initializing ctx::module and ctx::builder.
58 */
59 void
60 ac_llvm_context_init(struct ac_llvm_context *ctx,
61 struct ac_llvm_compiler *compiler,
62 enum chip_class chip_class, enum radeon_family family,
63 unsigned wave_size)
64 {
65 LLVMValueRef args[1];
66
67 ctx->context = LLVMContextCreate();
68
69 ctx->chip_class = chip_class;
70 ctx->family = family;
71 ctx->wave_size = wave_size;
72 ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32
73 : compiler->tm,
74 ctx->context);
75 ctx->builder = NULL;
76
77 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
78 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
79 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
80 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
81 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
82 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
83 ctx->intptr = ctx->i32;
84 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
85 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
86 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
87 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
88 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
89 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
90 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
91 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
92 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
93 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
94 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
95
96 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
97 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
98 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
99 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
100 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
101 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
102 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
103 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
104 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
105 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
106 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
107 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
108 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
109 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
110
111 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
112 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
113
114 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
115 "range", 5);
116
117 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
118 "invariant.load", 14);
119
120 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
121
122 args[0] = LLVMConstReal(ctx->f32, 2.5);
123 ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
124
125 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
126 "amdgpu.uniform", 14);
127
128 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
129 }
130
131 void
132 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
133 {
134 free(ctx->flow);
135 ctx->flow = NULL;
136 ctx->flow_depth_max = 0;
137 }
138
139 int
140 ac_get_llvm_num_components(LLVMValueRef value)
141 {
142 LLVMTypeRef type = LLVMTypeOf(value);
143 unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
144 ? LLVMGetVectorSize(type)
145 : 1;
146 return num_components;
147 }
148
149 LLVMValueRef
150 ac_llvm_extract_elem(struct ac_llvm_context *ac,
151 LLVMValueRef value,
152 int index)
153 {
154 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
155 assert(index == 0);
156 return value;
157 }
158
159 return LLVMBuildExtractElement(ac->builder, value,
160 LLVMConstInt(ac->i32, index, false), "");
161 }
162
163 int
164 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
165 {
166 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
167 type = LLVMGetElementType(type);
168
169 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
170 return LLVMGetIntTypeWidth(type);
171
172 if (type == ctx->f16)
173 return 16;
174 if (type == ctx->f32)
175 return 32;
176 if (type == ctx->f64)
177 return 64;
178
179 unreachable("Unhandled type kind in get_elem_bits");
180 }
181
182 unsigned
183 ac_get_type_size(LLVMTypeRef type)
184 {
185 LLVMTypeKind kind = LLVMGetTypeKind(type);
186
187 switch (kind) {
188 case LLVMIntegerTypeKind:
189 return LLVMGetIntTypeWidth(type) / 8;
190 case LLVMHalfTypeKind:
191 return 2;
192 case LLVMFloatTypeKind:
193 return 4;
194 case LLVMDoubleTypeKind:
195 return 8;
196 case LLVMPointerTypeKind:
197 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
198 return 4;
199 return 8;
200 case LLVMVectorTypeKind:
201 return LLVMGetVectorSize(type) *
202 ac_get_type_size(LLVMGetElementType(type));
203 case LLVMArrayTypeKind:
204 return LLVMGetArrayLength(type) *
205 ac_get_type_size(LLVMGetElementType(type));
206 default:
207 assert(0);
208 return 0;
209 }
210 }
211
212 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
213 {
214 if (t == ctx->i8)
215 return ctx->i8;
216 else if (t == ctx->f16 || t == ctx->i16)
217 return ctx->i16;
218 else if (t == ctx->f32 || t == ctx->i32)
219 return ctx->i32;
220 else if (t == ctx->f64 || t == ctx->i64)
221 return ctx->i64;
222 else
223 unreachable("Unhandled integer size");
224 }
225
226 LLVMTypeRef
227 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
228 {
229 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
230 LLVMTypeRef elem_type = LLVMGetElementType(t);
231 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
232 LLVMGetVectorSize(t));
233 }
234 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
235 switch (LLVMGetPointerAddressSpace(t)) {
236 case AC_ADDR_SPACE_GLOBAL:
237 return ctx->i64;
238 case AC_ADDR_SPACE_LDS:
239 return ctx->i32;
240 default:
241 unreachable("unhandled address space");
242 }
243 }
244 return to_integer_type_scalar(ctx, t);
245 }
246
247 LLVMValueRef
248 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
249 {
250 LLVMTypeRef type = LLVMTypeOf(v);
251 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
252 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
253 }
254 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
255 }
256
257 LLVMValueRef
258 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
259 {
260 LLVMTypeRef type = LLVMTypeOf(v);
261 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
262 return v;
263 return ac_to_integer(ctx, v);
264 }
265
266 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
267 {
268 if (t == ctx->i8)
269 return ctx->i8;
270 else if (t == ctx->i16 || t == ctx->f16)
271 return ctx->f16;
272 else if (t == ctx->i32 || t == ctx->f32)
273 return ctx->f32;
274 else if (t == ctx->i64 || t == ctx->f64)
275 return ctx->f64;
276 else
277 unreachable("Unhandled float size");
278 }
279
280 LLVMTypeRef
281 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
282 {
283 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
284 LLVMTypeRef elem_type = LLVMGetElementType(t);
285 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
286 LLVMGetVectorSize(t));
287 }
288 return to_float_type_scalar(ctx, t);
289 }
290
291 LLVMValueRef
292 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
293 {
294 LLVMTypeRef type = LLVMTypeOf(v);
295 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
296 }
297
298
299 LLVMValueRef
300 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
301 LLVMTypeRef return_type, LLVMValueRef *params,
302 unsigned param_count, unsigned attrib_mask)
303 {
304 LLVMValueRef function, call;
305 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
306
307 function = LLVMGetNamedFunction(ctx->module, name);
308 if (!function) {
309 LLVMTypeRef param_types[32], function_type;
310 unsigned i;
311
312 assert(param_count <= 32);
313
314 for (i = 0; i < param_count; ++i) {
315 assert(params[i]);
316 param_types[i] = LLVMTypeOf(params[i]);
317 }
318 function_type =
319 LLVMFunctionType(return_type, param_types, param_count, 0);
320 function = LLVMAddFunction(ctx->module, name, function_type);
321
322 LLVMSetFunctionCallConv(function, LLVMCCallConv);
323 LLVMSetLinkage(function, LLVMExternalLinkage);
324
325 if (!set_callsite_attrs)
326 ac_add_func_attributes(ctx->context, function, attrib_mask);
327 }
328
329 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
330 if (set_callsite_attrs)
331 ac_add_func_attributes(ctx->context, call, attrib_mask);
332 return call;
333 }
334
335 /**
336 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
337 * intrinsic names).
338 */
339 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
340 {
341 LLVMTypeRef elem_type = type;
342
343 assert(bufsize >= 8);
344
345 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
346 int ret = snprintf(buf, bufsize, "v%u",
347 LLVMGetVectorSize(type));
348 if (ret < 0) {
349 char *type_name = LLVMPrintTypeToString(type);
350 fprintf(stderr, "Error building type name for: %s\n",
351 type_name);
352 return;
353 }
354 elem_type = LLVMGetElementType(type);
355 buf += ret;
356 bufsize -= ret;
357 }
358 switch (LLVMGetTypeKind(elem_type)) {
359 default: break;
360 case LLVMIntegerTypeKind:
361 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
362 break;
363 case LLVMHalfTypeKind:
364 snprintf(buf, bufsize, "f16");
365 break;
366 case LLVMFloatTypeKind:
367 snprintf(buf, bufsize, "f32");
368 break;
369 case LLVMDoubleTypeKind:
370 snprintf(buf, bufsize, "f64");
371 break;
372 }
373 }
374
375 /**
376 * Helper function that builds an LLVM IR PHI node and immediately adds
377 * incoming edges.
378 */
379 LLVMValueRef
380 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
381 unsigned count_incoming, LLVMValueRef *values,
382 LLVMBasicBlockRef *blocks)
383 {
384 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
385 LLVMAddIncoming(phi, values, blocks, count_incoming);
386 return phi;
387 }
388
389 void ac_build_s_barrier(struct ac_llvm_context *ctx)
390 {
391 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
392 0, AC_FUNC_ATTR_CONVERGENT);
393 }
394
395 /* Prevent optimizations (at least of memory accesses) across the current
396 * point in the program by emitting empty inline assembly that is marked as
397 * having side effects.
398 *
399 * Optionally, a value can be passed through the inline assembly to prevent
400 * LLVM from hoisting calls to ReadNone functions.
401 */
402 void
403 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
404 LLVMValueRef *pvgpr)
405 {
406 static int counter = 0;
407
408 LLVMBuilderRef builder = ctx->builder;
409 char code[16];
410
411 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
412
413 if (!pvgpr) {
414 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
415 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
416 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
417 } else {
418 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
419 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
420 LLVMValueRef vgpr = *pvgpr;
421 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
422 unsigned vgpr_size = ac_get_type_size(vgpr_type);
423 LLVMValueRef vgpr0;
424
425 assert(vgpr_size % 4 == 0);
426
427 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
428 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
429 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
430 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
431 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
432
433 *pvgpr = vgpr;
434 }
435 }
436
437 LLVMValueRef
438 ac_build_shader_clock(struct ac_llvm_context *ctx)
439 {
440 const char *intr = HAVE_LLVM >= 0x0900 && ctx->chip_class >= GFX8 ?
441 "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter";
442 LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0);
443 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
444 }
445
446 LLVMValueRef
447 ac_build_ballot(struct ac_llvm_context *ctx,
448 LLVMValueRef value)
449 {
450 const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i32" : "llvm.amdgcn.icmp.i32";
451 LLVMValueRef args[3] = {
452 value,
453 ctx->i32_0,
454 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
455 };
456
457 /* We currently have no other way to prevent LLVM from lifting the icmp
458 * calls to a dominating basic block.
459 */
460 ac_build_optimization_barrier(ctx, &args[0]);
461
462 args[0] = ac_to_integer(ctx, args[0]);
463
464 return ac_build_intrinsic(ctx, name,
465 ctx->i64, args, 3,
466 AC_FUNC_ATTR_NOUNWIND |
467 AC_FUNC_ATTR_READNONE |
468 AC_FUNC_ATTR_CONVERGENT);
469 }
470
471 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
472 LLVMValueRef value)
473 {
474 const char *name = HAVE_LLVM >= 0x900 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
475 LLVMValueRef args[3] = {
476 value,
477 ctx->i1false,
478 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
479 };
480
481 assert(HAVE_LLVM >= 0x0800);
482 return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
483 AC_FUNC_ATTR_NOUNWIND |
484 AC_FUNC_ATTR_READNONE |
485 AC_FUNC_ATTR_CONVERGENT);
486 }
487
488 LLVMValueRef
489 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
490 {
491 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
492 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
493 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
494 }
495
496 LLVMValueRef
497 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
498 {
499 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
500 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
501 LLVMConstInt(ctx->i64, 0, 0), "");
502 }
503
504 LLVMValueRef
505 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
506 {
507 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
508 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
509
510 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
511 vote_set, active_set, "");
512 LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
513 vote_set,
514 LLVMConstInt(ctx->i64, 0, 0), "");
515 return LLVMBuildOr(ctx->builder, all, none, "");
516 }
517
518 LLVMValueRef
519 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
520 unsigned value_count, unsigned component)
521 {
522 LLVMValueRef vec = NULL;
523
524 if (value_count == 1) {
525 return values[component];
526 } else if (!value_count)
527 unreachable("value_count is 0");
528
529 for (unsigned i = component; i < value_count + component; i++) {
530 LLVMValueRef value = values[i];
531
532 if (i == component)
533 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
534 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
535 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
536 }
537 return vec;
538 }
539
540 LLVMValueRef
541 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
542 LLVMValueRef *values,
543 unsigned value_count,
544 unsigned value_stride,
545 bool load,
546 bool always_vector)
547 {
548 LLVMBuilderRef builder = ctx->builder;
549 LLVMValueRef vec = NULL;
550 unsigned i;
551
552 if (value_count == 1 && !always_vector) {
553 if (load)
554 return LLVMBuildLoad(builder, values[0], "");
555 return values[0];
556 } else if (!value_count)
557 unreachable("value_count is 0");
558
559 for (i = 0; i < value_count; i++) {
560 LLVMValueRef value = values[i * value_stride];
561 if (load)
562 value = LLVMBuildLoad(builder, value, "");
563
564 if (!i)
565 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
566 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
567 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
568 }
569 return vec;
570 }
571
572 LLVMValueRef
573 ac_build_gather_values(struct ac_llvm_context *ctx,
574 LLVMValueRef *values,
575 unsigned value_count)
576 {
577 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
578 }
579
580 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
581 * channels with undef. Extract at most src_channels components from the input.
582 */
583 static LLVMValueRef
584 ac_build_expand(struct ac_llvm_context *ctx,
585 LLVMValueRef value,
586 unsigned src_channels,
587 unsigned dst_channels)
588 {
589 LLVMTypeRef elemtype;
590 LLVMValueRef chan[dst_channels];
591
592 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
593 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
594
595 if (src_channels == dst_channels && vec_size == dst_channels)
596 return value;
597
598 src_channels = MIN2(src_channels, vec_size);
599
600 for (unsigned i = 0; i < src_channels; i++)
601 chan[i] = ac_llvm_extract_elem(ctx, value, i);
602
603 elemtype = LLVMGetElementType(LLVMTypeOf(value));
604 } else {
605 if (src_channels) {
606 assert(src_channels == 1);
607 chan[0] = value;
608 }
609 elemtype = LLVMTypeOf(value);
610 }
611
612 for (unsigned i = src_channels; i < dst_channels; i++)
613 chan[i] = LLVMGetUndef(elemtype);
614
615 return ac_build_gather_values(ctx, chan, dst_channels);
616 }
617
618 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
619 * with undef. Extract at most num_channels components from the input.
620 */
621 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
622 LLVMValueRef value,
623 unsigned num_channels)
624 {
625 return ac_build_expand(ctx, value, num_channels, 4);
626 }
627
628 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
629 {
630 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
631 const char *name;
632
633 if (type_size == 2)
634 name = "llvm.rint.f16";
635 else if (type_size == 4)
636 name = "llvm.rint.f32";
637 else
638 name = "llvm.rint.f64";
639
640 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
641 AC_FUNC_ATTR_READNONE);
642 }
643
644 LLVMValueRef
645 ac_build_fdiv(struct ac_llvm_context *ctx,
646 LLVMValueRef num,
647 LLVMValueRef den)
648 {
649 /* If we do (num / den), LLVM >= 7.0 does:
650 * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
651 *
652 * If we do (num * (1 / den)), LLVM does:
653 * return num * v_rcp_f32(den);
654 */
655 LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
656 LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
657 LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
658
659 /* Use v_rcp_f32 instead of precise division. */
660 if (!LLVMIsConstant(ret))
661 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
662 return ret;
663 }
664
665 /* See fast_idiv_by_const.h. */
666 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
667 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
668 LLVMValueRef num,
669 LLVMValueRef multiplier,
670 LLVMValueRef pre_shift,
671 LLVMValueRef post_shift,
672 LLVMValueRef increment)
673 {
674 LLVMBuilderRef builder = ctx->builder;
675
676 num = LLVMBuildLShr(builder, num, pre_shift, "");
677 num = LLVMBuildMul(builder,
678 LLVMBuildZExt(builder, num, ctx->i64, ""),
679 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
680 num = LLVMBuildAdd(builder, num,
681 LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
682 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
683 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
684 return LLVMBuildLShr(builder, num, post_shift, "");
685 }
686
687 /* See fast_idiv_by_const.h. */
688 /* If num != UINT_MAX, this more efficient version can be used. */
689 /* Set: increment = util_fast_udiv_info::increment; */
690 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
691 LLVMValueRef num,
692 LLVMValueRef multiplier,
693 LLVMValueRef pre_shift,
694 LLVMValueRef post_shift,
695 LLVMValueRef increment)
696 {
697 LLVMBuilderRef builder = ctx->builder;
698
699 num = LLVMBuildLShr(builder, num, pre_shift, "");
700 num = LLVMBuildNUWAdd(builder, num, increment, "");
701 num = LLVMBuildMul(builder,
702 LLVMBuildZExt(builder, num, ctx->i64, ""),
703 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
704 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
705 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
706 return LLVMBuildLShr(builder, num, post_shift, "");
707 }
708
709 /* See fast_idiv_by_const.h. */
710 /* Both operands must fit in 31 bits and the divisor must not be 1. */
711 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
712 LLVMValueRef num,
713 LLVMValueRef multiplier,
714 LLVMValueRef post_shift)
715 {
716 LLVMBuilderRef builder = ctx->builder;
717
718 num = LLVMBuildMul(builder,
719 LLVMBuildZExt(builder, num, ctx->i64, ""),
720 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
721 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
722 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
723 return LLVMBuildLShr(builder, num, post_shift, "");
724 }
725
726 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
727 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
728 * already multiplied by two. id is the cube face number.
729 */
730 struct cube_selection_coords {
731 LLVMValueRef stc[2];
732 LLVMValueRef ma;
733 LLVMValueRef id;
734 };
735
736 static void
737 build_cube_intrinsic(struct ac_llvm_context *ctx,
738 LLVMValueRef in[3],
739 struct cube_selection_coords *out)
740 {
741 LLVMTypeRef f32 = ctx->f32;
742
743 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
744 f32, in, 3, AC_FUNC_ATTR_READNONE);
745 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
746 f32, in, 3, AC_FUNC_ATTR_READNONE);
747 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
748 f32, in, 3, AC_FUNC_ATTR_READNONE);
749 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
750 f32, in, 3, AC_FUNC_ATTR_READNONE);
751 }
752
753 /**
754 * Build a manual selection sequence for cube face sc/tc coordinates and
755 * major axis vector (multiplied by 2 for consistency) for the given
756 * vec3 \p coords, for the face implied by \p selcoords.
757 *
758 * For the major axis, we always adjust the sign to be in the direction of
759 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
760 * the selcoords major axis.
761 */
762 static void build_cube_select(struct ac_llvm_context *ctx,
763 const struct cube_selection_coords *selcoords,
764 const LLVMValueRef *coords,
765 LLVMValueRef *out_st,
766 LLVMValueRef *out_ma)
767 {
768 LLVMBuilderRef builder = ctx->builder;
769 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
770 LLVMValueRef is_ma_positive;
771 LLVMValueRef sgn_ma;
772 LLVMValueRef is_ma_z, is_not_ma_z;
773 LLVMValueRef is_ma_y;
774 LLVMValueRef is_ma_x;
775 LLVMValueRef sgn;
776 LLVMValueRef tmp;
777
778 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
779 selcoords->ma, LLVMConstReal(f32, 0.0), "");
780 sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
781 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
782
783 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
784 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
785 is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
786 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
787 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
788
789 /* Select sc */
790 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
791 sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
792 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
793 LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
794 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
795
796 /* Select tc */
797 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
798 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
799 LLVMConstReal(f32, -1.0), "");
800 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
801
802 /* Select ma */
803 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
804 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
805 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
806 ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
807 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
808 }
809
810 void
811 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
812 bool is_deriv, bool is_array, bool is_lod,
813 LLVMValueRef *coords_arg,
814 LLVMValueRef *derivs_arg)
815 {
816
817 LLVMBuilderRef builder = ctx->builder;
818 struct cube_selection_coords selcoords;
819 LLVMValueRef coords[3];
820 LLVMValueRef invma;
821
822 if (is_array && !is_lod) {
823 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
824
825 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
826 *
827 * "For Array forms, the array layer used will be
828 *
829 * max(0, min(d−1, floor(layer+0.5)))
830 *
831 * where d is the depth of the texture array and layer
832 * comes from the component indicated in the tables below.
833 * Workaroudn for an issue where the layer is taken from a
834 * helper invocation which happens to fall on a different
835 * layer due to extrapolation."
836 *
837 * GFX8 and earlier attempt to implement this in hardware by
838 * clamping the value of coords[2] = (8 * layer) + face.
839 * Unfortunately, this means that the we end up with the wrong
840 * face when clamping occurs.
841 *
842 * Clamp the layer earlier to work around the issue.
843 */
844 if (ctx->chip_class <= GFX8) {
845 LLVMValueRef ge0;
846 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
847 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
848 }
849
850 coords_arg[3] = tmp;
851 }
852
853 build_cube_intrinsic(ctx, coords_arg, &selcoords);
854
855 invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
856 ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
857 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
858
859 for (int i = 0; i < 2; ++i)
860 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
861
862 coords[2] = selcoords.id;
863
864 if (is_deriv && derivs_arg) {
865 LLVMValueRef derivs[4];
866 int axis;
867
868 /* Convert cube derivatives to 2D derivatives. */
869 for (axis = 0; axis < 2; axis++) {
870 LLVMValueRef deriv_st[2];
871 LLVMValueRef deriv_ma;
872
873 /* Transform the derivative alongside the texture
874 * coordinate. Mathematically, the correct formula is
875 * as follows. Assume we're projecting onto the +Z face
876 * and denote by dx/dh the derivative of the (original)
877 * X texture coordinate with respect to horizontal
878 * window coordinates. The projection onto the +Z face
879 * plane is:
880 *
881 * f(x,z) = x/z
882 *
883 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
884 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
885 *
886 * This motivatives the implementation below.
887 *
888 * Whether this actually gives the expected results for
889 * apps that might feed in derivatives obtained via
890 * finite differences is anyone's guess. The OpenGL spec
891 * seems awfully quiet about how textureGrad for cube
892 * maps should be handled.
893 */
894 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
895 deriv_st, &deriv_ma);
896
897 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
898
899 for (int i = 0; i < 2; ++i)
900 derivs[axis * 2 + i] =
901 LLVMBuildFSub(builder,
902 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
903 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
904 }
905
906 memcpy(derivs_arg, derivs, sizeof(derivs));
907 }
908
909 /* Shift the texture coordinate. This must be applied after the
910 * derivative calculation.
911 */
912 for (int i = 0; i < 2; ++i)
913 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
914
915 if (is_array) {
916 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
917 /* coords_arg.w component - array_index for cube arrays */
918 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
919 }
920
921 memcpy(coords_arg, coords, sizeof(coords));
922 }
923
924
925 LLVMValueRef
926 ac_build_fs_interp(struct ac_llvm_context *ctx,
927 LLVMValueRef llvm_chan,
928 LLVMValueRef attr_number,
929 LLVMValueRef params,
930 LLVMValueRef i,
931 LLVMValueRef j)
932 {
933 LLVMValueRef args[5];
934 LLVMValueRef p1;
935
936 args[0] = i;
937 args[1] = llvm_chan;
938 args[2] = attr_number;
939 args[3] = params;
940
941 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
942 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
943
944 args[0] = p1;
945 args[1] = j;
946 args[2] = llvm_chan;
947 args[3] = attr_number;
948 args[4] = params;
949
950 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
951 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
952 }
953
954 LLVMValueRef
955 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
956 LLVMValueRef llvm_chan,
957 LLVMValueRef attr_number,
958 LLVMValueRef params,
959 LLVMValueRef i,
960 LLVMValueRef j)
961 {
962 LLVMValueRef args[6];
963 LLVMValueRef p1;
964
965 args[0] = i;
966 args[1] = llvm_chan;
967 args[2] = attr_number;
968 args[3] = ctx->i1false;
969 args[4] = params;
970
971 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
972 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
973
974 args[0] = p1;
975 args[1] = j;
976 args[2] = llvm_chan;
977 args[3] = attr_number;
978 args[4] = ctx->i1false;
979 args[5] = params;
980
981 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
982 ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
983 }
984
985 LLVMValueRef
986 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
987 LLVMValueRef parameter,
988 LLVMValueRef llvm_chan,
989 LLVMValueRef attr_number,
990 LLVMValueRef params)
991 {
992 LLVMValueRef args[4];
993
994 args[0] = parameter;
995 args[1] = llvm_chan;
996 args[2] = attr_number;
997 args[3] = params;
998
999 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
1000 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1001 }
1002
1003 LLVMValueRef
1004 ac_build_gep_ptr(struct ac_llvm_context *ctx,
1005 LLVMValueRef base_ptr,
1006 LLVMValueRef index)
1007 {
1008 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1009 }
1010
1011 LLVMValueRef
1012 ac_build_gep0(struct ac_llvm_context *ctx,
1013 LLVMValueRef base_ptr,
1014 LLVMValueRef index)
1015 {
1016 LLVMValueRef indices[2] = {
1017 ctx->i32_0,
1018 index,
1019 };
1020 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1021 }
1022
1023 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
1024 LLVMValueRef index)
1025 {
1026 return LLVMBuildPointerCast(ctx->builder,
1027 LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1028 LLVMTypeOf(ptr), "");
1029 }
1030
1031 void
1032 ac_build_indexed_store(struct ac_llvm_context *ctx,
1033 LLVMValueRef base_ptr, LLVMValueRef index,
1034 LLVMValueRef value)
1035 {
1036 LLVMBuildStore(ctx->builder, value,
1037 ac_build_gep0(ctx, base_ptr, index));
1038 }
1039
1040 /**
1041 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1042 * It's equivalent to doing a load from &base_ptr[index].
1043 *
1044 * \param base_ptr Where the array starts.
1045 * \param index The element index into the array.
1046 * \param uniform Whether the base_ptr and index can be assumed to be
1047 * dynamically uniform (i.e. load to an SGPR)
1048 * \param invariant Whether the load is invariant (no other opcodes affect it)
1049 * \param no_unsigned_wraparound
1050 * For all possible re-associations and re-distributions of an expression
1051 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1052 * without inbounds in base_ptr), this parameter is true if "addr + offset"
1053 * does not result in an unsigned integer wraparound. This is used for
1054 * optimal code generation of 32-bit pointer arithmetic.
1055 *
1056 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
1057 * integer wraparound can't be an imm offset in s_load_dword, because
1058 * the instruction performs "addr + offset" in 64 bits.
1059 *
1060 * Expected usage for bindless textures by chaining GEPs:
1061 * // possible unsigned wraparound, don't use InBounds:
1062 * ptr1 = LLVMBuildGEP(base_ptr, index);
1063 * image = load(ptr1); // becomes "s_load ptr1, 0"
1064 *
1065 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1066 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1067 */
1068 static LLVMValueRef
1069 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1070 LLVMValueRef index, bool uniform, bool invariant,
1071 bool no_unsigned_wraparound)
1072 {
1073 LLVMValueRef pointer, result;
1074
1075 if (no_unsigned_wraparound &&
1076 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1077 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1078 else
1079 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1080
1081 if (uniform)
1082 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1083 result = LLVMBuildLoad(ctx->builder, pointer, "");
1084 if (invariant)
1085 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1086 return result;
1087 }
1088
1089 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1090 LLVMValueRef index)
1091 {
1092 return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1093 }
1094
1095 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1096 LLVMValueRef base_ptr, LLVMValueRef index)
1097 {
1098 return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1099 }
1100
1101 /* This assumes that there is no unsigned integer wraparound during the address
1102 * computation, excluding all GEPs within base_ptr. */
1103 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1104 LLVMValueRef base_ptr, LLVMValueRef index)
1105 {
1106 return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1107 }
1108
1109 /* See ac_build_load_custom() documentation. */
1110 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1111 LLVMValueRef base_ptr, LLVMValueRef index)
1112 {
1113 return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1114 }
1115
1116 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx,
1117 unsigned cache_policy)
1118 {
1119 return cache_policy |
1120 (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1121 }
1122
1123 static void
1124 ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
1125 LLVMValueRef rsrc,
1126 LLVMValueRef data,
1127 LLVMValueRef vindex,
1128 LLVMValueRef voffset,
1129 unsigned num_channels,
1130 unsigned cache_policy,
1131 bool use_format)
1132 {
1133 LLVMValueRef args[] = {
1134 data,
1135 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1136 vindex ? vindex : ctx->i32_0,
1137 voffset,
1138 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1139 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1140 };
1141 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1142
1143 const char *type_names[] = {"f32", "v2f32", "v4f32"};
1144 char name[256];
1145
1146 if (use_format) {
1147 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
1148 type_names[func]);
1149 } else {
1150 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
1151 type_names[func]);
1152 }
1153
1154 ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
1155 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1156 }
1157
1158 static void
1159 ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
1160 LLVMValueRef rsrc,
1161 LLVMValueRef data,
1162 LLVMValueRef vindex,
1163 LLVMValueRef voffset,
1164 LLVMValueRef soffset,
1165 unsigned num_channels,
1166 LLVMTypeRef return_channel_type,
1167 unsigned cache_policy,
1168 bool use_format,
1169 bool structurized)
1170 {
1171 LLVMValueRef args[6];
1172 int idx = 0;
1173 args[idx++] = data;
1174 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1175 if (structurized)
1176 args[idx++] = vindex ? vindex : ctx->i32_0;
1177 args[idx++] = voffset ? voffset : ctx->i32_0;
1178 args[idx++] = soffset ? soffset : ctx->i32_0;
1179 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1180 unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1181 const char *indexing_kind = structurized ? "struct" : "raw";
1182 char name[256], type_name[8];
1183
1184 LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
1185 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1186
1187 if (use_format) {
1188 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1189 indexing_kind, type_name);
1190 } else {
1191 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1192 indexing_kind, type_name);
1193 }
1194
1195 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1196 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1197 }
1198
1199 void
1200 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1201 LLVMValueRef rsrc,
1202 LLVMValueRef data,
1203 LLVMValueRef vindex,
1204 LLVMValueRef voffset,
1205 unsigned num_channels,
1206 unsigned cache_policy)
1207 {
1208 if (HAVE_LLVM >= 0x800) {
1209 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
1210 voffset, NULL, num_channels,
1211 ctx->f32, cache_policy,
1212 true, true);
1213 } else {
1214 ac_build_llvm7_buffer_store_common(ctx, rsrc, data, vindex, voffset,
1215 num_channels, cache_policy,
1216 true);
1217 }
1218 }
1219
1220 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1221 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1222 * or v4i32 (num_channels=3,4).
1223 */
1224 void
1225 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1226 LLVMValueRef rsrc,
1227 LLVMValueRef vdata,
1228 unsigned num_channels,
1229 LLVMValueRef voffset,
1230 LLVMValueRef soffset,
1231 unsigned inst_offset,
1232 unsigned cache_policy,
1233 bool swizzle_enable_hint)
1234 {
1235 /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1236 * intrinsics. */
1237 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1238 LLVMValueRef v[3], v01;
1239
1240 for (int i = 0; i < 3; i++) {
1241 v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1242 LLVMConstInt(ctx->i32, i, 0), "");
1243 }
1244 v01 = ac_build_gather_values(ctx, v, 2);
1245
1246 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1247 soffset, inst_offset, cache_policy,
1248 swizzle_enable_hint);
1249 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1250 soffset, inst_offset + 8,
1251 cache_policy,
1252 swizzle_enable_hint);
1253 return;
1254 }
1255
1256 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1257 * (voffset is swizzled, but soffset isn't swizzled).
1258 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1259 */
1260 if (!swizzle_enable_hint) {
1261 LLVMValueRef offset = soffset;
1262
1263 if (inst_offset)
1264 offset = LLVMBuildAdd(ctx->builder, offset,
1265 LLVMConstInt(ctx->i32, inst_offset, 0), "");
1266
1267 if (HAVE_LLVM >= 0x800) {
1268 ac_build_llvm8_buffer_store_common(ctx, rsrc,
1269 ac_to_float(ctx, vdata),
1270 ctx->i32_0,
1271 voffset, offset,
1272 num_channels,
1273 ctx->f32,
1274 cache_policy,
1275 false, false);
1276 } else {
1277 if (voffset)
1278 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1279
1280 ac_build_llvm7_buffer_store_common(ctx, rsrc,
1281 ac_to_float(ctx, vdata),
1282 ctx->i32_0, offset,
1283 num_channels, cache_policy,
1284 false);
1285 }
1286 return;
1287 }
1288
1289 static const unsigned dfmts[] = {
1290 V_008F0C_BUF_DATA_FORMAT_32,
1291 V_008F0C_BUF_DATA_FORMAT_32_32,
1292 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1293 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1294 };
1295 unsigned dfmt = dfmts[num_channels - 1];
1296 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1297 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1298
1299 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1300 immoffset, num_channels, dfmt, nfmt, cache_policy);
1301 }
1302
1303 static LLVMValueRef
1304 ac_build_llvm7_buffer_load_common(struct ac_llvm_context *ctx,
1305 LLVMValueRef rsrc,
1306 LLVMValueRef vindex,
1307 LLVMValueRef voffset,
1308 unsigned num_channels,
1309 unsigned cache_policy,
1310 bool can_speculate,
1311 bool use_format)
1312 {
1313 LLVMValueRef args[] = {
1314 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1315 vindex ? vindex : ctx->i32_0,
1316 voffset,
1317 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), 0),
1318 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), 0)
1319 };
1320 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1321
1322 LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1323 const char *type_names[] = {"f32", "v2f32", "v4f32"};
1324 char name[256];
1325
1326 if (use_format) {
1327 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1328 type_names[func]);
1329 } else {
1330 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1331 type_names[func]);
1332 }
1333
1334 return ac_build_intrinsic(ctx, name, types[func], args,
1335 ARRAY_SIZE(args),
1336 ac_get_load_intr_attribs(can_speculate));
1337 }
1338
1339 static LLVMValueRef
1340 ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
1341 LLVMValueRef rsrc,
1342 LLVMValueRef vindex,
1343 LLVMValueRef voffset,
1344 LLVMValueRef soffset,
1345 unsigned num_channels,
1346 LLVMTypeRef channel_type,
1347 unsigned cache_policy,
1348 bool can_speculate,
1349 bool use_format,
1350 bool structurized)
1351 {
1352 LLVMValueRef args[5];
1353 int idx = 0;
1354 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1355 if (structurized)
1356 args[idx++] = vindex ? vindex : ctx->i32_0;
1357 args[idx++] = voffset ? voffset : ctx->i32_0;
1358 args[idx++] = soffset ? soffset : ctx->i32_0;
1359 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1360 unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1361 const char *indexing_kind = structurized ? "struct" : "raw";
1362 char name[256], type_name[8];
1363
1364 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1365 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1366
1367 if (use_format) {
1368 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1369 indexing_kind, type_name);
1370 } else {
1371 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1372 indexing_kind, type_name);
1373 }
1374
1375 return ac_build_intrinsic(ctx, name, type, args, idx,
1376 ac_get_load_intr_attribs(can_speculate));
1377 }
1378
1379 LLVMValueRef
1380 ac_build_buffer_load(struct ac_llvm_context *ctx,
1381 LLVMValueRef rsrc,
1382 int num_channels,
1383 LLVMValueRef vindex,
1384 LLVMValueRef voffset,
1385 LLVMValueRef soffset,
1386 unsigned inst_offset,
1387 unsigned cache_policy,
1388 bool can_speculate,
1389 bool allow_smem)
1390 {
1391 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1392 if (voffset)
1393 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1394 if (soffset)
1395 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1396
1397 if (allow_smem && !(cache_policy & ac_slc) &&
1398 (!(cache_policy & ac_glc) || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= GFX8))) {
1399 assert(vindex == NULL);
1400
1401 LLVMValueRef result[8];
1402
1403 for (int i = 0; i < num_channels; i++) {
1404 if (i) {
1405 offset = LLVMBuildAdd(ctx->builder, offset,
1406 LLVMConstInt(ctx->i32, 4, 0), "");
1407 }
1408 const char *intrname =
1409 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
1410 : "llvm.SI.load.const.v4i32";
1411 unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
1412 LLVMValueRef args[3] = {
1413 rsrc,
1414 offset,
1415 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1416 };
1417 result[i] = ac_build_intrinsic(ctx, intrname,
1418 ctx->f32, args, num_args,
1419 AC_FUNC_ATTR_READNONE |
1420 (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
1421 }
1422 if (num_channels == 1)
1423 return result[0];
1424
1425 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1426 result[num_channels++] = LLVMGetUndef(ctx->f32);
1427 return ac_build_gather_values(ctx, result, num_channels);
1428 }
1429
1430 if (HAVE_LLVM >= 0x0800) {
1431 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
1432 offset, ctx->i32_0,
1433 num_channels, ctx->f32,
1434 cache_policy,
1435 can_speculate, false,
1436 false);
1437 }
1438
1439 return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, offset,
1440 num_channels, cache_policy,
1441 can_speculate, false);
1442 }
1443
1444 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1445 LLVMValueRef rsrc,
1446 LLVMValueRef vindex,
1447 LLVMValueRef voffset,
1448 unsigned num_channels,
1449 unsigned cache_policy,
1450 bool can_speculate)
1451 {
1452 if (HAVE_LLVM >= 0x800) {
1453 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1454 num_channels, ctx->f32,
1455 cache_policy, can_speculate, true, true);
1456 }
1457 return ac_build_llvm7_buffer_load_common(ctx, rsrc, vindex, voffset,
1458 num_channels, cache_policy,
1459 can_speculate, true);
1460 }
1461
1462 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1463 LLVMValueRef rsrc,
1464 LLVMValueRef vindex,
1465 LLVMValueRef voffset,
1466 unsigned num_channels,
1467 unsigned cache_policy,
1468 bool can_speculate)
1469 {
1470 if (HAVE_LLVM >= 0x800) {
1471 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1472 num_channels, ctx->f32,
1473 cache_policy, can_speculate, true, true);
1474 }
1475
1476 LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1477 LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
1478 stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1479
1480 LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1481 LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1482 elem_count, stride, "");
1483
1484 LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1485 LLVMConstInt(ctx->i32, 2, 0), "");
1486
1487 return ac_build_llvm7_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1488 num_channels, cache_policy,
1489 can_speculate, true);
1490 }
1491
1492 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
1493 /// value for LLVM8+ tbuffer intrinsics.
1494 static unsigned
1495 ac_get_tbuffer_format(struct ac_llvm_context *ctx,
1496 unsigned dfmt, unsigned nfmt)
1497 {
1498 if (ctx->chip_class >= GFX10) {
1499 unsigned format;
1500 switch (dfmt) {
1501 default: unreachable("bad dfmt");
1502 case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
1503 case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
1504 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
1505 case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
1506 case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
1507 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
1508 case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
1509 case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
1510 case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break;
1511 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
1512 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break;
1513 }
1514
1515 // Use the regularity properties of the combined format enum.
1516 //
1517 // Note: float is incompatible with 8-bit data formats,
1518 // [us]{norm,scaled} are incomparible with 32-bit data formats.
1519 // [us]scaled are not writable.
1520 switch (nfmt) {
1521 case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
1522 case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
1523 case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
1524 case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
1525 default: unreachable("bad nfmt");
1526 case V_008F0C_BUF_NUM_FORMAT_UINT: break;
1527 case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
1528 case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
1529 }
1530
1531 return format;
1532 } else {
1533 return dfmt | (nfmt << 4);
1534 }
1535 }
1536
1537 static LLVMValueRef
1538 ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
1539 LLVMValueRef rsrc,
1540 LLVMValueRef vindex,
1541 LLVMValueRef voffset,
1542 LLVMValueRef soffset,
1543 unsigned num_channels,
1544 unsigned dfmt,
1545 unsigned nfmt,
1546 unsigned cache_policy,
1547 bool can_speculate,
1548 bool structurized)
1549 {
1550 LLVMValueRef args[6];
1551 int idx = 0;
1552 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1553 if (structurized)
1554 args[idx++] = vindex ? vindex : ctx->i32_0;
1555 args[idx++] = voffset ? voffset : ctx->i32_0;
1556 args[idx++] = soffset ? soffset : ctx->i32_0;
1557 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
1558 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1559 unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1560 const char *indexing_kind = structurized ? "struct" : "raw";
1561 char name[256], type_name[8];
1562
1563 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1564 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1565
1566 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1567 indexing_kind, type_name);
1568
1569 return ac_build_intrinsic(ctx, name, type, args, idx,
1570 ac_get_load_intr_attribs(can_speculate));
1571 }
1572
1573 static LLVMValueRef
1574 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1575 LLVMValueRef rsrc,
1576 LLVMValueRef vindex,
1577 LLVMValueRef voffset,
1578 LLVMValueRef soffset,
1579 LLVMValueRef immoffset,
1580 unsigned num_channels,
1581 unsigned dfmt,
1582 unsigned nfmt,
1583 unsigned cache_policy,
1584 bool can_speculate,
1585 bool structurized) /* only matters for LLVM 8+ */
1586 {
1587 if (HAVE_LLVM >= 0x800) {
1588 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1589
1590 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
1591 soffset, num_channels,
1592 dfmt, nfmt, cache_policy,
1593 can_speculate, structurized);
1594 }
1595
1596 LLVMValueRef args[] = {
1597 rsrc,
1598 vindex ? vindex : ctx->i32_0,
1599 voffset,
1600 soffset,
1601 immoffset,
1602 LLVMConstInt(ctx->i32, dfmt, false),
1603 LLVMConstInt(ctx->i32, nfmt, false),
1604 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
1605 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
1606 };
1607 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1608 LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1609 const char *type_names[] = {"i32", "v2i32", "v4i32"};
1610 char name[256];
1611
1612 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
1613 type_names[func]);
1614
1615 return ac_build_intrinsic(ctx, name, types[func], args, 9,
1616 ac_get_load_intr_attribs(can_speculate));
1617 }
1618
1619 LLVMValueRef
1620 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1621 LLVMValueRef rsrc,
1622 LLVMValueRef vindex,
1623 LLVMValueRef voffset,
1624 LLVMValueRef soffset,
1625 LLVMValueRef immoffset,
1626 unsigned num_channels,
1627 unsigned dfmt,
1628 unsigned nfmt,
1629 unsigned cache_policy,
1630 bool can_speculate)
1631 {
1632 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1633 immoffset, num_channels, dfmt, nfmt,
1634 cache_policy, can_speculate, true);
1635 }
1636
1637 LLVMValueRef
1638 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1639 LLVMValueRef rsrc,
1640 LLVMValueRef voffset,
1641 LLVMValueRef soffset,
1642 LLVMValueRef immoffset,
1643 unsigned num_channels,
1644 unsigned dfmt,
1645 unsigned nfmt,
1646 unsigned cache_policy,
1647 bool can_speculate)
1648 {
1649 return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1650 immoffset, num_channels, dfmt, nfmt,
1651 cache_policy, can_speculate, false);
1652 }
1653
1654 LLVMValueRef
1655 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1656 LLVMValueRef rsrc,
1657 LLVMValueRef voffset,
1658 LLVMValueRef soffset,
1659 LLVMValueRef immoffset,
1660 unsigned cache_policy)
1661 {
1662 LLVMValueRef res;
1663
1664 if (HAVE_LLVM >= 0x900) {
1665 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1666
1667 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1668 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1669 voffset, soffset,
1670 1, ctx->i16, cache_policy,
1671 false, false, false);
1672 } else {
1673 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1674 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1675
1676 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1677 immoffset, 1, dfmt, nfmt, cache_policy,
1678 false);
1679
1680 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1681 }
1682
1683 return res;
1684 }
1685
1686 LLVMValueRef
1687 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1688 LLVMValueRef rsrc,
1689 LLVMValueRef voffset,
1690 LLVMValueRef soffset,
1691 LLVMValueRef immoffset,
1692 unsigned cache_policy)
1693 {
1694 LLVMValueRef res;
1695
1696 if (HAVE_LLVM >= 0x900) {
1697 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1698
1699 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1700 res = ac_build_llvm8_buffer_load_common(ctx, rsrc, NULL,
1701 voffset, soffset,
1702 1, ctx->i8, cache_policy,
1703 false, false, false);
1704 } else {
1705 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1706 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1707
1708 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1709 immoffset, 1, dfmt, nfmt, cache_policy,
1710 false);
1711
1712 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1713 }
1714
1715 return res;
1716 }
1717
1718 /**
1719 * Convert an 11- or 10-bit unsigned floating point number to an f32.
1720 *
1721 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1722 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1723 */
1724 static LLVMValueRef
1725 ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits)
1726 {
1727 assert(LLVMTypeOf(src) == ctx->i32);
1728
1729 LLVMValueRef tmp;
1730 LLVMValueRef mantissa;
1731 mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1732
1733 /* Converting normal numbers is just a shift + correcting the exponent bias */
1734 unsigned normal_shift = 23 - mant_bits;
1735 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1736 LLVMValueRef shifted, normal;
1737
1738 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1739 normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1740
1741 /* Converting nan/inf numbers is the same, but with a different exponent update */
1742 LLVMValueRef naninf;
1743 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1744
1745 /* Converting denormals is the complex case: determine the leading zeros of the
1746 * mantissa to obtain the correct shift for the mantissa and exponent correction.
1747 */
1748 LLVMValueRef denormal;
1749 LLVMValueRef params[2] = {
1750 mantissa,
1751 ctx->i1true, /* result can be undef when arg is 0 */
1752 };
1753 LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32,
1754 params, 2, AC_FUNC_ATTR_READNONE);
1755
1756 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1757 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1758 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1759
1760 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1761 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1762 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1763 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1764
1765 /* Select the final result. */
1766 LLVMValueRef result;
1767
1768 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1769 LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1770 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1771
1772 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1773 LLVMConstInt(ctx->i32, 1 << mant_bits, false), "");
1774 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1775
1776 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1777 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1778
1779 return ac_to_float(ctx, result);
1780 }
1781
1782 /**
1783 * Generate a fully general open coded buffer format fetch with all required
1784 * fixups suitable for vertex fetch, using non-format buffer loads.
1785 *
1786 * Some combinations of argument values have special interpretations:
1787 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1788 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1789 *
1790 * \param log_size log(size of channel in bytes)
1791 * \param num_channels number of channels (1 to 4)
1792 * \param format AC_FETCH_FORMAT_xxx value
1793 * \param reverse whether XYZ channels are reversed
1794 * \param known_aligned whether the source is known to be aligned to hardware's
1795 * effective element size for loading the given format
1796 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1797 * \param rsrc buffer resource descriptor
1798 * \return the resulting vector of floats or integers bitcast to <4 x i32>
1799 */
1800 LLVMValueRef
1801 ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
1802 unsigned log_size,
1803 unsigned num_channels,
1804 unsigned format,
1805 bool reverse,
1806 bool known_aligned,
1807 LLVMValueRef rsrc,
1808 LLVMValueRef vindex,
1809 LLVMValueRef voffset,
1810 LLVMValueRef soffset,
1811 unsigned cache_policy,
1812 bool can_speculate)
1813 {
1814 LLVMValueRef tmp;
1815 unsigned load_log_size = log_size;
1816 unsigned load_num_channels = num_channels;
1817 if (log_size == 3) {
1818 load_log_size = 2;
1819 if (format == AC_FETCH_FORMAT_FLOAT) {
1820 load_num_channels = 2 * num_channels;
1821 } else {
1822 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1823 }
1824 }
1825
1826 int log_recombine = 0;
1827 if (ctx->chip_class == GFX6 && !known_aligned) {
1828 /* Avoid alignment restrictions by loading one byte at a time. */
1829 load_num_channels <<= load_log_size;
1830 log_recombine = load_log_size;
1831 load_log_size = 0;
1832 } else if (load_num_channels == 2 || load_num_channels == 4) {
1833 log_recombine = -util_logbase2(load_num_channels);
1834 load_num_channels = 1;
1835 load_log_size += -log_recombine;
1836 }
1837
1838 assert(load_log_size >= 2 || HAVE_LLVM >= 0x0900);
1839
1840 LLVMValueRef loads[32]; /* up to 32 bytes */
1841 for (unsigned i = 0; i < load_num_channels; ++i) {
1842 tmp = LLVMBuildAdd(ctx->builder, soffset,
1843 LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1844 if (HAVE_LLVM >= 0x0800) {
1845 LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 :
1846 load_log_size == 1 ? ctx->i16 : ctx->i32;
1847 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1848 loads[i] = ac_build_llvm8_buffer_load_common(
1849 ctx, rsrc, vindex, voffset, tmp,
1850 num_channels, channel_type, cache_policy,
1851 can_speculate, false, true);
1852 } else {
1853 tmp = LLVMBuildAdd(ctx->builder, voffset, tmp, "");
1854 loads[i] = ac_build_llvm7_buffer_load_common(
1855 ctx, rsrc, vindex, tmp,
1856 1 << (load_log_size - 2), cache_policy, can_speculate, false);
1857 }
1858 if (load_log_size >= 2)
1859 loads[i] = ac_to_integer(ctx, loads[i]);
1860 }
1861
1862 if (log_recombine > 0) {
1863 /* Recombine bytes if necessary (GFX6 only) */
1864 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1865
1866 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1867 LLVMValueRef accum = NULL;
1868 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1869 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1870 if (i == 0) {
1871 accum = tmp;
1872 } else {
1873 tmp = LLVMBuildShl(ctx->builder, tmp,
1874 LLVMConstInt(dst_type, 8 * i, false), "");
1875 accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1876 }
1877 }
1878 loads[dst] = accum;
1879 }
1880 } else if (log_recombine < 0) {
1881 /* Split vectors of dwords */
1882 if (load_log_size > 2) {
1883 assert(load_num_channels == 1);
1884 LLVMValueRef loaded = loads[0];
1885 unsigned log_split = load_log_size - 2;
1886 log_recombine += log_split;
1887 load_num_channels = 1 << log_split;
1888 load_log_size = 2;
1889 for (unsigned i = 0; i < load_num_channels; ++i) {
1890 tmp = LLVMConstInt(ctx->i32, i, false);
1891 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1892 }
1893 }
1894
1895 /* Further split dwords and shorts if required */
1896 if (log_recombine < 0) {
1897 for (unsigned src = load_num_channels,
1898 dst = load_num_channels << -log_recombine;
1899 src > 0; --src) {
1900 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1901 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1902 LLVMValueRef loaded = loads[src - 1];
1903 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1904 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1905 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1906 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1907 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1908 }
1909 }
1910 }
1911 }
1912
1913 if (log_size == 3) {
1914 if (format == AC_FETCH_FORMAT_FLOAT) {
1915 for (unsigned i = 0; i < num_channels; ++i) {
1916 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1917 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1918 }
1919 } else if (format == AC_FETCH_FORMAT_FIXED) {
1920 /* 10_11_11_FLOAT */
1921 LLVMValueRef data = loads[0];
1922 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1923 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1924 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1925 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1926 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1927
1928 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1929 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1930 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1931
1932 num_channels = 3;
1933 log_size = 2;
1934 format = AC_FETCH_FORMAT_FLOAT;
1935 } else {
1936 /* 2_10_10_10 data formats */
1937 LLVMValueRef data = loads[0];
1938 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1939 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1940 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1941 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1942 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1943 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1944 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1945 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1946 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1947
1948 num_channels = 4;
1949 }
1950 }
1951
1952 if (format == AC_FETCH_FORMAT_FLOAT) {
1953 if (log_size != 2) {
1954 for (unsigned chan = 0; chan < num_channels; ++chan) {
1955 tmp = ac_to_float(ctx, loads[chan]);
1956 if (log_size == 3)
1957 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1958 else if (log_size == 1)
1959 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1960 loads[chan] = ac_to_integer(ctx, tmp);
1961 }
1962 }
1963 } else if (format == AC_FETCH_FORMAT_UINT) {
1964 if (log_size != 2) {
1965 for (unsigned chan = 0; chan < num_channels; ++chan)
1966 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1967 }
1968 } else if (format == AC_FETCH_FORMAT_SINT) {
1969 if (log_size != 2) {
1970 for (unsigned chan = 0; chan < num_channels; ++chan)
1971 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1972 }
1973 } else {
1974 bool unsign = format == AC_FETCH_FORMAT_UNORM ||
1975 format == AC_FETCH_FORMAT_USCALED ||
1976 format == AC_FETCH_FORMAT_UINT;
1977
1978 for (unsigned chan = 0; chan < num_channels; ++chan) {
1979 if (unsign) {
1980 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1981 } else {
1982 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1983 }
1984
1985 LLVMValueRef scale = NULL;
1986 if (format == AC_FETCH_FORMAT_FIXED) {
1987 assert(log_size == 2);
1988 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1989 } else if (format == AC_FETCH_FORMAT_UNORM) {
1990 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1991 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1992 } else if (format == AC_FETCH_FORMAT_SNORM) {
1993 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1994 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1995 }
1996 if (scale)
1997 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1998
1999 if (format == AC_FETCH_FORMAT_SNORM) {
2000 /* Clamp to [-1, 1] */
2001 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
2002 LLVMValueRef clamp =
2003 LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
2004 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
2005 }
2006
2007 loads[chan] = ac_to_integer(ctx, tmp);
2008 }
2009 }
2010
2011 while (num_channels < 4) {
2012 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
2013 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
2014 } else {
2015 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
2016 }
2017 num_channels++;
2018 }
2019
2020 if (reverse) {
2021 tmp = loads[0];
2022 loads[0] = loads[2];
2023 loads[2] = tmp;
2024 }
2025
2026 return ac_build_gather_values(ctx, loads, 4);
2027 }
2028
2029 static void
2030 ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
2031 LLVMValueRef rsrc,
2032 LLVMValueRef vdata,
2033 LLVMValueRef vindex,
2034 LLVMValueRef voffset,
2035 LLVMValueRef soffset,
2036 unsigned num_channels,
2037 unsigned dfmt,
2038 unsigned nfmt,
2039 unsigned cache_policy,
2040 bool structurized)
2041 {
2042 LLVMValueRef args[7];
2043 int idx = 0;
2044 args[idx++] = vdata;
2045 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
2046 if (structurized)
2047 args[idx++] = vindex ? vindex : ctx->i32_0;
2048 args[idx++] = voffset ? voffset : ctx->i32_0;
2049 args[idx++] = soffset ? soffset : ctx->i32_0;
2050 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
2051 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
2052 unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
2053 const char *indexing_kind = structurized ? "struct" : "raw";
2054 char name[256], type_name[8];
2055
2056 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
2057 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
2058
2059 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
2060 indexing_kind, type_name);
2061
2062 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
2063 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2064 }
2065
2066 static void
2067 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
2068 LLVMValueRef rsrc,
2069 LLVMValueRef vdata,
2070 LLVMValueRef vindex,
2071 LLVMValueRef voffset,
2072 LLVMValueRef soffset,
2073 LLVMValueRef immoffset,
2074 unsigned num_channels,
2075 unsigned dfmt,
2076 unsigned nfmt,
2077 unsigned cache_policy,
2078 bool structurized) /* only matters for LLVM 8+ */
2079 {
2080 if (HAVE_LLVM >= 0x800) {
2081 voffset = LLVMBuildAdd(ctx->builder,
2082 voffset ? voffset : ctx->i32_0,
2083 immoffset, "");
2084
2085 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
2086 soffset, num_channels, dfmt, nfmt,
2087 cache_policy, structurized);
2088 } else {
2089 LLVMValueRef params[] = {
2090 vdata,
2091 rsrc,
2092 vindex ? vindex : ctx->i32_0,
2093 voffset ? voffset : ctx->i32_0,
2094 soffset ? soffset : ctx->i32_0,
2095 immoffset,
2096 LLVMConstInt(ctx->i32, dfmt, false),
2097 LLVMConstInt(ctx->i32, nfmt, false),
2098 LLVMConstInt(ctx->i1, !!(cache_policy & ac_glc), false),
2099 LLVMConstInt(ctx->i1, !!(cache_policy & ac_slc), false),
2100 };
2101 unsigned func = CLAMP(num_channels, 1, 3) - 1;
2102 const char *type_names[] = {"i32", "v2i32", "v4i32"};
2103 char name[256];
2104
2105 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
2106 type_names[func]);
2107
2108 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
2109 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
2110 }
2111 }
2112
2113 void
2114 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
2115 LLVMValueRef rsrc,
2116 LLVMValueRef vdata,
2117 LLVMValueRef vindex,
2118 LLVMValueRef voffset,
2119 LLVMValueRef soffset,
2120 LLVMValueRef immoffset,
2121 unsigned num_channels,
2122 unsigned dfmt,
2123 unsigned nfmt,
2124 unsigned cache_policy)
2125 {
2126 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
2127 immoffset, num_channels, dfmt, nfmt, cache_policy,
2128 true);
2129 }
2130
2131 void
2132 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
2133 LLVMValueRef rsrc,
2134 LLVMValueRef vdata,
2135 LLVMValueRef voffset,
2136 LLVMValueRef soffset,
2137 LLVMValueRef immoffset,
2138 unsigned num_channels,
2139 unsigned dfmt,
2140 unsigned nfmt,
2141 unsigned cache_policy)
2142 {
2143 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
2144 immoffset, num_channels, dfmt, nfmt, cache_policy,
2145 false);
2146 }
2147
2148 void
2149 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
2150 LLVMValueRef rsrc,
2151 LLVMValueRef vdata,
2152 LLVMValueRef voffset,
2153 LLVMValueRef soffset,
2154 unsigned cache_policy)
2155 {
2156 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
2157
2158 if (HAVE_LLVM >= 0x900) {
2159 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2160 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2161 voffset, soffset, 1,
2162 ctx->i16, cache_policy,
2163 false, false);
2164 } else {
2165 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
2166 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2167
2168 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2169
2170 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2171 ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2172 }
2173 }
2174
2175 void
2176 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
2177 LLVMValueRef rsrc,
2178 LLVMValueRef vdata,
2179 LLVMValueRef voffset,
2180 LLVMValueRef soffset,
2181 unsigned cache_policy)
2182 {
2183 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
2184
2185 if (HAVE_LLVM >= 0x900) {
2186 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
2187 ac_build_llvm8_buffer_store_common(ctx, rsrc, vdata, NULL,
2188 voffset, soffset, 1,
2189 ctx->i8, cache_policy,
2190 false, false);
2191 } else {
2192 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
2193 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
2194
2195 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
2196
2197 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
2198 ctx->i32_0, 1, dfmt, nfmt, cache_policy);
2199 }
2200 }
2201 /**
2202 * Set range metadata on an instruction. This can only be used on load and
2203 * call instructions. If you know an instruction can only produce the values
2204 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
2205 * \p lo is the minimum value inclusive.
2206 * \p hi is the maximum value exclusive.
2207 */
2208 static void set_range_metadata(struct ac_llvm_context *ctx,
2209 LLVMValueRef value, unsigned lo, unsigned hi)
2210 {
2211 LLVMValueRef range_md, md_args[2];
2212 LLVMTypeRef type = LLVMTypeOf(value);
2213 LLVMContextRef context = LLVMGetTypeContext(type);
2214
2215 md_args[0] = LLVMConstInt(type, lo, false);
2216 md_args[1] = LLVMConstInt(type, hi, false);
2217 range_md = LLVMMDNodeInContext(context, md_args, 2);
2218 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
2219 }
2220
2221 LLVMValueRef
2222 ac_get_thread_id(struct ac_llvm_context *ctx)
2223 {
2224 LLVMValueRef tid;
2225
2226 LLVMValueRef tid_args[2];
2227 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
2228 tid_args[1] = ctx->i32_0;
2229 tid_args[1] = ac_build_intrinsic(ctx,
2230 "llvm.amdgcn.mbcnt.lo", ctx->i32,
2231 tid_args, 2, AC_FUNC_ATTR_READNONE);
2232
2233 if (ctx->wave_size == 32) {
2234 tid = tid_args[1];
2235 } else {
2236 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
2237 ctx->i32, tid_args,
2238 2, AC_FUNC_ATTR_READNONE);
2239 }
2240 set_range_metadata(ctx, tid, 0, ctx->wave_size);
2241 return tid;
2242 }
2243
2244 /*
2245 * AMD GCN implements derivatives using the local data store (LDS)
2246 * All writes to the LDS happen in all executing threads at
2247 * the same time. TID is the Thread ID for the current
2248 * thread and is a value between 0 and 63, representing
2249 * the thread's position in the wavefront.
2250 *
2251 * For the pixel shader threads are grouped into quads of four pixels.
2252 * The TIDs of the pixels of a quad are:
2253 *
2254 * +------+------+
2255 * |4n + 0|4n + 1|
2256 * +------+------+
2257 * |4n + 2|4n + 3|
2258 * +------+------+
2259 *
2260 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
2261 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
2262 * the current pixel's column, and masking with 0xfffffffe yields the TID
2263 * of the left pixel of the current pixel's row.
2264 *
2265 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
2266 * adding 2 yields the TID of the pixel below the top pixel.
2267 */
2268 LLVMValueRef
2269 ac_build_ddxy(struct ac_llvm_context *ctx,
2270 uint32_t mask,
2271 int idx,
2272 LLVMValueRef val)
2273 {
2274 unsigned tl_lanes[4], trbl_lanes[4];
2275 char name[32], type[8];
2276 LLVMValueRef tl, trbl;
2277 LLVMTypeRef result_type;
2278 LLVMValueRef result;
2279
2280 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
2281
2282 if (result_type == ctx->f16)
2283 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
2284
2285 for (unsigned i = 0; i < 4; ++i) {
2286 tl_lanes[i] = i & mask;
2287 trbl_lanes[i] = (i & mask) + idx;
2288 }
2289
2290 tl = ac_build_quad_swizzle(ctx, val,
2291 tl_lanes[0], tl_lanes[1],
2292 tl_lanes[2], tl_lanes[3]);
2293 trbl = ac_build_quad_swizzle(ctx, val,
2294 trbl_lanes[0], trbl_lanes[1],
2295 trbl_lanes[2], trbl_lanes[3]);
2296
2297 if (result_type == ctx->f16) {
2298 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
2299 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
2300 }
2301
2302 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
2303 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
2304 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
2305
2306 ac_build_type_name_for_intr(result_type, type, sizeof(type));
2307 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
2308
2309 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
2310 }
2311
2312 void
2313 ac_build_sendmsg(struct ac_llvm_context *ctx,
2314 uint32_t msg,
2315 LLVMValueRef wave_id)
2316 {
2317 LLVMValueRef args[2];
2318 args[0] = LLVMConstInt(ctx->i32, msg, false);
2319 args[1] = wave_id;
2320 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
2321 }
2322
2323 LLVMValueRef
2324 ac_build_imsb(struct ac_llvm_context *ctx,
2325 LLVMValueRef arg,
2326 LLVMTypeRef dst_type)
2327 {
2328 LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
2329 dst_type, &arg, 1,
2330 AC_FUNC_ATTR_READNONE);
2331
2332 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
2333 * the index from LSB. Invert it by doing "31 - msb". */
2334 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
2335 msb, "");
2336
2337 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
2338 LLVMValueRef cond = LLVMBuildOr(ctx->builder,
2339 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2340 arg, ctx->i32_0, ""),
2341 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2342 arg, all_ones, ""), "");
2343
2344 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
2345 }
2346
2347 LLVMValueRef
2348 ac_build_umsb(struct ac_llvm_context *ctx,
2349 LLVMValueRef arg,
2350 LLVMTypeRef dst_type)
2351 {
2352 const char *intrin_name;
2353 LLVMTypeRef type;
2354 LLVMValueRef highest_bit;
2355 LLVMValueRef zero;
2356 unsigned bitsize;
2357
2358 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
2359 switch (bitsize) {
2360 case 64:
2361 intrin_name = "llvm.ctlz.i64";
2362 type = ctx->i64;
2363 highest_bit = LLVMConstInt(ctx->i64, 63, false);
2364 zero = ctx->i64_0;
2365 break;
2366 case 32:
2367 intrin_name = "llvm.ctlz.i32";
2368 type = ctx->i32;
2369 highest_bit = LLVMConstInt(ctx->i32, 31, false);
2370 zero = ctx->i32_0;
2371 break;
2372 case 16:
2373 intrin_name = "llvm.ctlz.i16";
2374 type = ctx->i16;
2375 highest_bit = LLVMConstInt(ctx->i16, 15, false);
2376 zero = ctx->i16_0;
2377 break;
2378 case 8:
2379 intrin_name = "llvm.ctlz.i8";
2380 type = ctx->i8;
2381 highest_bit = LLVMConstInt(ctx->i8, 7, false);
2382 zero = ctx->i8_0;
2383 break;
2384 default:
2385 unreachable(!"invalid bitsize");
2386 break;
2387 }
2388
2389 LLVMValueRef params[2] = {
2390 arg,
2391 ctx->i1true,
2392 };
2393
2394 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
2395 params, 2,
2396 AC_FUNC_ATTR_READNONE);
2397
2398 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
2399 * the index from LSB. Invert it by doing "31 - msb". */
2400 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
2401
2402 if (bitsize == 64) {
2403 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
2404 } else if (bitsize < 32) {
2405 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
2406 }
2407
2408 /* check for zero */
2409 return LLVMBuildSelect(ctx->builder,
2410 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
2411 LLVMConstInt(ctx->i32, -1, true), msb, "");
2412 }
2413
2414 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
2415 LLVMValueRef b)
2416 {
2417 char name[64];
2418 snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2419 LLVMValueRef args[2] = {a, b};
2420 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2421 AC_FUNC_ATTR_READNONE);
2422 }
2423
2424 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
2425 LLVMValueRef b)
2426 {
2427 char name[64];
2428 snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2429 LLVMValueRef args[2] = {a, b};
2430 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2431 AC_FUNC_ATTR_READNONE);
2432 }
2433
2434 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2435 LLVMValueRef b)
2436 {
2437 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2438 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2439 }
2440
2441 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2442 LLVMValueRef b)
2443 {
2444 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2445 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2446 }
2447
2448 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2449 LLVMValueRef b)
2450 {
2451 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2452 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2453 }
2454
2455 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
2456 LLVMValueRef b)
2457 {
2458 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
2459 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2460 }
2461
2462 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2463 {
2464 LLVMTypeRef t = LLVMTypeOf(value);
2465 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2466 LLVMConstReal(t, 1.0));
2467 }
2468
2469 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2470 {
2471 LLVMValueRef args[9];
2472
2473 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2474 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2475
2476 if (a->compr) {
2477 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2478 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2479
2480 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2481 v2i16, "");
2482 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2483 v2i16, "");
2484 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2485 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2486
2487 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2488 ctx->voidt, args, 6, 0);
2489 } else {
2490 args[2] = a->out[0];
2491 args[3] = a->out[1];
2492 args[4] = a->out[2];
2493 args[5] = a->out[3];
2494 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2495 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2496
2497 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2498 ctx->voidt, args, 8, 0);
2499 }
2500 }
2501
2502 void ac_build_export_null(struct ac_llvm_context *ctx)
2503 {
2504 struct ac_export_args args;
2505
2506 args.enabled_channels = 0x0; /* enabled channels */
2507 args.valid_mask = 1; /* whether the EXEC mask is valid */
2508 args.done = 1; /* DONE bit */
2509 args.target = V_008DFC_SQ_EXP_NULL;
2510 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2511 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2512 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2513 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2514 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2515
2516 ac_build_export(ctx, &args);
2517 }
2518
2519 static unsigned ac_num_coords(enum ac_image_dim dim)
2520 {
2521 switch (dim) {
2522 case ac_image_1d:
2523 return 1;
2524 case ac_image_2d:
2525 case ac_image_1darray:
2526 return 2;
2527 case ac_image_3d:
2528 case ac_image_cube:
2529 case ac_image_2darray:
2530 case ac_image_2dmsaa:
2531 return 3;
2532 case ac_image_2darraymsaa:
2533 return 4;
2534 default:
2535 unreachable("ac_num_coords: bad dim");
2536 }
2537 }
2538
2539 static unsigned ac_num_derivs(enum ac_image_dim dim)
2540 {
2541 switch (dim) {
2542 case ac_image_1d:
2543 case ac_image_1darray:
2544 return 2;
2545 case ac_image_2d:
2546 case ac_image_2darray:
2547 case ac_image_cube:
2548 return 4;
2549 case ac_image_3d:
2550 return 6;
2551 case ac_image_2dmsaa:
2552 case ac_image_2darraymsaa:
2553 default:
2554 unreachable("derivatives not supported");
2555 }
2556 }
2557
2558 static const char *get_atomic_name(enum ac_atomic_op op)
2559 {
2560 switch (op) {
2561 case ac_atomic_swap: return "swap";
2562 case ac_atomic_add: return "add";
2563 case ac_atomic_sub: return "sub";
2564 case ac_atomic_smin: return "smin";
2565 case ac_atomic_umin: return "umin";
2566 case ac_atomic_smax: return "smax";
2567 case ac_atomic_umax: return "umax";
2568 case ac_atomic_and: return "and";
2569 case ac_atomic_or: return "or";
2570 case ac_atomic_xor: return "xor";
2571 }
2572 unreachable("bad atomic op");
2573 }
2574
2575 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2576 struct ac_image_args *a)
2577 {
2578 const char *overload[3] = { "", "", "" };
2579 unsigned num_overloads = 0;
2580 LLVMValueRef args[18];
2581 unsigned num_args = 0;
2582 enum ac_image_dim dim = a->dim;
2583
2584 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2585 !a->level_zero);
2586 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2587 a->opcode != ac_image_store_mip) ||
2588 a->lod);
2589 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2590 (!a->compare && !a->offset));
2591 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2592 a->opcode == ac_image_get_lod) ||
2593 !a->bias);
2594 assert((a->bias ? 1 : 0) +
2595 (a->lod ? 1 : 0) +
2596 (a->level_zero ? 1 : 0) +
2597 (a->derivs[0] ? 1 : 0) <= 1);
2598
2599 if (a->opcode == ac_image_get_lod) {
2600 switch (dim) {
2601 case ac_image_1darray:
2602 dim = ac_image_1d;
2603 break;
2604 case ac_image_2darray:
2605 case ac_image_cube:
2606 dim = ac_image_2d;
2607 break;
2608 default:
2609 break;
2610 }
2611 }
2612
2613 bool sample = a->opcode == ac_image_sample ||
2614 a->opcode == ac_image_gather4 ||
2615 a->opcode == ac_image_get_lod;
2616 bool atomic = a->opcode == ac_image_atomic ||
2617 a->opcode == ac_image_atomic_cmpswap;
2618 bool load = a->opcode == ac_image_sample ||
2619 a->opcode == ac_image_gather4 ||
2620 a->opcode == ac_image_load ||
2621 a->opcode == ac_image_load_mip;
2622 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2623
2624 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2625 args[num_args++] = a->data[0];
2626 if (a->opcode == ac_image_atomic_cmpswap)
2627 args[num_args++] = a->data[1];
2628 }
2629
2630 if (!atomic)
2631 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2632
2633 if (a->offset)
2634 args[num_args++] = ac_to_integer(ctx, a->offset);
2635 if (a->bias) {
2636 args[num_args++] = ac_to_float(ctx, a->bias);
2637 overload[num_overloads++] = ".f32";
2638 }
2639 if (a->compare)
2640 args[num_args++] = ac_to_float(ctx, a->compare);
2641 if (a->derivs[0]) {
2642 unsigned count = ac_num_derivs(dim);
2643 for (unsigned i = 0; i < count; ++i)
2644 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2645 overload[num_overloads++] = ".f32";
2646 }
2647 unsigned num_coords =
2648 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2649 for (unsigned i = 0; i < num_coords; ++i)
2650 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2651 if (a->lod)
2652 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2653 overload[num_overloads++] = sample ? ".f32" : ".i32";
2654
2655 args[num_args++] = a->resource;
2656 if (sample) {
2657 args[num_args++] = a->sampler;
2658 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2659 }
2660
2661 args[num_args++] = ctx->i32_0; /* texfailctrl */
2662 args[num_args++] = LLVMConstInt(ctx->i32,
2663 load ? get_load_cache_policy(ctx, a->cache_policy) :
2664 a->cache_policy, false);
2665
2666 const char *name;
2667 const char *atomic_subop = "";
2668 switch (a->opcode) {
2669 case ac_image_sample: name = "sample"; break;
2670 case ac_image_gather4: name = "gather4"; break;
2671 case ac_image_load: name = "load"; break;
2672 case ac_image_load_mip: name = "load.mip"; break;
2673 case ac_image_store: name = "store"; break;
2674 case ac_image_store_mip: name = "store.mip"; break;
2675 case ac_image_atomic:
2676 name = "atomic.";
2677 atomic_subop = get_atomic_name(a->atomic);
2678 break;
2679 case ac_image_atomic_cmpswap:
2680 name = "atomic.";
2681 atomic_subop = "cmpswap";
2682 break;
2683 case ac_image_get_lod: name = "getlod"; break;
2684 case ac_image_get_resinfo: name = "getresinfo"; break;
2685 default: unreachable("invalid image opcode");
2686 }
2687
2688 const char *dimname;
2689 switch (dim) {
2690 case ac_image_1d: dimname = "1d"; break;
2691 case ac_image_2d: dimname = "2d"; break;
2692 case ac_image_3d: dimname = "3d"; break;
2693 case ac_image_cube: dimname = "cube"; break;
2694 case ac_image_1darray: dimname = "1darray"; break;
2695 case ac_image_2darray: dimname = "2darray"; break;
2696 case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2697 case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2698 default: unreachable("invalid dim");
2699 }
2700
2701 bool lod_suffix =
2702 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2703 char intr_name[96];
2704 snprintf(intr_name, sizeof(intr_name),
2705 "llvm.amdgcn.image.%s%s" /* base name */
2706 "%s%s%s" /* sample/gather modifiers */
2707 ".%s.%s%s%s%s", /* dimension and type overloads */
2708 name, atomic_subop,
2709 a->compare ? ".c" : "",
2710 a->bias ? ".b" :
2711 lod_suffix ? ".l" :
2712 a->derivs[0] ? ".d" :
2713 a->level_zero ? ".lz" : "",
2714 a->offset ? ".o" : "",
2715 dimname,
2716 atomic ? "i32" : "v4f32",
2717 overload[0], overload[1], overload[2]);
2718
2719 LLVMTypeRef retty;
2720 if (atomic)
2721 retty = ctx->i32;
2722 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2723 retty = ctx->voidt;
2724 else
2725 retty = ctx->v4f32;
2726
2727 LLVMValueRef result =
2728 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2729 a->attributes);
2730 if (!sample && retty == ctx->v4f32) {
2731 result = LLVMBuildBitCast(ctx->builder, result,
2732 ctx->v4i32, "");
2733 }
2734 return result;
2735 }
2736
2737 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2738 LLVMValueRef args[2])
2739 {
2740 LLVMTypeRef v2f16 =
2741 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2742
2743 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2744 args, 2, AC_FUNC_ATTR_READNONE);
2745 }
2746
2747 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2748 LLVMValueRef args[2])
2749 {
2750 LLVMValueRef res =
2751 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2752 ctx->v2i16, args, 2,
2753 AC_FUNC_ATTR_READNONE);
2754 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2755 }
2756
2757 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2758 LLVMValueRef args[2])
2759 {
2760 LLVMValueRef res =
2761 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2762 ctx->v2i16, args, 2,
2763 AC_FUNC_ATTR_READNONE);
2764 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2765 }
2766
2767 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2768 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2769 LLVMValueRef args[2], unsigned bits, bool hi)
2770 {
2771 assert(bits == 8 || bits == 10 || bits == 16);
2772
2773 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2774 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2775 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2776 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2777 LLVMValueRef max_alpha =
2778 bits != 10 ? max_rgb : ctx->i32_1;
2779 LLVMValueRef min_alpha =
2780 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2781
2782 /* Clamp. */
2783 if (bits != 16) {
2784 for (int i = 0; i < 2; i++) {
2785 bool alpha = hi && i == 1;
2786 args[i] = ac_build_imin(ctx, args[i],
2787 alpha ? max_alpha : max_rgb);
2788 args[i] = ac_build_imax(ctx, args[i],
2789 alpha ? min_alpha : min_rgb);
2790 }
2791 }
2792
2793 LLVMValueRef res =
2794 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2795 ctx->v2i16, args, 2,
2796 AC_FUNC_ATTR_READNONE);
2797 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2798 }
2799
2800 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2801 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2802 LLVMValueRef args[2], unsigned bits, bool hi)
2803 {
2804 assert(bits == 8 || bits == 10 || bits == 16);
2805
2806 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2807 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2808 LLVMValueRef max_alpha =
2809 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2810
2811 /* Clamp. */
2812 if (bits != 16) {
2813 for (int i = 0; i < 2; i++) {
2814 bool alpha = hi && i == 1;
2815 args[i] = ac_build_umin(ctx, args[i],
2816 alpha ? max_alpha : max_rgb);
2817 }
2818 }
2819
2820 LLVMValueRef res =
2821 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2822 ctx->v2i16, args, 2,
2823 AC_FUNC_ATTR_READNONE);
2824 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2825 }
2826
2827 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2828 {
2829 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2830 &i1, 1, AC_FUNC_ATTR_READNONE);
2831 }
2832
2833 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2834 {
2835 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2836 &i1, 1, 0);
2837 }
2838
2839 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2840 LLVMValueRef offset, LLVMValueRef width,
2841 bool is_signed)
2842 {
2843 LLVMValueRef args[] = {
2844 input,
2845 offset,
2846 width,
2847 };
2848
2849 LLVMValueRef result = ac_build_intrinsic(ctx,
2850 is_signed ? "llvm.amdgcn.sbfe.i32" :
2851 "llvm.amdgcn.ubfe.i32",
2852 ctx->i32, args, 3,
2853 AC_FUNC_ATTR_READNONE);
2854
2855 if (HAVE_LLVM < 0x0800) {
2856 /* FIXME: LLVM 7+ returns incorrect result when count is 0.
2857 * https://bugs.freedesktop.org/show_bug.cgi?id=107276
2858 */
2859 LLVMValueRef zero = ctx->i32_0;
2860 LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, width, zero, "");
2861 result = LLVMBuildSelect(ctx->builder, icond, zero, result, "");
2862 }
2863
2864 return result;
2865 }
2866
2867 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2868 LLVMValueRef s1, LLVMValueRef s2)
2869 {
2870 return LLVMBuildAdd(ctx->builder,
2871 LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2872 }
2873
2874 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2875 LLVMValueRef s1, LLVMValueRef s2)
2876 {
2877 return LLVMBuildFAdd(ctx->builder,
2878 LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2879 }
2880
2881 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2882 {
2883 if (!wait_flags)
2884 return;
2885
2886 unsigned lgkmcnt = 63;
2887 unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2888 unsigned vscnt = 63;
2889
2890 if (wait_flags & AC_WAIT_LGKM)
2891 lgkmcnt = 0;
2892 if (wait_flags & AC_WAIT_VLOAD)
2893 vmcnt = 0;
2894
2895 if (wait_flags & AC_WAIT_VSTORE) {
2896 if (ctx->chip_class >= GFX10)
2897 vscnt = 0;
2898 else
2899 vmcnt = 0;
2900 }
2901
2902 /* There is no intrinsic for vscnt(0), so use a fence. */
2903 if ((wait_flags & AC_WAIT_LGKM &&
2904 wait_flags & AC_WAIT_VLOAD &&
2905 wait_flags & AC_WAIT_VSTORE) ||
2906 vscnt == 0) {
2907 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2908 return;
2909 }
2910
2911 unsigned simm16 = (lgkmcnt << 8) |
2912 (7 << 4) | /* expcnt */
2913 (vmcnt & 0xf) |
2914 ((vmcnt >> 4) << 14);
2915
2916 LLVMValueRef args[1] = {
2917 LLVMConstInt(ctx->i32, simm16, false),
2918 };
2919 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2920 ctx->voidt, args, 1, 0);
2921 }
2922
2923 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
2924 LLVMValueRef src1, LLVMValueRef src2,
2925 unsigned bitsize)
2926 {
2927 LLVMTypeRef type;
2928 char *intr;
2929
2930 if (bitsize == 16) {
2931 intr = "llvm.amdgcn.fmed3.f16";
2932 type = ctx->f16;
2933 } else if (bitsize == 32) {
2934 intr = "llvm.amdgcn.fmed3.f32";
2935 type = ctx->f32;
2936 } else {
2937 intr = "llvm.amdgcn.fmed3.f64";
2938 type = ctx->f64;
2939 }
2940
2941 LLVMValueRef params[] = {
2942 src0,
2943 src1,
2944 src2,
2945 };
2946 return ac_build_intrinsic(ctx, intr, type, params, 3,
2947 AC_FUNC_ATTR_READNONE);
2948 }
2949
2950 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2951 unsigned bitsize)
2952 {
2953 LLVMTypeRef type;
2954 char *intr;
2955
2956 if (bitsize == 16) {
2957 intr = "llvm.amdgcn.fract.f16";
2958 type = ctx->f16;
2959 } else if (bitsize == 32) {
2960 intr = "llvm.amdgcn.fract.f32";
2961 type = ctx->f32;
2962 } else {
2963 intr = "llvm.amdgcn.fract.f64";
2964 type = ctx->f64;
2965 }
2966
2967 LLVMValueRef params[] = {
2968 src0,
2969 };
2970 return ac_build_intrinsic(ctx, intr, type, params, 1,
2971 AC_FUNC_ATTR_READNONE);
2972 }
2973
2974 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2975 unsigned bitsize)
2976 {
2977 LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2978 LLVMValueRef zero = LLVMConstInt(type, 0, false);
2979 LLVMValueRef one = LLVMConstInt(type, 1, false);
2980
2981 LLVMValueRef cmp, val;
2982 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2983 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2984 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2985 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2986 return val;
2987 }
2988
2989 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2990 unsigned bitsize)
2991 {
2992 LLVMValueRef cmp, val, zero, one;
2993 LLVMTypeRef type;
2994
2995 if (bitsize == 16) {
2996 type = ctx->f16;
2997 zero = ctx->f16_0;
2998 one = ctx->f16_1;
2999 } else if (bitsize == 32) {
3000 type = ctx->f32;
3001 zero = ctx->f32_0;
3002 one = ctx->f32_1;
3003 } else {
3004 type = ctx->f64;
3005 zero = ctx->f64_0;
3006 one = ctx->f64_1;
3007 }
3008
3009 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
3010 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
3011 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
3012 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
3013 return val;
3014 }
3015
3016 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
3017 {
3018 LLVMValueRef result;
3019 unsigned bitsize;
3020
3021 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3022
3023 switch (bitsize) {
3024 case 64:
3025 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
3026 (LLVMValueRef []) { src0 }, 1,
3027 AC_FUNC_ATTR_READNONE);
3028
3029 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3030 break;
3031 case 32:
3032 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
3033 (LLVMValueRef []) { src0 }, 1,
3034 AC_FUNC_ATTR_READNONE);
3035 break;
3036 case 16:
3037 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
3038 (LLVMValueRef []) { src0 }, 1,
3039 AC_FUNC_ATTR_READNONE);
3040
3041 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3042 break;
3043 case 8:
3044 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
3045 (LLVMValueRef []) { src0 }, 1,
3046 AC_FUNC_ATTR_READNONE);
3047
3048 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3049 break;
3050 default:
3051 unreachable(!"invalid bitsize");
3052 break;
3053 }
3054
3055 return result;
3056 }
3057
3058 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
3059 LLVMValueRef src0)
3060 {
3061 LLVMValueRef result;
3062 unsigned bitsize;
3063
3064 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3065
3066 switch (bitsize) {
3067 case 64:
3068 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
3069 (LLVMValueRef []) { src0 }, 1,
3070 AC_FUNC_ATTR_READNONE);
3071
3072 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
3073 break;
3074 case 32:
3075 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
3076 (LLVMValueRef []) { src0 }, 1,
3077 AC_FUNC_ATTR_READNONE);
3078 break;
3079 case 16:
3080 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
3081 (LLVMValueRef []) { src0 }, 1,
3082 AC_FUNC_ATTR_READNONE);
3083
3084 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3085 break;
3086 case 8:
3087 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
3088 (LLVMValueRef []) { src0 }, 1,
3089 AC_FUNC_ATTR_READNONE);
3090
3091 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
3092 break;
3093 default:
3094 unreachable(!"invalid bitsize");
3095 break;
3096 }
3097
3098 return result;
3099 }
3100
3101 #define AC_EXP_TARGET 0
3102 #define AC_EXP_ENABLED_CHANNELS 1
3103 #define AC_EXP_OUT0 2
3104
3105 enum ac_ir_type {
3106 AC_IR_UNDEF,
3107 AC_IR_CONST,
3108 AC_IR_VALUE,
3109 };
3110
3111 struct ac_vs_exp_chan
3112 {
3113 LLVMValueRef value;
3114 float const_float;
3115 enum ac_ir_type type;
3116 };
3117
3118 struct ac_vs_exp_inst {
3119 unsigned offset;
3120 LLVMValueRef inst;
3121 struct ac_vs_exp_chan chan[4];
3122 };
3123
3124 struct ac_vs_exports {
3125 unsigned num;
3126 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
3127 };
3128
3129 /* Return true if the PARAM export has been eliminated. */
3130 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
3131 uint32_t num_outputs,
3132 struct ac_vs_exp_inst *exp)
3133 {
3134 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
3135 bool is_zero[4] = {}, is_one[4] = {};
3136
3137 for (i = 0; i < 4; i++) {
3138 /* It's a constant expression. Undef outputs are eliminated too. */
3139 if (exp->chan[i].type == AC_IR_UNDEF) {
3140 is_zero[i] = true;
3141 is_one[i] = true;
3142 } else if (exp->chan[i].type == AC_IR_CONST) {
3143 if (exp->chan[i].const_float == 0)
3144 is_zero[i] = true;
3145 else if (exp->chan[i].const_float == 1)
3146 is_one[i] = true;
3147 else
3148 return false; /* other constant */
3149 } else
3150 return false;
3151 }
3152
3153 /* Only certain combinations of 0 and 1 can be eliminated. */
3154 if (is_zero[0] && is_zero[1] && is_zero[2])
3155 default_val = is_zero[3] ? 0 : 1;
3156 else if (is_one[0] && is_one[1] && is_one[2])
3157 default_val = is_zero[3] ? 2 : 3;
3158 else
3159 return false;
3160
3161 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
3162 LLVMInstructionEraseFromParent(exp->inst);
3163
3164 /* Change OFFSET to DEFAULT_VAL. */
3165 for (i = 0; i < num_outputs; i++) {
3166 if (vs_output_param_offset[i] == exp->offset) {
3167 vs_output_param_offset[i] =
3168 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
3169 break;
3170 }
3171 }
3172 return true;
3173 }
3174
3175 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
3176 uint8_t *vs_output_param_offset,
3177 uint32_t num_outputs,
3178 struct ac_vs_exports *processed,
3179 struct ac_vs_exp_inst *exp)
3180 {
3181 unsigned p, copy_back_channels = 0;
3182
3183 /* See if the output is already in the list of processed outputs.
3184 * The LLVMValueRef comparison relies on SSA.
3185 */
3186 for (p = 0; p < processed->num; p++) {
3187 bool different = false;
3188
3189 for (unsigned j = 0; j < 4; j++) {
3190 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
3191 struct ac_vs_exp_chan *c2 = &exp->chan[j];
3192
3193 /* Treat undef as a match. */
3194 if (c2->type == AC_IR_UNDEF)
3195 continue;
3196
3197 /* If c1 is undef but c2 isn't, we can copy c2 to c1
3198 * and consider the instruction duplicated.
3199 */
3200 if (c1->type == AC_IR_UNDEF) {
3201 copy_back_channels |= 1 << j;
3202 continue;
3203 }
3204
3205 /* Test whether the channels are not equal. */
3206 if (c1->type != c2->type ||
3207 (c1->type == AC_IR_CONST &&
3208 c1->const_float != c2->const_float) ||
3209 (c1->type == AC_IR_VALUE &&
3210 c1->value != c2->value)) {
3211 different = true;
3212 break;
3213 }
3214 }
3215 if (!different)
3216 break;
3217
3218 copy_back_channels = 0;
3219 }
3220 if (p == processed->num)
3221 return false;
3222
3223 /* If a match was found, but the matching export has undef where the new
3224 * one has a normal value, copy the normal value to the undef channel.
3225 */
3226 struct ac_vs_exp_inst *match = &processed->exp[p];
3227
3228 /* Get current enabled channels mask. */
3229 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
3230 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
3231
3232 while (copy_back_channels) {
3233 unsigned chan = u_bit_scan(&copy_back_channels);
3234
3235 assert(match->chan[chan].type == AC_IR_UNDEF);
3236 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
3237 exp->chan[chan].value);
3238 match->chan[chan] = exp->chan[chan];
3239
3240 /* Update number of enabled channels because the original mask
3241 * is not always 0xf.
3242 */
3243 enabled_channels |= (1 << chan);
3244 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
3245 LLVMConstInt(ctx->i32, enabled_channels, 0));
3246 }
3247
3248 /* The PARAM export is duplicated. Kill it. */
3249 LLVMInstructionEraseFromParent(exp->inst);
3250
3251 /* Change OFFSET to the matching export. */
3252 for (unsigned i = 0; i < num_outputs; i++) {
3253 if (vs_output_param_offset[i] == exp->offset) {
3254 vs_output_param_offset[i] = match->offset;
3255 break;
3256 }
3257 }
3258 return true;
3259 }
3260
3261 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
3262 LLVMValueRef main_fn,
3263 uint8_t *vs_output_param_offset,
3264 uint32_t num_outputs,
3265 uint8_t *num_param_exports)
3266 {
3267 LLVMBasicBlockRef bb;
3268 bool removed_any = false;
3269 struct ac_vs_exports exports;
3270
3271 exports.num = 0;
3272
3273 /* Process all LLVM instructions. */
3274 bb = LLVMGetFirstBasicBlock(main_fn);
3275 while (bb) {
3276 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
3277
3278 while (inst) {
3279 LLVMValueRef cur = inst;
3280 inst = LLVMGetNextInstruction(inst);
3281 struct ac_vs_exp_inst exp;
3282
3283 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
3284 continue;
3285
3286 LLVMValueRef callee = ac_llvm_get_called_value(cur);
3287
3288 if (!ac_llvm_is_function(callee))
3289 continue;
3290
3291 const char *name = LLVMGetValueName(callee);
3292 unsigned num_args = LLVMCountParams(callee);
3293
3294 /* Check if this is an export instruction. */
3295 if ((num_args != 9 && num_args != 8) ||
3296 (strcmp(name, "llvm.SI.export") &&
3297 strcmp(name, "llvm.amdgcn.exp.f32")))
3298 continue;
3299
3300 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
3301 unsigned target = LLVMConstIntGetZExtValue(arg);
3302
3303 if (target < V_008DFC_SQ_EXP_PARAM)
3304 continue;
3305
3306 target -= V_008DFC_SQ_EXP_PARAM;
3307
3308 /* Parse the instruction. */
3309 memset(&exp, 0, sizeof(exp));
3310 exp.offset = target;
3311 exp.inst = cur;
3312
3313 for (unsigned i = 0; i < 4; i++) {
3314 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
3315
3316 exp.chan[i].value = v;
3317
3318 if (LLVMIsUndef(v)) {
3319 exp.chan[i].type = AC_IR_UNDEF;
3320 } else if (LLVMIsAConstantFP(v)) {
3321 LLVMBool loses_info;
3322 exp.chan[i].type = AC_IR_CONST;
3323 exp.chan[i].const_float =
3324 LLVMConstRealGetDouble(v, &loses_info);
3325 } else {
3326 exp.chan[i].type = AC_IR_VALUE;
3327 }
3328 }
3329
3330 /* Eliminate constant and duplicated PARAM exports. */
3331 if (ac_eliminate_const_output(vs_output_param_offset,
3332 num_outputs, &exp) ||
3333 ac_eliminate_duplicated_output(ctx,
3334 vs_output_param_offset,
3335 num_outputs, &exports,
3336 &exp)) {
3337 removed_any = true;
3338 } else {
3339 exports.exp[exports.num++] = exp;
3340 }
3341 }
3342 bb = LLVMGetNextBasicBlock(bb);
3343 }
3344
3345 /* Remove holes in export memory due to removed PARAM exports.
3346 * This is done by renumbering all PARAM exports.
3347 */
3348 if (removed_any) {
3349 uint8_t old_offset[VARYING_SLOT_MAX];
3350 unsigned out, i;
3351
3352 /* Make a copy of the offsets. We need the old version while
3353 * we are modifying some of them. */
3354 memcpy(old_offset, vs_output_param_offset,
3355 sizeof(old_offset));
3356
3357 for (i = 0; i < exports.num; i++) {
3358 unsigned offset = exports.exp[i].offset;
3359
3360 /* Update vs_output_param_offset. Multiple outputs can
3361 * have the same offset.
3362 */
3363 for (out = 0; out < num_outputs; out++) {
3364 if (old_offset[out] == offset)
3365 vs_output_param_offset[out] = i;
3366 }
3367
3368 /* Change the PARAM offset in the instruction. */
3369 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
3370 LLVMConstInt(ctx->i32,
3371 V_008DFC_SQ_EXP_PARAM + i, 0));
3372 }
3373 *num_param_exports = exports.num;
3374 }
3375 }
3376
3377 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
3378 {
3379 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
3380 ac_build_intrinsic(ctx,
3381 "llvm.amdgcn.init.exec", ctx->voidt,
3382 &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
3383 }
3384
3385 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
3386 {
3387 unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
3388 ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
3389 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
3390 "lds");
3391 }
3392
3393 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
3394 LLVMValueRef dw_addr)
3395 {
3396 return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
3397 }
3398
3399 void ac_lds_store(struct ac_llvm_context *ctx,
3400 LLVMValueRef dw_addr,
3401 LLVMValueRef value)
3402 {
3403 value = ac_to_integer(ctx, value);
3404 ac_build_indexed_store(ctx, ctx->lds,
3405 dw_addr, value);
3406 }
3407
3408 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
3409 LLVMTypeRef dst_type,
3410 LLVMValueRef src0)
3411 {
3412 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3413 const char *intrin_name;
3414 LLVMTypeRef type;
3415 LLVMValueRef zero;
3416
3417 switch (src0_bitsize) {
3418 case 64:
3419 intrin_name = "llvm.cttz.i64";
3420 type = ctx->i64;
3421 zero = ctx->i64_0;
3422 break;
3423 case 32:
3424 intrin_name = "llvm.cttz.i32";
3425 type = ctx->i32;
3426 zero = ctx->i32_0;
3427 break;
3428 case 16:
3429 intrin_name = "llvm.cttz.i16";
3430 type = ctx->i16;
3431 zero = ctx->i16_0;
3432 break;
3433 case 8:
3434 intrin_name = "llvm.cttz.i8";
3435 type = ctx->i8;
3436 zero = ctx->i8_0;
3437 break;
3438 default:
3439 unreachable(!"invalid bitsize");
3440 }
3441
3442 LLVMValueRef params[2] = {
3443 src0,
3444
3445 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3446 * add special code to check for x=0. The reason is that
3447 * the LLVM behavior for x=0 is different from what we
3448 * need here. However, LLVM also assumes that ffs(x) is
3449 * in [0, 31], but GLSL expects that ffs(0) = -1, so
3450 * a conditional assignment to handle 0 is still required.
3451 *
3452 * The hardware already implements the correct behavior.
3453 */
3454 ctx->i1true,
3455 };
3456
3457 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
3458 params, 2,
3459 AC_FUNC_ATTR_READNONE);
3460
3461 if (src0_bitsize == 64) {
3462 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3463 } else if (src0_bitsize < 32) {
3464 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3465 }
3466
3467 /* TODO: We need an intrinsic to skip this conditional. */
3468 /* Check for zero: */
3469 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
3470 LLVMIntEQ, src0,
3471 zero, ""),
3472 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3473 }
3474
3475 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3476 {
3477 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3478 }
3479
3480 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3481 {
3482 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3483 }
3484
3485 static struct ac_llvm_flow *
3486 get_current_flow(struct ac_llvm_context *ctx)
3487 {
3488 if (ctx->flow_depth > 0)
3489 return &ctx->flow[ctx->flow_depth - 1];
3490 return NULL;
3491 }
3492
3493 static struct ac_llvm_flow *
3494 get_innermost_loop(struct ac_llvm_context *ctx)
3495 {
3496 for (unsigned i = ctx->flow_depth; i > 0; --i) {
3497 if (ctx->flow[i - 1].loop_entry_block)
3498 return &ctx->flow[i - 1];
3499 }
3500 return NULL;
3501 }
3502
3503 static struct ac_llvm_flow *
3504 push_flow(struct ac_llvm_context *ctx)
3505 {
3506 struct ac_llvm_flow *flow;
3507
3508 if (ctx->flow_depth >= ctx->flow_depth_max) {
3509 unsigned new_max = MAX2(ctx->flow_depth << 1,
3510 AC_LLVM_INITIAL_CF_DEPTH);
3511
3512 ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
3513 ctx->flow_depth_max = new_max;
3514 }
3515
3516 flow = &ctx->flow[ctx->flow_depth];
3517 ctx->flow_depth++;
3518
3519 flow->next_block = NULL;
3520 flow->loop_entry_block = NULL;
3521 return flow;
3522 }
3523
3524 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
3525 int label_id)
3526 {
3527 char buf[32];
3528 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3529 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3530 }
3531
3532 /* Append a basic block at the level of the parent flow.
3533 */
3534 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
3535 const char *name)
3536 {
3537 assert(ctx->flow_depth >= 1);
3538
3539 if (ctx->flow_depth >= 2) {
3540 struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
3541
3542 return LLVMInsertBasicBlockInContext(ctx->context,
3543 flow->next_block, name);
3544 }
3545
3546 LLVMValueRef main_fn =
3547 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3548 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3549 }
3550
3551 /* Emit a branch to the given default target for the current block if
3552 * applicable -- that is, if the current block does not already contain a
3553 * branch from a break or continue.
3554 */
3555 static void emit_default_branch(LLVMBuilderRef builder,
3556 LLVMBasicBlockRef target)
3557 {
3558 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3559 LLVMBuildBr(builder, target);
3560 }
3561
3562 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3563 {
3564 struct ac_llvm_flow *flow = push_flow(ctx);
3565 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3566 flow->next_block = append_basic_block(ctx, "ENDLOOP");
3567 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3568 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3569 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3570 }
3571
3572 void ac_build_break(struct ac_llvm_context *ctx)
3573 {
3574 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3575 LLVMBuildBr(ctx->builder, flow->next_block);
3576 }
3577
3578 void ac_build_continue(struct ac_llvm_context *ctx)
3579 {
3580 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3581 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3582 }
3583
3584 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3585 {
3586 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3587 LLVMBasicBlockRef endif_block;
3588
3589 assert(!current_branch->loop_entry_block);
3590
3591 endif_block = append_basic_block(ctx, "ENDIF");
3592 emit_default_branch(ctx->builder, endif_block);
3593
3594 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3595 set_basicblock_name(current_branch->next_block, "else", label_id);
3596
3597 current_branch->next_block = endif_block;
3598 }
3599
3600 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3601 {
3602 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3603
3604 assert(!current_branch->loop_entry_block);
3605
3606 emit_default_branch(ctx->builder, current_branch->next_block);
3607 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3608 set_basicblock_name(current_branch->next_block, "endif", label_id);
3609
3610 ctx->flow_depth--;
3611 }
3612
3613 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3614 {
3615 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3616
3617 assert(current_loop->loop_entry_block);
3618
3619 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3620
3621 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3622 set_basicblock_name(current_loop->next_block, "endloop", label_id);
3623 ctx->flow_depth--;
3624 }
3625
3626 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3627 {
3628 struct ac_llvm_flow *flow = push_flow(ctx);
3629 LLVMBasicBlockRef if_block;
3630
3631 if_block = append_basic_block(ctx, "IF");
3632 flow->next_block = append_basic_block(ctx, "ELSE");
3633 set_basicblock_name(if_block, "if", label_id);
3634 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3635 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3636 }
3637
3638 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3639 int label_id)
3640 {
3641 LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3642 value, ctx->f32_0, "");
3643 ac_build_ifcc(ctx, cond, label_id);
3644 }
3645
3646 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3647 int label_id)
3648 {
3649 LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3650 ac_to_integer(ctx, value),
3651 ctx->i32_0, "");
3652 ac_build_ifcc(ctx, cond, label_id);
3653 }
3654
3655 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3656 const char *name)
3657 {
3658 LLVMBuilderRef builder = ac->builder;
3659 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3660 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3661 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3662 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3663 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3664 LLVMValueRef res;
3665
3666 if (first_instr) {
3667 LLVMPositionBuilderBefore(first_builder, first_instr);
3668 } else {
3669 LLVMPositionBuilderAtEnd(first_builder, first_block);
3670 }
3671
3672 res = LLVMBuildAlloca(first_builder, type, name);
3673 LLVMDisposeBuilder(first_builder);
3674 return res;
3675 }
3676
3677 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3678 LLVMTypeRef type, const char *name)
3679 {
3680 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3681 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3682 return ptr;
3683 }
3684
3685 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3686 LLVMTypeRef type)
3687 {
3688 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3689 return LLVMBuildBitCast(ctx->builder, ptr,
3690 LLVMPointerType(type, addr_space), "");
3691 }
3692
3693 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3694 unsigned count)
3695 {
3696 unsigned num_components = ac_get_llvm_num_components(value);
3697 if (count == num_components)
3698 return value;
3699
3700 LLVMValueRef masks[MAX2(count, 2)];
3701 masks[0] = ctx->i32_0;
3702 masks[1] = ctx->i32_1;
3703 for (unsigned i = 2; i < count; i++)
3704 masks[i] = LLVMConstInt(ctx->i32, i, false);
3705
3706 if (count == 1)
3707 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3708 "");
3709
3710 LLVMValueRef swizzle = LLVMConstVector(masks, count);
3711 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3712 }
3713
3714 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3715 unsigned rshift, unsigned bitwidth)
3716 {
3717 LLVMValueRef value = param;
3718 if (rshift)
3719 value = LLVMBuildLShr(ctx->builder, value,
3720 LLVMConstInt(ctx->i32, rshift, false), "");
3721
3722 if (rshift + bitwidth < 32) {
3723 unsigned mask = (1 << bitwidth) - 1;
3724 value = LLVMBuildAnd(ctx->builder, value,
3725 LLVMConstInt(ctx->i32, mask, false), "");
3726 }
3727 return value;
3728 }
3729
3730 /* Adjust the sample index according to FMASK.
3731 *
3732 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3733 * which is the identity mapping. Each nibble says which physical sample
3734 * should be fetched to get that sample.
3735 *
3736 * For example, 0x11111100 means there are only 2 samples stored and
3737 * the second sample covers 3/4 of the pixel. When reading samples 0
3738 * and 1, return physical sample 0 (determined by the first two 0s
3739 * in FMASK), otherwise return physical sample 1.
3740 *
3741 * The sample index should be adjusted as follows:
3742 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3743 */
3744 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3745 LLVMValueRef *addr, bool is_array_tex)
3746 {
3747 struct ac_image_args fmask_load = {};
3748 fmask_load.opcode = ac_image_load;
3749 fmask_load.resource = fmask;
3750 fmask_load.dmask = 0xf;
3751 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3752 fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3753
3754 fmask_load.coords[0] = addr[0];
3755 fmask_load.coords[1] = addr[1];
3756 if (is_array_tex)
3757 fmask_load.coords[2] = addr[2];
3758
3759 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3760 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3761 ac->i32_0, "");
3762
3763 /* Apply the formula. */
3764 unsigned sample_chan = is_array_tex ? 3 : 2;
3765 LLVMValueRef final_sample;
3766 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3767 LLVMConstInt(ac->i32, 4, 0), "");
3768 final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3769 /* Mask the sample index by 0x7, because 0x8 means an unknown value
3770 * with EQAA, so those will map to 0. */
3771 final_sample = LLVMBuildAnd(ac->builder, final_sample,
3772 LLVMConstInt(ac->i32, 0x7, 0), "");
3773
3774 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3775 * resource descriptor is 0 (invalid).
3776 */
3777 LLVMValueRef tmp;
3778 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3779 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3780 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3781
3782 /* Replace the MSAA sample index. */
3783 addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3784 addr[sample_chan], "");
3785 }
3786
3787 static LLVMValueRef
3788 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3789 {
3790 ac_build_optimization_barrier(ctx, &src);
3791 return ac_build_intrinsic(ctx,
3792 lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3793 LLVMTypeOf(src), (LLVMValueRef []) {
3794 src, lane },
3795 lane == NULL ? 1 : 2,
3796 AC_FUNC_ATTR_READNONE |
3797 AC_FUNC_ATTR_CONVERGENT);
3798 }
3799
3800 /**
3801 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3802 * @param ctx
3803 * @param src
3804 * @param lane - id of the lane or NULL for the first active lane
3805 * @return value of the lane
3806 */
3807 LLVMValueRef
3808 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3809 {
3810 LLVMTypeRef src_type = LLVMTypeOf(src);
3811 src = ac_to_integer(ctx, src);
3812 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3813 LLVMValueRef ret;
3814
3815 if (bits == 32) {
3816 ret = _ac_build_readlane(ctx, src, lane);
3817 } else {
3818 assert(bits % 32 == 0);
3819 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3820 LLVMValueRef src_vector =
3821 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3822 ret = LLVMGetUndef(vec_type);
3823 for (unsigned i = 0; i < bits / 32; i++) {
3824 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3825 LLVMConstInt(ctx->i32, i, 0), "");
3826 LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3827 ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3828 LLVMConstInt(ctx->i32, i, 0), "");
3829 }
3830 }
3831 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3832 }
3833
3834 LLVMValueRef
3835 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3836 {
3837 if (HAVE_LLVM >= 0x0800) {
3838 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3839 (LLVMValueRef []) {value, lane, src}, 3,
3840 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3841 }
3842
3843 LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
3844 ac_get_thread_id(ctx), "");
3845 return LLVMBuildSelect(ctx->builder, pred, value, src, "");
3846 }
3847
3848 LLVMValueRef
3849 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3850 {
3851 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3852 LLVMVectorType(ctx->i32, 2),
3853 "");
3854 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3855 ctx->i32_0, "");
3856 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3857 ctx->i32_1, "");
3858 LLVMValueRef val =
3859 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3860 (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3861 2, AC_FUNC_ATTR_READNONE);
3862 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3863 (LLVMValueRef []) { mask_hi, val },
3864 2, AC_FUNC_ATTR_READNONE);
3865 return val;
3866 }
3867
3868 enum dpp_ctrl {
3869 _dpp_quad_perm = 0x000,
3870 _dpp_row_sl = 0x100,
3871 _dpp_row_sr = 0x110,
3872 _dpp_row_rr = 0x120,
3873 dpp_wf_sl1 = 0x130,
3874 dpp_wf_rl1 = 0x134,
3875 dpp_wf_sr1 = 0x138,
3876 dpp_wf_rr1 = 0x13C,
3877 dpp_row_mirror = 0x140,
3878 dpp_row_half_mirror = 0x141,
3879 dpp_row_bcast15 = 0x142,
3880 dpp_row_bcast31 = 0x143
3881 };
3882
3883 static inline enum dpp_ctrl
3884 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3885 {
3886 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3887 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3888 }
3889
3890 static inline enum dpp_ctrl
3891 dpp_row_sl(unsigned amount)
3892 {
3893 assert(amount > 0 && amount < 16);
3894 return _dpp_row_sl | amount;
3895 }
3896
3897 static inline enum dpp_ctrl
3898 dpp_row_sr(unsigned amount)
3899 {
3900 assert(amount > 0 && amount < 16);
3901 return _dpp_row_sr | amount;
3902 }
3903
3904 static LLVMValueRef
3905 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3906 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3907 bool bound_ctrl)
3908 {
3909 return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3910 LLVMTypeOf(old),
3911 (LLVMValueRef[]) {
3912 old, src,
3913 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3914 LLVMConstInt(ctx->i32, row_mask, 0),
3915 LLVMConstInt(ctx->i32, bank_mask, 0),
3916 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3917 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3918 }
3919
3920 static LLVMValueRef
3921 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3922 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3923 bool bound_ctrl)
3924 {
3925 LLVMTypeRef src_type = LLVMTypeOf(src);
3926 src = ac_to_integer(ctx, src);
3927 old = ac_to_integer(ctx, old);
3928 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3929 LLVMValueRef ret;
3930 if (bits == 32) {
3931 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3932 bank_mask, bound_ctrl);
3933 } else {
3934 assert(bits % 32 == 0);
3935 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3936 LLVMValueRef src_vector =
3937 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3938 LLVMValueRef old_vector =
3939 LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3940 ret = LLVMGetUndef(vec_type);
3941 for (unsigned i = 0; i < bits / 32; i++) {
3942 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3943 LLVMConstInt(ctx->i32, i,
3944 0), "");
3945 old = LLVMBuildExtractElement(ctx->builder, old_vector,
3946 LLVMConstInt(ctx->i32, i,
3947 0), "");
3948 LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3949 dpp_ctrl,
3950 row_mask,
3951 bank_mask,
3952 bound_ctrl);
3953 ret = LLVMBuildInsertElement(ctx->builder, ret,
3954 ret_comp,
3955 LLVMConstInt(ctx->i32, i,
3956 0), "");
3957 }
3958 }
3959 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3960 }
3961
3962 static LLVMValueRef
3963 _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3964 bool exchange_rows, bool bound_ctrl)
3965 {
3966 LLVMValueRef args[6] = {
3967 src,
3968 src,
3969 LLVMConstInt(ctx->i32, sel, false),
3970 LLVMConstInt(ctx->i32, sel >> 32, false),
3971 ctx->i1true, /* fi */
3972 bound_ctrl ? ctx->i1true : ctx->i1false,
3973 };
3974 return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
3975 : "llvm.amdgcn.permlane16",
3976 ctx->i32, args, 6,
3977 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3978 }
3979
3980 static LLVMValueRef
3981 ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3982 bool exchange_rows, bool bound_ctrl)
3983 {
3984 LLVMTypeRef src_type = LLVMTypeOf(src);
3985 src = ac_to_integer(ctx, src);
3986 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3987 LLVMValueRef ret;
3988 if (bits == 32) {
3989 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
3990 bound_ctrl);
3991 } else {
3992 assert(bits % 32 == 0);
3993 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3994 LLVMValueRef src_vector =
3995 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3996 ret = LLVMGetUndef(vec_type);
3997 for (unsigned i = 0; i < bits / 32; i++) {
3998 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3999 LLVMConstInt(ctx->i32, i,
4000 0), "");
4001 LLVMValueRef ret_comp =
4002 _ac_build_permlane16(ctx, src, sel,
4003 exchange_rows,
4004 bound_ctrl);
4005 ret = LLVMBuildInsertElement(ctx->builder, ret,
4006 ret_comp,
4007 LLVMConstInt(ctx->i32, i,
4008 0), "");
4009 }
4010 }
4011 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4012 }
4013
4014 static inline unsigned
4015 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
4016 {
4017 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
4018 return and_mask | (or_mask << 5) | (xor_mask << 10);
4019 }
4020
4021 static LLVMValueRef
4022 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4023 {
4024 return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
4025 LLVMTypeOf(src), (LLVMValueRef []) {
4026 src, LLVMConstInt(ctx->i32, mask, 0) },
4027 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4028 }
4029
4030 LLVMValueRef
4031 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
4032 {
4033 LLVMTypeRef src_type = LLVMTypeOf(src);
4034 src = ac_to_integer(ctx, src);
4035 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
4036 LLVMValueRef ret;
4037 if (bits == 32) {
4038 ret = _ac_build_ds_swizzle(ctx, src, mask);
4039 } else {
4040 assert(bits % 32 == 0);
4041 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
4042 LLVMValueRef src_vector =
4043 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
4044 ret = LLVMGetUndef(vec_type);
4045 for (unsigned i = 0; i < bits / 32; i++) {
4046 src = LLVMBuildExtractElement(ctx->builder, src_vector,
4047 LLVMConstInt(ctx->i32, i,
4048 0), "");
4049 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
4050 mask);
4051 ret = LLVMBuildInsertElement(ctx->builder, ret,
4052 ret_comp,
4053 LLVMConstInt(ctx->i32, i,
4054 0), "");
4055 }
4056 }
4057 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4058 }
4059
4060 static LLVMValueRef
4061 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
4062 {
4063 char name[32], type[8];
4064 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4065 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
4066 return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
4067 (LLVMValueRef []) { src }, 1,
4068 AC_FUNC_ATTR_READNONE);
4069 }
4070
4071 static LLVMValueRef
4072 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
4073 LLVMValueRef inactive)
4074 {
4075 char name[33], type[8];
4076 LLVMTypeRef src_type = LLVMTypeOf(src);
4077 src = ac_to_integer(ctx, src);
4078 inactive = ac_to_integer(ctx, inactive);
4079 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
4080 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
4081 LLVMValueRef ret =
4082 ac_build_intrinsic(ctx, name,
4083 LLVMTypeOf(src), (LLVMValueRef []) {
4084 src, inactive }, 2,
4085 AC_FUNC_ATTR_READNONE |
4086 AC_FUNC_ATTR_CONVERGENT);
4087 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
4088 }
4089
4090 static LLVMValueRef
4091 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
4092 {
4093 if (type_size == 4) {
4094 switch (op) {
4095 case nir_op_iadd: return ctx->i32_0;
4096 case nir_op_fadd: return ctx->f32_0;
4097 case nir_op_imul: return ctx->i32_1;
4098 case nir_op_fmul: return ctx->f32_1;
4099 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
4100 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
4101 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
4102 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
4103 case nir_op_umax: return ctx->i32_0;
4104 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
4105 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
4106 case nir_op_ior: return ctx->i32_0;
4107 case nir_op_ixor: return ctx->i32_0;
4108 default:
4109 unreachable("bad reduction intrinsic");
4110 }
4111 } else { /* type_size == 64bit */
4112 switch (op) {
4113 case nir_op_iadd: return ctx->i64_0;
4114 case nir_op_fadd: return ctx->f64_0;
4115 case nir_op_imul: return ctx->i64_1;
4116 case nir_op_fmul: return ctx->f64_1;
4117 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
4118 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4119 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
4120 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
4121 case nir_op_umax: return ctx->i64_0;
4122 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
4123 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
4124 case nir_op_ior: return ctx->i64_0;
4125 case nir_op_ixor: return ctx->i64_0;
4126 default:
4127 unreachable("bad reduction intrinsic");
4128 }
4129 }
4130 }
4131
4132 static LLVMValueRef
4133 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
4134 {
4135 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
4136 switch (op) {
4137 case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
4138 case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
4139 case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
4140 case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
4141 case nir_op_imin: return LLVMBuildSelect(ctx->builder,
4142 LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
4143 lhs, rhs, "");
4144 case nir_op_umin: return LLVMBuildSelect(ctx->builder,
4145 LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
4146 lhs, rhs, "");
4147 case nir_op_fmin: return ac_build_intrinsic(ctx,
4148 _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
4149 _64bit ? ctx->f64 : ctx->f32,
4150 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4151 case nir_op_imax: return LLVMBuildSelect(ctx->builder,
4152 LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
4153 lhs, rhs, "");
4154 case nir_op_umax: return LLVMBuildSelect(ctx->builder,
4155 LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
4156 lhs, rhs, "");
4157 case nir_op_fmax: return ac_build_intrinsic(ctx,
4158 _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
4159 _64bit ? ctx->f64 : ctx->f32,
4160 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
4161 case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
4162 case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
4163 case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
4164 default:
4165 unreachable("bad reduction intrinsic");
4166 }
4167 }
4168
4169 /**
4170 * \param maxprefix specifies that the result only needs to be correct for a
4171 * prefix of this many threads
4172 *
4173 * TODO: add inclusive and excluse scan functions for GFX6.
4174 */
4175 static LLVMValueRef
4176 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
4177 unsigned maxprefix, bool inclusive)
4178 {
4179 LLVMValueRef result, tmp;
4180
4181 if (ctx->chip_class >= GFX10) {
4182 result = inclusive ? src : identity;
4183 } else {
4184 if (inclusive)
4185 result = src;
4186 else
4187 result = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
4188 }
4189 if (maxprefix <= 1)
4190 return result;
4191 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
4192 result = ac_build_alu_op(ctx, result, tmp, op);
4193 if (maxprefix <= 2)
4194 return result;
4195 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
4196 result = ac_build_alu_op(ctx, result, tmp, op);
4197 if (maxprefix <= 3)
4198 return result;
4199 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
4200 result = ac_build_alu_op(ctx, result, tmp, op);
4201 if (maxprefix <= 4)
4202 return result;
4203 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4204 result = ac_build_alu_op(ctx, result, tmp, op);
4205 if (maxprefix <= 8)
4206 return result;
4207 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4208 result = ac_build_alu_op(ctx, result, tmp, op);
4209 if (maxprefix <= 16)
4210 return result;
4211
4212 if (ctx->chip_class >= GFX10) {
4213 /* dpp_row_bcast{15,31} are not supported on gfx10. */
4214 LLVMBuilderRef builder = ctx->builder;
4215 LLVMValueRef tid = ac_get_thread_id(ctx);
4216 LLVMValueRef cc;
4217 /* TODO-GFX10: Can we get better code-gen by putting this into
4218 * a branch so that LLVM generates EXEC mask manipulations? */
4219 if (inclusive)
4220 tmp = result;
4221 else
4222 tmp = ac_build_alu_op(ctx, result, src, op);
4223 tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
4224 tmp = ac_build_alu_op(ctx, result, tmp, op);
4225 cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
4226 cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
4227 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4228 if (maxprefix <= 32)
4229 return result;
4230
4231 if (inclusive)
4232 tmp = result;
4233 else
4234 tmp = ac_build_alu_op(ctx, result, src, op);
4235 tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
4236 tmp = ac_build_alu_op(ctx, result, tmp, op);
4237 cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
4238 LLVMConstInt(ctx->i32, 32, false), "");
4239 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4240 return result;
4241 }
4242
4243 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4244 result = ac_build_alu_op(ctx, result, tmp, op);
4245 if (maxprefix <= 32)
4246 return result;
4247 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4248 result = ac_build_alu_op(ctx, result, tmp, op);
4249 return result;
4250 }
4251
4252 LLVMValueRef
4253 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4254 {
4255 LLVMValueRef result;
4256
4257 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4258 LLVMBuilderRef builder = ctx->builder;
4259 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4260 result = ac_build_ballot(ctx, src);
4261 result = ac_build_mbcnt(ctx, result);
4262 result = LLVMBuildAdd(builder, result, src, "");
4263 return result;
4264 }
4265
4266 ac_build_optimization_barrier(ctx, &src);
4267
4268 LLVMValueRef identity =
4269 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4270 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4271 LLVMTypeOf(identity), "");
4272 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4273
4274 return ac_build_wwm(ctx, result);
4275 }
4276
4277 LLVMValueRef
4278 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4279 {
4280 LLVMValueRef result;
4281
4282 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4283 LLVMBuilderRef builder = ctx->builder;
4284 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4285 result = ac_build_ballot(ctx, src);
4286 result = ac_build_mbcnt(ctx, result);
4287 return result;
4288 }
4289
4290 ac_build_optimization_barrier(ctx, &src);
4291
4292 LLVMValueRef identity =
4293 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4294 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4295 LLVMTypeOf(identity), "");
4296 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4297
4298 return ac_build_wwm(ctx, result);
4299 }
4300
4301 LLVMValueRef
4302 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
4303 {
4304 if (cluster_size == 1) return src;
4305 ac_build_optimization_barrier(ctx, &src);
4306 LLVMValueRef result, swap;
4307 LLVMValueRef identity = get_reduction_identity(ctx, op,
4308 ac_get_type_size(LLVMTypeOf(src)));
4309 result = LLVMBuildBitCast(ctx->builder,
4310 ac_build_set_inactive(ctx, src, identity),
4311 LLVMTypeOf(identity), "");
4312 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4313 result = ac_build_alu_op(ctx, result, swap, op);
4314 if (cluster_size == 2) return ac_build_wwm(ctx, result);
4315
4316 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4317 result = ac_build_alu_op(ctx, result, swap, op);
4318 if (cluster_size == 4) return ac_build_wwm(ctx, result);
4319
4320 if (ctx->chip_class >= GFX8)
4321 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4322 else
4323 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4324 result = ac_build_alu_op(ctx, result, swap, op);
4325 if (cluster_size == 8) return ac_build_wwm(ctx, result);
4326
4327 if (ctx->chip_class >= GFX8)
4328 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4329 else
4330 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4331 result = ac_build_alu_op(ctx, result, swap, op);
4332 if (cluster_size == 16) return ac_build_wwm(ctx, result);
4333
4334 if (ctx->chip_class >= GFX10)
4335 swap = ac_build_permlane16(ctx, result, 0, true, false);
4336 else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4337 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4338 else
4339 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4340 result = ac_build_alu_op(ctx, result, swap, op);
4341 if (cluster_size == 32) return ac_build_wwm(ctx, result);
4342
4343 if (ctx->chip_class >= GFX8) {
4344 if (ctx->chip_class >= GFX10)
4345 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4346 else
4347 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4348 result = ac_build_alu_op(ctx, result, swap, op);
4349 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4350 return ac_build_wwm(ctx, result);
4351 } else {
4352 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4353 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4354 result = ac_build_alu_op(ctx, result, swap, op);
4355 return ac_build_wwm(ctx, result);
4356 }
4357 }
4358
4359 /**
4360 * "Top half" of a scan that reduces per-wave values across an entire
4361 * workgroup.
4362 *
4363 * The source value must be present in the highest lane of the wave, and the
4364 * highest lane must be live.
4365 */
4366 void
4367 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4368 {
4369 if (ws->maxwaves <= 1)
4370 return;
4371
4372 const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4373 LLVMBuilderRef builder = ctx->builder;
4374 LLVMValueRef tid = ac_get_thread_id(ctx);
4375 LLVMValueRef tmp;
4376
4377 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4378 ac_build_ifcc(ctx, tmp, 1000);
4379 LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4380 ac_build_endif(ctx, 1000);
4381 }
4382
4383 /**
4384 * "Bottom half" of a scan that reduces per-wave values across an entire
4385 * workgroup.
4386 *
4387 * The caller must place a barrier between the top and bottom halves.
4388 */
4389 void
4390 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4391 {
4392 const LLVMTypeRef type = LLVMTypeOf(ws->src);
4393 const LLVMValueRef identity =
4394 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4395
4396 if (ws->maxwaves <= 1) {
4397 ws->result_reduce = ws->src;
4398 ws->result_inclusive = ws->src;
4399 ws->result_exclusive = identity;
4400 return;
4401 }
4402 assert(ws->maxwaves <= 32);
4403
4404 LLVMBuilderRef builder = ctx->builder;
4405 LLVMValueRef tid = ac_get_thread_id(ctx);
4406 LLVMBasicBlockRef bbs[2];
4407 LLVMValueRef phivalues_scan[2];
4408 LLVMValueRef tmp, tmp2;
4409
4410 bbs[0] = LLVMGetInsertBlock(builder);
4411 phivalues_scan[0] = LLVMGetUndef(type);
4412
4413 if (ws->enable_reduce)
4414 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4415 else if (ws->enable_inclusive)
4416 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4417 else
4418 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4419 ac_build_ifcc(ctx, tmp, 1001);
4420 {
4421 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4422
4423 ac_build_optimization_barrier(ctx, &tmp);
4424
4425 bbs[1] = LLVMGetInsertBlock(builder);
4426 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4427 }
4428 ac_build_endif(ctx, 1001);
4429
4430 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4431
4432 if (ws->enable_reduce) {
4433 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4434 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4435 }
4436 if (ws->enable_inclusive)
4437 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4438 if (ws->enable_exclusive) {
4439 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4440 tmp = ac_build_readlane(ctx, scan, tmp);
4441 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4442 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4443 }
4444 }
4445
4446 /**
4447 * Inclusive scan of a per-wave value across an entire workgroup.
4448 *
4449 * This implies an s_barrier instruction.
4450 *
4451 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4452 * of the workgroup are live. (This requirement cannot easily be relaxed in a
4453 * useful manner because of the barrier in the algorithm.)
4454 */
4455 void
4456 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4457 {
4458 ac_build_wg_wavescan_top(ctx, ws);
4459 ac_build_s_barrier(ctx);
4460 ac_build_wg_wavescan_bottom(ctx, ws);
4461 }
4462
4463 /**
4464 * "Top half" of a scan that reduces per-thread values across an entire
4465 * workgroup.
4466 *
4467 * All lanes must be active when this code runs.
4468 */
4469 void
4470 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4471 {
4472 if (ws->enable_exclusive) {
4473 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4474 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4475 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4476 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4477 } else {
4478 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4479 }
4480
4481 bool enable_inclusive = ws->enable_inclusive;
4482 bool enable_exclusive = ws->enable_exclusive;
4483 ws->enable_inclusive = false;
4484 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4485 ac_build_wg_wavescan_top(ctx, ws);
4486 ws->enable_inclusive = enable_inclusive;
4487 ws->enable_exclusive = enable_exclusive;
4488 }
4489
4490 /**
4491 * "Bottom half" of a scan that reduces per-thread values across an entire
4492 * workgroup.
4493 *
4494 * The caller must place a barrier between the top and bottom halves.
4495 */
4496 void
4497 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4498 {
4499 bool enable_inclusive = ws->enable_inclusive;
4500 bool enable_exclusive = ws->enable_exclusive;
4501 ws->enable_inclusive = false;
4502 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4503 ac_build_wg_wavescan_bottom(ctx, ws);
4504 ws->enable_inclusive = enable_inclusive;
4505 ws->enable_exclusive = enable_exclusive;
4506
4507 /* ws->result_reduce is already the correct value */
4508 if (ws->enable_inclusive)
4509 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4510 if (ws->enable_exclusive)
4511 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4512 }
4513
4514 /**
4515 * A scan that reduces per-thread values across an entire workgroup.
4516 *
4517 * The caller must ensure that all lanes are active when this code runs
4518 * (WWM is insufficient!), because there is an implied barrier.
4519 */
4520 void
4521 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4522 {
4523 ac_build_wg_scan_top(ctx, ws);
4524 ac_build_s_barrier(ctx);
4525 ac_build_wg_scan_bottom(ctx, ws);
4526 }
4527
4528 LLVMValueRef
4529 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
4530 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
4531 {
4532 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4533 if (ctx->chip_class >= GFX8) {
4534 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4535 } else {
4536 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4537 }
4538 }
4539
4540 LLVMValueRef
4541 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4542 {
4543 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4544 return ac_build_intrinsic(ctx,
4545 "llvm.amdgcn.ds.bpermute", ctx->i32,
4546 (LLVMValueRef []) {index, src}, 2,
4547 AC_FUNC_ATTR_READNONE |
4548 AC_FUNC_ATTR_CONVERGENT);
4549 }
4550
4551 LLVMValueRef
4552 ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
4553 unsigned bitsize)
4554 {
4555 LLVMTypeRef type;
4556 char *intr;
4557
4558 if (bitsize == 16) {
4559 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4560 type = ctx->i16;
4561 } else if (bitsize == 32) {
4562 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4563 type = ctx->i32;
4564 } else {
4565 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4566 type = ctx->i32;
4567 }
4568
4569 LLVMValueRef params[] = {
4570 src0,
4571 };
4572 return ac_build_intrinsic(ctx, intr, type, params, 1,
4573 AC_FUNC_ATTR_READNONE);
4574 }
4575 LLVMValueRef
4576 ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
4577 unsigned bitsize)
4578 {
4579 LLVMTypeRef type;
4580 char *intr;
4581
4582 if (bitsize == 16) {
4583 intr = "llvm.amdgcn.frexp.mant.f16";
4584 type = ctx->f16;
4585 } else if (bitsize == 32) {
4586 intr = "llvm.amdgcn.frexp.mant.f32";
4587 type = ctx->f32;
4588 } else {
4589 intr = "llvm.amdgcn.frexp.mant.f64";
4590 type = ctx->f64;
4591 }
4592
4593 LLVMValueRef params[] = {
4594 src0,
4595 };
4596 return ac_build_intrinsic(ctx, intr, type, params, 1,
4597 AC_FUNC_ATTR_READNONE);
4598 }
4599
4600 /*
4601 * this takes an I,J coordinate pair,
4602 * and works out the X and Y derivatives.
4603 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4604 */
4605 LLVMValueRef
4606 ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4607 {
4608 LLVMValueRef result[4], a;
4609 unsigned i;
4610
4611 for (i = 0; i < 2; i++) {
4612 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
4613 LLVMConstInt(ctx->i32, i, false), "");
4614 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4615 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4616 }
4617 return ac_build_gather_values(ctx, result, 4);
4618 }
4619
4620 LLVMValueRef
4621 ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4622 {
4623 LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
4624 ctx->i1, NULL, 0,
4625 AC_FUNC_ATTR_READNONE);
4626 result = LLVMBuildNot(ctx->builder, result, "");
4627 return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4628 }
4629
4630 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
4631 LLVMValueRef *args, unsigned num_args)
4632 {
4633 LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4634 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4635 return ret;
4636 }