b5bb399eef1bf530b9da31b032d9d4d7b94923e1
[mesa.git] / src / amd / common / ac_llvm_build.c
1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include <llvm-c/Core.h>
29
30 #include "c11/threads.h"
31
32 #include <assert.h>
33 #include <stdio.h>
34
35 #include "ac_llvm_util.h"
36 #include "ac_exp_param.h"
37 #include "util/bitscan.h"
38 #include "util/macros.h"
39 #include "util/u_atomic.h"
40 #include "util/u_math.h"
41 #include "sid.h"
42
43 #include "shader_enums.h"
44
45 #define AC_LLVM_INITIAL_CF_DEPTH 4
46
47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
48 */
49 struct ac_llvm_flow {
50 /* Loop exit or next part of if/else/endif. */
51 LLVMBasicBlockRef next_block;
52 LLVMBasicBlockRef loop_entry_block;
53 };
54
55 /* Initialize module-independent parts of the context.
56 *
57 * The caller is responsible for initializing ctx::module and ctx::builder.
58 */
59 void
60 ac_llvm_context_init(struct ac_llvm_context *ctx,
61 enum chip_class chip_class, enum radeon_family family)
62 {
63 LLVMValueRef args[1];
64
65 ctx->context = LLVMContextCreate();
66
67 ctx->chip_class = chip_class;
68 ctx->family = family;
69 ctx->module = NULL;
70 ctx->builder = NULL;
71
72 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
73 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
74 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
75 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
76 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
77 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
78 ctx->intptr = ctx->i32;
79 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
80 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
81 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
82 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
83 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
84 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
85 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
86 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
87 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
88 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
89
90 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
91 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
92 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
93 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
94 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
95 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
96 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
97 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
98 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
99 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
100 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
101 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
102
103 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
104 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
105
106 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
107 "range", 5);
108
109 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
110 "invariant.load", 14);
111
112 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
113
114 args[0] = LLVMConstReal(ctx->f32, 2.5);
115 ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
116
117 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
118 "amdgpu.uniform", 14);
119
120 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
121 }
122
123 void
124 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
125 {
126 free(ctx->flow);
127 ctx->flow = NULL;
128 ctx->flow_depth_max = 0;
129 }
130
131 int
132 ac_get_llvm_num_components(LLVMValueRef value)
133 {
134 LLVMTypeRef type = LLVMTypeOf(value);
135 unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
136 ? LLVMGetVectorSize(type)
137 : 1;
138 return num_components;
139 }
140
141 LLVMValueRef
142 ac_llvm_extract_elem(struct ac_llvm_context *ac,
143 LLVMValueRef value,
144 int index)
145 {
146 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
147 assert(index == 0);
148 return value;
149 }
150
151 return LLVMBuildExtractElement(ac->builder, value,
152 LLVMConstInt(ac->i32, index, false), "");
153 }
154
155 int
156 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
157 {
158 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
159 type = LLVMGetElementType(type);
160
161 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
162 return LLVMGetIntTypeWidth(type);
163
164 if (type == ctx->f16)
165 return 16;
166 if (type == ctx->f32)
167 return 32;
168 if (type == ctx->f64)
169 return 64;
170
171 unreachable("Unhandled type kind in get_elem_bits");
172 }
173
174 unsigned
175 ac_get_type_size(LLVMTypeRef type)
176 {
177 LLVMTypeKind kind = LLVMGetTypeKind(type);
178
179 switch (kind) {
180 case LLVMIntegerTypeKind:
181 return LLVMGetIntTypeWidth(type) / 8;
182 case LLVMHalfTypeKind:
183 return 2;
184 case LLVMFloatTypeKind:
185 return 4;
186 case LLVMDoubleTypeKind:
187 return 8;
188 case LLVMPointerTypeKind:
189 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
190 return 4;
191 return 8;
192 case LLVMVectorTypeKind:
193 return LLVMGetVectorSize(type) *
194 ac_get_type_size(LLVMGetElementType(type));
195 case LLVMArrayTypeKind:
196 return LLVMGetArrayLength(type) *
197 ac_get_type_size(LLVMGetElementType(type));
198 default:
199 assert(0);
200 return 0;
201 }
202 }
203
204 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
205 {
206 if (t == ctx->i8)
207 return ctx->i8;
208 else if (t == ctx->f16 || t == ctx->i16)
209 return ctx->i16;
210 else if (t == ctx->f32 || t == ctx->i32)
211 return ctx->i32;
212 else if (t == ctx->f64 || t == ctx->i64)
213 return ctx->i64;
214 else
215 unreachable("Unhandled integer size");
216 }
217
218 LLVMTypeRef
219 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
220 {
221 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
222 LLVMTypeRef elem_type = LLVMGetElementType(t);
223 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
224 LLVMGetVectorSize(t));
225 }
226 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
227 switch (LLVMGetPointerAddressSpace(t)) {
228 case AC_ADDR_SPACE_GLOBAL:
229 return ctx->i64;
230 case AC_ADDR_SPACE_LDS:
231 return ctx->i32;
232 default:
233 unreachable("unhandled address space");
234 }
235 }
236 return to_integer_type_scalar(ctx, t);
237 }
238
239 LLVMValueRef
240 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
241 {
242 LLVMTypeRef type = LLVMTypeOf(v);
243 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
244 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
245 }
246 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
247 }
248
249 LLVMValueRef
250 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
251 {
252 LLVMTypeRef type = LLVMTypeOf(v);
253 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
254 return v;
255 return ac_to_integer(ctx, v);
256 }
257
258 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
259 {
260 if (t == ctx->i8)
261 return ctx->i8;
262 else if (t == ctx->i16 || t == ctx->f16)
263 return ctx->f16;
264 else if (t == ctx->i32 || t == ctx->f32)
265 return ctx->f32;
266 else if (t == ctx->i64 || t == ctx->f64)
267 return ctx->f64;
268 else
269 unreachable("Unhandled float size");
270 }
271
272 LLVMTypeRef
273 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
274 {
275 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
276 LLVMTypeRef elem_type = LLVMGetElementType(t);
277 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
278 LLVMGetVectorSize(t));
279 }
280 return to_float_type_scalar(ctx, t);
281 }
282
283 LLVMValueRef
284 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
285 {
286 LLVMTypeRef type = LLVMTypeOf(v);
287 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
288 }
289
290
291 LLVMValueRef
292 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
293 LLVMTypeRef return_type, LLVMValueRef *params,
294 unsigned param_count, unsigned attrib_mask)
295 {
296 LLVMValueRef function, call;
297 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
298
299 function = LLVMGetNamedFunction(ctx->module, name);
300 if (!function) {
301 LLVMTypeRef param_types[32], function_type;
302 unsigned i;
303
304 assert(param_count <= 32);
305
306 for (i = 0; i < param_count; ++i) {
307 assert(params[i]);
308 param_types[i] = LLVMTypeOf(params[i]);
309 }
310 function_type =
311 LLVMFunctionType(return_type, param_types, param_count, 0);
312 function = LLVMAddFunction(ctx->module, name, function_type);
313
314 LLVMSetFunctionCallConv(function, LLVMCCallConv);
315 LLVMSetLinkage(function, LLVMExternalLinkage);
316
317 if (!set_callsite_attrs)
318 ac_add_func_attributes(ctx->context, function, attrib_mask);
319 }
320
321 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
322 if (set_callsite_attrs)
323 ac_add_func_attributes(ctx->context, call, attrib_mask);
324 return call;
325 }
326
327 /**
328 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
329 * intrinsic names).
330 */
331 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
332 {
333 LLVMTypeRef elem_type = type;
334
335 assert(bufsize >= 8);
336
337 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
338 int ret = snprintf(buf, bufsize, "v%u",
339 LLVMGetVectorSize(type));
340 if (ret < 0) {
341 char *type_name = LLVMPrintTypeToString(type);
342 fprintf(stderr, "Error building type name for: %s\n",
343 type_name);
344 return;
345 }
346 elem_type = LLVMGetElementType(type);
347 buf += ret;
348 bufsize -= ret;
349 }
350 switch (LLVMGetTypeKind(elem_type)) {
351 default: break;
352 case LLVMIntegerTypeKind:
353 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
354 break;
355 case LLVMHalfTypeKind:
356 snprintf(buf, bufsize, "f16");
357 break;
358 case LLVMFloatTypeKind:
359 snprintf(buf, bufsize, "f32");
360 break;
361 case LLVMDoubleTypeKind:
362 snprintf(buf, bufsize, "f64");
363 break;
364 }
365 }
366
367 /**
368 * Helper function that builds an LLVM IR PHI node and immediately adds
369 * incoming edges.
370 */
371 LLVMValueRef
372 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
373 unsigned count_incoming, LLVMValueRef *values,
374 LLVMBasicBlockRef *blocks)
375 {
376 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
377 LLVMAddIncoming(phi, values, blocks, count_incoming);
378 return phi;
379 }
380
381 void ac_build_s_barrier(struct ac_llvm_context *ctx)
382 {
383 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
384 0, AC_FUNC_ATTR_CONVERGENT);
385 }
386
387 /* Prevent optimizations (at least of memory accesses) across the current
388 * point in the program by emitting empty inline assembly that is marked as
389 * having side effects.
390 *
391 * Optionally, a value can be passed through the inline assembly to prevent
392 * LLVM from hoisting calls to ReadNone functions.
393 */
394 void
395 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
396 LLVMValueRef *pvgpr)
397 {
398 static int counter = 0;
399
400 LLVMBuilderRef builder = ctx->builder;
401 char code[16];
402
403 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
404
405 if (!pvgpr) {
406 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
407 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
408 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
409 } else {
410 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
411 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
412 LLVMValueRef vgpr = *pvgpr;
413 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
414 unsigned vgpr_size = ac_get_type_size(vgpr_type);
415 LLVMValueRef vgpr0;
416
417 assert(vgpr_size % 4 == 0);
418
419 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
420 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
421 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
422 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
423 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
424
425 *pvgpr = vgpr;
426 }
427 }
428
429 LLVMValueRef
430 ac_build_shader_clock(struct ac_llvm_context *ctx)
431 {
432 LLVMValueRef tmp = ac_build_intrinsic(ctx, "llvm.readcyclecounter",
433 ctx->i64, NULL, 0, 0);
434 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
435 }
436
437 LLVMValueRef
438 ac_build_ballot(struct ac_llvm_context *ctx,
439 LLVMValueRef value)
440 {
441 LLVMValueRef args[3] = {
442 value,
443 ctx->i32_0,
444 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
445 };
446
447 /* We currently have no other way to prevent LLVM from lifting the icmp
448 * calls to a dominating basic block.
449 */
450 ac_build_optimization_barrier(ctx, &args[0]);
451
452 args[0] = ac_to_integer(ctx, args[0]);
453
454 return ac_build_intrinsic(ctx,
455 "llvm.amdgcn.icmp.i32",
456 ctx->i64, args, 3,
457 AC_FUNC_ATTR_NOUNWIND |
458 AC_FUNC_ATTR_READNONE |
459 AC_FUNC_ATTR_CONVERGENT);
460 }
461
462 LLVMValueRef
463 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
464 {
465 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
466 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
467 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
468 }
469
470 LLVMValueRef
471 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
472 {
473 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
474 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
475 LLVMConstInt(ctx->i64, 0, 0), "");
476 }
477
478 LLVMValueRef
479 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
480 {
481 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
482 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
483
484 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
485 vote_set, active_set, "");
486 LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
487 vote_set,
488 LLVMConstInt(ctx->i64, 0, 0), "");
489 return LLVMBuildOr(ctx->builder, all, none, "");
490 }
491
492 LLVMValueRef
493 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
494 unsigned value_count, unsigned component)
495 {
496 LLVMValueRef vec = NULL;
497
498 if (value_count == 1) {
499 return values[component];
500 } else if (!value_count)
501 unreachable("value_count is 0");
502
503 for (unsigned i = component; i < value_count + component; i++) {
504 LLVMValueRef value = values[i];
505
506 if (i == component)
507 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
508 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
509 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
510 }
511 return vec;
512 }
513
514 LLVMValueRef
515 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
516 LLVMValueRef *values,
517 unsigned value_count,
518 unsigned value_stride,
519 bool load,
520 bool always_vector)
521 {
522 LLVMBuilderRef builder = ctx->builder;
523 LLVMValueRef vec = NULL;
524 unsigned i;
525
526 if (value_count == 1 && !always_vector) {
527 if (load)
528 return LLVMBuildLoad(builder, values[0], "");
529 return values[0];
530 } else if (!value_count)
531 unreachable("value_count is 0");
532
533 for (i = 0; i < value_count; i++) {
534 LLVMValueRef value = values[i * value_stride];
535 if (load)
536 value = LLVMBuildLoad(builder, value, "");
537
538 if (!i)
539 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
540 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
541 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
542 }
543 return vec;
544 }
545
546 LLVMValueRef
547 ac_build_gather_values(struct ac_llvm_context *ctx,
548 LLVMValueRef *values,
549 unsigned value_count)
550 {
551 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
552 }
553
554 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
555 * channels with undef. Extract at most src_channels components from the input.
556 */
557 static LLVMValueRef
558 ac_build_expand(struct ac_llvm_context *ctx,
559 LLVMValueRef value,
560 unsigned src_channels,
561 unsigned dst_channels)
562 {
563 LLVMTypeRef elemtype;
564 LLVMValueRef chan[dst_channels];
565
566 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
567 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
568
569 if (src_channels == dst_channels && vec_size == dst_channels)
570 return value;
571
572 src_channels = MIN2(src_channels, vec_size);
573
574 for (unsigned i = 0; i < src_channels; i++)
575 chan[i] = ac_llvm_extract_elem(ctx, value, i);
576
577 elemtype = LLVMGetElementType(LLVMTypeOf(value));
578 } else {
579 if (src_channels) {
580 assert(src_channels == 1);
581 chan[0] = value;
582 }
583 elemtype = LLVMTypeOf(value);
584 }
585
586 for (unsigned i = src_channels; i < dst_channels; i++)
587 chan[i] = LLVMGetUndef(elemtype);
588
589 return ac_build_gather_values(ctx, chan, dst_channels);
590 }
591
592 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
593 * with undef. Extract at most num_channels components from the input.
594 */
595 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
596 LLVMValueRef value,
597 unsigned num_channels)
598 {
599 return ac_build_expand(ctx, value, num_channels, 4);
600 }
601
602 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
603 {
604 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
605 const char *name;
606
607 if (type_size == 2)
608 name = "llvm.rint.f16";
609 else if (type_size == 4)
610 name = "llvm.rint.f32";
611 else
612 name = "llvm.rint.f64";
613
614 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
615 AC_FUNC_ATTR_READNONE);
616 }
617
618 LLVMValueRef
619 ac_build_fdiv(struct ac_llvm_context *ctx,
620 LLVMValueRef num,
621 LLVMValueRef den)
622 {
623 /* If we do (num / den), LLVM >= 7.0 does:
624 * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
625 *
626 * If we do (num * (1 / den)), LLVM does:
627 * return num * v_rcp_f32(den);
628 */
629 LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
630 LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
631 LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
632
633 /* Use v_rcp_f32 instead of precise division. */
634 if (!LLVMIsConstant(ret))
635 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
636 return ret;
637 }
638
639 /* See fast_idiv_by_const.h. */
640 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
641 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
642 LLVMValueRef num,
643 LLVMValueRef multiplier,
644 LLVMValueRef pre_shift,
645 LLVMValueRef post_shift,
646 LLVMValueRef increment)
647 {
648 LLVMBuilderRef builder = ctx->builder;
649
650 num = LLVMBuildLShr(builder, num, pre_shift, "");
651 num = LLVMBuildMul(builder,
652 LLVMBuildZExt(builder, num, ctx->i64, ""),
653 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
654 num = LLVMBuildAdd(builder, num,
655 LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
656 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
657 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
658 return LLVMBuildLShr(builder, num, post_shift, "");
659 }
660
661 /* See fast_idiv_by_const.h. */
662 /* If num != UINT_MAX, this more efficient version can be used. */
663 /* Set: increment = util_fast_udiv_info::increment; */
664 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
665 LLVMValueRef num,
666 LLVMValueRef multiplier,
667 LLVMValueRef pre_shift,
668 LLVMValueRef post_shift,
669 LLVMValueRef increment)
670 {
671 LLVMBuilderRef builder = ctx->builder;
672
673 num = LLVMBuildLShr(builder, num, pre_shift, "");
674 num = LLVMBuildNUWAdd(builder, num, increment, "");
675 num = LLVMBuildMul(builder,
676 LLVMBuildZExt(builder, num, ctx->i64, ""),
677 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
678 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
679 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
680 return LLVMBuildLShr(builder, num, post_shift, "");
681 }
682
683 /* See fast_idiv_by_const.h. */
684 /* Both operands must fit in 31 bits and the divisor must not be 1. */
685 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
686 LLVMValueRef num,
687 LLVMValueRef multiplier,
688 LLVMValueRef post_shift)
689 {
690 LLVMBuilderRef builder = ctx->builder;
691
692 num = LLVMBuildMul(builder,
693 LLVMBuildZExt(builder, num, ctx->i64, ""),
694 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
695 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
696 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
697 return LLVMBuildLShr(builder, num, post_shift, "");
698 }
699
700 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
701 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
702 * already multiplied by two. id is the cube face number.
703 */
704 struct cube_selection_coords {
705 LLVMValueRef stc[2];
706 LLVMValueRef ma;
707 LLVMValueRef id;
708 };
709
710 static void
711 build_cube_intrinsic(struct ac_llvm_context *ctx,
712 LLVMValueRef in[3],
713 struct cube_selection_coords *out)
714 {
715 LLVMTypeRef f32 = ctx->f32;
716
717 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
718 f32, in, 3, AC_FUNC_ATTR_READNONE);
719 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
720 f32, in, 3, AC_FUNC_ATTR_READNONE);
721 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
722 f32, in, 3, AC_FUNC_ATTR_READNONE);
723 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
724 f32, in, 3, AC_FUNC_ATTR_READNONE);
725 }
726
727 /**
728 * Build a manual selection sequence for cube face sc/tc coordinates and
729 * major axis vector (multiplied by 2 for consistency) for the given
730 * vec3 \p coords, for the face implied by \p selcoords.
731 *
732 * For the major axis, we always adjust the sign to be in the direction of
733 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
734 * the selcoords major axis.
735 */
736 static void build_cube_select(struct ac_llvm_context *ctx,
737 const struct cube_selection_coords *selcoords,
738 const LLVMValueRef *coords,
739 LLVMValueRef *out_st,
740 LLVMValueRef *out_ma)
741 {
742 LLVMBuilderRef builder = ctx->builder;
743 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
744 LLVMValueRef is_ma_positive;
745 LLVMValueRef sgn_ma;
746 LLVMValueRef is_ma_z, is_not_ma_z;
747 LLVMValueRef is_ma_y;
748 LLVMValueRef is_ma_x;
749 LLVMValueRef sgn;
750 LLVMValueRef tmp;
751
752 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
753 selcoords->ma, LLVMConstReal(f32, 0.0), "");
754 sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
755 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
756
757 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
758 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
759 is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
760 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
761 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
762
763 /* Select sc */
764 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
765 sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
766 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
767 LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
768 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
769
770 /* Select tc */
771 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
772 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
773 LLVMConstReal(f32, -1.0), "");
774 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
775
776 /* Select ma */
777 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
778 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
779 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
780 ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
781 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
782 }
783
784 void
785 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
786 bool is_deriv, bool is_array, bool is_lod,
787 LLVMValueRef *coords_arg,
788 LLVMValueRef *derivs_arg)
789 {
790
791 LLVMBuilderRef builder = ctx->builder;
792 struct cube_selection_coords selcoords;
793 LLVMValueRef coords[3];
794 LLVMValueRef invma;
795
796 if (is_array && !is_lod) {
797 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
798
799 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
800 *
801 * "For Array forms, the array layer used will be
802 *
803 * max(0, min(d−1, floor(layer+0.5)))
804 *
805 * where d is the depth of the texture array and layer
806 * comes from the component indicated in the tables below.
807 * Workaroudn for an issue where the layer is taken from a
808 * helper invocation which happens to fall on a different
809 * layer due to extrapolation."
810 *
811 * VI and earlier attempt to implement this in hardware by
812 * clamping the value of coords[2] = (8 * layer) + face.
813 * Unfortunately, this means that the we end up with the wrong
814 * face when clamping occurs.
815 *
816 * Clamp the layer earlier to work around the issue.
817 */
818 if (ctx->chip_class <= VI) {
819 LLVMValueRef ge0;
820 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
821 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
822 }
823
824 coords_arg[3] = tmp;
825 }
826
827 build_cube_intrinsic(ctx, coords_arg, &selcoords);
828
829 invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
830 ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
831 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
832
833 for (int i = 0; i < 2; ++i)
834 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
835
836 coords[2] = selcoords.id;
837
838 if (is_deriv && derivs_arg) {
839 LLVMValueRef derivs[4];
840 int axis;
841
842 /* Convert cube derivatives to 2D derivatives. */
843 for (axis = 0; axis < 2; axis++) {
844 LLVMValueRef deriv_st[2];
845 LLVMValueRef deriv_ma;
846
847 /* Transform the derivative alongside the texture
848 * coordinate. Mathematically, the correct formula is
849 * as follows. Assume we're projecting onto the +Z face
850 * and denote by dx/dh the derivative of the (original)
851 * X texture coordinate with respect to horizontal
852 * window coordinates. The projection onto the +Z face
853 * plane is:
854 *
855 * f(x,z) = x/z
856 *
857 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
858 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
859 *
860 * This motivatives the implementation below.
861 *
862 * Whether this actually gives the expected results for
863 * apps that might feed in derivatives obtained via
864 * finite differences is anyone's guess. The OpenGL spec
865 * seems awfully quiet about how textureGrad for cube
866 * maps should be handled.
867 */
868 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
869 deriv_st, &deriv_ma);
870
871 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
872
873 for (int i = 0; i < 2; ++i)
874 derivs[axis * 2 + i] =
875 LLVMBuildFSub(builder,
876 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
877 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
878 }
879
880 memcpy(derivs_arg, derivs, sizeof(derivs));
881 }
882
883 /* Shift the texture coordinate. This must be applied after the
884 * derivative calculation.
885 */
886 for (int i = 0; i < 2; ++i)
887 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
888
889 if (is_array) {
890 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
891 /* coords_arg.w component - array_index for cube arrays */
892 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
893 }
894
895 memcpy(coords_arg, coords, sizeof(coords));
896 }
897
898
899 LLVMValueRef
900 ac_build_fs_interp(struct ac_llvm_context *ctx,
901 LLVMValueRef llvm_chan,
902 LLVMValueRef attr_number,
903 LLVMValueRef params,
904 LLVMValueRef i,
905 LLVMValueRef j)
906 {
907 LLVMValueRef args[5];
908 LLVMValueRef p1;
909
910 args[0] = i;
911 args[1] = llvm_chan;
912 args[2] = attr_number;
913 args[3] = params;
914
915 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
916 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
917
918 args[0] = p1;
919 args[1] = j;
920 args[2] = llvm_chan;
921 args[3] = attr_number;
922 args[4] = params;
923
924 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
925 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
926 }
927
928 LLVMValueRef
929 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
930 LLVMValueRef llvm_chan,
931 LLVMValueRef attr_number,
932 LLVMValueRef params,
933 LLVMValueRef i,
934 LLVMValueRef j)
935 {
936 LLVMValueRef args[6];
937 LLVMValueRef p1;
938
939 args[0] = i;
940 args[1] = llvm_chan;
941 args[2] = attr_number;
942 args[3] = ctx->i1false;
943 args[4] = params;
944
945 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
946 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
947
948 args[0] = p1;
949 args[1] = j;
950 args[2] = llvm_chan;
951 args[3] = attr_number;
952 args[4] = ctx->i1false;
953 args[5] = params;
954
955 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
956 ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
957 }
958
959 LLVMValueRef
960 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
961 LLVMValueRef parameter,
962 LLVMValueRef llvm_chan,
963 LLVMValueRef attr_number,
964 LLVMValueRef params)
965 {
966 LLVMValueRef args[4];
967
968 args[0] = parameter;
969 args[1] = llvm_chan;
970 args[2] = attr_number;
971 args[3] = params;
972
973 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
974 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
975 }
976
977 LLVMValueRef
978 ac_build_gep_ptr(struct ac_llvm_context *ctx,
979 LLVMValueRef base_ptr,
980 LLVMValueRef index)
981 {
982 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
983 }
984
985 LLVMValueRef
986 ac_build_gep0(struct ac_llvm_context *ctx,
987 LLVMValueRef base_ptr,
988 LLVMValueRef index)
989 {
990 LLVMValueRef indices[2] = {
991 ctx->i32_0,
992 index,
993 };
994 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
995 }
996
997 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
998 LLVMValueRef index)
999 {
1000 return LLVMBuildPointerCast(ctx->builder,
1001 ac_build_gep0(ctx, ptr, index),
1002 LLVMTypeOf(ptr), "");
1003 }
1004
1005 void
1006 ac_build_indexed_store(struct ac_llvm_context *ctx,
1007 LLVMValueRef base_ptr, LLVMValueRef index,
1008 LLVMValueRef value)
1009 {
1010 LLVMBuildStore(ctx->builder, value,
1011 ac_build_gep0(ctx, base_ptr, index));
1012 }
1013
1014 /**
1015 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1016 * It's equivalent to doing a load from &base_ptr[index].
1017 *
1018 * \param base_ptr Where the array starts.
1019 * \param index The element index into the array.
1020 * \param uniform Whether the base_ptr and index can be assumed to be
1021 * dynamically uniform (i.e. load to an SGPR)
1022 * \param invariant Whether the load is invariant (no other opcodes affect it)
1023 * \param no_unsigned_wraparound
1024 * For all possible re-associations and re-distributions of an expression
1025 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1026 * without inbounds in base_ptr), this parameter is true if "addr + offset"
1027 * does not result in an unsigned integer wraparound. This is used for
1028 * optimal code generation of 32-bit pointer arithmetic.
1029 *
1030 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
1031 * integer wraparound can't be an imm offset in s_load_dword, because
1032 * the instruction performs "addr + offset" in 64 bits.
1033 *
1034 * Expected usage for bindless textures by chaining GEPs:
1035 * // possible unsigned wraparound, don't use InBounds:
1036 * ptr1 = LLVMBuildGEP(base_ptr, index);
1037 * image = load(ptr1); // becomes "s_load ptr1, 0"
1038 *
1039 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1040 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1041 */
1042 static LLVMValueRef
1043 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1044 LLVMValueRef index, bool uniform, bool invariant,
1045 bool no_unsigned_wraparound)
1046 {
1047 LLVMValueRef pointer, result;
1048 LLVMValueRef indices[2] = {ctx->i32_0, index};
1049
1050 if (no_unsigned_wraparound &&
1051 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1052 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, "");
1053 else
1054 pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1055
1056 if (uniform)
1057 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1058 result = LLVMBuildLoad(ctx->builder, pointer, "");
1059 if (invariant)
1060 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1061 return result;
1062 }
1063
1064 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1065 LLVMValueRef index)
1066 {
1067 return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1068 }
1069
1070 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1071 LLVMValueRef base_ptr, LLVMValueRef index)
1072 {
1073 return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1074 }
1075
1076 /* This assumes that there is no unsigned integer wraparound during the address
1077 * computation, excluding all GEPs within base_ptr. */
1078 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1079 LLVMValueRef base_ptr, LLVMValueRef index)
1080 {
1081 return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1082 }
1083
1084 /* See ac_build_load_custom() documentation. */
1085 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1086 LLVMValueRef base_ptr, LLVMValueRef index)
1087 {
1088 return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1089 }
1090
1091 static void
1092 ac_build_buffer_store_common(struct ac_llvm_context *ctx,
1093 LLVMValueRef rsrc,
1094 LLVMValueRef data,
1095 LLVMValueRef vindex,
1096 LLVMValueRef voffset,
1097 unsigned num_channels,
1098 bool glc,
1099 bool slc,
1100 bool writeonly_memory,
1101 bool use_format)
1102 {
1103 LLVMValueRef args[] = {
1104 data,
1105 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1106 vindex ? vindex : ctx->i32_0,
1107 voffset,
1108 LLVMConstInt(ctx->i1, glc, 0),
1109 LLVMConstInt(ctx->i1, slc, 0)
1110 };
1111 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1112
1113 const char *type_names[] = {"f32", "v2f32", "v4f32"};
1114 char name[256];
1115
1116 if (use_format) {
1117 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.format.%s",
1118 type_names[func]);
1119 } else {
1120 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
1121 type_names[func]);
1122 }
1123
1124 ac_build_intrinsic(ctx, name, ctx->voidt, args, ARRAY_SIZE(args),
1125 ac_get_store_intr_attribs(writeonly_memory));
1126 }
1127
1128 static void
1129 ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
1130 LLVMValueRef rsrc,
1131 LLVMValueRef data,
1132 LLVMValueRef vindex,
1133 LLVMValueRef voffset,
1134 LLVMValueRef soffset,
1135 unsigned num_channels,
1136 bool glc,
1137 bool slc,
1138 bool writeonly_memory,
1139 bool use_format,
1140 bool structurized)
1141 {
1142 LLVMValueRef args[6];
1143 int idx = 0;
1144 args[idx++] = data;
1145 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1146 if (structurized)
1147 args[idx++] = vindex ? vindex : ctx->i32_0;
1148 args[idx++] = voffset ? voffset : ctx->i32_0;
1149 args[idx++] = soffset ? soffset : ctx->i32_0;
1150 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1151 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1152
1153 const char *type_names[] = {"f32", "v2f32", "v4f32"};
1154 const char *indexing_kind = structurized ? "struct" : "raw";
1155 char name[256];
1156
1157 if (use_format) {
1158 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1159 indexing_kind, type_names[func]);
1160 } else {
1161 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1162 indexing_kind, type_names[func]);
1163 }
1164
1165 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1166 ac_get_store_intr_attribs(writeonly_memory));
1167 }
1168
1169 void
1170 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1171 LLVMValueRef rsrc,
1172 LLVMValueRef data,
1173 LLVMValueRef vindex,
1174 LLVMValueRef voffset,
1175 unsigned num_channels,
1176 bool glc,
1177 bool writeonly_memory)
1178 {
1179 if (HAVE_LLVM >= 0x800) {
1180 ac_build_llvm8_buffer_store_common(ctx, rsrc, data, vindex,
1181 voffset, NULL, num_channels,
1182 glc, false, writeonly_memory,
1183 true, true);
1184 } else {
1185 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset,
1186 num_channels, glc, false,
1187 writeonly_memory, true);
1188 }
1189 }
1190
1191 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1192 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1193 * or v4i32 (num_channels=3,4).
1194 */
1195 void
1196 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1197 LLVMValueRef rsrc,
1198 LLVMValueRef vdata,
1199 unsigned num_channels,
1200 LLVMValueRef voffset,
1201 LLVMValueRef soffset,
1202 unsigned inst_offset,
1203 bool glc,
1204 bool slc,
1205 bool writeonly_memory,
1206 bool swizzle_enable_hint)
1207 {
1208 /* Split 3 channel stores, becase LLVM doesn't support 3-channel
1209 * intrinsics. */
1210 if (num_channels == 3) {
1211 LLVMValueRef v[3], v01;
1212
1213 for (int i = 0; i < 3; i++) {
1214 v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1215 LLVMConstInt(ctx->i32, i, 0), "");
1216 }
1217 v01 = ac_build_gather_values(ctx, v, 2);
1218
1219 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1220 soffset, inst_offset, glc, slc,
1221 writeonly_memory, swizzle_enable_hint);
1222 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1223 soffset, inst_offset + 8,
1224 glc, slc,
1225 writeonly_memory, swizzle_enable_hint);
1226 return;
1227 }
1228
1229 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1230 * (voffset is swizzled, but soffset isn't swizzled).
1231 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1232 */
1233 if (!swizzle_enable_hint) {
1234 LLVMValueRef offset = soffset;
1235
1236 if (inst_offset)
1237 offset = LLVMBuildAdd(ctx->builder, offset,
1238 LLVMConstInt(ctx->i32, inst_offset, 0), "");
1239
1240 if (HAVE_LLVM >= 0x800) {
1241 ac_build_llvm8_buffer_store_common(ctx, rsrc,
1242 ac_to_float(ctx, vdata),
1243 ctx->i32_0,
1244 voffset, offset,
1245 num_channels,
1246 glc, slc,
1247 writeonly_memory,
1248 false, false);
1249 } else {
1250 if (voffset)
1251 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1252
1253 ac_build_buffer_store_common(ctx, rsrc,
1254 ac_to_float(ctx, vdata),
1255 ctx->i32_0, offset,
1256 num_channels, glc, slc,
1257 writeonly_memory, false);
1258 }
1259 return;
1260 }
1261
1262 static const unsigned dfmts[] = {
1263 V_008F0C_BUF_DATA_FORMAT_32,
1264 V_008F0C_BUF_DATA_FORMAT_32_32,
1265 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1266 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1267 };
1268 unsigned dfmt = dfmts[num_channels - 1];
1269 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1270 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1271
1272 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1273 immoffset, num_channels, dfmt, nfmt, glc,
1274 slc, writeonly_memory);
1275 }
1276
1277 static LLVMValueRef
1278 ac_build_buffer_load_common(struct ac_llvm_context *ctx,
1279 LLVMValueRef rsrc,
1280 LLVMValueRef vindex,
1281 LLVMValueRef voffset,
1282 unsigned num_channels,
1283 bool glc,
1284 bool slc,
1285 bool can_speculate,
1286 bool use_format)
1287 {
1288 LLVMValueRef args[] = {
1289 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1290 vindex ? vindex : ctx->i32_0,
1291 voffset,
1292 LLVMConstInt(ctx->i1, glc, 0),
1293 LLVMConstInt(ctx->i1, slc, 0)
1294 };
1295 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1296
1297 LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1298 const char *type_names[] = {"f32", "v2f32", "v4f32"};
1299 char name[256];
1300
1301 if (use_format) {
1302 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1303 type_names[func]);
1304 } else {
1305 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1306 type_names[func]);
1307 }
1308
1309 return ac_build_intrinsic(ctx, name, types[func], args,
1310 ARRAY_SIZE(args),
1311 ac_get_load_intr_attribs(can_speculate));
1312 }
1313
1314 static LLVMValueRef
1315 ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
1316 LLVMValueRef rsrc,
1317 LLVMValueRef vindex,
1318 LLVMValueRef voffset,
1319 LLVMValueRef soffset,
1320 unsigned num_channels,
1321 bool glc,
1322 bool slc,
1323 bool can_speculate,
1324 bool use_format,
1325 bool structurized)
1326 {
1327 LLVMValueRef args[5];
1328 int idx = 0;
1329 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1330 if (structurized)
1331 args[idx++] = vindex ? vindex : ctx->i32_0;
1332 args[idx++] = voffset ? voffset : ctx->i32_0;
1333 args[idx++] = soffset ? soffset : ctx->i32_0;
1334 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1335 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1336
1337 LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1338 const char *type_names[] = {"f32", "v2f32", "v4f32"};
1339 const char *indexing_kind = structurized ? "struct" : "raw";
1340 char name[256];
1341
1342 if (use_format) {
1343 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1344 indexing_kind, type_names[func]);
1345 } else {
1346 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1347 indexing_kind, type_names[func]);
1348 }
1349
1350 return ac_build_intrinsic(ctx, name, types[func], args,
1351 idx,
1352 ac_get_load_intr_attribs(can_speculate));
1353 }
1354
1355 LLVMValueRef
1356 ac_build_buffer_load(struct ac_llvm_context *ctx,
1357 LLVMValueRef rsrc,
1358 int num_channels,
1359 LLVMValueRef vindex,
1360 LLVMValueRef voffset,
1361 LLVMValueRef soffset,
1362 unsigned inst_offset,
1363 unsigned glc,
1364 unsigned slc,
1365 bool can_speculate,
1366 bool allow_smem)
1367 {
1368 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1369 if (voffset)
1370 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1371 if (soffset)
1372 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1373
1374 if (allow_smem && !slc &&
1375 (!glc || (HAVE_LLVM >= 0x0800 && ctx->chip_class >= VI))) {
1376 assert(vindex == NULL);
1377
1378 LLVMValueRef result[8];
1379
1380 for (int i = 0; i < num_channels; i++) {
1381 if (i) {
1382 offset = LLVMBuildAdd(ctx->builder, offset,
1383 LLVMConstInt(ctx->i32, 4, 0), "");
1384 }
1385 const char *intrname =
1386 HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
1387 : "llvm.SI.load.const.v4i32";
1388 unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
1389 LLVMValueRef args[3] = {
1390 rsrc,
1391 offset,
1392 glc ? ctx->i32_1 : ctx->i32_0,
1393 };
1394 result[i] = ac_build_intrinsic(ctx, intrname,
1395 ctx->f32, args, num_args,
1396 AC_FUNC_ATTR_READNONE |
1397 (HAVE_LLVM < 0x0800 ? AC_FUNC_ATTR_LEGACY : 0));
1398 }
1399 if (num_channels == 1)
1400 return result[0];
1401
1402 if (num_channels == 3)
1403 result[num_channels++] = LLVMGetUndef(ctx->f32);
1404 return ac_build_gather_values(ctx, result, num_channels);
1405 }
1406
1407 if (HAVE_LLVM >= 0x0800) {
1408 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex,
1409 offset, ctx->i32_0,
1410 num_channels, glc, slc,
1411 can_speculate, false,
1412 false);
1413 }
1414
1415 return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
1416 num_channels, glc, slc,
1417 can_speculate, false);
1418 }
1419
1420 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1421 LLVMValueRef rsrc,
1422 LLVMValueRef vindex,
1423 LLVMValueRef voffset,
1424 unsigned num_channels,
1425 bool glc,
1426 bool can_speculate)
1427 {
1428 if (HAVE_LLVM >= 0x800) {
1429 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1430 num_channels, glc, false,
1431 can_speculate, true, true);
1432 }
1433 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
1434 num_channels, glc, false,
1435 can_speculate, true);
1436 }
1437
1438 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1439 LLVMValueRef rsrc,
1440 LLVMValueRef vindex,
1441 LLVMValueRef voffset,
1442 unsigned num_channels,
1443 bool glc,
1444 bool can_speculate)
1445 {
1446 if (HAVE_LLVM >= 0x800) {
1447 return ac_build_llvm8_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1448 num_channels, glc, false,
1449 can_speculate, true, true);
1450 }
1451
1452 LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1453 LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, ctx->i32_1, "");
1454 stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1455
1456 LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1457 LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1458 elem_count, stride, "");
1459
1460 LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1461 LLVMConstInt(ctx->i32, 2, 0), "");
1462
1463 return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1464 num_channels, glc, false,
1465 can_speculate, true);
1466 }
1467
1468 static LLVMValueRef
1469 ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
1470 LLVMValueRef rsrc,
1471 LLVMValueRef vindex,
1472 LLVMValueRef voffset,
1473 LLVMValueRef soffset,
1474 unsigned num_channels,
1475 unsigned dfmt,
1476 unsigned nfmt,
1477 bool glc,
1478 bool slc,
1479 bool can_speculate,
1480 bool structurized)
1481 {
1482 LLVMValueRef args[6];
1483 int idx = 0;
1484 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1485 if (structurized)
1486 args[idx++] = vindex ? vindex : ctx->i32_0;
1487 args[idx++] = voffset ? voffset : ctx->i32_0;
1488 args[idx++] = soffset ? soffset : ctx->i32_0;
1489 args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
1490 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1491 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1492
1493 LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1494 const char *type_names[] = {"i32", "v2i32", "v4i32"};
1495 const char *indexing_kind = structurized ? "struct" : "raw";
1496 char name[256];
1497
1498 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1499 indexing_kind, type_names[func]);
1500
1501 return ac_build_intrinsic(ctx, name, types[func], args,
1502 idx,
1503 ac_get_load_intr_attribs(can_speculate));
1504 }
1505
1506 static LLVMValueRef
1507 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1508 LLVMValueRef rsrc,
1509 LLVMValueRef vindex,
1510 LLVMValueRef voffset,
1511 LLVMValueRef soffset,
1512 LLVMValueRef immoffset,
1513 unsigned num_channels,
1514 unsigned dfmt,
1515 unsigned nfmt,
1516 bool glc,
1517 bool slc,
1518 bool can_speculate,
1519 bool structurized) /* only matters for LLVM 8+ */
1520 {
1521 if (HAVE_LLVM >= 0x800) {
1522 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1523
1524 return ac_build_llvm8_tbuffer_load(ctx, rsrc, vindex, voffset,
1525 soffset, num_channels,
1526 dfmt, nfmt, glc, slc,
1527 can_speculate, structurized);
1528 }
1529
1530 LLVMValueRef args[] = {
1531 rsrc,
1532 vindex ? vindex : ctx->i32_0,
1533 voffset,
1534 soffset,
1535 immoffset,
1536 LLVMConstInt(ctx->i32, dfmt, false),
1537 LLVMConstInt(ctx->i32, nfmt, false),
1538 LLVMConstInt(ctx->i32, glc, false),
1539 LLVMConstInt(ctx->i32, slc, false),
1540 };
1541 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1542 LLVMTypeRef types[] = {ctx->i32, ctx->v2i32, ctx->v4i32};
1543 const char *type_names[] = {"i32", "v2i32", "v4i32"};
1544 char name[256];
1545
1546 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.load.%s",
1547 type_names[func]);
1548
1549 return ac_build_intrinsic(ctx, name, types[func], args, 9,
1550 ac_get_load_intr_attribs(can_speculate));
1551 }
1552
1553 LLVMValueRef
1554 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1555 LLVMValueRef rsrc,
1556 LLVMValueRef vindex,
1557 LLVMValueRef voffset,
1558 LLVMValueRef soffset,
1559 LLVMValueRef immoffset,
1560 unsigned num_channels,
1561 unsigned dfmt,
1562 unsigned nfmt,
1563 bool glc,
1564 bool slc,
1565 bool can_speculate)
1566 {
1567 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1568 immoffset, num_channels, dfmt, nfmt, glc,
1569 slc, can_speculate, true);
1570 }
1571
1572 LLVMValueRef
1573 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1574 LLVMValueRef rsrc,
1575 LLVMValueRef voffset,
1576 LLVMValueRef soffset,
1577 LLVMValueRef immoffset,
1578 unsigned num_channels,
1579 unsigned dfmt,
1580 unsigned nfmt,
1581 bool glc,
1582 bool slc,
1583 bool can_speculate)
1584 {
1585 return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1586 immoffset, num_channels, dfmt, nfmt, glc,
1587 slc, can_speculate, false);
1588 }
1589
1590 LLVMValueRef
1591 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1592 LLVMValueRef rsrc,
1593 LLVMValueRef voffset,
1594 LLVMValueRef soffset,
1595 LLVMValueRef immoffset,
1596 bool glc)
1597 {
1598 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1599 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1600 LLVMValueRef res;
1601
1602 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1603 immoffset, 1, dfmt, nfmt, glc, false,
1604 false);
1605
1606 return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1607 }
1608
1609 LLVMValueRef
1610 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1611 LLVMValueRef rsrc,
1612 LLVMValueRef voffset,
1613 LLVMValueRef soffset,
1614 LLVMValueRef immoffset,
1615 bool glc)
1616 {
1617 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1618 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1619 LLVMValueRef res;
1620
1621 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1622 immoffset, 1, dfmt, nfmt, glc, false,
1623 false);
1624
1625 return LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1626 }
1627 static void
1628 ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
1629 LLVMValueRef rsrc,
1630 LLVMValueRef vdata,
1631 LLVMValueRef vindex,
1632 LLVMValueRef voffset,
1633 LLVMValueRef soffset,
1634 unsigned num_channels,
1635 unsigned dfmt,
1636 unsigned nfmt,
1637 bool glc,
1638 bool slc,
1639 bool writeonly_memory,
1640 bool structurized)
1641 {
1642 LLVMValueRef args[7];
1643 int idx = 0;
1644 args[idx++] = vdata;
1645 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1646 if (structurized)
1647 args[idx++] = vindex ? vindex : ctx->i32_0;
1648 args[idx++] = voffset ? voffset : ctx->i32_0;
1649 args[idx++] = soffset ? soffset : ctx->i32_0;
1650 args[idx++] = LLVMConstInt(ctx->i32, dfmt | (nfmt << 4), 0);
1651 args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
1652 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1653
1654 const char *type_names[] = {"i32", "v2i32", "v4i32"};
1655 const char *indexing_kind = structurized ? "struct" : "raw";
1656 char name[256];
1657
1658 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
1659 indexing_kind, type_names[func]);
1660
1661 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1662 ac_get_store_intr_attribs(writeonly_memory));
1663 }
1664
1665 static void
1666 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
1667 LLVMValueRef rsrc,
1668 LLVMValueRef vdata,
1669 LLVMValueRef vindex,
1670 LLVMValueRef voffset,
1671 LLVMValueRef soffset,
1672 LLVMValueRef immoffset,
1673 unsigned num_channels,
1674 unsigned dfmt,
1675 unsigned nfmt,
1676 bool glc,
1677 bool slc,
1678 bool writeonly_memory,
1679 bool structurized) /* only matters for LLVM 8+ */
1680 {
1681 if (HAVE_LLVM >= 0x800) {
1682 voffset = LLVMBuildAdd(ctx->builder,
1683 voffset ? voffset : ctx->i32_0,
1684 immoffset, "");
1685
1686 ac_build_llvm8_tbuffer_store(ctx, rsrc, vdata, vindex, voffset,
1687 soffset, num_channels, dfmt, nfmt,
1688 glc, slc, writeonly_memory,
1689 structurized);
1690 } else {
1691 LLVMValueRef params[] = {
1692 vdata,
1693 rsrc,
1694 vindex ? vindex : ctx->i32_0,
1695 voffset ? voffset : ctx->i32_0,
1696 soffset ? soffset : ctx->i32_0,
1697 immoffset,
1698 LLVMConstInt(ctx->i32, dfmt, false),
1699 LLVMConstInt(ctx->i32, nfmt, false),
1700 LLVMConstInt(ctx->i32, glc, false),
1701 LLVMConstInt(ctx->i32, slc, false),
1702 };
1703 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1704 const char *type_names[] = {"i32", "v2i32", "v4i32"};
1705 char name[256];
1706
1707 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
1708 type_names[func]);
1709
1710 ac_build_intrinsic(ctx, name, ctx->voidt, params, 10,
1711 ac_get_store_intr_attribs(writeonly_memory));
1712 }
1713 }
1714
1715 void
1716 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
1717 LLVMValueRef rsrc,
1718 LLVMValueRef vdata,
1719 LLVMValueRef vindex,
1720 LLVMValueRef voffset,
1721 LLVMValueRef soffset,
1722 LLVMValueRef immoffset,
1723 unsigned num_channels,
1724 unsigned dfmt,
1725 unsigned nfmt,
1726 bool glc,
1727 bool slc,
1728 bool writeonly_memory)
1729 {
1730 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
1731 immoffset, num_channels, dfmt, nfmt, glc, slc,
1732 writeonly_memory, true);
1733 }
1734
1735 void
1736 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
1737 LLVMValueRef rsrc,
1738 LLVMValueRef vdata,
1739 LLVMValueRef voffset,
1740 LLVMValueRef soffset,
1741 LLVMValueRef immoffset,
1742 unsigned num_channels,
1743 unsigned dfmt,
1744 unsigned nfmt,
1745 bool glc,
1746 bool slc,
1747 bool writeonly_memory)
1748 {
1749 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
1750 immoffset, num_channels, dfmt, nfmt, glc, slc,
1751 writeonly_memory, false);
1752 }
1753
1754 void
1755 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
1756 LLVMValueRef rsrc,
1757 LLVMValueRef vdata,
1758 LLVMValueRef voffset,
1759 LLVMValueRef soffset,
1760 bool glc,
1761 bool writeonly_memory)
1762 {
1763 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1764 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1765
1766 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1767 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1768
1769 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1770 ctx->i32_0, 1, dfmt, nfmt, glc, false,
1771 writeonly_memory);
1772 }
1773
1774 void
1775 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
1776 LLVMValueRef rsrc,
1777 LLVMValueRef vdata,
1778 LLVMValueRef voffset,
1779 LLVMValueRef soffset,
1780 bool glc,
1781 bool writeonly_memory)
1782 {
1783 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1784 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1785
1786 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1787 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1788
1789 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1790 ctx->i32_0, 1, dfmt, nfmt, glc, false,
1791 writeonly_memory);
1792 }
1793 /**
1794 * Set range metadata on an instruction. This can only be used on load and
1795 * call instructions. If you know an instruction can only produce the values
1796 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1797 * \p lo is the minimum value inclusive.
1798 * \p hi is the maximum value exclusive.
1799 */
1800 static void set_range_metadata(struct ac_llvm_context *ctx,
1801 LLVMValueRef value, unsigned lo, unsigned hi)
1802 {
1803 LLVMValueRef range_md, md_args[2];
1804 LLVMTypeRef type = LLVMTypeOf(value);
1805 LLVMContextRef context = LLVMGetTypeContext(type);
1806
1807 md_args[0] = LLVMConstInt(type, lo, false);
1808 md_args[1] = LLVMConstInt(type, hi, false);
1809 range_md = LLVMMDNodeInContext(context, md_args, 2);
1810 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1811 }
1812
1813 LLVMValueRef
1814 ac_get_thread_id(struct ac_llvm_context *ctx)
1815 {
1816 LLVMValueRef tid;
1817
1818 LLVMValueRef tid_args[2];
1819 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1820 tid_args[1] = ctx->i32_0;
1821 tid_args[1] = ac_build_intrinsic(ctx,
1822 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1823 tid_args, 2, AC_FUNC_ATTR_READNONE);
1824
1825 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
1826 ctx->i32, tid_args,
1827 2, AC_FUNC_ATTR_READNONE);
1828 set_range_metadata(ctx, tid, 0, 64);
1829 return tid;
1830 }
1831
1832 /*
1833 * SI implements derivatives using the local data store (LDS)
1834 * All writes to the LDS happen in all executing threads at
1835 * the same time. TID is the Thread ID for the current
1836 * thread and is a value between 0 and 63, representing
1837 * the thread's position in the wavefront.
1838 *
1839 * For the pixel shader threads are grouped into quads of four pixels.
1840 * The TIDs of the pixels of a quad are:
1841 *
1842 * +------+------+
1843 * |4n + 0|4n + 1|
1844 * +------+------+
1845 * |4n + 2|4n + 3|
1846 * +------+------+
1847 *
1848 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1849 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1850 * the current pixel's column, and masking with 0xfffffffe yields the TID
1851 * of the left pixel of the current pixel's row.
1852 *
1853 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1854 * adding 2 yields the TID of the pixel below the top pixel.
1855 */
1856 LLVMValueRef
1857 ac_build_ddxy(struct ac_llvm_context *ctx,
1858 uint32_t mask,
1859 int idx,
1860 LLVMValueRef val)
1861 {
1862 unsigned tl_lanes[4], trbl_lanes[4];
1863 LLVMValueRef tl, trbl;
1864 LLVMValueRef result;
1865
1866 for (unsigned i = 0; i < 4; ++i) {
1867 tl_lanes[i] = i & mask;
1868 trbl_lanes[i] = (i & mask) + idx;
1869 }
1870
1871 tl = ac_build_quad_swizzle(ctx, val,
1872 tl_lanes[0], tl_lanes[1],
1873 tl_lanes[2], tl_lanes[3]);
1874 trbl = ac_build_quad_swizzle(ctx, val,
1875 trbl_lanes[0], trbl_lanes[1],
1876 trbl_lanes[2], trbl_lanes[3]);
1877
1878 tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
1879 trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
1880 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1881
1882 result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32,
1883 &result, 1, 0);
1884
1885 return result;
1886 }
1887
1888 void
1889 ac_build_sendmsg(struct ac_llvm_context *ctx,
1890 uint32_t msg,
1891 LLVMValueRef wave_id)
1892 {
1893 LLVMValueRef args[2];
1894 args[0] = LLVMConstInt(ctx->i32, msg, false);
1895 args[1] = wave_id;
1896 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1897 }
1898
1899 LLVMValueRef
1900 ac_build_imsb(struct ac_llvm_context *ctx,
1901 LLVMValueRef arg,
1902 LLVMTypeRef dst_type)
1903 {
1904 LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
1905 dst_type, &arg, 1,
1906 AC_FUNC_ATTR_READNONE);
1907
1908 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1909 * the index from LSB. Invert it by doing "31 - msb". */
1910 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
1911 msb, "");
1912
1913 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1914 LLVMValueRef cond = LLVMBuildOr(ctx->builder,
1915 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
1916 arg, ctx->i32_0, ""),
1917 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
1918 arg, all_ones, ""), "");
1919
1920 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1921 }
1922
1923 LLVMValueRef
1924 ac_build_umsb(struct ac_llvm_context *ctx,
1925 LLVMValueRef arg,
1926 LLVMTypeRef dst_type)
1927 {
1928 const char *intrin_name;
1929 LLVMTypeRef type;
1930 LLVMValueRef highest_bit;
1931 LLVMValueRef zero;
1932 unsigned bitsize;
1933
1934 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1935 switch (bitsize) {
1936 case 64:
1937 intrin_name = "llvm.ctlz.i64";
1938 type = ctx->i64;
1939 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1940 zero = ctx->i64_0;
1941 break;
1942 case 32:
1943 intrin_name = "llvm.ctlz.i32";
1944 type = ctx->i32;
1945 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1946 zero = ctx->i32_0;
1947 break;
1948 case 16:
1949 intrin_name = "llvm.ctlz.i16";
1950 type = ctx->i16;
1951 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1952 zero = ctx->i16_0;
1953 break;
1954 default:
1955 unreachable(!"invalid bitsize");
1956 break;
1957 }
1958
1959 LLVMValueRef params[2] = {
1960 arg,
1961 ctx->i1true,
1962 };
1963
1964 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
1965 params, 2,
1966 AC_FUNC_ATTR_READNONE);
1967
1968 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1969 * the index from LSB. Invert it by doing "31 - msb". */
1970 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1971 msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, "");
1972
1973 /* check for zero */
1974 return LLVMBuildSelect(ctx->builder,
1975 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1976 LLVMConstInt(ctx->i32, -1, true), msb, "");
1977 }
1978
1979 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
1980 LLVMValueRef b)
1981 {
1982 char name[64];
1983 snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
1984 LLVMValueRef args[2] = {a, b};
1985 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
1986 AC_FUNC_ATTR_READNONE);
1987 }
1988
1989 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
1990 LLVMValueRef b)
1991 {
1992 char name[64];
1993 snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
1994 LLVMValueRef args[2] = {a, b};
1995 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
1996 AC_FUNC_ATTR_READNONE);
1997 }
1998
1999 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2000 LLVMValueRef b)
2001 {
2002 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2003 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2004 }
2005
2006 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2007 LLVMValueRef b)
2008 {
2009 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2010 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2011 }
2012
2013 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2014 LLVMValueRef b)
2015 {
2016 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2017 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2018 }
2019
2020 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2021 {
2022 LLVMTypeRef t = LLVMTypeOf(value);
2023 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2024 LLVMConstReal(t, 1.0));
2025 }
2026
2027 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2028 {
2029 LLVMValueRef args[9];
2030
2031 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2032 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2033
2034 if (a->compr) {
2035 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2036 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2037
2038 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2039 v2i16, "");
2040 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2041 v2i16, "");
2042 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2043 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2044
2045 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2046 ctx->voidt, args, 6, 0);
2047 } else {
2048 args[2] = a->out[0];
2049 args[3] = a->out[1];
2050 args[4] = a->out[2];
2051 args[5] = a->out[3];
2052 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2053 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2054
2055 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2056 ctx->voidt, args, 8, 0);
2057 }
2058 }
2059
2060 void ac_build_export_null(struct ac_llvm_context *ctx)
2061 {
2062 struct ac_export_args args;
2063
2064 args.enabled_channels = 0x0; /* enabled channels */
2065 args.valid_mask = 1; /* whether the EXEC mask is valid */
2066 args.done = 1; /* DONE bit */
2067 args.target = V_008DFC_SQ_EXP_NULL;
2068 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2069 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2070 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2071 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2072 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2073
2074 ac_build_export(ctx, &args);
2075 }
2076
2077 static unsigned ac_num_coords(enum ac_image_dim dim)
2078 {
2079 switch (dim) {
2080 case ac_image_1d:
2081 return 1;
2082 case ac_image_2d:
2083 case ac_image_1darray:
2084 return 2;
2085 case ac_image_3d:
2086 case ac_image_cube:
2087 case ac_image_2darray:
2088 case ac_image_2dmsaa:
2089 return 3;
2090 case ac_image_2darraymsaa:
2091 return 4;
2092 default:
2093 unreachable("ac_num_coords: bad dim");
2094 }
2095 }
2096
2097 static unsigned ac_num_derivs(enum ac_image_dim dim)
2098 {
2099 switch (dim) {
2100 case ac_image_1d:
2101 case ac_image_1darray:
2102 return 2;
2103 case ac_image_2d:
2104 case ac_image_2darray:
2105 case ac_image_cube:
2106 return 4;
2107 case ac_image_3d:
2108 return 6;
2109 case ac_image_2dmsaa:
2110 case ac_image_2darraymsaa:
2111 default:
2112 unreachable("derivatives not supported");
2113 }
2114 }
2115
2116 static const char *get_atomic_name(enum ac_atomic_op op)
2117 {
2118 switch (op) {
2119 case ac_atomic_swap: return "swap";
2120 case ac_atomic_add: return "add";
2121 case ac_atomic_sub: return "sub";
2122 case ac_atomic_smin: return "smin";
2123 case ac_atomic_umin: return "umin";
2124 case ac_atomic_smax: return "smax";
2125 case ac_atomic_umax: return "umax";
2126 case ac_atomic_and: return "and";
2127 case ac_atomic_or: return "or";
2128 case ac_atomic_xor: return "xor";
2129 }
2130 unreachable("bad atomic op");
2131 }
2132
2133 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2134 struct ac_image_args *a)
2135 {
2136 const char *overload[3] = { "", "", "" };
2137 unsigned num_overloads = 0;
2138 LLVMValueRef args[18];
2139 unsigned num_args = 0;
2140 enum ac_image_dim dim = a->dim;
2141
2142 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2143 !a->level_zero);
2144 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2145 a->opcode != ac_image_store_mip) ||
2146 a->lod);
2147 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2148 (!a->compare && !a->offset));
2149 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2150 a->opcode == ac_image_get_lod) ||
2151 !a->bias);
2152 assert((a->bias ? 1 : 0) +
2153 (a->lod ? 1 : 0) +
2154 (a->level_zero ? 1 : 0) +
2155 (a->derivs[0] ? 1 : 0) <= 1);
2156
2157 if (a->opcode == ac_image_get_lod) {
2158 switch (dim) {
2159 case ac_image_1darray:
2160 dim = ac_image_1d;
2161 break;
2162 case ac_image_2darray:
2163 case ac_image_cube:
2164 dim = ac_image_2d;
2165 break;
2166 default:
2167 break;
2168 }
2169 }
2170
2171 bool sample = a->opcode == ac_image_sample ||
2172 a->opcode == ac_image_gather4 ||
2173 a->opcode == ac_image_get_lod;
2174 bool atomic = a->opcode == ac_image_atomic ||
2175 a->opcode == ac_image_atomic_cmpswap;
2176 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2177
2178 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2179 args[num_args++] = a->data[0];
2180 if (a->opcode == ac_image_atomic_cmpswap)
2181 args[num_args++] = a->data[1];
2182 }
2183
2184 if (!atomic)
2185 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2186
2187 if (a->offset)
2188 args[num_args++] = ac_to_integer(ctx, a->offset);
2189 if (a->bias) {
2190 args[num_args++] = ac_to_float(ctx, a->bias);
2191 overload[num_overloads++] = ".f32";
2192 }
2193 if (a->compare)
2194 args[num_args++] = ac_to_float(ctx, a->compare);
2195 if (a->derivs[0]) {
2196 unsigned count = ac_num_derivs(dim);
2197 for (unsigned i = 0; i < count; ++i)
2198 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2199 overload[num_overloads++] = ".f32";
2200 }
2201 unsigned num_coords =
2202 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2203 for (unsigned i = 0; i < num_coords; ++i)
2204 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2205 if (a->lod)
2206 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2207 overload[num_overloads++] = sample ? ".f32" : ".i32";
2208
2209 args[num_args++] = a->resource;
2210 if (sample) {
2211 args[num_args++] = a->sampler;
2212 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2213 }
2214
2215 args[num_args++] = ctx->i32_0; /* texfailctrl */
2216 args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
2217
2218 const char *name;
2219 const char *atomic_subop = "";
2220 switch (a->opcode) {
2221 case ac_image_sample: name = "sample"; break;
2222 case ac_image_gather4: name = "gather4"; break;
2223 case ac_image_load: name = "load"; break;
2224 case ac_image_load_mip: name = "load.mip"; break;
2225 case ac_image_store: name = "store"; break;
2226 case ac_image_store_mip: name = "store.mip"; break;
2227 case ac_image_atomic:
2228 name = "atomic.";
2229 atomic_subop = get_atomic_name(a->atomic);
2230 break;
2231 case ac_image_atomic_cmpswap:
2232 name = "atomic.";
2233 atomic_subop = "cmpswap";
2234 break;
2235 case ac_image_get_lod: name = "getlod"; break;
2236 case ac_image_get_resinfo: name = "getresinfo"; break;
2237 default: unreachable("invalid image opcode");
2238 }
2239
2240 const char *dimname;
2241 switch (dim) {
2242 case ac_image_1d: dimname = "1d"; break;
2243 case ac_image_2d: dimname = "2d"; break;
2244 case ac_image_3d: dimname = "3d"; break;
2245 case ac_image_cube: dimname = "cube"; break;
2246 case ac_image_1darray: dimname = "1darray"; break;
2247 case ac_image_2darray: dimname = "2darray"; break;
2248 case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2249 case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2250 default: unreachable("invalid dim");
2251 }
2252
2253 bool lod_suffix =
2254 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2255 char intr_name[96];
2256 snprintf(intr_name, sizeof(intr_name),
2257 "llvm.amdgcn.image.%s%s" /* base name */
2258 "%s%s%s" /* sample/gather modifiers */
2259 ".%s.%s%s%s%s", /* dimension and type overloads */
2260 name, atomic_subop,
2261 a->compare ? ".c" : "",
2262 a->bias ? ".b" :
2263 lod_suffix ? ".l" :
2264 a->derivs[0] ? ".d" :
2265 a->level_zero ? ".lz" : "",
2266 a->offset ? ".o" : "",
2267 dimname,
2268 atomic ? "i32" : "v4f32",
2269 overload[0], overload[1], overload[2]);
2270
2271 LLVMTypeRef retty;
2272 if (atomic)
2273 retty = ctx->i32;
2274 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2275 retty = ctx->voidt;
2276 else
2277 retty = ctx->v4f32;
2278
2279 LLVMValueRef result =
2280 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2281 a->attributes);
2282 if (!sample && retty == ctx->v4f32) {
2283 result = LLVMBuildBitCast(ctx->builder, result,
2284 ctx->v4i32, "");
2285 }
2286 return result;
2287 }
2288
2289 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2290 LLVMValueRef args[2])
2291 {
2292 LLVMTypeRef v2f16 =
2293 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2294
2295 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2296 args, 2, AC_FUNC_ATTR_READNONE);
2297 }
2298
2299 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2300 LLVMValueRef args[2])
2301 {
2302 LLVMValueRef res =
2303 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2304 ctx->v2i16, args, 2,
2305 AC_FUNC_ATTR_READNONE);
2306 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2307 }
2308
2309 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2310 LLVMValueRef args[2])
2311 {
2312 LLVMValueRef res =
2313 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2314 ctx->v2i16, args, 2,
2315 AC_FUNC_ATTR_READNONE);
2316 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2317 }
2318
2319 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2320 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2321 LLVMValueRef args[2], unsigned bits, bool hi)
2322 {
2323 assert(bits == 8 || bits == 10 || bits == 16);
2324
2325 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2326 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2327 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2328 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2329 LLVMValueRef max_alpha =
2330 bits != 10 ? max_rgb : ctx->i32_1;
2331 LLVMValueRef min_alpha =
2332 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2333
2334 /* Clamp. */
2335 if (bits != 16) {
2336 for (int i = 0; i < 2; i++) {
2337 bool alpha = hi && i == 1;
2338 args[i] = ac_build_imin(ctx, args[i],
2339 alpha ? max_alpha : max_rgb);
2340 args[i] = ac_build_imax(ctx, args[i],
2341 alpha ? min_alpha : min_rgb);
2342 }
2343 }
2344
2345 LLVMValueRef res =
2346 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2347 ctx->v2i16, args, 2,
2348 AC_FUNC_ATTR_READNONE);
2349 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2350 }
2351
2352 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2353 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2354 LLVMValueRef args[2], unsigned bits, bool hi)
2355 {
2356 assert(bits == 8 || bits == 10 || bits == 16);
2357
2358 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2359 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2360 LLVMValueRef max_alpha =
2361 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2362
2363 /* Clamp. */
2364 if (bits != 16) {
2365 for (int i = 0; i < 2; i++) {
2366 bool alpha = hi && i == 1;
2367 args[i] = ac_build_umin(ctx, args[i],
2368 alpha ? max_alpha : max_rgb);
2369 }
2370 }
2371
2372 LLVMValueRef res =
2373 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2374 ctx->v2i16, args, 2,
2375 AC_FUNC_ATTR_READNONE);
2376 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2377 }
2378
2379 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2380 {
2381 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2382 &i1, 1, AC_FUNC_ATTR_READNONE);
2383 }
2384
2385 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2386 {
2387 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2388 &i1, 1, 0);
2389 }
2390
2391 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2392 LLVMValueRef offset, LLVMValueRef width,
2393 bool is_signed)
2394 {
2395 LLVMValueRef args[] = {
2396 input,
2397 offset,
2398 width,
2399 };
2400
2401 return ac_build_intrinsic(ctx,
2402 is_signed ? "llvm.amdgcn.sbfe.i32" :
2403 "llvm.amdgcn.ubfe.i32",
2404 ctx->i32, args, 3,
2405 AC_FUNC_ATTR_READNONE);
2406 }
2407
2408 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2409 LLVMValueRef s1, LLVMValueRef s2)
2410 {
2411 return LLVMBuildAdd(ctx->builder,
2412 LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2413 }
2414
2415 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2416 LLVMValueRef s1, LLVMValueRef s2)
2417 {
2418 return LLVMBuildFAdd(ctx->builder,
2419 LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2420 }
2421
2422 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
2423 {
2424 LLVMValueRef args[1] = {
2425 LLVMConstInt(ctx->i32, simm16, false),
2426 };
2427 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2428 ctx->voidt, args, 1, 0);
2429 }
2430
2431 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2432 unsigned bitsize)
2433 {
2434 LLVMTypeRef type;
2435 char *intr;
2436
2437 if (bitsize == 32) {
2438 intr = "llvm.amdgcn.fract.f32";
2439 type = ctx->f32;
2440 } else {
2441 intr = "llvm.amdgcn.fract.f64";
2442 type = ctx->f64;
2443 }
2444
2445 LLVMValueRef params[] = {
2446 src0,
2447 };
2448 return ac_build_intrinsic(ctx, intr, type, params, 1,
2449 AC_FUNC_ATTR_READNONE);
2450 }
2451
2452 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2453 unsigned bitsize)
2454 {
2455 LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2456 LLVMValueRef zero = LLVMConstInt(type, 0, false);
2457 LLVMValueRef one = LLVMConstInt(type, 1, false);
2458
2459 LLVMValueRef cmp, val;
2460 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2461 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2462 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2463 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2464 return val;
2465 }
2466
2467 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2468 unsigned bitsize)
2469 {
2470 LLVMValueRef cmp, val, zero, one;
2471 LLVMTypeRef type;
2472
2473 if (bitsize == 32) {
2474 type = ctx->f32;
2475 zero = ctx->f32_0;
2476 one = ctx->f32_1;
2477 } else {
2478 type = ctx->f64;
2479 zero = ctx->f64_0;
2480 one = ctx->f64_1;
2481 }
2482
2483 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
2484 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2485 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
2486 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
2487 return val;
2488 }
2489
2490 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2491 {
2492 LLVMValueRef result;
2493 unsigned bitsize;
2494
2495 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2496
2497 switch (bitsize) {
2498 case 64:
2499 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
2500 (LLVMValueRef []) { src0 }, 1,
2501 AC_FUNC_ATTR_READNONE);
2502
2503 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2504 break;
2505 case 32:
2506 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
2507 (LLVMValueRef []) { src0 }, 1,
2508 AC_FUNC_ATTR_READNONE);
2509 break;
2510 case 16:
2511 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
2512 (LLVMValueRef []) { src0 }, 1,
2513 AC_FUNC_ATTR_READNONE);
2514 break;
2515 default:
2516 unreachable(!"invalid bitsize");
2517 break;
2518 }
2519
2520 return result;
2521 }
2522
2523 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
2524 LLVMValueRef src0)
2525 {
2526 LLVMValueRef result;
2527 unsigned bitsize;
2528
2529 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2530
2531 switch (bitsize) {
2532 case 32:
2533 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
2534 (LLVMValueRef []) { src0 }, 1,
2535 AC_FUNC_ATTR_READNONE);
2536 break;
2537 case 16:
2538 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
2539 (LLVMValueRef []) { src0 }, 1,
2540 AC_FUNC_ATTR_READNONE);
2541 break;
2542 default:
2543 unreachable(!"invalid bitsize");
2544 break;
2545 }
2546
2547 return result;
2548 }
2549
2550 #define AC_EXP_TARGET 0
2551 #define AC_EXP_ENABLED_CHANNELS 1
2552 #define AC_EXP_OUT0 2
2553
2554 enum ac_ir_type {
2555 AC_IR_UNDEF,
2556 AC_IR_CONST,
2557 AC_IR_VALUE,
2558 };
2559
2560 struct ac_vs_exp_chan
2561 {
2562 LLVMValueRef value;
2563 float const_float;
2564 enum ac_ir_type type;
2565 };
2566
2567 struct ac_vs_exp_inst {
2568 unsigned offset;
2569 LLVMValueRef inst;
2570 struct ac_vs_exp_chan chan[4];
2571 };
2572
2573 struct ac_vs_exports {
2574 unsigned num;
2575 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2576 };
2577
2578 /* Return true if the PARAM export has been eliminated. */
2579 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
2580 uint32_t num_outputs,
2581 struct ac_vs_exp_inst *exp)
2582 {
2583 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2584 bool is_zero[4] = {}, is_one[4] = {};
2585
2586 for (i = 0; i < 4; i++) {
2587 /* It's a constant expression. Undef outputs are eliminated too. */
2588 if (exp->chan[i].type == AC_IR_UNDEF) {
2589 is_zero[i] = true;
2590 is_one[i] = true;
2591 } else if (exp->chan[i].type == AC_IR_CONST) {
2592 if (exp->chan[i].const_float == 0)
2593 is_zero[i] = true;
2594 else if (exp->chan[i].const_float == 1)
2595 is_one[i] = true;
2596 else
2597 return false; /* other constant */
2598 } else
2599 return false;
2600 }
2601
2602 /* Only certain combinations of 0 and 1 can be eliminated. */
2603 if (is_zero[0] && is_zero[1] && is_zero[2])
2604 default_val = is_zero[3] ? 0 : 1;
2605 else if (is_one[0] && is_one[1] && is_one[2])
2606 default_val = is_zero[3] ? 2 : 3;
2607 else
2608 return false;
2609
2610 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2611 LLVMInstructionEraseFromParent(exp->inst);
2612
2613 /* Change OFFSET to DEFAULT_VAL. */
2614 for (i = 0; i < num_outputs; i++) {
2615 if (vs_output_param_offset[i] == exp->offset) {
2616 vs_output_param_offset[i] =
2617 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2618 break;
2619 }
2620 }
2621 return true;
2622 }
2623
2624 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2625 uint8_t *vs_output_param_offset,
2626 uint32_t num_outputs,
2627 struct ac_vs_exports *processed,
2628 struct ac_vs_exp_inst *exp)
2629 {
2630 unsigned p, copy_back_channels = 0;
2631
2632 /* See if the output is already in the list of processed outputs.
2633 * The LLVMValueRef comparison relies on SSA.
2634 */
2635 for (p = 0; p < processed->num; p++) {
2636 bool different = false;
2637
2638 for (unsigned j = 0; j < 4; j++) {
2639 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2640 struct ac_vs_exp_chan *c2 = &exp->chan[j];
2641
2642 /* Treat undef as a match. */
2643 if (c2->type == AC_IR_UNDEF)
2644 continue;
2645
2646 /* If c1 is undef but c2 isn't, we can copy c2 to c1
2647 * and consider the instruction duplicated.
2648 */
2649 if (c1->type == AC_IR_UNDEF) {
2650 copy_back_channels |= 1 << j;
2651 continue;
2652 }
2653
2654 /* Test whether the channels are not equal. */
2655 if (c1->type != c2->type ||
2656 (c1->type == AC_IR_CONST &&
2657 c1->const_float != c2->const_float) ||
2658 (c1->type == AC_IR_VALUE &&
2659 c1->value != c2->value)) {
2660 different = true;
2661 break;
2662 }
2663 }
2664 if (!different)
2665 break;
2666
2667 copy_back_channels = 0;
2668 }
2669 if (p == processed->num)
2670 return false;
2671
2672 /* If a match was found, but the matching export has undef where the new
2673 * one has a normal value, copy the normal value to the undef channel.
2674 */
2675 struct ac_vs_exp_inst *match = &processed->exp[p];
2676
2677 /* Get current enabled channels mask. */
2678 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2679 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2680
2681 while (copy_back_channels) {
2682 unsigned chan = u_bit_scan(&copy_back_channels);
2683
2684 assert(match->chan[chan].type == AC_IR_UNDEF);
2685 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
2686 exp->chan[chan].value);
2687 match->chan[chan] = exp->chan[chan];
2688
2689 /* Update number of enabled channels because the original mask
2690 * is not always 0xf.
2691 */
2692 enabled_channels |= (1 << chan);
2693 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2694 LLVMConstInt(ctx->i32, enabled_channels, 0));
2695 }
2696
2697 /* The PARAM export is duplicated. Kill it. */
2698 LLVMInstructionEraseFromParent(exp->inst);
2699
2700 /* Change OFFSET to the matching export. */
2701 for (unsigned i = 0; i < num_outputs; i++) {
2702 if (vs_output_param_offset[i] == exp->offset) {
2703 vs_output_param_offset[i] = match->offset;
2704 break;
2705 }
2706 }
2707 return true;
2708 }
2709
2710 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
2711 LLVMValueRef main_fn,
2712 uint8_t *vs_output_param_offset,
2713 uint32_t num_outputs,
2714 uint8_t *num_param_exports)
2715 {
2716 LLVMBasicBlockRef bb;
2717 bool removed_any = false;
2718 struct ac_vs_exports exports;
2719
2720 exports.num = 0;
2721
2722 /* Process all LLVM instructions. */
2723 bb = LLVMGetFirstBasicBlock(main_fn);
2724 while (bb) {
2725 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2726
2727 while (inst) {
2728 LLVMValueRef cur = inst;
2729 inst = LLVMGetNextInstruction(inst);
2730 struct ac_vs_exp_inst exp;
2731
2732 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2733 continue;
2734
2735 LLVMValueRef callee = ac_llvm_get_called_value(cur);
2736
2737 if (!ac_llvm_is_function(callee))
2738 continue;
2739
2740 const char *name = LLVMGetValueName(callee);
2741 unsigned num_args = LLVMCountParams(callee);
2742
2743 /* Check if this is an export instruction. */
2744 if ((num_args != 9 && num_args != 8) ||
2745 (strcmp(name, "llvm.SI.export") &&
2746 strcmp(name, "llvm.amdgcn.exp.f32")))
2747 continue;
2748
2749 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2750 unsigned target = LLVMConstIntGetZExtValue(arg);
2751
2752 if (target < V_008DFC_SQ_EXP_PARAM)
2753 continue;
2754
2755 target -= V_008DFC_SQ_EXP_PARAM;
2756
2757 /* Parse the instruction. */
2758 memset(&exp, 0, sizeof(exp));
2759 exp.offset = target;
2760 exp.inst = cur;
2761
2762 for (unsigned i = 0; i < 4; i++) {
2763 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2764
2765 exp.chan[i].value = v;
2766
2767 if (LLVMIsUndef(v)) {
2768 exp.chan[i].type = AC_IR_UNDEF;
2769 } else if (LLVMIsAConstantFP(v)) {
2770 LLVMBool loses_info;
2771 exp.chan[i].type = AC_IR_CONST;
2772 exp.chan[i].const_float =
2773 LLVMConstRealGetDouble(v, &loses_info);
2774 } else {
2775 exp.chan[i].type = AC_IR_VALUE;
2776 }
2777 }
2778
2779 /* Eliminate constant and duplicated PARAM exports. */
2780 if (ac_eliminate_const_output(vs_output_param_offset,
2781 num_outputs, &exp) ||
2782 ac_eliminate_duplicated_output(ctx,
2783 vs_output_param_offset,
2784 num_outputs, &exports,
2785 &exp)) {
2786 removed_any = true;
2787 } else {
2788 exports.exp[exports.num++] = exp;
2789 }
2790 }
2791 bb = LLVMGetNextBasicBlock(bb);
2792 }
2793
2794 /* Remove holes in export memory due to removed PARAM exports.
2795 * This is done by renumbering all PARAM exports.
2796 */
2797 if (removed_any) {
2798 uint8_t old_offset[VARYING_SLOT_MAX];
2799 unsigned out, i;
2800
2801 /* Make a copy of the offsets. We need the old version while
2802 * we are modifying some of them. */
2803 memcpy(old_offset, vs_output_param_offset,
2804 sizeof(old_offset));
2805
2806 for (i = 0; i < exports.num; i++) {
2807 unsigned offset = exports.exp[i].offset;
2808
2809 /* Update vs_output_param_offset. Multiple outputs can
2810 * have the same offset.
2811 */
2812 for (out = 0; out < num_outputs; out++) {
2813 if (old_offset[out] == offset)
2814 vs_output_param_offset[out] = i;
2815 }
2816
2817 /* Change the PARAM offset in the instruction. */
2818 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2819 LLVMConstInt(ctx->i32,
2820 V_008DFC_SQ_EXP_PARAM + i, 0));
2821 }
2822 *num_param_exports = exports.num;
2823 }
2824 }
2825
2826 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2827 {
2828 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2829 ac_build_intrinsic(ctx,
2830 "llvm.amdgcn.init.exec", ctx->voidt,
2831 &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
2832 }
2833
2834 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2835 {
2836 unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
2837 ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2838 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
2839 "lds");
2840 }
2841
2842 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
2843 LLVMValueRef dw_addr)
2844 {
2845 return ac_build_load(ctx, ctx->lds, dw_addr);
2846 }
2847
2848 void ac_lds_store(struct ac_llvm_context *ctx,
2849 LLVMValueRef dw_addr,
2850 LLVMValueRef value)
2851 {
2852 value = ac_to_integer(ctx, value);
2853 ac_build_indexed_store(ctx, ctx->lds,
2854 dw_addr, value);
2855 }
2856
2857 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
2858 LLVMTypeRef dst_type,
2859 LLVMValueRef src0)
2860 {
2861 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2862 const char *intrin_name;
2863 LLVMTypeRef type;
2864 LLVMValueRef zero;
2865
2866 switch (src0_bitsize) {
2867 case 64:
2868 intrin_name = "llvm.cttz.i64";
2869 type = ctx->i64;
2870 zero = ctx->i64_0;
2871 break;
2872 case 32:
2873 intrin_name = "llvm.cttz.i32";
2874 type = ctx->i32;
2875 zero = ctx->i32_0;
2876 break;
2877 case 16:
2878 intrin_name = "llvm.cttz.i16";
2879 type = ctx->i16;
2880 zero = ctx->i16_0;
2881 break;
2882 default:
2883 unreachable(!"invalid bitsize");
2884 }
2885
2886 LLVMValueRef params[2] = {
2887 src0,
2888
2889 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2890 * add special code to check for x=0. The reason is that
2891 * the LLVM behavior for x=0 is different from what we
2892 * need here. However, LLVM also assumes that ffs(x) is
2893 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2894 * a conditional assignment to handle 0 is still required.
2895 *
2896 * The hardware already implements the correct behavior.
2897 */
2898 ctx->i1true,
2899 };
2900
2901 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
2902 params, 2,
2903 AC_FUNC_ATTR_READNONE);
2904
2905 if (src0_bitsize == 64) {
2906 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2907 }
2908
2909 /* TODO: We need an intrinsic to skip this conditional. */
2910 /* Check for zero: */
2911 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
2912 LLVMIntEQ, src0,
2913 zero, ""),
2914 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2915 }
2916
2917 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2918 {
2919 return LLVMPointerType(LLVMArrayType(elem_type, 0),
2920 AC_ADDR_SPACE_CONST);
2921 }
2922
2923 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2924 {
2925 return LLVMPointerType(LLVMArrayType(elem_type, 0),
2926 AC_ADDR_SPACE_CONST_32BIT);
2927 }
2928
2929 static struct ac_llvm_flow *
2930 get_current_flow(struct ac_llvm_context *ctx)
2931 {
2932 if (ctx->flow_depth > 0)
2933 return &ctx->flow[ctx->flow_depth - 1];
2934 return NULL;
2935 }
2936
2937 static struct ac_llvm_flow *
2938 get_innermost_loop(struct ac_llvm_context *ctx)
2939 {
2940 for (unsigned i = ctx->flow_depth; i > 0; --i) {
2941 if (ctx->flow[i - 1].loop_entry_block)
2942 return &ctx->flow[i - 1];
2943 }
2944 return NULL;
2945 }
2946
2947 static struct ac_llvm_flow *
2948 push_flow(struct ac_llvm_context *ctx)
2949 {
2950 struct ac_llvm_flow *flow;
2951
2952 if (ctx->flow_depth >= ctx->flow_depth_max) {
2953 unsigned new_max = MAX2(ctx->flow_depth << 1,
2954 AC_LLVM_INITIAL_CF_DEPTH);
2955
2956 ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
2957 ctx->flow_depth_max = new_max;
2958 }
2959
2960 flow = &ctx->flow[ctx->flow_depth];
2961 ctx->flow_depth++;
2962
2963 flow->next_block = NULL;
2964 flow->loop_entry_block = NULL;
2965 return flow;
2966 }
2967
2968 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
2969 int label_id)
2970 {
2971 char buf[32];
2972 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2973 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2974 }
2975
2976 /* Append a basic block at the level of the parent flow.
2977 */
2978 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
2979 const char *name)
2980 {
2981 assert(ctx->flow_depth >= 1);
2982
2983 if (ctx->flow_depth >= 2) {
2984 struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
2985
2986 return LLVMInsertBasicBlockInContext(ctx->context,
2987 flow->next_block, name);
2988 }
2989
2990 LLVMValueRef main_fn =
2991 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2992 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2993 }
2994
2995 /* Emit a branch to the given default target for the current block if
2996 * applicable -- that is, if the current block does not already contain a
2997 * branch from a break or continue.
2998 */
2999 static void emit_default_branch(LLVMBuilderRef builder,
3000 LLVMBasicBlockRef target)
3001 {
3002 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3003 LLVMBuildBr(builder, target);
3004 }
3005
3006 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3007 {
3008 struct ac_llvm_flow *flow = push_flow(ctx);
3009 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3010 flow->next_block = append_basic_block(ctx, "ENDLOOP");
3011 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3012 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3013 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3014 }
3015
3016 void ac_build_break(struct ac_llvm_context *ctx)
3017 {
3018 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3019 LLVMBuildBr(ctx->builder, flow->next_block);
3020 }
3021
3022 void ac_build_continue(struct ac_llvm_context *ctx)
3023 {
3024 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3025 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3026 }
3027
3028 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3029 {
3030 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3031 LLVMBasicBlockRef endif_block;
3032
3033 assert(!current_branch->loop_entry_block);
3034
3035 endif_block = append_basic_block(ctx, "ENDIF");
3036 emit_default_branch(ctx->builder, endif_block);
3037
3038 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3039 set_basicblock_name(current_branch->next_block, "else", label_id);
3040
3041 current_branch->next_block = endif_block;
3042 }
3043
3044 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3045 {
3046 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3047
3048 assert(!current_branch->loop_entry_block);
3049
3050 emit_default_branch(ctx->builder, current_branch->next_block);
3051 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3052 set_basicblock_name(current_branch->next_block, "endif", label_id);
3053
3054 ctx->flow_depth--;
3055 }
3056
3057 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3058 {
3059 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3060
3061 assert(current_loop->loop_entry_block);
3062
3063 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3064
3065 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3066 set_basicblock_name(current_loop->next_block, "endloop", label_id);
3067 ctx->flow_depth--;
3068 }
3069
3070 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3071 {
3072 struct ac_llvm_flow *flow = push_flow(ctx);
3073 LLVMBasicBlockRef if_block;
3074
3075 if_block = append_basic_block(ctx, "IF");
3076 flow->next_block = append_basic_block(ctx, "ELSE");
3077 set_basicblock_name(if_block, "if", label_id);
3078 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3079 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3080 }
3081
3082 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3083 int label_id)
3084 {
3085 LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3086 value, ctx->f32_0, "");
3087 ac_build_ifcc(ctx, cond, label_id);
3088 }
3089
3090 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3091 int label_id)
3092 {
3093 LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3094 ac_to_integer(ctx, value),
3095 ctx->i32_0, "");
3096 ac_build_ifcc(ctx, cond, label_id);
3097 }
3098
3099 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3100 const char *name)
3101 {
3102 LLVMBuilderRef builder = ac->builder;
3103 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3104 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3105 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3106 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3107 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3108 LLVMValueRef res;
3109
3110 if (first_instr) {
3111 LLVMPositionBuilderBefore(first_builder, first_instr);
3112 } else {
3113 LLVMPositionBuilderAtEnd(first_builder, first_block);
3114 }
3115
3116 res = LLVMBuildAlloca(first_builder, type, name);
3117 LLVMDisposeBuilder(first_builder);
3118 return res;
3119 }
3120
3121 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3122 LLVMTypeRef type, const char *name)
3123 {
3124 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3125 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3126 return ptr;
3127 }
3128
3129 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3130 LLVMTypeRef type)
3131 {
3132 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3133 return LLVMBuildBitCast(ctx->builder, ptr,
3134 LLVMPointerType(type, addr_space), "");
3135 }
3136
3137 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3138 unsigned count)
3139 {
3140 unsigned num_components = ac_get_llvm_num_components(value);
3141 if (count == num_components)
3142 return value;
3143
3144 LLVMValueRef masks[MAX2(count, 2)];
3145 masks[0] = ctx->i32_0;
3146 masks[1] = ctx->i32_1;
3147 for (unsigned i = 2; i < count; i++)
3148 masks[i] = LLVMConstInt(ctx->i32, i, false);
3149
3150 if (count == 1)
3151 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3152 "");
3153
3154 LLVMValueRef swizzle = LLVMConstVector(masks, count);
3155 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3156 }
3157
3158 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3159 unsigned rshift, unsigned bitwidth)
3160 {
3161 LLVMValueRef value = param;
3162 if (rshift)
3163 value = LLVMBuildLShr(ctx->builder, value,
3164 LLVMConstInt(ctx->i32, rshift, false), "");
3165
3166 if (rshift + bitwidth < 32) {
3167 unsigned mask = (1 << bitwidth) - 1;
3168 value = LLVMBuildAnd(ctx->builder, value,
3169 LLVMConstInt(ctx->i32, mask, false), "");
3170 }
3171 return value;
3172 }
3173
3174 /* Adjust the sample index according to FMASK.
3175 *
3176 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3177 * which is the identity mapping. Each nibble says which physical sample
3178 * should be fetched to get that sample.
3179 *
3180 * For example, 0x11111100 means there are only 2 samples stored and
3181 * the second sample covers 3/4 of the pixel. When reading samples 0
3182 * and 1, return physical sample 0 (determined by the first two 0s
3183 * in FMASK), otherwise return physical sample 1.
3184 *
3185 * The sample index should be adjusted as follows:
3186 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3187 */
3188 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3189 LLVMValueRef *addr, bool is_array_tex)
3190 {
3191 struct ac_image_args fmask_load = {};
3192 fmask_load.opcode = ac_image_load;
3193 fmask_load.resource = fmask;
3194 fmask_load.dmask = 0xf;
3195 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3196
3197 fmask_load.coords[0] = addr[0];
3198 fmask_load.coords[1] = addr[1];
3199 if (is_array_tex)
3200 fmask_load.coords[2] = addr[2];
3201
3202 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3203 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3204 ac->i32_0, "");
3205
3206 /* Apply the formula. */
3207 unsigned sample_chan = is_array_tex ? 3 : 2;
3208 LLVMValueRef final_sample;
3209 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3210 LLVMConstInt(ac->i32, 4, 0), "");
3211 final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3212 /* Mask the sample index by 0x7, because 0x8 means an unknown value
3213 * with EQAA, so those will map to 0. */
3214 final_sample = LLVMBuildAnd(ac->builder, final_sample,
3215 LLVMConstInt(ac->i32, 0x7, 0), "");
3216
3217 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3218 * resource descriptor is 0 (invalid).
3219 */
3220 LLVMValueRef tmp;
3221 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3222 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3223 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3224
3225 /* Replace the MSAA sample index. */
3226 addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3227 addr[sample_chan], "");
3228 }
3229
3230 static LLVMValueRef
3231 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3232 {
3233 ac_build_optimization_barrier(ctx, &src);
3234 return ac_build_intrinsic(ctx,
3235 lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3236 LLVMTypeOf(src), (LLVMValueRef []) {
3237 src, lane },
3238 lane == NULL ? 1 : 2,
3239 AC_FUNC_ATTR_READNONE |
3240 AC_FUNC_ATTR_CONVERGENT);
3241 }
3242
3243 /**
3244 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3245 * @param ctx
3246 * @param src
3247 * @param lane - id of the lane or NULL for the first active lane
3248 * @return value of the lane
3249 */
3250 LLVMValueRef
3251 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3252 {
3253 LLVMTypeRef src_type = LLVMTypeOf(src);
3254 src = ac_to_integer(ctx, src);
3255 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3256 LLVMValueRef ret;
3257
3258 if (bits == 32) {
3259 ret = _ac_build_readlane(ctx, src, lane);
3260 } else {
3261 assert(bits % 32 == 0);
3262 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3263 LLVMValueRef src_vector =
3264 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3265 ret = LLVMGetUndef(vec_type);
3266 for (unsigned i = 0; i < bits / 32; i++) {
3267 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3268 LLVMConstInt(ctx->i32, i, 0), "");
3269 LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3270 ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3271 LLVMConstInt(ctx->i32, i, 0), "");
3272 }
3273 }
3274 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3275 }
3276
3277 LLVMValueRef
3278 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3279 {
3280 /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
3281 */
3282 LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
3283 ac_get_thread_id(ctx), "");
3284 return LLVMBuildSelect(ctx->builder, pred, value, src, "");
3285 }
3286
3287 LLVMValueRef
3288 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3289 {
3290 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3291 LLVMVectorType(ctx->i32, 2),
3292 "");
3293 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3294 ctx->i32_0, "");
3295 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3296 ctx->i32_1, "");
3297 LLVMValueRef val =
3298 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3299 (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3300 2, AC_FUNC_ATTR_READNONE);
3301 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3302 (LLVMValueRef []) { mask_hi, val },
3303 2, AC_FUNC_ATTR_READNONE);
3304 return val;
3305 }
3306
3307 enum dpp_ctrl {
3308 _dpp_quad_perm = 0x000,
3309 _dpp_row_sl = 0x100,
3310 _dpp_row_sr = 0x110,
3311 _dpp_row_rr = 0x120,
3312 dpp_wf_sl1 = 0x130,
3313 dpp_wf_rl1 = 0x134,
3314 dpp_wf_sr1 = 0x138,
3315 dpp_wf_rr1 = 0x13C,
3316 dpp_row_mirror = 0x140,
3317 dpp_row_half_mirror = 0x141,
3318 dpp_row_bcast15 = 0x142,
3319 dpp_row_bcast31 = 0x143
3320 };
3321
3322 static inline enum dpp_ctrl
3323 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3324 {
3325 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3326 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3327 }
3328
3329 static inline enum dpp_ctrl
3330 dpp_row_sl(unsigned amount)
3331 {
3332 assert(amount > 0 && amount < 16);
3333 return _dpp_row_sl | amount;
3334 }
3335
3336 static inline enum dpp_ctrl
3337 dpp_row_sr(unsigned amount)
3338 {
3339 assert(amount > 0 && amount < 16);
3340 return _dpp_row_sr | amount;
3341 }
3342
3343 static LLVMValueRef
3344 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3345 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3346 bool bound_ctrl)
3347 {
3348 return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3349 LLVMTypeOf(old),
3350 (LLVMValueRef[]) {
3351 old, src,
3352 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3353 LLVMConstInt(ctx->i32, row_mask, 0),
3354 LLVMConstInt(ctx->i32, bank_mask, 0),
3355 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3356 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3357 }
3358
3359 static LLVMValueRef
3360 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3361 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3362 bool bound_ctrl)
3363 {
3364 LLVMTypeRef src_type = LLVMTypeOf(src);
3365 src = ac_to_integer(ctx, src);
3366 old = ac_to_integer(ctx, old);
3367 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3368 LLVMValueRef ret;
3369 if (bits == 32) {
3370 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3371 bank_mask, bound_ctrl);
3372 } else {
3373 assert(bits % 32 == 0);
3374 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3375 LLVMValueRef src_vector =
3376 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3377 LLVMValueRef old_vector =
3378 LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3379 ret = LLVMGetUndef(vec_type);
3380 for (unsigned i = 0; i < bits / 32; i++) {
3381 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3382 LLVMConstInt(ctx->i32, i,
3383 0), "");
3384 old = LLVMBuildExtractElement(ctx->builder, old_vector,
3385 LLVMConstInt(ctx->i32, i,
3386 0), "");
3387 LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3388 dpp_ctrl,
3389 row_mask,
3390 bank_mask,
3391 bound_ctrl);
3392 ret = LLVMBuildInsertElement(ctx->builder, ret,
3393 ret_comp,
3394 LLVMConstInt(ctx->i32, i,
3395 0), "");
3396 }
3397 }
3398 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3399 }
3400
3401 static inline unsigned
3402 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3403 {
3404 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3405 return and_mask | (or_mask << 5) | (xor_mask << 10);
3406 }
3407
3408 static LLVMValueRef
3409 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3410 {
3411 return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
3412 LLVMTypeOf(src), (LLVMValueRef []) {
3413 src, LLVMConstInt(ctx->i32, mask, 0) },
3414 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3415 }
3416
3417 LLVMValueRef
3418 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3419 {
3420 LLVMTypeRef src_type = LLVMTypeOf(src);
3421 src = ac_to_integer(ctx, src);
3422 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3423 LLVMValueRef ret;
3424 if (bits == 32) {
3425 ret = _ac_build_ds_swizzle(ctx, src, mask);
3426 } else {
3427 assert(bits % 32 == 0);
3428 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3429 LLVMValueRef src_vector =
3430 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3431 ret = LLVMGetUndef(vec_type);
3432 for (unsigned i = 0; i < bits / 32; i++) {
3433 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3434 LLVMConstInt(ctx->i32, i,
3435 0), "");
3436 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
3437 mask);
3438 ret = LLVMBuildInsertElement(ctx->builder, ret,
3439 ret_comp,
3440 LLVMConstInt(ctx->i32, i,
3441 0), "");
3442 }
3443 }
3444 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3445 }
3446
3447 static LLVMValueRef
3448 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3449 {
3450 char name[32], type[8];
3451 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3452 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3453 return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
3454 (LLVMValueRef []) { src }, 1,
3455 AC_FUNC_ATTR_READNONE);
3456 }
3457
3458 static LLVMValueRef
3459 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3460 LLVMValueRef inactive)
3461 {
3462 char name[33], type[8];
3463 LLVMTypeRef src_type = LLVMTypeOf(src);
3464 src = ac_to_integer(ctx, src);
3465 inactive = ac_to_integer(ctx, inactive);
3466 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3467 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3468 LLVMValueRef ret =
3469 ac_build_intrinsic(ctx, name,
3470 LLVMTypeOf(src), (LLVMValueRef []) {
3471 src, inactive }, 2,
3472 AC_FUNC_ATTR_READNONE |
3473 AC_FUNC_ATTR_CONVERGENT);
3474 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3475 }
3476
3477 static LLVMValueRef
3478 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
3479 {
3480 if (type_size == 4) {
3481 switch (op) {
3482 case nir_op_iadd: return ctx->i32_0;
3483 case nir_op_fadd: return ctx->f32_0;
3484 case nir_op_imul: return ctx->i32_1;
3485 case nir_op_fmul: return ctx->f32_1;
3486 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3487 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3488 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
3489 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3490 case nir_op_umax: return ctx->i32_0;
3491 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
3492 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
3493 case nir_op_ior: return ctx->i32_0;
3494 case nir_op_ixor: return ctx->i32_0;
3495 default:
3496 unreachable("bad reduction intrinsic");
3497 }
3498 } else { /* type_size == 64bit */
3499 switch (op) {
3500 case nir_op_iadd: return ctx->i64_0;
3501 case nir_op_fadd: return ctx->f64_0;
3502 case nir_op_imul: return ctx->i64_1;
3503 case nir_op_fmul: return ctx->f64_1;
3504 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3505 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3506 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
3507 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3508 case nir_op_umax: return ctx->i64_0;
3509 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
3510 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
3511 case nir_op_ior: return ctx->i64_0;
3512 case nir_op_ixor: return ctx->i64_0;
3513 default:
3514 unreachable("bad reduction intrinsic");
3515 }
3516 }
3517 }
3518
3519 static LLVMValueRef
3520 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
3521 {
3522 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3523 switch (op) {
3524 case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3525 case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3526 case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3527 case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3528 case nir_op_imin: return LLVMBuildSelect(ctx->builder,
3529 LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3530 lhs, rhs, "");
3531 case nir_op_umin: return LLVMBuildSelect(ctx->builder,
3532 LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3533 lhs, rhs, "");
3534 case nir_op_fmin: return ac_build_intrinsic(ctx,
3535 _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
3536 _64bit ? ctx->f64 : ctx->f32,
3537 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3538 case nir_op_imax: return LLVMBuildSelect(ctx->builder,
3539 LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3540 lhs, rhs, "");
3541 case nir_op_umax: return LLVMBuildSelect(ctx->builder,
3542 LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3543 lhs, rhs, "");
3544 case nir_op_fmax: return ac_build_intrinsic(ctx,
3545 _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
3546 _64bit ? ctx->f64 : ctx->f32,
3547 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3548 case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3549 case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3550 case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3551 default:
3552 unreachable("bad reduction intrinsic");
3553 }
3554 }
3555
3556 /**
3557 * \param maxprefix specifies that the result only needs to be correct for a
3558 * prefix of this many threads
3559 *
3560 * TODO: add inclusive and excluse scan functions for SI chip class.
3561 */
3562 static LLVMValueRef
3563 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
3564 unsigned maxprefix)
3565 {
3566 LLVMValueRef result, tmp;
3567 result = src;
3568 if (maxprefix <= 1)
3569 return result;
3570 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3571 result = ac_build_alu_op(ctx, result, tmp, op);
3572 if (maxprefix <= 2)
3573 return result;
3574 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3575 result = ac_build_alu_op(ctx, result, tmp, op);
3576 if (maxprefix <= 3)
3577 return result;
3578 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3579 result = ac_build_alu_op(ctx, result, tmp, op);
3580 if (maxprefix <= 4)
3581 return result;
3582 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3583 result = ac_build_alu_op(ctx, result, tmp, op);
3584 if (maxprefix <= 8)
3585 return result;
3586 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3587 result = ac_build_alu_op(ctx, result, tmp, op);
3588 if (maxprefix <= 16)
3589 return result;
3590 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3591 result = ac_build_alu_op(ctx, result, tmp, op);
3592 if (maxprefix <= 32)
3593 return result;
3594 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3595 result = ac_build_alu_op(ctx, result, tmp, op);
3596 return result;
3597 }
3598
3599 LLVMValueRef
3600 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3601 {
3602 LLVMValueRef result;
3603
3604 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3605 LLVMBuilderRef builder = ctx->builder;
3606 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3607 result = ac_build_ballot(ctx, src);
3608 result = ac_build_mbcnt(ctx, result);
3609 result = LLVMBuildAdd(builder, result, src, "");
3610 return result;
3611 }
3612
3613 ac_build_optimization_barrier(ctx, &src);
3614
3615 LLVMValueRef identity =
3616 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3617 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3618 LLVMTypeOf(identity), "");
3619 result = ac_build_scan(ctx, op, result, identity, 64);
3620
3621 return ac_build_wwm(ctx, result);
3622 }
3623
3624 LLVMValueRef
3625 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3626 {
3627 LLVMValueRef result;
3628
3629 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3630 LLVMBuilderRef builder = ctx->builder;
3631 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3632 result = ac_build_ballot(ctx, src);
3633 result = ac_build_mbcnt(ctx, result);
3634 return result;
3635 }
3636
3637 ac_build_optimization_barrier(ctx, &src);
3638
3639 LLVMValueRef identity =
3640 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3641 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3642 LLVMTypeOf(identity), "");
3643 result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
3644 result = ac_build_scan(ctx, op, result, identity, 64);
3645
3646 return ac_build_wwm(ctx, result);
3647 }
3648
3649 LLVMValueRef
3650 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
3651 {
3652 if (cluster_size == 1) return src;
3653 ac_build_optimization_barrier(ctx, &src);
3654 LLVMValueRef result, swap;
3655 LLVMValueRef identity = get_reduction_identity(ctx, op,
3656 ac_get_type_size(LLVMTypeOf(src)));
3657 result = LLVMBuildBitCast(ctx->builder,
3658 ac_build_set_inactive(ctx, src, identity),
3659 LLVMTypeOf(identity), "");
3660 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3661 result = ac_build_alu_op(ctx, result, swap, op);
3662 if (cluster_size == 2) return ac_build_wwm(ctx, result);
3663
3664 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3665 result = ac_build_alu_op(ctx, result, swap, op);
3666 if (cluster_size == 4) return ac_build_wwm(ctx, result);
3667
3668 if (ctx->chip_class >= VI)
3669 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3670 else
3671 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3672 result = ac_build_alu_op(ctx, result, swap, op);
3673 if (cluster_size == 8) return ac_build_wwm(ctx, result);
3674
3675 if (ctx->chip_class >= VI)
3676 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3677 else
3678 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3679 result = ac_build_alu_op(ctx, result, swap, op);
3680 if (cluster_size == 16) return ac_build_wwm(ctx, result);
3681
3682 if (ctx->chip_class >= VI && cluster_size != 32)
3683 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3684 else
3685 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3686 result = ac_build_alu_op(ctx, result, swap, op);
3687 if (cluster_size == 32) return ac_build_wwm(ctx, result);
3688
3689 if (ctx->chip_class >= VI) {
3690 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3691 result = ac_build_alu_op(ctx, result, swap, op);
3692 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3693 return ac_build_wwm(ctx, result);
3694 } else {
3695 swap = ac_build_readlane(ctx, result, ctx->i32_0);
3696 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3697 result = ac_build_alu_op(ctx, result, swap, op);
3698 return ac_build_wwm(ctx, result);
3699 }
3700 }
3701
3702 /**
3703 * "Top half" of a scan that reduces per-wave values across an entire
3704 * workgroup.
3705 *
3706 * The source value must be present in the highest lane of the wave, and the
3707 * highest lane must be live.
3708 */
3709 void
3710 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3711 {
3712 if (ws->maxwaves <= 1)
3713 return;
3714
3715 const LLVMValueRef i32_63 = LLVMConstInt(ctx->i32, 63, false);
3716 LLVMBuilderRef builder = ctx->builder;
3717 LLVMValueRef tid = ac_get_thread_id(ctx);
3718 LLVMValueRef tmp;
3719
3720 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, i32_63, "");
3721 ac_build_ifcc(ctx, tmp, 1000);
3722 LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
3723 ac_build_endif(ctx, 1000);
3724 }
3725
3726 /**
3727 * "Bottom half" of a scan that reduces per-wave values across an entire
3728 * workgroup.
3729 *
3730 * The caller must place a barrier between the top and bottom halves.
3731 */
3732 void
3733 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3734 {
3735 const LLVMTypeRef type = LLVMTypeOf(ws->src);
3736 const LLVMValueRef identity =
3737 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
3738
3739 if (ws->maxwaves <= 1) {
3740 ws->result_reduce = ws->src;
3741 ws->result_inclusive = ws->src;
3742 ws->result_exclusive = identity;
3743 return;
3744 }
3745 assert(ws->maxwaves <= 32);
3746
3747 LLVMBuilderRef builder = ctx->builder;
3748 LLVMValueRef tid = ac_get_thread_id(ctx);
3749 LLVMBasicBlockRef bbs[2];
3750 LLVMValueRef phivalues_scan[2];
3751 LLVMValueRef tmp, tmp2;
3752
3753 bbs[0] = LLVMGetInsertBlock(builder);
3754 phivalues_scan[0] = LLVMGetUndef(type);
3755
3756 if (ws->enable_reduce)
3757 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
3758 else if (ws->enable_inclusive)
3759 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
3760 else
3761 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
3762 ac_build_ifcc(ctx, tmp, 1001);
3763 {
3764 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
3765
3766 ac_build_optimization_barrier(ctx, &tmp);
3767
3768 bbs[1] = LLVMGetInsertBlock(builder);
3769 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves);
3770 }
3771 ac_build_endif(ctx, 1001);
3772
3773 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
3774
3775 if (ws->enable_reduce) {
3776 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
3777 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
3778 }
3779 if (ws->enable_inclusive)
3780 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
3781 if (ws->enable_exclusive) {
3782 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
3783 tmp = ac_build_readlane(ctx, scan, tmp);
3784 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
3785 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
3786 }
3787 }
3788
3789 /**
3790 * Inclusive scan of a per-wave value across an entire workgroup.
3791 *
3792 * This implies an s_barrier instruction.
3793 *
3794 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
3795 * of the workgroup are live. (This requirement cannot easily be relaxed in a
3796 * useful manner because of the barrier in the algorithm.)
3797 */
3798 void
3799 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3800 {
3801 ac_build_wg_wavescan_top(ctx, ws);
3802 ac_build_s_barrier(ctx);
3803 ac_build_wg_wavescan_bottom(ctx, ws);
3804 }
3805
3806 /**
3807 * "Top half" of a scan that reduces per-thread values across an entire
3808 * workgroup.
3809 *
3810 * All lanes must be active when this code runs.
3811 */
3812 void
3813 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3814 {
3815 if (ws->enable_exclusive) {
3816 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
3817 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
3818 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
3819 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
3820 } else {
3821 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
3822 }
3823
3824 bool enable_inclusive = ws->enable_inclusive;
3825 bool enable_exclusive = ws->enable_exclusive;
3826 ws->enable_inclusive = false;
3827 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
3828 ac_build_wg_wavescan_top(ctx, ws);
3829 ws->enable_inclusive = enable_inclusive;
3830 ws->enable_exclusive = enable_exclusive;
3831 }
3832
3833 /**
3834 * "Bottom half" of a scan that reduces per-thread values across an entire
3835 * workgroup.
3836 *
3837 * The caller must place a barrier between the top and bottom halves.
3838 */
3839 void
3840 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3841 {
3842 bool enable_inclusive = ws->enable_inclusive;
3843 bool enable_exclusive = ws->enable_exclusive;
3844 ws->enable_inclusive = false;
3845 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
3846 ac_build_wg_wavescan_bottom(ctx, ws);
3847 ws->enable_inclusive = enable_inclusive;
3848 ws->enable_exclusive = enable_exclusive;
3849
3850 /* ws->result_reduce is already the correct value */
3851 if (ws->enable_inclusive)
3852 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->src, ws->op);
3853 if (ws->enable_exclusive)
3854 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
3855 }
3856
3857 /**
3858 * A scan that reduces per-thread values across an entire workgroup.
3859 *
3860 * The caller must ensure that all lanes are active when this code runs
3861 * (WWM is insufficient!), because there is an implied barrier.
3862 */
3863 void
3864 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3865 {
3866 ac_build_wg_scan_top(ctx, ws);
3867 ac_build_s_barrier(ctx);
3868 ac_build_wg_scan_bottom(ctx, ws);
3869 }
3870
3871 LLVMValueRef
3872 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3873 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3874 {
3875 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3876 if (ctx->chip_class >= VI) {
3877 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
3878 } else {
3879 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3880 }
3881 }
3882
3883 LLVMValueRef
3884 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3885 {
3886 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3887 return ac_build_intrinsic(ctx,
3888 "llvm.amdgcn.ds.bpermute", ctx->i32,
3889 (LLVMValueRef []) {index, src}, 2,
3890 AC_FUNC_ATTR_READNONE |
3891 AC_FUNC_ATTR_CONVERGENT);
3892 }