ac: add 8-bit and 16-bit supports to ac_build_set_inactive()
[mesa.git] / src / amd / llvm / ac_llvm_build.c
1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include <llvm-c/Core.h>
29 #include <llvm/Config/llvm-config.h>
30
31 #include "c11/threads.h"
32
33 #include <assert.h>
34 #include <stdio.h>
35
36 #include "ac_llvm_util.h"
37 #include "ac_shader_util.h"
38 #include "ac_exp_param.h"
39 #include "util/bitscan.h"
40 #include "util/macros.h"
41 #include "util/u_atomic.h"
42 #include "util/u_math.h"
43 #include "sid.h"
44
45 #include "shader_enums.h"
46
47 #define AC_LLVM_INITIAL_CF_DEPTH 4
48
49 /* Data for if/else/endif and bgnloop/endloop control flow structures.
50 */
51 struct ac_llvm_flow {
52 /* Loop exit or next part of if/else/endif. */
53 LLVMBasicBlockRef next_block;
54 LLVMBasicBlockRef loop_entry_block;
55 };
56
57 /* Initialize module-independent parts of the context.
58 *
59 * The caller is responsible for initializing ctx::module and ctx::builder.
60 */
61 void
62 ac_llvm_context_init(struct ac_llvm_context *ctx,
63 struct ac_llvm_compiler *compiler,
64 enum chip_class chip_class, enum radeon_family family,
65 enum ac_float_mode float_mode, unsigned wave_size,
66 unsigned ballot_mask_bits)
67 {
68 LLVMValueRef args[1];
69
70 ctx->context = LLVMContextCreate();
71
72 ctx->chip_class = chip_class;
73 ctx->family = family;
74 ctx->wave_size = wave_size;
75 ctx->ballot_mask_bits = ballot_mask_bits;
76 ctx->float_mode = float_mode;
77 ctx->module = ac_create_module(wave_size == 32 ? compiler->tm_wave32
78 : compiler->tm,
79 ctx->context);
80 ctx->builder = ac_create_builder(ctx->context, float_mode);
81
82 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
83 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
84 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
85 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
86 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
87 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
88 ctx->intptr = ctx->i32;
89 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
90 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
91 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
92 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
93 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
94 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
95 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
96 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
97 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
98 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
99 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
100 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
101 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
102
103 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
104 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
105 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
106 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
107 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
108 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
109 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
110 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
111 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
112 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
113 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
114 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
115 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
116 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
117
118 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
119 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
120
121 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
122 "range", 5);
123
124 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
125 "invariant.load", 14);
126
127 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
128
129 args[0] = LLVMConstReal(ctx->f32, 2.5);
130 ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
131
132 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
133 "amdgpu.uniform", 14);
134
135 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
136 ctx->flow = calloc(1, sizeof(*ctx->flow));
137 }
138
139 void
140 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
141 {
142 free(ctx->flow->stack);
143 free(ctx->flow);
144 ctx->flow = NULL;
145 }
146
147 int
148 ac_get_llvm_num_components(LLVMValueRef value)
149 {
150 LLVMTypeRef type = LLVMTypeOf(value);
151 unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
152 ? LLVMGetVectorSize(type)
153 : 1;
154 return num_components;
155 }
156
157 LLVMValueRef
158 ac_llvm_extract_elem(struct ac_llvm_context *ac,
159 LLVMValueRef value,
160 int index)
161 {
162 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
163 assert(index == 0);
164 return value;
165 }
166
167 return LLVMBuildExtractElement(ac->builder, value,
168 LLVMConstInt(ac->i32, index, false), "");
169 }
170
171 int
172 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
173 {
174 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
175 type = LLVMGetElementType(type);
176
177 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
178 return LLVMGetIntTypeWidth(type);
179
180 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
181 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
182 return 32;
183 }
184
185 if (type == ctx->f16)
186 return 16;
187 if (type == ctx->f32)
188 return 32;
189 if (type == ctx->f64)
190 return 64;
191
192 unreachable("Unhandled type kind in get_elem_bits");
193 }
194
195 unsigned
196 ac_get_type_size(LLVMTypeRef type)
197 {
198 LLVMTypeKind kind = LLVMGetTypeKind(type);
199
200 switch (kind) {
201 case LLVMIntegerTypeKind:
202 return LLVMGetIntTypeWidth(type) / 8;
203 case LLVMHalfTypeKind:
204 return 2;
205 case LLVMFloatTypeKind:
206 return 4;
207 case LLVMDoubleTypeKind:
208 return 8;
209 case LLVMPointerTypeKind:
210 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
211 return 4;
212 return 8;
213 case LLVMVectorTypeKind:
214 return LLVMGetVectorSize(type) *
215 ac_get_type_size(LLVMGetElementType(type));
216 case LLVMArrayTypeKind:
217 return LLVMGetArrayLength(type) *
218 ac_get_type_size(LLVMGetElementType(type));
219 default:
220 assert(0);
221 return 0;
222 }
223 }
224
225 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
226 {
227 if (t == ctx->i8)
228 return ctx->i8;
229 else if (t == ctx->f16 || t == ctx->i16)
230 return ctx->i16;
231 else if (t == ctx->f32 || t == ctx->i32)
232 return ctx->i32;
233 else if (t == ctx->f64 || t == ctx->i64)
234 return ctx->i64;
235 else
236 unreachable("Unhandled integer size");
237 }
238
239 LLVMTypeRef
240 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
241 {
242 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
243 LLVMTypeRef elem_type = LLVMGetElementType(t);
244 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
245 LLVMGetVectorSize(t));
246 }
247 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
248 switch (LLVMGetPointerAddressSpace(t)) {
249 case AC_ADDR_SPACE_GLOBAL:
250 return ctx->i64;
251 case AC_ADDR_SPACE_CONST_32BIT:
252 case AC_ADDR_SPACE_LDS:
253 return ctx->i32;
254 default:
255 unreachable("unhandled address space");
256 }
257 }
258 return to_integer_type_scalar(ctx, t);
259 }
260
261 LLVMValueRef
262 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
263 {
264 LLVMTypeRef type = LLVMTypeOf(v);
265 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
266 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
267 }
268 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
269 }
270
271 LLVMValueRef
272 ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
273 {
274 LLVMTypeRef type = LLVMTypeOf(v);
275 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
276 return v;
277 return ac_to_integer(ctx, v);
278 }
279
280 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
281 {
282 if (t == ctx->i8)
283 return ctx->i8;
284 else if (t == ctx->i16 || t == ctx->f16)
285 return ctx->f16;
286 else if (t == ctx->i32 || t == ctx->f32)
287 return ctx->f32;
288 else if (t == ctx->i64 || t == ctx->f64)
289 return ctx->f64;
290 else
291 unreachable("Unhandled float size");
292 }
293
294 LLVMTypeRef
295 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
296 {
297 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
298 LLVMTypeRef elem_type = LLVMGetElementType(t);
299 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
300 LLVMGetVectorSize(t));
301 }
302 return to_float_type_scalar(ctx, t);
303 }
304
305 LLVMValueRef
306 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
307 {
308 LLVMTypeRef type = LLVMTypeOf(v);
309 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
310 }
311
312
313 LLVMValueRef
314 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
315 LLVMTypeRef return_type, LLVMValueRef *params,
316 unsigned param_count, unsigned attrib_mask)
317 {
318 LLVMValueRef function, call;
319 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
320
321 function = LLVMGetNamedFunction(ctx->module, name);
322 if (!function) {
323 LLVMTypeRef param_types[32], function_type;
324 unsigned i;
325
326 assert(param_count <= 32);
327
328 for (i = 0; i < param_count; ++i) {
329 assert(params[i]);
330 param_types[i] = LLVMTypeOf(params[i]);
331 }
332 function_type =
333 LLVMFunctionType(return_type, param_types, param_count, 0);
334 function = LLVMAddFunction(ctx->module, name, function_type);
335
336 LLVMSetFunctionCallConv(function, LLVMCCallConv);
337 LLVMSetLinkage(function, LLVMExternalLinkage);
338
339 if (!set_callsite_attrs)
340 ac_add_func_attributes(ctx->context, function, attrib_mask);
341 }
342
343 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
344 if (set_callsite_attrs)
345 ac_add_func_attributes(ctx->context, call, attrib_mask);
346 return call;
347 }
348
349 /**
350 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
351 * intrinsic names).
352 */
353 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
354 {
355 LLVMTypeRef elem_type = type;
356
357 assert(bufsize >= 8);
358
359 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
360 int ret = snprintf(buf, bufsize, "v%u",
361 LLVMGetVectorSize(type));
362 if (ret < 0) {
363 char *type_name = LLVMPrintTypeToString(type);
364 fprintf(stderr, "Error building type name for: %s\n",
365 type_name);
366 LLVMDisposeMessage(type_name);
367 return;
368 }
369 elem_type = LLVMGetElementType(type);
370 buf += ret;
371 bufsize -= ret;
372 }
373 switch (LLVMGetTypeKind(elem_type)) {
374 default: break;
375 case LLVMIntegerTypeKind:
376 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
377 break;
378 case LLVMHalfTypeKind:
379 snprintf(buf, bufsize, "f16");
380 break;
381 case LLVMFloatTypeKind:
382 snprintf(buf, bufsize, "f32");
383 break;
384 case LLVMDoubleTypeKind:
385 snprintf(buf, bufsize, "f64");
386 break;
387 }
388 }
389
390 /**
391 * Helper function that builds an LLVM IR PHI node and immediately adds
392 * incoming edges.
393 */
394 LLVMValueRef
395 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
396 unsigned count_incoming, LLVMValueRef *values,
397 LLVMBasicBlockRef *blocks)
398 {
399 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
400 LLVMAddIncoming(phi, values, blocks, count_incoming);
401 return phi;
402 }
403
404 void ac_build_s_barrier(struct ac_llvm_context *ctx)
405 {
406 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
407 0, AC_FUNC_ATTR_CONVERGENT);
408 }
409
410 /* Prevent optimizations (at least of memory accesses) across the current
411 * point in the program by emitting empty inline assembly that is marked as
412 * having side effects.
413 *
414 * Optionally, a value can be passed through the inline assembly to prevent
415 * LLVM from hoisting calls to ReadNone functions.
416 */
417 void
418 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
419 LLVMValueRef *pvgpr)
420 {
421 static int counter = 0;
422
423 LLVMBuilderRef builder = ctx->builder;
424 char code[16];
425
426 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
427
428 if (!pvgpr) {
429 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
430 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
431 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
432 } else {
433 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
434 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
435 LLVMValueRef vgpr = *pvgpr;
436 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
437 unsigned vgpr_size = ac_get_type_size(vgpr_type);
438 LLVMValueRef vgpr0;
439
440 assert(vgpr_size % 4 == 0);
441
442 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
443 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
444 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
445 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
446 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
447
448 *pvgpr = vgpr;
449 }
450 }
451
452 LLVMValueRef
453 ac_build_shader_clock(struct ac_llvm_context *ctx)
454 {
455 const char *intr = LLVM_VERSION_MAJOR >= 9 && ctx->chip_class >= GFX8 ?
456 "llvm.amdgcn.s.memrealtime" : "llvm.readcyclecounter";
457 LLVMValueRef tmp = ac_build_intrinsic(ctx, intr, ctx->i64, NULL, 0, 0);
458 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
459 }
460
461 LLVMValueRef
462 ac_build_ballot(struct ac_llvm_context *ctx,
463 LLVMValueRef value)
464 {
465 const char *name;
466
467 if (LLVM_VERSION_MAJOR >= 9) {
468 if (ctx->wave_size == 64)
469 name = "llvm.amdgcn.icmp.i64.i32";
470 else
471 name = "llvm.amdgcn.icmp.i32.i32";
472 } else {
473 name = "llvm.amdgcn.icmp.i32";
474 }
475 LLVMValueRef args[3] = {
476 value,
477 ctx->i32_0,
478 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
479 };
480
481 /* We currently have no other way to prevent LLVM from lifting the icmp
482 * calls to a dominating basic block.
483 */
484 ac_build_optimization_barrier(ctx, &args[0]);
485
486 args[0] = ac_to_integer(ctx, args[0]);
487
488 return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3,
489 AC_FUNC_ATTR_NOUNWIND |
490 AC_FUNC_ATTR_READNONE |
491 AC_FUNC_ATTR_CONVERGENT);
492 }
493
494 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx,
495 LLVMValueRef value)
496 {
497 const char *name = LLVM_VERSION_MAJOR >= 9 ? "llvm.amdgcn.icmp.i64.i1" : "llvm.amdgcn.icmp.i1";
498 LLVMValueRef args[3] = {
499 value,
500 ctx->i1false,
501 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
502 };
503
504 return ac_build_intrinsic(ctx, name, ctx->i64, args, 3,
505 AC_FUNC_ATTR_NOUNWIND |
506 AC_FUNC_ATTR_READNONE |
507 AC_FUNC_ATTR_CONVERGENT);
508 }
509
510 LLVMValueRef
511 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
512 {
513 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
514 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
515 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
516 }
517
518 LLVMValueRef
519 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
520 {
521 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
522 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
523 LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
524 }
525
526 LLVMValueRef
527 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
528 {
529 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
530 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
531
532 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
533 vote_set, active_set, "");
534 LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
535 vote_set,
536 LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
537 return LLVMBuildOr(ctx->builder, all, none, "");
538 }
539
540 LLVMValueRef
541 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
542 unsigned value_count, unsigned component)
543 {
544 LLVMValueRef vec = NULL;
545
546 if (value_count == 1) {
547 return values[component];
548 } else if (!value_count)
549 unreachable("value_count is 0");
550
551 for (unsigned i = component; i < value_count + component; i++) {
552 LLVMValueRef value = values[i];
553
554 if (i == component)
555 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
556 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
557 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
558 }
559 return vec;
560 }
561
562 LLVMValueRef
563 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
564 LLVMValueRef *values,
565 unsigned value_count,
566 unsigned value_stride,
567 bool load,
568 bool always_vector)
569 {
570 LLVMBuilderRef builder = ctx->builder;
571 LLVMValueRef vec = NULL;
572 unsigned i;
573
574 if (value_count == 1 && !always_vector) {
575 if (load)
576 return LLVMBuildLoad(builder, values[0], "");
577 return values[0];
578 } else if (!value_count)
579 unreachable("value_count is 0");
580
581 for (i = 0; i < value_count; i++) {
582 LLVMValueRef value = values[i * value_stride];
583 if (load)
584 value = LLVMBuildLoad(builder, value, "");
585
586 if (!i)
587 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
588 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
589 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
590 }
591 return vec;
592 }
593
594 LLVMValueRef
595 ac_build_gather_values(struct ac_llvm_context *ctx,
596 LLVMValueRef *values,
597 unsigned value_count)
598 {
599 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
600 }
601
602 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
603 * channels with undef. Extract at most src_channels components from the input.
604 */
605 static LLVMValueRef
606 ac_build_expand(struct ac_llvm_context *ctx,
607 LLVMValueRef value,
608 unsigned src_channels,
609 unsigned dst_channels)
610 {
611 LLVMTypeRef elemtype;
612 LLVMValueRef chan[dst_channels];
613
614 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
615 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
616
617 if (src_channels == dst_channels && vec_size == dst_channels)
618 return value;
619
620 src_channels = MIN2(src_channels, vec_size);
621
622 for (unsigned i = 0; i < src_channels; i++)
623 chan[i] = ac_llvm_extract_elem(ctx, value, i);
624
625 elemtype = LLVMGetElementType(LLVMTypeOf(value));
626 } else {
627 if (src_channels) {
628 assert(src_channels == 1);
629 chan[0] = value;
630 }
631 elemtype = LLVMTypeOf(value);
632 }
633
634 for (unsigned i = src_channels; i < dst_channels; i++)
635 chan[i] = LLVMGetUndef(elemtype);
636
637 return ac_build_gather_values(ctx, chan, dst_channels);
638 }
639
640 /* Extract components [start, start + channels) from a vector.
641 */
642 LLVMValueRef
643 ac_extract_components(struct ac_llvm_context *ctx,
644 LLVMValueRef value,
645 unsigned start,
646 unsigned channels)
647 {
648 LLVMValueRef chan[channels];
649
650 for (unsigned i = 0; i < channels; i++)
651 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
652
653 return ac_build_gather_values(ctx, chan, channels);
654 }
655
656 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
657 * with undef. Extract at most num_channels components from the input.
658 */
659 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
660 LLVMValueRef value,
661 unsigned num_channels)
662 {
663 return ac_build_expand(ctx, value, num_channels, 4);
664 }
665
666 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
667 {
668 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
669 const char *name;
670
671 if (type_size == 2)
672 name = "llvm.rint.f16";
673 else if (type_size == 4)
674 name = "llvm.rint.f32";
675 else
676 name = "llvm.rint.f64";
677
678 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1,
679 AC_FUNC_ATTR_READNONE);
680 }
681
682 LLVMValueRef
683 ac_build_fdiv(struct ac_llvm_context *ctx,
684 LLVMValueRef num,
685 LLVMValueRef den)
686 {
687 /* If we do (num / den), LLVM >= 7.0 does:
688 * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
689 *
690 * If we do (num * (1 / den)), LLVM does:
691 * return num * v_rcp_f32(den);
692 */
693 LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
694 LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
695 LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
696
697 /* Use v_rcp_f32 instead of precise division. */
698 if (!LLVMIsConstant(ret))
699 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
700 return ret;
701 }
702
703 /* See fast_idiv_by_const.h. */
704 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
705 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx,
706 LLVMValueRef num,
707 LLVMValueRef multiplier,
708 LLVMValueRef pre_shift,
709 LLVMValueRef post_shift,
710 LLVMValueRef increment)
711 {
712 LLVMBuilderRef builder = ctx->builder;
713
714 num = LLVMBuildLShr(builder, num, pre_shift, "");
715 num = LLVMBuildMul(builder,
716 LLVMBuildZExt(builder, num, ctx->i64, ""),
717 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
718 num = LLVMBuildAdd(builder, num,
719 LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
720 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
721 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
722 return LLVMBuildLShr(builder, num, post_shift, "");
723 }
724
725 /* See fast_idiv_by_const.h. */
726 /* If num != UINT_MAX, this more efficient version can be used. */
727 /* Set: increment = util_fast_udiv_info::increment; */
728 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx,
729 LLVMValueRef num,
730 LLVMValueRef multiplier,
731 LLVMValueRef pre_shift,
732 LLVMValueRef post_shift,
733 LLVMValueRef increment)
734 {
735 LLVMBuilderRef builder = ctx->builder;
736
737 num = LLVMBuildLShr(builder, num, pre_shift, "");
738 num = LLVMBuildNUWAdd(builder, num, increment, "");
739 num = LLVMBuildMul(builder,
740 LLVMBuildZExt(builder, num, ctx->i64, ""),
741 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
742 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
743 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
744 return LLVMBuildLShr(builder, num, post_shift, "");
745 }
746
747 /* See fast_idiv_by_const.h. */
748 /* Both operands must fit in 31 bits and the divisor must not be 1. */
749 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx,
750 LLVMValueRef num,
751 LLVMValueRef multiplier,
752 LLVMValueRef post_shift)
753 {
754 LLVMBuilderRef builder = ctx->builder;
755
756 num = LLVMBuildMul(builder,
757 LLVMBuildZExt(builder, num, ctx->i64, ""),
758 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
759 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
760 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
761 return LLVMBuildLShr(builder, num, post_shift, "");
762 }
763
764 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
765 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
766 * already multiplied by two. id is the cube face number.
767 */
768 struct cube_selection_coords {
769 LLVMValueRef stc[2];
770 LLVMValueRef ma;
771 LLVMValueRef id;
772 };
773
774 static void
775 build_cube_intrinsic(struct ac_llvm_context *ctx,
776 LLVMValueRef in[3],
777 struct cube_selection_coords *out)
778 {
779 LLVMTypeRef f32 = ctx->f32;
780
781 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
782 f32, in, 3, AC_FUNC_ATTR_READNONE);
783 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
784 f32, in, 3, AC_FUNC_ATTR_READNONE);
785 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
786 f32, in, 3, AC_FUNC_ATTR_READNONE);
787 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
788 f32, in, 3, AC_FUNC_ATTR_READNONE);
789 }
790
791 /**
792 * Build a manual selection sequence for cube face sc/tc coordinates and
793 * major axis vector (multiplied by 2 for consistency) for the given
794 * vec3 \p coords, for the face implied by \p selcoords.
795 *
796 * For the major axis, we always adjust the sign to be in the direction of
797 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
798 * the selcoords major axis.
799 */
800 static void build_cube_select(struct ac_llvm_context *ctx,
801 const struct cube_selection_coords *selcoords,
802 const LLVMValueRef *coords,
803 LLVMValueRef *out_st,
804 LLVMValueRef *out_ma)
805 {
806 LLVMBuilderRef builder = ctx->builder;
807 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
808 LLVMValueRef is_ma_positive;
809 LLVMValueRef sgn_ma;
810 LLVMValueRef is_ma_z, is_not_ma_z;
811 LLVMValueRef is_ma_y;
812 LLVMValueRef is_ma_x;
813 LLVMValueRef sgn;
814 LLVMValueRef tmp;
815
816 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
817 selcoords->ma, LLVMConstReal(f32, 0.0), "");
818 sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
819 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
820
821 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
822 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
823 is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
824 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
825 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
826
827 /* Select sc */
828 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
829 sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
830 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
831 LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
832 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
833
834 /* Select tc */
835 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
836 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
837 LLVMConstReal(f32, -1.0), "");
838 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
839
840 /* Select ma */
841 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
842 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
843 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
844 ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
845 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
846 }
847
848 void
849 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
850 bool is_deriv, bool is_array, bool is_lod,
851 LLVMValueRef *coords_arg,
852 LLVMValueRef *derivs_arg)
853 {
854
855 LLVMBuilderRef builder = ctx->builder;
856 struct cube_selection_coords selcoords;
857 LLVMValueRef coords[3];
858 LLVMValueRef invma;
859
860 if (is_array && !is_lod) {
861 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
862
863 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
864 *
865 * "For Array forms, the array layer used will be
866 *
867 * max(0, min(d−1, floor(layer+0.5)))
868 *
869 * where d is the depth of the texture array and layer
870 * comes from the component indicated in the tables below.
871 * Workaroudn for an issue where the layer is taken from a
872 * helper invocation which happens to fall on a different
873 * layer due to extrapolation."
874 *
875 * GFX8 and earlier attempt to implement this in hardware by
876 * clamping the value of coords[2] = (8 * layer) + face.
877 * Unfortunately, this means that the we end up with the wrong
878 * face when clamping occurs.
879 *
880 * Clamp the layer earlier to work around the issue.
881 */
882 if (ctx->chip_class <= GFX8) {
883 LLVMValueRef ge0;
884 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
885 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
886 }
887
888 coords_arg[3] = tmp;
889 }
890
891 build_cube_intrinsic(ctx, coords_arg, &selcoords);
892
893 invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
894 ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
895 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
896
897 for (int i = 0; i < 2; ++i)
898 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
899
900 coords[2] = selcoords.id;
901
902 if (is_deriv && derivs_arg) {
903 LLVMValueRef derivs[4];
904 int axis;
905
906 /* Convert cube derivatives to 2D derivatives. */
907 for (axis = 0; axis < 2; axis++) {
908 LLVMValueRef deriv_st[2];
909 LLVMValueRef deriv_ma;
910
911 /* Transform the derivative alongside the texture
912 * coordinate. Mathematically, the correct formula is
913 * as follows. Assume we're projecting onto the +Z face
914 * and denote by dx/dh the derivative of the (original)
915 * X texture coordinate with respect to horizontal
916 * window coordinates. The projection onto the +Z face
917 * plane is:
918 *
919 * f(x,z) = x/z
920 *
921 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
922 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
923 *
924 * This motivatives the implementation below.
925 *
926 * Whether this actually gives the expected results for
927 * apps that might feed in derivatives obtained via
928 * finite differences is anyone's guess. The OpenGL spec
929 * seems awfully quiet about how textureGrad for cube
930 * maps should be handled.
931 */
932 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
933 deriv_st, &deriv_ma);
934
935 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
936
937 for (int i = 0; i < 2; ++i)
938 derivs[axis * 2 + i] =
939 LLVMBuildFSub(builder,
940 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
941 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
942 }
943
944 memcpy(derivs_arg, derivs, sizeof(derivs));
945 }
946
947 /* Shift the texture coordinate. This must be applied after the
948 * derivative calculation.
949 */
950 for (int i = 0; i < 2; ++i)
951 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
952
953 if (is_array) {
954 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
955 /* coords_arg.w component - array_index for cube arrays */
956 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
957 }
958
959 memcpy(coords_arg, coords, sizeof(coords));
960 }
961
962
963 LLVMValueRef
964 ac_build_fs_interp(struct ac_llvm_context *ctx,
965 LLVMValueRef llvm_chan,
966 LLVMValueRef attr_number,
967 LLVMValueRef params,
968 LLVMValueRef i,
969 LLVMValueRef j)
970 {
971 LLVMValueRef args[5];
972 LLVMValueRef p1;
973
974 args[0] = i;
975 args[1] = llvm_chan;
976 args[2] = attr_number;
977 args[3] = params;
978
979 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
980 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
981
982 args[0] = p1;
983 args[1] = j;
984 args[2] = llvm_chan;
985 args[3] = attr_number;
986 args[4] = params;
987
988 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
989 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
990 }
991
992 LLVMValueRef
993 ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
994 LLVMValueRef llvm_chan,
995 LLVMValueRef attr_number,
996 LLVMValueRef params,
997 LLVMValueRef i,
998 LLVMValueRef j)
999 {
1000 LLVMValueRef args[6];
1001 LLVMValueRef p1;
1002
1003 args[0] = i;
1004 args[1] = llvm_chan;
1005 args[2] = attr_number;
1006 args[3] = ctx->i1false;
1007 args[4] = params;
1008
1009 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
1010 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
1011
1012 args[0] = p1;
1013 args[1] = j;
1014 args[2] = llvm_chan;
1015 args[3] = attr_number;
1016 args[4] = ctx->i1false;
1017 args[5] = params;
1018
1019 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
1020 ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
1021 }
1022
1023 LLVMValueRef
1024 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
1025 LLVMValueRef parameter,
1026 LLVMValueRef llvm_chan,
1027 LLVMValueRef attr_number,
1028 LLVMValueRef params)
1029 {
1030 LLVMValueRef args[4];
1031
1032 args[0] = parameter;
1033 args[1] = llvm_chan;
1034 args[2] = attr_number;
1035 args[3] = params;
1036
1037 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
1038 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1039 }
1040
1041 LLVMValueRef
1042 ac_build_gep_ptr(struct ac_llvm_context *ctx,
1043 LLVMValueRef base_ptr,
1044 LLVMValueRef index)
1045 {
1046 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1047 }
1048
1049 LLVMValueRef
1050 ac_build_gep0(struct ac_llvm_context *ctx,
1051 LLVMValueRef base_ptr,
1052 LLVMValueRef index)
1053 {
1054 LLVMValueRef indices[2] = {
1055 ctx->i32_0,
1056 index,
1057 };
1058 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1059 }
1060
1061 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
1062 LLVMValueRef index)
1063 {
1064 return LLVMBuildPointerCast(ctx->builder,
1065 LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
1066 LLVMTypeOf(ptr), "");
1067 }
1068
1069 void
1070 ac_build_indexed_store(struct ac_llvm_context *ctx,
1071 LLVMValueRef base_ptr, LLVMValueRef index,
1072 LLVMValueRef value)
1073 {
1074 LLVMBuildStore(ctx->builder, value,
1075 ac_build_gep0(ctx, base_ptr, index));
1076 }
1077
1078 /**
1079 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1080 * It's equivalent to doing a load from &base_ptr[index].
1081 *
1082 * \param base_ptr Where the array starts.
1083 * \param index The element index into the array.
1084 * \param uniform Whether the base_ptr and index can be assumed to be
1085 * dynamically uniform (i.e. load to an SGPR)
1086 * \param invariant Whether the load is invariant (no other opcodes affect it)
1087 * \param no_unsigned_wraparound
1088 * For all possible re-associations and re-distributions of an expression
1089 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1090 * without inbounds in base_ptr), this parameter is true if "addr + offset"
1091 * does not result in an unsigned integer wraparound. This is used for
1092 * optimal code generation of 32-bit pointer arithmetic.
1093 *
1094 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
1095 * integer wraparound can't be an imm offset in s_load_dword, because
1096 * the instruction performs "addr + offset" in 64 bits.
1097 *
1098 * Expected usage for bindless textures by chaining GEPs:
1099 * // possible unsigned wraparound, don't use InBounds:
1100 * ptr1 = LLVMBuildGEP(base_ptr, index);
1101 * image = load(ptr1); // becomes "s_load ptr1, 0"
1102 *
1103 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1104 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1105 */
1106 static LLVMValueRef
1107 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1108 LLVMValueRef index, bool uniform, bool invariant,
1109 bool no_unsigned_wraparound)
1110 {
1111 LLVMValueRef pointer, result;
1112
1113 if (no_unsigned_wraparound &&
1114 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1115 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1116 else
1117 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1118
1119 if (uniform)
1120 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1121 result = LLVMBuildLoad(ctx->builder, pointer, "");
1122 if (invariant)
1123 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1124 return result;
1125 }
1126
1127 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1128 LLVMValueRef index)
1129 {
1130 return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1131 }
1132
1133 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
1134 LLVMValueRef base_ptr, LLVMValueRef index)
1135 {
1136 return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1137 }
1138
1139 /* This assumes that there is no unsigned integer wraparound during the address
1140 * computation, excluding all GEPs within base_ptr. */
1141 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
1142 LLVMValueRef base_ptr, LLVMValueRef index)
1143 {
1144 return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1145 }
1146
1147 /* See ac_build_load_custom() documentation. */
1148 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1149 LLVMValueRef base_ptr, LLVMValueRef index)
1150 {
1151 return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1152 }
1153
1154 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx,
1155 unsigned cache_policy)
1156 {
1157 return cache_policy |
1158 (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1159 }
1160
1161 static void
1162 ac_build_buffer_store_common(struct ac_llvm_context *ctx,
1163 LLVMValueRef rsrc,
1164 LLVMValueRef data,
1165 LLVMValueRef vindex,
1166 LLVMValueRef voffset,
1167 LLVMValueRef soffset,
1168 unsigned num_channels,
1169 LLVMTypeRef return_channel_type,
1170 unsigned cache_policy,
1171 bool use_format,
1172 bool structurized)
1173 {
1174 LLVMValueRef args[6];
1175 int idx = 0;
1176 args[idx++] = data;
1177 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1178 if (structurized)
1179 args[idx++] = vindex ? vindex : ctx->i32_0;
1180 args[idx++] = voffset ? voffset : ctx->i32_0;
1181 args[idx++] = soffset ? soffset : ctx->i32_0;
1182 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1183 unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1184 const char *indexing_kind = structurized ? "struct" : "raw";
1185 char name[256], type_name[8];
1186
1187 LLVMTypeRef type = func > 1 ? LLVMVectorType(return_channel_type, func) : return_channel_type;
1188 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1189
1190 if (use_format) {
1191 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s",
1192 indexing_kind, type_name);
1193 } else {
1194 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s",
1195 indexing_kind, type_name);
1196 }
1197
1198 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1199 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1200 }
1201
1202 void
1203 ac_build_buffer_store_format(struct ac_llvm_context *ctx,
1204 LLVMValueRef rsrc,
1205 LLVMValueRef data,
1206 LLVMValueRef vindex,
1207 LLVMValueRef voffset,
1208 unsigned num_channels,
1209 unsigned cache_policy)
1210 {
1211 ac_build_buffer_store_common(ctx, rsrc, data, vindex,
1212 voffset, NULL, num_channels,
1213 ctx->f32, cache_policy,
1214 true, true);
1215 }
1216
1217 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1218 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1219 * or v4i32 (num_channels=3,4).
1220 */
1221 void
1222 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
1223 LLVMValueRef rsrc,
1224 LLVMValueRef vdata,
1225 unsigned num_channels,
1226 LLVMValueRef voffset,
1227 LLVMValueRef soffset,
1228 unsigned inst_offset,
1229 unsigned cache_policy,
1230 bool swizzle_enable_hint)
1231 {
1232 /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1233 * intrinsics. */
1234 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1235 LLVMValueRef v[3], v01;
1236
1237 for (int i = 0; i < 3; i++) {
1238 v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
1239 LLVMConstInt(ctx->i32, i, 0), "");
1240 }
1241 v01 = ac_build_gather_values(ctx, v, 2);
1242
1243 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
1244 soffset, inst_offset, cache_policy,
1245 swizzle_enable_hint);
1246 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
1247 soffset, inst_offset + 8,
1248 cache_policy,
1249 swizzle_enable_hint);
1250 return;
1251 }
1252
1253 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1254 * (voffset is swizzled, but soffset isn't swizzled).
1255 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1256 */
1257 if (!swizzle_enable_hint) {
1258 LLVMValueRef offset = soffset;
1259
1260 if (inst_offset)
1261 offset = LLVMBuildAdd(ctx->builder, offset,
1262 LLVMConstInt(ctx->i32, inst_offset, 0), "");
1263
1264 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata),
1265 ctx->i32_0, voffset, offset,
1266 num_channels, ctx->f32,
1267 cache_policy, false, false);
1268 return;
1269 }
1270
1271 static const unsigned dfmts[] = {
1272 V_008F0C_BUF_DATA_FORMAT_32,
1273 V_008F0C_BUF_DATA_FORMAT_32_32,
1274 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1275 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
1276 };
1277 unsigned dfmt = dfmts[num_channels - 1];
1278 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1279 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1280
1281 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1282 immoffset, num_channels, dfmt, nfmt, cache_policy);
1283 }
1284
1285 static LLVMValueRef
1286 ac_build_buffer_load_common(struct ac_llvm_context *ctx,
1287 LLVMValueRef rsrc,
1288 LLVMValueRef vindex,
1289 LLVMValueRef voffset,
1290 LLVMValueRef soffset,
1291 unsigned num_channels,
1292 LLVMTypeRef channel_type,
1293 unsigned cache_policy,
1294 bool can_speculate,
1295 bool use_format,
1296 bool structurized)
1297 {
1298 LLVMValueRef args[5];
1299 int idx = 0;
1300 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1301 if (structurized)
1302 args[idx++] = vindex ? vindex : ctx->i32_0;
1303 args[idx++] = voffset ? voffset : ctx->i32_0;
1304 args[idx++] = soffset ? soffset : ctx->i32_0;
1305 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1306 unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1307 const char *indexing_kind = structurized ? "struct" : "raw";
1308 char name[256], type_name[8];
1309
1310 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1311 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1312
1313 if (use_format) {
1314 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s",
1315 indexing_kind, type_name);
1316 } else {
1317 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s",
1318 indexing_kind, type_name);
1319 }
1320
1321 return ac_build_intrinsic(ctx, name, type, args, idx,
1322 ac_get_load_intr_attribs(can_speculate));
1323 }
1324
1325 LLVMValueRef
1326 ac_build_buffer_load(struct ac_llvm_context *ctx,
1327 LLVMValueRef rsrc,
1328 int num_channels,
1329 LLVMValueRef vindex,
1330 LLVMValueRef voffset,
1331 LLVMValueRef soffset,
1332 unsigned inst_offset,
1333 unsigned cache_policy,
1334 bool can_speculate,
1335 bool allow_smem)
1336 {
1337 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1338 if (voffset)
1339 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1340 if (soffset)
1341 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1342
1343 if (allow_smem && !(cache_policy & ac_slc) &&
1344 (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1345 assert(vindex == NULL);
1346
1347 LLVMValueRef result[8];
1348
1349 for (int i = 0; i < num_channels; i++) {
1350 if (i) {
1351 offset = LLVMBuildAdd(ctx->builder, offset,
1352 LLVMConstInt(ctx->i32, 4, 0), "");
1353 }
1354 LLVMValueRef args[3] = {
1355 rsrc,
1356 offset,
1357 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1358 };
1359 result[i] = ac_build_intrinsic(ctx,
1360 "llvm.amdgcn.s.buffer.load.f32",
1361 ctx->f32, args, 3,
1362 AC_FUNC_ATTR_READNONE);
1363 }
1364 if (num_channels == 1)
1365 return result[0];
1366
1367 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1368 result[num_channels++] = LLVMGetUndef(ctx->f32);
1369 return ac_build_gather_values(ctx, result, num_channels);
1370 }
1371
1372 return ac_build_buffer_load_common(ctx, rsrc, vindex,
1373 offset, ctx->i32_0,
1374 num_channels, ctx->f32,
1375 cache_policy,
1376 can_speculate, false, false);
1377 }
1378
1379 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1380 LLVMValueRef rsrc,
1381 LLVMValueRef vindex,
1382 LLVMValueRef voffset,
1383 unsigned num_channels,
1384 unsigned cache_policy,
1385 bool can_speculate)
1386 {
1387 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
1388 ctx->i32_0, num_channels, ctx->f32,
1389 cache_policy, can_speculate,
1390 true, true);
1391 }
1392
1393 static LLVMValueRef
1394 ac_build_tbuffer_load(struct ac_llvm_context *ctx,
1395 LLVMValueRef rsrc,
1396 LLVMValueRef vindex,
1397 LLVMValueRef voffset,
1398 LLVMValueRef soffset,
1399 LLVMValueRef immoffset,
1400 unsigned num_channels,
1401 unsigned dfmt,
1402 unsigned nfmt,
1403 unsigned cache_policy,
1404 bool can_speculate,
1405 bool structurized)
1406 {
1407 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1408
1409 LLVMValueRef args[6];
1410 int idx = 0;
1411 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1412 if (structurized)
1413 args[idx++] = vindex ? vindex : ctx->i32_0;
1414 args[idx++] = voffset ? voffset : ctx->i32_0;
1415 args[idx++] = soffset ? soffset : ctx->i32_0;
1416 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1417 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1418 unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1419 const char *indexing_kind = structurized ? "struct" : "raw";
1420 char name[256], type_name[8];
1421
1422 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1423 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1424
1425 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s",
1426 indexing_kind, type_name);
1427
1428 return ac_build_intrinsic(ctx, name, type, args, idx,
1429 ac_get_load_intr_attribs(can_speculate));
1430 }
1431
1432 LLVMValueRef
1433 ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx,
1434 LLVMValueRef rsrc,
1435 LLVMValueRef vindex,
1436 LLVMValueRef voffset,
1437 LLVMValueRef soffset,
1438 LLVMValueRef immoffset,
1439 unsigned num_channels,
1440 unsigned dfmt,
1441 unsigned nfmt,
1442 unsigned cache_policy,
1443 bool can_speculate)
1444 {
1445 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset,
1446 immoffset, num_channels, dfmt, nfmt,
1447 cache_policy, can_speculate, true);
1448 }
1449
1450 LLVMValueRef
1451 ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx,
1452 LLVMValueRef rsrc,
1453 LLVMValueRef voffset,
1454 LLVMValueRef soffset,
1455 LLVMValueRef immoffset,
1456 unsigned num_channels,
1457 unsigned dfmt,
1458 unsigned nfmt,
1459 unsigned cache_policy,
1460 bool can_speculate)
1461 {
1462 return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset,
1463 immoffset, num_channels, dfmt, nfmt,
1464 cache_policy, can_speculate, false);
1465 }
1466
1467 LLVMValueRef
1468 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1469 LLVMValueRef rsrc,
1470 LLVMValueRef voffset,
1471 LLVMValueRef soffset,
1472 LLVMValueRef immoffset,
1473 unsigned cache_policy)
1474 {
1475 LLVMValueRef res;
1476
1477 if (LLVM_VERSION_MAJOR >= 9) {
1478 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1479
1480 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1481 res = ac_build_buffer_load_common(ctx, rsrc, NULL,
1482 voffset, soffset,
1483 1, ctx->i16, cache_policy,
1484 false, false, false);
1485 } else {
1486 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1487 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1488
1489 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1490 immoffset, 1, dfmt, nfmt, cache_policy,
1491 false);
1492
1493 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1494 }
1495
1496 return res;
1497 }
1498
1499 LLVMValueRef
1500 ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx,
1501 LLVMValueRef rsrc,
1502 LLVMValueRef voffset,
1503 LLVMValueRef soffset,
1504 LLVMValueRef immoffset,
1505 unsigned cache_policy)
1506 {
1507 LLVMValueRef res;
1508
1509 if (LLVM_VERSION_MAJOR >= 9) {
1510 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1511
1512 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1513 res = ac_build_buffer_load_common(ctx, rsrc, NULL,
1514 voffset, soffset,
1515 1, ctx->i8, cache_policy,
1516 false, false, false);
1517 } else {
1518 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1519 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1520
1521 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset,
1522 immoffset, 1, dfmt, nfmt, cache_policy,
1523 false);
1524
1525 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1526 }
1527
1528 return res;
1529 }
1530
1531 /**
1532 * Convert an 11- or 10-bit unsigned floating point number to an f32.
1533 *
1534 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1535 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1536 */
1537 static LLVMValueRef
1538 ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned exp_bits, unsigned mant_bits)
1539 {
1540 assert(LLVMTypeOf(src) == ctx->i32);
1541
1542 LLVMValueRef tmp;
1543 LLVMValueRef mantissa;
1544 mantissa = LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1545
1546 /* Converting normal numbers is just a shift + correcting the exponent bias */
1547 unsigned normal_shift = 23 - mant_bits;
1548 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1549 LLVMValueRef shifted, normal;
1550
1551 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1552 normal = LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1553
1554 /* Converting nan/inf numbers is the same, but with a different exponent update */
1555 LLVMValueRef naninf;
1556 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1557
1558 /* Converting denormals is the complex case: determine the leading zeros of the
1559 * mantissa to obtain the correct shift for the mantissa and exponent correction.
1560 */
1561 LLVMValueRef denormal;
1562 LLVMValueRef params[2] = {
1563 mantissa,
1564 ctx->i1true, /* result can be undef when arg is 0 */
1565 };
1566 LLVMValueRef ctlz = ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32,
1567 params, 2, AC_FUNC_ATTR_READNONE);
1568
1569 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1570 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1571 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1572
1573 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1574 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1575 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1576 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1577
1578 /* Select the final result. */
1579 LLVMValueRef result;
1580
1581 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1582 LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1583 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1584
1585 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1586 LLVMConstInt(ctx->i32, 1 << mant_bits, false), "");
1587 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1588
1589 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1590 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1591
1592 return ac_to_float(ctx, result);
1593 }
1594
1595 /**
1596 * Generate a fully general open coded buffer format fetch with all required
1597 * fixups suitable for vertex fetch, using non-format buffer loads.
1598 *
1599 * Some combinations of argument values have special interpretations:
1600 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1601 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1602 *
1603 * \param log_size log(size of channel in bytes)
1604 * \param num_channels number of channels (1 to 4)
1605 * \param format AC_FETCH_FORMAT_xxx value
1606 * \param reverse whether XYZ channels are reversed
1607 * \param known_aligned whether the source is known to be aligned to hardware's
1608 * effective element size for loading the given format
1609 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1610 * \param rsrc buffer resource descriptor
1611 * \return the resulting vector of floats or integers bitcast to <4 x i32>
1612 */
1613 LLVMValueRef
1614 ac_build_opencoded_load_format(struct ac_llvm_context *ctx,
1615 unsigned log_size,
1616 unsigned num_channels,
1617 unsigned format,
1618 bool reverse,
1619 bool known_aligned,
1620 LLVMValueRef rsrc,
1621 LLVMValueRef vindex,
1622 LLVMValueRef voffset,
1623 LLVMValueRef soffset,
1624 unsigned cache_policy,
1625 bool can_speculate)
1626 {
1627 LLVMValueRef tmp;
1628 unsigned load_log_size = log_size;
1629 unsigned load_num_channels = num_channels;
1630 if (log_size == 3) {
1631 load_log_size = 2;
1632 if (format == AC_FETCH_FORMAT_FLOAT) {
1633 load_num_channels = 2 * num_channels;
1634 } else {
1635 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1636 }
1637 }
1638
1639 int log_recombine = 0;
1640 if (ctx->chip_class == GFX6 && !known_aligned) {
1641 /* Avoid alignment restrictions by loading one byte at a time. */
1642 load_num_channels <<= load_log_size;
1643 log_recombine = load_log_size;
1644 load_log_size = 0;
1645 } else if (load_num_channels == 2 || load_num_channels == 4) {
1646 log_recombine = -util_logbase2(load_num_channels);
1647 load_num_channels = 1;
1648 load_log_size += -log_recombine;
1649 }
1650
1651 assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9);
1652
1653 LLVMValueRef loads[32]; /* up to 32 bytes */
1654 for (unsigned i = 0; i < load_num_channels; ++i) {
1655 tmp = LLVMBuildAdd(ctx->builder, soffset,
1656 LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1657 LLVMTypeRef channel_type = load_log_size == 0 ? ctx->i8 :
1658 load_log_size == 1 ? ctx->i16 : ctx->i32;
1659 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1660 loads[i] = ac_build_buffer_load_common(
1661 ctx, rsrc, vindex, voffset, tmp,
1662 num_channels, channel_type, cache_policy,
1663 can_speculate, false, true);
1664 if (load_log_size >= 2)
1665 loads[i] = ac_to_integer(ctx, loads[i]);
1666 }
1667
1668 if (log_recombine > 0) {
1669 /* Recombine bytes if necessary (GFX6 only) */
1670 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1671
1672 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1673 LLVMValueRef accum = NULL;
1674 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1675 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1676 if (i == 0) {
1677 accum = tmp;
1678 } else {
1679 tmp = LLVMBuildShl(ctx->builder, tmp,
1680 LLVMConstInt(dst_type, 8 * i, false), "");
1681 accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1682 }
1683 }
1684 loads[dst] = accum;
1685 }
1686 } else if (log_recombine < 0) {
1687 /* Split vectors of dwords */
1688 if (load_log_size > 2) {
1689 assert(load_num_channels == 1);
1690 LLVMValueRef loaded = loads[0];
1691 unsigned log_split = load_log_size - 2;
1692 log_recombine += log_split;
1693 load_num_channels = 1 << log_split;
1694 load_log_size = 2;
1695 for (unsigned i = 0; i < load_num_channels; ++i) {
1696 tmp = LLVMConstInt(ctx->i32, i, false);
1697 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1698 }
1699 }
1700
1701 /* Further split dwords and shorts if required */
1702 if (log_recombine < 0) {
1703 for (unsigned src = load_num_channels,
1704 dst = load_num_channels << -log_recombine;
1705 src > 0; --src) {
1706 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1707 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1708 LLVMValueRef loaded = loads[src - 1];
1709 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1710 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1711 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1712 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1713 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1714 }
1715 }
1716 }
1717 }
1718
1719 if (log_size == 3) {
1720 if (format == AC_FETCH_FORMAT_FLOAT) {
1721 for (unsigned i = 0; i < num_channels; ++i) {
1722 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1723 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1724 }
1725 } else if (format == AC_FETCH_FORMAT_FIXED) {
1726 /* 10_11_11_FLOAT */
1727 LLVMValueRef data = loads[0];
1728 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1729 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1730 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1731 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1732 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1733
1734 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1735 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1736 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1737
1738 num_channels = 3;
1739 log_size = 2;
1740 format = AC_FETCH_FORMAT_FLOAT;
1741 } else {
1742 /* 2_10_10_10 data formats */
1743 LLVMValueRef data = loads[0];
1744 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1745 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1746 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1747 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1748 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1749 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1750 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1751 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1752 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1753
1754 num_channels = 4;
1755 }
1756 }
1757
1758 if (format == AC_FETCH_FORMAT_FLOAT) {
1759 if (log_size != 2) {
1760 for (unsigned chan = 0; chan < num_channels; ++chan) {
1761 tmp = ac_to_float(ctx, loads[chan]);
1762 if (log_size == 3)
1763 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1764 else if (log_size == 1)
1765 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1766 loads[chan] = ac_to_integer(ctx, tmp);
1767 }
1768 }
1769 } else if (format == AC_FETCH_FORMAT_UINT) {
1770 if (log_size != 2) {
1771 for (unsigned chan = 0; chan < num_channels; ++chan)
1772 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1773 }
1774 } else if (format == AC_FETCH_FORMAT_SINT) {
1775 if (log_size != 2) {
1776 for (unsigned chan = 0; chan < num_channels; ++chan)
1777 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1778 }
1779 } else {
1780 bool unsign = format == AC_FETCH_FORMAT_UNORM ||
1781 format == AC_FETCH_FORMAT_USCALED ||
1782 format == AC_FETCH_FORMAT_UINT;
1783
1784 for (unsigned chan = 0; chan < num_channels; ++chan) {
1785 if (unsign) {
1786 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1787 } else {
1788 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1789 }
1790
1791 LLVMValueRef scale = NULL;
1792 if (format == AC_FETCH_FORMAT_FIXED) {
1793 assert(log_size == 2);
1794 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1795 } else if (format == AC_FETCH_FORMAT_UNORM) {
1796 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1797 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1798 } else if (format == AC_FETCH_FORMAT_SNORM) {
1799 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1800 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1801 }
1802 if (scale)
1803 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1804
1805 if (format == AC_FETCH_FORMAT_SNORM) {
1806 /* Clamp to [-1, 1] */
1807 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1808 LLVMValueRef clamp =
1809 LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1810 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1811 }
1812
1813 loads[chan] = ac_to_integer(ctx, tmp);
1814 }
1815 }
1816
1817 while (num_channels < 4) {
1818 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1819 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1820 } else {
1821 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1822 }
1823 num_channels++;
1824 }
1825
1826 if (reverse) {
1827 tmp = loads[0];
1828 loads[0] = loads[2];
1829 loads[2] = tmp;
1830 }
1831
1832 return ac_build_gather_values(ctx, loads, 4);
1833 }
1834
1835 static void
1836 ac_build_tbuffer_store(struct ac_llvm_context *ctx,
1837 LLVMValueRef rsrc,
1838 LLVMValueRef vdata,
1839 LLVMValueRef vindex,
1840 LLVMValueRef voffset,
1841 LLVMValueRef soffset,
1842 LLVMValueRef immoffset,
1843 unsigned num_channels,
1844 unsigned dfmt,
1845 unsigned nfmt,
1846 unsigned cache_policy,
1847 bool structurized)
1848 {
1849 voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1850 immoffset, "");
1851
1852 LLVMValueRef args[7];
1853 int idx = 0;
1854 args[idx++] = vdata;
1855 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1856 if (structurized)
1857 args[idx++] = vindex ? vindex : ctx->i32_0;
1858 args[idx++] = voffset ? voffset : ctx->i32_0;
1859 args[idx++] = soffset ? soffset : ctx->i32_0;
1860 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1861 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1862 unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1863 const char *indexing_kind = structurized ? "struct" : "raw";
1864 char name[256], type_name[8];
1865
1866 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1867 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1868
1869 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s",
1870 indexing_kind, type_name);
1871
1872 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx,
1873 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1874 }
1875
1876 void
1877 ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx,
1878 LLVMValueRef rsrc,
1879 LLVMValueRef vdata,
1880 LLVMValueRef vindex,
1881 LLVMValueRef voffset,
1882 LLVMValueRef soffset,
1883 LLVMValueRef immoffset,
1884 unsigned num_channels,
1885 unsigned dfmt,
1886 unsigned nfmt,
1887 unsigned cache_policy)
1888 {
1889 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset,
1890 immoffset, num_channels, dfmt, nfmt, cache_policy,
1891 true);
1892 }
1893
1894 void
1895 ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx,
1896 LLVMValueRef rsrc,
1897 LLVMValueRef vdata,
1898 LLVMValueRef voffset,
1899 LLVMValueRef soffset,
1900 LLVMValueRef immoffset,
1901 unsigned num_channels,
1902 unsigned dfmt,
1903 unsigned nfmt,
1904 unsigned cache_policy)
1905 {
1906 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset,
1907 immoffset, num_channels, dfmt, nfmt, cache_policy,
1908 false);
1909 }
1910
1911 void
1912 ac_build_tbuffer_store_short(struct ac_llvm_context *ctx,
1913 LLVMValueRef rsrc,
1914 LLVMValueRef vdata,
1915 LLVMValueRef voffset,
1916 LLVMValueRef soffset,
1917 unsigned cache_policy)
1918 {
1919 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1920
1921 if (LLVM_VERSION_MAJOR >= 9) {
1922 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1923 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL,
1924 voffset, soffset, 1,
1925 ctx->i16, cache_policy,
1926 false, false);
1927 } else {
1928 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1929 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1930
1931 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1932
1933 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1934 ctx->i32_0, 1, dfmt, nfmt, cache_policy);
1935 }
1936 }
1937
1938 void
1939 ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx,
1940 LLVMValueRef rsrc,
1941 LLVMValueRef vdata,
1942 LLVMValueRef voffset,
1943 LLVMValueRef soffset,
1944 unsigned cache_policy)
1945 {
1946 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1947
1948 if (LLVM_VERSION_MAJOR >= 9) {
1949 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1950 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL,
1951 voffset, soffset, 1,
1952 ctx->i8, cache_policy,
1953 false, false);
1954 } else {
1955 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1956 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1957
1958 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1959
1960 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset,
1961 ctx->i32_0, 1, dfmt, nfmt, cache_policy);
1962 }
1963 }
1964 /**
1965 * Set range metadata on an instruction. This can only be used on load and
1966 * call instructions. If you know an instruction can only produce the values
1967 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1968 * \p lo is the minimum value inclusive.
1969 * \p hi is the maximum value exclusive.
1970 */
1971 static void set_range_metadata(struct ac_llvm_context *ctx,
1972 LLVMValueRef value, unsigned lo, unsigned hi)
1973 {
1974 LLVMValueRef range_md, md_args[2];
1975 LLVMTypeRef type = LLVMTypeOf(value);
1976 LLVMContextRef context = LLVMGetTypeContext(type);
1977
1978 md_args[0] = LLVMConstInt(type, lo, false);
1979 md_args[1] = LLVMConstInt(type, hi, false);
1980 range_md = LLVMMDNodeInContext(context, md_args, 2);
1981 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1982 }
1983
1984 LLVMValueRef
1985 ac_get_thread_id(struct ac_llvm_context *ctx)
1986 {
1987 LLVMValueRef tid;
1988
1989 LLVMValueRef tid_args[2];
1990 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1991 tid_args[1] = ctx->i32_0;
1992 tid_args[1] = ac_build_intrinsic(ctx,
1993 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1994 tid_args, 2, AC_FUNC_ATTR_READNONE);
1995
1996 if (ctx->wave_size == 32) {
1997 tid = tid_args[1];
1998 } else {
1999 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
2000 ctx->i32, tid_args,
2001 2, AC_FUNC_ATTR_READNONE);
2002 }
2003 set_range_metadata(ctx, tid, 0, ctx->wave_size);
2004 return tid;
2005 }
2006
2007 /*
2008 * AMD GCN implements derivatives using the local data store (LDS)
2009 * All writes to the LDS happen in all executing threads at
2010 * the same time. TID is the Thread ID for the current
2011 * thread and is a value between 0 and 63, representing
2012 * the thread's position in the wavefront.
2013 *
2014 * For the pixel shader threads are grouped into quads of four pixels.
2015 * The TIDs of the pixels of a quad are:
2016 *
2017 * +------+------+
2018 * |4n + 0|4n + 1|
2019 * +------+------+
2020 * |4n + 2|4n + 3|
2021 * +------+------+
2022 *
2023 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
2024 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
2025 * the current pixel's column, and masking with 0xfffffffe yields the TID
2026 * of the left pixel of the current pixel's row.
2027 *
2028 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
2029 * adding 2 yields the TID of the pixel below the top pixel.
2030 */
2031 LLVMValueRef
2032 ac_build_ddxy(struct ac_llvm_context *ctx,
2033 uint32_t mask,
2034 int idx,
2035 LLVMValueRef val)
2036 {
2037 unsigned tl_lanes[4], trbl_lanes[4];
2038 char name[32], type[8];
2039 LLVMValueRef tl, trbl;
2040 LLVMTypeRef result_type;
2041 LLVMValueRef result;
2042
2043 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
2044
2045 if (result_type == ctx->f16)
2046 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
2047
2048 for (unsigned i = 0; i < 4; ++i) {
2049 tl_lanes[i] = i & mask;
2050 trbl_lanes[i] = (i & mask) + idx;
2051 }
2052
2053 tl = ac_build_quad_swizzle(ctx, val,
2054 tl_lanes[0], tl_lanes[1],
2055 tl_lanes[2], tl_lanes[3]);
2056 trbl = ac_build_quad_swizzle(ctx, val,
2057 trbl_lanes[0], trbl_lanes[1],
2058 trbl_lanes[2], trbl_lanes[3]);
2059
2060 if (result_type == ctx->f16) {
2061 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
2062 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
2063 }
2064
2065 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
2066 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
2067 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
2068
2069 ac_build_type_name_for_intr(result_type, type, sizeof(type));
2070 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
2071
2072 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
2073 }
2074
2075 void
2076 ac_build_sendmsg(struct ac_llvm_context *ctx,
2077 uint32_t msg,
2078 LLVMValueRef wave_id)
2079 {
2080 LLVMValueRef args[2];
2081 args[0] = LLVMConstInt(ctx->i32, msg, false);
2082 args[1] = wave_id;
2083 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
2084 }
2085
2086 LLVMValueRef
2087 ac_build_imsb(struct ac_llvm_context *ctx,
2088 LLVMValueRef arg,
2089 LLVMTypeRef dst_type)
2090 {
2091 LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
2092 dst_type, &arg, 1,
2093 AC_FUNC_ATTR_READNONE);
2094
2095 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
2096 * the index from LSB. Invert it by doing "31 - msb". */
2097 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
2098 msb, "");
2099
2100 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
2101 LLVMValueRef cond = LLVMBuildOr(ctx->builder,
2102 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2103 arg, ctx->i32_0, ""),
2104 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
2105 arg, all_ones, ""), "");
2106
2107 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
2108 }
2109
2110 LLVMValueRef
2111 ac_build_umsb(struct ac_llvm_context *ctx,
2112 LLVMValueRef arg,
2113 LLVMTypeRef dst_type)
2114 {
2115 const char *intrin_name;
2116 LLVMTypeRef type;
2117 LLVMValueRef highest_bit;
2118 LLVMValueRef zero;
2119 unsigned bitsize;
2120
2121 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
2122 switch (bitsize) {
2123 case 64:
2124 intrin_name = "llvm.ctlz.i64";
2125 type = ctx->i64;
2126 highest_bit = LLVMConstInt(ctx->i64, 63, false);
2127 zero = ctx->i64_0;
2128 break;
2129 case 32:
2130 intrin_name = "llvm.ctlz.i32";
2131 type = ctx->i32;
2132 highest_bit = LLVMConstInt(ctx->i32, 31, false);
2133 zero = ctx->i32_0;
2134 break;
2135 case 16:
2136 intrin_name = "llvm.ctlz.i16";
2137 type = ctx->i16;
2138 highest_bit = LLVMConstInt(ctx->i16, 15, false);
2139 zero = ctx->i16_0;
2140 break;
2141 case 8:
2142 intrin_name = "llvm.ctlz.i8";
2143 type = ctx->i8;
2144 highest_bit = LLVMConstInt(ctx->i8, 7, false);
2145 zero = ctx->i8_0;
2146 break;
2147 default:
2148 unreachable(!"invalid bitsize");
2149 break;
2150 }
2151
2152 LLVMValueRef params[2] = {
2153 arg,
2154 ctx->i1true,
2155 };
2156
2157 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
2158 params, 2,
2159 AC_FUNC_ATTR_READNONE);
2160
2161 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
2162 * the index from LSB. Invert it by doing "31 - msb". */
2163 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
2164
2165 if (bitsize == 64) {
2166 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
2167 } else if (bitsize < 32) {
2168 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
2169 }
2170
2171 /* check for zero */
2172 return LLVMBuildSelect(ctx->builder,
2173 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
2174 LLVMConstInt(ctx->i32, -1, true), msb, "");
2175 }
2176
2177 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
2178 LLVMValueRef b)
2179 {
2180 char name[64];
2181 snprintf(name, sizeof(name), "llvm.minnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2182 LLVMValueRef args[2] = {a, b};
2183 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2184 AC_FUNC_ATTR_READNONE);
2185 }
2186
2187 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
2188 LLVMValueRef b)
2189 {
2190 char name[64];
2191 snprintf(name, sizeof(name), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, LLVMTypeOf(a)));
2192 LLVMValueRef args[2] = {a, b};
2193 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2,
2194 AC_FUNC_ATTR_READNONE);
2195 }
2196
2197 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
2198 LLVMValueRef b)
2199 {
2200 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
2201 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2202 }
2203
2204 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
2205 LLVMValueRef b)
2206 {
2207 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
2208 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2209 }
2210
2211 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
2212 LLVMValueRef b)
2213 {
2214 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
2215 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2216 }
2217
2218 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a,
2219 LLVMValueRef b)
2220 {
2221 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
2222 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
2223 }
2224
2225 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
2226 {
2227 LLVMTypeRef t = LLVMTypeOf(value);
2228 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
2229 LLVMConstReal(t, 1.0));
2230 }
2231
2232 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
2233 {
2234 LLVMValueRef args[9];
2235
2236 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
2237 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
2238
2239 if (a->compr) {
2240 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
2241 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
2242
2243 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
2244 v2i16, "");
2245 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
2246 v2i16, "");
2247 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
2248 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2249
2250 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
2251 ctx->voidt, args, 6, 0);
2252 } else {
2253 args[2] = a->out[0];
2254 args[3] = a->out[1];
2255 args[4] = a->out[2];
2256 args[5] = a->out[3];
2257 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
2258 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
2259
2260 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
2261 ctx->voidt, args, 8, 0);
2262 }
2263 }
2264
2265 void ac_build_export_null(struct ac_llvm_context *ctx)
2266 {
2267 struct ac_export_args args;
2268
2269 args.enabled_channels = 0x0; /* enabled channels */
2270 args.valid_mask = 1; /* whether the EXEC mask is valid */
2271 args.done = 1; /* DONE bit */
2272 args.target = V_008DFC_SQ_EXP_NULL;
2273 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2274 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2275 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2276 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2277 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2278
2279 ac_build_export(ctx, &args);
2280 }
2281
2282 static unsigned ac_num_coords(enum ac_image_dim dim)
2283 {
2284 switch (dim) {
2285 case ac_image_1d:
2286 return 1;
2287 case ac_image_2d:
2288 case ac_image_1darray:
2289 return 2;
2290 case ac_image_3d:
2291 case ac_image_cube:
2292 case ac_image_2darray:
2293 case ac_image_2dmsaa:
2294 return 3;
2295 case ac_image_2darraymsaa:
2296 return 4;
2297 default:
2298 unreachable("ac_num_coords: bad dim");
2299 }
2300 }
2301
2302 static unsigned ac_num_derivs(enum ac_image_dim dim)
2303 {
2304 switch (dim) {
2305 case ac_image_1d:
2306 case ac_image_1darray:
2307 return 2;
2308 case ac_image_2d:
2309 case ac_image_2darray:
2310 case ac_image_cube:
2311 return 4;
2312 case ac_image_3d:
2313 return 6;
2314 case ac_image_2dmsaa:
2315 case ac_image_2darraymsaa:
2316 default:
2317 unreachable("derivatives not supported");
2318 }
2319 }
2320
2321 static const char *get_atomic_name(enum ac_atomic_op op)
2322 {
2323 switch (op) {
2324 case ac_atomic_swap: return "swap";
2325 case ac_atomic_add: return "add";
2326 case ac_atomic_sub: return "sub";
2327 case ac_atomic_smin: return "smin";
2328 case ac_atomic_umin: return "umin";
2329 case ac_atomic_smax: return "smax";
2330 case ac_atomic_umax: return "umax";
2331 case ac_atomic_and: return "and";
2332 case ac_atomic_or: return "or";
2333 case ac_atomic_xor: return "xor";
2334 case ac_atomic_inc_wrap: return "inc";
2335 case ac_atomic_dec_wrap: return "dec";
2336 }
2337 unreachable("bad atomic op");
2338 }
2339
2340 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
2341 struct ac_image_args *a)
2342 {
2343 const char *overload[3] = { "", "", "" };
2344 unsigned num_overloads = 0;
2345 LLVMValueRef args[18];
2346 unsigned num_args = 0;
2347 enum ac_image_dim dim = a->dim;
2348
2349 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
2350 !a->level_zero);
2351 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2352 a->opcode != ac_image_store_mip) ||
2353 a->lod);
2354 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2355 (!a->compare && !a->offset));
2356 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2357 a->opcode == ac_image_get_lod) ||
2358 !a->bias);
2359 assert((a->bias ? 1 : 0) +
2360 (a->lod ? 1 : 0) +
2361 (a->level_zero ? 1 : 0) +
2362 (a->derivs[0] ? 1 : 0) <= 1);
2363
2364 if (a->opcode == ac_image_get_lod) {
2365 switch (dim) {
2366 case ac_image_1darray:
2367 dim = ac_image_1d;
2368 break;
2369 case ac_image_2darray:
2370 case ac_image_cube:
2371 dim = ac_image_2d;
2372 break;
2373 default:
2374 break;
2375 }
2376 }
2377
2378 bool sample = a->opcode == ac_image_sample ||
2379 a->opcode == ac_image_gather4 ||
2380 a->opcode == ac_image_get_lod;
2381 bool atomic = a->opcode == ac_image_atomic ||
2382 a->opcode == ac_image_atomic_cmpswap;
2383 bool load = a->opcode == ac_image_sample ||
2384 a->opcode == ac_image_gather4 ||
2385 a->opcode == ac_image_load ||
2386 a->opcode == ac_image_load_mip;
2387 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2388
2389 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2390 args[num_args++] = a->data[0];
2391 if (a->opcode == ac_image_atomic_cmpswap)
2392 args[num_args++] = a->data[1];
2393 }
2394
2395 if (!atomic)
2396 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
2397
2398 if (a->offset)
2399 args[num_args++] = ac_to_integer(ctx, a->offset);
2400 if (a->bias) {
2401 args[num_args++] = ac_to_float(ctx, a->bias);
2402 overload[num_overloads++] = ".f32";
2403 }
2404 if (a->compare)
2405 args[num_args++] = ac_to_float(ctx, a->compare);
2406 if (a->derivs[0]) {
2407 unsigned count = ac_num_derivs(dim);
2408 for (unsigned i = 0; i < count; ++i)
2409 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2410 overload[num_overloads++] = ".f32";
2411 }
2412 unsigned num_coords =
2413 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2414 for (unsigned i = 0; i < num_coords; ++i)
2415 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2416 if (a->lod)
2417 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2418 overload[num_overloads++] = sample ? ".f32" : ".i32";
2419
2420 args[num_args++] = a->resource;
2421 if (sample) {
2422 args[num_args++] = a->sampler;
2423 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2424 }
2425
2426 args[num_args++] = ctx->i32_0; /* texfailctrl */
2427 args[num_args++] = LLVMConstInt(ctx->i32,
2428 load ? get_load_cache_policy(ctx, a->cache_policy) :
2429 a->cache_policy, false);
2430
2431 const char *name;
2432 const char *atomic_subop = "";
2433 switch (a->opcode) {
2434 case ac_image_sample: name = "sample"; break;
2435 case ac_image_gather4: name = "gather4"; break;
2436 case ac_image_load: name = "load"; break;
2437 case ac_image_load_mip: name = "load.mip"; break;
2438 case ac_image_store: name = "store"; break;
2439 case ac_image_store_mip: name = "store.mip"; break;
2440 case ac_image_atomic:
2441 name = "atomic.";
2442 atomic_subop = get_atomic_name(a->atomic);
2443 break;
2444 case ac_image_atomic_cmpswap:
2445 name = "atomic.";
2446 atomic_subop = "cmpswap";
2447 break;
2448 case ac_image_get_lod: name = "getlod"; break;
2449 case ac_image_get_resinfo: name = "getresinfo"; break;
2450 default: unreachable("invalid image opcode");
2451 }
2452
2453 const char *dimname;
2454 switch (dim) {
2455 case ac_image_1d: dimname = "1d"; break;
2456 case ac_image_2d: dimname = "2d"; break;
2457 case ac_image_3d: dimname = "3d"; break;
2458 case ac_image_cube: dimname = "cube"; break;
2459 case ac_image_1darray: dimname = "1darray"; break;
2460 case ac_image_2darray: dimname = "2darray"; break;
2461 case ac_image_2dmsaa: dimname = "2dmsaa"; break;
2462 case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
2463 default: unreachable("invalid dim");
2464 }
2465
2466 bool lod_suffix =
2467 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2468 char intr_name[96];
2469 snprintf(intr_name, sizeof(intr_name),
2470 "llvm.amdgcn.image.%s%s" /* base name */
2471 "%s%s%s" /* sample/gather modifiers */
2472 ".%s.%s%s%s%s", /* dimension and type overloads */
2473 name, atomic_subop,
2474 a->compare ? ".c" : "",
2475 a->bias ? ".b" :
2476 lod_suffix ? ".l" :
2477 a->derivs[0] ? ".d" :
2478 a->level_zero ? ".lz" : "",
2479 a->offset ? ".o" : "",
2480 dimname,
2481 atomic ? "i32" : "v4f32",
2482 overload[0], overload[1], overload[2]);
2483
2484 LLVMTypeRef retty;
2485 if (atomic)
2486 retty = ctx->i32;
2487 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2488 retty = ctx->voidt;
2489 else
2490 retty = ctx->v4f32;
2491
2492 LLVMValueRef result =
2493 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
2494 a->attributes);
2495 if (!sample && retty == ctx->v4f32) {
2496 result = LLVMBuildBitCast(ctx->builder, result,
2497 ctx->v4i32, "");
2498 }
2499 return result;
2500 }
2501
2502 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx,
2503 LLVMValueRef rsrc)
2504 {
2505 LLVMValueRef samples;
2506
2507 /* Read the samples from the descriptor directly.
2508 * Hardware doesn't have any instruction for this.
2509 */
2510 samples = LLVMBuildExtractElement(ctx->builder, rsrc,
2511 LLVMConstInt(ctx->i32, 3, 0), "");
2512 samples = LLVMBuildLShr(ctx->builder, samples,
2513 LLVMConstInt(ctx->i32, 16, 0), "");
2514 samples = LLVMBuildAnd(ctx->builder, samples,
2515 LLVMConstInt(ctx->i32, 0xf, 0), "");
2516 samples = LLVMBuildShl(ctx->builder, ctx->i32_1,
2517 samples, "");
2518 return samples;
2519 }
2520
2521 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
2522 LLVMValueRef args[2])
2523 {
2524 LLVMTypeRef v2f16 =
2525 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
2526
2527 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
2528 args, 2, AC_FUNC_ATTR_READNONE);
2529 }
2530
2531 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
2532 LLVMValueRef args[2])
2533 {
2534 LLVMValueRef res =
2535 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
2536 ctx->v2i16, args, 2,
2537 AC_FUNC_ATTR_READNONE);
2538 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2539 }
2540
2541 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
2542 LLVMValueRef args[2])
2543 {
2544 LLVMValueRef res =
2545 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
2546 ctx->v2i16, args, 2,
2547 AC_FUNC_ATTR_READNONE);
2548 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2549 }
2550
2551 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2552 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
2553 LLVMValueRef args[2], unsigned bits, bool hi)
2554 {
2555 assert(bits == 8 || bits == 10 || bits == 16);
2556
2557 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2558 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2559 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
2560 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2561 LLVMValueRef max_alpha =
2562 bits != 10 ? max_rgb : ctx->i32_1;
2563 LLVMValueRef min_alpha =
2564 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2565
2566 /* Clamp. */
2567 if (bits != 16) {
2568 for (int i = 0; i < 2; i++) {
2569 bool alpha = hi && i == 1;
2570 args[i] = ac_build_imin(ctx, args[i],
2571 alpha ? max_alpha : max_rgb);
2572 args[i] = ac_build_imax(ctx, args[i],
2573 alpha ? min_alpha : min_rgb);
2574 }
2575 }
2576
2577 LLVMValueRef res =
2578 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
2579 ctx->v2i16, args, 2,
2580 AC_FUNC_ATTR_READNONE);
2581 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2582 }
2583
2584 /* The 8-bit and 10-bit clamping is for HW workarounds. */
2585 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
2586 LLVMValueRef args[2], unsigned bits, bool hi)
2587 {
2588 assert(bits == 8 || bits == 10 || bits == 16);
2589
2590 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
2591 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2592 LLVMValueRef max_alpha =
2593 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2594
2595 /* Clamp. */
2596 if (bits != 16) {
2597 for (int i = 0; i < 2; i++) {
2598 bool alpha = hi && i == 1;
2599 args[i] = ac_build_umin(ctx, args[i],
2600 alpha ? max_alpha : max_rgb);
2601 }
2602 }
2603
2604 LLVMValueRef res =
2605 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
2606 ctx->v2i16, args, 2,
2607 AC_FUNC_ATTR_READNONE);
2608 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2609 }
2610
2611 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2612 {
2613 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
2614 &i1, 1, AC_FUNC_ATTR_READNONE);
2615 }
2616
2617 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2618 {
2619 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
2620 &i1, 1, 0);
2621 }
2622
2623 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
2624 LLVMValueRef offset, LLVMValueRef width,
2625 bool is_signed)
2626 {
2627 LLVMValueRef args[] = {
2628 input,
2629 offset,
2630 width,
2631 };
2632
2633 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" :
2634 "llvm.amdgcn.ubfe.i32",
2635 ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2636
2637 }
2638
2639 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2640 LLVMValueRef s1, LLVMValueRef s2)
2641 {
2642 return LLVMBuildAdd(ctx->builder,
2643 LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2644 }
2645
2646 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
2647 LLVMValueRef s1, LLVMValueRef s2)
2648 {
2649 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2650 if (ctx->chip_class >= GFX10) {
2651 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32,
2652 (LLVMValueRef []) {s0, s1, s2}, 3,
2653 AC_FUNC_ATTR_READNONE);
2654 }
2655
2656 return LLVMBuildFAdd(ctx->builder,
2657 LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2658 }
2659
2660 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2661 {
2662 if (!wait_flags)
2663 return;
2664
2665 unsigned lgkmcnt = 63;
2666 unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2667 unsigned vscnt = 63;
2668
2669 if (wait_flags & AC_WAIT_LGKM)
2670 lgkmcnt = 0;
2671 if (wait_flags & AC_WAIT_VLOAD)
2672 vmcnt = 0;
2673
2674 if (wait_flags & AC_WAIT_VSTORE) {
2675 if (ctx->chip_class >= GFX10)
2676 vscnt = 0;
2677 else
2678 vmcnt = 0;
2679 }
2680
2681 /* There is no intrinsic for vscnt(0), so use a fence. */
2682 if ((wait_flags & AC_WAIT_LGKM &&
2683 wait_flags & AC_WAIT_VLOAD &&
2684 wait_flags & AC_WAIT_VSTORE) ||
2685 vscnt == 0) {
2686 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2687 return;
2688 }
2689
2690 unsigned simm16 = (lgkmcnt << 8) |
2691 (7 << 4) | /* expcnt */
2692 (vmcnt & 0xf) |
2693 ((vmcnt >> 4) << 14);
2694
2695 LLVMValueRef args[1] = {
2696 LLVMConstInt(ctx->i32, simm16, false),
2697 };
2698 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
2699 ctx->voidt, args, 1, 0);
2700 }
2701
2702 LLVMValueRef ac_build_fmed3(struct ac_llvm_context *ctx, LLVMValueRef src0,
2703 LLVMValueRef src1, LLVMValueRef src2,
2704 unsigned bitsize)
2705 {
2706 LLVMTypeRef type;
2707 char *intr;
2708
2709 if (bitsize == 16) {
2710 intr = "llvm.amdgcn.fmed3.f16";
2711 type = ctx->f16;
2712 } else if (bitsize == 32) {
2713 intr = "llvm.amdgcn.fmed3.f32";
2714 type = ctx->f32;
2715 } else {
2716 intr = "llvm.amdgcn.fmed3.f64";
2717 type = ctx->f64;
2718 }
2719
2720 LLVMValueRef params[] = {
2721 src0,
2722 src1,
2723 src2,
2724 };
2725 return ac_build_intrinsic(ctx, intr, type, params, 3,
2726 AC_FUNC_ATTR_READNONE);
2727 }
2728
2729 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2730 unsigned bitsize)
2731 {
2732 LLVMTypeRef type;
2733 char *intr;
2734
2735 if (bitsize == 16) {
2736 intr = "llvm.amdgcn.fract.f16";
2737 type = ctx->f16;
2738 } else if (bitsize == 32) {
2739 intr = "llvm.amdgcn.fract.f32";
2740 type = ctx->f32;
2741 } else {
2742 intr = "llvm.amdgcn.fract.f64";
2743 type = ctx->f64;
2744 }
2745
2746 LLVMValueRef params[] = {
2747 src0,
2748 };
2749 return ac_build_intrinsic(ctx, intr, type, params, 1,
2750 AC_FUNC_ATTR_READNONE);
2751 }
2752
2753 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2754 unsigned bitsize)
2755 {
2756 LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
2757 LLVMValueRef zero = LLVMConstInt(type, 0, false);
2758 LLVMValueRef one = LLVMConstInt(type, 1, false);
2759
2760 LLVMValueRef cmp, val;
2761 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2762 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2763 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2764 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2765 return val;
2766 }
2767
2768 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2769 unsigned bitsize)
2770 {
2771 LLVMValueRef cmp, val, zero, one;
2772 LLVMTypeRef type;
2773
2774 if (bitsize == 16) {
2775 type = ctx->f16;
2776 zero = ctx->f16_0;
2777 one = ctx->f16_1;
2778 } else if (bitsize == 32) {
2779 type = ctx->f32;
2780 zero = ctx->f32_0;
2781 one = ctx->f32_1;
2782 } else {
2783 type = ctx->f64;
2784 zero = ctx->f64_0;
2785 one = ctx->f64_1;
2786 }
2787
2788 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
2789 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2790 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
2791 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
2792 return val;
2793 }
2794
2795 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2796 {
2797 LLVMValueRef result;
2798 unsigned bitsize;
2799
2800 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2801
2802 switch (bitsize) {
2803 case 64:
2804 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
2805 (LLVMValueRef []) { src0 }, 1,
2806 AC_FUNC_ATTR_READNONE);
2807
2808 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2809 break;
2810 case 32:
2811 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
2812 (LLVMValueRef []) { src0 }, 1,
2813 AC_FUNC_ATTR_READNONE);
2814 break;
2815 case 16:
2816 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
2817 (LLVMValueRef []) { src0 }, 1,
2818 AC_FUNC_ATTR_READNONE);
2819
2820 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2821 break;
2822 case 8:
2823 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8,
2824 (LLVMValueRef []) { src0 }, 1,
2825 AC_FUNC_ATTR_READNONE);
2826
2827 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2828 break;
2829 default:
2830 unreachable(!"invalid bitsize");
2831 break;
2832 }
2833
2834 return result;
2835 }
2836
2837 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
2838 LLVMValueRef src0)
2839 {
2840 LLVMValueRef result;
2841 unsigned bitsize;
2842
2843 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2844
2845 switch (bitsize) {
2846 case 64:
2847 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64,
2848 (LLVMValueRef []) { src0 }, 1,
2849 AC_FUNC_ATTR_READNONE);
2850
2851 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2852 break;
2853 case 32:
2854 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32,
2855 (LLVMValueRef []) { src0 }, 1,
2856 AC_FUNC_ATTR_READNONE);
2857 break;
2858 case 16:
2859 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16,
2860 (LLVMValueRef []) { src0 }, 1,
2861 AC_FUNC_ATTR_READNONE);
2862
2863 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2864 break;
2865 case 8:
2866 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8,
2867 (LLVMValueRef []) { src0 }, 1,
2868 AC_FUNC_ATTR_READNONE);
2869
2870 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2871 break;
2872 default:
2873 unreachable(!"invalid bitsize");
2874 break;
2875 }
2876
2877 return result;
2878 }
2879
2880 #define AC_EXP_TARGET 0
2881 #define AC_EXP_ENABLED_CHANNELS 1
2882 #define AC_EXP_OUT0 2
2883
2884 enum ac_ir_type {
2885 AC_IR_UNDEF,
2886 AC_IR_CONST,
2887 AC_IR_VALUE,
2888 };
2889
2890 struct ac_vs_exp_chan
2891 {
2892 LLVMValueRef value;
2893 float const_float;
2894 enum ac_ir_type type;
2895 };
2896
2897 struct ac_vs_exp_inst {
2898 unsigned offset;
2899 LLVMValueRef inst;
2900 struct ac_vs_exp_chan chan[4];
2901 };
2902
2903 struct ac_vs_exports {
2904 unsigned num;
2905 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2906 };
2907
2908 /* Return true if the PARAM export has been eliminated. */
2909 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
2910 uint32_t num_outputs,
2911 struct ac_vs_exp_inst *exp)
2912 {
2913 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2914 bool is_zero[4] = {}, is_one[4] = {};
2915
2916 for (i = 0; i < 4; i++) {
2917 /* It's a constant expression. Undef outputs are eliminated too. */
2918 if (exp->chan[i].type == AC_IR_UNDEF) {
2919 is_zero[i] = true;
2920 is_one[i] = true;
2921 } else if (exp->chan[i].type == AC_IR_CONST) {
2922 if (exp->chan[i].const_float == 0)
2923 is_zero[i] = true;
2924 else if (exp->chan[i].const_float == 1)
2925 is_one[i] = true;
2926 else
2927 return false; /* other constant */
2928 } else
2929 return false;
2930 }
2931
2932 /* Only certain combinations of 0 and 1 can be eliminated. */
2933 if (is_zero[0] && is_zero[1] && is_zero[2])
2934 default_val = is_zero[3] ? 0 : 1;
2935 else if (is_one[0] && is_one[1] && is_one[2])
2936 default_val = is_zero[3] ? 2 : 3;
2937 else
2938 return false;
2939
2940 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2941 LLVMInstructionEraseFromParent(exp->inst);
2942
2943 /* Change OFFSET to DEFAULT_VAL. */
2944 for (i = 0; i < num_outputs; i++) {
2945 if (vs_output_param_offset[i] == exp->offset) {
2946 vs_output_param_offset[i] =
2947 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2948 break;
2949 }
2950 }
2951 return true;
2952 }
2953
2954 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2955 uint8_t *vs_output_param_offset,
2956 uint32_t num_outputs,
2957 struct ac_vs_exports *processed,
2958 struct ac_vs_exp_inst *exp)
2959 {
2960 unsigned p, copy_back_channels = 0;
2961
2962 /* See if the output is already in the list of processed outputs.
2963 * The LLVMValueRef comparison relies on SSA.
2964 */
2965 for (p = 0; p < processed->num; p++) {
2966 bool different = false;
2967
2968 for (unsigned j = 0; j < 4; j++) {
2969 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2970 struct ac_vs_exp_chan *c2 = &exp->chan[j];
2971
2972 /* Treat undef as a match. */
2973 if (c2->type == AC_IR_UNDEF)
2974 continue;
2975
2976 /* If c1 is undef but c2 isn't, we can copy c2 to c1
2977 * and consider the instruction duplicated.
2978 */
2979 if (c1->type == AC_IR_UNDEF) {
2980 copy_back_channels |= 1 << j;
2981 continue;
2982 }
2983
2984 /* Test whether the channels are not equal. */
2985 if (c1->type != c2->type ||
2986 (c1->type == AC_IR_CONST &&
2987 c1->const_float != c2->const_float) ||
2988 (c1->type == AC_IR_VALUE &&
2989 c1->value != c2->value)) {
2990 different = true;
2991 break;
2992 }
2993 }
2994 if (!different)
2995 break;
2996
2997 copy_back_channels = 0;
2998 }
2999 if (p == processed->num)
3000 return false;
3001
3002 /* If a match was found, but the matching export has undef where the new
3003 * one has a normal value, copy the normal value to the undef channel.
3004 */
3005 struct ac_vs_exp_inst *match = &processed->exp[p];
3006
3007 /* Get current enabled channels mask. */
3008 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
3009 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
3010
3011 while (copy_back_channels) {
3012 unsigned chan = u_bit_scan(&copy_back_channels);
3013
3014 assert(match->chan[chan].type == AC_IR_UNDEF);
3015 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
3016 exp->chan[chan].value);
3017 match->chan[chan] = exp->chan[chan];
3018
3019 /* Update number of enabled channels because the original mask
3020 * is not always 0xf.
3021 */
3022 enabled_channels |= (1 << chan);
3023 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
3024 LLVMConstInt(ctx->i32, enabled_channels, 0));
3025 }
3026
3027 /* The PARAM export is duplicated. Kill it. */
3028 LLVMInstructionEraseFromParent(exp->inst);
3029
3030 /* Change OFFSET to the matching export. */
3031 for (unsigned i = 0; i < num_outputs; i++) {
3032 if (vs_output_param_offset[i] == exp->offset) {
3033 vs_output_param_offset[i] = match->offset;
3034 break;
3035 }
3036 }
3037 return true;
3038 }
3039
3040 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
3041 LLVMValueRef main_fn,
3042 uint8_t *vs_output_param_offset,
3043 uint32_t num_outputs,
3044 uint8_t *num_param_exports)
3045 {
3046 LLVMBasicBlockRef bb;
3047 bool removed_any = false;
3048 struct ac_vs_exports exports;
3049
3050 exports.num = 0;
3051
3052 /* Process all LLVM instructions. */
3053 bb = LLVMGetFirstBasicBlock(main_fn);
3054 while (bb) {
3055 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
3056
3057 while (inst) {
3058 LLVMValueRef cur = inst;
3059 inst = LLVMGetNextInstruction(inst);
3060 struct ac_vs_exp_inst exp;
3061
3062 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
3063 continue;
3064
3065 LLVMValueRef callee = ac_llvm_get_called_value(cur);
3066
3067 if (!ac_llvm_is_function(callee))
3068 continue;
3069
3070 const char *name = LLVMGetValueName(callee);
3071 unsigned num_args = LLVMCountParams(callee);
3072
3073 /* Check if this is an export instruction. */
3074 if ((num_args != 9 && num_args != 8) ||
3075 (strcmp(name, "llvm.SI.export") &&
3076 strcmp(name, "llvm.amdgcn.exp.f32")))
3077 continue;
3078
3079 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
3080 unsigned target = LLVMConstIntGetZExtValue(arg);
3081
3082 if (target < V_008DFC_SQ_EXP_PARAM)
3083 continue;
3084
3085 target -= V_008DFC_SQ_EXP_PARAM;
3086
3087 /* Parse the instruction. */
3088 memset(&exp, 0, sizeof(exp));
3089 exp.offset = target;
3090 exp.inst = cur;
3091
3092 for (unsigned i = 0; i < 4; i++) {
3093 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
3094
3095 exp.chan[i].value = v;
3096
3097 if (LLVMIsUndef(v)) {
3098 exp.chan[i].type = AC_IR_UNDEF;
3099 } else if (LLVMIsAConstantFP(v)) {
3100 LLVMBool loses_info;
3101 exp.chan[i].type = AC_IR_CONST;
3102 exp.chan[i].const_float =
3103 LLVMConstRealGetDouble(v, &loses_info);
3104 } else {
3105 exp.chan[i].type = AC_IR_VALUE;
3106 }
3107 }
3108
3109 /* Eliminate constant and duplicated PARAM exports. */
3110 if (ac_eliminate_const_output(vs_output_param_offset,
3111 num_outputs, &exp) ||
3112 ac_eliminate_duplicated_output(ctx,
3113 vs_output_param_offset,
3114 num_outputs, &exports,
3115 &exp)) {
3116 removed_any = true;
3117 } else {
3118 exports.exp[exports.num++] = exp;
3119 }
3120 }
3121 bb = LLVMGetNextBasicBlock(bb);
3122 }
3123
3124 /* Remove holes in export memory due to removed PARAM exports.
3125 * This is done by renumbering all PARAM exports.
3126 */
3127 if (removed_any) {
3128 uint8_t old_offset[VARYING_SLOT_MAX];
3129 unsigned out, i;
3130
3131 /* Make a copy of the offsets. We need the old version while
3132 * we are modifying some of them. */
3133 memcpy(old_offset, vs_output_param_offset,
3134 sizeof(old_offset));
3135
3136 for (i = 0; i < exports.num; i++) {
3137 unsigned offset = exports.exp[i].offset;
3138
3139 /* Update vs_output_param_offset. Multiple outputs can
3140 * have the same offset.
3141 */
3142 for (out = 0; out < num_outputs; out++) {
3143 if (old_offset[out] == offset)
3144 vs_output_param_offset[out] = i;
3145 }
3146
3147 /* Change the PARAM offset in the instruction. */
3148 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
3149 LLVMConstInt(ctx->i32,
3150 V_008DFC_SQ_EXP_PARAM + i, 0));
3151 }
3152 *num_param_exports = exports.num;
3153 }
3154 }
3155
3156 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
3157 {
3158 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
3159 ac_build_intrinsic(ctx,
3160 "llvm.amdgcn.init.exec", ctx->voidt,
3161 &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
3162 }
3163
3164 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
3165 {
3166 unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
3167 ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
3168 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS),
3169 "lds");
3170 }
3171
3172 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
3173 LLVMValueRef dw_addr)
3174 {
3175 return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
3176 }
3177
3178 void ac_lds_store(struct ac_llvm_context *ctx,
3179 LLVMValueRef dw_addr,
3180 LLVMValueRef value)
3181 {
3182 value = ac_to_integer(ctx, value);
3183 ac_build_indexed_store(ctx, ctx->lds,
3184 dw_addr, value);
3185 }
3186
3187 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
3188 LLVMTypeRef dst_type,
3189 LLVMValueRef src0)
3190 {
3191 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
3192 const char *intrin_name;
3193 LLVMTypeRef type;
3194 LLVMValueRef zero;
3195
3196 switch (src0_bitsize) {
3197 case 64:
3198 intrin_name = "llvm.cttz.i64";
3199 type = ctx->i64;
3200 zero = ctx->i64_0;
3201 break;
3202 case 32:
3203 intrin_name = "llvm.cttz.i32";
3204 type = ctx->i32;
3205 zero = ctx->i32_0;
3206 break;
3207 case 16:
3208 intrin_name = "llvm.cttz.i16";
3209 type = ctx->i16;
3210 zero = ctx->i16_0;
3211 break;
3212 case 8:
3213 intrin_name = "llvm.cttz.i8";
3214 type = ctx->i8;
3215 zero = ctx->i8_0;
3216 break;
3217 default:
3218 unreachable(!"invalid bitsize");
3219 }
3220
3221 LLVMValueRef params[2] = {
3222 src0,
3223
3224 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3225 * add special code to check for x=0. The reason is that
3226 * the LLVM behavior for x=0 is different from what we
3227 * need here. However, LLVM also assumes that ffs(x) is
3228 * in [0, 31], but GLSL expects that ffs(0) = -1, so
3229 * a conditional assignment to handle 0 is still required.
3230 *
3231 * The hardware already implements the correct behavior.
3232 */
3233 ctx->i1true,
3234 };
3235
3236 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
3237 params, 2,
3238 AC_FUNC_ATTR_READNONE);
3239
3240 if (src0_bitsize == 64) {
3241 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3242 } else if (src0_bitsize < 32) {
3243 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3244 }
3245
3246 /* TODO: We need an intrinsic to skip this conditional. */
3247 /* Check for zero: */
3248 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
3249 LLVMIntEQ, src0,
3250 zero, ""),
3251 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3252 }
3253
3254 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3255 {
3256 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3257 }
3258
3259 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3260 {
3261 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3262 }
3263
3264 static struct ac_llvm_flow *
3265 get_current_flow(struct ac_llvm_context *ctx)
3266 {
3267 if (ctx->flow->depth > 0)
3268 return &ctx->flow->stack[ctx->flow->depth - 1];
3269 return NULL;
3270 }
3271
3272 static struct ac_llvm_flow *
3273 get_innermost_loop(struct ac_llvm_context *ctx)
3274 {
3275 for (unsigned i = ctx->flow->depth; i > 0; --i) {
3276 if (ctx->flow->stack[i - 1].loop_entry_block)
3277 return &ctx->flow->stack[i - 1];
3278 }
3279 return NULL;
3280 }
3281
3282 static struct ac_llvm_flow *
3283 push_flow(struct ac_llvm_context *ctx)
3284 {
3285 struct ac_llvm_flow *flow;
3286
3287 if (ctx->flow->depth >= ctx->flow->depth_max) {
3288 unsigned new_max = MAX2(ctx->flow->depth << 1,
3289 AC_LLVM_INITIAL_CF_DEPTH);
3290
3291 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
3292 ctx->flow->depth_max = new_max;
3293 }
3294
3295 flow = &ctx->flow->stack[ctx->flow->depth];
3296 ctx->flow->depth++;
3297
3298 flow->next_block = NULL;
3299 flow->loop_entry_block = NULL;
3300 return flow;
3301 }
3302
3303 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
3304 int label_id)
3305 {
3306 char buf[32];
3307 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3308 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3309 }
3310
3311 /* Append a basic block at the level of the parent flow.
3312 */
3313 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
3314 const char *name)
3315 {
3316 assert(ctx->flow->depth >= 1);
3317
3318 if (ctx->flow->depth >= 2) {
3319 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
3320
3321 return LLVMInsertBasicBlockInContext(ctx->context,
3322 flow->next_block, name);
3323 }
3324
3325 LLVMValueRef main_fn =
3326 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3327 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3328 }
3329
3330 /* Emit a branch to the given default target for the current block if
3331 * applicable -- that is, if the current block does not already contain a
3332 * branch from a break or continue.
3333 */
3334 static void emit_default_branch(LLVMBuilderRef builder,
3335 LLVMBasicBlockRef target)
3336 {
3337 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3338 LLVMBuildBr(builder, target);
3339 }
3340
3341 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3342 {
3343 struct ac_llvm_flow *flow = push_flow(ctx);
3344 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3345 flow->next_block = append_basic_block(ctx, "ENDLOOP");
3346 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3347 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3348 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3349 }
3350
3351 void ac_build_break(struct ac_llvm_context *ctx)
3352 {
3353 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3354 LLVMBuildBr(ctx->builder, flow->next_block);
3355 }
3356
3357 void ac_build_continue(struct ac_llvm_context *ctx)
3358 {
3359 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3360 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3361 }
3362
3363 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3364 {
3365 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3366 LLVMBasicBlockRef endif_block;
3367
3368 assert(!current_branch->loop_entry_block);
3369
3370 endif_block = append_basic_block(ctx, "ENDIF");
3371 emit_default_branch(ctx->builder, endif_block);
3372
3373 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3374 set_basicblock_name(current_branch->next_block, "else", label_id);
3375
3376 current_branch->next_block = endif_block;
3377 }
3378
3379 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3380 {
3381 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3382
3383 assert(!current_branch->loop_entry_block);
3384
3385 emit_default_branch(ctx->builder, current_branch->next_block);
3386 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3387 set_basicblock_name(current_branch->next_block, "endif", label_id);
3388
3389 ctx->flow->depth--;
3390 }
3391
3392 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3393 {
3394 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3395
3396 assert(current_loop->loop_entry_block);
3397
3398 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3399
3400 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3401 set_basicblock_name(current_loop->next_block, "endloop", label_id);
3402 ctx->flow->depth--;
3403 }
3404
3405 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3406 {
3407 struct ac_llvm_flow *flow = push_flow(ctx);
3408 LLVMBasicBlockRef if_block;
3409
3410 if_block = append_basic_block(ctx, "IF");
3411 flow->next_block = append_basic_block(ctx, "ELSE");
3412 set_basicblock_name(if_block, "if", label_id);
3413 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3414 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3415 }
3416
3417 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
3418 int label_id)
3419 {
3420 LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
3421 value, ctx->f32_0, "");
3422 ac_build_ifcc(ctx, cond, label_id);
3423 }
3424
3425 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
3426 int label_id)
3427 {
3428 LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3429 ac_to_integer(ctx, value),
3430 ctx->i32_0, "");
3431 ac_build_ifcc(ctx, cond, label_id);
3432 }
3433
3434 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type,
3435 const char *name)
3436 {
3437 LLVMBuilderRef builder = ac->builder;
3438 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3439 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3440 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3441 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3442 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3443 LLVMValueRef res;
3444
3445 if (first_instr) {
3446 LLVMPositionBuilderBefore(first_builder, first_instr);
3447 } else {
3448 LLVMPositionBuilderAtEnd(first_builder, first_block);
3449 }
3450
3451 res = LLVMBuildAlloca(first_builder, type, name);
3452 LLVMDisposeBuilder(first_builder);
3453 return res;
3454 }
3455
3456 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac,
3457 LLVMTypeRef type, const char *name)
3458 {
3459 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3460 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3461 return ptr;
3462 }
3463
3464 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
3465 LLVMTypeRef type)
3466 {
3467 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3468 return LLVMBuildBitCast(ctx->builder, ptr,
3469 LLVMPointerType(type, addr_space), "");
3470 }
3471
3472 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
3473 unsigned count)
3474 {
3475 unsigned num_components = ac_get_llvm_num_components(value);
3476 if (count == num_components)
3477 return value;
3478
3479 LLVMValueRef masks[MAX2(count, 2)];
3480 masks[0] = ctx->i32_0;
3481 masks[1] = ctx->i32_1;
3482 for (unsigned i = 2; i < count; i++)
3483 masks[i] = LLVMConstInt(ctx->i32, i, false);
3484
3485 if (count == 1)
3486 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
3487 "");
3488
3489 LLVMValueRef swizzle = LLVMConstVector(masks, count);
3490 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3491 }
3492
3493 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
3494 unsigned rshift, unsigned bitwidth)
3495 {
3496 LLVMValueRef value = param;
3497 if (rshift)
3498 value = LLVMBuildLShr(ctx->builder, value,
3499 LLVMConstInt(ctx->i32, rshift, false), "");
3500
3501 if (rshift + bitwidth < 32) {
3502 unsigned mask = (1 << bitwidth) - 1;
3503 value = LLVMBuildAnd(ctx->builder, value,
3504 LLVMConstInt(ctx->i32, mask, false), "");
3505 }
3506 return value;
3507 }
3508
3509 /* Adjust the sample index according to FMASK.
3510 *
3511 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3512 * which is the identity mapping. Each nibble says which physical sample
3513 * should be fetched to get that sample.
3514 *
3515 * For example, 0x11111100 means there are only 2 samples stored and
3516 * the second sample covers 3/4 of the pixel. When reading samples 0
3517 * and 1, return physical sample 0 (determined by the first two 0s
3518 * in FMASK), otherwise return physical sample 1.
3519 *
3520 * The sample index should be adjusted as follows:
3521 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3522 */
3523 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
3524 LLVMValueRef *addr, bool is_array_tex)
3525 {
3526 struct ac_image_args fmask_load = {};
3527 fmask_load.opcode = ac_image_load;
3528 fmask_load.resource = fmask;
3529 fmask_load.dmask = 0xf;
3530 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3531 fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3532
3533 fmask_load.coords[0] = addr[0];
3534 fmask_load.coords[1] = addr[1];
3535 if (is_array_tex)
3536 fmask_load.coords[2] = addr[2];
3537
3538 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3539 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
3540 ac->i32_0, "");
3541
3542 /* Apply the formula. */
3543 unsigned sample_chan = is_array_tex ? 3 : 2;
3544 LLVMValueRef final_sample;
3545 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3546 LLVMConstInt(ac->i32, 4, 0), "");
3547 final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3548 /* Mask the sample index by 0x7, because 0x8 means an unknown value
3549 * with EQAA, so those will map to 0. */
3550 final_sample = LLVMBuildAnd(ac->builder, final_sample,
3551 LLVMConstInt(ac->i32, 0x7, 0), "");
3552
3553 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3554 * resource descriptor is 0 (invalid).
3555 */
3556 LLVMValueRef tmp;
3557 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3558 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3559 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3560
3561 /* Replace the MSAA sample index. */
3562 addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
3563 addr[sample_chan], "");
3564 }
3565
3566 static LLVMValueRef
3567 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3568 {
3569 LLVMTypeRef type = LLVMTypeOf(src);
3570 LLVMValueRef result;
3571
3572 ac_build_optimization_barrier(ctx, &src);
3573
3574 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3575 if (lane)
3576 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3577
3578 result = ac_build_intrinsic(ctx,
3579 lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3580 ctx->i32, (LLVMValueRef []) { src, lane },
3581 lane == NULL ? 1 : 2,
3582 AC_FUNC_ATTR_READNONE |
3583 AC_FUNC_ATTR_CONVERGENT);
3584
3585 return LLVMBuildTrunc(ctx->builder, result, type, "");
3586 }
3587
3588 /**
3589 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3590 * @param ctx
3591 * @param src
3592 * @param lane - id of the lane or NULL for the first active lane
3593 * @return value of the lane
3594 */
3595 LLVMValueRef
3596 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3597 {
3598 LLVMTypeRef src_type = LLVMTypeOf(src);
3599 src = ac_to_integer(ctx, src);
3600 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3601 LLVMValueRef ret;
3602
3603 if (bits > 32) {
3604 assert(bits % 32 == 0);
3605 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3606 LLVMValueRef src_vector =
3607 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3608 ret = LLVMGetUndef(vec_type);
3609 for (unsigned i = 0; i < bits / 32; i++) {
3610 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3611 LLVMConstInt(ctx->i32, i, 0), "");
3612 LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
3613 ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
3614 LLVMConstInt(ctx->i32, i, 0), "");
3615 }
3616 } else {
3617 ret = _ac_build_readlane(ctx, src, lane);
3618 }
3619
3620 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3621 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3622 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3623 }
3624
3625 LLVMValueRef
3626 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
3627 {
3628 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3629 (LLVMValueRef []) {value, lane, src}, 3,
3630 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3631 }
3632
3633 LLVMValueRef
3634 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3635 {
3636 if (ctx->wave_size == 32) {
3637 return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3638 (LLVMValueRef []) { mask, ctx->i32_0 },
3639 2, AC_FUNC_ATTR_READNONE);
3640 }
3641 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
3642 LLVMVectorType(ctx->i32, 2),
3643 "");
3644 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
3645 ctx->i32_0, "");
3646 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
3647 ctx->i32_1, "");
3648 LLVMValueRef val =
3649 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3650 (LLVMValueRef []) { mask_lo, ctx->i32_0 },
3651 2, AC_FUNC_ATTR_READNONE);
3652 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
3653 (LLVMValueRef []) { mask_hi, val },
3654 2, AC_FUNC_ATTR_READNONE);
3655 return val;
3656 }
3657
3658 enum dpp_ctrl {
3659 _dpp_quad_perm = 0x000,
3660 _dpp_row_sl = 0x100,
3661 _dpp_row_sr = 0x110,
3662 _dpp_row_rr = 0x120,
3663 dpp_wf_sl1 = 0x130,
3664 dpp_wf_rl1 = 0x134,
3665 dpp_wf_sr1 = 0x138,
3666 dpp_wf_rr1 = 0x13C,
3667 dpp_row_mirror = 0x140,
3668 dpp_row_half_mirror = 0x141,
3669 dpp_row_bcast15 = 0x142,
3670 dpp_row_bcast31 = 0x143
3671 };
3672
3673 static inline enum dpp_ctrl
3674 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3675 {
3676 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3677 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3678 }
3679
3680 static inline enum dpp_ctrl
3681 dpp_row_sl(unsigned amount)
3682 {
3683 assert(amount > 0 && amount < 16);
3684 return _dpp_row_sl | amount;
3685 }
3686
3687 static inline enum dpp_ctrl
3688 dpp_row_sr(unsigned amount)
3689 {
3690 assert(amount > 0 && amount < 16);
3691 return _dpp_row_sr | amount;
3692 }
3693
3694 static LLVMValueRef
3695 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3696 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3697 bool bound_ctrl)
3698 {
3699 return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
3700 LLVMTypeOf(old),
3701 (LLVMValueRef[]) {
3702 old, src,
3703 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3704 LLVMConstInt(ctx->i32, row_mask, 0),
3705 LLVMConstInt(ctx->i32, bank_mask, 0),
3706 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
3707 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3708 }
3709
3710 static LLVMValueRef
3711 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3712 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3713 bool bound_ctrl)
3714 {
3715 LLVMTypeRef src_type = LLVMTypeOf(src);
3716 src = ac_to_integer(ctx, src);
3717 old = ac_to_integer(ctx, old);
3718 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3719 LLVMValueRef ret;
3720 if (bits == 32) {
3721 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
3722 bank_mask, bound_ctrl);
3723 } else {
3724 assert(bits % 32 == 0);
3725 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3726 LLVMValueRef src_vector =
3727 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3728 LLVMValueRef old_vector =
3729 LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3730 ret = LLVMGetUndef(vec_type);
3731 for (unsigned i = 0; i < bits / 32; i++) {
3732 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3733 LLVMConstInt(ctx->i32, i,
3734 0), "");
3735 old = LLVMBuildExtractElement(ctx->builder, old_vector,
3736 LLVMConstInt(ctx->i32, i,
3737 0), "");
3738 LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
3739 dpp_ctrl,
3740 row_mask,
3741 bank_mask,
3742 bound_ctrl);
3743 ret = LLVMBuildInsertElement(ctx->builder, ret,
3744 ret_comp,
3745 LLVMConstInt(ctx->i32, i,
3746 0), "");
3747 }
3748 }
3749 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3750 }
3751
3752 static LLVMValueRef
3753 _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3754 bool exchange_rows, bool bound_ctrl)
3755 {
3756 LLVMValueRef args[6] = {
3757 src,
3758 src,
3759 LLVMConstInt(ctx->i32, sel, false),
3760 LLVMConstInt(ctx->i32, sel >> 32, false),
3761 ctx->i1true, /* fi */
3762 bound_ctrl ? ctx->i1true : ctx->i1false,
3763 };
3764 return ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16"
3765 : "llvm.amdgcn.permlane16",
3766 ctx->i32, args, 6,
3767 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3768 }
3769
3770 static LLVMValueRef
3771 ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3772 bool exchange_rows, bool bound_ctrl)
3773 {
3774 LLVMTypeRef src_type = LLVMTypeOf(src);
3775 src = ac_to_integer(ctx, src);
3776 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3777 LLVMValueRef ret;
3778 if (bits == 32) {
3779 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows,
3780 bound_ctrl);
3781 } else {
3782 assert(bits % 32 == 0);
3783 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3784 LLVMValueRef src_vector =
3785 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3786 ret = LLVMGetUndef(vec_type);
3787 for (unsigned i = 0; i < bits / 32; i++) {
3788 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3789 LLVMConstInt(ctx->i32, i,
3790 0), "");
3791 LLVMValueRef ret_comp =
3792 _ac_build_permlane16(ctx, src, sel,
3793 exchange_rows,
3794 bound_ctrl);
3795 ret = LLVMBuildInsertElement(ctx->builder, ret,
3796 ret_comp,
3797 LLVMConstInt(ctx->i32, i,
3798 0), "");
3799 }
3800 }
3801 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3802 }
3803
3804 static inline unsigned
3805 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3806 {
3807 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3808 return and_mask | (or_mask << 5) | (xor_mask << 10);
3809 }
3810
3811 static LLVMValueRef
3812 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3813 {
3814 return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
3815 LLVMTypeOf(src), (LLVMValueRef []) {
3816 src, LLVMConstInt(ctx->i32, mask, 0) },
3817 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3818 }
3819
3820 LLVMValueRef
3821 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3822 {
3823 LLVMTypeRef src_type = LLVMTypeOf(src);
3824 src = ac_to_integer(ctx, src);
3825 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3826 LLVMValueRef ret;
3827 if (bits == 32) {
3828 ret = _ac_build_ds_swizzle(ctx, src, mask);
3829 } else {
3830 assert(bits % 32 == 0);
3831 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3832 LLVMValueRef src_vector =
3833 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3834 ret = LLVMGetUndef(vec_type);
3835 for (unsigned i = 0; i < bits / 32; i++) {
3836 src = LLVMBuildExtractElement(ctx->builder, src_vector,
3837 LLVMConstInt(ctx->i32, i,
3838 0), "");
3839 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
3840 mask);
3841 ret = LLVMBuildInsertElement(ctx->builder, ret,
3842 ret_comp,
3843 LLVMConstInt(ctx->i32, i,
3844 0), "");
3845 }
3846 }
3847 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3848 }
3849
3850 static LLVMValueRef
3851 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3852 {
3853 char name[32], type[8];
3854 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3855 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3856 return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
3857 (LLVMValueRef []) { src }, 1,
3858 AC_FUNC_ATTR_READNONE);
3859 }
3860
3861 static LLVMValueRef
3862 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3863 LLVMValueRef inactive)
3864 {
3865 char name[33], type[8];
3866 LLVMTypeRef src_type = LLVMTypeOf(src);
3867 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3868 src = ac_to_integer(ctx, src);
3869 inactive = ac_to_integer(ctx, inactive);
3870
3871 if (bitsize < 32) {
3872 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3873 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3874 }
3875
3876 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3877 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3878 LLVMValueRef ret =
3879 ac_build_intrinsic(ctx, name,
3880 LLVMTypeOf(src), (LLVMValueRef []) {
3881 src, inactive }, 2,
3882 AC_FUNC_ATTR_READNONE |
3883 AC_FUNC_ATTR_CONVERGENT);
3884 if (bitsize < 32)
3885 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3886
3887 return ret;
3888 }
3889
3890 static LLVMValueRef
3891 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
3892 {
3893 if (type_size == 4) {
3894 switch (op) {
3895 case nir_op_iadd: return ctx->i32_0;
3896 case nir_op_fadd: return ctx->f32_0;
3897 case nir_op_imul: return ctx->i32_1;
3898 case nir_op_fmul: return ctx->f32_1;
3899 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3900 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3901 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
3902 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3903 case nir_op_umax: return ctx->i32_0;
3904 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
3905 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
3906 case nir_op_ior: return ctx->i32_0;
3907 case nir_op_ixor: return ctx->i32_0;
3908 default:
3909 unreachable("bad reduction intrinsic");
3910 }
3911 } else { /* type_size == 64bit */
3912 switch (op) {
3913 case nir_op_iadd: return ctx->i64_0;
3914 case nir_op_fadd: return ctx->f64_0;
3915 case nir_op_imul: return ctx->i64_1;
3916 case nir_op_fmul: return ctx->f64_1;
3917 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3918 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3919 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
3920 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3921 case nir_op_umax: return ctx->i64_0;
3922 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
3923 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
3924 case nir_op_ior: return ctx->i64_0;
3925 case nir_op_ixor: return ctx->i64_0;
3926 default:
3927 unreachable("bad reduction intrinsic");
3928 }
3929 }
3930 }
3931
3932 static LLVMValueRef
3933 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
3934 {
3935 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3936 switch (op) {
3937 case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3938 case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3939 case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3940 case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3941 case nir_op_imin: return LLVMBuildSelect(ctx->builder,
3942 LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3943 lhs, rhs, "");
3944 case nir_op_umin: return LLVMBuildSelect(ctx->builder,
3945 LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3946 lhs, rhs, "");
3947 case nir_op_fmin: return ac_build_intrinsic(ctx,
3948 _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
3949 _64bit ? ctx->f64 : ctx->f32,
3950 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3951 case nir_op_imax: return LLVMBuildSelect(ctx->builder,
3952 LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3953 lhs, rhs, "");
3954 case nir_op_umax: return LLVMBuildSelect(ctx->builder,
3955 LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3956 lhs, rhs, "");
3957 case nir_op_fmax: return ac_build_intrinsic(ctx,
3958 _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
3959 _64bit ? ctx->f64 : ctx->f32,
3960 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3961 case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3962 case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3963 case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3964 default:
3965 unreachable("bad reduction intrinsic");
3966 }
3967 }
3968
3969 /**
3970 * \param maxprefix specifies that the result only needs to be correct for a
3971 * prefix of this many threads
3972 *
3973 * TODO: add inclusive and excluse scan functions for GFX6.
3974 */
3975 static LLVMValueRef
3976 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity,
3977 unsigned maxprefix, bool inclusive)
3978 {
3979 LLVMValueRef result, tmp;
3980
3981 if (ctx->chip_class >= GFX10) {
3982 result = inclusive ? src : identity;
3983 } else {
3984 if (!inclusive)
3985 src = ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3986 result = src;
3987 }
3988 if (maxprefix <= 1)
3989 return result;
3990 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3991 result = ac_build_alu_op(ctx, result, tmp, op);
3992 if (maxprefix <= 2)
3993 return result;
3994 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3995 result = ac_build_alu_op(ctx, result, tmp, op);
3996 if (maxprefix <= 3)
3997 return result;
3998 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3999 result = ac_build_alu_op(ctx, result, tmp, op);
4000 if (maxprefix <= 4)
4001 return result;
4002 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
4003 result = ac_build_alu_op(ctx, result, tmp, op);
4004 if (maxprefix <= 8)
4005 return result;
4006 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
4007 result = ac_build_alu_op(ctx, result, tmp, op);
4008 if (maxprefix <= 16)
4009 return result;
4010
4011 if (ctx->chip_class >= GFX10) {
4012 /* dpp_row_bcast{15,31} are not supported on gfx10. */
4013 LLVMBuilderRef builder = ctx->builder;
4014 LLVMValueRef tid = ac_get_thread_id(ctx);
4015 LLVMValueRef cc;
4016 /* TODO-GFX10: Can we get better code-gen by putting this into
4017 * a branch so that LLVM generates EXEC mask manipulations? */
4018 if (inclusive)
4019 tmp = result;
4020 else
4021 tmp = ac_build_alu_op(ctx, result, src, op);
4022 tmp = ac_build_permlane16(ctx, tmp, ~(uint64_t)0, true, false);
4023 tmp = ac_build_alu_op(ctx, result, tmp, op);
4024 cc = LLVMBuildAnd(builder, tid, LLVMConstInt(ctx->i32, 16, false), "");
4025 cc = LLVMBuildICmp(builder, LLVMIntNE, cc, ctx->i32_0, "");
4026 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4027 if (maxprefix <= 32)
4028 return result;
4029
4030 if (inclusive)
4031 tmp = result;
4032 else
4033 tmp = ac_build_alu_op(ctx, result, src, op);
4034 tmp = ac_build_readlane(ctx, tmp, LLVMConstInt(ctx->i32, 31, false));
4035 tmp = ac_build_alu_op(ctx, result, tmp, op);
4036 cc = LLVMBuildICmp(builder, LLVMIntUGE, tid,
4037 LLVMConstInt(ctx->i32, 32, false), "");
4038 result = LLVMBuildSelect(builder, cc, tmp, result, "");
4039 return result;
4040 }
4041
4042 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4043 result = ac_build_alu_op(ctx, result, tmp, op);
4044 if (maxprefix <= 32)
4045 return result;
4046 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4047 result = ac_build_alu_op(ctx, result, tmp, op);
4048 return result;
4049 }
4050
4051 LLVMValueRef
4052 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4053 {
4054 LLVMValueRef result;
4055
4056 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4057 LLVMBuilderRef builder = ctx->builder;
4058 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4059 result = ac_build_ballot(ctx, src);
4060 result = ac_build_mbcnt(ctx, result);
4061 result = LLVMBuildAdd(builder, result, src, "");
4062 return result;
4063 }
4064
4065 ac_build_optimization_barrier(ctx, &src);
4066
4067 LLVMValueRef identity =
4068 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4069 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4070 LLVMTypeOf(identity), "");
4071 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4072
4073 return ac_build_wwm(ctx, result);
4074 }
4075
4076 LLVMValueRef
4077 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4078 {
4079 LLVMValueRef result;
4080
4081 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4082 LLVMBuilderRef builder = ctx->builder;
4083 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4084 result = ac_build_ballot(ctx, src);
4085 result = ac_build_mbcnt(ctx, result);
4086 return result;
4087 }
4088
4089 ac_build_optimization_barrier(ctx, &src);
4090
4091 LLVMValueRef identity =
4092 get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4093 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4094 LLVMTypeOf(identity), "");
4095 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4096
4097 return ac_build_wwm(ctx, result);
4098 }
4099
4100 LLVMValueRef
4101 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
4102 {
4103 if (cluster_size == 1) return src;
4104 ac_build_optimization_barrier(ctx, &src);
4105 LLVMValueRef result, swap;
4106 LLVMValueRef identity = get_reduction_identity(ctx, op,
4107 ac_get_type_size(LLVMTypeOf(src)));
4108 result = LLVMBuildBitCast(ctx->builder,
4109 ac_build_set_inactive(ctx, src, identity),
4110 LLVMTypeOf(identity), "");
4111 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4112 result = ac_build_alu_op(ctx, result, swap, op);
4113 if (cluster_size == 2) return ac_build_wwm(ctx, result);
4114
4115 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4116 result = ac_build_alu_op(ctx, result, swap, op);
4117 if (cluster_size == 4) return ac_build_wwm(ctx, result);
4118
4119 if (ctx->chip_class >= GFX8)
4120 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4121 else
4122 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4123 result = ac_build_alu_op(ctx, result, swap, op);
4124 if (cluster_size == 8) return ac_build_wwm(ctx, result);
4125
4126 if (ctx->chip_class >= GFX8)
4127 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4128 else
4129 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4130 result = ac_build_alu_op(ctx, result, swap, op);
4131 if (cluster_size == 16) return ac_build_wwm(ctx, result);
4132
4133 if (ctx->chip_class >= GFX10)
4134 swap = ac_build_permlane16(ctx, result, 0, true, false);
4135 else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4136 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4137 else
4138 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4139 result = ac_build_alu_op(ctx, result, swap, op);
4140 if (cluster_size == 32) return ac_build_wwm(ctx, result);
4141
4142 if (ctx->chip_class >= GFX8) {
4143 if (ctx->chip_class >= GFX10)
4144 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4145 else
4146 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4147 result = ac_build_alu_op(ctx, result, swap, op);
4148 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4149 return ac_build_wwm(ctx, result);
4150 } else {
4151 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4152 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4153 result = ac_build_alu_op(ctx, result, swap, op);
4154 return ac_build_wwm(ctx, result);
4155 }
4156 }
4157
4158 /**
4159 * "Top half" of a scan that reduces per-wave values across an entire
4160 * workgroup.
4161 *
4162 * The source value must be present in the highest lane of the wave, and the
4163 * highest lane must be live.
4164 */
4165 void
4166 ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4167 {
4168 if (ws->maxwaves <= 1)
4169 return;
4170
4171 const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4172 LLVMBuilderRef builder = ctx->builder;
4173 LLVMValueRef tid = ac_get_thread_id(ctx);
4174 LLVMValueRef tmp;
4175
4176 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4177 ac_build_ifcc(ctx, tmp, 1000);
4178 LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4179 ac_build_endif(ctx, 1000);
4180 }
4181
4182 /**
4183 * "Bottom half" of a scan that reduces per-wave values across an entire
4184 * workgroup.
4185 *
4186 * The caller must place a barrier between the top and bottom halves.
4187 */
4188 void
4189 ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4190 {
4191 const LLVMTypeRef type = LLVMTypeOf(ws->src);
4192 const LLVMValueRef identity =
4193 get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4194
4195 if (ws->maxwaves <= 1) {
4196 ws->result_reduce = ws->src;
4197 ws->result_inclusive = ws->src;
4198 ws->result_exclusive = identity;
4199 return;
4200 }
4201 assert(ws->maxwaves <= 32);
4202
4203 LLVMBuilderRef builder = ctx->builder;
4204 LLVMValueRef tid = ac_get_thread_id(ctx);
4205 LLVMBasicBlockRef bbs[2];
4206 LLVMValueRef phivalues_scan[2];
4207 LLVMValueRef tmp, tmp2;
4208
4209 bbs[0] = LLVMGetInsertBlock(builder);
4210 phivalues_scan[0] = LLVMGetUndef(type);
4211
4212 if (ws->enable_reduce)
4213 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4214 else if (ws->enable_inclusive)
4215 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4216 else
4217 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4218 ac_build_ifcc(ctx, tmp, 1001);
4219 {
4220 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4221
4222 ac_build_optimization_barrier(ctx, &tmp);
4223
4224 bbs[1] = LLVMGetInsertBlock(builder);
4225 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4226 }
4227 ac_build_endif(ctx, 1001);
4228
4229 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4230
4231 if (ws->enable_reduce) {
4232 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4233 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4234 }
4235 if (ws->enable_inclusive)
4236 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4237 if (ws->enable_exclusive) {
4238 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4239 tmp = ac_build_readlane(ctx, scan, tmp);
4240 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4241 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4242 }
4243 }
4244
4245 /**
4246 * Inclusive scan of a per-wave value across an entire workgroup.
4247 *
4248 * This implies an s_barrier instruction.
4249 *
4250 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4251 * of the workgroup are live. (This requirement cannot easily be relaxed in a
4252 * useful manner because of the barrier in the algorithm.)
4253 */
4254 void
4255 ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4256 {
4257 ac_build_wg_wavescan_top(ctx, ws);
4258 ac_build_s_barrier(ctx);
4259 ac_build_wg_wavescan_bottom(ctx, ws);
4260 }
4261
4262 /**
4263 * "Top half" of a scan that reduces per-thread values across an entire
4264 * workgroup.
4265 *
4266 * All lanes must be active when this code runs.
4267 */
4268 void
4269 ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4270 {
4271 if (ws->enable_exclusive) {
4272 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4273 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4274 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4275 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4276 } else {
4277 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4278 }
4279
4280 bool enable_inclusive = ws->enable_inclusive;
4281 bool enable_exclusive = ws->enable_exclusive;
4282 ws->enable_inclusive = false;
4283 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4284 ac_build_wg_wavescan_top(ctx, ws);
4285 ws->enable_inclusive = enable_inclusive;
4286 ws->enable_exclusive = enable_exclusive;
4287 }
4288
4289 /**
4290 * "Bottom half" of a scan that reduces per-thread values across an entire
4291 * workgroup.
4292 *
4293 * The caller must place a barrier between the top and bottom halves.
4294 */
4295 void
4296 ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4297 {
4298 bool enable_inclusive = ws->enable_inclusive;
4299 bool enable_exclusive = ws->enable_exclusive;
4300 ws->enable_inclusive = false;
4301 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4302 ac_build_wg_wavescan_bottom(ctx, ws);
4303 ws->enable_inclusive = enable_inclusive;
4304 ws->enable_exclusive = enable_exclusive;
4305
4306 /* ws->result_reduce is already the correct value */
4307 if (ws->enable_inclusive)
4308 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4309 if (ws->enable_exclusive)
4310 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4311 }
4312
4313 /**
4314 * A scan that reduces per-thread values across an entire workgroup.
4315 *
4316 * The caller must ensure that all lanes are active when this code runs
4317 * (WWM is insufficient!), because there is an implied barrier.
4318 */
4319 void
4320 ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4321 {
4322 ac_build_wg_scan_top(ctx, ws);
4323 ac_build_s_barrier(ctx);
4324 ac_build_wg_scan_bottom(ctx, ws);
4325 }
4326
4327 LLVMValueRef
4328 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
4329 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
4330 {
4331 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4332 if (ctx->chip_class >= GFX8) {
4333 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4334 } else {
4335 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4336 }
4337 }
4338
4339 LLVMValueRef
4340 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4341 {
4342 LLVMTypeRef type = LLVMTypeOf(src);
4343 LLVMValueRef result;
4344
4345 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4346 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4347
4348 result = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32,
4349 (LLVMValueRef []) {index, src}, 2,
4350 AC_FUNC_ATTR_READNONE |
4351 AC_FUNC_ATTR_CONVERGENT);
4352 return LLVMBuildTrunc(ctx->builder, result, type, "");
4353 }
4354
4355 LLVMValueRef
4356 ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0,
4357 unsigned bitsize)
4358 {
4359 LLVMTypeRef type;
4360 char *intr;
4361
4362 if (bitsize == 16) {
4363 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4364 type = ctx->i16;
4365 } else if (bitsize == 32) {
4366 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4367 type = ctx->i32;
4368 } else {
4369 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4370 type = ctx->i32;
4371 }
4372
4373 LLVMValueRef params[] = {
4374 src0,
4375 };
4376 return ac_build_intrinsic(ctx, intr, type, params, 1,
4377 AC_FUNC_ATTR_READNONE);
4378 }
4379 LLVMValueRef
4380 ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0,
4381 unsigned bitsize)
4382 {
4383 LLVMTypeRef type;
4384 char *intr;
4385
4386 if (bitsize == 16) {
4387 intr = "llvm.amdgcn.frexp.mant.f16";
4388 type = ctx->f16;
4389 } else if (bitsize == 32) {
4390 intr = "llvm.amdgcn.frexp.mant.f32";
4391 type = ctx->f32;
4392 } else {
4393 intr = "llvm.amdgcn.frexp.mant.f64";
4394 type = ctx->f64;
4395 }
4396
4397 LLVMValueRef params[] = {
4398 src0,
4399 };
4400 return ac_build_intrinsic(ctx, intr, type, params, 1,
4401 AC_FUNC_ATTR_READNONE);
4402 }
4403
4404 LLVMValueRef
4405 ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0,
4406 unsigned bitsize)
4407 {
4408 LLVMTypeRef type;
4409 char *intr;
4410
4411 if (bitsize == 16) {
4412 intr = "llvm.canonicalize.f16";
4413 type = ctx->f16;
4414 } else if (bitsize == 32) {
4415 intr = "llvm.canonicalize.f32";
4416 type = ctx->f32;
4417 } else if (bitsize == 64) {
4418 intr = "llvm.canonicalize.f64";
4419 type = ctx->f64;
4420 }
4421
4422 LLVMValueRef params[] = {
4423 src0,
4424 };
4425 return ac_build_intrinsic(ctx, intr, type, params, 1,
4426 AC_FUNC_ATTR_READNONE);
4427 }
4428
4429 /*
4430 * this takes an I,J coordinate pair,
4431 * and works out the X and Y derivatives.
4432 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4433 */
4434 LLVMValueRef
4435 ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4436 {
4437 LLVMValueRef result[4], a;
4438 unsigned i;
4439
4440 for (i = 0; i < 2; i++) {
4441 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
4442 LLVMConstInt(ctx->i32, i, false), "");
4443 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4444 result[2+i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4445 }
4446 return ac_build_gather_values(ctx, result, 4);
4447 }
4448
4449 LLVMValueRef
4450 ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4451 {
4452 LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live",
4453 ctx->i1, NULL, 0,
4454 AC_FUNC_ATTR_READNONE);
4455 result = LLVMBuildNot(ctx->builder, result, "");
4456 return LLVMBuildSExt(ctx->builder, result, ctx->i32, "");
4457 }
4458
4459 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func,
4460 LLVMValueRef *args, unsigned num_args)
4461 {
4462 LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4463 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4464 return ret;
4465 }
4466
4467 void
4468 ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth,
4469 LLVMValueRef stencil, LLVMValueRef samplemask,
4470 struct ac_export_args *args)
4471 {
4472 unsigned mask = 0;
4473 unsigned format = ac_get_spi_shader_z_format(depth != NULL,
4474 stencil != NULL,
4475 samplemask != NULL);
4476
4477 assert(depth || stencil || samplemask);
4478
4479 memset(args, 0, sizeof(*args));
4480
4481 args->valid_mask = 1; /* whether the EXEC mask is valid */
4482 args->done = 1; /* DONE bit */
4483
4484 /* Specify the target we are exporting */
4485 args->target = V_008DFC_SQ_EXP_MRTZ;
4486
4487 args->compr = 0; /* COMP flag */
4488 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4489 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4490 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4491 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4492
4493 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4494 assert(!depth);
4495 args->compr = 1; /* COMPR flag */
4496
4497 if (stencil) {
4498 /* Stencil should be in X[23:16]. */
4499 stencil = ac_to_integer(ctx, stencil);
4500 stencil = LLVMBuildShl(ctx->builder, stencil,
4501 LLVMConstInt(ctx->i32, 16, 0), "");
4502 args->out[0] = ac_to_float(ctx, stencil);
4503 mask |= 0x3;
4504 }
4505 if (samplemask) {
4506 /* SampleMask should be in Y[15:0]. */
4507 args->out[1] = samplemask;
4508 mask |= 0xc;
4509 }
4510 } else {
4511 if (depth) {
4512 args->out[0] = depth;
4513 mask |= 0x1;
4514 }
4515 if (stencil) {
4516 args->out[1] = stencil;
4517 mask |= 0x2;
4518 }
4519 if (samplemask) {
4520 args->out[2] = samplemask;
4521 mask |= 0x4;
4522 }
4523 }
4524
4525 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4526 * at the X writemask component. */
4527 if (ctx->chip_class == GFX6 &&
4528 ctx->family != CHIP_OLAND &&
4529 ctx->family != CHIP_HAINAN)
4530 mask |= 0x1;
4531
4532 /* Specify which components to enable */
4533 args->enabled_channels = mask;
4534 }
4535