amd/common: round cube array slice in ac_prepare_cube_coords
[mesa.git] / src / amd / common / ac_llvm_build.c
1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include <llvm-c/Core.h>
29
30 #include "c11/threads.h"
31
32 #include <assert.h>
33 #include <stdio.h>
34
35 #include "ac_llvm_util.h"
36 #include "ac_exp_param.h"
37 #include "util/bitscan.h"
38 #include "util/macros.h"
39 #include "util/u_atomic.h"
40 #include "sid.h"
41
42 #include "shader_enums.h"
43
44 /* Initialize module-independent parts of the context.
45 *
46 * The caller is responsible for initializing ctx::module and ctx::builder.
47 */
48 void
49 ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context)
50 {
51 LLVMValueRef args[1];
52
53 ctx->context = context;
54 ctx->module = NULL;
55 ctx->builder = NULL;
56
57 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
58 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
59 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
60 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
61 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
62 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
63 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
64 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
65 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
66 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
67 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
68 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
69
70 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
71 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
72 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
73 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
74
75 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
76 "range", 5);
77
78 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
79 "invariant.load", 14);
80
81 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
82
83 args[0] = LLVMConstReal(ctx->f32, 2.5);
84 ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
85
86 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
87 "amdgpu.uniform", 14);
88
89 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
90 }
91
92 unsigned
93 ac_get_type_size(LLVMTypeRef type)
94 {
95 LLVMTypeKind kind = LLVMGetTypeKind(type);
96
97 switch (kind) {
98 case LLVMIntegerTypeKind:
99 return LLVMGetIntTypeWidth(type) / 8;
100 case LLVMFloatTypeKind:
101 return 4;
102 case LLVMDoubleTypeKind:
103 case LLVMPointerTypeKind:
104 return 8;
105 case LLVMVectorTypeKind:
106 return LLVMGetVectorSize(type) *
107 ac_get_type_size(LLVMGetElementType(type));
108 case LLVMArrayTypeKind:
109 return LLVMGetArrayLength(type) *
110 ac_get_type_size(LLVMGetElementType(type));
111 default:
112 assert(0);
113 return 0;
114 }
115 }
116
117 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
118 {
119 if (t == ctx->f16 || t == ctx->i16)
120 return ctx->i16;
121 else if (t == ctx->f32 || t == ctx->i32)
122 return ctx->i32;
123 else if (t == ctx->f64 || t == ctx->i64)
124 return ctx->i64;
125 else
126 unreachable("Unhandled integer size");
127 }
128
129 LLVMTypeRef
130 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
131 {
132 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
133 LLVMTypeRef elem_type = LLVMGetElementType(t);
134 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
135 LLVMGetVectorSize(t));
136 }
137 return to_integer_type_scalar(ctx, t);
138 }
139
140 LLVMValueRef
141 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
142 {
143 LLVMTypeRef type = LLVMTypeOf(v);
144 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
145 }
146
147 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
148 {
149 if (t == ctx->i16 || t == ctx->f16)
150 return ctx->f16;
151 else if (t == ctx->i32 || t == ctx->f32)
152 return ctx->f32;
153 else if (t == ctx->i64 || t == ctx->f64)
154 return ctx->f64;
155 else
156 unreachable("Unhandled float size");
157 }
158
159 LLVMTypeRef
160 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
161 {
162 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
163 LLVMTypeRef elem_type = LLVMGetElementType(t);
164 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
165 LLVMGetVectorSize(t));
166 }
167 return to_float_type_scalar(ctx, t);
168 }
169
170 LLVMValueRef
171 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
172 {
173 LLVMTypeRef type = LLVMTypeOf(v);
174 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
175 }
176
177
178 LLVMValueRef
179 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
180 LLVMTypeRef return_type, LLVMValueRef *params,
181 unsigned param_count, unsigned attrib_mask)
182 {
183 LLVMValueRef function, call;
184 bool set_callsite_attrs = HAVE_LLVM >= 0x0400 &&
185 !(attrib_mask & AC_FUNC_ATTR_LEGACY);
186
187 function = LLVMGetNamedFunction(ctx->module, name);
188 if (!function) {
189 LLVMTypeRef param_types[32], function_type;
190 unsigned i;
191
192 assert(param_count <= 32);
193
194 for (i = 0; i < param_count; ++i) {
195 assert(params[i]);
196 param_types[i] = LLVMTypeOf(params[i]);
197 }
198 function_type =
199 LLVMFunctionType(return_type, param_types, param_count, 0);
200 function = LLVMAddFunction(ctx->module, name, function_type);
201
202 LLVMSetFunctionCallConv(function, LLVMCCallConv);
203 LLVMSetLinkage(function, LLVMExternalLinkage);
204
205 if (!set_callsite_attrs)
206 ac_add_func_attributes(ctx->context, function, attrib_mask);
207 }
208
209 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
210 if (set_callsite_attrs)
211 ac_add_func_attributes(ctx->context, call, attrib_mask);
212 return call;
213 }
214
215 /**
216 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
217 * intrinsic names).
218 */
219 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
220 {
221 LLVMTypeRef elem_type = type;
222
223 assert(bufsize >= 8);
224
225 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
226 int ret = snprintf(buf, bufsize, "v%u",
227 LLVMGetVectorSize(type));
228 if (ret < 0) {
229 char *type_name = LLVMPrintTypeToString(type);
230 fprintf(stderr, "Error building type name for: %s\n",
231 type_name);
232 return;
233 }
234 elem_type = LLVMGetElementType(type);
235 buf += ret;
236 bufsize -= ret;
237 }
238 switch (LLVMGetTypeKind(elem_type)) {
239 default: break;
240 case LLVMIntegerTypeKind:
241 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
242 break;
243 case LLVMFloatTypeKind:
244 snprintf(buf, bufsize, "f32");
245 break;
246 case LLVMDoubleTypeKind:
247 snprintf(buf, bufsize, "f64");
248 break;
249 }
250 }
251
252 /* Prevent optimizations (at least of memory accesses) across the current
253 * point in the program by emitting empty inline assembly that is marked as
254 * having side effects.
255 *
256 * Optionally, a value can be passed through the inline assembly to prevent
257 * LLVM from hoisting calls to ReadNone functions.
258 */
259 void
260 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
261 LLVMValueRef *pvgpr)
262 {
263 static int counter = 0;
264
265 LLVMBuilderRef builder = ctx->builder;
266 char code[16];
267
268 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
269
270 if (!pvgpr) {
271 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
272 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
273 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
274 } else {
275 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
276 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
277 LLVMValueRef vgpr = *pvgpr;
278 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
279 unsigned vgpr_size = ac_get_type_size(vgpr_type);
280 LLVMValueRef vgpr0;
281
282 assert(vgpr_size % 4 == 0);
283
284 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
285 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
286 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
287 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
288 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
289
290 *pvgpr = vgpr;
291 }
292 }
293
294 LLVMValueRef
295 ac_build_ballot(struct ac_llvm_context *ctx,
296 LLVMValueRef value)
297 {
298 LLVMValueRef args[3] = {
299 value,
300 ctx->i32_0,
301 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
302 };
303
304 /* We currently have no other way to prevent LLVM from lifting the icmp
305 * calls to a dominating basic block.
306 */
307 ac_build_optimization_barrier(ctx, &args[0]);
308
309 if (LLVMTypeOf(args[0]) != ctx->i32)
310 args[0] = LLVMBuildBitCast(ctx->builder, args[0], ctx->i32, "");
311
312 return ac_build_intrinsic(ctx,
313 "llvm.amdgcn.icmp.i32",
314 ctx->i64, args, 3,
315 AC_FUNC_ATTR_NOUNWIND |
316 AC_FUNC_ATTR_READNONE |
317 AC_FUNC_ATTR_CONVERGENT);
318 }
319
320 LLVMValueRef
321 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
322 {
323 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
324 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
325 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
326 }
327
328 LLVMValueRef
329 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
330 {
331 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
332 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
333 LLVMConstInt(ctx->i64, 0, 0), "");
334 }
335
336 LLVMValueRef
337 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
338 {
339 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
340 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
341
342 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
343 vote_set, active_set, "");
344 LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
345 vote_set,
346 LLVMConstInt(ctx->i64, 0, 0), "");
347 return LLVMBuildOr(ctx->builder, all, none, "");
348 }
349
350 LLVMValueRef
351 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
352 LLVMValueRef *values,
353 unsigned value_count,
354 unsigned value_stride,
355 bool load,
356 bool always_vector)
357 {
358 LLVMBuilderRef builder = ctx->builder;
359 LLVMValueRef vec = NULL;
360 unsigned i;
361
362 if (value_count == 1 && !always_vector) {
363 if (load)
364 return LLVMBuildLoad(builder, values[0], "");
365 return values[0];
366 } else if (!value_count)
367 unreachable("value_count is 0");
368
369 for (i = 0; i < value_count; i++) {
370 LLVMValueRef value = values[i * value_stride];
371 if (load)
372 value = LLVMBuildLoad(builder, value, "");
373
374 if (!i)
375 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
376 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
377 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
378 }
379 return vec;
380 }
381
382 LLVMValueRef
383 ac_build_gather_values(struct ac_llvm_context *ctx,
384 LLVMValueRef *values,
385 unsigned value_count)
386 {
387 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
388 }
389
390 LLVMValueRef
391 ac_build_fdiv(struct ac_llvm_context *ctx,
392 LLVMValueRef num,
393 LLVMValueRef den)
394 {
395 LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, "");
396
397 if (!LLVMIsConstant(ret))
398 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
399 return ret;
400 }
401
402 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
403 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
404 * already multiplied by two. id is the cube face number.
405 */
406 struct cube_selection_coords {
407 LLVMValueRef stc[2];
408 LLVMValueRef ma;
409 LLVMValueRef id;
410 };
411
412 static void
413 build_cube_intrinsic(struct ac_llvm_context *ctx,
414 LLVMValueRef in[3],
415 struct cube_selection_coords *out)
416 {
417 LLVMTypeRef f32 = ctx->f32;
418
419 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
420 f32, in, 3, AC_FUNC_ATTR_READNONE);
421 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
422 f32, in, 3, AC_FUNC_ATTR_READNONE);
423 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
424 f32, in, 3, AC_FUNC_ATTR_READNONE);
425 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
426 f32, in, 3, AC_FUNC_ATTR_READNONE);
427 }
428
429 /**
430 * Build a manual selection sequence for cube face sc/tc coordinates and
431 * major axis vector (multiplied by 2 for consistency) for the given
432 * vec3 \p coords, for the face implied by \p selcoords.
433 *
434 * For the major axis, we always adjust the sign to be in the direction of
435 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
436 * the selcoords major axis.
437 */
438 static void build_cube_select(LLVMBuilderRef builder,
439 const struct cube_selection_coords *selcoords,
440 const LLVMValueRef *coords,
441 LLVMValueRef *out_st,
442 LLVMValueRef *out_ma)
443 {
444 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
445 LLVMValueRef is_ma_positive;
446 LLVMValueRef sgn_ma;
447 LLVMValueRef is_ma_z, is_not_ma_z;
448 LLVMValueRef is_ma_y;
449 LLVMValueRef is_ma_x;
450 LLVMValueRef sgn;
451 LLVMValueRef tmp;
452
453 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
454 selcoords->ma, LLVMConstReal(f32, 0.0), "");
455 sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
456 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
457
458 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
459 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
460 is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
461 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
462 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
463
464 /* Select sc */
465 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], coords[0], "");
466 sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
467 LLVMBuildSelect(builder, is_ma_x, sgn_ma,
468 LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
469 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
470
471 /* Select tc */
472 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
473 sgn = LLVMBuildSelect(builder, is_ma_y, LLVMBuildFNeg(builder, sgn_ma, ""),
474 LLVMConstReal(f32, -1.0), "");
475 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
476
477 /* Select ma */
478 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
479 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
480 sgn = LLVMBuildSelect(builder, is_ma_positive,
481 LLVMConstReal(f32, 2.0), LLVMConstReal(f32, -2.0), "");
482 *out_ma = LLVMBuildFMul(builder, tmp, sgn, "");
483 }
484
485 void
486 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
487 bool is_deriv, bool is_array, bool is_lod,
488 LLVMValueRef *coords_arg,
489 LLVMValueRef *derivs_arg)
490 {
491
492 LLVMBuilderRef builder = ctx->builder;
493 struct cube_selection_coords selcoords;
494 LLVMValueRef coords[3];
495 LLVMValueRef invma;
496
497 if (is_array && !is_lod) {
498 coords_arg[3] = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32,
499 &coords_arg[3], 1, 0);
500 }
501
502 build_cube_intrinsic(ctx, coords_arg, &selcoords);
503
504 invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
505 ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
506 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
507
508 for (int i = 0; i < 2; ++i)
509 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
510
511 coords[2] = selcoords.id;
512
513 if (is_deriv && derivs_arg) {
514 LLVMValueRef derivs[4];
515 int axis;
516
517 /* Convert cube derivatives to 2D derivatives. */
518 for (axis = 0; axis < 2; axis++) {
519 LLVMValueRef deriv_st[2];
520 LLVMValueRef deriv_ma;
521
522 /* Transform the derivative alongside the texture
523 * coordinate. Mathematically, the correct formula is
524 * as follows. Assume we're projecting onto the +Z face
525 * and denote by dx/dh the derivative of the (original)
526 * X texture coordinate with respect to horizontal
527 * window coordinates. The projection onto the +Z face
528 * plane is:
529 *
530 * f(x,z) = x/z
531 *
532 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
533 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
534 *
535 * This motivatives the implementation below.
536 *
537 * Whether this actually gives the expected results for
538 * apps that might feed in derivatives obtained via
539 * finite differences is anyone's guess. The OpenGL spec
540 * seems awfully quiet about how textureGrad for cube
541 * maps should be handled.
542 */
543 build_cube_select(builder, &selcoords, &derivs_arg[axis * 3],
544 deriv_st, &deriv_ma);
545
546 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
547
548 for (int i = 0; i < 2; ++i)
549 derivs[axis * 2 + i] =
550 LLVMBuildFSub(builder,
551 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
552 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
553 }
554
555 memcpy(derivs_arg, derivs, sizeof(derivs));
556 }
557
558 /* Shift the texture coordinate. This must be applied after the
559 * derivative calculation.
560 */
561 for (int i = 0; i < 2; ++i)
562 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
563
564 if (is_array) {
565 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
566 /* coords_arg.w component - array_index for cube arrays */
567 LLVMValueRef tmp = LLVMBuildFMul(ctx->builder, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), "");
568 coords[2] = LLVMBuildFAdd(ctx->builder, tmp, coords[2], "");
569 }
570
571 memcpy(coords_arg, coords, sizeof(coords));
572 }
573
574
575 LLVMValueRef
576 ac_build_fs_interp(struct ac_llvm_context *ctx,
577 LLVMValueRef llvm_chan,
578 LLVMValueRef attr_number,
579 LLVMValueRef params,
580 LLVMValueRef i,
581 LLVMValueRef j)
582 {
583 LLVMValueRef args[5];
584 LLVMValueRef p1;
585
586 if (HAVE_LLVM < 0x0400) {
587 LLVMValueRef ij[2];
588 ij[0] = LLVMBuildBitCast(ctx->builder, i, ctx->i32, "");
589 ij[1] = LLVMBuildBitCast(ctx->builder, j, ctx->i32, "");
590
591 args[0] = llvm_chan;
592 args[1] = attr_number;
593 args[2] = params;
594 args[3] = ac_build_gather_values(ctx, ij, 2);
595 return ac_build_intrinsic(ctx, "llvm.SI.fs.interp",
596 ctx->f32, args, 4,
597 AC_FUNC_ATTR_READNONE);
598 }
599
600 args[0] = i;
601 args[1] = llvm_chan;
602 args[2] = attr_number;
603 args[3] = params;
604
605 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
606 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
607
608 args[0] = p1;
609 args[1] = j;
610 args[2] = llvm_chan;
611 args[3] = attr_number;
612 args[4] = params;
613
614 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
615 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
616 }
617
618 LLVMValueRef
619 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
620 LLVMValueRef parameter,
621 LLVMValueRef llvm_chan,
622 LLVMValueRef attr_number,
623 LLVMValueRef params)
624 {
625 LLVMValueRef args[4];
626 if (HAVE_LLVM < 0x0400) {
627 args[0] = llvm_chan;
628 args[1] = attr_number;
629 args[2] = params;
630
631 return ac_build_intrinsic(ctx,
632 "llvm.SI.fs.constant",
633 ctx->f32, args, 3,
634 AC_FUNC_ATTR_READNONE);
635 }
636
637 args[0] = parameter;
638 args[1] = llvm_chan;
639 args[2] = attr_number;
640 args[3] = params;
641
642 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
643 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
644 }
645
646 LLVMValueRef
647 ac_build_gep0(struct ac_llvm_context *ctx,
648 LLVMValueRef base_ptr,
649 LLVMValueRef index)
650 {
651 LLVMValueRef indices[2] = {
652 LLVMConstInt(ctx->i32, 0, 0),
653 index,
654 };
655 return LLVMBuildGEP(ctx->builder, base_ptr,
656 indices, 2, "");
657 }
658
659 void
660 ac_build_indexed_store(struct ac_llvm_context *ctx,
661 LLVMValueRef base_ptr, LLVMValueRef index,
662 LLVMValueRef value)
663 {
664 LLVMBuildStore(ctx->builder, value,
665 ac_build_gep0(ctx, base_ptr, index));
666 }
667
668 /**
669 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
670 * It's equivalent to doing a load from &base_ptr[index].
671 *
672 * \param base_ptr Where the array starts.
673 * \param index The element index into the array.
674 * \param uniform Whether the base_ptr and index can be assumed to be
675 * dynamically uniform
676 */
677 LLVMValueRef
678 ac_build_indexed_load(struct ac_llvm_context *ctx,
679 LLVMValueRef base_ptr, LLVMValueRef index,
680 bool uniform)
681 {
682 LLVMValueRef pointer;
683
684 pointer = ac_build_gep0(ctx, base_ptr, index);
685 if (uniform)
686 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
687 return LLVMBuildLoad(ctx->builder, pointer, "");
688 }
689
690 /**
691 * Do a load from &base_ptr[index], but also add a flag that it's loading
692 * a constant from a dynamically uniform index.
693 */
694 LLVMValueRef
695 ac_build_indexed_load_const(struct ac_llvm_context *ctx,
696 LLVMValueRef base_ptr, LLVMValueRef index)
697 {
698 LLVMValueRef result = ac_build_indexed_load(ctx, base_ptr, index, true);
699 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
700 return result;
701 }
702
703 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
704 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
705 * or v4i32 (num_channels=3,4).
706 */
707 void
708 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
709 LLVMValueRef rsrc,
710 LLVMValueRef vdata,
711 unsigned num_channels,
712 LLVMValueRef voffset,
713 LLVMValueRef soffset,
714 unsigned inst_offset,
715 bool glc,
716 bool slc,
717 bool writeonly_memory,
718 bool has_add_tid)
719 {
720 /* TODO: Fix stores with ADD_TID and remove the "has_add_tid" flag. */
721 if (!has_add_tid) {
722 /* Split 3 channel stores, becase LLVM doesn't support 3-channel
723 * intrinsics. */
724 if (num_channels == 3) {
725 LLVMValueRef v[3], v01;
726
727 for (int i = 0; i < 3; i++) {
728 v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
729 LLVMConstInt(ctx->i32, i, 0), "");
730 }
731 v01 = ac_build_gather_values(ctx, v, 2);
732
733 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
734 soffset, inst_offset, glc, slc,
735 writeonly_memory, has_add_tid);
736 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
737 soffset, inst_offset + 8,
738 glc, slc,
739 writeonly_memory, has_add_tid);
740 return;
741 }
742
743 unsigned func = CLAMP(num_channels, 1, 3) - 1;
744 static const char *types[] = {"f32", "v2f32", "v4f32"};
745 char name[256];
746 LLVMValueRef offset = soffset;
747
748 if (inst_offset)
749 offset = LLVMBuildAdd(ctx->builder, offset,
750 LLVMConstInt(ctx->i32, inst_offset, 0), "");
751 if (voffset)
752 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
753
754 LLVMValueRef args[] = {
755 ac_to_float(ctx, vdata),
756 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
757 LLVMConstInt(ctx->i32, 0, 0),
758 offset,
759 LLVMConstInt(ctx->i1, glc, 0),
760 LLVMConstInt(ctx->i1, slc, 0),
761 };
762
763 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
764 types[func]);
765
766 ac_build_intrinsic(ctx, name, ctx->voidt,
767 args, ARRAY_SIZE(args),
768 writeonly_memory ?
769 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
770 AC_FUNC_ATTR_WRITEONLY);
771 return;
772 }
773
774 static unsigned dfmt[] = {
775 V_008F0C_BUF_DATA_FORMAT_32,
776 V_008F0C_BUF_DATA_FORMAT_32_32,
777 V_008F0C_BUF_DATA_FORMAT_32_32_32,
778 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
779 };
780 assert(num_channels >= 1 && num_channels <= 4);
781
782 LLVMValueRef args[] = {
783 rsrc,
784 vdata,
785 LLVMConstInt(ctx->i32, num_channels, 0),
786 voffset ? voffset : LLVMGetUndef(ctx->i32),
787 soffset,
788 LLVMConstInt(ctx->i32, inst_offset, 0),
789 LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
790 LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
791 LLVMConstInt(ctx->i32, voffset != NULL, 0),
792 LLVMConstInt(ctx->i32, 0, 0), /* idxen */
793 LLVMConstInt(ctx->i32, glc, 0),
794 LLVMConstInt(ctx->i32, slc, 0),
795 LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
796 };
797
798 /* The instruction offset field has 12 bits */
799 assert(voffset || inst_offset < (1 << 12));
800
801 /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
802 unsigned func = CLAMP(num_channels, 1, 3) - 1;
803 const char *types[] = {"i32", "v2i32", "v4i32"};
804 char name[256];
805 snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
806
807 ac_build_intrinsic(ctx, name, ctx->voidt,
808 args, ARRAY_SIZE(args),
809 AC_FUNC_ATTR_LEGACY);
810 }
811
812 LLVMValueRef
813 ac_build_buffer_load(struct ac_llvm_context *ctx,
814 LLVMValueRef rsrc,
815 int num_channels,
816 LLVMValueRef vindex,
817 LLVMValueRef voffset,
818 LLVMValueRef soffset,
819 unsigned inst_offset,
820 unsigned glc,
821 unsigned slc,
822 bool can_speculate,
823 bool allow_smem)
824 {
825 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
826 if (voffset)
827 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
828 if (soffset)
829 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
830
831 /* TODO: VI and later generations can use SMEM with GLC=1.*/
832 if (allow_smem && !glc && !slc) {
833 assert(vindex == NULL);
834
835 LLVMValueRef result[4];
836
837 for (int i = 0; i < num_channels; i++) {
838 if (i) {
839 offset = LLVMBuildAdd(ctx->builder, offset,
840 LLVMConstInt(ctx->i32, 4, 0), "");
841 }
842 LLVMValueRef args[2] = {rsrc, offset};
843 result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
844 ctx->f32, args, 2,
845 AC_FUNC_ATTR_READNONE |
846 AC_FUNC_ATTR_LEGACY);
847 }
848 if (num_channels == 1)
849 return result[0];
850
851 if (num_channels == 3)
852 result[num_channels++] = LLVMGetUndef(ctx->f32);
853 return ac_build_gather_values(ctx, result, num_channels);
854 }
855
856 unsigned func = CLAMP(num_channels, 1, 3) - 1;
857
858 LLVMValueRef args[] = {
859 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
860 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
861 offset,
862 LLVMConstInt(ctx->i1, glc, 0),
863 LLVMConstInt(ctx->i1, slc, 0)
864 };
865
866 LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
867 ctx->v4f32};
868 const char *type_names[] = {"f32", "v2f32", "v4f32"};
869 char name[256];
870
871 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
872 type_names[func]);
873
874 return ac_build_intrinsic(ctx, name, types[func], args,
875 ARRAY_SIZE(args),
876 /* READNONE means writes can't affect it, while
877 * READONLY means that writes can affect it. */
878 can_speculate && HAVE_LLVM >= 0x0400 ?
879 AC_FUNC_ATTR_READNONE :
880 AC_FUNC_ATTR_READONLY);
881 }
882
883 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
884 LLVMValueRef rsrc,
885 LLVMValueRef vindex,
886 LLVMValueRef voffset,
887 bool can_speculate)
888 {
889 LLVMValueRef args [] = {
890 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
891 vindex,
892 voffset,
893 LLVMConstInt(ctx->i1, 0, 0), /* glc */
894 LLVMConstInt(ctx->i1, 0, 0), /* slc */
895 };
896
897 return ac_build_intrinsic(ctx,
898 "llvm.amdgcn.buffer.load.format.v4f32",
899 ctx->v4f32, args, ARRAY_SIZE(args),
900 /* READNONE means writes can't affect it, while
901 * READONLY means that writes can affect it. */
902 can_speculate && HAVE_LLVM >= 0x0400 ?
903 AC_FUNC_ATTR_READNONE :
904 AC_FUNC_ATTR_READONLY);
905 }
906
907 /**
908 * Set range metadata on an instruction. This can only be used on load and
909 * call instructions. If you know an instruction can only produce the values
910 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
911 * \p lo is the minimum value inclusive.
912 * \p hi is the maximum value exclusive.
913 */
914 static void set_range_metadata(struct ac_llvm_context *ctx,
915 LLVMValueRef value, unsigned lo, unsigned hi)
916 {
917 LLVMValueRef range_md, md_args[2];
918 LLVMTypeRef type = LLVMTypeOf(value);
919 LLVMContextRef context = LLVMGetTypeContext(type);
920
921 md_args[0] = LLVMConstInt(type, lo, false);
922 md_args[1] = LLVMConstInt(type, hi, false);
923 range_md = LLVMMDNodeInContext(context, md_args, 2);
924 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
925 }
926
927 LLVMValueRef
928 ac_get_thread_id(struct ac_llvm_context *ctx)
929 {
930 LLVMValueRef tid;
931
932 LLVMValueRef tid_args[2];
933 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
934 tid_args[1] = LLVMConstInt(ctx->i32, 0, false);
935 tid_args[1] = ac_build_intrinsic(ctx,
936 "llvm.amdgcn.mbcnt.lo", ctx->i32,
937 tid_args, 2, AC_FUNC_ATTR_READNONE);
938
939 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
940 ctx->i32, tid_args,
941 2, AC_FUNC_ATTR_READNONE);
942 set_range_metadata(ctx, tid, 0, 64);
943 return tid;
944 }
945
946 /*
947 * SI implements derivatives using the local data store (LDS)
948 * All writes to the LDS happen in all executing threads at
949 * the same time. TID is the Thread ID for the current
950 * thread and is a value between 0 and 63, representing
951 * the thread's position in the wavefront.
952 *
953 * For the pixel shader threads are grouped into quads of four pixels.
954 * The TIDs of the pixels of a quad are:
955 *
956 * +------+------+
957 * |4n + 0|4n + 1|
958 * +------+------+
959 * |4n + 2|4n + 3|
960 * +------+------+
961 *
962 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
963 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
964 * the current pixel's column, and masking with 0xfffffffe yields the TID
965 * of the left pixel of the current pixel's row.
966 *
967 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
968 * adding 2 yields the TID of the pixel below the top pixel.
969 */
970 LLVMValueRef
971 ac_build_ddxy(struct ac_llvm_context *ctx,
972 bool has_ds_bpermute,
973 uint32_t mask,
974 int idx,
975 LLVMValueRef val)
976 {
977 LLVMValueRef tl, trbl, args[2];
978 LLVMValueRef result;
979
980 if (has_ds_bpermute) {
981 LLVMValueRef thread_id, tl_tid, trbl_tid;
982 thread_id = ac_get_thread_id(ctx);
983
984 tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
985 LLVMConstInt(ctx->i32, mask, false), "");
986
987 trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
988 LLVMConstInt(ctx->i32, idx, false), "");
989
990 args[0] = LLVMBuildMul(ctx->builder, tl_tid,
991 LLVMConstInt(ctx->i32, 4, false), "");
992 args[1] = val;
993 tl = ac_build_intrinsic(ctx,
994 "llvm.amdgcn.ds.bpermute", ctx->i32,
995 args, 2,
996 AC_FUNC_ATTR_READNONE |
997 AC_FUNC_ATTR_CONVERGENT);
998
999 args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
1000 LLVMConstInt(ctx->i32, 4, false), "");
1001 trbl = ac_build_intrinsic(ctx,
1002 "llvm.amdgcn.ds.bpermute", ctx->i32,
1003 args, 2,
1004 AC_FUNC_ATTR_READNONE |
1005 AC_FUNC_ATTR_CONVERGENT);
1006 } else {
1007 uint32_t masks[2];
1008
1009 switch (mask) {
1010 case AC_TID_MASK_TOP_LEFT:
1011 masks[0] = 0x8000;
1012 if (idx == 1)
1013 masks[1] = 0x8055;
1014 else
1015 masks[1] = 0x80aa;
1016
1017 break;
1018 case AC_TID_MASK_TOP:
1019 masks[0] = 0x8044;
1020 masks[1] = 0x80ee;
1021 break;
1022 case AC_TID_MASK_LEFT:
1023 masks[0] = 0x80a0;
1024 masks[1] = 0x80f5;
1025 break;
1026 }
1027
1028 args[0] = val;
1029 args[1] = LLVMConstInt(ctx->i32, masks[0], false);
1030
1031 tl = ac_build_intrinsic(ctx,
1032 "llvm.amdgcn.ds.swizzle", ctx->i32,
1033 args, 2,
1034 AC_FUNC_ATTR_READNONE |
1035 AC_FUNC_ATTR_CONVERGENT);
1036
1037 args[1] = LLVMConstInt(ctx->i32, masks[1], false);
1038 trbl = ac_build_intrinsic(ctx,
1039 "llvm.amdgcn.ds.swizzle", ctx->i32,
1040 args, 2,
1041 AC_FUNC_ATTR_READNONE |
1042 AC_FUNC_ATTR_CONVERGENT);
1043 }
1044
1045 tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
1046 trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
1047 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1048 return result;
1049 }
1050
1051 void
1052 ac_build_sendmsg(struct ac_llvm_context *ctx,
1053 uint32_t msg,
1054 LLVMValueRef wave_id)
1055 {
1056 LLVMValueRef args[2];
1057 const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.SI.sendmsg" : "llvm.amdgcn.s.sendmsg";
1058 args[0] = LLVMConstInt(ctx->i32, msg, false);
1059 args[1] = wave_id;
1060 ac_build_intrinsic(ctx, intr_name, ctx->voidt, args, 2, 0);
1061 }
1062
1063 LLVMValueRef
1064 ac_build_imsb(struct ac_llvm_context *ctx,
1065 LLVMValueRef arg,
1066 LLVMTypeRef dst_type)
1067 {
1068 const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.AMDGPU.flbit.i32" :
1069 "llvm.amdgcn.sffbh.i32";
1070 LLVMValueRef msb = ac_build_intrinsic(ctx, intr_name,
1071 dst_type, &arg, 1,
1072 AC_FUNC_ATTR_READNONE);
1073
1074 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1075 * the index from LSB. Invert it by doing "31 - msb". */
1076 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
1077 msb, "");
1078
1079 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1080 LLVMValueRef cond = LLVMBuildOr(ctx->builder,
1081 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
1082 arg, LLVMConstInt(ctx->i32, 0, 0), ""),
1083 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
1084 arg, all_ones, ""), "");
1085
1086 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1087 }
1088
1089 LLVMValueRef
1090 ac_build_umsb(struct ac_llvm_context *ctx,
1091 LLVMValueRef arg,
1092 LLVMTypeRef dst_type)
1093 {
1094 LLVMValueRef args[2] = {
1095 arg,
1096 LLVMConstInt(ctx->i1, 1, 0),
1097 };
1098 LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32",
1099 dst_type, args, ARRAY_SIZE(args),
1100 AC_FUNC_ATTR_READNONE);
1101
1102 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1103 * the index from LSB. Invert it by doing "31 - msb". */
1104 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
1105 msb, "");
1106
1107 /* check for zero */
1108 return LLVMBuildSelect(ctx->builder,
1109 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg,
1110 LLVMConstInt(ctx->i32, 0, 0), ""),
1111 LLVMConstInt(ctx->i32, -1, true), msb, "");
1112 }
1113
1114 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
1115 LLVMValueRef b)
1116 {
1117 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1118 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1119 }
1120
1121 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1122 {
1123 if (HAVE_LLVM >= 0x0500) {
1124 LLVMValueRef max[2] = {
1125 value,
1126 LLVMConstReal(ctx->f32, 0),
1127 };
1128 LLVMValueRef min[2] = {
1129 LLVMConstReal(ctx->f32, 1),
1130 };
1131
1132 min[1] = ac_build_intrinsic(ctx, "llvm.maxnum.f32",
1133 ctx->f32, max, 2,
1134 AC_FUNC_ATTR_READNONE);
1135 return ac_build_intrinsic(ctx, "llvm.minnum.f32",
1136 ctx->f32, min, 2,
1137 AC_FUNC_ATTR_READNONE);
1138 }
1139
1140 LLVMValueRef args[3] = {
1141 value,
1142 LLVMConstReal(ctx->f32, 0),
1143 LLVMConstReal(ctx->f32, 1),
1144 };
1145
1146 return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
1147 AC_FUNC_ATTR_READNONE |
1148 AC_FUNC_ATTR_LEGACY);
1149 }
1150
1151 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1152 {
1153 LLVMValueRef args[9];
1154
1155 if (HAVE_LLVM >= 0x0500) {
1156 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1157 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1158
1159 if (a->compr) {
1160 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
1161 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
1162
1163 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
1164 v2i16, "");
1165 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
1166 v2i16, "");
1167 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1168 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1169
1170 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
1171 ctx->voidt, args, 6, 0);
1172 } else {
1173 args[2] = a->out[0];
1174 args[3] = a->out[1];
1175 args[4] = a->out[2];
1176 args[5] = a->out[3];
1177 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1178 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1179
1180 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
1181 ctx->voidt, args, 8, 0);
1182 }
1183 return;
1184 }
1185
1186 args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1187 args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
1188 args[2] = LLVMConstInt(ctx->i32, a->done, 0);
1189 args[3] = LLVMConstInt(ctx->i32, a->target, 0);
1190 args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
1191 memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
1192
1193 ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
1194 AC_FUNC_ATTR_LEGACY);
1195 }
1196
1197 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
1198 struct ac_image_args *a)
1199 {
1200 LLVMTypeRef dst_type;
1201 LLVMValueRef args[11];
1202 unsigned num_args = 0;
1203 const char *name;
1204 char intr_name[128], type[64];
1205
1206 if (HAVE_LLVM >= 0x0400) {
1207 bool sample = a->opcode == ac_image_sample ||
1208 a->opcode == ac_image_gather4 ||
1209 a->opcode == ac_image_get_lod;
1210
1211 if (sample)
1212 args[num_args++] = ac_to_float(ctx, a->addr);
1213 else
1214 args[num_args++] = a->addr;
1215
1216 args[num_args++] = a->resource;
1217 if (sample)
1218 args[num_args++] = a->sampler;
1219 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
1220 if (sample)
1221 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
1222 args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* glc */
1223 args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* slc */
1224 args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* lwe */
1225 args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0);
1226
1227 switch (a->opcode) {
1228 case ac_image_sample:
1229 name = "llvm.amdgcn.image.sample";
1230 break;
1231 case ac_image_gather4:
1232 name = "llvm.amdgcn.image.gather4";
1233 break;
1234 case ac_image_load:
1235 name = "llvm.amdgcn.image.load";
1236 break;
1237 case ac_image_load_mip:
1238 name = "llvm.amdgcn.image.load.mip";
1239 break;
1240 case ac_image_get_lod:
1241 name = "llvm.amdgcn.image.getlod";
1242 break;
1243 case ac_image_get_resinfo:
1244 name = "llvm.amdgcn.image.getresinfo";
1245 break;
1246 default:
1247 unreachable("invalid image opcode");
1248 }
1249
1250 ac_build_type_name_for_intr(LLVMTypeOf(args[0]), type,
1251 sizeof(type));
1252
1253 snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
1254 name,
1255 a->compare ? ".c" : "",
1256 a->bias ? ".b" :
1257 a->lod ? ".l" :
1258 a->deriv ? ".d" :
1259 a->level_zero ? ".lz" : "",
1260 a->offset ? ".o" : "",
1261 type);
1262
1263 LLVMValueRef result =
1264 ac_build_intrinsic(ctx, intr_name,
1265 ctx->v4f32, args, num_args,
1266 AC_FUNC_ATTR_READNONE);
1267 if (!sample) {
1268 result = LLVMBuildBitCast(ctx->builder, result,
1269 ctx->v4i32, "");
1270 }
1271 return result;
1272 }
1273
1274 args[num_args++] = a->addr;
1275 args[num_args++] = a->resource;
1276
1277 if (a->opcode == ac_image_load ||
1278 a->opcode == ac_image_load_mip ||
1279 a->opcode == ac_image_get_resinfo) {
1280 dst_type = ctx->v4i32;
1281 } else {
1282 dst_type = ctx->v4f32;
1283 args[num_args++] = a->sampler;
1284 }
1285
1286 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
1287 args[num_args++] = LLVMConstInt(ctx->i32, a->unorm, 0);
1288 args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* r128 */
1289 args[num_args++] = LLVMConstInt(ctx->i32, a->da, 0);
1290 args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* glc */
1291 args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* slc */
1292 args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* tfe */
1293 args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* lwe */
1294
1295 switch (a->opcode) {
1296 case ac_image_sample:
1297 name = "llvm.SI.image.sample";
1298 break;
1299 case ac_image_gather4:
1300 name = "llvm.SI.gather4";
1301 break;
1302 case ac_image_load:
1303 name = "llvm.SI.image.load";
1304 break;
1305 case ac_image_load_mip:
1306 name = "llvm.SI.image.load.mip";
1307 break;
1308 case ac_image_get_lod:
1309 name = "llvm.SI.getlod";
1310 break;
1311 case ac_image_get_resinfo:
1312 name = "llvm.SI.getresinfo";
1313 break;
1314 }
1315
1316 ac_build_type_name_for_intr(LLVMTypeOf(a->addr), type, sizeof(type));
1317 snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.%s",
1318 name,
1319 a->compare ? ".c" : "",
1320 a->bias ? ".b" :
1321 a->lod ? ".l" :
1322 a->deriv ? ".d" :
1323 a->level_zero ? ".lz" : "",
1324 a->offset ? ".o" : "",
1325 type);
1326
1327 return ac_build_intrinsic(ctx, intr_name,
1328 dst_type, args, num_args,
1329 AC_FUNC_ATTR_READNONE |
1330 AC_FUNC_ATTR_LEGACY);
1331 }
1332
1333 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
1334 LLVMValueRef args[2])
1335 {
1336 if (HAVE_LLVM >= 0x0500) {
1337 LLVMTypeRef v2f16 =
1338 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
1339 LLVMValueRef res =
1340 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
1341 v2f16, args, 2,
1342 AC_FUNC_ATTR_READNONE);
1343 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1344 }
1345
1346 return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
1347 AC_FUNC_ATTR_READNONE |
1348 AC_FUNC_ATTR_LEGACY);
1349 }
1350
1351 /**
1352 * KILL, AKA discard in GLSL.
1353 *
1354 * \param value kill if value < 0.0 or value == NULL.
1355 */
1356 void ac_build_kill(struct ac_llvm_context *ctx, LLVMValueRef value)
1357 {
1358 if (value) {
1359 ac_build_intrinsic(ctx, "llvm.AMDGPU.kill", ctx->voidt,
1360 &value, 1, AC_FUNC_ATTR_LEGACY);
1361 } else {
1362 ac_build_intrinsic(ctx, "llvm.AMDGPU.kilp", ctx->voidt,
1363 NULL, 0, AC_FUNC_ATTR_LEGACY);
1364 }
1365 }
1366
1367 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
1368 LLVMValueRef offset, LLVMValueRef width,
1369 bool is_signed)
1370 {
1371 LLVMValueRef args[] = {
1372 input,
1373 offset,
1374 width,
1375 };
1376
1377 if (HAVE_LLVM >= 0x0500) {
1378 return ac_build_intrinsic(ctx,
1379 is_signed ? "llvm.amdgcn.sbfe.i32" :
1380 "llvm.amdgcn.ubfe.i32",
1381 ctx->i32, args, 3,
1382 AC_FUNC_ATTR_READNONE);
1383 }
1384
1385 return ac_build_intrinsic(ctx,
1386 is_signed ? "llvm.AMDGPU.bfe.i32" :
1387 "llvm.AMDGPU.bfe.u32",
1388 ctx->i32, args, 3,
1389 AC_FUNC_ATTR_READNONE |
1390 AC_FUNC_ATTR_LEGACY);
1391 }
1392
1393 void ac_get_image_intr_name(const char *base_name,
1394 LLVMTypeRef data_type,
1395 LLVMTypeRef coords_type,
1396 LLVMTypeRef rsrc_type,
1397 char *out_name, unsigned out_len)
1398 {
1399 char coords_type_name[8];
1400
1401 ac_build_type_name_for_intr(coords_type, coords_type_name,
1402 sizeof(coords_type_name));
1403
1404 if (HAVE_LLVM <= 0x0309) {
1405 snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name);
1406 } else {
1407 char data_type_name[8];
1408 char rsrc_type_name[8];
1409
1410 ac_build_type_name_for_intr(data_type, data_type_name,
1411 sizeof(data_type_name));
1412 ac_build_type_name_for_intr(rsrc_type, rsrc_type_name,
1413 sizeof(rsrc_type_name));
1414 snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
1415 data_type_name, coords_type_name, rsrc_type_name);
1416 }
1417 }
1418
1419 #define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
1420 #define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
1421
1422 enum ac_ir_type {
1423 AC_IR_UNDEF,
1424 AC_IR_CONST,
1425 AC_IR_VALUE,
1426 };
1427
1428 struct ac_vs_exp_chan
1429 {
1430 LLVMValueRef value;
1431 float const_float;
1432 enum ac_ir_type type;
1433 };
1434
1435 struct ac_vs_exp_inst {
1436 unsigned offset;
1437 LLVMValueRef inst;
1438 struct ac_vs_exp_chan chan[4];
1439 };
1440
1441 struct ac_vs_exports {
1442 unsigned num;
1443 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
1444 };
1445
1446 /* Return true if the PARAM export has been eliminated. */
1447 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
1448 uint32_t num_outputs,
1449 struct ac_vs_exp_inst *exp)
1450 {
1451 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
1452 bool is_zero[4] = {}, is_one[4] = {};
1453
1454 for (i = 0; i < 4; i++) {
1455 /* It's a constant expression. Undef outputs are eliminated too. */
1456 if (exp->chan[i].type == AC_IR_UNDEF) {
1457 is_zero[i] = true;
1458 is_one[i] = true;
1459 } else if (exp->chan[i].type == AC_IR_CONST) {
1460 if (exp->chan[i].const_float == 0)
1461 is_zero[i] = true;
1462 else if (exp->chan[i].const_float == 1)
1463 is_one[i] = true;
1464 else
1465 return false; /* other constant */
1466 } else
1467 return false;
1468 }
1469
1470 /* Only certain combinations of 0 and 1 can be eliminated. */
1471 if (is_zero[0] && is_zero[1] && is_zero[2])
1472 default_val = is_zero[3] ? 0 : 1;
1473 else if (is_one[0] && is_one[1] && is_one[2])
1474 default_val = is_zero[3] ? 2 : 3;
1475 else
1476 return false;
1477
1478 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
1479 LLVMInstructionEraseFromParent(exp->inst);
1480
1481 /* Change OFFSET to DEFAULT_VAL. */
1482 for (i = 0; i < num_outputs; i++) {
1483 if (vs_output_param_offset[i] == exp->offset) {
1484 vs_output_param_offset[i] =
1485 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
1486 break;
1487 }
1488 }
1489 return true;
1490 }
1491
1492 static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
1493 uint32_t num_outputs,
1494 struct ac_vs_exports *processed,
1495 struct ac_vs_exp_inst *exp)
1496 {
1497 unsigned p, copy_back_channels = 0;
1498
1499 /* See if the output is already in the list of processed outputs.
1500 * The LLVMValueRef comparison relies on SSA.
1501 */
1502 for (p = 0; p < processed->num; p++) {
1503 bool different = false;
1504
1505 for (unsigned j = 0; j < 4; j++) {
1506 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
1507 struct ac_vs_exp_chan *c2 = &exp->chan[j];
1508
1509 /* Treat undef as a match. */
1510 if (c2->type == AC_IR_UNDEF)
1511 continue;
1512
1513 /* If c1 is undef but c2 isn't, we can copy c2 to c1
1514 * and consider the instruction duplicated.
1515 */
1516 if (c1->type == AC_IR_UNDEF) {
1517 copy_back_channels |= 1 << j;
1518 continue;
1519 }
1520
1521 /* Test whether the channels are not equal. */
1522 if (c1->type != c2->type ||
1523 (c1->type == AC_IR_CONST &&
1524 c1->const_float != c2->const_float) ||
1525 (c1->type == AC_IR_VALUE &&
1526 c1->value != c2->value)) {
1527 different = true;
1528 break;
1529 }
1530 }
1531 if (!different)
1532 break;
1533
1534 copy_back_channels = 0;
1535 }
1536 if (p == processed->num)
1537 return false;
1538
1539 /* If a match was found, but the matching export has undef where the new
1540 * one has a normal value, copy the normal value to the undef channel.
1541 */
1542 struct ac_vs_exp_inst *match = &processed->exp[p];
1543
1544 while (copy_back_channels) {
1545 unsigned chan = u_bit_scan(&copy_back_channels);
1546
1547 assert(match->chan[chan].type == AC_IR_UNDEF);
1548 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
1549 exp->chan[chan].value);
1550 match->chan[chan] = exp->chan[chan];
1551 }
1552
1553 /* The PARAM export is duplicated. Kill it. */
1554 LLVMInstructionEraseFromParent(exp->inst);
1555
1556 /* Change OFFSET to the matching export. */
1557 for (unsigned i = 0; i < num_outputs; i++) {
1558 if (vs_output_param_offset[i] == exp->offset) {
1559 vs_output_param_offset[i] = match->offset;
1560 break;
1561 }
1562 }
1563 return true;
1564 }
1565
1566 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
1567 LLVMValueRef main_fn,
1568 uint8_t *vs_output_param_offset,
1569 uint32_t num_outputs,
1570 uint8_t *num_param_exports)
1571 {
1572 LLVMBasicBlockRef bb;
1573 bool removed_any = false;
1574 struct ac_vs_exports exports;
1575
1576 exports.num = 0;
1577
1578 /* Process all LLVM instructions. */
1579 bb = LLVMGetFirstBasicBlock(main_fn);
1580 while (bb) {
1581 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
1582
1583 while (inst) {
1584 LLVMValueRef cur = inst;
1585 inst = LLVMGetNextInstruction(inst);
1586 struct ac_vs_exp_inst exp;
1587
1588 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
1589 continue;
1590
1591 LLVMValueRef callee = ac_llvm_get_called_value(cur);
1592
1593 if (!ac_llvm_is_function(callee))
1594 continue;
1595
1596 const char *name = LLVMGetValueName(callee);
1597 unsigned num_args = LLVMCountParams(callee);
1598
1599 /* Check if this is an export instruction. */
1600 if ((num_args != 9 && num_args != 8) ||
1601 (strcmp(name, "llvm.SI.export") &&
1602 strcmp(name, "llvm.amdgcn.exp.f32")))
1603 continue;
1604
1605 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
1606 unsigned target = LLVMConstIntGetZExtValue(arg);
1607
1608 if (target < V_008DFC_SQ_EXP_PARAM)
1609 continue;
1610
1611 target -= V_008DFC_SQ_EXP_PARAM;
1612
1613 /* Parse the instruction. */
1614 memset(&exp, 0, sizeof(exp));
1615 exp.offset = target;
1616 exp.inst = cur;
1617
1618 for (unsigned i = 0; i < 4; i++) {
1619 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
1620
1621 exp.chan[i].value = v;
1622
1623 if (LLVMIsUndef(v)) {
1624 exp.chan[i].type = AC_IR_UNDEF;
1625 } else if (LLVMIsAConstantFP(v)) {
1626 LLVMBool loses_info;
1627 exp.chan[i].type = AC_IR_CONST;
1628 exp.chan[i].const_float =
1629 LLVMConstRealGetDouble(v, &loses_info);
1630 } else {
1631 exp.chan[i].type = AC_IR_VALUE;
1632 }
1633 }
1634
1635 /* Eliminate constant and duplicated PARAM exports. */
1636 if (ac_eliminate_const_output(vs_output_param_offset,
1637 num_outputs, &exp) ||
1638 ac_eliminate_duplicated_output(vs_output_param_offset,
1639 num_outputs, &exports,
1640 &exp)) {
1641 removed_any = true;
1642 } else {
1643 exports.exp[exports.num++] = exp;
1644 }
1645 }
1646 bb = LLVMGetNextBasicBlock(bb);
1647 }
1648
1649 /* Remove holes in export memory due to removed PARAM exports.
1650 * This is done by renumbering all PARAM exports.
1651 */
1652 if (removed_any) {
1653 uint8_t old_offset[VARYING_SLOT_MAX];
1654 unsigned out, i;
1655
1656 /* Make a copy of the offsets. We need the old version while
1657 * we are modifying some of them. */
1658 memcpy(old_offset, vs_output_param_offset,
1659 sizeof(old_offset));
1660
1661 for (i = 0; i < exports.num; i++) {
1662 unsigned offset = exports.exp[i].offset;
1663
1664 /* Update vs_output_param_offset. Multiple outputs can
1665 * have the same offset.
1666 */
1667 for (out = 0; out < num_outputs; out++) {
1668 if (old_offset[out] == offset)
1669 vs_output_param_offset[out] = i;
1670 }
1671
1672 /* Change the PARAM offset in the instruction. */
1673 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
1674 LLVMConstInt(ctx->i32,
1675 V_008DFC_SQ_EXP_PARAM + i, 0));
1676 }
1677 *num_param_exports = exports.num;
1678 }
1679 }