src/amd/common/ac_llvm_build.c

   1 /*
   2  * Copyright 2014 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the
   6  * "Software"), to deal in the Software without restriction, including
   7  * without limitation the rights to use, copy, modify, merge, publish,
   8  * distribute, sub license, and/or sell copies of the Software, and to
   9  * permit persons to whom the Software is furnished to do so, subject to
  10  * the following conditions:
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  19  *
  20  * The above copyright notice and this permission notice (including the
  21  * next paragraph) shall be included in all copies or substantial portions
  22  * of the Software.
  23  *
  24  */
  25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
  26 #include "ac_llvm_build.h"
  27
  28 #include <llvm-c/Core.h>
  29
  30 #include "c11/threads.h"
  31
  32 #include <assert.h>
  33 #include <stdio.h>
  34
  35 #include "ac_llvm_util.h"
  36 #include "ac_exp_param.h"
  37 #include "util/bitscan.h"
  38 #include "util/macros.h"
  39 #include "sid.h"
  40
  41 #include "shader_enums.h"
  42
  43 /* Initialize module-independent parts of the context.
  44  *
  45  * The caller is responsible for initializing ctx::module and ctx::builder.
  46  */
  47 void
  48 ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context)
  49 {
  50         LLVMValueRef args[1];
  51
  52         ctx->context = context;
  53         ctx->module = NULL;
  54         ctx->builder = NULL;
  55
  56         ctx->voidt = LLVMVoidTypeInContext(ctx->context);
  57         ctx->i1 = LLVMInt1TypeInContext(ctx->context);
  58         ctx->i8 = LLVMInt8TypeInContext(ctx->context);
  59         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
  60         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
  61         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
  62         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
  63         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
  64         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
  65         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
  66         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  67         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
  68
  69         ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
  70         ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
  71         ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
  72         ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
  73
  74         ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
  75                                                      "range", 5);
  76
  77         ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
  78                                                                "invariant.load", 14);
  79
  80         ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
  81
  82         args[0] = LLVMConstReal(ctx->f32, 2.5);
  83         ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
  84
  85         ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
  86                                                         "amdgpu.uniform", 14);
  87
  88         ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
  89 }
  90
  91 unsigned
  92 ac_get_type_size(LLVMTypeRef type)
  93 {
  94         LLVMTypeKind kind = LLVMGetTypeKind(type);
  95
  96         switch (kind) {
  97         case LLVMIntegerTypeKind:
  98                 return LLVMGetIntTypeWidth(type) / 8;
  99         case LLVMFloatTypeKind:
 100                 return 4;
 101         case LLVMPointerTypeKind:
 102                 return 8;
 103         case LLVMVectorTypeKind:
 104                 return LLVMGetVectorSize(type) *
 105                        ac_get_type_size(LLVMGetElementType(type));
 106         case LLVMArrayTypeKind:
 107                 return LLVMGetArrayLength(type) *
 108                        ac_get_type_size(LLVMGetElementType(type));
 109         default:
 110                 assert(0);
 111                 return 0;
 112         }
 113 }
 114
 115 LLVMValueRef
 116 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
 117                    LLVMTypeRef return_type, LLVMValueRef *params,
 118                    unsigned param_count, unsigned attrib_mask)
 119 {
 120         LLVMValueRef function, call;
 121         bool set_callsite_attrs = HAVE_LLVM >= 0x0400 &&
 122                                   !(attrib_mask & AC_FUNC_ATTR_LEGACY);
 123
 124         function = LLVMGetNamedFunction(ctx->module, name);
 125         if (!function) {
 126                 LLVMTypeRef param_types[32], function_type;
 127                 unsigned i;
 128
 129                 assert(param_count <= 32);
 130
 131                 for (i = 0; i < param_count; ++i) {
 132                         assert(params[i]);
 133                         param_types[i] = LLVMTypeOf(params[i]);
 134                 }
 135                 function_type =
 136                     LLVMFunctionType(return_type, param_types, param_count, 0);
 137                 function = LLVMAddFunction(ctx->module, name, function_type);
 138
 139                 LLVMSetFunctionCallConv(function, LLVMCCallConv);
 140                 LLVMSetLinkage(function, LLVMExternalLinkage);
 141
 142                 if (!set_callsite_attrs)
 143                         ac_add_func_attributes(ctx->context, function, attrib_mask);
 144         }
 145
 146         call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
 147         if (set_callsite_attrs)
 148                 ac_add_func_attributes(ctx->context, call, attrib_mask);
 149         return call;
 150 }
 151
 152 static LLVMValueRef bitcast_to_float(struct ac_llvm_context *ctx,
 153                                      LLVMValueRef value)
 154 {
 155         LLVMTypeRef type = LLVMTypeOf(value);
 156         LLVMTypeRef new_type;
 157
 158         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
 159                 new_type = LLVMVectorType(ctx->f32, LLVMGetVectorSize(type));
 160         else
 161                 new_type = ctx->f32;
 162
 163         return LLVMBuildBitCast(ctx->builder, value, new_type, "");
 164 }
 165
 166 /**
 167  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
 168  * intrinsic names).
 169  */
 170 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
 171 {
 172         LLVMTypeRef elem_type = type;
 173
 174         assert(bufsize >= 8);
 175
 176         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
 177                 int ret = snprintf(buf, bufsize, "v%u",
 178                                         LLVMGetVectorSize(type));
 179                 if (ret < 0) {
 180                         char *type_name = LLVMPrintTypeToString(type);
 181                         fprintf(stderr, "Error building type name for: %s\n",
 182                                 type_name);
 183                         return;
 184                 }
 185                 elem_type = LLVMGetElementType(type);
 186                 buf += ret;
 187                 bufsize -= ret;
 188         }
 189         switch (LLVMGetTypeKind(elem_type)) {
 190         default: break;
 191         case LLVMIntegerTypeKind:
 192                 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 193                 break;
 194         case LLVMFloatTypeKind:
 195                 snprintf(buf, bufsize, "f32");
 196                 break;
 197         case LLVMDoubleTypeKind:
 198                 snprintf(buf, bufsize, "f64");
 199                 break;
 200         }
 201 }
 202
 203 LLVMValueRef
 204 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
 205                                 LLVMValueRef *values,
 206                                 unsigned value_count,
 207                                 unsigned value_stride,
 208                                 bool load,
 209                                 bool always_vector)
 210 {
 211         LLVMBuilderRef builder = ctx->builder;
 212         LLVMValueRef vec = NULL;
 213         unsigned i;
 214
 215         if (value_count == 1 && !always_vector) {
 216                 if (load)
 217                         return LLVMBuildLoad(builder, values[0], "");
 218                 return values[0];
 219         } else if (!value_count)
 220                 unreachable("value_count is 0");
 221
 222         for (i = 0; i < value_count; i++) {
 223                 LLVMValueRef value = values[i * value_stride];
 224                 if (load)
 225                         value = LLVMBuildLoad(builder, value, "");
 226
 227                 if (!i)
 228                         vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
 229                 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
 230                 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
 231         }
 232         return vec;
 233 }
 234
 235 LLVMValueRef
 236 ac_build_gather_values(struct ac_llvm_context *ctx,
 237                        LLVMValueRef *values,
 238                        unsigned value_count)
 239 {
 240         return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 241 }
 242
 243 LLVMValueRef
 244 ac_build_fdiv(struct ac_llvm_context *ctx,
 245               LLVMValueRef num,
 246               LLVMValueRef den)
 247 {
 248         LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, "");
 249
 250         if (!LLVMIsConstant(ret))
 251                 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
 252         return ret;
 253 }
 254
 255 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
 256  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
 257  * already multiplied by two. id is the cube face number.
 258  */
 259 struct cube_selection_coords {
 260         LLVMValueRef stc[2];
 261         LLVMValueRef ma;
 262         LLVMValueRef id;
 263 };
 264
 265 static void
 266 build_cube_intrinsic(struct ac_llvm_context *ctx,
 267                      LLVMValueRef in[3],
 268                      struct cube_selection_coords *out)
 269 {
 270         LLVMTypeRef f32 = ctx->f32;
 271
 272         out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
 273                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 274         out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
 275                                          f32, in, 3, AC_FUNC_ATTR_READNONE);
 276         out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
 277                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 278         out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
 279                                      f32, in, 3, AC_FUNC_ATTR_READNONE);
 280 }
 281
 282 /**
 283  * Build a manual selection sequence for cube face sc/tc coordinates and
 284  * major axis vector (multiplied by 2 for consistency) for the given
 285  * vec3 \p coords, for the face implied by \p selcoords.
 286  *
 287  * For the major axis, we always adjust the sign to be in the direction of
 288  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
 289  * the selcoords major axis.
 290  */
 291 static void build_cube_select(LLVMBuilderRef builder,
 292                               const struct cube_selection_coords *selcoords,
 293                               const LLVMValueRef *coords,
 294                               LLVMValueRef *out_st,
 295                               LLVMValueRef *out_ma)
 296 {
 297         LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
 298         LLVMValueRef is_ma_positive;
 299         LLVMValueRef sgn_ma;
 300         LLVMValueRef is_ma_z, is_not_ma_z;
 301         LLVMValueRef is_ma_y;
 302         LLVMValueRef is_ma_x;
 303         LLVMValueRef sgn;
 304         LLVMValueRef tmp;
 305
 306         is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
 307                 selcoords->ma, LLVMConstReal(f32, 0.0), "");
 308         sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
 309                 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
 310
 311         is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
 312         is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
 313         is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
 314                 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
 315         is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
 316
 317         /* Select sc */
 318         tmp = LLVMBuildSelect(builder, is_ma_z, coords[2], coords[0], "");
 319         sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
 320                 LLVMBuildSelect(builder, is_ma_x, sgn_ma,
 321                         LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
 322         out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
 323
 324         /* Select tc */
 325         tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
 326         sgn = LLVMBuildSelect(builder, is_ma_y, LLVMBuildFNeg(builder, sgn_ma, ""),
 327                 LLVMConstReal(f32, -1.0), "");
 328         out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
 329
 330         /* Select ma */
 331         tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
 332                 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
 333         sgn = LLVMBuildSelect(builder, is_ma_positive,
 334                 LLVMConstReal(f32, 2.0), LLVMConstReal(f32, -2.0), "");
 335         *out_ma = LLVMBuildFMul(builder, tmp, sgn, "");
 336 }
 337
 338 void
 339 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
 340                        bool is_deriv, bool is_array,
 341                        LLVMValueRef *coords_arg,
 342                        LLVMValueRef *derivs_arg)
 343 {
 344
 345         LLVMBuilderRef builder = ctx->builder;
 346         struct cube_selection_coords selcoords;
 347         LLVMValueRef coords[3];
 348         LLVMValueRef invma;
 349
 350         build_cube_intrinsic(ctx, coords_arg, &selcoords);
 351
 352         invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
 353                         ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
 354         invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
 355
 356         for (int i = 0; i < 2; ++i)
 357                 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
 358
 359         coords[2] = selcoords.id;
 360
 361         if (is_deriv && derivs_arg) {
 362                 LLVMValueRef derivs[4];
 363                 int axis;
 364
 365                 /* Convert cube derivatives to 2D derivatives. */
 366                 for (axis = 0; axis < 2; axis++) {
 367                         LLVMValueRef deriv_st[2];
 368                         LLVMValueRef deriv_ma;
 369
 370                         /* Transform the derivative alongside the texture
 371                          * coordinate. Mathematically, the correct formula is
 372                          * as follows. Assume we're projecting onto the +Z face
 373                          * and denote by dx/dh the derivative of the (original)
 374                          * X texture coordinate with respect to horizontal
 375                          * window coordinates. The projection onto the +Z face
 376                          * plane is:
 377                          *
 378                          *   f(x,z) = x/z
 379                          *
 380                          * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
 381                          *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
 382                          *
 383                          * This motivatives the implementation below.
 384                          *
 385                          * Whether this actually gives the expected results for
 386                          * apps that might feed in derivatives obtained via
 387                          * finite differences is anyone's guess. The OpenGL spec
 388                          * seems awfully quiet about how textureGrad for cube
 389                          * maps should be handled.
 390                          */
 391                         build_cube_select(builder, &selcoords, &derivs_arg[axis * 3],
 392                                           deriv_st, &deriv_ma);
 393
 394                         deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
 395
 396                         for (int i = 0; i < 2; ++i)
 397                                 derivs[axis * 2 + i] =
 398                                         LLVMBuildFSub(builder,
 399                                                 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
 400                                                 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
 401                 }
 402
 403                 memcpy(derivs_arg, derivs, sizeof(derivs));
 404         }
 405
 406         /* Shift the texture coordinate. This must be applied after the
 407          * derivative calculation.
 408          */
 409         for (int i = 0; i < 2; ++i)
 410                 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
 411
 412         if (is_array) {
 413                 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
 414                 /* coords_arg.w component - array_index for cube arrays */
 415                 LLVMValueRef tmp = LLVMBuildFMul(ctx->builder, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), "");
 416                 coords[2] = LLVMBuildFAdd(ctx->builder, tmp, coords[2], "");
 417         }
 418
 419         memcpy(coords_arg, coords, sizeof(coords));
 420 }
 421
 422
 423 LLVMValueRef
 424 ac_build_fs_interp(struct ac_llvm_context *ctx,
 425                    LLVMValueRef llvm_chan,
 426                    LLVMValueRef attr_number,
 427                    LLVMValueRef params,
 428                    LLVMValueRef i,
 429                    LLVMValueRef j)
 430 {
 431         LLVMValueRef args[5];
 432         LLVMValueRef p1;
 433
 434         if (HAVE_LLVM < 0x0400) {
 435                 LLVMValueRef ij[2];
 436                 ij[0] = LLVMBuildBitCast(ctx->builder, i, ctx->i32, "");
 437                 ij[1] = LLVMBuildBitCast(ctx->builder, j, ctx->i32, "");
 438
 439                 args[0] = llvm_chan;
 440                 args[1] = attr_number;
 441                 args[2] = params;
 442                 args[3] = ac_build_gather_values(ctx, ij, 2);
 443                 return ac_build_intrinsic(ctx, "llvm.SI.fs.interp",
 444                                           ctx->f32, args, 4,
 445                                           AC_FUNC_ATTR_READNONE);
 446         }
 447
 448         args[0] = i;
 449         args[1] = llvm_chan;
 450         args[2] = attr_number;
 451         args[3] = params;
 452
 453         p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
 454                                 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 455
 456         args[0] = p1;
 457         args[1] = j;
 458         args[2] = llvm_chan;
 459         args[3] = attr_number;
 460         args[4] = params;
 461
 462         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
 463                                   ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 464 }
 465
 466 LLVMValueRef
 467 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 468                        LLVMValueRef parameter,
 469                        LLVMValueRef llvm_chan,
 470                        LLVMValueRef attr_number,
 471                        LLVMValueRef params)
 472 {
 473         LLVMValueRef args[4];
 474         if (HAVE_LLVM < 0x0400) {
 475                 args[0] = llvm_chan;
 476                 args[1] = attr_number;
 477                 args[2] = params;
 478
 479                 return ac_build_intrinsic(ctx,
 480                                           "llvm.SI.fs.constant",
 481                                           ctx->f32, args, 3,
 482                                           AC_FUNC_ATTR_READNONE);
 483         }
 484
 485         args[0] = parameter;
 486         args[1] = llvm_chan;
 487         args[2] = attr_number;
 488         args[3] = params;
 489
 490         return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
 491                                   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
 492 }
 493
 494 LLVMValueRef
 495 ac_build_gep0(struct ac_llvm_context *ctx,
 496               LLVMValueRef base_ptr,
 497               LLVMValueRef index)
 498 {
 499         LLVMValueRef indices[2] = {
 500                 LLVMConstInt(ctx->i32, 0, 0),
 501                 index,
 502         };
 503         return LLVMBuildGEP(ctx->builder, base_ptr,
 504                             indices, 2, "");
 505 }
 506
 507 void
 508 ac_build_indexed_store(struct ac_llvm_context *ctx,
 509                        LLVMValueRef base_ptr, LLVMValueRef index,
 510                        LLVMValueRef value)
 511 {
 512         LLVMBuildStore(ctx->builder, value,
 513                        ac_build_gep0(ctx, base_ptr, index));
 514 }
 515
 516 /**
 517  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
 518  * It's equivalent to doing a load from &base_ptr[index].
 519  *
 520  * \param base_ptr  Where the array starts.
 521  * \param index     The element index into the array.
 522  * \param uniform   Whether the base_ptr and index can be assumed to be
 523  *                  dynamically uniform
 524  */
 525 LLVMValueRef
 526 ac_build_indexed_load(struct ac_llvm_context *ctx,
 527                       LLVMValueRef base_ptr, LLVMValueRef index,
 528                       bool uniform)
 529 {
 530         LLVMValueRef pointer;
 531
 532         pointer = ac_build_gep0(ctx, base_ptr, index);
 533         if (uniform)
 534                 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
 535         return LLVMBuildLoad(ctx->builder, pointer, "");
 536 }
 537
 538 /**
 539  * Do a load from &base_ptr[index], but also add a flag that it's loading
 540  * a constant from a dynamically uniform index.
 541  */
 542 LLVMValueRef
 543 ac_build_indexed_load_const(struct ac_llvm_context *ctx,
 544                             LLVMValueRef base_ptr, LLVMValueRef index)
 545 {
 546         LLVMValueRef result = ac_build_indexed_load(ctx, base_ptr, index, true);
 547         LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
 548         return result;
 549 }
 550
 551 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
 552  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
 553  * or v4i32 (num_channels=3,4).
 554  */
 555 void
 556 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 557                             LLVMValueRef rsrc,
 558                             LLVMValueRef vdata,
 559                             unsigned num_channels,
 560                             LLVMValueRef voffset,
 561                             LLVMValueRef soffset,
 562                             unsigned inst_offset,
 563                             bool glc,
 564                             bool slc,
 565                             bool writeonly_memory,
 566                             bool has_add_tid)
 567 {
 568         /* TODO: Fix stores with ADD_TID and remove the "has_add_tid" flag. */
 569         if (!has_add_tid) {
 570                 /* Split 3 channel stores, becase LLVM doesn't support 3-channel
 571                  * intrinsics. */
 572                 if (num_channels == 3) {
 573                         LLVMValueRef v[3], v01;
 574
 575                         for (int i = 0; i < 3; i++) {
 576                                 v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
 577                                                 LLVMConstInt(ctx->i32, i, 0), "");
 578                         }
 579                         v01 = ac_build_gather_values(ctx, v, 2);
 580
 581                         ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
 582                                                     soffset, inst_offset, glc, slc,
 583                                                     writeonly_memory, has_add_tid);
 584                         ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
 585                                                     soffset, inst_offset + 8,
 586                                                     glc, slc,
 587                                                     writeonly_memory, has_add_tid);
 588                         return;
 589                 }
 590
 591                 unsigned func = CLAMP(num_channels, 1, 3) - 1;
 592                 static const char *types[] = {"f32", "v2f32", "v4f32"};
 593                 char name[256];
 594                 LLVMValueRef offset = soffset;
 595
 596                 if (inst_offset)
 597                         offset = LLVMBuildAdd(ctx->builder, offset,
 598                                               LLVMConstInt(ctx->i32, inst_offset, 0), "");
 599                 if (voffset)
 600                         offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
 601
 602                 LLVMValueRef args[] = {
 603                         bitcast_to_float(ctx, vdata),
 604                         LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
 605                         LLVMConstInt(ctx->i32, 0, 0),
 606                         offset,
 607                         LLVMConstInt(ctx->i1, glc, 0),
 608                         LLVMConstInt(ctx->i1, slc, 0),
 609                 };
 610
 611                 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
 612                          types[func]);
 613
 614                 ac_build_intrinsic(ctx, name, ctx->voidt,
 615                                    args, ARRAY_SIZE(args),
 616                                    writeonly_memory ?
 617                                            AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
 618                                            AC_FUNC_ATTR_WRITEONLY);
 619                 return;
 620         }
 621
 622         static unsigned dfmt[] = {
 623                 V_008F0C_BUF_DATA_FORMAT_32,
 624                 V_008F0C_BUF_DATA_FORMAT_32_32,
 625                 V_008F0C_BUF_DATA_FORMAT_32_32_32,
 626                 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
 627         };
 628         assert(num_channels >= 1 && num_channels <= 4);
 629
 630         LLVMValueRef args[] = {
 631                 rsrc,
 632                 vdata,
 633                 LLVMConstInt(ctx->i32, num_channels, 0),
 634                 voffset ? voffset : LLVMGetUndef(ctx->i32),
 635                 soffset,
 636                 LLVMConstInt(ctx->i32, inst_offset, 0),
 637                 LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
 638                 LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
 639                 LLVMConstInt(ctx->i32, voffset != NULL, 0),
 640                 LLVMConstInt(ctx->i32, 0, 0), /* idxen */
 641                 LLVMConstInt(ctx->i32, glc, 0),
 642                 LLVMConstInt(ctx->i32, slc, 0),
 643                 LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
 644         };
 645
 646         /* The instruction offset field has 12 bits */
 647         assert(voffset || inst_offset < (1 << 12));
 648
 649         /* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
 650         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 651         const char *types[] = {"i32", "v2i32", "v4i32"};
 652         char name[256];
 653         snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
 654
 655         ac_build_intrinsic(ctx, name, ctx->voidt,
 656                            args, ARRAY_SIZE(args),
 657                            AC_FUNC_ATTR_LEGACY);
 658 }
 659
 660 LLVMValueRef
 661 ac_build_buffer_load(struct ac_llvm_context *ctx,
 662                      LLVMValueRef rsrc,
 663                      int num_channels,
 664                      LLVMValueRef vindex,
 665                      LLVMValueRef voffset,
 666                      LLVMValueRef soffset,
 667                      unsigned inst_offset,
 668                      unsigned glc,
 669                      unsigned slc,
 670                      bool can_speculate,
 671                      bool allow_smem)
 672 {
 673         LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
 674         if (voffset)
 675                 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
 676         if (soffset)
 677                 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
 678
 679         /* TODO: VI and later generations can use SMEM with GLC=1.*/
 680         if (allow_smem && !glc && !slc) {
 681                 assert(vindex == NULL);
 682
 683                 LLVMValueRef result[4];
 684
 685                 for (int i = 0; i < num_channels; i++) {
 686                         if (i) {
 687                                 offset = LLVMBuildAdd(ctx->builder, offset,
 688                                                       LLVMConstInt(ctx->i32, 4, 0), "");
 689                         }
 690                         LLVMValueRef args[2] = {rsrc, offset};
 691                         result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
 692                                                        ctx->f32, args, 2,
 693                                                        AC_FUNC_ATTR_READNONE |
 694                                                        AC_FUNC_ATTR_LEGACY);
 695                 }
 696                 if (num_channels == 1)
 697                         return result[0];
 698
 699                 if (num_channels == 3)
 700                         result[num_channels++] = LLVMGetUndef(ctx->f32);
 701                 return ac_build_gather_values(ctx, result, num_channels);
 702         }
 703
 704         unsigned func = CLAMP(num_channels, 1, 3) - 1;
 705
 706         LLVMValueRef args[] = {
 707                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
 708                 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
 709                 offset,
 710                 LLVMConstInt(ctx->i1, glc, 0),
 711                 LLVMConstInt(ctx->i1, slc, 0)
 712         };
 713
 714         LLVMTypeRef types[] = {ctx->f32, LLVMVectorType(ctx->f32, 2),
 715                                ctx->v4f32};
 716         const char *type_names[] = {"f32", "v2f32", "v4f32"};
 717         char name[256];
 718
 719         snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
 720                  type_names[func]);
 721
 722         return ac_build_intrinsic(ctx, name, types[func], args,
 723                                   ARRAY_SIZE(args),
 724                                   /* READNONE means writes can't affect it, while
 725                                    * READONLY means that writes can affect it. */
 726                                   can_speculate && HAVE_LLVM >= 0x0400 ?
 727                                           AC_FUNC_ATTR_READNONE :
 728                                           AC_FUNC_ATTR_READONLY);
 729 }
 730
 731 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
 732                                          LLVMValueRef rsrc,
 733                                          LLVMValueRef vindex,
 734                                          LLVMValueRef voffset,
 735                                          bool can_speculate)
 736 {
 737         LLVMValueRef args [] = {
 738                 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
 739                 vindex,
 740                 voffset,
 741                 LLVMConstInt(ctx->i1, 0, 0), /* glc */
 742                 LLVMConstInt(ctx->i1, 0, 0), /* slc */
 743         };
 744
 745         return ac_build_intrinsic(ctx,
 746                                   "llvm.amdgcn.buffer.load.format.v4f32",
 747                                   ctx->v4f32, args, ARRAY_SIZE(args),
 748                                   /* READNONE means writes can't affect it, while
 749                                    * READONLY means that writes can affect it. */
 750                                   can_speculate && HAVE_LLVM >= 0x0400 ?
 751                                           AC_FUNC_ATTR_READNONE :
 752                                           AC_FUNC_ATTR_READONLY);
 753 }
 754
 755 /**
 756  * Set range metadata on an instruction.  This can only be used on load and
 757  * call instructions.  If you know an instruction can only produce the values
 758  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
 759  * \p lo is the minimum value inclusive.
 760  * \p hi is the maximum value exclusive.
 761  */
 762 static void set_range_metadata(struct ac_llvm_context *ctx,
 763                                LLVMValueRef value, unsigned lo, unsigned hi)
 764 {
 765         LLVMValueRef range_md, md_args[2];
 766         LLVMTypeRef type = LLVMTypeOf(value);
 767         LLVMContextRef context = LLVMGetTypeContext(type);
 768
 769         md_args[0] = LLVMConstInt(type, lo, false);
 770         md_args[1] = LLVMConstInt(type, hi, false);
 771         range_md = LLVMMDNodeInContext(context, md_args, 2);
 772         LLVMSetMetadata(value, ctx->range_md_kind, range_md);
 773 }
 774
 775 LLVMValueRef
 776 ac_get_thread_id(struct ac_llvm_context *ctx)
 777 {
 778         LLVMValueRef tid;
 779
 780         LLVMValueRef tid_args[2];
 781         tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
 782         tid_args[1] = LLVMConstInt(ctx->i32, 0, false);
 783         tid_args[1] = ac_build_intrinsic(ctx,
 784                                          "llvm.amdgcn.mbcnt.lo", ctx->i32,
 785                                          tid_args, 2, AC_FUNC_ATTR_READNONE);
 786
 787         tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
 788                                  ctx->i32, tid_args,
 789                                  2, AC_FUNC_ATTR_READNONE);
 790         set_range_metadata(ctx, tid, 0, 64);
 791         return tid;
 792 }
 793
 794 /*
 795  * SI implements derivatives using the local data store (LDS)
 796  * All writes to the LDS happen in all executing threads at
 797  * the same time. TID is the Thread ID for the current
 798  * thread and is a value between 0 and 63, representing
 799  * the thread's position in the wavefront.
 800  *
 801  * For the pixel shader threads are grouped into quads of four pixels.
 802  * The TIDs of the pixels of a quad are:
 803  *
 804  *  +------+------+
 805  *  |4n + 0|4n + 1|
 806  *  +------+------+
 807  *  |4n + 2|4n + 3|
 808  *  +------+------+
 809  *
 810  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
 811  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
 812  * the current pixel's column, and masking with 0xfffffffe yields the TID
 813  * of the left pixel of the current pixel's row.
 814  *
 815  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
 816  * adding 2 yields the TID of the pixel below the top pixel.
 817  */
 818 LLVMValueRef
 819 ac_build_ddxy(struct ac_llvm_context *ctx,
 820               bool has_ds_bpermute,
 821               uint32_t mask,
 822               int idx,
 823               LLVMValueRef val)
 824 {
 825         LLVMValueRef tl, trbl, args[2];
 826         LLVMValueRef result;
 827
 828         if (has_ds_bpermute) {
 829                 LLVMValueRef thread_id, tl_tid, trbl_tid;
 830                 thread_id = ac_get_thread_id(ctx);
 831
 832                 tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
 833                                       LLVMConstInt(ctx->i32, mask, false), "");
 834
 835                 trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
 836                                         LLVMConstInt(ctx->i32, idx, false), "");
 837
 838                 args[0] = LLVMBuildMul(ctx->builder, tl_tid,
 839                                        LLVMConstInt(ctx->i32, 4, false), "");
 840                 args[1] = val;
 841                 tl = ac_build_intrinsic(ctx,
 842                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
 843                                         args, 2,
 844                                         AC_FUNC_ATTR_READNONE |
 845                                         AC_FUNC_ATTR_CONVERGENT);
 846
 847                 args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
 848                                        LLVMConstInt(ctx->i32, 4, false), "");
 849                 trbl = ac_build_intrinsic(ctx,
 850                                           "llvm.amdgcn.ds.bpermute", ctx->i32,
 851                                           args, 2,
 852                                           AC_FUNC_ATTR_READNONE |
 853                                           AC_FUNC_ATTR_CONVERGENT);
 854         } else {
 855                 uint32_t masks[2];
 856
 857                 switch (mask) {
 858                 case AC_TID_MASK_TOP_LEFT:
 859                         masks[0] = 0x8000;
 860                         if (idx == 1)
 861                                 masks[1] = 0x8055;
 862                         else
 863                                 masks[1] = 0x80aa;
 864
 865                         break;
 866                 case AC_TID_MASK_TOP:
 867                         masks[0] = 0x8044;
 868                         masks[1] = 0x80ee;
 869                         break;
 870                 case AC_TID_MASK_LEFT:
 871                         masks[0] = 0x80a0;
 872                         masks[1] = 0x80f5;
 873                         break;
 874                 }
 875
 876                 args[0] = val;
 877                 args[1] = LLVMConstInt(ctx->i32, masks[0], false);
 878
 879                 tl = ac_build_intrinsic(ctx,
 880                                         "llvm.amdgcn.ds.swizzle", ctx->i32,
 881                                         args, 2,
 882                                         AC_FUNC_ATTR_READNONE |
 883                                         AC_FUNC_ATTR_CONVERGENT);
 884
 885                 args[1] = LLVMConstInt(ctx->i32, masks[1], false);
 886                 trbl = ac_build_intrinsic(ctx,
 887                                         "llvm.amdgcn.ds.swizzle", ctx->i32,
 888                                         args, 2,
 889                                         AC_FUNC_ATTR_READNONE |
 890                                         AC_FUNC_ATTR_CONVERGENT);
 891         }
 892
 893         tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
 894         trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
 895         result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
 896         return result;
 897 }
 898
 899 void
 900 ac_build_sendmsg(struct ac_llvm_context *ctx,
 901                  uint32_t msg,
 902                  LLVMValueRef wave_id)
 903 {
 904         LLVMValueRef args[2];
 905         const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.SI.sendmsg" : "llvm.amdgcn.s.sendmsg";
 906         args[0] = LLVMConstInt(ctx->i32, msg, false);
 907         args[1] = wave_id;
 908         ac_build_intrinsic(ctx, intr_name, ctx->voidt, args, 2, 0);
 909 }
 910
 911 LLVMValueRef
 912 ac_build_imsb(struct ac_llvm_context *ctx,
 913               LLVMValueRef arg,
 914               LLVMTypeRef dst_type)
 915 {
 916         const char *intr_name = (HAVE_LLVM < 0x0400) ? "llvm.AMDGPU.flbit.i32" :
 917                                                        "llvm.amdgcn.sffbh.i32";
 918         LLVMValueRef msb = ac_build_intrinsic(ctx, intr_name,
 919                                               dst_type, &arg, 1,
 920                                               AC_FUNC_ATTR_READNONE);
 921
 922         /* The HW returns the last bit index from MSB, but NIR/TGSI wants
 923          * the index from LSB. Invert it by doing "31 - msb". */
 924         msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
 925                            msb, "");
 926
 927         LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
 928         LLVMValueRef cond = LLVMBuildOr(ctx->builder,
 929                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 930                                                       arg, LLVMConstInt(ctx->i32, 0, 0), ""),
 931                                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
 932                                                       arg, all_ones, ""), "");
 933
 934         return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
 935 }
 936
 937 LLVMValueRef
 938 ac_build_umsb(struct ac_llvm_context *ctx,
 939               LLVMValueRef arg,
 940               LLVMTypeRef dst_type)
 941 {
 942         LLVMValueRef args[2] = {
 943                 arg,
 944                 LLVMConstInt(ctx->i1, 1, 0),
 945         };
 946         LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32",
 947                                               dst_type, args, ARRAY_SIZE(args),
 948                                               AC_FUNC_ATTR_READNONE);
 949
 950         /* The HW returns the last bit index from MSB, but TGSI/NIR wants
 951          * the index from LSB. Invert it by doing "31 - msb". */
 952         msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
 953                            msb, "");
 954
 955         /* check for zero */
 956         return LLVMBuildSelect(ctx->builder,
 957                                LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg,
 958                                              LLVMConstInt(ctx->i32, 0, 0), ""),
 959                                LLVMConstInt(ctx->i32, -1, true), msb, "");
 960 }
 961
 962 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
 963                            LLVMValueRef b)
 964 {
 965         LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
 966         return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
 967 }
 968
 969 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
 970 {
 971         if (HAVE_LLVM >= 0x0500) {
 972                 LLVMValueRef max[2] = {
 973                         value,
 974                         LLVMConstReal(ctx->f32, 0),
 975                 };
 976                 LLVMValueRef min[2] = {
 977                         LLVMConstReal(ctx->f32, 1),
 978                 };
 979
 980                 min[1] = ac_build_intrinsic(ctx, "llvm.maxnum.f32",
 981                                             ctx->f32, max, 2,
 982                                             AC_FUNC_ATTR_READNONE);
 983                 return ac_build_intrinsic(ctx, "llvm.minnum.f32",
 984                                           ctx->f32, min, 2,
 985                                           AC_FUNC_ATTR_READNONE);
 986         }
 987
 988         LLVMValueRef args[3] = {
 989                 value,
 990                 LLVMConstReal(ctx->f32, 0),
 991                 LLVMConstReal(ctx->f32, 1),
 992         };
 993
 994         return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
 995                                   AC_FUNC_ATTR_READNONE |
 996                                   AC_FUNC_ATTR_LEGACY);
 997 }
 998
 999 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1000 {
1001         LLVMValueRef args[9];
1002
1003         if (HAVE_LLVM >= 0x0500) {
1004                 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1005                 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1006
1007                 if (a->compr) {
1008                         LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
1009                         LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
1010
1011                         args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
1012                                                    v2i16, "");
1013                         args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
1014                                                    v2i16, "");
1015                         args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1016                         args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1017
1018                         ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
1019                                            ctx->voidt, args, 6, 0);
1020                 } else {
1021                         args[2] = a->out[0];
1022                         args[3] = a->out[1];
1023                         args[4] = a->out[2];
1024                         args[5] = a->out[3];
1025                         args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1026                         args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1027
1028                         ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
1029                                            ctx->voidt, args, 8, 0);
1030                 }
1031                 return;
1032         }
1033
1034         args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1035         args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
1036         args[2] = LLVMConstInt(ctx->i32, a->done, 0);
1037         args[3] = LLVMConstInt(ctx->i32, a->target, 0);
1038         args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
1039         memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
1040
1041         ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
1042                            AC_FUNC_ATTR_LEGACY);
1043 }
1044
1045 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
1046                                    struct ac_image_args *a)
1047 {
1048         LLVMTypeRef dst_type;
1049         LLVMValueRef args[11];
1050         unsigned num_args = 0;
1051         const char *name;
1052         char intr_name[128], type[64];
1053
1054         if (HAVE_LLVM >= 0x0400) {
1055                 bool sample = a->opcode == ac_image_sample ||
1056                               a->opcode == ac_image_gather4 ||
1057                               a->opcode == ac_image_get_lod;
1058
1059                 if (sample)
1060                         args[num_args++] = bitcast_to_float(ctx, a->addr);
1061                 else
1062                         args[num_args++] = a->addr;
1063
1064                 args[num_args++] = a->resource;
1065                 if (sample)
1066                         args[num_args++] = a->sampler;
1067                 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
1068                 if (sample)
1069                         args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
1070                 args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* glc */
1071                 args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* slc */
1072                 args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* lwe */
1073                 args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0);
1074
1075                 switch (a->opcode) {
1076                 case ac_image_sample:
1077                         name = "llvm.amdgcn.image.sample";
1078                         break;
1079                 case ac_image_gather4:
1080                         name = "llvm.amdgcn.image.gather4";
1081                         break;
1082                 case ac_image_load:
1083                         name = "llvm.amdgcn.image.load";
1084                         break;
1085                 case ac_image_load_mip:
1086                         name = "llvm.amdgcn.image.load.mip";
1087                         break;
1088                 case ac_image_get_lod:
1089                         name = "llvm.amdgcn.image.getlod";
1090                         break;
1091                 case ac_image_get_resinfo:
1092                         name = "llvm.amdgcn.image.getresinfo";
1093                         break;
1094                 default:
1095                         unreachable("invalid image opcode");
1096                 }
1097
1098                 ac_build_type_name_for_intr(LLVMTypeOf(args[0]), type,
1099                                             sizeof(type));
1100
1101                 snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
1102                         name,
1103                         a->compare ? ".c" : "",
1104                         a->bias ? ".b" :
1105                         a->lod ? ".l" :
1106                         a->deriv ? ".d" :
1107                         a->level_zero ? ".lz" : "",
1108                         a->offset ? ".o" : "",
1109                         type);
1110
1111                 LLVMValueRef result =
1112                         ac_build_intrinsic(ctx, intr_name,
1113                                            ctx->v4f32, args, num_args,
1114                                            AC_FUNC_ATTR_READNONE);
1115                 if (!sample) {
1116                         result = LLVMBuildBitCast(ctx->builder, result,
1117                                                   ctx->v4i32, "");
1118                 }
1119                 return result;
1120         }
1121
1122         args[num_args++] = a->addr;
1123         args[num_args++] = a->resource;
1124
1125         if (a->opcode == ac_image_load ||
1126             a->opcode == ac_image_load_mip ||
1127             a->opcode == ac_image_get_resinfo) {
1128                 dst_type = ctx->v4i32;
1129         } else {
1130                 dst_type = ctx->v4f32;
1131                 args[num_args++] = a->sampler;
1132         }
1133
1134         args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
1135         args[num_args++] = LLVMConstInt(ctx->i32, a->unorm, 0);
1136         args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* r128 */
1137         args[num_args++] = LLVMConstInt(ctx->i32, a->da, 0);
1138         args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* glc */
1139         args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* slc */
1140         args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* tfe */
1141         args[num_args++] = LLVMConstInt(ctx->i32, 0, 0); /* lwe */
1142
1143         switch (a->opcode) {
1144         case ac_image_sample:
1145                 name = "llvm.SI.image.sample";
1146                 break;
1147         case ac_image_gather4:
1148                 name = "llvm.SI.gather4";
1149                 break;
1150         case ac_image_load:
1151                 name = "llvm.SI.image.load";
1152                 break;
1153         case ac_image_load_mip:
1154                 name = "llvm.SI.image.load.mip";
1155                 break;
1156         case ac_image_get_lod:
1157                 name = "llvm.SI.getlod";
1158                 break;
1159         case ac_image_get_resinfo:
1160                 name = "llvm.SI.getresinfo";
1161                 break;
1162         }
1163
1164         ac_build_type_name_for_intr(LLVMTypeOf(a->addr), type, sizeof(type));
1165         snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.%s",
1166                 name,
1167                 a->compare ? ".c" : "",
1168                 a->bias ? ".b" :
1169                 a->lod ? ".l" :
1170                 a->deriv ? ".d" :
1171                 a->level_zero ? ".lz" : "",
1172                 a->offset ? ".o" : "",
1173                 type);
1174
1175         return ac_build_intrinsic(ctx, intr_name,
1176                                   dst_type, args, num_args,
1177                                   AC_FUNC_ATTR_READNONE |
1178                                   AC_FUNC_ATTR_LEGACY);
1179 }
1180
1181 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
1182                                     LLVMValueRef args[2])
1183 {
1184         if (HAVE_LLVM >= 0x0500) {
1185                 LLVMTypeRef v2f16 =
1186                         LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
1187                 LLVMValueRef res =
1188                         ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
1189                                            v2f16, args, 2,
1190                                            AC_FUNC_ATTR_READNONE);
1191                 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1192         }
1193
1194         return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
1195                                   AC_FUNC_ATTR_READNONE |
1196                                   AC_FUNC_ATTR_LEGACY);
1197 }
1198
1199 /**
1200  * KILL, AKA discard in GLSL.
1201  *
1202  * \param value  kill if value < 0.0 or value == NULL.
1203  */
1204 void ac_build_kill(struct ac_llvm_context *ctx, LLVMValueRef value)
1205 {
1206         if (value) {
1207                 ac_build_intrinsic(ctx, "llvm.AMDGPU.kill", ctx->voidt,
1208                                    &value, 1, AC_FUNC_ATTR_LEGACY);
1209         } else {
1210                 ac_build_intrinsic(ctx, "llvm.AMDGPU.kilp", ctx->voidt,
1211                                    NULL, 0, AC_FUNC_ATTR_LEGACY);
1212         }
1213 }
1214
1215 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
1216                           LLVMValueRef offset, LLVMValueRef width,
1217                           bool is_signed)
1218 {
1219         LLVMValueRef args[] = {
1220                 input,
1221                 offset,
1222                 width,
1223         };
1224
1225         if (HAVE_LLVM >= 0x0500) {
1226                 return ac_build_intrinsic(ctx,
1227                                           is_signed ? "llvm.amdgcn.sbfe.i32" :
1228                                                       "llvm.amdgcn.ubfe.i32",
1229                                           ctx->i32, args, 3,
1230                                           AC_FUNC_ATTR_READNONE);
1231         }
1232
1233         return ac_build_intrinsic(ctx,
1234                                   is_signed ? "llvm.AMDGPU.bfe.i32" :
1235                                               "llvm.AMDGPU.bfe.u32",
1236                                   ctx->i32, args, 3,
1237                                   AC_FUNC_ATTR_READNONE |
1238                                   AC_FUNC_ATTR_LEGACY);
1239 }
1240
1241 void ac_get_image_intr_name(const char *base_name,
1242                             LLVMTypeRef data_type,
1243                             LLVMTypeRef coords_type,
1244                             LLVMTypeRef rsrc_type,
1245                             char *out_name, unsigned out_len)
1246 {
1247         char coords_type_name[8];
1248
1249         ac_build_type_name_for_intr(coords_type, coords_type_name,
1250                             sizeof(coords_type_name));
1251
1252         if (HAVE_LLVM <= 0x0309) {
1253                 snprintf(out_name, out_len, "%s.%s", base_name, coords_type_name);
1254         } else {
1255                 char data_type_name[8];
1256                 char rsrc_type_name[8];
1257
1258                 ac_build_type_name_for_intr(data_type, data_type_name,
1259                                         sizeof(data_type_name));
1260                 ac_build_type_name_for_intr(rsrc_type, rsrc_type_name,
1261                                         sizeof(rsrc_type_name));
1262                 snprintf(out_name, out_len, "%s.%s.%s.%s", base_name,
1263                          data_type_name, coords_type_name, rsrc_type_name);
1264         }
1265 }
1266
1267 #define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
1268 #define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
1269
1270 enum ac_ir_type {
1271         AC_IR_UNDEF,
1272         AC_IR_CONST,
1273         AC_IR_VALUE,
1274 };
1275
1276 struct ac_vs_exp_chan
1277 {
1278         LLVMValueRef value;
1279         float const_float;
1280         enum ac_ir_type type;
1281 };
1282
1283 struct ac_vs_exp_inst {
1284         unsigned offset;
1285         LLVMValueRef inst;
1286         struct ac_vs_exp_chan chan[4];
1287 };
1288
1289 struct ac_vs_exports {
1290         unsigned num;
1291         struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
1292 };
1293
1294 /* Return true if the PARAM export has been eliminated. */
1295 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
1296                                       uint32_t num_outputs,
1297                                       struct ac_vs_exp_inst *exp)
1298 {
1299         unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
1300         bool is_zero[4] = {}, is_one[4] = {};
1301
1302         for (i = 0; i < 4; i++) {
1303                 /* It's a constant expression. Undef outputs are eliminated too. */
1304                 if (exp->chan[i].type == AC_IR_UNDEF) {
1305                         is_zero[i] = true;
1306                         is_one[i] = true;
1307                 } else if (exp->chan[i].type == AC_IR_CONST) {
1308                         if (exp->chan[i].const_float == 0)
1309                                 is_zero[i] = true;
1310                         else if (exp->chan[i].const_float == 1)
1311                                 is_one[i] = true;
1312                         else
1313                                 return false; /* other constant */
1314                 } else
1315                         return false;
1316         }
1317
1318         /* Only certain combinations of 0 and 1 can be eliminated. */
1319         if (is_zero[0] && is_zero[1] && is_zero[2])
1320                 default_val = is_zero[3] ? 0 : 1;
1321         else if (is_one[0] && is_one[1] && is_one[2])
1322                 default_val = is_zero[3] ? 2 : 3;
1323         else
1324                 return false;
1325
1326         /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
1327         LLVMInstructionEraseFromParent(exp->inst);
1328
1329         /* Change OFFSET to DEFAULT_VAL. */
1330         for (i = 0; i < num_outputs; i++) {
1331                 if (vs_output_param_offset[i] == exp->offset) {
1332                         vs_output_param_offset[i] =
1333                                 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
1334                         break;
1335                 }
1336         }
1337         return true;
1338 }
1339
1340 static bool ac_eliminate_duplicated_output(uint8_t *vs_output_param_offset,
1341                                            uint32_t num_outputs,
1342                                            struct ac_vs_exports *processed,
1343                                            struct ac_vs_exp_inst *exp)
1344 {
1345         unsigned p, copy_back_channels = 0;
1346
1347         /* See if the output is already in the list of processed outputs.
1348          * The LLVMValueRef comparison relies on SSA.
1349          */
1350         for (p = 0; p < processed->num; p++) {
1351                 bool different = false;
1352
1353                 for (unsigned j = 0; j < 4; j++) {
1354                         struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
1355                         struct ac_vs_exp_chan *c2 = &exp->chan[j];
1356
1357                         /* Treat undef as a match. */
1358                         if (c2->type == AC_IR_UNDEF)
1359                                 continue;
1360
1361                         /* If c1 is undef but c2 isn't, we can copy c2 to c1
1362                          * and consider the instruction duplicated.
1363                          */
1364                         if (c1->type == AC_IR_UNDEF) {
1365                                 copy_back_channels |= 1 << j;
1366                                 continue;
1367                         }
1368
1369                         /* Test whether the channels are not equal. */
1370                         if (c1->type != c2->type ||
1371                             (c1->type == AC_IR_CONST &&
1372                              c1->const_float != c2->const_float) ||
1373                             (c1->type == AC_IR_VALUE &&
1374                              c1->value != c2->value)) {
1375                                 different = true;
1376                                 break;
1377                         }
1378                 }
1379                 if (!different)
1380                         break;
1381
1382                 copy_back_channels = 0;
1383         }
1384         if (p == processed->num)
1385                 return false;
1386
1387         /* If a match was found, but the matching export has undef where the new
1388          * one has a normal value, copy the normal value to the undef channel.
1389          */
1390         struct ac_vs_exp_inst *match = &processed->exp[p];
1391
1392         while (copy_back_channels) {
1393                 unsigned chan = u_bit_scan(&copy_back_channels);
1394
1395                 assert(match->chan[chan].type == AC_IR_UNDEF);
1396                 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
1397                                exp->chan[chan].value);
1398                 match->chan[chan] = exp->chan[chan];
1399         }
1400
1401         /* The PARAM export is duplicated. Kill it. */
1402         LLVMInstructionEraseFromParent(exp->inst);
1403
1404         /* Change OFFSET to the matching export. */
1405         for (unsigned i = 0; i < num_outputs; i++) {
1406                 if (vs_output_param_offset[i] == exp->offset) {
1407                         vs_output_param_offset[i] = match->offset;
1408                         break;
1409                 }
1410         }
1411         return true;
1412 }
1413
1414 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
1415                             LLVMValueRef main_fn,
1416                             uint8_t *vs_output_param_offset,
1417                             uint32_t num_outputs,
1418                             uint8_t *num_param_exports)
1419 {
1420         LLVMBasicBlockRef bb;
1421         bool removed_any = false;
1422         struct ac_vs_exports exports;
1423
1424         exports.num = 0;
1425
1426         /* Process all LLVM instructions. */
1427         bb = LLVMGetFirstBasicBlock(main_fn);
1428         while (bb) {
1429                 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
1430
1431                 while (inst) {
1432                         LLVMValueRef cur = inst;
1433                         inst = LLVMGetNextInstruction(inst);
1434                         struct ac_vs_exp_inst exp;
1435
1436                         if (LLVMGetInstructionOpcode(cur) != LLVMCall)
1437                                 continue;
1438
1439                         LLVMValueRef callee = ac_llvm_get_called_value(cur);
1440
1441                         if (!ac_llvm_is_function(callee))
1442                                 continue;
1443
1444                         const char *name = LLVMGetValueName(callee);
1445                         unsigned num_args = LLVMCountParams(callee);
1446
1447                         /* Check if this is an export instruction. */
1448                         if ((num_args != 9 && num_args != 8) ||
1449                             (strcmp(name, "llvm.SI.export") &&
1450                              strcmp(name, "llvm.amdgcn.exp.f32")))
1451                                 continue;
1452
1453                         LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
1454                         unsigned target = LLVMConstIntGetZExtValue(arg);
1455
1456                         if (target < V_008DFC_SQ_EXP_PARAM)
1457                                 continue;
1458
1459                         target -= V_008DFC_SQ_EXP_PARAM;
1460
1461                         /* Parse the instruction. */
1462                         memset(&exp, 0, sizeof(exp));
1463                         exp.offset = target;
1464                         exp.inst = cur;
1465
1466                         for (unsigned i = 0; i < 4; i++) {
1467                                 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
1468
1469                                 exp.chan[i].value = v;
1470
1471                                 if (LLVMIsUndef(v)) {
1472                                         exp.chan[i].type = AC_IR_UNDEF;
1473                                 } else if (LLVMIsAConstantFP(v)) {
1474                                         LLVMBool loses_info;
1475                                         exp.chan[i].type = AC_IR_CONST;
1476                                         exp.chan[i].const_float =
1477                                                 LLVMConstRealGetDouble(v, &loses_info);
1478                                 } else {
1479                                         exp.chan[i].type = AC_IR_VALUE;
1480                                 }
1481                         }
1482
1483                         /* Eliminate constant and duplicated PARAM exports. */
1484                         if (ac_eliminate_const_output(vs_output_param_offset,
1485                                                       num_outputs, &exp) ||
1486                             ac_eliminate_duplicated_output(vs_output_param_offset,
1487                                                            num_outputs, &exports,
1488                                                            &exp)) {
1489                                 removed_any = true;
1490                         } else {
1491                                 exports.exp[exports.num++] = exp;
1492                         }
1493                 }
1494                 bb = LLVMGetNextBasicBlock(bb);
1495         }
1496
1497         /* Remove holes in export memory due to removed PARAM exports.
1498          * This is done by renumbering all PARAM exports.
1499          */
1500         if (removed_any) {
1501                 uint8_t old_offset[VARYING_SLOT_MAX];
1502                 unsigned out, i;
1503
1504                 /* Make a copy of the offsets. We need the old version while
1505                  * we are modifying some of them. */
1506                 memcpy(old_offset, vs_output_param_offset,
1507                        sizeof(old_offset));
1508
1509                 for (i = 0; i < exports.num; i++) {
1510                         unsigned offset = exports.exp[i].offset;
1511
1512                         /* Update vs_output_param_offset. Multiple outputs can
1513                          * have the same offset.
1514                          */
1515                         for (out = 0; out < num_outputs; out++) {
1516                                 if (old_offset[out] == offset)
1517                                         vs_output_param_offset[out] = i;
1518                         }
1519
1520                         /* Change the PARAM offset in the instruction. */
1521                         LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
1522                                        LLVMConstInt(ctx->i32,
1523                                                     V_008DFC_SQ_EXP_PARAM + i, 0));
1524                 }
1525                 *num_param_exports = exports.num;
1526         }
1527 }